Files
galaxy-game/backend/internal/telemetry/runtime.go
T
2026-05-06 10:14:55 +03:00

294 lines
9.0 KiB
Go

// Package telemetry owns the OpenTelemetry runtime for the backend process.
//
// New constructs the configured tracer and meter providers, registers them
// globally, and exposes a Shutdown method for orderly process exit. The
// supported exporter set follows README §4:
//
// - traces: none, otlp (gRPC or HTTP/protobuf), stdout
// - metrics: none, otlp (gRPC or HTTP/protobuf), stdout, prometheus
//
// When metrics use the prometheus exporter, the runtime also retains the
// Prometheus HTTP handler and listen address so a separate metrics listener
// (`internal/metricsapi`) can serve the scrape endpoint.
package telemetry
import (
"context"
"errors"
"fmt"
"net/http"
"galaxy/backend/internal/config"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
otelprom "go.opentelemetry.io/otel/exporters/prometheus"
"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/propagation"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
"go.opentelemetry.io/otel/trace"
"go.uber.org/zap"
)
// Runtime owns the shared OpenTelemetry providers and the optional Prometheus
// scrape handler.
type Runtime struct {
logger *zap.Logger
tracerProvider *sdktrace.TracerProvider
meterProvider *sdkmetric.MeterProvider
promHandler http.Handler
prometheusListenAddr string
}
// New constructs the backend telemetry runtime, registers global providers,
// and wires the optional Prometheus scrape handler. Callers must invoke
// Runtime.Shutdown during process exit.
func New(ctx context.Context, logger *zap.Logger, cfg config.TelemetryConfig) (*Runtime, error) {
if logger == nil {
logger = zap.NewNop()
}
res, err := resource.New(
ctx,
resource.WithAttributes(attribute.String("service.name", cfg.ServiceName)),
)
if err != nil {
return nil, fmt.Errorf("build telemetry resource: %w", err)
}
tracerProvider, err := newTracerProvider(ctx, cfg, res)
if err != nil {
return nil, fmt.Errorf("build tracer provider: %w", err)
}
meterProvider, promHandler, err := newMeterProvider(ctx, cfg, res)
if err != nil {
// Tracer provider was already constructed; release its resources before
// surfacing the meter-provider error.
_ = tracerProvider.Shutdown(ctx)
return nil, fmt.Errorf("build meter provider: %w", err)
}
otel.SetTracerProvider(tracerProvider)
otel.SetMeterProvider(meterProvider)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
rt := &Runtime{
logger: logger,
tracerProvider: tracerProvider,
meterProvider: meterProvider,
promHandler: promHandler,
}
if cfg.MetricsExporter == "prometheus" {
rt.prometheusListenAddr = cfg.PrometheusListenAddr
}
return rt, nil
}
// TracerProvider returns the runtime tracer provider, falling back to the
// global one when r is not initialised.
func (r *Runtime) TracerProvider() trace.TracerProvider {
if r == nil || r.tracerProvider == nil {
return otel.GetTracerProvider()
}
return r.tracerProvider
}
// MeterProvider returns the runtime meter provider, falling back to the
// global one when r is not initialised.
func (r *Runtime) MeterProvider() metric.MeterProvider {
if r == nil || r.meterProvider == nil {
return otel.GetMeterProvider()
}
return r.meterProvider
}
// Handler returns the Prometheus scrape handler when the metrics exporter is
// `prometheus`, or http.NotFoundHandler() otherwise. The metricsapi server
// uses this when a Prometheus listener is enabled.
func (r *Runtime) Handler() http.Handler {
if r == nil || r.promHandler == nil {
return http.NotFoundHandler()
}
return r.promHandler
}
// PrometheusListenAddr returns the configured Prometheus listen address, or
// the empty string when the Prometheus exporter is not selected.
func (r *Runtime) PrometheusListenAddr() string {
if r == nil {
return ""
}
return r.prometheusListenAddr
}
// Shutdown flushes both providers within ctx.
func (r *Runtime) Shutdown(ctx context.Context) error {
if r == nil {
return nil
}
var err error
if r.meterProvider != nil {
err = errors.Join(err, r.meterProvider.Shutdown(ctx))
}
if r.tracerProvider != nil {
err = errors.Join(err, r.tracerProvider.Shutdown(ctx))
}
return err
}
// TraceFieldsFromContext returns zap fields for the active OpenTelemetry span
// when ctx carries a valid span context. The helper is collocated with the
// telemetry runtime so observers do not need to import the OTel API directly.
func TraceFieldsFromContext(ctx context.Context) []zap.Field {
if ctx == nil {
return nil
}
spanContext := trace.SpanContextFromContext(ctx)
if !spanContext.IsValid() {
return nil
}
return []zap.Field{
zap.String("otel_trace_id", spanContext.TraceID().String()),
zap.String("otel_span_id", spanContext.SpanID().String()),
}
}
func newTracerProvider(ctx context.Context, cfg config.TelemetryConfig, res *resource.Resource) (*sdktrace.TracerProvider, error) {
switch cfg.TracesExporter {
case "none":
return sdktrace.NewTracerProvider(sdktrace.WithResource(res)), nil
case "stdout":
exporter, err := stdouttrace.New()
if err != nil {
return nil, fmt.Errorf("stdout trace exporter: %w", err)
}
return sdktrace.NewTracerProvider(
sdktrace.WithBatcher(exporter),
sdktrace.WithResource(res),
), nil
case "otlp":
exporter, err := newOTLPTraceExporter(ctx, cfg)
if err != nil {
return nil, err
}
return sdktrace.NewTracerProvider(
sdktrace.WithBatcher(exporter),
sdktrace.WithResource(res),
), nil
default:
return nil, fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter)
}
}
func newOTLPTraceExporter(ctx context.Context, cfg config.TelemetryConfig) (sdktrace.SpanExporter, error) {
switch cfg.Protocol {
case "grpc":
opts := []otlptracegrpc.Option{}
if cfg.Endpoint != "" {
opts = append(opts, otlptracegrpc.WithEndpoint(cfg.Endpoint))
}
exporter, err := otlptracegrpc.New(ctx, opts...)
if err != nil {
return nil, fmt.Errorf("otlp grpc trace exporter: %w", err)
}
return exporter, nil
case "http/protobuf":
opts := []otlptracehttp.Option{}
if cfg.Endpoint != "" {
opts = append(opts, otlptracehttp.WithEndpoint(cfg.Endpoint))
}
exporter, err := otlptracehttp.New(ctx, opts...)
if err != nil {
return nil, fmt.Errorf("otlp http trace exporter: %w", err)
}
return exporter, nil
default:
return nil, fmt.Errorf("unsupported otel protocol %q", cfg.Protocol)
}
}
func newMeterProvider(ctx context.Context, cfg config.TelemetryConfig, res *resource.Resource) (*sdkmetric.MeterProvider, http.Handler, error) {
switch cfg.MetricsExporter {
case "none":
return sdkmetric.NewMeterProvider(sdkmetric.WithResource(res)), nil, nil
case "stdout":
exporter, err := stdoutmetric.New()
if err != nil {
return nil, nil, fmt.Errorf("stdout metric exporter: %w", err)
}
return sdkmetric.NewMeterProvider(
sdkmetric.WithResource(res),
sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)),
), nil, nil
case "otlp":
exporter, err := newOTLPMetricExporter(ctx, cfg)
if err != nil {
return nil, nil, err
}
return sdkmetric.NewMeterProvider(
sdkmetric.WithResource(res),
sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)),
), nil, nil
case "prometheus":
registry := prometheus.NewRegistry()
exporter, err := otelprom.New(otelprom.WithRegisterer(registry))
if err != nil {
return nil, nil, fmt.Errorf("prometheus metric exporter: %w", err)
}
mp := sdkmetric.NewMeterProvider(
sdkmetric.WithResource(res),
sdkmetric.WithReader(exporter),
)
handler := promhttp.HandlerFor(registry, promhttp.HandlerOpts{})
return mp, handler, nil
default:
return nil, nil, fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter)
}
}
func newOTLPMetricExporter(ctx context.Context, cfg config.TelemetryConfig) (sdkmetric.Exporter, error) {
switch cfg.Protocol {
case "grpc":
opts := []otlpmetricgrpc.Option{}
if cfg.Endpoint != "" {
opts = append(opts, otlpmetricgrpc.WithEndpoint(cfg.Endpoint))
}
exporter, err := otlpmetricgrpc.New(ctx, opts...)
if err != nil {
return nil, fmt.Errorf("otlp grpc metric exporter: %w", err)
}
return exporter, nil
case "http/protobuf":
opts := []otlpmetrichttp.Option{}
if cfg.Endpoint != "" {
opts = append(opts, otlpmetrichttp.WithEndpoint(cfg.Endpoint))
}
exporter, err := otlpmetrichttp.New(ctx, opts...)
if err != nil {
return nil, fmt.Errorf("otlp http metric exporter: %w", err)
}
return exporter, nil
default:
return nil, fmt.Errorf("unsupported otel protocol %q", cfg.Protocol)
}
}