// Package telemetry provides lightweight OpenTelemetry helpers and // low-cardinality Runtime Manager instruments used by the runnable // skeleton. Later stages emit into the instruments declared here without // touching this package. package telemetry import ( "context" "errors" "fmt" "log/slog" "os" "strings" "sync" "time" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" "go.opentelemetry.io/otel/exporters/stdout/stdoutmetric" "go.opentelemetry.io/otel/exporters/stdout/stdouttrace" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/propagation" sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/resource" sdktrace "go.opentelemetry.io/otel/sdk/trace" oteltrace "go.opentelemetry.io/otel/trace" ) const meterName = "galaxy/rtmanager" const ( defaultServiceName = "galaxy-rtmanager" processExporterNone = "none" processExporterOTLP = "otlp" processProtocolHTTPProtobuf = "http/protobuf" processProtocolGRPC = "grpc" ) // ProcessConfig configures the process-wide OpenTelemetry runtime. type ProcessConfig struct { // ServiceName overrides the default OpenTelemetry service name. ServiceName string // TracesExporter selects the external traces exporter. Supported values // are `none` and `otlp`. TracesExporter string // MetricsExporter selects the external metrics exporter. Supported // values are `none` and `otlp`. MetricsExporter string // TracesProtocol selects the OTLP traces protocol when TracesExporter is // `otlp`. TracesProtocol string // MetricsProtocol selects the OTLP metrics protocol when // MetricsExporter is `otlp`. MetricsProtocol string // StdoutTracesEnabled enables the additional stdout trace exporter used // for local development and debugging. StdoutTracesEnabled bool // StdoutMetricsEnabled enables the additional stdout metric exporter // used for local development and debugging. StdoutMetricsEnabled bool } // Validate reports whether cfg contains a supported OpenTelemetry exporter // configuration. func (cfg ProcessConfig) Validate() error { switch cfg.TracesExporter { case processExporterNone, processExporterOTLP: default: return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter) } switch cfg.MetricsExporter { case processExporterNone, processExporterOTLP: default: return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter) } if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC { return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol) } if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC { return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol) } return nil } // Runtime owns the Runtime Manager OpenTelemetry providers and the // low-cardinality custom instruments listed in `rtmanager/README.md` // §Observability. type Runtime struct { tracerProvider oteltrace.TracerProvider meterProvider metric.MeterProvider meter metric.Meter shutdownMu sync.Mutex shutdownDone bool shutdownErr error shutdownFns []func(context.Context) error internalHTTPRequests metric.Int64Counter internalHTTPDuration metric.Float64Histogram startOutcomes metric.Int64Counter stopOutcomes metric.Int64Counter restartOutcomes metric.Int64Counter patchOutcomes metric.Int64Counter cleanupOutcomes metric.Int64Counter healthEvents metric.Int64Counter reconcileDrift metric.Int64Counter notificationIntents metric.Int64Counter dockerOpLatency metric.Float64Histogram leaseAcquireLatency metric.Float64Histogram runtimeRecordsByStatus metric.Int64ObservableGauge gaugeMu sync.Mutex gaugeRegistration metric.Registration } // NewWithProviders constructs a telemetry runtime around explicitly supplied // meterProvider and tracerProvider values. func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) { if meterProvider == nil { meterProvider = otel.GetMeterProvider() } if tracerProvider == nil { tracerProvider = otel.GetTracerProvider() } if meterProvider == nil { return nil, errors.New("new rtmanager telemetry runtime: nil meter provider") } if tracerProvider == nil { return nil, errors.New("new rtmanager telemetry runtime: nil tracer provider") } return buildRuntime(meterProvider, tracerProvider, nil) } // NewProcess constructs the process-wide Runtime Manager OpenTelemetry // runtime from cfg, installs the resulting providers globally, and // returns the runtime. func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) { if ctx == nil { return nil, errors.New("new rtmanager telemetry process: nil context") } if err := cfg.Validate(); err != nil { return nil, fmt.Errorf("new rtmanager telemetry process: %w", err) } if logger == nil { logger = slog.Default() } serviceName := strings.TrimSpace(cfg.ServiceName) if serviceName == "" { serviceName = defaultServiceName } res := resource.NewSchemaless(attribute.String("service.name", serviceName)) tracerProvider, err := newTracerProvider(ctx, res, cfg) if err != nil { return nil, fmt.Errorf("new rtmanager telemetry process: tracer provider: %w", err) } meterProvider, err := newMeterProvider(ctx, res, cfg) if err != nil { return nil, fmt.Errorf("new rtmanager telemetry process: meter provider: %w", err) } otel.SetTracerProvider(tracerProvider) otel.SetMeterProvider(meterProvider) otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( propagation.TraceContext{}, propagation.Baggage{}, )) runtime, err := buildRuntime(meterProvider, tracerProvider, []func(context.Context) error{ meterProvider.Shutdown, tracerProvider.Shutdown, }) if err != nil { return nil, fmt.Errorf("new rtmanager telemetry process: runtime: %w", err) } logger.Info("rtmanager telemetry configured", "service_name", serviceName, "traces_exporter", cfg.TracesExporter, "metrics_exporter", cfg.MetricsExporter, ) return runtime, nil } // TracerProvider returns the runtime tracer provider. func (runtime *Runtime) TracerProvider() oteltrace.TracerProvider { if runtime == nil || runtime.tracerProvider == nil { return otel.GetTracerProvider() } return runtime.tracerProvider } // MeterProvider returns the runtime meter provider. func (runtime *Runtime) MeterProvider() metric.MeterProvider { if runtime == nil || runtime.meterProvider == nil { return otel.GetMeterProvider() } return runtime.meterProvider } // Shutdown flushes and stops the configured telemetry providers. Shutdown // is idempotent. func (runtime *Runtime) Shutdown(ctx context.Context) error { if runtime == nil { return nil } runtime.shutdownMu.Lock() if runtime.shutdownDone { err := runtime.shutdownErr runtime.shutdownMu.Unlock() return err } runtime.shutdownDone = true runtime.shutdownMu.Unlock() runtime.gaugeMu.Lock() if runtime.gaugeRegistration != nil { _ = runtime.gaugeRegistration.Unregister() runtime.gaugeRegistration = nil } runtime.gaugeMu.Unlock() var shutdownErr error for index := len(runtime.shutdownFns) - 1; index >= 0; index-- { shutdownErr = errors.Join(shutdownErr, runtime.shutdownFns[index](ctx)) } runtime.shutdownMu.Lock() runtime.shutdownErr = shutdownErr runtime.shutdownMu.Unlock() return shutdownErr } // RecordInternalHTTPRequest records one internal HTTP request outcome. func (runtime *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) { if runtime == nil { return } options := metric.WithAttributes(attrs...) runtime.internalHTTPRequests.Add(normalizeContext(ctx), 1, options) runtime.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options) } // RecordStartOutcome records one terminal outcome of the start operation. // outcome is `success` or `failure`; errorCode is `replay_no_op` or one of // the stable failure codes from `rtmanager/README.md` §Error Model; // opSource is `lobby_stream`, `gm_rest`, or `admin_rest`. func (runtime *Runtime) RecordStartOutcome(ctx context.Context, outcome, errorCode, opSource string) { if runtime == nil || runtime.startOutcomes == nil { return } runtime.startOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("outcome", outcome), attribute.String("error_code", errorCode), attribute.String("op_source", opSource), )) } // RecordStopOutcome records one terminal outcome of the stop operation. // reason is the value carried on `runtime:stop_jobs` or the matching REST // reason; opSource is `lobby_stream`, `gm_rest`, or `admin_rest`. func (runtime *Runtime) RecordStopOutcome(ctx context.Context, outcome, reason, opSource string) { if runtime == nil || runtime.stopOutcomes == nil { return } runtime.stopOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("outcome", outcome), attribute.String("reason", reason), attribute.String("op_source", opSource), )) } // RecordRestartOutcome records one terminal outcome of the restart // operation. func (runtime *Runtime) RecordRestartOutcome(ctx context.Context, outcome, errorCode string) { if runtime == nil || runtime.restartOutcomes == nil { return } runtime.restartOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("outcome", outcome), attribute.String("error_code", errorCode), )) } // RecordPatchOutcome records one terminal outcome of the patch operation. func (runtime *Runtime) RecordPatchOutcome(ctx context.Context, outcome, errorCode string) { if runtime == nil || runtime.patchOutcomes == nil { return } runtime.patchOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("outcome", outcome), attribute.String("error_code", errorCode), )) } // RecordCleanupOutcome records one terminal outcome of the cleanup // operation. opSource is `auto_ttl` for the periodic cleanup worker and // `admin_rest` for explicit administrative removal. func (runtime *Runtime) RecordCleanupOutcome(ctx context.Context, outcome, opSource string) { if runtime == nil || runtime.cleanupOutcomes == nil { return } runtime.cleanupOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("outcome", outcome), attribute.String("op_source", opSource), )) } // RecordHealthEvent records one technical runtime event published on // `runtime:health_events`. eventType comes from the frozen vocabulary in // `rtmanager/README.md` §Async Stream Contracts. func (runtime *Runtime) RecordHealthEvent(ctx context.Context, eventType string) { if runtime == nil || runtime.healthEvents == nil { return } runtime.healthEvents.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("event_type", eventType), )) } // RecordReconcileDrift records one drift outcome from the reconciler. kind // is `adopt`, `dispose`, or `observed_exited`. func (runtime *Runtime) RecordReconcileDrift(ctx context.Context, kind string) { if runtime == nil || runtime.reconcileDrift == nil { return } runtime.reconcileDrift.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("kind", kind), )) } // RecordNotificationIntent records one admin-only notification intent // publish attempt. notificationType is `runtime.image_pull_failed`, // `runtime.container_start_failed`, or `runtime.start_config_invalid`. func (runtime *Runtime) RecordNotificationIntent(ctx context.Context, notificationType string) { if runtime == nil || runtime.notificationIntents == nil { return } runtime.notificationIntents.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("notification_type", notificationType), )) } // RecordDockerOpLatency records the wall-clock duration of one Docker SDK // call. op is one of `pull`, `create`, `start`, `stop`, `rm`, `inspect`, // `events`. func (runtime *Runtime) RecordDockerOpLatency(ctx context.Context, op string, duration time.Duration) { if runtime == nil || runtime.dockerOpLatency == nil { return } runtime.dockerOpLatency.Record(normalizeContext(ctx), duration.Seconds()*1000, metric.WithAttributes( attribute.String("op", op), )) } // RecordLeaseAcquireLatency records the wall-clock latency of one // per-game Redis lease acquisition. func (runtime *Runtime) RecordLeaseAcquireLatency(ctx context.Context, duration time.Duration) { if runtime == nil || runtime.leaseAcquireLatency == nil { return } runtime.leaseAcquireLatency.Record(normalizeContext(ctx), duration.Seconds()*1000) } // RuntimeRecordsByStatusProbe reports the number of runtime_records rows // per status. The production probe wraps the runtime record store; tests // may pass a stub. type RuntimeRecordsByStatusProbe interface { CountByStatus(ctx context.Context) (map[string]int, error) } // GaugeDependencies groups the collaborators required by RegisterGauges. type GaugeDependencies struct { // RuntimeRecordsByStatus probes the per-status row count for // `rtmanager.runtime_records_by_status`. RuntimeRecordsByStatus RuntimeRecordsByStatusProbe // Logger records non-fatal probe errors. Defaults to slog.Default // when nil. Logger *slog.Logger } // RegisterGauges installs the observable-gauge callback that reports // `rtmanager.runtime_records_by_status`. It is safe to call once per // Runtime; a second call replaces the previous registration. The runtime // keeps no strong reference to deps beyond the callback closure. // // The wiring layer registers the gauge once the persistence adapters // are constructed. func (runtime *Runtime) RegisterGauges(deps GaugeDependencies) error { if runtime == nil { return errors.New("register rtmanager gauges: nil runtime") } if deps.RuntimeRecordsByStatus == nil { return errors.New("register rtmanager gauges: nil runtime records probe") } logger := deps.Logger if logger == nil { logger = slog.Default() } runtime.gaugeMu.Lock() defer runtime.gaugeMu.Unlock() if runtime.gaugeRegistration != nil { _ = runtime.gaugeRegistration.Unregister() runtime.gaugeRegistration = nil } callback := func(ctx context.Context, observer metric.Observer) error { counts, err := deps.RuntimeRecordsByStatus.CountByStatus(ctx) if err != nil { logger.WarnContext(ctx, "runtime records probe failed", "err", err.Error(), ) return nil } for status, count := range counts { observer.ObserveInt64(runtime.runtimeRecordsByStatus, int64(count), metric.WithAttributes( attribute.String("status", status), )) } return nil } registration, err := runtime.meter.RegisterCallback(callback, runtime.runtimeRecordsByStatus) if err != nil { return fmt.Errorf("register rtmanager gauges: %w", err) } runtime.gaugeRegistration = registration return nil } func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) { meter := meterProvider.Meter(meterName) runtime := &Runtime{ tracerProvider: tracerProvider, meterProvider: meterProvider, meter: meter, shutdownFns: append([]func(context.Context) error(nil), shutdownFns...), } internalHTTPRequests, err := meter.Int64Counter("rtmanager.internal_http.requests") if err != nil { return nil, fmt.Errorf("build rtmanager telemetry runtime: internal_http.requests: %w", err) } internalHTTPDuration, err := meter.Float64Histogram("rtmanager.internal_http.duration", metric.WithUnit("ms")) if err != nil { return nil, fmt.Errorf("build rtmanager telemetry runtime: internal_http.duration: %w", err) } runtime.internalHTTPRequests = internalHTTPRequests runtime.internalHTTPDuration = internalHTTPDuration if err := registerCounters(meter, runtime); err != nil { return nil, err } if err := registerHistograms(meter, runtime); err != nil { return nil, err } if err := registerObservableGauges(meter, runtime); err != nil { return nil, err } return runtime, nil } func registerCounters(meter metric.Meter, runtime *Runtime) error { specs := []struct { name string target *metric.Int64Counter }{ {"rtmanager.start_outcomes", &runtime.startOutcomes}, {"rtmanager.stop_outcomes", &runtime.stopOutcomes}, {"rtmanager.restart_outcomes", &runtime.restartOutcomes}, {"rtmanager.patch_outcomes", &runtime.patchOutcomes}, {"rtmanager.cleanup_outcomes", &runtime.cleanupOutcomes}, {"rtmanager.health_events", &runtime.healthEvents}, {"rtmanager.reconcile_drift", &runtime.reconcileDrift}, {"rtmanager.notification_intents", &runtime.notificationIntents}, } for _, spec := range specs { counter, err := meter.Int64Counter(spec.name) if err != nil { return fmt.Errorf("build rtmanager telemetry runtime: %s: %w", spec.name, err) } *spec.target = counter } return nil } func registerHistograms(meter metric.Meter, runtime *Runtime) error { specs := []struct { name string unit string target *metric.Float64Histogram }{ {"rtmanager.docker_op_latency", "ms", &runtime.dockerOpLatency}, {"rtmanager.lease_acquire_latency", "ms", &runtime.leaseAcquireLatency}, } for _, spec := range specs { options := []metric.Float64HistogramOption{} if spec.unit != "" { options = append(options, metric.WithUnit(spec.unit)) } histogram, err := meter.Float64Histogram(spec.name, options...) if err != nil { return fmt.Errorf("build rtmanager telemetry runtime: %s: %w", spec.name, err) } *spec.target = histogram } return nil } func registerObservableGauges(meter metric.Meter, runtime *Runtime) error { gauge, err := meter.Int64ObservableGauge("rtmanager.runtime_records_by_status") if err != nil { return fmt.Errorf("build rtmanager telemetry runtime: runtime_records_by_status: %w", err) } runtime.runtimeRecordsByStatus = gauge return nil } func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdktrace.TracerProvider, error) { options := []sdktrace.TracerProviderOption{ sdktrace.WithResource(res), } if exporter, err := traceExporter(ctx, cfg); err != nil { return nil, err } else if exporter != nil { options = append(options, sdktrace.WithBatcher(exporter)) } if cfg.StdoutTracesEnabled { exporter, err := stdouttrace.New(stdouttrace.WithWriter(os.Stdout)) if err != nil { return nil, fmt.Errorf("stdout traces exporter: %w", err) } options = append(options, sdktrace.WithBatcher(exporter)) } return sdktrace.NewTracerProvider(options...), nil } func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdkmetric.MeterProvider, error) { options := []sdkmetric.Option{ sdkmetric.WithResource(res), } if exporter, err := metricExporter(ctx, cfg); err != nil { return nil, err } else if exporter != nil { options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter))) } if cfg.StdoutMetricsEnabled { exporter, err := stdoutmetric.New(stdoutmetric.WithWriter(os.Stdout)) if err != nil { return nil, fmt.Errorf("stdout metrics exporter: %w", err) } options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter))) } return sdkmetric.NewMeterProvider(options...), nil } func traceExporter(ctx context.Context, cfg ProcessConfig) (sdktrace.SpanExporter, error) { if cfg.TracesExporter != processExporterOTLP { return nil, nil } switch normalizeProtocol(cfg.TracesProtocol) { case processProtocolGRPC: exporter, err := otlptracegrpc.New(ctx) if err != nil { return nil, fmt.Errorf("otlp grpc traces exporter: %w", err) } return exporter, nil default: exporter, err := otlptracehttp.New(ctx) if err != nil { return nil, fmt.Errorf("otlp http traces exporter: %w", err) } return exporter, nil } } func metricExporter(ctx context.Context, cfg ProcessConfig) (sdkmetric.Exporter, error) { if cfg.MetricsExporter != processExporterOTLP { return nil, nil } switch normalizeProtocol(cfg.MetricsProtocol) { case processProtocolGRPC: exporter, err := otlpmetricgrpc.New(ctx) if err != nil { return nil, fmt.Errorf("otlp grpc metrics exporter: %w", err) } return exporter, nil default: exporter, err := otlpmetrichttp.New(ctx) if err != nil { return nil, fmt.Errorf("otlp http metrics exporter: %w", err) } return exporter, nil } } func normalizeProtocol(value string) string { switch strings.TrimSpace(value) { case processProtocolGRPC: return processProtocolGRPC default: return processProtocolHTTPProtobuf } } func normalizeContext(ctx context.Context) context.Context { if ctx == nil { return context.Background() } return ctx }