// Package telemetry provides lightweight OpenTelemetry helpers and // low-cardinality Game Master instruments used by the runnable skeleton. // Later stages emit into the instruments declared here without touching // this package. package telemetry import ( "context" "errors" "fmt" "log/slog" "os" "strings" "sync" "time" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" "go.opentelemetry.io/otel/exporters/stdout/stdoutmetric" "go.opentelemetry.io/otel/exporters/stdout/stdouttrace" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/propagation" sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/resource" sdktrace "go.opentelemetry.io/otel/sdk/trace" oteltrace "go.opentelemetry.io/otel/trace" ) const meterName = "galaxy/gamemaster" const ( defaultServiceName = "galaxy-gamemaster" processExporterNone = "none" processExporterOTLP = "otlp" processProtocolHTTPProtobuf = "http/protobuf" processProtocolGRPC = "grpc" ) // ProcessConfig configures the process-wide OpenTelemetry runtime. type ProcessConfig struct { // ServiceName overrides the default OpenTelemetry service name. ServiceName string // TracesExporter selects the external traces exporter. Supported values // are `none` and `otlp`. TracesExporter string // MetricsExporter selects the external metrics exporter. Supported // values are `none` and `otlp`. MetricsExporter string // TracesProtocol selects the OTLP traces protocol when TracesExporter is // `otlp`. TracesProtocol string // MetricsProtocol selects the OTLP metrics protocol when // MetricsExporter is `otlp`. MetricsProtocol string // StdoutTracesEnabled enables the additional stdout trace exporter used // for local development and debugging. StdoutTracesEnabled bool // StdoutMetricsEnabled enables the additional stdout metric exporter // used for local development and debugging. StdoutMetricsEnabled bool } // Validate reports whether cfg contains a supported OpenTelemetry exporter // configuration. func (cfg ProcessConfig) Validate() error { switch cfg.TracesExporter { case processExporterNone, processExporterOTLP: default: return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter) } switch cfg.MetricsExporter { case processExporterNone, processExporterOTLP: default: return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter) } if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC { return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol) } if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC { return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol) } return nil } // Runtime owns the Game Master OpenTelemetry providers and the // low-cardinality custom instruments listed in `gamemaster/README.md` // §Observability. type Runtime struct { tracerProvider oteltrace.TracerProvider meterProvider metric.MeterProvider meter metric.Meter shutdownMu sync.Mutex shutdownDone bool shutdownErr error shutdownFns []func(context.Context) error internalHTTPRequests metric.Int64Counter internalHTTPDuration metric.Float64Histogram registerRuntimeOutcomes metric.Int64Counter turnGenerationOutcomes metric.Int64Counter commandExecuteOutcomes metric.Int64Counter orderPutOutcomes metric.Int64Counter reportGetOutcomes metric.Int64Counter banishOutcomes metric.Int64Counter healthEventsConsumed metric.Int64Counter lobbyEventsPublished metric.Int64Counter notificationPublishAttempts metric.Int64Counter membershipCacheHits metric.Int64Counter engineCallLatency metric.Float64Histogram runtimeRecordsByStatus metric.Int64ObservableGauge schedulerDueGames metric.Int64ObservableGauge engineVersionsTotal metric.Int64ObservableGauge gaugeMu sync.Mutex gaugeRegistration metric.Registration } // NewWithProviders constructs a telemetry runtime around explicitly supplied // meterProvider and tracerProvider values. func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) { if meterProvider == nil { meterProvider = otel.GetMeterProvider() } if tracerProvider == nil { tracerProvider = otel.GetTracerProvider() } if meterProvider == nil { return nil, errors.New("new gamemaster telemetry runtime: nil meter provider") } if tracerProvider == nil { return nil, errors.New("new gamemaster telemetry runtime: nil tracer provider") } return buildRuntime(meterProvider, tracerProvider, nil) } // NewProcess constructs the process-wide Game Master OpenTelemetry runtime // from cfg, installs the resulting providers globally, and returns the // runtime. func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) { if ctx == nil { return nil, errors.New("new gamemaster telemetry process: nil context") } if err := cfg.Validate(); err != nil { return nil, fmt.Errorf("new gamemaster telemetry process: %w", err) } if logger == nil { logger = slog.Default() } serviceName := strings.TrimSpace(cfg.ServiceName) if serviceName == "" { serviceName = defaultServiceName } res := resource.NewSchemaless(attribute.String("service.name", serviceName)) tracerProvider, err := newTracerProvider(ctx, res, cfg) if err != nil { return nil, fmt.Errorf("new gamemaster telemetry process: tracer provider: %w", err) } meterProvider, err := newMeterProvider(ctx, res, cfg) if err != nil { return nil, fmt.Errorf("new gamemaster telemetry process: meter provider: %w", err) } otel.SetTracerProvider(tracerProvider) otel.SetMeterProvider(meterProvider) otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( propagation.TraceContext{}, propagation.Baggage{}, )) runtime, err := buildRuntime(meterProvider, tracerProvider, []func(context.Context) error{ meterProvider.Shutdown, tracerProvider.Shutdown, }) if err != nil { return nil, fmt.Errorf("new gamemaster telemetry process: runtime: %w", err) } logger.Info("gamemaster telemetry configured", "service_name", serviceName, "traces_exporter", cfg.TracesExporter, "metrics_exporter", cfg.MetricsExporter, ) return runtime, nil } // TracerProvider returns the runtime tracer provider. func (runtime *Runtime) TracerProvider() oteltrace.TracerProvider { if runtime == nil || runtime.tracerProvider == nil { return otel.GetTracerProvider() } return runtime.tracerProvider } // MeterProvider returns the runtime meter provider. func (runtime *Runtime) MeterProvider() metric.MeterProvider { if runtime == nil || runtime.meterProvider == nil { return otel.GetMeterProvider() } return runtime.meterProvider } // Shutdown flushes and stops the configured telemetry providers. Shutdown // is idempotent. func (runtime *Runtime) Shutdown(ctx context.Context) error { if runtime == nil { return nil } runtime.shutdownMu.Lock() if runtime.shutdownDone { err := runtime.shutdownErr runtime.shutdownMu.Unlock() return err } runtime.shutdownDone = true runtime.shutdownMu.Unlock() runtime.gaugeMu.Lock() if runtime.gaugeRegistration != nil { _ = runtime.gaugeRegistration.Unregister() runtime.gaugeRegistration = nil } runtime.gaugeMu.Unlock() var shutdownErr error for index := len(runtime.shutdownFns) - 1; index >= 0; index-- { shutdownErr = errors.Join(shutdownErr, runtime.shutdownFns[index](ctx)) } runtime.shutdownMu.Lock() runtime.shutdownErr = shutdownErr runtime.shutdownMu.Unlock() return shutdownErr } // RecordInternalHTTPRequest records one internal HTTP request outcome. func (runtime *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) { if runtime == nil { return } options := metric.WithAttributes(attrs...) runtime.internalHTTPRequests.Add(normalizeContext(ctx), 1, options) runtime.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options) } // RecordRegisterRuntimeOutcome records one terminal outcome of the // register-runtime operation. func (runtime *Runtime) RecordRegisterRuntimeOutcome(ctx context.Context, outcome, errorCode string) { if runtime == nil || runtime.registerRuntimeOutcomes == nil { return } runtime.registerRuntimeOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("outcome", outcome), attribute.String("error_code", errorCode), )) } // RecordTurnGenerationOutcome records one terminal outcome of a turn // generation. trigger is `scheduler` or `force`. func (runtime *Runtime) RecordTurnGenerationOutcome(ctx context.Context, outcome, errorCode, trigger string) { if runtime == nil || runtime.turnGenerationOutcomes == nil { return } runtime.turnGenerationOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("outcome", outcome), attribute.String("error_code", errorCode), attribute.String("trigger", trigger), )) } // RecordCommandExecuteOutcome records one terminal outcome of a command // execute call. func (runtime *Runtime) RecordCommandExecuteOutcome(ctx context.Context, outcome, errorCode string) { if runtime == nil || runtime.commandExecuteOutcomes == nil { return } runtime.commandExecuteOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("outcome", outcome), attribute.String("error_code", errorCode), )) } // RecordOrderPutOutcome records one terminal outcome of an order put call. func (runtime *Runtime) RecordOrderPutOutcome(ctx context.Context, outcome, errorCode string) { if runtime == nil || runtime.orderPutOutcomes == nil { return } runtime.orderPutOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("outcome", outcome), attribute.String("error_code", errorCode), )) } // RecordReportGetOutcome records one terminal outcome of a report get // call. func (runtime *Runtime) RecordReportGetOutcome(ctx context.Context, outcome, errorCode string) { if runtime == nil || runtime.reportGetOutcomes == nil { return } runtime.reportGetOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("outcome", outcome), attribute.String("error_code", errorCode), )) } // RecordBanishOutcome records one terminal outcome of a banish call. func (runtime *Runtime) RecordBanishOutcome(ctx context.Context, outcome, errorCode string) { if runtime == nil || runtime.banishOutcomes == nil { return } runtime.banishOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("outcome", outcome), attribute.String("error_code", errorCode), )) } // RecordHealthEventConsumed records one consumed `runtime:health_events` // entry. func (runtime *Runtime) RecordHealthEventConsumed(ctx context.Context) { if runtime == nil || runtime.healthEventsConsumed == nil { return } runtime.healthEventsConsumed.Add(normalizeContext(ctx), 1) } // RecordLobbyEventPublished records one publication on `gm:lobby_events`. // eventType is `runtime_snapshot_update` or `game_finished`. func (runtime *Runtime) RecordLobbyEventPublished(ctx context.Context, eventType string) { if runtime == nil || runtime.lobbyEventsPublished == nil { return } runtime.lobbyEventsPublished.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("event_type", eventType), )) } // RecordNotificationPublishAttempt records one publication attempt to // `notification:intents`. result is `ok` or `error`. func (runtime *Runtime) RecordNotificationPublishAttempt(ctx context.Context, notificationType, result string) { if runtime == nil || runtime.notificationPublishAttempts == nil { return } runtime.notificationPublishAttempts.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("notification_type", notificationType), attribute.String("result", result), )) } // RecordMembershipCacheResult records one membership cache lookup outcome. // result is `hit`, `miss`, or `invalidate`. func (runtime *Runtime) RecordMembershipCacheResult(ctx context.Context, result string) { if runtime == nil || runtime.membershipCacheHits == nil { return } runtime.membershipCacheHits.Add(normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("result", result), )) } // RecordEngineCall records the wall-clock duration of one engine HTTP // call. op is one of `init`, `status`, `turn`, `banish`, `command`, // `order`, `report`. func (runtime *Runtime) RecordEngineCall(ctx context.Context, op string, duration time.Duration) { if runtime == nil || runtime.engineCallLatency == nil { return } runtime.engineCallLatency.Record(normalizeContext(ctx), duration.Seconds()*1000, metric.WithAttributes( attribute.String("op", op), )) } // RuntimeRecordsByStatusProbe reports the number of `runtime_records` // rows per status. The production probe wraps the runtime record store; // tests may pass a stub. type RuntimeRecordsByStatusProbe interface { CountByStatus(ctx context.Context) (map[string]int, error) } // SchedulerDueGamesProbe reports how many runtime records are currently // due for a scheduler-driven turn generation. type SchedulerDueGamesProbe interface { CountDue(ctx context.Context) (int, error) } // EngineVersionsTotalProbe reports how many engine_versions rows are // registered. type EngineVersionsTotalProbe interface { CountVersions(ctx context.Context) (int, error) } // GaugeDependencies groups the collaborators required by RegisterGauges. type GaugeDependencies struct { // RuntimeRecordsByStatus probes the per-status row count for // `gamemaster.runtime_records_by_status`. RuntimeRecordsByStatus RuntimeRecordsByStatusProbe // SchedulerDueGames probes the due-now count for // `gamemaster.scheduler.due_games`. SchedulerDueGames SchedulerDueGamesProbe // EngineVersionsTotal probes the engine_versions row count for // `gamemaster.engine_versions_total`. EngineVersionsTotal EngineVersionsTotalProbe // Logger records non-fatal probe errors. Defaults to slog.Default // when nil. Logger *slog.Logger } // RegisterGauges installs the observable-gauge callback that reports // `gamemaster.runtime_records_by_status`, // `gamemaster.scheduler.due_games`, and // `gamemaster.engine_versions_total`. It is safe to call once per // Runtime; a second call replaces the previous registration. The runtime // keeps no strong reference to deps beyond the callback closure. // // The wiring layer registers the gauges once the persistence adapters // and scheduler probe are constructed. func (runtime *Runtime) RegisterGauges(deps GaugeDependencies) error { if runtime == nil { return errors.New("register gamemaster gauges: nil runtime") } if deps.RuntimeRecordsByStatus == nil { return errors.New("register gamemaster gauges: nil runtime records probe") } if deps.SchedulerDueGames == nil { return errors.New("register gamemaster gauges: nil scheduler probe") } if deps.EngineVersionsTotal == nil { return errors.New("register gamemaster gauges: nil engine versions probe") } logger := deps.Logger if logger == nil { logger = slog.Default() } runtime.gaugeMu.Lock() defer runtime.gaugeMu.Unlock() if runtime.gaugeRegistration != nil { _ = runtime.gaugeRegistration.Unregister() runtime.gaugeRegistration = nil } callback := func(ctx context.Context, observer metric.Observer) error { if counts, err := deps.RuntimeRecordsByStatus.CountByStatus(ctx); err != nil { logger.WarnContext(ctx, "runtime records probe failed", "err", err.Error(), ) } else { for status, count := range counts { observer.ObserveInt64(runtime.runtimeRecordsByStatus, int64(count), metric.WithAttributes( attribute.String("status", status), )) } } if due, err := deps.SchedulerDueGames.CountDue(ctx); err != nil { logger.WarnContext(ctx, "scheduler due games probe failed", "err", err.Error(), ) } else { observer.ObserveInt64(runtime.schedulerDueGames, int64(due)) } if versions, err := deps.EngineVersionsTotal.CountVersions(ctx); err != nil { logger.WarnContext(ctx, "engine versions probe failed", "err", err.Error(), ) } else { observer.ObserveInt64(runtime.engineVersionsTotal, int64(versions)) } return nil } registration, err := runtime.meter.RegisterCallback(callback, runtime.runtimeRecordsByStatus, runtime.schedulerDueGames, runtime.engineVersionsTotal, ) if err != nil { return fmt.Errorf("register gamemaster gauges: %w", err) } runtime.gaugeRegistration = registration return nil } func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) { meter := meterProvider.Meter(meterName) runtime := &Runtime{ tracerProvider: tracerProvider, meterProvider: meterProvider, meter: meter, shutdownFns: append([]func(context.Context) error(nil), shutdownFns...), } internalHTTPRequests, err := meter.Int64Counter("gamemaster.internal_http.requests") if err != nil { return nil, fmt.Errorf("build gamemaster telemetry runtime: internal_http.requests: %w", err) } internalHTTPDuration, err := meter.Float64Histogram("gamemaster.internal_http.duration", metric.WithUnit("ms")) if err != nil { return nil, fmt.Errorf("build gamemaster telemetry runtime: internal_http.duration: %w", err) } runtime.internalHTTPRequests = internalHTTPRequests runtime.internalHTTPDuration = internalHTTPDuration if err := registerCounters(meter, runtime); err != nil { return nil, err } if err := registerHistograms(meter, runtime); err != nil { return nil, err } if err := registerObservableGauges(meter, runtime); err != nil { return nil, err } return runtime, nil } func registerCounters(meter metric.Meter, runtime *Runtime) error { specs := []struct { name string target *metric.Int64Counter }{ {"gamemaster.register_runtime.outcomes", &runtime.registerRuntimeOutcomes}, {"gamemaster.turn_generation.outcomes", &runtime.turnGenerationOutcomes}, {"gamemaster.command_execute.outcomes", &runtime.commandExecuteOutcomes}, {"gamemaster.order_put.outcomes", &runtime.orderPutOutcomes}, {"gamemaster.report_get.outcomes", &runtime.reportGetOutcomes}, {"gamemaster.banish.outcomes", &runtime.banishOutcomes}, {"gamemaster.health_events.consumed", &runtime.healthEventsConsumed}, {"gamemaster.lobby_events.published", &runtime.lobbyEventsPublished}, {"gamemaster.notification.publish_attempts", &runtime.notificationPublishAttempts}, {"gamemaster.membership_cache.hits", &runtime.membershipCacheHits}, } for _, spec := range specs { counter, err := meter.Int64Counter(spec.name) if err != nil { return fmt.Errorf("build gamemaster telemetry runtime: %s: %w", spec.name, err) } *spec.target = counter } return nil } func registerHistograms(meter metric.Meter, runtime *Runtime) error { specs := []struct { name string unit string target *metric.Float64Histogram }{ {"gamemaster.engine_call.latency", "ms", &runtime.engineCallLatency}, } for _, spec := range specs { options := []metric.Float64HistogramOption{} if spec.unit != "" { options = append(options, metric.WithUnit(spec.unit)) } histogram, err := meter.Float64Histogram(spec.name, options...) if err != nil { return fmt.Errorf("build gamemaster telemetry runtime: %s: %w", spec.name, err) } *spec.target = histogram } return nil } func registerObservableGauges(meter metric.Meter, runtime *Runtime) error { gauge, err := meter.Int64ObservableGauge("gamemaster.runtime_records_by_status") if err != nil { return fmt.Errorf("build gamemaster telemetry runtime: runtime_records_by_status: %w", err) } runtime.runtimeRecordsByStatus = gauge due, err := meter.Int64ObservableGauge("gamemaster.scheduler.due_games") if err != nil { return fmt.Errorf("build gamemaster telemetry runtime: scheduler.due_games: %w", err) } runtime.schedulerDueGames = due versions, err := meter.Int64ObservableGauge("gamemaster.engine_versions_total") if err != nil { return fmt.Errorf("build gamemaster telemetry runtime: engine_versions_total: %w", err) } runtime.engineVersionsTotal = versions return nil } func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdktrace.TracerProvider, error) { options := []sdktrace.TracerProviderOption{ sdktrace.WithResource(res), } if exporter, err := traceExporter(ctx, cfg); err != nil { return nil, err } else if exporter != nil { options = append(options, sdktrace.WithBatcher(exporter)) } if cfg.StdoutTracesEnabled { exporter, err := stdouttrace.New(stdouttrace.WithWriter(os.Stdout)) if err != nil { return nil, fmt.Errorf("stdout traces exporter: %w", err) } options = append(options, sdktrace.WithBatcher(exporter)) } return sdktrace.NewTracerProvider(options...), nil } func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdkmetric.MeterProvider, error) { options := []sdkmetric.Option{ sdkmetric.WithResource(res), } if exporter, err := metricExporter(ctx, cfg); err != nil { return nil, err } else if exporter != nil { options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter))) } if cfg.StdoutMetricsEnabled { exporter, err := stdoutmetric.New(stdoutmetric.WithWriter(os.Stdout)) if err != nil { return nil, fmt.Errorf("stdout metrics exporter: %w", err) } options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter))) } return sdkmetric.NewMeterProvider(options...), nil } func traceExporter(ctx context.Context, cfg ProcessConfig) (sdktrace.SpanExporter, error) { if cfg.TracesExporter != processExporterOTLP { return nil, nil } switch normalizeProtocol(cfg.TracesProtocol) { case processProtocolGRPC: exporter, err := otlptracegrpc.New(ctx) if err != nil { return nil, fmt.Errorf("otlp grpc traces exporter: %w", err) } return exporter, nil default: exporter, err := otlptracehttp.New(ctx) if err != nil { return nil, fmt.Errorf("otlp http traces exporter: %w", err) } return exporter, nil } } func metricExporter(ctx context.Context, cfg ProcessConfig) (sdkmetric.Exporter, error) { if cfg.MetricsExporter != processExporterOTLP { return nil, nil } switch normalizeProtocol(cfg.MetricsProtocol) { case processProtocolGRPC: exporter, err := otlpmetricgrpc.New(ctx) if err != nil { return nil, fmt.Errorf("otlp grpc metrics exporter: %w", err) } return exporter, nil default: exporter, err := otlpmetrichttp.New(ctx) if err != nil { return nil, fmt.Errorf("otlp http metrics exporter: %w", err) } return exporter, nil } } func normalizeProtocol(value string) string { switch strings.TrimSpace(value) { case processProtocolGRPC: return processProtocolGRPC default: return processProtocolHTTPProtobuf } } func normalizeContext(ctx context.Context) context.Context { if ctx == nil { return context.Background() } return ctx }