Files
galaxy-game/rtmanager/internal/telemetry/runtime.go
T
2026-04-28 20:39:18 +02:00

652 lines
21 KiB
Go

// Package telemetry provides lightweight OpenTelemetry helpers and
// low-cardinality Runtime Manager instruments used by the runnable
// skeleton. Later stages emit into the instruments declared here without
// touching this package.
package telemetry
import (
"context"
"errors"
"fmt"
"log/slog"
"os"
"strings"
"sync"
"time"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/propagation"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
oteltrace "go.opentelemetry.io/otel/trace"
)
const meterName = "galaxy/rtmanager"
const (
defaultServiceName = "galaxy-rtmanager"
processExporterNone = "none"
processExporterOTLP = "otlp"
processProtocolHTTPProtobuf = "http/protobuf"
processProtocolGRPC = "grpc"
)
// ProcessConfig configures the process-wide OpenTelemetry runtime.
type ProcessConfig struct {
// ServiceName overrides the default OpenTelemetry service name.
ServiceName string
// TracesExporter selects the external traces exporter. Supported values
// are `none` and `otlp`.
TracesExporter string
// MetricsExporter selects the external metrics exporter. Supported
// values are `none` and `otlp`.
MetricsExporter string
// TracesProtocol selects the OTLP traces protocol when TracesExporter is
// `otlp`.
TracesProtocol string
// MetricsProtocol selects the OTLP metrics protocol when
// MetricsExporter is `otlp`.
MetricsProtocol string
// StdoutTracesEnabled enables the additional stdout trace exporter used
// for local development and debugging.
StdoutTracesEnabled bool
// StdoutMetricsEnabled enables the additional stdout metric exporter
// used for local development and debugging.
StdoutMetricsEnabled bool
}
// Validate reports whether cfg contains a supported OpenTelemetry exporter
// configuration.
func (cfg ProcessConfig) Validate() error {
switch cfg.TracesExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter)
}
switch cfg.MetricsExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter)
}
if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol)
}
if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol)
}
return nil
}
// Runtime owns the Runtime Manager OpenTelemetry providers and the
// low-cardinality custom instruments listed in `rtmanager/README.md`
// §Observability.
type Runtime struct {
tracerProvider oteltrace.TracerProvider
meterProvider metric.MeterProvider
meter metric.Meter
shutdownMu sync.Mutex
shutdownDone bool
shutdownErr error
shutdownFns []func(context.Context) error
internalHTTPRequests metric.Int64Counter
internalHTTPDuration metric.Float64Histogram
startOutcomes metric.Int64Counter
stopOutcomes metric.Int64Counter
restartOutcomes metric.Int64Counter
patchOutcomes metric.Int64Counter
cleanupOutcomes metric.Int64Counter
healthEvents metric.Int64Counter
reconcileDrift metric.Int64Counter
notificationIntents metric.Int64Counter
dockerOpLatency metric.Float64Histogram
leaseAcquireLatency metric.Float64Histogram
runtimeRecordsByStatus metric.Int64ObservableGauge
gaugeMu sync.Mutex
gaugeRegistration metric.Registration
}
// NewWithProviders constructs a telemetry runtime around explicitly supplied
// meterProvider and tracerProvider values.
func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) {
if meterProvider == nil {
meterProvider = otel.GetMeterProvider()
}
if tracerProvider == nil {
tracerProvider = otel.GetTracerProvider()
}
if meterProvider == nil {
return nil, errors.New("new rtmanager telemetry runtime: nil meter provider")
}
if tracerProvider == nil {
return nil, errors.New("new rtmanager telemetry runtime: nil tracer provider")
}
return buildRuntime(meterProvider, tracerProvider, nil)
}
// NewProcess constructs the process-wide Runtime Manager OpenTelemetry
// runtime from cfg, installs the resulting providers globally, and
// returns the runtime.
func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) {
if ctx == nil {
return nil, errors.New("new rtmanager telemetry process: nil context")
}
if err := cfg.Validate(); err != nil {
return nil, fmt.Errorf("new rtmanager telemetry process: %w", err)
}
if logger == nil {
logger = slog.Default()
}
serviceName := strings.TrimSpace(cfg.ServiceName)
if serviceName == "" {
serviceName = defaultServiceName
}
res := resource.NewSchemaless(attribute.String("service.name", serviceName))
tracerProvider, err := newTracerProvider(ctx, res, cfg)
if err != nil {
return nil, fmt.Errorf("new rtmanager telemetry process: tracer provider: %w", err)
}
meterProvider, err := newMeterProvider(ctx, res, cfg)
if err != nil {
return nil, fmt.Errorf("new rtmanager telemetry process: meter provider: %w", err)
}
otel.SetTracerProvider(tracerProvider)
otel.SetMeterProvider(meterProvider)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
runtime, err := buildRuntime(meterProvider, tracerProvider, []func(context.Context) error{
meterProvider.Shutdown,
tracerProvider.Shutdown,
})
if err != nil {
return nil, fmt.Errorf("new rtmanager telemetry process: runtime: %w", err)
}
logger.Info("rtmanager telemetry configured",
"service_name", serviceName,
"traces_exporter", cfg.TracesExporter,
"metrics_exporter", cfg.MetricsExporter,
)
return runtime, nil
}
// TracerProvider returns the runtime tracer provider.
func (runtime *Runtime) TracerProvider() oteltrace.TracerProvider {
if runtime == nil || runtime.tracerProvider == nil {
return otel.GetTracerProvider()
}
return runtime.tracerProvider
}
// MeterProvider returns the runtime meter provider.
func (runtime *Runtime) MeterProvider() metric.MeterProvider {
if runtime == nil || runtime.meterProvider == nil {
return otel.GetMeterProvider()
}
return runtime.meterProvider
}
// Shutdown flushes and stops the configured telemetry providers. Shutdown
// is idempotent.
func (runtime *Runtime) Shutdown(ctx context.Context) error {
if runtime == nil {
return nil
}
runtime.shutdownMu.Lock()
if runtime.shutdownDone {
err := runtime.shutdownErr
runtime.shutdownMu.Unlock()
return err
}
runtime.shutdownDone = true
runtime.shutdownMu.Unlock()
runtime.gaugeMu.Lock()
if runtime.gaugeRegistration != nil {
_ = runtime.gaugeRegistration.Unregister()
runtime.gaugeRegistration = nil
}
runtime.gaugeMu.Unlock()
var shutdownErr error
for index := len(runtime.shutdownFns) - 1; index >= 0; index-- {
shutdownErr = errors.Join(shutdownErr, runtime.shutdownFns[index](ctx))
}
runtime.shutdownMu.Lock()
runtime.shutdownErr = shutdownErr
runtime.shutdownMu.Unlock()
return shutdownErr
}
// RecordInternalHTTPRequest records one internal HTTP request outcome.
func (runtime *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
if runtime == nil {
return
}
options := metric.WithAttributes(attrs...)
runtime.internalHTTPRequests.Add(normalizeContext(ctx), 1, options)
runtime.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
}
// RecordStartOutcome records one terminal outcome of the start operation.
// outcome is `success` or `failure`; errorCode is `replay_no_op` or one of
// the stable failure codes from `rtmanager/README.md` §Error Model;
// opSource is `lobby_stream`, `gm_rest`, or `admin_rest`.
func (runtime *Runtime) RecordStartOutcome(ctx context.Context, outcome, errorCode, opSource string) {
if runtime == nil || runtime.startOutcomes == nil {
return
}
runtime.startOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("outcome", outcome),
attribute.String("error_code", errorCode),
attribute.String("op_source", opSource),
))
}
// RecordStopOutcome records one terminal outcome of the stop operation.
// reason is the value carried on `runtime:stop_jobs` or the matching REST
// reason; opSource is `lobby_stream`, `gm_rest`, or `admin_rest`.
func (runtime *Runtime) RecordStopOutcome(ctx context.Context, outcome, reason, opSource string) {
if runtime == nil || runtime.stopOutcomes == nil {
return
}
runtime.stopOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("outcome", outcome),
attribute.String("reason", reason),
attribute.String("op_source", opSource),
))
}
// RecordRestartOutcome records one terminal outcome of the restart
// operation.
func (runtime *Runtime) RecordRestartOutcome(ctx context.Context, outcome, errorCode string) {
if runtime == nil || runtime.restartOutcomes == nil {
return
}
runtime.restartOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("outcome", outcome),
attribute.String("error_code", errorCode),
))
}
// RecordPatchOutcome records one terminal outcome of the patch operation.
func (runtime *Runtime) RecordPatchOutcome(ctx context.Context, outcome, errorCode string) {
if runtime == nil || runtime.patchOutcomes == nil {
return
}
runtime.patchOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("outcome", outcome),
attribute.String("error_code", errorCode),
))
}
// RecordCleanupOutcome records one terminal outcome of the cleanup
// operation. opSource is `auto_ttl` for the periodic cleanup worker and
// `admin_rest` for explicit administrative removal.
func (runtime *Runtime) RecordCleanupOutcome(ctx context.Context, outcome, opSource string) {
if runtime == nil || runtime.cleanupOutcomes == nil {
return
}
runtime.cleanupOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("outcome", outcome),
attribute.String("op_source", opSource),
))
}
// RecordHealthEvent records one technical runtime event published on
// `runtime:health_events`. eventType comes from the frozen vocabulary in
// `rtmanager/README.md` §Async Stream Contracts.
func (runtime *Runtime) RecordHealthEvent(ctx context.Context, eventType string) {
if runtime == nil || runtime.healthEvents == nil {
return
}
runtime.healthEvents.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("event_type", eventType),
))
}
// RecordReconcileDrift records one drift outcome from the reconciler. kind
// is `adopt`, `dispose`, or `observed_exited`.
func (runtime *Runtime) RecordReconcileDrift(ctx context.Context, kind string) {
if runtime == nil || runtime.reconcileDrift == nil {
return
}
runtime.reconcileDrift.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("kind", kind),
))
}
// RecordNotificationIntent records one admin-only notification intent
// publish attempt. notificationType is `runtime.image_pull_failed`,
// `runtime.container_start_failed`, or `runtime.start_config_invalid`.
func (runtime *Runtime) RecordNotificationIntent(ctx context.Context, notificationType string) {
if runtime == nil || runtime.notificationIntents == nil {
return
}
runtime.notificationIntents.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("notification_type", notificationType),
))
}
// RecordDockerOpLatency records the wall-clock duration of one Docker SDK
// call. op is one of `pull`, `create`, `start`, `stop`, `rm`, `inspect`,
// `events`.
func (runtime *Runtime) RecordDockerOpLatency(ctx context.Context, op string, duration time.Duration) {
if runtime == nil || runtime.dockerOpLatency == nil {
return
}
runtime.dockerOpLatency.Record(normalizeContext(ctx), duration.Seconds()*1000, metric.WithAttributes(
attribute.String("op", op),
))
}
// RecordLeaseAcquireLatency records the wall-clock latency of one
// per-game Redis lease acquisition.
func (runtime *Runtime) RecordLeaseAcquireLatency(ctx context.Context, duration time.Duration) {
if runtime == nil || runtime.leaseAcquireLatency == nil {
return
}
runtime.leaseAcquireLatency.Record(normalizeContext(ctx), duration.Seconds()*1000)
}
// RuntimeRecordsByStatusProbe reports the number of runtime_records rows
// per status. The production probe wraps the runtime record store; tests
// may pass a stub.
type RuntimeRecordsByStatusProbe interface {
CountByStatus(ctx context.Context) (map[string]int, error)
}
// GaugeDependencies groups the collaborators required by RegisterGauges.
type GaugeDependencies struct {
// RuntimeRecordsByStatus probes the per-status row count for
// `rtmanager.runtime_records_by_status`.
RuntimeRecordsByStatus RuntimeRecordsByStatusProbe
// Logger records non-fatal probe errors. Defaults to slog.Default
// when nil.
Logger *slog.Logger
}
// RegisterGauges installs the observable-gauge callback that reports
// `rtmanager.runtime_records_by_status`. It is safe to call once per
// Runtime; a second call replaces the previous registration. The runtime
// keeps no strong reference to deps beyond the callback closure.
//
// The wiring layer registers the gauge once the persistence adapters
// are constructed.
func (runtime *Runtime) RegisterGauges(deps GaugeDependencies) error {
if runtime == nil {
return errors.New("register rtmanager gauges: nil runtime")
}
if deps.RuntimeRecordsByStatus == nil {
return errors.New("register rtmanager gauges: nil runtime records probe")
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
runtime.gaugeMu.Lock()
defer runtime.gaugeMu.Unlock()
if runtime.gaugeRegistration != nil {
_ = runtime.gaugeRegistration.Unregister()
runtime.gaugeRegistration = nil
}
callback := func(ctx context.Context, observer metric.Observer) error {
counts, err := deps.RuntimeRecordsByStatus.CountByStatus(ctx)
if err != nil {
logger.WarnContext(ctx, "runtime records probe failed",
"err", err.Error(),
)
return nil
}
for status, count := range counts {
observer.ObserveInt64(runtime.runtimeRecordsByStatus, int64(count), metric.WithAttributes(
attribute.String("status", status),
))
}
return nil
}
registration, err := runtime.meter.RegisterCallback(callback, runtime.runtimeRecordsByStatus)
if err != nil {
return fmt.Errorf("register rtmanager gauges: %w", err)
}
runtime.gaugeRegistration = registration
return nil
}
func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) {
meter := meterProvider.Meter(meterName)
runtime := &Runtime{
tracerProvider: tracerProvider,
meterProvider: meterProvider,
meter: meter,
shutdownFns: append([]func(context.Context) error(nil), shutdownFns...),
}
internalHTTPRequests, err := meter.Int64Counter("rtmanager.internal_http.requests")
if err != nil {
return nil, fmt.Errorf("build rtmanager telemetry runtime: internal_http.requests: %w", err)
}
internalHTTPDuration, err := meter.Float64Histogram("rtmanager.internal_http.duration", metric.WithUnit("ms"))
if err != nil {
return nil, fmt.Errorf("build rtmanager telemetry runtime: internal_http.duration: %w", err)
}
runtime.internalHTTPRequests = internalHTTPRequests
runtime.internalHTTPDuration = internalHTTPDuration
if err := registerCounters(meter, runtime); err != nil {
return nil, err
}
if err := registerHistograms(meter, runtime); err != nil {
return nil, err
}
if err := registerObservableGauges(meter, runtime); err != nil {
return nil, err
}
return runtime, nil
}
func registerCounters(meter metric.Meter, runtime *Runtime) error {
specs := []struct {
name string
target *metric.Int64Counter
}{
{"rtmanager.start_outcomes", &runtime.startOutcomes},
{"rtmanager.stop_outcomes", &runtime.stopOutcomes},
{"rtmanager.restart_outcomes", &runtime.restartOutcomes},
{"rtmanager.patch_outcomes", &runtime.patchOutcomes},
{"rtmanager.cleanup_outcomes", &runtime.cleanupOutcomes},
{"rtmanager.health_events", &runtime.healthEvents},
{"rtmanager.reconcile_drift", &runtime.reconcileDrift},
{"rtmanager.notification_intents", &runtime.notificationIntents},
}
for _, spec := range specs {
counter, err := meter.Int64Counter(spec.name)
if err != nil {
return fmt.Errorf("build rtmanager telemetry runtime: %s: %w", spec.name, err)
}
*spec.target = counter
}
return nil
}
func registerHistograms(meter metric.Meter, runtime *Runtime) error {
specs := []struct {
name string
unit string
target *metric.Float64Histogram
}{
{"rtmanager.docker_op_latency", "ms", &runtime.dockerOpLatency},
{"rtmanager.lease_acquire_latency", "ms", &runtime.leaseAcquireLatency},
}
for _, spec := range specs {
options := []metric.Float64HistogramOption{}
if spec.unit != "" {
options = append(options, metric.WithUnit(spec.unit))
}
histogram, err := meter.Float64Histogram(spec.name, options...)
if err != nil {
return fmt.Errorf("build rtmanager telemetry runtime: %s: %w", spec.name, err)
}
*spec.target = histogram
}
return nil
}
func registerObservableGauges(meter metric.Meter, runtime *Runtime) error {
gauge, err := meter.Int64ObservableGauge("rtmanager.runtime_records_by_status")
if err != nil {
return fmt.Errorf("build rtmanager telemetry runtime: runtime_records_by_status: %w", err)
}
runtime.runtimeRecordsByStatus = gauge
return nil
}
func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdktrace.TracerProvider, error) {
options := []sdktrace.TracerProviderOption{
sdktrace.WithResource(res),
}
if exporter, err := traceExporter(ctx, cfg); err != nil {
return nil, err
} else if exporter != nil {
options = append(options, sdktrace.WithBatcher(exporter))
}
if cfg.StdoutTracesEnabled {
exporter, err := stdouttrace.New(stdouttrace.WithWriter(os.Stdout))
if err != nil {
return nil, fmt.Errorf("stdout traces exporter: %w", err)
}
options = append(options, sdktrace.WithBatcher(exporter))
}
return sdktrace.NewTracerProvider(options...), nil
}
func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdkmetric.MeterProvider, error) {
options := []sdkmetric.Option{
sdkmetric.WithResource(res),
}
if exporter, err := metricExporter(ctx, cfg); err != nil {
return nil, err
} else if exporter != nil {
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
if cfg.StdoutMetricsEnabled {
exporter, err := stdoutmetric.New(stdoutmetric.WithWriter(os.Stdout))
if err != nil {
return nil, fmt.Errorf("stdout metrics exporter: %w", err)
}
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
return sdkmetric.NewMeterProvider(options...), nil
}
func traceExporter(ctx context.Context, cfg ProcessConfig) (sdktrace.SpanExporter, error) {
if cfg.TracesExporter != processExporterOTLP {
return nil, nil
}
switch normalizeProtocol(cfg.TracesProtocol) {
case processProtocolGRPC:
exporter, err := otlptracegrpc.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp grpc traces exporter: %w", err)
}
return exporter, nil
default:
exporter, err := otlptracehttp.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp http traces exporter: %w", err)
}
return exporter, nil
}
}
func metricExporter(ctx context.Context, cfg ProcessConfig) (sdkmetric.Exporter, error) {
if cfg.MetricsExporter != processExporterOTLP {
return nil, nil
}
switch normalizeProtocol(cfg.MetricsProtocol) {
case processProtocolGRPC:
exporter, err := otlpmetricgrpc.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp grpc metrics exporter: %w", err)
}
return exporter, nil
default:
exporter, err := otlpmetrichttp.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp http metrics exporter: %w", err)
}
return exporter, nil
}
}
func normalizeProtocol(value string) string {
switch strings.TrimSpace(value) {
case processProtocolGRPC:
return processProtocolGRPC
default:
return processProtocolHTTPProtobuf
}
}
func normalizeContext(ctx context.Context) context.Context {
if ctx == nil {
return context.Background()
}
return ctx
}