// Package telemetry provides shared OpenTelemetry runtime helpers and // low-cardinality user-service instruments. package telemetry import ( "context" "errors" "fmt" "io" "log/slog" "net/http" "os" "strings" "sync" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" otelprom "go.opentelemetry.io/otel/exporters/prometheus" "go.opentelemetry.io/otel/exporters/stdout/stdoutmetric" "go.opentelemetry.io/otel/exporters/stdout/stdouttrace" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/propagation" sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/resource" sdktrace "go.opentelemetry.io/otel/sdk/trace" oteltrace "go.opentelemetry.io/otel/trace" ) const meterName = "galaxy/user" const ( defaultServiceName = "galaxy-user" processExporterNone = "none" processExporterOTLP = "otlp" processProtocolHTTPProtobuf = "http/protobuf" processProtocolGRPC = "grpc" ) // ProcessConfig configures the process-wide OpenTelemetry runtime. type ProcessConfig struct { // ServiceName overrides the default OpenTelemetry service name. ServiceName string // TracesExporter selects the external traces exporter. Supported values are // `none` and `otlp`. TracesExporter string // MetricsExporter selects the external metrics exporter. Supported values // are `none` and `otlp`. MetricsExporter string // TracesProtocol selects the OTLP traces protocol when TracesExporter is // `otlp`. TracesProtocol string // MetricsProtocol selects the OTLP metrics protocol when MetricsExporter is // `otlp`. MetricsProtocol string // StdoutTracesEnabled enables the additional stdout trace exporter used for // local development and debugging. StdoutTracesEnabled bool // StdoutMetricsEnabled enables the additional stdout metric exporter used // for local development and debugging. StdoutMetricsEnabled bool } // Validate reports whether cfg contains a supported OpenTelemetry exporter // configuration. func (cfg ProcessConfig) Validate() error { switch cfg.TracesExporter { case processExporterNone, processExporterOTLP: default: return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter) } switch cfg.MetricsExporter { case processExporterNone, processExporterOTLP: default: return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter) } if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC { return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol) } if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC { return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol) } return nil } // Runtime owns the user-service OpenTelemetry providers, the Prometheus // metrics handler, and the custom low-cardinality instruments. type Runtime struct { tracerProvider oteltrace.TracerProvider meterProvider metric.MeterProvider promHandler http.Handler shutdownMu sync.Mutex shutdownDone bool shutdownErr error shutdownFns []func(context.Context) error internalHTTPRequests metric.Int64Counter internalHTTPDuration metric.Float64Histogram authResolutionOutcomes metric.Int64Counter userCreationOutcomes metric.Int64Counter userNameConflicts metric.Int64Counter entitlementMutations metric.Int64Counter sanctionMutations metric.Int64Counter limitMutations metric.Int64Counter lifecycleMutations metric.Int64Counter eventPublicationFailures metric.Int64Counter } // New constructs a lightweight telemetry runtime around meterProvider for // tests and embedded use cases that do not need process-level exporter wiring. func New(meterProvider metric.MeterProvider) (*Runtime, error) { return NewWithProviders(meterProvider, nil) } // NewWithProviders constructs a telemetry runtime around explicitly supplied // meterProvider and tracerProvider values. func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) { if meterProvider == nil { meterProvider = otel.GetMeterProvider() } if tracerProvider == nil { tracerProvider = otel.GetTracerProvider() } if meterProvider == nil { return nil, errors.New("new user telemetry runtime: nil meter provider") } if tracerProvider == nil { return nil, errors.New("new user telemetry runtime: nil tracer provider") } return buildRuntime(meterProvider, tracerProvider, http.NotFoundHandler(), nil) } // NewProcess constructs the process-wide user-service OpenTelemetry runtime // from cfg, installs the resulting providers globally, and returns the // runtime. func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) { return newProcess(ctx, cfg, logger, os.Stdout, os.Stdout) } // TracerProvider returns the runtime tracer provider. func (r *Runtime) TracerProvider() oteltrace.TracerProvider { if r == nil || r.tracerProvider == nil { return otel.GetTracerProvider() } return r.tracerProvider } // MeterProvider returns the runtime meter provider. func (r *Runtime) MeterProvider() metric.MeterProvider { if r == nil || r.meterProvider == nil { return otel.GetMeterProvider() } return r.meterProvider } // Handler returns the Prometheus handler that should be mounted on the admin // listener. func (r *Runtime) Handler() http.Handler { if r == nil || r.promHandler == nil { return http.NotFoundHandler() } return r.promHandler } // Shutdown flushes and stops the configured telemetry providers. Shutdown is // idempotent. func (r *Runtime) Shutdown(ctx context.Context) error { if r == nil { return nil } r.shutdownMu.Lock() if r.shutdownDone { err := r.shutdownErr r.shutdownMu.Unlock() return err } r.shutdownDone = true r.shutdownMu.Unlock() var shutdownErr error for index := len(r.shutdownFns) - 1; index >= 0; index-- { shutdownErr = errors.Join(shutdownErr, r.shutdownFns[index](ctx)) } r.shutdownMu.Lock() r.shutdownErr = shutdownErr r.shutdownMu.Unlock() return shutdownErr } // RecordInternalHTTPRequest records one internal HTTP request outcome. func (r *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) { if r == nil { return } options := metric.WithAttributes(attrs...) r.internalHTTPRequests.Add(normalizeContext(ctx), 1, options) r.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options) } // RecordAuthResolutionOutcome records one auth-facing resolution outcome. func (r *Runtime) RecordAuthResolutionOutcome(ctx context.Context, operation string, outcome string) { if r == nil { return } r.authResolutionOutcomes.Add( normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("operation", strings.TrimSpace(operation)), attribute.String("outcome", strings.TrimSpace(outcome)), ), ) } // RecordUserCreationOutcome records one ensure-by-email coarse outcome. func (r *Runtime) RecordUserCreationOutcome(ctx context.Context, outcome string) { if r == nil { return } r.userCreationOutcomes.Add( normalizeContext(ctx), 1, metric.WithAttributes(attribute.String("outcome", strings.TrimSpace(outcome))), ) } // RecordUserNameConflict records one user-name generation conflict observed // during operation. func (r *Runtime) RecordUserNameConflict(ctx context.Context, operation string) { if r == nil { return } r.userNameConflicts.Add( normalizeContext(ctx), 1, metric.WithAttributes(attribute.String("operation", strings.TrimSpace(operation))), ) } // RecordEntitlementMutation records one entitlement command outcome. func (r *Runtime) RecordEntitlementMutation(ctx context.Context, command string, outcome string) { if r == nil { return } r.entitlementMutations.Add( normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("command", strings.TrimSpace(command)), attribute.String("outcome", strings.TrimSpace(outcome)), ), ) } // RecordSanctionMutation records one sanction command outcome. func (r *Runtime) RecordSanctionMutation(ctx context.Context, command string, outcome string) { if r == nil { return } r.sanctionMutations.Add( normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("command", strings.TrimSpace(command)), attribute.String("outcome", strings.TrimSpace(outcome)), ), ) } // RecordLimitMutation records one limit command outcome. func (r *Runtime) RecordLimitMutation(ctx context.Context, command string, outcome string) { if r == nil { return } r.limitMutations.Add( normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("command", strings.TrimSpace(command)), attribute.String("outcome", strings.TrimSpace(outcome)), ), ) } // RecordUserLifecycleMutation records one trusted user-lifecycle command // outcome (currently `apply_permanent_block` and `delete`). func (r *Runtime) RecordUserLifecycleMutation(ctx context.Context, command string, outcome string) { if r == nil { return } r.lifecycleMutations.Add( normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("command", strings.TrimSpace(command)), attribute.String("outcome", strings.TrimSpace(outcome)), ), ) } // RecordEventPublicationFailure records one post-commit auxiliary event // publication failure. func (r *Runtime) RecordEventPublicationFailure(ctx context.Context, eventType string) { if r == nil { return } r.eventPublicationFailures.Add( normalizeContext(ctx), 1, metric.WithAttributes(attribute.String("event_type", strings.TrimSpace(eventType))), ) } func newProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger, stdoutTraceWriter io.Writer, stdoutMetricWriter io.Writer) (*Runtime, error) { if ctx == nil { return nil, errors.New("new user telemetry process: nil context") } if err := cfg.Validate(); err != nil { return nil, fmt.Errorf("new user telemetry process: %w", err) } if logger == nil { logger = slog.Default() } if strings.TrimSpace(cfg.ServiceName) == "" { cfg.ServiceName = defaultServiceName } res, err := resource.New( ctx, resource.WithAttributes(attribute.String("service.name", cfg.ServiceName)), ) if err != nil { return nil, fmt.Errorf("new user telemetry process: resource: %w", err) } tracerProvider, err := newTracerProvider(ctx, res, cfg, stdoutTraceWriter) if err != nil { return nil, fmt.Errorf("new user telemetry process: tracer provider: %w", err) } registry := prometheus.NewRegistry() prometheusExporter, err := otelprom.New(otelprom.WithRegisterer(registry)) if err != nil { return nil, fmt.Errorf("new user telemetry process: prometheus exporter: %w", err) } meterProvider, err := newMeterProvider(ctx, res, cfg, prometheusExporter, stdoutMetricWriter) if err != nil { return nil, fmt.Errorf("new user telemetry process: meter provider: %w", err) } otel.SetTracerProvider(tracerProvider) otel.SetMeterProvider(meterProvider) otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( propagation.TraceContext{}, propagation.Baggage{}, )) runtime, err := buildRuntime( meterProvider, tracerProvider, promhttp.HandlerFor(registry, promhttp.HandlerOpts{}), []func(context.Context) error{ meterProvider.Shutdown, tracerProvider.Shutdown, }, ) if err != nil { return nil, fmt.Errorf("new user telemetry process: %w", err) } logger.InfoContext(ctx, "user telemetry configured", "service_name", cfg.ServiceName, "traces_exporter", cfg.TracesExporter, "metrics_exporter", cfg.MetricsExporter, "stdout_traces_enabled", cfg.StdoutTracesEnabled, "stdout_metrics_enabled", cfg.StdoutMetricsEnabled, ) return runtime, nil } func buildRuntime( meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, promHandler http.Handler, shutdownFns []func(context.Context) error, ) (*Runtime, error) { meter := meterProvider.Meter(meterName) internalHTTPRequests, err := meter.Int64Counter("user.internal_http.requests") if err != nil { return nil, fmt.Errorf("build user telemetry runtime: internal_http.requests: %w", err) } internalHTTPDuration, err := meter.Float64Histogram("user.internal_http.duration", metric.WithUnit("ms")) if err != nil { return nil, fmt.Errorf("build user telemetry runtime: internal_http.duration: %w", err) } authResolutionOutcomes, err := meter.Int64Counter("user.auth_resolution.outcomes") if err != nil { return nil, fmt.Errorf("build user telemetry runtime: auth_resolution.outcomes: %w", err) } userCreationOutcomes, err := meter.Int64Counter("user.user_creation.outcomes") if err != nil { return nil, fmt.Errorf("build user telemetry runtime: user_creation.outcomes: %w", err) } userNameConflicts, err := meter.Int64Counter("user.user_name.conflicts") if err != nil { return nil, fmt.Errorf("build user telemetry runtime: user_name.conflicts: %w", err) } entitlementMutations, err := meter.Int64Counter("user.entitlement.mutations") if err != nil { return nil, fmt.Errorf("build user telemetry runtime: entitlement.mutations: %w", err) } sanctionMutations, err := meter.Int64Counter("user.sanction.mutations") if err != nil { return nil, fmt.Errorf("build user telemetry runtime: sanction.mutations: %w", err) } limitMutations, err := meter.Int64Counter("user.limit.mutations") if err != nil { return nil, fmt.Errorf("build user telemetry runtime: limit.mutations: %w", err) } lifecycleMutations, err := meter.Int64Counter("user.lifecycle.mutations") if err != nil { return nil, fmt.Errorf("build user telemetry runtime: lifecycle.mutations: %w", err) } eventPublicationFailures, err := meter.Int64Counter("user.event_publication_failures") if err != nil { return nil, fmt.Errorf("build user telemetry runtime: event_publication_failures: %w", err) } if promHandler == nil { promHandler = http.NotFoundHandler() } return &Runtime{ tracerProvider: tracerProvider, meterProvider: meterProvider, promHandler: promHandler, shutdownFns: shutdownFns, internalHTTPRequests: internalHTTPRequests, internalHTTPDuration: internalHTTPDuration, authResolutionOutcomes: authResolutionOutcomes, userCreationOutcomes: userCreationOutcomes, userNameConflicts: userNameConflicts, entitlementMutations: entitlementMutations, sanctionMutations: sanctionMutations, limitMutations: limitMutations, lifecycleMutations: lifecycleMutations, eventPublicationFailures: eventPublicationFailures, }, nil } func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig, stdoutWriter io.Writer) (*sdktrace.TracerProvider, error) { options := []sdktrace.TracerProviderOption{sdktrace.WithResource(res)} if cfg.TracesExporter == processExporterOTLP { exporter, err := newOTLPTraceExporter(ctx, cfg.TracesProtocol) if err != nil { return nil, err } options = append(options, sdktrace.WithBatcher(exporter)) } if cfg.StdoutTracesEnabled { exporter, err := stdouttrace.New( stdouttrace.WithPrettyPrint(), stdouttrace.WithWriter(stdoutWriter), ) if err != nil { return nil, err } options = append(options, sdktrace.WithBatcher(exporter)) } return sdktrace.NewTracerProvider(options...), nil } func newMeterProvider( ctx context.Context, res *resource.Resource, cfg ProcessConfig, prometheusExporter sdkmetric.Reader, stdoutWriter io.Writer, ) (*sdkmetric.MeterProvider, error) { options := []sdkmetric.Option{ sdkmetric.WithResource(res), sdkmetric.WithReader(prometheusExporter), } if cfg.MetricsExporter == processExporterOTLP { exporter, err := newOTLPMetricExporter(ctx, cfg.MetricsProtocol) if err != nil { return nil, err } options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter))) } if cfg.StdoutMetricsEnabled { exporter, err := stdoutmetric.New( stdoutmetric.WithPrettyPrint(), stdoutmetric.WithWriter(stdoutWriter), ) if err != nil { return nil, err } options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter))) } return sdkmetric.NewMeterProvider(options...), nil } func newOTLPTraceExporter(ctx context.Context, protocol string) (sdktrace.SpanExporter, error) { switch protocol { case "", processProtocolHTTPProtobuf: return otlptracehttp.New(ctx) case processProtocolGRPC: return otlptracegrpc.New(ctx) default: return nil, fmt.Errorf("unsupported OTLP traces protocol %q", protocol) } } func newOTLPMetricExporter(ctx context.Context, protocol string) (sdkmetric.Exporter, error) { switch protocol { case "", processProtocolHTTPProtobuf: return otlpmetrichttp.New(ctx) case processProtocolGRPC: return otlpmetricgrpc.New(ctx) default: return nil, fmt.Errorf("unsupported OTLP metrics protocol %q", protocol) } } func normalizeContext(ctx context.Context) context.Context { if ctx == nil { return context.Background() } return ctx }