// Package telemetry provides lightweight OpenTelemetry helpers and // low-cardinality Notification Service instruments. package telemetry import ( "context" "errors" "fmt" "log/slog" "os" "strings" "sync" "time" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" "go.opentelemetry.io/otel/exporters/stdout/stdoutmetric" "go.opentelemetry.io/otel/exporters/stdout/stdouttrace" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/propagation" sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/resource" sdktrace "go.opentelemetry.io/otel/sdk/trace" oteltrace "go.opentelemetry.io/otel/trace" ) const meterName = "galaxy/notification" const ( defaultServiceName = "galaxy-notification" processExporterNone = "none" processExporterOTLP = "otlp" processProtocolHTTPProtobuf = "http/protobuf" processProtocolGRPC = "grpc" ) // ProcessConfig configures the process-wide OpenTelemetry runtime. type ProcessConfig struct { // ServiceName overrides the default OpenTelemetry service name. ServiceName string // TracesExporter selects the external traces exporter. Supported values are // `none` and `otlp`. TracesExporter string // MetricsExporter selects the external metrics exporter. Supported values // are `none` and `otlp`. MetricsExporter string // TracesProtocol selects the OTLP traces protocol when TracesExporter is // `otlp`. TracesProtocol string // MetricsProtocol selects the OTLP metrics protocol when MetricsExporter is // `otlp`. MetricsProtocol string // StdoutTracesEnabled enables the additional stdout trace exporter used for // local development and debugging. StdoutTracesEnabled bool // StdoutMetricsEnabled enables the additional stdout metric exporter used // for local development and debugging. StdoutMetricsEnabled bool } // Validate reports whether cfg contains a supported OpenTelemetry exporter // configuration. func (cfg ProcessConfig) Validate() error { switch cfg.TracesExporter { case processExporterNone, processExporterOTLP: default: return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter) } switch cfg.MetricsExporter { case processExporterNone, processExporterOTLP: default: return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter) } if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC { return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol) } if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC { return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol) } return nil } // Runtime owns the Notification Service OpenTelemetry providers and // low-cardinality custom instruments. type Runtime struct { tracerProvider oteltrace.TracerProvider meterProvider metric.MeterProvider shutdownMu sync.Mutex shutdownDone bool shutdownErr error shutdownFns []func(context.Context) error routeScheduleReaderMu sync.RWMutex routeScheduleReader RouteScheduleSnapshotReader intentStreamLagReaderMu sync.RWMutex intentStreamLagReader IntentStreamLagSnapshotReader internalHTTPRequests metric.Int64Counter internalHTTPDuration metric.Float64Histogram internalHTTPLifecycle metric.Int64Counter intentOutcomes metric.Int64Counter malformedIntents metric.Int64Counter userEnrichment metric.Int64Counter routePublishAttempts metric.Int64Counter routeRetries metric.Int64Counter routeDeadLetters metric.Int64Counter } // RouteScheduleSnapshot stores the current observable state of the durable // notification route schedule. type RouteScheduleSnapshot struct { // Depth stores how many route keys are currently present in the route // schedule. Depth int64 // OldestScheduledFor stores the oldest currently scheduled due time when // one exists. OldestScheduledFor *time.Time } // RouteScheduleSnapshotReader loads one current route-schedule snapshot for // observable gauge reporting. type RouteScheduleSnapshotReader interface { // ReadRouteScheduleSnapshot returns the current route-schedule depth and // its oldest scheduled timestamp when one exists. ReadRouteScheduleSnapshot(context.Context) (RouteScheduleSnapshot, error) } // IntentStreamLagSnapshot stores the current observable lag of the plain-XREAD // notification-intent consumer. type IntentStreamLagSnapshot struct { // OldestUnprocessedAt stores the Redis Stream timestamp of the oldest // entry that has not yet been durably processed. OldestUnprocessedAt *time.Time } // IntentStreamLagSnapshotReader loads one current intent-stream lag snapshot // for observable gauge reporting. type IntentStreamLagSnapshotReader interface { // ReadIntentStreamLagSnapshot returns the oldest unprocessed stream entry // timestamp when one exists. ReadIntentStreamLagSnapshot(context.Context) (IntentStreamLagSnapshot, error) } // New constructs a lightweight telemetry runtime around meterProvider for // tests and embedded use cases that do not need process-level exporter wiring. func New(meterProvider metric.MeterProvider) (*Runtime, error) { return NewWithProviders(meterProvider, nil) } // NewWithProviders constructs a telemetry runtime around explicitly supplied // meterProvider and tracerProvider values. func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) { if meterProvider == nil { meterProvider = otel.GetMeterProvider() } if tracerProvider == nil { tracerProvider = otel.GetTracerProvider() } if meterProvider == nil { return nil, errors.New("new notification telemetry runtime: nil meter provider") } if tracerProvider == nil { return nil, errors.New("new notification telemetry runtime: nil tracer provider") } return buildRuntime(meterProvider, tracerProvider, nil) } // NewProcess constructs the process-wide Notification Service OpenTelemetry // runtime from cfg, installs the resulting providers globally, and returns the // runtime. func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) { if ctx == nil { return nil, errors.New("new notification telemetry process: nil context") } if err := cfg.Validate(); err != nil { return nil, fmt.Errorf("new notification telemetry process: %w", err) } if logger == nil { logger = slog.Default() } serviceName := strings.TrimSpace(cfg.ServiceName) if serviceName == "" { serviceName = defaultServiceName } res := resource.NewSchemaless(attribute.String("service.name", serviceName)) tracerProvider, err := newTracerProvider(ctx, res, cfg) if err != nil { return nil, fmt.Errorf("new notification telemetry process: tracer provider: %w", err) } meterProvider, err := newMeterProvider(ctx, res, cfg) if err != nil { return nil, fmt.Errorf("new notification telemetry process: meter provider: %w", err) } otel.SetTracerProvider(tracerProvider) otel.SetMeterProvider(meterProvider) otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( propagation.TraceContext{}, propagation.Baggage{}, )) runtime, err := buildRuntime(meterProvider, tracerProvider, []func(context.Context) error{ meterProvider.Shutdown, tracerProvider.Shutdown, }) if err != nil { return nil, fmt.Errorf("new notification telemetry process: runtime: %w", err) } logger.Info("notification telemetry configured", "service_name", serviceName, "traces_exporter", cfg.TracesExporter, "metrics_exporter", cfg.MetricsExporter, ) return runtime, nil } // TracerProvider returns the runtime tracer provider. func (runtime *Runtime) TracerProvider() oteltrace.TracerProvider { if runtime == nil || runtime.tracerProvider == nil { return otel.GetTracerProvider() } return runtime.tracerProvider } // MeterProvider returns the runtime meter provider. func (runtime *Runtime) MeterProvider() metric.MeterProvider { if runtime == nil || runtime.meterProvider == nil { return otel.GetMeterProvider() } return runtime.meterProvider } // Shutdown flushes and stops the configured telemetry providers. Shutdown is // idempotent. func (runtime *Runtime) Shutdown(ctx context.Context) error { if runtime == nil { return nil } runtime.shutdownMu.Lock() if runtime.shutdownDone { err := runtime.shutdownErr runtime.shutdownMu.Unlock() return err } runtime.shutdownDone = true runtime.shutdownMu.Unlock() var shutdownErr error for index := len(runtime.shutdownFns) - 1; index >= 0; index-- { shutdownErr = errors.Join(shutdownErr, runtime.shutdownFns[index](ctx)) } runtime.shutdownMu.Lock() runtime.shutdownErr = shutdownErr runtime.shutdownMu.Unlock() return shutdownErr } // RecordInternalHTTPRequest records one internal HTTP request outcome. func (runtime *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) { if runtime == nil { return } options := metric.WithAttributes(attrs...) runtime.internalHTTPRequests.Add(normalizeContext(ctx), 1, options) runtime.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options) } // RecordInternalHTTPEvent records one internal HTTP server lifecycle event. func (runtime *Runtime) RecordInternalHTTPEvent(ctx context.Context, event string) { if runtime == nil { return } runtime.internalHTTPLifecycle.Add( normalizeContext(ctx), 1, metric.WithAttributes(attribute.String("event", strings.TrimSpace(event))), ) } // RecordIntentOutcome records one accepted notification-intent outcome. func (runtime *Runtime) RecordIntentOutcome(ctx context.Context, notificationType string, producer string, audienceKind string, outcome string) { if runtime == nil { return } runtime.intentOutcomes.Add( normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("notification_type", cleanAttribute(notificationType, "unknown")), attribute.String("producer", cleanAttribute(producer, "unknown")), attribute.String("audience_kind", cleanAttribute(audienceKind, "unknown")), attribute.String("outcome", cleanAttribute(outcome, "unknown")), ), ) } // RecordMalformedIntent records one malformed or rejected notification intent. func (runtime *Runtime) RecordMalformedIntent(ctx context.Context, failureCode string, notificationType string, producer string) { if runtime == nil { return } runtime.malformedIntents.Add( normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("failure_code", cleanAttribute(failureCode, "unknown")), attribute.String("notification_type", cleanAttribute(notificationType, "unknown")), attribute.String("producer", cleanAttribute(producer, "unknown")), ), ) } // RecordUserEnrichmentAttempt records one User Service enrichment lookup // outcome. func (runtime *Runtime) RecordUserEnrichmentAttempt(ctx context.Context, notificationType string, result string) { if runtime == nil { return } runtime.userEnrichment.Add( normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("notification_type", cleanAttribute(notificationType, "unknown")), attribute.String("result", cleanAttribute(result, "unknown")), ), ) } // RecordRoutePublishAttempt records one route publication attempt outcome. func (runtime *Runtime) RecordRoutePublishAttempt(ctx context.Context, channel string, notificationType string, result string, failureClassification string) { if runtime == nil { return } runtime.routePublishAttempts.Add( normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("channel", cleanAttribute(channel, "unknown")), attribute.String("notification_type", cleanAttribute(notificationType, "unknown")), attribute.String("result", cleanAttribute(result, "unknown")), attribute.String("failure_classification", cleanAttribute(failureClassification, "none")), ), ) } // RecordRouteRetry records one route retry scheduling event. func (runtime *Runtime) RecordRouteRetry(ctx context.Context, channel string, notificationType string) { if runtime == nil { return } runtime.routeRetries.Add( normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("channel", cleanAttribute(channel, "unknown")), attribute.String("notification_type", cleanAttribute(notificationType, "unknown")), ), ) } // RecordRouteDeadLetter records one route transition to dead_letter. func (runtime *Runtime) RecordRouteDeadLetter(ctx context.Context, channel string, notificationType string, failureClassification string) { if runtime == nil { return } runtime.routeDeadLetters.Add( normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("channel", cleanAttribute(channel, "unknown")), attribute.String("notification_type", cleanAttribute(notificationType, "unknown")), attribute.String("failure_classification", cleanAttribute(failureClassification, "unknown")), ), ) } // SetRouteScheduleSnapshotReader installs the route-schedule reader used by // the observable route schedule gauges. func (runtime *Runtime) SetRouteScheduleSnapshotReader(reader RouteScheduleSnapshotReader) { if runtime == nil { return } runtime.routeScheduleReaderMu.Lock() runtime.routeScheduleReader = reader runtime.routeScheduleReaderMu.Unlock() } // SetIntentStreamLagSnapshotReader installs the intent-stream lag reader used // by the observable lag gauge. func (runtime *Runtime) SetIntentStreamLagSnapshotReader(reader IntentStreamLagSnapshotReader) { if runtime == nil { return } runtime.intentStreamLagReaderMu.Lock() runtime.intentStreamLagReader = reader runtime.intentStreamLagReaderMu.Unlock() } func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) { meter := meterProvider.Meter(meterName) runtime := &Runtime{ tracerProvider: tracerProvider, meterProvider: meterProvider, shutdownFns: append([]func(context.Context) error(nil), shutdownFns...), } internalHTTPRequests, err := meter.Int64Counter("notification.internal_http.requests") if err != nil { return nil, fmt.Errorf("build notification telemetry runtime: internal_http.requests: %w", err) } internalHTTPDuration, err := meter.Float64Histogram("notification.internal_http.duration_ms", metric.WithUnit("ms")) if err != nil { return nil, fmt.Errorf("build notification telemetry runtime: internal_http.duration_ms: %w", err) } internalHTTPLifecycle, err := meter.Int64Counter("notification.internal_http.lifecycle") if err != nil { return nil, fmt.Errorf("build notification telemetry runtime: internal_http.lifecycle: %w", err) } intentOutcomes, err := meter.Int64Counter("notification.intent.outcomes") if err != nil { return nil, fmt.Errorf("build notification telemetry runtime: intent.outcomes: %w", err) } malformedIntents, err := meter.Int64Counter("notification.intent.malformed") if err != nil { return nil, fmt.Errorf("build notification telemetry runtime: intent.malformed: %w", err) } userEnrichment, err := meter.Int64Counter("notification.user_enrichment.attempts") if err != nil { return nil, fmt.Errorf("build notification telemetry runtime: user_enrichment.attempts: %w", err) } routePublishAttempts, err := meter.Int64Counter("notification.route.publish_attempts") if err != nil { return nil, fmt.Errorf("build notification telemetry runtime: route.publish_attempts: %w", err) } routeRetries, err := meter.Int64Counter("notification.route.retries") if err != nil { return nil, fmt.Errorf("build notification telemetry runtime: route.retries: %w", err) } routeDeadLetters, err := meter.Int64Counter("notification.route.dead_letters") if err != nil { return nil, fmt.Errorf("build notification telemetry runtime: route.dead_letters: %w", err) } routeScheduleDepth, err := meter.Int64ObservableGauge("notification.route_schedule.depth") if err != nil { return nil, fmt.Errorf("build notification telemetry runtime: route_schedule.depth: %w", err) } routeScheduleOldestAge, err := meter.Int64ObservableGauge("notification.route_schedule.oldest_age_ms", metric.WithUnit("ms")) if err != nil { return nil, fmt.Errorf("build notification telemetry runtime: route_schedule.oldest_age_ms: %w", err) } intentStreamOldestUnprocessedAge, err := meter.Int64ObservableGauge("notification.intent_stream.oldest_unprocessed_age_ms", metric.WithUnit("ms")) if err != nil { return nil, fmt.Errorf("build notification telemetry runtime: intent_stream.oldest_unprocessed_age_ms: %w", err) } registration, err := meter.RegisterCallback(func(ctx context.Context, observer metric.Observer) error { runtime.observeRouteSchedule(ctx, observer, routeScheduleDepth, routeScheduleOldestAge) runtime.observeIntentStreamLag(ctx, observer, intentStreamOldestUnprocessedAge) return nil }, routeScheduleDepth, routeScheduleOldestAge, intentStreamOldestUnprocessedAge) if err != nil { return nil, fmt.Errorf("build notification telemetry runtime: observable callbacks: %w", err) } runtime.shutdownFns = append(runtime.shutdownFns, func(context.Context) error { return registration.Unregister() }) runtime.internalHTTPRequests = internalHTTPRequests runtime.internalHTTPDuration = internalHTTPDuration runtime.internalHTTPLifecycle = internalHTTPLifecycle runtime.intentOutcomes = intentOutcomes runtime.malformedIntents = malformedIntents runtime.userEnrichment = userEnrichment runtime.routePublishAttempts = routePublishAttempts runtime.routeRetries = routeRetries runtime.routeDeadLetters = routeDeadLetters return runtime, nil } func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdktrace.TracerProvider, error) { options := []sdktrace.TracerProviderOption{ sdktrace.WithResource(res), } if exporter, err := traceExporter(ctx, cfg); err != nil { return nil, err } else if exporter != nil { options = append(options, sdktrace.WithBatcher(exporter)) } if cfg.StdoutTracesEnabled { exporter, err := stdouttrace.New(stdouttrace.WithWriter(os.Stdout)) if err != nil { return nil, fmt.Errorf("stdout traces exporter: %w", err) } options = append(options, sdktrace.WithBatcher(exporter)) } return sdktrace.NewTracerProvider(options...), nil } func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdkmetric.MeterProvider, error) { options := []sdkmetric.Option{ sdkmetric.WithResource(res), } if exporter, err := metricExporter(ctx, cfg); err != nil { return nil, err } else if exporter != nil { options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter))) } if cfg.StdoutMetricsEnabled { exporter, err := stdoutmetric.New(stdoutmetric.WithWriter(os.Stdout)) if err != nil { return nil, fmt.Errorf("stdout metrics exporter: %w", err) } options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter))) } return sdkmetric.NewMeterProvider(options...), nil } func traceExporter(ctx context.Context, cfg ProcessConfig) (sdktrace.SpanExporter, error) { if cfg.TracesExporter != processExporterOTLP { return nil, nil } switch normalizeProtocol(cfg.TracesProtocol) { case processProtocolGRPC: exporter, err := otlptracegrpc.New(ctx) if err != nil { return nil, fmt.Errorf("otlp grpc traces exporter: %w", err) } return exporter, nil default: exporter, err := otlptracehttp.New(ctx) if err != nil { return nil, fmt.Errorf("otlp http traces exporter: %w", err) } return exporter, nil } } func metricExporter(ctx context.Context, cfg ProcessConfig) (sdkmetric.Exporter, error) { if cfg.MetricsExporter != processExporterOTLP { return nil, nil } switch normalizeProtocol(cfg.MetricsProtocol) { case processProtocolGRPC: exporter, err := otlpmetricgrpc.New(ctx) if err != nil { return nil, fmt.Errorf("otlp grpc metrics exporter: %w", err) } return exporter, nil default: exporter, err := otlpmetrichttp.New(ctx) if err != nil { return nil, fmt.Errorf("otlp http metrics exporter: %w", err) } return exporter, nil } } func normalizeProtocol(value string) string { switch strings.TrimSpace(value) { case processProtocolGRPC: return processProtocolGRPC default: return processProtocolHTTPProtobuf } } func normalizeContext(ctx context.Context) context.Context { if ctx == nil { return context.Background() } return ctx } func cleanAttribute(value string, fallback string) string { trimmed := strings.TrimSpace(value) if trimmed == "" { return fallback } return trimmed } func (runtime *Runtime) observeRouteSchedule( ctx context.Context, observer metric.Observer, depthGauge metric.Int64ObservableGauge, oldestAgeGauge metric.Int64ObservableGauge, ) { depth := int64(0) oldestAge := int64(0) reader := runtime.currentRouteScheduleReader() if reader != nil { snapshot, err := reader.ReadRouteScheduleSnapshot(ctx) if err != nil { otel.Handle(fmt.Errorf("observe notification route schedule: %w", err)) } else { if snapshot.Depth > 0 { depth = snapshot.Depth } if snapshot.OldestScheduledFor != nil { oldestAge = time.Since(snapshot.OldestScheduledFor.UTC()).Milliseconds() if oldestAge < 0 { oldestAge = 0 } } } } observer.ObserveInt64(depthGauge, depth) observer.ObserveInt64(oldestAgeGauge, oldestAge) } func (runtime *Runtime) observeIntentStreamLag( ctx context.Context, observer metric.Observer, oldestUnprocessedAgeGauge metric.Int64ObservableGauge, ) { oldestAge := int64(0) reader := runtime.currentIntentStreamLagReader() if reader != nil { snapshot, err := reader.ReadIntentStreamLagSnapshot(ctx) if err != nil { otel.Handle(fmt.Errorf("observe notification intent stream lag: %w", err)) } else if snapshot.OldestUnprocessedAt != nil { oldestAge = time.Since(snapshot.OldestUnprocessedAt.UTC()).Milliseconds() if oldestAge < 0 { oldestAge = 0 } } } observer.ObserveInt64(oldestUnprocessedAgeGauge, oldestAge) } func (runtime *Runtime) currentRouteScheduleReader() RouteScheduleSnapshotReader { runtime.routeScheduleReaderMu.RLock() defer runtime.routeScheduleReaderMu.RUnlock() return runtime.routeScheduleReader } func (runtime *Runtime) currentIntentStreamLagReader() IntentStreamLagSnapshotReader { runtime.intentStreamLagReaderMu.RLock() defer runtime.intentStreamLagReaderMu.RUnlock() return runtime.intentStreamLagReader }