Files
galaxy-game/notification/internal/telemetry/runtime.go
T
2026-04-22 08:49:45 +02:00

695 lines
22 KiB
Go

// Package telemetry provides lightweight OpenTelemetry helpers and
// low-cardinality Notification Service instruments.
package telemetry
import (
"context"
"errors"
"fmt"
"log/slog"
"os"
"strings"
"sync"
"time"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/propagation"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
oteltrace "go.opentelemetry.io/otel/trace"
)
const meterName = "galaxy/notification"
const (
defaultServiceName = "galaxy-notification"
processExporterNone = "none"
processExporterOTLP = "otlp"
processProtocolHTTPProtobuf = "http/protobuf"
processProtocolGRPC = "grpc"
)
// ProcessConfig configures the process-wide OpenTelemetry runtime.
type ProcessConfig struct {
// ServiceName overrides the default OpenTelemetry service name.
ServiceName string
// TracesExporter selects the external traces exporter. Supported values are
// `none` and `otlp`.
TracesExporter string
// MetricsExporter selects the external metrics exporter. Supported values
// are `none` and `otlp`.
MetricsExporter string
// TracesProtocol selects the OTLP traces protocol when TracesExporter is
// `otlp`.
TracesProtocol string
// MetricsProtocol selects the OTLP metrics protocol when MetricsExporter is
// `otlp`.
MetricsProtocol string
// StdoutTracesEnabled enables the additional stdout trace exporter used for
// local development and debugging.
StdoutTracesEnabled bool
// StdoutMetricsEnabled enables the additional stdout metric exporter used
// for local development and debugging.
StdoutMetricsEnabled bool
}
// Validate reports whether cfg contains a supported OpenTelemetry exporter
// configuration.
func (cfg ProcessConfig) Validate() error {
switch cfg.TracesExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter)
}
switch cfg.MetricsExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter)
}
if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol)
}
if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol)
}
return nil
}
// Runtime owns the Notification Service OpenTelemetry providers and
// low-cardinality custom instruments.
type Runtime struct {
tracerProvider oteltrace.TracerProvider
meterProvider metric.MeterProvider
shutdownMu sync.Mutex
shutdownDone bool
shutdownErr error
shutdownFns []func(context.Context) error
routeScheduleReaderMu sync.RWMutex
routeScheduleReader RouteScheduleSnapshotReader
intentStreamLagReaderMu sync.RWMutex
intentStreamLagReader IntentStreamLagSnapshotReader
internalHTTPRequests metric.Int64Counter
internalHTTPDuration metric.Float64Histogram
internalHTTPLifecycle metric.Int64Counter
intentOutcomes metric.Int64Counter
malformedIntents metric.Int64Counter
userEnrichment metric.Int64Counter
routePublishAttempts metric.Int64Counter
routeRetries metric.Int64Counter
routeDeadLetters metric.Int64Counter
}
// RouteScheduleSnapshot stores the current observable state of the durable
// notification route schedule.
type RouteScheduleSnapshot struct {
// Depth stores how many route keys are currently present in the route
// schedule.
Depth int64
// OldestScheduledFor stores the oldest currently scheduled due time when
// one exists.
OldestScheduledFor *time.Time
}
// RouteScheduleSnapshotReader loads one current route-schedule snapshot for
// observable gauge reporting.
type RouteScheduleSnapshotReader interface {
// ReadRouteScheduleSnapshot returns the current route-schedule depth and
// its oldest scheduled timestamp when one exists.
ReadRouteScheduleSnapshot(context.Context) (RouteScheduleSnapshot, error)
}
// IntentStreamLagSnapshot stores the current observable lag of the plain-XREAD
// notification-intent consumer.
type IntentStreamLagSnapshot struct {
// OldestUnprocessedAt stores the Redis Stream timestamp of the oldest
// entry that has not yet been durably processed.
OldestUnprocessedAt *time.Time
}
// IntentStreamLagSnapshotReader loads one current intent-stream lag snapshot
// for observable gauge reporting.
type IntentStreamLagSnapshotReader interface {
// ReadIntentStreamLagSnapshot returns the oldest unprocessed stream entry
// timestamp when one exists.
ReadIntentStreamLagSnapshot(context.Context) (IntentStreamLagSnapshot, error)
}
// New constructs a lightweight telemetry runtime around meterProvider for
// tests and embedded use cases that do not need process-level exporter wiring.
func New(meterProvider metric.MeterProvider) (*Runtime, error) {
return NewWithProviders(meterProvider, nil)
}
// NewWithProviders constructs a telemetry runtime around explicitly supplied
// meterProvider and tracerProvider values.
func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) {
if meterProvider == nil {
meterProvider = otel.GetMeterProvider()
}
if tracerProvider == nil {
tracerProvider = otel.GetTracerProvider()
}
if meterProvider == nil {
return nil, errors.New("new notification telemetry runtime: nil meter provider")
}
if tracerProvider == nil {
return nil, errors.New("new notification telemetry runtime: nil tracer provider")
}
return buildRuntime(meterProvider, tracerProvider, nil)
}
// NewProcess constructs the process-wide Notification Service OpenTelemetry
// runtime from cfg, installs the resulting providers globally, and returns the
// runtime.
func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) {
if ctx == nil {
return nil, errors.New("new notification telemetry process: nil context")
}
if err := cfg.Validate(); err != nil {
return nil, fmt.Errorf("new notification telemetry process: %w", err)
}
if logger == nil {
logger = slog.Default()
}
serviceName := strings.TrimSpace(cfg.ServiceName)
if serviceName == "" {
serviceName = defaultServiceName
}
res := resource.NewSchemaless(attribute.String("service.name", serviceName))
tracerProvider, err := newTracerProvider(ctx, res, cfg)
if err != nil {
return nil, fmt.Errorf("new notification telemetry process: tracer provider: %w", err)
}
meterProvider, err := newMeterProvider(ctx, res, cfg)
if err != nil {
return nil, fmt.Errorf("new notification telemetry process: meter provider: %w", err)
}
otel.SetTracerProvider(tracerProvider)
otel.SetMeterProvider(meterProvider)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
runtime, err := buildRuntime(meterProvider, tracerProvider, []func(context.Context) error{
meterProvider.Shutdown,
tracerProvider.Shutdown,
})
if err != nil {
return nil, fmt.Errorf("new notification telemetry process: runtime: %w", err)
}
logger.Info("notification telemetry configured",
"service_name", serviceName,
"traces_exporter", cfg.TracesExporter,
"metrics_exporter", cfg.MetricsExporter,
)
return runtime, nil
}
// TracerProvider returns the runtime tracer provider.
func (runtime *Runtime) TracerProvider() oteltrace.TracerProvider {
if runtime == nil || runtime.tracerProvider == nil {
return otel.GetTracerProvider()
}
return runtime.tracerProvider
}
// MeterProvider returns the runtime meter provider.
func (runtime *Runtime) MeterProvider() metric.MeterProvider {
if runtime == nil || runtime.meterProvider == nil {
return otel.GetMeterProvider()
}
return runtime.meterProvider
}
// Shutdown flushes and stops the configured telemetry providers. Shutdown is
// idempotent.
func (runtime *Runtime) Shutdown(ctx context.Context) error {
if runtime == nil {
return nil
}
runtime.shutdownMu.Lock()
if runtime.shutdownDone {
err := runtime.shutdownErr
runtime.shutdownMu.Unlock()
return err
}
runtime.shutdownDone = true
runtime.shutdownMu.Unlock()
var shutdownErr error
for index := len(runtime.shutdownFns) - 1; index >= 0; index-- {
shutdownErr = errors.Join(shutdownErr, runtime.shutdownFns[index](ctx))
}
runtime.shutdownMu.Lock()
runtime.shutdownErr = shutdownErr
runtime.shutdownMu.Unlock()
return shutdownErr
}
// RecordInternalHTTPRequest records one internal HTTP request outcome.
func (runtime *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
if runtime == nil {
return
}
options := metric.WithAttributes(attrs...)
runtime.internalHTTPRequests.Add(normalizeContext(ctx), 1, options)
runtime.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
}
// RecordInternalHTTPEvent records one internal HTTP server lifecycle event.
func (runtime *Runtime) RecordInternalHTTPEvent(ctx context.Context, event string) {
if runtime == nil {
return
}
runtime.internalHTTPLifecycle.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(attribute.String("event", strings.TrimSpace(event))),
)
}
// RecordIntentOutcome records one accepted notification-intent outcome.
func (runtime *Runtime) RecordIntentOutcome(ctx context.Context, notificationType string, producer string, audienceKind string, outcome string) {
if runtime == nil {
return
}
runtime.intentOutcomes.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("notification_type", cleanAttribute(notificationType, "unknown")),
attribute.String("producer", cleanAttribute(producer, "unknown")),
attribute.String("audience_kind", cleanAttribute(audienceKind, "unknown")),
attribute.String("outcome", cleanAttribute(outcome, "unknown")),
),
)
}
// RecordMalformedIntent records one malformed or rejected notification intent.
func (runtime *Runtime) RecordMalformedIntent(ctx context.Context, failureCode string, notificationType string, producer string) {
if runtime == nil {
return
}
runtime.malformedIntents.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("failure_code", cleanAttribute(failureCode, "unknown")),
attribute.String("notification_type", cleanAttribute(notificationType, "unknown")),
attribute.String("producer", cleanAttribute(producer, "unknown")),
),
)
}
// RecordUserEnrichmentAttempt records one User Service enrichment lookup
// outcome.
func (runtime *Runtime) RecordUserEnrichmentAttempt(ctx context.Context, notificationType string, result string) {
if runtime == nil {
return
}
runtime.userEnrichment.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("notification_type", cleanAttribute(notificationType, "unknown")),
attribute.String("result", cleanAttribute(result, "unknown")),
),
)
}
// RecordRoutePublishAttempt records one route publication attempt outcome.
func (runtime *Runtime) RecordRoutePublishAttempt(ctx context.Context, channel string, notificationType string, result string, failureClassification string) {
if runtime == nil {
return
}
runtime.routePublishAttempts.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("channel", cleanAttribute(channel, "unknown")),
attribute.String("notification_type", cleanAttribute(notificationType, "unknown")),
attribute.String("result", cleanAttribute(result, "unknown")),
attribute.String("failure_classification", cleanAttribute(failureClassification, "none")),
),
)
}
// RecordRouteRetry records one route retry scheduling event.
func (runtime *Runtime) RecordRouteRetry(ctx context.Context, channel string, notificationType string) {
if runtime == nil {
return
}
runtime.routeRetries.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("channel", cleanAttribute(channel, "unknown")),
attribute.String("notification_type", cleanAttribute(notificationType, "unknown")),
),
)
}
// RecordRouteDeadLetter records one route transition to dead_letter.
func (runtime *Runtime) RecordRouteDeadLetter(ctx context.Context, channel string, notificationType string, failureClassification string) {
if runtime == nil {
return
}
runtime.routeDeadLetters.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("channel", cleanAttribute(channel, "unknown")),
attribute.String("notification_type", cleanAttribute(notificationType, "unknown")),
attribute.String("failure_classification", cleanAttribute(failureClassification, "unknown")),
),
)
}
// SetRouteScheduleSnapshotReader installs the route-schedule reader used by
// the observable route schedule gauges.
func (runtime *Runtime) SetRouteScheduleSnapshotReader(reader RouteScheduleSnapshotReader) {
if runtime == nil {
return
}
runtime.routeScheduleReaderMu.Lock()
runtime.routeScheduleReader = reader
runtime.routeScheduleReaderMu.Unlock()
}
// SetIntentStreamLagSnapshotReader installs the intent-stream lag reader used
// by the observable lag gauge.
func (runtime *Runtime) SetIntentStreamLagSnapshotReader(reader IntentStreamLagSnapshotReader) {
if runtime == nil {
return
}
runtime.intentStreamLagReaderMu.Lock()
runtime.intentStreamLagReader = reader
runtime.intentStreamLagReaderMu.Unlock()
}
func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) {
meter := meterProvider.Meter(meterName)
runtime := &Runtime{
tracerProvider: tracerProvider,
meterProvider: meterProvider,
shutdownFns: append([]func(context.Context) error(nil), shutdownFns...),
}
internalHTTPRequests, err := meter.Int64Counter("notification.internal_http.requests")
if err != nil {
return nil, fmt.Errorf("build notification telemetry runtime: internal_http.requests: %w", err)
}
internalHTTPDuration, err := meter.Float64Histogram("notification.internal_http.duration_ms", metric.WithUnit("ms"))
if err != nil {
return nil, fmt.Errorf("build notification telemetry runtime: internal_http.duration_ms: %w", err)
}
internalHTTPLifecycle, err := meter.Int64Counter("notification.internal_http.lifecycle")
if err != nil {
return nil, fmt.Errorf("build notification telemetry runtime: internal_http.lifecycle: %w", err)
}
intentOutcomes, err := meter.Int64Counter("notification.intent.outcomes")
if err != nil {
return nil, fmt.Errorf("build notification telemetry runtime: intent.outcomes: %w", err)
}
malformedIntents, err := meter.Int64Counter("notification.intent.malformed")
if err != nil {
return nil, fmt.Errorf("build notification telemetry runtime: intent.malformed: %w", err)
}
userEnrichment, err := meter.Int64Counter("notification.user_enrichment.attempts")
if err != nil {
return nil, fmt.Errorf("build notification telemetry runtime: user_enrichment.attempts: %w", err)
}
routePublishAttempts, err := meter.Int64Counter("notification.route.publish_attempts")
if err != nil {
return nil, fmt.Errorf("build notification telemetry runtime: route.publish_attempts: %w", err)
}
routeRetries, err := meter.Int64Counter("notification.route.retries")
if err != nil {
return nil, fmt.Errorf("build notification telemetry runtime: route.retries: %w", err)
}
routeDeadLetters, err := meter.Int64Counter("notification.route.dead_letters")
if err != nil {
return nil, fmt.Errorf("build notification telemetry runtime: route.dead_letters: %w", err)
}
routeScheduleDepth, err := meter.Int64ObservableGauge("notification.route_schedule.depth")
if err != nil {
return nil, fmt.Errorf("build notification telemetry runtime: route_schedule.depth: %w", err)
}
routeScheduleOldestAge, err := meter.Int64ObservableGauge("notification.route_schedule.oldest_age_ms", metric.WithUnit("ms"))
if err != nil {
return nil, fmt.Errorf("build notification telemetry runtime: route_schedule.oldest_age_ms: %w", err)
}
intentStreamOldestUnprocessedAge, err := meter.Int64ObservableGauge("notification.intent_stream.oldest_unprocessed_age_ms", metric.WithUnit("ms"))
if err != nil {
return nil, fmt.Errorf("build notification telemetry runtime: intent_stream.oldest_unprocessed_age_ms: %w", err)
}
registration, err := meter.RegisterCallback(func(ctx context.Context, observer metric.Observer) error {
runtime.observeRouteSchedule(ctx, observer, routeScheduleDepth, routeScheduleOldestAge)
runtime.observeIntentStreamLag(ctx, observer, intentStreamOldestUnprocessedAge)
return nil
}, routeScheduleDepth, routeScheduleOldestAge, intentStreamOldestUnprocessedAge)
if err != nil {
return nil, fmt.Errorf("build notification telemetry runtime: observable callbacks: %w", err)
}
runtime.shutdownFns = append(runtime.shutdownFns, func(context.Context) error {
return registration.Unregister()
})
runtime.internalHTTPRequests = internalHTTPRequests
runtime.internalHTTPDuration = internalHTTPDuration
runtime.internalHTTPLifecycle = internalHTTPLifecycle
runtime.intentOutcomes = intentOutcomes
runtime.malformedIntents = malformedIntents
runtime.userEnrichment = userEnrichment
runtime.routePublishAttempts = routePublishAttempts
runtime.routeRetries = routeRetries
runtime.routeDeadLetters = routeDeadLetters
return runtime, nil
}
func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdktrace.TracerProvider, error) {
options := []sdktrace.TracerProviderOption{
sdktrace.WithResource(res),
}
if exporter, err := traceExporter(ctx, cfg); err != nil {
return nil, err
} else if exporter != nil {
options = append(options, sdktrace.WithBatcher(exporter))
}
if cfg.StdoutTracesEnabled {
exporter, err := stdouttrace.New(stdouttrace.WithWriter(os.Stdout))
if err != nil {
return nil, fmt.Errorf("stdout traces exporter: %w", err)
}
options = append(options, sdktrace.WithBatcher(exporter))
}
return sdktrace.NewTracerProvider(options...), nil
}
func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdkmetric.MeterProvider, error) {
options := []sdkmetric.Option{
sdkmetric.WithResource(res),
}
if exporter, err := metricExporter(ctx, cfg); err != nil {
return nil, err
} else if exporter != nil {
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
if cfg.StdoutMetricsEnabled {
exporter, err := stdoutmetric.New(stdoutmetric.WithWriter(os.Stdout))
if err != nil {
return nil, fmt.Errorf("stdout metrics exporter: %w", err)
}
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
return sdkmetric.NewMeterProvider(options...), nil
}
func traceExporter(ctx context.Context, cfg ProcessConfig) (sdktrace.SpanExporter, error) {
if cfg.TracesExporter != processExporterOTLP {
return nil, nil
}
switch normalizeProtocol(cfg.TracesProtocol) {
case processProtocolGRPC:
exporter, err := otlptracegrpc.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp grpc traces exporter: %w", err)
}
return exporter, nil
default:
exporter, err := otlptracehttp.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp http traces exporter: %w", err)
}
return exporter, nil
}
}
func metricExporter(ctx context.Context, cfg ProcessConfig) (sdkmetric.Exporter, error) {
if cfg.MetricsExporter != processExporterOTLP {
return nil, nil
}
switch normalizeProtocol(cfg.MetricsProtocol) {
case processProtocolGRPC:
exporter, err := otlpmetricgrpc.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp grpc metrics exporter: %w", err)
}
return exporter, nil
default:
exporter, err := otlpmetrichttp.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp http metrics exporter: %w", err)
}
return exporter, nil
}
}
func normalizeProtocol(value string) string {
switch strings.TrimSpace(value) {
case processProtocolGRPC:
return processProtocolGRPC
default:
return processProtocolHTTPProtobuf
}
}
func normalizeContext(ctx context.Context) context.Context {
if ctx == nil {
return context.Background()
}
return ctx
}
func cleanAttribute(value string, fallback string) string {
trimmed := strings.TrimSpace(value)
if trimmed == "" {
return fallback
}
return trimmed
}
func (runtime *Runtime) observeRouteSchedule(
ctx context.Context,
observer metric.Observer,
depthGauge metric.Int64ObservableGauge,
oldestAgeGauge metric.Int64ObservableGauge,
) {
depth := int64(0)
oldestAge := int64(0)
reader := runtime.currentRouteScheduleReader()
if reader != nil {
snapshot, err := reader.ReadRouteScheduleSnapshot(ctx)
if err != nil {
otel.Handle(fmt.Errorf("observe notification route schedule: %w", err))
} else {
if snapshot.Depth > 0 {
depth = snapshot.Depth
}
if snapshot.OldestScheduledFor != nil {
oldestAge = time.Since(snapshot.OldestScheduledFor.UTC()).Milliseconds()
if oldestAge < 0 {
oldestAge = 0
}
}
}
}
observer.ObserveInt64(depthGauge, depth)
observer.ObserveInt64(oldestAgeGauge, oldestAge)
}
func (runtime *Runtime) observeIntentStreamLag(
ctx context.Context,
observer metric.Observer,
oldestUnprocessedAgeGauge metric.Int64ObservableGauge,
) {
oldestAge := int64(0)
reader := runtime.currentIntentStreamLagReader()
if reader != nil {
snapshot, err := reader.ReadIntentStreamLagSnapshot(ctx)
if err != nil {
otel.Handle(fmt.Errorf("observe notification intent stream lag: %w", err))
} else if snapshot.OldestUnprocessedAt != nil {
oldestAge = time.Since(snapshot.OldestUnprocessedAt.UTC()).Milliseconds()
if oldestAge < 0 {
oldestAge = 0
}
}
}
observer.ObserveInt64(oldestUnprocessedAgeGauge, oldestAge)
}
func (runtime *Runtime) currentRouteScheduleReader() RouteScheduleSnapshotReader {
runtime.routeScheduleReaderMu.RLock()
defer runtime.routeScheduleReaderMu.RUnlock()
return runtime.routeScheduleReader
}
func (runtime *Runtime) currentIntentStreamLagReader() IntentStreamLagSnapshotReader {
runtime.intentStreamLagReaderMu.RLock()
defer runtime.intentStreamLagReaderMu.RUnlock()
return runtime.intentStreamLagReader
}