// Package telemetry provides shared OpenTelemetry runtime helpers and // low-cardinality authsession instruments. package telemetry import ( "context" "errors" "fmt" "galaxy/authsession/internal/domain/devicesession" "io" "os" "strings" "sync" "time" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" "go.opentelemetry.io/otel/exporters/stdout/stdoutmetric" "go.opentelemetry.io/otel/exporters/stdout/stdouttrace" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/propagation" sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/resource" sdktrace "go.opentelemetry.io/otel/sdk/trace" oteltrace "go.opentelemetry.io/otel/trace" "go.uber.org/zap" ) const meterName = "galaxy/authsession" const ( processExporterNone = "none" processExporterOTLP = "otlp" processProtocolHTTPProtobuf = "http/protobuf" processProtocolGRPC = "grpc" ) // ProcessConfig configures the process-wide OpenTelemetry runtime. type ProcessConfig struct { // ServiceName overrides the default OpenTelemetry service name. ServiceName string // TracesExporter selects the external traces exporter. Supported values are // `none` and `otlp`. TracesExporter string // MetricsExporter selects the external metrics exporter. Supported values // are `none` and `otlp`. MetricsExporter string // TracesProtocol selects the OTLP traces protocol when TracesExporter is // `otlp`. TracesProtocol string // MetricsProtocol selects the OTLP metrics protocol when MetricsExporter is // `otlp`. MetricsProtocol string // StdoutTracesEnabled enables the additional stdout trace exporter used for // local development and debugging. StdoutTracesEnabled bool // StdoutMetricsEnabled enables the additional stdout metric exporter used // for local development and debugging. StdoutMetricsEnabled bool } // Validate reports whether cfg contains a supported OpenTelemetry exporter // configuration. func (cfg ProcessConfig) Validate() error { switch cfg.TracesExporter { case processExporterNone, processExporterOTLP: default: return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter) } switch cfg.MetricsExporter { case processExporterNone, processExporterOTLP: default: return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter) } if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC { return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol) } if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC { return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol) } return nil } // SendEmailCodeOutcome identifies the coarse send-email-code result recorded // by authsession metrics. type SendEmailCodeOutcome string const ( // SendEmailCodeOutcomeSent reports that the login code was handed off for // delivery successfully. SendEmailCodeOutcomeSent SendEmailCodeOutcome = "sent" // SendEmailCodeOutcomeSuppressed reports that outward send stayed // success-shaped while actual delivery was skipped intentionally. SendEmailCodeOutcomeSuppressed SendEmailCodeOutcome = "suppressed" // SendEmailCodeOutcomeThrottled reports that a fresh challenge was created // but delivery was skipped because the resend cooldown was active. SendEmailCodeOutcomeThrottled SendEmailCodeOutcome = "throttled" // SendEmailCodeOutcomeFailed reports that the send flow reached an explicit // failure after a source-of-truth write. SendEmailCodeOutcomeFailed SendEmailCodeOutcome = "failed" ) // IsKnown reports whether SendEmailCodeOutcome belongs to the stable // authsession send-flow metric surface. func (o SendEmailCodeOutcome) IsKnown() bool { switch o { case SendEmailCodeOutcomeSent, SendEmailCodeOutcomeSuppressed, SendEmailCodeOutcomeThrottled, SendEmailCodeOutcomeFailed: return true default: return false } } // SendEmailCodeReason identifies the low-cardinality send-flow reason recorded // for suppressed, throttled, or failed outcomes. type SendEmailCodeReason string const ( // SendEmailCodeReasonBlocked reports that delivery was suppressed because // user policy already marked the e-mail as blocked. SendEmailCodeReasonBlocked SendEmailCodeReason = "blocked" // SendEmailCodeReasonMailSender reports that the delivery adapter itself // suppressed or failed the send attempt. SendEmailCodeReasonMailSender SendEmailCodeReason = "mail_sender" // SendEmailCodeReasonThrottled reports that delivery was skipped because the // resend cooldown was active. SendEmailCodeReasonThrottled SendEmailCodeReason = "throttled" ) // IsKnown reports whether SendEmailCodeReason belongs to the stable authsession // send-flow metric surface. func (r SendEmailCodeReason) IsKnown() bool { switch r { case "", SendEmailCodeReasonBlocked, SendEmailCodeReasonMailSender, SendEmailCodeReasonThrottled: return true default: return false } } // ConfirmEmailCodeOutcome identifies the coarse confirm-email-code result // recorded by authsession metrics. type ConfirmEmailCodeOutcome string const ( // ConfirmEmailCodeOutcomeSuccess reports that a device session was created // or idempotently recovered successfully. ConfirmEmailCodeOutcomeSuccess ConfirmEmailCodeOutcome = "success" ) // Runtime owns the authsession OpenTelemetry providers and custom // low-cardinality instruments. type Runtime struct { tracerProvider oteltrace.TracerProvider meterProvider metric.MeterProvider shutdownMu sync.Mutex shutdownDone bool shutdownErr error shutdownFns []func(context.Context) error publicHTTPRequests metric.Int64Counter publicHTTPDuration metric.Float64Histogram internalHTTPRequests metric.Int64Counter internalHTTPDuration metric.Float64Histogram sendEmailCodeAttempts metric.Int64Counter confirmEmailCodeAttempts metric.Int64Counter challengesCreated metric.Int64Counter sessionsCreated metric.Int64Counter sessionLimitRejections metric.Int64Counter projectionPublishFailures metric.Int64Counter userDirectoryOutcomes metric.Int64Counter sessionsRevoked metric.Int64Counter } // New constructs a lightweight telemetry runtime around meterProvider for // tests and embedded use cases that do not need process-level exporter wiring. func New(meterProvider metric.MeterProvider) (*Runtime, error) { return NewWithProviders(meterProvider, nil) } // NewWithProviders constructs a telemetry runtime around explicitly supplied // meterProvider and tracerProvider values. func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) { if meterProvider == nil { meterProvider = otel.GetMeterProvider() } if tracerProvider == nil { tracerProvider = otel.GetTracerProvider() } if meterProvider == nil { return nil, errors.New("new authsession telemetry runtime: nil meter provider") } if tracerProvider == nil { return nil, errors.New("new authsession telemetry runtime: nil tracer provider") } return buildRuntime(meterProvider, tracerProvider, nil) } // NewProcess constructs the process-wide authsession OpenTelemetry runtime from // cfg, installs the resulting providers globally, and returns the runtime. func NewProcess(ctx context.Context, cfg ProcessConfig, logger *zap.Logger) (*Runtime, error) { return newProcess(ctx, cfg, logger, os.Stdout, os.Stdout) } // TracerProvider returns the runtime tracer provider. func (r *Runtime) TracerProvider() oteltrace.TracerProvider { if r == nil || r.tracerProvider == nil { return otel.GetTracerProvider() } return r.tracerProvider } // MeterProvider returns the runtime meter provider. func (r *Runtime) MeterProvider() metric.MeterProvider { if r == nil || r.meterProvider == nil { return otel.GetMeterProvider() } return r.meterProvider } // Shutdown flushes and stops the configured telemetry providers. Shutdown is // idempotent. func (r *Runtime) Shutdown(ctx context.Context) error { if r == nil { return nil } r.shutdownMu.Lock() if r.shutdownDone { err := r.shutdownErr r.shutdownMu.Unlock() return err } r.shutdownDone = true shutdownFns := append([]func(context.Context) error(nil), r.shutdownFns...) r.shutdownMu.Unlock() var joined error for _, shutdownFn := range shutdownFns { joined = errors.Join(joined, shutdownFn(ctx)) } r.shutdownMu.Lock() r.shutdownErr = joined r.shutdownMu.Unlock() return joined } // RecordPublicHTTPRequest records one public HTTP request outcome. func (r *Runtime) RecordPublicHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) { if r == nil { return } options := metric.WithAttributes(attrs...) r.publicHTTPRequests.Add(normalizeContext(ctx), 1, options) r.publicHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options) } // RecordInternalHTTPRequest records one trusted internal HTTP request outcome. func (r *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) { if r == nil { return } options := metric.WithAttributes(attrs...) r.internalHTTPRequests.Add(normalizeContext(ctx), 1, options) r.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options) } // RecordSendEmailCode records one low-cardinality send-email-code outcome. func (r *Runtime) RecordSendEmailCode(ctx context.Context, outcome SendEmailCodeOutcome, reason SendEmailCodeReason) { if r == nil || !outcome.IsKnown() || !reason.IsKnown() { return } attrs := []attribute.KeyValue{ attribute.String("outcome", string(outcome)), } if reason != "" { attrs = append(attrs, attribute.String("reason", string(reason))) } r.sendEmailCodeAttempts.Add(normalizeContext(ctx), 1, metric.WithAttributes(attrs...)) } // RecordConfirmEmailCode records one low-cardinality confirm-email-code // outcome. Success uses the stable value `success`; failures should pass the // stable service/public error code. func (r *Runtime) RecordConfirmEmailCode(ctx context.Context, outcome string) { if r == nil || outcome == "" { return } r.confirmEmailCodeAttempts.Add( normalizeContext(ctx), 1, metric.WithAttributes(attribute.String("outcome", outcome)), ) } // RecordChallengeCreated records one newly persisted challenge. func (r *Runtime) RecordChallengeCreated(ctx context.Context) { if r == nil { return } r.challengesCreated.Add(normalizeContext(ctx), 1) } // RecordSessionCreated records one newly persisted device session. func (r *Runtime) RecordSessionCreated(ctx context.Context) { if r == nil { return } r.sessionsCreated.Add(normalizeContext(ctx), 1) } // RecordSessionLimitRejection records one rejected confirmation caused by the // active-session limit. func (r *Runtime) RecordSessionLimitRejection(ctx context.Context) { if r == nil { return } r.sessionLimitRejections.Add(normalizeContext(ctx), 1) } // RecordProjectionPublishFailure records one exhausted projection publish // failure for operation. func (r *Runtime) RecordProjectionPublishFailure(ctx context.Context, operation string) { if r == nil || strings.TrimSpace(operation) == "" { return } r.projectionPublishFailures.Add( normalizeContext(ctx), 1, metric.WithAttributes(attribute.String("operation", operation)), ) } // RecordUserDirectoryOutcome records one user-directory boundary outcome for // operation. func (r *Runtime) RecordUserDirectoryOutcome(ctx context.Context, operation string, outcome string) { if r == nil || strings.TrimSpace(operation) == "" || strings.TrimSpace(outcome) == "" { return } r.userDirectoryOutcomes.Add( normalizeContext(ctx), 1, metric.WithAttributes( attribute.String("operation", operation), attribute.String("outcome", outcome), ), ) } // RecordSessionRevocations records count revoked sessions for operation and a // low-cardinality revoke-reason bucket. func (r *Runtime) RecordSessionRevocations(ctx context.Context, operation string, reasonCode string, count int64) { if r == nil || strings.TrimSpace(operation) == "" || count <= 0 { return } r.sessionsRevoked.Add( normalizeContext(ctx), count, metric.WithAttributes( attribute.String("operation", operation), attribute.String("reason_bucket", revokeReasonBucket(reasonCode)), ), ) } func newProcess(ctx context.Context, cfg ProcessConfig, logger *zap.Logger, stdoutTraceWriter io.Writer, stdoutMetricWriter io.Writer) (*Runtime, error) { if ctx == nil { return nil, errors.New("new authsession process telemetry: nil context") } if logger == nil { logger = zap.NewNop() } if err := cfg.Validate(); err != nil { return nil, fmt.Errorf("new authsession process telemetry: %w", err) } res, err := resource.New( ctx, resource.WithAttributes(attribute.String("service.name", cfg.ServiceName)), ) if err != nil { return nil, fmt.Errorf("new authsession process telemetry: resource: %w", err) } tracerProvider, err := newTracerProvider(ctx, res, cfg, stdoutTraceWriter) if err != nil { return nil, fmt.Errorf("new authsession process telemetry: tracer provider: %w", err) } meterProvider, err := newMeterProvider(ctx, res, cfg, stdoutMetricWriter) if err != nil { return nil, fmt.Errorf("new authsession process telemetry: meter provider: %w", err) } logger.Info( "authsession telemetry configured", zap.String("service_name", cfg.ServiceName), zap.String("traces_exporter", cfg.TracesExporter), zap.String("metrics_exporter", cfg.MetricsExporter), zap.Bool("stdout_traces_enabled", cfg.StdoutTracesEnabled), zap.Bool("stdout_metrics_enabled", cfg.StdoutMetricsEnabled), ) otel.SetTracerProvider(tracerProvider) otel.SetMeterProvider(meterProvider) otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( propagation.TraceContext{}, propagation.Baggage{}, )) return buildRuntime( meterProvider, tracerProvider, []func(context.Context) error{ meterProvider.Shutdown, tracerProvider.Shutdown, }, ) } func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) { meter := meterProvider.Meter(meterName) publicHTTPRequests, err := meter.Int64Counter("authsession.public_http.requests") if err != nil { return nil, fmt.Errorf("build authsession telemetry runtime: public HTTP requests counter: %w", err) } publicHTTPDuration, err := meter.Float64Histogram("authsession.public_http.duration", metric.WithUnit("ms")) if err != nil { return nil, fmt.Errorf("build authsession telemetry runtime: public HTTP duration histogram: %w", err) } internalHTTPRequests, err := meter.Int64Counter("authsession.internal_http.requests") if err != nil { return nil, fmt.Errorf("build authsession telemetry runtime: internal HTTP requests counter: %w", err) } internalHTTPDuration, err := meter.Float64Histogram("authsession.internal_http.duration", metric.WithUnit("ms")) if err != nil { return nil, fmt.Errorf("build authsession telemetry runtime: internal HTTP duration histogram: %w", err) } sendEmailCodeAttempts, err := meter.Int64Counter("authsession.send_email_code.attempts") if err != nil { return nil, fmt.Errorf("build authsession telemetry runtime: send email code attempts counter: %w", err) } confirmEmailCodeAttempts, err := meter.Int64Counter("authsession.confirm_email_code.attempts") if err != nil { return nil, fmt.Errorf("build authsession telemetry runtime: confirm email code attempts counter: %w", err) } challengesCreated, err := meter.Int64Counter("authsession.challenges.created") if err != nil { return nil, fmt.Errorf("build authsession telemetry runtime: challenges created counter: %w", err) } sessionsCreated, err := meter.Int64Counter("authsession.sessions.created") if err != nil { return nil, fmt.Errorf("build authsession telemetry runtime: sessions created counter: %w", err) } sessionLimitRejections, err := meter.Int64Counter("authsession.session_limit.rejections") if err != nil { return nil, fmt.Errorf("build authsession telemetry runtime: session limit rejections counter: %w", err) } projectionPublishFailures, err := meter.Int64Counter("authsession.projection.publish_failures") if err != nil { return nil, fmt.Errorf("build authsession telemetry runtime: projection publish failures counter: %w", err) } userDirectoryOutcomes, err := meter.Int64Counter("authsession.user_directory.outcomes") if err != nil { return nil, fmt.Errorf("build authsession telemetry runtime: user directory outcomes counter: %w", err) } sessionsRevoked, err := meter.Int64Counter("authsession.sessions.revoked") if err != nil { return nil, fmt.Errorf("build authsession telemetry runtime: sessions revoked counter: %w", err) } return &Runtime{ tracerProvider: tracerProvider, meterProvider: meterProvider, shutdownFns: shutdownFns, publicHTTPRequests: publicHTTPRequests, publicHTTPDuration: publicHTTPDuration, internalHTTPRequests: internalHTTPRequests, internalHTTPDuration: internalHTTPDuration, sendEmailCodeAttempts: sendEmailCodeAttempts, confirmEmailCodeAttempts: confirmEmailCodeAttempts, challengesCreated: challengesCreated, sessionsCreated: sessionsCreated, sessionLimitRejections: sessionLimitRejections, projectionPublishFailures: projectionPublishFailures, userDirectoryOutcomes: userDirectoryOutcomes, sessionsRevoked: sessionsRevoked, }, nil } func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig, stdoutWriter io.Writer) (*sdktrace.TracerProvider, error) { options := []sdktrace.TracerProviderOption{sdktrace.WithResource(res)} if cfg.TracesExporter == processExporterOTLP { exporter, err := newOTLPTraceExporter(ctx, cfg.TracesProtocol) if err != nil { return nil, err } options = append(options, sdktrace.WithBatcher(exporter)) } if cfg.StdoutTracesEnabled { exporter, err := stdouttrace.New( stdouttrace.WithPrettyPrint(), stdouttrace.WithWriter(stdoutWriter), ) if err != nil { return nil, err } options = append(options, sdktrace.WithBatcher(exporter)) } return sdktrace.NewTracerProvider(options...), nil } func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig, stdoutWriter io.Writer) (*sdkmetric.MeterProvider, error) { options := []sdkmetric.Option{sdkmetric.WithResource(res)} if cfg.MetricsExporter == processExporterOTLP { exporter, err := newOTLPMetricExporter(ctx, cfg.MetricsProtocol) if err != nil { return nil, err } options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter))) } if cfg.StdoutMetricsEnabled { exporter, err := stdoutmetric.New( stdoutmetric.WithPrettyPrint(), stdoutmetric.WithWriter(stdoutWriter), ) if err != nil { return nil, err } options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter))) } return sdkmetric.NewMeterProvider(options...), nil } func newOTLPTraceExporter(ctx context.Context, protocol string) (sdktrace.SpanExporter, error) { switch protocol { case "", "http/protobuf": return otlptracehttp.New(ctx) case "grpc": return otlptracegrpc.New(ctx) default: return nil, fmt.Errorf("unsupported OTLP traces protocol %q", protocol) } } func newOTLPMetricExporter(ctx context.Context, protocol string) (sdkmetric.Exporter, error) { switch protocol { case "", "http/protobuf": return otlpmetrichttp.New(ctx) case "grpc": return otlpmetricgrpc.New(ctx) default: return nil, fmt.Errorf("unsupported OTLP metrics protocol %q", protocol) } } func revokeReasonBucket(reasonCode string) string { switch strings.TrimSpace(reasonCode) { case devicesession.RevokeReasonUserBlocked.String(): return "user_blocked" case "confirm_race_repair": return "confirm_race_repair" default: return "custom" } } func normalizeContext(ctx context.Context) context.Context { if ctx == nil { return context.Background() } return ctx }