feat: authsession service

This commit is contained in:
Ilia Denisov
2026-04-08 16:23:07 +02:00
committed by GitHub
parent 28f04916af
commit 86a68ed9d0
174 changed files with 31732 additions and 112 deletions
+620
View File
@@ -0,0 +1,620 @@
// Package telemetry provides shared OpenTelemetry runtime helpers and
// low-cardinality authsession instruments.
package telemetry
import (
"context"
"errors"
"fmt"
"galaxy/authsession/internal/domain/devicesession"
"io"
"os"
"strings"
"sync"
"time"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/propagation"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
oteltrace "go.opentelemetry.io/otel/trace"
"go.uber.org/zap"
)
const meterName = "galaxy/authsession"
const (
processExporterNone = "none"
processExporterOTLP = "otlp"
processProtocolHTTPProtobuf = "http/protobuf"
processProtocolGRPC = "grpc"
)
// ProcessConfig configures the process-wide OpenTelemetry runtime.
type ProcessConfig struct {
// ServiceName overrides the default OpenTelemetry service name.
ServiceName string
// TracesExporter selects the external traces exporter. Supported values are
// `none` and `otlp`.
TracesExporter string
// MetricsExporter selects the external metrics exporter. Supported values
// are `none` and `otlp`.
MetricsExporter string
// TracesProtocol selects the OTLP traces protocol when TracesExporter is
// `otlp`.
TracesProtocol string
// MetricsProtocol selects the OTLP metrics protocol when MetricsExporter is
// `otlp`.
MetricsProtocol string
// StdoutTracesEnabled enables the additional stdout trace exporter used for
// local development and debugging.
StdoutTracesEnabled bool
// StdoutMetricsEnabled enables the additional stdout metric exporter used
// for local development and debugging.
StdoutMetricsEnabled bool
}
// Validate reports whether cfg contains a supported OpenTelemetry exporter
// configuration.
func (cfg ProcessConfig) Validate() error {
switch cfg.TracesExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter)
}
switch cfg.MetricsExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter)
}
if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol)
}
if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol)
}
return nil
}
// SendEmailCodeOutcome identifies the coarse send-email-code result recorded
// by authsession metrics.
type SendEmailCodeOutcome string
const (
// SendEmailCodeOutcomeSent reports that the login code was handed off for
// delivery successfully.
SendEmailCodeOutcomeSent SendEmailCodeOutcome = "sent"
// SendEmailCodeOutcomeSuppressed reports that outward send stayed
// success-shaped while actual delivery was skipped intentionally.
SendEmailCodeOutcomeSuppressed SendEmailCodeOutcome = "suppressed"
// SendEmailCodeOutcomeThrottled reports that a fresh challenge was created
// but delivery was skipped because the resend cooldown was active.
SendEmailCodeOutcomeThrottled SendEmailCodeOutcome = "throttled"
// SendEmailCodeOutcomeFailed reports that the send flow reached an explicit
// failure after a source-of-truth write.
SendEmailCodeOutcomeFailed SendEmailCodeOutcome = "failed"
)
// IsKnown reports whether SendEmailCodeOutcome belongs to the stable
// authsession send-flow metric surface.
func (o SendEmailCodeOutcome) IsKnown() bool {
switch o {
case SendEmailCodeOutcomeSent,
SendEmailCodeOutcomeSuppressed,
SendEmailCodeOutcomeThrottled,
SendEmailCodeOutcomeFailed:
return true
default:
return false
}
}
// SendEmailCodeReason identifies the low-cardinality send-flow reason recorded
// for suppressed, throttled, or failed outcomes.
type SendEmailCodeReason string
const (
// SendEmailCodeReasonBlocked reports that delivery was suppressed because
// user policy already marked the e-mail as blocked.
SendEmailCodeReasonBlocked SendEmailCodeReason = "blocked"
// SendEmailCodeReasonMailSender reports that the delivery adapter itself
// suppressed or failed the send attempt.
SendEmailCodeReasonMailSender SendEmailCodeReason = "mail_sender"
// SendEmailCodeReasonThrottled reports that delivery was skipped because the
// resend cooldown was active.
SendEmailCodeReasonThrottled SendEmailCodeReason = "throttled"
)
// IsKnown reports whether SendEmailCodeReason belongs to the stable authsession
// send-flow metric surface.
func (r SendEmailCodeReason) IsKnown() bool {
switch r {
case "",
SendEmailCodeReasonBlocked,
SendEmailCodeReasonMailSender,
SendEmailCodeReasonThrottled:
return true
default:
return false
}
}
// ConfirmEmailCodeOutcome identifies the coarse confirm-email-code result
// recorded by authsession metrics.
type ConfirmEmailCodeOutcome string
const (
// ConfirmEmailCodeOutcomeSuccess reports that a device session was created
// or idempotently recovered successfully.
ConfirmEmailCodeOutcomeSuccess ConfirmEmailCodeOutcome = "success"
)
// Runtime owns the authsession OpenTelemetry providers and custom
// low-cardinality instruments.
type Runtime struct {
tracerProvider oteltrace.TracerProvider
meterProvider metric.MeterProvider
shutdownMu sync.Mutex
shutdownDone bool
shutdownErr error
shutdownFns []func(context.Context) error
publicHTTPRequests metric.Int64Counter
publicHTTPDuration metric.Float64Histogram
internalHTTPRequests metric.Int64Counter
internalHTTPDuration metric.Float64Histogram
sendEmailCodeAttempts metric.Int64Counter
confirmEmailCodeAttempts metric.Int64Counter
challengesCreated metric.Int64Counter
sessionsCreated metric.Int64Counter
sessionLimitRejections metric.Int64Counter
projectionPublishFailures metric.Int64Counter
userDirectoryOutcomes metric.Int64Counter
sessionsRevoked metric.Int64Counter
}
// New constructs a lightweight telemetry runtime around meterProvider for
// tests and embedded use cases that do not need process-level exporter wiring.
func New(meterProvider metric.MeterProvider) (*Runtime, error) {
return NewWithProviders(meterProvider, nil)
}
// NewWithProviders constructs a telemetry runtime around explicitly supplied
// meterProvider and tracerProvider values.
func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) {
if meterProvider == nil {
meterProvider = otel.GetMeterProvider()
}
if tracerProvider == nil {
tracerProvider = otel.GetTracerProvider()
}
if meterProvider == nil {
return nil, errors.New("new authsession telemetry runtime: nil meter provider")
}
if tracerProvider == nil {
return nil, errors.New("new authsession telemetry runtime: nil tracer provider")
}
return buildRuntime(meterProvider, tracerProvider, nil)
}
// NewProcess constructs the process-wide authsession OpenTelemetry runtime from
// cfg, installs the resulting providers globally, and returns the runtime.
func NewProcess(ctx context.Context, cfg ProcessConfig, logger *zap.Logger) (*Runtime, error) {
return newProcess(ctx, cfg, logger, os.Stdout, os.Stdout)
}
// TracerProvider returns the runtime tracer provider.
func (r *Runtime) TracerProvider() oteltrace.TracerProvider {
if r == nil || r.tracerProvider == nil {
return otel.GetTracerProvider()
}
return r.tracerProvider
}
// MeterProvider returns the runtime meter provider.
func (r *Runtime) MeterProvider() metric.MeterProvider {
if r == nil || r.meterProvider == nil {
return otel.GetMeterProvider()
}
return r.meterProvider
}
// Shutdown flushes and stops the configured telemetry providers. Shutdown is
// idempotent.
func (r *Runtime) Shutdown(ctx context.Context) error {
if r == nil {
return nil
}
r.shutdownMu.Lock()
if r.shutdownDone {
err := r.shutdownErr
r.shutdownMu.Unlock()
return err
}
r.shutdownDone = true
shutdownFns := append([]func(context.Context) error(nil), r.shutdownFns...)
r.shutdownMu.Unlock()
var joined error
for _, shutdownFn := range shutdownFns {
joined = errors.Join(joined, shutdownFn(ctx))
}
r.shutdownMu.Lock()
r.shutdownErr = joined
r.shutdownMu.Unlock()
return joined
}
// RecordPublicHTTPRequest records one public HTTP request outcome.
func (r *Runtime) RecordPublicHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
if r == nil {
return
}
options := metric.WithAttributes(attrs...)
r.publicHTTPRequests.Add(normalizeContext(ctx), 1, options)
r.publicHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
}
// RecordInternalHTTPRequest records one trusted internal HTTP request outcome.
func (r *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
if r == nil {
return
}
options := metric.WithAttributes(attrs...)
r.internalHTTPRequests.Add(normalizeContext(ctx), 1, options)
r.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
}
// RecordSendEmailCode records one low-cardinality send-email-code outcome.
func (r *Runtime) RecordSendEmailCode(ctx context.Context, outcome SendEmailCodeOutcome, reason SendEmailCodeReason) {
if r == nil || !outcome.IsKnown() || !reason.IsKnown() {
return
}
attrs := []attribute.KeyValue{
attribute.String("outcome", string(outcome)),
}
if reason != "" {
attrs = append(attrs, attribute.String("reason", string(reason)))
}
r.sendEmailCodeAttempts.Add(normalizeContext(ctx), 1, metric.WithAttributes(attrs...))
}
// RecordConfirmEmailCode records one low-cardinality confirm-email-code
// outcome. Success uses the stable value `success`; failures should pass the
// stable service/public error code.
func (r *Runtime) RecordConfirmEmailCode(ctx context.Context, outcome string) {
if r == nil || outcome == "" {
return
}
r.confirmEmailCodeAttempts.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(attribute.String("outcome", outcome)),
)
}
// RecordChallengeCreated records one newly persisted challenge.
func (r *Runtime) RecordChallengeCreated(ctx context.Context) {
if r == nil {
return
}
r.challengesCreated.Add(normalizeContext(ctx), 1)
}
// RecordSessionCreated records one newly persisted device session.
func (r *Runtime) RecordSessionCreated(ctx context.Context) {
if r == nil {
return
}
r.sessionsCreated.Add(normalizeContext(ctx), 1)
}
// RecordSessionLimitRejection records one rejected confirmation caused by the
// active-session limit.
func (r *Runtime) RecordSessionLimitRejection(ctx context.Context) {
if r == nil {
return
}
r.sessionLimitRejections.Add(normalizeContext(ctx), 1)
}
// RecordProjectionPublishFailure records one exhausted projection publish
// failure for operation.
func (r *Runtime) RecordProjectionPublishFailure(ctx context.Context, operation string) {
if r == nil || strings.TrimSpace(operation) == "" {
return
}
r.projectionPublishFailures.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(attribute.String("operation", operation)),
)
}
// RecordUserDirectoryOutcome records one user-directory boundary outcome for
// operation.
func (r *Runtime) RecordUserDirectoryOutcome(ctx context.Context, operation string, outcome string) {
if r == nil || strings.TrimSpace(operation) == "" || strings.TrimSpace(outcome) == "" {
return
}
r.userDirectoryOutcomes.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("operation", operation),
attribute.String("outcome", outcome),
),
)
}
// RecordSessionRevocations records count revoked sessions for operation and a
// low-cardinality revoke-reason bucket.
func (r *Runtime) RecordSessionRevocations(ctx context.Context, operation string, reasonCode string, count int64) {
if r == nil || strings.TrimSpace(operation) == "" || count <= 0 {
return
}
r.sessionsRevoked.Add(
normalizeContext(ctx),
count,
metric.WithAttributes(
attribute.String("operation", operation),
attribute.String("reason_bucket", revokeReasonBucket(reasonCode)),
),
)
}
func newProcess(ctx context.Context, cfg ProcessConfig, logger *zap.Logger, stdoutTraceWriter io.Writer, stdoutMetricWriter io.Writer) (*Runtime, error) {
if ctx == nil {
return nil, errors.New("new authsession process telemetry: nil context")
}
if logger == nil {
logger = zap.NewNop()
}
if err := cfg.Validate(); err != nil {
return nil, fmt.Errorf("new authsession process telemetry: %w", err)
}
res, err := resource.New(
ctx,
resource.WithAttributes(attribute.String("service.name", cfg.ServiceName)),
)
if err != nil {
return nil, fmt.Errorf("new authsession process telemetry: resource: %w", err)
}
tracerProvider, err := newTracerProvider(ctx, res, cfg, stdoutTraceWriter)
if err != nil {
return nil, fmt.Errorf("new authsession process telemetry: tracer provider: %w", err)
}
meterProvider, err := newMeterProvider(ctx, res, cfg, stdoutMetricWriter)
if err != nil {
return nil, fmt.Errorf("new authsession process telemetry: meter provider: %w", err)
}
logger.Info(
"authsession telemetry configured",
zap.String("service_name", cfg.ServiceName),
zap.String("traces_exporter", cfg.TracesExporter),
zap.String("metrics_exporter", cfg.MetricsExporter),
zap.Bool("stdout_traces_enabled", cfg.StdoutTracesEnabled),
zap.Bool("stdout_metrics_enabled", cfg.StdoutMetricsEnabled),
)
otel.SetTracerProvider(tracerProvider)
otel.SetMeterProvider(meterProvider)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
return buildRuntime(
meterProvider,
tracerProvider,
[]func(context.Context) error{
meterProvider.Shutdown,
tracerProvider.Shutdown,
},
)
}
func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) {
meter := meterProvider.Meter(meterName)
publicHTTPRequests, err := meter.Int64Counter("authsession.public_http.requests")
if err != nil {
return nil, fmt.Errorf("build authsession telemetry runtime: public HTTP requests counter: %w", err)
}
publicHTTPDuration, err := meter.Float64Histogram("authsession.public_http.duration", metric.WithUnit("ms"))
if err != nil {
return nil, fmt.Errorf("build authsession telemetry runtime: public HTTP duration histogram: %w", err)
}
internalHTTPRequests, err := meter.Int64Counter("authsession.internal_http.requests")
if err != nil {
return nil, fmt.Errorf("build authsession telemetry runtime: internal HTTP requests counter: %w", err)
}
internalHTTPDuration, err := meter.Float64Histogram("authsession.internal_http.duration", metric.WithUnit("ms"))
if err != nil {
return nil, fmt.Errorf("build authsession telemetry runtime: internal HTTP duration histogram: %w", err)
}
sendEmailCodeAttempts, err := meter.Int64Counter("authsession.send_email_code.attempts")
if err != nil {
return nil, fmt.Errorf("build authsession telemetry runtime: send email code attempts counter: %w", err)
}
confirmEmailCodeAttempts, err := meter.Int64Counter("authsession.confirm_email_code.attempts")
if err != nil {
return nil, fmt.Errorf("build authsession telemetry runtime: confirm email code attempts counter: %w", err)
}
challengesCreated, err := meter.Int64Counter("authsession.challenges.created")
if err != nil {
return nil, fmt.Errorf("build authsession telemetry runtime: challenges created counter: %w", err)
}
sessionsCreated, err := meter.Int64Counter("authsession.sessions.created")
if err != nil {
return nil, fmt.Errorf("build authsession telemetry runtime: sessions created counter: %w", err)
}
sessionLimitRejections, err := meter.Int64Counter("authsession.session_limit.rejections")
if err != nil {
return nil, fmt.Errorf("build authsession telemetry runtime: session limit rejections counter: %w", err)
}
projectionPublishFailures, err := meter.Int64Counter("authsession.projection.publish_failures")
if err != nil {
return nil, fmt.Errorf("build authsession telemetry runtime: projection publish failures counter: %w", err)
}
userDirectoryOutcomes, err := meter.Int64Counter("authsession.user_directory.outcomes")
if err != nil {
return nil, fmt.Errorf("build authsession telemetry runtime: user directory outcomes counter: %w", err)
}
sessionsRevoked, err := meter.Int64Counter("authsession.sessions.revoked")
if err != nil {
return nil, fmt.Errorf("build authsession telemetry runtime: sessions revoked counter: %w", err)
}
return &Runtime{
tracerProvider: tracerProvider,
meterProvider: meterProvider,
shutdownFns: shutdownFns,
publicHTTPRequests: publicHTTPRequests,
publicHTTPDuration: publicHTTPDuration,
internalHTTPRequests: internalHTTPRequests,
internalHTTPDuration: internalHTTPDuration,
sendEmailCodeAttempts: sendEmailCodeAttempts,
confirmEmailCodeAttempts: confirmEmailCodeAttempts,
challengesCreated: challengesCreated,
sessionsCreated: sessionsCreated,
sessionLimitRejections: sessionLimitRejections,
projectionPublishFailures: projectionPublishFailures,
userDirectoryOutcomes: userDirectoryOutcomes,
sessionsRevoked: sessionsRevoked,
}, nil
}
func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig, stdoutWriter io.Writer) (*sdktrace.TracerProvider, error) {
options := []sdktrace.TracerProviderOption{sdktrace.WithResource(res)}
if cfg.TracesExporter == processExporterOTLP {
exporter, err := newOTLPTraceExporter(ctx, cfg.TracesProtocol)
if err != nil {
return nil, err
}
options = append(options, sdktrace.WithBatcher(exporter))
}
if cfg.StdoutTracesEnabled {
exporter, err := stdouttrace.New(
stdouttrace.WithPrettyPrint(),
stdouttrace.WithWriter(stdoutWriter),
)
if err != nil {
return nil, err
}
options = append(options, sdktrace.WithBatcher(exporter))
}
return sdktrace.NewTracerProvider(options...), nil
}
func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig, stdoutWriter io.Writer) (*sdkmetric.MeterProvider, error) {
options := []sdkmetric.Option{sdkmetric.WithResource(res)}
if cfg.MetricsExporter == processExporterOTLP {
exporter, err := newOTLPMetricExporter(ctx, cfg.MetricsProtocol)
if err != nil {
return nil, err
}
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
if cfg.StdoutMetricsEnabled {
exporter, err := stdoutmetric.New(
stdoutmetric.WithPrettyPrint(),
stdoutmetric.WithWriter(stdoutWriter),
)
if err != nil {
return nil, err
}
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
return sdkmetric.NewMeterProvider(options...), nil
}
func newOTLPTraceExporter(ctx context.Context, protocol string) (sdktrace.SpanExporter, error) {
switch protocol {
case "", "http/protobuf":
return otlptracehttp.New(ctx)
case "grpc":
return otlptracegrpc.New(ctx)
default:
return nil, fmt.Errorf("unsupported OTLP traces protocol %q", protocol)
}
}
func newOTLPMetricExporter(ctx context.Context, protocol string) (sdkmetric.Exporter, error) {
switch protocol {
case "", "http/protobuf":
return otlpmetrichttp.New(ctx)
case "grpc":
return otlpmetricgrpc.New(ctx)
default:
return nil, fmt.Errorf("unsupported OTLP metrics protocol %q", protocol)
}
}
func revokeReasonBucket(reasonCode string) string {
switch strings.TrimSpace(reasonCode) {
case devicesession.RevokeReasonUserBlocked.String():
return "user_blocked"
case "confirm_race_repair":
return "confirm_race_repair"
default:
return "custom"
}
}
func normalizeContext(ctx context.Context) context.Context {
if ctx == nil {
return context.Background()
}
return ctx
}