Files
galaxy-game/user/internal/telemetry/runtime.go
T
2026-04-10 19:05:02 +02:00

550 lines
17 KiB
Go

// Package telemetry provides shared OpenTelemetry runtime helpers and
// low-cardinality user-service instruments.
package telemetry
import (
"context"
"errors"
"fmt"
"io"
"log/slog"
"net/http"
"os"
"strings"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
otelprom "go.opentelemetry.io/otel/exporters/prometheus"
"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/propagation"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
oteltrace "go.opentelemetry.io/otel/trace"
)
const meterName = "galaxy/user"
const (
defaultServiceName = "galaxy-user"
processExporterNone = "none"
processExporterOTLP = "otlp"
processProtocolHTTPProtobuf = "http/protobuf"
processProtocolGRPC = "grpc"
)
// ProcessConfig configures the process-wide OpenTelemetry runtime.
type ProcessConfig struct {
// ServiceName overrides the default OpenTelemetry service name.
ServiceName string
// TracesExporter selects the external traces exporter. Supported values are
// `none` and `otlp`.
TracesExporter string
// MetricsExporter selects the external metrics exporter. Supported values
// are `none` and `otlp`.
MetricsExporter string
// TracesProtocol selects the OTLP traces protocol when TracesExporter is
// `otlp`.
TracesProtocol string
// MetricsProtocol selects the OTLP metrics protocol when MetricsExporter is
// `otlp`.
MetricsProtocol string
// StdoutTracesEnabled enables the additional stdout trace exporter used for
// local development and debugging.
StdoutTracesEnabled bool
// StdoutMetricsEnabled enables the additional stdout metric exporter used
// for local development and debugging.
StdoutMetricsEnabled bool
}
// Validate reports whether cfg contains a supported OpenTelemetry exporter
// configuration.
func (cfg ProcessConfig) Validate() error {
switch cfg.TracesExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter)
}
switch cfg.MetricsExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter)
}
if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol)
}
if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol)
}
return nil
}
// Runtime owns the user-service OpenTelemetry providers, the Prometheus
// metrics handler, and the custom low-cardinality instruments.
type Runtime struct {
tracerProvider oteltrace.TracerProvider
meterProvider metric.MeterProvider
promHandler http.Handler
shutdownMu sync.Mutex
shutdownDone bool
shutdownErr error
shutdownFns []func(context.Context) error
internalHTTPRequests metric.Int64Counter
internalHTTPDuration metric.Float64Histogram
authResolutionOutcomes metric.Int64Counter
userCreationOutcomes metric.Int64Counter
raceNameReservationConflicts metric.Int64Counter
entitlementMutations metric.Int64Counter
sanctionMutations metric.Int64Counter
limitMutations metric.Int64Counter
eventPublicationFailures metric.Int64Counter
}
// New constructs a lightweight telemetry runtime around meterProvider for
// tests and embedded use cases that do not need process-level exporter wiring.
func New(meterProvider metric.MeterProvider) (*Runtime, error) {
return NewWithProviders(meterProvider, nil)
}
// NewWithProviders constructs a telemetry runtime around explicitly supplied
// meterProvider and tracerProvider values.
func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) {
if meterProvider == nil {
meterProvider = otel.GetMeterProvider()
}
if tracerProvider == nil {
tracerProvider = otel.GetTracerProvider()
}
if meterProvider == nil {
return nil, errors.New("new user telemetry runtime: nil meter provider")
}
if tracerProvider == nil {
return nil, errors.New("new user telemetry runtime: nil tracer provider")
}
return buildRuntime(meterProvider, tracerProvider, http.NotFoundHandler(), nil)
}
// NewProcess constructs the process-wide user-service OpenTelemetry runtime
// from cfg, installs the resulting providers globally, and returns the
// runtime.
func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) {
return newProcess(ctx, cfg, logger, os.Stdout, os.Stdout)
}
// TracerProvider returns the runtime tracer provider.
func (r *Runtime) TracerProvider() oteltrace.TracerProvider {
if r == nil || r.tracerProvider == nil {
return otel.GetTracerProvider()
}
return r.tracerProvider
}
// MeterProvider returns the runtime meter provider.
func (r *Runtime) MeterProvider() metric.MeterProvider {
if r == nil || r.meterProvider == nil {
return otel.GetMeterProvider()
}
return r.meterProvider
}
// Handler returns the Prometheus handler that should be mounted on the admin
// listener.
func (r *Runtime) Handler() http.Handler {
if r == nil || r.promHandler == nil {
return http.NotFoundHandler()
}
return r.promHandler
}
// Shutdown flushes and stops the configured telemetry providers. Shutdown is
// idempotent.
func (r *Runtime) Shutdown(ctx context.Context) error {
if r == nil {
return nil
}
r.shutdownMu.Lock()
if r.shutdownDone {
err := r.shutdownErr
r.shutdownMu.Unlock()
return err
}
r.shutdownDone = true
r.shutdownMu.Unlock()
var shutdownErr error
for index := len(r.shutdownFns) - 1; index >= 0; index-- {
shutdownErr = errors.Join(shutdownErr, r.shutdownFns[index](ctx))
}
r.shutdownMu.Lock()
r.shutdownErr = shutdownErr
r.shutdownMu.Unlock()
return shutdownErr
}
// RecordInternalHTTPRequest records one internal HTTP request outcome.
func (r *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
if r == nil {
return
}
options := metric.WithAttributes(attrs...)
r.internalHTTPRequests.Add(normalizeContext(ctx), 1, options)
r.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
}
// RecordAuthResolutionOutcome records one auth-facing resolution outcome.
func (r *Runtime) RecordAuthResolutionOutcome(ctx context.Context, operation string, outcome string) {
if r == nil {
return
}
r.authResolutionOutcomes.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("operation", strings.TrimSpace(operation)),
attribute.String("outcome", strings.TrimSpace(outcome)),
),
)
}
// RecordUserCreationOutcome records one ensure-by-email coarse outcome.
func (r *Runtime) RecordUserCreationOutcome(ctx context.Context, outcome string) {
if r == nil {
return
}
r.userCreationOutcomes.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(attribute.String("outcome", strings.TrimSpace(outcome))),
)
}
// RecordRaceNameReservationConflict records one race-name reservation conflict
// for operation.
func (r *Runtime) RecordRaceNameReservationConflict(ctx context.Context, operation string) {
if r == nil {
return
}
r.raceNameReservationConflicts.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(attribute.String("operation", strings.TrimSpace(operation))),
)
}
// RecordEntitlementMutation records one entitlement command outcome.
func (r *Runtime) RecordEntitlementMutation(ctx context.Context, command string, outcome string) {
if r == nil {
return
}
r.entitlementMutations.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("command", strings.TrimSpace(command)),
attribute.String("outcome", strings.TrimSpace(outcome)),
),
)
}
// RecordSanctionMutation records one sanction command outcome.
func (r *Runtime) RecordSanctionMutation(ctx context.Context, command string, outcome string) {
if r == nil {
return
}
r.sanctionMutations.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("command", strings.TrimSpace(command)),
attribute.String("outcome", strings.TrimSpace(outcome)),
),
)
}
// RecordLimitMutation records one limit command outcome.
func (r *Runtime) RecordLimitMutation(ctx context.Context, command string, outcome string) {
if r == nil {
return
}
r.limitMutations.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("command", strings.TrimSpace(command)),
attribute.String("outcome", strings.TrimSpace(outcome)),
),
)
}
// RecordEventPublicationFailure records one post-commit auxiliary event
// publication failure.
func (r *Runtime) RecordEventPublicationFailure(ctx context.Context, eventType string) {
if r == nil {
return
}
r.eventPublicationFailures.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(attribute.String("event_type", strings.TrimSpace(eventType))),
)
}
func newProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger, stdoutTraceWriter io.Writer, stdoutMetricWriter io.Writer) (*Runtime, error) {
if ctx == nil {
return nil, errors.New("new user telemetry process: nil context")
}
if err := cfg.Validate(); err != nil {
return nil, fmt.Errorf("new user telemetry process: %w", err)
}
if logger == nil {
logger = slog.Default()
}
if strings.TrimSpace(cfg.ServiceName) == "" {
cfg.ServiceName = defaultServiceName
}
res, err := resource.New(
ctx,
resource.WithAttributes(attribute.String("service.name", cfg.ServiceName)),
)
if err != nil {
return nil, fmt.Errorf("new user telemetry process: resource: %w", err)
}
tracerProvider, err := newTracerProvider(ctx, res, cfg, stdoutTraceWriter)
if err != nil {
return nil, fmt.Errorf("new user telemetry process: tracer provider: %w", err)
}
registry := prometheus.NewRegistry()
prometheusExporter, err := otelprom.New(otelprom.WithRegisterer(registry))
if err != nil {
return nil, fmt.Errorf("new user telemetry process: prometheus exporter: %w", err)
}
meterProvider, err := newMeterProvider(ctx, res, cfg, prometheusExporter, stdoutMetricWriter)
if err != nil {
return nil, fmt.Errorf("new user telemetry process: meter provider: %w", err)
}
otel.SetTracerProvider(tracerProvider)
otel.SetMeterProvider(meterProvider)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
runtime, err := buildRuntime(
meterProvider,
tracerProvider,
promhttp.HandlerFor(registry, promhttp.HandlerOpts{}),
[]func(context.Context) error{
meterProvider.Shutdown,
tracerProvider.Shutdown,
},
)
if err != nil {
return nil, fmt.Errorf("new user telemetry process: %w", err)
}
logger.InfoContext(ctx, "user telemetry configured",
"service_name", cfg.ServiceName,
"traces_exporter", cfg.TracesExporter,
"metrics_exporter", cfg.MetricsExporter,
"stdout_traces_enabled", cfg.StdoutTracesEnabled,
"stdout_metrics_enabled", cfg.StdoutMetricsEnabled,
)
return runtime, nil
}
func buildRuntime(
meterProvider metric.MeterProvider,
tracerProvider oteltrace.TracerProvider,
promHandler http.Handler,
shutdownFns []func(context.Context) error,
) (*Runtime, error) {
meter := meterProvider.Meter(meterName)
internalHTTPRequests, err := meter.Int64Counter("user.internal_http.requests")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: internal_http.requests: %w", err)
}
internalHTTPDuration, err := meter.Float64Histogram("user.internal_http.duration", metric.WithUnit("ms"))
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: internal_http.duration: %w", err)
}
authResolutionOutcomes, err := meter.Int64Counter("user.auth_resolution.outcomes")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: auth_resolution.outcomes: %w", err)
}
userCreationOutcomes, err := meter.Int64Counter("user.user_creation.outcomes")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: user_creation.outcomes: %w", err)
}
raceNameReservationConflicts, err := meter.Int64Counter("user.race_name.reservation_conflicts")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: race_name.reservation_conflicts: %w", err)
}
entitlementMutations, err := meter.Int64Counter("user.entitlement.mutations")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: entitlement.mutations: %w", err)
}
sanctionMutations, err := meter.Int64Counter("user.sanction.mutations")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: sanction.mutations: %w", err)
}
limitMutations, err := meter.Int64Counter("user.limit.mutations")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: limit.mutations: %w", err)
}
eventPublicationFailures, err := meter.Int64Counter("user.event_publication_failures")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: event_publication_failures: %w", err)
}
if promHandler == nil {
promHandler = http.NotFoundHandler()
}
return &Runtime{
tracerProvider: tracerProvider,
meterProvider: meterProvider,
promHandler: promHandler,
shutdownFns: shutdownFns,
internalHTTPRequests: internalHTTPRequests,
internalHTTPDuration: internalHTTPDuration,
authResolutionOutcomes: authResolutionOutcomes,
userCreationOutcomes: userCreationOutcomes,
raceNameReservationConflicts: raceNameReservationConflicts,
entitlementMutations: entitlementMutations,
sanctionMutations: sanctionMutations,
limitMutations: limitMutations,
eventPublicationFailures: eventPublicationFailures,
}, nil
}
func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig, stdoutWriter io.Writer) (*sdktrace.TracerProvider, error) {
options := []sdktrace.TracerProviderOption{sdktrace.WithResource(res)}
if cfg.TracesExporter == processExporterOTLP {
exporter, err := newOTLPTraceExporter(ctx, cfg.TracesProtocol)
if err != nil {
return nil, err
}
options = append(options, sdktrace.WithBatcher(exporter))
}
if cfg.StdoutTracesEnabled {
exporter, err := stdouttrace.New(
stdouttrace.WithPrettyPrint(),
stdouttrace.WithWriter(stdoutWriter),
)
if err != nil {
return nil, err
}
options = append(options, sdktrace.WithBatcher(exporter))
}
return sdktrace.NewTracerProvider(options...), nil
}
func newMeterProvider(
ctx context.Context,
res *resource.Resource,
cfg ProcessConfig,
prometheusExporter sdkmetric.Reader,
stdoutWriter io.Writer,
) (*sdkmetric.MeterProvider, error) {
options := []sdkmetric.Option{
sdkmetric.WithResource(res),
sdkmetric.WithReader(prometheusExporter),
}
if cfg.MetricsExporter == processExporterOTLP {
exporter, err := newOTLPMetricExporter(ctx, cfg.MetricsProtocol)
if err != nil {
return nil, err
}
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
if cfg.StdoutMetricsEnabled {
exporter, err := stdoutmetric.New(
stdoutmetric.WithPrettyPrint(),
stdoutmetric.WithWriter(stdoutWriter),
)
if err != nil {
return nil, err
}
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
return sdkmetric.NewMeterProvider(options...), nil
}
func newOTLPTraceExporter(ctx context.Context, protocol string) (sdktrace.SpanExporter, error) {
switch protocol {
case "", processProtocolHTTPProtobuf:
return otlptracehttp.New(ctx)
case processProtocolGRPC:
return otlptracegrpc.New(ctx)
default:
return nil, fmt.Errorf("unsupported OTLP traces protocol %q", protocol)
}
}
func newOTLPMetricExporter(ctx context.Context, protocol string) (sdkmetric.Exporter, error) {
switch protocol {
case "", processProtocolHTTPProtobuf:
return otlpmetrichttp.New(ctx)
case processProtocolGRPC:
return otlpmetricgrpc.New(ctx)
default:
return nil, fmt.Errorf("unsupported OTLP metrics protocol %q", protocol)
}
}
func normalizeContext(ctx context.Context) context.Context {
if ctx == nil {
return context.Background()
}
return ctx
}