feat: user service

This commit is contained in:
Ilia Denisov
2026-04-10 19:05:02 +02:00
committed by GitHub
parent 710bad712e
commit 23ffcb7535
140 changed files with 33418 additions and 952 deletions
+549
View File
@@ -0,0 +1,549 @@
// Package telemetry provides shared OpenTelemetry runtime helpers and
// low-cardinality user-service instruments.
package telemetry
import (
"context"
"errors"
"fmt"
"io"
"log/slog"
"net/http"
"os"
"strings"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
otelprom "go.opentelemetry.io/otel/exporters/prometheus"
"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/propagation"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
oteltrace "go.opentelemetry.io/otel/trace"
)
const meterName = "galaxy/user"
const (
defaultServiceName = "galaxy-user"
processExporterNone = "none"
processExporterOTLP = "otlp"
processProtocolHTTPProtobuf = "http/protobuf"
processProtocolGRPC = "grpc"
)
// ProcessConfig configures the process-wide OpenTelemetry runtime.
type ProcessConfig struct {
// ServiceName overrides the default OpenTelemetry service name.
ServiceName string
// TracesExporter selects the external traces exporter. Supported values are
// `none` and `otlp`.
TracesExporter string
// MetricsExporter selects the external metrics exporter. Supported values
// are `none` and `otlp`.
MetricsExporter string
// TracesProtocol selects the OTLP traces protocol when TracesExporter is
// `otlp`.
TracesProtocol string
// MetricsProtocol selects the OTLP metrics protocol when MetricsExporter is
// `otlp`.
MetricsProtocol string
// StdoutTracesEnabled enables the additional stdout trace exporter used for
// local development and debugging.
StdoutTracesEnabled bool
// StdoutMetricsEnabled enables the additional stdout metric exporter used
// for local development and debugging.
StdoutMetricsEnabled bool
}
// Validate reports whether cfg contains a supported OpenTelemetry exporter
// configuration.
func (cfg ProcessConfig) Validate() error {
switch cfg.TracesExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter)
}
switch cfg.MetricsExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter)
}
if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol)
}
if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol)
}
return nil
}
// Runtime owns the user-service OpenTelemetry providers, the Prometheus
// metrics handler, and the custom low-cardinality instruments.
type Runtime struct {
tracerProvider oteltrace.TracerProvider
meterProvider metric.MeterProvider
promHandler http.Handler
shutdownMu sync.Mutex
shutdownDone bool
shutdownErr error
shutdownFns []func(context.Context) error
internalHTTPRequests metric.Int64Counter
internalHTTPDuration metric.Float64Histogram
authResolutionOutcomes metric.Int64Counter
userCreationOutcomes metric.Int64Counter
raceNameReservationConflicts metric.Int64Counter
entitlementMutations metric.Int64Counter
sanctionMutations metric.Int64Counter
limitMutations metric.Int64Counter
eventPublicationFailures metric.Int64Counter
}
// New constructs a lightweight telemetry runtime around meterProvider for
// tests and embedded use cases that do not need process-level exporter wiring.
func New(meterProvider metric.MeterProvider) (*Runtime, error) {
return NewWithProviders(meterProvider, nil)
}
// NewWithProviders constructs a telemetry runtime around explicitly supplied
// meterProvider and tracerProvider values.
func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) {
if meterProvider == nil {
meterProvider = otel.GetMeterProvider()
}
if tracerProvider == nil {
tracerProvider = otel.GetTracerProvider()
}
if meterProvider == nil {
return nil, errors.New("new user telemetry runtime: nil meter provider")
}
if tracerProvider == nil {
return nil, errors.New("new user telemetry runtime: nil tracer provider")
}
return buildRuntime(meterProvider, tracerProvider, http.NotFoundHandler(), nil)
}
// NewProcess constructs the process-wide user-service OpenTelemetry runtime
// from cfg, installs the resulting providers globally, and returns the
// runtime.
func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) {
return newProcess(ctx, cfg, logger, os.Stdout, os.Stdout)
}
// TracerProvider returns the runtime tracer provider.
func (r *Runtime) TracerProvider() oteltrace.TracerProvider {
if r == nil || r.tracerProvider == nil {
return otel.GetTracerProvider()
}
return r.tracerProvider
}
// MeterProvider returns the runtime meter provider.
func (r *Runtime) MeterProvider() metric.MeterProvider {
if r == nil || r.meterProvider == nil {
return otel.GetMeterProvider()
}
return r.meterProvider
}
// Handler returns the Prometheus handler that should be mounted on the admin
// listener.
func (r *Runtime) Handler() http.Handler {
if r == nil || r.promHandler == nil {
return http.NotFoundHandler()
}
return r.promHandler
}
// Shutdown flushes and stops the configured telemetry providers. Shutdown is
// idempotent.
func (r *Runtime) Shutdown(ctx context.Context) error {
if r == nil {
return nil
}
r.shutdownMu.Lock()
if r.shutdownDone {
err := r.shutdownErr
r.shutdownMu.Unlock()
return err
}
r.shutdownDone = true
r.shutdownMu.Unlock()
var shutdownErr error
for index := len(r.shutdownFns) - 1; index >= 0; index-- {
shutdownErr = errors.Join(shutdownErr, r.shutdownFns[index](ctx))
}
r.shutdownMu.Lock()
r.shutdownErr = shutdownErr
r.shutdownMu.Unlock()
return shutdownErr
}
// RecordInternalHTTPRequest records one internal HTTP request outcome.
func (r *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
if r == nil {
return
}
options := metric.WithAttributes(attrs...)
r.internalHTTPRequests.Add(normalizeContext(ctx), 1, options)
r.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
}
// RecordAuthResolutionOutcome records one auth-facing resolution outcome.
func (r *Runtime) RecordAuthResolutionOutcome(ctx context.Context, operation string, outcome string) {
if r == nil {
return
}
r.authResolutionOutcomes.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("operation", strings.TrimSpace(operation)),
attribute.String("outcome", strings.TrimSpace(outcome)),
),
)
}
// RecordUserCreationOutcome records one ensure-by-email coarse outcome.
func (r *Runtime) RecordUserCreationOutcome(ctx context.Context, outcome string) {
if r == nil {
return
}
r.userCreationOutcomes.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(attribute.String("outcome", strings.TrimSpace(outcome))),
)
}
// RecordRaceNameReservationConflict records one race-name reservation conflict
// for operation.
func (r *Runtime) RecordRaceNameReservationConflict(ctx context.Context, operation string) {
if r == nil {
return
}
r.raceNameReservationConflicts.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(attribute.String("operation", strings.TrimSpace(operation))),
)
}
// RecordEntitlementMutation records one entitlement command outcome.
func (r *Runtime) RecordEntitlementMutation(ctx context.Context, command string, outcome string) {
if r == nil {
return
}
r.entitlementMutations.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("command", strings.TrimSpace(command)),
attribute.String("outcome", strings.TrimSpace(outcome)),
),
)
}
// RecordSanctionMutation records one sanction command outcome.
func (r *Runtime) RecordSanctionMutation(ctx context.Context, command string, outcome string) {
if r == nil {
return
}
r.sanctionMutations.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("command", strings.TrimSpace(command)),
attribute.String("outcome", strings.TrimSpace(outcome)),
),
)
}
// RecordLimitMutation records one limit command outcome.
func (r *Runtime) RecordLimitMutation(ctx context.Context, command string, outcome string) {
if r == nil {
return
}
r.limitMutations.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("command", strings.TrimSpace(command)),
attribute.String("outcome", strings.TrimSpace(outcome)),
),
)
}
// RecordEventPublicationFailure records one post-commit auxiliary event
// publication failure.
func (r *Runtime) RecordEventPublicationFailure(ctx context.Context, eventType string) {
if r == nil {
return
}
r.eventPublicationFailures.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(attribute.String("event_type", strings.TrimSpace(eventType))),
)
}
func newProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger, stdoutTraceWriter io.Writer, stdoutMetricWriter io.Writer) (*Runtime, error) {
if ctx == nil {
return nil, errors.New("new user telemetry process: nil context")
}
if err := cfg.Validate(); err != nil {
return nil, fmt.Errorf("new user telemetry process: %w", err)
}
if logger == nil {
logger = slog.Default()
}
if strings.TrimSpace(cfg.ServiceName) == "" {
cfg.ServiceName = defaultServiceName
}
res, err := resource.New(
ctx,
resource.WithAttributes(attribute.String("service.name", cfg.ServiceName)),
)
if err != nil {
return nil, fmt.Errorf("new user telemetry process: resource: %w", err)
}
tracerProvider, err := newTracerProvider(ctx, res, cfg, stdoutTraceWriter)
if err != nil {
return nil, fmt.Errorf("new user telemetry process: tracer provider: %w", err)
}
registry := prometheus.NewRegistry()
prometheusExporter, err := otelprom.New(otelprom.WithRegisterer(registry))
if err != nil {
return nil, fmt.Errorf("new user telemetry process: prometheus exporter: %w", err)
}
meterProvider, err := newMeterProvider(ctx, res, cfg, prometheusExporter, stdoutMetricWriter)
if err != nil {
return nil, fmt.Errorf("new user telemetry process: meter provider: %w", err)
}
otel.SetTracerProvider(tracerProvider)
otel.SetMeterProvider(meterProvider)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
runtime, err := buildRuntime(
meterProvider,
tracerProvider,
promhttp.HandlerFor(registry, promhttp.HandlerOpts{}),
[]func(context.Context) error{
meterProvider.Shutdown,
tracerProvider.Shutdown,
},
)
if err != nil {
return nil, fmt.Errorf("new user telemetry process: %w", err)
}
logger.InfoContext(ctx, "user telemetry configured",
"service_name", cfg.ServiceName,
"traces_exporter", cfg.TracesExporter,
"metrics_exporter", cfg.MetricsExporter,
"stdout_traces_enabled", cfg.StdoutTracesEnabled,
"stdout_metrics_enabled", cfg.StdoutMetricsEnabled,
)
return runtime, nil
}
func buildRuntime(
meterProvider metric.MeterProvider,
tracerProvider oteltrace.TracerProvider,
promHandler http.Handler,
shutdownFns []func(context.Context) error,
) (*Runtime, error) {
meter := meterProvider.Meter(meterName)
internalHTTPRequests, err := meter.Int64Counter("user.internal_http.requests")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: internal_http.requests: %w", err)
}
internalHTTPDuration, err := meter.Float64Histogram("user.internal_http.duration", metric.WithUnit("ms"))
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: internal_http.duration: %w", err)
}
authResolutionOutcomes, err := meter.Int64Counter("user.auth_resolution.outcomes")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: auth_resolution.outcomes: %w", err)
}
userCreationOutcomes, err := meter.Int64Counter("user.user_creation.outcomes")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: user_creation.outcomes: %w", err)
}
raceNameReservationConflicts, err := meter.Int64Counter("user.race_name.reservation_conflicts")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: race_name.reservation_conflicts: %w", err)
}
entitlementMutations, err := meter.Int64Counter("user.entitlement.mutations")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: entitlement.mutations: %w", err)
}
sanctionMutations, err := meter.Int64Counter("user.sanction.mutations")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: sanction.mutations: %w", err)
}
limitMutations, err := meter.Int64Counter("user.limit.mutations")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: limit.mutations: %w", err)
}
eventPublicationFailures, err := meter.Int64Counter("user.event_publication_failures")
if err != nil {
return nil, fmt.Errorf("build user telemetry runtime: event_publication_failures: %w", err)
}
if promHandler == nil {
promHandler = http.NotFoundHandler()
}
return &Runtime{
tracerProvider: tracerProvider,
meterProvider: meterProvider,
promHandler: promHandler,
shutdownFns: shutdownFns,
internalHTTPRequests: internalHTTPRequests,
internalHTTPDuration: internalHTTPDuration,
authResolutionOutcomes: authResolutionOutcomes,
userCreationOutcomes: userCreationOutcomes,
raceNameReservationConflicts: raceNameReservationConflicts,
entitlementMutations: entitlementMutations,
sanctionMutations: sanctionMutations,
limitMutations: limitMutations,
eventPublicationFailures: eventPublicationFailures,
}, nil
}
func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig, stdoutWriter io.Writer) (*sdktrace.TracerProvider, error) {
options := []sdktrace.TracerProviderOption{sdktrace.WithResource(res)}
if cfg.TracesExporter == processExporterOTLP {
exporter, err := newOTLPTraceExporter(ctx, cfg.TracesProtocol)
if err != nil {
return nil, err
}
options = append(options, sdktrace.WithBatcher(exporter))
}
if cfg.StdoutTracesEnabled {
exporter, err := stdouttrace.New(
stdouttrace.WithPrettyPrint(),
stdouttrace.WithWriter(stdoutWriter),
)
if err != nil {
return nil, err
}
options = append(options, sdktrace.WithBatcher(exporter))
}
return sdktrace.NewTracerProvider(options...), nil
}
func newMeterProvider(
ctx context.Context,
res *resource.Resource,
cfg ProcessConfig,
prometheusExporter sdkmetric.Reader,
stdoutWriter io.Writer,
) (*sdkmetric.MeterProvider, error) {
options := []sdkmetric.Option{
sdkmetric.WithResource(res),
sdkmetric.WithReader(prometheusExporter),
}
if cfg.MetricsExporter == processExporterOTLP {
exporter, err := newOTLPMetricExporter(ctx, cfg.MetricsProtocol)
if err != nil {
return nil, err
}
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
if cfg.StdoutMetricsEnabled {
exporter, err := stdoutmetric.New(
stdoutmetric.WithPrettyPrint(),
stdoutmetric.WithWriter(stdoutWriter),
)
if err != nil {
return nil, err
}
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
return sdkmetric.NewMeterProvider(options...), nil
}
func newOTLPTraceExporter(ctx context.Context, protocol string) (sdktrace.SpanExporter, error) {
switch protocol {
case "", processProtocolHTTPProtobuf:
return otlptracehttp.New(ctx)
case processProtocolGRPC:
return otlptracegrpc.New(ctx)
default:
return nil, fmt.Errorf("unsupported OTLP traces protocol %q", protocol)
}
}
func newOTLPMetricExporter(ctx context.Context, protocol string) (sdkmetric.Exporter, error) {
switch protocol {
case "", processProtocolHTTPProtobuf:
return otlpmetrichttp.New(ctx)
case processProtocolGRPC:
return otlpmetricgrpc.New(ctx)
default:
return nil, fmt.Errorf("unsupported OTLP metrics protocol %q", protocol)
}
}
func normalizeContext(ctx context.Context) context.Context {
if ctx == nil {
return context.Background()
}
return ctx
}
+186
View File
@@ -0,0 +1,186 @@
package telemetry
import (
"bytes"
"context"
"io"
"log/slog"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.opentelemetry.io/otel/attribute"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/metric/metricdata"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
)
func TestNewProcessBuildsWithoutExporters(t *testing.T) {
t.Parallel()
runtime, err := newProcess(context.Background(), ProcessConfig{
ServiceName: "galaxy-user-test",
TracesExporter: processExporterNone,
MetricsExporter: processExporterNone,
}, slog.New(slog.NewTextHandler(io.Discard, nil)), io.Discard, io.Discard)
require.NoError(t, err)
assert.NotNil(t, runtime.TracerProvider())
assert.NotNil(t, runtime.MeterProvider())
assert.NotNil(t, runtime.Handler())
require.NoError(t, runtime.Shutdown(context.Background()))
require.NoError(t, runtime.Shutdown(context.Background()))
}
func TestNewProcessBuildsWithStdoutExporters(t *testing.T) {
t.Parallel()
traceBuffer := &bytes.Buffer{}
metricBuffer := &bytes.Buffer{}
runtime, err := newProcess(context.Background(), ProcessConfig{
ServiceName: "galaxy-user-test",
TracesExporter: processExporterNone,
MetricsExporter: processExporterNone,
StdoutTracesEnabled: true,
StdoutMetricsEnabled: true,
}, slog.New(slog.NewTextHandler(io.Discard, nil)), traceBuffer, metricBuffer)
require.NoError(t, err)
ctx, span := runtime.TracerProvider().Tracer("test").Start(context.Background(), "internal-request")
runtime.RecordUserCreationOutcome(ctx, "created")
span.End()
require.NoError(t, runtime.Shutdown(context.Background()))
assert.NotEmpty(t, traceBuffer.String())
assert.NotEmpty(t, metricBuffer.String())
}
func TestNewPreservesBusinessMetrics(t *testing.T) {
t.Parallel()
reader := sdkmetric.NewManualReader()
meterProvider := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader))
tracerProvider := sdktrace.NewTracerProvider()
runtime, err := NewWithProviders(meterProvider, tracerProvider)
require.NoError(t, err)
runtime.RecordInternalHTTPRequest(context.Background(), []attribute.KeyValue{
attribute.String("route", "/api/v1/internal/users/:user_id/exists"),
attribute.String("method", "GET"),
attribute.String("edge_outcome", "success"),
}, 125*time.Millisecond)
runtime.RecordAuthResolutionOutcome(context.Background(), "resolve_by_email", "existing")
runtime.RecordUserCreationOutcome(context.Background(), "created")
runtime.RecordRaceNameReservationConflict(context.Background(), "update_my_profile")
runtime.RecordEntitlementMutation(context.Background(), "grant", "success")
runtime.RecordSanctionMutation(context.Background(), "apply", "conflict")
runtime.RecordLimitMutation(context.Background(), "remove", "subject_not_found")
runtime.RecordEventPublicationFailure(context.Background(), "user.profile.changed")
assertMetricCount(t, reader, "user.internal_http.requests", map[string]string{
"route": "/api/v1/internal/users/:user_id/exists",
"method": "GET",
"edge_outcome": "success",
}, 1)
assertHistogramCount(t, reader, "user.internal_http.duration", map[string]string{
"route": "/api/v1/internal/users/:user_id/exists",
"method": "GET",
"edge_outcome": "success",
}, 1)
assertMetricCount(t, reader, "user.auth_resolution.outcomes", map[string]string{
"operation": "resolve_by_email",
"outcome": "existing",
}, 1)
assertMetricCount(t, reader, "user.user_creation.outcomes", map[string]string{
"outcome": "created",
}, 1)
assertMetricCount(t, reader, "user.race_name.reservation_conflicts", map[string]string{
"operation": "update_my_profile",
}, 1)
assertMetricCount(t, reader, "user.entitlement.mutations", map[string]string{
"command": "grant",
"outcome": "success",
}, 1)
assertMetricCount(t, reader, "user.sanction.mutations", map[string]string{
"command": "apply",
"outcome": "conflict",
}, 1)
assertMetricCount(t, reader, "user.limit.mutations", map[string]string{
"command": "remove",
"outcome": "subject_not_found",
}, 1)
assertMetricCount(t, reader, "user.event_publication_failures", map[string]string{
"event_type": "user.profile.changed",
}, 1)
}
func assertMetricCount(t *testing.T, reader *sdkmetric.ManualReader, metricName string, wantAttrs map[string]string, wantValue int64) {
t.Helper()
var resourceMetrics metricdata.ResourceMetrics
require.NoError(t, reader.Collect(context.Background(), &resourceMetrics))
for _, scopeMetrics := range resourceMetrics.ScopeMetrics {
for _, metric := range scopeMetrics.Metrics {
if metric.Name != metricName {
continue
}
sum, ok := metric.Data.(metricdata.Sum[int64])
require.True(t, ok)
for _, point := range sum.DataPoints {
if hasMetricAttributes(point.Attributes.ToSlice(), wantAttrs) {
assert.Equal(t, wantValue, point.Value)
return
}
}
}
}
require.Failf(t, "test failed", "metric %q with attrs %v not found", metricName, wantAttrs)
}
func assertHistogramCount(t *testing.T, reader *sdkmetric.ManualReader, metricName string, wantAttrs map[string]string, wantCount uint64) {
t.Helper()
var resourceMetrics metricdata.ResourceMetrics
require.NoError(t, reader.Collect(context.Background(), &resourceMetrics))
for _, scopeMetrics := range resourceMetrics.ScopeMetrics {
for _, metric := range scopeMetrics.Metrics {
if metric.Name != metricName {
continue
}
histogram, ok := metric.Data.(metricdata.Histogram[float64])
require.True(t, ok)
for _, point := range histogram.DataPoints {
if hasMetricAttributes(point.Attributes.ToSlice(), wantAttrs) {
assert.Equal(t, wantCount, point.Count)
return
}
}
}
}
require.Failf(t, "test failed", "histogram %q with attrs %v not found", metricName, wantAttrs)
}
func hasMetricAttributes(values []attribute.KeyValue, want map[string]string) bool {
if len(values) != len(want) {
return false
}
for _, value := range values {
if want[string(value.Key)] != value.Value.AsString() {
return false
}
}
return true
}