feat: game lobby service
This commit is contained in:
@@ -0,0 +1,781 @@
|
||||
// Package telemetry provides lightweight OpenTelemetry helpers and
|
||||
// low-cardinality Game Lobby Service instruments used by the runnable
|
||||
// skeleton. Later stages extend the instrument set.
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
|
||||
"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
|
||||
"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
"go.opentelemetry.io/otel/propagation"
|
||||
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
sdktrace "go.opentelemetry.io/otel/sdk/trace"
|
||||
oteltrace "go.opentelemetry.io/otel/trace"
|
||||
)
|
||||
|
||||
const meterName = "galaxy/lobby"
|
||||
|
||||
const (
|
||||
defaultServiceName = "galaxy-lobby"
|
||||
|
||||
processExporterNone = "none"
|
||||
processExporterOTLP = "otlp"
|
||||
processProtocolHTTPProtobuf = "http/protobuf"
|
||||
processProtocolGRPC = "grpc"
|
||||
)
|
||||
|
||||
// ProcessConfig configures the process-wide OpenTelemetry runtime.
|
||||
type ProcessConfig struct {
|
||||
// ServiceName overrides the default OpenTelemetry service name.
|
||||
ServiceName string
|
||||
|
||||
// TracesExporter selects the external traces exporter. Supported values are
|
||||
// `none` and `otlp`.
|
||||
TracesExporter string
|
||||
|
||||
// MetricsExporter selects the external metrics exporter. Supported values
|
||||
// are `none` and `otlp`.
|
||||
MetricsExporter string
|
||||
|
||||
// TracesProtocol selects the OTLP traces protocol when TracesExporter is
|
||||
// `otlp`.
|
||||
TracesProtocol string
|
||||
|
||||
// MetricsProtocol selects the OTLP metrics protocol when MetricsExporter
|
||||
// is `otlp`.
|
||||
MetricsProtocol string
|
||||
|
||||
// StdoutTracesEnabled enables the additional stdout trace exporter used
|
||||
// for local development and debugging.
|
||||
StdoutTracesEnabled bool
|
||||
|
||||
// StdoutMetricsEnabled enables the additional stdout metric exporter used
|
||||
// for local development and debugging.
|
||||
StdoutMetricsEnabled bool
|
||||
}
|
||||
|
||||
// Validate reports whether cfg contains a supported OpenTelemetry exporter
|
||||
// configuration.
|
||||
func (cfg ProcessConfig) Validate() error {
|
||||
switch cfg.TracesExporter {
|
||||
case processExporterNone, processExporterOTLP:
|
||||
default:
|
||||
return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter)
|
||||
}
|
||||
|
||||
switch cfg.MetricsExporter {
|
||||
case processExporterNone, processExporterOTLP:
|
||||
default:
|
||||
return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter)
|
||||
}
|
||||
|
||||
if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC {
|
||||
return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol)
|
||||
}
|
||||
if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC {
|
||||
return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Runtime owns the Game Lobby Service OpenTelemetry providers and the
|
||||
// low-cardinality custom instruments listed in `lobby/README.md`
|
||||
// §Observability.
|
||||
type Runtime struct {
|
||||
tracerProvider oteltrace.TracerProvider
|
||||
meterProvider metric.MeterProvider
|
||||
meter metric.Meter
|
||||
|
||||
shutdownMu sync.Mutex
|
||||
shutdownDone bool
|
||||
shutdownErr error
|
||||
shutdownFns []func(context.Context) error
|
||||
|
||||
publicHTTPRequests metric.Int64Counter
|
||||
publicHTTPDuration metric.Float64Histogram
|
||||
internalHTTPRequests metric.Int64Counter
|
||||
internalHTTPDuration metric.Float64Histogram
|
||||
|
||||
gameTransitions metric.Int64Counter
|
||||
applicationOutcomes metric.Int64Counter
|
||||
inviteOutcomes metric.Int64Counter
|
||||
membershipChanges metric.Int64Counter
|
||||
startFlowOutcomes metric.Int64Counter
|
||||
notificationPublishAttempts metric.Int64Counter
|
||||
enrollmentAutomationChecks metric.Int64Counter
|
||||
raceNameOutcomes metric.Int64Counter
|
||||
pendingRegistrationExpires metric.Int64Counter
|
||||
userLifecycleCascadeReleases metric.Int64Counter
|
||||
capabilityEvaluations metric.Int64Counter
|
||||
|
||||
activeGames metric.Int64ObservableGauge
|
||||
gmEventsOldestUnprocessedAge metric.Int64ObservableGauge
|
||||
runtimeResultsOldestUnprocessedAge metric.Int64ObservableGauge
|
||||
userLifecycleOldestUnprocessedAge metric.Int64ObservableGauge
|
||||
|
||||
gaugeMu sync.Mutex
|
||||
gaugeRegistration metric.Registration
|
||||
}
|
||||
|
||||
// NewWithProviders constructs a telemetry runtime around explicitly supplied
|
||||
// meterProvider and tracerProvider values.
|
||||
func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) {
|
||||
if meterProvider == nil {
|
||||
meterProvider = otel.GetMeterProvider()
|
||||
}
|
||||
if tracerProvider == nil {
|
||||
tracerProvider = otel.GetTracerProvider()
|
||||
}
|
||||
if meterProvider == nil {
|
||||
return nil, errors.New("new lobby telemetry runtime: nil meter provider")
|
||||
}
|
||||
if tracerProvider == nil {
|
||||
return nil, errors.New("new lobby telemetry runtime: nil tracer provider")
|
||||
}
|
||||
|
||||
return buildRuntime(meterProvider, tracerProvider, nil)
|
||||
}
|
||||
|
||||
// NewProcess constructs the process-wide Game Lobby Service OpenTelemetry
|
||||
// runtime from cfg, installs the resulting providers globally, and returns
|
||||
// the runtime.
|
||||
func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) {
|
||||
if ctx == nil {
|
||||
return nil, errors.New("new lobby telemetry process: nil context")
|
||||
}
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new lobby telemetry process: %w", err)
|
||||
}
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
serviceName := strings.TrimSpace(cfg.ServiceName)
|
||||
if serviceName == "" {
|
||||
serviceName = defaultServiceName
|
||||
}
|
||||
|
||||
res := resource.NewSchemaless(attribute.String("service.name", serviceName))
|
||||
|
||||
tracerProvider, err := newTracerProvider(ctx, res, cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new lobby telemetry process: tracer provider: %w", err)
|
||||
}
|
||||
meterProvider, err := newMeterProvider(ctx, res, cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new lobby telemetry process: meter provider: %w", err)
|
||||
}
|
||||
|
||||
otel.SetTracerProvider(tracerProvider)
|
||||
otel.SetMeterProvider(meterProvider)
|
||||
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
|
||||
propagation.TraceContext{},
|
||||
propagation.Baggage{},
|
||||
))
|
||||
|
||||
runtime, err := buildRuntime(meterProvider, tracerProvider, []func(context.Context) error{
|
||||
meterProvider.Shutdown,
|
||||
tracerProvider.Shutdown,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new lobby telemetry process: runtime: %w", err)
|
||||
}
|
||||
|
||||
logger.Info("lobby telemetry configured",
|
||||
"service_name", serviceName,
|
||||
"traces_exporter", cfg.TracesExporter,
|
||||
"metrics_exporter", cfg.MetricsExporter,
|
||||
)
|
||||
|
||||
return runtime, nil
|
||||
}
|
||||
|
||||
// TracerProvider returns the runtime tracer provider.
|
||||
func (runtime *Runtime) TracerProvider() oteltrace.TracerProvider {
|
||||
if runtime == nil || runtime.tracerProvider == nil {
|
||||
return otel.GetTracerProvider()
|
||||
}
|
||||
|
||||
return runtime.tracerProvider
|
||||
}
|
||||
|
||||
// MeterProvider returns the runtime meter provider.
|
||||
func (runtime *Runtime) MeterProvider() metric.MeterProvider {
|
||||
if runtime == nil || runtime.meterProvider == nil {
|
||||
return otel.GetMeterProvider()
|
||||
}
|
||||
|
||||
return runtime.meterProvider
|
||||
}
|
||||
|
||||
// Shutdown flushes and stops the configured telemetry providers. Shutdown is
|
||||
// idempotent.
|
||||
func (runtime *Runtime) Shutdown(ctx context.Context) error {
|
||||
if runtime == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
runtime.shutdownMu.Lock()
|
||||
if runtime.shutdownDone {
|
||||
err := runtime.shutdownErr
|
||||
runtime.shutdownMu.Unlock()
|
||||
return err
|
||||
}
|
||||
runtime.shutdownDone = true
|
||||
runtime.shutdownMu.Unlock()
|
||||
|
||||
runtime.gaugeMu.Lock()
|
||||
if runtime.gaugeRegistration != nil {
|
||||
_ = runtime.gaugeRegistration.Unregister()
|
||||
runtime.gaugeRegistration = nil
|
||||
}
|
||||
runtime.gaugeMu.Unlock()
|
||||
|
||||
var shutdownErr error
|
||||
for index := len(runtime.shutdownFns) - 1; index >= 0; index-- {
|
||||
shutdownErr = errors.Join(shutdownErr, runtime.shutdownFns[index](ctx))
|
||||
}
|
||||
|
||||
runtime.shutdownMu.Lock()
|
||||
runtime.shutdownErr = shutdownErr
|
||||
runtime.shutdownMu.Unlock()
|
||||
|
||||
return shutdownErr
|
||||
}
|
||||
|
||||
// RecordPublicHTTPRequest records one public HTTP request outcome.
|
||||
func (runtime *Runtime) RecordPublicHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
options := metric.WithAttributes(attrs...)
|
||||
runtime.publicHTTPRequests.Add(normalizeContext(ctx), 1, options)
|
||||
runtime.publicHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
|
||||
}
|
||||
|
||||
// RecordInternalHTTPRequest records one internal HTTP request outcome.
|
||||
func (runtime *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
options := metric.WithAttributes(attrs...)
|
||||
runtime.internalHTTPRequests.Add(normalizeContext(ctx), 1, options)
|
||||
runtime.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
|
||||
}
|
||||
|
||||
// RecordGameTransition records one applied game-status transition with the
|
||||
// frozen attribute set from `lobby/README.md` §Observability.
|
||||
func (runtime *Runtime) RecordGameTransition(ctx context.Context, fromStatus, toStatus, trigger string) {
|
||||
if runtime == nil || runtime.gameTransitions == nil {
|
||||
return
|
||||
}
|
||||
runtime.gameTransitions.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("from_status", fromStatus),
|
||||
attribute.String("to_status", toStatus),
|
||||
attribute.String("trigger", trigger),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordApplicationOutcome records one terminal application outcome from the
|
||||
// `submitted` / `approved` / `rejected` vocabulary.
|
||||
func (runtime *Runtime) RecordApplicationOutcome(ctx context.Context, outcome string) {
|
||||
if runtime == nil || runtime.applicationOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.applicationOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordInviteOutcome records one terminal invite outcome from the
|
||||
// `created` / `redeemed` / `declined` / `revoked` / `expired` vocabulary.
|
||||
func (runtime *Runtime) RecordInviteOutcome(ctx context.Context, outcome string) {
|
||||
if runtime == nil || runtime.inviteOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.inviteOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordMembershipChange records one membership change from the
|
||||
// `activated` / `removed` / `blocked` / `external_block` vocabulary.
|
||||
func (runtime *Runtime) RecordMembershipChange(ctx context.Context, change string) {
|
||||
if runtime == nil || runtime.membershipChanges == nil {
|
||||
return
|
||||
}
|
||||
runtime.membershipChanges.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("change", change),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordStartFlowOutcome records one terminal outcome of the start flow
|
||||
// from the `running` / `paused` / `start_failed` vocabulary.
|
||||
func (runtime *Runtime) RecordStartFlowOutcome(ctx context.Context, outcome string) {
|
||||
if runtime == nil || runtime.startFlowOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.startFlowOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordNotificationPublish records one notification intent publish attempt.
|
||||
// result is `ok` on success and `error` on transport failure.
|
||||
func (runtime *Runtime) RecordNotificationPublish(ctx context.Context, notificationType, result string) {
|
||||
if runtime == nil || runtime.notificationPublishAttempts == nil {
|
||||
return
|
||||
}
|
||||
runtime.notificationPublishAttempts.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("notification_type", notificationType),
|
||||
attribute.String("result", result),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordEnrollmentAutomationCheck records one enrollment-automation tick
|
||||
// outcome per inspected game from the `no_op` / `transitioned` vocabulary.
|
||||
func (runtime *Runtime) RecordEnrollmentAutomationCheck(ctx context.Context, result string) {
|
||||
if runtime == nil || runtime.enrollmentAutomationChecks == nil {
|
||||
return
|
||||
}
|
||||
runtime.enrollmentAutomationChecks.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("result", result),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordRaceNameOutcome records one Race Name Directory side effect from the
|
||||
// `reserved` / `reservation_released` / `pending_created` /
|
||||
// `pending_released` / `registered` / `registered_released` vocabulary.
|
||||
func (runtime *Runtime) RecordRaceNameOutcome(ctx context.Context, outcome string) {
|
||||
if runtime == nil || runtime.raceNameOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.raceNameOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordPendingRegistrationExpiration records one pending-registration entry
|
||||
// released by the expiration worker. trigger is `tick` for periodic releases
|
||||
// and `manual` for direct administrative releases.
|
||||
func (runtime *Runtime) RecordPendingRegistrationExpiration(ctx context.Context, trigger string) {
|
||||
if runtime == nil || runtime.pendingRegistrationExpires == nil {
|
||||
return
|
||||
}
|
||||
runtime.pendingRegistrationExpires.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("trigger", trigger),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordUserLifecycleCascadeRelease records one cascade-release event from
|
||||
// the `permanent_blocked` / `deleted` vocabulary.
|
||||
func (runtime *Runtime) RecordUserLifecycleCascadeRelease(ctx context.Context, event string) {
|
||||
if runtime == nil || runtime.userLifecycleCascadeReleases == nil {
|
||||
return
|
||||
}
|
||||
runtime.userLifecycleCascadeReleases.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("event", event),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordCapabilityEvaluation records one per-membership capability decision
|
||||
// from the `capable` / `incapable` / `noop` vocabulary.
|
||||
func (runtime *Runtime) RecordCapabilityEvaluation(ctx context.Context, result string) {
|
||||
if runtime == nil || runtime.capabilityEvaluations == nil {
|
||||
return
|
||||
}
|
||||
runtime.capabilityEvaluations.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("result", result),
|
||||
))
|
||||
}
|
||||
|
||||
// ActiveGamesProbe reports the number of game records per status. The
|
||||
// production probe wraps GameStore.CountByStatus; tests may pass a stub.
|
||||
type ActiveGamesProbe interface {
|
||||
CountByStatus(ctx context.Context) (map[string]int, error)
|
||||
}
|
||||
|
||||
// StreamLagProbe reports the age of the oldest unprocessed entry on a Redis
|
||||
// Stream relative to a saved consumer offset. The boolean return reports
|
||||
// whether the probe could compute a value at all (false when no offset is
|
||||
// stored yet or no entries follow it).
|
||||
type StreamLagProbe interface {
|
||||
OldestUnprocessedAge(ctx context.Context, stream, savedOffset string) (time.Duration, bool, error)
|
||||
}
|
||||
|
||||
// StreamOffsetReader supplies the latest persisted offset for a stream so
|
||||
// the lag probe can compute the age of the oldest unprocessed entry.
|
||||
type StreamOffsetReader interface {
|
||||
Load(ctx context.Context, stream string) (entryID string, found bool, err error)
|
||||
}
|
||||
|
||||
// StreamGaugeBinding pairs the stream's stable offset label (used by the
|
||||
// offset reader) with the stream name (used by the lag probe).
|
||||
type StreamGaugeBinding struct {
|
||||
// OffsetLabel is the stable label passed to the offset reader. Lobby
|
||||
// uses `gm_lobby_events`, `runtime_results`, and `user_lifecycle`.
|
||||
OffsetLabel string
|
||||
|
||||
// StreamName is the Redis Stream key the lag probe reads from. Lobby
|
||||
// configures these via env vars (e.g. `gm:lobby_events`).
|
||||
StreamName string
|
||||
}
|
||||
|
||||
// GaugeDependencies groups the collaborators required by RegisterGauges.
|
||||
type GaugeDependencies struct {
|
||||
// ActiveGames probes the per-status game count for `lobby.active_games`.
|
||||
ActiveGames ActiveGamesProbe
|
||||
|
||||
// StreamLag probes the oldest unprocessed entry age for the three
|
||||
// `*.oldest_unprocessed_age_ms` gauges.
|
||||
StreamLag StreamLagProbe
|
||||
|
||||
// Offsets supplies the saved consumer offset per stream.
|
||||
Offsets StreamOffsetReader
|
||||
|
||||
// GMEvents binds the GM events offset label and stream name.
|
||||
GMEvents StreamGaugeBinding
|
||||
|
||||
// RuntimeResults binds the Runtime Manager job-result offset label
|
||||
// and stream name.
|
||||
RuntimeResults StreamGaugeBinding
|
||||
|
||||
// UserLifecycle binds the User Service lifecycle offset label and
|
||||
// stream name.
|
||||
UserLifecycle StreamGaugeBinding
|
||||
|
||||
// Logger records non-fatal probe errors. Defaults to slog.Default
|
||||
// when nil.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
func (binding StreamGaugeBinding) validate() error {
|
||||
if strings.TrimSpace(binding.OffsetLabel) == "" {
|
||||
return errors.New("empty offset label")
|
||||
}
|
||||
if strings.TrimSpace(binding.StreamName) == "" {
|
||||
return errors.New("empty stream name")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// RegisterGauges installs the observable-gauge callback that reports
|
||||
// `lobby.active_games` and the three `*.oldest_unprocessed_age_ms` gauges.
|
||||
// It is safe to call once per Runtime; a second call replaces the previous
|
||||
// registration. The runtime keeps no strong reference to deps beyond the
|
||||
// callback closure.
|
||||
func (runtime *Runtime) RegisterGauges(deps GaugeDependencies) error {
|
||||
if runtime == nil {
|
||||
return errors.New("register lobby gauges: nil runtime")
|
||||
}
|
||||
if deps.ActiveGames == nil {
|
||||
return errors.New("register lobby gauges: nil active games probe")
|
||||
}
|
||||
if deps.StreamLag == nil {
|
||||
return errors.New("register lobby gauges: nil stream lag probe")
|
||||
}
|
||||
if deps.Offsets == nil {
|
||||
return errors.New("register lobby gauges: nil offset reader")
|
||||
}
|
||||
if err := deps.GMEvents.validate(); err != nil {
|
||||
return fmt.Errorf("register lobby gauges: gm events: %w", err)
|
||||
}
|
||||
if err := deps.RuntimeResults.validate(); err != nil {
|
||||
return fmt.Errorf("register lobby gauges: runtime results: %w", err)
|
||||
}
|
||||
if err := deps.UserLifecycle.validate(); err != nil {
|
||||
return fmt.Errorf("register lobby gauges: user lifecycle: %w", err)
|
||||
}
|
||||
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
runtime.gaugeMu.Lock()
|
||||
defer runtime.gaugeMu.Unlock()
|
||||
|
||||
if runtime.gaugeRegistration != nil {
|
||||
_ = runtime.gaugeRegistration.Unregister()
|
||||
runtime.gaugeRegistration = nil
|
||||
}
|
||||
|
||||
streams := []struct {
|
||||
binding StreamGaugeBinding
|
||||
gauge metric.Int64ObservableGauge
|
||||
}{
|
||||
{deps.GMEvents, runtime.gmEventsOldestUnprocessedAge},
|
||||
{deps.RuntimeResults, runtime.runtimeResultsOldestUnprocessedAge},
|
||||
{deps.UserLifecycle, runtime.userLifecycleOldestUnprocessedAge},
|
||||
}
|
||||
|
||||
callback := func(ctx context.Context, observer metric.Observer) error {
|
||||
counts, err := deps.ActiveGames.CountByStatus(ctx)
|
||||
if err != nil {
|
||||
logger.WarnContext(ctx, "active games probe failed",
|
||||
"err", err.Error(),
|
||||
)
|
||||
} else {
|
||||
for status, count := range counts {
|
||||
observer.ObserveInt64(runtime.activeGames, int64(count), metric.WithAttributes(
|
||||
attribute.String("status", status),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
for _, stream := range streams {
|
||||
savedOffset, _, err := deps.Offsets.Load(ctx, stream.binding.OffsetLabel)
|
||||
if err != nil {
|
||||
logger.WarnContext(ctx, "stream offset load failed",
|
||||
"stream", stream.binding.StreamName,
|
||||
"err", err.Error(),
|
||||
)
|
||||
continue
|
||||
}
|
||||
age, ok, err := deps.StreamLag.OldestUnprocessedAge(ctx, stream.binding.StreamName, savedOffset)
|
||||
if err != nil {
|
||||
logger.WarnContext(ctx, "stream lag probe failed",
|
||||
"stream", stream.binding.StreamName,
|
||||
"err", err.Error(),
|
||||
)
|
||||
continue
|
||||
}
|
||||
if !ok {
|
||||
observer.ObserveInt64(stream.gauge, 0)
|
||||
continue
|
||||
}
|
||||
observer.ObserveInt64(stream.gauge, age.Milliseconds())
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
registration, err := runtime.meter.RegisterCallback(
|
||||
callback,
|
||||
runtime.activeGames,
|
||||
runtime.gmEventsOldestUnprocessedAge,
|
||||
runtime.runtimeResultsOldestUnprocessedAge,
|
||||
runtime.userLifecycleOldestUnprocessedAge,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("register lobby gauges: %w", err)
|
||||
}
|
||||
runtime.gaugeRegistration = registration
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) {
|
||||
meter := meterProvider.Meter(meterName)
|
||||
runtime := &Runtime{
|
||||
tracerProvider: tracerProvider,
|
||||
meterProvider: meterProvider,
|
||||
meter: meter,
|
||||
shutdownFns: append([]func(context.Context) error(nil), shutdownFns...),
|
||||
}
|
||||
|
||||
publicHTTPRequests, err := meter.Int64Counter("lobby.public_http.requests")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build lobby telemetry runtime: public_http.requests: %w", err)
|
||||
}
|
||||
publicHTTPDuration, err := meter.Float64Histogram("lobby.public_http.duration", metric.WithUnit("ms"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build lobby telemetry runtime: public_http.duration: %w", err)
|
||||
}
|
||||
internalHTTPRequests, err := meter.Int64Counter("lobby.internal_http.requests")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build lobby telemetry runtime: internal_http.requests: %w", err)
|
||||
}
|
||||
internalHTTPDuration, err := meter.Float64Histogram("lobby.internal_http.duration", metric.WithUnit("ms"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build lobby telemetry runtime: internal_http.duration: %w", err)
|
||||
}
|
||||
|
||||
runtime.publicHTTPRequests = publicHTTPRequests
|
||||
runtime.publicHTTPDuration = publicHTTPDuration
|
||||
runtime.internalHTTPRequests = internalHTTPRequests
|
||||
runtime.internalHTTPDuration = internalHTTPDuration
|
||||
|
||||
if err := registerLobbyCounters(meter, runtime); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := registerLobbyGauges(meter, runtime); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return runtime, nil
|
||||
}
|
||||
|
||||
func registerLobbyCounters(meter metric.Meter, runtime *Runtime) error {
|
||||
specs := []struct {
|
||||
name string
|
||||
target *metric.Int64Counter
|
||||
}{
|
||||
{"lobby.game.transitions", &runtime.gameTransitions},
|
||||
{"lobby.application.outcomes", &runtime.applicationOutcomes},
|
||||
{"lobby.invite.outcomes", &runtime.inviteOutcomes},
|
||||
{"lobby.membership.changes", &runtime.membershipChanges},
|
||||
{"lobby.start_flow.outcomes", &runtime.startFlowOutcomes},
|
||||
{"lobby.notification.publish_attempts", &runtime.notificationPublishAttempts},
|
||||
{"lobby.enrollment_automation.checks", &runtime.enrollmentAutomationChecks},
|
||||
{"lobby.race_name.outcomes", &runtime.raceNameOutcomes},
|
||||
{"lobby.pending_registration.expirations", &runtime.pendingRegistrationExpires},
|
||||
{"lobby.user_lifecycle.cascade_releases", &runtime.userLifecycleCascadeReleases},
|
||||
{"lobby.capability_evaluations", &runtime.capabilityEvaluations},
|
||||
}
|
||||
for _, spec := range specs {
|
||||
counter, err := meter.Int64Counter(spec.name)
|
||||
if err != nil {
|
||||
return fmt.Errorf("build lobby telemetry runtime: %s: %w", spec.name, err)
|
||||
}
|
||||
*spec.target = counter
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func registerLobbyGauges(meter metric.Meter, runtime *Runtime) error {
|
||||
gauges := []struct {
|
||||
name string
|
||||
unit string
|
||||
target *metric.Int64ObservableGauge
|
||||
}{
|
||||
{"lobby.active_games", "", &runtime.activeGames},
|
||||
{"lobby.gm_events.oldest_unprocessed_age_ms", "ms", &runtime.gmEventsOldestUnprocessedAge},
|
||||
{"lobby.runtime_results.oldest_unprocessed_age_ms", "ms", &runtime.runtimeResultsOldestUnprocessedAge},
|
||||
{"lobby.user_lifecycle.oldest_unprocessed_age_ms", "ms", &runtime.userLifecycleOldestUnprocessedAge},
|
||||
}
|
||||
for _, spec := range gauges {
|
||||
options := []metric.Int64ObservableGaugeOption{}
|
||||
if spec.unit != "" {
|
||||
options = append(options, metric.WithUnit(spec.unit))
|
||||
}
|
||||
gauge, err := meter.Int64ObservableGauge(spec.name, options...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("build lobby telemetry runtime: %s: %w", spec.name, err)
|
||||
}
|
||||
*spec.target = gauge
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdktrace.TracerProvider, error) {
|
||||
options := []sdktrace.TracerProviderOption{
|
||||
sdktrace.WithResource(res),
|
||||
}
|
||||
|
||||
if exporter, err := traceExporter(ctx, cfg); err != nil {
|
||||
return nil, err
|
||||
} else if exporter != nil {
|
||||
options = append(options, sdktrace.WithBatcher(exporter))
|
||||
}
|
||||
|
||||
if cfg.StdoutTracesEnabled {
|
||||
exporter, err := stdouttrace.New(stdouttrace.WithWriter(os.Stdout))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stdout traces exporter: %w", err)
|
||||
}
|
||||
options = append(options, sdktrace.WithBatcher(exporter))
|
||||
}
|
||||
|
||||
return sdktrace.NewTracerProvider(options...), nil
|
||||
}
|
||||
|
||||
func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdkmetric.MeterProvider, error) {
|
||||
options := []sdkmetric.Option{
|
||||
sdkmetric.WithResource(res),
|
||||
}
|
||||
|
||||
if exporter, err := metricExporter(ctx, cfg); err != nil {
|
||||
return nil, err
|
||||
} else if exporter != nil {
|
||||
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
|
||||
}
|
||||
|
||||
if cfg.StdoutMetricsEnabled {
|
||||
exporter, err := stdoutmetric.New(stdoutmetric.WithWriter(os.Stdout))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stdout metrics exporter: %w", err)
|
||||
}
|
||||
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
|
||||
}
|
||||
|
||||
return sdkmetric.NewMeterProvider(options...), nil
|
||||
}
|
||||
|
||||
func traceExporter(ctx context.Context, cfg ProcessConfig) (sdktrace.SpanExporter, error) {
|
||||
if cfg.TracesExporter != processExporterOTLP {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
switch normalizeProtocol(cfg.TracesProtocol) {
|
||||
case processProtocolGRPC:
|
||||
exporter, err := otlptracegrpc.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp grpc traces exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
default:
|
||||
exporter, err := otlptracehttp.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp http traces exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
}
|
||||
}
|
||||
|
||||
func metricExporter(ctx context.Context, cfg ProcessConfig) (sdkmetric.Exporter, error) {
|
||||
if cfg.MetricsExporter != processExporterOTLP {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
switch normalizeProtocol(cfg.MetricsProtocol) {
|
||||
case processProtocolGRPC:
|
||||
exporter, err := otlpmetricgrpc.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp grpc metrics exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
default:
|
||||
exporter, err := otlpmetrichttp.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp http metrics exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeProtocol(value string) string {
|
||||
switch strings.TrimSpace(value) {
|
||||
case processProtocolGRPC:
|
||||
return processProtocolGRPC
|
||||
default:
|
||||
return processProtocolHTTPProtobuf
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeContext(ctx context.Context) context.Context {
|
||||
if ctx == nil {
|
||||
return context.Background()
|
||||
}
|
||||
|
||||
return ctx
|
||||
}
|
||||
Reference in New Issue
Block a user