feat: gamemaster
This commit is contained in:
@@ -0,0 +1,721 @@
|
||||
// Package telemetry provides lightweight OpenTelemetry helpers and
|
||||
// low-cardinality Game Master instruments used by the runnable skeleton.
|
||||
// Later stages emit into the instruments declared here without touching
|
||||
// this package.
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
|
||||
"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
|
||||
"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
"go.opentelemetry.io/otel/propagation"
|
||||
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
sdktrace "go.opentelemetry.io/otel/sdk/trace"
|
||||
oteltrace "go.opentelemetry.io/otel/trace"
|
||||
)
|
||||
|
||||
const meterName = "galaxy/gamemaster"
|
||||
|
||||
const (
|
||||
defaultServiceName = "galaxy-gamemaster"
|
||||
|
||||
processExporterNone = "none"
|
||||
processExporterOTLP = "otlp"
|
||||
processProtocolHTTPProtobuf = "http/protobuf"
|
||||
processProtocolGRPC = "grpc"
|
||||
)
|
||||
|
||||
// ProcessConfig configures the process-wide OpenTelemetry runtime.
|
||||
type ProcessConfig struct {
|
||||
// ServiceName overrides the default OpenTelemetry service name.
|
||||
ServiceName string
|
||||
|
||||
// TracesExporter selects the external traces exporter. Supported values
|
||||
// are `none` and `otlp`.
|
||||
TracesExporter string
|
||||
|
||||
// MetricsExporter selects the external metrics exporter. Supported
|
||||
// values are `none` and `otlp`.
|
||||
MetricsExporter string
|
||||
|
||||
// TracesProtocol selects the OTLP traces protocol when TracesExporter is
|
||||
// `otlp`.
|
||||
TracesProtocol string
|
||||
|
||||
// MetricsProtocol selects the OTLP metrics protocol when
|
||||
// MetricsExporter is `otlp`.
|
||||
MetricsProtocol string
|
||||
|
||||
// StdoutTracesEnabled enables the additional stdout trace exporter used
|
||||
// for local development and debugging.
|
||||
StdoutTracesEnabled bool
|
||||
|
||||
// StdoutMetricsEnabled enables the additional stdout metric exporter
|
||||
// used for local development and debugging.
|
||||
StdoutMetricsEnabled bool
|
||||
}
|
||||
|
||||
// Validate reports whether cfg contains a supported OpenTelemetry exporter
|
||||
// configuration.
|
||||
func (cfg ProcessConfig) Validate() error {
|
||||
switch cfg.TracesExporter {
|
||||
case processExporterNone, processExporterOTLP:
|
||||
default:
|
||||
return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter)
|
||||
}
|
||||
|
||||
switch cfg.MetricsExporter {
|
||||
case processExporterNone, processExporterOTLP:
|
||||
default:
|
||||
return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter)
|
||||
}
|
||||
|
||||
if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC {
|
||||
return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol)
|
||||
}
|
||||
if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC {
|
||||
return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Runtime owns the Game Master OpenTelemetry providers and the
|
||||
// low-cardinality custom instruments listed in `gamemaster/README.md`
|
||||
// §Observability.
|
||||
type Runtime struct {
|
||||
tracerProvider oteltrace.TracerProvider
|
||||
meterProvider metric.MeterProvider
|
||||
meter metric.Meter
|
||||
|
||||
shutdownMu sync.Mutex
|
||||
shutdownDone bool
|
||||
shutdownErr error
|
||||
shutdownFns []func(context.Context) error
|
||||
|
||||
internalHTTPRequests metric.Int64Counter
|
||||
internalHTTPDuration metric.Float64Histogram
|
||||
|
||||
registerRuntimeOutcomes metric.Int64Counter
|
||||
turnGenerationOutcomes metric.Int64Counter
|
||||
commandExecuteOutcomes metric.Int64Counter
|
||||
orderPutOutcomes metric.Int64Counter
|
||||
reportGetOutcomes metric.Int64Counter
|
||||
banishOutcomes metric.Int64Counter
|
||||
healthEventsConsumed metric.Int64Counter
|
||||
lobbyEventsPublished metric.Int64Counter
|
||||
notificationPublishAttempts metric.Int64Counter
|
||||
membershipCacheHits metric.Int64Counter
|
||||
engineCallLatency metric.Float64Histogram
|
||||
|
||||
runtimeRecordsByStatus metric.Int64ObservableGauge
|
||||
schedulerDueGames metric.Int64ObservableGauge
|
||||
engineVersionsTotal metric.Int64ObservableGauge
|
||||
|
||||
gaugeMu sync.Mutex
|
||||
gaugeRegistration metric.Registration
|
||||
}
|
||||
|
||||
// NewWithProviders constructs a telemetry runtime around explicitly supplied
|
||||
// meterProvider and tracerProvider values.
|
||||
func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) {
|
||||
if meterProvider == nil {
|
||||
meterProvider = otel.GetMeterProvider()
|
||||
}
|
||||
if tracerProvider == nil {
|
||||
tracerProvider = otel.GetTracerProvider()
|
||||
}
|
||||
if meterProvider == nil {
|
||||
return nil, errors.New("new gamemaster telemetry runtime: nil meter provider")
|
||||
}
|
||||
if tracerProvider == nil {
|
||||
return nil, errors.New("new gamemaster telemetry runtime: nil tracer provider")
|
||||
}
|
||||
|
||||
return buildRuntime(meterProvider, tracerProvider, nil)
|
||||
}
|
||||
|
||||
// NewProcess constructs the process-wide Game Master OpenTelemetry runtime
|
||||
// from cfg, installs the resulting providers globally, and returns the
|
||||
// runtime.
|
||||
func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) {
|
||||
if ctx == nil {
|
||||
return nil, errors.New("new gamemaster telemetry process: nil context")
|
||||
}
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new gamemaster telemetry process: %w", err)
|
||||
}
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
serviceName := strings.TrimSpace(cfg.ServiceName)
|
||||
if serviceName == "" {
|
||||
serviceName = defaultServiceName
|
||||
}
|
||||
|
||||
res := resource.NewSchemaless(attribute.String("service.name", serviceName))
|
||||
|
||||
tracerProvider, err := newTracerProvider(ctx, res, cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new gamemaster telemetry process: tracer provider: %w", err)
|
||||
}
|
||||
meterProvider, err := newMeterProvider(ctx, res, cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new gamemaster telemetry process: meter provider: %w", err)
|
||||
}
|
||||
|
||||
otel.SetTracerProvider(tracerProvider)
|
||||
otel.SetMeterProvider(meterProvider)
|
||||
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
|
||||
propagation.TraceContext{},
|
||||
propagation.Baggage{},
|
||||
))
|
||||
|
||||
runtime, err := buildRuntime(meterProvider, tracerProvider, []func(context.Context) error{
|
||||
meterProvider.Shutdown,
|
||||
tracerProvider.Shutdown,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new gamemaster telemetry process: runtime: %w", err)
|
||||
}
|
||||
|
||||
logger.Info("gamemaster telemetry configured",
|
||||
"service_name", serviceName,
|
||||
"traces_exporter", cfg.TracesExporter,
|
||||
"metrics_exporter", cfg.MetricsExporter,
|
||||
)
|
||||
|
||||
return runtime, nil
|
||||
}
|
||||
|
||||
// TracerProvider returns the runtime tracer provider.
|
||||
func (runtime *Runtime) TracerProvider() oteltrace.TracerProvider {
|
||||
if runtime == nil || runtime.tracerProvider == nil {
|
||||
return otel.GetTracerProvider()
|
||||
}
|
||||
|
||||
return runtime.tracerProvider
|
||||
}
|
||||
|
||||
// MeterProvider returns the runtime meter provider.
|
||||
func (runtime *Runtime) MeterProvider() metric.MeterProvider {
|
||||
if runtime == nil || runtime.meterProvider == nil {
|
||||
return otel.GetMeterProvider()
|
||||
}
|
||||
|
||||
return runtime.meterProvider
|
||||
}
|
||||
|
||||
// Shutdown flushes and stops the configured telemetry providers. Shutdown
|
||||
// is idempotent.
|
||||
func (runtime *Runtime) Shutdown(ctx context.Context) error {
|
||||
if runtime == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
runtime.shutdownMu.Lock()
|
||||
if runtime.shutdownDone {
|
||||
err := runtime.shutdownErr
|
||||
runtime.shutdownMu.Unlock()
|
||||
return err
|
||||
}
|
||||
runtime.shutdownDone = true
|
||||
runtime.shutdownMu.Unlock()
|
||||
|
||||
runtime.gaugeMu.Lock()
|
||||
if runtime.gaugeRegistration != nil {
|
||||
_ = runtime.gaugeRegistration.Unregister()
|
||||
runtime.gaugeRegistration = nil
|
||||
}
|
||||
runtime.gaugeMu.Unlock()
|
||||
|
||||
var shutdownErr error
|
||||
for index := len(runtime.shutdownFns) - 1; index >= 0; index-- {
|
||||
shutdownErr = errors.Join(shutdownErr, runtime.shutdownFns[index](ctx))
|
||||
}
|
||||
|
||||
runtime.shutdownMu.Lock()
|
||||
runtime.shutdownErr = shutdownErr
|
||||
runtime.shutdownMu.Unlock()
|
||||
|
||||
return shutdownErr
|
||||
}
|
||||
|
||||
// RecordInternalHTTPRequest records one internal HTTP request outcome.
|
||||
func (runtime *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
options := metric.WithAttributes(attrs...)
|
||||
runtime.internalHTTPRequests.Add(normalizeContext(ctx), 1, options)
|
||||
runtime.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
|
||||
}
|
||||
|
||||
// RecordRegisterRuntimeOutcome records one terminal outcome of the
|
||||
// register-runtime operation.
|
||||
func (runtime *Runtime) RecordRegisterRuntimeOutcome(ctx context.Context, outcome, errorCode string) {
|
||||
if runtime == nil || runtime.registerRuntimeOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.registerRuntimeOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
attribute.String("error_code", errorCode),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordTurnGenerationOutcome records one terminal outcome of a turn
|
||||
// generation. trigger is `scheduler` or `force`.
|
||||
func (runtime *Runtime) RecordTurnGenerationOutcome(ctx context.Context, outcome, errorCode, trigger string) {
|
||||
if runtime == nil || runtime.turnGenerationOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.turnGenerationOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
attribute.String("error_code", errorCode),
|
||||
attribute.String("trigger", trigger),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordCommandExecuteOutcome records one terminal outcome of a command
|
||||
// execute call.
|
||||
func (runtime *Runtime) RecordCommandExecuteOutcome(ctx context.Context, outcome, errorCode string) {
|
||||
if runtime == nil || runtime.commandExecuteOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.commandExecuteOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
attribute.String("error_code", errorCode),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordOrderPutOutcome records one terminal outcome of an order put call.
|
||||
func (runtime *Runtime) RecordOrderPutOutcome(ctx context.Context, outcome, errorCode string) {
|
||||
if runtime == nil || runtime.orderPutOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.orderPutOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
attribute.String("error_code", errorCode),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordReportGetOutcome records one terminal outcome of a report get
|
||||
// call.
|
||||
func (runtime *Runtime) RecordReportGetOutcome(ctx context.Context, outcome, errorCode string) {
|
||||
if runtime == nil || runtime.reportGetOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.reportGetOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
attribute.String("error_code", errorCode),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordBanishOutcome records one terminal outcome of a banish call.
|
||||
func (runtime *Runtime) RecordBanishOutcome(ctx context.Context, outcome, errorCode string) {
|
||||
if runtime == nil || runtime.banishOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.banishOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
attribute.String("error_code", errorCode),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordHealthEventConsumed records one consumed `runtime:health_events`
|
||||
// entry.
|
||||
func (runtime *Runtime) RecordHealthEventConsumed(ctx context.Context) {
|
||||
if runtime == nil || runtime.healthEventsConsumed == nil {
|
||||
return
|
||||
}
|
||||
runtime.healthEventsConsumed.Add(normalizeContext(ctx), 1)
|
||||
}
|
||||
|
||||
// RecordLobbyEventPublished records one publication on `gm:lobby_events`.
|
||||
// eventType is `runtime_snapshot_update` or `game_finished`.
|
||||
func (runtime *Runtime) RecordLobbyEventPublished(ctx context.Context, eventType string) {
|
||||
if runtime == nil || runtime.lobbyEventsPublished == nil {
|
||||
return
|
||||
}
|
||||
runtime.lobbyEventsPublished.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("event_type", eventType),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordNotificationPublishAttempt records one publication attempt to
|
||||
// `notification:intents`. result is `ok` or `error`.
|
||||
func (runtime *Runtime) RecordNotificationPublishAttempt(ctx context.Context, notificationType, result string) {
|
||||
if runtime == nil || runtime.notificationPublishAttempts == nil {
|
||||
return
|
||||
}
|
||||
runtime.notificationPublishAttempts.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("notification_type", notificationType),
|
||||
attribute.String("result", result),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordMembershipCacheResult records one membership cache lookup outcome.
|
||||
// result is `hit`, `miss`, or `invalidate`.
|
||||
func (runtime *Runtime) RecordMembershipCacheResult(ctx context.Context, result string) {
|
||||
if runtime == nil || runtime.membershipCacheHits == nil {
|
||||
return
|
||||
}
|
||||
runtime.membershipCacheHits.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("result", result),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordEngineCall records the wall-clock duration of one engine HTTP
|
||||
// call. op is one of `init`, `status`, `turn`, `banish`, `command`,
|
||||
// `order`, `report`.
|
||||
func (runtime *Runtime) RecordEngineCall(ctx context.Context, op string, duration time.Duration) {
|
||||
if runtime == nil || runtime.engineCallLatency == nil {
|
||||
return
|
||||
}
|
||||
runtime.engineCallLatency.Record(normalizeContext(ctx), duration.Seconds()*1000, metric.WithAttributes(
|
||||
attribute.String("op", op),
|
||||
))
|
||||
}
|
||||
|
||||
// RuntimeRecordsByStatusProbe reports the number of `runtime_records`
|
||||
// rows per status. The production probe wraps the runtime record store;
|
||||
// tests may pass a stub.
|
||||
type RuntimeRecordsByStatusProbe interface {
|
||||
CountByStatus(ctx context.Context) (map[string]int, error)
|
||||
}
|
||||
|
||||
// SchedulerDueGamesProbe reports how many runtime records are currently
|
||||
// due for a scheduler-driven turn generation.
|
||||
type SchedulerDueGamesProbe interface {
|
||||
CountDue(ctx context.Context) (int, error)
|
||||
}
|
||||
|
||||
// EngineVersionsTotalProbe reports how many engine_versions rows are
|
||||
// registered.
|
||||
type EngineVersionsTotalProbe interface {
|
||||
CountVersions(ctx context.Context) (int, error)
|
||||
}
|
||||
|
||||
// GaugeDependencies groups the collaborators required by RegisterGauges.
|
||||
type GaugeDependencies struct {
|
||||
// RuntimeRecordsByStatus probes the per-status row count for
|
||||
// `gamemaster.runtime_records_by_status`.
|
||||
RuntimeRecordsByStatus RuntimeRecordsByStatusProbe
|
||||
|
||||
// SchedulerDueGames probes the due-now count for
|
||||
// `gamemaster.scheduler.due_games`.
|
||||
SchedulerDueGames SchedulerDueGamesProbe
|
||||
|
||||
// EngineVersionsTotal probes the engine_versions row count for
|
||||
// `gamemaster.engine_versions_total`.
|
||||
EngineVersionsTotal EngineVersionsTotalProbe
|
||||
|
||||
// Logger records non-fatal probe errors. Defaults to slog.Default
|
||||
// when nil.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// RegisterGauges installs the observable-gauge callback that reports
|
||||
// `gamemaster.runtime_records_by_status`,
|
||||
// `gamemaster.scheduler.due_games`, and
|
||||
// `gamemaster.engine_versions_total`. It is safe to call once per
|
||||
// Runtime; a second call replaces the previous registration. The runtime
|
||||
// keeps no strong reference to deps beyond the callback closure.
|
||||
//
|
||||
// The wiring layer registers the gauges once the persistence adapters
|
||||
// and scheduler probe are constructed.
|
||||
func (runtime *Runtime) RegisterGauges(deps GaugeDependencies) error {
|
||||
if runtime == nil {
|
||||
return errors.New("register gamemaster gauges: nil runtime")
|
||||
}
|
||||
if deps.RuntimeRecordsByStatus == nil {
|
||||
return errors.New("register gamemaster gauges: nil runtime records probe")
|
||||
}
|
||||
if deps.SchedulerDueGames == nil {
|
||||
return errors.New("register gamemaster gauges: nil scheduler probe")
|
||||
}
|
||||
if deps.EngineVersionsTotal == nil {
|
||||
return errors.New("register gamemaster gauges: nil engine versions probe")
|
||||
}
|
||||
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
runtime.gaugeMu.Lock()
|
||||
defer runtime.gaugeMu.Unlock()
|
||||
|
||||
if runtime.gaugeRegistration != nil {
|
||||
_ = runtime.gaugeRegistration.Unregister()
|
||||
runtime.gaugeRegistration = nil
|
||||
}
|
||||
|
||||
callback := func(ctx context.Context, observer metric.Observer) error {
|
||||
if counts, err := deps.RuntimeRecordsByStatus.CountByStatus(ctx); err != nil {
|
||||
logger.WarnContext(ctx, "runtime records probe failed",
|
||||
"err", err.Error(),
|
||||
)
|
||||
} else {
|
||||
for status, count := range counts {
|
||||
observer.ObserveInt64(runtime.runtimeRecordsByStatus, int64(count), metric.WithAttributes(
|
||||
attribute.String("status", status),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
if due, err := deps.SchedulerDueGames.CountDue(ctx); err != nil {
|
||||
logger.WarnContext(ctx, "scheduler due games probe failed",
|
||||
"err", err.Error(),
|
||||
)
|
||||
} else {
|
||||
observer.ObserveInt64(runtime.schedulerDueGames, int64(due))
|
||||
}
|
||||
|
||||
if versions, err := deps.EngineVersionsTotal.CountVersions(ctx); err != nil {
|
||||
logger.WarnContext(ctx, "engine versions probe failed",
|
||||
"err", err.Error(),
|
||||
)
|
||||
} else {
|
||||
observer.ObserveInt64(runtime.engineVersionsTotal, int64(versions))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
registration, err := runtime.meter.RegisterCallback(callback,
|
||||
runtime.runtimeRecordsByStatus,
|
||||
runtime.schedulerDueGames,
|
||||
runtime.engineVersionsTotal,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("register gamemaster gauges: %w", err)
|
||||
}
|
||||
runtime.gaugeRegistration = registration
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) {
|
||||
meter := meterProvider.Meter(meterName)
|
||||
runtime := &Runtime{
|
||||
tracerProvider: tracerProvider,
|
||||
meterProvider: meterProvider,
|
||||
meter: meter,
|
||||
shutdownFns: append([]func(context.Context) error(nil), shutdownFns...),
|
||||
}
|
||||
|
||||
internalHTTPRequests, err := meter.Int64Counter("gamemaster.internal_http.requests")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build gamemaster telemetry runtime: internal_http.requests: %w", err)
|
||||
}
|
||||
internalHTTPDuration, err := meter.Float64Histogram("gamemaster.internal_http.duration", metric.WithUnit("ms"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build gamemaster telemetry runtime: internal_http.duration: %w", err)
|
||||
}
|
||||
runtime.internalHTTPRequests = internalHTTPRequests
|
||||
runtime.internalHTTPDuration = internalHTTPDuration
|
||||
|
||||
if err := registerCounters(meter, runtime); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := registerHistograms(meter, runtime); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := registerObservableGauges(meter, runtime); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return runtime, nil
|
||||
}
|
||||
|
||||
func registerCounters(meter metric.Meter, runtime *Runtime) error {
|
||||
specs := []struct {
|
||||
name string
|
||||
target *metric.Int64Counter
|
||||
}{
|
||||
{"gamemaster.register_runtime.outcomes", &runtime.registerRuntimeOutcomes},
|
||||
{"gamemaster.turn_generation.outcomes", &runtime.turnGenerationOutcomes},
|
||||
{"gamemaster.command_execute.outcomes", &runtime.commandExecuteOutcomes},
|
||||
{"gamemaster.order_put.outcomes", &runtime.orderPutOutcomes},
|
||||
{"gamemaster.report_get.outcomes", &runtime.reportGetOutcomes},
|
||||
{"gamemaster.banish.outcomes", &runtime.banishOutcomes},
|
||||
{"gamemaster.health_events.consumed", &runtime.healthEventsConsumed},
|
||||
{"gamemaster.lobby_events.published", &runtime.lobbyEventsPublished},
|
||||
{"gamemaster.notification.publish_attempts", &runtime.notificationPublishAttempts},
|
||||
{"gamemaster.membership_cache.hits", &runtime.membershipCacheHits},
|
||||
}
|
||||
for _, spec := range specs {
|
||||
counter, err := meter.Int64Counter(spec.name)
|
||||
if err != nil {
|
||||
return fmt.Errorf("build gamemaster telemetry runtime: %s: %w", spec.name, err)
|
||||
}
|
||||
*spec.target = counter
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func registerHistograms(meter metric.Meter, runtime *Runtime) error {
|
||||
specs := []struct {
|
||||
name string
|
||||
unit string
|
||||
target *metric.Float64Histogram
|
||||
}{
|
||||
{"gamemaster.engine_call.latency", "ms", &runtime.engineCallLatency},
|
||||
}
|
||||
for _, spec := range specs {
|
||||
options := []metric.Float64HistogramOption{}
|
||||
if spec.unit != "" {
|
||||
options = append(options, metric.WithUnit(spec.unit))
|
||||
}
|
||||
histogram, err := meter.Float64Histogram(spec.name, options...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("build gamemaster telemetry runtime: %s: %w", spec.name, err)
|
||||
}
|
||||
*spec.target = histogram
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func registerObservableGauges(meter metric.Meter, runtime *Runtime) error {
|
||||
gauge, err := meter.Int64ObservableGauge("gamemaster.runtime_records_by_status")
|
||||
if err != nil {
|
||||
return fmt.Errorf("build gamemaster telemetry runtime: runtime_records_by_status: %w", err)
|
||||
}
|
||||
runtime.runtimeRecordsByStatus = gauge
|
||||
|
||||
due, err := meter.Int64ObservableGauge("gamemaster.scheduler.due_games")
|
||||
if err != nil {
|
||||
return fmt.Errorf("build gamemaster telemetry runtime: scheduler.due_games: %w", err)
|
||||
}
|
||||
runtime.schedulerDueGames = due
|
||||
|
||||
versions, err := meter.Int64ObservableGauge("gamemaster.engine_versions_total")
|
||||
if err != nil {
|
||||
return fmt.Errorf("build gamemaster telemetry runtime: engine_versions_total: %w", err)
|
||||
}
|
||||
runtime.engineVersionsTotal = versions
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdktrace.TracerProvider, error) {
|
||||
options := []sdktrace.TracerProviderOption{
|
||||
sdktrace.WithResource(res),
|
||||
}
|
||||
|
||||
if exporter, err := traceExporter(ctx, cfg); err != nil {
|
||||
return nil, err
|
||||
} else if exporter != nil {
|
||||
options = append(options, sdktrace.WithBatcher(exporter))
|
||||
}
|
||||
|
||||
if cfg.StdoutTracesEnabled {
|
||||
exporter, err := stdouttrace.New(stdouttrace.WithWriter(os.Stdout))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stdout traces exporter: %w", err)
|
||||
}
|
||||
options = append(options, sdktrace.WithBatcher(exporter))
|
||||
}
|
||||
|
||||
return sdktrace.NewTracerProvider(options...), nil
|
||||
}
|
||||
|
||||
func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdkmetric.MeterProvider, error) {
|
||||
options := []sdkmetric.Option{
|
||||
sdkmetric.WithResource(res),
|
||||
}
|
||||
|
||||
if exporter, err := metricExporter(ctx, cfg); err != nil {
|
||||
return nil, err
|
||||
} else if exporter != nil {
|
||||
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
|
||||
}
|
||||
|
||||
if cfg.StdoutMetricsEnabled {
|
||||
exporter, err := stdoutmetric.New(stdoutmetric.WithWriter(os.Stdout))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stdout metrics exporter: %w", err)
|
||||
}
|
||||
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
|
||||
}
|
||||
|
||||
return sdkmetric.NewMeterProvider(options...), nil
|
||||
}
|
||||
|
||||
func traceExporter(ctx context.Context, cfg ProcessConfig) (sdktrace.SpanExporter, error) {
|
||||
if cfg.TracesExporter != processExporterOTLP {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
switch normalizeProtocol(cfg.TracesProtocol) {
|
||||
case processProtocolGRPC:
|
||||
exporter, err := otlptracegrpc.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp grpc traces exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
default:
|
||||
exporter, err := otlptracehttp.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp http traces exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
}
|
||||
}
|
||||
|
||||
func metricExporter(ctx context.Context, cfg ProcessConfig) (sdkmetric.Exporter, error) {
|
||||
if cfg.MetricsExporter != processExporterOTLP {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
switch normalizeProtocol(cfg.MetricsProtocol) {
|
||||
case processProtocolGRPC:
|
||||
exporter, err := otlpmetricgrpc.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp grpc metrics exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
default:
|
||||
exporter, err := otlpmetrichttp.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp http metrics exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeProtocol(value string) string {
|
||||
switch strings.TrimSpace(value) {
|
||||
case processProtocolGRPC:
|
||||
return processProtocolGRPC
|
||||
default:
|
||||
return processProtocolHTTPProtobuf
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeContext(ctx context.Context) context.Context {
|
||||
if ctx == nil {
|
||||
return context.Background()
|
||||
}
|
||||
|
||||
return ctx
|
||||
}
|
||||
@@ -0,0 +1,190 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/metric/metricdata"
|
||||
)
|
||||
|
||||
func TestProcessConfigValidate(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
require.NoError(t, ProcessConfig{
|
||||
TracesExporter: "none",
|
||||
MetricsExporter: "none",
|
||||
}.Validate())
|
||||
|
||||
require.NoError(t, ProcessConfig{
|
||||
TracesExporter: "otlp",
|
||||
MetricsExporter: "otlp",
|
||||
TracesProtocol: "grpc",
|
||||
MetricsProtocol: "http/protobuf",
|
||||
}.Validate())
|
||||
|
||||
require.Error(t, ProcessConfig{
|
||||
TracesExporter: "stdout",
|
||||
MetricsExporter: "none",
|
||||
}.Validate())
|
||||
|
||||
require.Error(t, ProcessConfig{
|
||||
TracesExporter: "none",
|
||||
MetricsExporter: "kafka",
|
||||
}.Validate())
|
||||
|
||||
require.Error(t, ProcessConfig{
|
||||
TracesExporter: "otlp",
|
||||
MetricsExporter: "none",
|
||||
TracesProtocol: "thrift",
|
||||
}.Validate())
|
||||
}
|
||||
|
||||
func TestNewWithProvidersBuildsRuntime(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
reader := metric.NewManualReader()
|
||||
meterProvider := metric.NewMeterProvider(metric.WithReader(reader))
|
||||
|
||||
runtime, err := NewWithProviders(meterProvider, nil)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, runtime)
|
||||
require.NotNil(t, runtime.MeterProvider())
|
||||
require.NotNil(t, runtime.TracerProvider())
|
||||
}
|
||||
|
||||
func TestRecordHelpersEmitInstruments(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
reader := metric.NewManualReader()
|
||||
meterProvider := metric.NewMeterProvider(metric.WithReader(reader))
|
||||
runtime, err := NewWithProviders(meterProvider, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
runtime.RecordInternalHTTPRequest(ctx, []attribute.KeyValue{
|
||||
attribute.String("route", "/healthz"),
|
||||
attribute.String("method", "GET"),
|
||||
attribute.String("status_code", "200"),
|
||||
}, 10*time.Millisecond)
|
||||
runtime.RecordRegisterRuntimeOutcome(ctx, "success", "")
|
||||
runtime.RecordTurnGenerationOutcome(ctx, "success", "", "scheduler")
|
||||
runtime.RecordCommandExecuteOutcome(ctx, "success", "")
|
||||
runtime.RecordOrderPutOutcome(ctx, "success", "")
|
||||
runtime.RecordReportGetOutcome(ctx, "success", "")
|
||||
runtime.RecordBanishOutcome(ctx, "success", "")
|
||||
runtime.RecordHealthEventConsumed(ctx)
|
||||
runtime.RecordLobbyEventPublished(ctx, "runtime_snapshot_update")
|
||||
runtime.RecordNotificationPublishAttempt(ctx, "game.turn.ready", "ok")
|
||||
runtime.RecordMembershipCacheResult(ctx, "hit")
|
||||
runtime.RecordEngineCall(ctx, "init", 25*time.Millisecond)
|
||||
|
||||
var rm metricdata.ResourceMetrics
|
||||
require.NoError(t, reader.Collect(ctx, &rm))
|
||||
|
||||
names := collectInstrumentNames(rm)
|
||||
expected := []string{
|
||||
"gamemaster.internal_http.requests",
|
||||
"gamemaster.internal_http.duration",
|
||||
"gamemaster.register_runtime.outcomes",
|
||||
"gamemaster.turn_generation.outcomes",
|
||||
"gamemaster.command_execute.outcomes",
|
||||
"gamemaster.order_put.outcomes",
|
||||
"gamemaster.report_get.outcomes",
|
||||
"gamemaster.banish.outcomes",
|
||||
"gamemaster.health_events.consumed",
|
||||
"gamemaster.lobby_events.published",
|
||||
"gamemaster.notification.publish_attempts",
|
||||
"gamemaster.membership_cache.hits",
|
||||
"gamemaster.engine_call.latency",
|
||||
}
|
||||
for _, name := range expected {
|
||||
require.Contains(t, names, name, "expected instrument %s to be recorded", name)
|
||||
}
|
||||
}
|
||||
|
||||
func collectInstrumentNames(rm metricdata.ResourceMetrics) map[string]struct{} {
|
||||
names := make(map[string]struct{})
|
||||
for _, sm := range rm.ScopeMetrics {
|
||||
for _, m := range sm.Metrics {
|
||||
names[m.Name] = struct{}{}
|
||||
}
|
||||
}
|
||||
return names
|
||||
}
|
||||
|
||||
type stubRuntimeProbe struct {
|
||||
counts map[string]int
|
||||
err error
|
||||
}
|
||||
|
||||
func (probe stubRuntimeProbe) CountByStatus(_ context.Context) (map[string]int, error) {
|
||||
return probe.counts, probe.err
|
||||
}
|
||||
|
||||
type stubSchedulerProbe struct {
|
||||
due int
|
||||
err error
|
||||
}
|
||||
|
||||
func (probe stubSchedulerProbe) CountDue(_ context.Context) (int, error) {
|
||||
return probe.due, probe.err
|
||||
}
|
||||
|
||||
type stubVersionsProbe struct {
|
||||
count int
|
||||
err error
|
||||
}
|
||||
|
||||
func (probe stubVersionsProbe) CountVersions(_ context.Context) (int, error) {
|
||||
return probe.count, probe.err
|
||||
}
|
||||
|
||||
func TestRegisterGaugesEmitsObservations(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
reader := metric.NewManualReader()
|
||||
meterProvider := metric.NewMeterProvider(metric.WithReader(reader))
|
||||
runtime, err := NewWithProviders(meterProvider, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.NoError(t, runtime.RegisterGauges(GaugeDependencies{
|
||||
RuntimeRecordsByStatus: stubRuntimeProbe{counts: map[string]int{"running": 3}},
|
||||
SchedulerDueGames: stubSchedulerProbe{due: 2},
|
||||
EngineVersionsTotal: stubVersionsProbe{count: 5},
|
||||
}))
|
||||
|
||||
var rm metricdata.ResourceMetrics
|
||||
require.NoError(t, reader.Collect(context.Background(), &rm))
|
||||
|
||||
names := collectInstrumentNames(rm)
|
||||
require.Contains(t, names, "gamemaster.runtime_records_by_status")
|
||||
require.Contains(t, names, "gamemaster.scheduler.due_games")
|
||||
require.Contains(t, names, "gamemaster.engine_versions_total")
|
||||
}
|
||||
|
||||
func TestRegisterGaugesRejectsNilDependencies(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
reader := metric.NewManualReader()
|
||||
meterProvider := metric.NewMeterProvider(metric.WithReader(reader))
|
||||
runtime, err := NewWithProviders(meterProvider, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Error(t, runtime.RegisterGauges(GaugeDependencies{
|
||||
SchedulerDueGames: stubSchedulerProbe{},
|
||||
EngineVersionsTotal: stubVersionsProbe{},
|
||||
}))
|
||||
require.Error(t, runtime.RegisterGauges(GaugeDependencies{
|
||||
RuntimeRecordsByStatus: stubRuntimeProbe{},
|
||||
EngineVersionsTotal: stubVersionsProbe{},
|
||||
}))
|
||||
require.Error(t, runtime.RegisterGauges(GaugeDependencies{
|
||||
RuntimeRecordsByStatus: stubRuntimeProbe{},
|
||||
SchedulerDueGames: stubSchedulerProbe{},
|
||||
}))
|
||||
}
|
||||
Reference in New Issue
Block a user