feat: notification service
This commit is contained in:
@@ -0,0 +1,694 @@
|
||||
// Package telemetry provides lightweight OpenTelemetry helpers and
|
||||
// low-cardinality Notification Service instruments.
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
|
||||
"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
|
||||
"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
"go.opentelemetry.io/otel/propagation"
|
||||
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
sdktrace "go.opentelemetry.io/otel/sdk/trace"
|
||||
oteltrace "go.opentelemetry.io/otel/trace"
|
||||
)
|
||||
|
||||
const meterName = "galaxy/notification"
|
||||
|
||||
const (
|
||||
defaultServiceName = "galaxy-notification"
|
||||
|
||||
processExporterNone = "none"
|
||||
processExporterOTLP = "otlp"
|
||||
processProtocolHTTPProtobuf = "http/protobuf"
|
||||
processProtocolGRPC = "grpc"
|
||||
)
|
||||
|
||||
// ProcessConfig configures the process-wide OpenTelemetry runtime.
|
||||
type ProcessConfig struct {
|
||||
// ServiceName overrides the default OpenTelemetry service name.
|
||||
ServiceName string
|
||||
|
||||
// TracesExporter selects the external traces exporter. Supported values are
|
||||
// `none` and `otlp`.
|
||||
TracesExporter string
|
||||
|
||||
// MetricsExporter selects the external metrics exporter. Supported values
|
||||
// are `none` and `otlp`.
|
||||
MetricsExporter string
|
||||
|
||||
// TracesProtocol selects the OTLP traces protocol when TracesExporter is
|
||||
// `otlp`.
|
||||
TracesProtocol string
|
||||
|
||||
// MetricsProtocol selects the OTLP metrics protocol when MetricsExporter is
|
||||
// `otlp`.
|
||||
MetricsProtocol string
|
||||
|
||||
// StdoutTracesEnabled enables the additional stdout trace exporter used for
|
||||
// local development and debugging.
|
||||
StdoutTracesEnabled bool
|
||||
|
||||
// StdoutMetricsEnabled enables the additional stdout metric exporter used
|
||||
// for local development and debugging.
|
||||
StdoutMetricsEnabled bool
|
||||
}
|
||||
|
||||
// Validate reports whether cfg contains a supported OpenTelemetry exporter
|
||||
// configuration.
|
||||
func (cfg ProcessConfig) Validate() error {
|
||||
switch cfg.TracesExporter {
|
||||
case processExporterNone, processExporterOTLP:
|
||||
default:
|
||||
return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter)
|
||||
}
|
||||
|
||||
switch cfg.MetricsExporter {
|
||||
case processExporterNone, processExporterOTLP:
|
||||
default:
|
||||
return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter)
|
||||
}
|
||||
|
||||
if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC {
|
||||
return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol)
|
||||
}
|
||||
if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC {
|
||||
return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Runtime owns the Notification Service OpenTelemetry providers and
|
||||
// low-cardinality custom instruments.
|
||||
type Runtime struct {
|
||||
tracerProvider oteltrace.TracerProvider
|
||||
meterProvider metric.MeterProvider
|
||||
|
||||
shutdownMu sync.Mutex
|
||||
shutdownDone bool
|
||||
shutdownErr error
|
||||
shutdownFns []func(context.Context) error
|
||||
|
||||
routeScheduleReaderMu sync.RWMutex
|
||||
routeScheduleReader RouteScheduleSnapshotReader
|
||||
|
||||
intentStreamLagReaderMu sync.RWMutex
|
||||
intentStreamLagReader IntentStreamLagSnapshotReader
|
||||
|
||||
internalHTTPRequests metric.Int64Counter
|
||||
internalHTTPDuration metric.Float64Histogram
|
||||
internalHTTPLifecycle metric.Int64Counter
|
||||
intentOutcomes metric.Int64Counter
|
||||
malformedIntents metric.Int64Counter
|
||||
userEnrichment metric.Int64Counter
|
||||
routePublishAttempts metric.Int64Counter
|
||||
routeRetries metric.Int64Counter
|
||||
routeDeadLetters metric.Int64Counter
|
||||
}
|
||||
|
||||
// RouteScheduleSnapshot stores the current observable state of the durable
|
||||
// notification route schedule.
|
||||
type RouteScheduleSnapshot struct {
|
||||
// Depth stores how many route keys are currently present in the route
|
||||
// schedule.
|
||||
Depth int64
|
||||
|
||||
// OldestScheduledFor stores the oldest currently scheduled due time when
|
||||
// one exists.
|
||||
OldestScheduledFor *time.Time
|
||||
}
|
||||
|
||||
// RouteScheduleSnapshotReader loads one current route-schedule snapshot for
|
||||
// observable gauge reporting.
|
||||
type RouteScheduleSnapshotReader interface {
|
||||
// ReadRouteScheduleSnapshot returns the current route-schedule depth and
|
||||
// its oldest scheduled timestamp when one exists.
|
||||
ReadRouteScheduleSnapshot(context.Context) (RouteScheduleSnapshot, error)
|
||||
}
|
||||
|
||||
// IntentStreamLagSnapshot stores the current observable lag of the plain-XREAD
|
||||
// notification-intent consumer.
|
||||
type IntentStreamLagSnapshot struct {
|
||||
// OldestUnprocessedAt stores the Redis Stream timestamp of the oldest
|
||||
// entry that has not yet been durably processed.
|
||||
OldestUnprocessedAt *time.Time
|
||||
}
|
||||
|
||||
// IntentStreamLagSnapshotReader loads one current intent-stream lag snapshot
|
||||
// for observable gauge reporting.
|
||||
type IntentStreamLagSnapshotReader interface {
|
||||
// ReadIntentStreamLagSnapshot returns the oldest unprocessed stream entry
|
||||
// timestamp when one exists.
|
||||
ReadIntentStreamLagSnapshot(context.Context) (IntentStreamLagSnapshot, error)
|
||||
}
|
||||
|
||||
// New constructs a lightweight telemetry runtime around meterProvider for
|
||||
// tests and embedded use cases that do not need process-level exporter wiring.
|
||||
func New(meterProvider metric.MeterProvider) (*Runtime, error) {
|
||||
return NewWithProviders(meterProvider, nil)
|
||||
}
|
||||
|
||||
// NewWithProviders constructs a telemetry runtime around explicitly supplied
|
||||
// meterProvider and tracerProvider values.
|
||||
func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) {
|
||||
if meterProvider == nil {
|
||||
meterProvider = otel.GetMeterProvider()
|
||||
}
|
||||
if tracerProvider == nil {
|
||||
tracerProvider = otel.GetTracerProvider()
|
||||
}
|
||||
if meterProvider == nil {
|
||||
return nil, errors.New("new notification telemetry runtime: nil meter provider")
|
||||
}
|
||||
if tracerProvider == nil {
|
||||
return nil, errors.New("new notification telemetry runtime: nil tracer provider")
|
||||
}
|
||||
|
||||
return buildRuntime(meterProvider, tracerProvider, nil)
|
||||
}
|
||||
|
||||
// NewProcess constructs the process-wide Notification Service OpenTelemetry
|
||||
// runtime from cfg, installs the resulting providers globally, and returns the
|
||||
// runtime.
|
||||
func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) {
|
||||
if ctx == nil {
|
||||
return nil, errors.New("new notification telemetry process: nil context")
|
||||
}
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new notification telemetry process: %w", err)
|
||||
}
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
serviceName := strings.TrimSpace(cfg.ServiceName)
|
||||
if serviceName == "" {
|
||||
serviceName = defaultServiceName
|
||||
}
|
||||
|
||||
res := resource.NewSchemaless(attribute.String("service.name", serviceName))
|
||||
|
||||
tracerProvider, err := newTracerProvider(ctx, res, cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new notification telemetry process: tracer provider: %w", err)
|
||||
}
|
||||
meterProvider, err := newMeterProvider(ctx, res, cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new notification telemetry process: meter provider: %w", err)
|
||||
}
|
||||
|
||||
otel.SetTracerProvider(tracerProvider)
|
||||
otel.SetMeterProvider(meterProvider)
|
||||
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
|
||||
propagation.TraceContext{},
|
||||
propagation.Baggage{},
|
||||
))
|
||||
|
||||
runtime, err := buildRuntime(meterProvider, tracerProvider, []func(context.Context) error{
|
||||
meterProvider.Shutdown,
|
||||
tracerProvider.Shutdown,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new notification telemetry process: runtime: %w", err)
|
||||
}
|
||||
|
||||
logger.Info("notification telemetry configured",
|
||||
"service_name", serviceName,
|
||||
"traces_exporter", cfg.TracesExporter,
|
||||
"metrics_exporter", cfg.MetricsExporter,
|
||||
)
|
||||
|
||||
return runtime, nil
|
||||
}
|
||||
|
||||
// TracerProvider returns the runtime tracer provider.
|
||||
func (runtime *Runtime) TracerProvider() oteltrace.TracerProvider {
|
||||
if runtime == nil || runtime.tracerProvider == nil {
|
||||
return otel.GetTracerProvider()
|
||||
}
|
||||
|
||||
return runtime.tracerProvider
|
||||
}
|
||||
|
||||
// MeterProvider returns the runtime meter provider.
|
||||
func (runtime *Runtime) MeterProvider() metric.MeterProvider {
|
||||
if runtime == nil || runtime.meterProvider == nil {
|
||||
return otel.GetMeterProvider()
|
||||
}
|
||||
|
||||
return runtime.meterProvider
|
||||
}
|
||||
|
||||
// Shutdown flushes and stops the configured telemetry providers. Shutdown is
|
||||
// idempotent.
|
||||
func (runtime *Runtime) Shutdown(ctx context.Context) error {
|
||||
if runtime == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
runtime.shutdownMu.Lock()
|
||||
if runtime.shutdownDone {
|
||||
err := runtime.shutdownErr
|
||||
runtime.shutdownMu.Unlock()
|
||||
return err
|
||||
}
|
||||
runtime.shutdownDone = true
|
||||
runtime.shutdownMu.Unlock()
|
||||
|
||||
var shutdownErr error
|
||||
for index := len(runtime.shutdownFns) - 1; index >= 0; index-- {
|
||||
shutdownErr = errors.Join(shutdownErr, runtime.shutdownFns[index](ctx))
|
||||
}
|
||||
|
||||
runtime.shutdownMu.Lock()
|
||||
runtime.shutdownErr = shutdownErr
|
||||
runtime.shutdownMu.Unlock()
|
||||
|
||||
return shutdownErr
|
||||
}
|
||||
|
||||
// RecordInternalHTTPRequest records one internal HTTP request outcome.
|
||||
func (runtime *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
options := metric.WithAttributes(attrs...)
|
||||
runtime.internalHTTPRequests.Add(normalizeContext(ctx), 1, options)
|
||||
runtime.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
|
||||
}
|
||||
|
||||
// RecordInternalHTTPEvent records one internal HTTP server lifecycle event.
|
||||
func (runtime *Runtime) RecordInternalHTTPEvent(ctx context.Context, event string) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
runtime.internalHTTPLifecycle.Add(
|
||||
normalizeContext(ctx),
|
||||
1,
|
||||
metric.WithAttributes(attribute.String("event", strings.TrimSpace(event))),
|
||||
)
|
||||
}
|
||||
|
||||
// RecordIntentOutcome records one accepted notification-intent outcome.
|
||||
func (runtime *Runtime) RecordIntentOutcome(ctx context.Context, notificationType string, producer string, audienceKind string, outcome string) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
runtime.intentOutcomes.Add(
|
||||
normalizeContext(ctx),
|
||||
1,
|
||||
metric.WithAttributes(
|
||||
attribute.String("notification_type", cleanAttribute(notificationType, "unknown")),
|
||||
attribute.String("producer", cleanAttribute(producer, "unknown")),
|
||||
attribute.String("audience_kind", cleanAttribute(audienceKind, "unknown")),
|
||||
attribute.String("outcome", cleanAttribute(outcome, "unknown")),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
// RecordMalformedIntent records one malformed or rejected notification intent.
|
||||
func (runtime *Runtime) RecordMalformedIntent(ctx context.Context, failureCode string, notificationType string, producer string) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
runtime.malformedIntents.Add(
|
||||
normalizeContext(ctx),
|
||||
1,
|
||||
metric.WithAttributes(
|
||||
attribute.String("failure_code", cleanAttribute(failureCode, "unknown")),
|
||||
attribute.String("notification_type", cleanAttribute(notificationType, "unknown")),
|
||||
attribute.String("producer", cleanAttribute(producer, "unknown")),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
// RecordUserEnrichmentAttempt records one User Service enrichment lookup
|
||||
// outcome.
|
||||
func (runtime *Runtime) RecordUserEnrichmentAttempt(ctx context.Context, notificationType string, result string) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
runtime.userEnrichment.Add(
|
||||
normalizeContext(ctx),
|
||||
1,
|
||||
metric.WithAttributes(
|
||||
attribute.String("notification_type", cleanAttribute(notificationType, "unknown")),
|
||||
attribute.String("result", cleanAttribute(result, "unknown")),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
// RecordRoutePublishAttempt records one route publication attempt outcome.
|
||||
func (runtime *Runtime) RecordRoutePublishAttempt(ctx context.Context, channel string, notificationType string, result string, failureClassification string) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
runtime.routePublishAttempts.Add(
|
||||
normalizeContext(ctx),
|
||||
1,
|
||||
metric.WithAttributes(
|
||||
attribute.String("channel", cleanAttribute(channel, "unknown")),
|
||||
attribute.String("notification_type", cleanAttribute(notificationType, "unknown")),
|
||||
attribute.String("result", cleanAttribute(result, "unknown")),
|
||||
attribute.String("failure_classification", cleanAttribute(failureClassification, "none")),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
// RecordRouteRetry records one route retry scheduling event.
|
||||
func (runtime *Runtime) RecordRouteRetry(ctx context.Context, channel string, notificationType string) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
runtime.routeRetries.Add(
|
||||
normalizeContext(ctx),
|
||||
1,
|
||||
metric.WithAttributes(
|
||||
attribute.String("channel", cleanAttribute(channel, "unknown")),
|
||||
attribute.String("notification_type", cleanAttribute(notificationType, "unknown")),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
// RecordRouteDeadLetter records one route transition to dead_letter.
|
||||
func (runtime *Runtime) RecordRouteDeadLetter(ctx context.Context, channel string, notificationType string, failureClassification string) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
runtime.routeDeadLetters.Add(
|
||||
normalizeContext(ctx),
|
||||
1,
|
||||
metric.WithAttributes(
|
||||
attribute.String("channel", cleanAttribute(channel, "unknown")),
|
||||
attribute.String("notification_type", cleanAttribute(notificationType, "unknown")),
|
||||
attribute.String("failure_classification", cleanAttribute(failureClassification, "unknown")),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
// SetRouteScheduleSnapshotReader installs the route-schedule reader used by
|
||||
// the observable route schedule gauges.
|
||||
func (runtime *Runtime) SetRouteScheduleSnapshotReader(reader RouteScheduleSnapshotReader) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
runtime.routeScheduleReaderMu.Lock()
|
||||
runtime.routeScheduleReader = reader
|
||||
runtime.routeScheduleReaderMu.Unlock()
|
||||
}
|
||||
|
||||
// SetIntentStreamLagSnapshotReader installs the intent-stream lag reader used
|
||||
// by the observable lag gauge.
|
||||
func (runtime *Runtime) SetIntentStreamLagSnapshotReader(reader IntentStreamLagSnapshotReader) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
runtime.intentStreamLagReaderMu.Lock()
|
||||
runtime.intentStreamLagReader = reader
|
||||
runtime.intentStreamLagReaderMu.Unlock()
|
||||
}
|
||||
|
||||
func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) {
|
||||
meter := meterProvider.Meter(meterName)
|
||||
runtime := &Runtime{
|
||||
tracerProvider: tracerProvider,
|
||||
meterProvider: meterProvider,
|
||||
shutdownFns: append([]func(context.Context) error(nil), shutdownFns...),
|
||||
}
|
||||
|
||||
internalHTTPRequests, err := meter.Int64Counter("notification.internal_http.requests")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build notification telemetry runtime: internal_http.requests: %w", err)
|
||||
}
|
||||
internalHTTPDuration, err := meter.Float64Histogram("notification.internal_http.duration_ms", metric.WithUnit("ms"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build notification telemetry runtime: internal_http.duration_ms: %w", err)
|
||||
}
|
||||
internalHTTPLifecycle, err := meter.Int64Counter("notification.internal_http.lifecycle")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build notification telemetry runtime: internal_http.lifecycle: %w", err)
|
||||
}
|
||||
intentOutcomes, err := meter.Int64Counter("notification.intent.outcomes")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build notification telemetry runtime: intent.outcomes: %w", err)
|
||||
}
|
||||
malformedIntents, err := meter.Int64Counter("notification.intent.malformed")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build notification telemetry runtime: intent.malformed: %w", err)
|
||||
}
|
||||
userEnrichment, err := meter.Int64Counter("notification.user_enrichment.attempts")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build notification telemetry runtime: user_enrichment.attempts: %w", err)
|
||||
}
|
||||
routePublishAttempts, err := meter.Int64Counter("notification.route.publish_attempts")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build notification telemetry runtime: route.publish_attempts: %w", err)
|
||||
}
|
||||
routeRetries, err := meter.Int64Counter("notification.route.retries")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build notification telemetry runtime: route.retries: %w", err)
|
||||
}
|
||||
routeDeadLetters, err := meter.Int64Counter("notification.route.dead_letters")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build notification telemetry runtime: route.dead_letters: %w", err)
|
||||
}
|
||||
routeScheduleDepth, err := meter.Int64ObservableGauge("notification.route_schedule.depth")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build notification telemetry runtime: route_schedule.depth: %w", err)
|
||||
}
|
||||
routeScheduleOldestAge, err := meter.Int64ObservableGauge("notification.route_schedule.oldest_age_ms", metric.WithUnit("ms"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build notification telemetry runtime: route_schedule.oldest_age_ms: %w", err)
|
||||
}
|
||||
intentStreamOldestUnprocessedAge, err := meter.Int64ObservableGauge("notification.intent_stream.oldest_unprocessed_age_ms", metric.WithUnit("ms"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build notification telemetry runtime: intent_stream.oldest_unprocessed_age_ms: %w", err)
|
||||
}
|
||||
registration, err := meter.RegisterCallback(func(ctx context.Context, observer metric.Observer) error {
|
||||
runtime.observeRouteSchedule(ctx, observer, routeScheduleDepth, routeScheduleOldestAge)
|
||||
runtime.observeIntentStreamLag(ctx, observer, intentStreamOldestUnprocessedAge)
|
||||
return nil
|
||||
}, routeScheduleDepth, routeScheduleOldestAge, intentStreamOldestUnprocessedAge)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build notification telemetry runtime: observable callbacks: %w", err)
|
||||
}
|
||||
runtime.shutdownFns = append(runtime.shutdownFns, func(context.Context) error {
|
||||
return registration.Unregister()
|
||||
})
|
||||
|
||||
runtime.internalHTTPRequests = internalHTTPRequests
|
||||
runtime.internalHTTPDuration = internalHTTPDuration
|
||||
runtime.internalHTTPLifecycle = internalHTTPLifecycle
|
||||
runtime.intentOutcomes = intentOutcomes
|
||||
runtime.malformedIntents = malformedIntents
|
||||
runtime.userEnrichment = userEnrichment
|
||||
runtime.routePublishAttempts = routePublishAttempts
|
||||
runtime.routeRetries = routeRetries
|
||||
runtime.routeDeadLetters = routeDeadLetters
|
||||
|
||||
return runtime, nil
|
||||
}
|
||||
|
||||
func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdktrace.TracerProvider, error) {
|
||||
options := []sdktrace.TracerProviderOption{
|
||||
sdktrace.WithResource(res),
|
||||
}
|
||||
|
||||
if exporter, err := traceExporter(ctx, cfg); err != nil {
|
||||
return nil, err
|
||||
} else if exporter != nil {
|
||||
options = append(options, sdktrace.WithBatcher(exporter))
|
||||
}
|
||||
|
||||
if cfg.StdoutTracesEnabled {
|
||||
exporter, err := stdouttrace.New(stdouttrace.WithWriter(os.Stdout))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stdout traces exporter: %w", err)
|
||||
}
|
||||
options = append(options, sdktrace.WithBatcher(exporter))
|
||||
}
|
||||
|
||||
return sdktrace.NewTracerProvider(options...), nil
|
||||
}
|
||||
|
||||
func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdkmetric.MeterProvider, error) {
|
||||
options := []sdkmetric.Option{
|
||||
sdkmetric.WithResource(res),
|
||||
}
|
||||
|
||||
if exporter, err := metricExporter(ctx, cfg); err != nil {
|
||||
return nil, err
|
||||
} else if exporter != nil {
|
||||
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
|
||||
}
|
||||
|
||||
if cfg.StdoutMetricsEnabled {
|
||||
exporter, err := stdoutmetric.New(stdoutmetric.WithWriter(os.Stdout))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stdout metrics exporter: %w", err)
|
||||
}
|
||||
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
|
||||
}
|
||||
|
||||
return sdkmetric.NewMeterProvider(options...), nil
|
||||
}
|
||||
|
||||
func traceExporter(ctx context.Context, cfg ProcessConfig) (sdktrace.SpanExporter, error) {
|
||||
if cfg.TracesExporter != processExporterOTLP {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
switch normalizeProtocol(cfg.TracesProtocol) {
|
||||
case processProtocolGRPC:
|
||||
exporter, err := otlptracegrpc.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp grpc traces exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
default:
|
||||
exporter, err := otlptracehttp.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp http traces exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
}
|
||||
}
|
||||
|
||||
func metricExporter(ctx context.Context, cfg ProcessConfig) (sdkmetric.Exporter, error) {
|
||||
if cfg.MetricsExporter != processExporterOTLP {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
switch normalizeProtocol(cfg.MetricsProtocol) {
|
||||
case processProtocolGRPC:
|
||||
exporter, err := otlpmetricgrpc.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp grpc metrics exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
default:
|
||||
exporter, err := otlpmetrichttp.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp http metrics exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeProtocol(value string) string {
|
||||
switch strings.TrimSpace(value) {
|
||||
case processProtocolGRPC:
|
||||
return processProtocolGRPC
|
||||
default:
|
||||
return processProtocolHTTPProtobuf
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeContext(ctx context.Context) context.Context {
|
||||
if ctx == nil {
|
||||
return context.Background()
|
||||
}
|
||||
|
||||
return ctx
|
||||
}
|
||||
|
||||
func cleanAttribute(value string, fallback string) string {
|
||||
trimmed := strings.TrimSpace(value)
|
||||
if trimmed == "" {
|
||||
return fallback
|
||||
}
|
||||
|
||||
return trimmed
|
||||
}
|
||||
|
||||
func (runtime *Runtime) observeRouteSchedule(
|
||||
ctx context.Context,
|
||||
observer metric.Observer,
|
||||
depthGauge metric.Int64ObservableGauge,
|
||||
oldestAgeGauge metric.Int64ObservableGauge,
|
||||
) {
|
||||
depth := int64(0)
|
||||
oldestAge := int64(0)
|
||||
|
||||
reader := runtime.currentRouteScheduleReader()
|
||||
if reader != nil {
|
||||
snapshot, err := reader.ReadRouteScheduleSnapshot(ctx)
|
||||
if err != nil {
|
||||
otel.Handle(fmt.Errorf("observe notification route schedule: %w", err))
|
||||
} else {
|
||||
if snapshot.Depth > 0 {
|
||||
depth = snapshot.Depth
|
||||
}
|
||||
if snapshot.OldestScheduledFor != nil {
|
||||
oldestAge = time.Since(snapshot.OldestScheduledFor.UTC()).Milliseconds()
|
||||
if oldestAge < 0 {
|
||||
oldestAge = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
observer.ObserveInt64(depthGauge, depth)
|
||||
observer.ObserveInt64(oldestAgeGauge, oldestAge)
|
||||
}
|
||||
|
||||
func (runtime *Runtime) observeIntentStreamLag(
|
||||
ctx context.Context,
|
||||
observer metric.Observer,
|
||||
oldestUnprocessedAgeGauge metric.Int64ObservableGauge,
|
||||
) {
|
||||
oldestAge := int64(0)
|
||||
|
||||
reader := runtime.currentIntentStreamLagReader()
|
||||
if reader != nil {
|
||||
snapshot, err := reader.ReadIntentStreamLagSnapshot(ctx)
|
||||
if err != nil {
|
||||
otel.Handle(fmt.Errorf("observe notification intent stream lag: %w", err))
|
||||
} else if snapshot.OldestUnprocessedAt != nil {
|
||||
oldestAge = time.Since(snapshot.OldestUnprocessedAt.UTC()).Milliseconds()
|
||||
if oldestAge < 0 {
|
||||
oldestAge = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
observer.ObserveInt64(oldestUnprocessedAgeGauge, oldestAge)
|
||||
}
|
||||
|
||||
func (runtime *Runtime) currentRouteScheduleReader() RouteScheduleSnapshotReader {
|
||||
runtime.routeScheduleReaderMu.RLock()
|
||||
defer runtime.routeScheduleReaderMu.RUnlock()
|
||||
return runtime.routeScheduleReader
|
||||
}
|
||||
|
||||
func (runtime *Runtime) currentIntentStreamLagReader() IntentStreamLagSnapshotReader {
|
||||
runtime.intentStreamLagReaderMu.RLock()
|
||||
defer runtime.intentStreamLagReaderMu.RUnlock()
|
||||
return runtime.intentStreamLagReader
|
||||
}
|
||||
@@ -0,0 +1,228 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/metric/metricdata"
|
||||
sdktrace "go.opentelemetry.io/otel/sdk/trace"
|
||||
)
|
||||
|
||||
func TestRuntimeRecordsMetrics(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
reader := sdkmetric.NewManualReader()
|
||||
meterProvider := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader))
|
||||
tracerProvider := sdktrace.NewTracerProvider()
|
||||
|
||||
runtime, err := NewWithProviders(meterProvider, tracerProvider)
|
||||
require.NoError(t, err)
|
||||
|
||||
runtime.RecordInternalHTTPRequest(context.Background(), []attribute.KeyValue{
|
||||
attribute.String("route", "/healthz"),
|
||||
attribute.String("method", "GET"),
|
||||
attribute.String("edge_outcome", "success"),
|
||||
}, 5*time.Millisecond)
|
||||
runtime.RecordInternalHTTPEvent(context.Background(), "started")
|
||||
runtime.RecordIntentOutcome(context.Background(), "game.turn.ready", "game_master", "user", "accepted")
|
||||
runtime.RecordIntentOutcome(context.Background(), "game.turn.ready", "game_master", "user", "duplicate")
|
||||
runtime.RecordMalformedIntent(context.Background(), "idempotency_conflict", "game.turn.ready", "game_master")
|
||||
runtime.RecordUserEnrichmentAttempt(context.Background(), "game.turn.ready", "success")
|
||||
runtime.RecordUserEnrichmentAttempt(context.Background(), "game.turn.ready", "recipient_not_found")
|
||||
runtime.RecordRoutePublishAttempt(context.Background(), "push", "game.turn.ready", "published", "")
|
||||
runtime.RecordRoutePublishAttempt(context.Background(), "email", "game.turn.ready", "retry", "mail_stream_publish_failed")
|
||||
runtime.RecordRouteRetry(context.Background(), "email", "game.turn.ready")
|
||||
runtime.RecordRouteDeadLetter(context.Background(), "email", "game.turn.ready", "mail_stream_publish_failed")
|
||||
scheduledAt := time.Now().Add(-time.Second).UTC()
|
||||
unprocessedAt := time.Now().Add(-2 * time.Second).UTC()
|
||||
runtime.SetRouteScheduleSnapshotReader(stubRouteScheduleSnapshotReader{
|
||||
snapshot: RouteScheduleSnapshot{
|
||||
Depth: 3,
|
||||
OldestScheduledFor: &scheduledAt,
|
||||
},
|
||||
})
|
||||
runtime.SetIntentStreamLagSnapshotReader(stubIntentStreamLagSnapshotReader{
|
||||
snapshot: IntentStreamLagSnapshot{
|
||||
OldestUnprocessedAt: &unprocessedAt,
|
||||
},
|
||||
})
|
||||
|
||||
assertMetricCount(t, reader, "notification.internal_http.requests", map[string]string{
|
||||
"route": "/healthz",
|
||||
"method": "GET",
|
||||
"edge_outcome": "success",
|
||||
}, 1)
|
||||
assertMetricCount(t, reader, "notification.internal_http.lifecycle", map[string]string{
|
||||
"event": "started",
|
||||
}, 1)
|
||||
assertMetricCount(t, reader, "notification.intent.outcomes", map[string]string{
|
||||
"notification_type": "game.turn.ready",
|
||||
"producer": "game_master",
|
||||
"audience_kind": "user",
|
||||
"outcome": "accepted",
|
||||
}, 1)
|
||||
assertMetricCount(t, reader, "notification.intent.outcomes", map[string]string{
|
||||
"notification_type": "game.turn.ready",
|
||||
"producer": "game_master",
|
||||
"audience_kind": "user",
|
||||
"outcome": "duplicate",
|
||||
}, 1)
|
||||
assertMetricCount(t, reader, "notification.intent.malformed", map[string]string{
|
||||
"failure_code": "idempotency_conflict",
|
||||
"notification_type": "game.turn.ready",
|
||||
"producer": "game_master",
|
||||
}, 1)
|
||||
assertMetricCount(t, reader, "notification.user_enrichment.attempts", map[string]string{
|
||||
"notification_type": "game.turn.ready",
|
||||
"result": "success",
|
||||
}, 1)
|
||||
assertMetricCount(t, reader, "notification.user_enrichment.attempts", map[string]string{
|
||||
"notification_type": "game.turn.ready",
|
||||
"result": "recipient_not_found",
|
||||
}, 1)
|
||||
assertMetricCount(t, reader, "notification.route.publish_attempts", map[string]string{
|
||||
"channel": "push",
|
||||
"notification_type": "game.turn.ready",
|
||||
"result": "published",
|
||||
"failure_classification": "none",
|
||||
}, 1)
|
||||
assertMetricCount(t, reader, "notification.route.publish_attempts", map[string]string{
|
||||
"channel": "email",
|
||||
"notification_type": "game.turn.ready",
|
||||
"result": "retry",
|
||||
"failure_classification": "mail_stream_publish_failed",
|
||||
}, 1)
|
||||
assertMetricCount(t, reader, "notification.route.retries", map[string]string{
|
||||
"channel": "email",
|
||||
"notification_type": "game.turn.ready",
|
||||
}, 1)
|
||||
assertMetricCount(t, reader, "notification.route.dead_letters", map[string]string{
|
||||
"channel": "email",
|
||||
"notification_type": "game.turn.ready",
|
||||
"failure_classification": "mail_stream_publish_failed",
|
||||
}, 1)
|
||||
assertGaugeValue(t, reader, "notification.route_schedule.depth", nil, 3)
|
||||
assertGaugePositive(t, reader, "notification.route_schedule.oldest_age_ms", nil)
|
||||
assertGaugePositive(t, reader, "notification.intent_stream.oldest_unprocessed_age_ms", nil)
|
||||
}
|
||||
|
||||
func assertMetricCount(t *testing.T, reader *sdkmetric.ManualReader, metricName string, wantAttrs map[string]string, wantValue int64) {
|
||||
t.Helper()
|
||||
|
||||
var resourceMetrics metricdata.ResourceMetrics
|
||||
require.NoError(t, reader.Collect(context.Background(), &resourceMetrics))
|
||||
|
||||
for _, scopeMetrics := range resourceMetrics.ScopeMetrics {
|
||||
for _, metric := range scopeMetrics.Metrics {
|
||||
if metric.Name != metricName {
|
||||
continue
|
||||
}
|
||||
|
||||
sum, ok := metric.Data.(metricdata.Sum[int64])
|
||||
require.True(t, ok)
|
||||
|
||||
for _, point := range sum.DataPoints {
|
||||
if hasMetricAttributes(point.Attributes.ToSlice(), wantAttrs) {
|
||||
assert.Equal(t, wantValue, point.Value)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
require.Failf(t, "test failed", "metric %q with attrs %v not found", metricName, wantAttrs)
|
||||
}
|
||||
|
||||
func assertGaugeValue(t *testing.T, reader *sdkmetric.ManualReader, metricName string, wantAttrs map[string]string, wantValue int64) {
|
||||
t.Helper()
|
||||
|
||||
var resourceMetrics metricdata.ResourceMetrics
|
||||
require.NoError(t, reader.Collect(context.Background(), &resourceMetrics))
|
||||
|
||||
for _, scopeMetrics := range resourceMetrics.ScopeMetrics {
|
||||
for _, metric := range scopeMetrics.Metrics {
|
||||
if metric.Name != metricName {
|
||||
continue
|
||||
}
|
||||
|
||||
gauge, ok := metric.Data.(metricdata.Gauge[int64])
|
||||
require.True(t, ok)
|
||||
|
||||
for _, point := range gauge.DataPoints {
|
||||
if hasMetricAttributes(point.Attributes.ToSlice(), wantAttrs) {
|
||||
assert.Equal(t, wantValue, point.Value)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
require.Failf(t, "test failed", "gauge %q with attrs %v not found", metricName, wantAttrs)
|
||||
}
|
||||
|
||||
func assertGaugePositive(t *testing.T, reader *sdkmetric.ManualReader, metricName string, wantAttrs map[string]string) {
|
||||
t.Helper()
|
||||
|
||||
var resourceMetrics metricdata.ResourceMetrics
|
||||
require.NoError(t, reader.Collect(context.Background(), &resourceMetrics))
|
||||
|
||||
for _, scopeMetrics := range resourceMetrics.ScopeMetrics {
|
||||
for _, metric := range scopeMetrics.Metrics {
|
||||
if metric.Name != metricName {
|
||||
continue
|
||||
}
|
||||
|
||||
gauge, ok := metric.Data.(metricdata.Gauge[int64])
|
||||
require.True(t, ok)
|
||||
|
||||
for _, point := range gauge.DataPoints {
|
||||
if hasMetricAttributes(point.Attributes.ToSlice(), wantAttrs) {
|
||||
assert.Greater(t, point.Value, int64(0))
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
require.Failf(t, "test failed", "gauge %q with attrs %v not found", metricName, wantAttrs)
|
||||
}
|
||||
|
||||
func hasMetricAttributes(values []attribute.KeyValue, want map[string]string) bool {
|
||||
if len(want) == 0 {
|
||||
return len(values) == 0
|
||||
}
|
||||
if len(values) != len(want) {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, value := range values {
|
||||
if want[string(value.Key)] != value.Value.AsString() {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
type stubRouteScheduleSnapshotReader struct {
|
||||
snapshot RouteScheduleSnapshot
|
||||
err error
|
||||
}
|
||||
|
||||
func (reader stubRouteScheduleSnapshotReader) ReadRouteScheduleSnapshot(context.Context) (RouteScheduleSnapshot, error) {
|
||||
return reader.snapshot, reader.err
|
||||
}
|
||||
|
||||
type stubIntentStreamLagSnapshotReader struct {
|
||||
snapshot IntentStreamLagSnapshot
|
||||
err error
|
||||
}
|
||||
|
||||
func (reader stubIntentStreamLagSnapshotReader) ReadIntentStreamLagSnapshot(context.Context) (IntentStreamLagSnapshot, error) {
|
||||
return reader.snapshot, reader.err
|
||||
}
|
||||
Reference in New Issue
Block a user