feat: mail service

This commit is contained in:
Ilia Denisov
2026-04-17 18:39:16 +02:00
committed by GitHub
parent 23ffcb7535
commit 5b7593e6f6
183 changed files with 31215 additions and 248 deletions
+661
View File
@@ -0,0 +1,661 @@
// Package telemetry provides lightweight OpenTelemetry helpers and
// low-cardinality Mail Service instruments.
package telemetry
import (
"context"
"errors"
"fmt"
"log/slog"
"os"
"strings"
"sync"
"time"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/propagation"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
oteltrace "go.opentelemetry.io/otel/trace"
)
const meterName = "galaxy/mail"
const (
defaultServiceName = "galaxy-mail"
processExporterNone = "none"
processExporterOTLP = "otlp"
processProtocolHTTPProtobuf = "http/protobuf"
processProtocolGRPC = "grpc"
)
// ProcessConfig configures the process-wide OpenTelemetry runtime.
type ProcessConfig struct {
// ServiceName overrides the default OpenTelemetry service name.
ServiceName string
// TracesExporter selects the external traces exporter. Supported values are
// `none` and `otlp`.
TracesExporter string
// MetricsExporter selects the external metrics exporter. Supported values
// are `none` and `otlp`.
MetricsExporter string
// TracesProtocol selects the OTLP traces protocol when TracesExporter is
// `otlp`.
TracesProtocol string
// MetricsProtocol selects the OTLP metrics protocol when MetricsExporter is
// `otlp`.
MetricsProtocol string
// StdoutTracesEnabled enables the additional stdout trace exporter used for
// local development and debugging.
StdoutTracesEnabled bool
// StdoutMetricsEnabled enables the additional stdout metric exporter used
// for local development and debugging.
StdoutMetricsEnabled bool
}
// Validate reports whether cfg contains a supported OpenTelemetry exporter
// configuration.
func (cfg ProcessConfig) Validate() error {
switch cfg.TracesExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter)
}
switch cfg.MetricsExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter)
}
if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol)
}
if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol)
}
return nil
}
// Runtime owns the Mail Service OpenTelemetry providers and low-cardinality
// custom instruments.
type Runtime struct {
tracerProvider oteltrace.TracerProvider
meterProvider metric.MeterProvider
shutdownMu sync.Mutex
shutdownDone bool
shutdownErr error
shutdownFns []func(context.Context) error
attemptScheduleReaderMu sync.RWMutex
attemptScheduleReader AttemptScheduleSnapshotReader
internalHTTPRequests metric.Int64Counter
internalHTTPDuration metric.Float64Histogram
authDeliveryOutcomes metric.Int64Counter
genericDeliveryOutcomes metric.Int64Counter
malformedCommands metric.Int64Counter
acceptedAuthDeliveries metric.Int64Counter
acceptedGenericDeliveries metric.Int64Counter
suppressedDeliveries metric.Int64Counter
deliveryStatusTransitions metric.Int64Counter
attemptOutcomes metric.Int64Counter
deadLetters metric.Int64Counter
localeFallbacks metric.Int64Counter
providerSendDuration metric.Float64Histogram
}
// AttemptScheduleSnapshot stores the current observable state of the durable
// attempt schedule.
type AttemptScheduleSnapshot struct {
// Depth stores how many delivery ids are currently present in the attempt
// schedule.
Depth int64
// OldestScheduledFor stores the oldest currently scheduled due time when
// one exists.
OldestScheduledFor *time.Time
}
// AttemptScheduleSnapshotReader loads one current schedule snapshot for
// observable gauge reporting.
type AttemptScheduleSnapshotReader interface {
// ReadAttemptScheduleSnapshot returns the current attempt schedule depth and
// its oldest scheduled timestamp when one exists.
ReadAttemptScheduleSnapshot(context.Context) (AttemptScheduleSnapshot, error)
}
// New constructs a lightweight telemetry runtime around meterProvider for
// tests and embedded use cases that do not need process-level exporter wiring.
func New(meterProvider metric.MeterProvider) (*Runtime, error) {
return NewWithProviders(meterProvider, nil)
}
// NewWithProviders constructs a telemetry runtime around explicitly supplied
// meterProvider and tracerProvider values.
func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) {
if meterProvider == nil {
meterProvider = otel.GetMeterProvider()
}
if tracerProvider == nil {
tracerProvider = otel.GetTracerProvider()
}
if meterProvider == nil {
return nil, errors.New("new mail telemetry runtime: nil meter provider")
}
if tracerProvider == nil {
return nil, errors.New("new mail telemetry runtime: nil tracer provider")
}
return buildRuntime(meterProvider, tracerProvider, nil)
}
// NewProcess constructs the process-wide Mail Service OpenTelemetry runtime
// from cfg, installs the resulting providers globally, and returns the
// runtime.
func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) {
if ctx == nil {
return nil, errors.New("new mail telemetry process: nil context")
}
if err := cfg.Validate(); err != nil {
return nil, fmt.Errorf("new mail telemetry process: %w", err)
}
if logger == nil {
logger = slog.Default()
}
serviceName := strings.TrimSpace(cfg.ServiceName)
if serviceName == "" {
serviceName = defaultServiceName
}
res := resource.NewSchemaless(attribute.String("service.name", serviceName))
tracerProvider, err := newTracerProvider(ctx, res, cfg)
if err != nil {
return nil, fmt.Errorf("new mail telemetry process: tracer provider: %w", err)
}
meterProvider, err := newMeterProvider(ctx, res, cfg)
if err != nil {
return nil, fmt.Errorf("new mail telemetry process: meter provider: %w", err)
}
otel.SetTracerProvider(tracerProvider)
otel.SetMeterProvider(meterProvider)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
runtime, err := buildRuntime(meterProvider, tracerProvider, []func(context.Context) error{
meterProvider.Shutdown,
tracerProvider.Shutdown,
})
if err != nil {
return nil, fmt.Errorf("new mail telemetry process: runtime: %w", err)
}
logger.Info("mail telemetry configured",
"service_name", serviceName,
"traces_exporter", cfg.TracesExporter,
"metrics_exporter", cfg.MetricsExporter,
)
return runtime, nil
}
// TracerProvider returns the runtime tracer provider.
func (runtime *Runtime) TracerProvider() oteltrace.TracerProvider {
if runtime == nil || runtime.tracerProvider == nil {
return otel.GetTracerProvider()
}
return runtime.tracerProvider
}
// MeterProvider returns the runtime meter provider.
func (runtime *Runtime) MeterProvider() metric.MeterProvider {
if runtime == nil || runtime.meterProvider == nil {
return otel.GetMeterProvider()
}
return runtime.meterProvider
}
// Shutdown flushes and stops the configured telemetry providers. Shutdown is
// idempotent.
func (runtime *Runtime) Shutdown(ctx context.Context) error {
if runtime == nil {
return nil
}
runtime.shutdownMu.Lock()
if runtime.shutdownDone {
err := runtime.shutdownErr
runtime.shutdownMu.Unlock()
return err
}
runtime.shutdownDone = true
runtime.shutdownMu.Unlock()
var shutdownErr error
for index := len(runtime.shutdownFns) - 1; index >= 0; index-- {
shutdownErr = errors.Join(shutdownErr, runtime.shutdownFns[index](ctx))
}
runtime.shutdownMu.Lock()
runtime.shutdownErr = shutdownErr
runtime.shutdownMu.Unlock()
return shutdownErr
}
// RecordInternalHTTPRequest records one internal HTTP request outcome.
func (runtime *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
if runtime == nil {
return
}
options := metric.WithAttributes(attrs...)
runtime.internalHTTPRequests.Add(normalizeContext(ctx), 1, options)
runtime.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
}
// RecordAuthDeliveryOutcome records one auth-delivery acceptance outcome.
func (runtime *Runtime) RecordAuthDeliveryOutcome(ctx context.Context, outcome string) {
if runtime == nil {
return
}
runtime.authDeliveryOutcomes.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(attribute.String("outcome", strings.TrimSpace(outcome))),
)
}
// RecordGenericDeliveryOutcome records one generic-delivery acceptance
// outcome.
func (runtime *Runtime) RecordGenericDeliveryOutcome(ctx context.Context, outcome string) {
if runtime == nil {
return
}
runtime.genericDeliveryOutcomes.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(attribute.String("outcome", strings.TrimSpace(outcome))),
)
}
// RecordMalformedCommand records one malformed or rejected async stream
// command.
func (runtime *Runtime) RecordMalformedCommand(ctx context.Context, failureCode string) {
if runtime == nil {
return
}
runtime.malformedCommands.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(attribute.String("failure_code", strings.TrimSpace(failureCode))),
)
}
// RecordAcceptedAuthDelivery records one newly accepted auth delivery.
func (runtime *Runtime) RecordAcceptedAuthDelivery(ctx context.Context) {
if runtime == nil {
return
}
runtime.acceptedAuthDeliveries.Add(normalizeContext(ctx), 1)
}
// RecordAcceptedGenericDelivery records one newly accepted generic delivery.
func (runtime *Runtime) RecordAcceptedGenericDelivery(ctx context.Context) {
if runtime == nil {
return
}
runtime.acceptedGenericDeliveries.Add(normalizeContext(ctx), 1)
}
// RecordDeliveryStatusTransition records one durable delivery status
// transition.
func (runtime *Runtime) RecordDeliveryStatusTransition(ctx context.Context, status string, source string) {
if runtime == nil {
return
}
attrs := metric.WithAttributes(
attribute.String("status", strings.TrimSpace(status)),
attribute.String("source", strings.TrimSpace(source)),
)
runtime.deliveryStatusTransitions.Add(normalizeContext(ctx), 1, attrs)
switch strings.TrimSpace(status) {
case "suppressed":
runtime.suppressedDeliveries.Add(normalizeContext(ctx), 1)
case "dead_letter":
runtime.deadLetters.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(attribute.String("source", strings.TrimSpace(source))),
)
}
}
// RecordAttemptOutcome records one durable terminal attempt outcome.
func (runtime *Runtime) RecordAttemptOutcome(ctx context.Context, status string, source string) {
if runtime == nil {
return
}
runtime.attemptOutcomes.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("status", strings.TrimSpace(status)),
attribute.String("source", strings.TrimSpace(source)),
),
)
}
// RecordLocaleFallback records one template locale fallback event.
func (runtime *Runtime) RecordLocaleFallback(ctx context.Context, templateID string, requestedLocale string, resolvedLocale string) {
if runtime == nil {
return
}
runtime.localeFallbacks.Add(
normalizeContext(ctx),
1,
metric.WithAttributes(
attribute.String("template_id", strings.TrimSpace(templateID)),
attribute.String("requested_locale", strings.TrimSpace(requestedLocale)),
attribute.String("resolved_locale", strings.TrimSpace(resolvedLocale)),
),
)
}
// RecordProviderSendDuration records one provider send duration sample.
func (runtime *Runtime) RecordProviderSendDuration(ctx context.Context, provider string, outcome string, duration time.Duration) {
if runtime == nil {
return
}
runtime.providerSendDuration.Record(
normalizeContext(ctx),
duration.Seconds()*1000,
metric.WithAttributes(
attribute.String("provider", strings.TrimSpace(provider)),
attribute.String("outcome", strings.TrimSpace(outcome)),
),
)
}
// SetAttemptScheduleSnapshotReader installs the current attempt-schedule
// reader used by the observable schedule gauges.
func (runtime *Runtime) SetAttemptScheduleSnapshotReader(reader AttemptScheduleSnapshotReader) {
if runtime == nil {
return
}
runtime.attemptScheduleReaderMu.Lock()
runtime.attemptScheduleReader = reader
runtime.attemptScheduleReaderMu.Unlock()
}
func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) {
meter := meterProvider.Meter(meterName)
runtime := &Runtime{
tracerProvider: tracerProvider,
meterProvider: meterProvider,
shutdownFns: append([]func(context.Context) error(nil), shutdownFns...),
}
internalHTTPRequests, err := meter.Int64Counter("mail.internal_http.requests")
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: internal_http.requests: %w", err)
}
internalHTTPDuration, err := meter.Float64Histogram("mail.internal_http.duration", metric.WithUnit("ms"))
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: internal_http.duration: %w", err)
}
authDeliveryOutcomes, err := meter.Int64Counter("mail.auth_delivery.outcomes")
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: auth_delivery.outcomes: %w", err)
}
genericDeliveryOutcomes, err := meter.Int64Counter("mail.generic_delivery.outcomes")
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: generic_delivery.outcomes: %w", err)
}
malformedCommands, err := meter.Int64Counter("mail.stream_commands.malformed")
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: stream_commands.malformed: %w", err)
}
acceptedAuthDeliveries, err := meter.Int64Counter("mail.delivery.accepted_auth")
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: delivery.accepted_auth: %w", err)
}
acceptedGenericDeliveries, err := meter.Int64Counter("mail.delivery.accepted_generic")
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: delivery.accepted_generic: %w", err)
}
suppressedDeliveries, err := meter.Int64Counter("mail.delivery.suppressed")
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: delivery.suppressed: %w", err)
}
deliveryStatusTransitions, err := meter.Int64Counter("mail.delivery.status_transitions")
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: delivery.status_transitions: %w", err)
}
attemptOutcomes, err := meter.Int64Counter("mail.attempt.outcomes")
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: attempt.outcomes: %w", err)
}
deadLetters, err := meter.Int64Counter("mail.delivery.dead_letters")
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: delivery.dead_letters: %w", err)
}
localeFallbacks, err := meter.Int64Counter("mail.template.locale_fallback")
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: template.locale_fallback: %w", err)
}
providerSendDuration, err := meter.Float64Histogram("mail.provider.send.duration_ms", metric.WithUnit("ms"))
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: provider.send.duration_ms: %w", err)
}
attemptScheduleDepth, err := meter.Int64ObservableGauge("mail.attempt_schedule.depth")
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: attempt_schedule.depth: %w", err)
}
attemptScheduleOldestAge, err := meter.Int64ObservableGauge("mail.attempt_schedule.oldest_age_ms", metric.WithUnit("ms"))
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: attempt_schedule.oldest_age_ms: %w", err)
}
registration, err := meter.RegisterCallback(func(ctx context.Context, observer metric.Observer) error {
runtime.observeAttemptSchedule(ctx, observer, attemptScheduleDepth, attemptScheduleOldestAge)
return nil
}, attemptScheduleDepth, attemptScheduleOldestAge)
if err != nil {
return nil, fmt.Errorf("build mail telemetry runtime: attempt schedule callback: %w", err)
}
runtime.shutdownFns = append(runtime.shutdownFns, func(context.Context) error {
return registration.Unregister()
})
runtime.internalHTTPRequests = internalHTTPRequests
runtime.internalHTTPDuration = internalHTTPDuration
runtime.authDeliveryOutcomes = authDeliveryOutcomes
runtime.genericDeliveryOutcomes = genericDeliveryOutcomes
runtime.malformedCommands = malformedCommands
runtime.acceptedAuthDeliveries = acceptedAuthDeliveries
runtime.acceptedGenericDeliveries = acceptedGenericDeliveries
runtime.suppressedDeliveries = suppressedDeliveries
runtime.deliveryStatusTransitions = deliveryStatusTransitions
runtime.attemptOutcomes = attemptOutcomes
runtime.deadLetters = deadLetters
runtime.localeFallbacks = localeFallbacks
runtime.providerSendDuration = providerSendDuration
return runtime, nil
}
func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdktrace.TracerProvider, error) {
options := []sdktrace.TracerProviderOption{
sdktrace.WithResource(res),
}
if exporter, err := traceExporter(ctx, cfg); err != nil {
return nil, err
} else if exporter != nil {
options = append(options, sdktrace.WithBatcher(exporter))
}
if cfg.StdoutTracesEnabled {
exporter, err := stdouttrace.New(stdouttrace.WithWriter(os.Stdout))
if err != nil {
return nil, fmt.Errorf("stdout traces exporter: %w", err)
}
options = append(options, sdktrace.WithBatcher(exporter))
}
return sdktrace.NewTracerProvider(options...), nil
}
func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdkmetric.MeterProvider, error) {
options := []sdkmetric.Option{
sdkmetric.WithResource(res),
}
if exporter, err := metricExporter(ctx, cfg); err != nil {
return nil, err
} else if exporter != nil {
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
if cfg.StdoutMetricsEnabled {
exporter, err := stdoutmetric.New(stdoutmetric.WithWriter(os.Stdout))
if err != nil {
return nil, fmt.Errorf("stdout metrics exporter: %w", err)
}
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
return sdkmetric.NewMeterProvider(options...), nil
}
func traceExporter(ctx context.Context, cfg ProcessConfig) (sdktrace.SpanExporter, error) {
if cfg.TracesExporter != processExporterOTLP {
return nil, nil
}
switch normalizeProtocol(cfg.TracesProtocol) {
case processProtocolGRPC:
exporter, err := otlptracegrpc.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp grpc traces exporter: %w", err)
}
return exporter, nil
default:
exporter, err := otlptracehttp.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp http traces exporter: %w", err)
}
return exporter, nil
}
}
func metricExporter(ctx context.Context, cfg ProcessConfig) (sdkmetric.Exporter, error) {
if cfg.MetricsExporter != processExporterOTLP {
return nil, nil
}
switch normalizeProtocol(cfg.MetricsProtocol) {
case processProtocolGRPC:
exporter, err := otlpmetricgrpc.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp grpc metrics exporter: %w", err)
}
return exporter, nil
default:
exporter, err := otlpmetrichttp.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp http metrics exporter: %w", err)
}
return exporter, nil
}
}
func normalizeProtocol(value string) string {
switch strings.TrimSpace(value) {
case processProtocolGRPC:
return processProtocolGRPC
default:
return processProtocolHTTPProtobuf
}
}
func normalizeContext(ctx context.Context) context.Context {
if ctx == nil {
return context.Background()
}
return ctx
}
func (runtime *Runtime) observeAttemptSchedule(
ctx context.Context,
observer metric.Observer,
depthGauge metric.Int64ObservableGauge,
oldestAgeGauge metric.Int64ObservableGauge,
) {
depth := int64(0)
oldestAge := int64(0)
reader := runtime.currentAttemptScheduleReader()
if reader != nil {
snapshot, err := reader.ReadAttemptScheduleSnapshot(ctx)
if err != nil {
otel.Handle(fmt.Errorf("observe mail attempt schedule: %w", err))
} else {
if snapshot.Depth > 0 {
depth = snapshot.Depth
}
if snapshot.OldestScheduledFor != nil {
oldestAge = time.Since(snapshot.OldestScheduledFor.UTC()).Milliseconds()
if oldestAge < 0 {
oldestAge = 0
}
}
}
}
observer.ObserveInt64(depthGauge, depth)
observer.ObserveInt64(oldestAgeGauge, oldestAge)
}
func (runtime *Runtime) currentAttemptScheduleReader() AttemptScheduleSnapshotReader {
runtime.attemptScheduleReaderMu.RLock()
defer runtime.attemptScheduleReaderMu.RUnlock()
return runtime.attemptScheduleReader
}