// Package telemetry provides lightweight OpenTelemetry helpers and
// low-cardinality Runtime Manager instruments used by the runnable
// skeleton. Later stages emit into the instruments declared here without
// touching this package.
package telemetry

import (
	"context"
	"errors"
	"fmt"
	"log/slog"
	"os"
	"strings"
	"sync"
	"time"

	"go.opentelemetry.io/otel"
	"go.opentelemetry.io/otel/attribute"
	"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
	"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
	"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
	"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
	"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
	"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
	"go.opentelemetry.io/otel/metric"
	"go.opentelemetry.io/otel/propagation"
	sdkmetric "go.opentelemetry.io/otel/sdk/metric"
	"go.opentelemetry.io/otel/sdk/resource"
	sdktrace "go.opentelemetry.io/otel/sdk/trace"
	oteltrace "go.opentelemetry.io/otel/trace"
)

const meterName = "galaxy/rtmanager"

const (
	defaultServiceName = "galaxy-rtmanager"

	processExporterNone         = "none"
	processExporterOTLP         = "otlp"
	processProtocolHTTPProtobuf = "http/protobuf"
	processProtocolGRPC         = "grpc"
)

// ProcessConfig configures the process-wide OpenTelemetry runtime.
type ProcessConfig struct {
	// ServiceName overrides the default OpenTelemetry service name.
	ServiceName string

	// TracesExporter selects the external traces exporter. Supported values
	// are `none` and `otlp`.
	TracesExporter string

	// MetricsExporter selects the external metrics exporter. Supported
	// values are `none` and `otlp`.
	MetricsExporter string

	// TracesProtocol selects the OTLP traces protocol when TracesExporter is
	// `otlp`.
	TracesProtocol string

	// MetricsProtocol selects the OTLP metrics protocol when
	// MetricsExporter is `otlp`.
	MetricsProtocol string

	// StdoutTracesEnabled enables the additional stdout trace exporter used
	// for local development and debugging.
	StdoutTracesEnabled bool

	// StdoutMetricsEnabled enables the additional stdout metric exporter
	// used for local development and debugging.
	StdoutMetricsEnabled bool
}

// Validate reports whether cfg contains a supported OpenTelemetry exporter
// configuration.
func (cfg ProcessConfig) Validate() error {
	switch cfg.TracesExporter {
	case processExporterNone, processExporterOTLP:
	default:
		return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter)
	}

	switch cfg.MetricsExporter {
	case processExporterNone, processExporterOTLP:
	default:
		return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter)
	}

	if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC {
		return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol)
	}
	if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC {
		return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol)
	}

	return nil
}

// Runtime owns the Runtime Manager OpenTelemetry providers and the
// low-cardinality custom instruments listed in `rtmanager/README.md`
// §Observability.
type Runtime struct {
	tracerProvider oteltrace.TracerProvider
	meterProvider  metric.MeterProvider
	meter          metric.Meter

	shutdownMu   sync.Mutex
	shutdownDone bool
	shutdownErr  error
	shutdownFns  []func(context.Context) error

	internalHTTPRequests metric.Int64Counter
	internalHTTPDuration metric.Float64Histogram

	startOutcomes        metric.Int64Counter
	stopOutcomes         metric.Int64Counter
	restartOutcomes      metric.Int64Counter
	patchOutcomes        metric.Int64Counter
	cleanupOutcomes      metric.Int64Counter
	healthEvents         metric.Int64Counter
	reconcileDrift       metric.Int64Counter
	notificationIntents  metric.Int64Counter
	dockerOpLatency      metric.Float64Histogram
	leaseAcquireLatency  metric.Float64Histogram

	runtimeRecordsByStatus metric.Int64ObservableGauge

	gaugeMu           sync.Mutex
	gaugeRegistration metric.Registration
}

// NewWithProviders constructs a telemetry runtime around explicitly supplied
// meterProvider and tracerProvider values.
func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) {
	if meterProvider == nil {
		meterProvider = otel.GetMeterProvider()
	}
	if tracerProvider == nil {
		tracerProvider = otel.GetTracerProvider()
	}
	if meterProvider == nil {
		return nil, errors.New("new rtmanager telemetry runtime: nil meter provider")
	}
	if tracerProvider == nil {
		return nil, errors.New("new rtmanager telemetry runtime: nil tracer provider")
	}

	return buildRuntime(meterProvider, tracerProvider, nil)
}

// NewProcess constructs the process-wide Runtime Manager OpenTelemetry
// runtime from cfg, installs the resulting providers globally, and
// returns the runtime.
func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) {
	if ctx == nil {
		return nil, errors.New("new rtmanager telemetry process: nil context")
	}
	if err := cfg.Validate(); err != nil {
		return nil, fmt.Errorf("new rtmanager telemetry process: %w", err)
	}
	if logger == nil {
		logger = slog.Default()
	}

	serviceName := strings.TrimSpace(cfg.ServiceName)
	if serviceName == "" {
		serviceName = defaultServiceName
	}

	res := resource.NewSchemaless(attribute.String("service.name", serviceName))

	tracerProvider, err := newTracerProvider(ctx, res, cfg)
	if err != nil {
		return nil, fmt.Errorf("new rtmanager telemetry process: tracer provider: %w", err)
	}
	meterProvider, err := newMeterProvider(ctx, res, cfg)
	if err != nil {
		return nil, fmt.Errorf("new rtmanager telemetry process: meter provider: %w", err)
	}

	otel.SetTracerProvider(tracerProvider)
	otel.SetMeterProvider(meterProvider)
	otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
		propagation.TraceContext{},
		propagation.Baggage{},
	))

	runtime, err := buildRuntime(meterProvider, tracerProvider, []func(context.Context) error{
		meterProvider.Shutdown,
		tracerProvider.Shutdown,
	})
	if err != nil {
		return nil, fmt.Errorf("new rtmanager telemetry process: runtime: %w", err)
	}

	logger.Info("rtmanager telemetry configured",
		"service_name", serviceName,
		"traces_exporter", cfg.TracesExporter,
		"metrics_exporter", cfg.MetricsExporter,
	)

	return runtime, nil
}

// TracerProvider returns the runtime tracer provider.
func (runtime *Runtime) TracerProvider() oteltrace.TracerProvider {
	if runtime == nil || runtime.tracerProvider == nil {
		return otel.GetTracerProvider()
	}

	return runtime.tracerProvider
}

// MeterProvider returns the runtime meter provider.
func (runtime *Runtime) MeterProvider() metric.MeterProvider {
	if runtime == nil || runtime.meterProvider == nil {
		return otel.GetMeterProvider()
	}

	return runtime.meterProvider
}

// Shutdown flushes and stops the configured telemetry providers. Shutdown
// is idempotent.
func (runtime *Runtime) Shutdown(ctx context.Context) error {
	if runtime == nil {
		return nil
	}

	runtime.shutdownMu.Lock()
	if runtime.shutdownDone {
		err := runtime.shutdownErr
		runtime.shutdownMu.Unlock()
		return err
	}
	runtime.shutdownDone = true
	runtime.shutdownMu.Unlock()

	runtime.gaugeMu.Lock()
	if runtime.gaugeRegistration != nil {
		_ = runtime.gaugeRegistration.Unregister()
		runtime.gaugeRegistration = nil
	}
	runtime.gaugeMu.Unlock()

	var shutdownErr error
	for index := len(runtime.shutdownFns) - 1; index >= 0; index-- {
		shutdownErr = errors.Join(shutdownErr, runtime.shutdownFns[index](ctx))
	}

	runtime.shutdownMu.Lock()
	runtime.shutdownErr = shutdownErr
	runtime.shutdownMu.Unlock()

	return shutdownErr
}

// RecordInternalHTTPRequest records one internal HTTP request outcome.
func (runtime *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
	if runtime == nil {
		return
	}

	options := metric.WithAttributes(attrs...)
	runtime.internalHTTPRequests.Add(normalizeContext(ctx), 1, options)
	runtime.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
}

// RecordStartOutcome records one terminal outcome of the start operation.
// outcome is `success` or `failure`; errorCode is `replay_no_op` or one of
// the stable failure codes from `rtmanager/README.md` §Error Model;
// opSource is `lobby_stream`, `gm_rest`, or `admin_rest`.
func (runtime *Runtime) RecordStartOutcome(ctx context.Context, outcome, errorCode, opSource string) {
	if runtime == nil || runtime.startOutcomes == nil {
		return
	}
	runtime.startOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
		attribute.String("outcome", outcome),
		attribute.String("error_code", errorCode),
		attribute.String("op_source", opSource),
	))
}

// RecordStopOutcome records one terminal outcome of the stop operation.
// reason is the value carried on `runtime:stop_jobs` or the matching REST
// reason; opSource is `lobby_stream`, `gm_rest`, or `admin_rest`.
func (runtime *Runtime) RecordStopOutcome(ctx context.Context, outcome, reason, opSource string) {
	if runtime == nil || runtime.stopOutcomes == nil {
		return
	}
	runtime.stopOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
		attribute.String("outcome", outcome),
		attribute.String("reason", reason),
		attribute.String("op_source", opSource),
	))
}

// RecordRestartOutcome records one terminal outcome of the restart
// operation.
func (runtime *Runtime) RecordRestartOutcome(ctx context.Context, outcome, errorCode string) {
	if runtime == nil || runtime.restartOutcomes == nil {
		return
	}
	runtime.restartOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
		attribute.String("outcome", outcome),
		attribute.String("error_code", errorCode),
	))
}

// RecordPatchOutcome records one terminal outcome of the patch operation.
func (runtime *Runtime) RecordPatchOutcome(ctx context.Context, outcome, errorCode string) {
	if runtime == nil || runtime.patchOutcomes == nil {
		return
	}
	runtime.patchOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
		attribute.String("outcome", outcome),
		attribute.String("error_code", errorCode),
	))
}

// RecordCleanupOutcome records one terminal outcome of the cleanup
// operation. opSource is `auto_ttl` for the periodic cleanup worker and
// `admin_rest` for explicit administrative removal.
func (runtime *Runtime) RecordCleanupOutcome(ctx context.Context, outcome, opSource string) {
	if runtime == nil || runtime.cleanupOutcomes == nil {
		return
	}
	runtime.cleanupOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
		attribute.String("outcome", outcome),
		attribute.String("op_source", opSource),
	))
}

// RecordHealthEvent records one technical runtime event published on
// `runtime:health_events`. eventType comes from the frozen vocabulary in
// `rtmanager/README.md` §Async Stream Contracts.
func (runtime *Runtime) RecordHealthEvent(ctx context.Context, eventType string) {
	if runtime == nil || runtime.healthEvents == nil {
		return
	}
	runtime.healthEvents.Add(normalizeContext(ctx), 1, metric.WithAttributes(
		attribute.String("event_type", eventType),
	))
}

// RecordReconcileDrift records one drift outcome from the reconciler. kind
// is `adopt`, `dispose`, or `observed_exited`.
func (runtime *Runtime) RecordReconcileDrift(ctx context.Context, kind string) {
	if runtime == nil || runtime.reconcileDrift == nil {
		return
	}
	runtime.reconcileDrift.Add(normalizeContext(ctx), 1, metric.WithAttributes(
		attribute.String("kind", kind),
	))
}

// RecordNotificationIntent records one admin-only notification intent
// publish attempt. notificationType is `runtime.image_pull_failed`,
// `runtime.container_start_failed`, or `runtime.start_config_invalid`.
func (runtime *Runtime) RecordNotificationIntent(ctx context.Context, notificationType string) {
	if runtime == nil || runtime.notificationIntents == nil {
		return
	}
	runtime.notificationIntents.Add(normalizeContext(ctx), 1, metric.WithAttributes(
		attribute.String("notification_type", notificationType),
	))
}

// RecordDockerOpLatency records the wall-clock duration of one Docker SDK
// call. op is one of `pull`, `create`, `start`, `stop`, `rm`, `inspect`,
// `events`.
func (runtime *Runtime) RecordDockerOpLatency(ctx context.Context, op string, duration time.Duration) {
	if runtime == nil || runtime.dockerOpLatency == nil {
		return
	}
	runtime.dockerOpLatency.Record(normalizeContext(ctx), duration.Seconds()*1000, metric.WithAttributes(
		attribute.String("op", op),
	))
}

// RecordLeaseAcquireLatency records the wall-clock latency of one
// per-game Redis lease acquisition.
func (runtime *Runtime) RecordLeaseAcquireLatency(ctx context.Context, duration time.Duration) {
	if runtime == nil || runtime.leaseAcquireLatency == nil {
		return
	}
	runtime.leaseAcquireLatency.Record(normalizeContext(ctx), duration.Seconds()*1000)
}

// RuntimeRecordsByStatusProbe reports the number of runtime_records rows
// per status. The production probe wraps the runtime record store; tests
// may pass a stub.
type RuntimeRecordsByStatusProbe interface {
	CountByStatus(ctx context.Context) (map[string]int, error)
}

// GaugeDependencies groups the collaborators required by RegisterGauges.
type GaugeDependencies struct {
	// RuntimeRecordsByStatus probes the per-status row count for
	// `rtmanager.runtime_records_by_status`.
	RuntimeRecordsByStatus RuntimeRecordsByStatusProbe

	// Logger records non-fatal probe errors. Defaults to slog.Default
	// when nil.
	Logger *slog.Logger
}

// RegisterGauges installs the observable-gauge callback that reports
// `rtmanager.runtime_records_by_status`. It is safe to call once per
// Runtime; a second call replaces the previous registration. The runtime
// keeps no strong reference to deps beyond the callback closure.
//
// The wiring layer registers the gauge once the persistence adapters
// are constructed.
func (runtime *Runtime) RegisterGauges(deps GaugeDependencies) error {
	if runtime == nil {
		return errors.New("register rtmanager gauges: nil runtime")
	}
	if deps.RuntimeRecordsByStatus == nil {
		return errors.New("register rtmanager gauges: nil runtime records probe")
	}

	logger := deps.Logger
	if logger == nil {
		logger = slog.Default()
	}

	runtime.gaugeMu.Lock()
	defer runtime.gaugeMu.Unlock()

	if runtime.gaugeRegistration != nil {
		_ = runtime.gaugeRegistration.Unregister()
		runtime.gaugeRegistration = nil
	}

	callback := func(ctx context.Context, observer metric.Observer) error {
		counts, err := deps.RuntimeRecordsByStatus.CountByStatus(ctx)
		if err != nil {
			logger.WarnContext(ctx, "runtime records probe failed",
				"err", err.Error(),
			)
			return nil
		}
		for status, count := range counts {
			observer.ObserveInt64(runtime.runtimeRecordsByStatus, int64(count), metric.WithAttributes(
				attribute.String("status", status),
			))
		}
		return nil
	}

	registration, err := runtime.meter.RegisterCallback(callback, runtime.runtimeRecordsByStatus)
	if err != nil {
		return fmt.Errorf("register rtmanager gauges: %w", err)
	}
	runtime.gaugeRegistration = registration

	return nil
}

func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) {
	meter := meterProvider.Meter(meterName)
	runtime := &Runtime{
		tracerProvider: tracerProvider,
		meterProvider:  meterProvider,
		meter:          meter,
		shutdownFns:    append([]func(context.Context) error(nil), shutdownFns...),
	}

	internalHTTPRequests, err := meter.Int64Counter("rtmanager.internal_http.requests")
	if err != nil {
		return nil, fmt.Errorf("build rtmanager telemetry runtime: internal_http.requests: %w", err)
	}
	internalHTTPDuration, err := meter.Float64Histogram("rtmanager.internal_http.duration", metric.WithUnit("ms"))
	if err != nil {
		return nil, fmt.Errorf("build rtmanager telemetry runtime: internal_http.duration: %w", err)
	}
	runtime.internalHTTPRequests = internalHTTPRequests
	runtime.internalHTTPDuration = internalHTTPDuration

	if err := registerCounters(meter, runtime); err != nil {
		return nil, err
	}
	if err := registerHistograms(meter, runtime); err != nil {
		return nil, err
	}
	if err := registerObservableGauges(meter, runtime); err != nil {
		return nil, err
	}

	return runtime, nil
}

func registerCounters(meter metric.Meter, runtime *Runtime) error {
	specs := []struct {
		name   string
		target *metric.Int64Counter
	}{
		{"rtmanager.start_outcomes", &runtime.startOutcomes},
		{"rtmanager.stop_outcomes", &runtime.stopOutcomes},
		{"rtmanager.restart_outcomes", &runtime.restartOutcomes},
		{"rtmanager.patch_outcomes", &runtime.patchOutcomes},
		{"rtmanager.cleanup_outcomes", &runtime.cleanupOutcomes},
		{"rtmanager.health_events", &runtime.healthEvents},
		{"rtmanager.reconcile_drift", &runtime.reconcileDrift},
		{"rtmanager.notification_intents", &runtime.notificationIntents},
	}
	for _, spec := range specs {
		counter, err := meter.Int64Counter(spec.name)
		if err != nil {
			return fmt.Errorf("build rtmanager telemetry runtime: %s: %w", spec.name, err)
		}
		*spec.target = counter
	}
	return nil
}

func registerHistograms(meter metric.Meter, runtime *Runtime) error {
	specs := []struct {
		name   string
		unit   string
		target *metric.Float64Histogram
	}{
		{"rtmanager.docker_op_latency", "ms", &runtime.dockerOpLatency},
		{"rtmanager.lease_acquire_latency", "ms", &runtime.leaseAcquireLatency},
	}
	for _, spec := range specs {
		options := []metric.Float64HistogramOption{}
		if spec.unit != "" {
			options = append(options, metric.WithUnit(spec.unit))
		}
		histogram, err := meter.Float64Histogram(spec.name, options...)
		if err != nil {
			return fmt.Errorf("build rtmanager telemetry runtime: %s: %w", spec.name, err)
		}
		*spec.target = histogram
	}
	return nil
}

func registerObservableGauges(meter metric.Meter, runtime *Runtime) error {
	gauge, err := meter.Int64ObservableGauge("rtmanager.runtime_records_by_status")
	if err != nil {
		return fmt.Errorf("build rtmanager telemetry runtime: runtime_records_by_status: %w", err)
	}
	runtime.runtimeRecordsByStatus = gauge
	return nil
}

func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdktrace.TracerProvider, error) {
	options := []sdktrace.TracerProviderOption{
		sdktrace.WithResource(res),
	}

	if exporter, err := traceExporter(ctx, cfg); err != nil {
		return nil, err
	} else if exporter != nil {
		options = append(options, sdktrace.WithBatcher(exporter))
	}

	if cfg.StdoutTracesEnabled {
		exporter, err := stdouttrace.New(stdouttrace.WithWriter(os.Stdout))
		if err != nil {
			return nil, fmt.Errorf("stdout traces exporter: %w", err)
		}
		options = append(options, sdktrace.WithBatcher(exporter))
	}

	return sdktrace.NewTracerProvider(options...), nil
}

func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdkmetric.MeterProvider, error) {
	options := []sdkmetric.Option{
		sdkmetric.WithResource(res),
	}

	if exporter, err := metricExporter(ctx, cfg); err != nil {
		return nil, err
	} else if exporter != nil {
		options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
	}

	if cfg.StdoutMetricsEnabled {
		exporter, err := stdoutmetric.New(stdoutmetric.WithWriter(os.Stdout))
		if err != nil {
			return nil, fmt.Errorf("stdout metrics exporter: %w", err)
		}
		options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
	}

	return sdkmetric.NewMeterProvider(options...), nil
}

func traceExporter(ctx context.Context, cfg ProcessConfig) (sdktrace.SpanExporter, error) {
	if cfg.TracesExporter != processExporterOTLP {
		return nil, nil
	}

	switch normalizeProtocol(cfg.TracesProtocol) {
	case processProtocolGRPC:
		exporter, err := otlptracegrpc.New(ctx)
		if err != nil {
			return nil, fmt.Errorf("otlp grpc traces exporter: %w", err)
		}
		return exporter, nil
	default:
		exporter, err := otlptracehttp.New(ctx)
		if err != nil {
			return nil, fmt.Errorf("otlp http traces exporter: %w", err)
		}
		return exporter, nil
	}
}

func metricExporter(ctx context.Context, cfg ProcessConfig) (sdkmetric.Exporter, error) {
	if cfg.MetricsExporter != processExporterOTLP {
		return nil, nil
	}

	switch normalizeProtocol(cfg.MetricsProtocol) {
	case processProtocolGRPC:
		exporter, err := otlpmetricgrpc.New(ctx)
		if err != nil {
			return nil, fmt.Errorf("otlp grpc metrics exporter: %w", err)
		}
		return exporter, nil
	default:
		exporter, err := otlpmetrichttp.New(ctx)
		if err != nil {
			return nil, fmt.Errorf("otlp http metrics exporter: %w", err)
		}
		return exporter, nil
	}
}

func normalizeProtocol(value string) string {
	switch strings.TrimSpace(value) {
	case processProtocolGRPC:
		return processProtocolGRPC
	default:
		return processProtocolHTTPProtobuf
	}
}

func normalizeContext(ctx context.Context) context.Context {
	if ctx == nil {
		return context.Background()
	}

	return ctx
}