feat: runtime manager

2026-04-28 20:39:18 +02:00
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,411 @@
+// Package healthprobe runs the active HTTP `/healthz` probe described in
+// `rtmanager/README.md §Health Monitoring`.
+//
+// On every tick the worker lists `runtime_records.status=running`,
+// probes each engine endpoint in parallel (capped at
+// defaultMaxConcurrency), and applies the
+// RTMANAGER_PROBE_FAILURES_THRESHOLD hysteresis to emit `probe_failed`
+// (after N consecutive failures) and `probe_recovered` (on the first
+// success after a `probe_failed` was published). In-memory state is
+// pruned at the start of every tick against the freshly-read running
+// list, so a game that stops between ticks never accumulates stale
+// failure counters.
+//
+// Design rationale is captured in
+// `rtmanager/docs/workers.md`.
+package healthprobe
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log/slog"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+
+	"galaxy/rtmanager/internal/domain/health"
+	"galaxy/rtmanager/internal/domain/runtime"
+	"galaxy/rtmanager/internal/logging"
+	"galaxy/rtmanager/internal/ports"
+	"galaxy/rtmanager/internal/telemetry"
+)
+
+// defaultMaxConcurrency caps the number of in-flight `/healthz`
+// requests inside a single tick. RTM v1 is single-instance with a
+// modest active-game count; the cap keeps a slow engine from delaying
+// the rest of the cohort while preventing pathological fan-out if the
+// running list grows.
+const defaultMaxConcurrency = 16
+
+// healthzPath is the engine probe path. Stable per
+// `game/README.md §/healthz`.
+const healthzPath = "/healthz"
+
+// Dependencies groups the collaborators required by Worker.
+type Dependencies struct {
+	// RuntimeRecords lists running games on every tick.
+	RuntimeRecords ports.RuntimeRecordStore
+
+	// HealthEvents emits `probe_failed` and `probe_recovered`.
+	HealthEvents ports.HealthEventPublisher
+
+	// HTTPClient performs the engine `/healthz` request. Required.
+	// Production wiring supplies an `otelhttp`-instrumented client.
+	HTTPClient *http.Client
+
+	// Telemetry records one health-event counter per emission.
+	Telemetry *telemetry.Runtime
+
+	// Interval bounds the tick period.
+	Interval time.Duration
+
+	// ProbeTimeout bounds one engine `/healthz` call.
+	ProbeTimeout time.Duration
+
+	// FailuresThreshold is the consecutive-failure count that promotes
+	// the in-memory counter to a `probe_failed` emission.
+	FailuresThreshold int
+
+	// MaxConcurrency caps the number of in-flight probes per tick.
+	// Defaults to defaultMaxConcurrency when zero or negative.
+	MaxConcurrency int
+
+	// Clock supplies the wall-clock used for emission timestamps.
+	// Defaults to `time.Now` when nil.
+	Clock func() time.Time
+
+	// Logger receives structured worker-level events. Defaults to
+	// `slog.Default()` when nil.
+	Logger *slog.Logger
+}
+
+// Worker drives the periodic active-probe loop.
+type Worker struct {
+	runtimeRecords ports.RuntimeRecordStore
+	healthEvents   ports.HealthEventPublisher
+	httpClient     *http.Client
+	telemetry      *telemetry.Runtime
+
+	interval          time.Duration
+	probeTimeout      time.Duration
+	failuresThreshold int
+	maxConcurrency    int
+
+	clock  func() time.Time
+	logger *slog.Logger
+
+	mu     sync.Mutex
+	states map[string]*probeState
+}
+
+// probeState stores the per-game hysteresis counters. Owned by Worker
+// and protected by Worker.mu.
+type probeState struct {
+	consecutiveFailures int
+	failurePublished    bool
+}
+
+// NewWorker constructs one Worker from deps.
+func NewWorker(deps Dependencies) (*Worker, error) {
+	switch {
+	case deps.RuntimeRecords == nil:
+		return nil, errors.New("new health probe worker: nil runtime records store")
+	case deps.HealthEvents == nil:
+		return nil, errors.New("new health probe worker: nil health events publisher")
+	case deps.HTTPClient == nil:
+		return nil, errors.New("new health probe worker: nil http client")
+	case deps.Telemetry == nil:
+		return nil, errors.New("new health probe worker: nil telemetry runtime")
+	case deps.Interval <= 0:
+		return nil, errors.New("new health probe worker: interval must be positive")
+	case deps.ProbeTimeout <= 0:
+		return nil, errors.New("new health probe worker: probe timeout must be positive")
+	case deps.FailuresThreshold <= 0:
+		return nil, errors.New("new health probe worker: failures threshold must be positive")
+	}
+
+	clock := deps.Clock
+	if clock == nil {
+		clock = time.Now
+	}
+	logger := deps.Logger
+	if logger == nil {
+		logger = slog.Default()
+	}
+	maxConcurrency := deps.MaxConcurrency
+	if maxConcurrency <= 0 {
+		maxConcurrency = defaultMaxConcurrency
+	}
+
+	return &Worker{
+		runtimeRecords:    deps.RuntimeRecords,
+		healthEvents:      deps.HealthEvents,
+		httpClient:        deps.HTTPClient,
+		telemetry:         deps.Telemetry,
+		interval:          deps.Interval,
+		probeTimeout:      deps.ProbeTimeout,
+		failuresThreshold: deps.FailuresThreshold,
+		maxConcurrency:    maxConcurrency,
+		clock:             clock,
+		logger:            logger.With("worker", "rtmanager.healthprobe"),
+		states:            map[string]*probeState{},
+	}, nil
+}
+
+// Run drives the probe loop until ctx is cancelled. Per-tick errors are
+// absorbed; the loop only exits on context cancellation.
+func (worker *Worker) Run(ctx context.Context) error {
+	if worker == nil {
+		return errors.New("run health probe worker: nil worker")
+	}
+	if ctx == nil {
+		return errors.New("run health probe worker: nil context")
+	}
+	if err := ctx.Err(); err != nil {
+		return err
+	}
+
+	worker.logger.Info("health probe worker started",
+		"interval", worker.interval.String(),
+		"probe_timeout", worker.probeTimeout.String(),
+		"failures_threshold", worker.failuresThreshold,
+		"max_concurrency", worker.maxConcurrency,
+	)
+	defer worker.logger.Info("health probe worker stopped")
+
+	ticker := time.NewTicker(worker.interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-ticker.C:
+			worker.tick(ctx)
+		}
+	}
+}
+
+// Shutdown is a no-op; Run terminates on context cancellation.
+func (worker *Worker) Shutdown(ctx context.Context) error {
+	if ctx == nil {
+		return errors.New("shutdown health probe worker: nil context")
+	}
+	return nil
+}
+
+// Tick performs one probe pass. Exported so tests can drive the worker
+// deterministically without spinning a real ticker.
+func (worker *Worker) Tick(ctx context.Context) {
+	worker.tick(ctx)
+}
+
+// tick performs one full pass: list running records, prune state for
+// stopped games, then probe every running game in parallel.
+func (worker *Worker) tick(ctx context.Context) {
+	if err := ctx.Err(); err != nil {
+		return
+	}
+
+	records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
+	if err != nil {
+		worker.logger.WarnContext(ctx, "list running records",
+			"err", err.Error(),
+		)
+		return
+	}
+
+	worker.pruneStates(records)
+
+	if len(records) == 0 {
+		return
+	}
+
+	semaphore := make(chan struct{}, worker.maxConcurrency)
+	var waitGroup sync.WaitGroup
+	for _, record := range records {
+		select {
+		case <-ctx.Done():
+			waitGroup.Wait()
+			return
+		case semaphore <- struct{}{}:
+		}
+		waitGroup.Add(1)
+		go func(record runtime.RuntimeRecord) {
+			defer waitGroup.Done()
+			defer func() { <-semaphore }()
+			worker.probeOne(ctx, record)
+		}(record)
+	}
+	waitGroup.Wait()
+}
+
+// pruneStates removes per-game state for games no longer in the running
+// list. Stopped or removed games therefore start with a clean counter
+// the next time they re-enter `running`.
+func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) {
+	worker.mu.Lock()
+	defer worker.mu.Unlock()
+	if len(worker.states) == 0 {
+		return
+	}
+	running := make(map[string]struct{}, len(records))
+	for _, record := range records {
+		running[record.GameID] = struct{}{}
+	}
+	for gameID := range worker.states {
+		if _, ok := running[gameID]; !ok {
+			delete(worker.states, gameID)
+		}
+	}
+}
+
+// probeOne issues one `/healthz` request and updates hysteresis state.
+func (worker *Worker) probeOne(ctx context.Context, record runtime.RuntimeRecord) {
+	probeCtx, cancel := context.WithTimeout(ctx, worker.probeTimeout)
+	defer cancel()
+
+	endpoint := strings.TrimRight(record.EngineEndpoint, "/") + healthzPath
+	request, err := http.NewRequestWithContext(probeCtx, http.MethodGet, endpoint, nil)
+	if err != nil {
+		worker.recordFailure(ctx, record, 0, fmt.Errorf("build request: %w", err))
+		return
+	}
+
+	response, err := worker.httpClient.Do(request)
+	if err != nil {
+		worker.recordFailure(ctx, record, 0, err)
+		return
+	}
+	defer response.Body.Close()
+
+	if response.StatusCode == http.StatusOK {
+		worker.recordSuccess(ctx, record)
+		return
+	}
+	worker.recordFailure(ctx, record, response.StatusCode, fmt.Errorf("unexpected status %d", response.StatusCode))
+}
+
+// recordSuccess updates state on a successful probe and emits
+// `probe_recovered` when the prior tick had crossed the failure
+// threshold.
+func (worker *Worker) recordSuccess(ctx context.Context, record runtime.RuntimeRecord) {
+	worker.mu.Lock()
+	state, ok := worker.states[record.GameID]
+	if !ok {
+		worker.mu.Unlock()
+		return
+	}
+	if !state.failurePublished {
+		state.consecutiveFailures = 0
+		worker.mu.Unlock()
+		return
+	}
+	priorFailureCount := state.consecutiveFailures
+	state.consecutiveFailures = 0
+	state.failurePublished = false
+	worker.mu.Unlock()
+
+	worker.publish(ctx, ports.HealthEventEnvelope{
+		GameID:      record.GameID,
+		ContainerID: record.CurrentContainerID,
+		EventType:   health.EventTypeProbeRecovered,
+		OccurredAt:  worker.clock().UTC(),
+		Details:     probeRecoveredDetails(priorFailureCount),
+	})
+}
+
+// recordFailure updates state on a failed probe and emits
+// `probe_failed` once the threshold is crossed.
+func (worker *Worker) recordFailure(ctx context.Context, record runtime.RuntimeRecord, lastStatus int, lastErr error) {
+	worker.mu.Lock()
+	state, ok := worker.states[record.GameID]
+	if !ok {
+		state = &probeState{}
+		worker.states[record.GameID] = state
+	}
+	state.consecutiveFailures++
+	if state.failurePublished || state.consecutiveFailures < worker.failuresThreshold {
+		count := state.consecutiveFailures
+		worker.mu.Unlock()
+		worker.logger.DebugContext(ctx, "probe failure",
+			"game_id", record.GameID,
+			"consecutive_failures", count,
+			"threshold", worker.failuresThreshold,
+			"err", errString(lastErr),
+		)
+		return
+	}
+	state.failurePublished = true
+	count := state.consecutiveFailures
+	worker.mu.Unlock()
+
+	worker.publish(ctx, ports.HealthEventEnvelope{
+		GameID:      record.GameID,
+		ContainerID: record.CurrentContainerID,
+		EventType:   health.EventTypeProbeFailed,
+		OccurredAt:  worker.clock().UTC(),
+		Details:     probeFailedDetails(count, lastStatus, errString(lastErr)),
+	})
+}
+
+// publish emits one envelope through the configured publisher, updates
+// the telemetry counter, and logs the outcome. Failures degrade to a
+// warning log per `rtmanager/README.md §Notification Contracts`.
+func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
+	if err := worker.healthEvents.Publish(ctx, envelope); err != nil {
+		worker.logger.ErrorContext(ctx, "publish health event",
+			"game_id", envelope.GameID,
+			"container_id", envelope.ContainerID,
+			"event_type", string(envelope.EventType),
+			"err", err.Error(),
+		)
+		return
+	}
+
+	worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
+
+	logArgs := []any{
+		"game_id", envelope.GameID,
+		"container_id", envelope.ContainerID,
+		"event_type", string(envelope.EventType),
+	}
+	logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
+	worker.logger.InfoContext(ctx, "probe event published", logArgs...)
+}
+
+// probeFailedDetails builds the JSON payload required by the
+// `probe_failed` AsyncAPI variant.
+func probeFailedDetails(consecutiveFailures, lastStatus int, lastError string) json.RawMessage {
+	payload := struct {
+		ConsecutiveFailures int    `json:"consecutive_failures"`
+		LastStatus          int    `json:"last_status"`
+		LastError           string `json:"last_error"`
+	}{
+		ConsecutiveFailures: consecutiveFailures,
+		LastStatus:          lastStatus,
+		LastError:           lastError,
+	}
+	encoded, _ := json.Marshal(payload)
+	return encoded
+}
+
+// probeRecoveredDetails builds the JSON payload required by the
+// `probe_recovered` AsyncAPI variant.
+func probeRecoveredDetails(priorFailureCount int) json.RawMessage {
+	payload := struct {
+		PriorFailureCount int `json:"prior_failure_count"`
+	}{PriorFailureCount: priorFailureCount}
+	encoded, _ := json.Marshal(payload)
+	return encoded
+}
+
+func errString(err error) string {
+	if err == nil {
+		return ""
+	}
+	return err.Error()
+}