// Package healthprobe runs the active HTTP `/healthz` probe described in // `rtmanager/README.md §Health Monitoring`. // // On every tick the worker lists `runtime_records.status=running`, // probes each engine endpoint in parallel (capped at // defaultMaxConcurrency), and applies the // RTMANAGER_PROBE_FAILURES_THRESHOLD hysteresis to emit `probe_failed` // (after N consecutive failures) and `probe_recovered` (on the first // success after a `probe_failed` was published). In-memory state is // pruned at the start of every tick against the freshly-read running // list, so a game that stops between ticks never accumulates stale // failure counters. // // Design rationale is captured in // `rtmanager/docs/workers.md`. package healthprobe import ( "context" "encoding/json" "errors" "fmt" "log/slog" "net/http" "strings" "sync" "time" "galaxy/rtmanager/internal/domain/health" "galaxy/rtmanager/internal/domain/runtime" "galaxy/rtmanager/internal/logging" "galaxy/rtmanager/internal/ports" "galaxy/rtmanager/internal/telemetry" ) // defaultMaxConcurrency caps the number of in-flight `/healthz` // requests inside a single tick. RTM v1 is single-instance with a // modest active-game count; the cap keeps a slow engine from delaying // the rest of the cohort while preventing pathological fan-out if the // running list grows. const defaultMaxConcurrency = 16 // healthzPath is the engine probe path. Stable per // `game/README.md §/healthz`. const healthzPath = "/healthz" // Dependencies groups the collaborators required by Worker. type Dependencies struct { // RuntimeRecords lists running games on every tick. RuntimeRecords ports.RuntimeRecordStore // HealthEvents emits `probe_failed` and `probe_recovered`. HealthEvents ports.HealthEventPublisher // HTTPClient performs the engine `/healthz` request. Required. // Production wiring supplies an `otelhttp`-instrumented client. HTTPClient *http.Client // Telemetry records one health-event counter per emission. Telemetry *telemetry.Runtime // Interval bounds the tick period. Interval time.Duration // ProbeTimeout bounds one engine `/healthz` call. ProbeTimeout time.Duration // FailuresThreshold is the consecutive-failure count that promotes // the in-memory counter to a `probe_failed` emission. FailuresThreshold int // MaxConcurrency caps the number of in-flight probes per tick. // Defaults to defaultMaxConcurrency when zero or negative. MaxConcurrency int // Clock supplies the wall-clock used for emission timestamps. // Defaults to `time.Now` when nil. Clock func() time.Time // Logger receives structured worker-level events. Defaults to // `slog.Default()` when nil. Logger *slog.Logger } // Worker drives the periodic active-probe loop. type Worker struct { runtimeRecords ports.RuntimeRecordStore healthEvents ports.HealthEventPublisher httpClient *http.Client telemetry *telemetry.Runtime interval time.Duration probeTimeout time.Duration failuresThreshold int maxConcurrency int clock func() time.Time logger *slog.Logger mu sync.Mutex states map[string]*probeState } // probeState stores the per-game hysteresis counters. Owned by Worker // and protected by Worker.mu. type probeState struct { consecutiveFailures int failurePublished bool } // NewWorker constructs one Worker from deps. func NewWorker(deps Dependencies) (*Worker, error) { switch { case deps.RuntimeRecords == nil: return nil, errors.New("new health probe worker: nil runtime records store") case deps.HealthEvents == nil: return nil, errors.New("new health probe worker: nil health events publisher") case deps.HTTPClient == nil: return nil, errors.New("new health probe worker: nil http client") case deps.Telemetry == nil: return nil, errors.New("new health probe worker: nil telemetry runtime") case deps.Interval <= 0: return nil, errors.New("new health probe worker: interval must be positive") case deps.ProbeTimeout <= 0: return nil, errors.New("new health probe worker: probe timeout must be positive") case deps.FailuresThreshold <= 0: return nil, errors.New("new health probe worker: failures threshold must be positive") } clock := deps.Clock if clock == nil { clock = time.Now } logger := deps.Logger if logger == nil { logger = slog.Default() } maxConcurrency := deps.MaxConcurrency if maxConcurrency <= 0 { maxConcurrency = defaultMaxConcurrency } return &Worker{ runtimeRecords: deps.RuntimeRecords, healthEvents: deps.HealthEvents, httpClient: deps.HTTPClient, telemetry: deps.Telemetry, interval: deps.Interval, probeTimeout: deps.ProbeTimeout, failuresThreshold: deps.FailuresThreshold, maxConcurrency: maxConcurrency, clock: clock, logger: logger.With("worker", "rtmanager.healthprobe"), states: map[string]*probeState{}, }, nil } // Run drives the probe loop until ctx is cancelled. Per-tick errors are // absorbed; the loop only exits on context cancellation. func (worker *Worker) Run(ctx context.Context) error { if worker == nil { return errors.New("run health probe worker: nil worker") } if ctx == nil { return errors.New("run health probe worker: nil context") } if err := ctx.Err(); err != nil { return err } worker.logger.Info("health probe worker started", "interval", worker.interval.String(), "probe_timeout", worker.probeTimeout.String(), "failures_threshold", worker.failuresThreshold, "max_concurrency", worker.maxConcurrency, ) defer worker.logger.Info("health probe worker stopped") ticker := time.NewTicker(worker.interval) defer ticker.Stop() for { select { case <-ctx.Done(): return ctx.Err() case <-ticker.C: worker.tick(ctx) } } } // Shutdown is a no-op; Run terminates on context cancellation. func (worker *Worker) Shutdown(ctx context.Context) error { if ctx == nil { return errors.New("shutdown health probe worker: nil context") } return nil } // Tick performs one probe pass. Exported so tests can drive the worker // deterministically without spinning a real ticker. func (worker *Worker) Tick(ctx context.Context) { worker.tick(ctx) } // tick performs one full pass: list running records, prune state for // stopped games, then probe every running game in parallel. func (worker *Worker) tick(ctx context.Context) { if err := ctx.Err(); err != nil { return } records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning) if err != nil { worker.logger.WarnContext(ctx, "list running records", "err", err.Error(), ) return } worker.pruneStates(records) if len(records) == 0 { return } semaphore := make(chan struct{}, worker.maxConcurrency) var waitGroup sync.WaitGroup for _, record := range records { select { case <-ctx.Done(): waitGroup.Wait() return case semaphore <- struct{}{}: } waitGroup.Add(1) go func(record runtime.RuntimeRecord) { defer waitGroup.Done() defer func() { <-semaphore }() worker.probeOne(ctx, record) }(record) } waitGroup.Wait() } // pruneStates removes per-game state for games no longer in the running // list. Stopped or removed games therefore start with a clean counter // the next time they re-enter `running`. func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) { worker.mu.Lock() defer worker.mu.Unlock() if len(worker.states) == 0 { return } running := make(map[string]struct{}, len(records)) for _, record := range records { running[record.GameID] = struct{}{} } for gameID := range worker.states { if _, ok := running[gameID]; !ok { delete(worker.states, gameID) } } } // probeOne issues one `/healthz` request and updates hysteresis state. func (worker *Worker) probeOne(ctx context.Context, record runtime.RuntimeRecord) { probeCtx, cancel := context.WithTimeout(ctx, worker.probeTimeout) defer cancel() endpoint := strings.TrimRight(record.EngineEndpoint, "/") + healthzPath request, err := http.NewRequestWithContext(probeCtx, http.MethodGet, endpoint, nil) if err != nil { worker.recordFailure(ctx, record, 0, fmt.Errorf("build request: %w", err)) return } response, err := worker.httpClient.Do(request) if err != nil { worker.recordFailure(ctx, record, 0, err) return } defer response.Body.Close() if response.StatusCode == http.StatusOK { worker.recordSuccess(ctx, record) return } worker.recordFailure(ctx, record, response.StatusCode, fmt.Errorf("unexpected status %d", response.StatusCode)) } // recordSuccess updates state on a successful probe and emits // `probe_recovered` when the prior tick had crossed the failure // threshold. func (worker *Worker) recordSuccess(ctx context.Context, record runtime.RuntimeRecord) { worker.mu.Lock() state, ok := worker.states[record.GameID] if !ok { worker.mu.Unlock() return } if !state.failurePublished { state.consecutiveFailures = 0 worker.mu.Unlock() return } priorFailureCount := state.consecutiveFailures state.consecutiveFailures = 0 state.failurePublished = false worker.mu.Unlock() worker.publish(ctx, ports.HealthEventEnvelope{ GameID: record.GameID, ContainerID: record.CurrentContainerID, EventType: health.EventTypeProbeRecovered, OccurredAt: worker.clock().UTC(), Details: probeRecoveredDetails(priorFailureCount), }) } // recordFailure updates state on a failed probe and emits // `probe_failed` once the threshold is crossed. func (worker *Worker) recordFailure(ctx context.Context, record runtime.RuntimeRecord, lastStatus int, lastErr error) { worker.mu.Lock() state, ok := worker.states[record.GameID] if !ok { state = &probeState{} worker.states[record.GameID] = state } state.consecutiveFailures++ if state.failurePublished || state.consecutiveFailures < worker.failuresThreshold { count := state.consecutiveFailures worker.mu.Unlock() worker.logger.DebugContext(ctx, "probe failure", "game_id", record.GameID, "consecutive_failures", count, "threshold", worker.failuresThreshold, "err", errString(lastErr), ) return } state.failurePublished = true count := state.consecutiveFailures worker.mu.Unlock() worker.publish(ctx, ports.HealthEventEnvelope{ GameID: record.GameID, ContainerID: record.CurrentContainerID, EventType: health.EventTypeProbeFailed, OccurredAt: worker.clock().UTC(), Details: probeFailedDetails(count, lastStatus, errString(lastErr)), }) } // publish emits one envelope through the configured publisher, updates // the telemetry counter, and logs the outcome. Failures degrade to a // warning log per `rtmanager/README.md §Notification Contracts`. func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) { if err := worker.healthEvents.Publish(ctx, envelope); err != nil { worker.logger.ErrorContext(ctx, "publish health event", "game_id", envelope.GameID, "container_id", envelope.ContainerID, "event_type", string(envelope.EventType), "err", err.Error(), ) return } worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType)) logArgs := []any{ "game_id", envelope.GameID, "container_id", envelope.ContainerID, "event_type", string(envelope.EventType), } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) worker.logger.InfoContext(ctx, "probe event published", logArgs...) } // probeFailedDetails builds the JSON payload required by the // `probe_failed` AsyncAPI variant. func probeFailedDetails(consecutiveFailures, lastStatus int, lastError string) json.RawMessage { payload := struct { ConsecutiveFailures int `json:"consecutive_failures"` LastStatus int `json:"last_status"` LastError string `json:"last_error"` }{ ConsecutiveFailures: consecutiveFailures, LastStatus: lastStatus, LastError: lastError, } encoded, _ := json.Marshal(payload) return encoded } // probeRecoveredDetails builds the JSON payload required by the // `probe_recovered` AsyncAPI variant. func probeRecoveredDetails(priorFailureCount int) json.RawMessage { payload := struct { PriorFailureCount int `json:"prior_failure_count"` }{PriorFailureCount: priorFailureCount} encoded, _ := json.Marshal(payload) return encoded } func errString(err error) string { if err == nil { return "" } return err.Error() }