412 lines
12 KiB
Go
412 lines
12 KiB
Go
// Package healthprobe runs the active HTTP `/healthz` probe described in
|
|
// `rtmanager/README.md §Health Monitoring`.
|
|
//
|
|
// On every tick the worker lists `runtime_records.status=running`,
|
|
// probes each engine endpoint in parallel (capped at
|
|
// defaultMaxConcurrency), and applies the
|
|
// RTMANAGER_PROBE_FAILURES_THRESHOLD hysteresis to emit `probe_failed`
|
|
// (after N consecutive failures) and `probe_recovered` (on the first
|
|
// success after a `probe_failed` was published). In-memory state is
|
|
// pruned at the start of every tick against the freshly-read running
|
|
// list, so a game that stops between ticks never accumulates stale
|
|
// failure counters.
|
|
//
|
|
// Design rationale is captured in
|
|
// `rtmanager/docs/workers.md`.
|
|
package healthprobe
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"net/http"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"galaxy/rtmanager/internal/domain/health"
|
|
"galaxy/rtmanager/internal/domain/runtime"
|
|
"galaxy/rtmanager/internal/logging"
|
|
"galaxy/rtmanager/internal/ports"
|
|
"galaxy/rtmanager/internal/telemetry"
|
|
)
|
|
|
|
// defaultMaxConcurrency caps the number of in-flight `/healthz`
|
|
// requests inside a single tick. RTM v1 is single-instance with a
|
|
// modest active-game count; the cap keeps a slow engine from delaying
|
|
// the rest of the cohort while preventing pathological fan-out if the
|
|
// running list grows.
|
|
const defaultMaxConcurrency = 16
|
|
|
|
// healthzPath is the engine probe path. Stable per
|
|
// `game/README.md §/healthz`.
|
|
const healthzPath = "/healthz"
|
|
|
|
// Dependencies groups the collaborators required by Worker.
|
|
type Dependencies struct {
|
|
// RuntimeRecords lists running games on every tick.
|
|
RuntimeRecords ports.RuntimeRecordStore
|
|
|
|
// HealthEvents emits `probe_failed` and `probe_recovered`.
|
|
HealthEvents ports.HealthEventPublisher
|
|
|
|
// HTTPClient performs the engine `/healthz` request. Required.
|
|
// Production wiring supplies an `otelhttp`-instrumented client.
|
|
HTTPClient *http.Client
|
|
|
|
// Telemetry records one health-event counter per emission.
|
|
Telemetry *telemetry.Runtime
|
|
|
|
// Interval bounds the tick period.
|
|
Interval time.Duration
|
|
|
|
// ProbeTimeout bounds one engine `/healthz` call.
|
|
ProbeTimeout time.Duration
|
|
|
|
// FailuresThreshold is the consecutive-failure count that promotes
|
|
// the in-memory counter to a `probe_failed` emission.
|
|
FailuresThreshold int
|
|
|
|
// MaxConcurrency caps the number of in-flight probes per tick.
|
|
// Defaults to defaultMaxConcurrency when zero or negative.
|
|
MaxConcurrency int
|
|
|
|
// Clock supplies the wall-clock used for emission timestamps.
|
|
// Defaults to `time.Now` when nil.
|
|
Clock func() time.Time
|
|
|
|
// Logger receives structured worker-level events. Defaults to
|
|
// `slog.Default()` when nil.
|
|
Logger *slog.Logger
|
|
}
|
|
|
|
// Worker drives the periodic active-probe loop.
|
|
type Worker struct {
|
|
runtimeRecords ports.RuntimeRecordStore
|
|
healthEvents ports.HealthEventPublisher
|
|
httpClient *http.Client
|
|
telemetry *telemetry.Runtime
|
|
|
|
interval time.Duration
|
|
probeTimeout time.Duration
|
|
failuresThreshold int
|
|
maxConcurrency int
|
|
|
|
clock func() time.Time
|
|
logger *slog.Logger
|
|
|
|
mu sync.Mutex
|
|
states map[string]*probeState
|
|
}
|
|
|
|
// probeState stores the per-game hysteresis counters. Owned by Worker
|
|
// and protected by Worker.mu.
|
|
type probeState struct {
|
|
consecutiveFailures int
|
|
failurePublished bool
|
|
}
|
|
|
|
// NewWorker constructs one Worker from deps.
|
|
func NewWorker(deps Dependencies) (*Worker, error) {
|
|
switch {
|
|
case deps.RuntimeRecords == nil:
|
|
return nil, errors.New("new health probe worker: nil runtime records store")
|
|
case deps.HealthEvents == nil:
|
|
return nil, errors.New("new health probe worker: nil health events publisher")
|
|
case deps.HTTPClient == nil:
|
|
return nil, errors.New("new health probe worker: nil http client")
|
|
case deps.Telemetry == nil:
|
|
return nil, errors.New("new health probe worker: nil telemetry runtime")
|
|
case deps.Interval <= 0:
|
|
return nil, errors.New("new health probe worker: interval must be positive")
|
|
case deps.ProbeTimeout <= 0:
|
|
return nil, errors.New("new health probe worker: probe timeout must be positive")
|
|
case deps.FailuresThreshold <= 0:
|
|
return nil, errors.New("new health probe worker: failures threshold must be positive")
|
|
}
|
|
|
|
clock := deps.Clock
|
|
if clock == nil {
|
|
clock = time.Now
|
|
}
|
|
logger := deps.Logger
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
maxConcurrency := deps.MaxConcurrency
|
|
if maxConcurrency <= 0 {
|
|
maxConcurrency = defaultMaxConcurrency
|
|
}
|
|
|
|
return &Worker{
|
|
runtimeRecords: deps.RuntimeRecords,
|
|
healthEvents: deps.HealthEvents,
|
|
httpClient: deps.HTTPClient,
|
|
telemetry: deps.Telemetry,
|
|
interval: deps.Interval,
|
|
probeTimeout: deps.ProbeTimeout,
|
|
failuresThreshold: deps.FailuresThreshold,
|
|
maxConcurrency: maxConcurrency,
|
|
clock: clock,
|
|
logger: logger.With("worker", "rtmanager.healthprobe"),
|
|
states: map[string]*probeState{},
|
|
}, nil
|
|
}
|
|
|
|
// Run drives the probe loop until ctx is cancelled. Per-tick errors are
|
|
// absorbed; the loop only exits on context cancellation.
|
|
func (worker *Worker) Run(ctx context.Context) error {
|
|
if worker == nil {
|
|
return errors.New("run health probe worker: nil worker")
|
|
}
|
|
if ctx == nil {
|
|
return errors.New("run health probe worker: nil context")
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
worker.logger.Info("health probe worker started",
|
|
"interval", worker.interval.String(),
|
|
"probe_timeout", worker.probeTimeout.String(),
|
|
"failures_threshold", worker.failuresThreshold,
|
|
"max_concurrency", worker.maxConcurrency,
|
|
)
|
|
defer worker.logger.Info("health probe worker stopped")
|
|
|
|
ticker := time.NewTicker(worker.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-ticker.C:
|
|
worker.tick(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Shutdown is a no-op; Run terminates on context cancellation.
|
|
func (worker *Worker) Shutdown(ctx context.Context) error {
|
|
if ctx == nil {
|
|
return errors.New("shutdown health probe worker: nil context")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Tick performs one probe pass. Exported so tests can drive the worker
|
|
// deterministically without spinning a real ticker.
|
|
func (worker *Worker) Tick(ctx context.Context) {
|
|
worker.tick(ctx)
|
|
}
|
|
|
|
// tick performs one full pass: list running records, prune state for
|
|
// stopped games, then probe every running game in parallel.
|
|
func (worker *Worker) tick(ctx context.Context) {
|
|
if err := ctx.Err(); err != nil {
|
|
return
|
|
}
|
|
|
|
records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
|
|
if err != nil {
|
|
worker.logger.WarnContext(ctx, "list running records",
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
worker.pruneStates(records)
|
|
|
|
if len(records) == 0 {
|
|
return
|
|
}
|
|
|
|
semaphore := make(chan struct{}, worker.maxConcurrency)
|
|
var waitGroup sync.WaitGroup
|
|
for _, record := range records {
|
|
select {
|
|
case <-ctx.Done():
|
|
waitGroup.Wait()
|
|
return
|
|
case semaphore <- struct{}{}:
|
|
}
|
|
waitGroup.Add(1)
|
|
go func(record runtime.RuntimeRecord) {
|
|
defer waitGroup.Done()
|
|
defer func() { <-semaphore }()
|
|
worker.probeOne(ctx, record)
|
|
}(record)
|
|
}
|
|
waitGroup.Wait()
|
|
}
|
|
|
|
// pruneStates removes per-game state for games no longer in the running
|
|
// list. Stopped or removed games therefore start with a clean counter
|
|
// the next time they re-enter `running`.
|
|
func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) {
|
|
worker.mu.Lock()
|
|
defer worker.mu.Unlock()
|
|
if len(worker.states) == 0 {
|
|
return
|
|
}
|
|
running := make(map[string]struct{}, len(records))
|
|
for _, record := range records {
|
|
running[record.GameID] = struct{}{}
|
|
}
|
|
for gameID := range worker.states {
|
|
if _, ok := running[gameID]; !ok {
|
|
delete(worker.states, gameID)
|
|
}
|
|
}
|
|
}
|
|
|
|
// probeOne issues one `/healthz` request and updates hysteresis state.
|
|
func (worker *Worker) probeOne(ctx context.Context, record runtime.RuntimeRecord) {
|
|
probeCtx, cancel := context.WithTimeout(ctx, worker.probeTimeout)
|
|
defer cancel()
|
|
|
|
endpoint := strings.TrimRight(record.EngineEndpoint, "/") + healthzPath
|
|
request, err := http.NewRequestWithContext(probeCtx, http.MethodGet, endpoint, nil)
|
|
if err != nil {
|
|
worker.recordFailure(ctx, record, 0, fmt.Errorf("build request: %w", err))
|
|
return
|
|
}
|
|
|
|
response, err := worker.httpClient.Do(request)
|
|
if err != nil {
|
|
worker.recordFailure(ctx, record, 0, err)
|
|
return
|
|
}
|
|
defer response.Body.Close()
|
|
|
|
if response.StatusCode == http.StatusOK {
|
|
worker.recordSuccess(ctx, record)
|
|
return
|
|
}
|
|
worker.recordFailure(ctx, record, response.StatusCode, fmt.Errorf("unexpected status %d", response.StatusCode))
|
|
}
|
|
|
|
// recordSuccess updates state on a successful probe and emits
|
|
// `probe_recovered` when the prior tick had crossed the failure
|
|
// threshold.
|
|
func (worker *Worker) recordSuccess(ctx context.Context, record runtime.RuntimeRecord) {
|
|
worker.mu.Lock()
|
|
state, ok := worker.states[record.GameID]
|
|
if !ok {
|
|
worker.mu.Unlock()
|
|
return
|
|
}
|
|
if !state.failurePublished {
|
|
state.consecutiveFailures = 0
|
|
worker.mu.Unlock()
|
|
return
|
|
}
|
|
priorFailureCount := state.consecutiveFailures
|
|
state.consecutiveFailures = 0
|
|
state.failurePublished = false
|
|
worker.mu.Unlock()
|
|
|
|
worker.publish(ctx, ports.HealthEventEnvelope{
|
|
GameID: record.GameID,
|
|
ContainerID: record.CurrentContainerID,
|
|
EventType: health.EventTypeProbeRecovered,
|
|
OccurredAt: worker.clock().UTC(),
|
|
Details: probeRecoveredDetails(priorFailureCount),
|
|
})
|
|
}
|
|
|
|
// recordFailure updates state on a failed probe and emits
|
|
// `probe_failed` once the threshold is crossed.
|
|
func (worker *Worker) recordFailure(ctx context.Context, record runtime.RuntimeRecord, lastStatus int, lastErr error) {
|
|
worker.mu.Lock()
|
|
state, ok := worker.states[record.GameID]
|
|
if !ok {
|
|
state = &probeState{}
|
|
worker.states[record.GameID] = state
|
|
}
|
|
state.consecutiveFailures++
|
|
if state.failurePublished || state.consecutiveFailures < worker.failuresThreshold {
|
|
count := state.consecutiveFailures
|
|
worker.mu.Unlock()
|
|
worker.logger.DebugContext(ctx, "probe failure",
|
|
"game_id", record.GameID,
|
|
"consecutive_failures", count,
|
|
"threshold", worker.failuresThreshold,
|
|
"err", errString(lastErr),
|
|
)
|
|
return
|
|
}
|
|
state.failurePublished = true
|
|
count := state.consecutiveFailures
|
|
worker.mu.Unlock()
|
|
|
|
worker.publish(ctx, ports.HealthEventEnvelope{
|
|
GameID: record.GameID,
|
|
ContainerID: record.CurrentContainerID,
|
|
EventType: health.EventTypeProbeFailed,
|
|
OccurredAt: worker.clock().UTC(),
|
|
Details: probeFailedDetails(count, lastStatus, errString(lastErr)),
|
|
})
|
|
}
|
|
|
|
// publish emits one envelope through the configured publisher, updates
|
|
// the telemetry counter, and logs the outcome. Failures degrade to a
|
|
// warning log per `rtmanager/README.md §Notification Contracts`.
|
|
func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
|
if err := worker.healthEvents.Publish(ctx, envelope); err != nil {
|
|
worker.logger.ErrorContext(ctx, "publish health event",
|
|
"game_id", envelope.GameID,
|
|
"container_id", envelope.ContainerID,
|
|
"event_type", string(envelope.EventType),
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
|
|
|
|
logArgs := []any{
|
|
"game_id", envelope.GameID,
|
|
"container_id", envelope.ContainerID,
|
|
"event_type", string(envelope.EventType),
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
worker.logger.InfoContext(ctx, "probe event published", logArgs...)
|
|
}
|
|
|
|
// probeFailedDetails builds the JSON payload required by the
|
|
// `probe_failed` AsyncAPI variant.
|
|
func probeFailedDetails(consecutiveFailures, lastStatus int, lastError string) json.RawMessage {
|
|
payload := struct {
|
|
ConsecutiveFailures int `json:"consecutive_failures"`
|
|
LastStatus int `json:"last_status"`
|
|
LastError string `json:"last_error"`
|
|
}{
|
|
ConsecutiveFailures: consecutiveFailures,
|
|
LastStatus: lastStatus,
|
|
LastError: lastError,
|
|
}
|
|
encoded, _ := json.Marshal(payload)
|
|
return encoded
|
|
}
|
|
|
|
// probeRecoveredDetails builds the JSON payload required by the
|
|
// `probe_recovered` AsyncAPI variant.
|
|
func probeRecoveredDetails(priorFailureCount int) json.RawMessage {
|
|
payload := struct {
|
|
PriorFailureCount int `json:"prior_failure_count"`
|
|
}{PriorFailureCount: priorFailureCount}
|
|
encoded, _ := json.Marshal(payload)
|
|
return encoded
|
|
}
|
|
|
|
func errString(err error) string {
|
|
if err == nil {
|
|
return ""
|
|
}
|
|
return err.Error()
|
|
}
|