galaxy-game/rtmanager/internal/worker/healthprobe/worker.go

// Package healthprobe runs the active HTTP `/healthz` probe described in
// `rtmanager/README.md §Health Monitoring`.
//
// On every tick the worker lists `runtime_records.status=running`,
// probes each engine endpoint in parallel (capped at
// defaultMaxConcurrency), and applies the
// RTMANAGER_PROBE_FAILURES_THRESHOLD hysteresis to emit `probe_failed`
// (after N consecutive failures) and `probe_recovered` (on the first
// success after a `probe_failed` was published). In-memory state is
// pruned at the start of every tick against the freshly-read running
// list, so a game that stops between ticks never accumulates stale
// failure counters.
//
// Design rationale is captured in
// `rtmanager/docs/workers.md`.
package healthprobe

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"log/slog"
	"net/http"
	"strings"
	"sync"
	"time"

	"galaxy/rtmanager/internal/domain/health"
	"galaxy/rtmanager/internal/domain/runtime"
	"galaxy/rtmanager/internal/logging"
	"galaxy/rtmanager/internal/ports"
	"galaxy/rtmanager/internal/telemetry"
)

// defaultMaxConcurrency caps the number of in-flight `/healthz`
// requests inside a single tick. RTM v1 is single-instance with a
// modest active-game count; the cap keeps a slow engine from delaying
// the rest of the cohort while preventing pathological fan-out if the
// running list grows.
const defaultMaxConcurrency = 16

// healthzPath is the engine probe path. Stable per
// `game/README.md §/healthz`.
const healthzPath = "/healthz"

// Dependencies groups the collaborators required by Worker.
type Dependencies struct {
	// RuntimeRecords lists running games on every tick.
	RuntimeRecords ports.RuntimeRecordStore

	// HealthEvents emits `probe_failed` and `probe_recovered`.
	HealthEvents ports.HealthEventPublisher

	// HTTPClient performs the engine `/healthz` request. Required.
	// Production wiring supplies an `otelhttp`-instrumented client.
	HTTPClient *http.Client

	// Telemetry records one health-event counter per emission.
	Telemetry *telemetry.Runtime

	// Interval bounds the tick period.
	Interval time.Duration

	// ProbeTimeout bounds one engine `/healthz` call.
	ProbeTimeout time.Duration

	// FailuresThreshold is the consecutive-failure count that promotes
	// the in-memory counter to a `probe_failed` emission.
	FailuresThreshold int

	// MaxConcurrency caps the number of in-flight probes per tick.
	// Defaults to defaultMaxConcurrency when zero or negative.
	MaxConcurrency int

	// Clock supplies the wall-clock used for emission timestamps.
	// Defaults to `time.Now` when nil.
	Clock func() time.Time

	// Logger receives structured worker-level events. Defaults to
	// `slog.Default()` when nil.
	Logger *slog.Logger
}

// Worker drives the periodic active-probe loop.
type Worker struct {
	runtimeRecords ports.RuntimeRecordStore
	healthEvents   ports.HealthEventPublisher
	httpClient     *http.Client
	telemetry      *telemetry.Runtime

	interval          time.Duration
	probeTimeout      time.Duration
	failuresThreshold int
	maxConcurrency    int

	clock  func() time.Time
	logger *slog.Logger

	mu     sync.Mutex
	states map[string]*probeState
}

// probeState stores the per-game hysteresis counters. Owned by Worker
// and protected by Worker.mu.
type probeState struct {
	consecutiveFailures int
	failurePublished    bool
}

// NewWorker constructs one Worker from deps.
func NewWorker(deps Dependencies) (*Worker, error) {
	switch {
	case deps.RuntimeRecords == nil:
		return nil, errors.New("new health probe worker: nil runtime records store")
	case deps.HealthEvents == nil:
		return nil, errors.New("new health probe worker: nil health events publisher")
	case deps.HTTPClient == nil:
		return nil, errors.New("new health probe worker: nil http client")
	case deps.Telemetry == nil:
		return nil, errors.New("new health probe worker: nil telemetry runtime")
	case deps.Interval <= 0:
		return nil, errors.New("new health probe worker: interval must be positive")
	case deps.ProbeTimeout <= 0:
		return nil, errors.New("new health probe worker: probe timeout must be positive")
	case deps.FailuresThreshold <= 0:
		return nil, errors.New("new health probe worker: failures threshold must be positive")
	}

	clock := deps.Clock
	if clock == nil {
		clock = time.Now
	}
	logger := deps.Logger
	if logger == nil {
		logger = slog.Default()
	}
	maxConcurrency := deps.MaxConcurrency
	if maxConcurrency <= 0 {
		maxConcurrency = defaultMaxConcurrency
	}

	return &Worker{
		runtimeRecords:    deps.RuntimeRecords,
		healthEvents:      deps.HealthEvents,
		httpClient:        deps.HTTPClient,
		telemetry:         deps.Telemetry,
		interval:          deps.Interval,
		probeTimeout:      deps.ProbeTimeout,
		failuresThreshold: deps.FailuresThreshold,
		maxConcurrency:    maxConcurrency,
		clock:             clock,
		logger:            logger.With("worker", "rtmanager.healthprobe"),
		states:            map[string]*probeState{},
	}, nil
}

// Run drives the probe loop until ctx is cancelled. Per-tick errors are
// absorbed; the loop only exits on context cancellation.
func (worker *Worker) Run(ctx context.Context) error {
	if worker == nil {
		return errors.New("run health probe worker: nil worker")
	}
	if ctx == nil {
		return errors.New("run health probe worker: nil context")
	}
	if err := ctx.Err(); err != nil {
		return err
	}

	worker.logger.Info("health probe worker started",
		"interval", worker.interval.String(),
		"probe_timeout", worker.probeTimeout.String(),
		"failures_threshold", worker.failuresThreshold,
		"max_concurrency", worker.maxConcurrency,
	)
	defer worker.logger.Info("health probe worker stopped")

	ticker := time.NewTicker(worker.interval)
	defer ticker.Stop()

	for {
		select {
		case <-ctx.Done():
			return ctx.Err()
		case <-ticker.C:
			worker.tick(ctx)
		}
	}
}

// Shutdown is a no-op; Run terminates on context cancellation.
func (worker *Worker) Shutdown(ctx context.Context) error {
	if ctx == nil {
		return errors.New("shutdown health probe worker: nil context")
	}
	return nil
}

// Tick performs one probe pass. Exported so tests can drive the worker
// deterministically without spinning a real ticker.
func (worker *Worker) Tick(ctx context.Context) {
	worker.tick(ctx)
}

// tick performs one full pass: list running records, prune state for
// stopped games, then probe every running game in parallel.
func (worker *Worker) tick(ctx context.Context) {
	if err := ctx.Err(); err != nil {
		return
	}

	records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
	if err != nil {
		worker.logger.WarnContext(ctx, "list running records",
			"err", err.Error(),
		)
		return
	}

	worker.pruneStates(records)

	if len(records) == 0 {
		return
	}

	semaphore := make(chan struct{}, worker.maxConcurrency)
	var waitGroup sync.WaitGroup
	for _, record := range records {
		select {
		case <-ctx.Done():
			waitGroup.Wait()
			return
		case semaphore <- struct{}{}:
		}
		waitGroup.Add(1)
		go func(record runtime.RuntimeRecord) {
			defer waitGroup.Done()
			defer func() { <-semaphore }()
			worker.probeOne(ctx, record)
		}(record)
	}
	waitGroup.Wait()
}

// pruneStates removes per-game state for games no longer in the running
// list. Stopped or removed games therefore start with a clean counter
// the next time they re-enter `running`.
func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) {
	worker.mu.Lock()
	defer worker.mu.Unlock()
	if len(worker.states) == 0 {
		return
	}
	running := make(map[string]struct{}, len(records))
	for _, record := range records {
		running[record.GameID] = struct{}{}
	}
	for gameID := range worker.states {
		if _, ok := running[gameID]; !ok {
			delete(worker.states, gameID)
		}
	}
}

// probeOne issues one `/healthz` request and updates hysteresis state.
func (worker *Worker) probeOne(ctx context.Context, record runtime.RuntimeRecord) {
	probeCtx, cancel := context.WithTimeout(ctx, worker.probeTimeout)
	defer cancel()

	endpoint := strings.TrimRight(record.EngineEndpoint, "/") + healthzPath
	request, err := http.NewRequestWithContext(probeCtx, http.MethodGet, endpoint, nil)
	if err != nil {
		worker.recordFailure(ctx, record, 0, fmt.Errorf("build request: %w", err))
		return
	}

	response, err := worker.httpClient.Do(request)
	if err != nil {
		worker.recordFailure(ctx, record, 0, err)
		return
	}
	defer response.Body.Close()

	if response.StatusCode == http.StatusOK {
		worker.recordSuccess(ctx, record)
		return
	}
	worker.recordFailure(ctx, record, response.StatusCode, fmt.Errorf("unexpected status %d", response.StatusCode))
}

// recordSuccess updates state on a successful probe and emits
// `probe_recovered` when the prior tick had crossed the failure
// threshold.
func (worker *Worker) recordSuccess(ctx context.Context, record runtime.RuntimeRecord) {
	worker.mu.Lock()
	state, ok := worker.states[record.GameID]
	if !ok {
		worker.mu.Unlock()
		return
	}
	if !state.failurePublished {
		state.consecutiveFailures = 0
		worker.mu.Unlock()
		return
	}
	priorFailureCount := state.consecutiveFailures
	state.consecutiveFailures = 0
	state.failurePublished = false
	worker.mu.Unlock()

	worker.publish(ctx, ports.HealthEventEnvelope{
		GameID:      record.GameID,
		ContainerID: record.CurrentContainerID,
		EventType:   health.EventTypeProbeRecovered,
		OccurredAt:  worker.clock().UTC(),
		Details:     probeRecoveredDetails(priorFailureCount),
	})
}

// recordFailure updates state on a failed probe and emits
// `probe_failed` once the threshold is crossed.
func (worker *Worker) recordFailure(ctx context.Context, record runtime.RuntimeRecord, lastStatus int, lastErr error) {
	worker.mu.Lock()
	state, ok := worker.states[record.GameID]
	if !ok {
		state = &probeState{}
		worker.states[record.GameID] = state
	}
	state.consecutiveFailures++
	if state.failurePublished || state.consecutiveFailures < worker.failuresThreshold {
		count := state.consecutiveFailures
		worker.mu.Unlock()
		worker.logger.DebugContext(ctx, "probe failure",
			"game_id", record.GameID,
			"consecutive_failures", count,
			"threshold", worker.failuresThreshold,
			"err", errString(lastErr),
		)
		return
	}
	state.failurePublished = true
	count := state.consecutiveFailures
	worker.mu.Unlock()

	worker.publish(ctx, ports.HealthEventEnvelope{
		GameID:      record.GameID,
		ContainerID: record.CurrentContainerID,
		EventType:   health.EventTypeProbeFailed,
		OccurredAt:  worker.clock().UTC(),
		Details:     probeFailedDetails(count, lastStatus, errString(lastErr)),
	})
}

// publish emits one envelope through the configured publisher, updates
// the telemetry counter, and logs the outcome. Failures degrade to a
// warning log per `rtmanager/README.md §Notification Contracts`.
func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
	if err := worker.healthEvents.Publish(ctx, envelope); err != nil {
		worker.logger.ErrorContext(ctx, "publish health event",
			"game_id", envelope.GameID,
			"container_id", envelope.ContainerID,
			"event_type", string(envelope.EventType),
			"err", err.Error(),
		)
		return
	}

	worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))

	logArgs := []any{
		"game_id", envelope.GameID,
		"container_id", envelope.ContainerID,
		"event_type", string(envelope.EventType),
	}
	logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
	worker.logger.InfoContext(ctx, "probe event published", logArgs...)
}

// probeFailedDetails builds the JSON payload required by the
// `probe_failed` AsyncAPI variant.
func probeFailedDetails(consecutiveFailures, lastStatus int, lastError string) json.RawMessage {
	payload := struct {
		ConsecutiveFailures int    `json:"consecutive_failures"`
		LastStatus          int    `json:"last_status"`
		LastError           string `json:"last_error"`
	}{
		ConsecutiveFailures: consecutiveFailures,
		LastStatus:          lastStatus,
		LastError:           lastError,
	}
	encoded, _ := json.Marshal(payload)
	return encoded
}

// probeRecoveredDetails builds the JSON payload required by the
// `probe_recovered` AsyncAPI variant.
func probeRecoveredDetails(priorFailureCount int) json.RawMessage {
	payload := struct {
		PriorFailureCount int `json:"prior_failure_count"`
	}{PriorFailureCount: priorFailureCount}
	encoded, _ := json.Marshal(payload)
	return encoded
}

func errString(err error) string {
	if err == nil {
		return ""
	}
	return err.Error()
}