feat: runtime manager

2026-04-28 20:39:18 +02:00
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,411 @@
+// Package healthprobe runs the active HTTP `/healthz` probe described in
+// `rtmanager/README.md §Health Monitoring`.
+//
+// On every tick the worker lists `runtime_records.status=running`,
+// probes each engine endpoint in parallel (capped at
+// defaultMaxConcurrency), and applies the
+// RTMANAGER_PROBE_FAILURES_THRESHOLD hysteresis to emit `probe_failed`
+// (after N consecutive failures) and `probe_recovered` (on the first
+// success after a `probe_failed` was published). In-memory state is
+// pruned at the start of every tick against the freshly-read running
+// list, so a game that stops between ticks never accumulates stale
+// failure counters.
+//
+// Design rationale is captured in
+// `rtmanager/docs/workers.md`.
+package healthprobe
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log/slog"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+
+	"galaxy/rtmanager/internal/domain/health"
+	"galaxy/rtmanager/internal/domain/runtime"
+	"galaxy/rtmanager/internal/logging"
+	"galaxy/rtmanager/internal/ports"
+	"galaxy/rtmanager/internal/telemetry"
+)
+
+// defaultMaxConcurrency caps the number of in-flight `/healthz`
+// requests inside a single tick. RTM v1 is single-instance with a
+// modest active-game count; the cap keeps a slow engine from delaying
+// the rest of the cohort while preventing pathological fan-out if the
+// running list grows.
+const defaultMaxConcurrency = 16
+
+// healthzPath is the engine probe path. Stable per
+// `game/README.md §/healthz`.
+const healthzPath = "/healthz"
+
+// Dependencies groups the collaborators required by Worker.
+type Dependencies struct {
+	// RuntimeRecords lists running games on every tick.
+	RuntimeRecords ports.RuntimeRecordStore
+
+	// HealthEvents emits `probe_failed` and `probe_recovered`.
+	HealthEvents ports.HealthEventPublisher
+
+	// HTTPClient performs the engine `/healthz` request. Required.
+	// Production wiring supplies an `otelhttp`-instrumented client.
+	HTTPClient *http.Client
+
+	// Telemetry records one health-event counter per emission.
+	Telemetry *telemetry.Runtime
+
+	// Interval bounds the tick period.
+	Interval time.Duration
+
+	// ProbeTimeout bounds one engine `/healthz` call.
+	ProbeTimeout time.Duration
+
+	// FailuresThreshold is the consecutive-failure count that promotes
+	// the in-memory counter to a `probe_failed` emission.
+	FailuresThreshold int
+
+	// MaxConcurrency caps the number of in-flight probes per tick.
+	// Defaults to defaultMaxConcurrency when zero or negative.
+	MaxConcurrency int
+
+	// Clock supplies the wall-clock used for emission timestamps.
+	// Defaults to `time.Now` when nil.
+	Clock func() time.Time
+
+	// Logger receives structured worker-level events. Defaults to
+	// `slog.Default()` when nil.
+	Logger *slog.Logger
+}
+
+// Worker drives the periodic active-probe loop.
+type Worker struct {
+	runtimeRecords ports.RuntimeRecordStore
+	healthEvents   ports.HealthEventPublisher
+	httpClient     *http.Client
+	telemetry      *telemetry.Runtime
+
+	interval          time.Duration
+	probeTimeout      time.Duration
+	failuresThreshold int
+	maxConcurrency    int
+
+	clock  func() time.Time
+	logger *slog.Logger
+
+	mu     sync.Mutex
+	states map[string]*probeState
+}
+
+// probeState stores the per-game hysteresis counters. Owned by Worker
+// and protected by Worker.mu.
+type probeState struct {
+	consecutiveFailures int
+	failurePublished    bool
+}
+
+// NewWorker constructs one Worker from deps.
+func NewWorker(deps Dependencies) (*Worker, error) {
+	switch {
+	case deps.RuntimeRecords == nil:
+		return nil, errors.New("new health probe worker: nil runtime records store")
+	case deps.HealthEvents == nil:
+		return nil, errors.New("new health probe worker: nil health events publisher")
+	case deps.HTTPClient == nil:
+		return nil, errors.New("new health probe worker: nil http client")
+	case deps.Telemetry == nil:
+		return nil, errors.New("new health probe worker: nil telemetry runtime")
+	case deps.Interval <= 0:
+		return nil, errors.New("new health probe worker: interval must be positive")
+	case deps.ProbeTimeout <= 0:
+		return nil, errors.New("new health probe worker: probe timeout must be positive")
+	case deps.FailuresThreshold <= 0:
+		return nil, errors.New("new health probe worker: failures threshold must be positive")
+	}
+
+	clock := deps.Clock
+	if clock == nil {
+		clock = time.Now
+	}
+	logger := deps.Logger
+	if logger == nil {
+		logger = slog.Default()
+	}
+	maxConcurrency := deps.MaxConcurrency
+	if maxConcurrency <= 0 {
+		maxConcurrency = defaultMaxConcurrency
+	}
+
+	return &Worker{
+		runtimeRecords:    deps.RuntimeRecords,
+		healthEvents:      deps.HealthEvents,
+		httpClient:        deps.HTTPClient,
+		telemetry:         deps.Telemetry,
+		interval:          deps.Interval,
+		probeTimeout:      deps.ProbeTimeout,
+		failuresThreshold: deps.FailuresThreshold,
+		maxConcurrency:    maxConcurrency,
+		clock:             clock,
+		logger:            logger.With("worker", "rtmanager.healthprobe"),
+		states:            map[string]*probeState{},
+	}, nil
+}
+
+// Run drives the probe loop until ctx is cancelled. Per-tick errors are
+// absorbed; the loop only exits on context cancellation.
+func (worker *Worker) Run(ctx context.Context) error {
+	if worker == nil {
+		return errors.New("run health probe worker: nil worker")
+	}
+	if ctx == nil {
+		return errors.New("run health probe worker: nil context")
+	}
+	if err := ctx.Err(); err != nil {
+		return err
+	}
+
+	worker.logger.Info("health probe worker started",
+		"interval", worker.interval.String(),
+		"probe_timeout", worker.probeTimeout.String(),
+		"failures_threshold", worker.failuresThreshold,
+		"max_concurrency", worker.maxConcurrency,
+	)
+	defer worker.logger.Info("health probe worker stopped")
+
+	ticker := time.NewTicker(worker.interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-ticker.C:
+			worker.tick(ctx)
+		}
+	}
+}
+
+// Shutdown is a no-op; Run terminates on context cancellation.
+func (worker *Worker) Shutdown(ctx context.Context) error {
+	if ctx == nil {
+		return errors.New("shutdown health probe worker: nil context")
+	}
+	return nil
+}
+
+// Tick performs one probe pass. Exported so tests can drive the worker
+// deterministically without spinning a real ticker.
+func (worker *Worker) Tick(ctx context.Context) {
+	worker.tick(ctx)
+}
+
+// tick performs one full pass: list running records, prune state for
+// stopped games, then probe every running game in parallel.
+func (worker *Worker) tick(ctx context.Context) {
+	if err := ctx.Err(); err != nil {
+		return
+	}
+
+	records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
+	if err != nil {
+		worker.logger.WarnContext(ctx, "list running records",
+			"err", err.Error(),
+		)
+		return
+	}
+
+	worker.pruneStates(records)
+
+	if len(records) == 0 {
+		return
+	}
+
+	semaphore := make(chan struct{}, worker.maxConcurrency)
+	var waitGroup sync.WaitGroup
+	for _, record := range records {
+		select {
+		case <-ctx.Done():
+			waitGroup.Wait()
+			return
+		case semaphore <- struct{}{}:
+		}
+		waitGroup.Add(1)
+		go func(record runtime.RuntimeRecord) {
+			defer waitGroup.Done()
+			defer func() { <-semaphore }()
+			worker.probeOne(ctx, record)
+		}(record)
+	}
+	waitGroup.Wait()
+}
+
+// pruneStates removes per-game state for games no longer in the running
+// list. Stopped or removed games therefore start with a clean counter
+// the next time they re-enter `running`.
+func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) {
+	worker.mu.Lock()
+	defer worker.mu.Unlock()
+	if len(worker.states) == 0 {
+		return
+	}
+	running := make(map[string]struct{}, len(records))
+	for _, record := range records {
+		running[record.GameID] = struct{}{}
+	}
+	for gameID := range worker.states {
+		if _, ok := running[gameID]; !ok {
+			delete(worker.states, gameID)
+		}
+	}
+}
+
+// probeOne issues one `/healthz` request and updates hysteresis state.
+func (worker *Worker) probeOne(ctx context.Context, record runtime.RuntimeRecord) {
+	probeCtx, cancel := context.WithTimeout(ctx, worker.probeTimeout)
+	defer cancel()
+
+	endpoint := strings.TrimRight(record.EngineEndpoint, "/") + healthzPath
+	request, err := http.NewRequestWithContext(probeCtx, http.MethodGet, endpoint, nil)
+	if err != nil {
+		worker.recordFailure(ctx, record, 0, fmt.Errorf("build request: %w", err))
+		return
+	}
+
+	response, err := worker.httpClient.Do(request)
+	if err != nil {
+		worker.recordFailure(ctx, record, 0, err)
+		return
+	}
+	defer response.Body.Close()
+
+	if response.StatusCode == http.StatusOK {
+		worker.recordSuccess(ctx, record)
+		return
+	}
+	worker.recordFailure(ctx, record, response.StatusCode, fmt.Errorf("unexpected status %d", response.StatusCode))
+}
+
+// recordSuccess updates state on a successful probe and emits
+// `probe_recovered` when the prior tick had crossed the failure
+// threshold.
+func (worker *Worker) recordSuccess(ctx context.Context, record runtime.RuntimeRecord) {
+	worker.mu.Lock()
+	state, ok := worker.states[record.GameID]
+	if !ok {
+		worker.mu.Unlock()
+		return
+	}
+	if !state.failurePublished {
+		state.consecutiveFailures = 0
+		worker.mu.Unlock()
+		return
+	}
+	priorFailureCount := state.consecutiveFailures
+	state.consecutiveFailures = 0
+	state.failurePublished = false
+	worker.mu.Unlock()
+
+	worker.publish(ctx, ports.HealthEventEnvelope{
+		GameID:      record.GameID,
+		ContainerID: record.CurrentContainerID,
+		EventType:   health.EventTypeProbeRecovered,
+		OccurredAt:  worker.clock().UTC(),
+		Details:     probeRecoveredDetails(priorFailureCount),
+	})
+}
+
+// recordFailure updates state on a failed probe and emits
+// `probe_failed` once the threshold is crossed.
+func (worker *Worker) recordFailure(ctx context.Context, record runtime.RuntimeRecord, lastStatus int, lastErr error) {
+	worker.mu.Lock()
+	state, ok := worker.states[record.GameID]
+	if !ok {
+		state = &probeState{}
+		worker.states[record.GameID] = state
+	}
+	state.consecutiveFailures++
+	if state.failurePublished || state.consecutiveFailures < worker.failuresThreshold {
+		count := state.consecutiveFailures
+		worker.mu.Unlock()
+		worker.logger.DebugContext(ctx, "probe failure",
+			"game_id", record.GameID,
+			"consecutive_failures", count,
+			"threshold", worker.failuresThreshold,
+			"err", errString(lastErr),
+		)
+		return
+	}
+	state.failurePublished = true
+	count := state.consecutiveFailures
+	worker.mu.Unlock()
+
+	worker.publish(ctx, ports.HealthEventEnvelope{
+		GameID:      record.GameID,
+		ContainerID: record.CurrentContainerID,
+		EventType:   health.EventTypeProbeFailed,
+		OccurredAt:  worker.clock().UTC(),
+		Details:     probeFailedDetails(count, lastStatus, errString(lastErr)),
+	})
+}
+
+// publish emits one envelope through the configured publisher, updates
+// the telemetry counter, and logs the outcome. Failures degrade to a
+// warning log per `rtmanager/README.md §Notification Contracts`.
+func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
+	if err := worker.healthEvents.Publish(ctx, envelope); err != nil {
+		worker.logger.ErrorContext(ctx, "publish health event",
+			"game_id", envelope.GameID,
+			"container_id", envelope.ContainerID,
+			"event_type", string(envelope.EventType),
+			"err", err.Error(),
+		)
+		return
+	}
+
+	worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
+
+	logArgs := []any{
+		"game_id", envelope.GameID,
+		"container_id", envelope.ContainerID,
+		"event_type", string(envelope.EventType),
+	}
+	logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
+	worker.logger.InfoContext(ctx, "probe event published", logArgs...)
+}
+
+// probeFailedDetails builds the JSON payload required by the
+// `probe_failed` AsyncAPI variant.
+func probeFailedDetails(consecutiveFailures, lastStatus int, lastError string) json.RawMessage {
+	payload := struct {
+		ConsecutiveFailures int    `json:"consecutive_failures"`
+		LastStatus          int    `json:"last_status"`
+		LastError           string `json:"last_error"`
+	}{
+		ConsecutiveFailures: consecutiveFailures,
+		LastStatus:          lastStatus,
+		LastError:           lastError,
+	}
+	encoded, _ := json.Marshal(payload)
+	return encoded
+}
+
+// probeRecoveredDetails builds the JSON payload required by the
+// `probe_recovered` AsyncAPI variant.
+func probeRecoveredDetails(priorFailureCount int) json.RawMessage {
+	payload := struct {
+		PriorFailureCount int `json:"prior_failure_count"`
+	}{PriorFailureCount: priorFailureCount}
+	encoded, _ := json.Marshal(payload)
+	return encoded
+}
+
+func errString(err error) string {
+	if err == nil {
+		return ""
+	}
+	return err.Error()
+}
@@ -0,0 +1,417 @@
+package healthprobe_test
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"io"
+	"log/slog"
+	"net/http"
+	"net/http/httptest"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"galaxy/rtmanager/internal/domain/health"
+	"galaxy/rtmanager/internal/domain/runtime"
+	"galaxy/rtmanager/internal/ports"
+	"galaxy/rtmanager/internal/telemetry"
+	"galaxy/rtmanager/internal/worker/healthprobe"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func silentLogger() *slog.Logger {
+	return slog.New(slog.NewTextHandler(io.Discard, nil))
+}
+
+// fakeRuntimeRecords supports List/ListByStatus only; the worker does
+// not call other methods.
+type fakeRuntimeRecords struct {
+	mu      sync.Mutex
+	running []runtime.RuntimeRecord
+	listErr error
+}
+
+func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} }
+
+func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.running = append([]runtime.RuntimeRecord(nil), records...)
+}
+
+func (s *fakeRuntimeRecords) Clear() {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.running = nil
+}
+
+func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) {
+	return runtime.RuntimeRecord{}, runtime.ErrNotFound
+}
+func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil }
+func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
+	return nil
+}
+func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
+	return nil, nil
+}
+
+func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.listErr != nil {
+		return nil, s.listErr
+	}
+	if status != runtime.StatusRunning {
+		return nil, nil
+	}
+	out := make([]runtime.RuntimeRecord, len(s.running))
+	copy(out, s.running)
+	return out, nil
+}
+
+// fakeHealthEvents captures every Publish call.
+type fakeHealthEvents struct {
+	mu         sync.Mutex
+	published  []ports.HealthEventEnvelope
+	publishErr error
+}
+
+func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.publishErr != nil {
+		return s.publishErr
+	}
+	s.published = append(s.published, envelope)
+	return nil
+}
+
+func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	out := make([]ports.HealthEventEnvelope, len(s.published))
+	copy(out, s.published)
+	return out
+}
+
+// engineServer is a per-game HTTP fake controlled by tests.
+type engineServer struct {
+	server   *httptest.Server
+	status   atomic.Int32
+	requests atomic.Int32
+}
+
+func newEngineServer(t *testing.T) *engineServer {
+	t.Helper()
+	es := &engineServer{}
+	es.status.Store(http.StatusOK)
+	es.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		es.requests.Add(1)
+		w.WriteHeader(int(es.status.Load()))
+	}))
+	t.Cleanup(es.server.Close)
+	return es
+}
+
+func (e *engineServer) URL() string { return e.server.URL }
+
+func (e *engineServer) SetStatus(code int) { e.status.Store(int32(code)) }
+
+func (e *engineServer) Stop() { e.server.Close() }
+
+// --- harness ----------------------------------------------------------
+
+type harness struct {
+	records *fakeRuntimeRecords
+	health  *fakeHealthEvents
+	worker  *healthprobe.Worker
+	now     time.Time
+}
+
+func newHarness(t *testing.T) *harness {
+	t.Helper()
+	telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
+	require.NoError(t, err)
+
+	records := newFakeRuntimeRecords()
+	healthEvents := &fakeHealthEvents{}
+
+	worker, err := healthprobe.NewWorker(healthprobe.Dependencies{
+		RuntimeRecords:    records,
+		HealthEvents:      healthEvents,
+		HTTPClient:        &http.Client{},
+		Telemetry:         telemetryRuntime,
+		Interval:          50 * time.Millisecond,
+		ProbeTimeout:      100 * time.Millisecond,
+		FailuresThreshold: 3,
+		MaxConcurrency:    4,
+		Clock:             func() time.Time { return time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) },
+		Logger:            silentLogger(),
+	})
+	require.NoError(t, err)
+
+	return &harness{
+		records: records,
+		health:  healthEvents,
+		worker:  worker,
+		now:     time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
+	}
+}
+
+func runningRecord(gameID, endpoint string) runtime.RuntimeRecord {
+	startedAt := time.Date(2026, 4, 27, 11, 0, 0, 0, time.UTC)
+	return runtime.RuntimeRecord{
+		GameID:             gameID,
+		Status:             runtime.StatusRunning,
+		CurrentContainerID: "ctr-" + gameID,
+		CurrentImageRef:    "galaxy/game:1.0.0",
+		EngineEndpoint:     endpoint,
+		StatePath:          "/var/lib/galaxy/games/" + gameID,
+		DockerNetwork:      "galaxy-net",
+		StartedAt:          &startedAt,
+		LastOpAt:           startedAt,
+		CreatedAt:          startedAt,
+	}
+}
+
+// --- constructor -------------------------------------------------------
+
+func TestNewWorkerRejectsMissingDeps(t *testing.T) {
+	telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
+	require.NoError(t, err)
+
+	base := healthprobe.Dependencies{
+		RuntimeRecords:    newFakeRuntimeRecords(),
+		HealthEvents:      &fakeHealthEvents{},
+		HTTPClient:        &http.Client{},
+		Telemetry:         telemetryRuntime,
+		Interval:          time.Second,
+		ProbeTimeout:      time.Second,
+		FailuresThreshold: 1,
+	}
+
+	defectives := []healthprobe.Dependencies{
+		{},
+		{RuntimeRecords: base.RuntimeRecords},
+		{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents},
+		{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient},
+		{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry},
+		{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second},
+		{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second, ProbeTimeout: time.Second},
+	}
+	for index, deps := range defectives {
+		_, err := healthprobe.NewWorker(deps)
+		require.Errorf(t, err, "case %d should fail", index)
+	}
+
+	_, err = healthprobe.NewWorker(base)
+	require.NoError(t, err)
+}
+
+// --- behaviour --------------------------------------------------------
+
+func TestTickHealthyDoesNotEmit(t *testing.T) {
+	h := newHarness(t)
+	engine := newEngineServer(t)
+
+	h.records.Set(runningRecord("game-a", engine.URL()))
+	h.worker.Tick(context.Background())
+
+	assert.Empty(t, h.health.Published(), "successful probe must not emit events")
+	assert.Equal(t, int32(1), engine.requests.Load(), "exactly one probe request")
+}
+
+func TestTickFailureBelowThresholdDoesNotEmit(t *testing.T) {
+	h := newHarness(t)
+	engine := newEngineServer(t)
+	engine.SetStatus(http.StatusServiceUnavailable)
+
+	h.records.Set(runningRecord("game-a", engine.URL()))
+	h.worker.Tick(context.Background())
+	h.worker.Tick(context.Background())
+
+	assert.Empty(t, h.health.Published(), "two failures below threshold must not emit")
+}
+
+func TestTickFailuresCrossingThresholdEmitProbeFailedOnce(t *testing.T) {
+	h := newHarness(t)
+	engine := newEngineServer(t)
+	engine.SetStatus(http.StatusInternalServerError)
+
+	h.records.Set(runningRecord("game-a", engine.URL()))
+
+	for range 5 {
+		h.worker.Tick(context.Background())
+	}
+
+	envelopes := h.health.Published()
+	require.Len(t, envelopes, 1, "probe_failed must publish exactly once across many failures")
+	envelope := envelopes[0]
+	assert.Equal(t, health.EventTypeProbeFailed, envelope.EventType)
+	assert.Equal(t, "game-a", envelope.GameID)
+	assert.Equal(t, "ctr-game-a", envelope.ContainerID)
+
+	var details struct {
+		ConsecutiveFailures int    `json:"consecutive_failures"`
+		LastStatus          int    `json:"last_status"`
+		LastError           string `json:"last_error"`
+	}
+	require.NoError(t, json.Unmarshal(envelope.Details, &details))
+	assert.Equal(t, 3, details.ConsecutiveFailures, "consecutive_failures equals threshold at first emission")
+	assert.Equal(t, http.StatusInternalServerError, details.LastStatus)
+	assert.NotEmpty(t, details.LastError)
+}
+
+func TestTickRecoveryEmitsProbeRecoveredWithPriorFailureCount(t *testing.T) {
+	h := newHarness(t)
+	engine := newEngineServer(t)
+	engine.SetStatus(http.StatusInternalServerError)
+
+	h.records.Set(runningRecord("game-a", engine.URL()))
+
+	for range 3 {
+		h.worker.Tick(context.Background())
+	}
+	require.Len(t, h.health.Published(), 1, "expect probe_failed after threshold")
+
+	engine.SetStatus(http.StatusOK)
+	h.worker.Tick(context.Background())
+
+	envelopes := h.health.Published()
+	require.Len(t, envelopes, 2, "recovery must emit exactly one probe_recovered")
+	envelope := envelopes[1]
+	assert.Equal(t, health.EventTypeProbeRecovered, envelope.EventType)
+
+	var details struct {
+		PriorFailureCount int `json:"prior_failure_count"`
+	}
+	require.NoError(t, json.Unmarshal(envelope.Details, &details))
+	assert.Equal(t, 3, details.PriorFailureCount)
+}
+
+func TestTickFlappingDoesNotDoublePublishProbeFailed(t *testing.T) {
+	h := newHarness(t)
+	engine := newEngineServer(t)
+	engine.SetStatus(http.StatusInternalServerError)
+
+	h.records.Set(runningRecord("game-a", engine.URL()))
+	for range 5 {
+		h.worker.Tick(context.Background())
+	}
+	require.Len(t, h.health.Published(), 1)
+
+	// New failure after probe_failed has been published: must not emit again.
+	h.worker.Tick(context.Background())
+	assert.Len(t, h.health.Published(), 1, "no new probe_failed while already in failed state")
+}
+
+func TestTickPrunesStateForGamesNoLongerRunning(t *testing.T) {
+	h := newHarness(t)
+	engine := newEngineServer(t)
+	engine.SetStatus(http.StatusInternalServerError)
+
+	h.records.Set(runningRecord("game-a", engine.URL()))
+	for range 3 {
+		h.worker.Tick(context.Background())
+	}
+	require.Len(t, h.health.Published(), 1, "probe_failed published before stop")
+
+	// Game leaves running; state must be pruned.
+	h.records.Clear()
+	h.worker.Tick(context.Background())
+
+	// Re-introduce the same game: counter starts fresh, new failures
+	// must accumulate from zero before another probe_failed fires.
+	h.records.Set(runningRecord("game-a", engine.URL()))
+	h.worker.Tick(context.Background())
+	h.worker.Tick(context.Background())
+	assert.Len(t, h.health.Published(), 1, "fresh state must require threshold failures again")
+
+	h.worker.Tick(context.Background())
+	assert.Len(t, h.health.Published(), 2, "third fresh failure crosses threshold")
+}
+
+func TestTickProbesMultipleGamesConcurrently(t *testing.T) {
+	h := newHarness(t)
+
+	// Two slow engines that simulate noticeable latency. Sequential
+	// execution would take 2*latency; parallel finishes near 1*latency.
+	const latency = 80 * time.Millisecond
+	makeSlowEngine := func() *httptest.Server {
+		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			time.Sleep(latency)
+			w.WriteHeader(http.StatusOK)
+		}))
+		t.Cleanup(server.Close)
+		return server
+	}
+	a := makeSlowEngine()
+	b := makeSlowEngine()
+
+	h.records.Set(
+		runningRecord("game-a", a.URL),
+		runningRecord("game-b", b.URL),
+	)
+
+	start := time.Now()
+	h.worker.Tick(context.Background())
+	elapsed := time.Since(start)
+
+	assert.Less(t, elapsed, 2*latency, "probes must run concurrently, not sequentially")
+}
+
+func TestTickAbsorbsListError(t *testing.T) {
+	h := newHarness(t)
+	h.records.listErr = errors.New("pg down")
+
+	require.NotPanics(t, func() { h.worker.Tick(context.Background()) })
+	assert.Empty(t, h.health.Published())
+}
+
+func TestTickAbsorbsPublishError(t *testing.T) {
+	h := newHarness(t)
+	h.health.publishErr = errors.New("redis down")
+	engine := newEngineServer(t)
+	engine.SetStatus(http.StatusInternalServerError)
+
+	h.records.Set(runningRecord("game-a", engine.URL()))
+	for range 3 {
+		h.worker.Tick(context.Background())
+	}
+	// publishErr means nothing accumulated; the worker must not panic
+	// or change state in surprising ways.
+	assert.Empty(t, h.health.Published())
+}
+
+func TestRunRespectsContextCancel(t *testing.T) {
+	h := newHarness(t)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan error, 1)
+	go func() { done <- h.worker.Run(ctx) }()
+
+	cancel()
+	select {
+	case err := <-done:
+		assert.ErrorIs(t, err, context.Canceled)
+	case <-time.After(time.Second):
+		t.Fatalf("Run did not exit after cancel")
+	}
+}
+
+func TestShutdownIsNoOp(t *testing.T) {
+	h := newHarness(t)
+	require.NoError(t, h.worker.Shutdown(context.Background()))
+}
+
+// --- compile-time safety ----------------------------------------------
+
+var (
+	_ ports.RuntimeRecordStore   = (*fakeRuntimeRecords)(nil)
+	_ ports.HealthEventPublisher = (*fakeHealthEvents)(nil)
+)