feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,411 @@
// Package healthprobe runs the active HTTP `/healthz` probe described in
// `rtmanager/README.md §Health Monitoring`.
//
// On every tick the worker lists `runtime_records.status=running`,
// probes each engine endpoint in parallel (capped at
// defaultMaxConcurrency), and applies the
// RTMANAGER_PROBE_FAILURES_THRESHOLD hysteresis to emit `probe_failed`
// (after N consecutive failures) and `probe_recovered` (on the first
// success after a `probe_failed` was published). In-memory state is
// pruned at the start of every tick against the freshly-read running
// list, so a game that stops between ticks never accumulates stale
// failure counters.
//
// Design rationale is captured in
// `rtmanager/docs/workers.md`.
package healthprobe
import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
"net/http"
"strings"
"sync"
"time"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/logging"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/telemetry"
)
// defaultMaxConcurrency caps the number of in-flight `/healthz`
// requests inside a single tick. RTM v1 is single-instance with a
// modest active-game count; the cap keeps a slow engine from delaying
// the rest of the cohort while preventing pathological fan-out if the
// running list grows.
const defaultMaxConcurrency = 16
// healthzPath is the engine probe path. Stable per
// `game/README.md §/healthz`.
const healthzPath = "/healthz"
// Dependencies groups the collaborators required by Worker.
type Dependencies struct {
// RuntimeRecords lists running games on every tick.
RuntimeRecords ports.RuntimeRecordStore
// HealthEvents emits `probe_failed` and `probe_recovered`.
HealthEvents ports.HealthEventPublisher
// HTTPClient performs the engine `/healthz` request. Required.
// Production wiring supplies an `otelhttp`-instrumented client.
HTTPClient *http.Client
// Telemetry records one health-event counter per emission.
Telemetry *telemetry.Runtime
// Interval bounds the tick period.
Interval time.Duration
// ProbeTimeout bounds one engine `/healthz` call.
ProbeTimeout time.Duration
// FailuresThreshold is the consecutive-failure count that promotes
// the in-memory counter to a `probe_failed` emission.
FailuresThreshold int
// MaxConcurrency caps the number of in-flight probes per tick.
// Defaults to defaultMaxConcurrency when zero or negative.
MaxConcurrency int
// Clock supplies the wall-clock used for emission timestamps.
// Defaults to `time.Now` when nil.
Clock func() time.Time
// Logger receives structured worker-level events. Defaults to
// `slog.Default()` when nil.
Logger *slog.Logger
}
// Worker drives the periodic active-probe loop.
type Worker struct {
runtimeRecords ports.RuntimeRecordStore
healthEvents ports.HealthEventPublisher
httpClient *http.Client
telemetry *telemetry.Runtime
interval time.Duration
probeTimeout time.Duration
failuresThreshold int
maxConcurrency int
clock func() time.Time
logger *slog.Logger
mu sync.Mutex
states map[string]*probeState
}
// probeState stores the per-game hysteresis counters. Owned by Worker
// and protected by Worker.mu.
type probeState struct {
consecutiveFailures int
failurePublished bool
}
// NewWorker constructs one Worker from deps.
func NewWorker(deps Dependencies) (*Worker, error) {
switch {
case deps.RuntimeRecords == nil:
return nil, errors.New("new health probe worker: nil runtime records store")
case deps.HealthEvents == nil:
return nil, errors.New("new health probe worker: nil health events publisher")
case deps.HTTPClient == nil:
return nil, errors.New("new health probe worker: nil http client")
case deps.Telemetry == nil:
return nil, errors.New("new health probe worker: nil telemetry runtime")
case deps.Interval <= 0:
return nil, errors.New("new health probe worker: interval must be positive")
case deps.ProbeTimeout <= 0:
return nil, errors.New("new health probe worker: probe timeout must be positive")
case deps.FailuresThreshold <= 0:
return nil, errors.New("new health probe worker: failures threshold must be positive")
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
maxConcurrency := deps.MaxConcurrency
if maxConcurrency <= 0 {
maxConcurrency = defaultMaxConcurrency
}
return &Worker{
runtimeRecords: deps.RuntimeRecords,
healthEvents: deps.HealthEvents,
httpClient: deps.HTTPClient,
telemetry: deps.Telemetry,
interval: deps.Interval,
probeTimeout: deps.ProbeTimeout,
failuresThreshold: deps.FailuresThreshold,
maxConcurrency: maxConcurrency,
clock: clock,
logger: logger.With("worker", "rtmanager.healthprobe"),
states: map[string]*probeState{},
}, nil
}
// Run drives the probe loop until ctx is cancelled. Per-tick errors are
// absorbed; the loop only exits on context cancellation.
func (worker *Worker) Run(ctx context.Context) error {
if worker == nil {
return errors.New("run health probe worker: nil worker")
}
if ctx == nil {
return errors.New("run health probe worker: nil context")
}
if err := ctx.Err(); err != nil {
return err
}
worker.logger.Info("health probe worker started",
"interval", worker.interval.String(),
"probe_timeout", worker.probeTimeout.String(),
"failures_threshold", worker.failuresThreshold,
"max_concurrency", worker.maxConcurrency,
)
defer worker.logger.Info("health probe worker stopped")
ticker := time.NewTicker(worker.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
worker.tick(ctx)
}
}
}
// Shutdown is a no-op; Run terminates on context cancellation.
func (worker *Worker) Shutdown(ctx context.Context) error {
if ctx == nil {
return errors.New("shutdown health probe worker: nil context")
}
return nil
}
// Tick performs one probe pass. Exported so tests can drive the worker
// deterministically without spinning a real ticker.
func (worker *Worker) Tick(ctx context.Context) {
worker.tick(ctx)
}
// tick performs one full pass: list running records, prune state for
// stopped games, then probe every running game in parallel.
func (worker *Worker) tick(ctx context.Context) {
if err := ctx.Err(); err != nil {
return
}
records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
if err != nil {
worker.logger.WarnContext(ctx, "list running records",
"err", err.Error(),
)
return
}
worker.pruneStates(records)
if len(records) == 0 {
return
}
semaphore := make(chan struct{}, worker.maxConcurrency)
var waitGroup sync.WaitGroup
for _, record := range records {
select {
case <-ctx.Done():
waitGroup.Wait()
return
case semaphore <- struct{}{}:
}
waitGroup.Add(1)
go func(record runtime.RuntimeRecord) {
defer waitGroup.Done()
defer func() { <-semaphore }()
worker.probeOne(ctx, record)
}(record)
}
waitGroup.Wait()
}
// pruneStates removes per-game state for games no longer in the running
// list. Stopped or removed games therefore start with a clean counter
// the next time they re-enter `running`.
func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) {
worker.mu.Lock()
defer worker.mu.Unlock()
if len(worker.states) == 0 {
return
}
running := make(map[string]struct{}, len(records))
for _, record := range records {
running[record.GameID] = struct{}{}
}
for gameID := range worker.states {
if _, ok := running[gameID]; !ok {
delete(worker.states, gameID)
}
}
}
// probeOne issues one `/healthz` request and updates hysteresis state.
func (worker *Worker) probeOne(ctx context.Context, record runtime.RuntimeRecord) {
probeCtx, cancel := context.WithTimeout(ctx, worker.probeTimeout)
defer cancel()
endpoint := strings.TrimRight(record.EngineEndpoint, "/") + healthzPath
request, err := http.NewRequestWithContext(probeCtx, http.MethodGet, endpoint, nil)
if err != nil {
worker.recordFailure(ctx, record, 0, fmt.Errorf("build request: %w", err))
return
}
response, err := worker.httpClient.Do(request)
if err != nil {
worker.recordFailure(ctx, record, 0, err)
return
}
defer response.Body.Close()
if response.StatusCode == http.StatusOK {
worker.recordSuccess(ctx, record)
return
}
worker.recordFailure(ctx, record, response.StatusCode, fmt.Errorf("unexpected status %d", response.StatusCode))
}
// recordSuccess updates state on a successful probe and emits
// `probe_recovered` when the prior tick had crossed the failure
// threshold.
func (worker *Worker) recordSuccess(ctx context.Context, record runtime.RuntimeRecord) {
worker.mu.Lock()
state, ok := worker.states[record.GameID]
if !ok {
worker.mu.Unlock()
return
}
if !state.failurePublished {
state.consecutiveFailures = 0
worker.mu.Unlock()
return
}
priorFailureCount := state.consecutiveFailures
state.consecutiveFailures = 0
state.failurePublished = false
worker.mu.Unlock()
worker.publish(ctx, ports.HealthEventEnvelope{
GameID: record.GameID,
ContainerID: record.CurrentContainerID,
EventType: health.EventTypeProbeRecovered,
OccurredAt: worker.clock().UTC(),
Details: probeRecoveredDetails(priorFailureCount),
})
}
// recordFailure updates state on a failed probe and emits
// `probe_failed` once the threshold is crossed.
func (worker *Worker) recordFailure(ctx context.Context, record runtime.RuntimeRecord, lastStatus int, lastErr error) {
worker.mu.Lock()
state, ok := worker.states[record.GameID]
if !ok {
state = &probeState{}
worker.states[record.GameID] = state
}
state.consecutiveFailures++
if state.failurePublished || state.consecutiveFailures < worker.failuresThreshold {
count := state.consecutiveFailures
worker.mu.Unlock()
worker.logger.DebugContext(ctx, "probe failure",
"game_id", record.GameID,
"consecutive_failures", count,
"threshold", worker.failuresThreshold,
"err", errString(lastErr),
)
return
}
state.failurePublished = true
count := state.consecutiveFailures
worker.mu.Unlock()
worker.publish(ctx, ports.HealthEventEnvelope{
GameID: record.GameID,
ContainerID: record.CurrentContainerID,
EventType: health.EventTypeProbeFailed,
OccurredAt: worker.clock().UTC(),
Details: probeFailedDetails(count, lastStatus, errString(lastErr)),
})
}
// publish emits one envelope through the configured publisher, updates
// the telemetry counter, and logs the outcome. Failures degrade to a
// warning log per `rtmanager/README.md §Notification Contracts`.
func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
if err := worker.healthEvents.Publish(ctx, envelope); err != nil {
worker.logger.ErrorContext(ctx, "publish health event",
"game_id", envelope.GameID,
"container_id", envelope.ContainerID,
"event_type", string(envelope.EventType),
"err", err.Error(),
)
return
}
worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
logArgs := []any{
"game_id", envelope.GameID,
"container_id", envelope.ContainerID,
"event_type", string(envelope.EventType),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
worker.logger.InfoContext(ctx, "probe event published", logArgs...)
}
// probeFailedDetails builds the JSON payload required by the
// `probe_failed` AsyncAPI variant.
func probeFailedDetails(consecutiveFailures, lastStatus int, lastError string) json.RawMessage {
payload := struct {
ConsecutiveFailures int `json:"consecutive_failures"`
LastStatus int `json:"last_status"`
LastError string `json:"last_error"`
}{
ConsecutiveFailures: consecutiveFailures,
LastStatus: lastStatus,
LastError: lastError,
}
encoded, _ := json.Marshal(payload)
return encoded
}
// probeRecoveredDetails builds the JSON payload required by the
// `probe_recovered` AsyncAPI variant.
func probeRecoveredDetails(priorFailureCount int) json.RawMessage {
payload := struct {
PriorFailureCount int `json:"prior_failure_count"`
}{PriorFailureCount: priorFailureCount}
encoded, _ := json.Marshal(payload)
return encoded
}
func errString(err error) string {
if err == nil {
return ""
}
return err.Error()
}
@@ -0,0 +1,417 @@
package healthprobe_test
import (
"context"
"encoding/json"
"errors"
"io"
"log/slog"
"net/http"
"net/http/httptest"
"sync"
"sync/atomic"
"testing"
"time"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/telemetry"
"galaxy/rtmanager/internal/worker/healthprobe"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func silentLogger() *slog.Logger {
return slog.New(slog.NewTextHandler(io.Discard, nil))
}
// fakeRuntimeRecords supports List/ListByStatus only; the worker does
// not call other methods.
type fakeRuntimeRecords struct {
mu sync.Mutex
running []runtime.RuntimeRecord
listErr error
}
func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} }
func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) {
s.mu.Lock()
defer s.mu.Unlock()
s.running = append([]runtime.RuntimeRecord(nil), records...)
}
func (s *fakeRuntimeRecords) Clear() {
s.mu.Lock()
defer s.mu.Unlock()
s.running = nil
}
func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil }
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
return nil
}
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
return nil, nil
}
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.listErr != nil {
return nil, s.listErr
}
if status != runtime.StatusRunning {
return nil, nil
}
out := make([]runtime.RuntimeRecord, len(s.running))
copy(out, s.running)
return out, nil
}
// fakeHealthEvents captures every Publish call.
type fakeHealthEvents struct {
mu sync.Mutex
published []ports.HealthEventEnvelope
publishErr error
}
func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.publishErr != nil {
return s.publishErr
}
s.published = append(s.published, envelope)
return nil
}
func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope {
s.mu.Lock()
defer s.mu.Unlock()
out := make([]ports.HealthEventEnvelope, len(s.published))
copy(out, s.published)
return out
}
// engineServer is a per-game HTTP fake controlled by tests.
type engineServer struct {
server *httptest.Server
status atomic.Int32
requests atomic.Int32
}
func newEngineServer(t *testing.T) *engineServer {
t.Helper()
es := &engineServer{}
es.status.Store(http.StatusOK)
es.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
es.requests.Add(1)
w.WriteHeader(int(es.status.Load()))
}))
t.Cleanup(es.server.Close)
return es
}
func (e *engineServer) URL() string { return e.server.URL }
func (e *engineServer) SetStatus(code int) { e.status.Store(int32(code)) }
func (e *engineServer) Stop() { e.server.Close() }
// --- harness ----------------------------------------------------------
type harness struct {
records *fakeRuntimeRecords
health *fakeHealthEvents
worker *healthprobe.Worker
now time.Time
}
func newHarness(t *testing.T) *harness {
t.Helper()
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
require.NoError(t, err)
records := newFakeRuntimeRecords()
healthEvents := &fakeHealthEvents{}
worker, err := healthprobe.NewWorker(healthprobe.Dependencies{
RuntimeRecords: records,
HealthEvents: healthEvents,
HTTPClient: &http.Client{},
Telemetry: telemetryRuntime,
Interval: 50 * time.Millisecond,
ProbeTimeout: 100 * time.Millisecond,
FailuresThreshold: 3,
MaxConcurrency: 4,
Clock: func() time.Time { return time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) },
Logger: silentLogger(),
})
require.NoError(t, err)
return &harness{
records: records,
health: healthEvents,
worker: worker,
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
}
}
func runningRecord(gameID, endpoint string) runtime.RuntimeRecord {
startedAt := time.Date(2026, 4, 27, 11, 0, 0, 0, time.UTC)
return runtime.RuntimeRecord{
GameID: gameID,
Status: runtime.StatusRunning,
CurrentContainerID: "ctr-" + gameID,
CurrentImageRef: "galaxy/game:1.0.0",
EngineEndpoint: endpoint,
StatePath: "/var/lib/galaxy/games/" + gameID,
DockerNetwork: "galaxy-net",
StartedAt: &startedAt,
LastOpAt: startedAt,
CreatedAt: startedAt,
}
}
// --- constructor -------------------------------------------------------
func TestNewWorkerRejectsMissingDeps(t *testing.T) {
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
require.NoError(t, err)
base := healthprobe.Dependencies{
RuntimeRecords: newFakeRuntimeRecords(),
HealthEvents: &fakeHealthEvents{},
HTTPClient: &http.Client{},
Telemetry: telemetryRuntime,
Interval: time.Second,
ProbeTimeout: time.Second,
FailuresThreshold: 1,
}
defectives := []healthprobe.Dependencies{
{},
{RuntimeRecords: base.RuntimeRecords},
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents},
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient},
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry},
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second},
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second, ProbeTimeout: time.Second},
}
for index, deps := range defectives {
_, err := healthprobe.NewWorker(deps)
require.Errorf(t, err, "case %d should fail", index)
}
_, err = healthprobe.NewWorker(base)
require.NoError(t, err)
}
// --- behaviour --------------------------------------------------------
func TestTickHealthyDoesNotEmit(t *testing.T) {
h := newHarness(t)
engine := newEngineServer(t)
h.records.Set(runningRecord("game-a", engine.URL()))
h.worker.Tick(context.Background())
assert.Empty(t, h.health.Published(), "successful probe must not emit events")
assert.Equal(t, int32(1), engine.requests.Load(), "exactly one probe request")
}
func TestTickFailureBelowThresholdDoesNotEmit(t *testing.T) {
h := newHarness(t)
engine := newEngineServer(t)
engine.SetStatus(http.StatusServiceUnavailable)
h.records.Set(runningRecord("game-a", engine.URL()))
h.worker.Tick(context.Background())
h.worker.Tick(context.Background())
assert.Empty(t, h.health.Published(), "two failures below threshold must not emit")
}
func TestTickFailuresCrossingThresholdEmitProbeFailedOnce(t *testing.T) {
h := newHarness(t)
engine := newEngineServer(t)
engine.SetStatus(http.StatusInternalServerError)
h.records.Set(runningRecord("game-a", engine.URL()))
for range 5 {
h.worker.Tick(context.Background())
}
envelopes := h.health.Published()
require.Len(t, envelopes, 1, "probe_failed must publish exactly once across many failures")
envelope := envelopes[0]
assert.Equal(t, health.EventTypeProbeFailed, envelope.EventType)
assert.Equal(t, "game-a", envelope.GameID)
assert.Equal(t, "ctr-game-a", envelope.ContainerID)
var details struct {
ConsecutiveFailures int `json:"consecutive_failures"`
LastStatus int `json:"last_status"`
LastError string `json:"last_error"`
}
require.NoError(t, json.Unmarshal(envelope.Details, &details))
assert.Equal(t, 3, details.ConsecutiveFailures, "consecutive_failures equals threshold at first emission")
assert.Equal(t, http.StatusInternalServerError, details.LastStatus)
assert.NotEmpty(t, details.LastError)
}
func TestTickRecoveryEmitsProbeRecoveredWithPriorFailureCount(t *testing.T) {
h := newHarness(t)
engine := newEngineServer(t)
engine.SetStatus(http.StatusInternalServerError)
h.records.Set(runningRecord("game-a", engine.URL()))
for range 3 {
h.worker.Tick(context.Background())
}
require.Len(t, h.health.Published(), 1, "expect probe_failed after threshold")
engine.SetStatus(http.StatusOK)
h.worker.Tick(context.Background())
envelopes := h.health.Published()
require.Len(t, envelopes, 2, "recovery must emit exactly one probe_recovered")
envelope := envelopes[1]
assert.Equal(t, health.EventTypeProbeRecovered, envelope.EventType)
var details struct {
PriorFailureCount int `json:"prior_failure_count"`
}
require.NoError(t, json.Unmarshal(envelope.Details, &details))
assert.Equal(t, 3, details.PriorFailureCount)
}
func TestTickFlappingDoesNotDoublePublishProbeFailed(t *testing.T) {
h := newHarness(t)
engine := newEngineServer(t)
engine.SetStatus(http.StatusInternalServerError)
h.records.Set(runningRecord("game-a", engine.URL()))
for range 5 {
h.worker.Tick(context.Background())
}
require.Len(t, h.health.Published(), 1)
// New failure after probe_failed has been published: must not emit again.
h.worker.Tick(context.Background())
assert.Len(t, h.health.Published(), 1, "no new probe_failed while already in failed state")
}
func TestTickPrunesStateForGamesNoLongerRunning(t *testing.T) {
h := newHarness(t)
engine := newEngineServer(t)
engine.SetStatus(http.StatusInternalServerError)
h.records.Set(runningRecord("game-a", engine.URL()))
for range 3 {
h.worker.Tick(context.Background())
}
require.Len(t, h.health.Published(), 1, "probe_failed published before stop")
// Game leaves running; state must be pruned.
h.records.Clear()
h.worker.Tick(context.Background())
// Re-introduce the same game: counter starts fresh, new failures
// must accumulate from zero before another probe_failed fires.
h.records.Set(runningRecord("game-a", engine.URL()))
h.worker.Tick(context.Background())
h.worker.Tick(context.Background())
assert.Len(t, h.health.Published(), 1, "fresh state must require threshold failures again")
h.worker.Tick(context.Background())
assert.Len(t, h.health.Published(), 2, "third fresh failure crosses threshold")
}
func TestTickProbesMultipleGamesConcurrently(t *testing.T) {
h := newHarness(t)
// Two slow engines that simulate noticeable latency. Sequential
// execution would take 2*latency; parallel finishes near 1*latency.
const latency = 80 * time.Millisecond
makeSlowEngine := func() *httptest.Server {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
time.Sleep(latency)
w.WriteHeader(http.StatusOK)
}))
t.Cleanup(server.Close)
return server
}
a := makeSlowEngine()
b := makeSlowEngine()
h.records.Set(
runningRecord("game-a", a.URL),
runningRecord("game-b", b.URL),
)
start := time.Now()
h.worker.Tick(context.Background())
elapsed := time.Since(start)
assert.Less(t, elapsed, 2*latency, "probes must run concurrently, not sequentially")
}
func TestTickAbsorbsListError(t *testing.T) {
h := newHarness(t)
h.records.listErr = errors.New("pg down")
require.NotPanics(t, func() { h.worker.Tick(context.Background()) })
assert.Empty(t, h.health.Published())
}
func TestTickAbsorbsPublishError(t *testing.T) {
h := newHarness(t)
h.health.publishErr = errors.New("redis down")
engine := newEngineServer(t)
engine.SetStatus(http.StatusInternalServerError)
h.records.Set(runningRecord("game-a", engine.URL()))
for range 3 {
h.worker.Tick(context.Background())
}
// publishErr means nothing accumulated; the worker must not panic
// or change state in surprising ways.
assert.Empty(t, h.health.Published())
}
func TestRunRespectsContextCancel(t *testing.T) {
h := newHarness(t)
ctx, cancel := context.WithCancel(context.Background())
done := make(chan error, 1)
go func() { done <- h.worker.Run(ctx) }()
cancel()
select {
case err := <-done:
assert.ErrorIs(t, err, context.Canceled)
case <-time.After(time.Second):
t.Fatalf("Run did not exit after cancel")
}
}
func TestShutdownIsNoOp(t *testing.T) {
h := newHarness(t)
require.NoError(t, h.worker.Shutdown(context.Background()))
}
// --- compile-time safety ----------------------------------------------
var (
_ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
_ ports.HealthEventPublisher = (*fakeHealthEvents)(nil)
)