package healthprobe_test import ( "context" "encoding/json" "errors" "io" "log/slog" "net/http" "net/http/httptest" "sync" "sync/atomic" "testing" "time" "galaxy/rtmanager/internal/domain/health" "galaxy/rtmanager/internal/domain/runtime" "galaxy/rtmanager/internal/ports" "galaxy/rtmanager/internal/telemetry" "galaxy/rtmanager/internal/worker/healthprobe" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func silentLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } // fakeRuntimeRecords supports List/ListByStatus only; the worker does // not call other methods. type fakeRuntimeRecords struct { mu sync.Mutex running []runtime.RuntimeRecord listErr error } func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} } func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) { s.mu.Lock() defer s.mu.Unlock() s.running = append([]runtime.RuntimeRecord(nil), records...) } func (s *fakeRuntimeRecords) Clear() { s.mu.Lock() defer s.mu.Unlock() s.running = nil } func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) { return runtime.RuntimeRecord{}, runtime.ErrNotFound } func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil } func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error { return nil } func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { return nil, nil } func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) { s.mu.Lock() defer s.mu.Unlock() if s.listErr != nil { return nil, s.listErr } if status != runtime.StatusRunning { return nil, nil } out := make([]runtime.RuntimeRecord, len(s.running)) copy(out, s.running) return out, nil } // fakeHealthEvents captures every Publish call. type fakeHealthEvents struct { mu sync.Mutex published []ports.HealthEventEnvelope publishErr error } func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error { s.mu.Lock() defer s.mu.Unlock() if s.publishErr != nil { return s.publishErr } s.published = append(s.published, envelope) return nil } func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope { s.mu.Lock() defer s.mu.Unlock() out := make([]ports.HealthEventEnvelope, len(s.published)) copy(out, s.published) return out } // engineServer is a per-game HTTP fake controlled by tests. type engineServer struct { server *httptest.Server status atomic.Int32 requests atomic.Int32 } func newEngineServer(t *testing.T) *engineServer { t.Helper() es := &engineServer{} es.status.Store(http.StatusOK) es.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { es.requests.Add(1) w.WriteHeader(int(es.status.Load())) })) t.Cleanup(es.server.Close) return es } func (e *engineServer) URL() string { return e.server.URL } func (e *engineServer) SetStatus(code int) { e.status.Store(int32(code)) } func (e *engineServer) Stop() { e.server.Close() } // --- harness ---------------------------------------------------------- type harness struct { records *fakeRuntimeRecords health *fakeHealthEvents worker *healthprobe.Worker now time.Time } func newHarness(t *testing.T) *harness { t.Helper() telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) require.NoError(t, err) records := newFakeRuntimeRecords() healthEvents := &fakeHealthEvents{} worker, err := healthprobe.NewWorker(healthprobe.Dependencies{ RuntimeRecords: records, HealthEvents: healthEvents, HTTPClient: &http.Client{}, Telemetry: telemetryRuntime, Interval: 50 * time.Millisecond, ProbeTimeout: 100 * time.Millisecond, FailuresThreshold: 3, MaxConcurrency: 4, Clock: func() time.Time { return time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) }, Logger: silentLogger(), }) require.NoError(t, err) return &harness{ records: records, health: healthEvents, worker: worker, now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC), } } func runningRecord(gameID, endpoint string) runtime.RuntimeRecord { startedAt := time.Date(2026, 4, 27, 11, 0, 0, 0, time.UTC) return runtime.RuntimeRecord{ GameID: gameID, Status: runtime.StatusRunning, CurrentContainerID: "ctr-" + gameID, CurrentImageRef: "galaxy/game:1.0.0", EngineEndpoint: endpoint, StatePath: "/var/lib/galaxy/games/" + gameID, DockerNetwork: "galaxy-net", StartedAt: &startedAt, LastOpAt: startedAt, CreatedAt: startedAt, } } // --- constructor ------------------------------------------------------- func TestNewWorkerRejectsMissingDeps(t *testing.T) { telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) require.NoError(t, err) base := healthprobe.Dependencies{ RuntimeRecords: newFakeRuntimeRecords(), HealthEvents: &fakeHealthEvents{}, HTTPClient: &http.Client{}, Telemetry: telemetryRuntime, Interval: time.Second, ProbeTimeout: time.Second, FailuresThreshold: 1, } defectives := []healthprobe.Dependencies{ {}, {RuntimeRecords: base.RuntimeRecords}, {RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents}, {RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient}, {RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry}, {RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second}, {RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second, ProbeTimeout: time.Second}, } for index, deps := range defectives { _, err := healthprobe.NewWorker(deps) require.Errorf(t, err, "case %d should fail", index) } _, err = healthprobe.NewWorker(base) require.NoError(t, err) } // --- behaviour -------------------------------------------------------- func TestTickHealthyDoesNotEmit(t *testing.T) { h := newHarness(t) engine := newEngineServer(t) h.records.Set(runningRecord("game-a", engine.URL())) h.worker.Tick(context.Background()) assert.Empty(t, h.health.Published(), "successful probe must not emit events") assert.Equal(t, int32(1), engine.requests.Load(), "exactly one probe request") } func TestTickFailureBelowThresholdDoesNotEmit(t *testing.T) { h := newHarness(t) engine := newEngineServer(t) engine.SetStatus(http.StatusServiceUnavailable) h.records.Set(runningRecord("game-a", engine.URL())) h.worker.Tick(context.Background()) h.worker.Tick(context.Background()) assert.Empty(t, h.health.Published(), "two failures below threshold must not emit") } func TestTickFailuresCrossingThresholdEmitProbeFailedOnce(t *testing.T) { h := newHarness(t) engine := newEngineServer(t) engine.SetStatus(http.StatusInternalServerError) h.records.Set(runningRecord("game-a", engine.URL())) for range 5 { h.worker.Tick(context.Background()) } envelopes := h.health.Published() require.Len(t, envelopes, 1, "probe_failed must publish exactly once across many failures") envelope := envelopes[0] assert.Equal(t, health.EventTypeProbeFailed, envelope.EventType) assert.Equal(t, "game-a", envelope.GameID) assert.Equal(t, "ctr-game-a", envelope.ContainerID) var details struct { ConsecutiveFailures int `json:"consecutive_failures"` LastStatus int `json:"last_status"` LastError string `json:"last_error"` } require.NoError(t, json.Unmarshal(envelope.Details, &details)) assert.Equal(t, 3, details.ConsecutiveFailures, "consecutive_failures equals threshold at first emission") assert.Equal(t, http.StatusInternalServerError, details.LastStatus) assert.NotEmpty(t, details.LastError) } func TestTickRecoveryEmitsProbeRecoveredWithPriorFailureCount(t *testing.T) { h := newHarness(t) engine := newEngineServer(t) engine.SetStatus(http.StatusInternalServerError) h.records.Set(runningRecord("game-a", engine.URL())) for range 3 { h.worker.Tick(context.Background()) } require.Len(t, h.health.Published(), 1, "expect probe_failed after threshold") engine.SetStatus(http.StatusOK) h.worker.Tick(context.Background()) envelopes := h.health.Published() require.Len(t, envelopes, 2, "recovery must emit exactly one probe_recovered") envelope := envelopes[1] assert.Equal(t, health.EventTypeProbeRecovered, envelope.EventType) var details struct { PriorFailureCount int `json:"prior_failure_count"` } require.NoError(t, json.Unmarshal(envelope.Details, &details)) assert.Equal(t, 3, details.PriorFailureCount) } func TestTickFlappingDoesNotDoublePublishProbeFailed(t *testing.T) { h := newHarness(t) engine := newEngineServer(t) engine.SetStatus(http.StatusInternalServerError) h.records.Set(runningRecord("game-a", engine.URL())) for range 5 { h.worker.Tick(context.Background()) } require.Len(t, h.health.Published(), 1) // New failure after probe_failed has been published: must not emit again. h.worker.Tick(context.Background()) assert.Len(t, h.health.Published(), 1, "no new probe_failed while already in failed state") } func TestTickPrunesStateForGamesNoLongerRunning(t *testing.T) { h := newHarness(t) engine := newEngineServer(t) engine.SetStatus(http.StatusInternalServerError) h.records.Set(runningRecord("game-a", engine.URL())) for range 3 { h.worker.Tick(context.Background()) } require.Len(t, h.health.Published(), 1, "probe_failed published before stop") // Game leaves running; state must be pruned. h.records.Clear() h.worker.Tick(context.Background()) // Re-introduce the same game: counter starts fresh, new failures // must accumulate from zero before another probe_failed fires. h.records.Set(runningRecord("game-a", engine.URL())) h.worker.Tick(context.Background()) h.worker.Tick(context.Background()) assert.Len(t, h.health.Published(), 1, "fresh state must require threshold failures again") h.worker.Tick(context.Background()) assert.Len(t, h.health.Published(), 2, "third fresh failure crosses threshold") } func TestTickProbesMultipleGamesConcurrently(t *testing.T) { h := newHarness(t) // Two slow engines that simulate noticeable latency. Sequential // execution would take 2*latency; parallel finishes near 1*latency. const latency = 80 * time.Millisecond makeSlowEngine := func() *httptest.Server { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { time.Sleep(latency) w.WriteHeader(http.StatusOK) })) t.Cleanup(server.Close) return server } a := makeSlowEngine() b := makeSlowEngine() h.records.Set( runningRecord("game-a", a.URL), runningRecord("game-b", b.URL), ) start := time.Now() h.worker.Tick(context.Background()) elapsed := time.Since(start) assert.Less(t, elapsed, 2*latency, "probes must run concurrently, not sequentially") } func TestTickAbsorbsListError(t *testing.T) { h := newHarness(t) h.records.listErr = errors.New("pg down") require.NotPanics(t, func() { h.worker.Tick(context.Background()) }) assert.Empty(t, h.health.Published()) } func TestTickAbsorbsPublishError(t *testing.T) { h := newHarness(t) h.health.publishErr = errors.New("redis down") engine := newEngineServer(t) engine.SetStatus(http.StatusInternalServerError) h.records.Set(runningRecord("game-a", engine.URL())) for range 3 { h.worker.Tick(context.Background()) } // publishErr means nothing accumulated; the worker must not panic // or change state in surprising ways. assert.Empty(t, h.health.Published()) } func TestRunRespectsContextCancel(t *testing.T) { h := newHarness(t) ctx, cancel := context.WithCancel(context.Background()) done := make(chan error, 1) go func() { done <- h.worker.Run(ctx) }() cancel() select { case err := <-done: assert.ErrorIs(t, err, context.Canceled) case <-time.After(time.Second): t.Fatalf("Run did not exit after cancel") } } func TestShutdownIsNoOp(t *testing.T) { h := newHarness(t) require.NoError(t, h.worker.Shutdown(context.Background())) } // --- compile-time safety ---------------------------------------------- var ( _ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil) _ ports.HealthEventPublisher = (*fakeHealthEvents)(nil) )