418 lines
12 KiB
Go
418 lines
12 KiB
Go
package healthprobe_test
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"io"
|
|
"log/slog"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"sync"
|
|
"sync/atomic"
|
|
"testing"
|
|
"time"
|
|
|
|
"galaxy/rtmanager/internal/domain/health"
|
|
"galaxy/rtmanager/internal/domain/runtime"
|
|
"galaxy/rtmanager/internal/ports"
|
|
"galaxy/rtmanager/internal/telemetry"
|
|
"galaxy/rtmanager/internal/worker/healthprobe"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
func silentLogger() *slog.Logger {
|
|
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
|
}
|
|
|
|
// fakeRuntimeRecords supports List/ListByStatus only; the worker does
|
|
// not call other methods.
|
|
type fakeRuntimeRecords struct {
|
|
mu sync.Mutex
|
|
running []runtime.RuntimeRecord
|
|
listErr error
|
|
}
|
|
|
|
func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} }
|
|
|
|
func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
s.running = append([]runtime.RuntimeRecord(nil), records...)
|
|
}
|
|
|
|
func (s *fakeRuntimeRecords) Clear() {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
s.running = nil
|
|
}
|
|
|
|
func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) {
|
|
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
|
}
|
|
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil }
|
|
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
|
|
return nil
|
|
}
|
|
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
if s.listErr != nil {
|
|
return nil, s.listErr
|
|
}
|
|
if status != runtime.StatusRunning {
|
|
return nil, nil
|
|
}
|
|
out := make([]runtime.RuntimeRecord, len(s.running))
|
|
copy(out, s.running)
|
|
return out, nil
|
|
}
|
|
|
|
// fakeHealthEvents captures every Publish call.
|
|
type fakeHealthEvents struct {
|
|
mu sync.Mutex
|
|
published []ports.HealthEventEnvelope
|
|
publishErr error
|
|
}
|
|
|
|
func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
if s.publishErr != nil {
|
|
return s.publishErr
|
|
}
|
|
s.published = append(s.published, envelope)
|
|
return nil
|
|
}
|
|
|
|
func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
out := make([]ports.HealthEventEnvelope, len(s.published))
|
|
copy(out, s.published)
|
|
return out
|
|
}
|
|
|
|
// engineServer is a per-game HTTP fake controlled by tests.
|
|
type engineServer struct {
|
|
server *httptest.Server
|
|
status atomic.Int32
|
|
requests atomic.Int32
|
|
}
|
|
|
|
func newEngineServer(t *testing.T) *engineServer {
|
|
t.Helper()
|
|
es := &engineServer{}
|
|
es.status.Store(http.StatusOK)
|
|
es.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
|
es.requests.Add(1)
|
|
w.WriteHeader(int(es.status.Load()))
|
|
}))
|
|
t.Cleanup(es.server.Close)
|
|
return es
|
|
}
|
|
|
|
func (e *engineServer) URL() string { return e.server.URL }
|
|
|
|
func (e *engineServer) SetStatus(code int) { e.status.Store(int32(code)) }
|
|
|
|
func (e *engineServer) Stop() { e.server.Close() }
|
|
|
|
// --- harness ----------------------------------------------------------
|
|
|
|
type harness struct {
|
|
records *fakeRuntimeRecords
|
|
health *fakeHealthEvents
|
|
worker *healthprobe.Worker
|
|
now time.Time
|
|
}
|
|
|
|
func newHarness(t *testing.T) *harness {
|
|
t.Helper()
|
|
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
|
require.NoError(t, err)
|
|
|
|
records := newFakeRuntimeRecords()
|
|
healthEvents := &fakeHealthEvents{}
|
|
|
|
worker, err := healthprobe.NewWorker(healthprobe.Dependencies{
|
|
RuntimeRecords: records,
|
|
HealthEvents: healthEvents,
|
|
HTTPClient: &http.Client{},
|
|
Telemetry: telemetryRuntime,
|
|
Interval: 50 * time.Millisecond,
|
|
ProbeTimeout: 100 * time.Millisecond,
|
|
FailuresThreshold: 3,
|
|
MaxConcurrency: 4,
|
|
Clock: func() time.Time { return time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) },
|
|
Logger: silentLogger(),
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
return &harness{
|
|
records: records,
|
|
health: healthEvents,
|
|
worker: worker,
|
|
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
|
}
|
|
}
|
|
|
|
func runningRecord(gameID, endpoint string) runtime.RuntimeRecord {
|
|
startedAt := time.Date(2026, 4, 27, 11, 0, 0, 0, time.UTC)
|
|
return runtime.RuntimeRecord{
|
|
GameID: gameID,
|
|
Status: runtime.StatusRunning,
|
|
CurrentContainerID: "ctr-" + gameID,
|
|
CurrentImageRef: "galaxy/game:1.0.0",
|
|
EngineEndpoint: endpoint,
|
|
StatePath: "/var/lib/galaxy/games/" + gameID,
|
|
DockerNetwork: "galaxy-net",
|
|
StartedAt: &startedAt,
|
|
LastOpAt: startedAt,
|
|
CreatedAt: startedAt,
|
|
}
|
|
}
|
|
|
|
// --- constructor -------------------------------------------------------
|
|
|
|
func TestNewWorkerRejectsMissingDeps(t *testing.T) {
|
|
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
|
require.NoError(t, err)
|
|
|
|
base := healthprobe.Dependencies{
|
|
RuntimeRecords: newFakeRuntimeRecords(),
|
|
HealthEvents: &fakeHealthEvents{},
|
|
HTTPClient: &http.Client{},
|
|
Telemetry: telemetryRuntime,
|
|
Interval: time.Second,
|
|
ProbeTimeout: time.Second,
|
|
FailuresThreshold: 1,
|
|
}
|
|
|
|
defectives := []healthprobe.Dependencies{
|
|
{},
|
|
{RuntimeRecords: base.RuntimeRecords},
|
|
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents},
|
|
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient},
|
|
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry},
|
|
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second},
|
|
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second, ProbeTimeout: time.Second},
|
|
}
|
|
for index, deps := range defectives {
|
|
_, err := healthprobe.NewWorker(deps)
|
|
require.Errorf(t, err, "case %d should fail", index)
|
|
}
|
|
|
|
_, err = healthprobe.NewWorker(base)
|
|
require.NoError(t, err)
|
|
}
|
|
|
|
// --- behaviour --------------------------------------------------------
|
|
|
|
func TestTickHealthyDoesNotEmit(t *testing.T) {
|
|
h := newHarness(t)
|
|
engine := newEngineServer(t)
|
|
|
|
h.records.Set(runningRecord("game-a", engine.URL()))
|
|
h.worker.Tick(context.Background())
|
|
|
|
assert.Empty(t, h.health.Published(), "successful probe must not emit events")
|
|
assert.Equal(t, int32(1), engine.requests.Load(), "exactly one probe request")
|
|
}
|
|
|
|
func TestTickFailureBelowThresholdDoesNotEmit(t *testing.T) {
|
|
h := newHarness(t)
|
|
engine := newEngineServer(t)
|
|
engine.SetStatus(http.StatusServiceUnavailable)
|
|
|
|
h.records.Set(runningRecord("game-a", engine.URL()))
|
|
h.worker.Tick(context.Background())
|
|
h.worker.Tick(context.Background())
|
|
|
|
assert.Empty(t, h.health.Published(), "two failures below threshold must not emit")
|
|
}
|
|
|
|
func TestTickFailuresCrossingThresholdEmitProbeFailedOnce(t *testing.T) {
|
|
h := newHarness(t)
|
|
engine := newEngineServer(t)
|
|
engine.SetStatus(http.StatusInternalServerError)
|
|
|
|
h.records.Set(runningRecord("game-a", engine.URL()))
|
|
|
|
for range 5 {
|
|
h.worker.Tick(context.Background())
|
|
}
|
|
|
|
envelopes := h.health.Published()
|
|
require.Len(t, envelopes, 1, "probe_failed must publish exactly once across many failures")
|
|
envelope := envelopes[0]
|
|
assert.Equal(t, health.EventTypeProbeFailed, envelope.EventType)
|
|
assert.Equal(t, "game-a", envelope.GameID)
|
|
assert.Equal(t, "ctr-game-a", envelope.ContainerID)
|
|
|
|
var details struct {
|
|
ConsecutiveFailures int `json:"consecutive_failures"`
|
|
LastStatus int `json:"last_status"`
|
|
LastError string `json:"last_error"`
|
|
}
|
|
require.NoError(t, json.Unmarshal(envelope.Details, &details))
|
|
assert.Equal(t, 3, details.ConsecutiveFailures, "consecutive_failures equals threshold at first emission")
|
|
assert.Equal(t, http.StatusInternalServerError, details.LastStatus)
|
|
assert.NotEmpty(t, details.LastError)
|
|
}
|
|
|
|
func TestTickRecoveryEmitsProbeRecoveredWithPriorFailureCount(t *testing.T) {
|
|
h := newHarness(t)
|
|
engine := newEngineServer(t)
|
|
engine.SetStatus(http.StatusInternalServerError)
|
|
|
|
h.records.Set(runningRecord("game-a", engine.URL()))
|
|
|
|
for range 3 {
|
|
h.worker.Tick(context.Background())
|
|
}
|
|
require.Len(t, h.health.Published(), 1, "expect probe_failed after threshold")
|
|
|
|
engine.SetStatus(http.StatusOK)
|
|
h.worker.Tick(context.Background())
|
|
|
|
envelopes := h.health.Published()
|
|
require.Len(t, envelopes, 2, "recovery must emit exactly one probe_recovered")
|
|
envelope := envelopes[1]
|
|
assert.Equal(t, health.EventTypeProbeRecovered, envelope.EventType)
|
|
|
|
var details struct {
|
|
PriorFailureCount int `json:"prior_failure_count"`
|
|
}
|
|
require.NoError(t, json.Unmarshal(envelope.Details, &details))
|
|
assert.Equal(t, 3, details.PriorFailureCount)
|
|
}
|
|
|
|
func TestTickFlappingDoesNotDoublePublishProbeFailed(t *testing.T) {
|
|
h := newHarness(t)
|
|
engine := newEngineServer(t)
|
|
engine.SetStatus(http.StatusInternalServerError)
|
|
|
|
h.records.Set(runningRecord("game-a", engine.URL()))
|
|
for range 5 {
|
|
h.worker.Tick(context.Background())
|
|
}
|
|
require.Len(t, h.health.Published(), 1)
|
|
|
|
// New failure after probe_failed has been published: must not emit again.
|
|
h.worker.Tick(context.Background())
|
|
assert.Len(t, h.health.Published(), 1, "no new probe_failed while already in failed state")
|
|
}
|
|
|
|
func TestTickPrunesStateForGamesNoLongerRunning(t *testing.T) {
|
|
h := newHarness(t)
|
|
engine := newEngineServer(t)
|
|
engine.SetStatus(http.StatusInternalServerError)
|
|
|
|
h.records.Set(runningRecord("game-a", engine.URL()))
|
|
for range 3 {
|
|
h.worker.Tick(context.Background())
|
|
}
|
|
require.Len(t, h.health.Published(), 1, "probe_failed published before stop")
|
|
|
|
// Game leaves running; state must be pruned.
|
|
h.records.Clear()
|
|
h.worker.Tick(context.Background())
|
|
|
|
// Re-introduce the same game: counter starts fresh, new failures
|
|
// must accumulate from zero before another probe_failed fires.
|
|
h.records.Set(runningRecord("game-a", engine.URL()))
|
|
h.worker.Tick(context.Background())
|
|
h.worker.Tick(context.Background())
|
|
assert.Len(t, h.health.Published(), 1, "fresh state must require threshold failures again")
|
|
|
|
h.worker.Tick(context.Background())
|
|
assert.Len(t, h.health.Published(), 2, "third fresh failure crosses threshold")
|
|
}
|
|
|
|
func TestTickProbesMultipleGamesConcurrently(t *testing.T) {
|
|
h := newHarness(t)
|
|
|
|
// Two slow engines that simulate noticeable latency. Sequential
|
|
// execution would take 2*latency; parallel finishes near 1*latency.
|
|
const latency = 80 * time.Millisecond
|
|
makeSlowEngine := func() *httptest.Server {
|
|
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
|
time.Sleep(latency)
|
|
w.WriteHeader(http.StatusOK)
|
|
}))
|
|
t.Cleanup(server.Close)
|
|
return server
|
|
}
|
|
a := makeSlowEngine()
|
|
b := makeSlowEngine()
|
|
|
|
h.records.Set(
|
|
runningRecord("game-a", a.URL),
|
|
runningRecord("game-b", b.URL),
|
|
)
|
|
|
|
start := time.Now()
|
|
h.worker.Tick(context.Background())
|
|
elapsed := time.Since(start)
|
|
|
|
assert.Less(t, elapsed, 2*latency, "probes must run concurrently, not sequentially")
|
|
}
|
|
|
|
func TestTickAbsorbsListError(t *testing.T) {
|
|
h := newHarness(t)
|
|
h.records.listErr = errors.New("pg down")
|
|
|
|
require.NotPanics(t, func() { h.worker.Tick(context.Background()) })
|
|
assert.Empty(t, h.health.Published())
|
|
}
|
|
|
|
func TestTickAbsorbsPublishError(t *testing.T) {
|
|
h := newHarness(t)
|
|
h.health.publishErr = errors.New("redis down")
|
|
engine := newEngineServer(t)
|
|
engine.SetStatus(http.StatusInternalServerError)
|
|
|
|
h.records.Set(runningRecord("game-a", engine.URL()))
|
|
for range 3 {
|
|
h.worker.Tick(context.Background())
|
|
}
|
|
// publishErr means nothing accumulated; the worker must not panic
|
|
// or change state in surprising ways.
|
|
assert.Empty(t, h.health.Published())
|
|
}
|
|
|
|
func TestRunRespectsContextCancel(t *testing.T) {
|
|
h := newHarness(t)
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
done := make(chan error, 1)
|
|
go func() { done <- h.worker.Run(ctx) }()
|
|
|
|
cancel()
|
|
select {
|
|
case err := <-done:
|
|
assert.ErrorIs(t, err, context.Canceled)
|
|
case <-time.After(time.Second):
|
|
t.Fatalf("Run did not exit after cancel")
|
|
}
|
|
}
|
|
|
|
func TestShutdownIsNoOp(t *testing.T) {
|
|
h := newHarness(t)
|
|
require.NoError(t, h.worker.Shutdown(context.Background()))
|
|
}
|
|
|
|
// --- compile-time safety ----------------------------------------------
|
|
|
|
var (
|
|
_ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
|
|
_ ports.HealthEventPublisher = (*fakeHealthEvents)(nil)
|
|
)
|