galaxy-game/rtmanager/internal/worker/healthprobe/worker_test.go

package healthprobe_test

import (
	"context"
	"encoding/json"
	"errors"
	"io"
	"log/slog"
	"net/http"
	"net/http/httptest"
	"sync"
	"sync/atomic"
	"testing"
	"time"

	"galaxy/rtmanager/internal/domain/health"
	"galaxy/rtmanager/internal/domain/runtime"
	"galaxy/rtmanager/internal/ports"
	"galaxy/rtmanager/internal/telemetry"
	"galaxy/rtmanager/internal/worker/healthprobe"

	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

func silentLogger() *slog.Logger {
	return slog.New(slog.NewTextHandler(io.Discard, nil))
}

// fakeRuntimeRecords supports List/ListByStatus only; the worker does
// not call other methods.
type fakeRuntimeRecords struct {
	mu      sync.Mutex
	running []runtime.RuntimeRecord
	listErr error
}

func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} }

func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) {
	s.mu.Lock()
	defer s.mu.Unlock()
	s.running = append([]runtime.RuntimeRecord(nil), records...)
}

func (s *fakeRuntimeRecords) Clear() {
	s.mu.Lock()
	defer s.mu.Unlock()
	s.running = nil
}

func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) {
	return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil }
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
	return nil
}
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
	return nil, nil
}

func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
	s.mu.Lock()
	defer s.mu.Unlock()
	if s.listErr != nil {
		return nil, s.listErr
	}
	if status != runtime.StatusRunning {
		return nil, nil
	}
	out := make([]runtime.RuntimeRecord, len(s.running))
	copy(out, s.running)
	return out, nil
}

// fakeHealthEvents captures every Publish call.
type fakeHealthEvents struct {
	mu         sync.Mutex
	published  []ports.HealthEventEnvelope
	publishErr error
}

func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
	s.mu.Lock()
	defer s.mu.Unlock()
	if s.publishErr != nil {
		return s.publishErr
	}
	s.published = append(s.published, envelope)
	return nil
}

func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope {
	s.mu.Lock()
	defer s.mu.Unlock()
	out := make([]ports.HealthEventEnvelope, len(s.published))
	copy(out, s.published)
	return out
}

// engineServer is a per-game HTTP fake controlled by tests.
type engineServer struct {
	server   *httptest.Server
	status   atomic.Int32
	requests atomic.Int32
}

func newEngineServer(t *testing.T) *engineServer {
	t.Helper()
	es := &engineServer{}
	es.status.Store(http.StatusOK)
	es.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
		es.requests.Add(1)
		w.WriteHeader(int(es.status.Load()))
	}))
	t.Cleanup(es.server.Close)
	return es
}

func (e *engineServer) URL() string { return e.server.URL }

func (e *engineServer) SetStatus(code int) { e.status.Store(int32(code)) }

func (e *engineServer) Stop() { e.server.Close() }

// --- harness ----------------------------------------------------------

type harness struct {
	records *fakeRuntimeRecords
	health  *fakeHealthEvents
	worker  *healthprobe.Worker
	now     time.Time
}

func newHarness(t *testing.T) *harness {
	t.Helper()
	telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
	require.NoError(t, err)

	records := newFakeRuntimeRecords()
	healthEvents := &fakeHealthEvents{}

	worker, err := healthprobe.NewWorker(healthprobe.Dependencies{
		RuntimeRecords:    records,
		HealthEvents:      healthEvents,
		HTTPClient:        &http.Client{},
		Telemetry:         telemetryRuntime,
		Interval:          50 * time.Millisecond,
		ProbeTimeout:      100 * time.Millisecond,
		FailuresThreshold: 3,
		MaxConcurrency:    4,
		Clock:             func() time.Time { return time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) },
		Logger:            silentLogger(),
	})
	require.NoError(t, err)

	return &harness{
		records: records,
		health:  healthEvents,
		worker:  worker,
		now:     time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
	}
}

func runningRecord(gameID, endpoint string) runtime.RuntimeRecord {
	startedAt := time.Date(2026, 4, 27, 11, 0, 0, 0, time.UTC)
	return runtime.RuntimeRecord{
		GameID:             gameID,
		Status:             runtime.StatusRunning,
		CurrentContainerID: "ctr-" + gameID,
		CurrentImageRef:    "galaxy/game:1.0.0",
		EngineEndpoint:     endpoint,
		StatePath:          "/var/lib/galaxy/games/" + gameID,
		DockerNetwork:      "galaxy-net",
		StartedAt:          &startedAt,
		LastOpAt:           startedAt,
		CreatedAt:          startedAt,
	}
}

// --- constructor -------------------------------------------------------

func TestNewWorkerRejectsMissingDeps(t *testing.T) {
	telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
	require.NoError(t, err)

	base := healthprobe.Dependencies{
		RuntimeRecords:    newFakeRuntimeRecords(),
		HealthEvents:      &fakeHealthEvents{},
		HTTPClient:        &http.Client{},
		Telemetry:         telemetryRuntime,
		Interval:          time.Second,
		ProbeTimeout:      time.Second,
		FailuresThreshold: 1,
	}

	defectives := []healthprobe.Dependencies{
		{},
		{RuntimeRecords: base.RuntimeRecords},
		{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents},
		{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient},
		{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry},
		{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second},
		{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second, ProbeTimeout: time.Second},
	}
	for index, deps := range defectives {
		_, err := healthprobe.NewWorker(deps)
		require.Errorf(t, err, "case %d should fail", index)
	}

	_, err = healthprobe.NewWorker(base)
	require.NoError(t, err)
}

// --- behaviour --------------------------------------------------------

func TestTickHealthyDoesNotEmit(t *testing.T) {
	h := newHarness(t)
	engine := newEngineServer(t)

	h.records.Set(runningRecord("game-a", engine.URL()))
	h.worker.Tick(context.Background())

	assert.Empty(t, h.health.Published(), "successful probe must not emit events")
	assert.Equal(t, int32(1), engine.requests.Load(), "exactly one probe request")
}

func TestTickFailureBelowThresholdDoesNotEmit(t *testing.T) {
	h := newHarness(t)
	engine := newEngineServer(t)
	engine.SetStatus(http.StatusServiceUnavailable)

	h.records.Set(runningRecord("game-a", engine.URL()))
	h.worker.Tick(context.Background())
	h.worker.Tick(context.Background())

	assert.Empty(t, h.health.Published(), "two failures below threshold must not emit")
}

func TestTickFailuresCrossingThresholdEmitProbeFailedOnce(t *testing.T) {
	h := newHarness(t)
	engine := newEngineServer(t)
	engine.SetStatus(http.StatusInternalServerError)

	h.records.Set(runningRecord("game-a", engine.URL()))

	for range 5 {
		h.worker.Tick(context.Background())
	}

	envelopes := h.health.Published()
	require.Len(t, envelopes, 1, "probe_failed must publish exactly once across many failures")
	envelope := envelopes[0]
	assert.Equal(t, health.EventTypeProbeFailed, envelope.EventType)
	assert.Equal(t, "game-a", envelope.GameID)
	assert.Equal(t, "ctr-game-a", envelope.ContainerID)

	var details struct {
		ConsecutiveFailures int    `json:"consecutive_failures"`
		LastStatus          int    `json:"last_status"`
		LastError           string `json:"last_error"`
	}
	require.NoError(t, json.Unmarshal(envelope.Details, &details))
	assert.Equal(t, 3, details.ConsecutiveFailures, "consecutive_failures equals threshold at first emission")
	assert.Equal(t, http.StatusInternalServerError, details.LastStatus)
	assert.NotEmpty(t, details.LastError)
}

func TestTickRecoveryEmitsProbeRecoveredWithPriorFailureCount(t *testing.T) {
	h := newHarness(t)
	engine := newEngineServer(t)
	engine.SetStatus(http.StatusInternalServerError)

	h.records.Set(runningRecord("game-a", engine.URL()))

	for range 3 {
		h.worker.Tick(context.Background())
	}
	require.Len(t, h.health.Published(), 1, "expect probe_failed after threshold")

	engine.SetStatus(http.StatusOK)
	h.worker.Tick(context.Background())

	envelopes := h.health.Published()
	require.Len(t, envelopes, 2, "recovery must emit exactly one probe_recovered")
	envelope := envelopes[1]
	assert.Equal(t, health.EventTypeProbeRecovered, envelope.EventType)

	var details struct {
		PriorFailureCount int `json:"prior_failure_count"`
	}
	require.NoError(t, json.Unmarshal(envelope.Details, &details))
	assert.Equal(t, 3, details.PriorFailureCount)
}

func TestTickFlappingDoesNotDoublePublishProbeFailed(t *testing.T) {
	h := newHarness(t)
	engine := newEngineServer(t)
	engine.SetStatus(http.StatusInternalServerError)

	h.records.Set(runningRecord("game-a", engine.URL()))
	for range 5 {
		h.worker.Tick(context.Background())
	}
	require.Len(t, h.health.Published(), 1)

	// New failure after probe_failed has been published: must not emit again.
	h.worker.Tick(context.Background())
	assert.Len(t, h.health.Published(), 1, "no new probe_failed while already in failed state")
}

func TestTickPrunesStateForGamesNoLongerRunning(t *testing.T) {
	h := newHarness(t)
	engine := newEngineServer(t)
	engine.SetStatus(http.StatusInternalServerError)

	h.records.Set(runningRecord("game-a", engine.URL()))
	for range 3 {
		h.worker.Tick(context.Background())
	}
	require.Len(t, h.health.Published(), 1, "probe_failed published before stop")

	// Game leaves running; state must be pruned.
	h.records.Clear()
	h.worker.Tick(context.Background())

	// Re-introduce the same game: counter starts fresh, new failures
	// must accumulate from zero before another probe_failed fires.
	h.records.Set(runningRecord("game-a", engine.URL()))
	h.worker.Tick(context.Background())
	h.worker.Tick(context.Background())
	assert.Len(t, h.health.Published(), 1, "fresh state must require threshold failures again")

	h.worker.Tick(context.Background())
	assert.Len(t, h.health.Published(), 2, "third fresh failure crosses threshold")
}

func TestTickProbesMultipleGamesConcurrently(t *testing.T) {
	h := newHarness(t)

	// Two slow engines that simulate noticeable latency. Sequential
	// execution would take 2*latency; parallel finishes near 1*latency.
	const latency = 80 * time.Millisecond
	makeSlowEngine := func() *httptest.Server {
		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
			time.Sleep(latency)
			w.WriteHeader(http.StatusOK)
		}))
		t.Cleanup(server.Close)
		return server
	}
	a := makeSlowEngine()
	b := makeSlowEngine()

	h.records.Set(
		runningRecord("game-a", a.URL),
		runningRecord("game-b", b.URL),
	)

	start := time.Now()
	h.worker.Tick(context.Background())
	elapsed := time.Since(start)

	assert.Less(t, elapsed, 2*latency, "probes must run concurrently, not sequentially")
}

func TestTickAbsorbsListError(t *testing.T) {
	h := newHarness(t)
	h.records.listErr = errors.New("pg down")

	require.NotPanics(t, func() { h.worker.Tick(context.Background()) })
	assert.Empty(t, h.health.Published())
}

func TestTickAbsorbsPublishError(t *testing.T) {
	h := newHarness(t)
	h.health.publishErr = errors.New("redis down")
	engine := newEngineServer(t)
	engine.SetStatus(http.StatusInternalServerError)

	h.records.Set(runningRecord("game-a", engine.URL()))
	for range 3 {
		h.worker.Tick(context.Background())
	}
	// publishErr means nothing accumulated; the worker must not panic
	// or change state in surprising ways.
	assert.Empty(t, h.health.Published())
}

func TestRunRespectsContextCancel(t *testing.T) {
	h := newHarness(t)

	ctx, cancel := context.WithCancel(context.Background())
	done := make(chan error, 1)
	go func() { done <- h.worker.Run(ctx) }()

	cancel()
	select {
	case err := <-done:
		assert.ErrorIs(t, err, context.Canceled)
	case <-time.After(time.Second):
		t.Fatalf("Run did not exit after cancel")
	}
}

func TestShutdownIsNoOp(t *testing.T) {
	h := newHarness(t)
	require.NoError(t, h.worker.Shutdown(context.Background()))
}

// --- compile-time safety ----------------------------------------------

var (
	_ ports.RuntimeRecordStore   = (*fakeRuntimeRecords)(nil)
	_ ports.HealthEventPublisher = (*fakeHealthEvents)(nil)
)