package reconcile_test import ( "context" "encoding/json" "errors" "io" "log/slog" "strconv" "sync" "testing" "time" "galaxy/rtmanager/internal/adapters/docker/mocks" "galaxy/rtmanager/internal/config" "galaxy/rtmanager/internal/domain/health" "galaxy/rtmanager/internal/domain/operation" "galaxy/rtmanager/internal/domain/runtime" "galaxy/rtmanager/internal/ports" "galaxy/rtmanager/internal/service/startruntime" "galaxy/rtmanager/internal/telemetry" "galaxy/rtmanager/internal/worker/reconcile" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/mock/gomock" ) func silentLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } // --- fake doubles ----------------------------------------------------- type fakeRuntimeRecords struct { mu sync.Mutex stored map[string]runtime.RuntimeRecord getErr error upsertErr error updateStatusErr error listErr error upserts []runtime.RuntimeRecord updates []ports.UpdateStatusInput } func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}} } func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) { s.mu.Lock() defer s.mu.Unlock() for _, record := range records { s.stored[record.GameID] = record } } func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) { s.mu.Lock() defer s.mu.Unlock() if s.getErr != nil { return runtime.RuntimeRecord{}, s.getErr } record, ok := s.stored[gameID] if !ok { return runtime.RuntimeRecord{}, runtime.ErrNotFound } return record, nil } func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error { s.mu.Lock() defer s.mu.Unlock() if s.upsertErr != nil { return s.upsertErr } s.upserts = append(s.upserts, record) s.stored[record.GameID] = record return nil } func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error { s.mu.Lock() defer s.mu.Unlock() s.updates = append(s.updates, input) if s.updateStatusErr != nil { return s.updateStatusErr } record, ok := s.stored[input.GameID] if !ok { return runtime.ErrNotFound } if record.Status != input.ExpectedFrom { return runtime.ErrConflict } if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID { return runtime.ErrConflict } record.Status = input.To record.LastOpAt = input.Now switch input.To { case runtime.StatusStopped: t := input.Now record.StoppedAt = &t case runtime.StatusRemoved: t := input.Now record.RemovedAt = &t record.CurrentContainerID = "" } s.stored[input.GameID] = record return nil } func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { return nil, errors.New("not used in reconciler tests") } func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) { s.mu.Lock() defer s.mu.Unlock() if s.listErr != nil { return nil, s.listErr } var out []runtime.RuntimeRecord for _, record := range s.stored { if record.Status == status { out = append(out, record) } } return out, nil } func (s *fakeRuntimeRecords) Upserts() []runtime.RuntimeRecord { s.mu.Lock() defer s.mu.Unlock() out := make([]runtime.RuntimeRecord, len(s.upserts)) copy(out, s.upserts) return out } func (s *fakeRuntimeRecords) Updates() []ports.UpdateStatusInput { s.mu.Lock() defer s.mu.Unlock() out := make([]ports.UpdateStatusInput, len(s.updates)) copy(out, s.updates) return out } type fakeOperationLogs struct { mu sync.Mutex appendErr error appends []operation.OperationEntry } func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) { s.mu.Lock() defer s.mu.Unlock() if s.appendErr != nil { return 0, s.appendErr } s.appends = append(s.appends, entry) return int64(len(s.appends)), nil } func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) { return nil, errors.New("not used in reconciler tests") } func (s *fakeOperationLogs) Appends() []operation.OperationEntry { s.mu.Lock() defer s.mu.Unlock() out := make([]operation.OperationEntry, len(s.appends)) copy(out, s.appends) return out } type fakeHealthEvents struct { mu sync.Mutex publishErr error published []ports.HealthEventEnvelope } func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error { s.mu.Lock() defer s.mu.Unlock() if s.publishErr != nil { return s.publishErr } s.published = append(s.published, envelope) return nil } func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope { s.mu.Lock() defer s.mu.Unlock() out := make([]ports.HealthEventEnvelope, len(s.published)) copy(out, s.published) return out } type fakeLeases struct { mu sync.Mutex acquired bool acquireErr error releaseErr error acquires []string releases []string } func (l *fakeLeases) TryAcquire(_ context.Context, gameID, token string, _ time.Duration) (bool, error) { l.mu.Lock() defer l.mu.Unlock() l.acquires = append(l.acquires, gameID+":"+token) if l.acquireErr != nil { return false, l.acquireErr } return l.acquired, nil } func (l *fakeLeases) Release(_ context.Context, gameID, token string) error { l.mu.Lock() defer l.mu.Unlock() l.releases = append(l.releases, gameID+":"+token) return l.releaseErr } func (l *fakeLeases) Acquires() []string { l.mu.Lock() defer l.mu.Unlock() out := make([]string, len(l.acquires)) copy(out, l.acquires) return out } func (l *fakeLeases) Releases() []string { l.mu.Lock() defer l.mu.Unlock() out := make([]string, len(l.releases)) copy(out, l.releases) return out } // --- harness ---------------------------------------------------------- type harness struct { docker *mocks.MockDockerClient records *fakeRuntimeRecords operationLogs *fakeOperationLogs healthEvents *fakeHealthEvents leases *fakeLeases telemetry *telemetry.Runtime now time.Time } func newHarness(t *testing.T) *harness { t.Helper() ctrl := gomock.NewController(t) t.Cleanup(ctrl.Finish) telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) require.NoError(t, err) return &harness{ docker: mocks.NewMockDockerClient(ctrl), records: newFakeRuntimeRecords(), operationLogs: &fakeOperationLogs{}, healthEvents: &fakeHealthEvents{}, leases: &fakeLeases{acquired: true}, telemetry: telemetryRuntime, now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC), } } func (h *harness) build(t *testing.T) *reconcile.Reconciler { t.Helper() r, err := reconcile.NewReconciler(reconcile.Dependencies{ Docker: h.docker, RuntimeRecords: h.records, OperationLogs: h.operationLogs, HealthEvents: h.healthEvents, Leases: h.leases, Telemetry: h.telemetry, DockerCfg: config.DockerConfig{ Host: "unix:///var/run/docker.sock", Network: "galaxy-net", LogDriver: "json-file", PullPolicy: config.ImagePullPolicyIfMissing, }, ContainerCfg: config.ContainerConfig{ DefaultCPUQuota: 1.0, DefaultMemory: "512m", DefaultPIDsLimit: 512, StopTimeout: 30 * time.Second, Retention: 30 * 24 * time.Hour, EngineStateMountPath: "/var/lib/galaxy-game", EngineStateEnvName: "GAME_STATE_PATH", GameStateDirMode: 0o750, GameStateRoot: "/var/lib/galaxy/games", }, Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute}, Interval: 50 * time.Millisecond, Clock: func() time.Time { return h.now }, Logger: silentLogger(), NewToken: func() string { return "token-A" }, }) require.NoError(t, err) return r } // runningRecord builds a baseline runtime record in `running` state. func runningRecord(gameID, containerID string, startedAt time.Time) runtime.RuntimeRecord { return runtime.RuntimeRecord{ GameID: gameID, Status: runtime.StatusRunning, CurrentContainerID: containerID, CurrentImageRef: "galaxy/game:1.0.0", EngineEndpoint: "http://galaxy-game-" + gameID + ":8080", StatePath: "/var/lib/galaxy/games/" + gameID, DockerNetwork: "galaxy-net", StartedAt: &startedAt, LastOpAt: startedAt, CreatedAt: startedAt, } } func ownedSummary(gameID, containerID, imageRef, status string, startedAtMs int64) ports.ContainerSummary { labels := map[string]string{ startruntime.LabelOwner: startruntime.LabelOwnerValue, startruntime.LabelKind: startruntime.LabelKindValue, startruntime.LabelGameID: gameID, startruntime.LabelEngineImageRef: imageRef, } if startedAtMs > 0 { labels[startruntime.LabelStartedAtMs] = strconv.FormatInt(startedAtMs, 10) } return ports.ContainerSummary{ ID: containerID, ImageRef: imageRef, Hostname: "galaxy-game-" + gameID, Labels: labels, Status: status, StartedAt: time.UnixMilli(startedAtMs).UTC(), } } // --- constructor ------------------------------------------------------ func TestNewReconcilerRejectsMissingDeps(t *testing.T) { ctrl := gomock.NewController(t) t.Cleanup(ctrl.Finish) telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) require.NoError(t, err) dockerCfg := config.DockerConfig{ Host: "unix:///var/run/docker.sock", Network: "galaxy-net", LogDriver: "json-file", PullPolicy: config.ImagePullPolicyIfMissing, } containerCfg := config.ContainerConfig{ DefaultCPUQuota: 1.0, DefaultMemory: "512m", DefaultPIDsLimit: 512, StopTimeout: 30 * time.Second, Retention: 30 * 24 * time.Hour, EngineStateMountPath: "/var/lib/galaxy-game", EngineStateEnvName: "GAME_STATE_PATH", GameStateDirMode: 0o750, GameStateRoot: "/var/lib/galaxy/games", } coord := config.CoordinationConfig{GameLeaseTTL: time.Minute} base := reconcile.Dependencies{ Docker: mocks.NewMockDockerClient(ctrl), RuntimeRecords: newFakeRuntimeRecords(), OperationLogs: &fakeOperationLogs{}, HealthEvents: &fakeHealthEvents{}, Leases: &fakeLeases{acquired: true}, Telemetry: telemetryRuntime, DockerCfg: dockerCfg, ContainerCfg: containerCfg, Coordination: coord, Interval: time.Second, } defectives := []reconcile.Dependencies{ {}, {Docker: base.Docker}, {Docker: base.Docker, RuntimeRecords: base.RuntimeRecords}, {Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs}, {Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs, HealthEvents: base.HealthEvents}, {Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs, HealthEvents: base.HealthEvents, Leases: base.Leases}, {Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs, HealthEvents: base.HealthEvents, Leases: base.Leases, Telemetry: base.Telemetry}, } for index, deps := range defectives { _, err := reconcile.NewReconciler(deps) require.Errorf(t, err, "case %d should fail", index) } _, err = reconcile.NewReconciler(base) require.NoError(t, err) } // --- adopt ------------------------------------------------------------ func TestReconcileAdoptInsertsRecord(t *testing.T) { h := newHarness(t) r := h.build(t) startedAt := time.Date(2026, 4, 28, 11, 30, 0, 0, time.UTC) summary := ownedSummary("game-a", "ctr-game-a", "galaxy/game:1.2.3", "running", startedAt.UnixMilli()) h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) r.Tick(context.Background()) upserts := h.records.Upserts() require.Len(t, upserts, 1) got := upserts[0] assert.Equal(t, "game-a", got.GameID) assert.Equal(t, runtime.StatusRunning, got.Status) assert.Equal(t, "ctr-game-a", got.CurrentContainerID) assert.Equal(t, "galaxy/game:1.2.3", got.CurrentImageRef) assert.Equal(t, "http://galaxy-game-game-a:8080", got.EngineEndpoint) assert.Equal(t, "/var/lib/galaxy/games/game-a", got.StatePath) assert.Equal(t, "galaxy-net", got.DockerNetwork) require.NotNil(t, got.StartedAt) assert.True(t, got.StartedAt.Equal(startedAt)) appends := h.operationLogs.Appends() require.Len(t, appends, 1) assert.Equal(t, operation.OpKindReconcileAdopt, appends[0].OpKind) assert.Equal(t, operation.OpSourceAutoReconcile, appends[0].OpSource) assert.Equal(t, operation.OutcomeSuccess, appends[0].Outcome) assert.Equal(t, "ctr-game-a", appends[0].ContainerID) assert.Equal(t, []string{"game-a:token-A"}, h.leases.Acquires()) assert.Equal(t, []string{"game-a:token-A"}, h.leases.Releases()) assert.Empty(t, h.healthEvents.Published(), "adopt does not publish health events") } func TestReconcileAdoptFallsBackToInspectStartedAtWhenLabelMissing(t *testing.T) { h := newHarness(t) r := h.build(t) summary := ownedSummary("game-b", "ctr-game-b", "galaxy/game:1.0.0", "running", 0) h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) inspectStarted := time.Date(2026, 4, 28, 10, 0, 0, 0, time.UTC) h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-b").Return(ports.ContainerInspect{ ID: "ctr-game-b", StartedAt: inspectStarted, Status: "running", }, nil) r.Tick(context.Background()) upserts := h.records.Upserts() require.Len(t, upserts, 1) require.NotNil(t, upserts[0].StartedAt) assert.True(t, upserts[0].StartedAt.Equal(inspectStarted)) } func TestReconcileAdoptSkipsWhenRecordAppearsConcurrently(t *testing.T) { h := newHarness(t) r := h.build(t) startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) h.records.Set(runningRecord("game-c", "ctr-game-c", startedAt)) // Docker reports the same game running, but the record already // exists (start service won the race). The list pass sees the // record, so adopt path is never entered. summary := ownedSummary("game-c", "ctr-game-c", "galaxy/game:1.0.0", "running", startedAt.UnixMilli()) h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) r.Tick(context.Background()) assert.Empty(t, h.records.Upserts()) assert.Empty(t, h.operationLogs.Appends()) assert.Empty(t, h.leases.Acquires(), "no mutation -> no lease acquired") } func TestReconcileAdoptSkipsNonRunningContainer(t *testing.T) { h := newHarness(t) r := h.build(t) summary := ownedSummary("game-d", "ctr-game-d", "galaxy/game:1.0.0", "exited", time.Now().UnixMilli()) h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) r.Tick(context.Background()) assert.Empty(t, h.records.Upserts(), "exited container without record is not adopted") assert.Empty(t, h.leases.Acquires()) } // --- dispose ---------------------------------------------------------- func TestReconcileDisposeMarksRemoved(t *testing.T) { h := newHarness(t) r := h.build(t) startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) h.records.Set(runningRecord("game-e", "ctr-game-e", startedAt)) h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil) r.Tick(context.Background()) updates := h.records.Updates() require.Len(t, updates, 1) assert.Equal(t, "game-e", updates[0].GameID) assert.Equal(t, runtime.StatusRunning, updates[0].ExpectedFrom) assert.Equal(t, "ctr-game-e", updates[0].ExpectedContainerID) assert.Equal(t, runtime.StatusRemoved, updates[0].To) published := h.healthEvents.Published() require.Len(t, published, 1) assert.Equal(t, health.EventTypeContainerDisappeared, published[0].EventType) assert.Equal(t, "game-e", published[0].GameID) assert.Equal(t, "ctr-game-e", published[0].ContainerID) assert.JSONEq(t, `{}`, string(published[0].Details)) appends := h.operationLogs.Appends() require.Len(t, appends, 1) assert.Equal(t, operation.OpKindReconcileDispose, appends[0].OpKind) assert.Equal(t, operation.OpSourceAutoReconcile, appends[0].OpSource) } func TestReconcileDisposeSkipsOnCASConflict(t *testing.T) { h := newHarness(t) r := h.build(t) startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) h.records.Set(runningRecord("game-f", "ctr-game-f", startedAt)) h.records.updateStatusErr = runtime.ErrConflict h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil) r.Tick(context.Background()) assert.Empty(t, h.healthEvents.Published(), "no health event when CAS lost") assert.Empty(t, h.operationLogs.Appends(), "no operation_log entry when CAS lost") } func TestReconcileDisposeSkipsWhenStateChangedAfterReread(t *testing.T) { h := newHarness(t) r := h.build(t) // Running record observed by ListByStatus, but Get under the lease // returns a record whose status has changed. startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) listed := runningRecord("game-g", "ctr-game-g", startedAt) h.records.Set(listed) h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil) // Mutate the stored record to simulate concurrent stop completing // between the list pass and the lease re-read. The fake's Get // observes the mutated state. h.records.mu.Lock() stoppedAt := startedAt.Add(time.Minute) listed.Status = runtime.StatusStopped listed.StoppedAt = &stoppedAt h.records.stored["game-g"] = listed h.records.mu.Unlock() r.Tick(context.Background()) assert.Empty(t, h.records.Updates(), "re-read sees status != running -> skip") assert.Empty(t, h.healthEvents.Published()) assert.Empty(t, h.operationLogs.Appends()) } // --- observed_exited -------------------------------------------------- func TestReconcileObservedExitedMarksStopped(t *testing.T) { h := newHarness(t) r := h.build(t) startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) h.records.Set(runningRecord("game-h", "ctr-game-h", startedAt)) summary := ownedSummary("game-h", "ctr-game-h", "galaxy/game:1.0.0", "exited", startedAt.UnixMilli()) h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-h").Return(ports.ContainerInspect{ ID: "ctr-game-h", Status: "exited", ExitCode: 137, OOMKilled: false, }, nil) r.Tick(context.Background()) updates := h.records.Updates() require.Len(t, updates, 1) assert.Equal(t, runtime.StatusRunning, updates[0].ExpectedFrom) assert.Equal(t, "ctr-game-h", updates[0].ExpectedContainerID) assert.Equal(t, runtime.StatusStopped, updates[0].To) published := h.healthEvents.Published() require.Len(t, published, 1) assert.Equal(t, health.EventTypeContainerExited, published[0].EventType) var details struct { ExitCode int `json:"exit_code"` OOM bool `json:"oom"` } require.NoError(t, json.Unmarshal(published[0].Details, &details)) assert.Equal(t, 137, details.ExitCode) assert.False(t, details.OOM) assert.Empty(t, h.operationLogs.Appends(), "observed_exited writes no operation_log entry") } // --- no-op paths ------------------------------------------------------ func TestReconcileNoDriftIsNoop(t *testing.T) { h := newHarness(t) r := h.build(t) startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) h.records.Set(runningRecord("game-i", "ctr-game-i", startedAt)) summary := ownedSummary("game-i", "ctr-game-i", "galaxy/game:1.0.0", "running", startedAt.UnixMilli()) h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) r.Tick(context.Background()) assert.Empty(t, h.records.Upserts()) assert.Empty(t, h.records.Updates()) assert.Empty(t, h.healthEvents.Published()) assert.Empty(t, h.operationLogs.Appends()) assert.Empty(t, h.leases.Acquires()) } func TestReconcileSkipsWhenContainerIDMismatch(t *testing.T) { h := newHarness(t) r := h.build(t) startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) h.records.Set(runningRecord("game-j", "ctr-old", startedAt)) // Docker reports the new container id; restart is in flight. summary := ownedSummary("game-j", "ctr-new", "galaxy/game:1.0.0", "running", startedAt.UnixMilli()) h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) r.Tick(context.Background()) assert.Empty(t, h.records.Updates(), "id mismatch -> reconciler stays out of the way") assert.Empty(t, h.healthEvents.Published()) } // --- lease busy / errors ---------------------------------------------- func TestReconcileLeaseConflictSkipsGame(t *testing.T) { h := newHarness(t) h.leases.acquired = false r := h.build(t) startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) h.records.Set(runningRecord("game-k", "ctr-game-k", startedAt)) h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil) r.Tick(context.Background()) assert.Empty(t, h.records.Updates(), "lease busy -> dispose skipped") assert.Empty(t, h.healthEvents.Published()) assert.Empty(t, h.leases.Releases(), "release not called when acquire returned false") } func TestReconcileNowAbsorbsListError(t *testing.T) { h := newHarness(t) r := h.build(t) h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, errors.New("docker daemon down")) require.NoError(t, r.ReconcileNow(context.Background())) assert.Empty(t, h.records.Updates()) assert.Empty(t, h.records.Upserts()) } func TestReconcileNowAbsorbsRecordsListError(t *testing.T) { h := newHarness(t) r := h.build(t) h.records.listErr = errors.New("pg down") h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil) require.NoError(t, r.ReconcileNow(context.Background())) } func TestReconcileNowReturnsContextError(t *testing.T) { h := newHarness(t) r := h.build(t) ctx, cancel := context.WithCancel(context.Background()) cancel() require.ErrorIs(t, r.ReconcileNow(ctx), context.Canceled) } // --- Run lifecycle ---------------------------------------------------- func TestRunRespectsContextCancel(t *testing.T) { h := newHarness(t) r := h.build(t) ctx, cancel := context.WithCancel(context.Background()) done := make(chan error, 1) go func() { done <- r.Run(ctx) }() cancel() select { case err := <-done: assert.ErrorIs(t, err, context.Canceled) case <-time.After(time.Second): t.Fatalf("Run did not exit after cancel") } } func TestShutdownIsNoOp(t *testing.T) { h := newHarness(t) r := h.build(t) require.NoError(t, r.Shutdown(context.Background())) } // --- compile-time safety ---------------------------------------------- var ( _ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil) _ ports.OperationLogStore = (*fakeOperationLogs)(nil) _ ports.HealthEventPublisher = (*fakeHealthEvents)(nil) _ ports.GameLeaseStore = (*fakeLeases)(nil) )