package startjobsconsumer_test import ( "context" "errors" "io" "log/slog" "strconv" "sync" "testing" "time" "galaxy/notificationintent" "galaxy/rtmanager/internal/adapters/docker/mocks" "galaxy/rtmanager/internal/adapters/jobresultspublisher" "galaxy/rtmanager/internal/adapters/redisstate/streamoffsets" "galaxy/rtmanager/internal/config" "galaxy/rtmanager/internal/domain/operation" "galaxy/rtmanager/internal/domain/runtime" "galaxy/rtmanager/internal/ports" "galaxy/rtmanager/internal/service/startruntime" "galaxy/rtmanager/internal/telemetry" "galaxy/rtmanager/internal/worker/startjobsconsumer" "github.com/alicebob/miniredis/v2" "github.com/redis/go-redis/v9" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/mock/gomock" ) func silentLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } type fakeStartService struct { mu sync.Mutex inputs []startruntime.Input result startruntime.Result err error hook func(input startruntime.Input) (startruntime.Result, error) } func (s *fakeStartService) Handle(_ context.Context, input startruntime.Input) (startruntime.Result, error) { s.mu.Lock() defer s.mu.Unlock() s.inputs = append(s.inputs, input) if s.hook != nil { return s.hook(input) } return s.result, s.err } func (s *fakeStartService) Inputs() []startruntime.Input { s.mu.Lock() defer s.mu.Unlock() out := make([]startruntime.Input, len(s.inputs)) copy(out, s.inputs) return out } type fakeJobResults struct { mu sync.Mutex published []ports.JobResult publishErr error } func (s *fakeJobResults) Publish(_ context.Context, result ports.JobResult) error { s.mu.Lock() defer s.mu.Unlock() if s.publishErr != nil { return s.publishErr } s.published = append(s.published, result) return nil } func (s *fakeJobResults) Published() []ports.JobResult { s.mu.Lock() defer s.mu.Unlock() out := make([]ports.JobResult, len(s.published)) copy(out, s.published) return out } type fakeOffsetStore struct { mu sync.Mutex offsets map[string]string loadErr error saveErr error } func newFakeOffsetStore() *fakeOffsetStore { return &fakeOffsetStore{offsets: map[string]string{}} } func (s *fakeOffsetStore) Load(_ context.Context, label string) (string, bool, error) { s.mu.Lock() defer s.mu.Unlock() if s.loadErr != nil { return "", false, s.loadErr } value, ok := s.offsets[label] return value, ok, nil } func (s *fakeOffsetStore) Save(_ context.Context, label, entryID string) error { s.mu.Lock() defer s.mu.Unlock() if s.saveErr != nil { return s.saveErr } s.offsets[label] = entryID return nil } func (s *fakeOffsetStore) Get(label string) (string, bool) { s.mu.Lock() defer s.mu.Unlock() value, ok := s.offsets[label] return value, ok } type harness struct { consumer *startjobsconsumer.Consumer starts *fakeStartService results *fakeJobResults offsets *fakeOffsetStore stream string server *miniredis.Miniredis client *redis.Client } func newHarness(t *testing.T) *harness { t.Helper() server := miniredis.RunT(t) client := redis.NewClient(&redis.Options{Addr: server.Addr()}) t.Cleanup(func() { _ = client.Close() }) starts := &fakeStartService{} results := &fakeJobResults{} offsets := newFakeOffsetStore() stream := "runtime:start_jobs" consumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{ Client: client, Stream: stream, BlockTimeout: 50 * time.Millisecond, StartService: starts, JobResults: results, OffsetStore: offsets, Logger: silentLogger(), }) require.NoError(t, err) return &harness{ consumer: consumer, starts: starts, results: results, offsets: offsets, stream: stream, server: server, client: client, } } func startMessage(id, gameID, imageRef string, requestedAtMS int64) redis.XMessage { return redis.XMessage{ ID: id, Values: map[string]any{ "game_id": gameID, "image_ref": imageRef, "requested_at_ms": strconv.FormatInt(requestedAtMS, 10), }, } } func TestNewConsumerRejectsMissingDeps(t *testing.T) { server := miniredis.RunT(t) client := redis.NewClient(&redis.Options{Addr: server.Addr()}) t.Cleanup(func() { _ = client.Close() }) cases := []startjobsconsumer.Config{ {}, {Client: client}, {Client: client, Stream: "runtime:start_jobs"}, {Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second}, {Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second, StartService: &fakeStartService{}}, {Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second, StartService: &fakeStartService{}, JobResults: &fakeJobResults{}}, } for index, cfg := range cases { _, err := startjobsconsumer.NewConsumer(cfg) require.Errorf(t, err, "case %d should fail", index) } } func TestHandleMessageSuccessPublishesSuccessResult(t *testing.T) { h := newHarness(t) h.starts.result = startruntime.Result{ Record: runtime.RuntimeRecord{ GameID: "game-1", Status: runtime.StatusRunning, CurrentContainerID: "c-1", EngineEndpoint: "http://galaxy-game-game-1:8080", }, Outcome: operation.OutcomeSuccess, } h.consumer.HandleMessage(context.Background(), startMessage("100-0", "game-1", "galaxy/game:1.0.0", 1700)) inputs := h.starts.Inputs() require.Len(t, inputs, 1) assert.Equal(t, "game-1", inputs[0].GameID) assert.Equal(t, "galaxy/game:1.0.0", inputs[0].ImageRef) assert.Equal(t, operation.OpSourceLobbyStream, inputs[0].OpSource) assert.Equal(t, "100-0", inputs[0].SourceRef) published := h.results.Published() require.Len(t, published, 1) assert.Equal(t, ports.JobResult{ GameID: "game-1", Outcome: ports.JobOutcomeSuccess, ContainerID: "c-1", EngineEndpoint: "http://galaxy-game-game-1:8080", }, published[0]) } func TestHandleMessageFailurePublishesFailureResult(t *testing.T) { h := newHarness(t) h.starts.result = startruntime.Result{ Outcome: operation.OutcomeFailure, ErrorCode: startruntime.ErrorCodeImagePullFailed, ErrorMessage: "manifest unknown", } h.consumer.HandleMessage(context.Background(), startMessage("101-0", "game-2", "galaxy/game:bad", 1700)) published := h.results.Published() require.Len(t, published, 1) assert.Equal(t, ports.JobResult{ GameID: "game-2", Outcome: ports.JobOutcomeFailure, ErrorCode: "image_pull_failed", ErrorMessage: "manifest unknown", }, published[0]) } func TestHandleMessageReplayNoOpKeepsContainerAndEndpoint(t *testing.T) { h := newHarness(t) h.starts.result = startruntime.Result{ Record: runtime.RuntimeRecord{ GameID: "game-3", Status: runtime.StatusRunning, CurrentContainerID: "c-3", EngineEndpoint: "http://galaxy-game-game-3:8080", }, Outcome: operation.OutcomeSuccess, ErrorCode: startruntime.ErrorCodeReplayNoOp, } h.consumer.HandleMessage(context.Background(), startMessage("102-0", "game-3", "galaxy/game:1.0.0", 1700)) published := h.results.Published() require.Len(t, published, 1) assert.Equal(t, ports.JobResult{ GameID: "game-3", Outcome: ports.JobOutcomeSuccess, ContainerID: "c-3", EngineEndpoint: "http://galaxy-game-game-3:8080", ErrorCode: "replay_no_op", }, published[0]) } func TestHandleMessageMalformedEnvelopesAreAbsorbed(t *testing.T) { h := newHarness(t) cases := []redis.XMessage{ {ID: "200-0", Values: map[string]any{"image_ref": "galaxy/game:1.0.0", "requested_at_ms": "1"}}, {ID: "200-1", Values: map[string]any{"game_id": " ", "image_ref": "galaxy/game:1.0.0", "requested_at_ms": "1"}}, {ID: "200-2", Values: map[string]any{"game_id": "game-x", "requested_at_ms": "1"}}, {ID: "200-3", Values: map[string]any{"game_id": "game-x", "image_ref": " ", "requested_at_ms": "1"}}, {ID: "200-4", Values: map[string]any{"game_id": "game-x", "image_ref": "galaxy/game:1.0.0", "requested_at_ms": "not-a-number"}}, } for _, msg := range cases { h.consumer.HandleMessage(context.Background(), msg) } assert.Empty(t, h.starts.Inputs(), "malformed envelopes must not reach the start service") assert.Empty(t, h.results.Published(), "malformed envelopes must not produce job results") } func TestHandleMessagePublishFailureIsAbsorbed(t *testing.T) { h := newHarness(t) h.starts.result = startruntime.Result{Outcome: operation.OutcomeFailure, ErrorCode: "internal_error"} h.results.publishErr = errors.New("redis transient") h.consumer.HandleMessage(context.Background(), startMessage("300-0", "game-x", "galaxy/game:1.0.0", 1700)) require.Len(t, h.starts.Inputs(), 1, "service still runs even when publish fails") } func TestHandleMessageGoLevelErrorIsAbsorbed(t *testing.T) { h := newHarness(t) h.starts.err = errors.New("nil ctx") h.consumer.HandleMessage(context.Background(), startMessage("400-0", "game-y", "galaxy/game:1.0.0", 1700)) assert.Empty(t, h.results.Published(), "go-level service errors must not surface as job results") } func TestRunAdvancesOffsetPerMessage(t *testing.T) { h := newHarness(t) h.starts.result = startruntime.Result{ Record: runtime.RuntimeRecord{ GameID: "game-5", Status: runtime.StatusRunning, CurrentContainerID: "c-5", EngineEndpoint: "http://galaxy-game-game-5:8080", }, Outcome: operation.OutcomeSuccess, } ctx, cancel := context.WithCancel(context.Background()) defer cancel() done := make(chan error, 1) go func() { done <- h.consumer.Run(ctx) }() mustXAdd(t, h.client, h.stream, "game-5", "galaxy/game:1.0.0", 1) mustXAdd(t, h.client, h.stream, "game-5", "galaxy/game:1.0.0", 2) require.Eventually(t, func() bool { return len(h.results.Published()) == 2 }, time.Second, 10*time.Millisecond, "consumer must produce one job result per envelope") cancel() require.Eventually(t, func() bool { select { case <-done: return true default: return false } }, time.Second, 10*time.Millisecond, "Run must exit after context cancel") id, ok := h.offsets.Get("startjobs") require.True(t, ok, "offset must be persisted after the run loop processed messages") assert.NotEmpty(t, id, "offset entry id must not be empty") } func TestRunResumesFromPersistedOffset(t *testing.T) { h := newHarness(t) h.starts.result = startruntime.Result{ Record: runtime.RuntimeRecord{ GameID: "game-6", Status: runtime.StatusRunning, CurrentContainerID: "c-6", EngineEndpoint: "http://galaxy-game-game-6:8080", }, Outcome: operation.OutcomeSuccess, } preID := mustXAdd(t, h.client, h.stream, "game-6", "galaxy/game:1.0.0", 1) require.NoError(t, h.offsets.Save(context.Background(), "startjobs", preID)) ctx, cancel := context.WithCancel(context.Background()) defer cancel() done := make(chan error, 1) go func() { done <- h.consumer.Run(ctx) }() mustXAdd(t, h.client, h.stream, "game-6", "galaxy/game:1.0.0", 2) require.Eventually(t, func() bool { return len(h.results.Published()) == 1 }, time.Second, 10*time.Millisecond, "consumer must skip the pre-existing entry and process only the new one") cancel() <-done } func TestRunExitsImmediatelyOnAlreadyCancelledContext(t *testing.T) { h := newHarness(t) ctx, cancel := context.WithCancel(context.Background()) cancel() err := h.consumer.Run(ctx) require.ErrorIs(t, err, context.Canceled) assert.Empty(t, h.starts.Inputs()) assert.Empty(t, h.results.Published()) } func mustXAdd(t *testing.T, client *redis.Client, stream, gameID, imageRef string, requestedAtMS int64) string { t.Helper() id, err := client.XAdd(context.Background(), &redis.XAddArgs{ Stream: stream, Values: map[string]any{ "game_id": gameID, "image_ref": imageRef, "requested_at_ms": strconv.FormatInt(requestedAtMS, 10), }, }).Result() require.NoError(t, err) return id } // --- in-memory fakes for the roundtrip integration test ---------------------- type memoryRecords struct { mu sync.Mutex store map[string]runtime.RuntimeRecord } func newMemoryRecords() *memoryRecords { return &memoryRecords{store: map[string]runtime.RuntimeRecord{}} } func (s *memoryRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) { s.mu.Lock() defer s.mu.Unlock() record, ok := s.store[gameID] if !ok { return runtime.RuntimeRecord{}, runtime.ErrNotFound } return record, nil } func (s *memoryRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error { s.mu.Lock() defer s.mu.Unlock() s.store[record.GameID] = record return nil } func (s *memoryRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error { return errors.New("not used in start integration test") } func (s *memoryRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) { return nil, errors.New("not used in start integration test") } func (s *memoryRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { return nil, errors.New("not used in start integration test") } type memoryOperationLogs struct { mu sync.Mutex entries []operation.OperationEntry } func (s *memoryOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) { s.mu.Lock() defer s.mu.Unlock() s.entries = append(s.entries, entry) return int64(len(s.entries)), nil } func (s *memoryOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) { return nil, errors.New("not used in start integration test") } type memoryLeases struct{} func (l *memoryLeases) TryAcquire(_ context.Context, _, _ string, _ time.Duration) (bool, error) { return true, nil } func (l *memoryLeases) Release(_ context.Context, _, _ string) error { return nil } type memoryHealthEvents struct{} func (h *memoryHealthEvents) Publish(_ context.Context, _ ports.HealthEventEnvelope) error { return nil } type memoryNotifications struct{} func (n *memoryNotifications) Publish(_ context.Context, _ notificationintent.Intent) error { return nil } // TestRoundTripStartJobThroughRealServiceAndPublisher exercises the // Lobby → RTM → Lobby contract end-to-end inside one process: an XADD // in the documented `runtime:start_jobs` shape is consumed, the real // `startruntime.Service` runs against an in-memory fake stack and a // gomock-backed Docker port, the real `jobresultspublisher` writes to // `runtime:job_results`, and the test asserts the symmetric wire shape. // // A second XADD of the same envelope must surface as // `error_code=replay_no_op` per the AsyncAPI replay-safety rule. func TestRoundTripStartJobThroughRealServiceAndPublisher(t *testing.T) { ctrl := gomock.NewController(t) t.Cleanup(ctrl.Finish) server := miniredis.RunT(t) client := redis.NewClient(&redis.Options{Addr: server.Addr()}) t.Cleanup(func() { _ = client.Close() }) now := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) records := newMemoryRecords() dockerMock := mocks.NewMockDockerClient(ctrl) dockerMock.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil).Times(1) dockerMock.EXPECT().PullImage(gomock.Any(), "galaxy/game:1.0.0", ports.PullPolicy(config.ImagePullPolicyIfMissing)).Return(nil).Times(1) dockerMock.EXPECT().InspectImage(gomock.Any(), "galaxy/game:1.0.0").Return(ports.ImageInspect{ Ref: "galaxy/game:1.0.0", Labels: map[string]string{}, }, nil).Times(1) dockerMock.EXPECT().Run(gomock.Any(), gomock.Any()).Return(ports.RunResult{ ContainerID: "ctr-roundtrip", EngineEndpoint: "http://galaxy-game-game-1:8080", StartedAt: now, }, nil).Times(1) telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) require.NoError(t, err) containerCfg := config.ContainerConfig{ DefaultCPUQuota: 1.0, DefaultMemory: "512m", DefaultPIDsLimit: 512, StopTimeout: 30 * time.Second, Retention: 30 * 24 * time.Hour, EngineStateMountPath: "/var/lib/galaxy-game", EngineStateEnvName: "GAME_STATE_PATH", GameStateDirMode: 0o750, GameStateRoot: "/var/lib/galaxy/games", } dockerCfg := config.DockerConfig{ Host: "unix:///var/run/docker.sock", Network: "galaxy-net", LogDriver: "json-file", PullPolicy: config.ImagePullPolicyIfMissing, } coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute} startService, err := startruntime.NewService(startruntime.Dependencies{ RuntimeRecords: records, OperationLogs: &memoryOperationLogs{}, Docker: dockerMock, Leases: &memoryLeases{}, HealthEvents: &memoryHealthEvents{}, Notifications: &memoryNotifications{}, Container: containerCfg, DockerCfg: dockerCfg, Coordination: coordinationCfg, Telemetry: telemetryRuntime, Logger: silentLogger(), Clock: func() time.Time { return now }, NewToken: func() string { return "token-roundtrip" }, PrepareStateDir: func(_ string) (string, error) { return "/var/lib/galaxy/games/game-1", nil }, }) require.NoError(t, err) publisher, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{ Client: client, Stream: "runtime:job_results", }) require.NoError(t, err) offsetStore, err := streamoffsets.New(streamoffsets.Config{Client: client}) require.NoError(t, err) consumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{ Client: client, Stream: "runtime:start_jobs", BlockTimeout: 50 * time.Millisecond, StartService: startService, JobResults: publisher, OffsetStore: offsetStore, Logger: silentLogger(), }) require.NoError(t, err) ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(cancel) done := make(chan error, 1) go func() { done <- consumer.Run(ctx) }() mustXAdd(t, client, "runtime:start_jobs", "game-1", "galaxy/game:1.0.0", 1700) require.Eventually(t, func() bool { entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result() return err == nil && len(entries) == 1 }, 2*time.Second, 20*time.Millisecond, "first XADD must produce one job result entry") entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result() require.NoError(t, err) require.Len(t, entries, 1) values := entries[0].Values assert.Equal(t, "game-1", values["game_id"]) assert.Equal(t, "success", values["outcome"]) assert.Equal(t, "ctr-roundtrip", values["container_id"]) assert.Equal(t, "http://galaxy-game-game-1:8080", values["engine_endpoint"]) assert.Equal(t, "", values["error_code"], "fresh start must publish empty error_code") assert.Equal(t, "", values["error_message"]) // Replay: the same envelope must surface as success/replay_no_op // because the runtime record now reports `running` with the same // image_ref. The Docker mock has no further expectations, so a // second pull/run would fail the test. mustXAdd(t, client, "runtime:start_jobs", "game-1", "galaxy/game:1.0.0", 1701) require.Eventually(t, func() bool { entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result() return err == nil && len(entries) == 2 }, 2*time.Second, 20*time.Millisecond, "second XADD must produce a replay_no_op job result") entries, err = client.XRange(ctx, "runtime:job_results", "-", "+").Result() require.NoError(t, err) require.Len(t, entries, 2) replay := entries[1].Values assert.Equal(t, "game-1", replay["game_id"]) assert.Equal(t, "success", replay["outcome"]) assert.Equal(t, "ctr-roundtrip", replay["container_id"]) assert.Equal(t, "http://galaxy-game-game-1:8080", replay["engine_endpoint"]) assert.Equal(t, "replay_no_op", replay["error_code"]) assert.Equal(t, "", replay["error_message"]) cancel() select { case <-done: case <-time.After(time.Second): t.Fatal("consumer Run did not exit after context cancel") } }