Files
galaxy-game/rtmanager/internal/worker/startjobsconsumer/consumer_test.go
T
2026-04-28 20:39:18 +02:00

632 lines
19 KiB
Go

package startjobsconsumer_test
import (
"context"
"errors"
"io"
"log/slog"
"strconv"
"sync"
"testing"
"time"
"galaxy/notificationintent"
"galaxy/rtmanager/internal/adapters/docker/mocks"
"galaxy/rtmanager/internal/adapters/jobresultspublisher"
"galaxy/rtmanager/internal/adapters/redisstate/streamoffsets"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/telemetry"
"galaxy/rtmanager/internal/worker/startjobsconsumer"
"github.com/alicebob/miniredis/v2"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
)
func silentLogger() *slog.Logger {
return slog.New(slog.NewTextHandler(io.Discard, nil))
}
type fakeStartService struct {
mu sync.Mutex
inputs []startruntime.Input
result startruntime.Result
err error
hook func(input startruntime.Input) (startruntime.Result, error)
}
func (s *fakeStartService) Handle(_ context.Context, input startruntime.Input) (startruntime.Result, error) {
s.mu.Lock()
defer s.mu.Unlock()
s.inputs = append(s.inputs, input)
if s.hook != nil {
return s.hook(input)
}
return s.result, s.err
}
func (s *fakeStartService) Inputs() []startruntime.Input {
s.mu.Lock()
defer s.mu.Unlock()
out := make([]startruntime.Input, len(s.inputs))
copy(out, s.inputs)
return out
}
type fakeJobResults struct {
mu sync.Mutex
published []ports.JobResult
publishErr error
}
func (s *fakeJobResults) Publish(_ context.Context, result ports.JobResult) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.publishErr != nil {
return s.publishErr
}
s.published = append(s.published, result)
return nil
}
func (s *fakeJobResults) Published() []ports.JobResult {
s.mu.Lock()
defer s.mu.Unlock()
out := make([]ports.JobResult, len(s.published))
copy(out, s.published)
return out
}
type fakeOffsetStore struct {
mu sync.Mutex
offsets map[string]string
loadErr error
saveErr error
}
func newFakeOffsetStore() *fakeOffsetStore {
return &fakeOffsetStore{offsets: map[string]string{}}
}
func (s *fakeOffsetStore) Load(_ context.Context, label string) (string, bool, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.loadErr != nil {
return "", false, s.loadErr
}
value, ok := s.offsets[label]
return value, ok, nil
}
func (s *fakeOffsetStore) Save(_ context.Context, label, entryID string) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.saveErr != nil {
return s.saveErr
}
s.offsets[label] = entryID
return nil
}
func (s *fakeOffsetStore) Get(label string) (string, bool) {
s.mu.Lock()
defer s.mu.Unlock()
value, ok := s.offsets[label]
return value, ok
}
type harness struct {
consumer *startjobsconsumer.Consumer
starts *fakeStartService
results *fakeJobResults
offsets *fakeOffsetStore
stream string
server *miniredis.Miniredis
client *redis.Client
}
func newHarness(t *testing.T) *harness {
t.Helper()
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
t.Cleanup(func() { _ = client.Close() })
starts := &fakeStartService{}
results := &fakeJobResults{}
offsets := newFakeOffsetStore()
stream := "runtime:start_jobs"
consumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{
Client: client,
Stream: stream,
BlockTimeout: 50 * time.Millisecond,
StartService: starts,
JobResults: results,
OffsetStore: offsets,
Logger: silentLogger(),
})
require.NoError(t, err)
return &harness{
consumer: consumer,
starts: starts,
results: results,
offsets: offsets,
stream: stream,
server: server,
client: client,
}
}
func startMessage(id, gameID, imageRef string, requestedAtMS int64) redis.XMessage {
return redis.XMessage{
ID: id,
Values: map[string]any{
"game_id": gameID,
"image_ref": imageRef,
"requested_at_ms": strconv.FormatInt(requestedAtMS, 10),
},
}
}
func TestNewConsumerRejectsMissingDeps(t *testing.T) {
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
t.Cleanup(func() { _ = client.Close() })
cases := []startjobsconsumer.Config{
{},
{Client: client},
{Client: client, Stream: "runtime:start_jobs"},
{Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second},
{Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second, StartService: &fakeStartService{}},
{Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second, StartService: &fakeStartService{}, JobResults: &fakeJobResults{}},
}
for index, cfg := range cases {
_, err := startjobsconsumer.NewConsumer(cfg)
require.Errorf(t, err, "case %d should fail", index)
}
}
func TestHandleMessageSuccessPublishesSuccessResult(t *testing.T) {
h := newHarness(t)
h.starts.result = startruntime.Result{
Record: runtime.RuntimeRecord{
GameID: "game-1",
Status: runtime.StatusRunning,
CurrentContainerID: "c-1",
EngineEndpoint: "http://galaxy-game-game-1:8080",
},
Outcome: operation.OutcomeSuccess,
}
h.consumer.HandleMessage(context.Background(), startMessage("100-0", "game-1", "galaxy/game:1.0.0", 1700))
inputs := h.starts.Inputs()
require.Len(t, inputs, 1)
assert.Equal(t, "game-1", inputs[0].GameID)
assert.Equal(t, "galaxy/game:1.0.0", inputs[0].ImageRef)
assert.Equal(t, operation.OpSourceLobbyStream, inputs[0].OpSource)
assert.Equal(t, "100-0", inputs[0].SourceRef)
published := h.results.Published()
require.Len(t, published, 1)
assert.Equal(t, ports.JobResult{
GameID: "game-1",
Outcome: ports.JobOutcomeSuccess,
ContainerID: "c-1",
EngineEndpoint: "http://galaxy-game-game-1:8080",
}, published[0])
}
func TestHandleMessageFailurePublishesFailureResult(t *testing.T) {
h := newHarness(t)
h.starts.result = startruntime.Result{
Outcome: operation.OutcomeFailure,
ErrorCode: startruntime.ErrorCodeImagePullFailed,
ErrorMessage: "manifest unknown",
}
h.consumer.HandleMessage(context.Background(), startMessage("101-0", "game-2", "galaxy/game:bad", 1700))
published := h.results.Published()
require.Len(t, published, 1)
assert.Equal(t, ports.JobResult{
GameID: "game-2",
Outcome: ports.JobOutcomeFailure,
ErrorCode: "image_pull_failed",
ErrorMessage: "manifest unknown",
}, published[0])
}
func TestHandleMessageReplayNoOpKeepsContainerAndEndpoint(t *testing.T) {
h := newHarness(t)
h.starts.result = startruntime.Result{
Record: runtime.RuntimeRecord{
GameID: "game-3",
Status: runtime.StatusRunning,
CurrentContainerID: "c-3",
EngineEndpoint: "http://galaxy-game-game-3:8080",
},
Outcome: operation.OutcomeSuccess,
ErrorCode: startruntime.ErrorCodeReplayNoOp,
}
h.consumer.HandleMessage(context.Background(), startMessage("102-0", "game-3", "galaxy/game:1.0.0", 1700))
published := h.results.Published()
require.Len(t, published, 1)
assert.Equal(t, ports.JobResult{
GameID: "game-3",
Outcome: ports.JobOutcomeSuccess,
ContainerID: "c-3",
EngineEndpoint: "http://galaxy-game-game-3:8080",
ErrorCode: "replay_no_op",
}, published[0])
}
func TestHandleMessageMalformedEnvelopesAreAbsorbed(t *testing.T) {
h := newHarness(t)
cases := []redis.XMessage{
{ID: "200-0", Values: map[string]any{"image_ref": "galaxy/game:1.0.0", "requested_at_ms": "1"}},
{ID: "200-1", Values: map[string]any{"game_id": " ", "image_ref": "galaxy/game:1.0.0", "requested_at_ms": "1"}},
{ID: "200-2", Values: map[string]any{"game_id": "game-x", "requested_at_ms": "1"}},
{ID: "200-3", Values: map[string]any{"game_id": "game-x", "image_ref": " ", "requested_at_ms": "1"}},
{ID: "200-4", Values: map[string]any{"game_id": "game-x", "image_ref": "galaxy/game:1.0.0", "requested_at_ms": "not-a-number"}},
}
for _, msg := range cases {
h.consumer.HandleMessage(context.Background(), msg)
}
assert.Empty(t, h.starts.Inputs(), "malformed envelopes must not reach the start service")
assert.Empty(t, h.results.Published(), "malformed envelopes must not produce job results")
}
func TestHandleMessagePublishFailureIsAbsorbed(t *testing.T) {
h := newHarness(t)
h.starts.result = startruntime.Result{Outcome: operation.OutcomeFailure, ErrorCode: "internal_error"}
h.results.publishErr = errors.New("redis transient")
h.consumer.HandleMessage(context.Background(), startMessage("300-0", "game-x", "galaxy/game:1.0.0", 1700))
require.Len(t, h.starts.Inputs(), 1, "service still runs even when publish fails")
}
func TestHandleMessageGoLevelErrorIsAbsorbed(t *testing.T) {
h := newHarness(t)
h.starts.err = errors.New("nil ctx")
h.consumer.HandleMessage(context.Background(), startMessage("400-0", "game-y", "galaxy/game:1.0.0", 1700))
assert.Empty(t, h.results.Published(), "go-level service errors must not surface as job results")
}
func TestRunAdvancesOffsetPerMessage(t *testing.T) {
h := newHarness(t)
h.starts.result = startruntime.Result{
Record: runtime.RuntimeRecord{
GameID: "game-5",
Status: runtime.StatusRunning,
CurrentContainerID: "c-5",
EngineEndpoint: "http://galaxy-game-game-5:8080",
},
Outcome: operation.OutcomeSuccess,
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := make(chan error, 1)
go func() { done <- h.consumer.Run(ctx) }()
mustXAdd(t, h.client, h.stream, "game-5", "galaxy/game:1.0.0", 1)
mustXAdd(t, h.client, h.stream, "game-5", "galaxy/game:1.0.0", 2)
require.Eventually(t, func() bool {
return len(h.results.Published()) == 2
}, time.Second, 10*time.Millisecond, "consumer must produce one job result per envelope")
cancel()
require.Eventually(t, func() bool {
select {
case <-done:
return true
default:
return false
}
}, time.Second, 10*time.Millisecond, "Run must exit after context cancel")
id, ok := h.offsets.Get("startjobs")
require.True(t, ok, "offset must be persisted after the run loop processed messages")
assert.NotEmpty(t, id, "offset entry id must not be empty")
}
func TestRunResumesFromPersistedOffset(t *testing.T) {
h := newHarness(t)
h.starts.result = startruntime.Result{
Record: runtime.RuntimeRecord{
GameID: "game-6",
Status: runtime.StatusRunning,
CurrentContainerID: "c-6",
EngineEndpoint: "http://galaxy-game-game-6:8080",
},
Outcome: operation.OutcomeSuccess,
}
preID := mustXAdd(t, h.client, h.stream, "game-6", "galaxy/game:1.0.0", 1)
require.NoError(t, h.offsets.Save(context.Background(), "startjobs", preID))
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := make(chan error, 1)
go func() { done <- h.consumer.Run(ctx) }()
mustXAdd(t, h.client, h.stream, "game-6", "galaxy/game:1.0.0", 2)
require.Eventually(t, func() bool {
return len(h.results.Published()) == 1
}, time.Second, 10*time.Millisecond, "consumer must skip the pre-existing entry and process only the new one")
cancel()
<-done
}
func TestRunExitsImmediatelyOnAlreadyCancelledContext(t *testing.T) {
h := newHarness(t)
ctx, cancel := context.WithCancel(context.Background())
cancel()
err := h.consumer.Run(ctx)
require.ErrorIs(t, err, context.Canceled)
assert.Empty(t, h.starts.Inputs())
assert.Empty(t, h.results.Published())
}
func mustXAdd(t *testing.T, client *redis.Client, stream, gameID, imageRef string, requestedAtMS int64) string {
t.Helper()
id, err := client.XAdd(context.Background(), &redis.XAddArgs{
Stream: stream,
Values: map[string]any{
"game_id": gameID,
"image_ref": imageRef,
"requested_at_ms": strconv.FormatInt(requestedAtMS, 10),
},
}).Result()
require.NoError(t, err)
return id
}
// --- in-memory fakes for the roundtrip integration test ----------------------
type memoryRecords struct {
mu sync.Mutex
store map[string]runtime.RuntimeRecord
}
func newMemoryRecords() *memoryRecords {
return &memoryRecords{store: map[string]runtime.RuntimeRecord{}}
}
func (s *memoryRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
record, ok := s.store[gameID]
if !ok {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
return record, nil
}
func (s *memoryRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
s.mu.Lock()
defer s.mu.Unlock()
s.store[record.GameID] = record
return nil
}
func (s *memoryRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
return errors.New("not used in start integration test")
}
func (s *memoryRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in start integration test")
}
func (s *memoryRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in start integration test")
}
type memoryOperationLogs struct {
mu sync.Mutex
entries []operation.OperationEntry
}
func (s *memoryOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
s.mu.Lock()
defer s.mu.Unlock()
s.entries = append(s.entries, entry)
return int64(len(s.entries)), nil
}
func (s *memoryOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
return nil, errors.New("not used in start integration test")
}
type memoryLeases struct{}
func (l *memoryLeases) TryAcquire(_ context.Context, _, _ string, _ time.Duration) (bool, error) {
return true, nil
}
func (l *memoryLeases) Release(_ context.Context, _, _ string) error {
return nil
}
type memoryHealthEvents struct{}
func (h *memoryHealthEvents) Publish(_ context.Context, _ ports.HealthEventEnvelope) error {
return nil
}
type memoryNotifications struct{}
func (n *memoryNotifications) Publish(_ context.Context, _ notificationintent.Intent) error {
return nil
}
// TestRoundTripStartJobThroughRealServiceAndPublisher exercises the
// Lobby → RTM → Lobby contract end-to-end inside one process: an XADD
// in the documented `runtime:start_jobs` shape is consumed, the real
// `startruntime.Service` runs against an in-memory fake stack and a
// gomock-backed Docker port, the real `jobresultspublisher` writes to
// `runtime:job_results`, and the test asserts the symmetric wire shape.
//
// A second XADD of the same envelope must surface as
// `error_code=replay_no_op` per the AsyncAPI replay-safety rule.
func TestRoundTripStartJobThroughRealServiceAndPublisher(t *testing.T) {
ctrl := gomock.NewController(t)
t.Cleanup(ctrl.Finish)
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
t.Cleanup(func() { _ = client.Close() })
now := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
records := newMemoryRecords()
dockerMock := mocks.NewMockDockerClient(ctrl)
dockerMock.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil).Times(1)
dockerMock.EXPECT().PullImage(gomock.Any(), "galaxy/game:1.0.0", ports.PullPolicy(config.ImagePullPolicyIfMissing)).Return(nil).Times(1)
dockerMock.EXPECT().InspectImage(gomock.Any(), "galaxy/game:1.0.0").Return(ports.ImageInspect{
Ref: "galaxy/game:1.0.0",
Labels: map[string]string{},
}, nil).Times(1)
dockerMock.EXPECT().Run(gomock.Any(), gomock.Any()).Return(ports.RunResult{
ContainerID: "ctr-roundtrip",
EngineEndpoint: "http://galaxy-game-game-1:8080",
StartedAt: now,
}, nil).Times(1)
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
require.NoError(t, err)
containerCfg := config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
}
dockerCfg := config.DockerConfig{
Host: "unix:///var/run/docker.sock",
Network: "galaxy-net",
LogDriver: "json-file",
PullPolicy: config.ImagePullPolicyIfMissing,
}
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
startService, err := startruntime.NewService(startruntime.Dependencies{
RuntimeRecords: records,
OperationLogs: &memoryOperationLogs{},
Docker: dockerMock,
Leases: &memoryLeases{},
HealthEvents: &memoryHealthEvents{},
Notifications: &memoryNotifications{},
Container: containerCfg,
DockerCfg: dockerCfg,
Coordination: coordinationCfg,
Telemetry: telemetryRuntime,
Logger: silentLogger(),
Clock: func() time.Time { return now },
NewToken: func() string { return "token-roundtrip" },
PrepareStateDir: func(_ string) (string, error) {
return "/var/lib/galaxy/games/game-1", nil
},
})
require.NoError(t, err)
publisher, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{
Client: client,
Stream: "runtime:job_results",
})
require.NoError(t, err)
offsetStore, err := streamoffsets.New(streamoffsets.Config{Client: client})
require.NoError(t, err)
consumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{
Client: client,
Stream: "runtime:start_jobs",
BlockTimeout: 50 * time.Millisecond,
StartService: startService,
JobResults: publisher,
OffsetStore: offsetStore,
Logger: silentLogger(),
})
require.NoError(t, err)
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
done := make(chan error, 1)
go func() { done <- consumer.Run(ctx) }()
mustXAdd(t, client, "runtime:start_jobs", "game-1", "galaxy/game:1.0.0", 1700)
require.Eventually(t, func() bool {
entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result()
return err == nil && len(entries) == 1
}, 2*time.Second, 20*time.Millisecond, "first XADD must produce one job result entry")
entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result()
require.NoError(t, err)
require.Len(t, entries, 1)
values := entries[0].Values
assert.Equal(t, "game-1", values["game_id"])
assert.Equal(t, "success", values["outcome"])
assert.Equal(t, "ctr-roundtrip", values["container_id"])
assert.Equal(t, "http://galaxy-game-game-1:8080", values["engine_endpoint"])
assert.Equal(t, "", values["error_code"], "fresh start must publish empty error_code")
assert.Equal(t, "", values["error_message"])
// Replay: the same envelope must surface as success/replay_no_op
// because the runtime record now reports `running` with the same
// image_ref. The Docker mock has no further expectations, so a
// second pull/run would fail the test.
mustXAdd(t, client, "runtime:start_jobs", "game-1", "galaxy/game:1.0.0", 1701)
require.Eventually(t, func() bool {
entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result()
return err == nil && len(entries) == 2
}, 2*time.Second, 20*time.Millisecond, "second XADD must produce a replay_no_op job result")
entries, err = client.XRange(ctx, "runtime:job_results", "-", "+").Result()
require.NoError(t, err)
require.Len(t, entries, 2)
replay := entries[1].Values
assert.Equal(t, "game-1", replay["game_id"])
assert.Equal(t, "success", replay["outcome"])
assert.Equal(t, "ctr-roundtrip", replay["container_id"])
assert.Equal(t, "http://galaxy-game-game-1:8080", replay["engine_endpoint"])
assert.Equal(t, "replay_no_op", replay["error_code"])
assert.Equal(t, "", replay["error_message"])
cancel()
select {
case <-done:
case <-time.After(time.Second):
t.Fatal("consumer Run did not exit after context cancel")
}
}