feat: runtime manager
This commit is contained in:
@@ -0,0 +1,337 @@
|
||||
// Package startjobsconsumer drives the asynchronous half of the
|
||||
// Lobby ↔ Runtime Manager start contract. The consumer XREADs from
|
||||
// `runtime:start_jobs` (produced by Lobby), decodes the envelope frozen
|
||||
// in `rtmanager/api/runtime-jobs-asyncapi.yaml`, calls the production
|
||||
// start orchestrator, and publishes one `runtime:job_results` outcome
|
||||
// per consumed envelope.
|
||||
//
|
||||
// Replay safety is provided by the start service: an idempotent re-run
|
||||
// surfaces as `Outcome=success` with `error_code=replay_no_op`. The
|
||||
// consumer copies the service Result fields into the `RuntimeJobResult`
|
||||
// payload verbatim. Per-message decode and publish errors are logged
|
||||
// and absorbed; the offset advances unconditionally so a single poison
|
||||
// message cannot pin the loop. Design rationale is captured in
|
||||
// `rtmanager/docs/workers.md`.
|
||||
package startjobsconsumer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// streamOffsetLabel identifies the start-jobs consumer in the stream
|
||||
// offset store. The label stays stable when the underlying stream key
|
||||
// is renamed via configuration. Matches the convention from
|
||||
// `rtmanager/README.md §Persistence Layout > Redis runtime-coordination state`.
|
||||
const streamOffsetLabel = "startjobs"
|
||||
|
||||
// Wire field names of the `RuntimeStartJob` payload. Frozen by
|
||||
// `rtmanager/api/runtime-jobs-asyncapi.yaml`; renaming any of them
|
||||
// requires a coordinated contract change with Lobby.
|
||||
const (
|
||||
fieldGameID = "game_id"
|
||||
fieldImageRef = "image_ref"
|
||||
fieldRequestedAtMS = "requested_at_ms"
|
||||
)
|
||||
|
||||
// StartService is the narrow surface the consumer needs from the start
|
||||
// orchestrator. The concrete `*startruntime.Service` satisfies this
|
||||
// interface and is wired in production.
|
||||
type StartService interface {
|
||||
Handle(ctx context.Context, input startruntime.Input) (startruntime.Result, error)
|
||||
}
|
||||
|
||||
// Config groups the dependencies required to construct a Consumer.
|
||||
type Config struct {
|
||||
// Client provides XREAD access to the start-jobs stream.
|
||||
Client *redis.Client
|
||||
|
||||
// Stream stores the Redis Streams key consumed by the worker.
|
||||
Stream string
|
||||
|
||||
// BlockTimeout bounds the blocking XREAD window.
|
||||
BlockTimeout time.Duration
|
||||
|
||||
// StartService executes the start lifecycle for each decoded
|
||||
// envelope.
|
||||
StartService StartService
|
||||
|
||||
// JobResults publishes one outcome entry per processed envelope.
|
||||
JobResults ports.JobResultPublisher
|
||||
|
||||
// OffsetStore persists the last successfully processed entry id so
|
||||
// the consumer survives restarts without replaying processed
|
||||
// envelopes.
|
||||
OffsetStore ports.StreamOffsetStore
|
||||
|
||||
// Logger receives structured worker-level events. Defaults to
|
||||
// `slog.Default` when nil.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// Consumer drives the start-jobs processing loop.
|
||||
type Consumer struct {
|
||||
client *redis.Client
|
||||
stream string
|
||||
blockTimeout time.Duration
|
||||
startService StartService
|
||||
jobResults ports.JobResultPublisher
|
||||
offsetStore ports.StreamOffsetStore
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewConsumer constructs one Consumer from cfg. Validation errors
|
||||
// surface the missing collaborator verbatim.
|
||||
func NewConsumer(cfg Config) (*Consumer, error) {
|
||||
switch {
|
||||
case cfg.Client == nil:
|
||||
return nil, errors.New("new start jobs consumer: nil redis client")
|
||||
case strings.TrimSpace(cfg.Stream) == "":
|
||||
return nil, errors.New("new start jobs consumer: stream must not be empty")
|
||||
case cfg.BlockTimeout <= 0:
|
||||
return nil, errors.New("new start jobs consumer: block timeout must be positive")
|
||||
case cfg.StartService == nil:
|
||||
return nil, errors.New("new start jobs consumer: nil start service")
|
||||
case cfg.JobResults == nil:
|
||||
return nil, errors.New("new start jobs consumer: nil job results publisher")
|
||||
case cfg.OffsetStore == nil:
|
||||
return nil, errors.New("new start jobs consumer: nil offset store")
|
||||
}
|
||||
|
||||
logger := cfg.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
return &Consumer{
|
||||
client: cfg.Client,
|
||||
stream: cfg.Stream,
|
||||
blockTimeout: cfg.BlockTimeout,
|
||||
startService: cfg.StartService,
|
||||
jobResults: cfg.JobResults,
|
||||
offsetStore: cfg.OffsetStore,
|
||||
logger: logger.With("worker", "rtmanager.startjobs", "stream", cfg.Stream),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run drives the XREAD loop until ctx is cancelled. Per-message
|
||||
// outcomes are absorbed by HandleMessage; the loop only exits on
|
||||
// context cancellation or a fatal Redis / offset-store error.
|
||||
func (consumer *Consumer) Run(ctx context.Context) error {
|
||||
if consumer == nil || consumer.client == nil {
|
||||
return errors.New("run start jobs consumer: nil consumer")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run start jobs consumer: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
lastID, found, err := consumer.offsetStore.Load(ctx, streamOffsetLabel)
|
||||
if err != nil {
|
||||
return fmt.Errorf("run start jobs consumer: load offset: %w", err)
|
||||
}
|
||||
if !found {
|
||||
lastID = "0-0"
|
||||
}
|
||||
|
||||
consumer.logger.Info("start jobs consumer started",
|
||||
"block_timeout", consumer.blockTimeout.String(),
|
||||
"start_entry_id", lastID,
|
||||
)
|
||||
defer consumer.logger.Info("start jobs consumer stopped")
|
||||
|
||||
for {
|
||||
streams, err := consumer.client.XRead(ctx, &redis.XReadArgs{
|
||||
Streams: []string{consumer.stream, lastID},
|
||||
Count: 1,
|
||||
Block: consumer.blockTimeout,
|
||||
}).Result()
|
||||
switch {
|
||||
case err == nil:
|
||||
for _, stream := range streams {
|
||||
for _, message := range stream.Messages {
|
||||
consumer.HandleMessage(ctx, message)
|
||||
if err := consumer.offsetStore.Save(ctx, streamOffsetLabel, message.ID); err != nil {
|
||||
return fmt.Errorf("run start jobs consumer: save offset: %w", err)
|
||||
}
|
||||
lastID = message.ID
|
||||
}
|
||||
}
|
||||
case errors.Is(err, redis.Nil):
|
||||
continue
|
||||
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)):
|
||||
return ctx.Err()
|
||||
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed):
|
||||
return fmt.Errorf("run start jobs consumer: %w", err)
|
||||
default:
|
||||
return fmt.Errorf("run start jobs consumer: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; the consumer relies on context cancellation.
|
||||
func (consumer *Consumer) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown start jobs consumer: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// HandleMessage processes one Redis Stream message. Exported so tests
|
||||
// can drive the consumer deterministically without spinning up a real
|
||||
// XREAD loop.
|
||||
//
|
||||
// Per-message errors are logged and absorbed: the worker keeps running
|
||||
// and the offset is allowed to advance.
|
||||
func (consumer *Consumer) HandleMessage(ctx context.Context, message redis.XMessage) {
|
||||
if consumer == nil {
|
||||
return
|
||||
}
|
||||
|
||||
envelope, err := decodeStartJob(message)
|
||||
if err != nil {
|
||||
consumer.logger.WarnContext(ctx, "decode start job",
|
||||
"stream_entry_id", message.ID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
input := startruntime.Input{
|
||||
GameID: envelope.GameID,
|
||||
ImageRef: envelope.ImageRef,
|
||||
OpSource: operation.OpSourceLobbyStream,
|
||||
SourceRef: message.ID,
|
||||
}
|
||||
result, err := consumer.startService.Handle(ctx, input)
|
||||
if err != nil {
|
||||
consumer.logger.ErrorContext(ctx, "start service returned go-level error",
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", envelope.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
jobResult := buildJobResult(envelope.GameID, result)
|
||||
if err := consumer.jobResults.Publish(ctx, jobResult); err != nil {
|
||||
consumer.logger.ErrorContext(ctx, "publish job result",
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", envelope.GameID,
|
||||
"outcome", jobResult.Outcome,
|
||||
"error_code", jobResult.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
logArgs := []any{
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", envelope.GameID,
|
||||
"outcome", jobResult.Outcome,
|
||||
"error_code", jobResult.ErrorCode,
|
||||
"requested_at_ms", envelope.RequestedAtMS,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
consumer.logger.InfoContext(ctx, "start job processed", logArgs...)
|
||||
}
|
||||
|
||||
// startJobEnvelope stores the decoded shape of one `runtime:start_jobs`
|
||||
// stream entry.
|
||||
type startJobEnvelope struct {
|
||||
GameID string
|
||||
ImageRef string
|
||||
RequestedAtMS int64
|
||||
}
|
||||
|
||||
func decodeStartJob(message redis.XMessage) (startJobEnvelope, error) {
|
||||
gameID := strings.TrimSpace(optionalString(message.Values, fieldGameID))
|
||||
if gameID == "" {
|
||||
return startJobEnvelope{}, errors.New("missing game_id")
|
||||
}
|
||||
imageRef := strings.TrimSpace(optionalString(message.Values, fieldImageRef))
|
||||
if imageRef == "" {
|
||||
return startJobEnvelope{}, errors.New("missing image_ref")
|
||||
}
|
||||
requestedAtMS, err := optionalInt64(message.Values, fieldRequestedAtMS)
|
||||
if err != nil {
|
||||
return startJobEnvelope{}, fmt.Errorf("invalid requested_at_ms: %w", err)
|
||||
}
|
||||
return startJobEnvelope{
|
||||
GameID: gameID,
|
||||
ImageRef: imageRef,
|
||||
RequestedAtMS: requestedAtMS,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// buildJobResult translates a startruntime.Result into the wire payload
|
||||
// published on `runtime:job_results`. ContainerID and EngineEndpoint are
|
||||
// taken from the service's Record on success / replay; on failure the
|
||||
// service returns a zero Record and both fields stay empty per the
|
||||
// AsyncAPI contract (required field, empty string is a valid value).
|
||||
func buildJobResult(gameID string, result startruntime.Result) ports.JobResult {
|
||||
jobResult := ports.JobResult{
|
||||
GameID: gameID,
|
||||
Outcome: string(result.Outcome),
|
||||
ErrorCode: result.ErrorCode,
|
||||
ErrorMessage: result.ErrorMessage,
|
||||
}
|
||||
if result.Outcome == operation.OutcomeSuccess {
|
||||
jobResult.ContainerID = result.Record.CurrentContainerID
|
||||
jobResult.EngineEndpoint = result.Record.EngineEndpoint
|
||||
}
|
||||
return jobResult
|
||||
}
|
||||
|
||||
func optionalString(values map[string]any, key string) string {
|
||||
raw, ok := values[key]
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
switch typed := raw.(type) {
|
||||
case string:
|
||||
return typed
|
||||
case []byte:
|
||||
return string(typed)
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func optionalInt64(values map[string]any, key string) (int64, error) {
|
||||
raw, ok := values[key]
|
||||
if !ok {
|
||||
return 0, nil
|
||||
}
|
||||
var stringValue string
|
||||
switch typed := raw.(type) {
|
||||
case string:
|
||||
stringValue = typed
|
||||
case []byte:
|
||||
stringValue = string(typed)
|
||||
default:
|
||||
return 0, fmt.Errorf("unsupported type %T", raw)
|
||||
}
|
||||
stringValue = strings.TrimSpace(stringValue)
|
||||
if stringValue == "" {
|
||||
return 0, nil
|
||||
}
|
||||
parsed, err := strconv.ParseInt(stringValue, 10, 64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return parsed, nil
|
||||
}
|
||||
@@ -0,0 +1,631 @@
|
||||
package startjobsconsumer_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/adapters/jobresultspublisher"
|
||||
"galaxy/rtmanager/internal/adapters/redisstate/streamoffsets"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
"galaxy/rtmanager/internal/worker/startjobsconsumer"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
func silentLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||
}
|
||||
|
||||
type fakeStartService struct {
|
||||
mu sync.Mutex
|
||||
inputs []startruntime.Input
|
||||
result startruntime.Result
|
||||
err error
|
||||
hook func(input startruntime.Input) (startruntime.Result, error)
|
||||
}
|
||||
|
||||
func (s *fakeStartService) Handle(_ context.Context, input startruntime.Input) (startruntime.Result, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.inputs = append(s.inputs, input)
|
||||
if s.hook != nil {
|
||||
return s.hook(input)
|
||||
}
|
||||
return s.result, s.err
|
||||
}
|
||||
|
||||
func (s *fakeStartService) Inputs() []startruntime.Input {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]startruntime.Input, len(s.inputs))
|
||||
copy(out, s.inputs)
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeJobResults struct {
|
||||
mu sync.Mutex
|
||||
published []ports.JobResult
|
||||
publishErr error
|
||||
}
|
||||
|
||||
func (s *fakeJobResults) Publish(_ context.Context, result ports.JobResult) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.publishErr != nil {
|
||||
return s.publishErr
|
||||
}
|
||||
s.published = append(s.published, result)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeJobResults) Published() []ports.JobResult {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]ports.JobResult, len(s.published))
|
||||
copy(out, s.published)
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeOffsetStore struct {
|
||||
mu sync.Mutex
|
||||
offsets map[string]string
|
||||
loadErr error
|
||||
saveErr error
|
||||
}
|
||||
|
||||
func newFakeOffsetStore() *fakeOffsetStore {
|
||||
return &fakeOffsetStore{offsets: map[string]string{}}
|
||||
}
|
||||
|
||||
func (s *fakeOffsetStore) Load(_ context.Context, label string) (string, bool, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.loadErr != nil {
|
||||
return "", false, s.loadErr
|
||||
}
|
||||
value, ok := s.offsets[label]
|
||||
return value, ok, nil
|
||||
}
|
||||
|
||||
func (s *fakeOffsetStore) Save(_ context.Context, label, entryID string) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.saveErr != nil {
|
||||
return s.saveErr
|
||||
}
|
||||
s.offsets[label] = entryID
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeOffsetStore) Get(label string) (string, bool) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
value, ok := s.offsets[label]
|
||||
return value, ok
|
||||
}
|
||||
|
||||
type harness struct {
|
||||
consumer *startjobsconsumer.Consumer
|
||||
starts *fakeStartService
|
||||
results *fakeJobResults
|
||||
offsets *fakeOffsetStore
|
||||
stream string
|
||||
server *miniredis.Miniredis
|
||||
client *redis.Client
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
starts := &fakeStartService{}
|
||||
results := &fakeJobResults{}
|
||||
offsets := newFakeOffsetStore()
|
||||
stream := "runtime:start_jobs"
|
||||
|
||||
consumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{
|
||||
Client: client,
|
||||
Stream: stream,
|
||||
BlockTimeout: 50 * time.Millisecond,
|
||||
StartService: starts,
|
||||
JobResults: results,
|
||||
OffsetStore: offsets,
|
||||
Logger: silentLogger(),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
consumer: consumer,
|
||||
starts: starts,
|
||||
results: results,
|
||||
offsets: offsets,
|
||||
stream: stream,
|
||||
server: server,
|
||||
client: client,
|
||||
}
|
||||
}
|
||||
|
||||
func startMessage(id, gameID, imageRef string, requestedAtMS int64) redis.XMessage {
|
||||
return redis.XMessage{
|
||||
ID: id,
|
||||
Values: map[string]any{
|
||||
"game_id": gameID,
|
||||
"image_ref": imageRef,
|
||||
"requested_at_ms": strconv.FormatInt(requestedAtMS, 10),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewConsumerRejectsMissingDeps(t *testing.T) {
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
cases := []startjobsconsumer.Config{
|
||||
{},
|
||||
{Client: client},
|
||||
{Client: client, Stream: "runtime:start_jobs"},
|
||||
{Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second},
|
||||
{Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second, StartService: &fakeStartService{}},
|
||||
{Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second, StartService: &fakeStartService{}, JobResults: &fakeJobResults{}},
|
||||
}
|
||||
for index, cfg := range cases {
|
||||
_, err := startjobsconsumer.NewConsumer(cfg)
|
||||
require.Errorf(t, err, "case %d should fail", index)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleMessageSuccessPublishesSuccessResult(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.result = startruntime.Result{
|
||||
Record: runtime.RuntimeRecord{
|
||||
GameID: "game-1",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "c-1",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
},
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), startMessage("100-0", "game-1", "galaxy/game:1.0.0", 1700))
|
||||
|
||||
inputs := h.starts.Inputs()
|
||||
require.Len(t, inputs, 1)
|
||||
assert.Equal(t, "game-1", inputs[0].GameID)
|
||||
assert.Equal(t, "galaxy/game:1.0.0", inputs[0].ImageRef)
|
||||
assert.Equal(t, operation.OpSourceLobbyStream, inputs[0].OpSource)
|
||||
assert.Equal(t, "100-0", inputs[0].SourceRef)
|
||||
|
||||
published := h.results.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, ports.JobResult{
|
||||
GameID: "game-1",
|
||||
Outcome: ports.JobOutcomeSuccess,
|
||||
ContainerID: "c-1",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
}, published[0])
|
||||
}
|
||||
|
||||
func TestHandleMessageFailurePublishesFailureResult(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.result = startruntime.Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: startruntime.ErrorCodeImagePullFailed,
|
||||
ErrorMessage: "manifest unknown",
|
||||
}
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), startMessage("101-0", "game-2", "galaxy/game:bad", 1700))
|
||||
|
||||
published := h.results.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, ports.JobResult{
|
||||
GameID: "game-2",
|
||||
Outcome: ports.JobOutcomeFailure,
|
||||
ErrorCode: "image_pull_failed",
|
||||
ErrorMessage: "manifest unknown",
|
||||
}, published[0])
|
||||
}
|
||||
|
||||
func TestHandleMessageReplayNoOpKeepsContainerAndEndpoint(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.result = startruntime.Result{
|
||||
Record: runtime.RuntimeRecord{
|
||||
GameID: "game-3",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "c-3",
|
||||
EngineEndpoint: "http://galaxy-game-game-3:8080",
|
||||
},
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
||||
}
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), startMessage("102-0", "game-3", "galaxy/game:1.0.0", 1700))
|
||||
|
||||
published := h.results.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, ports.JobResult{
|
||||
GameID: "game-3",
|
||||
Outcome: ports.JobOutcomeSuccess,
|
||||
ContainerID: "c-3",
|
||||
EngineEndpoint: "http://galaxy-game-game-3:8080",
|
||||
ErrorCode: "replay_no_op",
|
||||
}, published[0])
|
||||
}
|
||||
|
||||
func TestHandleMessageMalformedEnvelopesAreAbsorbed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
cases := []redis.XMessage{
|
||||
{ID: "200-0", Values: map[string]any{"image_ref": "galaxy/game:1.0.0", "requested_at_ms": "1"}},
|
||||
{ID: "200-1", Values: map[string]any{"game_id": " ", "image_ref": "galaxy/game:1.0.0", "requested_at_ms": "1"}},
|
||||
{ID: "200-2", Values: map[string]any{"game_id": "game-x", "requested_at_ms": "1"}},
|
||||
{ID: "200-3", Values: map[string]any{"game_id": "game-x", "image_ref": " ", "requested_at_ms": "1"}},
|
||||
{ID: "200-4", Values: map[string]any{"game_id": "game-x", "image_ref": "galaxy/game:1.0.0", "requested_at_ms": "not-a-number"}},
|
||||
}
|
||||
for _, msg := range cases {
|
||||
h.consumer.HandleMessage(context.Background(), msg)
|
||||
}
|
||||
|
||||
assert.Empty(t, h.starts.Inputs(), "malformed envelopes must not reach the start service")
|
||||
assert.Empty(t, h.results.Published(), "malformed envelopes must not produce job results")
|
||||
}
|
||||
|
||||
func TestHandleMessagePublishFailureIsAbsorbed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.result = startruntime.Result{Outcome: operation.OutcomeFailure, ErrorCode: "internal_error"}
|
||||
h.results.publishErr = errors.New("redis transient")
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), startMessage("300-0", "game-x", "galaxy/game:1.0.0", 1700))
|
||||
|
||||
require.Len(t, h.starts.Inputs(), 1, "service still runs even when publish fails")
|
||||
}
|
||||
|
||||
func TestHandleMessageGoLevelErrorIsAbsorbed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.err = errors.New("nil ctx")
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), startMessage("400-0", "game-y", "galaxy/game:1.0.0", 1700))
|
||||
|
||||
assert.Empty(t, h.results.Published(), "go-level service errors must not surface as job results")
|
||||
}
|
||||
|
||||
func TestRunAdvancesOffsetPerMessage(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.result = startruntime.Result{
|
||||
Record: runtime.RuntimeRecord{
|
||||
GameID: "game-5",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "c-5",
|
||||
EngineEndpoint: "http://galaxy-game-game-5:8080",
|
||||
},
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- h.consumer.Run(ctx) }()
|
||||
|
||||
mustXAdd(t, h.client, h.stream, "game-5", "galaxy/game:1.0.0", 1)
|
||||
mustXAdd(t, h.client, h.stream, "game-5", "galaxy/game:1.0.0", 2)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
return len(h.results.Published()) == 2
|
||||
}, time.Second, 10*time.Millisecond, "consumer must produce one job result per envelope")
|
||||
|
||||
cancel()
|
||||
require.Eventually(t, func() bool {
|
||||
select {
|
||||
case <-done:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}, time.Second, 10*time.Millisecond, "Run must exit after context cancel")
|
||||
|
||||
id, ok := h.offsets.Get("startjobs")
|
||||
require.True(t, ok, "offset must be persisted after the run loop processed messages")
|
||||
assert.NotEmpty(t, id, "offset entry id must not be empty")
|
||||
}
|
||||
|
||||
func TestRunResumesFromPersistedOffset(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.result = startruntime.Result{
|
||||
Record: runtime.RuntimeRecord{
|
||||
GameID: "game-6",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "c-6",
|
||||
EngineEndpoint: "http://galaxy-game-game-6:8080",
|
||||
},
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}
|
||||
|
||||
preID := mustXAdd(t, h.client, h.stream, "game-6", "galaxy/game:1.0.0", 1)
|
||||
require.NoError(t, h.offsets.Save(context.Background(), "startjobs", preID))
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- h.consumer.Run(ctx) }()
|
||||
|
||||
mustXAdd(t, h.client, h.stream, "game-6", "galaxy/game:1.0.0", 2)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
return len(h.results.Published()) == 1
|
||||
}, time.Second, 10*time.Millisecond, "consumer must skip the pre-existing entry and process only the new one")
|
||||
|
||||
cancel()
|
||||
<-done
|
||||
}
|
||||
|
||||
func TestRunExitsImmediatelyOnAlreadyCancelledContext(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
err := h.consumer.Run(ctx)
|
||||
require.ErrorIs(t, err, context.Canceled)
|
||||
assert.Empty(t, h.starts.Inputs())
|
||||
assert.Empty(t, h.results.Published())
|
||||
}
|
||||
|
||||
func mustXAdd(t *testing.T, client *redis.Client, stream, gameID, imageRef string, requestedAtMS int64) string {
|
||||
t.Helper()
|
||||
id, err := client.XAdd(context.Background(), &redis.XAddArgs{
|
||||
Stream: stream,
|
||||
Values: map[string]any{
|
||||
"game_id": gameID,
|
||||
"image_ref": imageRef,
|
||||
"requested_at_ms": strconv.FormatInt(requestedAtMS, 10),
|
||||
},
|
||||
}).Result()
|
||||
require.NoError(t, err)
|
||||
return id
|
||||
}
|
||||
|
||||
// --- in-memory fakes for the roundtrip integration test ----------------------
|
||||
|
||||
type memoryRecords struct {
|
||||
mu sync.Mutex
|
||||
store map[string]runtime.RuntimeRecord
|
||||
}
|
||||
|
||||
func newMemoryRecords() *memoryRecords {
|
||||
return &memoryRecords{store: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *memoryRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
record, ok := s.store[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *memoryRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.store[record.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *memoryRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
|
||||
return errors.New("not used in start integration test")
|
||||
}
|
||||
|
||||
func (s *memoryRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in start integration test")
|
||||
}
|
||||
|
||||
func (s *memoryRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in start integration test")
|
||||
}
|
||||
|
||||
type memoryOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
entries []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *memoryOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.entries = append(s.entries, entry)
|
||||
return int64(len(s.entries)), nil
|
||||
}
|
||||
|
||||
func (s *memoryOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in start integration test")
|
||||
}
|
||||
|
||||
type memoryLeases struct{}
|
||||
|
||||
func (l *memoryLeases) TryAcquire(_ context.Context, _, _ string, _ time.Duration) (bool, error) {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func (l *memoryLeases) Release(_ context.Context, _, _ string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
type memoryHealthEvents struct{}
|
||||
|
||||
func (h *memoryHealthEvents) Publish(_ context.Context, _ ports.HealthEventEnvelope) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
type memoryNotifications struct{}
|
||||
|
||||
func (n *memoryNotifications) Publish(_ context.Context, _ notificationintent.Intent) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// TestRoundTripStartJobThroughRealServiceAndPublisher exercises the
|
||||
// Lobby → RTM → Lobby contract end-to-end inside one process: an XADD
|
||||
// in the documented `runtime:start_jobs` shape is consumed, the real
|
||||
// `startruntime.Service` runs against an in-memory fake stack and a
|
||||
// gomock-backed Docker port, the real `jobresultspublisher` writes to
|
||||
// `runtime:job_results`, and the test asserts the symmetric wire shape.
|
||||
//
|
||||
// A second XADD of the same envelope must surface as
|
||||
// `error_code=replay_no_op` per the AsyncAPI replay-safety rule.
|
||||
func TestRoundTripStartJobThroughRealServiceAndPublisher(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
now := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
|
||||
records := newMemoryRecords()
|
||||
dockerMock := mocks.NewMockDockerClient(ctrl)
|
||||
|
||||
dockerMock.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil).Times(1)
|
||||
dockerMock.EXPECT().PullImage(gomock.Any(), "galaxy/game:1.0.0", ports.PullPolicy(config.ImagePullPolicyIfMissing)).Return(nil).Times(1)
|
||||
dockerMock.EXPECT().InspectImage(gomock.Any(), "galaxy/game:1.0.0").Return(ports.ImageInspect{
|
||||
Ref: "galaxy/game:1.0.0",
|
||||
Labels: map[string]string{},
|
||||
}, nil).Times(1)
|
||||
dockerMock.EXPECT().Run(gomock.Any(), gomock.Any()).Return(ports.RunResult{
|
||||
ContainerID: "ctr-roundtrip",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StartedAt: now,
|
||||
}, nil).Times(1)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
containerCfg := config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
}
|
||||
dockerCfg := config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
}
|
||||
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
|
||||
|
||||
startService, err := startruntime.NewService(startruntime.Dependencies{
|
||||
RuntimeRecords: records,
|
||||
OperationLogs: &memoryOperationLogs{},
|
||||
Docker: dockerMock,
|
||||
Leases: &memoryLeases{},
|
||||
HealthEvents: &memoryHealthEvents{},
|
||||
Notifications: &memoryNotifications{},
|
||||
Container: containerCfg,
|
||||
DockerCfg: dockerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: telemetryRuntime,
|
||||
Logger: silentLogger(),
|
||||
Clock: func() time.Time { return now },
|
||||
NewToken: func() string { return "token-roundtrip" },
|
||||
PrepareStateDir: func(_ string) (string, error) {
|
||||
return "/var/lib/galaxy/games/game-1", nil
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
publisher, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{
|
||||
Client: client,
|
||||
Stream: "runtime:job_results",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
offsetStore, err := streamoffsets.New(streamoffsets.Config{Client: client})
|
||||
require.NoError(t, err)
|
||||
|
||||
consumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{
|
||||
Client: client,
|
||||
Stream: "runtime:start_jobs",
|
||||
BlockTimeout: 50 * time.Millisecond,
|
||||
StartService: startService,
|
||||
JobResults: publisher,
|
||||
OffsetStore: offsetStore,
|
||||
Logger: silentLogger(),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
t.Cleanup(cancel)
|
||||
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- consumer.Run(ctx) }()
|
||||
|
||||
mustXAdd(t, client, "runtime:start_jobs", "game-1", "galaxy/game:1.0.0", 1700)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result()
|
||||
return err == nil && len(entries) == 1
|
||||
}, 2*time.Second, 20*time.Millisecond, "first XADD must produce one job result entry")
|
||||
|
||||
entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, entries, 1)
|
||||
values := entries[0].Values
|
||||
assert.Equal(t, "game-1", values["game_id"])
|
||||
assert.Equal(t, "success", values["outcome"])
|
||||
assert.Equal(t, "ctr-roundtrip", values["container_id"])
|
||||
assert.Equal(t, "http://galaxy-game-game-1:8080", values["engine_endpoint"])
|
||||
assert.Equal(t, "", values["error_code"], "fresh start must publish empty error_code")
|
||||
assert.Equal(t, "", values["error_message"])
|
||||
|
||||
// Replay: the same envelope must surface as success/replay_no_op
|
||||
// because the runtime record now reports `running` with the same
|
||||
// image_ref. The Docker mock has no further expectations, so a
|
||||
// second pull/run would fail the test.
|
||||
mustXAdd(t, client, "runtime:start_jobs", "game-1", "galaxy/game:1.0.0", 1701)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result()
|
||||
return err == nil && len(entries) == 2
|
||||
}, 2*time.Second, 20*time.Millisecond, "second XADD must produce a replay_no_op job result")
|
||||
|
||||
entries, err = client.XRange(ctx, "runtime:job_results", "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, entries, 2)
|
||||
replay := entries[1].Values
|
||||
assert.Equal(t, "game-1", replay["game_id"])
|
||||
assert.Equal(t, "success", replay["outcome"])
|
||||
assert.Equal(t, "ctr-roundtrip", replay["container_id"])
|
||||
assert.Equal(t, "http://galaxy-game-game-1:8080", replay["engine_endpoint"])
|
||||
assert.Equal(t, "replay_no_op", replay["error_code"])
|
||||
assert.Equal(t, "", replay["error_message"])
|
||||
|
||||
cancel()
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("consumer Run did not exit after context cancel")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user