feat: game lobby service
This commit is contained in:
@@ -0,0 +1,564 @@
|
||||
// Package runtimejobresult implements the worker that consumes runtime
|
||||
// job results published by Runtime Manager and drives the second half of
|
||||
// the game start flow: persisting the runtime binding, calling
|
||||
// Game Master to register the running game, and transitioning the game
|
||||
// status to `running`, `paused`, or `start_failed` accordingly.
|
||||
//
|
||||
// Replay protection relies on the CAS-based UpdateStatus semantics: a
|
||||
// duplicate result event finds the game in a non-`starting` status and
|
||||
// the second pass becomes a no-op without any extra side effects. The
|
||||
// stream offset advances after each message so the consumer survives
|
||||
// restarts without re-emitting state changes.
|
||||
package runtimejobresult
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/lobby/internal/domain/common"
|
||||
"galaxy/lobby/internal/domain/game"
|
||||
"galaxy/lobby/internal/logging"
|
||||
"galaxy/lobby/internal/ports"
|
||||
"galaxy/lobby/internal/telemetry"
|
||||
"galaxy/notificationintent"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// streamOffsetLabel identifies the runtime-job-results consumer in the
|
||||
// stream offset store. The label stays stable when the underlying
|
||||
// stream key is renamed via configuration.
|
||||
const streamOffsetLabel = "runtime_results"
|
||||
|
||||
// IntentPublisher publishes notification intents.
|
||||
type IntentPublisher interface {
|
||||
Publish(ctx context.Context, intent notificationintent.Intent) (string, error)
|
||||
}
|
||||
|
||||
// Config groups the dependencies used by Consumer.
|
||||
type Config struct {
|
||||
// Client provides XREAD access to the runtime job results stream.
|
||||
Client *redis.Client
|
||||
|
||||
// Stream stores the Redis Streams key consumed by the worker.
|
||||
Stream string
|
||||
|
||||
// BlockTimeout bounds the blocking XREAD window.
|
||||
BlockTimeout time.Duration
|
||||
|
||||
// Games persists the post-start game record updates.
|
||||
Games ports.GameStore
|
||||
|
||||
// RuntimeManager publishes stop jobs in the orphan-container path.
|
||||
RuntimeManager ports.RuntimeManager
|
||||
|
||||
// GMClient registers the running game with Game Master after a
|
||||
// successful binding persistence.
|
||||
GMClient ports.GMClient
|
||||
|
||||
// Intents publishes the lobby.runtime_paused_after_start
|
||||
// notification when GM is unavailable.
|
||||
Intents IntentPublisher
|
||||
|
||||
// OffsetStore persists the last successfully processed entry id.
|
||||
OffsetStore ports.StreamOffsetStore
|
||||
|
||||
// Clock supplies the wall-clock used for status transition
|
||||
// timestamps. Defaults to time.Now when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// Logger receives structured worker-level events. Defaults to
|
||||
// slog.Default when nil.
|
||||
Logger *slog.Logger
|
||||
|
||||
// Telemetry records the `lobby.start_flow.outcomes` and
|
||||
// `lobby.game.transitions` counters per processed result. Optional;
|
||||
// nil disables metric emission.
|
||||
Telemetry *telemetry.Runtime
|
||||
}
|
||||
|
||||
// Consumer drives the runtime-job-results processing loop.
|
||||
type Consumer struct {
|
||||
client *redis.Client
|
||||
stream string
|
||||
blockTimeout time.Duration
|
||||
games ports.GameStore
|
||||
runtimeManager ports.RuntimeManager
|
||||
gmClient ports.GMClient
|
||||
intents IntentPublisher
|
||||
offsetStore ports.StreamOffsetStore
|
||||
clock func() time.Time
|
||||
logger *slog.Logger
|
||||
telemetry *telemetry.Runtime
|
||||
}
|
||||
|
||||
// NewConsumer constructs one Consumer from cfg.
|
||||
func NewConsumer(cfg Config) (*Consumer, error) {
|
||||
switch {
|
||||
case cfg.Client == nil:
|
||||
return nil, errors.New("new runtime job result consumer: nil redis client")
|
||||
case strings.TrimSpace(cfg.Stream) == "":
|
||||
return nil, errors.New("new runtime job result consumer: stream must not be empty")
|
||||
case cfg.BlockTimeout <= 0:
|
||||
return nil, errors.New("new runtime job result consumer: block timeout must be positive")
|
||||
case cfg.Games == nil:
|
||||
return nil, errors.New("new runtime job result consumer: nil game store")
|
||||
case cfg.RuntimeManager == nil:
|
||||
return nil, errors.New("new runtime job result consumer: nil runtime manager")
|
||||
case cfg.GMClient == nil:
|
||||
return nil, errors.New("new runtime job result consumer: nil gm client")
|
||||
case cfg.Intents == nil:
|
||||
return nil, errors.New("new runtime job result consumer: nil intent publisher")
|
||||
case cfg.OffsetStore == nil:
|
||||
return nil, errors.New("new runtime job result consumer: nil offset store")
|
||||
}
|
||||
clock := cfg.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := cfg.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
return &Consumer{
|
||||
client: cfg.Client,
|
||||
stream: cfg.Stream,
|
||||
blockTimeout: cfg.BlockTimeout,
|
||||
games: cfg.Games,
|
||||
runtimeManager: cfg.RuntimeManager,
|
||||
gmClient: cfg.GMClient,
|
||||
intents: cfg.Intents,
|
||||
offsetStore: cfg.OffsetStore,
|
||||
clock: clock,
|
||||
logger: logger.With("worker", "lobby.runtimejobresult", "stream", cfg.Stream),
|
||||
telemetry: cfg.Telemetry,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run drives the XREAD loop until ctx is cancelled. Per-message
|
||||
// outcomes are absorbed by HandleMessage; the loop only exits on
|
||||
// context cancellation or a fatal Redis error.
|
||||
func (consumer *Consumer) Run(ctx context.Context) error {
|
||||
if consumer == nil || consumer.client == nil {
|
||||
return errors.New("run runtime job result consumer: nil consumer")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run runtime job result consumer: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
lastID, found, err := consumer.offsetStore.Load(ctx, streamOffsetLabel)
|
||||
if err != nil {
|
||||
return fmt.Errorf("run runtime job result consumer: load offset: %w", err)
|
||||
}
|
||||
if !found {
|
||||
lastID = "0-0"
|
||||
}
|
||||
|
||||
consumer.logger.Info("runtime job result consumer started", "block_timeout", consumer.blockTimeout.String(), "start_entry_id", lastID)
|
||||
defer consumer.logger.Info("runtime job result consumer stopped")
|
||||
|
||||
for {
|
||||
streams, err := consumer.client.XRead(ctx, &redis.XReadArgs{
|
||||
Streams: []string{consumer.stream, lastID},
|
||||
Count: 1,
|
||||
Block: consumer.blockTimeout,
|
||||
}).Result()
|
||||
switch {
|
||||
case err == nil:
|
||||
for _, stream := range streams {
|
||||
for _, message := range stream.Messages {
|
||||
consumer.HandleMessage(ctx, message)
|
||||
if err := consumer.offsetStore.Save(ctx, streamOffsetLabel, message.ID); err != nil {
|
||||
return fmt.Errorf("run runtime job result consumer: save offset: %w", err)
|
||||
}
|
||||
lastID = message.ID
|
||||
}
|
||||
}
|
||||
case errors.Is(err, redis.Nil):
|
||||
continue
|
||||
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)):
|
||||
return ctx.Err()
|
||||
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed):
|
||||
return fmt.Errorf("run runtime job result consumer: %w", err)
|
||||
default:
|
||||
return fmt.Errorf("run runtime job result consumer: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; the consumer relies on context cancellation.
|
||||
func (consumer *Consumer) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown runtime job result consumer: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// HandleMessage processes one Redis Stream message. Exported so tests
|
||||
// can drive the consumer deterministically without spinning up a real
|
||||
// XREAD loop.
|
||||
//
|
||||
// Per-message errors are logged and absorbed: the worker keeps running
|
||||
// and the offset is allowed to advance. CAS-status conflicts (typical
|
||||
// for replayed events) are also absorbed.
|
||||
func (consumer *Consumer) HandleMessage(ctx context.Context, message redis.XMessage) {
|
||||
if consumer == nil {
|
||||
return
|
||||
}
|
||||
|
||||
event, err := decodeJobResult(message)
|
||||
if err != nil {
|
||||
consumer.logger.WarnContext(ctx, "decode runtime job result",
|
||||
"stream_entry_id", message.ID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
switch event.Outcome {
|
||||
case outcomeFailure:
|
||||
consumer.handleFailure(ctx, message.ID, event)
|
||||
case outcomeSuccess:
|
||||
consumer.handleSuccess(ctx, message.ID, event)
|
||||
default:
|
||||
consumer.logger.WarnContext(ctx, "unknown runtime job outcome",
|
||||
"stream_entry_id", message.ID,
|
||||
"outcome", event.Outcome,
|
||||
"game_id", event.GameID.String(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// handleFailure transitions the game from `starting` to `start_failed`.
|
||||
// The CAS-status update absorbs replays naturally: if the game is no
|
||||
// longer in `starting`, the second call returns ErrConflict /
|
||||
// ErrInvalidTransition and the worker treats it as an already-handled
|
||||
// duplicate.
|
||||
func (consumer *Consumer) handleFailure(ctx context.Context, entryID string, event jobResultEvent) {
|
||||
at := consumer.clock().UTC()
|
||||
err := consumer.games.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: event.GameID,
|
||||
ExpectedFrom: game.StatusStarting,
|
||||
To: game.StatusStartFailed,
|
||||
Trigger: game.TriggerRuntimeEvent,
|
||||
At: at,
|
||||
})
|
||||
switch {
|
||||
case err == nil:
|
||||
consumer.telemetry.RecordGameTransition(ctx,
|
||||
string(game.StatusStarting),
|
||||
string(game.StatusStartFailed),
|
||||
string(game.TriggerRuntimeEvent),
|
||||
)
|
||||
consumer.telemetry.RecordStartFlowOutcome(ctx, "start_failed")
|
||||
logArgs := []any{
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", event.GameID.String(),
|
||||
"from_status", string(game.StatusStarting),
|
||||
"to_status", string(game.StatusStartFailed),
|
||||
"trigger", string(game.TriggerRuntimeEvent),
|
||||
"error_code", event.ErrorCode,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
consumer.logger.InfoContext(ctx, "game start failed", logArgs...)
|
||||
case errors.Is(err, game.ErrConflict), errors.Is(err, game.ErrInvalidTransition):
|
||||
consumer.logger.InfoContext(ctx, "ignored runtime failure for game not in starting",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", event.GameID.String(),
|
||||
)
|
||||
default:
|
||||
consumer.logger.WarnContext(ctx, "transition game to start_failed",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", event.GameID.String(),
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// handleSuccess applies the success-path branches: persist binding,
|
||||
// call GM, transition status. Any failure branches into the
|
||||
// orphan-container or paused-after-start paths defined by the README.
|
||||
//
|
||||
// Replay protection: the worker re-reads the game record up front. If
|
||||
// the record is no longer in `starting`, the event is treated as an
|
||||
// already-handled replay and processing exits without further side
|
||||
// effects (no binding overwrite, no GM call, no status transition).
|
||||
func (consumer *Consumer) handleSuccess(ctx context.Context, entryID string, event jobResultEvent) {
|
||||
at := consumer.clock().UTC()
|
||||
|
||||
if err := event.validateSuccess(); err != nil {
|
||||
consumer.logger.WarnContext(ctx, "invalid runtime job success event",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", event.GameID.String(),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
record, err := consumer.games.Get(ctx, event.GameID)
|
||||
if err != nil {
|
||||
consumer.logger.WarnContext(ctx, "load game for runtime success",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", event.GameID.String(),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if record.Status != game.StatusStarting {
|
||||
consumer.logger.InfoContext(ctx, "ignored runtime success for game not in starting",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", record.GameID.String(),
|
||||
"current_status", string(record.Status),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
binding := game.RuntimeBinding{
|
||||
ContainerID: event.ContainerID,
|
||||
EngineEndpoint: event.EngineEndpoint,
|
||||
RuntimeJobID: entryID,
|
||||
BoundAt: at,
|
||||
}
|
||||
if err := consumer.games.UpdateRuntimeBinding(ctx, ports.UpdateRuntimeBindingInput{
|
||||
GameID: event.GameID,
|
||||
Binding: binding,
|
||||
At: at,
|
||||
}); err != nil {
|
||||
consumer.handleOrphan(ctx, entryID, event, at, err)
|
||||
return
|
||||
}
|
||||
|
||||
gmErr := consumer.gmClient.RegisterGame(ctx, ports.RegisterGameRequest{
|
||||
GameID: record.GameID,
|
||||
ContainerID: binding.ContainerID,
|
||||
EngineEndpoint: binding.EngineEndpoint,
|
||||
TargetEngineVersion: record.TargetEngineVersion,
|
||||
TurnSchedule: record.TurnSchedule,
|
||||
})
|
||||
if gmErr != nil {
|
||||
consumer.handleGMUnavailable(ctx, entryID, record, at, gmErr)
|
||||
return
|
||||
}
|
||||
|
||||
if err := consumer.games.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: record.GameID,
|
||||
ExpectedFrom: game.StatusStarting,
|
||||
To: game.StatusRunning,
|
||||
Trigger: game.TriggerRuntimeEvent,
|
||||
At: at,
|
||||
}); err != nil {
|
||||
switch {
|
||||
case errors.Is(err, game.ErrConflict), errors.Is(err, game.ErrInvalidTransition):
|
||||
consumer.logger.InfoContext(ctx, "ignored running transition for game not in starting",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", record.GameID.String(),
|
||||
)
|
||||
default:
|
||||
consumer.logger.WarnContext(ctx, "transition game to running",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", record.GameID.String(),
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
consumer.telemetry.RecordGameTransition(ctx,
|
||||
string(game.StatusStarting),
|
||||
string(game.StatusRunning),
|
||||
string(game.TriggerRuntimeEvent),
|
||||
)
|
||||
consumer.telemetry.RecordStartFlowOutcome(ctx, "running")
|
||||
|
||||
logArgs := []any{
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", record.GameID.String(),
|
||||
"from_status", string(game.StatusStarting),
|
||||
"to_status", string(game.StatusRunning),
|
||||
"trigger", string(game.TriggerRuntimeEvent),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
consumer.logger.InfoContext(ctx, "game running after runtime registration", logArgs...)
|
||||
}
|
||||
|
||||
// handleOrphan implements the orphan-container path: the container
|
||||
// started but Lobby could not persist the binding metadata. We publish
|
||||
// a stop job to Runtime Manager and transition the game to
|
||||
// `start_failed`. Stop-job dispatch is attempted before the status
|
||||
// transition so a process crash between the two leaves the stop-job
|
||||
// safely re-published on replay (Runtime Manager idempotency is
|
||||
// required).
|
||||
func (consumer *Consumer) handleOrphan(ctx context.Context, entryID string, event jobResultEvent, at time.Time, cause error) {
|
||||
consumer.logger.WarnContext(ctx, "persist runtime binding failed; orphan container path",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", event.GameID.String(),
|
||||
"err", cause.Error(),
|
||||
)
|
||||
if err := consumer.runtimeManager.PublishStopJob(ctx, event.GameID.String()); err != nil {
|
||||
consumer.logger.WarnContext(ctx, "publish stop job for orphan container",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", event.GameID.String(),
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
if err := consumer.games.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: event.GameID,
|
||||
ExpectedFrom: game.StatusStarting,
|
||||
To: game.StatusStartFailed,
|
||||
Trigger: game.TriggerRuntimeEvent,
|
||||
At: at,
|
||||
}); err != nil && !errors.Is(err, game.ErrConflict) && !errors.Is(err, game.ErrInvalidTransition) {
|
||||
consumer.logger.WarnContext(ctx, "transition orphan game to start_failed",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", event.GameID.String(),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
consumer.telemetry.RecordGameTransition(ctx,
|
||||
string(game.StatusStarting),
|
||||
string(game.StatusStartFailed),
|
||||
string(game.TriggerRuntimeEvent),
|
||||
)
|
||||
consumer.telemetry.RecordStartFlowOutcome(ctx, "start_failed")
|
||||
}
|
||||
|
||||
// handleGMUnavailable implements the paused-after-start path: the
|
||||
// container is alive but Game Master could not be registered. The game
|
||||
// is moved to `paused` and an admin notification is published.
|
||||
func (consumer *Consumer) handleGMUnavailable(ctx context.Context, entryID string, record game.Game, at time.Time, cause error) {
|
||||
consumer.logger.WarnContext(ctx, "gm registration failed; pause-after-start path",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", record.GameID.String(),
|
||||
"err", cause.Error(),
|
||||
)
|
||||
if err := consumer.games.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: record.GameID,
|
||||
ExpectedFrom: game.StatusStarting,
|
||||
To: game.StatusPaused,
|
||||
Trigger: game.TriggerRuntimeEvent,
|
||||
At: at,
|
||||
}); err != nil {
|
||||
switch {
|
||||
case errors.Is(err, game.ErrConflict), errors.Is(err, game.ErrInvalidTransition):
|
||||
consumer.logger.InfoContext(ctx, "ignored paused transition for game not in starting",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", record.GameID.String(),
|
||||
)
|
||||
default:
|
||||
consumer.logger.WarnContext(ctx, "transition game to paused",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", record.GameID.String(),
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
consumer.telemetry.RecordGameTransition(ctx,
|
||||
string(game.StatusStarting),
|
||||
string(game.StatusPaused),
|
||||
string(game.TriggerRuntimeEvent),
|
||||
)
|
||||
consumer.telemetry.RecordStartFlowOutcome(ctx, "paused")
|
||||
|
||||
intent, err := notificationintent.NewLobbyRuntimePausedAfterStartIntent(
|
||||
notificationintent.Metadata{
|
||||
IdempotencyKey: "lobby.runtime_paused_after_start:" + entryID,
|
||||
OccurredAt: at,
|
||||
},
|
||||
notificationintent.LobbyRuntimePausedAfterStartPayload{
|
||||
GameID: record.GameID.String(),
|
||||
GameName: record.GameName,
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
consumer.logger.ErrorContext(ctx, "build runtime paused intent",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", record.GameID.String(),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if _, err := consumer.intents.Publish(ctx, intent); err != nil {
|
||||
consumer.logger.WarnContext(ctx, "publish runtime paused intent",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", record.GameID.String(),
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// outcomeSuccess and outcomeFailure are the two accepted values of the
|
||||
// runtime job result `outcome` field.
|
||||
const (
|
||||
outcomeSuccess = "success"
|
||||
outcomeFailure = "failure"
|
||||
)
|
||||
|
||||
// jobResultEvent stores the decoded shape of one runtime:job_results
|
||||
// stream entry.
|
||||
type jobResultEvent struct {
|
||||
GameID common.GameID
|
||||
Outcome string
|
||||
ContainerID string
|
||||
EngineEndpoint string
|
||||
ErrorCode string
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
func (event jobResultEvent) validateSuccess() error {
|
||||
if strings.TrimSpace(event.ContainerID) == "" {
|
||||
return errors.New("success event missing container_id")
|
||||
}
|
||||
if strings.TrimSpace(event.EngineEndpoint) == "" {
|
||||
return errors.New("success event missing engine_endpoint")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func decodeJobResult(message redis.XMessage) (jobResultEvent, error) {
|
||||
gameIDRaw := optionalString(message.Values, "game_id")
|
||||
if strings.TrimSpace(gameIDRaw) == "" {
|
||||
return jobResultEvent{}, errors.New("missing game_id")
|
||||
}
|
||||
gameID := common.GameID(gameIDRaw)
|
||||
if err := gameID.Validate(); err != nil {
|
||||
return jobResultEvent{}, fmt.Errorf("invalid game_id: %w", err)
|
||||
}
|
||||
outcome := optionalString(message.Values, "outcome")
|
||||
if outcome != outcomeSuccess && outcome != outcomeFailure {
|
||||
return jobResultEvent{}, fmt.Errorf("unsupported outcome %q", outcome)
|
||||
}
|
||||
|
||||
return jobResultEvent{
|
||||
GameID: gameID,
|
||||
Outcome: outcome,
|
||||
ContainerID: optionalString(message.Values, "container_id"),
|
||||
EngineEndpoint: optionalString(message.Values, "engine_endpoint"),
|
||||
ErrorCode: optionalString(message.Values, "error_code"),
|
||||
ErrorMessage: optionalString(message.Values, "error_message"),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func optionalString(values map[string]any, key string) string {
|
||||
raw, ok := values[key]
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
switch typed := raw.(type) {
|
||||
case string:
|
||||
return typed
|
||||
case []byte:
|
||||
return string(typed)
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,372 @@
|
||||
package runtimejobresult_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/lobby/internal/adapters/gamestub"
|
||||
"galaxy/lobby/internal/adapters/gmclientstub"
|
||||
"galaxy/lobby/internal/adapters/intentpubstub"
|
||||
"galaxy/lobby/internal/adapters/runtimemanagerstub"
|
||||
"galaxy/lobby/internal/adapters/streamoffsetstub"
|
||||
"galaxy/lobby/internal/domain/common"
|
||||
"galaxy/lobby/internal/domain/game"
|
||||
"galaxy/lobby/internal/ports"
|
||||
"galaxy/lobby/internal/worker/runtimejobresult"
|
||||
"galaxy/notificationintent"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func silentLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||
}
|
||||
|
||||
type harness struct {
|
||||
games *gamestub.Store
|
||||
runtime *runtimemanagerstub.Publisher
|
||||
gm *gmclientstub.Client
|
||||
intents *intentpubstub.Publisher
|
||||
offsets *streamoffsetstub.Store
|
||||
consumer *runtimejobresult.Consumer
|
||||
server *miniredis.Miniredis
|
||||
clientRedis *redis.Client
|
||||
stream string
|
||||
at time.Time
|
||||
gameRecord game.Game
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
server := miniredis.RunT(t)
|
||||
clientRedis := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = clientRedis.Close() })
|
||||
|
||||
games := gamestub.NewStore()
|
||||
runtime := runtimemanagerstub.NewPublisher()
|
||||
gm := gmclientstub.NewClient()
|
||||
intents := intentpubstub.NewPublisher()
|
||||
offsets := streamoffsetstub.NewStore()
|
||||
at := time.Date(2026, 4, 25, 13, 0, 0, 0, time.UTC)
|
||||
|
||||
h := &harness{
|
||||
games: games,
|
||||
runtime: runtime,
|
||||
gm: gm,
|
||||
intents: intents,
|
||||
offsets: offsets,
|
||||
server: server,
|
||||
clientRedis: clientRedis,
|
||||
stream: "runtime:job_results",
|
||||
at: at,
|
||||
}
|
||||
|
||||
now := at.Add(-time.Hour)
|
||||
record, err := game.New(game.NewGameInput{
|
||||
GameID: common.GameID("game-w"),
|
||||
GameName: "test worker game",
|
||||
GameType: game.GameTypePublic,
|
||||
MinPlayers: 4,
|
||||
MaxPlayers: 8,
|
||||
StartGapHours: 12,
|
||||
StartGapPlayers: 2,
|
||||
EnrollmentEndsAt: now.Add(24 * time.Hour),
|
||||
TurnSchedule: "0 18 * * *",
|
||||
TargetEngineVersion: "v1.0.0",
|
||||
Now: now,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
record.Status = game.StatusStarting
|
||||
require.NoError(t, games.Save(context.Background(), record))
|
||||
h.gameRecord = record
|
||||
|
||||
consumer, err := runtimejobresult.NewConsumer(runtimejobresult.Config{
|
||||
Client: clientRedis,
|
||||
Stream: h.stream,
|
||||
BlockTimeout: 100 * time.Millisecond,
|
||||
Games: games,
|
||||
RuntimeManager: runtime,
|
||||
GMClient: gm,
|
||||
Intents: intents,
|
||||
OffsetStore: offsets,
|
||||
Clock: func() time.Time { return at },
|
||||
Logger: silentLogger(),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
h.consumer = consumer
|
||||
|
||||
return h
|
||||
}
|
||||
|
||||
func successMessage(t *testing.T, h *harness, id string) redis.XMessage {
|
||||
t.Helper()
|
||||
return redis.XMessage{
|
||||
ID: id,
|
||||
Values: map[string]any{
|
||||
"game_id": h.gameRecord.GameID.String(),
|
||||
"outcome": "success",
|
||||
"container_id": "container-1",
|
||||
"engine_endpoint": "engine.local:9000",
|
||||
"completed_at_ms": "1745581200000",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func failureMessage(t *testing.T, h *harness, id string) redis.XMessage {
|
||||
t.Helper()
|
||||
return redis.XMessage{
|
||||
ID: id,
|
||||
Values: map[string]any{
|
||||
"game_id": h.gameRecord.GameID.String(),
|
||||
"outcome": "failure",
|
||||
"error_code": "image_pull_failed",
|
||||
"error_message": "registry unreachable",
|
||||
"completed_at_ms": "1745581200000",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewConsumerRejectsMissingDeps(t *testing.T) {
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
_, err := runtimejobresult.NewConsumer(runtimejobresult.Config{
|
||||
Stream: "runtime:job_results",
|
||||
BlockTimeout: time.Second,
|
||||
})
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = runtimejobresult.NewConsumer(runtimejobresult.Config{
|
||||
Client: client,
|
||||
BlockTimeout: time.Second,
|
||||
})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestHandleSuccessTransitionsToRunning(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.consumer.HandleMessage(context.Background(), successMessage(t, h, "1700000000000-0"))
|
||||
|
||||
got, err := h.games.Get(context.Background(), h.gameRecord.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, game.StatusRunning, got.Status)
|
||||
require.NotNil(t, got.RuntimeBinding)
|
||||
assert.Equal(t, "container-1", got.RuntimeBinding.ContainerID)
|
||||
assert.Equal(t, "engine.local:9000", got.RuntimeBinding.EngineEndpoint)
|
||||
assert.Equal(t, "1700000000000-0", got.RuntimeBinding.RuntimeJobID)
|
||||
require.NotNil(t, got.StartedAt)
|
||||
assert.True(t, got.StartedAt.Equal(h.at))
|
||||
|
||||
require.Len(t, h.gm.Requests(), 1)
|
||||
req := h.gm.Requests()[0]
|
||||
assert.Equal(t, h.gameRecord.GameID, req.GameID)
|
||||
assert.Equal(t, "container-1", req.ContainerID)
|
||||
assert.Equal(t, "engine.local:9000", req.EngineEndpoint)
|
||||
assert.Equal(t, h.gameRecord.TargetEngineVersion, req.TargetEngineVersion)
|
||||
assert.Equal(t, h.gameRecord.TurnSchedule, req.TurnSchedule)
|
||||
|
||||
assert.Empty(t, h.runtime.StopJobs())
|
||||
assert.Empty(t, h.intents.Published())
|
||||
}
|
||||
|
||||
func TestHandleSuccessGMUnavailableMovesToPausedAndPublishesIntent(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.gm.SetError(ports.ErrGMUnavailable)
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), successMessage(t, h, "1700000000001-0"))
|
||||
|
||||
got, err := h.games.Get(context.Background(), h.gameRecord.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, game.StatusPaused, got.Status)
|
||||
require.NotNil(t, got.RuntimeBinding, "binding still persisted before paused")
|
||||
|
||||
published := h.intents.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, notificationintent.NotificationTypeLobbyRuntimePausedAfterStart, published[0].NotificationType)
|
||||
assert.Empty(t, h.runtime.StopJobs())
|
||||
}
|
||||
|
||||
func TestHandleFailureTransitionsToStartFailed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.consumer.HandleMessage(context.Background(), failureMessage(t, h, "1700000000002-0"))
|
||||
|
||||
got, err := h.games.Get(context.Background(), h.gameRecord.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, game.StatusStartFailed, got.Status)
|
||||
assert.Nil(t, got.RuntimeBinding)
|
||||
assert.Empty(t, h.runtime.StopJobs())
|
||||
assert.Empty(t, h.gm.Requests())
|
||||
assert.Empty(t, h.intents.Published())
|
||||
}
|
||||
|
||||
func TestHandleSuccessOrphanContainerWhenBindingFails(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
// Force binding update to fail by removing the game record from
|
||||
// the store before the message lands.
|
||||
require.NoError(t, h.games.Save(context.Background(), h.gameRecord))
|
||||
failingGames := &fakeBindingFailer{Store: h.games, err: errors.New("redis tx failed")}
|
||||
|
||||
consumer, err := runtimejobresult.NewConsumer(runtimejobresult.Config{
|
||||
Client: h.clientRedis,
|
||||
Stream: h.stream,
|
||||
BlockTimeout: 100 * time.Millisecond,
|
||||
Games: failingGames,
|
||||
RuntimeManager: h.runtime,
|
||||
GMClient: h.gm,
|
||||
Intents: h.intents,
|
||||
OffsetStore: h.offsets,
|
||||
Clock: func() time.Time { return h.at },
|
||||
Logger: silentLogger(),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
consumer.HandleMessage(context.Background(), successMessage(t, h, "1700000000003-0"))
|
||||
|
||||
got, err := h.games.Get(context.Background(), h.gameRecord.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, game.StatusStartFailed, got.Status,
|
||||
"orphan path must move game to start_failed")
|
||||
assert.Nil(t, got.RuntimeBinding, "binding never persisted")
|
||||
|
||||
assert.Equal(t, []string{h.gameRecord.GameID.String()}, h.runtime.StopJobs())
|
||||
assert.Empty(t, h.gm.Requests())
|
||||
assert.Empty(t, h.intents.Published())
|
||||
}
|
||||
|
||||
func TestHandleSuccessReplayIsNoOp(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.consumer.HandleMessage(context.Background(), successMessage(t, h, "1700000000004-0"))
|
||||
require.Len(t, h.gm.Requests(), 1)
|
||||
|
||||
got, err := h.games.Get(context.Background(), h.gameRecord.GameID)
|
||||
require.NoError(t, err)
|
||||
originalUpdatedAt := got.UpdatedAt
|
||||
|
||||
// Replay the same event: status is already running, so the early
|
||||
// status check exits before any side-effect call (no binding
|
||||
// overwrite, no GM call, no transition).
|
||||
h.gm.SetError(errors.New("must not be called again"))
|
||||
h.consumer.HandleMessage(context.Background(), successMessage(t, h, "1700000000004-0"))
|
||||
|
||||
require.Len(t, h.gm.Requests(), 1, "GM register-game is invoked once across replays")
|
||||
|
||||
got, err = h.games.Get(context.Background(), h.gameRecord.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, game.StatusRunning, got.Status)
|
||||
assert.True(t, got.UpdatedAt.Equal(originalUpdatedAt), "no further mutations on replay")
|
||||
assert.Empty(t, h.intents.Published())
|
||||
}
|
||||
|
||||
func TestHandleFailureReplayIsNoOp(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.consumer.HandleMessage(context.Background(), failureMessage(t, h, "1700000000005-0"))
|
||||
got, err := h.games.Get(context.Background(), h.gameRecord.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, game.StatusStartFailed, got.Status)
|
||||
originalUpdatedAt := got.UpdatedAt
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), failureMessage(t, h, "1700000000005-0"))
|
||||
|
||||
got, err = h.games.Get(context.Background(), h.gameRecord.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, game.StatusStartFailed, got.Status)
|
||||
assert.True(t, got.UpdatedAt.Equal(originalUpdatedAt), "no further mutations on replay")
|
||||
}
|
||||
|
||||
func TestHandleMalformedEvents(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
cases := []redis.XMessage{
|
||||
{ID: "1-0", Values: map[string]any{"outcome": "success"}}, // missing game_id
|
||||
{ID: "1-1", Values: map[string]any{"game_id": "bogus", "outcome": "success"}}, // invalid game_id format
|
||||
{ID: "1-2", Values: map[string]any{"game_id": h.gameRecord.GameID.String(), "outcome": "weird"}}, // bad outcome
|
||||
{ID: "1-3", Values: map[string]any{"game_id": h.gameRecord.GameID.String(), "outcome": "success"}}, // missing container_id
|
||||
{ID: "1-4", Values: map[string]any{"game_id": h.gameRecord.GameID.String(), "outcome": "success", "container_id": "c"}}, // missing engine_endpoint
|
||||
}
|
||||
for _, msg := range cases {
|
||||
h.consumer.HandleMessage(context.Background(), msg)
|
||||
}
|
||||
|
||||
got, err := h.games.Get(context.Background(), h.gameRecord.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, game.StatusStarting, got.Status, "malformed events leave game untouched")
|
||||
assert.Empty(t, h.runtime.StopJobs())
|
||||
assert.Empty(t, h.gm.Requests())
|
||||
}
|
||||
|
||||
// fakeBindingFailer wraps gamestub.Store and forces UpdateRuntimeBinding
|
||||
// to fail; everything else delegates to the embedded store.
|
||||
type fakeBindingFailer struct {
|
||||
*gamestub.Store
|
||||
err error
|
||||
}
|
||||
|
||||
func (f *fakeBindingFailer) UpdateRuntimeBinding(_ context.Context, _ ports.UpdateRuntimeBindingInput) error {
|
||||
return f.err
|
||||
}
|
||||
|
||||
var _ ports.GameStore = (*fakeBindingFailer)(nil)
|
||||
|
||||
func TestRunDrainsStreamUntilCancelled(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
// Pre-publish a success message into the real miniredis stream
|
||||
// before Run starts.
|
||||
_, err := h.clientRedis.XAdd(context.Background(), &redis.XAddArgs{
|
||||
Stream: h.stream,
|
||||
Values: map[string]any{
|
||||
"game_id": h.gameRecord.GameID.String(),
|
||||
"outcome": "success",
|
||||
"container_id": "container-2",
|
||||
"engine_endpoint": "engine.local:9001",
|
||||
"completed_at_ms": "1745581200000",
|
||||
},
|
||||
}).Result()
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
done := make(chan error, 1)
|
||||
go func() {
|
||||
done <- h.consumer.Run(ctx)
|
||||
}()
|
||||
|
||||
// Poll for the running transition; once observed cancel context.
|
||||
deadline := time.Now().Add(1500 * time.Millisecond)
|
||||
for time.Now().Before(deadline) {
|
||||
got, err := h.games.Get(context.Background(), h.gameRecord.GameID)
|
||||
require.NoError(t, err)
|
||||
if got.Status == game.StatusRunning {
|
||||
break
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
|
||||
cancel()
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatalf("consumer did not stop")
|
||||
}
|
||||
|
||||
got, err := h.games.Get(context.Background(), h.gameRecord.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, game.StatusRunning, got.Status)
|
||||
require.NotNil(t, got.RuntimeBinding)
|
||||
assert.Equal(t, "container-2", got.RuntimeBinding.ContainerID)
|
||||
|
||||
// Offset must have been persisted at least once.
|
||||
id, found, err := h.offsets.Load(context.Background(), "runtime_results")
|
||||
require.NoError(t, err)
|
||||
assert.True(t, found)
|
||||
assert.NotEmpty(t, id)
|
||||
}
|
||||
Reference in New Issue
Block a user