// Package runtimejobresult implements the worker that consumes runtime // job results published by Runtime Manager and drives the second half of // the game start flow: persisting the runtime binding, calling // Game Master to register the running game, and transitioning the game // status to `running`, `paused`, or `start_failed` accordingly. // // Replay protection relies on the CAS-based UpdateStatus semantics: a // duplicate result event finds the game in a non-`starting` status and // the second pass becomes a no-op without any extra side effects. The // stream offset advances after each message so the consumer survives // restarts without re-emitting state changes. package runtimejobresult import ( "context" "errors" "fmt" "log/slog" "strings" "time" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/logging" "galaxy/lobby/internal/ports" "galaxy/lobby/internal/telemetry" "galaxy/notificationintent" "github.com/redis/go-redis/v9" ) // streamOffsetLabel identifies the runtime-job-results consumer in the // stream offset store. The label stays stable when the underlying // stream key is renamed via configuration. const streamOffsetLabel = "runtime_results" // IntentPublisher publishes notification intents. type IntentPublisher interface { Publish(ctx context.Context, intent notificationintent.Intent) (string, error) } // Config groups the dependencies used by Consumer. type Config struct { // Client provides XREAD access to the runtime job results stream. Client *redis.Client // Stream stores the Redis Streams key consumed by the worker. Stream string // BlockTimeout bounds the blocking XREAD window. BlockTimeout time.Duration // Games persists the post-start game record updates. Games ports.GameStore // RuntimeManager publishes stop jobs in the orphan-container path. RuntimeManager ports.RuntimeManager // GMClient registers the running game with Game Master after a // successful binding persistence. GMClient ports.GMClient // Intents publishes the lobby.runtime_paused_after_start // notification when GM is unavailable. Intents IntentPublisher // OffsetStore persists the last successfully processed entry id. OffsetStore ports.StreamOffsetStore // Clock supplies the wall-clock used for status transition // timestamps. Defaults to time.Now when nil. Clock func() time.Time // Logger receives structured worker-level events. Defaults to // slog.Default when nil. Logger *slog.Logger // Telemetry records the `lobby.start_flow.outcomes` and // `lobby.game.transitions` counters per processed result. Optional; // nil disables metric emission. Telemetry *telemetry.Runtime } // Consumer drives the runtime-job-results processing loop. type Consumer struct { client *redis.Client stream string blockTimeout time.Duration games ports.GameStore runtimeManager ports.RuntimeManager gmClient ports.GMClient intents IntentPublisher offsetStore ports.StreamOffsetStore clock func() time.Time logger *slog.Logger telemetry *telemetry.Runtime } // NewConsumer constructs one Consumer from cfg. func NewConsumer(cfg Config) (*Consumer, error) { switch { case cfg.Client == nil: return nil, errors.New("new runtime job result consumer: nil redis client") case strings.TrimSpace(cfg.Stream) == "": return nil, errors.New("new runtime job result consumer: stream must not be empty") case cfg.BlockTimeout <= 0: return nil, errors.New("new runtime job result consumer: block timeout must be positive") case cfg.Games == nil: return nil, errors.New("new runtime job result consumer: nil game store") case cfg.RuntimeManager == nil: return nil, errors.New("new runtime job result consumer: nil runtime manager") case cfg.GMClient == nil: return nil, errors.New("new runtime job result consumer: nil gm client") case cfg.Intents == nil: return nil, errors.New("new runtime job result consumer: nil intent publisher") case cfg.OffsetStore == nil: return nil, errors.New("new runtime job result consumer: nil offset store") } clock := cfg.Clock if clock == nil { clock = time.Now } logger := cfg.Logger if logger == nil { logger = slog.Default() } return &Consumer{ client: cfg.Client, stream: cfg.Stream, blockTimeout: cfg.BlockTimeout, games: cfg.Games, runtimeManager: cfg.RuntimeManager, gmClient: cfg.GMClient, intents: cfg.Intents, offsetStore: cfg.OffsetStore, clock: clock, logger: logger.With("worker", "lobby.runtimejobresult", "stream", cfg.Stream), telemetry: cfg.Telemetry, }, nil } // Run drives the XREAD loop until ctx is cancelled. Per-message // outcomes are absorbed by HandleMessage; the loop only exits on // context cancellation or a fatal Redis error. func (consumer *Consumer) Run(ctx context.Context) error { if consumer == nil || consumer.client == nil { return errors.New("run runtime job result consumer: nil consumer") } if ctx == nil { return errors.New("run runtime job result consumer: nil context") } if err := ctx.Err(); err != nil { return err } lastID, found, err := consumer.offsetStore.Load(ctx, streamOffsetLabel) if err != nil { return fmt.Errorf("run runtime job result consumer: load offset: %w", err) } if !found { lastID = "0-0" } consumer.logger.Info("runtime job result consumer started", "block_timeout", consumer.blockTimeout.String(), "start_entry_id", lastID) defer consumer.logger.Info("runtime job result consumer stopped") for { streams, err := consumer.client.XRead(ctx, &redis.XReadArgs{ Streams: []string{consumer.stream, lastID}, Count: 1, Block: consumer.blockTimeout, }).Result() switch { case err == nil: for _, stream := range streams { for _, message := range stream.Messages { consumer.HandleMessage(ctx, message) if err := consumer.offsetStore.Save(ctx, streamOffsetLabel, message.ID); err != nil { return fmt.Errorf("run runtime job result consumer: save offset: %w", err) } lastID = message.ID } } case errors.Is(err, redis.Nil): continue case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)): return ctx.Err() case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed): return fmt.Errorf("run runtime job result consumer: %w", err) default: return fmt.Errorf("run runtime job result consumer: %w", err) } } } // Shutdown is a no-op; the consumer relies on context cancellation. func (consumer *Consumer) Shutdown(ctx context.Context) error { if ctx == nil { return errors.New("shutdown runtime job result consumer: nil context") } return nil } // HandleMessage processes one Redis Stream message. Exported so tests // can drive the consumer deterministically without spinning up a real // XREAD loop. // // Per-message errors are logged and absorbed: the worker keeps running // and the offset is allowed to advance. CAS-status conflicts (typical // for replayed events) are also absorbed. func (consumer *Consumer) HandleMessage(ctx context.Context, message redis.XMessage) { if consumer == nil { return } event, err := decodeJobResult(message) if err != nil { consumer.logger.WarnContext(ctx, "decode runtime job result", "stream_entry_id", message.ID, "err", err.Error(), ) return } switch event.Outcome { case outcomeFailure: consumer.handleFailure(ctx, message.ID, event) case outcomeSuccess: consumer.handleSuccess(ctx, message.ID, event) default: consumer.logger.WarnContext(ctx, "unknown runtime job outcome", "stream_entry_id", message.ID, "outcome", event.Outcome, "game_id", event.GameID.String(), ) } } // handleFailure transitions the game from `starting` to `start_failed`. // The CAS-status update absorbs replays naturally: if the game is no // longer in `starting`, the second call returns ErrConflict / // ErrInvalidTransition and the worker treats it as an already-handled // duplicate. func (consumer *Consumer) handleFailure(ctx context.Context, entryID string, event jobResultEvent) { at := consumer.clock().UTC() err := consumer.games.UpdateStatus(ctx, ports.UpdateStatusInput{ GameID: event.GameID, ExpectedFrom: game.StatusStarting, To: game.StatusStartFailed, Trigger: game.TriggerRuntimeEvent, At: at, }) switch { case err == nil: consumer.telemetry.RecordGameTransition(ctx, string(game.StatusStarting), string(game.StatusStartFailed), string(game.TriggerRuntimeEvent), ) consumer.telemetry.RecordStartFlowOutcome(ctx, "start_failed") logArgs := []any{ "stream_entry_id", entryID, "game_id", event.GameID.String(), "from_status", string(game.StatusStarting), "to_status", string(game.StatusStartFailed), "trigger", string(game.TriggerRuntimeEvent), "error_code", event.ErrorCode, } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) consumer.logger.InfoContext(ctx, "game start failed", logArgs...) case errors.Is(err, game.ErrConflict), errors.Is(err, game.ErrInvalidTransition): consumer.logger.InfoContext(ctx, "ignored runtime failure for game not in starting", "stream_entry_id", entryID, "game_id", event.GameID.String(), ) default: consumer.logger.WarnContext(ctx, "transition game to start_failed", "stream_entry_id", entryID, "game_id", event.GameID.String(), "err", err.Error(), ) } } // handleSuccess applies the success-path branches: persist binding, // call GM, transition status. Any failure branches into the // orphan-container or paused-after-start paths defined by the README. // // Replay protection: the worker re-reads the game record up front. If // the record is no longer in `starting`, the event is treated as an // already-handled replay and processing exits without further side // effects (no binding overwrite, no GM call, no status transition). func (consumer *Consumer) handleSuccess(ctx context.Context, entryID string, event jobResultEvent) { at := consumer.clock().UTC() if err := event.validateSuccess(); err != nil { consumer.logger.WarnContext(ctx, "invalid runtime job success event", "stream_entry_id", entryID, "game_id", event.GameID.String(), "err", err.Error(), ) return } record, err := consumer.games.Get(ctx, event.GameID) if err != nil { consumer.logger.WarnContext(ctx, "load game for runtime success", "stream_entry_id", entryID, "game_id", event.GameID.String(), "err", err.Error(), ) return } if record.Status != game.StatusStarting { consumer.logger.InfoContext(ctx, "ignored runtime success for game not in starting", "stream_entry_id", entryID, "game_id", record.GameID.String(), "current_status", string(record.Status), ) return } binding := game.RuntimeBinding{ ContainerID: event.ContainerID, EngineEndpoint: event.EngineEndpoint, RuntimeJobID: entryID, BoundAt: at, } if err := consumer.games.UpdateRuntimeBinding(ctx, ports.UpdateRuntimeBindingInput{ GameID: event.GameID, Binding: binding, At: at, }); err != nil { consumer.handleOrphan(ctx, entryID, event, at, err) return } gmErr := consumer.gmClient.RegisterGame(ctx, ports.RegisterGameRequest{ GameID: record.GameID, ContainerID: binding.ContainerID, EngineEndpoint: binding.EngineEndpoint, TargetEngineVersion: record.TargetEngineVersion, TurnSchedule: record.TurnSchedule, }) if gmErr != nil { consumer.handleGMUnavailable(ctx, entryID, record, at, gmErr) return } if err := consumer.games.UpdateStatus(ctx, ports.UpdateStatusInput{ GameID: record.GameID, ExpectedFrom: game.StatusStarting, To: game.StatusRunning, Trigger: game.TriggerRuntimeEvent, At: at, }); err != nil { switch { case errors.Is(err, game.ErrConflict), errors.Is(err, game.ErrInvalidTransition): consumer.logger.InfoContext(ctx, "ignored running transition for game not in starting", "stream_entry_id", entryID, "game_id", record.GameID.String(), ) default: consumer.logger.WarnContext(ctx, "transition game to running", "stream_entry_id", entryID, "game_id", record.GameID.String(), "err", err.Error(), ) } return } consumer.telemetry.RecordGameTransition(ctx, string(game.StatusStarting), string(game.StatusRunning), string(game.TriggerRuntimeEvent), ) consumer.telemetry.RecordStartFlowOutcome(ctx, "running") logArgs := []any{ "stream_entry_id", entryID, "game_id", record.GameID.String(), "from_status", string(game.StatusStarting), "to_status", string(game.StatusRunning), "trigger", string(game.TriggerRuntimeEvent), } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) consumer.logger.InfoContext(ctx, "game running after runtime registration", logArgs...) } // handleOrphan implements the orphan-container path: the container // started but Lobby could not persist the binding metadata. We publish // a stop job to Runtime Manager and transition the game to // `start_failed`. Stop-job dispatch is attempted before the status // transition so a process crash between the two leaves the stop-job // safely re-published on replay (Runtime Manager idempotency is // required). func (consumer *Consumer) handleOrphan(ctx context.Context, entryID string, event jobResultEvent, at time.Time, cause error) { consumer.logger.WarnContext(ctx, "persist runtime binding failed; orphan container path", "stream_entry_id", entryID, "game_id", event.GameID.String(), "err", cause.Error(), ) if err := consumer.runtimeManager.PublishStopJob(ctx, event.GameID.String()); err != nil { consumer.logger.WarnContext(ctx, "publish stop job for orphan container", "stream_entry_id", entryID, "game_id", event.GameID.String(), "err", err.Error(), ) } if err := consumer.games.UpdateStatus(ctx, ports.UpdateStatusInput{ GameID: event.GameID, ExpectedFrom: game.StatusStarting, To: game.StatusStartFailed, Trigger: game.TriggerRuntimeEvent, At: at, }); err != nil && !errors.Is(err, game.ErrConflict) && !errors.Is(err, game.ErrInvalidTransition) { consumer.logger.WarnContext(ctx, "transition orphan game to start_failed", "stream_entry_id", entryID, "game_id", event.GameID.String(), "err", err.Error(), ) return } consumer.telemetry.RecordGameTransition(ctx, string(game.StatusStarting), string(game.StatusStartFailed), string(game.TriggerRuntimeEvent), ) consumer.telemetry.RecordStartFlowOutcome(ctx, "start_failed") } // handleGMUnavailable implements the paused-after-start path: the // container is alive but Game Master could not be registered. The game // is moved to `paused` and an admin notification is published. func (consumer *Consumer) handleGMUnavailable(ctx context.Context, entryID string, record game.Game, at time.Time, cause error) { consumer.logger.WarnContext(ctx, "gm registration failed; pause-after-start path", "stream_entry_id", entryID, "game_id", record.GameID.String(), "err", cause.Error(), ) if err := consumer.games.UpdateStatus(ctx, ports.UpdateStatusInput{ GameID: record.GameID, ExpectedFrom: game.StatusStarting, To: game.StatusPaused, Trigger: game.TriggerRuntimeEvent, At: at, }); err != nil { switch { case errors.Is(err, game.ErrConflict), errors.Is(err, game.ErrInvalidTransition): consumer.logger.InfoContext(ctx, "ignored paused transition for game not in starting", "stream_entry_id", entryID, "game_id", record.GameID.String(), ) default: consumer.logger.WarnContext(ctx, "transition game to paused", "stream_entry_id", entryID, "game_id", record.GameID.String(), "err", err.Error(), ) } return } consumer.telemetry.RecordGameTransition(ctx, string(game.StatusStarting), string(game.StatusPaused), string(game.TriggerRuntimeEvent), ) consumer.telemetry.RecordStartFlowOutcome(ctx, "paused") intent, err := notificationintent.NewLobbyRuntimePausedAfterStartIntent( notificationintent.Metadata{ IdempotencyKey: "lobby.runtime_paused_after_start:" + entryID, OccurredAt: at, }, notificationintent.LobbyRuntimePausedAfterStartPayload{ GameID: record.GameID.String(), GameName: record.GameName, }, ) if err != nil { consumer.logger.ErrorContext(ctx, "build runtime paused intent", "stream_entry_id", entryID, "game_id", record.GameID.String(), "err", err.Error(), ) return } if _, err := consumer.intents.Publish(ctx, intent); err != nil { consumer.logger.WarnContext(ctx, "publish runtime paused intent", "stream_entry_id", entryID, "game_id", record.GameID.String(), "err", err.Error(), ) } } // outcomeSuccess and outcomeFailure are the two accepted values of the // runtime job result `outcome` field. const ( outcomeSuccess = "success" outcomeFailure = "failure" ) // jobResultEvent stores the decoded shape of one runtime:job_results // stream entry. type jobResultEvent struct { GameID common.GameID Outcome string ContainerID string EngineEndpoint string ErrorCode string ErrorMessage string } func (event jobResultEvent) validateSuccess() error { if strings.TrimSpace(event.ContainerID) == "" { return errors.New("success event missing container_id") } if strings.TrimSpace(event.EngineEndpoint) == "" { return errors.New("success event missing engine_endpoint") } return nil } func decodeJobResult(message redis.XMessage) (jobResultEvent, error) { gameIDRaw := optionalString(message.Values, "game_id") if strings.TrimSpace(gameIDRaw) == "" { return jobResultEvent{}, errors.New("missing game_id") } gameID := common.GameID(gameIDRaw) if err := gameID.Validate(); err != nil { return jobResultEvent{}, fmt.Errorf("invalid game_id: %w", err) } outcome := optionalString(message.Values, "outcome") if outcome != outcomeSuccess && outcome != outcomeFailure { return jobResultEvent{}, fmt.Errorf("unsupported outcome %q", outcome) } return jobResultEvent{ GameID: gameID, Outcome: outcome, ContainerID: optionalString(message.Values, "container_id"), EngineEndpoint: optionalString(message.Values, "engine_endpoint"), ErrorCode: optionalString(message.Values, "error_code"), ErrorMessage: optionalString(message.Values, "error_message"), }, nil } func optionalString(values map[string]any, key string) string { raw, ok := values[key] if !ok { return "" } switch typed := raw.(type) { case string: return typed case []byte: return string(typed) default: return "" } }