Files
galaxy-game/lobby/internal/worker/runtimejobresult/consumer.go
T
2026-04-28 20:39:18 +02:00

565 lines
18 KiB
Go

// Package runtimejobresult implements the worker that consumes runtime
// job results published by Runtime Manager and drives the second half of
// the game start flow: persisting the runtime binding, calling
// Game Master to register the running game, and transitioning the game
// status to `running`, `paused`, or `start_failed` accordingly.
//
// Replay protection relies on the CAS-based UpdateStatus semantics: a
// duplicate result event finds the game in a non-`starting` status and
// the second pass becomes a no-op without any extra side effects. The
// stream offset advances after each message so the consumer survives
// restarts without re-emitting state changes.
package runtimejobresult
import (
"context"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"galaxy/lobby/internal/domain/common"
"galaxy/lobby/internal/domain/game"
"galaxy/lobby/internal/logging"
"galaxy/lobby/internal/ports"
"galaxy/lobby/internal/telemetry"
"galaxy/notificationintent"
"github.com/redis/go-redis/v9"
)
// streamOffsetLabel identifies the runtime-job-results consumer in the
// stream offset store. The label stays stable when the underlying
// stream key is renamed via configuration.
const streamOffsetLabel = "runtime_results"
// IntentPublisher publishes notification intents.
type IntentPublisher interface {
Publish(ctx context.Context, intent notificationintent.Intent) (string, error)
}
// Config groups the dependencies used by Consumer.
type Config struct {
// Client provides XREAD access to the runtime job results stream.
Client *redis.Client
// Stream stores the Redis Streams key consumed by the worker.
Stream string
// BlockTimeout bounds the blocking XREAD window.
BlockTimeout time.Duration
// Games persists the post-start game record updates.
Games ports.GameStore
// RuntimeManager publishes stop jobs in the orphan-container path.
RuntimeManager ports.RuntimeManager
// GMClient registers the running game with Game Master after a
// successful binding persistence.
GMClient ports.GMClient
// Intents publishes the lobby.runtime_paused_after_start
// notification when GM is unavailable.
Intents IntentPublisher
// OffsetStore persists the last successfully processed entry id.
OffsetStore ports.StreamOffsetStore
// Clock supplies the wall-clock used for status transition
// timestamps. Defaults to time.Now when nil.
Clock func() time.Time
// Logger receives structured worker-level events. Defaults to
// slog.Default when nil.
Logger *slog.Logger
// Telemetry records the `lobby.start_flow.outcomes` and
// `lobby.game.transitions` counters per processed result. Optional;
// nil disables metric emission.
Telemetry *telemetry.Runtime
}
// Consumer drives the runtime-job-results processing loop.
type Consumer struct {
client *redis.Client
stream string
blockTimeout time.Duration
games ports.GameStore
runtimeManager ports.RuntimeManager
gmClient ports.GMClient
intents IntentPublisher
offsetStore ports.StreamOffsetStore
clock func() time.Time
logger *slog.Logger
telemetry *telemetry.Runtime
}
// NewConsumer constructs one Consumer from cfg.
func NewConsumer(cfg Config) (*Consumer, error) {
switch {
case cfg.Client == nil:
return nil, errors.New("new runtime job result consumer: nil redis client")
case strings.TrimSpace(cfg.Stream) == "":
return nil, errors.New("new runtime job result consumer: stream must not be empty")
case cfg.BlockTimeout <= 0:
return nil, errors.New("new runtime job result consumer: block timeout must be positive")
case cfg.Games == nil:
return nil, errors.New("new runtime job result consumer: nil game store")
case cfg.RuntimeManager == nil:
return nil, errors.New("new runtime job result consumer: nil runtime manager")
case cfg.GMClient == nil:
return nil, errors.New("new runtime job result consumer: nil gm client")
case cfg.Intents == nil:
return nil, errors.New("new runtime job result consumer: nil intent publisher")
case cfg.OffsetStore == nil:
return nil, errors.New("new runtime job result consumer: nil offset store")
}
clock := cfg.Clock
if clock == nil {
clock = time.Now
}
logger := cfg.Logger
if logger == nil {
logger = slog.Default()
}
return &Consumer{
client: cfg.Client,
stream: cfg.Stream,
blockTimeout: cfg.BlockTimeout,
games: cfg.Games,
runtimeManager: cfg.RuntimeManager,
gmClient: cfg.GMClient,
intents: cfg.Intents,
offsetStore: cfg.OffsetStore,
clock: clock,
logger: logger.With("worker", "lobby.runtimejobresult", "stream", cfg.Stream),
telemetry: cfg.Telemetry,
}, nil
}
// Run drives the XREAD loop until ctx is cancelled. Per-message
// outcomes are absorbed by HandleMessage; the loop only exits on
// context cancellation or a fatal Redis error.
func (consumer *Consumer) Run(ctx context.Context) error {
if consumer == nil || consumer.client == nil {
return errors.New("run runtime job result consumer: nil consumer")
}
if ctx == nil {
return errors.New("run runtime job result consumer: nil context")
}
if err := ctx.Err(); err != nil {
return err
}
lastID, found, err := consumer.offsetStore.Load(ctx, streamOffsetLabel)
if err != nil {
return fmt.Errorf("run runtime job result consumer: load offset: %w", err)
}
if !found {
lastID = "0-0"
}
consumer.logger.Info("runtime job result consumer started", "block_timeout", consumer.blockTimeout.String(), "start_entry_id", lastID)
defer consumer.logger.Info("runtime job result consumer stopped")
for {
streams, err := consumer.client.XRead(ctx, &redis.XReadArgs{
Streams: []string{consumer.stream, lastID},
Count: 1,
Block: consumer.blockTimeout,
}).Result()
switch {
case err == nil:
for _, stream := range streams {
for _, message := range stream.Messages {
consumer.HandleMessage(ctx, message)
if err := consumer.offsetStore.Save(ctx, streamOffsetLabel, message.ID); err != nil {
return fmt.Errorf("run runtime job result consumer: save offset: %w", err)
}
lastID = message.ID
}
}
case errors.Is(err, redis.Nil):
continue
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)):
return ctx.Err()
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed):
return fmt.Errorf("run runtime job result consumer: %w", err)
default:
return fmt.Errorf("run runtime job result consumer: %w", err)
}
}
}
// Shutdown is a no-op; the consumer relies on context cancellation.
func (consumer *Consumer) Shutdown(ctx context.Context) error {
if ctx == nil {
return errors.New("shutdown runtime job result consumer: nil context")
}
return nil
}
// HandleMessage processes one Redis Stream message. Exported so tests
// can drive the consumer deterministically without spinning up a real
// XREAD loop.
//
// Per-message errors are logged and absorbed: the worker keeps running
// and the offset is allowed to advance. CAS-status conflicts (typical
// for replayed events) are also absorbed.
func (consumer *Consumer) HandleMessage(ctx context.Context, message redis.XMessage) {
if consumer == nil {
return
}
event, err := decodeJobResult(message)
if err != nil {
consumer.logger.WarnContext(ctx, "decode runtime job result",
"stream_entry_id", message.ID,
"err", err.Error(),
)
return
}
switch event.Outcome {
case outcomeFailure:
consumer.handleFailure(ctx, message.ID, event)
case outcomeSuccess:
consumer.handleSuccess(ctx, message.ID, event)
default:
consumer.logger.WarnContext(ctx, "unknown runtime job outcome",
"stream_entry_id", message.ID,
"outcome", event.Outcome,
"game_id", event.GameID.String(),
)
}
}
// handleFailure transitions the game from `starting` to `start_failed`.
// The CAS-status update absorbs replays naturally: if the game is no
// longer in `starting`, the second call returns ErrConflict /
// ErrInvalidTransition and the worker treats it as an already-handled
// duplicate.
func (consumer *Consumer) handleFailure(ctx context.Context, entryID string, event jobResultEvent) {
at := consumer.clock().UTC()
err := consumer.games.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: event.GameID,
ExpectedFrom: game.StatusStarting,
To: game.StatusStartFailed,
Trigger: game.TriggerRuntimeEvent,
At: at,
})
switch {
case err == nil:
consumer.telemetry.RecordGameTransition(ctx,
string(game.StatusStarting),
string(game.StatusStartFailed),
string(game.TriggerRuntimeEvent),
)
consumer.telemetry.RecordStartFlowOutcome(ctx, "start_failed")
logArgs := []any{
"stream_entry_id", entryID,
"game_id", event.GameID.String(),
"from_status", string(game.StatusStarting),
"to_status", string(game.StatusStartFailed),
"trigger", string(game.TriggerRuntimeEvent),
"error_code", event.ErrorCode,
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
consumer.logger.InfoContext(ctx, "game start failed", logArgs...)
case errors.Is(err, game.ErrConflict), errors.Is(err, game.ErrInvalidTransition):
consumer.logger.InfoContext(ctx, "ignored runtime failure for game not in starting",
"stream_entry_id", entryID,
"game_id", event.GameID.String(),
)
default:
consumer.logger.WarnContext(ctx, "transition game to start_failed",
"stream_entry_id", entryID,
"game_id", event.GameID.String(),
"err", err.Error(),
)
}
}
// handleSuccess applies the success-path branches: persist binding,
// call GM, transition status. Any failure branches into the
// orphan-container or paused-after-start paths defined by the README.
//
// Replay protection: the worker re-reads the game record up front. If
// the record is no longer in `starting`, the event is treated as an
// already-handled replay and processing exits without further side
// effects (no binding overwrite, no GM call, no status transition).
func (consumer *Consumer) handleSuccess(ctx context.Context, entryID string, event jobResultEvent) {
at := consumer.clock().UTC()
if err := event.validateSuccess(); err != nil {
consumer.logger.WarnContext(ctx, "invalid runtime job success event",
"stream_entry_id", entryID,
"game_id", event.GameID.String(),
"err", err.Error(),
)
return
}
record, err := consumer.games.Get(ctx, event.GameID)
if err != nil {
consumer.logger.WarnContext(ctx, "load game for runtime success",
"stream_entry_id", entryID,
"game_id", event.GameID.String(),
"err", err.Error(),
)
return
}
if record.Status != game.StatusStarting {
consumer.logger.InfoContext(ctx, "ignored runtime success for game not in starting",
"stream_entry_id", entryID,
"game_id", record.GameID.String(),
"current_status", string(record.Status),
)
return
}
binding := game.RuntimeBinding{
ContainerID: event.ContainerID,
EngineEndpoint: event.EngineEndpoint,
RuntimeJobID: entryID,
BoundAt: at,
}
if err := consumer.games.UpdateRuntimeBinding(ctx, ports.UpdateRuntimeBindingInput{
GameID: event.GameID,
Binding: binding,
At: at,
}); err != nil {
consumer.handleOrphan(ctx, entryID, event, at, err)
return
}
gmErr := consumer.gmClient.RegisterGame(ctx, ports.RegisterGameRequest{
GameID: record.GameID,
ContainerID: binding.ContainerID,
EngineEndpoint: binding.EngineEndpoint,
TargetEngineVersion: record.TargetEngineVersion,
TurnSchedule: record.TurnSchedule,
})
if gmErr != nil {
consumer.handleGMUnavailable(ctx, entryID, record, at, gmErr)
return
}
if err := consumer.games.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: record.GameID,
ExpectedFrom: game.StatusStarting,
To: game.StatusRunning,
Trigger: game.TriggerRuntimeEvent,
At: at,
}); err != nil {
switch {
case errors.Is(err, game.ErrConflict), errors.Is(err, game.ErrInvalidTransition):
consumer.logger.InfoContext(ctx, "ignored running transition for game not in starting",
"stream_entry_id", entryID,
"game_id", record.GameID.String(),
)
default:
consumer.logger.WarnContext(ctx, "transition game to running",
"stream_entry_id", entryID,
"game_id", record.GameID.String(),
"err", err.Error(),
)
}
return
}
consumer.telemetry.RecordGameTransition(ctx,
string(game.StatusStarting),
string(game.StatusRunning),
string(game.TriggerRuntimeEvent),
)
consumer.telemetry.RecordStartFlowOutcome(ctx, "running")
logArgs := []any{
"stream_entry_id", entryID,
"game_id", record.GameID.String(),
"from_status", string(game.StatusStarting),
"to_status", string(game.StatusRunning),
"trigger", string(game.TriggerRuntimeEvent),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
consumer.logger.InfoContext(ctx, "game running after runtime registration", logArgs...)
}
// handleOrphan implements the orphan-container path: the container
// started but Lobby could not persist the binding metadata. We publish
// a stop job to Runtime Manager and transition the game to
// `start_failed`. Stop-job dispatch is attempted before the status
// transition so a process crash between the two leaves the stop-job
// safely re-published on replay (Runtime Manager idempotency is
// required).
func (consumer *Consumer) handleOrphan(ctx context.Context, entryID string, event jobResultEvent, at time.Time, cause error) {
consumer.logger.WarnContext(ctx, "persist runtime binding failed; orphan container path",
"stream_entry_id", entryID,
"game_id", event.GameID.String(),
"err", cause.Error(),
)
if err := consumer.runtimeManager.PublishStopJob(ctx, event.GameID.String(), ports.StopReasonOrphanCleanup); err != nil {
consumer.logger.WarnContext(ctx, "publish stop job for orphan container",
"stream_entry_id", entryID,
"game_id", event.GameID.String(),
"err", err.Error(),
)
}
if err := consumer.games.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: event.GameID,
ExpectedFrom: game.StatusStarting,
To: game.StatusStartFailed,
Trigger: game.TriggerRuntimeEvent,
At: at,
}); err != nil && !errors.Is(err, game.ErrConflict) && !errors.Is(err, game.ErrInvalidTransition) {
consumer.logger.WarnContext(ctx, "transition orphan game to start_failed",
"stream_entry_id", entryID,
"game_id", event.GameID.String(),
"err", err.Error(),
)
return
}
consumer.telemetry.RecordGameTransition(ctx,
string(game.StatusStarting),
string(game.StatusStartFailed),
string(game.TriggerRuntimeEvent),
)
consumer.telemetry.RecordStartFlowOutcome(ctx, "start_failed")
}
// handleGMUnavailable implements the paused-after-start path: the
// container is alive but Game Master could not be registered. The game
// is moved to `paused` and an admin notification is published.
func (consumer *Consumer) handleGMUnavailable(ctx context.Context, entryID string, record game.Game, at time.Time, cause error) {
consumer.logger.WarnContext(ctx, "gm registration failed; pause-after-start path",
"stream_entry_id", entryID,
"game_id", record.GameID.String(),
"err", cause.Error(),
)
if err := consumer.games.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: record.GameID,
ExpectedFrom: game.StatusStarting,
To: game.StatusPaused,
Trigger: game.TriggerRuntimeEvent,
At: at,
}); err != nil {
switch {
case errors.Is(err, game.ErrConflict), errors.Is(err, game.ErrInvalidTransition):
consumer.logger.InfoContext(ctx, "ignored paused transition for game not in starting",
"stream_entry_id", entryID,
"game_id", record.GameID.String(),
)
default:
consumer.logger.WarnContext(ctx, "transition game to paused",
"stream_entry_id", entryID,
"game_id", record.GameID.String(),
"err", err.Error(),
)
}
return
}
consumer.telemetry.RecordGameTransition(ctx,
string(game.StatusStarting),
string(game.StatusPaused),
string(game.TriggerRuntimeEvent),
)
consumer.telemetry.RecordStartFlowOutcome(ctx, "paused")
intent, err := notificationintent.NewLobbyRuntimePausedAfterStartIntent(
notificationintent.Metadata{
IdempotencyKey: "lobby.runtime_paused_after_start:" + entryID,
OccurredAt: at,
},
notificationintent.LobbyRuntimePausedAfterStartPayload{
GameID: record.GameID.String(),
GameName: record.GameName,
},
)
if err != nil {
consumer.logger.ErrorContext(ctx, "build runtime paused intent",
"stream_entry_id", entryID,
"game_id", record.GameID.String(),
"err", err.Error(),
)
return
}
if _, err := consumer.intents.Publish(ctx, intent); err != nil {
consumer.logger.WarnContext(ctx, "publish runtime paused intent",
"stream_entry_id", entryID,
"game_id", record.GameID.String(),
"err", err.Error(),
)
}
}
// outcomeSuccess and outcomeFailure are the two accepted values of the
// runtime job result `outcome` field.
const (
outcomeSuccess = "success"
outcomeFailure = "failure"
)
// jobResultEvent stores the decoded shape of one runtime:job_results
// stream entry.
type jobResultEvent struct {
GameID common.GameID
Outcome string
ContainerID string
EngineEndpoint string
ErrorCode string
ErrorMessage string
}
func (event jobResultEvent) validateSuccess() error {
if strings.TrimSpace(event.ContainerID) == "" {
return errors.New("success event missing container_id")
}
if strings.TrimSpace(event.EngineEndpoint) == "" {
return errors.New("success event missing engine_endpoint")
}
return nil
}
func decodeJobResult(message redis.XMessage) (jobResultEvent, error) {
gameIDRaw := optionalString(message.Values, "game_id")
if strings.TrimSpace(gameIDRaw) == "" {
return jobResultEvent{}, errors.New("missing game_id")
}
gameID := common.GameID(gameIDRaw)
if err := gameID.Validate(); err != nil {
return jobResultEvent{}, fmt.Errorf("invalid game_id: %w", err)
}
outcome := optionalString(message.Values, "outcome")
if outcome != outcomeSuccess && outcome != outcomeFailure {
return jobResultEvent{}, fmt.Errorf("unsupported outcome %q", outcome)
}
return jobResultEvent{
GameID: gameID,
Outcome: outcome,
ContainerID: optionalString(message.Values, "container_id"),
EngineEndpoint: optionalString(message.Values, "engine_endpoint"),
ErrorCode: optionalString(message.Values, "error_code"),
ErrorMessage: optionalString(message.Values, "error_message"),
}, nil
}
func optionalString(values map[string]any, key string) string {
raw, ok := values[key]
if !ok {
return ""
}
switch typed := raw.(type) {
case string:
return typed
case []byte:
return string(typed)
default:
return ""
}
}