// Package startjobsconsumer drives the asynchronous half of the // Lobby ↔ Runtime Manager start contract. The consumer XREADs from // `runtime:start_jobs` (produced by Lobby), decodes the envelope frozen // in `rtmanager/api/runtime-jobs-asyncapi.yaml`, calls the production // start orchestrator, and publishes one `runtime:job_results` outcome // per consumed envelope. // // Replay safety is provided by the start service: an idempotent re-run // surfaces as `Outcome=success` with `error_code=replay_no_op`. The // consumer copies the service Result fields into the `RuntimeJobResult` // payload verbatim. Per-message decode and publish errors are logged // and absorbed; the offset advances unconditionally so a single poison // message cannot pin the loop. Design rationale is captured in // `rtmanager/docs/workers.md`. package startjobsconsumer import ( "context" "errors" "fmt" "log/slog" "strconv" "strings" "time" "galaxy/rtmanager/internal/domain/operation" "galaxy/rtmanager/internal/logging" "galaxy/rtmanager/internal/ports" "galaxy/rtmanager/internal/service/startruntime" "github.com/redis/go-redis/v9" ) // streamOffsetLabel identifies the start-jobs consumer in the stream // offset store. The label stays stable when the underlying stream key // is renamed via configuration. Matches the convention from // `rtmanager/README.md §Persistence Layout > Redis runtime-coordination state`. const streamOffsetLabel = "startjobs" // Wire field names of the `RuntimeStartJob` payload. Frozen by // `rtmanager/api/runtime-jobs-asyncapi.yaml`; renaming any of them // requires a coordinated contract change with Lobby. const ( fieldGameID = "game_id" fieldImageRef = "image_ref" fieldRequestedAtMS = "requested_at_ms" ) // StartService is the narrow surface the consumer needs from the start // orchestrator. The concrete `*startruntime.Service` satisfies this // interface and is wired in production. type StartService interface { Handle(ctx context.Context, input startruntime.Input) (startruntime.Result, error) } // Config groups the dependencies required to construct a Consumer. type Config struct { // Client provides XREAD access to the start-jobs stream. Client *redis.Client // Stream stores the Redis Streams key consumed by the worker. Stream string // BlockTimeout bounds the blocking XREAD window. BlockTimeout time.Duration // StartService executes the start lifecycle for each decoded // envelope. StartService StartService // JobResults publishes one outcome entry per processed envelope. JobResults ports.JobResultPublisher // OffsetStore persists the last successfully processed entry id so // the consumer survives restarts without replaying processed // envelopes. OffsetStore ports.StreamOffsetStore // Logger receives structured worker-level events. Defaults to // `slog.Default` when nil. Logger *slog.Logger } // Consumer drives the start-jobs processing loop. type Consumer struct { client *redis.Client stream string blockTimeout time.Duration startService StartService jobResults ports.JobResultPublisher offsetStore ports.StreamOffsetStore logger *slog.Logger } // NewConsumer constructs one Consumer from cfg. Validation errors // surface the missing collaborator verbatim. func NewConsumer(cfg Config) (*Consumer, error) { switch { case cfg.Client == nil: return nil, errors.New("new start jobs consumer: nil redis client") case strings.TrimSpace(cfg.Stream) == "": return nil, errors.New("new start jobs consumer: stream must not be empty") case cfg.BlockTimeout <= 0: return nil, errors.New("new start jobs consumer: block timeout must be positive") case cfg.StartService == nil: return nil, errors.New("new start jobs consumer: nil start service") case cfg.JobResults == nil: return nil, errors.New("new start jobs consumer: nil job results publisher") case cfg.OffsetStore == nil: return nil, errors.New("new start jobs consumer: nil offset store") } logger := cfg.Logger if logger == nil { logger = slog.Default() } return &Consumer{ client: cfg.Client, stream: cfg.Stream, blockTimeout: cfg.BlockTimeout, startService: cfg.StartService, jobResults: cfg.JobResults, offsetStore: cfg.OffsetStore, logger: logger.With("worker", "rtmanager.startjobs", "stream", cfg.Stream), }, nil } // Run drives the XREAD loop until ctx is cancelled. Per-message // outcomes are absorbed by HandleMessage; the loop only exits on // context cancellation or a fatal Redis / offset-store error. func (consumer *Consumer) Run(ctx context.Context) error { if consumer == nil || consumer.client == nil { return errors.New("run start jobs consumer: nil consumer") } if ctx == nil { return errors.New("run start jobs consumer: nil context") } if err := ctx.Err(); err != nil { return err } lastID, found, err := consumer.offsetStore.Load(ctx, streamOffsetLabel) if err != nil { return fmt.Errorf("run start jobs consumer: load offset: %w", err) } if !found { lastID = "0-0" } consumer.logger.Info("start jobs consumer started", "block_timeout", consumer.blockTimeout.String(), "start_entry_id", lastID, ) defer consumer.logger.Info("start jobs consumer stopped") for { streams, err := consumer.client.XRead(ctx, &redis.XReadArgs{ Streams: []string{consumer.stream, lastID}, Count: 1, Block: consumer.blockTimeout, }).Result() switch { case err == nil: for _, stream := range streams { for _, message := range stream.Messages { consumer.HandleMessage(ctx, message) if err := consumer.offsetStore.Save(ctx, streamOffsetLabel, message.ID); err != nil { return fmt.Errorf("run start jobs consumer: save offset: %w", err) } lastID = message.ID } } case errors.Is(err, redis.Nil): continue case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)): return ctx.Err() case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed): return fmt.Errorf("run start jobs consumer: %w", err) default: return fmt.Errorf("run start jobs consumer: %w", err) } } } // Shutdown is a no-op; the consumer relies on context cancellation. func (consumer *Consumer) Shutdown(ctx context.Context) error { if ctx == nil { return errors.New("shutdown start jobs consumer: nil context") } return nil } // HandleMessage processes one Redis Stream message. Exported so tests // can drive the consumer deterministically without spinning up a real // XREAD loop. // // Per-message errors are logged and absorbed: the worker keeps running // and the offset is allowed to advance. func (consumer *Consumer) HandleMessage(ctx context.Context, message redis.XMessage) { if consumer == nil { return } envelope, err := decodeStartJob(message) if err != nil { consumer.logger.WarnContext(ctx, "decode start job", "stream_entry_id", message.ID, "err", err.Error(), ) return } input := startruntime.Input{ GameID: envelope.GameID, ImageRef: envelope.ImageRef, OpSource: operation.OpSourceLobbyStream, SourceRef: message.ID, } result, err := consumer.startService.Handle(ctx, input) if err != nil { consumer.logger.ErrorContext(ctx, "start service returned go-level error", "stream_entry_id", message.ID, "game_id", envelope.GameID, "err", err.Error(), ) return } jobResult := buildJobResult(envelope.GameID, result) if err := consumer.jobResults.Publish(ctx, jobResult); err != nil { consumer.logger.ErrorContext(ctx, "publish job result", "stream_entry_id", message.ID, "game_id", envelope.GameID, "outcome", jobResult.Outcome, "error_code", jobResult.ErrorCode, "err", err.Error(), ) return } logArgs := []any{ "stream_entry_id", message.ID, "game_id", envelope.GameID, "outcome", jobResult.Outcome, "error_code", jobResult.ErrorCode, "requested_at_ms", envelope.RequestedAtMS, } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) consumer.logger.InfoContext(ctx, "start job processed", logArgs...) } // startJobEnvelope stores the decoded shape of one `runtime:start_jobs` // stream entry. type startJobEnvelope struct { GameID string ImageRef string RequestedAtMS int64 } func decodeStartJob(message redis.XMessage) (startJobEnvelope, error) { gameID := strings.TrimSpace(optionalString(message.Values, fieldGameID)) if gameID == "" { return startJobEnvelope{}, errors.New("missing game_id") } imageRef := strings.TrimSpace(optionalString(message.Values, fieldImageRef)) if imageRef == "" { return startJobEnvelope{}, errors.New("missing image_ref") } requestedAtMS, err := optionalInt64(message.Values, fieldRequestedAtMS) if err != nil { return startJobEnvelope{}, fmt.Errorf("invalid requested_at_ms: %w", err) } return startJobEnvelope{ GameID: gameID, ImageRef: imageRef, RequestedAtMS: requestedAtMS, }, nil } // buildJobResult translates a startruntime.Result into the wire payload // published on `runtime:job_results`. ContainerID and EngineEndpoint are // taken from the service's Record on success / replay; on failure the // service returns a zero Record and both fields stay empty per the // AsyncAPI contract (required field, empty string is a valid value). func buildJobResult(gameID string, result startruntime.Result) ports.JobResult { jobResult := ports.JobResult{ GameID: gameID, Outcome: string(result.Outcome), ErrorCode: result.ErrorCode, ErrorMessage: result.ErrorMessage, } if result.Outcome == operation.OutcomeSuccess { jobResult.ContainerID = result.Record.CurrentContainerID jobResult.EngineEndpoint = result.Record.EngineEndpoint } return jobResult } func optionalString(values map[string]any, key string) string { raw, ok := values[key] if !ok { return "" } switch typed := raw.(type) { case string: return typed case []byte: return string(typed) default: return "" } } func optionalInt64(values map[string]any, key string) (int64, error) { raw, ok := values[key] if !ok { return 0, nil } var stringValue string switch typed := raw.(type) { case string: stringValue = typed case []byte: stringValue = string(typed) default: return 0, fmt.Errorf("unsupported type %T", raw) } stringValue = strings.TrimSpace(stringValue) if stringValue == "" { return 0, nil } parsed, err := strconv.ParseInt(stringValue, 10, 64) if err != nil { return 0, err } return parsed, nil }