338 lines
10 KiB
Go
338 lines
10 KiB
Go
// Package startjobsconsumer drives the asynchronous half of the
|
|
// Lobby ↔ Runtime Manager start contract. The consumer XREADs from
|
|
// `runtime:start_jobs` (produced by Lobby), decodes the envelope frozen
|
|
// in `rtmanager/api/runtime-jobs-asyncapi.yaml`, calls the production
|
|
// start orchestrator, and publishes one `runtime:job_results` outcome
|
|
// per consumed envelope.
|
|
//
|
|
// Replay safety is provided by the start service: an idempotent re-run
|
|
// surfaces as `Outcome=success` with `error_code=replay_no_op`. The
|
|
// consumer copies the service Result fields into the `RuntimeJobResult`
|
|
// payload verbatim. Per-message decode and publish errors are logged
|
|
// and absorbed; the offset advances unconditionally so a single poison
|
|
// message cannot pin the loop. Design rationale is captured in
|
|
// `rtmanager/docs/workers.md`.
|
|
package startjobsconsumer
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"galaxy/rtmanager/internal/domain/operation"
|
|
"galaxy/rtmanager/internal/logging"
|
|
"galaxy/rtmanager/internal/ports"
|
|
"galaxy/rtmanager/internal/service/startruntime"
|
|
|
|
"github.com/redis/go-redis/v9"
|
|
)
|
|
|
|
// streamOffsetLabel identifies the start-jobs consumer in the stream
|
|
// offset store. The label stays stable when the underlying stream key
|
|
// is renamed via configuration. Matches the convention from
|
|
// `rtmanager/README.md §Persistence Layout > Redis runtime-coordination state`.
|
|
const streamOffsetLabel = "startjobs"
|
|
|
|
// Wire field names of the `RuntimeStartJob` payload. Frozen by
|
|
// `rtmanager/api/runtime-jobs-asyncapi.yaml`; renaming any of them
|
|
// requires a coordinated contract change with Lobby.
|
|
const (
|
|
fieldGameID = "game_id"
|
|
fieldImageRef = "image_ref"
|
|
fieldRequestedAtMS = "requested_at_ms"
|
|
)
|
|
|
|
// StartService is the narrow surface the consumer needs from the start
|
|
// orchestrator. The concrete `*startruntime.Service` satisfies this
|
|
// interface and is wired in production.
|
|
type StartService interface {
|
|
Handle(ctx context.Context, input startruntime.Input) (startruntime.Result, error)
|
|
}
|
|
|
|
// Config groups the dependencies required to construct a Consumer.
|
|
type Config struct {
|
|
// Client provides XREAD access to the start-jobs stream.
|
|
Client *redis.Client
|
|
|
|
// Stream stores the Redis Streams key consumed by the worker.
|
|
Stream string
|
|
|
|
// BlockTimeout bounds the blocking XREAD window.
|
|
BlockTimeout time.Duration
|
|
|
|
// StartService executes the start lifecycle for each decoded
|
|
// envelope.
|
|
StartService StartService
|
|
|
|
// JobResults publishes one outcome entry per processed envelope.
|
|
JobResults ports.JobResultPublisher
|
|
|
|
// OffsetStore persists the last successfully processed entry id so
|
|
// the consumer survives restarts without replaying processed
|
|
// envelopes.
|
|
OffsetStore ports.StreamOffsetStore
|
|
|
|
// Logger receives structured worker-level events. Defaults to
|
|
// `slog.Default` when nil.
|
|
Logger *slog.Logger
|
|
}
|
|
|
|
// Consumer drives the start-jobs processing loop.
|
|
type Consumer struct {
|
|
client *redis.Client
|
|
stream string
|
|
blockTimeout time.Duration
|
|
startService StartService
|
|
jobResults ports.JobResultPublisher
|
|
offsetStore ports.StreamOffsetStore
|
|
logger *slog.Logger
|
|
}
|
|
|
|
// NewConsumer constructs one Consumer from cfg. Validation errors
|
|
// surface the missing collaborator verbatim.
|
|
func NewConsumer(cfg Config) (*Consumer, error) {
|
|
switch {
|
|
case cfg.Client == nil:
|
|
return nil, errors.New("new start jobs consumer: nil redis client")
|
|
case strings.TrimSpace(cfg.Stream) == "":
|
|
return nil, errors.New("new start jobs consumer: stream must not be empty")
|
|
case cfg.BlockTimeout <= 0:
|
|
return nil, errors.New("new start jobs consumer: block timeout must be positive")
|
|
case cfg.StartService == nil:
|
|
return nil, errors.New("new start jobs consumer: nil start service")
|
|
case cfg.JobResults == nil:
|
|
return nil, errors.New("new start jobs consumer: nil job results publisher")
|
|
case cfg.OffsetStore == nil:
|
|
return nil, errors.New("new start jobs consumer: nil offset store")
|
|
}
|
|
|
|
logger := cfg.Logger
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
return &Consumer{
|
|
client: cfg.Client,
|
|
stream: cfg.Stream,
|
|
blockTimeout: cfg.BlockTimeout,
|
|
startService: cfg.StartService,
|
|
jobResults: cfg.JobResults,
|
|
offsetStore: cfg.OffsetStore,
|
|
logger: logger.With("worker", "rtmanager.startjobs", "stream", cfg.Stream),
|
|
}, nil
|
|
}
|
|
|
|
// Run drives the XREAD loop until ctx is cancelled. Per-message
|
|
// outcomes are absorbed by HandleMessage; the loop only exits on
|
|
// context cancellation or a fatal Redis / offset-store error.
|
|
func (consumer *Consumer) Run(ctx context.Context) error {
|
|
if consumer == nil || consumer.client == nil {
|
|
return errors.New("run start jobs consumer: nil consumer")
|
|
}
|
|
if ctx == nil {
|
|
return errors.New("run start jobs consumer: nil context")
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
lastID, found, err := consumer.offsetStore.Load(ctx, streamOffsetLabel)
|
|
if err != nil {
|
|
return fmt.Errorf("run start jobs consumer: load offset: %w", err)
|
|
}
|
|
if !found {
|
|
lastID = "0-0"
|
|
}
|
|
|
|
consumer.logger.Info("start jobs consumer started",
|
|
"block_timeout", consumer.blockTimeout.String(),
|
|
"start_entry_id", lastID,
|
|
)
|
|
defer consumer.logger.Info("start jobs consumer stopped")
|
|
|
|
for {
|
|
streams, err := consumer.client.XRead(ctx, &redis.XReadArgs{
|
|
Streams: []string{consumer.stream, lastID},
|
|
Count: 1,
|
|
Block: consumer.blockTimeout,
|
|
}).Result()
|
|
switch {
|
|
case err == nil:
|
|
for _, stream := range streams {
|
|
for _, message := range stream.Messages {
|
|
consumer.HandleMessage(ctx, message)
|
|
if err := consumer.offsetStore.Save(ctx, streamOffsetLabel, message.ID); err != nil {
|
|
return fmt.Errorf("run start jobs consumer: save offset: %w", err)
|
|
}
|
|
lastID = message.ID
|
|
}
|
|
}
|
|
case errors.Is(err, redis.Nil):
|
|
continue
|
|
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)):
|
|
return ctx.Err()
|
|
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed):
|
|
return fmt.Errorf("run start jobs consumer: %w", err)
|
|
default:
|
|
return fmt.Errorf("run start jobs consumer: %w", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Shutdown is a no-op; the consumer relies on context cancellation.
|
|
func (consumer *Consumer) Shutdown(ctx context.Context) error {
|
|
if ctx == nil {
|
|
return errors.New("shutdown start jobs consumer: nil context")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// HandleMessage processes one Redis Stream message. Exported so tests
|
|
// can drive the consumer deterministically without spinning up a real
|
|
// XREAD loop.
|
|
//
|
|
// Per-message errors are logged and absorbed: the worker keeps running
|
|
// and the offset is allowed to advance.
|
|
func (consumer *Consumer) HandleMessage(ctx context.Context, message redis.XMessage) {
|
|
if consumer == nil {
|
|
return
|
|
}
|
|
|
|
envelope, err := decodeStartJob(message)
|
|
if err != nil {
|
|
consumer.logger.WarnContext(ctx, "decode start job",
|
|
"stream_entry_id", message.ID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
input := startruntime.Input{
|
|
GameID: envelope.GameID,
|
|
ImageRef: envelope.ImageRef,
|
|
OpSource: operation.OpSourceLobbyStream,
|
|
SourceRef: message.ID,
|
|
}
|
|
result, err := consumer.startService.Handle(ctx, input)
|
|
if err != nil {
|
|
consumer.logger.ErrorContext(ctx, "start service returned go-level error",
|
|
"stream_entry_id", message.ID,
|
|
"game_id", envelope.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
jobResult := buildJobResult(envelope.GameID, result)
|
|
if err := consumer.jobResults.Publish(ctx, jobResult); err != nil {
|
|
consumer.logger.ErrorContext(ctx, "publish job result",
|
|
"stream_entry_id", message.ID,
|
|
"game_id", envelope.GameID,
|
|
"outcome", jobResult.Outcome,
|
|
"error_code", jobResult.ErrorCode,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
logArgs := []any{
|
|
"stream_entry_id", message.ID,
|
|
"game_id", envelope.GameID,
|
|
"outcome", jobResult.Outcome,
|
|
"error_code", jobResult.ErrorCode,
|
|
"requested_at_ms", envelope.RequestedAtMS,
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
consumer.logger.InfoContext(ctx, "start job processed", logArgs...)
|
|
}
|
|
|
|
// startJobEnvelope stores the decoded shape of one `runtime:start_jobs`
|
|
// stream entry.
|
|
type startJobEnvelope struct {
|
|
GameID string
|
|
ImageRef string
|
|
RequestedAtMS int64
|
|
}
|
|
|
|
func decodeStartJob(message redis.XMessage) (startJobEnvelope, error) {
|
|
gameID := strings.TrimSpace(optionalString(message.Values, fieldGameID))
|
|
if gameID == "" {
|
|
return startJobEnvelope{}, errors.New("missing game_id")
|
|
}
|
|
imageRef := strings.TrimSpace(optionalString(message.Values, fieldImageRef))
|
|
if imageRef == "" {
|
|
return startJobEnvelope{}, errors.New("missing image_ref")
|
|
}
|
|
requestedAtMS, err := optionalInt64(message.Values, fieldRequestedAtMS)
|
|
if err != nil {
|
|
return startJobEnvelope{}, fmt.Errorf("invalid requested_at_ms: %w", err)
|
|
}
|
|
return startJobEnvelope{
|
|
GameID: gameID,
|
|
ImageRef: imageRef,
|
|
RequestedAtMS: requestedAtMS,
|
|
}, nil
|
|
}
|
|
|
|
// buildJobResult translates a startruntime.Result into the wire payload
|
|
// published on `runtime:job_results`. ContainerID and EngineEndpoint are
|
|
// taken from the service's Record on success / replay; on failure the
|
|
// service returns a zero Record and both fields stay empty per the
|
|
// AsyncAPI contract (required field, empty string is a valid value).
|
|
func buildJobResult(gameID string, result startruntime.Result) ports.JobResult {
|
|
jobResult := ports.JobResult{
|
|
GameID: gameID,
|
|
Outcome: string(result.Outcome),
|
|
ErrorCode: result.ErrorCode,
|
|
ErrorMessage: result.ErrorMessage,
|
|
}
|
|
if result.Outcome == operation.OutcomeSuccess {
|
|
jobResult.ContainerID = result.Record.CurrentContainerID
|
|
jobResult.EngineEndpoint = result.Record.EngineEndpoint
|
|
}
|
|
return jobResult
|
|
}
|
|
|
|
func optionalString(values map[string]any, key string) string {
|
|
raw, ok := values[key]
|
|
if !ok {
|
|
return ""
|
|
}
|
|
switch typed := raw.(type) {
|
|
case string:
|
|
return typed
|
|
case []byte:
|
|
return string(typed)
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
func optionalInt64(values map[string]any, key string) (int64, error) {
|
|
raw, ok := values[key]
|
|
if !ok {
|
|
return 0, nil
|
|
}
|
|
var stringValue string
|
|
switch typed := raw.(type) {
|
|
case string:
|
|
stringValue = typed
|
|
case []byte:
|
|
stringValue = string(typed)
|
|
default:
|
|
return 0, fmt.Errorf("unsupported type %T", raw)
|
|
}
|
|
stringValue = strings.TrimSpace(stringValue)
|
|
if stringValue == "" {
|
|
return 0, nil
|
|
}
|
|
parsed, err := strconv.ParseInt(stringValue, 10, 64)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return parsed, nil
|
|
}
|