feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,332 @@
// Package stopjobsconsumer drives the asynchronous half of the
// Lobby ↔ Runtime Manager stop contract. The consumer XREADs from
// `runtime:stop_jobs` (produced by Lobby), decodes the envelope frozen
// in `rtmanager/api/runtime-jobs-asyncapi.yaml`, calls the production
// stop orchestrator, and publishes one `runtime:job_results` outcome
// per consumed envelope.
//
// Replay safety: the stop service surfaces an already-stopped or
// already-removed record as `Outcome=success` with
// `error_code=replay_no_op`. The consumer copies the result fields
// into the wire payload verbatim. Per-message decode and publish
// errors are logged and absorbed; the offset advances unconditionally
// so a single poison message cannot pin the loop. Design rationale is
// captured in `rtmanager/docs/workers.md`.
package stopjobsconsumer
import (
"context"
"errors"
"fmt"
"log/slog"
"strconv"
"strings"
"time"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/logging"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/stopruntime"
"github.com/redis/go-redis/v9"
)
// streamOffsetLabel identifies the stop-jobs consumer in the stream
// offset store. Matches the convention from
// `rtmanager/README.md §Persistence Layout > Redis runtime-coordination state`.
const streamOffsetLabel = "stopjobs"
// Wire field names of the `RuntimeStopJob` payload. Frozen by
// `rtmanager/api/runtime-jobs-asyncapi.yaml`.
const (
fieldGameID = "game_id"
fieldReason = "reason"
fieldRequestedAtMS = "requested_at_ms"
)
// StopService is the narrow surface the consumer needs from the stop
// orchestrator. The concrete `*stopruntime.Service` satisfies this
// interface and is wired in production.
type StopService interface {
Handle(ctx context.Context, input stopruntime.Input) (stopruntime.Result, error)
}
// Config groups the dependencies required to construct a Consumer.
type Config struct {
// Client provides XREAD access to the stop-jobs stream.
Client *redis.Client
// Stream stores the Redis Streams key consumed by the worker.
Stream string
// BlockTimeout bounds the blocking XREAD window.
BlockTimeout time.Duration
// StopService executes the stop lifecycle for each decoded envelope.
StopService StopService
// JobResults publishes one outcome entry per processed envelope.
JobResults ports.JobResultPublisher
// OffsetStore persists the last successfully processed entry id so
// the consumer survives restarts without replaying processed
// envelopes.
OffsetStore ports.StreamOffsetStore
// Logger receives structured worker-level events. Defaults to
// `slog.Default` when nil.
Logger *slog.Logger
}
// Consumer drives the stop-jobs processing loop.
type Consumer struct {
client *redis.Client
stream string
blockTimeout time.Duration
stopService StopService
jobResults ports.JobResultPublisher
offsetStore ports.StreamOffsetStore
logger *slog.Logger
}
// NewConsumer constructs one Consumer from cfg.
func NewConsumer(cfg Config) (*Consumer, error) {
switch {
case cfg.Client == nil:
return nil, errors.New("new stop jobs consumer: nil redis client")
case strings.TrimSpace(cfg.Stream) == "":
return nil, errors.New("new stop jobs consumer: stream must not be empty")
case cfg.BlockTimeout <= 0:
return nil, errors.New("new stop jobs consumer: block timeout must be positive")
case cfg.StopService == nil:
return nil, errors.New("new stop jobs consumer: nil stop service")
case cfg.JobResults == nil:
return nil, errors.New("new stop jobs consumer: nil job results publisher")
case cfg.OffsetStore == nil:
return nil, errors.New("new stop jobs consumer: nil offset store")
}
logger := cfg.Logger
if logger == nil {
logger = slog.Default()
}
return &Consumer{
client: cfg.Client,
stream: cfg.Stream,
blockTimeout: cfg.BlockTimeout,
stopService: cfg.StopService,
jobResults: cfg.JobResults,
offsetStore: cfg.OffsetStore,
logger: logger.With("worker", "rtmanager.stopjobs", "stream", cfg.Stream),
}, nil
}
// Run drives the XREAD loop until ctx is cancelled.
func (consumer *Consumer) Run(ctx context.Context) error {
if consumer == nil || consumer.client == nil {
return errors.New("run stop jobs consumer: nil consumer")
}
if ctx == nil {
return errors.New("run stop jobs consumer: nil context")
}
if err := ctx.Err(); err != nil {
return err
}
lastID, found, err := consumer.offsetStore.Load(ctx, streamOffsetLabel)
if err != nil {
return fmt.Errorf("run stop jobs consumer: load offset: %w", err)
}
if !found {
lastID = "0-0"
}
consumer.logger.Info("stop jobs consumer started",
"block_timeout", consumer.blockTimeout.String(),
"start_entry_id", lastID,
)
defer consumer.logger.Info("stop jobs consumer stopped")
for {
streams, err := consumer.client.XRead(ctx, &redis.XReadArgs{
Streams: []string{consumer.stream, lastID},
Count: 1,
Block: consumer.blockTimeout,
}).Result()
switch {
case err == nil:
for _, stream := range streams {
for _, message := range stream.Messages {
consumer.HandleMessage(ctx, message)
if err := consumer.offsetStore.Save(ctx, streamOffsetLabel, message.ID); err != nil {
return fmt.Errorf("run stop jobs consumer: save offset: %w", err)
}
lastID = message.ID
}
}
case errors.Is(err, redis.Nil):
continue
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)):
return ctx.Err()
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed):
return fmt.Errorf("run stop jobs consumer: %w", err)
default:
return fmt.Errorf("run stop jobs consumer: %w", err)
}
}
}
// Shutdown is a no-op; the consumer relies on context cancellation.
func (consumer *Consumer) Shutdown(ctx context.Context) error {
if ctx == nil {
return errors.New("shutdown stop jobs consumer: nil context")
}
return nil
}
// HandleMessage processes one Redis Stream message. Exported so tests
// can drive the consumer deterministically without spinning up a real
// XREAD loop.
func (consumer *Consumer) HandleMessage(ctx context.Context, message redis.XMessage) {
if consumer == nil {
return
}
envelope, err := decodeStopJob(message)
if err != nil {
consumer.logger.WarnContext(ctx, "decode stop job",
"stream_entry_id", message.ID,
"err", err.Error(),
)
return
}
input := stopruntime.Input{
GameID: envelope.GameID,
Reason: envelope.Reason,
OpSource: operation.OpSourceLobbyStream,
SourceRef: message.ID,
}
result, err := consumer.stopService.Handle(ctx, input)
if err != nil {
consumer.logger.ErrorContext(ctx, "stop service returned go-level error",
"stream_entry_id", message.ID,
"game_id", envelope.GameID,
"err", err.Error(),
)
return
}
jobResult := buildJobResult(envelope.GameID, result)
if err := consumer.jobResults.Publish(ctx, jobResult); err != nil {
consumer.logger.ErrorContext(ctx, "publish job result",
"stream_entry_id", message.ID,
"game_id", envelope.GameID,
"outcome", jobResult.Outcome,
"error_code", jobResult.ErrorCode,
"err", err.Error(),
)
return
}
logArgs := []any{
"stream_entry_id", message.ID,
"game_id", envelope.GameID,
"reason", string(envelope.Reason),
"outcome", jobResult.Outcome,
"error_code", jobResult.ErrorCode,
"requested_at_ms", envelope.RequestedAtMS,
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
consumer.logger.InfoContext(ctx, "stop job processed", logArgs...)
}
// stopJobEnvelope stores the decoded shape of one `runtime:stop_jobs`
// stream entry.
type stopJobEnvelope struct {
GameID string
Reason stopruntime.StopReason
RequestedAtMS int64
}
func decodeStopJob(message redis.XMessage) (stopJobEnvelope, error) {
gameID := strings.TrimSpace(optionalString(message.Values, fieldGameID))
if gameID == "" {
return stopJobEnvelope{}, errors.New("missing game_id")
}
reasonRaw := strings.TrimSpace(optionalString(message.Values, fieldReason))
if reasonRaw == "" {
return stopJobEnvelope{}, errors.New("missing reason")
}
reason := stopruntime.StopReason(reasonRaw)
if !reason.IsKnown() {
return stopJobEnvelope{}, fmt.Errorf("unsupported reason %q", reasonRaw)
}
requestedAtMS, err := optionalInt64(message.Values, fieldRequestedAtMS)
if err != nil {
return stopJobEnvelope{}, fmt.Errorf("invalid requested_at_ms: %w", err)
}
return stopJobEnvelope{
GameID: gameID,
Reason: reason,
RequestedAtMS: requestedAtMS,
}, nil
}
// buildJobResult translates a stopruntime.Result into the wire payload
// published on `runtime:job_results`. Stop replays for `status=removed`
// records carry an empty `CurrentContainerID`; the consumer publishes
// the empty fields verbatim, which the AsyncAPI contract permits.
func buildJobResult(gameID string, result stopruntime.Result) ports.JobResult {
jobResult := ports.JobResult{
GameID: gameID,
Outcome: string(result.Outcome),
ErrorCode: result.ErrorCode,
ErrorMessage: result.ErrorMessage,
}
if result.Outcome == operation.OutcomeSuccess {
jobResult.ContainerID = result.Record.CurrentContainerID
jobResult.EngineEndpoint = result.Record.EngineEndpoint
}
return jobResult
}
func optionalString(values map[string]any, key string) string {
raw, ok := values[key]
if !ok {
return ""
}
switch typed := raw.(type) {
case string:
return typed
case []byte:
return string(typed)
default:
return ""
}
}
func optionalInt64(values map[string]any, key string) (int64, error) {
raw, ok := values[key]
if !ok {
return 0, nil
}
var stringValue string
switch typed := raw.(type) {
case string:
stringValue = typed
case []byte:
stringValue = string(typed)
default:
return 0, fmt.Errorf("unsupported type %T", raw)
}
stringValue = strings.TrimSpace(stringValue)
if stringValue == "" {
return 0, nil
}
parsed, err := strconv.ParseInt(stringValue, 10, 64)
if err != nil {
return 0, err
}
return parsed, nil
}
@@ -0,0 +1,357 @@
package stopjobsconsumer_test
import (
"context"
"errors"
"io"
"log/slog"
"strconv"
"sync"
"testing"
"time"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"galaxy/rtmanager/internal/worker/stopjobsconsumer"
"github.com/alicebob/miniredis/v2"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func silentLogger() *slog.Logger {
return slog.New(slog.NewTextHandler(io.Discard, nil))
}
type fakeStopService struct {
mu sync.Mutex
inputs []stopruntime.Input
result stopruntime.Result
err error
}
func (s *fakeStopService) Handle(_ context.Context, input stopruntime.Input) (stopruntime.Result, error) {
s.mu.Lock()
defer s.mu.Unlock()
s.inputs = append(s.inputs, input)
return s.result, s.err
}
func (s *fakeStopService) Inputs() []stopruntime.Input {
s.mu.Lock()
defer s.mu.Unlock()
out := make([]stopruntime.Input, len(s.inputs))
copy(out, s.inputs)
return out
}
type fakeJobResults struct {
mu sync.Mutex
published []ports.JobResult
publishErr error
}
func (s *fakeJobResults) Publish(_ context.Context, result ports.JobResult) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.publishErr != nil {
return s.publishErr
}
s.published = append(s.published, result)
return nil
}
func (s *fakeJobResults) Published() []ports.JobResult {
s.mu.Lock()
defer s.mu.Unlock()
out := make([]ports.JobResult, len(s.published))
copy(out, s.published)
return out
}
type fakeOffsetStore struct {
mu sync.Mutex
offsets map[string]string
}
func newFakeOffsetStore() *fakeOffsetStore {
return &fakeOffsetStore{offsets: map[string]string{}}
}
func (s *fakeOffsetStore) Load(_ context.Context, label string) (string, bool, error) {
s.mu.Lock()
defer s.mu.Unlock()
value, ok := s.offsets[label]
return value, ok, nil
}
func (s *fakeOffsetStore) Save(_ context.Context, label, entryID string) error {
s.mu.Lock()
defer s.mu.Unlock()
s.offsets[label] = entryID
return nil
}
func (s *fakeOffsetStore) Get(label string) (string, bool) {
s.mu.Lock()
defer s.mu.Unlock()
value, ok := s.offsets[label]
return value, ok
}
type harness struct {
consumer *stopjobsconsumer.Consumer
stops *fakeStopService
results *fakeJobResults
offsets *fakeOffsetStore
stream string
server *miniredis.Miniredis
client *redis.Client
}
func newHarness(t *testing.T) *harness {
t.Helper()
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
t.Cleanup(func() { _ = client.Close() })
stops := &fakeStopService{}
results := &fakeJobResults{}
offsets := newFakeOffsetStore()
stream := "runtime:stop_jobs"
consumer, err := stopjobsconsumer.NewConsumer(stopjobsconsumer.Config{
Client: client,
Stream: stream,
BlockTimeout: 50 * time.Millisecond,
StopService: stops,
JobResults: results,
OffsetStore: offsets,
Logger: silentLogger(),
})
require.NoError(t, err)
return &harness{
consumer: consumer,
stops: stops,
results: results,
offsets: offsets,
stream: stream,
server: server,
client: client,
}
}
func stopMessage(id, gameID, reason string, requestedAtMS int64) redis.XMessage {
return redis.XMessage{
ID: id,
Values: map[string]any{
"game_id": gameID,
"reason": reason,
"requested_at_ms": strconv.FormatInt(requestedAtMS, 10),
},
}
}
func TestNewConsumerRejectsMissingDeps(t *testing.T) {
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
t.Cleanup(func() { _ = client.Close() })
cases := []stopjobsconsumer.Config{
{},
{Client: client},
{Client: client, Stream: "runtime:stop_jobs"},
{Client: client, Stream: "runtime:stop_jobs", BlockTimeout: time.Second},
{Client: client, Stream: "runtime:stop_jobs", BlockTimeout: time.Second, StopService: &fakeStopService{}},
{Client: client, Stream: "runtime:stop_jobs", BlockTimeout: time.Second, StopService: &fakeStopService{}, JobResults: &fakeJobResults{}},
}
for index, cfg := range cases {
_, err := stopjobsconsumer.NewConsumer(cfg)
require.Errorf(t, err, "case %d should fail", index)
}
}
func TestHandleMessageSuccessPublishesSuccessResult(t *testing.T) {
h := newHarness(t)
h.stops.result = stopruntime.Result{
Record: runtime.RuntimeRecord{
GameID: "game-1",
Status: runtime.StatusStopped,
CurrentContainerID: "c-1",
CurrentImageRef: "galaxy/game:1.0.0",
EngineEndpoint: "http://galaxy-game-game-1:8080",
},
Outcome: operation.OutcomeSuccess,
}
h.consumer.HandleMessage(context.Background(), stopMessage("100-0", "game-1", "cancelled", 1700))
inputs := h.stops.Inputs()
require.Len(t, inputs, 1)
assert.Equal(t, "game-1", inputs[0].GameID)
assert.Equal(t, stopruntime.StopReasonCancelled, inputs[0].Reason)
assert.Equal(t, operation.OpSourceLobbyStream, inputs[0].OpSource)
assert.Equal(t, "100-0", inputs[0].SourceRef)
published := h.results.Published()
require.Len(t, published, 1)
assert.Equal(t, ports.JobResult{
GameID: "game-1",
Outcome: ports.JobOutcomeSuccess,
ContainerID: "c-1",
EngineEndpoint: "http://galaxy-game-game-1:8080",
}, published[0])
}
func TestHandleMessageFailureNotFoundPublishesFailureResult(t *testing.T) {
h := newHarness(t)
h.stops.result = stopruntime.Result{
Outcome: operation.OutcomeFailure,
ErrorCode: startruntime.ErrorCodeNotFound,
ErrorMessage: "runtime record for game \"game-2\" does not exist",
}
h.consumer.HandleMessage(context.Background(), stopMessage("101-0", "game-2", "admin_request", 1700))
published := h.results.Published()
require.Len(t, published, 1)
assert.Equal(t, ports.JobResult{
GameID: "game-2",
Outcome: ports.JobOutcomeFailure,
ErrorCode: "not_found",
ErrorMessage: "runtime record for game \"game-2\" does not exist",
}, published[0])
}
func TestHandleMessageReplayNoOpForRemovedRecordHasEmptyContainerAndEndpoint(t *testing.T) {
h := newHarness(t)
h.stops.result = stopruntime.Result{
Record: runtime.RuntimeRecord{
GameID: "game-3",
Status: runtime.StatusRemoved,
CurrentContainerID: "",
EngineEndpoint: "http://galaxy-game-game-3:8080",
},
Outcome: operation.OutcomeSuccess,
ErrorCode: startruntime.ErrorCodeReplayNoOp,
}
h.consumer.HandleMessage(context.Background(), stopMessage("102-0", "game-3", "finished", 1700))
published := h.results.Published()
require.Len(t, published, 1)
assert.Equal(t, ports.JobResult{
GameID: "game-3",
Outcome: ports.JobOutcomeSuccess,
ContainerID: "",
EngineEndpoint: "http://galaxy-game-game-3:8080",
ErrorCode: "replay_no_op",
}, published[0])
}
func TestHandleMessageMalformedEnvelopesAreAbsorbed(t *testing.T) {
h := newHarness(t)
cases := []redis.XMessage{
{ID: "200-0", Values: map[string]any{"reason": "cancelled", "requested_at_ms": "1"}},
{ID: "200-1", Values: map[string]any{"game_id": "game-x", "requested_at_ms": "1"}},
{ID: "200-2", Values: map[string]any{"game_id": "game-x", "reason": " ", "requested_at_ms": "1"}},
{ID: "200-3", Values: map[string]any{"game_id": "game-x", "reason": "not_a_known_reason", "requested_at_ms": "1"}},
{ID: "200-4", Values: map[string]any{"game_id": "game-x", "reason": "cancelled", "requested_at_ms": "abc"}},
}
for _, msg := range cases {
h.consumer.HandleMessage(context.Background(), msg)
}
assert.Empty(t, h.stops.Inputs(), "malformed envelopes must not reach the stop service")
assert.Empty(t, h.results.Published(), "malformed envelopes must not produce job results")
}
func TestHandleMessagePublishFailureIsAbsorbed(t *testing.T) {
h := newHarness(t)
h.stops.result = stopruntime.Result{Outcome: operation.OutcomeFailure, ErrorCode: "internal_error"}
h.results.publishErr = errors.New("redis transient")
h.consumer.HandleMessage(context.Background(), stopMessage("300-0", "game-x", "cancelled", 1700))
require.Len(t, h.stops.Inputs(), 1, "service still runs even when publish fails")
}
func TestHandleMessageGoLevelErrorIsAbsorbed(t *testing.T) {
h := newHarness(t)
h.stops.err = errors.New("nil ctx")
h.consumer.HandleMessage(context.Background(), stopMessage("400-0", "game-y", "cancelled", 1700))
assert.Empty(t, h.results.Published(), "go-level service errors must not surface as job results")
}
func TestRunAdvancesOffsetPerMessage(t *testing.T) {
h := newHarness(t)
h.stops.result = stopruntime.Result{
Record: runtime.RuntimeRecord{
GameID: "game-5",
Status: runtime.StatusStopped,
CurrentContainerID: "c-5",
EngineEndpoint: "http://galaxy-game-game-5:8080",
},
Outcome: operation.OutcomeSuccess,
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := make(chan error, 1)
go func() { done <- h.consumer.Run(ctx) }()
mustXAdd(t, h.client, h.stream, "game-5", "cancelled", 1)
mustXAdd(t, h.client, h.stream, "game-5", "finished", 2)
require.Eventually(t, func() bool {
return len(h.results.Published()) == 2
}, time.Second, 10*time.Millisecond, "consumer must produce one job result per envelope")
cancel()
require.Eventually(t, func() bool {
select {
case <-done:
return true
default:
return false
}
}, time.Second, 10*time.Millisecond, "Run must exit after context cancel")
id, ok := h.offsets.Get("stopjobs")
require.True(t, ok, "offset must be persisted after the run loop processed messages")
assert.NotEmpty(t, id, "offset entry id must not be empty")
}
func TestRunExitsImmediatelyOnAlreadyCancelledContext(t *testing.T) {
h := newHarness(t)
ctx, cancel := context.WithCancel(context.Background())
cancel()
err := h.consumer.Run(ctx)
require.ErrorIs(t, err, context.Canceled)
assert.Empty(t, h.stops.Inputs())
assert.Empty(t, h.results.Published())
}
func mustXAdd(t *testing.T, client *redis.Client, stream, gameID, reason string, requestedAtMS int64) string {
t.Helper()
id, err := client.XAdd(context.Background(), &redis.XAddArgs{
Stream: stream,
Values: map[string]any{
"game_id": gameID,
"reason": reason,
"requested_at_ms": strconv.FormatInt(requestedAtMS, 10),
},
}).Result()
require.NoError(t, err)
return id
}