feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,68 @@
package startruntime
// Stable error codes returned in `Result.ErrorCode`. The values match the
// vocabulary frozen by `rtmanager/README.md §Error Model`,
// `rtmanager/api/internal-openapi.yaml`, and
// `rtmanager/api/runtime-jobs-asyncapi.yaml`. Although the constants live
// in the start-service package they are the canonical home for every
// lifecycle service in `internal/service/`. Stop, restart, patch,
// cleanup, the REST handlers, and the stream consumers import these
// names rather than redeclare them; renaming any of them is a contract
// change.
const (
// ErrorCodeReplayNoOp reports that the request was an idempotent
// replay against an already-running record with the same image_ref.
ErrorCodeReplayNoOp = "replay_no_op"
// ErrorCodeStartConfigInvalid reports that the start request was
// rejected before any Docker work because of a validation failure
// (invalid image_ref shape, missing Docker network, unwritable state
// directory).
ErrorCodeStartConfigInvalid = "start_config_invalid"
// ErrorCodeImagePullFailed reports that the image pull stage failed.
ErrorCodeImagePullFailed = "image_pull_failed"
// ErrorCodeContainerStartFailed reports that `docker create` or
// `docker start` failed, or that the runtime record could not be
// installed after a successful Run.
ErrorCodeContainerStartFailed = "container_start_failed"
// ErrorCodeConflict reports an operation incompatible with the
// current runtime state (lease busy, running record with a different
// image_ref, cleanup attempted on a running runtime, restart or
// patch attempted on a removed record).
ErrorCodeConflict = "conflict"
// ErrorCodeServiceUnavailable reports that a steady-state dependency
// (Docker daemon, PostgreSQL, Redis) was unreachable for this call.
ErrorCodeServiceUnavailable = "service_unavailable"
// ErrorCodeInternal reports an unexpected error not classified by
// the other codes.
ErrorCodeInternal = "internal_error"
// ErrorCodeInvalidRequest reports that the request was rejected
// because of structural input validation (empty required fields,
// unknown enum values). Used by the stop / restart / patch /
// cleanup services for malformed Input. The start service uses the
// stricter `start_config_invalid` code instead because every start
// validation failure also raises an admin notification intent.
ErrorCodeInvalidRequest = "invalid_request"
// ErrorCodeNotFound reports that the runtime record requested by a
// stop, restart, patch or cleanup operation does not exist. Those
// services raise it; the start service never does (start installs
// the record on first call).
ErrorCodeNotFound = "not_found"
// ErrorCodeImageRefNotSemver reports that a patch operation was
// rejected because either the current or the new image reference
// could not be parsed as a semver tag.
ErrorCodeImageRefNotSemver = "image_ref_not_semver"
// ErrorCodeSemverPatchOnly reports that a patch operation was
// rejected because the major or minor component differs between the
// current and new image references.
ErrorCodeSemverPatchOnly = "semver_patch_only"
)
@@ -0,0 +1,940 @@
// Package startruntime implements the `start` lifecycle operation owned
// by Runtime Manager. The service is the single orchestrator behind
// both the asynchronous `runtime:start_jobs` consumer and the
// synchronous `POST /api/v1/internal/runtimes/{game_id}/start` REST
// handler; both callers obtain a deterministic Result with a stable
// `Outcome` / `ErrorCode` pair.
//
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
// §Lifecycles → Start`. Design rationale is captured in
// `rtmanager/docs/services.md`.
package startruntime
import (
"context"
"crypto/rand"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"galaxy/notificationintent"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/logging"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/telemetry"
"github.com/distribution/reference"
)
// Container labels applied to every engine container created by the
// start service. Frozen by `rtmanager/README.md §Container Model`.
const (
LabelOwner = "com.galaxy.owner"
LabelOwnerValue = "rtmanager"
LabelKind = "com.galaxy.kind"
LabelKindValue = "game-engine"
LabelGameID = "com.galaxy.game_id"
LabelEngineImageRef = "com.galaxy.engine_image_ref"
LabelStartedAtMs = "com.galaxy.started_at_ms"
// Image labels read at start time to derive resource limits.
imageLabelCPUQuota = "com.galaxy.cpu_quota"
imageLabelMemory = "com.galaxy.memory"
imageLabelPIDsLimit = "com.galaxy.pids_limit"
// HostnamePrefix is the constant prefix used to build the per-game
// container hostname (`galaxy-game-{game_id}`). The full hostname
// also forms the container name; restart and patch keep the same
// value so the engine endpoint stays stable across container
// recreates.
HostnamePrefix = "galaxy-game-"
// EngineStateBackCompatEnvName is the secondary env var name v1
// engines accept for the bind-mounted state directory. Always set
// alongside the configured primary name to honour the v1 backward
// compatibility commitment in `rtmanager/README.md §Container Model`.
EngineStateBackCompatEnvName = "STORAGE_PATH"
// leaseReleaseTimeout bounds the deferred lease-release call. A
// fresh background context is used so the release runs even when
// the request context was already canceled.
leaseReleaseTimeout = 5 * time.Second
)
// Input stores the per-call arguments for one start operation.
type Input struct {
// GameID identifies the platform game to start.
GameID string
// ImageRef stores the producer-resolved Docker reference of the
// engine image. Validated against `distribution/reference` before
// any Docker work.
ImageRef string
// OpSource classifies how the request entered Runtime Manager.
// Required: every operation_log entry carries an op_source.
OpSource operation.OpSource
// SourceRef stores the optional opaque per-source reference
// (Redis Stream entry id, REST request id, admin user id). Empty
// when the caller does not provide one.
SourceRef string
}
// Validate reports whether input carries the structural invariants the
// service requires.
func (input Input) Validate() error {
if strings.TrimSpace(input.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if strings.TrimSpace(input.ImageRef) == "" {
return fmt.Errorf("image ref must not be empty")
}
if !input.OpSource.IsKnown() {
return fmt.Errorf("op source %q is unsupported", input.OpSource)
}
return nil
}
// Result stores the deterministic outcome of one Handle call.
type Result struct {
// Record carries the runtime record installed by the operation.
// Populated on success and on idempotent replay (`replay_no_op`);
// zero on failure.
Record runtime.RuntimeRecord
// Outcome reports whether the operation completed (success) or
// produced a stable failure code.
Outcome operation.Outcome
// ErrorCode stores the stable error code on failure, or
// `replay_no_op` on idempotent replay. Empty for fresh successes.
ErrorCode string
// ErrorMessage stores the operator-readable detail on failure.
// Empty for successes.
ErrorMessage string
}
// Dependencies groups the collaborators required by Service.
type Dependencies struct {
// RuntimeRecords reads and installs the durable runtime record.
RuntimeRecords ports.RuntimeRecordStore
// OperationLogs records the success / failure audit entry.
OperationLogs ports.OperationLogStore
// Docker drives the Docker daemon (network check, pull, inspect,
// run, remove).
Docker ports.DockerClient
// Leases serialises operations against the same game id.
Leases ports.GameLeaseStore
// HealthEvents publishes `runtime:health_events` and upserts the
// matching `health_snapshots` row.
HealthEvents ports.HealthEventPublisher
// Notifications publishes admin-only failure intents.
Notifications ports.NotificationIntentPublisher
// Lobby provides best-effort diagnostic context for the started
// game. May be nil; the start operation does not depend on it.
Lobby ports.LobbyInternalClient
// Container groups the per-container defaults and state-directory
// settings consumed at start time.
Container config.ContainerConfig
// Docker groups the Docker daemon settings (network, log driver,
// pull policy) consumed at start time.
DockerCfg config.DockerConfig
// Coordination supplies the per-game lease TTL.
Coordination config.CoordinationConfig
// Telemetry records start outcomes, lease latency, and health
// event counters. Required.
Telemetry *telemetry.Runtime
// Logger records structured service-level events. Defaults to
// `slog.Default()` when nil.
Logger *slog.Logger
// Clock supplies the wall-clock used for operation timestamps.
// Defaults to `time.Now` when nil.
Clock func() time.Time
// NewToken supplies a unique opaque lease token. Defaults to a
// 32-byte random base64url string when nil. Tests may override.
NewToken func() string
// PrepareStateDir creates the per-game state directory and
// returns its absolute host path. Defaults to a real-filesystem
// implementation that honours Container.GameStateRoot,
// Container.GameStateDirMode, and Container.GameStateOwner{UID,GID}.
// Tests override to point at a temporary directory.
PrepareStateDir func(gameID string) (string, error)
}
// Service executes the start lifecycle operation.
type Service struct {
runtimeRecords ports.RuntimeRecordStore
operationLogs ports.OperationLogStore
docker ports.DockerClient
leases ports.GameLeaseStore
healthEvents ports.HealthEventPublisher
notifications ports.NotificationIntentPublisher
lobby ports.LobbyInternalClient
containerCfg config.ContainerConfig
dockerCfg config.DockerConfig
leaseTTL time.Duration
telemetry *telemetry.Runtime
logger *slog.Logger
clock func() time.Time
newToken func() string
prepareStateDir func(gameID string) (string, error)
}
// NewService constructs one Service from deps.
func NewService(deps Dependencies) (*Service, error) {
switch {
case deps.RuntimeRecords == nil:
return nil, errors.New("new start runtime service: nil runtime records")
case deps.OperationLogs == nil:
return nil, errors.New("new start runtime service: nil operation logs")
case deps.Docker == nil:
return nil, errors.New("new start runtime service: nil docker client")
case deps.Leases == nil:
return nil, errors.New("new start runtime service: nil lease store")
case deps.HealthEvents == nil:
return nil, errors.New("new start runtime service: nil health events publisher")
case deps.Notifications == nil:
return nil, errors.New("new start runtime service: nil notification publisher")
case deps.Telemetry == nil:
return nil, errors.New("new start runtime service: nil telemetry runtime")
}
if err := deps.Container.Validate(); err != nil {
return nil, fmt.Errorf("new start runtime service: container config: %w", err)
}
if err := deps.DockerCfg.Validate(); err != nil {
return nil, fmt.Errorf("new start runtime service: docker config: %w", err)
}
if err := deps.Coordination.Validate(); err != nil {
return nil, fmt.Errorf("new start runtime service: coordination config: %w", err)
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
logger = logger.With("service", "rtmanager.startruntime")
newToken := deps.NewToken
if newToken == nil {
newToken = defaultTokenGenerator()
}
prepareStateDir := deps.PrepareStateDir
if prepareStateDir == nil {
prepareStateDir = newDefaultStateDirPreparer(deps.Container)
}
return &Service{
runtimeRecords: deps.RuntimeRecords,
operationLogs: deps.OperationLogs,
docker: deps.Docker,
leases: deps.Leases,
healthEvents: deps.HealthEvents,
notifications: deps.Notifications,
lobby: deps.Lobby,
containerCfg: deps.Container,
dockerCfg: deps.DockerCfg,
leaseTTL: deps.Coordination.GameLeaseTTL,
telemetry: deps.Telemetry,
logger: logger,
clock: clock,
newToken: newToken,
prepareStateDir: prepareStateDir,
}, nil
}
// Handle executes one start operation end-to-end. The Go-level error
// return is reserved for non-business failures (nil context, nil
// receiver). Every business outcome — fresh success, idempotent
// replay, or any of the stable failure modes — flows through Result.
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("start runtime: nil service")
}
if ctx == nil {
return Result{}, errors.New("start runtime: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeStartConfigInvalid,
errorMessage: err.Error(),
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
}), nil
}
token := service.newToken()
leaseStart := service.clock()
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
}), nil
}
if !acquired {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeConflict,
errorMessage: "another lifecycle operation is in progress for this game",
}), nil
}
defer service.releaseLease(ctx, input.GameID, token)
return service.runUnderLease(ctx, input, opStartedAt)
}
// Run executes the start lifecycle assuming the per-game lease is
// already held by the caller. The method is reserved for orchestrator
// services in `internal/service/` that compose start with another
// operation under a single outer lease (restart and patch). External
// callers must use Handle, which acquires and releases the lease
// itself.
//
// Run still validates input and reports business outcomes through
// Result; the Go-level error return is reserved for non-business
// failures (nil context, nil receiver). Operation log entries,
// telemetry counters, health events and admin-only notification
// intents fire identically to Handle.
func (service *Service) Run(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("start runtime: nil service")
}
if ctx == nil {
return Result{}, errors.New("start runtime: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeStartConfigInvalid,
errorMessage: err.Error(),
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
}), nil
}
return service.runUnderLease(ctx, input, opStartedAt)
}
// runUnderLease executes the post-validation, lease-protected start
// steps shared by Handle and Run. Callers must validate input and
// acquire the lease (when applicable) before invocation.
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
existing, hasExisting, err := service.loadExisting(ctx, input.GameID)
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeInternal,
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
}), nil
}
if hasExisting && existing.Status == runtime.StatusRunning {
if existing.CurrentImageRef == input.ImageRef {
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
}
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeConflict,
errorMessage: fmt.Sprintf("runtime already running with image_ref %q", existing.CurrentImageRef),
}), nil
}
service.fetchLobbyDiagnostic(ctx, input.GameID)
if err := validateImageRef(input.ImageRef); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeStartConfigInvalid,
errorMessage: fmt.Sprintf("invalid image_ref: %s", err.Error()),
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
}), nil
}
if err := service.docker.EnsureNetwork(ctx, service.dockerCfg.Network); err != nil {
if errors.Is(err, ports.ErrNetworkMissing) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeStartConfigInvalid,
errorMessage: fmt.Sprintf("docker network %q is missing", service.dockerCfg.Network),
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
}), nil
}
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("ensure docker network: %s", err.Error()),
}), nil
}
if err := service.docker.PullImage(ctx, input.ImageRef, ports.PullPolicy(service.dockerCfg.PullPolicy)); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeImagePullFailed,
errorMessage: err.Error(),
notificationType: notificationintent.NotificationTypeRuntimeImagePullFailed,
}), nil
}
imageInspect, err := service.docker.InspectImage(ctx, input.ImageRef)
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeImagePullFailed,
errorMessage: fmt.Sprintf("inspect image: %s", err.Error()),
notificationType: notificationintent.NotificationTypeRuntimeImagePullFailed,
}), nil
}
cpuQuota, memory, pidsLimit := service.resolveLimits(imageInspect.Labels)
statePath, err := service.prepareStateDir(input.GameID)
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeStartConfigInvalid,
errorMessage: fmt.Sprintf("prepare state directory: %s", err.Error()),
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
}), nil
}
hostname := containerHostname(input.GameID)
spec := ports.RunSpec{
Name: hostname,
Image: input.ImageRef,
Hostname: hostname,
Network: service.dockerCfg.Network,
Env: service.buildEnv(),
Labels: service.buildLabels(input.GameID, input.ImageRef, opStartedAt),
BindMounts: []ports.BindMount{{
HostPath: statePath,
MountPath: service.containerCfg.EngineStateMountPath,
ReadOnly: false,
}},
LogDriver: service.dockerCfg.LogDriver,
LogOpts: parseLogOpts(service.dockerCfg.LogOpts),
CPUQuota: cpuQuota,
Memory: memory,
PIDsLimit: pidsLimit,
}
runResult, err := service.docker.Run(ctx, spec)
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeContainerStartFailed,
errorMessage: err.Error(),
notificationType: notificationintent.NotificationTypeRuntimeContainerStartFailed,
}), nil
}
createdAt := opStartedAt
if hasExisting && !existing.CreatedAt.IsZero() {
createdAt = existing.CreatedAt
}
startedAt := runResult.StartedAt
record := runtime.RuntimeRecord{
GameID: input.GameID,
Status: runtime.StatusRunning,
CurrentContainerID: runResult.ContainerID,
CurrentImageRef: input.ImageRef,
EngineEndpoint: runResult.EngineEndpoint,
StatePath: statePath,
DockerNetwork: service.dockerCfg.Network,
StartedAt: &startedAt,
LastOpAt: startedAt,
CreatedAt: createdAt,
}
if err := service.runtimeRecords.Upsert(ctx, record); err != nil {
service.bestEffortRemove(input.GameID, runResult.ContainerID)
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeContainerStartFailed,
errorMessage: fmt.Sprintf("upsert runtime record: %s", err.Error()),
containerID: runResult.ContainerID,
notificationType: notificationintent.NotificationTypeRuntimeContainerStartFailed,
}), nil
}
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindStart,
OpSource: input.OpSource,
SourceRef: input.SourceRef,
ImageRef: input.ImageRef,
ContainerID: runResult.ContainerID,
Outcome: operation.OutcomeSuccess,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
GameID: input.GameID,
ContainerID: runResult.ContainerID,
EventType: health.EventTypeContainerStarted,
OccurredAt: startedAt,
Details: containerStartedDetails(input.ImageRef),
})
service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeSuccess), "", string(input.OpSource))
service.telemetry.RecordHealthEvent(ctx, string(health.EventTypeContainerStarted))
logArgs := []any{
"game_id", input.GameID,
"container_id", runResult.ContainerID,
"image_ref", input.ImageRef,
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime started", logArgs...)
return Result{
Record: record,
Outcome: operation.OutcomeSuccess,
}, nil
}
// failureCtx groups the inputs to recordFailure so the Handle method
// stays readable.
type failureCtx struct {
opStartedAt time.Time
input Input
errorCode string
errorMessage string
containerID string
notificationType notificationintent.NotificationType
}
// recordFailure records the failure operation_log entry, publishes the
// matching admin-only notification intent (when applicable), and emits
// telemetry. All side effects are best-effort; a downstream failure is
// logged but does not change the returned Result.
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: fc.input.GameID,
OpKind: operation.OpKindStart,
OpSource: fc.input.OpSource,
SourceRef: fc.input.SourceRef,
ImageRef: fc.input.ImageRef,
ContainerID: fc.containerID,
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
StartedAt: fc.opStartedAt,
FinishedAt: &finishedAt,
})
if fc.notificationType != "" {
service.bestEffortNotify(ctx, fc)
}
service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode, string(fc.input.OpSource))
logArgs := []any{
"game_id", fc.input.GameID,
"image_ref", fc.input.ImageRef,
"op_source", string(fc.input.OpSource),
"error_code", fc.errorCode,
"error_message", fc.errorMessage,
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.WarnContext(ctx, "runtime start failed", logArgs...)
return Result{
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
}
}
// recordReplayNoOp records the idempotent replay outcome and returns
// the existing record. The operation_log entry is appended best-effort
// so audit history captures the replay; telemetry counts the call as a
// successful start with `error_code=replay_no_op`.
func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindStart,
OpSource: input.OpSource,
SourceRef: input.SourceRef,
ImageRef: input.ImageRef,
ContainerID: existing.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
ErrorCode: ErrorCodeReplayNoOp,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeSuccess), ErrorCodeReplayNoOp, string(input.OpSource))
logArgs := []any{
"game_id", input.GameID,
"container_id", existing.CurrentContainerID,
"image_ref", input.ImageRef,
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime start replay no-op", logArgs...)
return Result{
Record: existing,
Outcome: operation.OutcomeSuccess,
ErrorCode: ErrorCodeReplayNoOp,
}
}
// loadExisting reads the runtime record for gameID. The boolean return
// reports whether a record exists; ErrNotFound is translated to
// (zero, false, nil) so the caller does not branch on the sentinel
// elsewhere.
func (service *Service) loadExisting(ctx context.Context, gameID string) (runtime.RuntimeRecord, bool, error) {
record, err := service.runtimeRecords.Get(ctx, gameID)
switch {
case errors.Is(err, runtime.ErrNotFound):
return runtime.RuntimeRecord{}, false, nil
case err != nil:
return runtime.RuntimeRecord{}, false, err
default:
return record, true, nil
}
}
// fetchLobbyDiagnostic best-effort enriches the request log with the
// Lobby-side game record. A nil Lobby client or any transport failure
// is logged and the start operation continues.
func (service *Service) fetchLobbyDiagnostic(ctx context.Context, gameID string) {
if service.lobby == nil {
return
}
record, err := service.lobby.GetGame(ctx, gameID)
if err != nil {
service.logger.DebugContext(ctx, "lobby diagnostic fetch failed",
"game_id", gameID,
"err", err.Error(),
)
return
}
service.logger.DebugContext(ctx, "lobby diagnostic fetched",
"game_id", gameID,
"lobby_status", record.Status,
"lobby_target_engine_version", record.TargetEngineVersion,
)
}
// resolveLimits derives the per-container resource limits from the
// resolved image's labels with config-driven fallbacks. Unparseable
// label values silently fall back to the configured default; operators
// see the chosen value through `rtmanager.docker_op_latency` and start
// logs.
func (service *Service) resolveLimits(labels map[string]string) (cpuQuota float64, memory string, pidsLimit int) {
cpuQuota = service.containerCfg.DefaultCPUQuota
memory = service.containerCfg.DefaultMemory
pidsLimit = service.containerCfg.DefaultPIDsLimit
if raw, ok := labels[imageLabelCPUQuota]; ok {
if value, err := strconv.ParseFloat(raw, 64); err == nil && value > 0 {
cpuQuota = value
}
}
if raw, ok := labels[imageLabelMemory]; ok && strings.TrimSpace(raw) != "" {
memory = raw
}
if raw, ok := labels[imageLabelPIDsLimit]; ok {
if value, err := strconv.Atoi(raw); err == nil && value > 0 {
pidsLimit = value
}
}
return cpuQuota, memory, pidsLimit
}
// buildEnv assembles the env-var map handed to the engine. Both the
// configured primary name and `STORAGE_PATH` are set per
// `rtmanager/README.md §Container Model` v1 backward compatibility.
func (service *Service) buildEnv() map[string]string {
mount := service.containerCfg.EngineStateMountPath
env := map[string]string{
service.containerCfg.EngineStateEnvName: mount,
EngineStateBackCompatEnvName: mount,
}
return env
}
// buildLabels assembles the container labels per
// `rtmanager/README.md §Container Model`.
func (service *Service) buildLabels(gameID, imageRef string, startedAt time.Time) map[string]string {
return map[string]string{
LabelOwner: LabelOwnerValue,
LabelKind: LabelKindValue,
LabelGameID: gameID,
LabelEngineImageRef: imageRef,
LabelStartedAtMs: strconv.FormatInt(startedAt.UTC().UnixMilli(), 10),
}
}
// releaseLease releases the per-game lease in a fresh background
// context so a canceled request context does not leave the lease
// pinned for its TTL.
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
defer cancel()
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
service.logger.WarnContext(ctx, "release game lease",
"game_id", gameID,
"err", err.Error(),
)
}
}
// bestEffortAppend writes one operation_log entry. A failure is logged
// and discarded; the durable runtime record (or its absence) remains
// the source of truth.
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
service.logger.ErrorContext(ctx, "append operation log",
"game_id", entry.GameID,
"op_kind", string(entry.OpKind),
"outcome", string(entry.Outcome),
"error_code", entry.ErrorCode,
"err", err.Error(),
)
}
}
// bestEffortPublishHealth emits one health event + snapshot upsert.
// Failures degrade silently per `rtmanager/README.md §Notification
// Contracts`; the runtime record remains the source of truth.
func (service *Service) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) {
if err := service.healthEvents.Publish(ctx, envelope); err != nil {
service.logger.ErrorContext(ctx, "publish health event",
"game_id", envelope.GameID,
"container_id", envelope.ContainerID,
"event_type", string(envelope.EventType),
"err", err.Error(),
)
}
}
// bestEffortNotify publishes one admin-only failure intent. Failures
// degrade silently because the source business state already reflects
// the outcome.
func (service *Service) bestEffortNotify(ctx context.Context, fc failureCtx) {
intent, err := buildFailureIntent(fc, service.clock().UTC())
if err != nil {
service.logger.ErrorContext(ctx, "build notification intent",
"game_id", fc.input.GameID,
"notification_type", string(fc.notificationType),
"err", err.Error(),
)
return
}
if err := service.notifications.Publish(ctx, intent); err != nil {
service.logger.ErrorContext(ctx, "publish notification intent",
"game_id", fc.input.GameID,
"notification_type", string(fc.notificationType),
"err", err.Error(),
)
return
}
service.telemetry.RecordNotificationIntent(ctx, string(fc.notificationType))
}
// bestEffortRemove forces removal of a container left running by a
// failed start that progressed past Run but failed to register the
// runtime record. Failures degrade silently — the reconciler adopts
// orphans the periodic pass observes.
func (service *Service) bestEffortRemove(gameID, containerID string) {
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
defer cancel()
if err := service.docker.Remove(cleanupCtx, containerID); err != nil {
service.logger.ErrorContext(cleanupCtx, "rollback container after upsert failure",
"game_id", gameID,
"container_id", containerID,
"err", err.Error(),
)
}
}
// containerHostname builds the per-game hostname that doubles as the
// Docker container name.
func containerHostname(gameID string) string {
return HostnamePrefix + gameID
}
// containerStartedDetails builds the `details` payload required by the
// `container_started` AsyncAPI variant.
func containerStartedDetails(imageRef string) json.RawMessage {
payload := map[string]string{"image_ref": imageRef}
encoded, _ := json.Marshal(payload)
return encoded
}
// validateImageRef rejects malformed Docker references before any
// daemon round-trip. The validation surfaces as `start_config_invalid`;
// daemon-side rejections after a valid parse are reported as
// `image_pull_failed`.
func validateImageRef(ref string) error {
if strings.TrimSpace(ref) == "" {
return fmt.Errorf("image ref must not be empty")
}
if _, err := reference.ParseNormalizedNamed(ref); err != nil {
return err
}
return nil
}
// parseLogOpts turns the `key=value,key2=value2` shape of the
// `RTMANAGER_DOCKER_LOG_OPTS` config into a map suitable for the
// Docker SDK. Empty input returns nil so the SDK uses driver defaults.
func parseLogOpts(raw string) map[string]string {
if strings.TrimSpace(raw) == "" {
return nil
}
out := make(map[string]string)
for part := range strings.SplitSeq(raw, ",") {
entry := strings.TrimSpace(part)
if entry == "" {
continue
}
index := strings.IndexByte(entry, '=')
if index <= 0 {
continue
}
out[entry[:index]] = entry[index+1:]
}
if len(out) == 0 {
return nil
}
return out
}
// buildFailureIntent constructs the admin-only notification intent for
// fc. The idempotency key is scoped per (notification_type, game_id,
// image_ref, attempted_at_ms) so the same failure observed twice is
// recognised as a duplicate by Notification Service.
func buildFailureIntent(fc failureCtx, attemptedAt time.Time) (notificationintent.Intent, error) {
attemptedAtMs := attemptedAt.UnixMilli()
idempotencyKey := fmt.Sprintf("%s.%s.%d", fc.notificationType, fc.input.GameID, attemptedAtMs)
metadata := notificationintent.Metadata{
IdempotencyKey: idempotencyKey,
OccurredAt: attemptedAt,
}
switch fc.notificationType {
case notificationintent.NotificationTypeRuntimeImagePullFailed:
return notificationintent.NewRuntimeImagePullFailedIntent(metadata, notificationintent.RuntimeImagePullFailedPayload{
GameID: fc.input.GameID,
ImageRef: fc.input.ImageRef,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
AttemptedAtMs: attemptedAtMs,
})
case notificationintent.NotificationTypeRuntimeContainerStartFailed:
return notificationintent.NewRuntimeContainerStartFailedIntent(metadata, notificationintent.RuntimeContainerStartFailedPayload{
GameID: fc.input.GameID,
ImageRef: fc.input.ImageRef,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
AttemptedAtMs: attemptedAtMs,
})
case notificationintent.NotificationTypeRuntimeStartConfigInvalid:
return notificationintent.NewRuntimeStartConfigInvalidIntent(metadata, notificationintent.RuntimeStartConfigInvalidPayload{
GameID: fc.input.GameID,
ImageRef: fc.input.ImageRef,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
AttemptedAtMs: attemptedAtMs,
})
default:
return notificationintent.Intent{}, fmt.Errorf("unsupported notification type %q", fc.notificationType)
}
}
// defaultTokenGenerator returns a function that produces 32-byte
// base64url-encoded tokens. The randomness source is `crypto/rand`;
// failures fall back to a deterministic-looking but invalid token so
// the caller observes a TryAcquire collision rather than a panic on a
// degraded entropy source.
func defaultTokenGenerator() func() string {
return func() string {
var buf [32]byte
if _, err := rand.Read(buf[:]); err != nil {
return "rtmanager-fallback-token"
}
return base64.RawURLEncoding.EncodeToString(buf[:])
}
}
// newDefaultStateDirPreparer returns a function that creates the
// per-game state directory under cfg.GameStateRoot with the configured
// permissions and ownership. The function is overridable through
// Dependencies.PrepareStateDir; tests inject a temporary-dir fake.
func newDefaultStateDirPreparer(cfg config.ContainerConfig) func(gameID string) (string, error) {
mode := os.FileMode(cfg.GameStateDirMode)
uid := cfg.GameStateOwnerUID
gid := cfg.GameStateOwnerGID
root := cfg.GameStateRoot
return func(gameID string) (string, error) {
path := filepath.Join(root, gameID)
if err := os.MkdirAll(path, mode); err != nil {
return "", fmt.Errorf("create state dir %q: %w", path, err)
}
if err := os.Chmod(path, mode); err != nil {
return "", fmt.Errorf("chmod state dir %q: %w", path, err)
}
if err := os.Chown(path, uid, gid); err != nil {
return "", fmt.Errorf("chown state dir %q: %w", path, err)
}
return path, nil
}
}
@@ -0,0 +1,693 @@
package startruntime_test
import (
"context"
"encoding/json"
"errors"
"sync"
"testing"
"time"
"galaxy/notificationintent"
"galaxy/rtmanager/internal/adapters/docker/mocks"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/telemetry"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
)
// --- test doubles -----------------------------------------------------
type fakeRuntimeRecords struct {
mu sync.Mutex
stored map[string]runtime.RuntimeRecord
getErr error
upsertErr error
upserts []runtime.RuntimeRecord
}
func newFakeRuntimeRecords() *fakeRuntimeRecords {
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
}
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.getErr != nil {
return runtime.RuntimeRecord{}, s.getErr
}
record, ok := s.stored[gameID]
if !ok {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
return record, nil
}
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.upsertErr != nil {
return s.upsertErr
}
s.upserts = append(s.upserts, record)
s.stored[record.GameID] = record
return nil
}
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
return errors.New("not used in start tests")
}
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in start tests")
}
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in start tests")
}
type fakeOperationLogs struct {
mu sync.Mutex
appendErr error
appends []operation.OperationEntry
}
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.appendErr != nil {
return 0, s.appendErr
}
s.appends = append(s.appends, entry)
return int64(len(s.appends)), nil
}
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
return nil, errors.New("not used in start tests")
}
func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) {
s.mu.Lock()
defer s.mu.Unlock()
if len(s.appends) == 0 {
return operation.OperationEntry{}, false
}
return s.appends[len(s.appends)-1], true
}
type fakeLeases struct {
acquired bool
acquireErr error
releaseErr error
mu sync.Mutex
acquires []string
releases []string
}
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
l.mu.Lock()
defer l.mu.Unlock()
l.acquires = append(l.acquires, token)
if l.acquireErr != nil {
return false, l.acquireErr
}
return l.acquired, nil
}
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
l.mu.Lock()
defer l.mu.Unlock()
l.releases = append(l.releases, token)
return l.releaseErr
}
type fakeHealthEvents struct {
mu sync.Mutex
publishErr error
envelopes []ports.HealthEventEnvelope
}
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
h.mu.Lock()
defer h.mu.Unlock()
if h.publishErr != nil {
return h.publishErr
}
h.envelopes = append(h.envelopes, envelope)
return nil
}
type fakeNotifications struct {
mu sync.Mutex
publishErr error
intents []notificationintent.Intent
}
func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error {
n.mu.Lock()
defer n.mu.Unlock()
if n.publishErr != nil {
return n.publishErr
}
n.intents = append(n.intents, intent)
return nil
}
type fakeLobby struct {
record ports.LobbyGameRecord
err error
mu sync.Mutex
calls []string
}
func (l *fakeLobby) GetGame(_ context.Context, gameID string) (ports.LobbyGameRecord, error) {
l.mu.Lock()
defer l.mu.Unlock()
l.calls = append(l.calls, gameID)
if l.err != nil {
return ports.LobbyGameRecord{}, l.err
}
return l.record, nil
}
// --- harness ----------------------------------------------------------
type harness struct {
records *fakeRuntimeRecords
operationLogs *fakeOperationLogs
docker *mocks.MockDockerClient
leases *fakeLeases
healthEvents *fakeHealthEvents
notifications *fakeNotifications
lobby *fakeLobby
telemetry *telemetry.Runtime
now time.Time
stateDir string
}
func newHarness(t *testing.T) *harness {
t.Helper()
ctrl := gomock.NewController(t)
t.Cleanup(ctrl.Finish)
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
require.NoError(t, err)
return &harness{
records: newFakeRuntimeRecords(),
operationLogs: &fakeOperationLogs{},
docker: mocks.NewMockDockerClient(ctrl),
leases: &fakeLeases{acquired: true},
healthEvents: &fakeHealthEvents{},
notifications: &fakeNotifications{},
lobby: &fakeLobby{},
telemetry: telemetryRuntime,
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
stateDir: "/var/lib/galaxy/games/game-1",
}
}
func (h *harness) build(t *testing.T) *startruntime.Service {
t.Helper()
containerCfg := config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
}
dockerCfg := config.DockerConfig{
Host: "unix:///var/run/docker.sock",
Network: "galaxy-net",
LogDriver: "json-file",
PullPolicy: config.ImagePullPolicyIfMissing,
}
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
service, err := startruntime.NewService(startruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
HealthEvents: h.healthEvents,
Notifications: h.notifications,
Lobby: h.lobby,
Container: containerCfg,
DockerCfg: dockerCfg,
Coordination: coordinationCfg,
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: func() string { return "token-A" },
PrepareStateDir: func(_ string) (string, error) {
return h.stateDir, nil
},
})
require.NoError(t, err)
return service
}
func basicInput() startruntime.Input {
return startruntime.Input{
GameID: "game-1",
ImageRef: "registry.example.com/galaxy/game:1.4.7",
OpSource: operation.OpSourceLobbyStream,
SourceRef: "1700000000000-0",
}
}
func sampleRunResult(now time.Time) ports.RunResult {
return ports.RunResult{
ContainerID: "ctr-123",
EngineEndpoint: "http://galaxy-game-game-1:8080",
StartedAt: now,
}
}
// --- happy path -------------------------------------------------------
func TestHandleHappyPath(t *testing.T) {
h := newHarness(t)
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, ports.PullPolicy(config.ImagePullPolicyIfMissing)).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{
Ref: input.ImageRef,
Labels: map[string]string{
"com.galaxy.cpu_quota": "0.5",
"com.galaxy.memory": "256m",
"com.galaxy.pids_limit": "256",
},
}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).DoAndReturn(func(_ context.Context, spec ports.RunSpec) (ports.RunResult, error) {
assert.Equal(t, "galaxy-game-game-1", spec.Name)
assert.Equal(t, "galaxy-game-game-1", spec.Hostname)
assert.Equal(t, input.ImageRef, spec.Image)
assert.Equal(t, "galaxy-net", spec.Network)
assert.Equal(t, "json-file", spec.LogDriver)
assert.InDelta(t, 0.5, spec.CPUQuota, 0)
assert.Equal(t, "256m", spec.Memory)
assert.Equal(t, 256, spec.PIDsLimit)
assert.Equal(t, h.stateDir, spec.BindMounts[0].HostPath)
assert.Equal(t, "/var/lib/galaxy-game", spec.BindMounts[0].MountPath)
assert.Equal(t, "/var/lib/galaxy-game", spec.Env["GAME_STATE_PATH"])
assert.Equal(t, "/var/lib/galaxy-game", spec.Env["STORAGE_PATH"])
assert.Equal(t, "rtmanager", spec.Labels[startruntime.LabelOwner])
assert.Equal(t, "game-engine", spec.Labels[startruntime.LabelKind])
assert.Equal(t, input.GameID, spec.Labels[startruntime.LabelGameID])
assert.Equal(t, input.ImageRef, spec.Labels[startruntime.LabelEngineImageRef])
return sampleRunResult(h.now), nil
})
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Empty(t, result.ErrorCode)
assert.Equal(t, runtime.StatusRunning, result.Record.Status)
assert.Equal(t, "ctr-123", result.Record.CurrentContainerID)
assert.Equal(t, input.ImageRef, result.Record.CurrentImageRef)
assert.Equal(t, "http://galaxy-game-game-1:8080", result.Record.EngineEndpoint)
assert.Equal(t, h.stateDir, result.Record.StatePath)
assert.Equal(t, "galaxy-net", result.Record.DockerNetwork)
require.NotNil(t, result.Record.StartedAt)
assert.Equal(t, h.now, *result.Record.StartedAt)
assert.Equal(t, h.now, result.Record.LastOpAt)
assert.Equal(t, h.now, result.Record.CreatedAt)
require.Len(t, h.records.upserts, 1)
require.Len(t, h.operationLogs.appends, 1)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OpKindStart, last.OpKind)
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
assert.Empty(t, last.ErrorCode)
assert.Equal(t, "ctr-123", last.ContainerID)
require.Len(t, h.healthEvents.envelopes, 1)
assert.Equal(t, health.EventTypeContainerStarted, h.healthEvents.envelopes[0].EventType)
var details map[string]string
require.NoError(t, json.Unmarshal(h.healthEvents.envelopes[0].Details, &details))
assert.Equal(t, input.ImageRef, details["image_ref"])
assert.Empty(t, h.notifications.intents, "no notification intent expected on success")
assert.Equal(t, []string{"token-A"}, h.leases.acquires)
assert.Equal(t, []string{"token-A"}, h.leases.releases)
assert.Equal(t, []string{input.GameID}, h.lobby.calls)
}
// --- idempotent replay ------------------------------------------------
func TestHandleReplayNoOpForRunningRecordWithSameImageRef(t *testing.T) {
h := newHarness(t)
input := basicInput()
startedAt := h.now.Add(-time.Hour)
h.records.stored[input.GameID] = runtime.RuntimeRecord{
GameID: input.GameID,
Status: runtime.StatusRunning,
CurrentContainerID: "ctr-prev",
CurrentImageRef: input.ImageRef,
EngineEndpoint: "http://galaxy-game-game-1:8080",
StatePath: h.stateDir,
DockerNetwork: "galaxy-net",
StartedAt: &startedAt,
LastOpAt: startedAt,
CreatedAt: startedAt,
}
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
assert.Equal(t, "ctr-prev", result.Record.CurrentContainerID)
assert.Empty(t, h.records.upserts, "replay must not Upsert a fresh record")
require.Len(t, h.operationLogs.appends, 1)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode)
assert.Equal(t, "ctr-prev", last.ContainerID)
assert.Empty(t, h.notifications.intents)
assert.Equal(t, []string{"token-A"}, h.leases.releases, "lease must be released after replay no-op")
}
// --- conflicts --------------------------------------------------------
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
h := newHarness(t)
h.leases.acquired = false
input := basicInput()
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
require.Len(t, h.operationLogs.appends, 1)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OutcomeFailure, last.Outcome)
assert.Equal(t, startruntime.ErrorCodeConflict, last.ErrorCode)
assert.Empty(t, h.notifications.intents, "lease conflicts must not raise admin notifications")
assert.Empty(t, h.leases.releases, "release must not run when acquire returned false")
}
func TestHandleConflictWhenRunningWithDifferentImageRef(t *testing.T) {
h := newHarness(t)
input := basicInput()
startedAt := h.now.Add(-time.Hour)
h.records.stored[input.GameID] = runtime.RuntimeRecord{
GameID: input.GameID,
Status: runtime.StatusRunning,
CurrentContainerID: "ctr-prev",
CurrentImageRef: "registry.example.com/galaxy/game:1.4.6",
EngineEndpoint: "http://galaxy-game-game-1:8080",
StatePath: h.stateDir,
DockerNetwork: "galaxy-net",
StartedAt: &startedAt,
LastOpAt: startedAt,
CreatedAt: startedAt,
}
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, startruntime.ErrorCodeConflict, last.ErrorCode)
assert.Empty(t, h.notifications.intents)
assert.Empty(t, h.records.upserts)
}
// --- start_config_invalid ---------------------------------------------
func TestHandleStartConfigInvalidWhenImageRefMalformed(t *testing.T) {
h := newHarness(t)
input := basicInput()
input.ImageRef = "::not a docker reference::"
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
require.Len(t, h.notifications.intents, 1)
assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OutcomeFailure, last.Outcome)
}
func TestHandleStartConfigInvalidWhenNetworkMissing(t *testing.T) {
h := newHarness(t)
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(ports.ErrNetworkMissing)
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
require.Len(t, h.notifications.intents, 1)
assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType)
}
func TestHandleStartConfigInvalidWhenStateDirFails(t *testing.T) {
h := newHarness(t)
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
service, err := startruntime.NewService(startruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
HealthEvents: h.healthEvents,
Notifications: h.notifications,
Lobby: h.lobby,
Container: config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
},
DockerCfg: config.DockerConfig{
Host: "unix:///var/run/docker.sock",
Network: "galaxy-net",
LogDriver: "json-file",
PullPolicy: config.ImagePullPolicyIfMissing,
},
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: func() string { return "token-A" },
PrepareStateDir: func(_ string) (string, error) {
return "", errors.New("disk full")
},
})
require.NoError(t, err)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
require.Len(t, h.notifications.intents, 1)
assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType)
}
// --- image_pull_failed ------------------------------------------------
func TestHandleImagePullFailed(t *testing.T) {
h := newHarness(t)
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(errors.New("manifest unknown"))
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode)
require.Len(t, h.notifications.intents, 1)
assert.Equal(t, notificationintent.NotificationTypeRuntimeImagePullFailed, h.notifications.intents[0].NotificationType)
assert.Empty(t, h.records.upserts)
}
// --- container_start_failed ------------------------------------------
func TestHandleContainerStartFailedOnRunError(t *testing.T) {
h := newHarness(t)
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(ports.RunResult{}, errors.New("container name conflict"))
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeContainerStartFailed, result.ErrorCode)
require.Len(t, h.notifications.intents, 1)
assert.Equal(t, notificationintent.NotificationTypeRuntimeContainerStartFailed, h.notifications.intents[0].NotificationType)
assert.Empty(t, h.records.upserts)
}
func TestHandleRollsBackContainerWhenUpsertFails(t *testing.T) {
h := newHarness(t)
h.records.upsertErr = errors.New("connection refused")
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-123").Return(nil)
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeContainerStartFailed, result.ErrorCode)
require.Len(t, h.notifications.intents, 1)
assert.Equal(t, notificationintent.NotificationTypeRuntimeContainerStartFailed, h.notifications.intents[0].NotificationType)
}
// --- best-effort degradation -----------------------------------------
func TestHandleSuccessSurvivesOperationLogFailure(t *testing.T) {
h := newHarness(t)
h.operationLogs.appendErr = errors.New("postgres down")
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Empty(t, result.ErrorCode)
assert.Len(t, h.records.upserts, 1)
}
func TestHandleSuccessSurvivesHealthPublishFailure(t *testing.T) {
h := newHarness(t)
h.healthEvents.publishErr = errors.New("redis down")
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Len(t, h.records.upserts, 1)
}
// --- pre-existing stopped record proceeds with fresh start ----------
func TestHandlePreservesCreatedAtForExistingRecord(t *testing.T) {
h := newHarness(t)
input := basicInput()
originalCreatedAt := h.now.Add(-72 * time.Hour)
stoppedAt := h.now.Add(-time.Hour)
h.records.stored[input.GameID] = runtime.RuntimeRecord{
GameID: input.GameID,
Status: runtime.StatusStopped,
CurrentImageRef: "registry.example.com/galaxy/game:1.4.6",
EngineEndpoint: "http://galaxy-game-game-1:8080",
StatePath: h.stateDir,
DockerNetwork: "galaxy-net",
StoppedAt: &stoppedAt,
LastOpAt: stoppedAt,
CreatedAt: originalCreatedAt,
}
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, originalCreatedAt, result.Record.CreatedAt, "created_at must be preserved across re-starts")
assert.Equal(t, runtime.StatusRunning, result.Record.Status)
assert.Equal(t, input.ImageRef, result.Record.CurrentImageRef)
}
// --- input validation -----------------------------------------------
func TestHandleRejectsInvalidInput(t *testing.T) {
h := newHarness(t)
service := h.build(t)
cases := []startruntime.Input{
{GameID: "", ImageRef: "x", OpSource: operation.OpSourceLobbyStream},
{GameID: "g", ImageRef: "", OpSource: operation.OpSourceLobbyStream},
{GameID: "g", ImageRef: "x", OpSource: operation.OpSource("bogus")},
}
for _, input := range cases {
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
}
}
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
h := newHarness(t)
deps := startruntime.Dependencies{
Container: config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
},
DockerCfg: config.DockerConfig{
Host: "unix:///var/run/docker.sock",
Network: "galaxy-net",
LogDriver: "json-file",
PullPolicy: config.ImagePullPolicyIfMissing,
},
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
}
_, err := startruntime.NewService(deps)
require.Error(t, err)
}