feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,52 @@
package patchruntime
import (
"errors"
"fmt"
"strings"
"github.com/distribution/reference"
"golang.org/x/mod/semver"
)
// errImageRefNoTag reports that an image reference does not declare a
// tag. The patch service maps it to `image_ref_not_semver` because a
// digest-only or tagless reference cannot carry a semver-comparable
// version.
var errImageRefNoTag = errors.New("image reference is missing a tag")
// extractSemverTag returns the canonical semver string ("v1.4.7") for
// imageRef, ready to feed into golang.org/x/mod/semver. The leading "v"
// is added when the underlying tag omits it.
//
// Errors returned by this function are pre-formatted for inclusion in
// the patch service's `image_ref_not_semver` failure message.
func extractSemverTag(imageRef string) (string, error) {
parsed, err := reference.ParseNormalizedNamed(imageRef)
if err != nil {
return "", fmt.Errorf("parse image reference %q: %w", imageRef, err)
}
tagged, ok := parsed.(reference.NamedTagged)
if !ok {
return "", fmt.Errorf("%w: %q", errImageRefNoTag, imageRef)
}
tag := strings.TrimSpace(tagged.Tag())
if tag == "" {
return "", fmt.Errorf("%w: %q", errImageRefNoTag, imageRef)
}
candidate := tag
if !strings.HasPrefix(candidate, "v") {
candidate = "v" + candidate
}
if !semver.IsValid(candidate) {
return "", fmt.Errorf("tag %q on image reference %q is not a valid semver", tag, imageRef)
}
return candidate, nil
}
// samePatchSeries reports whether two canonical semver strings (with
// the leading "v") share their major and minor components. The third
// component (patch) and any pre-release / build metadata are ignored.
func samePatchSeries(currentSemver, newSemver string) bool {
return semver.MajorMinor(currentSemver) == semver.MajorMinor(newSemver)
}
@@ -0,0 +1,483 @@
// Package patchruntime implements the `patch` lifecycle operation owned
// by Runtime Manager. Patch is restart with a new `image_ref`: under
// one outer per-game lease the service runs the stop service, removes
// the container, and runs the start service with the new image. The
// engine reads its state from the bind-mount on startup, so any data
// written before the patch survives.
//
// The new and current image references must both parse as semver tags
// and share their major and minor components. A new tag that bumps the
// major or the minor surfaces as `semver_patch_only`; a tag that is
// not parseable as semver surfaces as `image_ref_not_semver`. These
// pre-checks run before any Docker work so a rejected patch never
// disturbs the running runtime.
//
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
// §Lifecycles → Patch`. Design rationale is captured in
// `rtmanager/docs/services.md`.
package patchruntime
import (
"context"
"crypto/rand"
"encoding/base64"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/logging"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"galaxy/rtmanager/internal/telemetry"
)
const leaseReleaseTimeout = 5 * time.Second
// Input stores the per-call arguments for one patch operation.
type Input struct {
// GameID identifies the platform game to patch.
GameID string
// NewImageRef stores the new Docker reference the patch installs.
// Must be a valid Docker reference whose tag parses as semver.
NewImageRef string
// OpSource classifies how the request entered Runtime Manager.
OpSource operation.OpSource
// SourceRef stores the optional opaque per-source reference. When
// non-empty it is reused as the correlation id linking the outer
// patch entry to the inner stop and start log entries.
SourceRef string
}
// Validate reports whether input carries the structural invariants the
// service requires. Image-reference shape and semver checks happen
// later inside Handle so that they run after the runtime record has
// been loaded.
func (input Input) Validate() error {
if strings.TrimSpace(input.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if strings.TrimSpace(input.NewImageRef) == "" {
return fmt.Errorf("new image ref must not be empty")
}
if !input.OpSource.IsKnown() {
return fmt.Errorf("op source %q is unsupported", input.OpSource)
}
return nil
}
// Result stores the deterministic outcome of one Handle call.
type Result struct {
// Record carries the runtime record installed by the inner start on
// success; zero on failure.
Record runtime.RuntimeRecord
// Outcome reports whether the operation completed (success) or
// produced a stable failure code.
Outcome operation.Outcome
// ErrorCode stores the stable error code on failure.
ErrorCode string
// ErrorMessage stores the operator-readable detail on failure.
ErrorMessage string
}
// Dependencies groups the collaborators required by Service.
type Dependencies struct {
RuntimeRecords ports.RuntimeRecordStore
OperationLogs ports.OperationLogStore
Docker ports.DockerClient
Leases ports.GameLeaseStore
// StopService runs the inner stop step.
StopService *stopruntime.Service
// StartService runs the inner start step with the new image_ref.
StartService *startruntime.Service
Coordination config.CoordinationConfig
Telemetry *telemetry.Runtime
Logger *slog.Logger
Clock func() time.Time
NewToken func() string
}
// Service executes the patch lifecycle operation.
type Service struct {
runtimeRecords ports.RuntimeRecordStore
operationLogs ports.OperationLogStore
docker ports.DockerClient
leases ports.GameLeaseStore
stopService *stopruntime.Service
startService *startruntime.Service
leaseTTL time.Duration
telemetry *telemetry.Runtime
logger *slog.Logger
clock func() time.Time
newToken func() string
}
// NewService constructs one Service from deps.
func NewService(deps Dependencies) (*Service, error) {
switch {
case deps.RuntimeRecords == nil:
return nil, errors.New("new patch runtime service: nil runtime records")
case deps.OperationLogs == nil:
return nil, errors.New("new patch runtime service: nil operation logs")
case deps.Docker == nil:
return nil, errors.New("new patch runtime service: nil docker client")
case deps.Leases == nil:
return nil, errors.New("new patch runtime service: nil lease store")
case deps.StopService == nil:
return nil, errors.New("new patch runtime service: nil stop service")
case deps.StartService == nil:
return nil, errors.New("new patch runtime service: nil start service")
case deps.Telemetry == nil:
return nil, errors.New("new patch runtime service: nil telemetry runtime")
}
if err := deps.Coordination.Validate(); err != nil {
return nil, fmt.Errorf("new patch runtime service: coordination config: %w", err)
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
logger = logger.With("service", "rtmanager.patchruntime")
newToken := deps.NewToken
if newToken == nil {
newToken = defaultTokenGenerator()
}
return &Service{
runtimeRecords: deps.RuntimeRecords,
operationLogs: deps.OperationLogs,
docker: deps.Docker,
leases: deps.Leases,
stopService: deps.StopService,
startService: deps.StartService,
leaseTTL: deps.Coordination.GameLeaseTTL,
telemetry: deps.Telemetry,
logger: logger,
clock: clock,
newToken: newToken,
}, nil
}
// Handle executes one patch operation end-to-end. The Go-level error
// return is reserved for non-business failures (nil context, nil
// receiver). Every business outcome — success or any of the stable
// failure codes — flows through Result.
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("patch runtime: nil service")
}
if ctx == nil {
return Result{}, errors.New("patch runtime: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInvalidRequest,
errorMessage: err.Error(),
}), nil
}
token := service.newToken()
leaseStart := service.clock()
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
}), nil
}
if !acquired {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeConflict,
errorMessage: "another lifecycle operation is in progress for this game",
}), nil
}
defer service.releaseLease(ctx, input.GameID, token)
return service.runUnderLease(ctx, input, opStartedAt)
}
// runUnderLease executes the lease-protected patch sequence: load the
// runtime record, validate semver compatibility, run inner stop,
// remove the container, run inner start with the new image.
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
if errors.Is(err, runtime.ErrNotFound) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeNotFound,
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
}), nil
}
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
}), nil
}
if existing.Status == runtime.StatusRemoved {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeConflict,
errorMessage: fmt.Sprintf("runtime for game %q is removed; cannot patch", input.GameID),
}), nil
}
if strings.TrimSpace(existing.CurrentImageRef) == "" {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("runtime record for game %q has no current image_ref", input.GameID),
}), nil
}
currentSemver, err := extractSemverTag(existing.CurrentImageRef)
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeImageRefNotSemver,
errorMessage: fmt.Sprintf("current image_ref: %s", err.Error()),
imageRef: existing.CurrentImageRef,
}), nil
}
newSemver, err := extractSemverTag(input.NewImageRef)
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeImageRefNotSemver,
errorMessage: fmt.Sprintf("new image_ref: %s", err.Error()),
imageRef: input.NewImageRef,
}), nil
}
if !samePatchSeries(currentSemver, newSemver) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeSemverPatchOnly,
errorMessage: fmt.Sprintf(
"patch must keep major.minor; current=%s new=%s",
currentSemver, newSemver,
),
imageRef: input.NewImageRef,
}), nil
}
correlationRef := input.SourceRef
if correlationRef == "" {
correlationRef = service.newToken()
}
containerID := existing.CurrentContainerID
stopResult, err := service.stopService.Run(ctx, stopruntime.Input{
GameID: input.GameID,
Reason: stopruntime.StopReasonAdminRequest,
OpSource: input.OpSource,
SourceRef: correlationRef,
})
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("inner stop: %s", err.Error()),
imageRef: input.NewImageRef,
containerID: containerID,
}), nil
}
if stopResult.Outcome == operation.OutcomeFailure {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: stopResult.ErrorCode,
errorMessage: fmt.Sprintf("inner stop failed: %s", stopResult.ErrorMessage),
imageRef: input.NewImageRef,
containerID: containerID,
}), nil
}
if containerID != "" {
if err := service.docker.Remove(ctx, containerID); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("docker remove: %s", err.Error()),
imageRef: input.NewImageRef,
containerID: containerID,
}), nil
}
}
startResult, err := service.startService.Run(ctx, startruntime.Input{
GameID: input.GameID,
ImageRef: input.NewImageRef,
OpSource: input.OpSource,
SourceRef: correlationRef,
})
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("inner start: %s", err.Error()),
imageRef: input.NewImageRef,
}), nil
}
if startResult.Outcome == operation.OutcomeFailure {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startResult.ErrorCode,
errorMessage: fmt.Sprintf("inner start failed: %s", startResult.ErrorMessage),
imageRef: input.NewImageRef,
}), nil
}
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindPatch,
OpSource: input.OpSource,
SourceRef: correlationRef,
ImageRef: input.NewImageRef,
ContainerID: startResult.Record.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeSuccess), "")
logArgs := []any{
"game_id", input.GameID,
"prev_image_ref", existing.CurrentImageRef,
"new_image_ref", input.NewImageRef,
"prev_container_id", containerID,
"new_container_id", startResult.Record.CurrentContainerID,
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime patched", logArgs...)
return Result{
Record: startResult.Record,
Outcome: operation.OutcomeSuccess,
}, nil
}
// failureCtx groups the inputs to recordFailure.
type failureCtx struct {
opStartedAt time.Time
input Input
errorCode string
errorMessage string
imageRef string
containerID string
}
// recordFailure writes the outer failure operation_log entry and emits
// telemetry. Inner stop / start services have already recorded their
// own entries; this is the outer summary.
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: fc.input.GameID,
OpKind: operation.OpKindPatch,
OpSource: fc.input.OpSource,
SourceRef: fc.input.SourceRef,
ImageRef: fc.imageRef,
ContainerID: fc.containerID,
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
StartedAt: fc.opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode)
logArgs := []any{
"game_id", fc.input.GameID,
"image_ref", fc.imageRef,
"op_source", string(fc.input.OpSource),
"error_code", fc.errorCode,
"error_message", fc.errorMessage,
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.WarnContext(ctx, "runtime patch failed", logArgs...)
return Result{
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
}
}
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
defer cancel()
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
service.logger.WarnContext(ctx, "release game lease",
"game_id", gameID,
"err", err.Error(),
)
}
}
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
service.logger.ErrorContext(ctx, "append operation log",
"game_id", entry.GameID,
"op_kind", string(entry.OpKind),
"outcome", string(entry.Outcome),
"error_code", entry.ErrorCode,
"err", err.Error(),
)
}
}
func defaultTokenGenerator() func() string {
return func() string {
var buf [32]byte
if _, err := rand.Read(buf[:]); err != nil {
return "rtmanager-fallback-token"
}
return base64.RawURLEncoding.EncodeToString(buf[:])
}
}
@@ -0,0 +1,597 @@
package patchruntime_test
import (
"context"
"errors"
"sync"
"testing"
"time"
"galaxy/notificationintent"
"galaxy/rtmanager/internal/adapters/docker/mocks"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/patchruntime"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"galaxy/rtmanager/internal/telemetry"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
)
// --- shared fake doubles (mirror the restartruntime test pattern) ---
type fakeRuntimeRecords struct {
mu sync.Mutex
stored map[string]runtime.RuntimeRecord
getErr error
upsertErr error
updateStatusErr error
upserts []runtime.RuntimeRecord
updates []ports.UpdateStatusInput
}
func newFakeRuntimeRecords() *fakeRuntimeRecords {
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
}
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.getErr != nil {
return runtime.RuntimeRecord{}, s.getErr
}
record, ok := s.stored[gameID]
if !ok {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
return record, nil
}
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.upsertErr != nil {
return s.upsertErr
}
s.upserts = append(s.upserts, record)
s.stored[record.GameID] = record
return nil
}
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
s.mu.Lock()
defer s.mu.Unlock()
s.updates = append(s.updates, input)
if s.updateStatusErr != nil {
return s.updateStatusErr
}
record, ok := s.stored[input.GameID]
if !ok {
return runtime.ErrNotFound
}
if record.Status != input.ExpectedFrom {
return runtime.ErrConflict
}
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
return runtime.ErrConflict
}
record.Status = input.To
record.LastOpAt = input.Now
switch input.To {
case runtime.StatusStopped:
stoppedAt := input.Now
record.StoppedAt = &stoppedAt
case runtime.StatusRemoved:
removedAt := input.Now
record.RemovedAt = &removedAt
record.CurrentContainerID = ""
}
s.stored[input.GameID] = record
return nil
}
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in patch tests")
}
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in patch tests")
}
type fakeOperationLogs struct {
mu sync.Mutex
appendErr error
appends []operation.OperationEntry
}
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.appendErr != nil {
return 0, s.appendErr
}
s.appends = append(s.appends, entry)
return int64(len(s.appends)), nil
}
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
return nil, errors.New("not used in patch tests")
}
func (s *fakeOperationLogs) byKind(kind operation.OpKind) []operation.OperationEntry {
s.mu.Lock()
defer s.mu.Unlock()
out := []operation.OperationEntry{}
for _, entry := range s.appends {
if entry.OpKind == kind {
out = append(out, entry)
}
}
return out
}
type fakeLeases struct {
mu sync.Mutex
acquired bool
acquireErr error
releaseErr error
acquires []string
releases []string
}
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
l.mu.Lock()
defer l.mu.Unlock()
l.acquires = append(l.acquires, token)
if l.acquireErr != nil {
return false, l.acquireErr
}
return l.acquired, nil
}
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
l.mu.Lock()
defer l.mu.Unlock()
l.releases = append(l.releases, token)
return l.releaseErr
}
type fakeHealthEvents struct {
mu sync.Mutex
envelopes []ports.HealthEventEnvelope
}
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
h.mu.Lock()
defer h.mu.Unlock()
h.envelopes = append(h.envelopes, envelope)
return nil
}
type fakeNotifications struct {
mu sync.Mutex
intents []notificationintent.Intent
}
func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error {
n.mu.Lock()
defer n.mu.Unlock()
n.intents = append(n.intents, intent)
return nil
}
type fakeLobby struct{}
func (l *fakeLobby) GetGame(_ context.Context, _ string) (ports.LobbyGameRecord, error) {
return ports.LobbyGameRecord{}, nil
}
// --- harness ---------------------------------------------------------
type harness struct {
records *fakeRuntimeRecords
operationLogs *fakeOperationLogs
docker *mocks.MockDockerClient
leases *fakeLeases
healthEvents *fakeHealthEvents
notifications *fakeNotifications
lobby *fakeLobby
telemetry *telemetry.Runtime
now time.Time
stateDir string
startService *startruntime.Service
stopService *stopruntime.Service
}
func newHarness(t *testing.T) *harness {
t.Helper()
ctrl := gomock.NewController(t)
t.Cleanup(ctrl.Finish)
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
require.NoError(t, err)
h := &harness{
records: newFakeRuntimeRecords(),
operationLogs: &fakeOperationLogs{},
docker: mocks.NewMockDockerClient(ctrl),
leases: &fakeLeases{acquired: true},
healthEvents: &fakeHealthEvents{},
notifications: &fakeNotifications{},
lobby: &fakeLobby{},
telemetry: telemetryRuntime,
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
stateDir: "/var/lib/galaxy/games/game-1",
}
containerCfg := config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
}
dockerCfg := config.DockerConfig{
Host: "unix:///var/run/docker.sock",
Network: "galaxy-net",
LogDriver: "json-file",
PullPolicy: config.ImagePullPolicyIfMissing,
}
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
startService, err := startruntime.NewService(startruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
HealthEvents: h.healthEvents,
Notifications: h.notifications,
Lobby: h.lobby,
Container: containerCfg,
DockerCfg: dockerCfg,
Coordination: coordinationCfg,
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: func() string { return "inner-start-token" },
PrepareStateDir: func(_ string) (string, error) { return h.stateDir, nil },
})
require.NoError(t, err)
h.startService = startService
stopService, err := stopruntime.NewService(stopruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
HealthEvents: h.healthEvents,
Container: containerCfg,
Coordination: coordinationCfg,
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: func() string { return "inner-stop-token" },
})
require.NoError(t, err)
h.stopService = stopService
return h
}
func (h *harness) build(t *testing.T, tokens ...string) *patchruntime.Service {
t.Helper()
tokenIdx := 0
tokenGen := func() string {
if tokenIdx >= len(tokens) {
return "outer-fallback"
}
t := tokens[tokenIdx]
tokenIdx++
return t
}
service, err := patchruntime.NewService(patchruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
StopService: h.stopService,
StartService: h.startService,
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: tokenGen,
})
require.NoError(t, err)
return service
}
const (
currentImage = "registry.example.com/galaxy/game:1.4.7"
patchImage = "registry.example.com/galaxy/game:1.4.8"
majorBump = "registry.example.com/galaxy/game:2.0.0"
tagless = "registry.example.com/galaxy/game"
notSemver = "registry.example.com/galaxy/game:latest"
)
func runningRecord(now time.Time) runtime.RuntimeRecord {
startedAt := now.Add(-time.Hour)
return runtime.RuntimeRecord{
GameID: "game-1",
Status: runtime.StatusRunning,
CurrentContainerID: "ctr-old",
CurrentImageRef: currentImage,
EngineEndpoint: "http://galaxy-game-game-1:8080",
StatePath: "/var/lib/galaxy/games/game-1",
DockerNetwork: "galaxy-net",
StartedAt: &startedAt,
LastOpAt: startedAt,
CreatedAt: startedAt,
}
}
func basicInput() patchruntime.Input {
return patchruntime.Input{
GameID: "game-1",
NewImageRef: patchImage,
OpSource: operation.OpSourceGMRest,
SourceRef: "rest-req-99",
}
}
func sampleRunResult(now time.Time) ports.RunResult {
return ports.RunResult{
ContainerID: "ctr-new",
EngineEndpoint: "http://galaxy-game-game-1:8080",
StartedAt: now,
}
}
func expectInnerStart(h *harness, image string) {
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), image, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), image).Return(ports.ImageInspect{Ref: image}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
}
// --- happy path -----------------------------------------------------
func TestHandlePatchHappyPath(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
expectInnerStart(h, patchImage)
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, patchImage, result.Record.CurrentImageRef)
patches := h.operationLogs.byKind(operation.OpKindPatch)
require.Len(t, patches, 1)
assert.Equal(t, "rest-req-99", patches[0].SourceRef)
assert.Equal(t, patchImage, patches[0].ImageRef)
assert.Equal(t, "ctr-new", patches[0].ContainerID)
assert.Len(t, h.operationLogs.byKind(operation.OpKindStop), 1)
assert.Len(t, h.operationLogs.byKind(operation.OpKindStart), 1)
}
func TestHandlePatchSameImageProceedsAsRecreate(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
expectInnerStart(h, currentImage)
input := basicInput()
input.NewImageRef = currentImage
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
require.Len(t, h.operationLogs.byKind(operation.OpKindPatch), 1, "patch entry recorded even when image is unchanged")
}
// --- semver pre-checks ---------------------------------------------
func TestHandleImageRefNotSemverWhenNewIsTagless(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
input := basicInput()
input.NewImageRef = tagless
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode)
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop), "no inner stop on pre-check failure")
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart))
}
func TestHandleImageRefNotSemverWhenNewIsNonSemver(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
input := basicInput()
input.NewImageRef = notSemver
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode)
}
func TestHandleImageRefNotSemverWhenCurrentIsTagless(t *testing.T) {
h := newHarness(t)
record := runningRecord(h.now)
record.CurrentImageRef = tagless
h.records.stored["game-1"] = record
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode)
}
func TestHandleSemverPatchOnlyOnMajorBump(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
input := basicInput()
input.NewImageRef = majorBump
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeSemverPatchOnly, result.ErrorCode)
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop))
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart))
}
func TestHandleSemverPatchOnlyOnMinorBump(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
input := basicInput()
input.NewImageRef = "registry.example.com/galaxy/game:1.5.0"
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeSemverPatchOnly, result.ErrorCode)
}
// --- record state checks -------------------------------------------
func TestHandleNotFoundForMissingRecord(t *testing.T) {
h := newHarness(t)
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
}
func TestHandleConflictForRemovedRecord(t *testing.T) {
h := newHarness(t)
removed := runningRecord(h.now)
removed.Status = runtime.StatusRemoved
removed.CurrentContainerID = ""
removedAt := h.now.Add(-time.Hour)
removed.RemovedAt = &removedAt
h.records.stored["game-1"] = removed
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
}
// --- failures from inner ops ---------------------------------------
func TestHandlePropagatesInnerStopFailure(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(errors.New("daemon unreachable"))
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
}
func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o"))
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
}
func TestHandlePropagatesInnerStartFailure(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), patchImage, gomock.Any()).Return(errors.New("manifest unknown"))
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode)
}
// --- conflicts ------------------------------------------------------
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
h := newHarness(t)
h.leases.acquired = false
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
}
// --- input validation ----------------------------------------------
func TestHandleRejectsInvalidInput(t *testing.T) {
h := newHarness(t)
service := h.build(t, "outer-token")
cases := []patchruntime.Input{
{GameID: "", NewImageRef: patchImage, OpSource: operation.OpSourceGMRest},
{GameID: "g", NewImageRef: "", OpSource: operation.OpSourceGMRest},
{GameID: "g", NewImageRef: patchImage, OpSource: operation.OpSource("bogus")},
}
for _, input := range cases {
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
}
}
// --- constructor ---------------------------------------------------
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
h := newHarness(t)
deps := patchruntime.Dependencies{
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
}
_, err := patchruntime.NewService(deps)
require.Error(t, err)
}