feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,612 @@
// Package stopruntime implements the `stop` lifecycle operation owned by
// Runtime Manager. The service is the single orchestrator behind both
// the asynchronous `runtime:stop_jobs` consumer and the synchronous
// `POST /api/v1/internal/runtimes/{game_id}/stop` REST handler. It is
// also the inner stop step of the restart and patch services, which
// call Run while holding the outer per-game lease.
//
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
// §Lifecycles → Stop`. Design rationale is captured in
// `rtmanager/docs/services.md`.
package stopruntime
import (
"context"
"crypto/rand"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/logging"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/telemetry"
)
// leaseReleaseTimeout bounds the deferred lease-release call. A fresh
// background context is used so the release runs even when the request
// context was already canceled.
const leaseReleaseTimeout = 5 * time.Second
// Input stores the per-call arguments for one stop operation.
type Input struct {
// GameID identifies the platform game to stop.
GameID string
// Reason classifies the trigger of the stop. Required.
Reason StopReason
// OpSource classifies how the request entered Runtime Manager.
// Required: every operation_log entry carries an op_source.
OpSource operation.OpSource
// SourceRef stores the optional opaque per-source reference (Redis
// Stream entry id, REST request id, admin user id). Empty when the
// caller does not provide one. For inner calls invoked by the
// restart and patch orchestrators it carries the outer correlation
// id so the three operation_log entries share it.
SourceRef string
}
// Validate reports whether input carries the structural invariants the
// service requires.
func (input Input) Validate() error {
if strings.TrimSpace(input.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if !input.OpSource.IsKnown() {
return fmt.Errorf("op source %q is unsupported", input.OpSource)
}
if err := input.Reason.Validate(); err != nil {
return err
}
return nil
}
// Result stores the deterministic outcome of one Handle / Run call.
type Result struct {
// Record carries the runtime record installed by the operation.
// Populated on success and on idempotent replay; zero on failure.
Record runtime.RuntimeRecord
// Outcome reports whether the operation completed (success) or
// produced a stable failure code.
Outcome operation.Outcome
// ErrorCode stores the stable error code on failure, or
// `replay_no_op` on idempotent replay. Empty for fresh successes.
ErrorCode string
// ErrorMessage stores the operator-readable detail on failure.
// Empty for successes.
ErrorMessage string
}
// Dependencies groups the collaborators required by Service.
type Dependencies struct {
// RuntimeRecords reads and updates the durable runtime record.
RuntimeRecords ports.RuntimeRecordStore
// OperationLogs records the success / failure audit entry.
OperationLogs ports.OperationLogStore
// Docker drives the Docker daemon (container stop).
Docker ports.DockerClient
// Leases serialises operations against the same game id.
Leases ports.GameLeaseStore
// HealthEvents publishes `runtime:health_events` and upserts the
// matching `health_snapshots` row. Used on the vanished-container
// path to emit `container_disappeared`.
HealthEvents ports.HealthEventPublisher
// Container groups the per-container settings consumed at stop time
// (the graceful stop timeout).
Container config.ContainerConfig
// Coordination supplies the per-game lease TTL.
Coordination config.CoordinationConfig
// Telemetry records stop outcomes and lease latency. Required.
Telemetry *telemetry.Runtime
// Logger records structured service-level events. Defaults to
// `slog.Default()` when nil.
Logger *slog.Logger
// Clock supplies the wall-clock used for operation timestamps.
// Defaults to `time.Now` when nil.
Clock func() time.Time
// NewToken supplies a unique opaque lease token. Defaults to a
// 32-byte random base64url string when nil. Tests may override.
NewToken func() string
}
// Service executes the stop lifecycle operation.
type Service struct {
runtimeRecords ports.RuntimeRecordStore
operationLogs ports.OperationLogStore
docker ports.DockerClient
leases ports.GameLeaseStore
healthEvents ports.HealthEventPublisher
stopTimeout time.Duration
leaseTTL time.Duration
telemetry *telemetry.Runtime
logger *slog.Logger
clock func() time.Time
newToken func() string
}
// NewService constructs one Service from deps.
func NewService(deps Dependencies) (*Service, error) {
switch {
case deps.RuntimeRecords == nil:
return nil, errors.New("new stop runtime service: nil runtime records")
case deps.OperationLogs == nil:
return nil, errors.New("new stop runtime service: nil operation logs")
case deps.Docker == nil:
return nil, errors.New("new stop runtime service: nil docker client")
case deps.Leases == nil:
return nil, errors.New("new stop runtime service: nil lease store")
case deps.HealthEvents == nil:
return nil, errors.New("new stop runtime service: nil health events publisher")
case deps.Telemetry == nil:
return nil, errors.New("new stop runtime service: nil telemetry runtime")
}
if err := deps.Container.Validate(); err != nil {
return nil, fmt.Errorf("new stop runtime service: container config: %w", err)
}
if err := deps.Coordination.Validate(); err != nil {
return nil, fmt.Errorf("new stop runtime service: coordination config: %w", err)
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
logger = logger.With("service", "rtmanager.stopruntime")
newToken := deps.NewToken
if newToken == nil {
newToken = defaultTokenGenerator()
}
return &Service{
runtimeRecords: deps.RuntimeRecords,
operationLogs: deps.OperationLogs,
docker: deps.Docker,
leases: deps.Leases,
healthEvents: deps.HealthEvents,
stopTimeout: deps.Container.StopTimeout,
leaseTTL: deps.Coordination.GameLeaseTTL,
telemetry: deps.Telemetry,
logger: logger,
clock: clock,
newToken: newToken,
}, nil
}
// Handle executes one stop operation end-to-end. The Go-level error
// return is reserved for non-business failures (nil context, nil
// receiver). Every business outcome — success, idempotent replay, or
// any of the stable failure modes — flows through Result.
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("stop runtime: nil service")
}
if ctx == nil {
return Result{}, errors.New("stop runtime: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInvalidRequest,
errorMessage: err.Error(),
}), nil
}
token := service.newToken()
leaseStart := service.clock()
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
}), nil
}
if !acquired {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeConflict,
errorMessage: "another lifecycle operation is in progress for this game",
}), nil
}
defer service.releaseLease(ctx, input.GameID, token)
return service.runUnderLease(ctx, input, opStartedAt)
}
// Run executes the stop lifecycle assuming the per-game lease is
// already held by the caller. The method is reserved for orchestrator
// services in `internal/service/` that compose stop with another
// operation under a single outer lease (restart and patch). External
// callers must use Handle.
func (service *Service) Run(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("stop runtime: nil service")
}
if ctx == nil {
return Result{}, errors.New("stop runtime: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInvalidRequest,
errorMessage: err.Error(),
}), nil
}
return service.runUnderLease(ctx, input, opStartedAt)
}
// runUnderLease executes the post-validation, lease-protected stop
// steps shared by Handle and Run.
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
if errors.Is(err, runtime.ErrNotFound) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeNotFound,
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
}), nil
}
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
}), nil
}
switch existing.Status {
case runtime.StatusStopped, runtime.StatusRemoved:
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
case runtime.StatusRunning:
// proceed
default:
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("runtime record has unsupported status %q", existing.Status),
}), nil
}
if err := service.docker.Stop(ctx, existing.CurrentContainerID, service.stopTimeout); err != nil {
if errors.Is(err, ports.ErrContainerNotFound) {
return service.handleVanished(ctx, input, opStartedAt, existing), nil
}
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("docker stop: %s", err.Error()),
containerID: existing.CurrentContainerID,
imageRef: existing.CurrentImageRef,
}), nil
}
updateNow := service.clock().UTC()
err = service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: input.GameID,
ExpectedFrom: runtime.StatusRunning,
ExpectedContainerID: existing.CurrentContainerID,
To: runtime.StatusStopped,
Now: updateNow,
})
if errors.Is(err, runtime.ErrConflict) {
// CAS race: a concurrent reconciler / restart already moved the
// record. The desired terminal state was reached by another path.
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
}
if errors.Is(err, runtime.ErrNotFound) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeNotFound,
errorMessage: fmt.Sprintf("runtime record for game %q vanished mid-stop", input.GameID),
containerID: existing.CurrentContainerID,
imageRef: existing.CurrentImageRef,
}), nil
}
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("update runtime status: %s", err.Error()),
containerID: existing.CurrentContainerID,
imageRef: existing.CurrentImageRef,
}), nil
}
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindStop,
OpSource: input.OpSource,
SourceRef: input.SourceRef,
ImageRef: existing.CurrentImageRef,
ContainerID: existing.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
record := existing
record.Status = runtime.StatusStopped
stoppedAt := updateNow
record.StoppedAt = &stoppedAt
record.LastOpAt = updateNow
logArgs := []any{
"game_id", input.GameID,
"container_id", existing.CurrentContainerID,
"reason", string(input.Reason),
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime stopped", logArgs...)
return Result{
Record: record,
Outcome: operation.OutcomeSuccess,
}, nil
}
// handleVanished records the success outcome for the case where docker
// stop reports the container as already gone. It updates the record to
// removed, publishes container_disappeared, and returns success.
func (service *Service) handleVanished(ctx context.Context, input Input, opStartedAt time.Time, existing runtime.RuntimeRecord) Result {
updateNow := service.clock().UTC()
err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: input.GameID,
ExpectedFrom: runtime.StatusRunning,
ExpectedContainerID: existing.CurrentContainerID,
To: runtime.StatusRemoved,
Now: updateNow,
})
if errors.Is(err, runtime.ErrConflict) {
return service.recordReplayNoOp(ctx, opStartedAt, input, existing)
}
if err != nil && !errors.Is(err, runtime.ErrNotFound) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("update runtime status to removed: %s", err.Error()),
containerID: existing.CurrentContainerID,
imageRef: existing.CurrentImageRef,
})
}
service.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
GameID: input.GameID,
ContainerID: existing.CurrentContainerID,
EventType: health.EventTypeContainerDisappeared,
OccurredAt: updateNow,
Details: emptyHealthDetails(),
})
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindStop,
OpSource: input.OpSource,
SourceRef: input.SourceRef,
ImageRef: existing.CurrentImageRef,
ContainerID: existing.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
service.telemetry.RecordHealthEvent(ctx, string(health.EventTypeContainerDisappeared))
record := existing
record.Status = runtime.StatusRemoved
record.CurrentContainerID = ""
removedAt := updateNow
record.RemovedAt = &removedAt
record.LastOpAt = updateNow
logArgs := []any{
"game_id", input.GameID,
"container_id", existing.CurrentContainerID,
"reason", string(input.Reason),
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime stop on vanished container", logArgs...)
return Result{
Record: record,
Outcome: operation.OutcomeSuccess,
}
}
// recordReplayNoOp records the idempotent replay outcome and returns the
// existing record unchanged.
func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindStop,
OpSource: input.OpSource,
SourceRef: input.SourceRef,
ImageRef: existing.CurrentImageRef,
ContainerID: existing.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
ErrorCode: startruntime.ErrorCodeReplayNoOp,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
logArgs := []any{
"game_id", input.GameID,
"container_id", existing.CurrentContainerID,
"reason", string(input.Reason),
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime stop replay no-op", logArgs...)
return Result{
Record: existing,
Outcome: operation.OutcomeSuccess,
ErrorCode: startruntime.ErrorCodeReplayNoOp,
}
}
// failureCtx groups the inputs to recordFailure so the runUnderLease
// method stays readable.
type failureCtx struct {
opStartedAt time.Time
input Input
errorCode string
errorMessage string
containerID string
imageRef string
}
// recordFailure records the failure operation_log entry and emits
// telemetry. The runtime record stays untouched.
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: fc.input.GameID,
OpKind: operation.OpKindStop,
OpSource: fc.input.OpSource,
SourceRef: fc.input.SourceRef,
ImageRef: fc.imageRef,
ContainerID: fc.containerID,
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
StartedAt: fc.opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeFailure), string(fc.input.Reason), string(fc.input.OpSource))
logArgs := []any{
"game_id", fc.input.GameID,
"reason", string(fc.input.Reason),
"op_source", string(fc.input.OpSource),
"error_code", fc.errorCode,
"error_message", fc.errorMessage,
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.WarnContext(ctx, "runtime stop failed", logArgs...)
return Result{
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
}
}
// releaseLease releases the per-game lease in a fresh background context
// so a canceled request context does not leave the lease pinned for its
// TTL.
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
defer cancel()
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
service.logger.WarnContext(ctx, "release game lease",
"game_id", gameID,
"err", err.Error(),
)
}
}
// bestEffortAppend writes one operation_log entry. A failure is logged
// and discarded; the durable runtime record (or its absence) remains
// the source of truth.
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
service.logger.ErrorContext(ctx, "append operation log",
"game_id", entry.GameID,
"op_kind", string(entry.OpKind),
"outcome", string(entry.Outcome),
"error_code", entry.ErrorCode,
"err", err.Error(),
)
}
}
// bestEffortPublishHealth emits one health event + snapshot upsert.
// Failures degrade silently per `rtmanager/README.md §Notification
// Contracts`; the runtime record remains the source of truth.
func (service *Service) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) {
if err := service.healthEvents.Publish(ctx, envelope); err != nil {
service.logger.ErrorContext(ctx, "publish health event",
"game_id", envelope.GameID,
"container_id", envelope.ContainerID,
"event_type", string(envelope.EventType),
"err", err.Error(),
)
}
}
// defaultTokenGenerator returns a function that produces 32-byte
// base64url-encoded tokens. Mirrors the start service: a degraded
// entropy source falls back to a sentinel token so the next TryAcquire
// observes a collision rather than a panic.
func defaultTokenGenerator() func() string {
return func() string {
var buf [32]byte
if _, err := rand.Read(buf[:]); err != nil {
return "rtmanager-fallback-token"
}
return base64.RawURLEncoding.EncodeToString(buf[:])
}
}
// emptyHealthDetails returns the canonical empty-object payload required
// by the `container_disappeared` AsyncAPI variant.
func emptyHealthDetails() json.RawMessage {
return json.RawMessage("{}")
}
@@ -0,0 +1,537 @@
package stopruntime_test
import (
"context"
"errors"
"sync"
"testing"
"time"
"galaxy/rtmanager/internal/adapters/docker/mocks"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"galaxy/rtmanager/internal/telemetry"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
)
// --- test doubles -----------------------------------------------------
type fakeRuntimeRecords struct {
mu sync.Mutex
stored map[string]runtime.RuntimeRecord
getErr error
updateStatusErr error
updates []ports.UpdateStatusInput
}
func newFakeRuntimeRecords() *fakeRuntimeRecords {
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
}
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.getErr != nil {
return runtime.RuntimeRecord{}, s.getErr
}
record, ok := s.stored[gameID]
if !ok {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
return record, nil
}
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error {
return errors.New("not used in stop tests")
}
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
s.mu.Lock()
defer s.mu.Unlock()
s.updates = append(s.updates, input)
if s.updateStatusErr != nil {
return s.updateStatusErr
}
record, ok := s.stored[input.GameID]
if !ok {
return runtime.ErrNotFound
}
if record.Status != input.ExpectedFrom {
return runtime.ErrConflict
}
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
return runtime.ErrConflict
}
record.Status = input.To
record.LastOpAt = input.Now
switch input.To {
case runtime.StatusStopped:
stoppedAt := input.Now
record.StoppedAt = &stoppedAt
case runtime.StatusRemoved:
removedAt := input.Now
record.RemovedAt = &removedAt
record.CurrentContainerID = ""
}
s.stored[input.GameID] = record
return nil
}
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in stop tests")
}
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in stop tests")
}
type fakeOperationLogs struct {
mu sync.Mutex
appendErr error
appends []operation.OperationEntry
}
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.appendErr != nil {
return 0, s.appendErr
}
s.appends = append(s.appends, entry)
return int64(len(s.appends)), nil
}
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
return nil, errors.New("not used in stop tests")
}
func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) {
s.mu.Lock()
defer s.mu.Unlock()
if len(s.appends) == 0 {
return operation.OperationEntry{}, false
}
return s.appends[len(s.appends)-1], true
}
type fakeLeases struct {
acquired bool
acquireErr error
releaseErr error
mu sync.Mutex
acquires []string
releases []string
}
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
l.mu.Lock()
defer l.mu.Unlock()
l.acquires = append(l.acquires, token)
if l.acquireErr != nil {
return false, l.acquireErr
}
return l.acquired, nil
}
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
l.mu.Lock()
defer l.mu.Unlock()
l.releases = append(l.releases, token)
return l.releaseErr
}
type fakeHealthEvents struct {
mu sync.Mutex
publishErr error
envelopes []ports.HealthEventEnvelope
}
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
h.mu.Lock()
defer h.mu.Unlock()
if h.publishErr != nil {
return h.publishErr
}
h.envelopes = append(h.envelopes, envelope)
return nil
}
// --- harness ----------------------------------------------------------
type harness struct {
records *fakeRuntimeRecords
operationLogs *fakeOperationLogs
docker *mocks.MockDockerClient
leases *fakeLeases
healthEvents *fakeHealthEvents
telemetry *telemetry.Runtime
now time.Time
}
func newHarness(t *testing.T) *harness {
t.Helper()
ctrl := gomock.NewController(t)
t.Cleanup(ctrl.Finish)
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
require.NoError(t, err)
return &harness{
records: newFakeRuntimeRecords(),
operationLogs: &fakeOperationLogs{},
docker: mocks.NewMockDockerClient(ctrl),
leases: &fakeLeases{acquired: true},
healthEvents: &fakeHealthEvents{},
telemetry: telemetryRuntime,
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
}
}
func (h *harness) build(t *testing.T) *stopruntime.Service {
t.Helper()
containerCfg := config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
}
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
service, err := stopruntime.NewService(stopruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
HealthEvents: h.healthEvents,
Container: containerCfg,
Coordination: coordinationCfg,
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: func() string { return "token-A" },
})
require.NoError(t, err)
return service
}
func basicInput() stopruntime.Input {
return stopruntime.Input{
GameID: "game-1",
Reason: stopruntime.StopReasonCancelled,
OpSource: operation.OpSourceLobbyStream,
SourceRef: "1700000000000-0",
}
}
func runningRecord(now time.Time) runtime.RuntimeRecord {
startedAt := now.Add(-time.Hour)
return runtime.RuntimeRecord{
GameID: "game-1",
Status: runtime.StatusRunning,
CurrentContainerID: "ctr-123",
CurrentImageRef: "registry.example.com/galaxy/game:1.4.7",
EngineEndpoint: "http://galaxy-game-game-1:8080",
StatePath: "/var/lib/galaxy/games/game-1",
DockerNetwork: "galaxy-net",
StartedAt: &startedAt,
LastOpAt: startedAt,
CreatedAt: startedAt,
}
}
// --- happy path -------------------------------------------------------
func TestHandleHappyPath(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Empty(t, result.ErrorCode)
assert.Equal(t, runtime.StatusStopped, result.Record.Status)
require.NotNil(t, result.Record.StoppedAt)
assert.Equal(t, h.now, *result.Record.StoppedAt)
assert.Equal(t, h.now, result.Record.LastOpAt)
require.Len(t, h.records.updates, 1)
assert.Equal(t, runtime.StatusRunning, h.records.updates[0].ExpectedFrom)
assert.Equal(t, runtime.StatusStopped, h.records.updates[0].To)
assert.Equal(t, "ctr-123", h.records.updates[0].ExpectedContainerID)
require.Len(t, h.operationLogs.appends, 1)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OpKindStop, last.OpKind)
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
assert.Empty(t, last.ErrorCode)
assert.Equal(t, "ctr-123", last.ContainerID)
assert.Empty(t, h.healthEvents.envelopes)
assert.Equal(t, []string{"token-A"}, h.leases.acquires)
assert.Equal(t, []string{"token-A"}, h.leases.releases)
}
// --- replay ----------------------------------------------------------
func TestHandleReplayNoOpForStoppedRecord(t *testing.T) {
h := newHarness(t)
stoppedRecord := runningRecord(h.now)
stoppedRecord.Status = runtime.StatusStopped
stoppedAt := h.now.Add(-time.Minute)
stoppedRecord.StoppedAt = &stoppedAt
h.records.stored["game-1"] = stoppedRecord
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
assert.Equal(t, runtime.StatusStopped, result.Record.Status)
assert.Empty(t, h.records.updates)
require.Len(t, h.operationLogs.appends, 1)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode)
assert.Equal(t, []string{"token-A"}, h.leases.releases)
}
func TestHandleReplayNoOpForRemovedRecord(t *testing.T) {
h := newHarness(t)
removed := runningRecord(h.now)
removed.Status = runtime.StatusRemoved
removed.CurrentContainerID = ""
removedAt := h.now.Add(-time.Minute)
removed.RemovedAt = &removedAt
h.records.stored["game-1"] = removed
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
}
// --- vanished container ----------------------------------------------
func TestHandleVanishedContainerMarksRemoved(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(ports.ErrContainerNotFound)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Empty(t, result.ErrorCode)
assert.Equal(t, runtime.StatusRemoved, result.Record.Status)
assert.Empty(t, result.Record.CurrentContainerID)
require.Len(t, h.records.updates, 1)
assert.Equal(t, runtime.StatusRemoved, h.records.updates[0].To)
require.Len(t, h.healthEvents.envelopes, 1)
assert.Equal(t, health.EventTypeContainerDisappeared, h.healthEvents.envelopes[0].EventType)
require.Len(t, h.operationLogs.appends, 1)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
assert.Empty(t, last.ErrorCode)
}
// --- failure paths ---------------------------------------------------
func TestHandleNotFoundForMissingRecord(t *testing.T) {
h := newHarness(t)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
assert.Empty(t, h.healthEvents.envelopes)
assert.Empty(t, h.records.updates)
}
func TestHandleServiceUnavailableOnDockerError(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(errors.New("docker daemon timeout"))
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OutcomeFailure, last.Outcome)
assert.Equal(t, "ctr-123", last.ContainerID)
assert.Empty(t, h.records.updates, "no record mutation on docker stop failure")
}
func TestHandleReplayNoOpOnUpdateStatusConflict(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.records.updateStatusErr = runtime.ErrConflict
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
}
func TestHandleInternalErrorOnUpdateStatusGenericError(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.records.updateStatusErr = errors.New("postgres down")
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeInternal, result.ErrorCode)
}
// --- conflicts -------------------------------------------------------
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
h := newHarness(t)
h.leases.acquired = false
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
assert.Empty(t, h.leases.releases, "release must not run when acquire returned false")
}
func TestHandleServiceUnavailableOnLeaseError(t *testing.T) {
h := newHarness(t)
h.leases.acquireErr = errors.New("redis timeout")
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
}
// --- input validation ------------------------------------------------
func TestHandleRejectsInvalidInput(t *testing.T) {
h := newHarness(t)
service := h.build(t)
cases := []stopruntime.Input{
{GameID: "", Reason: stopruntime.StopReasonCancelled, OpSource: operation.OpSourceLobbyStream},
{GameID: "g", Reason: "", OpSource: operation.OpSourceLobbyStream},
{GameID: "g", Reason: stopruntime.StopReason("bogus"), OpSource: operation.OpSourceLobbyStream},
{GameID: "g", Reason: stopruntime.StopReasonCancelled, OpSource: operation.OpSource("bogus")},
}
for _, input := range cases {
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
}
}
// --- Run path (no-lease) ---------------------------------------------
func TestRunSkipsLease(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.leases.acquired = false // would block Handle; Run must ignore
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
service := h.build(t)
result, err := service.Run(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Empty(t, h.leases.acquires, "Run must not touch the lease store")
assert.Empty(t, h.leases.releases)
}
// --- best-effort degradation ----------------------------------------
func TestHandleSurvivesOperationLogFailure(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.operationLogs.appendErr = errors.New("postgres down")
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
}
func TestHandleSurvivesHealthPublishFailureOnVanished(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.healthEvents.publishErr = errors.New("redis down")
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(ports.ErrContainerNotFound)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, runtime.StatusRemoved, result.Record.Status)
}
// --- constructor -----------------------------------------------------
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
h := newHarness(t)
deps := stopruntime.Dependencies{
Container: config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
},
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
}
_, err := stopruntime.NewService(deps)
require.Error(t, err)
}
@@ -0,0 +1,82 @@
package stopruntime
import "fmt"
// StopReason classifies why a caller is asking Runtime Manager to stop a
// game container. The enum is part of the `runtime:stop_jobs` envelope
// produced by Game Lobby and the body of the `POST
// /api/v1/internal/runtimes/{game_id}/stop` REST endpoint, and mirrors
// the AsyncAPI contract frozen in
// `rtmanager/api/runtime-jobs-asyncapi.yaml`.
//
// The vocabulary is shared with `lobby/internal/ports/runtimemanager.go`;
// the two declarations stay byte-identical and adding a new value
// requires a coordinated contract bump on both sides.
type StopReason string
// StopReason enum values. Adding a new value is a contract change that
// touches the AsyncAPI spec, the Lobby producer, and every Runtime
// Manager consumer.
const (
// StopReasonOrphanCleanup releases a container whose post-start
// metadata persistence failed in Lobby.
StopReasonOrphanCleanup StopReason = "orphan_cleanup"
// StopReasonCancelled covers user-lifecycle cascade and explicit
// cancel paths for in-flight games.
StopReasonCancelled StopReason = "cancelled"
// StopReasonFinished is reserved for engine-driven game finish flows.
StopReasonFinished StopReason = "finished"
// StopReasonAdminRequest is reserved for admin-initiated stop paths.
StopReasonAdminRequest StopReason = "admin_request"
// StopReasonTimeout is reserved for timeout-driven stop paths.
StopReasonTimeout StopReason = "timeout"
)
// IsKnown reports whether reason belongs to the frozen stop-reason
// vocabulary.
func (reason StopReason) IsKnown() bool {
switch reason {
case StopReasonOrphanCleanup,
StopReasonCancelled,
StopReasonFinished,
StopReasonAdminRequest,
StopReasonTimeout:
return true
default:
return false
}
}
// AllStopReasons returns the frozen list of every stop-reason value. The
// slice order is stable across calls and matches the AsyncAPI enum order.
func AllStopReasons() []StopReason {
return []StopReason{
StopReasonOrphanCleanup,
StopReasonCancelled,
StopReasonFinished,
StopReasonAdminRequest,
StopReasonTimeout,
}
}
// String returns reason as its stored enum value. Useful in log fields
// and telemetry attributes.
func (reason StopReason) String() string {
return string(reason)
}
// Validate reports whether reason carries one of the five values fixed
// by the AsyncAPI contract.
func (reason StopReason) Validate() error {
if reason == "" {
return fmt.Errorf("stop reason must not be empty")
}
if !reason.IsKnown() {
return fmt.Errorf("stop reason %q is unsupported", reason)
}
return nil
}