feat: runtime manager
This commit is contained in:
@@ -0,0 +1,442 @@
|
||||
// Package cleanupcontainer implements the `cleanup_container` lifecycle
|
||||
// operation owned by Runtime Manager. The service removes the Docker
|
||||
// container of an already-stopped runtime and transitions the record
|
||||
// to `removed`. It refuses to operate on a still-running runtime —
|
||||
// callers must stop first.
|
||||
//
|
||||
// Two callers exercise this surface: the administrative
|
||||
// `DELETE /api/v1/internal/runtimes/{game_id}/container` endpoint, and
|
||||
// the periodic container-cleanup worker that walks
|
||||
// `runtime_records.status='stopped'` rows older than
|
||||
// `RTMANAGER_CONTAINER_RETENTION_DAYS`. Both paths flow through Handle.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
|
||||
// §Lifecycles → Cleanup`. Design rationale is captured in
|
||||
// `rtmanager/docs/services.md`.
|
||||
package cleanupcontainer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
const leaseReleaseTimeout = 5 * time.Second
|
||||
|
||||
// Input stores the per-call arguments for one cleanup operation.
|
||||
type Input struct {
|
||||
// GameID identifies the platform game whose container is removed.
|
||||
GameID string
|
||||
|
||||
// OpSource classifies how the request entered Runtime Manager.
|
||||
// Required: every operation_log entry carries an op_source.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference (REST
|
||||
// request id, admin user id). Empty for the periodic auto-TTL
|
||||
// caller.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle call.
|
||||
type Result struct {
|
||||
// Record carries the updated runtime record on success and on
|
||||
// idempotent replay; zero on failure.
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure, or
|
||||
// `replay_no_op` on idempotent replay. Empty for fresh successes.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
OperationLogs ports.OperationLogStore
|
||||
Docker ports.DockerClient
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
Telemetry *telemetry.Runtime
|
||||
Logger *slog.Logger
|
||||
Clock func() time.Time
|
||||
NewToken func() string
|
||||
}
|
||||
|
||||
// Service executes the cleanup_container lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
docker ports.DockerClient
|
||||
leases ports.GameLeaseStore
|
||||
|
||||
leaseTTL time.Duration
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
|
||||
clock func() time.Time
|
||||
newToken func() string
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new cleanup container service: nil runtime records")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new cleanup container service: nil operation logs")
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new cleanup container service: nil docker client")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new cleanup container service: nil lease store")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new cleanup container service: nil telemetry runtime")
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new cleanup container service: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "rtmanager.cleanupcontainer")
|
||||
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
docker: deps.Docker,
|
||||
leases: deps.Leases,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
newToken: newToken,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one cleanup operation end-to-end. The Go-level error
|
||||
// return is reserved for non-business failures (nil context, nil
|
||||
// receiver). Every business outcome — success, idempotent replay, or
|
||||
// any of the stable failure modes — flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("cleanup container: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("cleanup container: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInvalidRequest,
|
||||
errorMessage: err.Error(),
|
||||
}), nil
|
||||
}
|
||||
|
||||
token := service.newToken()
|
||||
leaseStart := service.clock()
|
||||
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
|
||||
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if !acquired {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: "another lifecycle operation is in progress for this game",
|
||||
}), nil
|
||||
}
|
||||
defer service.releaseLease(ctx, input.GameID, token)
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// runUnderLease executes the lease-protected cleanup steps.
|
||||
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
|
||||
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
|
||||
switch existing.Status {
|
||||
case runtime.StatusRemoved:
|
||||
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
|
||||
case runtime.StatusRunning:
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: fmt.Sprintf("runtime for game %q is running; stop the runtime first", input.GameID),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
case runtime.StatusStopped:
|
||||
// proceed
|
||||
default:
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("runtime record has unsupported status %q", existing.Status),
|
||||
}), nil
|
||||
}
|
||||
|
||||
if existing.CurrentContainerID != "" {
|
||||
if err := service.docker.Remove(ctx, existing.CurrentContainerID); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("docker remove: %s", err.Error()),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
}
|
||||
|
||||
updateNow := service.clock().UTC()
|
||||
err = service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: input.GameID,
|
||||
ExpectedFrom: runtime.StatusStopped,
|
||||
ExpectedContainerID: existing.CurrentContainerID,
|
||||
To: runtime.StatusRemoved,
|
||||
Now: updateNow,
|
||||
})
|
||||
if errors.Is(err, runtime.ErrConflict) {
|
||||
// CAS race: another caller (reconciler dispose, concurrent admin)
|
||||
// already moved the record. The desired terminal state was
|
||||
// reached by another path.
|
||||
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
|
||||
}
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q vanished mid-cleanup", input.GameID),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("update runtime status: %s", err.Error()),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindCleanupContainer,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: existing.CurrentImageRef,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordCleanupOutcome(ctx, string(operation.OutcomeSuccess), string(input.OpSource))
|
||||
|
||||
record := existing
|
||||
record.Status = runtime.StatusRemoved
|
||||
record.CurrentContainerID = ""
|
||||
removedAt := updateNow
|
||||
record.RemovedAt = &removedAt
|
||||
record.LastOpAt = updateNow
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", existing.CurrentContainerID,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime container cleaned up", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// recordReplayNoOp records the idempotent replay outcome and returns the
|
||||
// existing record unchanged.
|
||||
func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindCleanupContainer,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: existing.CurrentImageRef,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordCleanupOutcome(ctx, string(operation.OutcomeSuccess), string(input.OpSource))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", existing.CurrentContainerID,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime cleanup replay no-op", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: existing,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
||||
}
|
||||
}
|
||||
|
||||
// failureCtx groups the inputs to recordFailure.
|
||||
type failureCtx struct {
|
||||
opStartedAt time.Time
|
||||
input Input
|
||||
errorCode string
|
||||
errorMessage string
|
||||
containerID string
|
||||
imageRef string
|
||||
}
|
||||
|
||||
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: fc.input.GameID,
|
||||
OpKind: operation.OpKindCleanupContainer,
|
||||
OpSource: fc.input.OpSource,
|
||||
SourceRef: fc.input.SourceRef,
|
||||
ImageRef: fc.imageRef,
|
||||
ContainerID: fc.containerID,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
StartedAt: fc.opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordCleanupOutcome(ctx, string(operation.OutcomeFailure), string(fc.input.OpSource))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", fc.input.GameID,
|
||||
"op_source", string(fc.input.OpSource),
|
||||
"error_code", fc.errorCode,
|
||||
"error_message", fc.errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "runtime cleanup failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
service.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,382 @@
|
||||
package cleanupcontainer_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// --- shared fake doubles ----------------------------------------------
|
||||
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
updateStatusErr error
|
||||
|
||||
updates []ports.UpdateStatusInput
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error {
|
||||
return errors.New("not used in cleanup tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.updates = append(s.updates, input)
|
||||
if s.updateStatusErr != nil {
|
||||
return s.updateStatusErr
|
||||
}
|
||||
record, ok := s.stored[input.GameID]
|
||||
if !ok {
|
||||
return runtime.ErrNotFound
|
||||
}
|
||||
if record.Status != input.ExpectedFrom {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
record.Status = input.To
|
||||
record.LastOpAt = input.Now
|
||||
if input.To == runtime.StatusRemoved {
|
||||
removedAt := input.Now
|
||||
record.RemovedAt = &removedAt
|
||||
record.CurrentContainerID = ""
|
||||
}
|
||||
s.stored[input.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in cleanup tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in cleanup tests")
|
||||
}
|
||||
|
||||
type fakeOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
|
||||
appendErr error
|
||||
appends []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.appendErr != nil {
|
||||
return 0, s.appendErr
|
||||
}
|
||||
s.appends = append(s.appends, entry)
|
||||
return int64(len(s.appends)), nil
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in cleanup tests")
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if len(s.appends) == 0 {
|
||||
return operation.OperationEntry{}, false
|
||||
}
|
||||
return s.appends[len(s.appends)-1], true
|
||||
}
|
||||
|
||||
type fakeLeases struct {
|
||||
mu sync.Mutex
|
||||
|
||||
acquired bool
|
||||
acquireErr error
|
||||
releaseErr error
|
||||
|
||||
acquires []string
|
||||
releases []string
|
||||
}
|
||||
|
||||
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.acquires = append(l.acquires, token)
|
||||
if l.acquireErr != nil {
|
||||
return false, l.acquireErr
|
||||
}
|
||||
return l.acquired, nil
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.releases = append(l.releases, token)
|
||||
return l.releaseErr
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
operationLogs *fakeOperationLogs
|
||||
docker *mocks.MockDockerClient
|
||||
leases *fakeLeases
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
now time.Time
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
records: newFakeRuntimeRecords(),
|
||||
operationLogs: &fakeOperationLogs{},
|
||||
docker: mocks.NewMockDockerClient(ctrl),
|
||||
leases: &fakeLeases{acquired: true},
|
||||
telemetry: telemetryRuntime,
|
||||
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T) *cleanupcontainer.Service {
|
||||
t.Helper()
|
||||
service, err := cleanupcontainer.NewService(cleanupcontainer.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "token-A" },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return service
|
||||
}
|
||||
|
||||
func basicInput() cleanupcontainer.Input {
|
||||
return cleanupcontainer.Input{
|
||||
GameID: "game-1",
|
||||
OpSource: operation.OpSourceAdminRest,
|
||||
SourceRef: "rest-cleanup-1",
|
||||
}
|
||||
}
|
||||
|
||||
func stoppedRecord(now time.Time) runtime.RuntimeRecord {
|
||||
startedAt := now.Add(-2 * time.Hour)
|
||||
stoppedAt := now.Add(-time.Hour)
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: "game-1",
|
||||
Status: runtime.StatusStopped,
|
||||
CurrentContainerID: "ctr-old",
|
||||
CurrentImageRef: "registry.example.com/galaxy/game:1.4.7",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-1",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
StoppedAt: &stoppedAt,
|
||||
LastOpAt: stoppedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
// --- happy path -----------------------------------------------------
|
||||
|
||||
func TestHandleCleanupHappyPath(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = stoppedRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, result.ErrorCode)
|
||||
assert.Equal(t, runtime.StatusRemoved, result.Record.Status)
|
||||
assert.Empty(t, result.Record.CurrentContainerID)
|
||||
|
||||
require.Len(t, h.records.updates, 1)
|
||||
assert.Equal(t, runtime.StatusStopped, h.records.updates[0].ExpectedFrom)
|
||||
assert.Equal(t, runtime.StatusRemoved, h.records.updates[0].To)
|
||||
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OpKindCleanupContainer, last.OpKind)
|
||||
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
|
||||
assert.Empty(t, last.ErrorCode)
|
||||
}
|
||||
|
||||
// --- replay ---------------------------------------------------------
|
||||
|
||||
func TestHandleReplayNoOpForRemovedRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
removed := stoppedRecord(h.now)
|
||||
removed.Status = runtime.StatusRemoved
|
||||
removed.CurrentContainerID = ""
|
||||
removedAt := h.now.Add(-30 * time.Minute)
|
||||
removed.RemovedAt = &removedAt
|
||||
h.records.stored["game-1"] = removed
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
|
||||
assert.Empty(t, h.records.updates)
|
||||
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleReplayNoOpOnUpdateStatusConflict(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = stoppedRecord(h.now)
|
||||
h.records.updateStatusErr = runtime.ErrConflict
|
||||
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- failure paths --------------------------------------------------
|
||||
|
||||
func TestHandleConflictOnRunningRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
running := stoppedRecord(h.now)
|
||||
running.Status = runtime.StatusRunning
|
||||
startedAt := h.now.Add(-time.Hour)
|
||||
running.StartedAt = &startedAt
|
||||
running.StoppedAt = nil
|
||||
h.records.stored["game-1"] = running
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
assert.Contains(t, result.ErrorMessage, "stop the runtime first")
|
||||
}
|
||||
|
||||
func TestHandleNotFoundForMissingRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = stoppedRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o"))
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
assert.Empty(t, h.records.updates, "no record mutation on docker remove failure")
|
||||
}
|
||||
|
||||
func TestHandleInternalErrorOnGenericUpdateError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = stoppedRecord(h.now)
|
||||
h.records.updateStatusErr = errors.New("postgres down")
|
||||
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeInternal, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquired = false
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- input validation ----------------------------------------------
|
||||
|
||||
func TestHandleRejectsInvalidInput(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t)
|
||||
|
||||
cases := []cleanupcontainer.Input{
|
||||
{GameID: "", OpSource: operation.OpSourceAdminRest},
|
||||
{GameID: "g", OpSource: operation.OpSource("bogus")},
|
||||
}
|
||||
for _, input := range cases {
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor ---------------------------------------------------
|
||||
|
||||
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
deps := cleanupcontainer.Dependencies{
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
}
|
||||
_, err := cleanupcontainer.NewService(deps)
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
package patchruntime
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/distribution/reference"
|
||||
"golang.org/x/mod/semver"
|
||||
)
|
||||
|
||||
// errImageRefNoTag reports that an image reference does not declare a
|
||||
// tag. The patch service maps it to `image_ref_not_semver` because a
|
||||
// digest-only or tagless reference cannot carry a semver-comparable
|
||||
// version.
|
||||
var errImageRefNoTag = errors.New("image reference is missing a tag")
|
||||
|
||||
// extractSemverTag returns the canonical semver string ("v1.4.7") for
|
||||
// imageRef, ready to feed into golang.org/x/mod/semver. The leading "v"
|
||||
// is added when the underlying tag omits it.
|
||||
//
|
||||
// Errors returned by this function are pre-formatted for inclusion in
|
||||
// the patch service's `image_ref_not_semver` failure message.
|
||||
func extractSemverTag(imageRef string) (string, error) {
|
||||
parsed, err := reference.ParseNormalizedNamed(imageRef)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("parse image reference %q: %w", imageRef, err)
|
||||
}
|
||||
tagged, ok := parsed.(reference.NamedTagged)
|
||||
if !ok {
|
||||
return "", fmt.Errorf("%w: %q", errImageRefNoTag, imageRef)
|
||||
}
|
||||
tag := strings.TrimSpace(tagged.Tag())
|
||||
if tag == "" {
|
||||
return "", fmt.Errorf("%w: %q", errImageRefNoTag, imageRef)
|
||||
}
|
||||
candidate := tag
|
||||
if !strings.HasPrefix(candidate, "v") {
|
||||
candidate = "v" + candidate
|
||||
}
|
||||
if !semver.IsValid(candidate) {
|
||||
return "", fmt.Errorf("tag %q on image reference %q is not a valid semver", tag, imageRef)
|
||||
}
|
||||
return candidate, nil
|
||||
}
|
||||
|
||||
// samePatchSeries reports whether two canonical semver strings (with
|
||||
// the leading "v") share their major and minor components. The third
|
||||
// component (patch) and any pre-release / build metadata are ignored.
|
||||
func samePatchSeries(currentSemver, newSemver string) bool {
|
||||
return semver.MajorMinor(currentSemver) == semver.MajorMinor(newSemver)
|
||||
}
|
||||
@@ -0,0 +1,483 @@
|
||||
// Package patchruntime implements the `patch` lifecycle operation owned
|
||||
// by Runtime Manager. Patch is restart with a new `image_ref`: under
|
||||
// one outer per-game lease the service runs the stop service, removes
|
||||
// the container, and runs the start service with the new image. The
|
||||
// engine reads its state from the bind-mount on startup, so any data
|
||||
// written before the patch survives.
|
||||
//
|
||||
// The new and current image references must both parse as semver tags
|
||||
// and share their major and minor components. A new tag that bumps the
|
||||
// major or the minor surfaces as `semver_patch_only`; a tag that is
|
||||
// not parseable as semver surfaces as `image_ref_not_semver`. These
|
||||
// pre-checks run before any Docker work so a rejected patch never
|
||||
// disturbs the running runtime.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
|
||||
// §Lifecycles → Patch`. Design rationale is captured in
|
||||
// `rtmanager/docs/services.md`.
|
||||
package patchruntime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
const leaseReleaseTimeout = 5 * time.Second
|
||||
|
||||
// Input stores the per-call arguments for one patch operation.
|
||||
type Input struct {
|
||||
// GameID identifies the platform game to patch.
|
||||
GameID string
|
||||
|
||||
// NewImageRef stores the new Docker reference the patch installs.
|
||||
// Must be a valid Docker reference whose tag parses as semver.
|
||||
NewImageRef string
|
||||
|
||||
// OpSource classifies how the request entered Runtime Manager.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference. When
|
||||
// non-empty it is reused as the correlation id linking the outer
|
||||
// patch entry to the inner stop and start log entries.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires. Image-reference shape and semver checks happen
|
||||
// later inside Handle so that they run after the runtime record has
|
||||
// been loaded.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(input.NewImageRef) == "" {
|
||||
return fmt.Errorf("new image ref must not be empty")
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle call.
|
||||
type Result struct {
|
||||
// Record carries the runtime record installed by the inner start on
|
||||
// success; zero on failure.
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
OperationLogs ports.OperationLogStore
|
||||
Docker ports.DockerClient
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
// StopService runs the inner stop step.
|
||||
StopService *stopruntime.Service
|
||||
// StartService runs the inner start step with the new image_ref.
|
||||
StartService *startruntime.Service
|
||||
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
Telemetry *telemetry.Runtime
|
||||
Logger *slog.Logger
|
||||
Clock func() time.Time
|
||||
NewToken func() string
|
||||
}
|
||||
|
||||
// Service executes the patch lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
docker ports.DockerClient
|
||||
leases ports.GameLeaseStore
|
||||
stopService *stopruntime.Service
|
||||
startService *startruntime.Service
|
||||
|
||||
leaseTTL time.Duration
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
|
||||
clock func() time.Time
|
||||
newToken func() string
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new patch runtime service: nil runtime records")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new patch runtime service: nil operation logs")
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new patch runtime service: nil docker client")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new patch runtime service: nil lease store")
|
||||
case deps.StopService == nil:
|
||||
return nil, errors.New("new patch runtime service: nil stop service")
|
||||
case deps.StartService == nil:
|
||||
return nil, errors.New("new patch runtime service: nil start service")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new patch runtime service: nil telemetry runtime")
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new patch runtime service: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "rtmanager.patchruntime")
|
||||
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
docker: deps.Docker,
|
||||
leases: deps.Leases,
|
||||
stopService: deps.StopService,
|
||||
startService: deps.StartService,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
newToken: newToken,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one patch operation end-to-end. The Go-level error
|
||||
// return is reserved for non-business failures (nil context, nil
|
||||
// receiver). Every business outcome — success or any of the stable
|
||||
// failure codes — flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("patch runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("patch runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInvalidRequest,
|
||||
errorMessage: err.Error(),
|
||||
}), nil
|
||||
}
|
||||
|
||||
token := service.newToken()
|
||||
leaseStart := service.clock()
|
||||
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
|
||||
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if !acquired {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: "another lifecycle operation is in progress for this game",
|
||||
}), nil
|
||||
}
|
||||
defer service.releaseLease(ctx, input.GameID, token)
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// runUnderLease executes the lease-protected patch sequence: load the
|
||||
// runtime record, validate semver compatibility, run inner stop,
|
||||
// remove the container, run inner start with the new image.
|
||||
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
|
||||
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if existing.Status == runtime.StatusRemoved {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: fmt.Sprintf("runtime for game %q is removed; cannot patch", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
if strings.TrimSpace(existing.CurrentImageRef) == "" {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q has no current image_ref", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
|
||||
currentSemver, err := extractSemverTag(existing.CurrentImageRef)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeImageRefNotSemver,
|
||||
errorMessage: fmt.Sprintf("current image_ref: %s", err.Error()),
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
newSemver, err := extractSemverTag(input.NewImageRef)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeImageRefNotSemver,
|
||||
errorMessage: fmt.Sprintf("new image_ref: %s", err.Error()),
|
||||
imageRef: input.NewImageRef,
|
||||
}), nil
|
||||
}
|
||||
if !samePatchSeries(currentSemver, newSemver) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeSemverPatchOnly,
|
||||
errorMessage: fmt.Sprintf(
|
||||
"patch must keep major.minor; current=%s new=%s",
|
||||
currentSemver, newSemver,
|
||||
),
|
||||
imageRef: input.NewImageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
correlationRef := input.SourceRef
|
||||
if correlationRef == "" {
|
||||
correlationRef = service.newToken()
|
||||
}
|
||||
containerID := existing.CurrentContainerID
|
||||
|
||||
stopResult, err := service.stopService.Run(ctx, stopruntime.Input{
|
||||
GameID: input.GameID,
|
||||
Reason: stopruntime.StopReasonAdminRequest,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
})
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("inner stop: %s", err.Error()),
|
||||
imageRef: input.NewImageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
if stopResult.Outcome == operation.OutcomeFailure {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: stopResult.ErrorCode,
|
||||
errorMessage: fmt.Sprintf("inner stop failed: %s", stopResult.ErrorMessage),
|
||||
imageRef: input.NewImageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
|
||||
if containerID != "" {
|
||||
if err := service.docker.Remove(ctx, containerID); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("docker remove: %s", err.Error()),
|
||||
imageRef: input.NewImageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
}
|
||||
|
||||
startResult, err := service.startService.Run(ctx, startruntime.Input{
|
||||
GameID: input.GameID,
|
||||
ImageRef: input.NewImageRef,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
})
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("inner start: %s", err.Error()),
|
||||
imageRef: input.NewImageRef,
|
||||
}), nil
|
||||
}
|
||||
if startResult.Outcome == operation.OutcomeFailure {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startResult.ErrorCode,
|
||||
errorMessage: fmt.Sprintf("inner start failed: %s", startResult.ErrorMessage),
|
||||
imageRef: input.NewImageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindPatch,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
ImageRef: input.NewImageRef,
|
||||
ContainerID: startResult.Record.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeSuccess), "")
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"prev_image_ref", existing.CurrentImageRef,
|
||||
"new_image_ref", input.NewImageRef,
|
||||
"prev_container_id", containerID,
|
||||
"new_container_id", startResult.Record.CurrentContainerID,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime patched", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: startResult.Record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// failureCtx groups the inputs to recordFailure.
|
||||
type failureCtx struct {
|
||||
opStartedAt time.Time
|
||||
input Input
|
||||
errorCode string
|
||||
errorMessage string
|
||||
imageRef string
|
||||
containerID string
|
||||
}
|
||||
|
||||
// recordFailure writes the outer failure operation_log entry and emits
|
||||
// telemetry. Inner stop / start services have already recorded their
|
||||
// own entries; this is the outer summary.
|
||||
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: fc.input.GameID,
|
||||
OpKind: operation.OpKindPatch,
|
||||
OpSource: fc.input.OpSource,
|
||||
SourceRef: fc.input.SourceRef,
|
||||
ImageRef: fc.imageRef,
|
||||
ContainerID: fc.containerID,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
StartedAt: fc.opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode)
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", fc.input.GameID,
|
||||
"image_ref", fc.imageRef,
|
||||
"op_source", string(fc.input.OpSource),
|
||||
"error_code", fc.errorCode,
|
||||
"error_message", fc.errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "runtime patch failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
service.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,597 @@
|
||||
package patchruntime_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/patchruntime"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// --- shared fake doubles (mirror the restartruntime test pattern) ---
|
||||
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
upsertErr error
|
||||
updateStatusErr error
|
||||
|
||||
upserts []runtime.RuntimeRecord
|
||||
updates []ports.UpdateStatusInput
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.upsertErr != nil {
|
||||
return s.upsertErr
|
||||
}
|
||||
s.upserts = append(s.upserts, record)
|
||||
s.stored[record.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.updates = append(s.updates, input)
|
||||
if s.updateStatusErr != nil {
|
||||
return s.updateStatusErr
|
||||
}
|
||||
record, ok := s.stored[input.GameID]
|
||||
if !ok {
|
||||
return runtime.ErrNotFound
|
||||
}
|
||||
if record.Status != input.ExpectedFrom {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
record.Status = input.To
|
||||
record.LastOpAt = input.Now
|
||||
switch input.To {
|
||||
case runtime.StatusStopped:
|
||||
stoppedAt := input.Now
|
||||
record.StoppedAt = &stoppedAt
|
||||
case runtime.StatusRemoved:
|
||||
removedAt := input.Now
|
||||
record.RemovedAt = &removedAt
|
||||
record.CurrentContainerID = ""
|
||||
}
|
||||
s.stored[input.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in patch tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in patch tests")
|
||||
}
|
||||
|
||||
type fakeOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
|
||||
appendErr error
|
||||
appends []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.appendErr != nil {
|
||||
return 0, s.appendErr
|
||||
}
|
||||
s.appends = append(s.appends, entry)
|
||||
return int64(len(s.appends)), nil
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in patch tests")
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) byKind(kind operation.OpKind) []operation.OperationEntry {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := []operation.OperationEntry{}
|
||||
for _, entry := range s.appends {
|
||||
if entry.OpKind == kind {
|
||||
out = append(out, entry)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeLeases struct {
|
||||
mu sync.Mutex
|
||||
|
||||
acquired bool
|
||||
acquireErr error
|
||||
releaseErr error
|
||||
|
||||
acquires []string
|
||||
releases []string
|
||||
}
|
||||
|
||||
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.acquires = append(l.acquires, token)
|
||||
if l.acquireErr != nil {
|
||||
return false, l.acquireErr
|
||||
}
|
||||
return l.acquired, nil
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.releases = append(l.releases, token)
|
||||
return l.releaseErr
|
||||
}
|
||||
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
envelopes []ports.HealthEventEnvelope
|
||||
}
|
||||
|
||||
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
h.envelopes = append(h.envelopes, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeNotifications struct {
|
||||
mu sync.Mutex
|
||||
intents []notificationintent.Intent
|
||||
}
|
||||
|
||||
func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
n.intents = append(n.intents, intent)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeLobby struct{}
|
||||
|
||||
func (l *fakeLobby) GetGame(_ context.Context, _ string) (ports.LobbyGameRecord, error) {
|
||||
return ports.LobbyGameRecord{}, nil
|
||||
}
|
||||
|
||||
// --- harness ---------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
operationLogs *fakeOperationLogs
|
||||
docker *mocks.MockDockerClient
|
||||
leases *fakeLeases
|
||||
healthEvents *fakeHealthEvents
|
||||
notifications *fakeNotifications
|
||||
lobby *fakeLobby
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
now time.Time
|
||||
stateDir string
|
||||
|
||||
startService *startruntime.Service
|
||||
stopService *stopruntime.Service
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
h := &harness{
|
||||
records: newFakeRuntimeRecords(),
|
||||
operationLogs: &fakeOperationLogs{},
|
||||
docker: mocks.NewMockDockerClient(ctrl),
|
||||
leases: &fakeLeases{acquired: true},
|
||||
healthEvents: &fakeHealthEvents{},
|
||||
notifications: &fakeNotifications{},
|
||||
lobby: &fakeLobby{},
|
||||
telemetry: telemetryRuntime,
|
||||
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
stateDir: "/var/lib/galaxy/games/game-1",
|
||||
}
|
||||
|
||||
containerCfg := config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
}
|
||||
dockerCfg := config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
}
|
||||
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
|
||||
|
||||
startService, err := startruntime.NewService(startruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Notifications: h.notifications,
|
||||
Lobby: h.lobby,
|
||||
Container: containerCfg,
|
||||
DockerCfg: dockerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "inner-start-token" },
|
||||
PrepareStateDir: func(_ string) (string, error) { return h.stateDir, nil },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
h.startService = startService
|
||||
|
||||
stopService, err := stopruntime.NewService(stopruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Container: containerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "inner-stop-token" },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
h.stopService = stopService
|
||||
|
||||
return h
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T, tokens ...string) *patchruntime.Service {
|
||||
t.Helper()
|
||||
tokenIdx := 0
|
||||
tokenGen := func() string {
|
||||
if tokenIdx >= len(tokens) {
|
||||
return "outer-fallback"
|
||||
}
|
||||
t := tokens[tokenIdx]
|
||||
tokenIdx++
|
||||
return t
|
||||
}
|
||||
service, err := patchruntime.NewService(patchruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
StopService: h.stopService,
|
||||
StartService: h.startService,
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: tokenGen,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return service
|
||||
}
|
||||
|
||||
const (
|
||||
currentImage = "registry.example.com/galaxy/game:1.4.7"
|
||||
patchImage = "registry.example.com/galaxy/game:1.4.8"
|
||||
majorBump = "registry.example.com/galaxy/game:2.0.0"
|
||||
tagless = "registry.example.com/galaxy/game"
|
||||
notSemver = "registry.example.com/galaxy/game:latest"
|
||||
)
|
||||
|
||||
func runningRecord(now time.Time) runtime.RuntimeRecord {
|
||||
startedAt := now.Add(-time.Hour)
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: "game-1",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-old",
|
||||
CurrentImageRef: currentImage,
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-1",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func basicInput() patchruntime.Input {
|
||||
return patchruntime.Input{
|
||||
GameID: "game-1",
|
||||
NewImageRef: patchImage,
|
||||
OpSource: operation.OpSourceGMRest,
|
||||
SourceRef: "rest-req-99",
|
||||
}
|
||||
}
|
||||
|
||||
func sampleRunResult(now time.Time) ports.RunResult {
|
||||
return ports.RunResult{
|
||||
ContainerID: "ctr-new",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StartedAt: now,
|
||||
}
|
||||
}
|
||||
|
||||
func expectInnerStart(h *harness, image string) {
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), image, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), image).Return(ports.ImageInspect{Ref: image}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
|
||||
}
|
||||
|
||||
// --- happy path -----------------------------------------------------
|
||||
|
||||
func TestHandlePatchHappyPath(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
expectInnerStart(h, patchImage)
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, patchImage, result.Record.CurrentImageRef)
|
||||
|
||||
patches := h.operationLogs.byKind(operation.OpKindPatch)
|
||||
require.Len(t, patches, 1)
|
||||
assert.Equal(t, "rest-req-99", patches[0].SourceRef)
|
||||
assert.Equal(t, patchImage, patches[0].ImageRef)
|
||||
assert.Equal(t, "ctr-new", patches[0].ContainerID)
|
||||
|
||||
assert.Len(t, h.operationLogs.byKind(operation.OpKindStop), 1)
|
||||
assert.Len(t, h.operationLogs.byKind(operation.OpKindStart), 1)
|
||||
}
|
||||
|
||||
func TestHandlePatchSameImageProceedsAsRecreate(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
expectInnerStart(h, currentImage)
|
||||
|
||||
input := basicInput()
|
||||
input.NewImageRef = currentImage
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
require.Len(t, h.operationLogs.byKind(operation.OpKindPatch), 1, "patch entry recorded even when image is unchanged")
|
||||
}
|
||||
|
||||
// --- semver pre-checks ---------------------------------------------
|
||||
|
||||
func TestHandleImageRefNotSemverWhenNewIsTagless(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
input := basicInput()
|
||||
input.NewImageRef = tagless
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode)
|
||||
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop), "no inner stop on pre-check failure")
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart))
|
||||
}
|
||||
|
||||
func TestHandleImageRefNotSemverWhenNewIsNonSemver(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
input := basicInput()
|
||||
input.NewImageRef = notSemver
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleImageRefNotSemverWhenCurrentIsTagless(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
record := runningRecord(h.now)
|
||||
record.CurrentImageRef = tagless
|
||||
h.records.stored["game-1"] = record
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleSemverPatchOnlyOnMajorBump(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
input := basicInput()
|
||||
input.NewImageRef = majorBump
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeSemverPatchOnly, result.ErrorCode)
|
||||
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop))
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart))
|
||||
}
|
||||
|
||||
func TestHandleSemverPatchOnlyOnMinorBump(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
input := basicInput()
|
||||
input.NewImageRef = "registry.example.com/galaxy/game:1.5.0"
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeSemverPatchOnly, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- record state checks -------------------------------------------
|
||||
|
||||
func TestHandleNotFoundForMissingRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t, "outer-token")
|
||||
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleConflictForRemovedRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
removed := runningRecord(h.now)
|
||||
removed.Status = runtime.StatusRemoved
|
||||
removed.CurrentContainerID = ""
|
||||
removedAt := h.now.Add(-time.Hour)
|
||||
removed.RemovedAt = &removedAt
|
||||
h.records.stored["game-1"] = removed
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- failures from inner ops ---------------------------------------
|
||||
|
||||
func TestHandlePropagatesInnerStopFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(errors.New("daemon unreachable"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandlePropagatesInnerStartFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), patchImage, gomock.Any()).Return(errors.New("manifest unknown"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- conflicts ------------------------------------------------------
|
||||
|
||||
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquired = false
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- input validation ----------------------------------------------
|
||||
|
||||
func TestHandleRejectsInvalidInput(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t, "outer-token")
|
||||
|
||||
cases := []patchruntime.Input{
|
||||
{GameID: "", NewImageRef: patchImage, OpSource: operation.OpSourceGMRest},
|
||||
{GameID: "g", NewImageRef: "", OpSource: operation.OpSourceGMRest},
|
||||
{GameID: "g", NewImageRef: patchImage, OpSource: operation.OpSource("bogus")},
|
||||
}
|
||||
for _, input := range cases {
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor ---------------------------------------------------
|
||||
|
||||
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
deps := patchruntime.Dependencies{
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
}
|
||||
_, err := patchruntime.NewService(deps)
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,482 @@
|
||||
// Package restartruntime implements the `restart` lifecycle operation
|
||||
// owned by Runtime Manager. Restart is a recreate: under one outer
|
||||
// per-game lease the service runs the stop service, removes the
|
||||
// container with `docker rm`, and runs the start service with the
|
||||
// runtime's current `image_ref`. The hostname / engine endpoint stays
|
||||
// stable across the recreate; `container_id` changes.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
|
||||
// §Lifecycles → Restart`. Design rationale is captured in
|
||||
// `rtmanager/docs/services.md`, in particular the lease-sharing
|
||||
// pattern with `startruntime.Service.Run` / `stopruntime.Service.Run`,
|
||||
// the correlation-id reuse on `source_ref`, and the
|
||||
// inner-stop-then-rm-failure recovery rule.
|
||||
package restartruntime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
// leaseReleaseTimeout bounds the deferred lease-release call.
|
||||
const leaseReleaseTimeout = 5 * time.Second
|
||||
|
||||
// Input stores the per-call arguments for one restart operation.
|
||||
type Input struct {
|
||||
// GameID identifies the platform game to restart.
|
||||
GameID string
|
||||
|
||||
// OpSource classifies how the request entered Runtime Manager.
|
||||
// Required: every operation_log entry carries an op_source.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference (REST
|
||||
// request id, admin user id). When non-empty it is reused as the
|
||||
// correlation id linking the outer restart entry to the inner stop
|
||||
// and start log entries.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle call.
|
||||
type Result struct {
|
||||
// Record carries the runtime record installed by the inner start on
|
||||
// success; zero on failure.
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure. Empty for
|
||||
// success.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
// Empty for success.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
// RuntimeRecords reads the runtime record at the start of restart
|
||||
// to capture the current image_ref and container_id.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// OperationLogs records the outer restart audit entry. Inner stop
|
||||
// and start services append their own entries through their own
|
||||
// stores.
|
||||
OperationLogs ports.OperationLogStore
|
||||
|
||||
// Docker drives the docker rm step between the inner stop and
|
||||
// inner start.
|
||||
Docker ports.DockerClient
|
||||
|
||||
// Leases serialises operations against the same game id. The outer
|
||||
// lease is held for the entire stop + rm + start sequence.
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
// StopService runs the inner stop step under the outer lease.
|
||||
StopService *stopruntime.Service
|
||||
|
||||
// StartService runs the inner start step under the outer lease.
|
||||
StartService *startruntime.Service
|
||||
|
||||
// Coordination supplies the per-game lease TTL.
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
// Telemetry records restart outcomes and lease latency. Required.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Logger records structured service-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
|
||||
// Clock supplies the wall-clock used for operation timestamps.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// NewToken supplies a unique opaque token. Used both for the lease
|
||||
// and for the correlation id when Input.SourceRef is empty.
|
||||
// Defaults to a 32-byte random base64url string when nil.
|
||||
NewToken func() string
|
||||
}
|
||||
|
||||
// Service executes the restart lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
docker ports.DockerClient
|
||||
leases ports.GameLeaseStore
|
||||
stopService *stopruntime.Service
|
||||
startService *startruntime.Service
|
||||
|
||||
leaseTTL time.Duration
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
|
||||
clock func() time.Time
|
||||
newToken func() string
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new restart runtime service: nil runtime records")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new restart runtime service: nil operation logs")
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new restart runtime service: nil docker client")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new restart runtime service: nil lease store")
|
||||
case deps.StopService == nil:
|
||||
return nil, errors.New("new restart runtime service: nil stop service")
|
||||
case deps.StartService == nil:
|
||||
return nil, errors.New("new restart runtime service: nil start service")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new restart runtime service: nil telemetry runtime")
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new restart runtime service: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "rtmanager.restartruntime")
|
||||
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
docker: deps.Docker,
|
||||
leases: deps.Leases,
|
||||
stopService: deps.StopService,
|
||||
startService: deps.StartService,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
newToken: newToken,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one restart operation end-to-end. The Go-level error
|
||||
// return is reserved for non-business failures (nil context, nil
|
||||
// receiver). Every business outcome — success or any of the stable
|
||||
// failure codes — flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("restart runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("restart runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInvalidRequest,
|
||||
errorMessage: err.Error(),
|
||||
}), nil
|
||||
}
|
||||
|
||||
token := service.newToken()
|
||||
leaseStart := service.clock()
|
||||
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
|
||||
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if !acquired {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: "another lifecycle operation is in progress for this game",
|
||||
}), nil
|
||||
}
|
||||
defer service.releaseLease(ctx, input.GameID, token)
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// runUnderLease executes the lease-protected restart sequence. Loads
|
||||
// the runtime record, runs inner stop, removes the container, runs
|
||||
// inner start.
|
||||
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
|
||||
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if existing.Status == runtime.StatusRemoved {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: fmt.Sprintf("runtime for game %q is removed; cannot restart", input.GameID),
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
if strings.TrimSpace(existing.CurrentImageRef) == "" {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q has no image_ref to restart with", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
|
||||
correlationRef := input.SourceRef
|
||||
if correlationRef == "" {
|
||||
correlationRef = service.newToken()
|
||||
}
|
||||
containerID := existing.CurrentContainerID
|
||||
imageRef := existing.CurrentImageRef
|
||||
|
||||
stopResult, err := service.stopService.Run(ctx, stopruntime.Input{
|
||||
GameID: input.GameID,
|
||||
Reason: stopruntime.StopReasonAdminRequest,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
})
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("inner stop: %s", err.Error()),
|
||||
imageRef: imageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
if stopResult.Outcome == operation.OutcomeFailure {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: stopResult.ErrorCode,
|
||||
errorMessage: fmt.Sprintf("inner stop failed: %s", stopResult.ErrorMessage),
|
||||
imageRef: imageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
|
||||
if containerID != "" {
|
||||
if err := service.docker.Remove(ctx, containerID); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("docker remove: %s", err.Error()),
|
||||
imageRef: imageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
}
|
||||
|
||||
startResult, err := service.startService.Run(ctx, startruntime.Input{
|
||||
GameID: input.GameID,
|
||||
ImageRef: imageRef,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
})
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("inner start: %s", err.Error()),
|
||||
imageRef: imageRef,
|
||||
}), nil
|
||||
}
|
||||
if startResult.Outcome == operation.OutcomeFailure {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startResult.ErrorCode,
|
||||
errorMessage: fmt.Sprintf("inner start failed: %s", startResult.ErrorMessage),
|
||||
imageRef: imageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindRestart,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
ImageRef: imageRef,
|
||||
ContainerID: startResult.Record.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordRestartOutcome(ctx, string(operation.OutcomeSuccess), "")
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"prev_container_id", containerID,
|
||||
"new_container_id", startResult.Record.CurrentContainerID,
|
||||
"image_ref", imageRef,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime restarted", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: startResult.Record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// failureCtx groups the inputs to recordFailure.
|
||||
type failureCtx struct {
|
||||
opStartedAt time.Time
|
||||
input Input
|
||||
errorCode string
|
||||
errorMessage string
|
||||
imageRef string
|
||||
containerID string
|
||||
}
|
||||
|
||||
// recordFailure records the outer failure operation_log entry and emits
|
||||
// telemetry. Inner stop / start services have already recorded their
|
||||
// own entries; this is the outer summary.
|
||||
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: fc.input.GameID,
|
||||
OpKind: operation.OpKindRestart,
|
||||
OpSource: fc.input.OpSource,
|
||||
SourceRef: correlationRefOrEmpty(fc.input),
|
||||
ImageRef: fc.imageRef,
|
||||
ContainerID: fc.containerID,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
StartedAt: fc.opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordRestartOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode)
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", fc.input.GameID,
|
||||
"image_ref", fc.imageRef,
|
||||
"op_source", string(fc.input.OpSource),
|
||||
"error_code", fc.errorCode,
|
||||
"error_message", fc.errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "runtime restart failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
// correlationRefOrEmpty returns the original Input.SourceRef for the
|
||||
// outer entry. Outer-failure paths that did not yet generate a
|
||||
// correlation id (input validation, lease busy) keep the original
|
||||
// `source_ref` which is the actor ref.
|
||||
func correlationRefOrEmpty(input Input) string {
|
||||
return input.SourceRef
|
||||
}
|
||||
|
||||
// releaseLease releases the per-game lease in a fresh background context.
|
||||
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
service.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortAppend writes one outer operation_log entry. Inner ops have
|
||||
// already appended their own; a failure here only loses the outer
|
||||
// summary, which is acceptable.
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// defaultTokenGenerator returns a function that produces 32-byte
|
||||
// base64url-encoded tokens.
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,584 @@
|
||||
package restartruntime_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/restartruntime"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// --- shared fake doubles ----------------------------------------------
|
||||
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
upsertErr error
|
||||
updateStatusErr error
|
||||
|
||||
upserts []runtime.RuntimeRecord
|
||||
updates []ports.UpdateStatusInput
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.upsertErr != nil {
|
||||
return s.upsertErr
|
||||
}
|
||||
s.upserts = append(s.upserts, record)
|
||||
s.stored[record.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.updates = append(s.updates, input)
|
||||
if s.updateStatusErr != nil {
|
||||
return s.updateStatusErr
|
||||
}
|
||||
record, ok := s.stored[input.GameID]
|
||||
if !ok {
|
||||
return runtime.ErrNotFound
|
||||
}
|
||||
if record.Status != input.ExpectedFrom {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
record.Status = input.To
|
||||
record.LastOpAt = input.Now
|
||||
switch input.To {
|
||||
case runtime.StatusStopped:
|
||||
stoppedAt := input.Now
|
||||
record.StoppedAt = &stoppedAt
|
||||
case runtime.StatusRemoved:
|
||||
removedAt := input.Now
|
||||
record.RemovedAt = &removedAt
|
||||
record.CurrentContainerID = ""
|
||||
}
|
||||
s.stored[input.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in restart tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in restart tests")
|
||||
}
|
||||
|
||||
type fakeOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
|
||||
appendErr error
|
||||
appends []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.appendErr != nil {
|
||||
return 0, s.appendErr
|
||||
}
|
||||
s.appends = append(s.appends, entry)
|
||||
return int64(len(s.appends)), nil
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in restart tests")
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) byKind(kind operation.OpKind) []operation.OperationEntry {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := []operation.OperationEntry{}
|
||||
for _, entry := range s.appends {
|
||||
if entry.OpKind == kind {
|
||||
out = append(out, entry)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeLeases struct {
|
||||
mu sync.Mutex
|
||||
|
||||
acquired bool
|
||||
acquireErr error
|
||||
releaseErr error
|
||||
|
||||
acquires []string
|
||||
releases []string
|
||||
}
|
||||
|
||||
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.acquires = append(l.acquires, token)
|
||||
if l.acquireErr != nil {
|
||||
return false, l.acquireErr
|
||||
}
|
||||
return l.acquired, nil
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.releases = append(l.releases, token)
|
||||
return l.releaseErr
|
||||
}
|
||||
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
|
||||
publishErr error
|
||||
envelopes []ports.HealthEventEnvelope
|
||||
}
|
||||
|
||||
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
if h.publishErr != nil {
|
||||
return h.publishErr
|
||||
}
|
||||
h.envelopes = append(h.envelopes, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeNotifications struct {
|
||||
mu sync.Mutex
|
||||
|
||||
publishErr error
|
||||
intents []notificationintent.Intent
|
||||
}
|
||||
|
||||
func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
if n.publishErr != nil {
|
||||
return n.publishErr
|
||||
}
|
||||
n.intents = append(n.intents, intent)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeLobby struct {
|
||||
record ports.LobbyGameRecord
|
||||
err error
|
||||
}
|
||||
|
||||
func (l *fakeLobby) GetGame(_ context.Context, _ string) (ports.LobbyGameRecord, error) {
|
||||
if l.err != nil {
|
||||
return ports.LobbyGameRecord{}, l.err
|
||||
}
|
||||
return l.record, nil
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
operationLogs *fakeOperationLogs
|
||||
docker *mocks.MockDockerClient
|
||||
leases *fakeLeases
|
||||
healthEvents *fakeHealthEvents
|
||||
notifications *fakeNotifications
|
||||
lobby *fakeLobby
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
now time.Time
|
||||
stateDir string
|
||||
|
||||
startService *startruntime.Service
|
||||
stopService *stopruntime.Service
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
h := &harness{
|
||||
records: newFakeRuntimeRecords(),
|
||||
operationLogs: &fakeOperationLogs{},
|
||||
docker: mocks.NewMockDockerClient(ctrl),
|
||||
leases: &fakeLeases{acquired: true},
|
||||
healthEvents: &fakeHealthEvents{},
|
||||
notifications: &fakeNotifications{},
|
||||
lobby: &fakeLobby{},
|
||||
telemetry: telemetryRuntime,
|
||||
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
stateDir: "/var/lib/galaxy/games/game-1",
|
||||
}
|
||||
|
||||
containerCfg := config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
}
|
||||
dockerCfg := config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
}
|
||||
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
|
||||
|
||||
startService, err := startruntime.NewService(startruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Notifications: h.notifications,
|
||||
Lobby: h.lobby,
|
||||
Container: containerCfg,
|
||||
DockerCfg: dockerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "inner-start-token" },
|
||||
PrepareStateDir: func(_ string) (string, error) { return h.stateDir, nil },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
h.startService = startService
|
||||
|
||||
stopService, err := stopruntime.NewService(stopruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Container: containerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "inner-stop-token" },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
h.stopService = stopService
|
||||
|
||||
return h
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T, tokens ...string) *restartruntime.Service {
|
||||
t.Helper()
|
||||
tokenIdx := 0
|
||||
tokenGen := func() string {
|
||||
if tokenIdx >= len(tokens) {
|
||||
return "outer-fallback"
|
||||
}
|
||||
t := tokens[tokenIdx]
|
||||
tokenIdx++
|
||||
return t
|
||||
}
|
||||
service, err := restartruntime.NewService(restartruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
StopService: h.stopService,
|
||||
StartService: h.startService,
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: tokenGen,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return service
|
||||
}
|
||||
|
||||
const imageRef = "registry.example.com/galaxy/game:1.4.7"
|
||||
|
||||
func runningRecord(now time.Time) runtime.RuntimeRecord {
|
||||
startedAt := now.Add(-time.Hour)
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: "game-1",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-old",
|
||||
CurrentImageRef: imageRef,
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-1",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func basicInput() restartruntime.Input {
|
||||
return restartruntime.Input{
|
||||
GameID: "game-1",
|
||||
OpSource: operation.OpSourceGMRest,
|
||||
SourceRef: "rest-req-42",
|
||||
}
|
||||
}
|
||||
|
||||
func sampleRunResult(now time.Time) ports.RunResult {
|
||||
return ports.RunResult{
|
||||
ContainerID: "ctr-new",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StartedAt: now,
|
||||
}
|
||||
}
|
||||
|
||||
func expectInnerStart(h *harness) {
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), imageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), imageRef).Return(ports.ImageInspect{Ref: imageRef}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
|
||||
}
|
||||
|
||||
// --- happy path -------------------------------------------------------
|
||||
|
||||
func TestHandleRestartFromRunning(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
expectInnerStart(h)
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, result.ErrorCode)
|
||||
assert.Equal(t, "ctr-new", result.Record.CurrentContainerID)
|
||||
assert.Equal(t, imageRef, result.Record.CurrentImageRef)
|
||||
assert.Equal(t, runtime.StatusRunning, result.Record.Status)
|
||||
|
||||
stops := h.operationLogs.byKind(operation.OpKindStop)
|
||||
starts := h.operationLogs.byKind(operation.OpKindStart)
|
||||
restarts := h.operationLogs.byKind(operation.OpKindRestart)
|
||||
require.Len(t, stops, 1, "inner stop appended its own entry")
|
||||
require.Len(t, starts, 1, "inner start appended its own entry")
|
||||
require.Len(t, restarts, 1, "outer restart appended one summary entry")
|
||||
|
||||
assert.Equal(t, "rest-req-42", stops[0].SourceRef, "correlation id propagated to inner stop")
|
||||
assert.Equal(t, "rest-req-42", starts[0].SourceRef, "correlation id propagated to inner start")
|
||||
assert.Equal(t, "rest-req-42", restarts[0].SourceRef, "correlation id stored on outer restart")
|
||||
assert.Equal(t, "ctr-new", restarts[0].ContainerID)
|
||||
assert.Equal(t, imageRef, restarts[0].ImageRef)
|
||||
|
||||
assert.Equal(t, []string{"outer-token"}, h.leases.acquires)
|
||||
assert.Equal(t, []string{"outer-token"}, h.leases.releases)
|
||||
}
|
||||
|
||||
func TestHandleRestartFromStopped(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
stoppedRecord := runningRecord(h.now)
|
||||
stoppedRecord.Status = runtime.StatusStopped
|
||||
stoppedAt := h.now.Add(-30 * time.Minute)
|
||||
stoppedRecord.StoppedAt = &stoppedAt
|
||||
h.records.stored["game-1"] = stoppedRecord
|
||||
|
||||
// No docker.Stop because inner stop short-circuits via replay no-op.
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
expectInnerStart(h)
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, "ctr-new", result.Record.CurrentContainerID)
|
||||
}
|
||||
|
||||
// --- correlation id fallback -----------------------------------------
|
||||
|
||||
func TestHandleGeneratesCorrelationWhenSourceRefEmpty(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
expectInnerStart(h)
|
||||
|
||||
input := basicInput()
|
||||
input.SourceRef = ""
|
||||
|
||||
// First newToken call yields the lease token, second yields the
|
||||
// correlation id fallback.
|
||||
service := h.build(t, "outer-token", "correlation-fallback")
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
|
||||
stops := h.operationLogs.byKind(operation.OpKindStop)
|
||||
starts := h.operationLogs.byKind(operation.OpKindStart)
|
||||
restarts := h.operationLogs.byKind(operation.OpKindRestart)
|
||||
require.Len(t, stops, 1)
|
||||
require.Len(t, starts, 1)
|
||||
require.Len(t, restarts, 1)
|
||||
assert.Equal(t, "correlation-fallback", stops[0].SourceRef)
|
||||
assert.Equal(t, "correlation-fallback", starts[0].SourceRef)
|
||||
assert.Equal(t, "correlation-fallback", restarts[0].SourceRef)
|
||||
}
|
||||
|
||||
// --- failure paths ---------------------------------------------------
|
||||
|
||||
func TestHandleNotFoundForMissingRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t, "outer-token")
|
||||
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop))
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart))
|
||||
require.Len(t, h.operationLogs.byKind(operation.OpKindRestart), 1)
|
||||
}
|
||||
|
||||
func TestHandleConflictForRemovedRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
removed := runningRecord(h.now)
|
||||
removed.Status = runtime.StatusRemoved
|
||||
removed.CurrentContainerID = ""
|
||||
removedAt := h.now.Add(-time.Hour)
|
||||
removed.RemovedAt = &removedAt
|
||||
h.records.stored["game-1"] = removed
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquired = false
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
assert.Empty(t, h.leases.releases, "release must not run when acquire returned false")
|
||||
}
|
||||
|
||||
func TestHandlePropagatesInnerStopFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(errors.New("daemon unreachable"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
assert.Contains(t, result.ErrorMessage, "inner stop failed")
|
||||
}
|
||||
|
||||
func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
assert.Contains(t, result.ErrorMessage, "docker remove")
|
||||
// inner stop did succeed and write its log entry; outer restart records failure.
|
||||
require.Len(t, h.operationLogs.byKind(operation.OpKindStop), 1)
|
||||
require.Len(t, h.operationLogs.byKind(operation.OpKindRestart), 1)
|
||||
}
|
||||
|
||||
func TestHandlePropagatesInnerStartFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), imageRef, gomock.Any()).Return(errors.New("manifest unknown"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode)
|
||||
assert.Contains(t, result.ErrorMessage, "inner start failed")
|
||||
}
|
||||
|
||||
// --- input validation ------------------------------------------------
|
||||
|
||||
func TestHandleRejectsInvalidInput(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t, "outer-token")
|
||||
|
||||
cases := []restartruntime.Input{
|
||||
{GameID: "", OpSource: operation.OpSourceGMRest},
|
||||
{GameID: "g", OpSource: operation.OpSource("bogus")},
|
||||
}
|
||||
for _, input := range cases {
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor -----------------------------------------------------
|
||||
|
||||
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
deps := restartruntime.Dependencies{
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
}
|
||||
_, err := restartruntime.NewService(deps)
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
package startruntime
|
||||
|
||||
// Stable error codes returned in `Result.ErrorCode`. The values match the
|
||||
// vocabulary frozen by `rtmanager/README.md §Error Model`,
|
||||
// `rtmanager/api/internal-openapi.yaml`, and
|
||||
// `rtmanager/api/runtime-jobs-asyncapi.yaml`. Although the constants live
|
||||
// in the start-service package they are the canonical home for every
|
||||
// lifecycle service in `internal/service/`. Stop, restart, patch,
|
||||
// cleanup, the REST handlers, and the stream consumers import these
|
||||
// names rather than redeclare them; renaming any of them is a contract
|
||||
// change.
|
||||
const (
|
||||
// ErrorCodeReplayNoOp reports that the request was an idempotent
|
||||
// replay against an already-running record with the same image_ref.
|
||||
ErrorCodeReplayNoOp = "replay_no_op"
|
||||
|
||||
// ErrorCodeStartConfigInvalid reports that the start request was
|
||||
// rejected before any Docker work because of a validation failure
|
||||
// (invalid image_ref shape, missing Docker network, unwritable state
|
||||
// directory).
|
||||
ErrorCodeStartConfigInvalid = "start_config_invalid"
|
||||
|
||||
// ErrorCodeImagePullFailed reports that the image pull stage failed.
|
||||
ErrorCodeImagePullFailed = "image_pull_failed"
|
||||
|
||||
// ErrorCodeContainerStartFailed reports that `docker create` or
|
||||
// `docker start` failed, or that the runtime record could not be
|
||||
// installed after a successful Run.
|
||||
ErrorCodeContainerStartFailed = "container_start_failed"
|
||||
|
||||
// ErrorCodeConflict reports an operation incompatible with the
|
||||
// current runtime state (lease busy, running record with a different
|
||||
// image_ref, cleanup attempted on a running runtime, restart or
|
||||
// patch attempted on a removed record).
|
||||
ErrorCodeConflict = "conflict"
|
||||
|
||||
// ErrorCodeServiceUnavailable reports that a steady-state dependency
|
||||
// (Docker daemon, PostgreSQL, Redis) was unreachable for this call.
|
||||
ErrorCodeServiceUnavailable = "service_unavailable"
|
||||
|
||||
// ErrorCodeInternal reports an unexpected error not classified by
|
||||
// the other codes.
|
||||
ErrorCodeInternal = "internal_error"
|
||||
|
||||
// ErrorCodeInvalidRequest reports that the request was rejected
|
||||
// because of structural input validation (empty required fields,
|
||||
// unknown enum values). Used by the stop / restart / patch /
|
||||
// cleanup services for malformed Input. The start service uses the
|
||||
// stricter `start_config_invalid` code instead because every start
|
||||
// validation failure also raises an admin notification intent.
|
||||
ErrorCodeInvalidRequest = "invalid_request"
|
||||
|
||||
// ErrorCodeNotFound reports that the runtime record requested by a
|
||||
// stop, restart, patch or cleanup operation does not exist. Those
|
||||
// services raise it; the start service never does (start installs
|
||||
// the record on first call).
|
||||
ErrorCodeNotFound = "not_found"
|
||||
|
||||
// ErrorCodeImageRefNotSemver reports that a patch operation was
|
||||
// rejected because either the current or the new image reference
|
||||
// could not be parsed as a semver tag.
|
||||
ErrorCodeImageRefNotSemver = "image_ref_not_semver"
|
||||
|
||||
// ErrorCodeSemverPatchOnly reports that a patch operation was
|
||||
// rejected because the major or minor component differs between the
|
||||
// current and new image references.
|
||||
ErrorCodeSemverPatchOnly = "semver_patch_only"
|
||||
)
|
||||
@@ -0,0 +1,940 @@
|
||||
// Package startruntime implements the `start` lifecycle operation owned
|
||||
// by Runtime Manager. The service is the single orchestrator behind
|
||||
// both the asynchronous `runtime:start_jobs` consumer and the
|
||||
// synchronous `POST /api/v1/internal/runtimes/{game_id}/start` REST
|
||||
// handler; both callers obtain a deterministic Result with a stable
|
||||
// `Outcome` / `ErrorCode` pair.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
|
||||
// §Lifecycles → Start`. Design rationale is captured in
|
||||
// `rtmanager/docs/services.md`.
|
||||
package startruntime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/distribution/reference"
|
||||
)
|
||||
|
||||
// Container labels applied to every engine container created by the
|
||||
// start service. Frozen by `rtmanager/README.md §Container Model`.
|
||||
const (
|
||||
LabelOwner = "com.galaxy.owner"
|
||||
LabelOwnerValue = "rtmanager"
|
||||
LabelKind = "com.galaxy.kind"
|
||||
LabelKindValue = "game-engine"
|
||||
LabelGameID = "com.galaxy.game_id"
|
||||
LabelEngineImageRef = "com.galaxy.engine_image_ref"
|
||||
LabelStartedAtMs = "com.galaxy.started_at_ms"
|
||||
|
||||
// Image labels read at start time to derive resource limits.
|
||||
imageLabelCPUQuota = "com.galaxy.cpu_quota"
|
||||
imageLabelMemory = "com.galaxy.memory"
|
||||
imageLabelPIDsLimit = "com.galaxy.pids_limit"
|
||||
|
||||
// HostnamePrefix is the constant prefix used to build the per-game
|
||||
// container hostname (`galaxy-game-{game_id}`). The full hostname
|
||||
// also forms the container name; restart and patch keep the same
|
||||
// value so the engine endpoint stays stable across container
|
||||
// recreates.
|
||||
HostnamePrefix = "galaxy-game-"
|
||||
|
||||
// EngineStateBackCompatEnvName is the secondary env var name v1
|
||||
// engines accept for the bind-mounted state directory. Always set
|
||||
// alongside the configured primary name to honour the v1 backward
|
||||
// compatibility commitment in `rtmanager/README.md §Container Model`.
|
||||
EngineStateBackCompatEnvName = "STORAGE_PATH"
|
||||
|
||||
// leaseReleaseTimeout bounds the deferred lease-release call. A
|
||||
// fresh background context is used so the release runs even when
|
||||
// the request context was already canceled.
|
||||
leaseReleaseTimeout = 5 * time.Second
|
||||
)
|
||||
|
||||
// Input stores the per-call arguments for one start operation.
|
||||
type Input struct {
|
||||
// GameID identifies the platform game to start.
|
||||
GameID string
|
||||
|
||||
// ImageRef stores the producer-resolved Docker reference of the
|
||||
// engine image. Validated against `distribution/reference` before
|
||||
// any Docker work.
|
||||
ImageRef string
|
||||
|
||||
// OpSource classifies how the request entered Runtime Manager.
|
||||
// Required: every operation_log entry carries an op_source.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference
|
||||
// (Redis Stream entry id, REST request id, admin user id). Empty
|
||||
// when the caller does not provide one.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(input.ImageRef) == "" {
|
||||
return fmt.Errorf("image ref must not be empty")
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle call.
|
||||
type Result struct {
|
||||
// Record carries the runtime record installed by the operation.
|
||||
// Populated on success and on idempotent replay (`replay_no_op`);
|
||||
// zero on failure.
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure, or
|
||||
// `replay_no_op` on idempotent replay. Empty for fresh successes.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
// Empty for successes.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
// RuntimeRecords reads and installs the durable runtime record.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// OperationLogs records the success / failure audit entry.
|
||||
OperationLogs ports.OperationLogStore
|
||||
|
||||
// Docker drives the Docker daemon (network check, pull, inspect,
|
||||
// run, remove).
|
||||
Docker ports.DockerClient
|
||||
|
||||
// Leases serialises operations against the same game id.
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
// HealthEvents publishes `runtime:health_events` and upserts the
|
||||
// matching `health_snapshots` row.
|
||||
HealthEvents ports.HealthEventPublisher
|
||||
|
||||
// Notifications publishes admin-only failure intents.
|
||||
Notifications ports.NotificationIntentPublisher
|
||||
|
||||
// Lobby provides best-effort diagnostic context for the started
|
||||
// game. May be nil; the start operation does not depend on it.
|
||||
Lobby ports.LobbyInternalClient
|
||||
|
||||
// Container groups the per-container defaults and state-directory
|
||||
// settings consumed at start time.
|
||||
Container config.ContainerConfig
|
||||
|
||||
// Docker groups the Docker daemon settings (network, log driver,
|
||||
// pull policy) consumed at start time.
|
||||
DockerCfg config.DockerConfig
|
||||
|
||||
// Coordination supplies the per-game lease TTL.
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
// Telemetry records start outcomes, lease latency, and health
|
||||
// event counters. Required.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Logger records structured service-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
|
||||
// Clock supplies the wall-clock used for operation timestamps.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// NewToken supplies a unique opaque lease token. Defaults to a
|
||||
// 32-byte random base64url string when nil. Tests may override.
|
||||
NewToken func() string
|
||||
|
||||
// PrepareStateDir creates the per-game state directory and
|
||||
// returns its absolute host path. Defaults to a real-filesystem
|
||||
// implementation that honours Container.GameStateRoot,
|
||||
// Container.GameStateDirMode, and Container.GameStateOwner{UID,GID}.
|
||||
// Tests override to point at a temporary directory.
|
||||
PrepareStateDir func(gameID string) (string, error)
|
||||
}
|
||||
|
||||
// Service executes the start lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
docker ports.DockerClient
|
||||
leases ports.GameLeaseStore
|
||||
healthEvents ports.HealthEventPublisher
|
||||
notifications ports.NotificationIntentPublisher
|
||||
lobby ports.LobbyInternalClient
|
||||
|
||||
containerCfg config.ContainerConfig
|
||||
dockerCfg config.DockerConfig
|
||||
leaseTTL time.Duration
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
|
||||
clock func() time.Time
|
||||
newToken func() string
|
||||
prepareStateDir func(gameID string) (string, error)
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new start runtime service: nil runtime records")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new start runtime service: nil operation logs")
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new start runtime service: nil docker client")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new start runtime service: nil lease store")
|
||||
case deps.HealthEvents == nil:
|
||||
return nil, errors.New("new start runtime service: nil health events publisher")
|
||||
case deps.Notifications == nil:
|
||||
return nil, errors.New("new start runtime service: nil notification publisher")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new start runtime service: nil telemetry runtime")
|
||||
}
|
||||
if err := deps.Container.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new start runtime service: container config: %w", err)
|
||||
}
|
||||
if err := deps.DockerCfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new start runtime service: docker config: %w", err)
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new start runtime service: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "rtmanager.startruntime")
|
||||
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
prepareStateDir := deps.PrepareStateDir
|
||||
if prepareStateDir == nil {
|
||||
prepareStateDir = newDefaultStateDirPreparer(deps.Container)
|
||||
}
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
docker: deps.Docker,
|
||||
leases: deps.Leases,
|
||||
healthEvents: deps.HealthEvents,
|
||||
notifications: deps.Notifications,
|
||||
lobby: deps.Lobby,
|
||||
containerCfg: deps.Container,
|
||||
dockerCfg: deps.DockerCfg,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
newToken: newToken,
|
||||
prepareStateDir: prepareStateDir,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one start operation end-to-end. The Go-level error
|
||||
// return is reserved for non-business failures (nil context, nil
|
||||
// receiver). Every business outcome — fresh success, idempotent
|
||||
// replay, or any of the stable failure modes — flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("start runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("start runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeStartConfigInvalid,
|
||||
errorMessage: err.Error(),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
|
||||
}), nil
|
||||
}
|
||||
|
||||
token := service.newToken()
|
||||
leaseStart := service.clock()
|
||||
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
|
||||
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if !acquired {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeConflict,
|
||||
errorMessage: "another lifecycle operation is in progress for this game",
|
||||
}), nil
|
||||
}
|
||||
defer service.releaseLease(ctx, input.GameID, token)
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// Run executes the start lifecycle assuming the per-game lease is
|
||||
// already held by the caller. The method is reserved for orchestrator
|
||||
// services in `internal/service/` that compose start with another
|
||||
// operation under a single outer lease (restart and patch). External
|
||||
// callers must use Handle, which acquires and releases the lease
|
||||
// itself.
|
||||
//
|
||||
// Run still validates input and reports business outcomes through
|
||||
// Result; the Go-level error return is reserved for non-business
|
||||
// failures (nil context, nil receiver). Operation log entries,
|
||||
// telemetry counters, health events and admin-only notification
|
||||
// intents fire identically to Handle.
|
||||
func (service *Service) Run(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("start runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("start runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeStartConfigInvalid,
|
||||
errorMessage: err.Error(),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
|
||||
}), nil
|
||||
}
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// runUnderLease executes the post-validation, lease-protected start
|
||||
// steps shared by Handle and Run. Callers must validate input and
|
||||
// acquire the lease (when applicable) before invocation.
|
||||
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
|
||||
existing, hasExisting, err := service.loadExisting(ctx, input.GameID)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if hasExisting && existing.Status == runtime.StatusRunning {
|
||||
if existing.CurrentImageRef == input.ImageRef {
|
||||
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
|
||||
}
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeConflict,
|
||||
errorMessage: fmt.Sprintf("runtime already running with image_ref %q", existing.CurrentImageRef),
|
||||
}), nil
|
||||
}
|
||||
|
||||
service.fetchLobbyDiagnostic(ctx, input.GameID)
|
||||
|
||||
if err := validateImageRef(input.ImageRef); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeStartConfigInvalid,
|
||||
errorMessage: fmt.Sprintf("invalid image_ref: %s", err.Error()),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
|
||||
}), nil
|
||||
}
|
||||
|
||||
if err := service.docker.EnsureNetwork(ctx, service.dockerCfg.Network); err != nil {
|
||||
if errors.Is(err, ports.ErrNetworkMissing) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeStartConfigInvalid,
|
||||
errorMessage: fmt.Sprintf("docker network %q is missing", service.dockerCfg.Network),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
|
||||
}), nil
|
||||
}
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("ensure docker network: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
|
||||
if err := service.docker.PullImage(ctx, input.ImageRef, ports.PullPolicy(service.dockerCfg.PullPolicy)); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeImagePullFailed,
|
||||
errorMessage: err.Error(),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeImagePullFailed,
|
||||
}), nil
|
||||
}
|
||||
|
||||
imageInspect, err := service.docker.InspectImage(ctx, input.ImageRef)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeImagePullFailed,
|
||||
errorMessage: fmt.Sprintf("inspect image: %s", err.Error()),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeImagePullFailed,
|
||||
}), nil
|
||||
}
|
||||
cpuQuota, memory, pidsLimit := service.resolveLimits(imageInspect.Labels)
|
||||
|
||||
statePath, err := service.prepareStateDir(input.GameID)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeStartConfigInvalid,
|
||||
errorMessage: fmt.Sprintf("prepare state directory: %s", err.Error()),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
|
||||
}), nil
|
||||
}
|
||||
|
||||
hostname := containerHostname(input.GameID)
|
||||
spec := ports.RunSpec{
|
||||
Name: hostname,
|
||||
Image: input.ImageRef,
|
||||
Hostname: hostname,
|
||||
Network: service.dockerCfg.Network,
|
||||
Env: service.buildEnv(),
|
||||
Labels: service.buildLabels(input.GameID, input.ImageRef, opStartedAt),
|
||||
BindMounts: []ports.BindMount{{
|
||||
HostPath: statePath,
|
||||
MountPath: service.containerCfg.EngineStateMountPath,
|
||||
ReadOnly: false,
|
||||
}},
|
||||
LogDriver: service.dockerCfg.LogDriver,
|
||||
LogOpts: parseLogOpts(service.dockerCfg.LogOpts),
|
||||
CPUQuota: cpuQuota,
|
||||
Memory: memory,
|
||||
PIDsLimit: pidsLimit,
|
||||
}
|
||||
runResult, err := service.docker.Run(ctx, spec)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeContainerStartFailed,
|
||||
errorMessage: err.Error(),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeContainerStartFailed,
|
||||
}), nil
|
||||
}
|
||||
|
||||
createdAt := opStartedAt
|
||||
if hasExisting && !existing.CreatedAt.IsZero() {
|
||||
createdAt = existing.CreatedAt
|
||||
}
|
||||
startedAt := runResult.StartedAt
|
||||
record := runtime.RuntimeRecord{
|
||||
GameID: input.GameID,
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: runResult.ContainerID,
|
||||
CurrentImageRef: input.ImageRef,
|
||||
EngineEndpoint: runResult.EngineEndpoint,
|
||||
StatePath: statePath,
|
||||
DockerNetwork: service.dockerCfg.Network,
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: createdAt,
|
||||
}
|
||||
if err := service.runtimeRecords.Upsert(ctx, record); err != nil {
|
||||
service.bestEffortRemove(input.GameID, runResult.ContainerID)
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeContainerStartFailed,
|
||||
errorMessage: fmt.Sprintf("upsert runtime record: %s", err.Error()),
|
||||
containerID: runResult.ContainerID,
|
||||
notificationType: notificationintent.NotificationTypeRuntimeContainerStartFailed,
|
||||
}), nil
|
||||
}
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindStart,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: input.ImageRef,
|
||||
ContainerID: runResult.ContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
|
||||
GameID: input.GameID,
|
||||
ContainerID: runResult.ContainerID,
|
||||
EventType: health.EventTypeContainerStarted,
|
||||
OccurredAt: startedAt,
|
||||
Details: containerStartedDetails(input.ImageRef),
|
||||
})
|
||||
|
||||
service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeSuccess), "", string(input.OpSource))
|
||||
service.telemetry.RecordHealthEvent(ctx, string(health.EventTypeContainerStarted))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", runResult.ContainerID,
|
||||
"image_ref", input.ImageRef,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime started", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// failureCtx groups the inputs to recordFailure so the Handle method
|
||||
// stays readable.
|
||||
type failureCtx struct {
|
||||
opStartedAt time.Time
|
||||
input Input
|
||||
errorCode string
|
||||
errorMessage string
|
||||
containerID string
|
||||
notificationType notificationintent.NotificationType
|
||||
}
|
||||
|
||||
// recordFailure records the failure operation_log entry, publishes the
|
||||
// matching admin-only notification intent (when applicable), and emits
|
||||
// telemetry. All side effects are best-effort; a downstream failure is
|
||||
// logged but does not change the returned Result.
|
||||
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: fc.input.GameID,
|
||||
OpKind: operation.OpKindStart,
|
||||
OpSource: fc.input.OpSource,
|
||||
SourceRef: fc.input.SourceRef,
|
||||
ImageRef: fc.input.ImageRef,
|
||||
ContainerID: fc.containerID,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
StartedAt: fc.opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
|
||||
if fc.notificationType != "" {
|
||||
service.bestEffortNotify(ctx, fc)
|
||||
}
|
||||
|
||||
service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode, string(fc.input.OpSource))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", fc.input.GameID,
|
||||
"image_ref", fc.input.ImageRef,
|
||||
"op_source", string(fc.input.OpSource),
|
||||
"error_code", fc.errorCode,
|
||||
"error_message", fc.errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "runtime start failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
// recordReplayNoOp records the idempotent replay outcome and returns
|
||||
// the existing record. The operation_log entry is appended best-effort
|
||||
// so audit history captures the replay; telemetry counts the call as a
|
||||
// successful start with `error_code=replay_no_op`.
|
||||
func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindStart,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: input.ImageRef,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: ErrorCodeReplayNoOp,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeSuccess), ErrorCodeReplayNoOp, string(input.OpSource))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", existing.CurrentContainerID,
|
||||
"image_ref", input.ImageRef,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime start replay no-op", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: existing,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: ErrorCodeReplayNoOp,
|
||||
}
|
||||
}
|
||||
|
||||
// loadExisting reads the runtime record for gameID. The boolean return
|
||||
// reports whether a record exists; ErrNotFound is translated to
|
||||
// (zero, false, nil) so the caller does not branch on the sentinel
|
||||
// elsewhere.
|
||||
func (service *Service) loadExisting(ctx context.Context, gameID string) (runtime.RuntimeRecord, bool, error) {
|
||||
record, err := service.runtimeRecords.Get(ctx, gameID)
|
||||
switch {
|
||||
case errors.Is(err, runtime.ErrNotFound):
|
||||
return runtime.RuntimeRecord{}, false, nil
|
||||
case err != nil:
|
||||
return runtime.RuntimeRecord{}, false, err
|
||||
default:
|
||||
return record, true, nil
|
||||
}
|
||||
}
|
||||
|
||||
// fetchLobbyDiagnostic best-effort enriches the request log with the
|
||||
// Lobby-side game record. A nil Lobby client or any transport failure
|
||||
// is logged and the start operation continues.
|
||||
func (service *Service) fetchLobbyDiagnostic(ctx context.Context, gameID string) {
|
||||
if service.lobby == nil {
|
||||
return
|
||||
}
|
||||
record, err := service.lobby.GetGame(ctx, gameID)
|
||||
if err != nil {
|
||||
service.logger.DebugContext(ctx, "lobby diagnostic fetch failed",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
service.logger.DebugContext(ctx, "lobby diagnostic fetched",
|
||||
"game_id", gameID,
|
||||
"lobby_status", record.Status,
|
||||
"lobby_target_engine_version", record.TargetEngineVersion,
|
||||
)
|
||||
}
|
||||
|
||||
// resolveLimits derives the per-container resource limits from the
|
||||
// resolved image's labels with config-driven fallbacks. Unparseable
|
||||
// label values silently fall back to the configured default; operators
|
||||
// see the chosen value through `rtmanager.docker_op_latency` and start
|
||||
// logs.
|
||||
func (service *Service) resolveLimits(labels map[string]string) (cpuQuota float64, memory string, pidsLimit int) {
|
||||
cpuQuota = service.containerCfg.DefaultCPUQuota
|
||||
memory = service.containerCfg.DefaultMemory
|
||||
pidsLimit = service.containerCfg.DefaultPIDsLimit
|
||||
|
||||
if raw, ok := labels[imageLabelCPUQuota]; ok {
|
||||
if value, err := strconv.ParseFloat(raw, 64); err == nil && value > 0 {
|
||||
cpuQuota = value
|
||||
}
|
||||
}
|
||||
if raw, ok := labels[imageLabelMemory]; ok && strings.TrimSpace(raw) != "" {
|
||||
memory = raw
|
||||
}
|
||||
if raw, ok := labels[imageLabelPIDsLimit]; ok {
|
||||
if value, err := strconv.Atoi(raw); err == nil && value > 0 {
|
||||
pidsLimit = value
|
||||
}
|
||||
}
|
||||
return cpuQuota, memory, pidsLimit
|
||||
}
|
||||
|
||||
// buildEnv assembles the env-var map handed to the engine. Both the
|
||||
// configured primary name and `STORAGE_PATH` are set per
|
||||
// `rtmanager/README.md §Container Model` v1 backward compatibility.
|
||||
func (service *Service) buildEnv() map[string]string {
|
||||
mount := service.containerCfg.EngineStateMountPath
|
||||
env := map[string]string{
|
||||
service.containerCfg.EngineStateEnvName: mount,
|
||||
EngineStateBackCompatEnvName: mount,
|
||||
}
|
||||
return env
|
||||
}
|
||||
|
||||
// buildLabels assembles the container labels per
|
||||
// `rtmanager/README.md §Container Model`.
|
||||
func (service *Service) buildLabels(gameID, imageRef string, startedAt time.Time) map[string]string {
|
||||
return map[string]string{
|
||||
LabelOwner: LabelOwnerValue,
|
||||
LabelKind: LabelKindValue,
|
||||
LabelGameID: gameID,
|
||||
LabelEngineImageRef: imageRef,
|
||||
LabelStartedAtMs: strconv.FormatInt(startedAt.UTC().UnixMilli(), 10),
|
||||
}
|
||||
}
|
||||
|
||||
// releaseLease releases the per-game lease in a fresh background
|
||||
// context so a canceled request context does not leave the lease
|
||||
// pinned for its TTL.
|
||||
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
service.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortAppend writes one operation_log entry. A failure is logged
|
||||
// and discarded; the durable runtime record (or its absence) remains
|
||||
// the source of truth.
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortPublishHealth emits one health event + snapshot upsert.
|
||||
// Failures degrade silently per `rtmanager/README.md §Notification
|
||||
// Contracts`; the runtime record remains the source of truth.
|
||||
func (service *Service) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
||||
if err := service.healthEvents.Publish(ctx, envelope); err != nil {
|
||||
service.logger.ErrorContext(ctx, "publish health event",
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortNotify publishes one admin-only failure intent. Failures
|
||||
// degrade silently because the source business state already reflects
|
||||
// the outcome.
|
||||
func (service *Service) bestEffortNotify(ctx context.Context, fc failureCtx) {
|
||||
intent, err := buildFailureIntent(fc, service.clock().UTC())
|
||||
if err != nil {
|
||||
service.logger.ErrorContext(ctx, "build notification intent",
|
||||
"game_id", fc.input.GameID,
|
||||
"notification_type", string(fc.notificationType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if err := service.notifications.Publish(ctx, intent); err != nil {
|
||||
service.logger.ErrorContext(ctx, "publish notification intent",
|
||||
"game_id", fc.input.GameID,
|
||||
"notification_type", string(fc.notificationType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
service.telemetry.RecordNotificationIntent(ctx, string(fc.notificationType))
|
||||
}
|
||||
|
||||
// bestEffortRemove forces removal of a container left running by a
|
||||
// failed start that progressed past Run but failed to register the
|
||||
// runtime record. Failures degrade silently — the reconciler adopts
|
||||
// orphans the periodic pass observes.
|
||||
func (service *Service) bestEffortRemove(gameID, containerID string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.docker.Remove(cleanupCtx, containerID); err != nil {
|
||||
service.logger.ErrorContext(cleanupCtx, "rollback container after upsert failure",
|
||||
"game_id", gameID,
|
||||
"container_id", containerID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// containerHostname builds the per-game hostname that doubles as the
|
||||
// Docker container name.
|
||||
func containerHostname(gameID string) string {
|
||||
return HostnamePrefix + gameID
|
||||
}
|
||||
|
||||
// containerStartedDetails builds the `details` payload required by the
|
||||
// `container_started` AsyncAPI variant.
|
||||
func containerStartedDetails(imageRef string) json.RawMessage {
|
||||
payload := map[string]string{"image_ref": imageRef}
|
||||
encoded, _ := json.Marshal(payload)
|
||||
return encoded
|
||||
}
|
||||
|
||||
// validateImageRef rejects malformed Docker references before any
|
||||
// daemon round-trip. The validation surfaces as `start_config_invalid`;
|
||||
// daemon-side rejections after a valid parse are reported as
|
||||
// `image_pull_failed`.
|
||||
func validateImageRef(ref string) error {
|
||||
if strings.TrimSpace(ref) == "" {
|
||||
return fmt.Errorf("image ref must not be empty")
|
||||
}
|
||||
if _, err := reference.ParseNormalizedNamed(ref); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// parseLogOpts turns the `key=value,key2=value2` shape of the
|
||||
// `RTMANAGER_DOCKER_LOG_OPTS` config into a map suitable for the
|
||||
// Docker SDK. Empty input returns nil so the SDK uses driver defaults.
|
||||
func parseLogOpts(raw string) map[string]string {
|
||||
if strings.TrimSpace(raw) == "" {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]string)
|
||||
for part := range strings.SplitSeq(raw, ",") {
|
||||
entry := strings.TrimSpace(part)
|
||||
if entry == "" {
|
||||
continue
|
||||
}
|
||||
index := strings.IndexByte(entry, '=')
|
||||
if index <= 0 {
|
||||
continue
|
||||
}
|
||||
out[entry[:index]] = entry[index+1:]
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// buildFailureIntent constructs the admin-only notification intent for
|
||||
// fc. The idempotency key is scoped per (notification_type, game_id,
|
||||
// image_ref, attempted_at_ms) so the same failure observed twice is
|
||||
// recognised as a duplicate by Notification Service.
|
||||
func buildFailureIntent(fc failureCtx, attemptedAt time.Time) (notificationintent.Intent, error) {
|
||||
attemptedAtMs := attemptedAt.UnixMilli()
|
||||
idempotencyKey := fmt.Sprintf("%s.%s.%d", fc.notificationType, fc.input.GameID, attemptedAtMs)
|
||||
metadata := notificationintent.Metadata{
|
||||
IdempotencyKey: idempotencyKey,
|
||||
OccurredAt: attemptedAt,
|
||||
}
|
||||
|
||||
switch fc.notificationType {
|
||||
case notificationintent.NotificationTypeRuntimeImagePullFailed:
|
||||
return notificationintent.NewRuntimeImagePullFailedIntent(metadata, notificationintent.RuntimeImagePullFailedPayload{
|
||||
GameID: fc.input.GameID,
|
||||
ImageRef: fc.input.ImageRef,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
AttemptedAtMs: attemptedAtMs,
|
||||
})
|
||||
case notificationintent.NotificationTypeRuntimeContainerStartFailed:
|
||||
return notificationintent.NewRuntimeContainerStartFailedIntent(metadata, notificationintent.RuntimeContainerStartFailedPayload{
|
||||
GameID: fc.input.GameID,
|
||||
ImageRef: fc.input.ImageRef,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
AttemptedAtMs: attemptedAtMs,
|
||||
})
|
||||
case notificationintent.NotificationTypeRuntimeStartConfigInvalid:
|
||||
return notificationintent.NewRuntimeStartConfigInvalidIntent(metadata, notificationintent.RuntimeStartConfigInvalidPayload{
|
||||
GameID: fc.input.GameID,
|
||||
ImageRef: fc.input.ImageRef,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
AttemptedAtMs: attemptedAtMs,
|
||||
})
|
||||
default:
|
||||
return notificationintent.Intent{}, fmt.Errorf("unsupported notification type %q", fc.notificationType)
|
||||
}
|
||||
}
|
||||
|
||||
// defaultTokenGenerator returns a function that produces 32-byte
|
||||
// base64url-encoded tokens. The randomness source is `crypto/rand`;
|
||||
// failures fall back to a deterministic-looking but invalid token so
|
||||
// the caller observes a TryAcquire collision rather than a panic on a
|
||||
// degraded entropy source.
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
|
||||
// newDefaultStateDirPreparer returns a function that creates the
|
||||
// per-game state directory under cfg.GameStateRoot with the configured
|
||||
// permissions and ownership. The function is overridable through
|
||||
// Dependencies.PrepareStateDir; tests inject a temporary-dir fake.
|
||||
func newDefaultStateDirPreparer(cfg config.ContainerConfig) func(gameID string) (string, error) {
|
||||
mode := os.FileMode(cfg.GameStateDirMode)
|
||||
uid := cfg.GameStateOwnerUID
|
||||
gid := cfg.GameStateOwnerGID
|
||||
root := cfg.GameStateRoot
|
||||
return func(gameID string) (string, error) {
|
||||
path := filepath.Join(root, gameID)
|
||||
if err := os.MkdirAll(path, mode); err != nil {
|
||||
return "", fmt.Errorf("create state dir %q: %w", path, err)
|
||||
}
|
||||
if err := os.Chmod(path, mode); err != nil {
|
||||
return "", fmt.Errorf("chmod state dir %q: %w", path, err)
|
||||
}
|
||||
if err := os.Chown(path, uid, gid); err != nil {
|
||||
return "", fmt.Errorf("chown state dir %q: %w", path, err)
|
||||
}
|
||||
return path, nil
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,693 @@
|
||||
package startruntime_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// --- test doubles -----------------------------------------------------
|
||||
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
upsertErr error
|
||||
upserts []runtime.RuntimeRecord
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.upsertErr != nil {
|
||||
return s.upsertErr
|
||||
}
|
||||
s.upserts = append(s.upserts, record)
|
||||
s.stored[record.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
|
||||
return errors.New("not used in start tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in start tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in start tests")
|
||||
}
|
||||
|
||||
type fakeOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
appendErr error
|
||||
appends []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.appendErr != nil {
|
||||
return 0, s.appendErr
|
||||
}
|
||||
s.appends = append(s.appends, entry)
|
||||
return int64(len(s.appends)), nil
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in start tests")
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if len(s.appends) == 0 {
|
||||
return operation.OperationEntry{}, false
|
||||
}
|
||||
return s.appends[len(s.appends)-1], true
|
||||
}
|
||||
|
||||
type fakeLeases struct {
|
||||
acquired bool
|
||||
acquireErr error
|
||||
releaseErr error
|
||||
|
||||
mu sync.Mutex
|
||||
acquires []string
|
||||
releases []string
|
||||
}
|
||||
|
||||
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.acquires = append(l.acquires, token)
|
||||
if l.acquireErr != nil {
|
||||
return false, l.acquireErr
|
||||
}
|
||||
return l.acquired, nil
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.releases = append(l.releases, token)
|
||||
return l.releaseErr
|
||||
}
|
||||
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
publishErr error
|
||||
envelopes []ports.HealthEventEnvelope
|
||||
}
|
||||
|
||||
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
if h.publishErr != nil {
|
||||
return h.publishErr
|
||||
}
|
||||
h.envelopes = append(h.envelopes, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeNotifications struct {
|
||||
mu sync.Mutex
|
||||
publishErr error
|
||||
intents []notificationintent.Intent
|
||||
}
|
||||
|
||||
func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
if n.publishErr != nil {
|
||||
return n.publishErr
|
||||
}
|
||||
n.intents = append(n.intents, intent)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeLobby struct {
|
||||
record ports.LobbyGameRecord
|
||||
err error
|
||||
|
||||
mu sync.Mutex
|
||||
calls []string
|
||||
}
|
||||
|
||||
func (l *fakeLobby) GetGame(_ context.Context, gameID string) (ports.LobbyGameRecord, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.calls = append(l.calls, gameID)
|
||||
if l.err != nil {
|
||||
return ports.LobbyGameRecord{}, l.err
|
||||
}
|
||||
return l.record, nil
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
operationLogs *fakeOperationLogs
|
||||
docker *mocks.MockDockerClient
|
||||
leases *fakeLeases
|
||||
healthEvents *fakeHealthEvents
|
||||
notifications *fakeNotifications
|
||||
lobby *fakeLobby
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
now time.Time
|
||||
stateDir string
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
records: newFakeRuntimeRecords(),
|
||||
operationLogs: &fakeOperationLogs{},
|
||||
docker: mocks.NewMockDockerClient(ctrl),
|
||||
leases: &fakeLeases{acquired: true},
|
||||
healthEvents: &fakeHealthEvents{},
|
||||
notifications: &fakeNotifications{},
|
||||
lobby: &fakeLobby{},
|
||||
telemetry: telemetryRuntime,
|
||||
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
stateDir: "/var/lib/galaxy/games/game-1",
|
||||
}
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T) *startruntime.Service {
|
||||
t.Helper()
|
||||
|
||||
containerCfg := config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
}
|
||||
dockerCfg := config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
}
|
||||
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
|
||||
|
||||
service, err := startruntime.NewService(startruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Notifications: h.notifications,
|
||||
Lobby: h.lobby,
|
||||
Container: containerCfg,
|
||||
DockerCfg: dockerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "token-A" },
|
||||
PrepareStateDir: func(_ string) (string, error) {
|
||||
return h.stateDir, nil
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return service
|
||||
}
|
||||
|
||||
func basicInput() startruntime.Input {
|
||||
return startruntime.Input{
|
||||
GameID: "game-1",
|
||||
ImageRef: "registry.example.com/galaxy/game:1.4.7",
|
||||
OpSource: operation.OpSourceLobbyStream,
|
||||
SourceRef: "1700000000000-0",
|
||||
}
|
||||
}
|
||||
|
||||
func sampleRunResult(now time.Time) ports.RunResult {
|
||||
return ports.RunResult{
|
||||
ContainerID: "ctr-123",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StartedAt: now,
|
||||
}
|
||||
}
|
||||
|
||||
// --- happy path -------------------------------------------------------
|
||||
|
||||
func TestHandleHappyPath(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, ports.PullPolicy(config.ImagePullPolicyIfMissing)).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{
|
||||
Ref: input.ImageRef,
|
||||
Labels: map[string]string{
|
||||
"com.galaxy.cpu_quota": "0.5",
|
||||
"com.galaxy.memory": "256m",
|
||||
"com.galaxy.pids_limit": "256",
|
||||
},
|
||||
}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).DoAndReturn(func(_ context.Context, spec ports.RunSpec) (ports.RunResult, error) {
|
||||
assert.Equal(t, "galaxy-game-game-1", spec.Name)
|
||||
assert.Equal(t, "galaxy-game-game-1", spec.Hostname)
|
||||
assert.Equal(t, input.ImageRef, spec.Image)
|
||||
assert.Equal(t, "galaxy-net", spec.Network)
|
||||
assert.Equal(t, "json-file", spec.LogDriver)
|
||||
assert.InDelta(t, 0.5, spec.CPUQuota, 0)
|
||||
assert.Equal(t, "256m", spec.Memory)
|
||||
assert.Equal(t, 256, spec.PIDsLimit)
|
||||
assert.Equal(t, h.stateDir, spec.BindMounts[0].HostPath)
|
||||
assert.Equal(t, "/var/lib/galaxy-game", spec.BindMounts[0].MountPath)
|
||||
assert.Equal(t, "/var/lib/galaxy-game", spec.Env["GAME_STATE_PATH"])
|
||||
assert.Equal(t, "/var/lib/galaxy-game", spec.Env["STORAGE_PATH"])
|
||||
assert.Equal(t, "rtmanager", spec.Labels[startruntime.LabelOwner])
|
||||
assert.Equal(t, "game-engine", spec.Labels[startruntime.LabelKind])
|
||||
assert.Equal(t, input.GameID, spec.Labels[startruntime.LabelGameID])
|
||||
assert.Equal(t, input.ImageRef, spec.Labels[startruntime.LabelEngineImageRef])
|
||||
return sampleRunResult(h.now), nil
|
||||
})
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, result.ErrorCode)
|
||||
assert.Equal(t, runtime.StatusRunning, result.Record.Status)
|
||||
assert.Equal(t, "ctr-123", result.Record.CurrentContainerID)
|
||||
assert.Equal(t, input.ImageRef, result.Record.CurrentImageRef)
|
||||
assert.Equal(t, "http://galaxy-game-game-1:8080", result.Record.EngineEndpoint)
|
||||
assert.Equal(t, h.stateDir, result.Record.StatePath)
|
||||
assert.Equal(t, "galaxy-net", result.Record.DockerNetwork)
|
||||
require.NotNil(t, result.Record.StartedAt)
|
||||
assert.Equal(t, h.now, *result.Record.StartedAt)
|
||||
assert.Equal(t, h.now, result.Record.LastOpAt)
|
||||
assert.Equal(t, h.now, result.Record.CreatedAt)
|
||||
|
||||
require.Len(t, h.records.upserts, 1)
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OpKindStart, last.OpKind)
|
||||
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
|
||||
assert.Empty(t, last.ErrorCode)
|
||||
assert.Equal(t, "ctr-123", last.ContainerID)
|
||||
|
||||
require.Len(t, h.healthEvents.envelopes, 1)
|
||||
assert.Equal(t, health.EventTypeContainerStarted, h.healthEvents.envelopes[0].EventType)
|
||||
var details map[string]string
|
||||
require.NoError(t, json.Unmarshal(h.healthEvents.envelopes[0].Details, &details))
|
||||
assert.Equal(t, input.ImageRef, details["image_ref"])
|
||||
|
||||
assert.Empty(t, h.notifications.intents, "no notification intent expected on success")
|
||||
assert.Equal(t, []string{"token-A"}, h.leases.acquires)
|
||||
assert.Equal(t, []string{"token-A"}, h.leases.releases)
|
||||
assert.Equal(t, []string{input.GameID}, h.lobby.calls)
|
||||
}
|
||||
|
||||
// --- idempotent replay ------------------------------------------------
|
||||
|
||||
func TestHandleReplayNoOpForRunningRecordWithSameImageRef(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
startedAt := h.now.Add(-time.Hour)
|
||||
h.records.stored[input.GameID] = runtime.RuntimeRecord{
|
||||
GameID: input.GameID,
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-prev",
|
||||
CurrentImageRef: input.ImageRef,
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: h.stateDir,
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
|
||||
assert.Equal(t, "ctr-prev", result.Record.CurrentContainerID)
|
||||
|
||||
assert.Empty(t, h.records.upserts, "replay must not Upsert a fresh record")
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode)
|
||||
assert.Equal(t, "ctr-prev", last.ContainerID)
|
||||
assert.Empty(t, h.notifications.intents)
|
||||
assert.Equal(t, []string{"token-A"}, h.leases.releases, "lease must be released after replay no-op")
|
||||
}
|
||||
|
||||
// --- conflicts --------------------------------------------------------
|
||||
|
||||
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquired = false
|
||||
input := basicInput()
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OutcomeFailure, last.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, last.ErrorCode)
|
||||
|
||||
assert.Empty(t, h.notifications.intents, "lease conflicts must not raise admin notifications")
|
||||
assert.Empty(t, h.leases.releases, "release must not run when acquire returned false")
|
||||
}
|
||||
|
||||
func TestHandleConflictWhenRunningWithDifferentImageRef(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
startedAt := h.now.Add(-time.Hour)
|
||||
h.records.stored[input.GameID] = runtime.RuntimeRecord{
|
||||
GameID: input.GameID,
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-prev",
|
||||
CurrentImageRef: "registry.example.com/galaxy/game:1.4.6",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: h.stateDir,
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, last.ErrorCode)
|
||||
assert.Empty(t, h.notifications.intents)
|
||||
assert.Empty(t, h.records.upserts)
|
||||
}
|
||||
|
||||
// --- start_config_invalid ---------------------------------------------
|
||||
|
||||
func TestHandleStartConfigInvalidWhenImageRefMalformed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
input.ImageRef = "::not a docker reference::"
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
|
||||
|
||||
require.Len(t, h.notifications.intents, 1)
|
||||
assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OutcomeFailure, last.Outcome)
|
||||
}
|
||||
|
||||
func TestHandleStartConfigInvalidWhenNetworkMissing(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(ports.ErrNetworkMissing)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
|
||||
require.Len(t, h.notifications.intents, 1)
|
||||
assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType)
|
||||
}
|
||||
|
||||
func TestHandleStartConfigInvalidWhenStateDirFails(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
|
||||
|
||||
service, err := startruntime.NewService(startruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Notifications: h.notifications,
|
||||
Lobby: h.lobby,
|
||||
Container: config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
},
|
||||
DockerCfg: config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
},
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "token-A" },
|
||||
PrepareStateDir: func(_ string) (string, error) {
|
||||
return "", errors.New("disk full")
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
|
||||
require.Len(t, h.notifications.intents, 1)
|
||||
assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType)
|
||||
}
|
||||
|
||||
// --- image_pull_failed ------------------------------------------------
|
||||
|
||||
func TestHandleImagePullFailed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(errors.New("manifest unknown"))
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode)
|
||||
require.Len(t, h.notifications.intents, 1)
|
||||
assert.Equal(t, notificationintent.NotificationTypeRuntimeImagePullFailed, h.notifications.intents[0].NotificationType)
|
||||
assert.Empty(t, h.records.upserts)
|
||||
}
|
||||
|
||||
// --- container_start_failed ------------------------------------------
|
||||
|
||||
func TestHandleContainerStartFailedOnRunError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(ports.RunResult{}, errors.New("container name conflict"))
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeContainerStartFailed, result.ErrorCode)
|
||||
require.Len(t, h.notifications.intents, 1)
|
||||
assert.Equal(t, notificationintent.NotificationTypeRuntimeContainerStartFailed, h.notifications.intents[0].NotificationType)
|
||||
assert.Empty(t, h.records.upserts)
|
||||
}
|
||||
|
||||
func TestHandleRollsBackContainerWhenUpsertFails(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.upsertErr = errors.New("connection refused")
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-123").Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeContainerStartFailed, result.ErrorCode)
|
||||
require.Len(t, h.notifications.intents, 1)
|
||||
assert.Equal(t, notificationintent.NotificationTypeRuntimeContainerStartFailed, h.notifications.intents[0].NotificationType)
|
||||
}
|
||||
|
||||
// --- best-effort degradation -----------------------------------------
|
||||
|
||||
func TestHandleSuccessSurvivesOperationLogFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.operationLogs.appendErr = errors.New("postgres down")
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, result.ErrorCode)
|
||||
assert.Len(t, h.records.upserts, 1)
|
||||
}
|
||||
|
||||
func TestHandleSuccessSurvivesHealthPublishFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.healthEvents.publishErr = errors.New("redis down")
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Len(t, h.records.upserts, 1)
|
||||
}
|
||||
|
||||
// --- pre-existing stopped record proceeds with fresh start ----------
|
||||
|
||||
func TestHandlePreservesCreatedAtForExistingRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
originalCreatedAt := h.now.Add(-72 * time.Hour)
|
||||
stoppedAt := h.now.Add(-time.Hour)
|
||||
h.records.stored[input.GameID] = runtime.RuntimeRecord{
|
||||
GameID: input.GameID,
|
||||
Status: runtime.StatusStopped,
|
||||
CurrentImageRef: "registry.example.com/galaxy/game:1.4.6",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: h.stateDir,
|
||||
DockerNetwork: "galaxy-net",
|
||||
StoppedAt: &stoppedAt,
|
||||
LastOpAt: stoppedAt,
|
||||
CreatedAt: originalCreatedAt,
|
||||
}
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, originalCreatedAt, result.Record.CreatedAt, "created_at must be preserved across re-starts")
|
||||
assert.Equal(t, runtime.StatusRunning, result.Record.Status)
|
||||
assert.Equal(t, input.ImageRef, result.Record.CurrentImageRef)
|
||||
}
|
||||
|
||||
// --- input validation -----------------------------------------------
|
||||
|
||||
func TestHandleRejectsInvalidInput(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t)
|
||||
|
||||
cases := []startruntime.Input{
|
||||
{GameID: "", ImageRef: "x", OpSource: operation.OpSourceLobbyStream},
|
||||
{GameID: "g", ImageRef: "", OpSource: operation.OpSourceLobbyStream},
|
||||
{GameID: "g", ImageRef: "x", OpSource: operation.OpSource("bogus")},
|
||||
}
|
||||
for _, input := range cases {
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
deps := startruntime.Dependencies{
|
||||
Container: config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
},
|
||||
DockerCfg: config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
},
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
}
|
||||
_, err := startruntime.NewService(deps)
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,612 @@
|
||||
// Package stopruntime implements the `stop` lifecycle operation owned by
|
||||
// Runtime Manager. The service is the single orchestrator behind both
|
||||
// the asynchronous `runtime:stop_jobs` consumer and the synchronous
|
||||
// `POST /api/v1/internal/runtimes/{game_id}/stop` REST handler. It is
|
||||
// also the inner stop step of the restart and patch services, which
|
||||
// call Run while holding the outer per-game lease.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
|
||||
// §Lifecycles → Stop`. Design rationale is captured in
|
||||
// `rtmanager/docs/services.md`.
|
||||
package stopruntime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
// leaseReleaseTimeout bounds the deferred lease-release call. A fresh
|
||||
// background context is used so the release runs even when the request
|
||||
// context was already canceled.
|
||||
const leaseReleaseTimeout = 5 * time.Second
|
||||
|
||||
// Input stores the per-call arguments for one stop operation.
|
||||
type Input struct {
|
||||
// GameID identifies the platform game to stop.
|
||||
GameID string
|
||||
|
||||
// Reason classifies the trigger of the stop. Required.
|
||||
Reason StopReason
|
||||
|
||||
// OpSource classifies how the request entered Runtime Manager.
|
||||
// Required: every operation_log entry carries an op_source.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference (Redis
|
||||
// Stream entry id, REST request id, admin user id). Empty when the
|
||||
// caller does not provide one. For inner calls invoked by the
|
||||
// restart and patch orchestrators it carries the outer correlation
|
||||
// id so the three operation_log entries share it.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
if err := input.Reason.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle / Run call.
|
||||
type Result struct {
|
||||
// Record carries the runtime record installed by the operation.
|
||||
// Populated on success and on idempotent replay; zero on failure.
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure, or
|
||||
// `replay_no_op` on idempotent replay. Empty for fresh successes.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
// Empty for successes.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
// RuntimeRecords reads and updates the durable runtime record.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// OperationLogs records the success / failure audit entry.
|
||||
OperationLogs ports.OperationLogStore
|
||||
|
||||
// Docker drives the Docker daemon (container stop).
|
||||
Docker ports.DockerClient
|
||||
|
||||
// Leases serialises operations against the same game id.
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
// HealthEvents publishes `runtime:health_events` and upserts the
|
||||
// matching `health_snapshots` row. Used on the vanished-container
|
||||
// path to emit `container_disappeared`.
|
||||
HealthEvents ports.HealthEventPublisher
|
||||
|
||||
// Container groups the per-container settings consumed at stop time
|
||||
// (the graceful stop timeout).
|
||||
Container config.ContainerConfig
|
||||
|
||||
// Coordination supplies the per-game lease TTL.
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
// Telemetry records stop outcomes and lease latency. Required.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Logger records structured service-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
|
||||
// Clock supplies the wall-clock used for operation timestamps.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// NewToken supplies a unique opaque lease token. Defaults to a
|
||||
// 32-byte random base64url string when nil. Tests may override.
|
||||
NewToken func() string
|
||||
}
|
||||
|
||||
// Service executes the stop lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
docker ports.DockerClient
|
||||
leases ports.GameLeaseStore
|
||||
healthEvents ports.HealthEventPublisher
|
||||
|
||||
stopTimeout time.Duration
|
||||
leaseTTL time.Duration
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
|
||||
clock func() time.Time
|
||||
newToken func() string
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new stop runtime service: nil runtime records")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new stop runtime service: nil operation logs")
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new stop runtime service: nil docker client")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new stop runtime service: nil lease store")
|
||||
case deps.HealthEvents == nil:
|
||||
return nil, errors.New("new stop runtime service: nil health events publisher")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new stop runtime service: nil telemetry runtime")
|
||||
}
|
||||
if err := deps.Container.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new stop runtime service: container config: %w", err)
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new stop runtime service: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "rtmanager.stopruntime")
|
||||
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
docker: deps.Docker,
|
||||
leases: deps.Leases,
|
||||
healthEvents: deps.HealthEvents,
|
||||
stopTimeout: deps.Container.StopTimeout,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
newToken: newToken,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one stop operation end-to-end. The Go-level error
|
||||
// return is reserved for non-business failures (nil context, nil
|
||||
// receiver). Every business outcome — success, idempotent replay, or
|
||||
// any of the stable failure modes — flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("stop runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("stop runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInvalidRequest,
|
||||
errorMessage: err.Error(),
|
||||
}), nil
|
||||
}
|
||||
|
||||
token := service.newToken()
|
||||
leaseStart := service.clock()
|
||||
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
|
||||
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if !acquired {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: "another lifecycle operation is in progress for this game",
|
||||
}), nil
|
||||
}
|
||||
defer service.releaseLease(ctx, input.GameID, token)
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// Run executes the stop lifecycle assuming the per-game lease is
|
||||
// already held by the caller. The method is reserved for orchestrator
|
||||
// services in `internal/service/` that compose stop with another
|
||||
// operation under a single outer lease (restart and patch). External
|
||||
// callers must use Handle.
|
||||
func (service *Service) Run(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("stop runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("stop runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInvalidRequest,
|
||||
errorMessage: err.Error(),
|
||||
}), nil
|
||||
}
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// runUnderLease executes the post-validation, lease-protected stop
|
||||
// steps shared by Handle and Run.
|
||||
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
|
||||
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
|
||||
switch existing.Status {
|
||||
case runtime.StatusStopped, runtime.StatusRemoved:
|
||||
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
|
||||
case runtime.StatusRunning:
|
||||
// proceed
|
||||
default:
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("runtime record has unsupported status %q", existing.Status),
|
||||
}), nil
|
||||
}
|
||||
|
||||
if err := service.docker.Stop(ctx, existing.CurrentContainerID, service.stopTimeout); err != nil {
|
||||
if errors.Is(err, ports.ErrContainerNotFound) {
|
||||
return service.handleVanished(ctx, input, opStartedAt, existing), nil
|
||||
}
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("docker stop: %s", err.Error()),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
updateNow := service.clock().UTC()
|
||||
err = service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: input.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
ExpectedContainerID: existing.CurrentContainerID,
|
||||
To: runtime.StatusStopped,
|
||||
Now: updateNow,
|
||||
})
|
||||
if errors.Is(err, runtime.ErrConflict) {
|
||||
// CAS race: a concurrent reconciler / restart already moved the
|
||||
// record. The desired terminal state was reached by another path.
|
||||
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
|
||||
}
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q vanished mid-stop", input.GameID),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("update runtime status: %s", err.Error()),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindStop,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: existing.CurrentImageRef,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
|
||||
|
||||
record := existing
|
||||
record.Status = runtime.StatusStopped
|
||||
stoppedAt := updateNow
|
||||
record.StoppedAt = &stoppedAt
|
||||
record.LastOpAt = updateNow
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", existing.CurrentContainerID,
|
||||
"reason", string(input.Reason),
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime stopped", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// handleVanished records the success outcome for the case where docker
|
||||
// stop reports the container as already gone. It updates the record to
|
||||
// removed, publishes container_disappeared, and returns success.
|
||||
func (service *Service) handleVanished(ctx context.Context, input Input, opStartedAt time.Time, existing runtime.RuntimeRecord) Result {
|
||||
updateNow := service.clock().UTC()
|
||||
err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: input.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
ExpectedContainerID: existing.CurrentContainerID,
|
||||
To: runtime.StatusRemoved,
|
||||
Now: updateNow,
|
||||
})
|
||||
if errors.Is(err, runtime.ErrConflict) {
|
||||
return service.recordReplayNoOp(ctx, opStartedAt, input, existing)
|
||||
}
|
||||
if err != nil && !errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("update runtime status to removed: %s", err.Error()),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
})
|
||||
}
|
||||
|
||||
service.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
|
||||
GameID: input.GameID,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
EventType: health.EventTypeContainerDisappeared,
|
||||
OccurredAt: updateNow,
|
||||
Details: emptyHealthDetails(),
|
||||
})
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindStop,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: existing.CurrentImageRef,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
|
||||
service.telemetry.RecordHealthEvent(ctx, string(health.EventTypeContainerDisappeared))
|
||||
|
||||
record := existing
|
||||
record.Status = runtime.StatusRemoved
|
||||
record.CurrentContainerID = ""
|
||||
removedAt := updateNow
|
||||
record.RemovedAt = &removedAt
|
||||
record.LastOpAt = updateNow
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", existing.CurrentContainerID,
|
||||
"reason", string(input.Reason),
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime stop on vanished container", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}
|
||||
}
|
||||
|
||||
// recordReplayNoOp records the idempotent replay outcome and returns the
|
||||
// existing record unchanged.
|
||||
func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindStop,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: existing.CurrentImageRef,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", existing.CurrentContainerID,
|
||||
"reason", string(input.Reason),
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime stop replay no-op", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: existing,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
||||
}
|
||||
}
|
||||
|
||||
// failureCtx groups the inputs to recordFailure so the runUnderLease
|
||||
// method stays readable.
|
||||
type failureCtx struct {
|
||||
opStartedAt time.Time
|
||||
input Input
|
||||
errorCode string
|
||||
errorMessage string
|
||||
containerID string
|
||||
imageRef string
|
||||
}
|
||||
|
||||
// recordFailure records the failure operation_log entry and emits
|
||||
// telemetry. The runtime record stays untouched.
|
||||
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: fc.input.GameID,
|
||||
OpKind: operation.OpKindStop,
|
||||
OpSource: fc.input.OpSource,
|
||||
SourceRef: fc.input.SourceRef,
|
||||
ImageRef: fc.imageRef,
|
||||
ContainerID: fc.containerID,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
StartedAt: fc.opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeFailure), string(fc.input.Reason), string(fc.input.OpSource))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", fc.input.GameID,
|
||||
"reason", string(fc.input.Reason),
|
||||
"op_source", string(fc.input.OpSource),
|
||||
"error_code", fc.errorCode,
|
||||
"error_message", fc.errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "runtime stop failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
// releaseLease releases the per-game lease in a fresh background context
|
||||
// so a canceled request context does not leave the lease pinned for its
|
||||
// TTL.
|
||||
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
service.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortAppend writes one operation_log entry. A failure is logged
|
||||
// and discarded; the durable runtime record (or its absence) remains
|
||||
// the source of truth.
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortPublishHealth emits one health event + snapshot upsert.
|
||||
// Failures degrade silently per `rtmanager/README.md §Notification
|
||||
// Contracts`; the runtime record remains the source of truth.
|
||||
func (service *Service) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
||||
if err := service.healthEvents.Publish(ctx, envelope); err != nil {
|
||||
service.logger.ErrorContext(ctx, "publish health event",
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// defaultTokenGenerator returns a function that produces 32-byte
|
||||
// base64url-encoded tokens. Mirrors the start service: a degraded
|
||||
// entropy source falls back to a sentinel token so the next TryAcquire
|
||||
// observes a collision rather than a panic.
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
|
||||
// emptyHealthDetails returns the canonical empty-object payload required
|
||||
// by the `container_disappeared` AsyncAPI variant.
|
||||
func emptyHealthDetails() json.RawMessage {
|
||||
return json.RawMessage("{}")
|
||||
}
|
||||
@@ -0,0 +1,537 @@
|
||||
package stopruntime_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// --- test doubles -----------------------------------------------------
|
||||
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
updateStatusErr error
|
||||
|
||||
updates []ports.UpdateStatusInput
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error {
|
||||
return errors.New("not used in stop tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.updates = append(s.updates, input)
|
||||
if s.updateStatusErr != nil {
|
||||
return s.updateStatusErr
|
||||
}
|
||||
record, ok := s.stored[input.GameID]
|
||||
if !ok {
|
||||
return runtime.ErrNotFound
|
||||
}
|
||||
if record.Status != input.ExpectedFrom {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
record.Status = input.To
|
||||
record.LastOpAt = input.Now
|
||||
switch input.To {
|
||||
case runtime.StatusStopped:
|
||||
stoppedAt := input.Now
|
||||
record.StoppedAt = &stoppedAt
|
||||
case runtime.StatusRemoved:
|
||||
removedAt := input.Now
|
||||
record.RemovedAt = &removedAt
|
||||
record.CurrentContainerID = ""
|
||||
}
|
||||
s.stored[input.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in stop tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in stop tests")
|
||||
}
|
||||
|
||||
type fakeOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
|
||||
appendErr error
|
||||
appends []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.appendErr != nil {
|
||||
return 0, s.appendErr
|
||||
}
|
||||
s.appends = append(s.appends, entry)
|
||||
return int64(len(s.appends)), nil
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in stop tests")
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if len(s.appends) == 0 {
|
||||
return operation.OperationEntry{}, false
|
||||
}
|
||||
return s.appends[len(s.appends)-1], true
|
||||
}
|
||||
|
||||
type fakeLeases struct {
|
||||
acquired bool
|
||||
acquireErr error
|
||||
releaseErr error
|
||||
|
||||
mu sync.Mutex
|
||||
acquires []string
|
||||
releases []string
|
||||
}
|
||||
|
||||
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.acquires = append(l.acquires, token)
|
||||
if l.acquireErr != nil {
|
||||
return false, l.acquireErr
|
||||
}
|
||||
return l.acquired, nil
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.releases = append(l.releases, token)
|
||||
return l.releaseErr
|
||||
}
|
||||
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
|
||||
publishErr error
|
||||
envelopes []ports.HealthEventEnvelope
|
||||
}
|
||||
|
||||
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
if h.publishErr != nil {
|
||||
return h.publishErr
|
||||
}
|
||||
h.envelopes = append(h.envelopes, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
operationLogs *fakeOperationLogs
|
||||
docker *mocks.MockDockerClient
|
||||
leases *fakeLeases
|
||||
healthEvents *fakeHealthEvents
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
now time.Time
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
records: newFakeRuntimeRecords(),
|
||||
operationLogs: &fakeOperationLogs{},
|
||||
docker: mocks.NewMockDockerClient(ctrl),
|
||||
leases: &fakeLeases{acquired: true},
|
||||
healthEvents: &fakeHealthEvents{},
|
||||
telemetry: telemetryRuntime,
|
||||
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T) *stopruntime.Service {
|
||||
t.Helper()
|
||||
|
||||
containerCfg := config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
}
|
||||
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
|
||||
|
||||
service, err := stopruntime.NewService(stopruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Container: containerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "token-A" },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return service
|
||||
}
|
||||
|
||||
func basicInput() stopruntime.Input {
|
||||
return stopruntime.Input{
|
||||
GameID: "game-1",
|
||||
Reason: stopruntime.StopReasonCancelled,
|
||||
OpSource: operation.OpSourceLobbyStream,
|
||||
SourceRef: "1700000000000-0",
|
||||
}
|
||||
}
|
||||
|
||||
func runningRecord(now time.Time) runtime.RuntimeRecord {
|
||||
startedAt := now.Add(-time.Hour)
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: "game-1",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-123",
|
||||
CurrentImageRef: "registry.example.com/galaxy/game:1.4.7",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-1",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
// --- happy path -------------------------------------------------------
|
||||
|
||||
func TestHandleHappyPath(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, result.ErrorCode)
|
||||
assert.Equal(t, runtime.StatusStopped, result.Record.Status)
|
||||
require.NotNil(t, result.Record.StoppedAt)
|
||||
assert.Equal(t, h.now, *result.Record.StoppedAt)
|
||||
assert.Equal(t, h.now, result.Record.LastOpAt)
|
||||
|
||||
require.Len(t, h.records.updates, 1)
|
||||
assert.Equal(t, runtime.StatusRunning, h.records.updates[0].ExpectedFrom)
|
||||
assert.Equal(t, runtime.StatusStopped, h.records.updates[0].To)
|
||||
assert.Equal(t, "ctr-123", h.records.updates[0].ExpectedContainerID)
|
||||
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OpKindStop, last.OpKind)
|
||||
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
|
||||
assert.Empty(t, last.ErrorCode)
|
||||
assert.Equal(t, "ctr-123", last.ContainerID)
|
||||
|
||||
assert.Empty(t, h.healthEvents.envelopes)
|
||||
assert.Equal(t, []string{"token-A"}, h.leases.acquires)
|
||||
assert.Equal(t, []string{"token-A"}, h.leases.releases)
|
||||
}
|
||||
|
||||
// --- replay ----------------------------------------------------------
|
||||
|
||||
func TestHandleReplayNoOpForStoppedRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
stoppedRecord := runningRecord(h.now)
|
||||
stoppedRecord.Status = runtime.StatusStopped
|
||||
stoppedAt := h.now.Add(-time.Minute)
|
||||
stoppedRecord.StoppedAt = &stoppedAt
|
||||
h.records.stored["game-1"] = stoppedRecord
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
|
||||
assert.Equal(t, runtime.StatusStopped, result.Record.Status)
|
||||
|
||||
assert.Empty(t, h.records.updates)
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode)
|
||||
assert.Equal(t, []string{"token-A"}, h.leases.releases)
|
||||
}
|
||||
|
||||
func TestHandleReplayNoOpForRemovedRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
removed := runningRecord(h.now)
|
||||
removed.Status = runtime.StatusRemoved
|
||||
removed.CurrentContainerID = ""
|
||||
removedAt := h.now.Add(-time.Minute)
|
||||
removed.RemovedAt = &removedAt
|
||||
h.records.stored["game-1"] = removed
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- vanished container ----------------------------------------------
|
||||
|
||||
func TestHandleVanishedContainerMarksRemoved(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(ports.ErrContainerNotFound)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, result.ErrorCode)
|
||||
assert.Equal(t, runtime.StatusRemoved, result.Record.Status)
|
||||
assert.Empty(t, result.Record.CurrentContainerID)
|
||||
|
||||
require.Len(t, h.records.updates, 1)
|
||||
assert.Equal(t, runtime.StatusRemoved, h.records.updates[0].To)
|
||||
|
||||
require.Len(t, h.healthEvents.envelopes, 1)
|
||||
assert.Equal(t, health.EventTypeContainerDisappeared, h.healthEvents.envelopes[0].EventType)
|
||||
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
|
||||
assert.Empty(t, last.ErrorCode)
|
||||
}
|
||||
|
||||
// --- failure paths ---------------------------------------------------
|
||||
|
||||
func TestHandleNotFoundForMissingRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
|
||||
assert.Empty(t, h.healthEvents.envelopes)
|
||||
assert.Empty(t, h.records.updates)
|
||||
}
|
||||
|
||||
func TestHandleServiceUnavailableOnDockerError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(errors.New("docker daemon timeout"))
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OutcomeFailure, last.Outcome)
|
||||
assert.Equal(t, "ctr-123", last.ContainerID)
|
||||
assert.Empty(t, h.records.updates, "no record mutation on docker stop failure")
|
||||
}
|
||||
|
||||
func TestHandleReplayNoOpOnUpdateStatusConflict(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
h.records.updateStatusErr = runtime.ErrConflict
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleInternalErrorOnUpdateStatusGenericError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
h.records.updateStatusErr = errors.New("postgres down")
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeInternal, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- conflicts -------------------------------------------------------
|
||||
|
||||
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquired = false
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
|
||||
assert.Empty(t, h.leases.releases, "release must not run when acquire returned false")
|
||||
}
|
||||
|
||||
func TestHandleServiceUnavailableOnLeaseError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquireErr = errors.New("redis timeout")
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- input validation ------------------------------------------------
|
||||
|
||||
func TestHandleRejectsInvalidInput(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t)
|
||||
|
||||
cases := []stopruntime.Input{
|
||||
{GameID: "", Reason: stopruntime.StopReasonCancelled, OpSource: operation.OpSourceLobbyStream},
|
||||
{GameID: "g", Reason: "", OpSource: operation.OpSourceLobbyStream},
|
||||
{GameID: "g", Reason: stopruntime.StopReason("bogus"), OpSource: operation.OpSourceLobbyStream},
|
||||
{GameID: "g", Reason: stopruntime.StopReasonCancelled, OpSource: operation.OpSource("bogus")},
|
||||
}
|
||||
for _, input := range cases {
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Run path (no-lease) ---------------------------------------------
|
||||
|
||||
func TestRunSkipsLease(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
h.leases.acquired = false // would block Handle; Run must ignore
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Run(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, h.leases.acquires, "Run must not touch the lease store")
|
||||
assert.Empty(t, h.leases.releases)
|
||||
}
|
||||
|
||||
// --- best-effort degradation ----------------------------------------
|
||||
|
||||
func TestHandleSurvivesOperationLogFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
h.operationLogs.appendErr = errors.New("postgres down")
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
}
|
||||
|
||||
func TestHandleSurvivesHealthPublishFailureOnVanished(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
h.healthEvents.publishErr = errors.New("redis down")
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(ports.ErrContainerNotFound)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, runtime.StatusRemoved, result.Record.Status)
|
||||
}
|
||||
|
||||
// --- constructor -----------------------------------------------------
|
||||
|
||||
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
deps := stopruntime.Dependencies{
|
||||
Container: config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
},
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
}
|
||||
_, err := stopruntime.NewService(deps)
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
package stopruntime
|
||||
|
||||
import "fmt"
|
||||
|
||||
// StopReason classifies why a caller is asking Runtime Manager to stop a
|
||||
// game container. The enum is part of the `runtime:stop_jobs` envelope
|
||||
// produced by Game Lobby and the body of the `POST
|
||||
// /api/v1/internal/runtimes/{game_id}/stop` REST endpoint, and mirrors
|
||||
// the AsyncAPI contract frozen in
|
||||
// `rtmanager/api/runtime-jobs-asyncapi.yaml`.
|
||||
//
|
||||
// The vocabulary is shared with `lobby/internal/ports/runtimemanager.go`;
|
||||
// the two declarations stay byte-identical and adding a new value
|
||||
// requires a coordinated contract bump on both sides.
|
||||
type StopReason string
|
||||
|
||||
// StopReason enum values. Adding a new value is a contract change that
|
||||
// touches the AsyncAPI spec, the Lobby producer, and every Runtime
|
||||
// Manager consumer.
|
||||
const (
|
||||
// StopReasonOrphanCleanup releases a container whose post-start
|
||||
// metadata persistence failed in Lobby.
|
||||
StopReasonOrphanCleanup StopReason = "orphan_cleanup"
|
||||
|
||||
// StopReasonCancelled covers user-lifecycle cascade and explicit
|
||||
// cancel paths for in-flight games.
|
||||
StopReasonCancelled StopReason = "cancelled"
|
||||
|
||||
// StopReasonFinished is reserved for engine-driven game finish flows.
|
||||
StopReasonFinished StopReason = "finished"
|
||||
|
||||
// StopReasonAdminRequest is reserved for admin-initiated stop paths.
|
||||
StopReasonAdminRequest StopReason = "admin_request"
|
||||
|
||||
// StopReasonTimeout is reserved for timeout-driven stop paths.
|
||||
StopReasonTimeout StopReason = "timeout"
|
||||
)
|
||||
|
||||
// IsKnown reports whether reason belongs to the frozen stop-reason
|
||||
// vocabulary.
|
||||
func (reason StopReason) IsKnown() bool {
|
||||
switch reason {
|
||||
case StopReasonOrphanCleanup,
|
||||
StopReasonCancelled,
|
||||
StopReasonFinished,
|
||||
StopReasonAdminRequest,
|
||||
StopReasonTimeout:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// AllStopReasons returns the frozen list of every stop-reason value. The
|
||||
// slice order is stable across calls and matches the AsyncAPI enum order.
|
||||
func AllStopReasons() []StopReason {
|
||||
return []StopReason{
|
||||
StopReasonOrphanCleanup,
|
||||
StopReasonCancelled,
|
||||
StopReasonFinished,
|
||||
StopReasonAdminRequest,
|
||||
StopReasonTimeout,
|
||||
}
|
||||
}
|
||||
|
||||
// String returns reason as its stored enum value. Useful in log fields
|
||||
// and telemetry attributes.
|
||||
func (reason StopReason) String() string {
|
||||
return string(reason)
|
||||
}
|
||||
|
||||
// Validate reports whether reason carries one of the five values fixed
|
||||
// by the AsyncAPI contract.
|
||||
func (reason StopReason) Validate() error {
|
||||
if reason == "" {
|
||||
return fmt.Errorf("stop reason must not be empty")
|
||||
}
|
||||
if !reason.IsKnown() {
|
||||
return fmt.Errorf("stop reason %q is unsupported", reason)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user