feat: runtime manager
This commit is contained in:
@@ -0,0 +1,482 @@
|
||||
// Package restartruntime implements the `restart` lifecycle operation
|
||||
// owned by Runtime Manager. Restart is a recreate: under one outer
|
||||
// per-game lease the service runs the stop service, removes the
|
||||
// container with `docker rm`, and runs the start service with the
|
||||
// runtime's current `image_ref`. The hostname / engine endpoint stays
|
||||
// stable across the recreate; `container_id` changes.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
|
||||
// §Lifecycles → Restart`. Design rationale is captured in
|
||||
// `rtmanager/docs/services.md`, in particular the lease-sharing
|
||||
// pattern with `startruntime.Service.Run` / `stopruntime.Service.Run`,
|
||||
// the correlation-id reuse on `source_ref`, and the
|
||||
// inner-stop-then-rm-failure recovery rule.
|
||||
package restartruntime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
// leaseReleaseTimeout bounds the deferred lease-release call.
|
||||
const leaseReleaseTimeout = 5 * time.Second
|
||||
|
||||
// Input stores the per-call arguments for one restart operation.
|
||||
type Input struct {
|
||||
// GameID identifies the platform game to restart.
|
||||
GameID string
|
||||
|
||||
// OpSource classifies how the request entered Runtime Manager.
|
||||
// Required: every operation_log entry carries an op_source.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference (REST
|
||||
// request id, admin user id). When non-empty it is reused as the
|
||||
// correlation id linking the outer restart entry to the inner stop
|
||||
// and start log entries.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle call.
|
||||
type Result struct {
|
||||
// Record carries the runtime record installed by the inner start on
|
||||
// success; zero on failure.
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure. Empty for
|
||||
// success.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
// Empty for success.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
// RuntimeRecords reads the runtime record at the start of restart
|
||||
// to capture the current image_ref and container_id.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// OperationLogs records the outer restart audit entry. Inner stop
|
||||
// and start services append their own entries through their own
|
||||
// stores.
|
||||
OperationLogs ports.OperationLogStore
|
||||
|
||||
// Docker drives the docker rm step between the inner stop and
|
||||
// inner start.
|
||||
Docker ports.DockerClient
|
||||
|
||||
// Leases serialises operations against the same game id. The outer
|
||||
// lease is held for the entire stop + rm + start sequence.
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
// StopService runs the inner stop step under the outer lease.
|
||||
StopService *stopruntime.Service
|
||||
|
||||
// StartService runs the inner start step under the outer lease.
|
||||
StartService *startruntime.Service
|
||||
|
||||
// Coordination supplies the per-game lease TTL.
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
// Telemetry records restart outcomes and lease latency. Required.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Logger records structured service-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
|
||||
// Clock supplies the wall-clock used for operation timestamps.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// NewToken supplies a unique opaque token. Used both for the lease
|
||||
// and for the correlation id when Input.SourceRef is empty.
|
||||
// Defaults to a 32-byte random base64url string when nil.
|
||||
NewToken func() string
|
||||
}
|
||||
|
||||
// Service executes the restart lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
docker ports.DockerClient
|
||||
leases ports.GameLeaseStore
|
||||
stopService *stopruntime.Service
|
||||
startService *startruntime.Service
|
||||
|
||||
leaseTTL time.Duration
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
|
||||
clock func() time.Time
|
||||
newToken func() string
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new restart runtime service: nil runtime records")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new restart runtime service: nil operation logs")
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new restart runtime service: nil docker client")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new restart runtime service: nil lease store")
|
||||
case deps.StopService == nil:
|
||||
return nil, errors.New("new restart runtime service: nil stop service")
|
||||
case deps.StartService == nil:
|
||||
return nil, errors.New("new restart runtime service: nil start service")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new restart runtime service: nil telemetry runtime")
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new restart runtime service: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "rtmanager.restartruntime")
|
||||
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
docker: deps.Docker,
|
||||
leases: deps.Leases,
|
||||
stopService: deps.StopService,
|
||||
startService: deps.StartService,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
newToken: newToken,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one restart operation end-to-end. The Go-level error
|
||||
// return is reserved for non-business failures (nil context, nil
|
||||
// receiver). Every business outcome — success or any of the stable
|
||||
// failure codes — flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("restart runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("restart runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInvalidRequest,
|
||||
errorMessage: err.Error(),
|
||||
}), nil
|
||||
}
|
||||
|
||||
token := service.newToken()
|
||||
leaseStart := service.clock()
|
||||
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
|
||||
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if !acquired {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: "another lifecycle operation is in progress for this game",
|
||||
}), nil
|
||||
}
|
||||
defer service.releaseLease(ctx, input.GameID, token)
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// runUnderLease executes the lease-protected restart sequence. Loads
|
||||
// the runtime record, runs inner stop, removes the container, runs
|
||||
// inner start.
|
||||
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
|
||||
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if existing.Status == runtime.StatusRemoved {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: fmt.Sprintf("runtime for game %q is removed; cannot restart", input.GameID),
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
if strings.TrimSpace(existing.CurrentImageRef) == "" {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q has no image_ref to restart with", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
|
||||
correlationRef := input.SourceRef
|
||||
if correlationRef == "" {
|
||||
correlationRef = service.newToken()
|
||||
}
|
||||
containerID := existing.CurrentContainerID
|
||||
imageRef := existing.CurrentImageRef
|
||||
|
||||
stopResult, err := service.stopService.Run(ctx, stopruntime.Input{
|
||||
GameID: input.GameID,
|
||||
Reason: stopruntime.StopReasonAdminRequest,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
})
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("inner stop: %s", err.Error()),
|
||||
imageRef: imageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
if stopResult.Outcome == operation.OutcomeFailure {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: stopResult.ErrorCode,
|
||||
errorMessage: fmt.Sprintf("inner stop failed: %s", stopResult.ErrorMessage),
|
||||
imageRef: imageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
|
||||
if containerID != "" {
|
||||
if err := service.docker.Remove(ctx, containerID); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("docker remove: %s", err.Error()),
|
||||
imageRef: imageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
}
|
||||
|
||||
startResult, err := service.startService.Run(ctx, startruntime.Input{
|
||||
GameID: input.GameID,
|
||||
ImageRef: imageRef,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
})
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("inner start: %s", err.Error()),
|
||||
imageRef: imageRef,
|
||||
}), nil
|
||||
}
|
||||
if startResult.Outcome == operation.OutcomeFailure {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startResult.ErrorCode,
|
||||
errorMessage: fmt.Sprintf("inner start failed: %s", startResult.ErrorMessage),
|
||||
imageRef: imageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindRestart,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
ImageRef: imageRef,
|
||||
ContainerID: startResult.Record.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordRestartOutcome(ctx, string(operation.OutcomeSuccess), "")
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"prev_container_id", containerID,
|
||||
"new_container_id", startResult.Record.CurrentContainerID,
|
||||
"image_ref", imageRef,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime restarted", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: startResult.Record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// failureCtx groups the inputs to recordFailure.
|
||||
type failureCtx struct {
|
||||
opStartedAt time.Time
|
||||
input Input
|
||||
errorCode string
|
||||
errorMessage string
|
||||
imageRef string
|
||||
containerID string
|
||||
}
|
||||
|
||||
// recordFailure records the outer failure operation_log entry and emits
|
||||
// telemetry. Inner stop / start services have already recorded their
|
||||
// own entries; this is the outer summary.
|
||||
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: fc.input.GameID,
|
||||
OpKind: operation.OpKindRestart,
|
||||
OpSource: fc.input.OpSource,
|
||||
SourceRef: correlationRefOrEmpty(fc.input),
|
||||
ImageRef: fc.imageRef,
|
||||
ContainerID: fc.containerID,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
StartedAt: fc.opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordRestartOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode)
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", fc.input.GameID,
|
||||
"image_ref", fc.imageRef,
|
||||
"op_source", string(fc.input.OpSource),
|
||||
"error_code", fc.errorCode,
|
||||
"error_message", fc.errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "runtime restart failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
// correlationRefOrEmpty returns the original Input.SourceRef for the
|
||||
// outer entry. Outer-failure paths that did not yet generate a
|
||||
// correlation id (input validation, lease busy) keep the original
|
||||
// `source_ref` which is the actor ref.
|
||||
func correlationRefOrEmpty(input Input) string {
|
||||
return input.SourceRef
|
||||
}
|
||||
|
||||
// releaseLease releases the per-game lease in a fresh background context.
|
||||
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
service.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortAppend writes one outer operation_log entry. Inner ops have
|
||||
// already appended their own; a failure here only loses the outer
|
||||
// summary, which is acceptable.
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// defaultTokenGenerator returns a function that produces 32-byte
|
||||
// base64url-encoded tokens.
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,584 @@
|
||||
package restartruntime_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/restartruntime"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// --- shared fake doubles ----------------------------------------------
|
||||
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
upsertErr error
|
||||
updateStatusErr error
|
||||
|
||||
upserts []runtime.RuntimeRecord
|
||||
updates []ports.UpdateStatusInput
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.upsertErr != nil {
|
||||
return s.upsertErr
|
||||
}
|
||||
s.upserts = append(s.upserts, record)
|
||||
s.stored[record.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.updates = append(s.updates, input)
|
||||
if s.updateStatusErr != nil {
|
||||
return s.updateStatusErr
|
||||
}
|
||||
record, ok := s.stored[input.GameID]
|
||||
if !ok {
|
||||
return runtime.ErrNotFound
|
||||
}
|
||||
if record.Status != input.ExpectedFrom {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
record.Status = input.To
|
||||
record.LastOpAt = input.Now
|
||||
switch input.To {
|
||||
case runtime.StatusStopped:
|
||||
stoppedAt := input.Now
|
||||
record.StoppedAt = &stoppedAt
|
||||
case runtime.StatusRemoved:
|
||||
removedAt := input.Now
|
||||
record.RemovedAt = &removedAt
|
||||
record.CurrentContainerID = ""
|
||||
}
|
||||
s.stored[input.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in restart tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in restart tests")
|
||||
}
|
||||
|
||||
type fakeOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
|
||||
appendErr error
|
||||
appends []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.appendErr != nil {
|
||||
return 0, s.appendErr
|
||||
}
|
||||
s.appends = append(s.appends, entry)
|
||||
return int64(len(s.appends)), nil
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in restart tests")
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) byKind(kind operation.OpKind) []operation.OperationEntry {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := []operation.OperationEntry{}
|
||||
for _, entry := range s.appends {
|
||||
if entry.OpKind == kind {
|
||||
out = append(out, entry)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeLeases struct {
|
||||
mu sync.Mutex
|
||||
|
||||
acquired bool
|
||||
acquireErr error
|
||||
releaseErr error
|
||||
|
||||
acquires []string
|
||||
releases []string
|
||||
}
|
||||
|
||||
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.acquires = append(l.acquires, token)
|
||||
if l.acquireErr != nil {
|
||||
return false, l.acquireErr
|
||||
}
|
||||
return l.acquired, nil
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.releases = append(l.releases, token)
|
||||
return l.releaseErr
|
||||
}
|
||||
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
|
||||
publishErr error
|
||||
envelopes []ports.HealthEventEnvelope
|
||||
}
|
||||
|
||||
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
if h.publishErr != nil {
|
||||
return h.publishErr
|
||||
}
|
||||
h.envelopes = append(h.envelopes, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeNotifications struct {
|
||||
mu sync.Mutex
|
||||
|
||||
publishErr error
|
||||
intents []notificationintent.Intent
|
||||
}
|
||||
|
||||
func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
if n.publishErr != nil {
|
||||
return n.publishErr
|
||||
}
|
||||
n.intents = append(n.intents, intent)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeLobby struct {
|
||||
record ports.LobbyGameRecord
|
||||
err error
|
||||
}
|
||||
|
||||
func (l *fakeLobby) GetGame(_ context.Context, _ string) (ports.LobbyGameRecord, error) {
|
||||
if l.err != nil {
|
||||
return ports.LobbyGameRecord{}, l.err
|
||||
}
|
||||
return l.record, nil
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
operationLogs *fakeOperationLogs
|
||||
docker *mocks.MockDockerClient
|
||||
leases *fakeLeases
|
||||
healthEvents *fakeHealthEvents
|
||||
notifications *fakeNotifications
|
||||
lobby *fakeLobby
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
now time.Time
|
||||
stateDir string
|
||||
|
||||
startService *startruntime.Service
|
||||
stopService *stopruntime.Service
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
h := &harness{
|
||||
records: newFakeRuntimeRecords(),
|
||||
operationLogs: &fakeOperationLogs{},
|
||||
docker: mocks.NewMockDockerClient(ctrl),
|
||||
leases: &fakeLeases{acquired: true},
|
||||
healthEvents: &fakeHealthEvents{},
|
||||
notifications: &fakeNotifications{},
|
||||
lobby: &fakeLobby{},
|
||||
telemetry: telemetryRuntime,
|
||||
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
stateDir: "/var/lib/galaxy/games/game-1",
|
||||
}
|
||||
|
||||
containerCfg := config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
}
|
||||
dockerCfg := config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
}
|
||||
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
|
||||
|
||||
startService, err := startruntime.NewService(startruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Notifications: h.notifications,
|
||||
Lobby: h.lobby,
|
||||
Container: containerCfg,
|
||||
DockerCfg: dockerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "inner-start-token" },
|
||||
PrepareStateDir: func(_ string) (string, error) { return h.stateDir, nil },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
h.startService = startService
|
||||
|
||||
stopService, err := stopruntime.NewService(stopruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Container: containerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "inner-stop-token" },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
h.stopService = stopService
|
||||
|
||||
return h
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T, tokens ...string) *restartruntime.Service {
|
||||
t.Helper()
|
||||
tokenIdx := 0
|
||||
tokenGen := func() string {
|
||||
if tokenIdx >= len(tokens) {
|
||||
return "outer-fallback"
|
||||
}
|
||||
t := tokens[tokenIdx]
|
||||
tokenIdx++
|
||||
return t
|
||||
}
|
||||
service, err := restartruntime.NewService(restartruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
StopService: h.stopService,
|
||||
StartService: h.startService,
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: tokenGen,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return service
|
||||
}
|
||||
|
||||
const imageRef = "registry.example.com/galaxy/game:1.4.7"
|
||||
|
||||
func runningRecord(now time.Time) runtime.RuntimeRecord {
|
||||
startedAt := now.Add(-time.Hour)
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: "game-1",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-old",
|
||||
CurrentImageRef: imageRef,
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-1",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func basicInput() restartruntime.Input {
|
||||
return restartruntime.Input{
|
||||
GameID: "game-1",
|
||||
OpSource: operation.OpSourceGMRest,
|
||||
SourceRef: "rest-req-42",
|
||||
}
|
||||
}
|
||||
|
||||
func sampleRunResult(now time.Time) ports.RunResult {
|
||||
return ports.RunResult{
|
||||
ContainerID: "ctr-new",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StartedAt: now,
|
||||
}
|
||||
}
|
||||
|
||||
func expectInnerStart(h *harness) {
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), imageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), imageRef).Return(ports.ImageInspect{Ref: imageRef}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
|
||||
}
|
||||
|
||||
// --- happy path -------------------------------------------------------
|
||||
|
||||
func TestHandleRestartFromRunning(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
expectInnerStart(h)
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, result.ErrorCode)
|
||||
assert.Equal(t, "ctr-new", result.Record.CurrentContainerID)
|
||||
assert.Equal(t, imageRef, result.Record.CurrentImageRef)
|
||||
assert.Equal(t, runtime.StatusRunning, result.Record.Status)
|
||||
|
||||
stops := h.operationLogs.byKind(operation.OpKindStop)
|
||||
starts := h.operationLogs.byKind(operation.OpKindStart)
|
||||
restarts := h.operationLogs.byKind(operation.OpKindRestart)
|
||||
require.Len(t, stops, 1, "inner stop appended its own entry")
|
||||
require.Len(t, starts, 1, "inner start appended its own entry")
|
||||
require.Len(t, restarts, 1, "outer restart appended one summary entry")
|
||||
|
||||
assert.Equal(t, "rest-req-42", stops[0].SourceRef, "correlation id propagated to inner stop")
|
||||
assert.Equal(t, "rest-req-42", starts[0].SourceRef, "correlation id propagated to inner start")
|
||||
assert.Equal(t, "rest-req-42", restarts[0].SourceRef, "correlation id stored on outer restart")
|
||||
assert.Equal(t, "ctr-new", restarts[0].ContainerID)
|
||||
assert.Equal(t, imageRef, restarts[0].ImageRef)
|
||||
|
||||
assert.Equal(t, []string{"outer-token"}, h.leases.acquires)
|
||||
assert.Equal(t, []string{"outer-token"}, h.leases.releases)
|
||||
}
|
||||
|
||||
func TestHandleRestartFromStopped(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
stoppedRecord := runningRecord(h.now)
|
||||
stoppedRecord.Status = runtime.StatusStopped
|
||||
stoppedAt := h.now.Add(-30 * time.Minute)
|
||||
stoppedRecord.StoppedAt = &stoppedAt
|
||||
h.records.stored["game-1"] = stoppedRecord
|
||||
|
||||
// No docker.Stop because inner stop short-circuits via replay no-op.
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
expectInnerStart(h)
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, "ctr-new", result.Record.CurrentContainerID)
|
||||
}
|
||||
|
||||
// --- correlation id fallback -----------------------------------------
|
||||
|
||||
func TestHandleGeneratesCorrelationWhenSourceRefEmpty(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
expectInnerStart(h)
|
||||
|
||||
input := basicInput()
|
||||
input.SourceRef = ""
|
||||
|
||||
// First newToken call yields the lease token, second yields the
|
||||
// correlation id fallback.
|
||||
service := h.build(t, "outer-token", "correlation-fallback")
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
|
||||
stops := h.operationLogs.byKind(operation.OpKindStop)
|
||||
starts := h.operationLogs.byKind(operation.OpKindStart)
|
||||
restarts := h.operationLogs.byKind(operation.OpKindRestart)
|
||||
require.Len(t, stops, 1)
|
||||
require.Len(t, starts, 1)
|
||||
require.Len(t, restarts, 1)
|
||||
assert.Equal(t, "correlation-fallback", stops[0].SourceRef)
|
||||
assert.Equal(t, "correlation-fallback", starts[0].SourceRef)
|
||||
assert.Equal(t, "correlation-fallback", restarts[0].SourceRef)
|
||||
}
|
||||
|
||||
// --- failure paths ---------------------------------------------------
|
||||
|
||||
func TestHandleNotFoundForMissingRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t, "outer-token")
|
||||
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop))
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart))
|
||||
require.Len(t, h.operationLogs.byKind(operation.OpKindRestart), 1)
|
||||
}
|
||||
|
||||
func TestHandleConflictForRemovedRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
removed := runningRecord(h.now)
|
||||
removed.Status = runtime.StatusRemoved
|
||||
removed.CurrentContainerID = ""
|
||||
removedAt := h.now.Add(-time.Hour)
|
||||
removed.RemovedAt = &removedAt
|
||||
h.records.stored["game-1"] = removed
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquired = false
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
assert.Empty(t, h.leases.releases, "release must not run when acquire returned false")
|
||||
}
|
||||
|
||||
func TestHandlePropagatesInnerStopFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(errors.New("daemon unreachable"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
assert.Contains(t, result.ErrorMessage, "inner stop failed")
|
||||
}
|
||||
|
||||
func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
assert.Contains(t, result.ErrorMessage, "docker remove")
|
||||
// inner stop did succeed and write its log entry; outer restart records failure.
|
||||
require.Len(t, h.operationLogs.byKind(operation.OpKindStop), 1)
|
||||
require.Len(t, h.operationLogs.byKind(operation.OpKindRestart), 1)
|
||||
}
|
||||
|
||||
func TestHandlePropagatesInnerStartFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), imageRef, gomock.Any()).Return(errors.New("manifest unknown"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode)
|
||||
assert.Contains(t, result.ErrorMessage, "inner start failed")
|
||||
}
|
||||
|
||||
// --- input validation ------------------------------------------------
|
||||
|
||||
func TestHandleRejectsInvalidInput(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t, "outer-token")
|
||||
|
||||
cases := []restartruntime.Input{
|
||||
{GameID: "", OpSource: operation.OpSourceGMRest},
|
||||
{GameID: "g", OpSource: operation.OpSource("bogus")},
|
||||
}
|
||||
for _, input := range cases {
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor -----------------------------------------------------
|
||||
|
||||
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
deps := restartruntime.Dependencies{
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
}
|
||||
_, err := restartruntime.NewService(deps)
|
||||
require.Error(t, err)
|
||||
}
|
||||
Reference in New Issue
Block a user