Files
galaxy-game/rtmanager/internal/service/restartruntime/service.go
T
2026-04-28 20:39:18 +02:00

483 lines
16 KiB
Go

// Package restartruntime implements the `restart` lifecycle operation
// owned by Runtime Manager. Restart is a recreate: under one outer
// per-game lease the service runs the stop service, removes the
// container with `docker rm`, and runs the start service with the
// runtime's current `image_ref`. The hostname / engine endpoint stays
// stable across the recreate; `container_id` changes.
//
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
// §Lifecycles → Restart`. Design rationale is captured in
// `rtmanager/docs/services.md`, in particular the lease-sharing
// pattern with `startruntime.Service.Run` / `stopruntime.Service.Run`,
// the correlation-id reuse on `source_ref`, and the
// inner-stop-then-rm-failure recovery rule.
package restartruntime
import (
"context"
"crypto/rand"
"encoding/base64"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/logging"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"galaxy/rtmanager/internal/telemetry"
)
// leaseReleaseTimeout bounds the deferred lease-release call.
const leaseReleaseTimeout = 5 * time.Second
// Input stores the per-call arguments for one restart operation.
type Input struct {
// GameID identifies the platform game to restart.
GameID string
// OpSource classifies how the request entered Runtime Manager.
// Required: every operation_log entry carries an op_source.
OpSource operation.OpSource
// SourceRef stores the optional opaque per-source reference (REST
// request id, admin user id). When non-empty it is reused as the
// correlation id linking the outer restart entry to the inner stop
// and start log entries.
SourceRef string
}
// Validate reports whether input carries the structural invariants the
// service requires.
func (input Input) Validate() error {
if strings.TrimSpace(input.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if !input.OpSource.IsKnown() {
return fmt.Errorf("op source %q is unsupported", input.OpSource)
}
return nil
}
// Result stores the deterministic outcome of one Handle call.
type Result struct {
// Record carries the runtime record installed by the inner start on
// success; zero on failure.
Record runtime.RuntimeRecord
// Outcome reports whether the operation completed (success) or
// produced a stable failure code.
Outcome operation.Outcome
// ErrorCode stores the stable error code on failure. Empty for
// success.
ErrorCode string
// ErrorMessage stores the operator-readable detail on failure.
// Empty for success.
ErrorMessage string
}
// Dependencies groups the collaborators required by Service.
type Dependencies struct {
// RuntimeRecords reads the runtime record at the start of restart
// to capture the current image_ref and container_id.
RuntimeRecords ports.RuntimeRecordStore
// OperationLogs records the outer restart audit entry. Inner stop
// and start services append their own entries through their own
// stores.
OperationLogs ports.OperationLogStore
// Docker drives the docker rm step between the inner stop and
// inner start.
Docker ports.DockerClient
// Leases serialises operations against the same game id. The outer
// lease is held for the entire stop + rm + start sequence.
Leases ports.GameLeaseStore
// StopService runs the inner stop step under the outer lease.
StopService *stopruntime.Service
// StartService runs the inner start step under the outer lease.
StartService *startruntime.Service
// Coordination supplies the per-game lease TTL.
Coordination config.CoordinationConfig
// Telemetry records restart outcomes and lease latency. Required.
Telemetry *telemetry.Runtime
// Logger records structured service-level events. Defaults to
// `slog.Default()` when nil.
Logger *slog.Logger
// Clock supplies the wall-clock used for operation timestamps.
// Defaults to `time.Now` when nil.
Clock func() time.Time
// NewToken supplies a unique opaque token. Used both for the lease
// and for the correlation id when Input.SourceRef is empty.
// Defaults to a 32-byte random base64url string when nil.
NewToken func() string
}
// Service executes the restart lifecycle operation.
type Service struct {
runtimeRecords ports.RuntimeRecordStore
operationLogs ports.OperationLogStore
docker ports.DockerClient
leases ports.GameLeaseStore
stopService *stopruntime.Service
startService *startruntime.Service
leaseTTL time.Duration
telemetry *telemetry.Runtime
logger *slog.Logger
clock func() time.Time
newToken func() string
}
// NewService constructs one Service from deps.
func NewService(deps Dependencies) (*Service, error) {
switch {
case deps.RuntimeRecords == nil:
return nil, errors.New("new restart runtime service: nil runtime records")
case deps.OperationLogs == nil:
return nil, errors.New("new restart runtime service: nil operation logs")
case deps.Docker == nil:
return nil, errors.New("new restart runtime service: nil docker client")
case deps.Leases == nil:
return nil, errors.New("new restart runtime service: nil lease store")
case deps.StopService == nil:
return nil, errors.New("new restart runtime service: nil stop service")
case deps.StartService == nil:
return nil, errors.New("new restart runtime service: nil start service")
case deps.Telemetry == nil:
return nil, errors.New("new restart runtime service: nil telemetry runtime")
}
if err := deps.Coordination.Validate(); err != nil {
return nil, fmt.Errorf("new restart runtime service: coordination config: %w", err)
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
logger = logger.With("service", "rtmanager.restartruntime")
newToken := deps.NewToken
if newToken == nil {
newToken = defaultTokenGenerator()
}
return &Service{
runtimeRecords: deps.RuntimeRecords,
operationLogs: deps.OperationLogs,
docker: deps.Docker,
leases: deps.Leases,
stopService: deps.StopService,
startService: deps.StartService,
leaseTTL: deps.Coordination.GameLeaseTTL,
telemetry: deps.Telemetry,
logger: logger,
clock: clock,
newToken: newToken,
}, nil
}
// Handle executes one restart operation end-to-end. The Go-level error
// return is reserved for non-business failures (nil context, nil
// receiver). Every business outcome — success or any of the stable
// failure codes — flows through Result.
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("restart runtime: nil service")
}
if ctx == nil {
return Result{}, errors.New("restart runtime: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInvalidRequest,
errorMessage: err.Error(),
}), nil
}
token := service.newToken()
leaseStart := service.clock()
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
}), nil
}
if !acquired {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeConflict,
errorMessage: "another lifecycle operation is in progress for this game",
}), nil
}
defer service.releaseLease(ctx, input.GameID, token)
return service.runUnderLease(ctx, input, opStartedAt)
}
// runUnderLease executes the lease-protected restart sequence. Loads
// the runtime record, runs inner stop, removes the container, runs
// inner start.
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
if errors.Is(err, runtime.ErrNotFound) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeNotFound,
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
}), nil
}
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
}), nil
}
if existing.Status == runtime.StatusRemoved {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeConflict,
errorMessage: fmt.Sprintf("runtime for game %q is removed; cannot restart", input.GameID),
imageRef: existing.CurrentImageRef,
}), nil
}
if strings.TrimSpace(existing.CurrentImageRef) == "" {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("runtime record for game %q has no image_ref to restart with", input.GameID),
}), nil
}
correlationRef := input.SourceRef
if correlationRef == "" {
correlationRef = service.newToken()
}
containerID := existing.CurrentContainerID
imageRef := existing.CurrentImageRef
stopResult, err := service.stopService.Run(ctx, stopruntime.Input{
GameID: input.GameID,
Reason: stopruntime.StopReasonAdminRequest,
OpSource: input.OpSource,
SourceRef: correlationRef,
})
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("inner stop: %s", err.Error()),
imageRef: imageRef,
containerID: containerID,
}), nil
}
if stopResult.Outcome == operation.OutcomeFailure {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: stopResult.ErrorCode,
errorMessage: fmt.Sprintf("inner stop failed: %s", stopResult.ErrorMessage),
imageRef: imageRef,
containerID: containerID,
}), nil
}
if containerID != "" {
if err := service.docker.Remove(ctx, containerID); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("docker remove: %s", err.Error()),
imageRef: imageRef,
containerID: containerID,
}), nil
}
}
startResult, err := service.startService.Run(ctx, startruntime.Input{
GameID: input.GameID,
ImageRef: imageRef,
OpSource: input.OpSource,
SourceRef: correlationRef,
})
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("inner start: %s", err.Error()),
imageRef: imageRef,
}), nil
}
if startResult.Outcome == operation.OutcomeFailure {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startResult.ErrorCode,
errorMessage: fmt.Sprintf("inner start failed: %s", startResult.ErrorMessage),
imageRef: imageRef,
}), nil
}
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindRestart,
OpSource: input.OpSource,
SourceRef: correlationRef,
ImageRef: imageRef,
ContainerID: startResult.Record.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordRestartOutcome(ctx, string(operation.OutcomeSuccess), "")
logArgs := []any{
"game_id", input.GameID,
"prev_container_id", containerID,
"new_container_id", startResult.Record.CurrentContainerID,
"image_ref", imageRef,
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime restarted", logArgs...)
return Result{
Record: startResult.Record,
Outcome: operation.OutcomeSuccess,
}, nil
}
// failureCtx groups the inputs to recordFailure.
type failureCtx struct {
opStartedAt time.Time
input Input
errorCode string
errorMessage string
imageRef string
containerID string
}
// recordFailure records the outer failure operation_log entry and emits
// telemetry. Inner stop / start services have already recorded their
// own entries; this is the outer summary.
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: fc.input.GameID,
OpKind: operation.OpKindRestart,
OpSource: fc.input.OpSource,
SourceRef: correlationRefOrEmpty(fc.input),
ImageRef: fc.imageRef,
ContainerID: fc.containerID,
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
StartedAt: fc.opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordRestartOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode)
logArgs := []any{
"game_id", fc.input.GameID,
"image_ref", fc.imageRef,
"op_source", string(fc.input.OpSource),
"error_code", fc.errorCode,
"error_message", fc.errorMessage,
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.WarnContext(ctx, "runtime restart failed", logArgs...)
return Result{
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
}
}
// correlationRefOrEmpty returns the original Input.SourceRef for the
// outer entry. Outer-failure paths that did not yet generate a
// correlation id (input validation, lease busy) keep the original
// `source_ref` which is the actor ref.
func correlationRefOrEmpty(input Input) string {
return input.SourceRef
}
// releaseLease releases the per-game lease in a fresh background context.
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
defer cancel()
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
service.logger.WarnContext(ctx, "release game lease",
"game_id", gameID,
"err", err.Error(),
)
}
}
// bestEffortAppend writes one outer operation_log entry. Inner ops have
// already appended their own; a failure here only loses the outer
// summary, which is acceptable.
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
service.logger.ErrorContext(ctx, "append operation log",
"game_id", entry.GameID,
"op_kind", string(entry.OpKind),
"outcome", string(entry.Outcome),
"error_code", entry.ErrorCode,
"err", err.Error(),
)
}
}
// defaultTokenGenerator returns a function that produces 32-byte
// base64url-encoded tokens.
func defaultTokenGenerator() func() string {
return func() string {
var buf [32]byte
if _, err := rand.Read(buf[:]); err != nil {
return "rtmanager-fallback-token"
}
return base64.RawURLEncoding.EncodeToString(buf[:])
}
}