// Package stopruntime implements the `stop` lifecycle operation owned by // Runtime Manager. The service is the single orchestrator behind both // the asynchronous `runtime:stop_jobs` consumer and the synchronous // `POST /api/v1/internal/runtimes/{game_id}/stop` REST handler. It is // also the inner stop step of the restart and patch services, which // call Run while holding the outer per-game lease. // // Lifecycle and failure-mode semantics follow `rtmanager/README.md // §Lifecycles → Stop`. Design rationale is captured in // `rtmanager/docs/services.md`. package stopruntime import ( "context" "crypto/rand" "encoding/base64" "encoding/json" "errors" "fmt" "log/slog" "strings" "time" "galaxy/rtmanager/internal/config" "galaxy/rtmanager/internal/domain/health" "galaxy/rtmanager/internal/domain/operation" "galaxy/rtmanager/internal/domain/runtime" "galaxy/rtmanager/internal/logging" "galaxy/rtmanager/internal/ports" "galaxy/rtmanager/internal/service/startruntime" "galaxy/rtmanager/internal/telemetry" ) // leaseReleaseTimeout bounds the deferred lease-release call. A fresh // background context is used so the release runs even when the request // context was already canceled. const leaseReleaseTimeout = 5 * time.Second // Input stores the per-call arguments for one stop operation. type Input struct { // GameID identifies the platform game to stop. GameID string // Reason classifies the trigger of the stop. Required. Reason StopReason // OpSource classifies how the request entered Runtime Manager. // Required: every operation_log entry carries an op_source. OpSource operation.OpSource // SourceRef stores the optional opaque per-source reference (Redis // Stream entry id, REST request id, admin user id). Empty when the // caller does not provide one. For inner calls invoked by the // restart and patch orchestrators it carries the outer correlation // id so the three operation_log entries share it. SourceRef string } // Validate reports whether input carries the structural invariants the // service requires. func (input Input) Validate() error { if strings.TrimSpace(input.GameID) == "" { return fmt.Errorf("game id must not be empty") } if !input.OpSource.IsKnown() { return fmt.Errorf("op source %q is unsupported", input.OpSource) } if err := input.Reason.Validate(); err != nil { return err } return nil } // Result stores the deterministic outcome of one Handle / Run call. type Result struct { // Record carries the runtime record installed by the operation. // Populated on success and on idempotent replay; zero on failure. Record runtime.RuntimeRecord // Outcome reports whether the operation completed (success) or // produced a stable failure code. Outcome operation.Outcome // ErrorCode stores the stable error code on failure, or // `replay_no_op` on idempotent replay. Empty for fresh successes. ErrorCode string // ErrorMessage stores the operator-readable detail on failure. // Empty for successes. ErrorMessage string } // Dependencies groups the collaborators required by Service. type Dependencies struct { // RuntimeRecords reads and updates the durable runtime record. RuntimeRecords ports.RuntimeRecordStore // OperationLogs records the success / failure audit entry. OperationLogs ports.OperationLogStore // Docker drives the Docker daemon (container stop). Docker ports.DockerClient // Leases serialises operations against the same game id. Leases ports.GameLeaseStore // HealthEvents publishes `runtime:health_events` and upserts the // matching `health_snapshots` row. Used on the vanished-container // path to emit `container_disappeared`. HealthEvents ports.HealthEventPublisher // Container groups the per-container settings consumed at stop time // (the graceful stop timeout). Container config.ContainerConfig // Coordination supplies the per-game lease TTL. Coordination config.CoordinationConfig // Telemetry records stop outcomes and lease latency. Required. Telemetry *telemetry.Runtime // Logger records structured service-level events. Defaults to // `slog.Default()` when nil. Logger *slog.Logger // Clock supplies the wall-clock used for operation timestamps. // Defaults to `time.Now` when nil. Clock func() time.Time // NewToken supplies a unique opaque lease token. Defaults to a // 32-byte random base64url string when nil. Tests may override. NewToken func() string } // Service executes the stop lifecycle operation. type Service struct { runtimeRecords ports.RuntimeRecordStore operationLogs ports.OperationLogStore docker ports.DockerClient leases ports.GameLeaseStore healthEvents ports.HealthEventPublisher stopTimeout time.Duration leaseTTL time.Duration telemetry *telemetry.Runtime logger *slog.Logger clock func() time.Time newToken func() string } // NewService constructs one Service from deps. func NewService(deps Dependencies) (*Service, error) { switch { case deps.RuntimeRecords == nil: return nil, errors.New("new stop runtime service: nil runtime records") case deps.OperationLogs == nil: return nil, errors.New("new stop runtime service: nil operation logs") case deps.Docker == nil: return nil, errors.New("new stop runtime service: nil docker client") case deps.Leases == nil: return nil, errors.New("new stop runtime service: nil lease store") case deps.HealthEvents == nil: return nil, errors.New("new stop runtime service: nil health events publisher") case deps.Telemetry == nil: return nil, errors.New("new stop runtime service: nil telemetry runtime") } if err := deps.Container.Validate(); err != nil { return nil, fmt.Errorf("new stop runtime service: container config: %w", err) } if err := deps.Coordination.Validate(); err != nil { return nil, fmt.Errorf("new stop runtime service: coordination config: %w", err) } clock := deps.Clock if clock == nil { clock = time.Now } logger := deps.Logger if logger == nil { logger = slog.Default() } logger = logger.With("service", "rtmanager.stopruntime") newToken := deps.NewToken if newToken == nil { newToken = defaultTokenGenerator() } return &Service{ runtimeRecords: deps.RuntimeRecords, operationLogs: deps.OperationLogs, docker: deps.Docker, leases: deps.Leases, healthEvents: deps.HealthEvents, stopTimeout: deps.Container.StopTimeout, leaseTTL: deps.Coordination.GameLeaseTTL, telemetry: deps.Telemetry, logger: logger, clock: clock, newToken: newToken, }, nil } // Handle executes one stop operation end-to-end. The Go-level error // return is reserved for non-business failures (nil context, nil // receiver). Every business outcome — success, idempotent replay, or // any of the stable failure modes — flows through Result. func (service *Service) Handle(ctx context.Context, input Input) (Result, error) { if service == nil { return Result{}, errors.New("stop runtime: nil service") } if ctx == nil { return Result{}, errors.New("stop runtime: nil context") } opStartedAt := service.clock().UTC() if err := input.Validate(); err != nil { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeInvalidRequest, errorMessage: err.Error(), }), nil } token := service.newToken() leaseStart := service.clock() acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL) service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart)) if err != nil { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeServiceUnavailable, errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()), }), nil } if !acquired { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeConflict, errorMessage: "another lifecycle operation is in progress for this game", }), nil } defer service.releaseLease(ctx, input.GameID, token) return service.runUnderLease(ctx, input, opStartedAt) } // Run executes the stop lifecycle assuming the per-game lease is // already held by the caller. The method is reserved for orchestrator // services in `internal/service/` that compose stop with another // operation under a single outer lease (restart and patch). External // callers must use Handle. func (service *Service) Run(ctx context.Context, input Input) (Result, error) { if service == nil { return Result{}, errors.New("stop runtime: nil service") } if ctx == nil { return Result{}, errors.New("stop runtime: nil context") } opStartedAt := service.clock().UTC() if err := input.Validate(); err != nil { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeInvalidRequest, errorMessage: err.Error(), }), nil } return service.runUnderLease(ctx, input, opStartedAt) } // runUnderLease executes the post-validation, lease-protected stop // steps shared by Handle and Run. func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) { existing, err := service.runtimeRecords.Get(ctx, input.GameID) if errors.Is(err, runtime.ErrNotFound) { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeNotFound, errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID), }), nil } if err != nil { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeInternal, errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()), }), nil } switch existing.Status { case runtime.StatusStopped, runtime.StatusRemoved: return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil case runtime.StatusRunning: // proceed default: return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeInternal, errorMessage: fmt.Sprintf("runtime record has unsupported status %q", existing.Status), }), nil } if err := service.docker.Stop(ctx, existing.CurrentContainerID, service.stopTimeout); err != nil { if errors.Is(err, ports.ErrContainerNotFound) { return service.handleVanished(ctx, input, opStartedAt, existing), nil } return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeServiceUnavailable, errorMessage: fmt.Sprintf("docker stop: %s", err.Error()), containerID: existing.CurrentContainerID, imageRef: existing.CurrentImageRef, }), nil } updateNow := service.clock().UTC() err = service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{ GameID: input.GameID, ExpectedFrom: runtime.StatusRunning, ExpectedContainerID: existing.CurrentContainerID, To: runtime.StatusStopped, Now: updateNow, }) if errors.Is(err, runtime.ErrConflict) { // CAS race: a concurrent reconciler / restart already moved the // record. The desired terminal state was reached by another path. return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil } if errors.Is(err, runtime.ErrNotFound) { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeNotFound, errorMessage: fmt.Sprintf("runtime record for game %q vanished mid-stop", input.GameID), containerID: existing.CurrentContainerID, imageRef: existing.CurrentImageRef, }), nil } if err != nil { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeInternal, errorMessage: fmt.Sprintf("update runtime status: %s", err.Error()), containerID: existing.CurrentContainerID, imageRef: existing.CurrentImageRef, }), nil } finishedAt := service.clock().UTC() service.bestEffortAppend(ctx, operation.OperationEntry{ GameID: input.GameID, OpKind: operation.OpKindStop, OpSource: input.OpSource, SourceRef: input.SourceRef, ImageRef: existing.CurrentImageRef, ContainerID: existing.CurrentContainerID, Outcome: operation.OutcomeSuccess, StartedAt: opStartedAt, FinishedAt: &finishedAt, }) service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource)) record := existing record.Status = runtime.StatusStopped stoppedAt := updateNow record.StoppedAt = &stoppedAt record.LastOpAt = updateNow logArgs := []any{ "game_id", input.GameID, "container_id", existing.CurrentContainerID, "reason", string(input.Reason), "op_source", string(input.OpSource), } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) service.logger.InfoContext(ctx, "runtime stopped", logArgs...) return Result{ Record: record, Outcome: operation.OutcomeSuccess, }, nil } // handleVanished records the success outcome for the case where docker // stop reports the container as already gone. It updates the record to // removed, publishes container_disappeared, and returns success. func (service *Service) handleVanished(ctx context.Context, input Input, opStartedAt time.Time, existing runtime.RuntimeRecord) Result { updateNow := service.clock().UTC() err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{ GameID: input.GameID, ExpectedFrom: runtime.StatusRunning, ExpectedContainerID: existing.CurrentContainerID, To: runtime.StatusRemoved, Now: updateNow, }) if errors.Is(err, runtime.ErrConflict) { return service.recordReplayNoOp(ctx, opStartedAt, input, existing) } if err != nil && !errors.Is(err, runtime.ErrNotFound) { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeInternal, errorMessage: fmt.Sprintf("update runtime status to removed: %s", err.Error()), containerID: existing.CurrentContainerID, imageRef: existing.CurrentImageRef, }) } service.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{ GameID: input.GameID, ContainerID: existing.CurrentContainerID, EventType: health.EventTypeContainerDisappeared, OccurredAt: updateNow, Details: emptyHealthDetails(), }) finishedAt := service.clock().UTC() service.bestEffortAppend(ctx, operation.OperationEntry{ GameID: input.GameID, OpKind: operation.OpKindStop, OpSource: input.OpSource, SourceRef: input.SourceRef, ImageRef: existing.CurrentImageRef, ContainerID: existing.CurrentContainerID, Outcome: operation.OutcomeSuccess, StartedAt: opStartedAt, FinishedAt: &finishedAt, }) service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource)) service.telemetry.RecordHealthEvent(ctx, string(health.EventTypeContainerDisappeared)) record := existing record.Status = runtime.StatusRemoved record.CurrentContainerID = "" removedAt := updateNow record.RemovedAt = &removedAt record.LastOpAt = updateNow logArgs := []any{ "game_id", input.GameID, "container_id", existing.CurrentContainerID, "reason", string(input.Reason), "op_source", string(input.OpSource), } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) service.logger.InfoContext(ctx, "runtime stop on vanished container", logArgs...) return Result{ Record: record, Outcome: operation.OutcomeSuccess, } } // recordReplayNoOp records the idempotent replay outcome and returns the // existing record unchanged. func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result { finishedAt := service.clock().UTC() service.bestEffortAppend(ctx, operation.OperationEntry{ GameID: input.GameID, OpKind: operation.OpKindStop, OpSource: input.OpSource, SourceRef: input.SourceRef, ImageRef: existing.CurrentImageRef, ContainerID: existing.CurrentContainerID, Outcome: operation.OutcomeSuccess, ErrorCode: startruntime.ErrorCodeReplayNoOp, StartedAt: opStartedAt, FinishedAt: &finishedAt, }) service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource)) logArgs := []any{ "game_id", input.GameID, "container_id", existing.CurrentContainerID, "reason", string(input.Reason), "op_source", string(input.OpSource), } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) service.logger.InfoContext(ctx, "runtime stop replay no-op", logArgs...) return Result{ Record: existing, Outcome: operation.OutcomeSuccess, ErrorCode: startruntime.ErrorCodeReplayNoOp, } } // failureCtx groups the inputs to recordFailure so the runUnderLease // method stays readable. type failureCtx struct { opStartedAt time.Time input Input errorCode string errorMessage string containerID string imageRef string } // recordFailure records the failure operation_log entry and emits // telemetry. The runtime record stays untouched. func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result { finishedAt := service.clock().UTC() service.bestEffortAppend(ctx, operation.OperationEntry{ GameID: fc.input.GameID, OpKind: operation.OpKindStop, OpSource: fc.input.OpSource, SourceRef: fc.input.SourceRef, ImageRef: fc.imageRef, ContainerID: fc.containerID, Outcome: operation.OutcomeFailure, ErrorCode: fc.errorCode, ErrorMessage: fc.errorMessage, StartedAt: fc.opStartedAt, FinishedAt: &finishedAt, }) service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeFailure), string(fc.input.Reason), string(fc.input.OpSource)) logArgs := []any{ "game_id", fc.input.GameID, "reason", string(fc.input.Reason), "op_source", string(fc.input.OpSource), "error_code", fc.errorCode, "error_message", fc.errorMessage, } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) service.logger.WarnContext(ctx, "runtime stop failed", logArgs...) return Result{ Outcome: operation.OutcomeFailure, ErrorCode: fc.errorCode, ErrorMessage: fc.errorMessage, } } // releaseLease releases the per-game lease in a fresh background context // so a canceled request context does not leave the lease pinned for its // TTL. func (service *Service) releaseLease(ctx context.Context, gameID, token string) { cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout) defer cancel() if err := service.leases.Release(cleanupCtx, gameID, token); err != nil { service.logger.WarnContext(ctx, "release game lease", "game_id", gameID, "err", err.Error(), ) } } // bestEffortAppend writes one operation_log entry. A failure is logged // and discarded; the durable runtime record (or its absence) remains // the source of truth. func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) { if _, err := service.operationLogs.Append(ctx, entry); err != nil { service.logger.ErrorContext(ctx, "append operation log", "game_id", entry.GameID, "op_kind", string(entry.OpKind), "outcome", string(entry.Outcome), "error_code", entry.ErrorCode, "err", err.Error(), ) } } // bestEffortPublishHealth emits one health event + snapshot upsert. // Failures degrade silently per `rtmanager/README.md §Notification // Contracts`; the runtime record remains the source of truth. func (service *Service) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) { if err := service.healthEvents.Publish(ctx, envelope); err != nil { service.logger.ErrorContext(ctx, "publish health event", "game_id", envelope.GameID, "container_id", envelope.ContainerID, "event_type", string(envelope.EventType), "err", err.Error(), ) } } // defaultTokenGenerator returns a function that produces 32-byte // base64url-encoded tokens. Mirrors the start service: a degraded // entropy source falls back to a sentinel token so the next TryAcquire // observes a collision rather than a panic. func defaultTokenGenerator() func() string { return func() string { var buf [32]byte if _, err := rand.Read(buf[:]); err != nil { return "rtmanager-fallback-token" } return base64.RawURLEncoding.EncodeToString(buf[:]) } } // emptyHealthDetails returns the canonical empty-object payload required // by the `container_disappeared` AsyncAPI variant. func emptyHealthDetails() json.RawMessage { return json.RawMessage("{}") }