// Package adminstop implements the admin stop service-layer // orchestrator owned by Game Master. It is driven by Admin Service or // system administrators through // `POST /api/v1/internal/runtimes/{game_id}/stop` and tells Runtime // Manager to stop the game's container while transitioning the runtime // record to `stopped`. // // Lifecycle and failure-mode semantics follow `gamemaster/README.md // §Lifecycles → Stop`. The idempotent-on-terminal-status and // conflict-on-starting rules are recorded in // `gamemaster/docs/stage17-admin-operations.md`. package adminstop import ( "context" "errors" "fmt" "log/slog" "strings" "time" "galaxy/gamemaster/internal/domain/operation" "galaxy/gamemaster/internal/domain/runtime" "galaxy/gamemaster/internal/logging" "galaxy/gamemaster/internal/ports" "galaxy/gamemaster/internal/telemetry" ) // Input stores the per-call arguments for one admin stop operation. type Input struct { // GameID identifies the runtime to stop. GameID string // Reason classifies the stop. Empty defaults to // `admin_request`. Allowed values: `admin_request`, `finished`, // `timeout`. Reason string // OpSource classifies how the request entered Game Master. Used to // stamp `operation_log.op_source`. Defaults to `admin_rest` when // missing or unrecognised. OpSource operation.OpSource // SourceRef stores the optional opaque per-source reference (REST // request id, admin user id). Empty when the caller does not // provide one. SourceRef string } // Validate reports whether input carries the structural invariants the // service requires before any store is touched. func (input Input) Validate() error { if strings.TrimSpace(input.GameID) == "" { return fmt.Errorf("game id must not be empty") } switch strings.TrimSpace(input.Reason) { case "", ReasonAdminRequest, ReasonFinished, ReasonTimeout: return nil default: return fmt.Errorf("reason %q is unsupported", input.Reason) } } // Result stores the deterministic outcome of one Handle call. Business // outcomes flow through Result; the Go-level error return is reserved // for non-business failures (nil context, nil receiver). type Result struct { // Record carries the runtime record observed (and on success // transitioned) by the operation. Populated on success and on the // idempotent no-op branch; zero on early-rejection failures // (invalid_request, runtime_not_found). Record runtime.RuntimeRecord // Outcome reports whether the operation completed (success) or // produced a stable failure code. Outcome operation.Outcome // ErrorCode stores the stable error code on failure. Empty on // success. ErrorCode string // ErrorMessage stores the operator-readable detail on failure. // Empty on success. ErrorMessage string } // IsSuccess reports whether the result represents a successful // operation. func (result Result) IsSuccess() bool { return result.Outcome == operation.OutcomeSuccess } // Dependencies groups the collaborators required by Service. type Dependencies struct { // RuntimeRecords drives the read of the current row plus the CAS // transition to `stopped`. RuntimeRecords ports.RuntimeRecordStore // OperationLogs records the audit entry for the operation. OperationLogs ports.OperationLogStore // RTM drives the Runtime Manager stop call. RTM ports.RTMClient // LobbyEvents publishes the post-success // `runtime_snapshot_update` to `gm:lobby_events`. LobbyEvents ports.LobbyEventsPublisher // Telemetry is required by the lobby-events publication helper. Telemetry *telemetry.Runtime // Logger records structured service-level events. Defaults to // `slog.Default()` when nil. Logger *slog.Logger // Clock supplies the wall-clock used for operation timestamps. // Defaults to `time.Now` when nil. Clock func() time.Time } // Service executes the admin stop lifecycle operation. type Service struct { runtimeRecords ports.RuntimeRecordStore operationLogs ports.OperationLogStore rtm ports.RTMClient lobbyEvents ports.LobbyEventsPublisher telemetry *telemetry.Runtime logger *slog.Logger clock func() time.Time } // NewService constructs one Service from deps. func NewService(deps Dependencies) (*Service, error) { switch { case deps.RuntimeRecords == nil: return nil, errors.New("new admin stop service: nil runtime records") case deps.OperationLogs == nil: return nil, errors.New("new admin stop service: nil operation logs") case deps.RTM == nil: return nil, errors.New("new admin stop service: nil rtm client") case deps.LobbyEvents == nil: return nil, errors.New("new admin stop service: nil lobby events publisher") case deps.Telemetry == nil: return nil, errors.New("new admin stop service: nil telemetry runtime") } clock := deps.Clock if clock == nil { clock = time.Now } logger := deps.Logger if logger == nil { logger = slog.Default() } logger = logger.With("service", "gamemaster.adminstop") return &Service{ runtimeRecords: deps.RuntimeRecords, operationLogs: deps.OperationLogs, rtm: deps.RTM, lobbyEvents: deps.LobbyEvents, telemetry: deps.Telemetry, logger: logger, clock: clock, }, nil } // Handle executes one admin stop operation end-to-end. The Go-level // error return is reserved for non-business failures (nil context, nil // receiver). Every business outcome flows through Result. func (service *Service) Handle(ctx context.Context, input Input) (Result, error) { if service == nil { return Result{}, errors.New("admin stop: nil service") } if ctx == nil { return Result{}, errors.New("admin stop: nil context") } opStartedAt := service.clock().UTC() if err := input.Validate(); err != nil { return service.recordEarlyFailure(ctx, opStartedAt, input, ErrorCodeInvalidRequest, err.Error()), nil } reason := strings.TrimSpace(input.Reason) if reason == "" { reason = ReasonAdminRequest } record, err := service.runtimeRecords.Get(ctx, input.GameID) switch { case errors.Is(err, runtime.ErrNotFound): return service.recordEarlyFailure(ctx, opStartedAt, input, ErrorCodeRuntimeNotFound, "runtime record does not exist"), nil case err != nil: return service.recordEarlyFailure(ctx, opStartedAt, input, ErrorCodeServiceUnavailable, fmt.Sprintf("get runtime record: %s", err.Error())), nil } switch record.Status { case runtime.StatusStopped, runtime.StatusFinished: return service.completeIdempotent(ctx, opStartedAt, input, record), nil case runtime.StatusStarting: return service.recordEarlyFailureWithRecord(ctx, opStartedAt, input, record, ErrorCodeConflict, fmt.Sprintf("runtime status is %q; stop requires a started runtime", record.Status)), nil } if err := service.rtm.Stop(ctx, input.GameID, reason); err != nil { return service.recordEarlyFailureWithRecord(ctx, opStartedAt, input, record, ErrorCodeServiceUnavailable, fmt.Sprintf("rtm stop: %s", err.Error())), nil } stoppedAt := service.clock().UTC() casErr := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{ GameID: input.GameID, ExpectedFrom: record.Status, To: runtime.StatusStopped, Now: stoppedAt, }) switch { case casErr == nil: case errors.Is(casErr, runtime.ErrConflict): return service.recordEarlyFailureWithRecord(ctx, opStartedAt, input, record, ErrorCodeConflict, fmt.Sprintf("cas runtime status to stopped: %s", casErr.Error())), nil case errors.Is(casErr, runtime.ErrNotFound): return service.recordEarlyFailureWithRecord(ctx, opStartedAt, input, record, ErrorCodeRuntimeNotFound, fmt.Sprintf("cas runtime status to stopped: %s", casErr.Error())), nil default: return service.recordEarlyFailureWithRecord(ctx, opStartedAt, input, record, ErrorCodeServiceUnavailable, fmt.Sprintf("cas runtime status to stopped: %s", casErr.Error())), nil } persisted, reloadErr := service.runtimeRecords.Get(ctx, input.GameID) if reloadErr != nil { // CAS already committed; surface the success outcome but log the // degraded reload so operators know the response carries the // pre-CAS record. service.logger.WarnContext(ctx, "reload runtime record after stop", "game_id", input.GameID, "err", reloadErr.Error(), ) persisted = record persisted.Status = runtime.StatusStopped persisted.UpdatedAt = stoppedAt persisted.StoppedAt = &stoppedAt } service.publishSnapshot(ctx, persisted, stoppedAt) service.appendSuccessLog(ctx, opStartedAt, input) logArgs := []any{ "game_id", input.GameID, "reason", reason, "from_status", string(record.Status), "op_source", string(fallbackOpSource(input.OpSource)), } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) service.logger.InfoContext(ctx, "runtime stopped", logArgs...) return Result{ Record: persisted, Outcome: operation.OutcomeSuccess, }, nil } // completeIdempotent records the no-op success path used when the // runtime is already terminal (stopped or finished). RTM is not // invoked, no snapshot is published, but the audit row is written so // operators can confirm the call landed. func (service *Service) completeIdempotent(ctx context.Context, opStartedAt time.Time, input Input, record runtime.RuntimeRecord) Result { service.appendSuccessLog(ctx, opStartedAt, input) logArgs := []any{ "game_id", input.GameID, "observed_status", string(record.Status), "op_source", string(fallbackOpSource(input.OpSource)), } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) service.logger.InfoContext(ctx, "runtime stop already terminal", logArgs...) return Result{ Record: record, Outcome: operation.OutcomeSuccess, } } // recordEarlyFailure records a failure that occurred before the runtime // row was read or in the validation phase. func (service *Service) recordEarlyFailure(ctx context.Context, opStartedAt time.Time, input Input, errorCode string, errorMessage string) Result { return service.recordEarlyFailureWithRecord(ctx, opStartedAt, input, runtime.RuntimeRecord{}, errorCode, errorMessage) } // recordEarlyFailureWithRecord records a failure and propagates the // observed runtime record (when available) to the caller. func (service *Service) recordEarlyFailureWithRecord(ctx context.Context, opStartedAt time.Time, input Input, record runtime.RuntimeRecord, errorCode string, errorMessage string) Result { service.appendFailureLog(ctx, opStartedAt, input, errorCode, errorMessage) logArgs := []any{ "game_id", input.GameID, "op_source", string(input.OpSource), "error_code", errorCode, "error_message", errorMessage, } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) service.logger.WarnContext(ctx, "admin stop rejected", logArgs...) return Result{ Record: record, Outcome: operation.OutcomeFailure, ErrorCode: errorCode, ErrorMessage: errorMessage, } } // publishSnapshot publishes the post-success // `runtime_snapshot_update` per `gamemaster/README.md §Lifecycles → // Stop` step 4. Failure is logged but never rolls back the just-applied // CAS; the snapshot stream is best-effort by contract. func (service *Service) publishSnapshot(ctx context.Context, record runtime.RuntimeRecord, occurredAt time.Time) { msg := ports.RuntimeSnapshotUpdate{ GameID: record.GameID, CurrentTurn: record.CurrentTurn, RuntimeStatus: record.Status, EngineHealthSummary: record.EngineHealth, PlayerTurnStats: nil, OccurredAt: occurredAt, } if err := service.lobbyEvents.PublishSnapshotUpdate(ctx, msg); err != nil { service.logger.ErrorContext(ctx, "publish runtime snapshot update", "game_id", record.GameID, "err", err.Error(), ) return } service.telemetry.RecordLobbyEventPublished(ctx, "runtime_snapshot_update") } // appendSuccessLog records the success operation_log entry. func (service *Service) appendSuccessLog(ctx context.Context, opStartedAt time.Time, input Input) { finishedAt := service.clock().UTC() service.bestEffortAppend(ctx, operation.OperationEntry{ GameID: input.GameID, OpKind: operation.OpKindStop, OpSource: fallbackOpSource(input.OpSource), SourceRef: input.SourceRef, Outcome: operation.OutcomeSuccess, StartedAt: opStartedAt, FinishedAt: &finishedAt, }) } // appendFailureLog records the failure operation_log entry. func (service *Service) appendFailureLog(ctx context.Context, opStartedAt time.Time, input Input, errorCode string, errorMessage string) { finishedAt := service.clock().UTC() service.bestEffortAppend(ctx, operation.OperationEntry{ GameID: input.GameID, OpKind: operation.OpKindStop, OpSource: fallbackOpSource(input.OpSource), SourceRef: input.SourceRef, Outcome: operation.OutcomeFailure, ErrorCode: errorCode, ErrorMessage: errorMessage, StartedAt: opStartedAt, FinishedAt: &finishedAt, }) } // bestEffortAppend writes one operation_log entry. A failure is logged // and discarded; the runtime row is the source of truth. func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) { if _, err := service.operationLogs.Append(ctx, entry); err != nil { service.logger.ErrorContext(ctx, "append operation log", "game_id", entry.GameID, "op_kind", string(entry.OpKind), "outcome", string(entry.Outcome), "error_code", entry.ErrorCode, "err", err.Error(), ) } } // fallbackOpSource defaults to `admin_rest` when the caller did not // supply a known op source. Mirrors `gamemaster/README.md §Trusted // Surfaces`. func fallbackOpSource(source operation.OpSource) operation.OpSource { if source.IsKnown() { return source } return operation.OpSourceAdminRest }