Files
galaxy-game/gamemaster/internal/service/adminstop/service.go
T
2026-05-03 07:59:03 +02:00

397 lines
13 KiB
Go

// Package adminstop implements the admin stop service-layer
// orchestrator owned by Game Master. It is driven by Admin Service or
// system administrators through
// `POST /api/v1/internal/runtimes/{game_id}/stop` and tells Runtime
// Manager to stop the game's container while transitioning the runtime
// record to `stopped`.
//
// Lifecycle and failure-mode semantics follow `gamemaster/README.md
// §Lifecycles → Stop`. The idempotent-on-terminal-status and
// conflict-on-starting rules are recorded in
// `gamemaster/docs/stage17-admin-operations.md`.
package adminstop
import (
"context"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"galaxy/gamemaster/internal/domain/operation"
"galaxy/gamemaster/internal/domain/runtime"
"galaxy/gamemaster/internal/logging"
"galaxy/gamemaster/internal/ports"
"galaxy/gamemaster/internal/telemetry"
)
// Input stores the per-call arguments for one admin stop operation.
type Input struct {
// GameID identifies the runtime to stop.
GameID string
// Reason classifies the stop. Empty defaults to
// `admin_request`. Allowed values: `admin_request`, `finished`,
// `timeout`.
Reason string
// OpSource classifies how the request entered Game Master. Used to
// stamp `operation_log.op_source`. Defaults to `admin_rest` when
// missing or unrecognised.
OpSource operation.OpSource
// SourceRef stores the optional opaque per-source reference (REST
// request id, admin user id). Empty when the caller does not
// provide one.
SourceRef string
}
// Validate reports whether input carries the structural invariants the
// service requires before any store is touched.
func (input Input) Validate() error {
if strings.TrimSpace(input.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
switch strings.TrimSpace(input.Reason) {
case "", ReasonAdminRequest, ReasonFinished, ReasonTimeout:
return nil
default:
return fmt.Errorf("reason %q is unsupported", input.Reason)
}
}
// Result stores the deterministic outcome of one Handle call. Business
// outcomes flow through Result; the Go-level error return is reserved
// for non-business failures (nil context, nil receiver).
type Result struct {
// Record carries the runtime record observed (and on success
// transitioned) by the operation. Populated on success and on the
// idempotent no-op branch; zero on early-rejection failures
// (invalid_request, runtime_not_found).
Record runtime.RuntimeRecord
// Outcome reports whether the operation completed (success) or
// produced a stable failure code.
Outcome operation.Outcome
// ErrorCode stores the stable error code on failure. Empty on
// success.
ErrorCode string
// ErrorMessage stores the operator-readable detail on failure.
// Empty on success.
ErrorMessage string
}
// IsSuccess reports whether the result represents a successful
// operation.
func (result Result) IsSuccess() bool {
return result.Outcome == operation.OutcomeSuccess
}
// Dependencies groups the collaborators required by Service.
type Dependencies struct {
// RuntimeRecords drives the read of the current row plus the CAS
// transition to `stopped`.
RuntimeRecords ports.RuntimeRecordStore
// OperationLogs records the audit entry for the operation.
OperationLogs ports.OperationLogStore
// RTM drives the Runtime Manager stop call.
RTM ports.RTMClient
// LobbyEvents publishes the post-success
// `runtime_snapshot_update` to `gm:lobby_events`.
LobbyEvents ports.LobbyEventsPublisher
// Telemetry is required by the lobby-events publication helper.
Telemetry *telemetry.Runtime
// Logger records structured service-level events. Defaults to
// `slog.Default()` when nil.
Logger *slog.Logger
// Clock supplies the wall-clock used for operation timestamps.
// Defaults to `time.Now` when nil.
Clock func() time.Time
}
// Service executes the admin stop lifecycle operation.
type Service struct {
runtimeRecords ports.RuntimeRecordStore
operationLogs ports.OperationLogStore
rtm ports.RTMClient
lobbyEvents ports.LobbyEventsPublisher
telemetry *telemetry.Runtime
logger *slog.Logger
clock func() time.Time
}
// NewService constructs one Service from deps.
func NewService(deps Dependencies) (*Service, error) {
switch {
case deps.RuntimeRecords == nil:
return nil, errors.New("new admin stop service: nil runtime records")
case deps.OperationLogs == nil:
return nil, errors.New("new admin stop service: nil operation logs")
case deps.RTM == nil:
return nil, errors.New("new admin stop service: nil rtm client")
case deps.LobbyEvents == nil:
return nil, errors.New("new admin stop service: nil lobby events publisher")
case deps.Telemetry == nil:
return nil, errors.New("new admin stop service: nil telemetry runtime")
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
logger = logger.With("service", "gamemaster.adminstop")
return &Service{
runtimeRecords: deps.RuntimeRecords,
operationLogs: deps.OperationLogs,
rtm: deps.RTM,
lobbyEvents: deps.LobbyEvents,
telemetry: deps.Telemetry,
logger: logger,
clock: clock,
}, nil
}
// Handle executes one admin stop operation end-to-end. The Go-level
// error return is reserved for non-business failures (nil context, nil
// receiver). Every business outcome flows through Result.
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("admin stop: nil service")
}
if ctx == nil {
return Result{}, errors.New("admin stop: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordEarlyFailure(ctx, opStartedAt, input,
ErrorCodeInvalidRequest, err.Error()), nil
}
reason := strings.TrimSpace(input.Reason)
if reason == "" {
reason = ReasonAdminRequest
}
record, err := service.runtimeRecords.Get(ctx, input.GameID)
switch {
case errors.Is(err, runtime.ErrNotFound):
return service.recordEarlyFailure(ctx, opStartedAt, input,
ErrorCodeRuntimeNotFound, "runtime record does not exist"), nil
case err != nil:
return service.recordEarlyFailure(ctx, opStartedAt, input,
ErrorCodeServiceUnavailable, fmt.Sprintf("get runtime record: %s", err.Error())), nil
}
switch record.Status {
case runtime.StatusStopped, runtime.StatusFinished:
return service.completeIdempotent(ctx, opStartedAt, input, record), nil
case runtime.StatusStarting:
return service.recordEarlyFailureWithRecord(ctx, opStartedAt, input, record,
ErrorCodeConflict,
fmt.Sprintf("runtime status is %q; stop requires a started runtime", record.Status)), nil
}
if err := service.rtm.Stop(ctx, input.GameID, reason); err != nil {
return service.recordEarlyFailureWithRecord(ctx, opStartedAt, input, record,
ErrorCodeServiceUnavailable, fmt.Sprintf("rtm stop: %s", err.Error())), nil
}
stoppedAt := service.clock().UTC()
casErr := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: input.GameID,
ExpectedFrom: record.Status,
To: runtime.StatusStopped,
Now: stoppedAt,
})
switch {
case casErr == nil:
case errors.Is(casErr, runtime.ErrConflict):
return service.recordEarlyFailureWithRecord(ctx, opStartedAt, input, record,
ErrorCodeConflict,
fmt.Sprintf("cas runtime status to stopped: %s", casErr.Error())), nil
case errors.Is(casErr, runtime.ErrNotFound):
return service.recordEarlyFailureWithRecord(ctx, opStartedAt, input, record,
ErrorCodeRuntimeNotFound,
fmt.Sprintf("cas runtime status to stopped: %s", casErr.Error())), nil
default:
return service.recordEarlyFailureWithRecord(ctx, opStartedAt, input, record,
ErrorCodeServiceUnavailable,
fmt.Sprintf("cas runtime status to stopped: %s", casErr.Error())), nil
}
persisted, reloadErr := service.runtimeRecords.Get(ctx, input.GameID)
if reloadErr != nil {
// CAS already committed; surface the success outcome but log the
// degraded reload so operators know the response carries the
// pre-CAS record.
service.logger.WarnContext(ctx, "reload runtime record after stop",
"game_id", input.GameID,
"err", reloadErr.Error(),
)
persisted = record
persisted.Status = runtime.StatusStopped
persisted.UpdatedAt = stoppedAt
persisted.StoppedAt = &stoppedAt
}
service.publishSnapshot(ctx, persisted, stoppedAt)
service.appendSuccessLog(ctx, opStartedAt, input)
logArgs := []any{
"game_id", input.GameID,
"reason", reason,
"from_status", string(record.Status),
"op_source", string(fallbackOpSource(input.OpSource)),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime stopped", logArgs...)
return Result{
Record: persisted,
Outcome: operation.OutcomeSuccess,
}, nil
}
// completeIdempotent records the no-op success path used when the
// runtime is already terminal (stopped or finished). RTM is not
// invoked, no snapshot is published, but the audit row is written so
// operators can confirm the call landed.
func (service *Service) completeIdempotent(ctx context.Context, opStartedAt time.Time, input Input, record runtime.RuntimeRecord) Result {
service.appendSuccessLog(ctx, opStartedAt, input)
logArgs := []any{
"game_id", input.GameID,
"observed_status", string(record.Status),
"op_source", string(fallbackOpSource(input.OpSource)),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime stop already terminal", logArgs...)
return Result{
Record: record,
Outcome: operation.OutcomeSuccess,
}
}
// recordEarlyFailure records a failure that occurred before the runtime
// row was read or in the validation phase.
func (service *Service) recordEarlyFailure(ctx context.Context, opStartedAt time.Time, input Input, errorCode string, errorMessage string) Result {
return service.recordEarlyFailureWithRecord(ctx, opStartedAt, input, runtime.RuntimeRecord{}, errorCode, errorMessage)
}
// recordEarlyFailureWithRecord records a failure and propagates the
// observed runtime record (when available) to the caller.
func (service *Service) recordEarlyFailureWithRecord(ctx context.Context, opStartedAt time.Time, input Input, record runtime.RuntimeRecord, errorCode string, errorMessage string) Result {
service.appendFailureLog(ctx, opStartedAt, input, errorCode, errorMessage)
logArgs := []any{
"game_id", input.GameID,
"op_source", string(input.OpSource),
"error_code", errorCode,
"error_message", errorMessage,
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.WarnContext(ctx, "admin stop rejected", logArgs...)
return Result{
Record: record,
Outcome: operation.OutcomeFailure,
ErrorCode: errorCode,
ErrorMessage: errorMessage,
}
}
// publishSnapshot publishes the post-success
// `runtime_snapshot_update` per `gamemaster/README.md §Lifecycles →
// Stop` step 4. Failure is logged but never rolls back the just-applied
// CAS; the snapshot stream is best-effort by contract.
func (service *Service) publishSnapshot(ctx context.Context, record runtime.RuntimeRecord, occurredAt time.Time) {
msg := ports.RuntimeSnapshotUpdate{
GameID: record.GameID,
CurrentTurn: record.CurrentTurn,
RuntimeStatus: record.Status,
EngineHealthSummary: record.EngineHealth,
PlayerTurnStats: nil,
OccurredAt: occurredAt,
}
if err := service.lobbyEvents.PublishSnapshotUpdate(ctx, msg); err != nil {
service.logger.ErrorContext(ctx, "publish runtime snapshot update",
"game_id", record.GameID,
"err", err.Error(),
)
return
}
service.telemetry.RecordLobbyEventPublished(ctx, "runtime_snapshot_update")
}
// appendSuccessLog records the success operation_log entry.
func (service *Service) appendSuccessLog(ctx context.Context, opStartedAt time.Time, input Input) {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindStop,
OpSource: fallbackOpSource(input.OpSource),
SourceRef: input.SourceRef,
Outcome: operation.OutcomeSuccess,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
}
// appendFailureLog records the failure operation_log entry.
func (service *Service) appendFailureLog(ctx context.Context, opStartedAt time.Time, input Input, errorCode string, errorMessage string) {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindStop,
OpSource: fallbackOpSource(input.OpSource),
SourceRef: input.SourceRef,
Outcome: operation.OutcomeFailure,
ErrorCode: errorCode,
ErrorMessage: errorMessage,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
}
// bestEffortAppend writes one operation_log entry. A failure is logged
// and discarded; the runtime row is the source of truth.
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
service.logger.ErrorContext(ctx, "append operation log",
"game_id", entry.GameID,
"op_kind", string(entry.OpKind),
"outcome", string(entry.Outcome),
"error_code", entry.ErrorCode,
"err", err.Error(),
)
}
}
// fallbackOpSource defaults to `admin_rest` when the caller did not
// supply a known op source. Mirrors `gamemaster/README.md §Trusted
// Surfaces`.
func fallbackOpSource(source operation.OpSource) operation.OpSource {
if source.IsKnown() {
return source
}
return operation.OpSourceAdminRest
}