feat: gamemaster
This commit is contained in:
@@ -0,0 +1,971 @@
|
||||
// Package turngeneration implements the turn-generation orchestrator
|
||||
// owned by Game Master. It is the single entry point through which the
|
||||
// scheduler ticker (Stage 15 worker) and the admin force-next-turn flow
|
||||
// (Stage 17) drive a turn through the engine container.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `gamemaster/README.md
|
||||
// §Lifecycles → Turn generation` and §Force-next-turn. Design rationale
|
||||
// is captured in
|
||||
// `gamemaster/docs/stage15-scheduler-and-turn-generation.md`.
|
||||
package turngeneration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/gamemaster/internal/domain/operation"
|
||||
"galaxy/gamemaster/internal/domain/playermapping"
|
||||
"galaxy/gamemaster/internal/domain/runtime"
|
||||
"galaxy/gamemaster/internal/logging"
|
||||
"galaxy/gamemaster/internal/ports"
|
||||
"galaxy/gamemaster/internal/service/scheduler"
|
||||
"galaxy/gamemaster/internal/telemetry"
|
||||
"galaxy/notificationintent"
|
||||
)
|
||||
|
||||
// Trigger classifies the caller of one turn-generation operation. The
|
||||
// value flows into telemetry and structured logs only — it does not
|
||||
// branch the orchestrator's persistence path. The skip-tick mechanic is
|
||||
// driven exclusively by the runtime record's `skip_next_tick` column.
|
||||
type Trigger string
|
||||
|
||||
const (
|
||||
// TriggerScheduler labels turn generations dispatched by the
|
||||
// `schedulerticker` worker.
|
||||
TriggerScheduler Trigger = "scheduler"
|
||||
|
||||
// TriggerForce labels turn generations dispatched by the admin
|
||||
// force-next-turn flow (Stage 17 `service/adminforce`).
|
||||
TriggerForce Trigger = "force"
|
||||
)
|
||||
|
||||
// IsKnown reports whether trigger belongs to the frozen trigger
|
||||
// vocabulary.
|
||||
func (trigger Trigger) IsKnown() bool {
|
||||
switch trigger {
|
||||
case TriggerScheduler, TriggerForce:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Input stores the per-call arguments for one turn-generation
|
||||
// operation.
|
||||
type Input struct {
|
||||
// GameID identifies the runtime to drive.
|
||||
GameID string
|
||||
|
||||
// Trigger classifies the caller. Used for telemetry and logs only.
|
||||
Trigger Trigger
|
||||
|
||||
// OpSource classifies how the request entered Game Master. Used to
|
||||
// stamp `operation_log.op_source`. Defaults to `admin_rest` when
|
||||
// missing or unrecognised.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference (REST
|
||||
// request id, scheduler tick id). Empty when the caller does not
|
||||
// provide one.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires before any store is touched.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if !input.Trigger.IsKnown() {
|
||||
return fmt.Errorf("trigger %q is unsupported", input.Trigger)
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle call.
|
||||
type Result struct {
|
||||
// Record carries the post-mutation runtime record. Populated on
|
||||
// every success outcome and on `engine_*` failures (where the row
|
||||
// was moved to `generation_failed`); zero on early-rejection
|
||||
// outcomes (`invalid_request`, `runtime_not_found`,
|
||||
// `runtime_not_running`, `conflict` on initial CAS,
|
||||
// `service_unavailable` on initial Get).
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Trigger echoes back Input.Trigger for log/telemetry consumers.
|
||||
Trigger Trigger
|
||||
|
||||
// Finished is true when the engine reported `finished=true` on this
|
||||
// turn and the runtime transitioned to `finished`.
|
||||
Finished bool
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure. Empty on
|
||||
// success.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
// Empty on success.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// IsSuccess reports whether the result represents a successful
|
||||
// operation.
|
||||
func (result Result) IsSuccess() bool {
|
||||
return result.Outcome == operation.OutcomeSuccess
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
// RuntimeRecords drives every CAS and scheduling persistence step.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// PlayerMappings supplies the per-game roster used to project
|
||||
// engine player state to user-facing notification recipients and
|
||||
// `player_turn_stats`.
|
||||
PlayerMappings ports.PlayerMappingStore
|
||||
|
||||
// OperationLogs records the audit entry for the operation.
|
||||
OperationLogs ports.OperationLogStore
|
||||
|
||||
// Engine drives the engine /admin/turn call.
|
||||
Engine ports.EngineClient
|
||||
|
||||
// LobbyEvents publishes `runtime_snapshot_update` and
|
||||
// `game_finished` to `gm:lobby_events`.
|
||||
LobbyEvents ports.LobbyEventsPublisher
|
||||
|
||||
// Notifications publishes `game.turn.ready`, `game.finished`, and
|
||||
// `game.generation_failed` intents to `notification:intents`.
|
||||
Notifications ports.NotificationIntentPublisher
|
||||
|
||||
// Lobby resolves the human-readable `game_name` consumed by
|
||||
// notification payloads. Failure is fail-soft: the orchestrator
|
||||
// falls back to `game_id`.
|
||||
Lobby ports.LobbyClient
|
||||
|
||||
// Scheduler computes the post-success `next_generation_at` value.
|
||||
Scheduler *scheduler.Service
|
||||
|
||||
// Telemetry records the turn-generation outcome counter, lobby
|
||||
// publication counter, and notification publish-attempt counter.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Logger records structured service-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
|
||||
// Clock supplies the wall-clock used for operation timestamps.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
}
|
||||
|
||||
// Service executes the turn-generation lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
playerMappings ports.PlayerMappingStore
|
||||
operationLogs ports.OperationLogStore
|
||||
engine ports.EngineClient
|
||||
lobbyEvents ports.LobbyEventsPublisher
|
||||
notifications ports.NotificationIntentPublisher
|
||||
lobby ports.LobbyClient
|
||||
scheduler *scheduler.Service
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
clock func() time.Time
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new turn generation service: nil runtime records")
|
||||
case deps.PlayerMappings == nil:
|
||||
return nil, errors.New("new turn generation service: nil player mappings")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new turn generation service: nil operation logs")
|
||||
case deps.Engine == nil:
|
||||
return nil, errors.New("new turn generation service: nil engine client")
|
||||
case deps.LobbyEvents == nil:
|
||||
return nil, errors.New("new turn generation service: nil lobby events publisher")
|
||||
case deps.Notifications == nil:
|
||||
return nil, errors.New("new turn generation service: nil notification publisher")
|
||||
case deps.Lobby == nil:
|
||||
return nil, errors.New("new turn generation service: nil lobby client")
|
||||
case deps.Scheduler == nil:
|
||||
return nil, errors.New("new turn generation service: nil scheduler")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new turn generation service: nil telemetry runtime")
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "gamemaster.turngeneration")
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
playerMappings: deps.PlayerMappings,
|
||||
operationLogs: deps.OperationLogs,
|
||||
engine: deps.Engine,
|
||||
lobbyEvents: deps.LobbyEvents,
|
||||
notifications: deps.Notifications,
|
||||
lobby: deps.Lobby,
|
||||
scheduler: deps.Scheduler,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one turn-generation operation end-to-end. The
|
||||
// Go-level error return is reserved for non-business failures (nil
|
||||
// context, nil receiver). Every business outcome flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("turn generation: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("turn generation: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordEarlyFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeInvalidRequest, err.Error()), nil
|
||||
}
|
||||
|
||||
record, outcome, ok := service.loadRecord(ctx, opStartedAt, input)
|
||||
if !ok {
|
||||
return outcome, nil
|
||||
}
|
||||
|
||||
if record.Status != runtime.StatusRunning {
|
||||
return service.recordEarlyFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeRuntimeNotRunning,
|
||||
fmt.Sprintf("runtime status is %q, expected %q",
|
||||
record.Status, runtime.StatusRunning)), nil
|
||||
}
|
||||
|
||||
if outcome, ok := service.casToInProgress(ctx, opStartedAt, input); !ok {
|
||||
return outcome, nil
|
||||
}
|
||||
|
||||
state, engineOK, engineCode, engineMsg := service.callEngineTurn(ctx, record)
|
||||
mappings, listErr := service.playerMappings.ListByGame(ctx, input.GameID)
|
||||
if listErr != nil {
|
||||
// Without mappings we cannot project player_turn_stats; treat
|
||||
// as a service_unavailable failure but still try to roll the
|
||||
// runtime to generation_failed because the engine call may
|
||||
// have already mutated state.
|
||||
return service.failGeneration(ctx, opStartedAt, input, record,
|
||||
ErrorCodeServiceUnavailable,
|
||||
fmt.Sprintf("list player mappings: %s", listErr.Error())), nil
|
||||
}
|
||||
|
||||
if !engineOK {
|
||||
return service.failGeneration(ctx, opStartedAt, input, record,
|
||||
engineCode, engineMsg), nil
|
||||
}
|
||||
|
||||
if outcome, ok := service.validateRoster(ctx, opStartedAt, input, record, state, mappings); !ok {
|
||||
return outcome, nil
|
||||
}
|
||||
|
||||
if state.Finished {
|
||||
return service.completeFinished(ctx, opStartedAt, input, record, state, mappings), nil
|
||||
}
|
||||
return service.completeRunning(ctx, opStartedAt, input, record, state, mappings), nil
|
||||
}
|
||||
|
||||
// loadRecord reads the runtime record and maps store errors to
|
||||
// orchestrator outcomes. ok=false means the flow stops with the
|
||||
// returned Result.
|
||||
func (service *Service) loadRecord(ctx context.Context, opStartedAt time.Time, input Input) (runtime.RuntimeRecord, Result, bool) {
|
||||
record, err := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
switch {
|
||||
case err == nil:
|
||||
return record, Result{}, true
|
||||
case errors.Is(err, runtime.ErrNotFound):
|
||||
return runtime.RuntimeRecord{}, service.recordEarlyFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeRuntimeNotFound, "runtime record does not exist"), false
|
||||
default:
|
||||
return runtime.RuntimeRecord{}, service.recordEarlyFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeServiceUnavailable, fmt.Sprintf("get runtime record: %s", err.Error())), false
|
||||
}
|
||||
}
|
||||
|
||||
// casToInProgress flips the runtime row from `running` to
|
||||
// `generation_in_progress`. ok=false means the flow stops with the
|
||||
// returned Result; the caller has not touched the engine yet.
|
||||
func (service *Service) casToInProgress(ctx context.Context, opStartedAt time.Time, input Input) (Result, bool) {
|
||||
err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: input.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
To: runtime.StatusGenerationInProgress,
|
||||
Now: opStartedAt,
|
||||
})
|
||||
switch {
|
||||
case err == nil:
|
||||
return Result{}, true
|
||||
case errors.Is(err, runtime.ErrConflict):
|
||||
return service.recordEarlyFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeConflict,
|
||||
fmt.Sprintf("cas runtime status to generation_in_progress: %s", err.Error())), false
|
||||
case errors.Is(err, runtime.ErrNotFound):
|
||||
return service.recordEarlyFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeRuntimeNotFound,
|
||||
fmt.Sprintf("cas runtime status to generation_in_progress: %s", err.Error())), false
|
||||
default:
|
||||
return service.recordEarlyFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeServiceUnavailable,
|
||||
fmt.Sprintf("cas runtime status to generation_in_progress: %s", err.Error())), false
|
||||
}
|
||||
}
|
||||
|
||||
// callEngineTurn dispatches the engine /admin/turn call and classifies
|
||||
// the outcome. engineOK=true means the response is well-formed at the
|
||||
// transport level; engineOK=false populates errorCode / errorMessage
|
||||
// with a stable failure shape.
|
||||
func (service *Service) callEngineTurn(ctx context.Context, record runtime.RuntimeRecord) (state ports.StateResponse, engineOK bool, errorCode string, errorMessage string) {
|
||||
state, err := service.engine.Turn(ctx, record.EngineEndpoint)
|
||||
if err == nil {
|
||||
return state, true, "", ""
|
||||
}
|
||||
return ports.StateResponse{}, false, classifyEngineError(err), fmt.Sprintf("engine turn: %s", err.Error())
|
||||
}
|
||||
|
||||
// classifyEngineError maps the engine port sentinels to the
|
||||
// turn-generation stable error codes.
|
||||
func classifyEngineError(err error) string {
|
||||
switch {
|
||||
case errors.Is(err, ports.ErrEngineValidation):
|
||||
return ErrorCodeEngineValidationError
|
||||
case errors.Is(err, ports.ErrEngineProtocolViolation):
|
||||
return ErrorCodeEngineProtocolViolation
|
||||
case errors.Is(err, ports.ErrEngineUnreachable):
|
||||
return ErrorCodeEngineUnreachable
|
||||
default:
|
||||
return ErrorCodeEngineUnreachable
|
||||
}
|
||||
}
|
||||
|
||||
// validateRoster checks that the engine response carries exactly the
|
||||
// race set installed at register-runtime. ok=false means the flow stops
|
||||
// (and the runtime row is moved to `generation_failed`).
|
||||
func (service *Service) validateRoster(ctx context.Context, opStartedAt time.Time, input Input, record runtime.RuntimeRecord, state ports.StateResponse, mappings []playermapping.PlayerMapping) (Result, bool) {
|
||||
if len(state.Players) != len(mappings) {
|
||||
message := fmt.Sprintf("engine player count %d does not match roster size %d",
|
||||
len(state.Players), len(mappings))
|
||||
return service.failGeneration(ctx, opStartedAt, input, record,
|
||||
ErrorCodeEngineProtocolViolation, message), false
|
||||
}
|
||||
expected := make(map[string]struct{}, len(mappings))
|
||||
for _, mapping := range mappings {
|
||||
expected[mapping.RaceName] = struct{}{}
|
||||
}
|
||||
for _, player := range state.Players {
|
||||
if _, ok := expected[player.RaceName]; !ok {
|
||||
message := fmt.Sprintf("engine returned race %q not present in roster", player.RaceName)
|
||||
return service.failGeneration(ctx, opStartedAt, input, record,
|
||||
ErrorCodeEngineProtocolViolation, message), false
|
||||
}
|
||||
}
|
||||
return Result{}, true
|
||||
}
|
||||
|
||||
// completeFinished handles the `finished=true` branch: CAS to finished,
|
||||
// clear scheduling, publish game_finished, publish game.finished
|
||||
// notification, audit success.
|
||||
func (service *Service) completeFinished(ctx context.Context, opStartedAt time.Time, input Input, record runtime.RuntimeRecord, state ports.StateResponse, mappings []playermapping.PlayerMapping) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
|
||||
err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: input.GameID,
|
||||
ExpectedFrom: runtime.StatusGenerationInProgress,
|
||||
To: runtime.StatusFinished,
|
||||
Now: finishedAt,
|
||||
})
|
||||
if err != nil {
|
||||
return service.handlePostEngineCASFailure(ctx, opStartedAt, input, record, err)
|
||||
}
|
||||
|
||||
if err := service.runtimeRecords.UpdateScheduling(ctx, ports.UpdateSchedulingInput{
|
||||
GameID: input.GameID,
|
||||
NextGenerationAt: nil,
|
||||
SkipNextTick: false,
|
||||
CurrentTurn: state.Turn,
|
||||
Now: finishedAt,
|
||||
}); err != nil {
|
||||
// The CAS to finished succeeded; the row is in the terminal
|
||||
// state. Surface a service_unavailable to the caller but keep
|
||||
// the audit and snapshot consistent.
|
||||
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeServiceUnavailable,
|
||||
fmt.Sprintf("update scheduling on finish: %s", err.Error()))
|
||||
}
|
||||
|
||||
persisted, reloadErr := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
if reloadErr != nil {
|
||||
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeServiceUnavailable,
|
||||
fmt.Sprintf("reload runtime record: %s", reloadErr.Error()))
|
||||
}
|
||||
|
||||
stats := projectPlayerStats(state, mappings)
|
||||
|
||||
finishedMsg := ports.GameFinished{
|
||||
GameID: input.GameID,
|
||||
FinalTurnNumber: state.Turn,
|
||||
RuntimeStatus: runtime.StatusFinished,
|
||||
PlayerTurnStats: stats,
|
||||
FinishedAt: finishedAt,
|
||||
}
|
||||
if err := service.lobbyEvents.PublishGameFinished(ctx, finishedMsg); err != nil {
|
||||
service.logger.ErrorContext(ctx, "publish game finished",
|
||||
"game_id", input.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
} else {
|
||||
service.telemetry.RecordLobbyEventPublished(ctx, "game_finished")
|
||||
}
|
||||
|
||||
gameName := service.resolveGameName(ctx, input.GameID)
|
||||
recipients := recipientUserIDs(mappings)
|
||||
service.publishGameFinishedIntent(ctx, input, gameName, state.Turn, recipients, finishedAt)
|
||||
|
||||
service.appendSuccessLog(ctx, opStartedAt, input)
|
||||
service.telemetry.RecordTurnGenerationOutcome(ctx,
|
||||
string(operation.OutcomeSuccess), "", string(input.Trigger))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"trigger", string(input.Trigger),
|
||||
"final_turn", state.Turn,
|
||||
"finished", true,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "turn generation finished game", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: persisted,
|
||||
Trigger: input.Trigger,
|
||||
Finished: true,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}
|
||||
}
|
||||
|
||||
// completeRunning handles the `finished=false` branch: recompute next
|
||||
// tick, CAS back to running, publish snapshot, publish
|
||||
// game.turn.ready notification, audit success.
|
||||
func (service *Service) completeRunning(ctx context.Context, opStartedAt time.Time, input Input, record runtime.RuntimeRecord, state ports.StateResponse, mappings []playermapping.PlayerMapping) Result {
|
||||
completedAt := service.clock().UTC()
|
||||
|
||||
next, _, err := service.scheduler.ComputeNext(record.TurnSchedule, completedAt, record.SkipNextTick)
|
||||
if err != nil {
|
||||
return service.failGeneration(ctx, opStartedAt, input, record,
|
||||
ErrorCodeInvalidRequest,
|
||||
fmt.Sprintf("recompute next tick: %s", err.Error()))
|
||||
}
|
||||
|
||||
if err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: input.GameID,
|
||||
ExpectedFrom: runtime.StatusGenerationInProgress,
|
||||
To: runtime.StatusRunning,
|
||||
Now: completedAt,
|
||||
}); err != nil {
|
||||
return service.handlePostEngineCASFailure(ctx, opStartedAt, input, record, err)
|
||||
}
|
||||
|
||||
if err := service.runtimeRecords.UpdateScheduling(ctx, ports.UpdateSchedulingInput{
|
||||
GameID: input.GameID,
|
||||
NextGenerationAt: &next,
|
||||
SkipNextTick: false,
|
||||
CurrentTurn: state.Turn,
|
||||
Now: completedAt,
|
||||
}); err != nil {
|
||||
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeServiceUnavailable,
|
||||
fmt.Sprintf("update scheduling on running: %s", err.Error()))
|
||||
}
|
||||
|
||||
persisted, reloadErr := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
if reloadErr != nil {
|
||||
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeServiceUnavailable,
|
||||
fmt.Sprintf("reload runtime record: %s", reloadErr.Error()))
|
||||
}
|
||||
|
||||
stats := projectPlayerStats(state, mappings)
|
||||
|
||||
snapshot := ports.RuntimeSnapshotUpdate{
|
||||
GameID: input.GameID,
|
||||
CurrentTurn: state.Turn,
|
||||
RuntimeStatus: runtime.StatusRunning,
|
||||
EngineHealthSummary: persisted.EngineHealth,
|
||||
PlayerTurnStats: stats,
|
||||
OccurredAt: completedAt,
|
||||
}
|
||||
if err := service.lobbyEvents.PublishSnapshotUpdate(ctx, snapshot); err != nil {
|
||||
service.logger.ErrorContext(ctx, "publish runtime snapshot update",
|
||||
"game_id", input.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
} else {
|
||||
service.telemetry.RecordLobbyEventPublished(ctx, "runtime_snapshot_update")
|
||||
}
|
||||
|
||||
gameName := service.resolveGameName(ctx, input.GameID)
|
||||
recipients := recipientUserIDs(mappings)
|
||||
service.publishGameTurnReadyIntent(ctx, input, gameName, state.Turn, recipients, completedAt)
|
||||
|
||||
service.appendSuccessLog(ctx, opStartedAt, input)
|
||||
service.telemetry.RecordTurnGenerationOutcome(ctx,
|
||||
string(operation.OutcomeSuccess), "", string(input.Trigger))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"trigger", string(input.Trigger),
|
||||
"current_turn", state.Turn,
|
||||
"next_generation_at", next.Format(time.RFC3339Nano),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "turn generation succeeded", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: persisted,
|
||||
Trigger: input.Trigger,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}
|
||||
}
|
||||
|
||||
// failGeneration handles every post-CAS failure path: CAS to
|
||||
// generation_failed, publish snapshot, publish game.generation_failed
|
||||
// admin notification, audit failure.
|
||||
func (service *Service) failGeneration(ctx context.Context, opStartedAt time.Time, input Input, _ runtime.RuntimeRecord, errorCode string, errorMessage string) Result {
|
||||
failedAt := service.clock().UTC()
|
||||
|
||||
casErr := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: input.GameID,
|
||||
ExpectedFrom: runtime.StatusGenerationInProgress,
|
||||
To: runtime.StatusGenerationFailed,
|
||||
Now: failedAt,
|
||||
})
|
||||
if casErr != nil && !errors.Is(casErr, runtime.ErrConflict) {
|
||||
// Best-effort transition. The original error code remains the
|
||||
// caller-visible one; log the secondary failure.
|
||||
service.logger.ErrorContext(ctx, "cas runtime status to generation_failed",
|
||||
"game_id", input.GameID,
|
||||
"err", casErr.Error(),
|
||||
)
|
||||
}
|
||||
|
||||
persisted, reloadErr := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
publishedStatus := runtime.StatusGenerationFailed
|
||||
if reloadErr == nil {
|
||||
publishedStatus = persisted.Status
|
||||
}
|
||||
|
||||
snapshot := ports.RuntimeSnapshotUpdate{
|
||||
GameID: input.GameID,
|
||||
CurrentTurn: persistedTurn(persisted, reloadErr),
|
||||
RuntimeStatus: publishedStatus,
|
||||
EngineHealthSummary: persistedHealth(persisted, reloadErr),
|
||||
PlayerTurnStats: nil,
|
||||
OccurredAt: failedAt,
|
||||
}
|
||||
if err := service.lobbyEvents.PublishSnapshotUpdate(ctx, snapshot); err != nil {
|
||||
service.logger.ErrorContext(ctx, "publish runtime snapshot update on failure",
|
||||
"game_id", input.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
} else {
|
||||
service.telemetry.RecordLobbyEventPublished(ctx, "runtime_snapshot_update")
|
||||
}
|
||||
|
||||
gameName := service.resolveGameName(ctx, input.GameID)
|
||||
service.publishGameGenerationFailedIntent(ctx, input, gameName, errorCode, errorMessage, failedAt)
|
||||
|
||||
service.appendFailureLog(ctx, opStartedAt, input, errorCode, errorMessage)
|
||||
service.telemetry.RecordTurnGenerationOutcome(ctx,
|
||||
string(operation.OutcomeFailure), errorCode, string(input.Trigger))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"trigger", string(input.Trigger),
|
||||
"error_code", errorCode,
|
||||
"error_message", errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "turn generation failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: persisted,
|
||||
Trigger: input.Trigger,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: errorCode,
|
||||
ErrorMessage: errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
// handlePostEngineCASFailure maps a CAS error that surfaced after the
|
||||
// engine call already succeeded. Conflict means an external actor (e.g.
|
||||
// admin stop) won the race; other errors are treated as
|
||||
// service_unavailable. No publication is issued — the external mutation
|
||||
// owns its own snapshot.
|
||||
func (service *Service) handlePostEngineCASFailure(ctx context.Context, opStartedAt time.Time, input Input, _ runtime.RuntimeRecord, casErr error) Result {
|
||||
switch {
|
||||
case errors.Is(casErr, runtime.ErrConflict):
|
||||
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeConflict,
|
||||
fmt.Sprintf("cas runtime status post-engine: %s", casErr.Error()))
|
||||
case errors.Is(casErr, runtime.ErrNotFound):
|
||||
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeRuntimeNotFound,
|
||||
fmt.Sprintf("cas runtime status post-engine: %s", casErr.Error()))
|
||||
default:
|
||||
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
||||
ErrorCodeServiceUnavailable,
|
||||
fmt.Sprintf("cas runtime status post-engine: %s", casErr.Error()))
|
||||
}
|
||||
}
|
||||
|
||||
// recordEarlyFailure handles failures that occur before the runtime row
|
||||
// is in `generation_in_progress`. No status mutation, no publication;
|
||||
// only audit and telemetry.
|
||||
func (service *Service) recordEarlyFailure(ctx context.Context, opStartedAt time.Time, input Input, errorCode string, errorMessage string) Result {
|
||||
service.appendFailureLog(ctx, opStartedAt, input, errorCode, errorMessage)
|
||||
service.telemetry.RecordTurnGenerationOutcome(ctx,
|
||||
string(operation.OutcomeFailure), errorCode, string(input.Trigger))
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"trigger", string(input.Trigger),
|
||||
"error_code", errorCode,
|
||||
"error_message", errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "turn generation rejected", logArgs...)
|
||||
return Result{
|
||||
Trigger: input.Trigger,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: errorCode,
|
||||
ErrorMessage: errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
// recordTerminalFailure handles failures after a post-engine CAS or a
|
||||
// reload failed. The runtime row is in an undetermined state owned by
|
||||
// whatever mutation won; we record the audit and surface the failure
|
||||
// without further publication.
|
||||
func (service *Service) recordTerminalFailure(ctx context.Context, opStartedAt time.Time, input Input, errorCode string, errorMessage string) Result {
|
||||
service.appendFailureLog(ctx, opStartedAt, input, errorCode, errorMessage)
|
||||
service.telemetry.RecordTurnGenerationOutcome(ctx,
|
||||
string(operation.OutcomeFailure), errorCode, string(input.Trigger))
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"trigger", string(input.Trigger),
|
||||
"error_code", errorCode,
|
||||
"error_message", errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "turn generation post-engine failure", logArgs...)
|
||||
return Result{
|
||||
Trigger: input.Trigger,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: errorCode,
|
||||
ErrorMessage: errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
// resolveGameName fetches the human-readable game name from Lobby and
|
||||
// falls back to the platform game id on any error per Stage 15 D1.
|
||||
func (service *Service) resolveGameName(ctx context.Context, gameID string) string {
|
||||
summary, err := service.lobby.GetGameSummary(ctx, gameID)
|
||||
if err != nil {
|
||||
logArgs := []any{
|
||||
"game_id", gameID,
|
||||
"error_code", "lobby_unavailable",
|
||||
"err", err.Error(),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "resolve game name fell back to game id", logArgs...)
|
||||
return gameID
|
||||
}
|
||||
if strings.TrimSpace(summary.GameName) == "" {
|
||||
return gameID
|
||||
}
|
||||
return summary.GameName
|
||||
}
|
||||
|
||||
// publishGameTurnReadyIntent publishes the user-targeted notification
|
||||
// that announces a freshly generated turn. Empty recipient sets are
|
||||
// dropped silently — the validator inside notificationintent rejects
|
||||
// them outright, but the orchestrator should not break commit.
|
||||
func (service *Service) publishGameTurnReadyIntent(ctx context.Context, input Input, gameName string, turnNumber int, recipients []string, occurredAt time.Time) {
|
||||
if len(recipients) == 0 {
|
||||
service.logger.WarnContext(ctx, "skip game.turn.ready notification: empty recipient set",
|
||||
"game_id", input.GameID,
|
||||
)
|
||||
return
|
||||
}
|
||||
intent, err := notificationintent.NewGameTurnReadyIntent(
|
||||
notificationintent.Metadata{
|
||||
IdempotencyKey: fmt.Sprintf("game.turn.ready:%s:%d", input.GameID, turnNumber),
|
||||
OccurredAt: occurredAt,
|
||||
RequestID: logging.RequestIDFromContext(ctx),
|
||||
},
|
||||
recipients,
|
||||
notificationintent.GameTurnReadyPayload{
|
||||
GameID: input.GameID,
|
||||
GameName: gameName,
|
||||
TurnNumber: int64(turnNumber),
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
service.logger.ErrorContext(ctx, "build game.turn.ready intent",
|
||||
"game_id", input.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
||||
string(notificationintent.NotificationTypeGameTurnReady), "error")
|
||||
return
|
||||
}
|
||||
if err := service.notifications.Publish(ctx, intent); err != nil {
|
||||
service.logger.ErrorContext(ctx, "publish game.turn.ready intent",
|
||||
"game_id", input.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
||||
string(notificationintent.NotificationTypeGameTurnReady), "error")
|
||||
return
|
||||
}
|
||||
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
||||
string(notificationintent.NotificationTypeGameTurnReady), "ok")
|
||||
}
|
||||
|
||||
// publishGameFinishedIntent publishes the user-targeted notification
|
||||
// that announces a finished game.
|
||||
func (service *Service) publishGameFinishedIntent(ctx context.Context, input Input, gameName string, finalTurnNumber int, recipients []string, occurredAt time.Time) {
|
||||
if len(recipients) == 0 {
|
||||
service.logger.WarnContext(ctx, "skip game.finished notification: empty recipient set",
|
||||
"game_id", input.GameID,
|
||||
)
|
||||
return
|
||||
}
|
||||
intent, err := notificationintent.NewGameFinishedIntent(
|
||||
notificationintent.Metadata{
|
||||
IdempotencyKey: fmt.Sprintf("game.finished:%s:%d", input.GameID, finalTurnNumber),
|
||||
OccurredAt: occurredAt,
|
||||
RequestID: logging.RequestIDFromContext(ctx),
|
||||
},
|
||||
recipients,
|
||||
notificationintent.GameFinishedPayload{
|
||||
GameID: input.GameID,
|
||||
GameName: gameName,
|
||||
FinalTurnNumber: int64(finalTurnNumber),
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
service.logger.ErrorContext(ctx, "build game.finished intent",
|
||||
"game_id", input.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
||||
string(notificationintent.NotificationTypeGameFinished), "error")
|
||||
return
|
||||
}
|
||||
if err := service.notifications.Publish(ctx, intent); err != nil {
|
||||
service.logger.ErrorContext(ctx, "publish game.finished intent",
|
||||
"game_id", input.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
||||
string(notificationintent.NotificationTypeGameFinished), "error")
|
||||
return
|
||||
}
|
||||
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
||||
string(notificationintent.NotificationTypeGameFinished), "ok")
|
||||
}
|
||||
|
||||
// publishGameGenerationFailedIntent publishes the admin-email
|
||||
// notification that announces a failed turn generation.
|
||||
func (service *Service) publishGameGenerationFailedIntent(ctx context.Context, input Input, gameName string, errorCode string, errorMessage string, occurredAt time.Time) {
|
||||
failureReason := errorCode
|
||||
if strings.TrimSpace(errorMessage) != "" {
|
||||
failureReason = fmt.Sprintf("%s: %s", errorCode, errorMessage)
|
||||
}
|
||||
intent, err := notificationintent.NewGameGenerationFailedIntent(
|
||||
notificationintent.Metadata{
|
||||
IdempotencyKey: fmt.Sprintf("game.generation_failed:%s:%d",
|
||||
input.GameID, occurredAt.UnixMilli()),
|
||||
OccurredAt: occurredAt,
|
||||
RequestID: logging.RequestIDFromContext(ctx),
|
||||
},
|
||||
notificationintent.GameGenerationFailedPayload{
|
||||
GameID: input.GameID,
|
||||
GameName: gameName,
|
||||
FailureReason: failureReason,
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
service.logger.ErrorContext(ctx, "build game.generation_failed intent",
|
||||
"game_id", input.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
||||
string(notificationintent.NotificationTypeGameGenerationFailed), "error")
|
||||
return
|
||||
}
|
||||
if err := service.notifications.Publish(ctx, intent); err != nil {
|
||||
service.logger.ErrorContext(ctx, "publish game.generation_failed intent",
|
||||
"game_id", input.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
||||
string(notificationintent.NotificationTypeGameGenerationFailed), "error")
|
||||
return
|
||||
}
|
||||
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
||||
string(notificationintent.NotificationTypeGameGenerationFailed), "ok")
|
||||
}
|
||||
|
||||
// projectPlayerStats joins the engine response on RaceName against the
|
||||
// installed roster to build one PlayerTurnStats per active member.
|
||||
// Result is sorted by UserID for a deterministic wire order.
|
||||
func projectPlayerStats(state ports.StateResponse, mappings []playermapping.PlayerMapping) []ports.PlayerTurnStats {
|
||||
if len(state.Players) == 0 || len(mappings) == 0 {
|
||||
return nil
|
||||
}
|
||||
userByRace := make(map[string]string, len(mappings))
|
||||
for _, mapping := range mappings {
|
||||
userByRace[mapping.RaceName] = mapping.UserID
|
||||
}
|
||||
stats := make([]ports.PlayerTurnStats, 0, len(state.Players))
|
||||
for _, player := range state.Players {
|
||||
userID, ok := userByRace[player.RaceName]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
stats = append(stats, ports.PlayerTurnStats{
|
||||
UserID: userID,
|
||||
Planets: player.Planets,
|
||||
Population: player.Population,
|
||||
})
|
||||
}
|
||||
sort.Slice(stats, func(i, j int) bool { return stats[i].UserID < stats[j].UserID })
|
||||
return stats
|
||||
}
|
||||
|
||||
// recipientUserIDs returns the deduplicated, sorted-ascending list of
|
||||
// platform user ids derived from the roster. Mirrors the
|
||||
// notificationintent validator's expectations.
|
||||
func recipientUserIDs(mappings []playermapping.PlayerMapping) []string {
|
||||
if len(mappings) == 0 {
|
||||
return nil
|
||||
}
|
||||
seen := make(map[string]struct{}, len(mappings))
|
||||
result := make([]string, 0, len(mappings))
|
||||
for _, mapping := range mappings {
|
||||
userID := strings.TrimSpace(mapping.UserID)
|
||||
if userID == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[userID]; ok {
|
||||
continue
|
||||
}
|
||||
seen[userID] = struct{}{}
|
||||
result = append(result, userID)
|
||||
}
|
||||
sort.Strings(result)
|
||||
return result
|
||||
}
|
||||
|
||||
// persistedTurn returns the stored CurrentTurn when reloadErr is nil,
|
||||
// or zero otherwise. Used to populate the failure-side snapshot
|
||||
// without making a second DB read.
|
||||
func persistedTurn(record runtime.RuntimeRecord, reloadErr error) int {
|
||||
if reloadErr != nil {
|
||||
return 0
|
||||
}
|
||||
return record.CurrentTurn
|
||||
}
|
||||
|
||||
// persistedHealth returns the stored EngineHealth when reloadErr is
|
||||
// nil, or empty string otherwise.
|
||||
func persistedHealth(record runtime.RuntimeRecord, reloadErr error) string {
|
||||
if reloadErr != nil {
|
||||
return ""
|
||||
}
|
||||
return record.EngineHealth
|
||||
}
|
||||
|
||||
// appendSuccessLog records the success operation_log entry.
|
||||
func (service *Service) appendSuccessLog(ctx context.Context, opStartedAt time.Time, input Input) {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindTurnGeneration,
|
||||
OpSource: fallbackOpSource(input.OpSource),
|
||||
SourceRef: input.SourceRef,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
}
|
||||
|
||||
// appendFailureLog records the failure operation_log entry.
|
||||
func (service *Service) appendFailureLog(ctx context.Context, opStartedAt time.Time, input Input, errorCode string, errorMessage string) {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindTurnGeneration,
|
||||
OpSource: fallbackOpSource(input.OpSource),
|
||||
SourceRef: input.SourceRef,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: errorCode,
|
||||
ErrorMessage: errorMessage,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
}
|
||||
|
||||
// bestEffortAppend writes one operation_log entry. A failure is logged
|
||||
// and discarded; the runtime row is the source of truth.
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// fallbackOpSource defaults to admin_rest when source is missing or
|
||||
// unrecognised. Mirrors `gamemaster/README.md §Trusted Surfaces`.
|
||||
func fallbackOpSource(source operation.OpSource) operation.OpSource {
|
||||
if source.IsKnown() {
|
||||
return source
|
||||
}
|
||||
return operation.OpSourceAdminRest
|
||||
}
|
||||
Reference in New Issue
Block a user