972 lines
35 KiB
Go
972 lines
35 KiB
Go
// Package turngeneration implements the turn-generation orchestrator
|
|
// owned by Game Master. It is the single entry point through which the
|
|
// scheduler ticker (Stage 15 worker) and the admin force-next-turn flow
|
|
// (Stage 17) drive a turn through the engine container.
|
|
//
|
|
// Lifecycle and failure-mode semantics follow `gamemaster/README.md
|
|
// §Lifecycles → Turn generation` and §Force-next-turn. Design rationale
|
|
// is captured in
|
|
// `gamemaster/docs/stage15-scheduler-and-turn-generation.md`.
|
|
package turngeneration
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"galaxy/gamemaster/internal/domain/operation"
|
|
"galaxy/gamemaster/internal/domain/playermapping"
|
|
"galaxy/gamemaster/internal/domain/runtime"
|
|
"galaxy/gamemaster/internal/logging"
|
|
"galaxy/gamemaster/internal/ports"
|
|
"galaxy/gamemaster/internal/service/scheduler"
|
|
"galaxy/gamemaster/internal/telemetry"
|
|
"galaxy/notificationintent"
|
|
)
|
|
|
|
// Trigger classifies the caller of one turn-generation operation. The
|
|
// value flows into telemetry and structured logs only — it does not
|
|
// branch the orchestrator's persistence path. The skip-tick mechanic is
|
|
// driven exclusively by the runtime record's `skip_next_tick` column.
|
|
type Trigger string
|
|
|
|
const (
|
|
// TriggerScheduler labels turn generations dispatched by the
|
|
// `schedulerticker` worker.
|
|
TriggerScheduler Trigger = "scheduler"
|
|
|
|
// TriggerForce labels turn generations dispatched by the admin
|
|
// force-next-turn flow (Stage 17 `service/adminforce`).
|
|
TriggerForce Trigger = "force"
|
|
)
|
|
|
|
// IsKnown reports whether trigger belongs to the frozen trigger
|
|
// vocabulary.
|
|
func (trigger Trigger) IsKnown() bool {
|
|
switch trigger {
|
|
case TriggerScheduler, TriggerForce:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// Input stores the per-call arguments for one turn-generation
|
|
// operation.
|
|
type Input struct {
|
|
// GameID identifies the runtime to drive.
|
|
GameID string
|
|
|
|
// Trigger classifies the caller. Used for telemetry and logs only.
|
|
Trigger Trigger
|
|
|
|
// OpSource classifies how the request entered Game Master. Used to
|
|
// stamp `operation_log.op_source`. Defaults to `admin_rest` when
|
|
// missing or unrecognised.
|
|
OpSource operation.OpSource
|
|
|
|
// SourceRef stores the optional opaque per-source reference (REST
|
|
// request id, scheduler tick id). Empty when the caller does not
|
|
// provide one.
|
|
SourceRef string
|
|
}
|
|
|
|
// Validate reports whether input carries the structural invariants the
|
|
// service requires before any store is touched.
|
|
func (input Input) Validate() error {
|
|
if strings.TrimSpace(input.GameID) == "" {
|
|
return fmt.Errorf("game id must not be empty")
|
|
}
|
|
if !input.Trigger.IsKnown() {
|
|
return fmt.Errorf("trigger %q is unsupported", input.Trigger)
|
|
}
|
|
if !input.OpSource.IsKnown() {
|
|
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Result stores the deterministic outcome of one Handle call.
|
|
type Result struct {
|
|
// Record carries the post-mutation runtime record. Populated on
|
|
// every success outcome and on `engine_*` failures (where the row
|
|
// was moved to `generation_failed`); zero on early-rejection
|
|
// outcomes (`invalid_request`, `runtime_not_found`,
|
|
// `runtime_not_running`, `conflict` on initial CAS,
|
|
// `service_unavailable` on initial Get).
|
|
Record runtime.RuntimeRecord
|
|
|
|
// Trigger echoes back Input.Trigger for log/telemetry consumers.
|
|
Trigger Trigger
|
|
|
|
// Finished is true when the engine reported `finished=true` on this
|
|
// turn and the runtime transitioned to `finished`.
|
|
Finished bool
|
|
|
|
// Outcome reports whether the operation completed (success) or
|
|
// produced a stable failure code.
|
|
Outcome operation.Outcome
|
|
|
|
// ErrorCode stores the stable error code on failure. Empty on
|
|
// success.
|
|
ErrorCode string
|
|
|
|
// ErrorMessage stores the operator-readable detail on failure.
|
|
// Empty on success.
|
|
ErrorMessage string
|
|
}
|
|
|
|
// IsSuccess reports whether the result represents a successful
|
|
// operation.
|
|
func (result Result) IsSuccess() bool {
|
|
return result.Outcome == operation.OutcomeSuccess
|
|
}
|
|
|
|
// Dependencies groups the collaborators required by Service.
|
|
type Dependencies struct {
|
|
// RuntimeRecords drives every CAS and scheduling persistence step.
|
|
RuntimeRecords ports.RuntimeRecordStore
|
|
|
|
// PlayerMappings supplies the per-game roster used to project
|
|
// engine player state to user-facing notification recipients and
|
|
// `player_turn_stats`.
|
|
PlayerMappings ports.PlayerMappingStore
|
|
|
|
// OperationLogs records the audit entry for the operation.
|
|
OperationLogs ports.OperationLogStore
|
|
|
|
// Engine drives the engine /admin/turn call.
|
|
Engine ports.EngineClient
|
|
|
|
// LobbyEvents publishes `runtime_snapshot_update` and
|
|
// `game_finished` to `gm:lobby_events`.
|
|
LobbyEvents ports.LobbyEventsPublisher
|
|
|
|
// Notifications publishes `game.turn.ready`, `game.finished`, and
|
|
// `game.generation_failed` intents to `notification:intents`.
|
|
Notifications ports.NotificationIntentPublisher
|
|
|
|
// Lobby resolves the human-readable `game_name` consumed by
|
|
// notification payloads. Failure is fail-soft: the orchestrator
|
|
// falls back to `game_id`.
|
|
Lobby ports.LobbyClient
|
|
|
|
// Scheduler computes the post-success `next_generation_at` value.
|
|
Scheduler *scheduler.Service
|
|
|
|
// Telemetry records the turn-generation outcome counter, lobby
|
|
// publication counter, and notification publish-attempt counter.
|
|
Telemetry *telemetry.Runtime
|
|
|
|
// Logger records structured service-level events. Defaults to
|
|
// `slog.Default()` when nil.
|
|
Logger *slog.Logger
|
|
|
|
// Clock supplies the wall-clock used for operation timestamps.
|
|
// Defaults to `time.Now` when nil.
|
|
Clock func() time.Time
|
|
}
|
|
|
|
// Service executes the turn-generation lifecycle operation.
|
|
type Service struct {
|
|
runtimeRecords ports.RuntimeRecordStore
|
|
playerMappings ports.PlayerMappingStore
|
|
operationLogs ports.OperationLogStore
|
|
engine ports.EngineClient
|
|
lobbyEvents ports.LobbyEventsPublisher
|
|
notifications ports.NotificationIntentPublisher
|
|
lobby ports.LobbyClient
|
|
scheduler *scheduler.Service
|
|
|
|
telemetry *telemetry.Runtime
|
|
logger *slog.Logger
|
|
clock func() time.Time
|
|
}
|
|
|
|
// NewService constructs one Service from deps.
|
|
func NewService(deps Dependencies) (*Service, error) {
|
|
switch {
|
|
case deps.RuntimeRecords == nil:
|
|
return nil, errors.New("new turn generation service: nil runtime records")
|
|
case deps.PlayerMappings == nil:
|
|
return nil, errors.New("new turn generation service: nil player mappings")
|
|
case deps.OperationLogs == nil:
|
|
return nil, errors.New("new turn generation service: nil operation logs")
|
|
case deps.Engine == nil:
|
|
return nil, errors.New("new turn generation service: nil engine client")
|
|
case deps.LobbyEvents == nil:
|
|
return nil, errors.New("new turn generation service: nil lobby events publisher")
|
|
case deps.Notifications == nil:
|
|
return nil, errors.New("new turn generation service: nil notification publisher")
|
|
case deps.Lobby == nil:
|
|
return nil, errors.New("new turn generation service: nil lobby client")
|
|
case deps.Scheduler == nil:
|
|
return nil, errors.New("new turn generation service: nil scheduler")
|
|
case deps.Telemetry == nil:
|
|
return nil, errors.New("new turn generation service: nil telemetry runtime")
|
|
}
|
|
|
|
clock := deps.Clock
|
|
if clock == nil {
|
|
clock = time.Now
|
|
}
|
|
logger := deps.Logger
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
logger = logger.With("service", "gamemaster.turngeneration")
|
|
|
|
return &Service{
|
|
runtimeRecords: deps.RuntimeRecords,
|
|
playerMappings: deps.PlayerMappings,
|
|
operationLogs: deps.OperationLogs,
|
|
engine: deps.Engine,
|
|
lobbyEvents: deps.LobbyEvents,
|
|
notifications: deps.Notifications,
|
|
lobby: deps.Lobby,
|
|
scheduler: deps.Scheduler,
|
|
telemetry: deps.Telemetry,
|
|
logger: logger,
|
|
clock: clock,
|
|
}, nil
|
|
}
|
|
|
|
// Handle executes one turn-generation operation end-to-end. The
|
|
// Go-level error return is reserved for non-business failures (nil
|
|
// context, nil receiver). Every business outcome flows through Result.
|
|
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
|
if service == nil {
|
|
return Result{}, errors.New("turn generation: nil service")
|
|
}
|
|
if ctx == nil {
|
|
return Result{}, errors.New("turn generation: nil context")
|
|
}
|
|
|
|
opStartedAt := service.clock().UTC()
|
|
|
|
if err := input.Validate(); err != nil {
|
|
return service.recordEarlyFailure(ctx, opStartedAt, input,
|
|
ErrorCodeInvalidRequest, err.Error()), nil
|
|
}
|
|
|
|
record, outcome, ok := service.loadRecord(ctx, opStartedAt, input)
|
|
if !ok {
|
|
return outcome, nil
|
|
}
|
|
|
|
if record.Status != runtime.StatusRunning {
|
|
return service.recordEarlyFailure(ctx, opStartedAt, input,
|
|
ErrorCodeRuntimeNotRunning,
|
|
fmt.Sprintf("runtime status is %q, expected %q",
|
|
record.Status, runtime.StatusRunning)), nil
|
|
}
|
|
|
|
if outcome, ok := service.casToInProgress(ctx, opStartedAt, input); !ok {
|
|
return outcome, nil
|
|
}
|
|
|
|
state, engineOK, engineCode, engineMsg := service.callEngineTurn(ctx, record)
|
|
mappings, listErr := service.playerMappings.ListByGame(ctx, input.GameID)
|
|
if listErr != nil {
|
|
// Without mappings we cannot project player_turn_stats; treat
|
|
// as a service_unavailable failure but still try to roll the
|
|
// runtime to generation_failed because the engine call may
|
|
// have already mutated state.
|
|
return service.failGeneration(ctx, opStartedAt, input, record,
|
|
ErrorCodeServiceUnavailable,
|
|
fmt.Sprintf("list player mappings: %s", listErr.Error())), nil
|
|
}
|
|
|
|
if !engineOK {
|
|
return service.failGeneration(ctx, opStartedAt, input, record,
|
|
engineCode, engineMsg), nil
|
|
}
|
|
|
|
if outcome, ok := service.validateRoster(ctx, opStartedAt, input, record, state, mappings); !ok {
|
|
return outcome, nil
|
|
}
|
|
|
|
if state.Finished {
|
|
return service.completeFinished(ctx, opStartedAt, input, record, state, mappings), nil
|
|
}
|
|
return service.completeRunning(ctx, opStartedAt, input, record, state, mappings), nil
|
|
}
|
|
|
|
// loadRecord reads the runtime record and maps store errors to
|
|
// orchestrator outcomes. ok=false means the flow stops with the
|
|
// returned Result.
|
|
func (service *Service) loadRecord(ctx context.Context, opStartedAt time.Time, input Input) (runtime.RuntimeRecord, Result, bool) {
|
|
record, err := service.runtimeRecords.Get(ctx, input.GameID)
|
|
switch {
|
|
case err == nil:
|
|
return record, Result{}, true
|
|
case errors.Is(err, runtime.ErrNotFound):
|
|
return runtime.RuntimeRecord{}, service.recordEarlyFailure(ctx, opStartedAt, input,
|
|
ErrorCodeRuntimeNotFound, "runtime record does not exist"), false
|
|
default:
|
|
return runtime.RuntimeRecord{}, service.recordEarlyFailure(ctx, opStartedAt, input,
|
|
ErrorCodeServiceUnavailable, fmt.Sprintf("get runtime record: %s", err.Error())), false
|
|
}
|
|
}
|
|
|
|
// casToInProgress flips the runtime row from `running` to
|
|
// `generation_in_progress`. ok=false means the flow stops with the
|
|
// returned Result; the caller has not touched the engine yet.
|
|
func (service *Service) casToInProgress(ctx context.Context, opStartedAt time.Time, input Input) (Result, bool) {
|
|
err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
|
GameID: input.GameID,
|
|
ExpectedFrom: runtime.StatusRunning,
|
|
To: runtime.StatusGenerationInProgress,
|
|
Now: opStartedAt,
|
|
})
|
|
switch {
|
|
case err == nil:
|
|
return Result{}, true
|
|
case errors.Is(err, runtime.ErrConflict):
|
|
return service.recordEarlyFailure(ctx, opStartedAt, input,
|
|
ErrorCodeConflict,
|
|
fmt.Sprintf("cas runtime status to generation_in_progress: %s", err.Error())), false
|
|
case errors.Is(err, runtime.ErrNotFound):
|
|
return service.recordEarlyFailure(ctx, opStartedAt, input,
|
|
ErrorCodeRuntimeNotFound,
|
|
fmt.Sprintf("cas runtime status to generation_in_progress: %s", err.Error())), false
|
|
default:
|
|
return service.recordEarlyFailure(ctx, opStartedAt, input,
|
|
ErrorCodeServiceUnavailable,
|
|
fmt.Sprintf("cas runtime status to generation_in_progress: %s", err.Error())), false
|
|
}
|
|
}
|
|
|
|
// callEngineTurn dispatches the engine /admin/turn call and classifies
|
|
// the outcome. engineOK=true means the response is well-formed at the
|
|
// transport level; engineOK=false populates errorCode / errorMessage
|
|
// with a stable failure shape.
|
|
func (service *Service) callEngineTurn(ctx context.Context, record runtime.RuntimeRecord) (state ports.StateResponse, engineOK bool, errorCode string, errorMessage string) {
|
|
state, err := service.engine.Turn(ctx, record.EngineEndpoint)
|
|
if err == nil {
|
|
return state, true, "", ""
|
|
}
|
|
return ports.StateResponse{}, false, classifyEngineError(err), fmt.Sprintf("engine turn: %s", err.Error())
|
|
}
|
|
|
|
// classifyEngineError maps the engine port sentinels to the
|
|
// turn-generation stable error codes.
|
|
func classifyEngineError(err error) string {
|
|
switch {
|
|
case errors.Is(err, ports.ErrEngineValidation):
|
|
return ErrorCodeEngineValidationError
|
|
case errors.Is(err, ports.ErrEngineProtocolViolation):
|
|
return ErrorCodeEngineProtocolViolation
|
|
case errors.Is(err, ports.ErrEngineUnreachable):
|
|
return ErrorCodeEngineUnreachable
|
|
default:
|
|
return ErrorCodeEngineUnreachable
|
|
}
|
|
}
|
|
|
|
// validateRoster checks that the engine response carries exactly the
|
|
// race set installed at register-runtime. ok=false means the flow stops
|
|
// (and the runtime row is moved to `generation_failed`).
|
|
func (service *Service) validateRoster(ctx context.Context, opStartedAt time.Time, input Input, record runtime.RuntimeRecord, state ports.StateResponse, mappings []playermapping.PlayerMapping) (Result, bool) {
|
|
if len(state.Players) != len(mappings) {
|
|
message := fmt.Sprintf("engine player count %d does not match roster size %d",
|
|
len(state.Players), len(mappings))
|
|
return service.failGeneration(ctx, opStartedAt, input, record,
|
|
ErrorCodeEngineProtocolViolation, message), false
|
|
}
|
|
expected := make(map[string]struct{}, len(mappings))
|
|
for _, mapping := range mappings {
|
|
expected[mapping.RaceName] = struct{}{}
|
|
}
|
|
for _, player := range state.Players {
|
|
if _, ok := expected[player.RaceName]; !ok {
|
|
message := fmt.Sprintf("engine returned race %q not present in roster", player.RaceName)
|
|
return service.failGeneration(ctx, opStartedAt, input, record,
|
|
ErrorCodeEngineProtocolViolation, message), false
|
|
}
|
|
}
|
|
return Result{}, true
|
|
}
|
|
|
|
// completeFinished handles the `finished=true` branch: CAS to finished,
|
|
// clear scheduling, publish game_finished, publish game.finished
|
|
// notification, audit success.
|
|
func (service *Service) completeFinished(ctx context.Context, opStartedAt time.Time, input Input, record runtime.RuntimeRecord, state ports.StateResponse, mappings []playermapping.PlayerMapping) Result {
|
|
finishedAt := service.clock().UTC()
|
|
|
|
err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
|
GameID: input.GameID,
|
|
ExpectedFrom: runtime.StatusGenerationInProgress,
|
|
To: runtime.StatusFinished,
|
|
Now: finishedAt,
|
|
})
|
|
if err != nil {
|
|
return service.handlePostEngineCASFailure(ctx, opStartedAt, input, record, err)
|
|
}
|
|
|
|
if err := service.runtimeRecords.UpdateScheduling(ctx, ports.UpdateSchedulingInput{
|
|
GameID: input.GameID,
|
|
NextGenerationAt: nil,
|
|
SkipNextTick: false,
|
|
CurrentTurn: state.Turn,
|
|
Now: finishedAt,
|
|
}); err != nil {
|
|
// The CAS to finished succeeded; the row is in the terminal
|
|
// state. Surface a service_unavailable to the caller but keep
|
|
// the audit and snapshot consistent.
|
|
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
|
ErrorCodeServiceUnavailable,
|
|
fmt.Sprintf("update scheduling on finish: %s", err.Error()))
|
|
}
|
|
|
|
persisted, reloadErr := service.runtimeRecords.Get(ctx, input.GameID)
|
|
if reloadErr != nil {
|
|
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
|
ErrorCodeServiceUnavailable,
|
|
fmt.Sprintf("reload runtime record: %s", reloadErr.Error()))
|
|
}
|
|
|
|
stats := projectPlayerStats(state, mappings)
|
|
|
|
finishedMsg := ports.GameFinished{
|
|
GameID: input.GameID,
|
|
FinalTurnNumber: state.Turn,
|
|
RuntimeStatus: runtime.StatusFinished,
|
|
PlayerTurnStats: stats,
|
|
FinishedAt: finishedAt,
|
|
}
|
|
if err := service.lobbyEvents.PublishGameFinished(ctx, finishedMsg); err != nil {
|
|
service.logger.ErrorContext(ctx, "publish game finished",
|
|
"game_id", input.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
} else {
|
|
service.telemetry.RecordLobbyEventPublished(ctx, "game_finished")
|
|
}
|
|
|
|
gameName := service.resolveGameName(ctx, input.GameID)
|
|
recipients := recipientUserIDs(mappings)
|
|
service.publishGameFinishedIntent(ctx, input, gameName, state.Turn, recipients, finishedAt)
|
|
|
|
service.appendSuccessLog(ctx, opStartedAt, input)
|
|
service.telemetry.RecordTurnGenerationOutcome(ctx,
|
|
string(operation.OutcomeSuccess), "", string(input.Trigger))
|
|
|
|
logArgs := []any{
|
|
"game_id", input.GameID,
|
|
"trigger", string(input.Trigger),
|
|
"final_turn", state.Turn,
|
|
"finished", true,
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
service.logger.InfoContext(ctx, "turn generation finished game", logArgs...)
|
|
|
|
return Result{
|
|
Record: persisted,
|
|
Trigger: input.Trigger,
|
|
Finished: true,
|
|
Outcome: operation.OutcomeSuccess,
|
|
}
|
|
}
|
|
|
|
// completeRunning handles the `finished=false` branch: recompute next
|
|
// tick, CAS back to running, publish snapshot, publish
|
|
// game.turn.ready notification, audit success.
|
|
func (service *Service) completeRunning(ctx context.Context, opStartedAt time.Time, input Input, record runtime.RuntimeRecord, state ports.StateResponse, mappings []playermapping.PlayerMapping) Result {
|
|
completedAt := service.clock().UTC()
|
|
|
|
next, _, err := service.scheduler.ComputeNext(record.TurnSchedule, completedAt, record.SkipNextTick)
|
|
if err != nil {
|
|
return service.failGeneration(ctx, opStartedAt, input, record,
|
|
ErrorCodeInvalidRequest,
|
|
fmt.Sprintf("recompute next tick: %s", err.Error()))
|
|
}
|
|
|
|
if err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
|
GameID: input.GameID,
|
|
ExpectedFrom: runtime.StatusGenerationInProgress,
|
|
To: runtime.StatusRunning,
|
|
Now: completedAt,
|
|
}); err != nil {
|
|
return service.handlePostEngineCASFailure(ctx, opStartedAt, input, record, err)
|
|
}
|
|
|
|
if err := service.runtimeRecords.UpdateScheduling(ctx, ports.UpdateSchedulingInput{
|
|
GameID: input.GameID,
|
|
NextGenerationAt: &next,
|
|
SkipNextTick: false,
|
|
CurrentTurn: state.Turn,
|
|
Now: completedAt,
|
|
}); err != nil {
|
|
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
|
ErrorCodeServiceUnavailable,
|
|
fmt.Sprintf("update scheduling on running: %s", err.Error()))
|
|
}
|
|
|
|
persisted, reloadErr := service.runtimeRecords.Get(ctx, input.GameID)
|
|
if reloadErr != nil {
|
|
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
|
ErrorCodeServiceUnavailable,
|
|
fmt.Sprintf("reload runtime record: %s", reloadErr.Error()))
|
|
}
|
|
|
|
stats := projectPlayerStats(state, mappings)
|
|
|
|
snapshot := ports.RuntimeSnapshotUpdate{
|
|
GameID: input.GameID,
|
|
CurrentTurn: state.Turn,
|
|
RuntimeStatus: runtime.StatusRunning,
|
|
EngineHealthSummary: persisted.EngineHealth,
|
|
PlayerTurnStats: stats,
|
|
OccurredAt: completedAt,
|
|
}
|
|
if err := service.lobbyEvents.PublishSnapshotUpdate(ctx, snapshot); err != nil {
|
|
service.logger.ErrorContext(ctx, "publish runtime snapshot update",
|
|
"game_id", input.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
} else {
|
|
service.telemetry.RecordLobbyEventPublished(ctx, "runtime_snapshot_update")
|
|
}
|
|
|
|
gameName := service.resolveGameName(ctx, input.GameID)
|
|
recipients := recipientUserIDs(mappings)
|
|
service.publishGameTurnReadyIntent(ctx, input, gameName, state.Turn, recipients, completedAt)
|
|
|
|
service.appendSuccessLog(ctx, opStartedAt, input)
|
|
service.telemetry.RecordTurnGenerationOutcome(ctx,
|
|
string(operation.OutcomeSuccess), "", string(input.Trigger))
|
|
|
|
logArgs := []any{
|
|
"game_id", input.GameID,
|
|
"trigger", string(input.Trigger),
|
|
"current_turn", state.Turn,
|
|
"next_generation_at", next.Format(time.RFC3339Nano),
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
service.logger.InfoContext(ctx, "turn generation succeeded", logArgs...)
|
|
|
|
return Result{
|
|
Record: persisted,
|
|
Trigger: input.Trigger,
|
|
Outcome: operation.OutcomeSuccess,
|
|
}
|
|
}
|
|
|
|
// failGeneration handles every post-CAS failure path: CAS to
|
|
// generation_failed, publish snapshot, publish game.generation_failed
|
|
// admin notification, audit failure.
|
|
func (service *Service) failGeneration(ctx context.Context, opStartedAt time.Time, input Input, _ runtime.RuntimeRecord, errorCode string, errorMessage string) Result {
|
|
failedAt := service.clock().UTC()
|
|
|
|
casErr := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
|
GameID: input.GameID,
|
|
ExpectedFrom: runtime.StatusGenerationInProgress,
|
|
To: runtime.StatusGenerationFailed,
|
|
Now: failedAt,
|
|
})
|
|
if casErr != nil && !errors.Is(casErr, runtime.ErrConflict) {
|
|
// Best-effort transition. The original error code remains the
|
|
// caller-visible one; log the secondary failure.
|
|
service.logger.ErrorContext(ctx, "cas runtime status to generation_failed",
|
|
"game_id", input.GameID,
|
|
"err", casErr.Error(),
|
|
)
|
|
}
|
|
|
|
persisted, reloadErr := service.runtimeRecords.Get(ctx, input.GameID)
|
|
publishedStatus := runtime.StatusGenerationFailed
|
|
if reloadErr == nil {
|
|
publishedStatus = persisted.Status
|
|
}
|
|
|
|
snapshot := ports.RuntimeSnapshotUpdate{
|
|
GameID: input.GameID,
|
|
CurrentTurn: persistedTurn(persisted, reloadErr),
|
|
RuntimeStatus: publishedStatus,
|
|
EngineHealthSummary: persistedHealth(persisted, reloadErr),
|
|
PlayerTurnStats: nil,
|
|
OccurredAt: failedAt,
|
|
}
|
|
if err := service.lobbyEvents.PublishSnapshotUpdate(ctx, snapshot); err != nil {
|
|
service.logger.ErrorContext(ctx, "publish runtime snapshot update on failure",
|
|
"game_id", input.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
} else {
|
|
service.telemetry.RecordLobbyEventPublished(ctx, "runtime_snapshot_update")
|
|
}
|
|
|
|
gameName := service.resolveGameName(ctx, input.GameID)
|
|
service.publishGameGenerationFailedIntent(ctx, input, gameName, errorCode, errorMessage, failedAt)
|
|
|
|
service.appendFailureLog(ctx, opStartedAt, input, errorCode, errorMessage)
|
|
service.telemetry.RecordTurnGenerationOutcome(ctx,
|
|
string(operation.OutcomeFailure), errorCode, string(input.Trigger))
|
|
|
|
logArgs := []any{
|
|
"game_id", input.GameID,
|
|
"trigger", string(input.Trigger),
|
|
"error_code", errorCode,
|
|
"error_message", errorMessage,
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
service.logger.WarnContext(ctx, "turn generation failed", logArgs...)
|
|
|
|
return Result{
|
|
Record: persisted,
|
|
Trigger: input.Trigger,
|
|
Outcome: operation.OutcomeFailure,
|
|
ErrorCode: errorCode,
|
|
ErrorMessage: errorMessage,
|
|
}
|
|
}
|
|
|
|
// handlePostEngineCASFailure maps a CAS error that surfaced after the
|
|
// engine call already succeeded. Conflict means an external actor (e.g.
|
|
// admin stop) won the race; other errors are treated as
|
|
// service_unavailable. No publication is issued — the external mutation
|
|
// owns its own snapshot.
|
|
func (service *Service) handlePostEngineCASFailure(ctx context.Context, opStartedAt time.Time, input Input, _ runtime.RuntimeRecord, casErr error) Result {
|
|
switch {
|
|
case errors.Is(casErr, runtime.ErrConflict):
|
|
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
|
ErrorCodeConflict,
|
|
fmt.Sprintf("cas runtime status post-engine: %s", casErr.Error()))
|
|
case errors.Is(casErr, runtime.ErrNotFound):
|
|
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
|
ErrorCodeRuntimeNotFound,
|
|
fmt.Sprintf("cas runtime status post-engine: %s", casErr.Error()))
|
|
default:
|
|
return service.recordTerminalFailure(ctx, opStartedAt, input,
|
|
ErrorCodeServiceUnavailable,
|
|
fmt.Sprintf("cas runtime status post-engine: %s", casErr.Error()))
|
|
}
|
|
}
|
|
|
|
// recordEarlyFailure handles failures that occur before the runtime row
|
|
// is in `generation_in_progress`. No status mutation, no publication;
|
|
// only audit and telemetry.
|
|
func (service *Service) recordEarlyFailure(ctx context.Context, opStartedAt time.Time, input Input, errorCode string, errorMessage string) Result {
|
|
service.appendFailureLog(ctx, opStartedAt, input, errorCode, errorMessage)
|
|
service.telemetry.RecordTurnGenerationOutcome(ctx,
|
|
string(operation.OutcomeFailure), errorCode, string(input.Trigger))
|
|
logArgs := []any{
|
|
"game_id", input.GameID,
|
|
"trigger", string(input.Trigger),
|
|
"error_code", errorCode,
|
|
"error_message", errorMessage,
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
service.logger.WarnContext(ctx, "turn generation rejected", logArgs...)
|
|
return Result{
|
|
Trigger: input.Trigger,
|
|
Outcome: operation.OutcomeFailure,
|
|
ErrorCode: errorCode,
|
|
ErrorMessage: errorMessage,
|
|
}
|
|
}
|
|
|
|
// recordTerminalFailure handles failures after a post-engine CAS or a
|
|
// reload failed. The runtime row is in an undetermined state owned by
|
|
// whatever mutation won; we record the audit and surface the failure
|
|
// without further publication.
|
|
func (service *Service) recordTerminalFailure(ctx context.Context, opStartedAt time.Time, input Input, errorCode string, errorMessage string) Result {
|
|
service.appendFailureLog(ctx, opStartedAt, input, errorCode, errorMessage)
|
|
service.telemetry.RecordTurnGenerationOutcome(ctx,
|
|
string(operation.OutcomeFailure), errorCode, string(input.Trigger))
|
|
logArgs := []any{
|
|
"game_id", input.GameID,
|
|
"trigger", string(input.Trigger),
|
|
"error_code", errorCode,
|
|
"error_message", errorMessage,
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
service.logger.WarnContext(ctx, "turn generation post-engine failure", logArgs...)
|
|
return Result{
|
|
Trigger: input.Trigger,
|
|
Outcome: operation.OutcomeFailure,
|
|
ErrorCode: errorCode,
|
|
ErrorMessage: errorMessage,
|
|
}
|
|
}
|
|
|
|
// resolveGameName fetches the human-readable game name from Lobby and
|
|
// falls back to the platform game id on any error per Stage 15 D1.
|
|
func (service *Service) resolveGameName(ctx context.Context, gameID string) string {
|
|
summary, err := service.lobby.GetGameSummary(ctx, gameID)
|
|
if err != nil {
|
|
logArgs := []any{
|
|
"game_id", gameID,
|
|
"error_code", "lobby_unavailable",
|
|
"err", err.Error(),
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
service.logger.WarnContext(ctx, "resolve game name fell back to game id", logArgs...)
|
|
return gameID
|
|
}
|
|
if strings.TrimSpace(summary.GameName) == "" {
|
|
return gameID
|
|
}
|
|
return summary.GameName
|
|
}
|
|
|
|
// publishGameTurnReadyIntent publishes the user-targeted notification
|
|
// that announces a freshly generated turn. Empty recipient sets are
|
|
// dropped silently — the validator inside notificationintent rejects
|
|
// them outright, but the orchestrator should not break commit.
|
|
func (service *Service) publishGameTurnReadyIntent(ctx context.Context, input Input, gameName string, turnNumber int, recipients []string, occurredAt time.Time) {
|
|
if len(recipients) == 0 {
|
|
service.logger.WarnContext(ctx, "skip game.turn.ready notification: empty recipient set",
|
|
"game_id", input.GameID,
|
|
)
|
|
return
|
|
}
|
|
intent, err := notificationintent.NewGameTurnReadyIntent(
|
|
notificationintent.Metadata{
|
|
IdempotencyKey: fmt.Sprintf("game.turn.ready:%s:%d", input.GameID, turnNumber),
|
|
OccurredAt: occurredAt,
|
|
RequestID: logging.RequestIDFromContext(ctx),
|
|
},
|
|
recipients,
|
|
notificationintent.GameTurnReadyPayload{
|
|
GameID: input.GameID,
|
|
GameName: gameName,
|
|
TurnNumber: int64(turnNumber),
|
|
},
|
|
)
|
|
if err != nil {
|
|
service.logger.ErrorContext(ctx, "build game.turn.ready intent",
|
|
"game_id", input.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
|
string(notificationintent.NotificationTypeGameTurnReady), "error")
|
|
return
|
|
}
|
|
if err := service.notifications.Publish(ctx, intent); err != nil {
|
|
service.logger.ErrorContext(ctx, "publish game.turn.ready intent",
|
|
"game_id", input.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
|
string(notificationintent.NotificationTypeGameTurnReady), "error")
|
|
return
|
|
}
|
|
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
|
string(notificationintent.NotificationTypeGameTurnReady), "ok")
|
|
}
|
|
|
|
// publishGameFinishedIntent publishes the user-targeted notification
|
|
// that announces a finished game.
|
|
func (service *Service) publishGameFinishedIntent(ctx context.Context, input Input, gameName string, finalTurnNumber int, recipients []string, occurredAt time.Time) {
|
|
if len(recipients) == 0 {
|
|
service.logger.WarnContext(ctx, "skip game.finished notification: empty recipient set",
|
|
"game_id", input.GameID,
|
|
)
|
|
return
|
|
}
|
|
intent, err := notificationintent.NewGameFinishedIntent(
|
|
notificationintent.Metadata{
|
|
IdempotencyKey: fmt.Sprintf("game.finished:%s:%d", input.GameID, finalTurnNumber),
|
|
OccurredAt: occurredAt,
|
|
RequestID: logging.RequestIDFromContext(ctx),
|
|
},
|
|
recipients,
|
|
notificationintent.GameFinishedPayload{
|
|
GameID: input.GameID,
|
|
GameName: gameName,
|
|
FinalTurnNumber: int64(finalTurnNumber),
|
|
},
|
|
)
|
|
if err != nil {
|
|
service.logger.ErrorContext(ctx, "build game.finished intent",
|
|
"game_id", input.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
|
string(notificationintent.NotificationTypeGameFinished), "error")
|
|
return
|
|
}
|
|
if err := service.notifications.Publish(ctx, intent); err != nil {
|
|
service.logger.ErrorContext(ctx, "publish game.finished intent",
|
|
"game_id", input.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
|
string(notificationintent.NotificationTypeGameFinished), "error")
|
|
return
|
|
}
|
|
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
|
string(notificationintent.NotificationTypeGameFinished), "ok")
|
|
}
|
|
|
|
// publishGameGenerationFailedIntent publishes the admin-email
|
|
// notification that announces a failed turn generation.
|
|
func (service *Service) publishGameGenerationFailedIntent(ctx context.Context, input Input, gameName string, errorCode string, errorMessage string, occurredAt time.Time) {
|
|
failureReason := errorCode
|
|
if strings.TrimSpace(errorMessage) != "" {
|
|
failureReason = fmt.Sprintf("%s: %s", errorCode, errorMessage)
|
|
}
|
|
intent, err := notificationintent.NewGameGenerationFailedIntent(
|
|
notificationintent.Metadata{
|
|
IdempotencyKey: fmt.Sprintf("game.generation_failed:%s:%d",
|
|
input.GameID, occurredAt.UnixMilli()),
|
|
OccurredAt: occurredAt,
|
|
RequestID: logging.RequestIDFromContext(ctx),
|
|
},
|
|
notificationintent.GameGenerationFailedPayload{
|
|
GameID: input.GameID,
|
|
GameName: gameName,
|
|
FailureReason: failureReason,
|
|
},
|
|
)
|
|
if err != nil {
|
|
service.logger.ErrorContext(ctx, "build game.generation_failed intent",
|
|
"game_id", input.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
|
string(notificationintent.NotificationTypeGameGenerationFailed), "error")
|
|
return
|
|
}
|
|
if err := service.notifications.Publish(ctx, intent); err != nil {
|
|
service.logger.ErrorContext(ctx, "publish game.generation_failed intent",
|
|
"game_id", input.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
|
string(notificationintent.NotificationTypeGameGenerationFailed), "error")
|
|
return
|
|
}
|
|
service.telemetry.RecordNotificationPublishAttempt(ctx,
|
|
string(notificationintent.NotificationTypeGameGenerationFailed), "ok")
|
|
}
|
|
|
|
// projectPlayerStats joins the engine response on RaceName against the
|
|
// installed roster to build one PlayerTurnStats per active member.
|
|
// Result is sorted by UserID for a deterministic wire order.
|
|
func projectPlayerStats(state ports.StateResponse, mappings []playermapping.PlayerMapping) []ports.PlayerTurnStats {
|
|
if len(state.Players) == 0 || len(mappings) == 0 {
|
|
return nil
|
|
}
|
|
userByRace := make(map[string]string, len(mappings))
|
|
for _, mapping := range mappings {
|
|
userByRace[mapping.RaceName] = mapping.UserID
|
|
}
|
|
stats := make([]ports.PlayerTurnStats, 0, len(state.Players))
|
|
for _, player := range state.Players {
|
|
userID, ok := userByRace[player.RaceName]
|
|
if !ok {
|
|
continue
|
|
}
|
|
stats = append(stats, ports.PlayerTurnStats{
|
|
UserID: userID,
|
|
Planets: player.Planets,
|
|
Population: player.Population,
|
|
})
|
|
}
|
|
sort.Slice(stats, func(i, j int) bool { return stats[i].UserID < stats[j].UserID })
|
|
return stats
|
|
}
|
|
|
|
// recipientUserIDs returns the deduplicated, sorted-ascending list of
|
|
// platform user ids derived from the roster. Mirrors the
|
|
// notificationintent validator's expectations.
|
|
func recipientUserIDs(mappings []playermapping.PlayerMapping) []string {
|
|
if len(mappings) == 0 {
|
|
return nil
|
|
}
|
|
seen := make(map[string]struct{}, len(mappings))
|
|
result := make([]string, 0, len(mappings))
|
|
for _, mapping := range mappings {
|
|
userID := strings.TrimSpace(mapping.UserID)
|
|
if userID == "" {
|
|
continue
|
|
}
|
|
if _, ok := seen[userID]; ok {
|
|
continue
|
|
}
|
|
seen[userID] = struct{}{}
|
|
result = append(result, userID)
|
|
}
|
|
sort.Strings(result)
|
|
return result
|
|
}
|
|
|
|
// persistedTurn returns the stored CurrentTurn when reloadErr is nil,
|
|
// or zero otherwise. Used to populate the failure-side snapshot
|
|
// without making a second DB read.
|
|
func persistedTurn(record runtime.RuntimeRecord, reloadErr error) int {
|
|
if reloadErr != nil {
|
|
return 0
|
|
}
|
|
return record.CurrentTurn
|
|
}
|
|
|
|
// persistedHealth returns the stored EngineHealth when reloadErr is
|
|
// nil, or empty string otherwise.
|
|
func persistedHealth(record runtime.RuntimeRecord, reloadErr error) string {
|
|
if reloadErr != nil {
|
|
return ""
|
|
}
|
|
return record.EngineHealth
|
|
}
|
|
|
|
// appendSuccessLog records the success operation_log entry.
|
|
func (service *Service) appendSuccessLog(ctx context.Context, opStartedAt time.Time, input Input) {
|
|
finishedAt := service.clock().UTC()
|
|
service.bestEffortAppend(ctx, operation.OperationEntry{
|
|
GameID: input.GameID,
|
|
OpKind: operation.OpKindTurnGeneration,
|
|
OpSource: fallbackOpSource(input.OpSource),
|
|
SourceRef: input.SourceRef,
|
|
Outcome: operation.OutcomeSuccess,
|
|
StartedAt: opStartedAt,
|
|
FinishedAt: &finishedAt,
|
|
})
|
|
}
|
|
|
|
// appendFailureLog records the failure operation_log entry.
|
|
func (service *Service) appendFailureLog(ctx context.Context, opStartedAt time.Time, input Input, errorCode string, errorMessage string) {
|
|
finishedAt := service.clock().UTC()
|
|
service.bestEffortAppend(ctx, operation.OperationEntry{
|
|
GameID: input.GameID,
|
|
OpKind: operation.OpKindTurnGeneration,
|
|
OpSource: fallbackOpSource(input.OpSource),
|
|
SourceRef: input.SourceRef,
|
|
Outcome: operation.OutcomeFailure,
|
|
ErrorCode: errorCode,
|
|
ErrorMessage: errorMessage,
|
|
StartedAt: opStartedAt,
|
|
FinishedAt: &finishedAt,
|
|
})
|
|
}
|
|
|
|
// bestEffortAppend writes one operation_log entry. A failure is logged
|
|
// and discarded; the runtime row is the source of truth.
|
|
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
|
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
|
service.logger.ErrorContext(ctx, "append operation log",
|
|
"game_id", entry.GameID,
|
|
"op_kind", string(entry.OpKind),
|
|
"outcome", string(entry.Outcome),
|
|
"error_code", entry.ErrorCode,
|
|
"err", err.Error(),
|
|
)
|
|
}
|
|
}
|
|
|
|
// fallbackOpSource defaults to admin_rest when source is missing or
|
|
// unrecognised. Mirrors `gamemaster/README.md §Trusted Surfaces`.
|
|
func fallbackOpSource(source operation.OpSource) operation.OpSource {
|
|
if source.IsKnown() {
|
|
return source
|
|
}
|
|
return operation.OpSourceAdminRest
|
|
}
|