feat: gamemaster
This commit is contained in:
@@ -0,0 +1,726 @@
|
||||
// Package registerruntime implements the register-runtime service-layer
|
||||
// orchestrator owned by Game Master. The service is the single entry
|
||||
// point Game Lobby uses (after Runtime Manager has reported a successful
|
||||
// container start) to install a freshly-started game in Game Master.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `gamemaster/README.md
|
||||
// §Lifecycles → Register-runtime`. Design rationale is captured in
|
||||
// `gamemaster/docs/stage13-register-runtime.md`.
|
||||
package registerruntime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/gamemaster/internal/domain/engineversion"
|
||||
"galaxy/gamemaster/internal/domain/operation"
|
||||
"galaxy/gamemaster/internal/domain/playermapping"
|
||||
"galaxy/gamemaster/internal/domain/runtime"
|
||||
"galaxy/gamemaster/internal/domain/schedule"
|
||||
"galaxy/gamemaster/internal/logging"
|
||||
"galaxy/gamemaster/internal/ports"
|
||||
"galaxy/gamemaster/internal/telemetry"
|
||||
)
|
||||
|
||||
// Member stores one entry of Input.Members. The shape mirrors
|
||||
// `RegisterRuntimeMember` in `gamemaster/api/internal-openapi.yaml`.
|
||||
type Member struct {
|
||||
// UserID identifies an active platform member of the game.
|
||||
UserID string
|
||||
|
||||
// RaceName stores the race name reserved for the member by Game
|
||||
// Lobby. Used both to build the engine /admin/init roster and to
|
||||
// resolve the engine response back to user_id.
|
||||
RaceName string
|
||||
}
|
||||
|
||||
// Input stores the per-call arguments for one register-runtime
|
||||
// operation. The shape mirrors `RegisterRuntimeRequest` plus the
|
||||
// audit-only OpSource / SourceRef pair.
|
||||
type Input struct {
|
||||
// GameID identifies the platform game whose runtime is being
|
||||
// registered.
|
||||
GameID string
|
||||
|
||||
// EngineEndpoint stores the engine container URL Game Master uses
|
||||
// for every subsequent call against the runtime
|
||||
// (`http://galaxy-game-{game_id}:8080`).
|
||||
EngineEndpoint string
|
||||
|
||||
// Members stores the per-active-member roster Game Lobby committed
|
||||
// when the platform game opened. Must be non-empty.
|
||||
Members []Member
|
||||
|
||||
// TargetEngineVersion stores the semver under which Runtime Manager
|
||||
// started the container. Resolved against the engine_versions
|
||||
// registry to recover the matching image_ref.
|
||||
TargetEngineVersion string
|
||||
|
||||
// TurnSchedule stores the five-field cron expression governing turn
|
||||
// generation, copied from the platform game record.
|
||||
TurnSchedule string
|
||||
|
||||
// OpSource classifies how the request entered Game Master. Required:
|
||||
// every operation_log entry carries an op_source.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference (request
|
||||
// id, admin user id). Empty when the caller does not provide one.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires before any store is touched.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(input.EngineEndpoint) == "" {
|
||||
return fmt.Errorf("engine endpoint must not be empty")
|
||||
}
|
||||
if len(input.Members) == 0 {
|
||||
return fmt.Errorf("members must not be empty")
|
||||
}
|
||||
for index, member := range input.Members {
|
||||
if strings.TrimSpace(member.UserID) == "" {
|
||||
return fmt.Errorf("members[%d]: user id must not be empty", index)
|
||||
}
|
||||
if strings.TrimSpace(member.RaceName) == "" {
|
||||
return fmt.Errorf("members[%d]: race name must not be empty", index)
|
||||
}
|
||||
}
|
||||
if strings.TrimSpace(input.TargetEngineVersion) == "" {
|
||||
return fmt.Errorf("target engine version must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(input.TurnSchedule) == "" {
|
||||
return fmt.Errorf("turn schedule must not be empty")
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
if duplicate := firstDuplicateMember(input.Members); duplicate != "" {
|
||||
return fmt.Errorf("members carry duplicate entries for %q", duplicate)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// firstDuplicateMember returns the first user_id or race_name that
|
||||
// appears more than once in members. Empty when every entry is unique.
|
||||
func firstDuplicateMember(members []Member) string {
|
||||
seenUsers := make(map[string]struct{}, len(members))
|
||||
seenRaces := make(map[string]struct{}, len(members))
|
||||
for _, member := range members {
|
||||
if _, ok := seenUsers[member.UserID]; ok {
|
||||
return member.UserID
|
||||
}
|
||||
seenUsers[member.UserID] = struct{}{}
|
||||
if _, ok := seenRaces[member.RaceName]; ok {
|
||||
return member.RaceName
|
||||
}
|
||||
seenRaces[member.RaceName] = struct{}{}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle call. Business
|
||||
// outcomes flow through Result; the Go-level error return is reserved
|
||||
// for non-business failures (nil context, nil receiver).
|
||||
type Result struct {
|
||||
// Record carries the runtime record installed by the operation.
|
||||
// Populated on success; zero on failure.
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure. Empty on
|
||||
// success.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
// Empty on success.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// IsSuccess reports whether the result represents a successful
|
||||
// operation.
|
||||
func (result Result) IsSuccess() bool {
|
||||
return result.Outcome == operation.OutcomeSuccess
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
// RuntimeRecords stores the runtime_records row installed by the
|
||||
// flow.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// EngineVersions resolves `target_engine_version` to the matching
|
||||
// image_ref and validates the version exists.
|
||||
EngineVersions ports.EngineVersionStore
|
||||
|
||||
// PlayerMappings persists the (game_id, user_id) → race_name
|
||||
// projection derived from the engine /admin/init response.
|
||||
PlayerMappings ports.PlayerMappingStore
|
||||
|
||||
// OperationLogs records the audit entry for the operation.
|
||||
OperationLogs ports.OperationLogStore
|
||||
|
||||
// Engine drives the engine /admin/init call and decodes the
|
||||
// response.
|
||||
Engine ports.EngineClient
|
||||
|
||||
// LobbyEvents publishes the post-success runtime_snapshot_update
|
||||
// to `gm:lobby_events`.
|
||||
LobbyEvents ports.LobbyEventsPublisher
|
||||
|
||||
// Telemetry records register-runtime outcomes plus the snapshot
|
||||
// publication counter. Required.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Logger records structured service-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
|
||||
// Clock supplies the wall-clock used for operation timestamps.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
}
|
||||
|
||||
// Service executes the register-runtime lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
engineVersions ports.EngineVersionStore
|
||||
playerMappings ports.PlayerMappingStore
|
||||
operationLogs ports.OperationLogStore
|
||||
engine ports.EngineClient
|
||||
lobbyEvents ports.LobbyEventsPublisher
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
clock func() time.Time
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new register runtime service: nil runtime records")
|
||||
case deps.EngineVersions == nil:
|
||||
return nil, errors.New("new register runtime service: nil engine versions")
|
||||
case deps.PlayerMappings == nil:
|
||||
return nil, errors.New("new register runtime service: nil player mappings")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new register runtime service: nil operation logs")
|
||||
case deps.Engine == nil:
|
||||
return nil, errors.New("new register runtime service: nil engine client")
|
||||
case deps.LobbyEvents == nil:
|
||||
return nil, errors.New("new register runtime service: nil lobby events publisher")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new register runtime service: nil telemetry runtime")
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "gamemaster.registerruntime")
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
engineVersions: deps.EngineVersions,
|
||||
playerMappings: deps.PlayerMappings,
|
||||
operationLogs: deps.OperationLogs,
|
||||
engine: deps.Engine,
|
||||
lobbyEvents: deps.LobbyEvents,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one register-runtime operation end-to-end. The
|
||||
// Go-level error return is reserved for non-business failures (nil
|
||||
// context, nil receiver). Every business outcome flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("register runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("register runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, opStartedAt, input, false, false,
|
||||
ErrorCodeInvalidRequest, err.Error()), nil
|
||||
}
|
||||
|
||||
if outcome, ok := service.rejectExisting(ctx, opStartedAt, input); ok {
|
||||
return outcome, nil
|
||||
}
|
||||
|
||||
imageRef, outcome, ok := service.resolveImageRef(ctx, opStartedAt, input)
|
||||
if !ok {
|
||||
return outcome, nil
|
||||
}
|
||||
|
||||
record := service.buildStartingRecord(input, imageRef, opStartedAt)
|
||||
if err := service.runtimeRecords.Insert(ctx, record); err != nil {
|
||||
switch {
|
||||
case errors.Is(err, runtime.ErrConflict):
|
||||
return service.recordFailure(ctx, opStartedAt, input, false, false,
|
||||
ErrorCodeConflict, "runtime record already exists"), nil
|
||||
default:
|
||||
return service.recordFailure(ctx, opStartedAt, input, false, false,
|
||||
ErrorCodeServiceUnavailable, fmt.Sprintf("insert runtime record: %s", err.Error())), nil
|
||||
}
|
||||
}
|
||||
|
||||
engineState, outcome, ok := service.callEngineInit(ctx, opStartedAt, input)
|
||||
if !ok {
|
||||
return outcome, nil
|
||||
}
|
||||
|
||||
if outcome, ok := service.validateRoster(ctx, opStartedAt, input, engineState); !ok {
|
||||
return outcome, nil
|
||||
}
|
||||
|
||||
if outcome, ok := service.installPlayerMappings(ctx, opStartedAt, input, engineState); !ok {
|
||||
return outcome, nil
|
||||
}
|
||||
|
||||
nextGenerationAt, outcome, ok := service.computeNextGeneration(ctx, opStartedAt, input)
|
||||
if !ok {
|
||||
return outcome, nil
|
||||
}
|
||||
|
||||
if outcome, ok := service.casToRunning(ctx, opStartedAt, input); !ok {
|
||||
return outcome, nil
|
||||
}
|
||||
|
||||
if outcome, ok := service.persistInitialScheduling(ctx, opStartedAt, input, nextGenerationAt); !ok {
|
||||
return outcome, nil
|
||||
}
|
||||
|
||||
persisted, outcome, ok := service.reloadRecord(ctx, opStartedAt, input)
|
||||
if !ok {
|
||||
return outcome, nil
|
||||
}
|
||||
|
||||
stats := projectInitToStats(engineState, input.Members)
|
||||
|
||||
service.appendSuccessLog(ctx, opStartedAt, input)
|
||||
service.publishSnapshot(ctx, persisted, stats, opStartedAt)
|
||||
service.telemetry.RecordRegisterRuntimeOutcome(ctx, string(operation.OutcomeSuccess), "")
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"engine_version", input.TargetEngineVersion,
|
||||
"members", len(input.Members),
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime registered", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: persisted,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// rejectExisting returns a Result and ok=true when the runtime record
|
||||
// already exists or the lookup itself failed; ok=false continues the
|
||||
// flow.
|
||||
func (service *Service) rejectExisting(ctx context.Context, opStartedAt time.Time, input Input) (Result, bool) {
|
||||
_, err := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
switch {
|
||||
case errors.Is(err, runtime.ErrNotFound):
|
||||
return Result{}, false
|
||||
case err != nil:
|
||||
return service.recordFailure(ctx, opStartedAt, input, false, false,
|
||||
ErrorCodeServiceUnavailable, fmt.Sprintf("get runtime record: %s", err.Error())), true
|
||||
default:
|
||||
return service.recordFailure(ctx, opStartedAt, input, false, false,
|
||||
ErrorCodeConflict, "runtime record already exists"), true
|
||||
}
|
||||
}
|
||||
|
||||
// resolveImageRef resolves the target engine version against the
|
||||
// engine_versions registry. Returns ok=false on failure with the
|
||||
// matching Result.
|
||||
func (service *Service) resolveImageRef(ctx context.Context, opStartedAt time.Time, input Input) (string, Result, bool) {
|
||||
version, err := service.engineVersions.Get(ctx, input.TargetEngineVersion)
|
||||
switch {
|
||||
case errors.Is(err, engineversion.ErrNotFound):
|
||||
return "", service.recordFailure(ctx, opStartedAt, input, false, false,
|
||||
ErrorCodeEngineVersionNotFound,
|
||||
fmt.Sprintf("engine version %q not found", input.TargetEngineVersion)), false
|
||||
case err != nil:
|
||||
return "", service.recordFailure(ctx, opStartedAt, input, false, false,
|
||||
ErrorCodeServiceUnavailable, fmt.Sprintf("get engine version: %s", err.Error())), false
|
||||
}
|
||||
return version.ImageRef, Result{}, true
|
||||
}
|
||||
|
||||
// buildStartingRecord assembles the initial runtime_records row,
|
||||
// matching `gamemaster/README.md §Lifecycles → Register-runtime` step 4.
|
||||
func (service *Service) buildStartingRecord(input Input, imageRef string, now time.Time) runtime.RuntimeRecord {
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: input.GameID,
|
||||
Status: runtime.StatusStarting,
|
||||
EngineEndpoint: input.EngineEndpoint,
|
||||
CurrentImageRef: imageRef,
|
||||
CurrentEngineVersion: input.TargetEngineVersion,
|
||||
TurnSchedule: input.TurnSchedule,
|
||||
CurrentTurn: 0,
|
||||
NextGenerationAt: nil,
|
||||
SkipNextTick: false,
|
||||
EngineHealth: "",
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}
|
||||
}
|
||||
|
||||
// callEngineInit dispatches the engine /admin/init call and maps the
|
||||
// transport-layer error to a stable Result code. ok=false means the
|
||||
// flow stops.
|
||||
func (service *Service) callEngineInit(ctx context.Context, opStartedAt time.Time, input Input) (ports.StateResponse, Result, bool) {
|
||||
races := make([]ports.InitRace, 0, len(input.Members))
|
||||
for _, member := range input.Members {
|
||||
races = append(races, ports.InitRace{RaceName: member.RaceName})
|
||||
}
|
||||
state, err := service.engine.Init(ctx, input.EngineEndpoint, ports.InitRequest{Races: races})
|
||||
if err == nil {
|
||||
return state, Result{}, true
|
||||
}
|
||||
|
||||
code := classifyEngineError(err)
|
||||
message := fmt.Sprintf("engine init: %s", err.Error())
|
||||
return ports.StateResponse{}, service.recordFailure(ctx, opStartedAt, input, true, false, code, message), false
|
||||
}
|
||||
|
||||
// classifyEngineError maps the engine port sentinels to the
|
||||
// register-runtime stable error codes per Stage 13 D1.
|
||||
func classifyEngineError(err error) string {
|
||||
switch {
|
||||
case errors.Is(err, ports.ErrEngineValidation):
|
||||
return ErrorCodeEngineValidationError
|
||||
case errors.Is(err, ports.ErrEngineProtocolViolation):
|
||||
return ErrorCodeEngineProtocolViolation
|
||||
case errors.Is(err, ports.ErrEngineUnreachable):
|
||||
return ErrorCodeEngineUnreachable
|
||||
default:
|
||||
return ErrorCodeEngineUnreachable
|
||||
}
|
||||
}
|
||||
|
||||
// validateRoster checks that the engine response carries exactly the
|
||||
// race set Game Master sent on /admin/init. ok=false means the flow
|
||||
// stops.
|
||||
func (service *Service) validateRoster(ctx context.Context, opStartedAt time.Time, input Input, state ports.StateResponse) (Result, bool) {
|
||||
if len(state.Players) != len(input.Members) {
|
||||
message := fmt.Sprintf("engine player count %d does not match roster size %d", len(state.Players), len(input.Members))
|
||||
return service.recordFailure(ctx, opStartedAt, input, true, false,
|
||||
ErrorCodeEngineProtocolViolation, message), false
|
||||
}
|
||||
expected := make(map[string]struct{}, len(input.Members))
|
||||
for _, member := range input.Members {
|
||||
expected[member.RaceName] = struct{}{}
|
||||
}
|
||||
for _, player := range state.Players {
|
||||
if _, ok := expected[player.RaceName]; !ok {
|
||||
message := fmt.Sprintf("engine returned race %q not present in roster", player.RaceName)
|
||||
return service.recordFailure(ctx, opStartedAt, input, true, false,
|
||||
ErrorCodeEngineProtocolViolation, message), false
|
||||
}
|
||||
}
|
||||
return Result{}, true
|
||||
}
|
||||
|
||||
// installPlayerMappings projects the engine response onto
|
||||
// player_mappings rows and persists them in one batch. ok=false means
|
||||
// the flow stops (and rolls back both stores).
|
||||
func (service *Service) installPlayerMappings(ctx context.Context, opStartedAt time.Time, input Input, state ports.StateResponse) (Result, bool) {
|
||||
userByRace := make(map[string]string, len(input.Members))
|
||||
for _, member := range input.Members {
|
||||
userByRace[member.RaceName] = member.UserID
|
||||
}
|
||||
|
||||
mappings := make([]playermapping.PlayerMapping, 0, len(state.Players))
|
||||
for _, player := range state.Players {
|
||||
userID, ok := userByRace[player.RaceName]
|
||||
if !ok {
|
||||
message := fmt.Sprintf("engine returned race %q not present in roster", player.RaceName)
|
||||
return service.recordFailure(ctx, opStartedAt, input, true, false,
|
||||
ErrorCodeEngineProtocolViolation, message), false
|
||||
}
|
||||
mappings = append(mappings, playermapping.PlayerMapping{
|
||||
GameID: input.GameID,
|
||||
UserID: userID,
|
||||
RaceName: player.RaceName,
|
||||
EnginePlayerUUID: player.EnginePlayerUUID,
|
||||
CreatedAt: opStartedAt,
|
||||
})
|
||||
}
|
||||
|
||||
if err := service.playerMappings.BulkInsert(ctx, mappings); err != nil {
|
||||
// BulkInsert is per-statement atomic (stage 11 D7), so a failure
|
||||
// leaves no mappings to clean up — only the runtime row.
|
||||
switch {
|
||||
case errors.Is(err, playermapping.ErrConflict):
|
||||
return service.recordFailure(ctx, opStartedAt, input, true, false,
|
||||
ErrorCodeConflict, fmt.Sprintf("bulk insert player mappings: %s", err.Error())), false
|
||||
default:
|
||||
return service.recordFailure(ctx, opStartedAt, input, true, false,
|
||||
ErrorCodeServiceUnavailable, fmt.Sprintf("bulk insert player mappings: %s", err.Error())), false
|
||||
}
|
||||
}
|
||||
return Result{}, true
|
||||
}
|
||||
|
||||
// computeNextGeneration parses the cron schedule and computes the first
|
||||
// next-generation timestamp (no skip pending). ok=false means the flow
|
||||
// stops with rollback.
|
||||
func (service *Service) computeNextGeneration(ctx context.Context, opStartedAt time.Time, input Input) (time.Time, Result, bool) {
|
||||
sched, err := schedule.Parse(input.TurnSchedule)
|
||||
if err != nil {
|
||||
return time.Time{}, service.recordFailure(ctx, opStartedAt, input, true, true,
|
||||
ErrorCodeInvalidRequest, fmt.Sprintf("parse turn schedule: %s", err.Error())), false
|
||||
}
|
||||
next, _ := sched.Next(opStartedAt, false)
|
||||
return next.UTC(), Result{}, true
|
||||
}
|
||||
|
||||
// casToRunning flips the runtime record from `starting` to `running`.
|
||||
// On CAS failure or any storage error the flow rolls back both stores.
|
||||
func (service *Service) casToRunning(ctx context.Context, opStartedAt time.Time, input Input) (Result, bool) {
|
||||
err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: input.GameID,
|
||||
ExpectedFrom: runtime.StatusStarting,
|
||||
To: runtime.StatusRunning,
|
||||
Now: opStartedAt,
|
||||
})
|
||||
switch {
|
||||
case err == nil:
|
||||
return Result{}, true
|
||||
case errors.Is(err, runtime.ErrConflict):
|
||||
return service.recordFailure(ctx, opStartedAt, input, true, true,
|
||||
ErrorCodeConflict, fmt.Sprintf("cas runtime status to running: %s", err.Error())), false
|
||||
default:
|
||||
return service.recordFailure(ctx, opStartedAt, input, true, true,
|
||||
ErrorCodeServiceUnavailable, fmt.Sprintf("cas runtime status to running: %s", err.Error())), false
|
||||
}
|
||||
}
|
||||
|
||||
// persistInitialScheduling writes the first `next_generation_at` and
|
||||
// the (already false) skip flag plus turn=0 on the runtime row.
|
||||
// Failure rolls back both stores.
|
||||
func (service *Service) persistInitialScheduling(ctx context.Context, opStartedAt time.Time, input Input, next time.Time) (Result, bool) {
|
||||
err := service.runtimeRecords.UpdateScheduling(ctx, ports.UpdateSchedulingInput{
|
||||
GameID: input.GameID,
|
||||
NextGenerationAt: &next,
|
||||
SkipNextTick: false,
|
||||
CurrentTurn: 0,
|
||||
Now: opStartedAt,
|
||||
})
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, opStartedAt, input, true, true,
|
||||
ErrorCodeServiceUnavailable, fmt.Sprintf("update initial scheduling: %s", err.Error())), false
|
||||
}
|
||||
return Result{}, true
|
||||
}
|
||||
|
||||
// reloadRecord re-reads the runtime row so the returned Result.Record
|
||||
// carries the post-CAS, post-scheduling timestamps the adapters set.
|
||||
// On read failure the flow rolls back both stores.
|
||||
func (service *Service) reloadRecord(ctx context.Context, opStartedAt time.Time, input Input) (runtime.RuntimeRecord, Result, bool) {
|
||||
persisted, err := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
if err != nil {
|
||||
return runtime.RuntimeRecord{}, service.recordFailure(ctx, opStartedAt, input, true, true,
|
||||
ErrorCodeServiceUnavailable, fmt.Sprintf("reload runtime record: %s", err.Error())), false
|
||||
}
|
||||
return persisted, Result{}, true
|
||||
}
|
||||
|
||||
// projectInitToStats joins the engine /admin/init response on RaceName
|
||||
// against the input roster to produce one PlayerTurnStats per active
|
||||
// member. The caller has already validated that every player race name
|
||||
// is present in the roster, so the lookup is total.
|
||||
func projectInitToStats(state ports.StateResponse, members []Member) []ports.PlayerTurnStats {
|
||||
if len(state.Players) == 0 {
|
||||
return nil
|
||||
}
|
||||
userByRace := make(map[string]string, len(members))
|
||||
for _, member := range members {
|
||||
userByRace[member.RaceName] = member.UserID
|
||||
}
|
||||
stats := make([]ports.PlayerTurnStats, 0, len(state.Players))
|
||||
for _, player := range state.Players {
|
||||
userID, ok := userByRace[player.RaceName]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
stats = append(stats, ports.PlayerTurnStats{
|
||||
UserID: userID,
|
||||
Planets: player.Planets,
|
||||
Population: player.Population,
|
||||
})
|
||||
}
|
||||
sort.Slice(stats, func(i, j int) bool { return stats[i].UserID < stats[j].UserID })
|
||||
return stats
|
||||
}
|
||||
|
||||
// recordFailure assembles the failure Result, rolls back any installed
|
||||
// state, appends the operation_log failure entry, and emits telemetry.
|
||||
// runtimeInserted reports whether the runtime row was already
|
||||
// installed; playerMappingsInstalled reports whether the player_mappings
|
||||
// rows were installed too. The two booleans gate the rollback so a
|
||||
// race-induced ErrConflict from Insert does not delete a row owned by
|
||||
// another caller.
|
||||
func (service *Service) recordFailure(
|
||||
ctx context.Context,
|
||||
opStartedAt time.Time,
|
||||
input Input,
|
||||
runtimeInserted bool,
|
||||
playerMappingsInstalled bool,
|
||||
errorCode string,
|
||||
errorMessage string,
|
||||
) Result {
|
||||
if runtimeInserted {
|
||||
service.rollback(ctx, input.GameID, playerMappingsInstalled)
|
||||
}
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindRegisterRuntime,
|
||||
OpSource: fallbackOpSource(input.OpSource),
|
||||
SourceRef: input.SourceRef,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: errorCode,
|
||||
ErrorMessage: errorMessage,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
|
||||
service.telemetry.RecordRegisterRuntimeOutcome(ctx, string(operation.OutcomeFailure), errorCode)
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"engine_version", input.TargetEngineVersion,
|
||||
"op_source", string(input.OpSource),
|
||||
"error_code", errorCode,
|
||||
"error_message", errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "register runtime failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: errorCode,
|
||||
ErrorMessage: errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
// rollback removes any installed state. Both store calls are
|
||||
// idempotent; failures are logged but never overwrite the original
|
||||
// failure reason. A fresh background context is used so a cancelled
|
||||
// request context does not strand the row.
|
||||
func (service *Service) rollback(ctx context.Context, gameID string, playerMappingsInstalled bool) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), rollbackTimeout)
|
||||
defer cancel()
|
||||
if playerMappingsInstalled {
|
||||
if err := service.playerMappings.DeleteByGame(cleanupCtx, gameID); err != nil {
|
||||
service.logger.ErrorContext(ctx, "rollback player mappings",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
if err := service.runtimeRecords.Delete(cleanupCtx, gameID); err != nil {
|
||||
service.logger.ErrorContext(ctx, "rollback runtime record",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// rollbackTimeout bounds each rollback storage call. A fresh background
|
||||
// context is used so a canceled request context does not block the
|
||||
// cleanup; the timeout matches the shape used by
|
||||
// `rtmanager/internal/service/startruntime.Service.releaseLease`.
|
||||
const rollbackTimeout = 5 * time.Second
|
||||
|
||||
// appendSuccessLog records the success operation_log entry for the
|
||||
// completed register-runtime operation.
|
||||
func (service *Service) appendSuccessLog(ctx context.Context, opStartedAt time.Time, input Input) {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindRegisterRuntime,
|
||||
OpSource: fallbackOpSource(input.OpSource),
|
||||
SourceRef: input.SourceRef,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
}
|
||||
|
||||
// publishSnapshot publishes the post-success runtime_snapshot_update
|
||||
// per `gamemaster/README.md §Lifecycles → Register-runtime` step 9.
|
||||
// Failures are logged but do not roll back the just-installed runtime
|
||||
// record; the snapshot stream is best-effort by contract.
|
||||
func (service *Service) publishSnapshot(ctx context.Context, record runtime.RuntimeRecord, stats []ports.PlayerTurnStats, occurredAt time.Time) {
|
||||
msg := ports.RuntimeSnapshotUpdate{
|
||||
GameID: record.GameID,
|
||||
CurrentTurn: record.CurrentTurn,
|
||||
RuntimeStatus: record.Status,
|
||||
EngineHealthSummary: record.EngineHealth,
|
||||
PlayerTurnStats: stats,
|
||||
OccurredAt: occurredAt,
|
||||
}
|
||||
if err := service.lobbyEvents.PublishSnapshotUpdate(ctx, msg); err != nil {
|
||||
service.logger.ErrorContext(ctx, "publish runtime snapshot update",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
service.telemetry.RecordLobbyEventPublished(ctx, "runtime_snapshot_update")
|
||||
}
|
||||
|
||||
// bestEffortAppend writes one operation_log entry. A failure is logged
|
||||
// and discarded; the runtime record (or its absence after rollback) is
|
||||
// the source of truth.
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// fallbackOpSource defaults to `admin_rest` when the caller did not
|
||||
// supply a known op source. Mirrors the README §Trusted Surfaces rule
|
||||
// "when missing or unrecognised, GM defaults to `op_source=admin_rest`".
|
||||
func fallbackOpSource(source operation.OpSource) operation.OpSource {
|
||||
if source.IsKnown() {
|
||||
return source
|
||||
}
|
||||
return operation.OpSourceAdminRest
|
||||
}
|
||||
Reference in New Issue
Block a user