// Package registerruntime implements the register-runtime service-layer // orchestrator owned by Game Master. The service is the single entry // point Game Lobby uses (after Runtime Manager has reported a successful // container start) to install a freshly-started game in Game Master. // // Lifecycle and failure-mode semantics follow `gamemaster/README.md // §Lifecycles → Register-runtime`. Design rationale is captured in // `gamemaster/docs/stage13-register-runtime.md`. package registerruntime import ( "context" "errors" "fmt" "log/slog" "sort" "strings" "time" "galaxy/gamemaster/internal/domain/engineversion" "galaxy/gamemaster/internal/domain/operation" "galaxy/gamemaster/internal/domain/playermapping" "galaxy/gamemaster/internal/domain/runtime" "galaxy/gamemaster/internal/domain/schedule" "galaxy/gamemaster/internal/logging" "galaxy/gamemaster/internal/ports" "galaxy/gamemaster/internal/telemetry" ) // Member stores one entry of Input.Members. The shape mirrors // `RegisterRuntimeMember` in `gamemaster/api/internal-openapi.yaml`. type Member struct { // UserID identifies an active platform member of the game. UserID string // RaceName stores the race name reserved for the member by Game // Lobby. Used both to build the engine /admin/init roster and to // resolve the engine response back to user_id. RaceName string } // Input stores the per-call arguments for one register-runtime // operation. The shape mirrors `RegisterRuntimeRequest` plus the // audit-only OpSource / SourceRef pair. type Input struct { // GameID identifies the platform game whose runtime is being // registered. GameID string // EngineEndpoint stores the engine container URL Game Master uses // for every subsequent call against the runtime // (`http://galaxy-game-{game_id}:8080`). EngineEndpoint string // Members stores the per-active-member roster Game Lobby committed // when the platform game opened. Must be non-empty. Members []Member // TargetEngineVersion stores the semver under which Runtime Manager // started the container. Resolved against the engine_versions // registry to recover the matching image_ref. TargetEngineVersion string // TurnSchedule stores the five-field cron expression governing turn // generation, copied from the platform game record. TurnSchedule string // OpSource classifies how the request entered Game Master. Required: // every operation_log entry carries an op_source. OpSource operation.OpSource // SourceRef stores the optional opaque per-source reference (request // id, admin user id). Empty when the caller does not provide one. SourceRef string } // Validate reports whether input carries the structural invariants the // service requires before any store is touched. func (input Input) Validate() error { if strings.TrimSpace(input.GameID) == "" { return fmt.Errorf("game id must not be empty") } if strings.TrimSpace(input.EngineEndpoint) == "" { return fmt.Errorf("engine endpoint must not be empty") } if len(input.Members) == 0 { return fmt.Errorf("members must not be empty") } for index, member := range input.Members { if strings.TrimSpace(member.UserID) == "" { return fmt.Errorf("members[%d]: user id must not be empty", index) } if strings.TrimSpace(member.RaceName) == "" { return fmt.Errorf("members[%d]: race name must not be empty", index) } } if strings.TrimSpace(input.TargetEngineVersion) == "" { return fmt.Errorf("target engine version must not be empty") } if strings.TrimSpace(input.TurnSchedule) == "" { return fmt.Errorf("turn schedule must not be empty") } if !input.OpSource.IsKnown() { return fmt.Errorf("op source %q is unsupported", input.OpSource) } if duplicate := firstDuplicateMember(input.Members); duplicate != "" { return fmt.Errorf("members carry duplicate entries for %q", duplicate) } return nil } // firstDuplicateMember returns the first user_id or race_name that // appears more than once in members. Empty when every entry is unique. func firstDuplicateMember(members []Member) string { seenUsers := make(map[string]struct{}, len(members)) seenRaces := make(map[string]struct{}, len(members)) for _, member := range members { if _, ok := seenUsers[member.UserID]; ok { return member.UserID } seenUsers[member.UserID] = struct{}{} if _, ok := seenRaces[member.RaceName]; ok { return member.RaceName } seenRaces[member.RaceName] = struct{}{} } return "" } // Result stores the deterministic outcome of one Handle call. Business // outcomes flow through Result; the Go-level error return is reserved // for non-business failures (nil context, nil receiver). type Result struct { // Record carries the runtime record installed by the operation. // Populated on success; zero on failure. Record runtime.RuntimeRecord // Outcome reports whether the operation completed (success) or // produced a stable failure code. Outcome operation.Outcome // ErrorCode stores the stable error code on failure. Empty on // success. ErrorCode string // ErrorMessage stores the operator-readable detail on failure. // Empty on success. ErrorMessage string } // IsSuccess reports whether the result represents a successful // operation. func (result Result) IsSuccess() bool { return result.Outcome == operation.OutcomeSuccess } // Dependencies groups the collaborators required by Service. type Dependencies struct { // RuntimeRecords stores the runtime_records row installed by the // flow. RuntimeRecords ports.RuntimeRecordStore // EngineVersions resolves `target_engine_version` to the matching // image_ref and validates the version exists. EngineVersions ports.EngineVersionStore // PlayerMappings persists the (game_id, user_id) → race_name // projection derived from the engine /admin/init response. PlayerMappings ports.PlayerMappingStore // OperationLogs records the audit entry for the operation. OperationLogs ports.OperationLogStore // Engine drives the engine /admin/init call and decodes the // response. Engine ports.EngineClient // LobbyEvents publishes the post-success runtime_snapshot_update // to `gm:lobby_events`. LobbyEvents ports.LobbyEventsPublisher // Telemetry records register-runtime outcomes plus the snapshot // publication counter. Required. Telemetry *telemetry.Runtime // Logger records structured service-level events. Defaults to // `slog.Default()` when nil. Logger *slog.Logger // Clock supplies the wall-clock used for operation timestamps. // Defaults to `time.Now` when nil. Clock func() time.Time } // Service executes the register-runtime lifecycle operation. type Service struct { runtimeRecords ports.RuntimeRecordStore engineVersions ports.EngineVersionStore playerMappings ports.PlayerMappingStore operationLogs ports.OperationLogStore engine ports.EngineClient lobbyEvents ports.LobbyEventsPublisher telemetry *telemetry.Runtime logger *slog.Logger clock func() time.Time } // NewService constructs one Service from deps. func NewService(deps Dependencies) (*Service, error) { switch { case deps.RuntimeRecords == nil: return nil, errors.New("new register runtime service: nil runtime records") case deps.EngineVersions == nil: return nil, errors.New("new register runtime service: nil engine versions") case deps.PlayerMappings == nil: return nil, errors.New("new register runtime service: nil player mappings") case deps.OperationLogs == nil: return nil, errors.New("new register runtime service: nil operation logs") case deps.Engine == nil: return nil, errors.New("new register runtime service: nil engine client") case deps.LobbyEvents == nil: return nil, errors.New("new register runtime service: nil lobby events publisher") case deps.Telemetry == nil: return nil, errors.New("new register runtime service: nil telemetry runtime") } clock := deps.Clock if clock == nil { clock = time.Now } logger := deps.Logger if logger == nil { logger = slog.Default() } logger = logger.With("service", "gamemaster.registerruntime") return &Service{ runtimeRecords: deps.RuntimeRecords, engineVersions: deps.EngineVersions, playerMappings: deps.PlayerMappings, operationLogs: deps.OperationLogs, engine: deps.Engine, lobbyEvents: deps.LobbyEvents, telemetry: deps.Telemetry, logger: logger, clock: clock, }, nil } // Handle executes one register-runtime operation end-to-end. The // Go-level error return is reserved for non-business failures (nil // context, nil receiver). Every business outcome flows through Result. func (service *Service) Handle(ctx context.Context, input Input) (Result, error) { if service == nil { return Result{}, errors.New("register runtime: nil service") } if ctx == nil { return Result{}, errors.New("register runtime: nil context") } opStartedAt := service.clock().UTC() if err := input.Validate(); err != nil { return service.recordFailure(ctx, opStartedAt, input, false, false, ErrorCodeInvalidRequest, err.Error()), nil } if outcome, ok := service.rejectExisting(ctx, opStartedAt, input); ok { return outcome, nil } imageRef, outcome, ok := service.resolveImageRef(ctx, opStartedAt, input) if !ok { return outcome, nil } record := service.buildStartingRecord(input, imageRef, opStartedAt) if err := service.runtimeRecords.Insert(ctx, record); err != nil { switch { case errors.Is(err, runtime.ErrConflict): return service.recordFailure(ctx, opStartedAt, input, false, false, ErrorCodeConflict, "runtime record already exists"), nil default: return service.recordFailure(ctx, opStartedAt, input, false, false, ErrorCodeServiceUnavailable, fmt.Sprintf("insert runtime record: %s", err.Error())), nil } } engineState, outcome, ok := service.callEngineInit(ctx, opStartedAt, input) if !ok { return outcome, nil } if outcome, ok := service.validateRoster(ctx, opStartedAt, input, engineState); !ok { return outcome, nil } if outcome, ok := service.installPlayerMappings(ctx, opStartedAt, input, engineState); !ok { return outcome, nil } nextGenerationAt, outcome, ok := service.computeNextGeneration(ctx, opStartedAt, input) if !ok { return outcome, nil } if outcome, ok := service.casToRunning(ctx, opStartedAt, input); !ok { return outcome, nil } if outcome, ok := service.persistInitialScheduling(ctx, opStartedAt, input, nextGenerationAt); !ok { return outcome, nil } persisted, outcome, ok := service.reloadRecord(ctx, opStartedAt, input) if !ok { return outcome, nil } stats := projectInitToStats(engineState, input.Members) service.appendSuccessLog(ctx, opStartedAt, input) service.publishSnapshot(ctx, persisted, stats, opStartedAt) service.telemetry.RecordRegisterRuntimeOutcome(ctx, string(operation.OutcomeSuccess), "") logArgs := []any{ "game_id", input.GameID, "engine_version", input.TargetEngineVersion, "members", len(input.Members), "op_source", string(input.OpSource), } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) service.logger.InfoContext(ctx, "runtime registered", logArgs...) return Result{ Record: persisted, Outcome: operation.OutcomeSuccess, }, nil } // rejectExisting returns a Result and ok=true when the runtime record // already exists or the lookup itself failed; ok=false continues the // flow. func (service *Service) rejectExisting(ctx context.Context, opStartedAt time.Time, input Input) (Result, bool) { _, err := service.runtimeRecords.Get(ctx, input.GameID) switch { case errors.Is(err, runtime.ErrNotFound): return Result{}, false case err != nil: return service.recordFailure(ctx, opStartedAt, input, false, false, ErrorCodeServiceUnavailable, fmt.Sprintf("get runtime record: %s", err.Error())), true default: return service.recordFailure(ctx, opStartedAt, input, false, false, ErrorCodeConflict, "runtime record already exists"), true } } // resolveImageRef resolves the target engine version against the // engine_versions registry. Returns ok=false on failure with the // matching Result. func (service *Service) resolveImageRef(ctx context.Context, opStartedAt time.Time, input Input) (string, Result, bool) { version, err := service.engineVersions.Get(ctx, input.TargetEngineVersion) switch { case errors.Is(err, engineversion.ErrNotFound): return "", service.recordFailure(ctx, opStartedAt, input, false, false, ErrorCodeEngineVersionNotFound, fmt.Sprintf("engine version %q not found", input.TargetEngineVersion)), false case err != nil: return "", service.recordFailure(ctx, opStartedAt, input, false, false, ErrorCodeServiceUnavailable, fmt.Sprintf("get engine version: %s", err.Error())), false } return version.ImageRef, Result{}, true } // buildStartingRecord assembles the initial runtime_records row, // matching `gamemaster/README.md §Lifecycles → Register-runtime` step 4. func (service *Service) buildStartingRecord(input Input, imageRef string, now time.Time) runtime.RuntimeRecord { return runtime.RuntimeRecord{ GameID: input.GameID, Status: runtime.StatusStarting, EngineEndpoint: input.EngineEndpoint, CurrentImageRef: imageRef, CurrentEngineVersion: input.TargetEngineVersion, TurnSchedule: input.TurnSchedule, CurrentTurn: 0, NextGenerationAt: nil, SkipNextTick: false, EngineHealth: "", CreatedAt: now, UpdatedAt: now, } } // callEngineInit dispatches the engine /admin/init call and maps the // transport-layer error to a stable Result code. ok=false means the // flow stops. func (service *Service) callEngineInit(ctx context.Context, opStartedAt time.Time, input Input) (ports.StateResponse, Result, bool) { races := make([]ports.InitRace, 0, len(input.Members)) for _, member := range input.Members { races = append(races, ports.InitRace{RaceName: member.RaceName}) } state, err := service.engine.Init(ctx, input.EngineEndpoint, ports.InitRequest{Races: races}) if err == nil { return state, Result{}, true } code := classifyEngineError(err) message := fmt.Sprintf("engine init: %s", err.Error()) return ports.StateResponse{}, service.recordFailure(ctx, opStartedAt, input, true, false, code, message), false } // classifyEngineError maps the engine port sentinels to the // register-runtime stable error codes per Stage 13 D1. func classifyEngineError(err error) string { switch { case errors.Is(err, ports.ErrEngineValidation): return ErrorCodeEngineValidationError case errors.Is(err, ports.ErrEngineProtocolViolation): return ErrorCodeEngineProtocolViolation case errors.Is(err, ports.ErrEngineUnreachable): return ErrorCodeEngineUnreachable default: return ErrorCodeEngineUnreachable } } // validateRoster checks that the engine response carries exactly the // race set Game Master sent on /admin/init. ok=false means the flow // stops. func (service *Service) validateRoster(ctx context.Context, opStartedAt time.Time, input Input, state ports.StateResponse) (Result, bool) { if len(state.Players) != len(input.Members) { message := fmt.Sprintf("engine player count %d does not match roster size %d", len(state.Players), len(input.Members)) return service.recordFailure(ctx, opStartedAt, input, true, false, ErrorCodeEngineProtocolViolation, message), false } expected := make(map[string]struct{}, len(input.Members)) for _, member := range input.Members { expected[member.RaceName] = struct{}{} } for _, player := range state.Players { if _, ok := expected[player.RaceName]; !ok { message := fmt.Sprintf("engine returned race %q not present in roster", player.RaceName) return service.recordFailure(ctx, opStartedAt, input, true, false, ErrorCodeEngineProtocolViolation, message), false } } return Result{}, true } // installPlayerMappings projects the engine response onto // player_mappings rows and persists them in one batch. ok=false means // the flow stops (and rolls back both stores). func (service *Service) installPlayerMappings(ctx context.Context, opStartedAt time.Time, input Input, state ports.StateResponse) (Result, bool) { userByRace := make(map[string]string, len(input.Members)) for _, member := range input.Members { userByRace[member.RaceName] = member.UserID } mappings := make([]playermapping.PlayerMapping, 0, len(state.Players)) for _, player := range state.Players { userID, ok := userByRace[player.RaceName] if !ok { message := fmt.Sprintf("engine returned race %q not present in roster", player.RaceName) return service.recordFailure(ctx, opStartedAt, input, true, false, ErrorCodeEngineProtocolViolation, message), false } mappings = append(mappings, playermapping.PlayerMapping{ GameID: input.GameID, UserID: userID, RaceName: player.RaceName, EnginePlayerUUID: player.EnginePlayerUUID, CreatedAt: opStartedAt, }) } if err := service.playerMappings.BulkInsert(ctx, mappings); err != nil { // BulkInsert is per-statement atomic (stage 11 D7), so a failure // leaves no mappings to clean up — only the runtime row. switch { case errors.Is(err, playermapping.ErrConflict): return service.recordFailure(ctx, opStartedAt, input, true, false, ErrorCodeConflict, fmt.Sprintf("bulk insert player mappings: %s", err.Error())), false default: return service.recordFailure(ctx, opStartedAt, input, true, false, ErrorCodeServiceUnavailable, fmt.Sprintf("bulk insert player mappings: %s", err.Error())), false } } return Result{}, true } // computeNextGeneration parses the cron schedule and computes the first // next-generation timestamp (no skip pending). ok=false means the flow // stops with rollback. func (service *Service) computeNextGeneration(ctx context.Context, opStartedAt time.Time, input Input) (time.Time, Result, bool) { sched, err := schedule.Parse(input.TurnSchedule) if err != nil { return time.Time{}, service.recordFailure(ctx, opStartedAt, input, true, true, ErrorCodeInvalidRequest, fmt.Sprintf("parse turn schedule: %s", err.Error())), false } next, _ := sched.Next(opStartedAt, false) return next.UTC(), Result{}, true } // casToRunning flips the runtime record from `starting` to `running`. // On CAS failure or any storage error the flow rolls back both stores. func (service *Service) casToRunning(ctx context.Context, opStartedAt time.Time, input Input) (Result, bool) { err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{ GameID: input.GameID, ExpectedFrom: runtime.StatusStarting, To: runtime.StatusRunning, Now: opStartedAt, }) switch { case err == nil: return Result{}, true case errors.Is(err, runtime.ErrConflict): return service.recordFailure(ctx, opStartedAt, input, true, true, ErrorCodeConflict, fmt.Sprintf("cas runtime status to running: %s", err.Error())), false default: return service.recordFailure(ctx, opStartedAt, input, true, true, ErrorCodeServiceUnavailable, fmt.Sprintf("cas runtime status to running: %s", err.Error())), false } } // persistInitialScheduling writes the first `next_generation_at` and // the (already false) skip flag plus turn=0 on the runtime row. // Failure rolls back both stores. func (service *Service) persistInitialScheduling(ctx context.Context, opStartedAt time.Time, input Input, next time.Time) (Result, bool) { err := service.runtimeRecords.UpdateScheduling(ctx, ports.UpdateSchedulingInput{ GameID: input.GameID, NextGenerationAt: &next, SkipNextTick: false, CurrentTurn: 0, Now: opStartedAt, }) if err != nil { return service.recordFailure(ctx, opStartedAt, input, true, true, ErrorCodeServiceUnavailable, fmt.Sprintf("update initial scheduling: %s", err.Error())), false } return Result{}, true } // reloadRecord re-reads the runtime row so the returned Result.Record // carries the post-CAS, post-scheduling timestamps the adapters set. // On read failure the flow rolls back both stores. func (service *Service) reloadRecord(ctx context.Context, opStartedAt time.Time, input Input) (runtime.RuntimeRecord, Result, bool) { persisted, err := service.runtimeRecords.Get(ctx, input.GameID) if err != nil { return runtime.RuntimeRecord{}, service.recordFailure(ctx, opStartedAt, input, true, true, ErrorCodeServiceUnavailable, fmt.Sprintf("reload runtime record: %s", err.Error())), false } return persisted, Result{}, true } // projectInitToStats joins the engine /admin/init response on RaceName // against the input roster to produce one PlayerTurnStats per active // member. The caller has already validated that every player race name // is present in the roster, so the lookup is total. func projectInitToStats(state ports.StateResponse, members []Member) []ports.PlayerTurnStats { if len(state.Players) == 0 { return nil } userByRace := make(map[string]string, len(members)) for _, member := range members { userByRace[member.RaceName] = member.UserID } stats := make([]ports.PlayerTurnStats, 0, len(state.Players)) for _, player := range state.Players { userID, ok := userByRace[player.RaceName] if !ok { continue } stats = append(stats, ports.PlayerTurnStats{ UserID: userID, Planets: player.Planets, Population: player.Population, }) } sort.Slice(stats, func(i, j int) bool { return stats[i].UserID < stats[j].UserID }) return stats } // recordFailure assembles the failure Result, rolls back any installed // state, appends the operation_log failure entry, and emits telemetry. // runtimeInserted reports whether the runtime row was already // installed; playerMappingsInstalled reports whether the player_mappings // rows were installed too. The two booleans gate the rollback so a // race-induced ErrConflict from Insert does not delete a row owned by // another caller. func (service *Service) recordFailure( ctx context.Context, opStartedAt time.Time, input Input, runtimeInserted bool, playerMappingsInstalled bool, errorCode string, errorMessage string, ) Result { if runtimeInserted { service.rollback(ctx, input.GameID, playerMappingsInstalled) } finishedAt := service.clock().UTC() service.bestEffortAppend(ctx, operation.OperationEntry{ GameID: input.GameID, OpKind: operation.OpKindRegisterRuntime, OpSource: fallbackOpSource(input.OpSource), SourceRef: input.SourceRef, Outcome: operation.OutcomeFailure, ErrorCode: errorCode, ErrorMessage: errorMessage, StartedAt: opStartedAt, FinishedAt: &finishedAt, }) service.telemetry.RecordRegisterRuntimeOutcome(ctx, string(operation.OutcomeFailure), errorCode) logArgs := []any{ "game_id", input.GameID, "engine_version", input.TargetEngineVersion, "op_source", string(input.OpSource), "error_code", errorCode, "error_message", errorMessage, } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) service.logger.WarnContext(ctx, "register runtime failed", logArgs...) return Result{ Outcome: operation.OutcomeFailure, ErrorCode: errorCode, ErrorMessage: errorMessage, } } // rollback removes any installed state. Both store calls are // idempotent; failures are logged but never overwrite the original // failure reason. A fresh background context is used so a cancelled // request context does not strand the row. func (service *Service) rollback(ctx context.Context, gameID string, playerMappingsInstalled bool) { cleanupCtx, cancel := context.WithTimeout(context.Background(), rollbackTimeout) defer cancel() if playerMappingsInstalled { if err := service.playerMappings.DeleteByGame(cleanupCtx, gameID); err != nil { service.logger.ErrorContext(ctx, "rollback player mappings", "game_id", gameID, "err", err.Error(), ) } } if err := service.runtimeRecords.Delete(cleanupCtx, gameID); err != nil { service.logger.ErrorContext(ctx, "rollback runtime record", "game_id", gameID, "err", err.Error(), ) } } // rollbackTimeout bounds each rollback storage call. A fresh background // context is used so a canceled request context does not block the // cleanup; the timeout matches the shape used by // `rtmanager/internal/service/startruntime.Service.releaseLease`. const rollbackTimeout = 5 * time.Second // appendSuccessLog records the success operation_log entry for the // completed register-runtime operation. func (service *Service) appendSuccessLog(ctx context.Context, opStartedAt time.Time, input Input) { finishedAt := service.clock().UTC() service.bestEffortAppend(ctx, operation.OperationEntry{ GameID: input.GameID, OpKind: operation.OpKindRegisterRuntime, OpSource: fallbackOpSource(input.OpSource), SourceRef: input.SourceRef, Outcome: operation.OutcomeSuccess, StartedAt: opStartedAt, FinishedAt: &finishedAt, }) } // publishSnapshot publishes the post-success runtime_snapshot_update // per `gamemaster/README.md §Lifecycles → Register-runtime` step 9. // Failures are logged but do not roll back the just-installed runtime // record; the snapshot stream is best-effort by contract. func (service *Service) publishSnapshot(ctx context.Context, record runtime.RuntimeRecord, stats []ports.PlayerTurnStats, occurredAt time.Time) { msg := ports.RuntimeSnapshotUpdate{ GameID: record.GameID, CurrentTurn: record.CurrentTurn, RuntimeStatus: record.Status, EngineHealthSummary: record.EngineHealth, PlayerTurnStats: stats, OccurredAt: occurredAt, } if err := service.lobbyEvents.PublishSnapshotUpdate(ctx, msg); err != nil { service.logger.ErrorContext(ctx, "publish runtime snapshot update", "game_id", record.GameID, "err", err.Error(), ) return } service.telemetry.RecordLobbyEventPublished(ctx, "runtime_snapshot_update") } // bestEffortAppend writes one operation_log entry. A failure is logged // and discarded; the runtime record (or its absence after rollback) is // the source of truth. func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) { if _, err := service.operationLogs.Append(ctx, entry); err != nil { service.logger.ErrorContext(ctx, "append operation log", "game_id", entry.GameID, "op_kind", string(entry.OpKind), "outcome", string(entry.Outcome), "error_code", entry.ErrorCode, "err", err.Error(), ) } } // fallbackOpSource defaults to `admin_rest` when the caller did not // supply a known op source. Mirrors the README §Trusted Surfaces rule // "when missing or unrecognised, GM defaults to `op_source=admin_rest`". func fallbackOpSource(source operation.OpSource) operation.OpSource { if source.IsKnown() { return source } return operation.OpSourceAdminRest }