feat: gamemaster

This commit is contained in:
Ilia Denisov
2026-05-03 07:59:03 +02:00
committed by GitHub
parent a7cee15115
commit 3e2622757e
229 changed files with 41521 additions and 1098 deletions
@@ -0,0 +1,43 @@
package runtime
import (
"errors"
"fmt"
)
// ErrNotFound reports that a runtime record was requested but does not
// exist in the store.
var ErrNotFound = errors.New("runtime record not found")
// ErrConflict reports that a runtime mutation could not be applied
// because the record changed concurrently or failed a compare-and-swap
// guard.
var ErrConflict = errors.New("runtime record conflict")
// ErrInvalidTransition is the sentinel returned when Transition rejects
// a `(from, to)` pair.
var ErrInvalidTransition = errors.New("invalid runtime status transition")
// InvalidTransitionError stores the rejected `(from, to)` pair and wraps
// ErrInvalidTransition so callers can match it with errors.Is.
type InvalidTransitionError struct {
// From stores the source status that was attempted to leave.
From Status
// To stores the destination status that was attempted to enter.
To Status
}
// Error reports a human-readable summary of the rejected pair.
func (err *InvalidTransitionError) Error() string {
return fmt.Sprintf(
"invalid runtime status transition from %q to %q",
err.From, err.To,
)
}
// Unwrap returns ErrInvalidTransition so errors.Is recognizes the
// sentinel.
func (err *InvalidTransitionError) Unwrap() error {
return ErrInvalidTransition
}
+254
View File
@@ -0,0 +1,254 @@
// Package runtime defines the runtime-record domain model, status
// machine, and sentinel errors owned by Game Master.
//
// The package mirrors the durable shape of the `runtime_records`
// PostgreSQL table (see
// `galaxy/gamemaster/internal/adapters/postgres/migrations/00001_init.sql`).
// Every status / transition / required-field rule already documented in
// `galaxy/gamemaster/README.md` lives here as code so adapter and service
// layers do not re-derive it.
package runtime
import (
"fmt"
"strings"
"time"
)
// Status identifies one runtime-record lifecycle state.
type Status string
const (
// StatusStarting reports that register-runtime has persisted the row
// but the engine /admin/init call has not yet succeeded.
StatusStarting Status = "starting"
// StatusRunning reports that the runtime is healthy and accepting
// player commands and turn generation.
StatusRunning Status = "running"
// StatusGenerationInProgress reports that the scheduler or admin
// force-next-turn flow has CAS'd the row to drive turn generation.
StatusGenerationInProgress Status = "generation_in_progress"
// StatusGenerationFailed reports that turn generation surfaced an
// engine error and the runtime is awaiting manual recovery.
StatusGenerationFailed Status = "generation_failed"
// StatusStopped reports that an admin stop has completed; the row
// stays in PostgreSQL for audit.
StatusStopped Status = "stopped"
// StatusEngineUnreachable reports that runtime:health_events observed
// an engine container failure (exited, OOM, disappeared, or repeated
// probe failures).
StatusEngineUnreachable Status = "engine_unreachable"
// StatusFinished reports that the engine returned `finished:true` on
// a turn-generation response. The state is terminal: the row stays
// here indefinitely; operator cleanup is the only path out.
StatusFinished Status = "finished"
)
// IsKnown reports whether status belongs to the frozen runtime status
// vocabulary.
func (status Status) IsKnown() bool {
switch status {
case StatusStarting,
StatusRunning,
StatusGenerationInProgress,
StatusGenerationFailed,
StatusStopped,
StatusEngineUnreachable,
StatusFinished:
return true
default:
return false
}
}
// IsTerminal reports whether status can no longer accept lifecycle
// transitions. Per `gamemaster/README.md §Game Master status model`, only
// `finished` is terminal; `stopped` may still be observed but is treated
// as a non-terminal end-state for admin replay purposes (no transitions
// out of it are wired in v1, but the state machine does not forbid them
// architecturally).
func (status Status) IsTerminal() bool {
return status == StatusFinished
}
// AllStatuses returns the frozen list of every runtime status value. The
// slice order is stable across calls and matches the README §Persistence
// Layout listing.
func AllStatuses() []Status {
return []Status{
StatusStarting,
StatusRunning,
StatusGenerationInProgress,
StatusGenerationFailed,
StatusStopped,
StatusEngineUnreachable,
StatusFinished,
}
}
// RuntimeRecord stores one durable runtime record owned by Game Master.
// It mirrors one row of the `runtime_records` table.
//
// NextGenerationAt is *time.Time so a missing tick (e.g., a row that has
// just entered with status=starting) is unambiguous. StartedAt, StoppedAt,
// and FinishedAt are *time.Time for the same reason and align with the
// jet-generated model.
type RuntimeRecord struct {
// GameID identifies the platform game owning this runtime record.
GameID string
// Status stores the current lifecycle state.
Status Status
// EngineEndpoint stores the stable URL Game Master uses to reach the
// engine container, in `http://galaxy-game-{game_id}:8080` form.
EngineEndpoint string
// CurrentImageRef stores the Docker reference of the running engine
// image (or the most recent one for stopped/finished records).
CurrentImageRef string
// CurrentEngineVersion stores the semver of the currently-bound
// engine version (registered in `engine_versions`).
CurrentEngineVersion string
// TurnSchedule stores the five-field cron expression governing turn
// generation, copied from the platform game record at
// register-runtime time.
TurnSchedule string
// CurrentTurn stores the last completed turn number; zero until the
// first turn generates.
CurrentTurn int
// NextGenerationAt stores the next due tick. Nil when no tick is
// scheduled (e.g., status=starting, finished, stopped).
NextGenerationAt *time.Time
// SkipNextTick is true when force-next-turn has set the skip flag
// for the next regular tick. Cleared by the scheduler after the
// first scheduled step is skipped.
SkipNextTick bool
// EngineHealth stores the short text summary derived from
// runtime:health_events; empty until the first health observation.
EngineHealth string
// CreatedAt stores the wall-clock at which the record was created.
CreatedAt time.Time
// UpdatedAt stores the wall-clock of the most recent mutation.
UpdatedAt time.Time
// StartedAt stores the wall-clock at which the runtime first
// transitioned to running. Non-nil once the status leaves starting.
StartedAt *time.Time
// StoppedAt stores the wall-clock at which the runtime was stopped.
// Non-nil when status is stopped.
StoppedAt *time.Time
// FinishedAt stores the wall-clock at which the engine reported
// finish. Non-nil when status is finished.
FinishedAt *time.Time
}
// Validate reports whether record satisfies the runtime-record invariants
// implied by README §Lifecycles and the SQL CHECK on `runtime_records`.
func (record RuntimeRecord) Validate() error {
if strings.TrimSpace(record.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if !record.Status.IsKnown() {
return fmt.Errorf("status %q is unsupported", record.Status)
}
if strings.TrimSpace(record.EngineEndpoint) == "" {
return fmt.Errorf("engine endpoint must not be empty")
}
if strings.TrimSpace(record.CurrentImageRef) == "" {
return fmt.Errorf("current image ref must not be empty")
}
if strings.TrimSpace(record.CurrentEngineVersion) == "" {
return fmt.Errorf("current engine version must not be empty")
}
if strings.TrimSpace(record.TurnSchedule) == "" {
return fmt.Errorf("turn schedule must not be empty")
}
if record.CurrentTurn < 0 {
return fmt.Errorf("current turn must not be negative")
}
if record.CreatedAt.IsZero() {
return fmt.Errorf("created at must not be zero")
}
if record.UpdatedAt.IsZero() {
return fmt.Errorf("updated at must not be zero")
}
if record.UpdatedAt.Before(record.CreatedAt) {
return fmt.Errorf("updated at must not be before created at")
}
if record.NextGenerationAt != nil && record.NextGenerationAt.IsZero() {
return fmt.Errorf("next generation at must not be zero when present")
}
switch record.Status {
case StatusStarting:
if record.StartedAt != nil {
return fmt.Errorf("started at must be nil for starting records")
}
case StatusRunning,
StatusGenerationInProgress,
StatusGenerationFailed,
StatusEngineUnreachable:
if record.StartedAt == nil {
return fmt.Errorf(
"started at must not be nil for %s records",
record.Status,
)
}
if record.StartedAt.IsZero() {
return fmt.Errorf("started at must not be zero when present")
}
case StatusStopped:
if record.StartedAt == nil {
return fmt.Errorf("started at must not be nil for stopped records")
}
if record.StoppedAt == nil {
return fmt.Errorf("stopped at must not be nil for stopped records")
}
if record.StoppedAt.IsZero() {
return fmt.Errorf("stopped at must not be zero when present")
}
if record.StoppedAt.Before(*record.StartedAt) {
return fmt.Errorf("stopped at must not be before started at")
}
case StatusFinished:
if record.StartedAt == nil {
return fmt.Errorf("started at must not be nil for finished records")
}
if record.FinishedAt == nil {
return fmt.Errorf("finished at must not be nil for finished records")
}
if record.FinishedAt.IsZero() {
return fmt.Errorf("finished at must not be zero when present")
}
if record.FinishedAt.Before(*record.StartedAt) {
return fmt.Errorf("finished at must not be before started at")
}
}
if record.StartedAt != nil && record.StartedAt.Before(record.CreatedAt) {
return fmt.Errorf("started at must not be before created at")
}
return nil
}
@@ -0,0 +1,130 @@
package runtime
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func validRunningRecord() RuntimeRecord {
created := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
started := created.Add(time.Minute)
updated := started.Add(time.Minute)
next := updated.Add(time.Hour)
return RuntimeRecord{
GameID: "game-1",
Status: StatusRunning,
EngineEndpoint: "http://galaxy-game-1:8080",
CurrentImageRef: "ghcr.io/galaxy/game:v1.2.3",
CurrentEngineVersion: "v1.2.3",
TurnSchedule: "0 18 * * *",
CurrentTurn: 0,
NextGenerationAt: &next,
CreatedAt: created,
UpdatedAt: updated,
StartedAt: &started,
}
}
func TestStatusIsKnown(t *testing.T) {
for _, status := range AllStatuses() {
assert.True(t, status.IsKnown(), "want known: %q", status)
}
assert.False(t, Status("exotic").IsKnown())
assert.False(t, Status("").IsKnown())
}
func TestStatusIsTerminal(t *testing.T) {
assert.True(t, StatusFinished.IsTerminal())
for _, status := range AllStatuses() {
if status == StatusFinished {
continue
}
assert.False(t, status.IsTerminal(), "%q must not be terminal", status)
}
}
func TestAllStatusesStable(t *testing.T) {
first := AllStatuses()
second := AllStatuses()
assert.Equal(t, first, second)
assert.Len(t, first, 7)
}
func TestRuntimeRecordValidateHappy(t *testing.T) {
require.NoError(t, validRunningRecord().Validate())
}
func TestRuntimeRecordValidateAcceptsStarting(t *testing.T) {
record := validRunningRecord()
record.Status = StatusStarting
record.StartedAt = nil
record.NextGenerationAt = nil
assert.NoError(t, record.Validate())
}
func TestRuntimeRecordValidateRequiresFinishedAt(t *testing.T) {
record := validRunningRecord()
record.Status = StatusFinished
record.FinishedAt = nil
assert.Error(t, record.Validate())
finished := record.UpdatedAt.Add(time.Minute)
record.FinishedAt = &finished
assert.NoError(t, record.Validate())
}
func TestRuntimeRecordValidateRequiresStoppedAtForStopped(t *testing.T) {
record := validRunningRecord()
record.Status = StatusStopped
assert.Error(t, record.Validate())
stopped := record.UpdatedAt.Add(time.Minute)
record.StoppedAt = &stopped
assert.NoError(t, record.Validate())
}
func TestRuntimeRecordValidateRejects(t *testing.T) {
tests := []struct {
name string
mutate func(*RuntimeRecord)
}{
{"empty game id", func(r *RuntimeRecord) { r.GameID = "" }},
{"unknown status", func(r *RuntimeRecord) { r.Status = "exotic" }},
{"empty engine endpoint", func(r *RuntimeRecord) { r.EngineEndpoint = "" }},
{"empty image ref", func(r *RuntimeRecord) { r.CurrentImageRef = "" }},
{"empty engine version", func(r *RuntimeRecord) { r.CurrentEngineVersion = "" }},
{"empty turn schedule", func(r *RuntimeRecord) { r.TurnSchedule = "" }},
{"negative turn", func(r *RuntimeRecord) { r.CurrentTurn = -1 }},
{"zero created at", func(r *RuntimeRecord) { r.CreatedAt = time.Time{} }},
{"zero updated at", func(r *RuntimeRecord) { r.UpdatedAt = time.Time{} }},
{"updated before created", func(r *RuntimeRecord) {
r.UpdatedAt = r.CreatedAt.Add(-time.Minute)
}},
{"started before created", func(r *RuntimeRecord) {
before := r.CreatedAt.Add(-time.Minute)
r.StartedAt = &before
}},
{"running missing started at", func(r *RuntimeRecord) { r.StartedAt = nil }},
{"starting with started at", func(r *RuntimeRecord) {
r.Status = StatusStarting
// keep StartedAt set
}},
{"zero next generation at", func(r *RuntimeRecord) {
zero := time.Time{}
r.NextGenerationAt = &zero
}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
record := validRunningRecord()
tt.mutate(&record)
assert.Error(t, record.Validate())
})
}
}
@@ -0,0 +1,77 @@
package runtime
// transitionKey stores one `(from, to)` pair in the allowed-transitions
// table.
type transitionKey struct {
from Status
to Status
}
// allowedTransitions enumerates the runtime-status transitions Game
// Master is allowed to apply. The set mirrors the lifecycle flows frozen
// in `galaxy/gamemaster/README.md §Lifecycles`:
//
// - starting → running: register-runtime CAS after a successful
// engine /admin/init.
// - running → generation_in_progress: scheduler ticker or admin
// force-next-turn enters turn generation.
// - generation_in_progress → running: turn generation succeeded with
// `finished=false`.
// - generation_in_progress → generation_failed: engine timeout or
// 5xx during turn generation.
// - generation_in_progress → finished: engine returned
// `finished=true`; the state is terminal.
// - generation_failed → generation_in_progress: admin force-next-turn
// after manual recovery.
// - running → engine_unreachable: runtime:health_events observed an
// engine container failure (Stage 18 consumer).
// - engine_unreachable → running: runtime:health_events observed a
// recovery; reserved for the Stage 18 consumer; declared here so
// Stage 18 needs no transitions edit.
// - running → stopped, generation_in_progress → stopped,
// generation_failed → stopped, engine_unreachable → stopped: admin
// stop is allowed from every non-terminal status (README §Stop:
// «CAS `runtime_records.status: * → stopped`»).
var allowedTransitions = map[transitionKey]struct{}{
{StatusStarting, StatusRunning}: {},
{StatusRunning, StatusGenerationInProgress}: {},
{StatusGenerationInProgress, StatusRunning}: {},
{StatusGenerationInProgress, StatusGenerationFailed}: {},
{StatusGenerationInProgress, StatusFinished}: {},
{StatusGenerationFailed, StatusGenerationInProgress}: {},
{StatusRunning, StatusEngineUnreachable}: {},
{StatusEngineUnreachable, StatusRunning}: {},
{StatusRunning, StatusStopped}: {},
{StatusGenerationInProgress, StatusStopped}: {},
{StatusGenerationFailed, StatusStopped}: {},
{StatusEngineUnreachable, StatusStopped}: {},
}
// AllowedTransitions returns a copy of the `(from, to)` allowed
// transitions table used by Transition. The returned map is safe to
// mutate; callers should not rely on iteration order.
func AllowedTransitions() map[Status][]Status {
result := make(map[Status][]Status)
for key := range allowedTransitions {
result[key.from] = append(result[key.from], key.to)
}
return result
}
// Transition reports whether from may transition to next. The function
// returns nil when the pair is permitted, and an *InvalidTransitionError
// wrapping ErrInvalidTransition otherwise. It does not touch any store
// and is safe to call from any layer.
func Transition(from Status, next Status) error {
if !from.IsKnown() || !next.IsKnown() {
return &InvalidTransitionError{From: from, To: next}
}
if _, ok := allowedTransitions[transitionKey{from: from, to: next}]; !ok {
return &InvalidTransitionError{From: from, To: next}
}
return nil
}
@@ -0,0 +1,90 @@
package runtime
import (
"errors"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestTransitionAcceptsAllAllowedPairs(t *testing.T) {
for from, tos := range AllowedTransitions() {
for _, to := range tos {
t.Run(string(from)+"->"+string(to), func(t *testing.T) {
assert.NoError(t, Transition(from, to))
})
}
}
}
func TestTransitionRejectsForbiddenPairs(t *testing.T) {
allowed := AllowedTransitions()
allowedSet := make(map[transitionKey]struct{})
for from, tos := range allowed {
for _, to := range tos {
allowedSet[transitionKey{from: from, to: to}] = struct{}{}
}
}
for _, from := range AllStatuses() {
for _, to := range AllStatuses() {
if _, ok := allowedSet[transitionKey{from: from, to: to}]; ok {
continue
}
t.Run(string(from)+"->"+string(to), func(t *testing.T) {
err := Transition(from, to)
require.Error(t, err)
var typed *InvalidTransitionError
assert.True(t, errors.As(err, &typed))
assert.Equal(t, from, typed.From)
assert.Equal(t, to, typed.To)
assert.True(t, errors.Is(err, ErrInvalidTransition))
})
}
}
}
func TestTransitionRejectsUnknownStatus(t *testing.T) {
tests := []struct {
name string
from Status
to Status
}{
{"unknown from", "exotic", StatusRunning},
{"unknown to", StatusRunning, "exotic"},
{"both unknown", "from-x", "to-y"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := Transition(tt.from, tt.to)
require.Error(t, err)
assert.True(t, errors.Is(err, ErrInvalidTransition))
})
}
}
func TestAllowedTransitionsIncludesExpectedFlows(t *testing.T) {
allowed := AllowedTransitions()
must := func(from Status, expected Status) {
t.Helper()
got := allowed[from]
assert.Containsf(t, got, expected,
"expected %q in transitions from %q, got %v",
expected, from, got)
}
must(StatusStarting, StatusRunning)
must(StatusRunning, StatusGenerationInProgress)
must(StatusGenerationInProgress, StatusRunning)
must(StatusGenerationInProgress, StatusGenerationFailed)
must(StatusGenerationInProgress, StatusFinished)
must(StatusGenerationFailed, StatusGenerationInProgress)
must(StatusRunning, StatusEngineUnreachable)
must(StatusEngineUnreachable, StatusRunning)
must(StatusRunning, StatusStopped)
must(StatusGenerationInProgress, StatusStopped)
must(StatusGenerationFailed, StatusStopped)
must(StatusEngineUnreachable, StatusStopped)
}