feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,43 @@
package runtime
import (
"errors"
"fmt"
)
// ErrNotFound reports that a runtime record was requested but does not
// exist in the store.
var ErrNotFound = errors.New("runtime record not found")
// ErrConflict reports that a runtime mutation could not be applied
// because the record changed concurrently or failed a compare-and-swap
// guard.
var ErrConflict = errors.New("runtime record conflict")
// ErrInvalidTransition is the sentinel returned when Transition rejects
// a `(from, to)` pair.
var ErrInvalidTransition = errors.New("invalid runtime status transition")
// InvalidTransitionError stores the rejected `(from, to)` pair and wraps
// ErrInvalidTransition so callers can match it with errors.Is.
type InvalidTransitionError struct {
// From stores the source status that was attempted to leave.
From Status
// To stores the destination status that was attempted to enter.
To Status
}
// Error reports a human-readable summary of the rejected pair.
func (err *InvalidTransitionError) Error() string {
return fmt.Sprintf(
"invalid runtime status transition from %q to %q",
err.From, err.To,
)
}
// Unwrap returns ErrInvalidTransition so errors.Is recognizes the
// sentinel.
func (err *InvalidTransitionError) Unwrap() error {
return ErrInvalidTransition
}
+197
View File
@@ -0,0 +1,197 @@
// Package runtime defines the runtime-record domain model, status machine,
// and sentinel errors owned by Runtime Manager.
//
// The package mirrors the durable shape of the `runtime_records`
// PostgreSQL table (see
// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`).
// Every status / transition / required-field rule already documented in
// `galaxy/rtmanager/README.md` lives here as code so adapter and service
// layers do not re-derive it.
package runtime
import (
"fmt"
"strings"
"time"
)
// Status identifies one runtime-record lifecycle state.
type Status string
const (
// StatusRunning reports that an engine container is live and bound to
// the record. The associated container id and image ref are non-empty
// and StartedAt is set.
StatusRunning Status = "running"
// StatusStopped reports that the engine container has exited (graceful
// stop, observed Docker exit, or reconciled exit). The container is
// still present in Docker until the cleanup worker removes it.
StatusStopped Status = "stopped"
// StatusRemoved reports that the container has been removed from
// Docker (admin cleanup or reconcile_dispose). The record stays in
// PostgreSQL for audit; there is no transition out of this state.
StatusRemoved Status = "removed"
)
// IsKnown reports whether status belongs to the frozen runtime status
// vocabulary.
func (status Status) IsKnown() bool {
switch status {
case StatusRunning, StatusStopped, StatusRemoved:
return true
default:
return false
}
}
// IsTerminal reports whether status can no longer accept lifecycle
// transitions.
func (status Status) IsTerminal() bool {
return status == StatusRemoved
}
// AllStatuses returns the frozen list of every runtime status value. The
// slice order is stable across calls and matches the README §Persistence
// Layout listing.
func AllStatuses() []Status {
return []Status{
StatusRunning,
StatusStopped,
StatusRemoved,
}
}
// RuntimeRecord stores one durable runtime record owned by Runtime
// Manager. It mirrors one row of the `runtime_records` table.
//
// CurrentContainerID and CurrentImageRef are stored as plain strings; an
// empty value represents SQL NULL and is bridged at the adapter layer.
// StartedAt, StoppedAt, and RemovedAt are *time.Time so a missing value
// is unambiguous and aligns with the jet-generated model.
type RuntimeRecord struct {
// GameID identifies the platform game owning this runtime record.
GameID string
// Status stores the current lifecycle state.
Status Status
// CurrentContainerID identifies the bound Docker container. Empty
// when status is removed and after a reconciler observes
// disappearance.
CurrentContainerID string
// CurrentImageRef stores the Docker reference of the currently-bound
// engine image. Non-empty when status is running or stopped.
CurrentImageRef string
// EngineEndpoint stores the stable URL Game Master uses to reach the
// engine container, in `http://galaxy-game-{game_id}:8080` form.
EngineEndpoint string
// StatePath stores the absolute host path of the bind-mounted engine
// state directory.
StatePath string
// DockerNetwork stores the Docker network the container was attached
// to at create time.
DockerNetwork string
// StartedAt stores the wall-clock at which the container became
// running. Non-nil when status is running or stopped.
StartedAt *time.Time
// StoppedAt stores the wall-clock at which the container exited.
// Non-nil when status is stopped or removed (when the record passed
// through stopped before removal).
StoppedAt *time.Time
// RemovedAt stores the wall-clock at which the container was removed
// from Docker. Non-nil when status is removed.
RemovedAt *time.Time
// LastOpAt stores the wall-clock of the most recent operation
// affecting this record. Drives the cleanup TTL.
LastOpAt time.Time
// CreatedAt stores the wall-clock at which Runtime Manager first saw
// this game.
CreatedAt time.Time
}
// Validate reports whether record satisfies the runtime-record invariants
// implied by README §Lifecycles and the SQL CHECK on `runtime_records`.
func (record RuntimeRecord) Validate() error {
if strings.TrimSpace(record.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if !record.Status.IsKnown() {
return fmt.Errorf("status %q is unsupported", record.Status)
}
if strings.TrimSpace(record.EngineEndpoint) == "" {
return fmt.Errorf("engine endpoint must not be empty")
}
if strings.TrimSpace(record.StatePath) == "" {
return fmt.Errorf("state path must not be empty")
}
if strings.TrimSpace(record.DockerNetwork) == "" {
return fmt.Errorf("docker network must not be empty")
}
if record.LastOpAt.IsZero() {
return fmt.Errorf("last op at must not be zero")
}
if record.CreatedAt.IsZero() {
return fmt.Errorf("created at must not be zero")
}
if record.LastOpAt.Before(record.CreatedAt) {
return fmt.Errorf("last op at must not be before created at")
}
switch record.Status {
case StatusRunning:
if strings.TrimSpace(record.CurrentContainerID) == "" {
return fmt.Errorf("current container id must not be empty for running records")
}
if strings.TrimSpace(record.CurrentImageRef) == "" {
return fmt.Errorf("current image ref must not be empty for running records")
}
if record.StartedAt == nil {
return fmt.Errorf("started at must not be nil for running records")
}
if record.StartedAt.IsZero() {
return fmt.Errorf("started at must not be zero when present")
}
case StatusStopped:
if strings.TrimSpace(record.CurrentImageRef) == "" {
return fmt.Errorf("current image ref must not be empty for stopped records")
}
if record.StoppedAt == nil {
return fmt.Errorf("stopped at must not be nil for stopped records")
}
if record.StoppedAt.IsZero() {
return fmt.Errorf("stopped at must not be zero when present")
}
case StatusRemoved:
if record.RemovedAt == nil {
return fmt.Errorf("removed at must not be nil for removed records")
}
if record.RemovedAt.IsZero() {
return fmt.Errorf("removed at must not be zero when present")
}
}
if record.StartedAt != nil && record.StartedAt.Before(record.CreatedAt) {
return fmt.Errorf("started at must not be before created at")
}
if record.StoppedAt != nil && record.StartedAt != nil && record.StoppedAt.Before(*record.StartedAt) {
return fmt.Errorf("stopped at must not be before started at")
}
if record.RemovedAt != nil && record.RemovedAt.Before(record.CreatedAt) {
return fmt.Errorf("removed at must not be before created at")
}
return nil
}
@@ -0,0 +1,156 @@
package runtime
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestStatusIsKnown(t *testing.T) {
for _, status := range AllStatuses() {
assert.Truef(t, status.IsKnown(), "expected %q known", status)
}
assert.False(t, Status("").IsKnown())
assert.False(t, Status("unknown").IsKnown())
}
func TestStatusIsTerminal(t *testing.T) {
assert.True(t, StatusRemoved.IsTerminal())
for _, status := range []Status{StatusRunning, StatusStopped} {
assert.Falsef(t, status.IsTerminal(), "expected %q non-terminal", status)
}
}
func TestAllStatuses(t *testing.T) {
statuses := AllStatuses()
assert.ElementsMatch(t,
[]Status{StatusRunning, StatusStopped, StatusRemoved},
statuses,
)
statuses[0] = "tampered"
assert.Equal(t, StatusRunning, AllStatuses()[0],
"AllStatuses must return an independent slice")
}
func runningRecord() RuntimeRecord {
created := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
started := created.Add(time.Second)
return RuntimeRecord{
GameID: "game-test",
Status: StatusRunning,
CurrentContainerID: "container-1",
CurrentImageRef: "galaxy/game:1.0.0",
EngineEndpoint: "http://galaxy-game-game-test:8080",
StatePath: "/var/lib/galaxy/games/game-test",
DockerNetwork: "galaxy-net",
StartedAt: &started,
LastOpAt: started,
CreatedAt: created,
}
}
func TestRuntimeRecordValidateRunningHappy(t *testing.T) {
require.NoError(t, runningRecord().Validate())
}
func TestRuntimeRecordValidateStoppedHappy(t *testing.T) {
record := runningRecord()
stopped := record.StartedAt.Add(time.Minute)
record.Status = StatusStopped
record.StoppedAt = &stopped
record.LastOpAt = stopped
require.NoError(t, record.Validate())
}
func TestRuntimeRecordValidateRemovedHappy(t *testing.T) {
record := runningRecord()
stopped := record.StartedAt.Add(time.Minute)
removed := stopped.Add(time.Minute)
record.Status = StatusRemoved
record.StoppedAt = &stopped
record.RemovedAt = &removed
record.CurrentContainerID = ""
record.LastOpAt = removed
require.NoError(t, record.Validate())
}
func TestRuntimeRecordValidateRejects(t *testing.T) {
tests := []struct {
name string
mutate func(*RuntimeRecord)
}{
{"empty game id", func(r *RuntimeRecord) { r.GameID = "" }},
{"unknown status", func(r *RuntimeRecord) { r.Status = "exotic" }},
{"empty engine endpoint", func(r *RuntimeRecord) { r.EngineEndpoint = "" }},
{"empty state path", func(r *RuntimeRecord) { r.StatePath = "" }},
{"empty docker network", func(r *RuntimeRecord) { r.DockerNetwork = "" }},
{"zero last op at", func(r *RuntimeRecord) { r.LastOpAt = time.Time{} }},
{"zero created at", func(r *RuntimeRecord) { r.CreatedAt = time.Time{} }},
{"last op at before created at", func(r *RuntimeRecord) {
r.LastOpAt = r.CreatedAt.Add(-time.Second)
}},
{"running without container id", func(r *RuntimeRecord) {
r.CurrentContainerID = ""
}},
{"running without image ref", func(r *RuntimeRecord) {
r.CurrentImageRef = ""
}},
{"running without started at", func(r *RuntimeRecord) {
r.StartedAt = nil
}},
{"started at before created at", func(r *RuntimeRecord) {
before := r.CreatedAt.Add(-time.Second)
r.StartedAt = &before
}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
record := runningRecord()
tt.mutate(&record)
assert.Error(t, record.Validate())
})
}
}
func TestRuntimeRecordValidateRejectsStoppedWithoutStoppedAt(t *testing.T) {
record := runningRecord()
record.Status = StatusStopped
record.StoppedAt = nil
assert.Error(t, record.Validate())
}
func TestRuntimeRecordValidateRejectsStoppedBeforeStarted(t *testing.T) {
record := runningRecord()
stopped := record.StartedAt.Add(-time.Second)
record.Status = StatusStopped
record.StoppedAt = &stopped
assert.Error(t, record.Validate())
}
func TestRuntimeRecordValidateRejectsRemovedWithoutRemovedAt(t *testing.T) {
record := runningRecord()
record.Status = StatusRemoved
record.RemovedAt = nil
assert.Error(t, record.Validate())
}
func TestRuntimeRecordValidateRejectsRemovedBeforeCreated(t *testing.T) {
record := runningRecord()
before := record.CreatedAt.Add(-time.Second)
record.Status = StatusRemoved
record.RemovedAt = &before
assert.Error(t, record.Validate())
}
@@ -0,0 +1,51 @@
package runtime
// transitionKey stores one `(from, to)` pair in the allowed-transitions
// table.
type transitionKey struct {
from Status
to Status
}
// allowedTransitions stores the set of permitted `(from, to)` status
// pairs. The four pairs mirror the lifecycle flows frozen in
// `galaxy/rtmanager/README.md §Lifecycles`:
//
// - running → stopped: graceful stop, observed Docker exit, or
// reconcile observing an exited container.
// - running → removed: reconcile_dispose when Docker no longer reports
// the container at all.
// - stopped → running: restart and patch inner start steps.
// - stopped → removed: cleanup_container, both the periodic TTL worker
// and the admin DELETE endpoint.
var allowedTransitions = map[transitionKey]struct{}{
{StatusRunning, StatusStopped}: {},
{StatusRunning, StatusRemoved}: {},
{StatusStopped, StatusRunning}: {},
{StatusStopped, StatusRemoved}: {},
}
// AllowedTransitions returns a copy of the `(from, to)` allowed
// transitions table used by Transition. The returned map is safe to
// mutate; callers should not rely on iteration order.
func AllowedTransitions() map[Status][]Status {
result := make(map[Status][]Status)
for key := range allowedTransitions {
result[key.from] = append(result[key.from], key.to)
}
return result
}
// Transition reports whether from may transition to next. The function
// returns nil when the pair is permitted, and an *InvalidTransitionError
// wrapping ErrInvalidTransition otherwise. It does not touch any store
// and is safe to call from any layer.
func Transition(from Status, next Status) error {
if !from.IsKnown() || !next.IsKnown() {
return &InvalidTransitionError{From: from, To: next}
}
if _, ok := allowedTransitions[transitionKey{from: from, to: next}]; !ok {
return &InvalidTransitionError{From: from, To: next}
}
return nil
}
@@ -0,0 +1,88 @@
package runtime
import (
"errors"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestTransitionAllowed(t *testing.T) {
cases := []struct {
from Status
to Status
}{
{StatusRunning, StatusStopped},
{StatusRunning, StatusRemoved},
{StatusStopped, StatusRunning},
{StatusStopped, StatusRemoved},
}
for _, tc := range cases {
assert.NoErrorf(t, Transition(tc.from, tc.to),
"expected %q -> %q allowed", tc.from, tc.to)
}
}
func TestTransitionRejected(t *testing.T) {
cases := []struct {
from Status
to Status
}{
{StatusRemoved, StatusRunning},
{StatusRemoved, StatusStopped},
{StatusRemoved, StatusRemoved},
{StatusRunning, StatusRunning},
{StatusStopped, StatusStopped},
{Status("unknown"), StatusRunning},
{StatusRunning, Status("unknown")},
{Status(""), Status("")},
}
for _, tc := range cases {
err := Transition(tc.from, tc.to)
require.Errorf(t, err, "expected %q -> %q rejected", tc.from, tc.to)
assert.ErrorIs(t, err, ErrInvalidTransition)
var transitionErr *InvalidTransitionError
require.True(t, errors.As(err, &transitionErr),
"expected *InvalidTransitionError for %q -> %q", tc.from, tc.to)
assert.Equal(t, tc.from, transitionErr.From)
assert.Equal(t, tc.to, transitionErr.To)
}
}
func TestAllowedTransitionsReturnsCopy(t *testing.T) {
first := AllowedTransitions()
require.NotEmpty(t, first)
for from := range first {
first[from] = nil
}
second := AllowedTransitions()
assert.NotEmpty(t, second[StatusRunning],
"AllowedTransitions must return an independent map per call")
}
func TestAllowedTransitionsCoversFourPairs(t *testing.T) {
transitions := AllowedTransitions()
assert.ElementsMatch(t,
[]Status{StatusStopped, StatusRemoved},
transitions[StatusRunning],
)
assert.ElementsMatch(t,
[]Status{StatusRunning, StatusRemoved},
transitions[StatusStopped],
)
assert.Empty(t, transitions[StatusRemoved],
"removed has no outgoing transitions")
}
func TestInvalidTransitionErrorMessage(t *testing.T) {
err := &InvalidTransitionError{From: StatusRunning, To: Status("bogus")}
assert.Contains(t, err.Error(), "running")
assert.Contains(t, err.Error(), "bogus")
}