feat: runtime manager

2026-04-28 20:39:18 +02:00
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,43 @@
+package runtime
+
+import (
+	"errors"
+	"fmt"
+)
+
+// ErrNotFound reports that a runtime record was requested but does not
+// exist in the store.
+var ErrNotFound = errors.New("runtime record not found")
+
+// ErrConflict reports that a runtime mutation could not be applied
+// because the record changed concurrently or failed a compare-and-swap
+// guard.
+var ErrConflict = errors.New("runtime record conflict")
+
+// ErrInvalidTransition is the sentinel returned when Transition rejects
+// a `(from, to)` pair.
+var ErrInvalidTransition = errors.New("invalid runtime status transition")
+
+// InvalidTransitionError stores the rejected `(from, to)` pair and wraps
+// ErrInvalidTransition so callers can match it with errors.Is.
+type InvalidTransitionError struct {
+	// From stores the source status that was attempted to leave.
+	From Status
+
+	// To stores the destination status that was attempted to enter.
+	To Status
+}
+
+// Error reports a human-readable summary of the rejected pair.
+func (err *InvalidTransitionError) Error() string {
+	return fmt.Sprintf(
+		"invalid runtime status transition from %q to %q",
+		err.From, err.To,
+	)
+}
+
+// Unwrap returns ErrInvalidTransition so errors.Is recognizes the
+// sentinel.
+func (err *InvalidTransitionError) Unwrap() error {
+	return ErrInvalidTransition
+}
@@ -0,0 +1,197 @@
+// Package runtime defines the runtime-record domain model, status machine,
+// and sentinel errors owned by Runtime Manager.
+//
+// The package mirrors the durable shape of the `runtime_records`
+// PostgreSQL table (see
+// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`).
+// Every status / transition / required-field rule already documented in
+// `galaxy/rtmanager/README.md` lives here as code so adapter and service
+// layers do not re-derive it.
+package runtime
+
+import (
+	"fmt"
+	"strings"
+	"time"
+)
+
+// Status identifies one runtime-record lifecycle state.
+type Status string
+
+const (
+	// StatusRunning reports that an engine container is live and bound to
+	// the record. The associated container id and image ref are non-empty
+	// and StartedAt is set.
+	StatusRunning Status = "running"
+
+	// StatusStopped reports that the engine container has exited (graceful
+	// stop, observed Docker exit, or reconciled exit). The container is
+	// still present in Docker until the cleanup worker removes it.
+	StatusStopped Status = "stopped"
+
+	// StatusRemoved reports that the container has been removed from
+	// Docker (admin cleanup or reconcile_dispose). The record stays in
+	// PostgreSQL for audit; there is no transition out of this state.
+	StatusRemoved Status = "removed"
+)
+
+// IsKnown reports whether status belongs to the frozen runtime status
+// vocabulary.
+func (status Status) IsKnown() bool {
+	switch status {
+	case StatusRunning, StatusStopped, StatusRemoved:
+		return true
+	default:
+		return false
+	}
+}
+
+// IsTerminal reports whether status can no longer accept lifecycle
+// transitions.
+func (status Status) IsTerminal() bool {
+	return status == StatusRemoved
+}
+
+// AllStatuses returns the frozen list of every runtime status value. The
+// slice order is stable across calls and matches the README §Persistence
+// Layout listing.
+func AllStatuses() []Status {
+	return []Status{
+		StatusRunning,
+		StatusStopped,
+		StatusRemoved,
+	}
+}
+
+// RuntimeRecord stores one durable runtime record owned by Runtime
+// Manager. It mirrors one row of the `runtime_records` table.
+//
+// CurrentContainerID and CurrentImageRef are stored as plain strings; an
+// empty value represents SQL NULL and is bridged at the adapter layer.
+// StartedAt, StoppedAt, and RemovedAt are *time.Time so a missing value
+// is unambiguous and aligns with the jet-generated model.
+type RuntimeRecord struct {
+	// GameID identifies the platform game owning this runtime record.
+	GameID string
+
+	// Status stores the current lifecycle state.
+	Status Status
+
+	// CurrentContainerID identifies the bound Docker container. Empty
+	// when status is removed and after a reconciler observes
+	// disappearance.
+	CurrentContainerID string
+
+	// CurrentImageRef stores the Docker reference of the currently-bound
+	// engine image. Non-empty when status is running or stopped.
+	CurrentImageRef string
+
+	// EngineEndpoint stores the stable URL Game Master uses to reach the
+	// engine container, in `http://galaxy-game-{game_id}:8080` form.
+	EngineEndpoint string
+
+	// StatePath stores the absolute host path of the bind-mounted engine
+	// state directory.
+	StatePath string
+
+	// DockerNetwork stores the Docker network the container was attached
+	// to at create time.
+	DockerNetwork string
+
+	// StartedAt stores the wall-clock at which the container became
+	// running. Non-nil when status is running or stopped.
+	StartedAt *time.Time
+
+	// StoppedAt stores the wall-clock at which the container exited.
+	// Non-nil when status is stopped or removed (when the record passed
+	// through stopped before removal).
+	StoppedAt *time.Time
+
+	// RemovedAt stores the wall-clock at which the container was removed
+	// from Docker. Non-nil when status is removed.
+	RemovedAt *time.Time
+
+	// LastOpAt stores the wall-clock of the most recent operation
+	// affecting this record. Drives the cleanup TTL.
+	LastOpAt time.Time
+
+	// CreatedAt stores the wall-clock at which Runtime Manager first saw
+	// this game.
+	CreatedAt time.Time
+}
+
+// Validate reports whether record satisfies the runtime-record invariants
+// implied by README §Lifecycles and the SQL CHECK on `runtime_records`.
+func (record RuntimeRecord) Validate() error {
+	if strings.TrimSpace(record.GameID) == "" {
+		return fmt.Errorf("game id must not be empty")
+	}
+	if !record.Status.IsKnown() {
+		return fmt.Errorf("status %q is unsupported", record.Status)
+	}
+	if strings.TrimSpace(record.EngineEndpoint) == "" {
+		return fmt.Errorf("engine endpoint must not be empty")
+	}
+	if strings.TrimSpace(record.StatePath) == "" {
+		return fmt.Errorf("state path must not be empty")
+	}
+	if strings.TrimSpace(record.DockerNetwork) == "" {
+		return fmt.Errorf("docker network must not be empty")
+	}
+	if record.LastOpAt.IsZero() {
+		return fmt.Errorf("last op at must not be zero")
+	}
+	if record.CreatedAt.IsZero() {
+		return fmt.Errorf("created at must not be zero")
+	}
+	if record.LastOpAt.Before(record.CreatedAt) {
+		return fmt.Errorf("last op at must not be before created at")
+	}
+
+	switch record.Status {
+	case StatusRunning:
+		if strings.TrimSpace(record.CurrentContainerID) == "" {
+			return fmt.Errorf("current container id must not be empty for running records")
+		}
+		if strings.TrimSpace(record.CurrentImageRef) == "" {
+			return fmt.Errorf("current image ref must not be empty for running records")
+		}
+		if record.StartedAt == nil {
+			return fmt.Errorf("started at must not be nil for running records")
+		}
+		if record.StartedAt.IsZero() {
+			return fmt.Errorf("started at must not be zero when present")
+		}
+
+	case StatusStopped:
+		if strings.TrimSpace(record.CurrentImageRef) == "" {
+			return fmt.Errorf("current image ref must not be empty for stopped records")
+		}
+		if record.StoppedAt == nil {
+			return fmt.Errorf("stopped at must not be nil for stopped records")
+		}
+		if record.StoppedAt.IsZero() {
+			return fmt.Errorf("stopped at must not be zero when present")
+		}
+
+	case StatusRemoved:
+		if record.RemovedAt == nil {
+			return fmt.Errorf("removed at must not be nil for removed records")
+		}
+		if record.RemovedAt.IsZero() {
+			return fmt.Errorf("removed at must not be zero when present")
+		}
+	}
+
+	if record.StartedAt != nil && record.StartedAt.Before(record.CreatedAt) {
+		return fmt.Errorf("started at must not be before created at")
+	}
+	if record.StoppedAt != nil && record.StartedAt != nil && record.StoppedAt.Before(*record.StartedAt) {
+		return fmt.Errorf("stopped at must not be before started at")
+	}
+	if record.RemovedAt != nil && record.RemovedAt.Before(record.CreatedAt) {
+		return fmt.Errorf("removed at must not be before created at")
+	}
+
+	return nil
+}
@@ -0,0 +1,156 @@
+package runtime
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestStatusIsKnown(t *testing.T) {
+	for _, status := range AllStatuses() {
+		assert.Truef(t, status.IsKnown(), "expected %q known", status)
+	}
+
+	assert.False(t, Status("").IsKnown())
+	assert.False(t, Status("unknown").IsKnown())
+}
+
+func TestStatusIsTerminal(t *testing.T) {
+	assert.True(t, StatusRemoved.IsTerminal())
+
+	for _, status := range []Status{StatusRunning, StatusStopped} {
+		assert.Falsef(t, status.IsTerminal(), "expected %q non-terminal", status)
+	}
+}
+
+func TestAllStatuses(t *testing.T) {
+	statuses := AllStatuses()
+
+	assert.ElementsMatch(t,
+		[]Status{StatusRunning, StatusStopped, StatusRemoved},
+		statuses,
+	)
+
+	statuses[0] = "tampered"
+	assert.Equal(t, StatusRunning, AllStatuses()[0],
+		"AllStatuses must return an independent slice")
+}
+
+func runningRecord() RuntimeRecord {
+	created := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
+	started := created.Add(time.Second)
+	return RuntimeRecord{
+		GameID:             "game-test",
+		Status:             StatusRunning,
+		CurrentContainerID: "container-1",
+		CurrentImageRef:    "galaxy/game:1.0.0",
+		EngineEndpoint:     "http://galaxy-game-game-test:8080",
+		StatePath:          "/var/lib/galaxy/games/game-test",
+		DockerNetwork:      "galaxy-net",
+		StartedAt:          &started,
+		LastOpAt:           started,
+		CreatedAt:          created,
+	}
+}
+
+func TestRuntimeRecordValidateRunningHappy(t *testing.T) {
+	require.NoError(t, runningRecord().Validate())
+}
+
+func TestRuntimeRecordValidateStoppedHappy(t *testing.T) {
+	record := runningRecord()
+	stopped := record.StartedAt.Add(time.Minute)
+	record.Status = StatusStopped
+	record.StoppedAt = &stopped
+	record.LastOpAt = stopped
+
+	require.NoError(t, record.Validate())
+}
+
+func TestRuntimeRecordValidateRemovedHappy(t *testing.T) {
+	record := runningRecord()
+	stopped := record.StartedAt.Add(time.Minute)
+	removed := stopped.Add(time.Minute)
+	record.Status = StatusRemoved
+	record.StoppedAt = &stopped
+	record.RemovedAt = &removed
+	record.CurrentContainerID = ""
+	record.LastOpAt = removed
+
+	require.NoError(t, record.Validate())
+}
+
+func TestRuntimeRecordValidateRejects(t *testing.T) {
+	tests := []struct {
+		name   string
+		mutate func(*RuntimeRecord)
+	}{
+		{"empty game id", func(r *RuntimeRecord) { r.GameID = "" }},
+		{"unknown status", func(r *RuntimeRecord) { r.Status = "exotic" }},
+		{"empty engine endpoint", func(r *RuntimeRecord) { r.EngineEndpoint = "" }},
+		{"empty state path", func(r *RuntimeRecord) { r.StatePath = "" }},
+		{"empty docker network", func(r *RuntimeRecord) { r.DockerNetwork = "" }},
+		{"zero last op at", func(r *RuntimeRecord) { r.LastOpAt = time.Time{} }},
+		{"zero created at", func(r *RuntimeRecord) { r.CreatedAt = time.Time{} }},
+		{"last op at before created at", func(r *RuntimeRecord) {
+			r.LastOpAt = r.CreatedAt.Add(-time.Second)
+		}},
+		{"running without container id", func(r *RuntimeRecord) {
+			r.CurrentContainerID = ""
+		}},
+		{"running without image ref", func(r *RuntimeRecord) {
+			r.CurrentImageRef = ""
+		}},
+		{"running without started at", func(r *RuntimeRecord) {
+			r.StartedAt = nil
+		}},
+		{"started at before created at", func(r *RuntimeRecord) {
+			before := r.CreatedAt.Add(-time.Second)
+			r.StartedAt = &before
+		}},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			record := runningRecord()
+			tt.mutate(&record)
+			assert.Error(t, record.Validate())
+		})
+	}
+}
+
+func TestRuntimeRecordValidateRejectsStoppedWithoutStoppedAt(t *testing.T) {
+	record := runningRecord()
+	record.Status = StatusStopped
+	record.StoppedAt = nil
+
+	assert.Error(t, record.Validate())
+}
+
+func TestRuntimeRecordValidateRejectsStoppedBeforeStarted(t *testing.T) {
+	record := runningRecord()
+	stopped := record.StartedAt.Add(-time.Second)
+	record.Status = StatusStopped
+	record.StoppedAt = &stopped
+
+	assert.Error(t, record.Validate())
+}
+
+func TestRuntimeRecordValidateRejectsRemovedWithoutRemovedAt(t *testing.T) {
+	record := runningRecord()
+	record.Status = StatusRemoved
+	record.RemovedAt = nil
+
+	assert.Error(t, record.Validate())
+}
+
+func TestRuntimeRecordValidateRejectsRemovedBeforeCreated(t *testing.T) {
+	record := runningRecord()
+	before := record.CreatedAt.Add(-time.Second)
+	record.Status = StatusRemoved
+	record.RemovedAt = &before
+
+	assert.Error(t, record.Validate())
+}
@@ -0,0 +1,51 @@
+package runtime
+
+// transitionKey stores one `(from, to)` pair in the allowed-transitions
+// table.
+type transitionKey struct {
+	from Status
+	to   Status
+}
+
+// allowedTransitions stores the set of permitted `(from, to)` status
+// pairs. The four pairs mirror the lifecycle flows frozen in
+// `galaxy/rtmanager/README.md §Lifecycles`:
+//
+//   - running → stopped: graceful stop, observed Docker exit, or
+//     reconcile observing an exited container.
+//   - running → removed: reconcile_dispose when Docker no longer reports
+//     the container at all.
+//   - stopped → running: restart and patch inner start steps.
+//   - stopped → removed: cleanup_container, both the periodic TTL worker
+//     and the admin DELETE endpoint.
+var allowedTransitions = map[transitionKey]struct{}{
+	{StatusRunning, StatusStopped}: {},
+	{StatusRunning, StatusRemoved}: {},
+	{StatusStopped, StatusRunning}: {},
+	{StatusStopped, StatusRemoved}: {},
+}
+
+// AllowedTransitions returns a copy of the `(from, to)` allowed
+// transitions table used by Transition. The returned map is safe to
+// mutate; callers should not rely on iteration order.
+func AllowedTransitions() map[Status][]Status {
+	result := make(map[Status][]Status)
+	for key := range allowedTransitions {
+		result[key.from] = append(result[key.from], key.to)
+	}
+	return result
+}
+
+// Transition reports whether from may transition to next. The function
+// returns nil when the pair is permitted, and an *InvalidTransitionError
+// wrapping ErrInvalidTransition otherwise. It does not touch any store
+// and is safe to call from any layer.
+func Transition(from Status, next Status) error {
+	if !from.IsKnown() || !next.IsKnown() {
+		return &InvalidTransitionError{From: from, To: next}
+	}
+	if _, ok := allowedTransitions[transitionKey{from: from, to: next}]; !ok {
+		return &InvalidTransitionError{From: from, To: next}
+	}
+	return nil
+}
@@ -0,0 +1,88 @@
+package runtime
+
+import (
+	"errors"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestTransitionAllowed(t *testing.T) {
+	cases := []struct {
+		from Status
+		to   Status
+	}{
+		{StatusRunning, StatusStopped},
+		{StatusRunning, StatusRemoved},
+		{StatusStopped, StatusRunning},
+		{StatusStopped, StatusRemoved},
+	}
+
+	for _, tc := range cases {
+		assert.NoErrorf(t, Transition(tc.from, tc.to),
+			"expected %q -> %q allowed", tc.from, tc.to)
+	}
+}
+
+func TestTransitionRejected(t *testing.T) {
+	cases := []struct {
+		from Status
+		to   Status
+	}{
+		{StatusRemoved, StatusRunning},
+		{StatusRemoved, StatusStopped},
+		{StatusRemoved, StatusRemoved},
+		{StatusRunning, StatusRunning},
+		{StatusStopped, StatusStopped},
+		{Status("unknown"), StatusRunning},
+		{StatusRunning, Status("unknown")},
+		{Status(""), Status("")},
+	}
+
+	for _, tc := range cases {
+		err := Transition(tc.from, tc.to)
+		require.Errorf(t, err, "expected %q -> %q rejected", tc.from, tc.to)
+		assert.ErrorIs(t, err, ErrInvalidTransition)
+
+		var transitionErr *InvalidTransitionError
+		require.True(t, errors.As(err, &transitionErr),
+			"expected *InvalidTransitionError for %q -> %q", tc.from, tc.to)
+		assert.Equal(t, tc.from, transitionErr.From)
+		assert.Equal(t, tc.to, transitionErr.To)
+	}
+}
+
+func TestAllowedTransitionsReturnsCopy(t *testing.T) {
+	first := AllowedTransitions()
+	require.NotEmpty(t, first)
+
+	for from := range first {
+		first[from] = nil
+	}
+
+	second := AllowedTransitions()
+	assert.NotEmpty(t, second[StatusRunning],
+		"AllowedTransitions must return an independent map per call")
+}
+
+func TestAllowedTransitionsCoversFourPairs(t *testing.T) {
+	transitions := AllowedTransitions()
+
+	assert.ElementsMatch(t,
+		[]Status{StatusStopped, StatusRemoved},
+		transitions[StatusRunning],
+	)
+	assert.ElementsMatch(t,
+		[]Status{StatusRunning, StatusRemoved},
+		transitions[StatusStopped],
+	)
+	assert.Empty(t, transitions[StatusRemoved],
+		"removed has no outgoing transitions")
+}
+
+func TestInvalidTransitionErrorMessage(t *testing.T) {
+	err := &InvalidTransitionError{From: StatusRunning, To: Status("bogus")}
+	assert.Contains(t, err.Error(), "running")
+	assert.Contains(t, err.Error(), "bogus")
+}