feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
+245
View File
@@ -0,0 +1,245 @@
// Package operation defines the runtime-operation audit-log domain types
// owned by Runtime Manager.
//
// One OperationEntry maps to one row of the `operation_log` PostgreSQL
// table (see
// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`).
// The OpKind / OpSource / Outcome enums match the SQL CHECK constraints
// verbatim and feed the telemetry counters declared in
// `galaxy/rtmanager/README.md §Observability`.
package operation
import (
"fmt"
"strings"
"time"
)
// OpKind identifies the kind of operation Runtime Manager performed.
type OpKind string
const (
// OpKindStart records a start lifecycle operation.
OpKindStart OpKind = "start"
// OpKindStop records a stop lifecycle operation.
OpKindStop OpKind = "stop"
// OpKindRestart records a restart lifecycle operation
// (recreate with the same image_ref).
OpKindRestart OpKind = "restart"
// OpKindPatch records a semver-patch lifecycle operation
// (recreate with a new image_ref).
OpKindPatch OpKind = "patch"
// OpKindCleanupContainer records a container removal performed by
// the cleanup TTL worker or the admin DELETE endpoint.
OpKindCleanupContainer OpKind = "cleanup_container"
// OpKindReconcileAdopt records that the reconciler discovered an
// unrecorded container labelled `com.galaxy.owner=rtmanager` and
// inserted a runtime record for it.
OpKindReconcileAdopt OpKind = "reconcile_adopt"
// OpKindReconcileDispose records that the reconciler observed a
// running record whose container is missing in Docker and marked it
// as removed.
OpKindReconcileDispose OpKind = "reconcile_dispose"
)
// IsKnown reports whether kind belongs to the frozen op-kind vocabulary.
func (kind OpKind) IsKnown() bool {
switch kind {
case OpKindStart,
OpKindStop,
OpKindRestart,
OpKindPatch,
OpKindCleanupContainer,
OpKindReconcileAdopt,
OpKindReconcileDispose:
return true
default:
return false
}
}
// AllOpKinds returns the frozen list of every op-kind value. The slice
// order is stable across calls.
func AllOpKinds() []OpKind {
return []OpKind{
OpKindStart,
OpKindStop,
OpKindRestart,
OpKindPatch,
OpKindCleanupContainer,
OpKindReconcileAdopt,
OpKindReconcileDispose,
}
}
// OpSource identifies where one operation entered Runtime Manager.
type OpSource string
const (
// OpSourceLobbyStream identifies entries triggered by the
// `runtime:start_jobs` or `runtime:stop_jobs` Redis Stream consumer.
OpSourceLobbyStream OpSource = "lobby_stream"
// OpSourceGMRest identifies entries triggered by Game Master through
// the internal REST surface.
OpSourceGMRest OpSource = "gm_rest"
// OpSourceAdminRest identifies entries triggered by Admin Service
// through the internal REST surface.
OpSourceAdminRest OpSource = "admin_rest"
// OpSourceAutoTTL identifies entries triggered by the periodic
// container-cleanup worker.
OpSourceAutoTTL OpSource = "auto_ttl"
// OpSourceAutoReconcile identifies entries triggered by the
// reconciler at startup or on its periodic interval.
OpSourceAutoReconcile OpSource = "auto_reconcile"
)
// IsKnown reports whether source belongs to the frozen op-source
// vocabulary.
func (source OpSource) IsKnown() bool {
switch source {
case OpSourceLobbyStream,
OpSourceGMRest,
OpSourceAdminRest,
OpSourceAutoTTL,
OpSourceAutoReconcile:
return true
default:
return false
}
}
// AllOpSources returns the frozen list of every op-source value. The
// slice order is stable across calls.
func AllOpSources() []OpSource {
return []OpSource{
OpSourceLobbyStream,
OpSourceGMRest,
OpSourceAdminRest,
OpSourceAutoTTL,
OpSourceAutoReconcile,
}
}
// Outcome reports the high-level outcome of one operation.
type Outcome string
const (
// OutcomeSuccess reports that the operation completed without
// surfacing an error.
OutcomeSuccess Outcome = "success"
// OutcomeFailure reports that the operation surfaced a stable error
// code recorded in OperationEntry.ErrorCode.
OutcomeFailure Outcome = "failure"
)
// IsKnown reports whether outcome belongs to the frozen outcome
// vocabulary.
func (outcome Outcome) IsKnown() bool {
switch outcome {
case OutcomeSuccess, OutcomeFailure:
return true
default:
return false
}
}
// AllOutcomes returns the frozen list of every outcome value.
func AllOutcomes() []Outcome {
return []Outcome{OutcomeSuccess, OutcomeFailure}
}
// OperationEntry stores one append-only audit row of the `operation_log`
// table. ID is zero on records that have not been persisted yet; the
// store assigns it from the table's bigserial column. FinishedAt is a
// pointer because the column is nullable for in-flight rows even though
// the lifecycle services finalise the row in the same transaction.
type OperationEntry struct {
// ID identifies the persisted row. Zero before persistence.
ID int64
// GameID identifies the platform game this operation acted on.
GameID string
// OpKind classifies what the operation did.
OpKind OpKind
// OpSource classifies how the operation entered Runtime Manager.
OpSource OpSource
// SourceRef stores an opaque per-source reference such as a Redis
// Stream entry id, a REST request id, or an admin user id. Empty
// when the source does not provide one.
SourceRef string
// ImageRef stores the engine image reference associated with the
// operation, when applicable. Empty for operations that do not
// touch an image (e.g., cleanup_container).
ImageRef string
// ContainerID stores the Docker container id observed at the time
// of the operation, when applicable.
ContainerID string
// Outcome reports whether the operation succeeded or failed.
Outcome Outcome
// ErrorCode stores the stable error code on failure. Empty on
// success.
ErrorCode string
// ErrorMessage stores the operator-readable detail on failure.
// Empty on success.
ErrorMessage string
// StartedAt stores the wall-clock at which the operation began.
StartedAt time.Time
// FinishedAt stores the wall-clock at which the operation
// finalised. Nil for in-flight rows.
FinishedAt *time.Time
}
// Validate reports whether entry satisfies the operation-log invariants
// implied by the SQL CHECK constraints and the README §Persistence
// Layout.
func (entry OperationEntry) Validate() error {
if strings.TrimSpace(entry.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if !entry.OpKind.IsKnown() {
return fmt.Errorf("op kind %q is unsupported", entry.OpKind)
}
if !entry.OpSource.IsKnown() {
return fmt.Errorf("op source %q is unsupported", entry.OpSource)
}
if !entry.Outcome.IsKnown() {
return fmt.Errorf("outcome %q is unsupported", entry.Outcome)
}
if entry.StartedAt.IsZero() {
return fmt.Errorf("started at must not be zero")
}
if entry.FinishedAt != nil {
if entry.FinishedAt.IsZero() {
return fmt.Errorf("finished at must not be zero when present")
}
if entry.FinishedAt.Before(entry.StartedAt) {
return fmt.Errorf("finished at must not be before started at")
}
}
if entry.Outcome == OutcomeFailure && strings.TrimSpace(entry.ErrorCode) == "" {
return fmt.Errorf("error code must not be empty for failure entries")
}
return nil
}
@@ -0,0 +1,130 @@
package operation
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestOpKindIsKnown(t *testing.T) {
for _, kind := range AllOpKinds() {
assert.Truef(t, kind.IsKnown(), "expected %q known", kind)
}
assert.False(t, OpKind("").IsKnown())
assert.False(t, OpKind("rollback").IsKnown())
}
func TestAllOpKindsCoverFrozenSet(t *testing.T) {
assert.ElementsMatch(t,
[]OpKind{
OpKindStart, OpKindStop, OpKindRestart, OpKindPatch,
OpKindCleanupContainer, OpKindReconcileAdopt, OpKindReconcileDispose,
},
AllOpKinds(),
)
}
func TestOpSourceIsKnown(t *testing.T) {
for _, source := range AllOpSources() {
assert.Truef(t, source.IsKnown(), "expected %q known", source)
}
assert.False(t, OpSource("").IsKnown())
assert.False(t, OpSource("manual").IsKnown())
}
func TestAllOpSourcesCoverFrozenSet(t *testing.T) {
assert.ElementsMatch(t,
[]OpSource{
OpSourceLobbyStream, OpSourceGMRest, OpSourceAdminRest,
OpSourceAutoTTL, OpSourceAutoReconcile,
},
AllOpSources(),
)
}
func TestOutcomeIsKnown(t *testing.T) {
for _, outcome := range AllOutcomes() {
assert.Truef(t, outcome.IsKnown(), "expected %q known", outcome)
}
assert.False(t, Outcome("").IsKnown())
assert.False(t, Outcome("partial").IsKnown())
}
func TestAllOutcomesCoverFrozenSet(t *testing.T) {
assert.ElementsMatch(t,
[]Outcome{OutcomeSuccess, OutcomeFailure},
AllOutcomes(),
)
}
func successEntry() OperationEntry {
started := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
finished := started.Add(time.Second)
return OperationEntry{
GameID: "game-test",
OpKind: OpKindStart,
OpSource: OpSourceLobbyStream,
SourceRef: "1700000000000-0",
ImageRef: "galaxy/game:1.0.0",
ContainerID: "container-1",
Outcome: OutcomeSuccess,
StartedAt: started,
FinishedAt: &finished,
}
}
func TestOperationEntryValidateHappy(t *testing.T) {
require.NoError(t, successEntry().Validate())
}
func TestOperationEntryValidateAcceptsReplayNoOp(t *testing.T) {
entry := successEntry()
entry.ErrorCode = "replay_no_op"
assert.NoError(t, entry.Validate())
}
func TestOperationEntryValidateAcceptsInFlight(t *testing.T) {
entry := successEntry()
entry.FinishedAt = nil
assert.NoError(t, entry.Validate())
}
func TestOperationEntryValidateRejects(t *testing.T) {
tests := []struct {
name string
mutate func(*OperationEntry)
}{
{"empty game id", func(e *OperationEntry) { e.GameID = "" }},
{"unknown op kind", func(e *OperationEntry) { e.OpKind = "exotic" }},
{"unknown op source", func(e *OperationEntry) { e.OpSource = "exotic" }},
{"unknown outcome", func(e *OperationEntry) { e.Outcome = "partial" }},
{"zero started at", func(e *OperationEntry) { e.StartedAt = time.Time{} }},
{"zero finished at", func(e *OperationEntry) {
zero := time.Time{}
e.FinishedAt = &zero
}},
{"finished before started", func(e *OperationEntry) {
before := e.StartedAt.Add(-time.Second)
e.FinishedAt = &before
}},
{"failure without error code", func(e *OperationEntry) {
e.Outcome = OutcomeFailure
e.ErrorCode = ""
}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
entry := successEntry()
tt.mutate(&entry)
assert.Error(t, entry.Validate())
})
}
}