feat: backend service

This commit is contained in:
Ilia Denisov
2026-05-06 10:14:55 +03:00
committed by GitHub
parent 3e2622757e
commit f446c6a2ac
1486 changed files with 49720 additions and 266401 deletions
+174
View File
@@ -0,0 +1,174 @@
package runtime
import (
"context"
"fmt"
"sync"
"sync/atomic"
"github.com/google/uuid"
)
// Cache is the in-memory write-through projection of the runtime
// records and engine version registry. Mirrors the lobby/auth/admin
// cache idiom: Postgres is the source of truth, the cache is updated
// only after a successful commit.
//
// Reads (Get*) take RLocks; writes (Put*, Remove*) take Locks. The
// cache only retains non-terminal runtime records so the active set
// stays small and warm.
type Cache struct {
mu sync.RWMutex
runtimes map[uuid.UUID]RuntimeRecord
engineVersions map[string]EngineVersion
ready atomic.Bool
}
// NewCache returns an empty Cache.
func NewCache() *Cache {
return &Cache{
runtimes: make(map[uuid.UUID]RuntimeRecord),
engineVersions: make(map[string]EngineVersion),
}
}
// Warm populates the cache from store. Must be called once at process
// boot before the HTTP listener accepts traffic.
func (c *Cache) Warm(ctx context.Context, store *Store) error {
if c == nil {
return nil
}
versions, err := store.ListEngineVersions(ctx)
if err != nil {
return fmt.Errorf("runtime cache warm: engine versions: %w", err)
}
records, err := store.ListAllRuntimeRecords(ctx)
if err != nil {
return fmt.Errorf("runtime cache warm: runtime records: %w", err)
}
c.mu.Lock()
defer c.mu.Unlock()
c.engineVersions = make(map[string]EngineVersion, len(versions))
for _, v := range versions {
c.engineVersions[v.Version] = v
}
c.runtimes = make(map[uuid.UUID]RuntimeRecord)
for _, r := range records {
if r.IsTerminal() {
continue
}
c.runtimes[r.GameID] = r
}
c.ready.Store(true)
return nil
}
// Ready reports whether Warm completed at least once.
func (c *Cache) Ready() bool {
if c == nil {
return false
}
return c.ready.Load()
}
// Sizes returns the cardinalities of the two projections; used by the
// startup log line and tests.
func (c *Cache) Sizes() (runtimes int, engineVersions int) {
if c == nil {
return 0, 0
}
c.mu.RLock()
defer c.mu.RUnlock()
return len(c.runtimes), len(c.engineVersions)
}
// GetRuntime returns the cached runtime record for gameID together
// with a presence flag.
func (c *Cache) GetRuntime(gameID uuid.UUID) (RuntimeRecord, bool) {
if c == nil {
return RuntimeRecord{}, false
}
c.mu.RLock()
defer c.mu.RUnlock()
r, ok := c.runtimes[gameID]
return r, ok
}
// PutRuntime stores or updates the runtime record. Terminal statuses
// cause the entry to be evicted.
func (c *Cache) PutRuntime(rec RuntimeRecord) {
if c == nil {
return
}
c.mu.Lock()
defer c.mu.Unlock()
if rec.IsTerminal() {
delete(c.runtimes, rec.GameID)
return
}
c.runtimes[rec.GameID] = rec
}
// RemoveRuntime evicts the entry for gameID.
func (c *Cache) RemoveRuntime(gameID uuid.UUID) {
if c == nil {
return
}
c.mu.Lock()
defer c.mu.Unlock()
delete(c.runtimes, gameID)
}
// ActiveRuntimes returns a snapshot copy of every cached runtime
// record. The reconciler and the scheduler both iterate this list.
func (c *Cache) ActiveRuntimes() []RuntimeRecord {
if c == nil {
return nil
}
c.mu.RLock()
defer c.mu.RUnlock()
out := make([]RuntimeRecord, 0, len(c.runtimes))
for _, r := range c.runtimes {
out = append(out, r)
}
return out
}
// GetEngineVersion returns the cached engine_versions row keyed by
// version label, together with a presence flag.
func (c *Cache) GetEngineVersion(version string) (EngineVersion, bool) {
if c == nil {
return EngineVersion{}, false
}
c.mu.RLock()
defer c.mu.RUnlock()
v, ok := c.engineVersions[version]
return v, ok
}
// PutEngineVersion stores or updates the engine_versions cache entry.
func (c *Cache) PutEngineVersion(v EngineVersion) {
if c == nil {
return
}
c.mu.Lock()
defer c.mu.Unlock()
c.engineVersions[v.Version] = v
}
// ListEngineVersions returns a snapshot of the cached engine_versions
// rows ordered by created_at DESC. Falls back to a deterministic order
// by version label when timestamps tie.
func (c *Cache) ListEngineVersions() []EngineVersion {
if c == nil {
return nil
}
c.mu.RLock()
defer c.mu.RUnlock()
out := make([]EngineVersion, 0, len(c.engineVersions))
for _, v := range c.engineVersions {
out = append(out, v)
}
return out
}
+54
View File
@@ -0,0 +1,54 @@
package runtime
import (
"testing"
"github.com/google/uuid"
)
func TestCacheRuntimeRoundTrip(t *testing.T) {
c := NewCache()
gameID := uuid.New()
rec := RuntimeRecord{GameID: gameID, Status: RuntimeStatusRunning}
c.PutRuntime(rec)
got, ok := c.GetRuntime(gameID)
if !ok {
t.Fatal("expected cache hit")
}
if got.Status != RuntimeStatusRunning {
t.Fatalf("status = %s, want running", got.Status)
}
rec.Status = RuntimeStatusFinished
c.PutRuntime(rec)
if _, ok := c.GetRuntime(gameID); ok {
t.Fatal("terminal status must evict")
}
}
func TestCacheEngineVersionRoundTrip(t *testing.T) {
c := NewCache()
v := EngineVersion{Version: "0.1.0", ImageRef: "img", Enabled: true}
c.PutEngineVersion(v)
got, ok := c.GetEngineVersion("0.1.0")
if !ok {
t.Fatal("expected hit")
}
if got.ImageRef != "img" {
t.Fatalf("image_ref = %s, want img", got.ImageRef)
}
if list := c.ListEngineVersions(); len(list) != 1 {
t.Fatalf("list size = %d, want 1", len(list))
}
}
func TestCacheActiveRuntimes(t *testing.T) {
c := NewCache()
c.PutRuntime(RuntimeRecord{GameID: uuid.New(), Status: RuntimeStatusRunning})
c.PutRuntime(RuntimeRecord{GameID: uuid.New(), Status: RuntimeStatusStarting})
c.PutRuntime(RuntimeRecord{GameID: uuid.New(), Status: RuntimeStatusFinished}) // evicted
if got := c.ActiveRuntimes(); len(got) != 2 {
t.Fatalf("active = %d, want 2", len(got))
}
}
+138
View File
@@ -0,0 +1,138 @@
package runtime
import (
"context"
"time"
"galaxy/backend/internal/config"
"galaxy/backend/internal/dockerclient"
"galaxy/backend/internal/engineclient"
"github.com/google/uuid"
"go.uber.org/zap"
)
// LobbyConsumer is the inbound surface the runtime uses to publish
// snapshots and adoption / removal events back into lobby. The
// canonical implementation is `*lobby.Service`; tests substitute a
// hand-rolled fake that records the calls.
//
// The interface is intentionally narrow: runtime only forwards
// data-plane events. Lobby owns every status transition that follows
// from the snapshot.
type LobbyConsumer interface {
// OnRuntimeSnapshot is invoked synchronously after every successful
// engine read or health-probe transition. Lobby maps the snapshot
// into its `games.runtime_snapshot` projection and may transition
// the game's lifecycle status.
OnRuntimeSnapshot(ctx context.Context, gameID uuid.UUID, snapshot LobbySnapshot) error
// OnRuntimeJobResult is invoked by the reconciler when a labelled
// container that lobby believes is alive has disappeared. Lobby
// reacts by cancelling the game (the engine container is gone).
OnRuntimeJobResult(ctx context.Context, gameID uuid.UUID, result JobResult) error
}
// LobbySnapshot is the runtime → lobby DTO. It is the runtime's view
// of the engine status response, plus the per-player observations
// lobby needs for capable-finish promotion.
//
// The structure intentionally mirrors `lobby.RuntimeSnapshot` in
// shape; runtime keeps its own version so the two packages do not
// import each other directly. The cmd/backend wiring layer adapts
// between them.
type LobbySnapshot struct {
CurrentTurn int32
RuntimeStatus string
EngineHealth string
ObservedAt time.Time
PlayerStats []LobbyPlayerStats
}
// LobbyPlayerStats is the per-player observation read from a runtime
// snapshot. `MaxPlanets` / `MaxPopulation` are the per-snapshot
// running maxima; lobby aggregates across the game lifetime.
type LobbyPlayerStats struct {
UserID uuid.UUID
InitialPlanets int32
InitialPopulation int32
CurrentPlanets int32
CurrentPopulation int32
MaxPlanets int32
MaxPopulation int32
}
// JobResult is the outcome envelope passed to
// `LobbyConsumer.OnRuntimeJobResult`. The reconciler produces it on
// adoption / removal events; future job paths (start, stop, restart)
// may reuse the same envelope.
type JobResult struct {
Op string
Status string
Message string
}
// NotificationPublisher is the outbound surface runtime uses to emit
// admin-channel notifications enumerated under `runtime.*` in
// `backend/README.md` §10. The real implementation lives in
// `backend/internal/notification` ; until then
// `NewNoopNotificationPublisher` ships a logger-only stub so the
// runtime path stays callable end-to-end during tests.
//
// Kind must be one of `runtime.image_pull_failed`,
// `runtime.container_start_failed`, or `runtime.start_config_invalid`.
// Payload carries the kind-specific fields documented in the catalog.
// The IdempotencyKey is supplied by the caller and feeds the
// notification UNIQUE(kind, idempotency_key) constraint.
type NotificationPublisher interface {
PublishRuntimeEvent(ctx context.Context, kind, idempotencyKey string, payload map[string]any) error
}
// NewNoopNotificationPublisher returns a NotificationPublisher that
// logs every event at info level and returns nil. The implementation swaps in
// the real `*notification.Service` adapter.
func NewNoopNotificationPublisher(logger *zap.Logger) NotificationPublisher {
if logger == nil {
logger = zap.NewNop()
}
return &noopNotificationPublisher{logger: logger.Named("runtime.notify.noop")}
}
type noopNotificationPublisher struct {
logger *zap.Logger
}
func (p *noopNotificationPublisher) PublishRuntimeEvent(_ context.Context, kind, idempotencyKey string, payload map[string]any) error {
p.logger.Info("runtime event (noop publisher)",
zap.String("kind", kind),
zap.String("idempotency_key", idempotencyKey),
zap.Int("payload_keys", len(payload)),
)
return nil
}
// Deps aggregates every collaborator the runtime Service depends on.
// Constructing the Service through Deps (rather than positional args)
// keeps the wiring patches small as new dependencies are added.
type Deps struct {
Store *Store
Cache *Cache
EngineVersions *EngineVersionService
Docker dockerclient.Client
Engine *engineclient.Client
Lobby LobbyConsumer
Notification NotificationPublisher
// DockerNetwork is the user-defined Docker network name engine
// containers attach to. Wired from `cfg.Docker.Network`.
DockerNetwork string
// HostStateRoot is the host-side directory that holds per-game
// state subdirectories. Wired from `cfg.Game.StateRoot`.
HostStateRoot string
Config config.RuntimeConfig
Logger *zap.Logger
Now func() time.Time
}
+189
View File
@@ -0,0 +1,189 @@
package runtime
import (
"context"
"errors"
"fmt"
"sort"
"strings"
"time"
"galaxy/util"
)
// EngineVersionService implements the engine-version registry CRUD
// surface consumed by the admin endpoints under
// `/api/v1/admin/engine-versions/*`. Mutations are write-through: a
// successful Postgres write is followed by a cache update so warm
// reads observe the new state immediately.
type EngineVersionService struct {
store *Store
cache *Cache
now func() time.Time
}
// NewEngineVersionService constructs the service. now defaults to
// time.Now when nil.
func NewEngineVersionService(store *Store, cache *Cache, now func() time.Time) *EngineVersionService {
if now == nil {
now = time.Now
}
return &EngineVersionService{store: store, cache: cache, now: now}
}
// List returns every engine_versions row ordered by created_at DESC.
// Cache-first when warm; falls back to a Postgres read otherwise.
func (s *EngineVersionService) List(ctx context.Context) ([]EngineVersion, error) {
if s.cache != nil && s.cache.Ready() {
out := s.cache.ListEngineVersions()
sort.SliceStable(out, func(i, j int) bool {
if !out[i].CreatedAt.Equal(out[j].CreatedAt) {
return out[i].CreatedAt.After(out[j].CreatedAt)
}
return out[i].Version > out[j].Version
})
return out, nil
}
return s.store.ListEngineVersions(ctx)
}
// Get returns the row for version. Returns ErrNotFound on miss.
func (s *EngineVersionService) Get(ctx context.Context, version string) (EngineVersion, error) {
version = strings.TrimSpace(version)
if version == "" {
return EngineVersion{}, fmt.Errorf("%w: version must not be empty", ErrInvalidInput)
}
if s.cache != nil {
if v, ok := s.cache.GetEngineVersion(version); ok {
return v, nil
}
}
v, err := s.store.GetEngineVersion(ctx, version)
if err != nil {
return EngineVersion{}, err
}
if s.cache != nil {
s.cache.PutEngineVersion(v)
}
return v, nil
}
// RegisterInput is the parameter struct for Register.
type RegisterInput struct {
Version string
ImageRef string
Enabled *bool
}
// Validate normalises the request and rejects empty / malformed
// fields. Semver is enforced via `pkg/util.ParseSemver`.
func (in *RegisterInput) Validate() error {
in.Version = strings.TrimSpace(in.Version)
in.ImageRef = strings.TrimSpace(in.ImageRef)
if in.Version == "" {
return fmt.Errorf("%w: version must not be empty", ErrInvalidInput)
}
if _, err := util.ParseSemver(in.Version); err != nil {
return fmt.Errorf("%w: version %q is not a valid semver: %v", ErrInvalidInput, in.Version, err)
}
if in.ImageRef == "" {
return fmt.Errorf("%w: image_ref must not be empty", ErrInvalidInput)
}
return nil
}
// Register persists a fresh engine_versions row. Returns
// ErrEngineVersionTaken on duplicate version.
func (s *EngineVersionService) Register(ctx context.Context, in RegisterInput) (EngineVersion, error) {
if err := (&in).Validate(); err != nil {
return EngineVersion{}, err
}
enabled := true
if in.Enabled != nil {
enabled = *in.Enabled
}
now := s.now().UTC()
v, err := s.store.InsertEngineVersion(ctx, in.Version, in.ImageRef, enabled, now)
if err != nil {
return EngineVersion{}, err
}
if s.cache != nil {
s.cache.PutEngineVersion(v)
}
return v, nil
}
// UpdateInput is the parameter struct for Update. Nil pointers leave
// the corresponding column alone.
type UpdateInput struct {
ImageRef *string
Enabled *bool
}
// Update patches mutable fields on an existing row.
func (s *EngineVersionService) Update(ctx context.Context, version string, in UpdateInput) (EngineVersion, error) {
version = strings.TrimSpace(version)
if version == "" {
return EngineVersion{}, fmt.Errorf("%w: version must not be empty", ErrInvalidInput)
}
patch := engineVersionUpdate{Enabled: in.Enabled}
if in.ImageRef != nil {
trimmed := strings.TrimSpace(*in.ImageRef)
if trimmed == "" {
return EngineVersion{}, fmt.Errorf("%w: image_ref must not be empty", ErrInvalidInput)
}
patch.ImageRef = &trimmed
}
now := s.now().UTC()
v, err := s.store.UpdateEngineVersion(ctx, version, patch, now)
if err != nil {
return EngineVersion{}, err
}
if s.cache != nil {
s.cache.PutEngineVersion(v)
}
return v, nil
}
// Disable flips the enabled flag to false. Idempotent.
func (s *EngineVersionService) Disable(ctx context.Context, version string) (EngineVersion, error) {
disabled := false
return s.Update(ctx, version, UpdateInput{Enabled: &disabled})
}
// Resolve returns the row for version, rejecting disabled rows with
// ErrEngineVersionDisabled. Used by `Service.StartGame` /
// `AdminPatch` / `AdminRestart` before the docker pull.
func (s *EngineVersionService) Resolve(ctx context.Context, version string) (EngineVersion, error) {
v, err := s.Get(ctx, version)
if err != nil {
return EngineVersion{}, err
}
if !v.Enabled {
return EngineVersion{}, fmt.Errorf("%w: %s", ErrEngineVersionDisabled, v.Version)
}
return v, nil
}
// CheckPatchCompatible verifies the requested target version stays
// inside the same major+minor line as `currentVersion`. Returns
// ErrPatchSemverIncompatible otherwise.
func CheckPatchCompatible(currentVersion, targetVersion string) error {
current, err := util.ParseSemver(currentVersion)
if err != nil {
return fmt.Errorf("%w: current version %q: %v", ErrInvalidInput, currentVersion, err)
}
target, err := util.ParseSemver(targetVersion)
if err != nil {
return fmt.Errorf("%w: target version %q: %v", ErrInvalidInput, targetVersion, err)
}
if current.Major != target.Major || current.Minor != target.Minor {
return fmt.Errorf("%w: %s -> %s", ErrPatchSemverIncompatible, currentVersion, targetVersion)
}
return nil
}
// IsKnownEngineVersion is a small helper used by tests and handlers.
func IsKnownEngineVersion(err error) bool {
return errors.Is(err, ErrEngineVersionDisabled) || errors.Is(err, ErrPatchSemverIncompatible)
}
@@ -0,0 +1,76 @@
package runtime
import (
"errors"
"testing"
)
func TestEngineVersionRegisterValidate(t *testing.T) {
cases := []struct {
name string
input RegisterInput
wantErr error
}{
{
name: "empty version",
input: RegisterInput{Version: "", ImageRef: "img"},
wantErr: ErrInvalidInput,
},
{
name: "non-semver",
input: RegisterInput{Version: "abc", ImageRef: "img"},
wantErr: ErrInvalidInput,
},
{
name: "empty image",
input: RegisterInput{Version: "0.1.0", ImageRef: ""},
wantErr: ErrInvalidInput,
},
{
name: "valid",
input: RegisterInput{Version: "0.1.0", ImageRef: "img"},
wantErr: nil,
},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
err := (&c.input).Validate()
if c.wantErr == nil {
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
return
}
if !errors.Is(err, c.wantErr) {
t.Fatalf("got %v, want %v", err, c.wantErr)
}
})
}
}
func TestCheckPatchCompatible(t *testing.T) {
cases := []struct {
name string
current string
target string
wantErr error
}{
{"same patch", "0.1.0", "0.1.0", nil},
{"compatible patch", "0.1.0", "0.1.4", nil},
{"different minor", "0.1.0", "0.2.0", ErrPatchSemverIncompatible},
{"different major", "1.0.0", "2.0.0", ErrPatchSemverIncompatible},
{"invalid current", "abc", "0.1.0", ErrInvalidInput},
{"invalid target", "0.1.0", "abc", ErrInvalidInput},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
err := CheckPatchCompatible(c.current, c.target)
if c.wantErr == nil && err != nil {
t.Fatalf("unexpected error: %v", err)
}
if c.wantErr != nil && !errors.Is(err, c.wantErr) {
t.Fatalf("got %v, want %v", err, c.wantErr)
}
})
}
}
+45
View File
@@ -0,0 +1,45 @@
package runtime
import "errors"
// Sentinel errors. Handlers map them to the standard JSON envelope at
// the wire boundary; lobby and admin packages observe them through
// errors.Is when they need to branch on the domain reason.
var (
// ErrNotFound is returned when no row matches the requested
// primary key (engine version, runtime record, player mapping).
ErrNotFound = errors.New("runtime: not found")
// ErrInvalidInput reports request-level validation failures
// (empty fields, malformed semver, unknown enum values).
ErrInvalidInput = errors.New("runtime: invalid input")
// ErrConflict reports that the requested action conflicts with
// the current persisted state (illegal status transition, retry
// while a job is still in-flight, race against the reconciler).
ErrConflict = errors.New("runtime: conflict")
// ErrEngineVersionTaken means a duplicate primary key was
// observed when registering a new engine version row.
ErrEngineVersionTaken = errors.New("runtime: engine version already registered")
// ErrEngineVersionDisabled reports that a referenced engine
// version row exists but is marked disabled.
ErrEngineVersionDisabled = errors.New("runtime: engine version disabled")
// ErrPatchSemverIncompatible reports that an admin-requested
// version patch crosses major or minor boundary, which Galaxy
// disallows for in-place patching (per ARCHITECTURE.md §9).
ErrPatchSemverIncompatible = errors.New("runtime: patch must stay inside the same major/minor line")
// ErrJobQueueFull reports that the worker pool's buffered job
// channel is at capacity. Surfaced as 503 service_unavailable at
// the wire boundary; in practice the pool size and queue depth
// are budgeted in `BACKEND_RUNTIME_*` env vars so the operator
// can absorb peaks.
ErrJobQueueFull = errors.New("runtime: job queue full")
// ErrShutdown means the runtime service has stopped accepting
// work because the parent context was cancelled.
ErrShutdown = errors.New("runtime: shutting down")
)
+55
View File
@@ -0,0 +1,55 @@
package runtime
import (
"context"
"errors"
"galaxy/backend/internal/dockerclient"
"go.uber.org/zap"
)
// publishStartConfigInvalid emits the `runtime.start_config_invalid`
// admin notification for a pre-Run validation failure on the start /
// patch path. The OperationLog supplies the idempotency key so the
// catalog UNIQUE(kind, idempotency_key) constraint deduplicates a
// repeated retry on the same operation row.
func (s *Service) publishStartConfigInvalid(ctx context.Context, op OperationLog, reason string) {
s.publishRuntimeEvent(ctx, "runtime.start_config_invalid", op, map[string]any{
"game_id": op.GameID.String(),
"reason": reason,
})
}
// publishStartFailure emits either `runtime.image_pull_failed` or
// `runtime.container_start_failed` depending on whether the Docker
// daemon reported a pull-stage error. The two kinds carry the catalog
// payload from `backend/README.md` §10.
func (s *Service) publishStartFailure(ctx context.Context, op OperationLog, imageRef string, runErr error) {
if errors.Is(runErr, dockerclient.ErrImagePullFailed) {
s.publishRuntimeEvent(ctx, "runtime.image_pull_failed", op, map[string]any{
"game_id": op.GameID.String(),
"image_ref": imageRef,
})
return
}
s.publishRuntimeEvent(ctx, "runtime.container_start_failed", op, map[string]any{
"game_id": op.GameID.String(),
})
}
// publishRuntimeEvent threads the publisher call through the package
// logger so a misconfigured publisher cannot silently drop events.
func (s *Service) publishRuntimeEvent(ctx context.Context, kind string, op OperationLog, payload map[string]any) {
if s.deps.Notification == nil {
return
}
idempotencyKey := kind + ":" + op.GameID.String() + ":" + op.OperationID.String()
if err := s.deps.Notification.PublishRuntimeEvent(ctx, kind, idempotencyKey, payload); err != nil {
s.deps.Logger.Warn("runtime notification publish failed",
zap.String("kind", kind),
zap.String("idempotency_key", idempotencyKey),
zap.Error(err),
)
}
}
+203
View File
@@ -0,0 +1,203 @@
package runtime
import (
"context"
"errors"
"fmt"
"strings"
"time"
"galaxy/backend/internal/dockerclient"
"github.com/google/uuid"
"go.uber.org/zap"
)
// Reconciler runs an immediate startup pass plus a periodic ticker
// (`BACKEND_RUNTIME_RECONCILE_INTERVAL`). On every pass it diffs
// labelled containers reported by Docker against
// `runtime_records`, adopts unrecorded labelled containers, marks
// recorded-but-missing as `removed`, and publishes a fresh snapshot
// for matched pairs.
//
// Implements `internal/app.Component`.
type Reconciler struct {
svc *Service
}
// NewReconciler builds a Reconciler bound to svc.
func NewReconciler(svc *Service) *Reconciler { return &Reconciler{svc: svc} }
// Run drives the reconciliation loop until ctx is cancelled.
func (r *Reconciler) Run(ctx context.Context) error {
if r == nil {
return nil
}
logger := r.svc.deps.Logger.Named("reconciler")
if err := r.tick(ctx); err != nil {
logger.Warn("initial reconcile tick failed", zap.Error(err))
}
ticker := time.NewTicker(r.svc.deps.Config.ReconcileInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return nil
case <-ticker.C:
if err := r.tick(ctx); err != nil {
logger.Warn("reconcile tick failed", zap.Error(err))
}
}
}
}
// Shutdown is a no-op: each tick is synchronous inside Run.
func (r *Reconciler) Shutdown(_ context.Context) error { return nil }
// Tick runs a single reconciliation pass. Exposed for tests so they
// can drive the reconciler without timing dependencies.
func (r *Reconciler) Tick(ctx context.Context) error { return r.tick(ctx) }
func (r *Reconciler) tick(ctx context.Context) error {
containers, err := r.svc.deps.Docker.List(ctx, dockerclient.ListFilter{
Labels: map[string]string{dockerclient.ManagedLabel: dockerclient.ManagedLabelValue},
})
if err != nil {
return fmt.Errorf("list managed containers: %w", err)
}
byContainerID := make(map[string]dockerclient.ContainerSummary, len(containers))
byGameID := make(map[uuid.UUID]dockerclient.ContainerSummary, len(containers))
for _, c := range containers {
byContainerID[c.ID] = c
gameID, ok := parseGameIDFromContainerName(c.Name)
if ok {
byGameID[gameID] = c
}
}
records, err := r.svc.deps.Store.ListAllRuntimeRecords(ctx)
if err != nil {
return fmt.Errorf("list runtime records: %w", err)
}
knownGames := make(map[uuid.UUID]struct{}, len(records))
var errs []error
for _, rec := range records {
knownGames[rec.GameID] = struct{}{}
if rec.IsTerminal() {
continue
}
c, matched := matchContainer(rec, byContainerID, byGameID)
if !matched {
if err := r.markRemoved(ctx, rec); err != nil {
errs = append(errs, fmt.Errorf("mark removed %s: %w", rec.GameID, err))
}
continue
}
if err := r.refreshSnapshot(ctx, rec, c); err != nil {
errs = append(errs, fmt.Errorf("refresh snapshot %s: %w", rec.GameID, err))
}
}
for gameID, c := range byGameID {
if _, ok := knownGames[gameID]; ok {
continue
}
if err := r.adopt(ctx, gameID, c); err != nil {
errs = append(errs, fmt.Errorf("adopt %s: %w", gameID, err))
}
}
return errors.Join(errs...)
}
func matchContainer(rec RuntimeRecord, byContainerID map[string]dockerclient.ContainerSummary, byGameID map[uuid.UUID]dockerclient.ContainerSummary) (dockerclient.ContainerSummary, bool) {
if rec.CurrentContainerID != "" {
if c, ok := byContainerID[rec.CurrentContainerID]; ok {
return c, true
}
}
if c, ok := byGameID[rec.GameID]; ok {
return c, true
}
return dockerclient.ContainerSummary{}, false
}
func (r *Reconciler) markRemoved(ctx context.Context, rec RuntimeRecord) error {
updated, err := r.svc.transitionRuntimeStatus(ctx, rec.GameID, RuntimeStatusRemoved, "")
if err != nil {
return err
}
r.svc.deps.Cache.PutRuntime(updated)
if r.svc.deps.Lobby != nil {
err = r.svc.deps.Lobby.OnRuntimeJobResult(ctx, rec.GameID, JobResult{
Op: OpReconcile,
Status: RuntimeStatusRemoved,
Message: "container disappeared",
})
if err != nil {
r.svc.deps.Logger.Warn("lobby OnRuntimeJobResult failed",
zap.String("game_id", rec.GameID.String()),
zap.Error(err))
}
}
return nil
}
func (r *Reconciler) adopt(ctx context.Context, gameID uuid.UUID, c dockerclient.ContainerSummary) error {
endpoint := fmt.Sprintf("http://%s:%d", HostName(gameID.String()), 8080)
game, err := r.svc.deps.Store.LoadGameProjection(ctx, gameID)
if err != nil {
if errors.Is(err, ErrNotFound) {
r.svc.deps.Logger.Warn("orphan container, no matching game",
zap.String("game_id", gameID.String()),
zap.String("container_id", c.ID))
return nil
}
return err
}
rec, err := r.svc.upsertRuntimeRecord(ctx, runtimeRecordInsert{
GameID: gameID,
Status: RuntimeStatusRunning,
CurrentContainerID: c.ID,
CurrentImageRef: c.ImageRef,
CurrentEngineVersion: c.Labels["galaxy.engine_version"],
EngineEndpoint: endpoint,
DockerNetwork: r.svc.dockerNetwork(),
TurnSchedule: game.TurnSchedule,
}, runtimeRecordUpdate{
Status: strPtr(RuntimeStatusRunning),
CurrentContainerID: strPtr(c.ID),
CurrentImageRef: strPtr(c.ImageRef),
CurrentEngineVersion: strPtr(c.Labels["galaxy.engine_version"]),
EngineEndpoint: strPtr(endpoint),
})
if err != nil {
return err
}
r.svc.deps.Cache.PutRuntime(rec)
r.svc.scheduler.startGame(rec)
return nil
}
func (r *Reconciler) refreshSnapshot(ctx context.Context, rec RuntimeRecord, _ dockerclient.ContainerSummary) error {
state, err := r.svc.deps.Engine.Status(ctx, rec.EngineEndpoint)
if err != nil {
_, _ = r.svc.transitionRuntimeStatus(ctx, rec.GameID, RuntimeStatusEngineUnreachable, "")
return nil
}
return r.svc.publishSnapshot(ctx, rec.GameID, state)
}
func parseGameIDFromContainerName(name string) (uuid.UUID, bool) {
const prefix = "galaxy-game-"
suffix := strings.TrimPrefix(name, prefix)
if suffix == name {
return uuid.Nil, false
}
parsed, err := uuid.Parse(suffix)
if err != nil {
return uuid.Nil, false
}
return parsed, true
}
+101
View File
@@ -0,0 +1,101 @@
// Package runtime owns the lifecycle of game-engine containers and the
// engine-version registry on the platform side. It is the single
// component permitted to talk to the Docker daemon
// (`internal/dockerclient`) and to running engine HTTP listeners
// (`internal/engineclient`); cross-cutting concerns such as the lobby
// state machine, notification fan-out, or player-mapping persistence
// live in their domain packages and reach into runtime through a
// narrow interface set documented in `deps.go`.
//
// The package introduces the package on top of the The implementation lobby. The
// lobby `RuntimeGateway` shifts from a logger-only no-op to a real
// adapter backed by `*runtime.Service`; runtime publishes snapshots
// back into lobby through `LobbyConsumer.OnRuntimeSnapshot`. The
// engine-version registry CRUD endpoints under
// `/api/v1/admin/engine-versions/*` and the runtime admin/user proxy
// endpoints flip from 501 placeholders to real responses.
package runtime
import (
"errors"
"github.com/jackc/pgx/v5/pgconn"
)
// Runtime status vocabulary mirrors `runtime_records_status_chk` in
// `backend/internal/postgres/migrations/00001_init.sql`.
const (
RuntimeStatusStarting = "starting"
RuntimeStatusRunning = "running"
RuntimeStatusGenerationInProgress = "generation_in_progress"
RuntimeStatusGenerationFailed = "generation_failed"
RuntimeStatusStopped = "stopped"
RuntimeStatusEngineUnreachable = "engine_unreachable"
RuntimeStatusFinished = "finished"
RuntimeStatusRemoved = "removed"
)
// Operation log vocabulary recorded into `runtime_operation_log.op` and
// `runtime_operation_log.status`. Kept as exported constants so
// runtime, admin handlers, and tests share the same wire values.
const (
OpStart = "start"
OpStop = "stop"
OpPause = "pause"
OpResume = "resume"
OpRestart = "restart"
OpPatch = "patch"
OpForceNextTurn = "force_next_turn"
OpReconcile = "reconcile"
OpTurn = "turn"
OpSourceLobby = "lobby"
OpSourceAdmin = "admin"
OpSourceScheduler = "scheduler"
OpSourceReconciler = "reconciler"
OpStatusQueued = "queued"
OpStatusRunning = "running"
OpStatusSucceeded = "succeeded"
OpStatusFailed = "failed"
)
// Container naming convention. The hostname is the primary alias on
// the user-defined Docker network; the engine endpoint URL is
// synthesised by `dockerclient.Adapter.Run` as `http://{hostname}:8080`.
const (
containerNamePrefix = "galaxy-game-"
containerHostPrefix = "galaxy-game-"
)
// pgErrCodeUniqueViolation is the SQLSTATE Postgres emits on a UNIQUE
// constraint violation. Kept locally so the runtime package does not
// import `internal/admin` or `internal/lobby` for the constant.
const pgErrCodeUniqueViolation = "23505"
// isUniqueViolation reports whether err is a Postgres UNIQUE
// constraint violation, optionally restricted to a specific constraint
// name. Empty constraintName matches any UNIQUE violation.
func isUniqueViolation(err error, constraintName string) bool {
var pgErr *pgconn.PgError
if !errors.As(err, &pgErr) {
return false
}
if pgErr.Code != pgErrCodeUniqueViolation {
return false
}
if constraintName == "" {
return true
}
return pgErr.ConstraintName == constraintName
}
// ContainerName synthesises the Docker container / hostname for the
// supplied game id. Exported so tests and the reconciler can resolve
// the inverse mapping without duplicating the format string.
func ContainerName(gameID string) string { return containerNamePrefix + gameID }
// HostName synthesises the in-network hostname for the supplied game
// id. Mirrors ContainerName so the engine endpoint URL `http://{host}:8080`
// resolves through Docker DNS on the user-defined network.
func HostName(gameID string) string { return containerHostPrefix + gameID }
+266
View File
@@ -0,0 +1,266 @@
package runtime
import (
"context"
"errors"
"sync"
"time"
"galaxy/backend/internal/dockerclient"
"galaxy/cronutil"
"github.com/google/uuid"
"go.uber.org/zap"
)
// Scheduler runs one goroutine per running game. Each goroutine holds
// a `cronutil.Schedule` parsed from `runtime_records.turn_schedule`
// and invokes `engineclient.Turn` on every tick (or when
// `skip_next_tick=true` short-circuits the timer).
//
// Implements `app.Component` so main.go can register the bookkeeper
// component alongside the worker pool and reconciler. Run blocks on
// ctx; per-game goroutines tear down when their game leaves the cache
// (stopGame is called) or when ctx is cancelled.
type Scheduler struct {
svc *Service
mu sync.Mutex
tickers map[uuid.UUID]*scheduledGame
parent context.Context
stopping bool
}
type scheduledGame struct {
cancel context.CancelFunc
done chan struct{}
}
// NewScheduler builds a Scheduler. The svc reference is held for the
// life of the Scheduler.
func NewScheduler(svc *Service) *Scheduler {
return &Scheduler{
svc: svc,
tickers: make(map[uuid.UUID]*scheduledGame),
}
}
// Run installs ctx as the parent context and re-attaches scheduler
// goroutines for every active runtime record at startup. Blocks on
// ctx.
func (sch *Scheduler) Run(ctx context.Context) error {
if sch == nil {
return nil
}
sch.mu.Lock()
sch.parent = ctx
sch.stopping = false
sch.mu.Unlock()
// Re-attach schedulers for every running record.
for _, rec := range sch.svc.deps.Cache.ActiveRuntimes() {
if rec.Status != RuntimeStatusRunning {
continue
}
sch.startGame(rec)
}
<-ctx.Done()
return nil
}
// Shutdown cancels every per-game goroutine and waits for them to
// drain. The provided context bounds the wait.
func (sch *Scheduler) Shutdown(ctx context.Context) error {
if sch == nil {
return nil
}
sch.mu.Lock()
sch.stopping = true
games := make([]*scheduledGame, 0, len(sch.tickers))
for _, g := range sch.tickers {
games = append(games, g)
}
sch.tickers = make(map[uuid.UUID]*scheduledGame)
sch.mu.Unlock()
for _, g := range games {
g.cancel()
}
for _, g := range games {
select {
case <-g.done:
case <-ctx.Done():
return ctx.Err()
}
}
return nil
}
// startGame attaches a per-game scheduler goroutine. Idempotent: a
// repeated call replaces the old goroutine with a fresh one bound to
// the supplied record.
func (sch *Scheduler) startGame(rec RuntimeRecord) {
if sch == nil {
return
}
sch.mu.Lock()
if sch.stopping || sch.parent == nil {
sch.mu.Unlock()
return
}
if existing, ok := sch.tickers[rec.GameID]; ok {
existing.cancel()
sch.mu.Unlock()
<-existing.done
sch.mu.Lock()
}
parent := sch.parent
if parent == nil {
sch.mu.Unlock()
return
}
gameCtx, cancel := context.WithCancel(parent)
g := &scheduledGame{cancel: cancel, done: make(chan struct{})}
sch.tickers[rec.GameID] = g
sch.mu.Unlock()
go sch.loop(gameCtx, rec, g.done)
}
// stopGame cancels the goroutine tied to gameID. Idempotent.
func (sch *Scheduler) stopGame(gameID uuid.UUID) {
if sch == nil {
return
}
sch.mu.Lock()
g, ok := sch.tickers[gameID]
if ok {
delete(sch.tickers, gameID)
}
sch.mu.Unlock()
if !ok {
return
}
g.cancel()
<-g.done
}
// activeCount reports how many games currently have a scheduler
// goroutine. Used by tests.
func (sch *Scheduler) activeCount() int {
sch.mu.Lock()
defer sch.mu.Unlock()
return len(sch.tickers)
}
// tickInterval computes the wait for the next scheduler firing. When
// the cron schedule fails to parse the loop falls back to a one-hour
// safety interval and logs a warning so operators notice.
func (sch *Scheduler) loop(ctx context.Context, rec RuntimeRecord, done chan struct{}) {
defer close(done)
logger := sch.svc.deps.Logger.With(zap.String("game_id", rec.GameID.String()))
schedule, err := cronutil.Parse(rec.TurnSchedule)
if err != nil {
logger.Warn("invalid turn_schedule, scheduler stopping",
zap.String("turn_schedule", rec.TurnSchedule),
zap.Error(err))
return
}
for {
latest, ok := sch.svc.deps.Cache.GetRuntime(rec.GameID)
if !ok {
return
}
if latest.Status != RuntimeStatusRunning {
return
}
now := sch.svc.deps.Now().UTC()
next := schedule.Next(now)
wait := next.Sub(now)
if latest.SkipNextTick {
wait = 0
}
if wait < 0 {
wait = 0
}
timer := time.NewTimer(wait)
select {
case <-ctx.Done():
timer.Stop()
return
case <-timer.C:
}
// Fresh fetch in case of pause / status change while waiting.
current, ok := sch.svc.deps.Cache.GetRuntime(rec.GameID)
if !ok {
return
}
if current.Status != RuntimeStatusRunning {
return
}
if current.Paused {
continue
}
if err := sch.tick(ctx, current); err != nil {
logger.Warn("scheduler tick failed", zap.Error(err))
}
}
}
// tick runs one engine /admin/turn call under the per-game mutex,
// publishes the resulting snapshot, and clears `skip_next_tick`.
func (sch *Scheduler) tick(ctx context.Context, rec RuntimeRecord) error {
mu := sch.svc.gameLock(rec.GameID)
if !mu.TryLock() {
return nil // another op is in flight; skip this tick
}
defer mu.Unlock()
op, err := sch.svc.beginOperation(ctx, rec.GameID, OpTurn, OpSourceScheduler)
if err != nil {
return err
}
state, err := sch.svc.deps.Engine.Turn(ctx, rec.EngineEndpoint)
if err != nil {
sch.svc.completeOperation(ctx, op, err)
_, _ = sch.svc.transitionRuntimeStatus(ctx, rec.GameID, RuntimeStatusEngineUnreachable, "")
// On engine unreachable, also clear skip_next_tick so the next
// real tick can start fresh.
_ = sch.clearSkipFlag(ctx, rec.GameID)
// Best-effort: ask Docker whether the container is still
// alive; if it's gone we mark the runtime row as removed.
if rec.CurrentContainerID != "" {
if _, inspErr := sch.svc.deps.Docker.InspectContainer(ctx, rec.CurrentContainerID); errors.Is(inspErr, dockerclient.ErrContainerNotFound) {
_, _ = sch.svc.transitionRuntimeStatus(ctx, rec.GameID, RuntimeStatusRemoved, "")
}
}
return err
}
if err := sch.svc.publishSnapshot(ctx, rec.GameID, state); err != nil {
sch.svc.completeOperation(ctx, op, err)
return err
}
sch.svc.completeOperation(ctx, op, nil)
_ = sch.clearSkipFlag(ctx, rec.GameID)
return nil
}
func (sch *Scheduler) clearSkipFlag(ctx context.Context, gameID uuid.UUID) error {
rec, ok := sch.svc.deps.Cache.GetRuntime(gameID)
if !ok || !rec.SkipNextTick {
return nil
}
skip := false
now := sch.svc.deps.Now().UTC()
updated, err := sch.svc.deps.Store.UpdateRuntimeRecord(ctx, gameID, runtimeRecordUpdate{SkipNextTick: &skip}, now)
if err != nil {
return err
}
sch.svc.deps.Cache.PutRuntime(updated)
return nil
}
+908
View File
@@ -0,0 +1,908 @@
package runtime
import (
"context"
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"time"
"galaxy/backend/internal/dockerclient"
"galaxy/model/rest"
"github.com/google/uuid"
"go.uber.org/zap"
)
// Service is the runtime-domain entry point. It owns the per-game
// lifecycle (start, stop, pause, resume, restart, patch,
// force-next-turn), the runtime cache, the player-mapping projection,
// and the operation log; it coordinates with the worker pool and the
// per-game scheduler goroutines.
type Service struct {
deps Deps
gameMu sync.Map // uuid.UUID -> *sync.Mutex
scheduler *Scheduler
workers *WorkerPool
}
// NewService constructs a Service. Logger and Now default sensibly. The
// `Service` is `app.Component`-shaped through the embedded WorkerPool /
// Scheduler / Reconciler that callers register separately.
func NewService(deps Deps) (*Service, error) {
if deps.Store == nil {
return nil, errors.New("runtime: store must not be nil")
}
if deps.Cache == nil {
return nil, errors.New("runtime: cache must not be nil")
}
if deps.EngineVersions == nil {
return nil, errors.New("runtime: engine version service must not be nil")
}
if deps.Docker == nil {
return nil, errors.New("runtime: docker client must not be nil")
}
if deps.Engine == nil {
return nil, errors.New("runtime: engine client must not be nil")
}
if deps.Logger == nil {
deps.Logger = zap.NewNop()
}
deps.Logger = deps.Logger.Named("runtime")
if deps.Notification == nil {
deps.Notification = NewNoopNotificationPublisher(deps.Logger)
}
if deps.Now == nil {
deps.Now = time.Now
}
if deps.Config.WorkerPoolSize <= 0 {
deps.Config.WorkerPoolSize = 1
}
if deps.Config.JobQueueSize <= 0 {
deps.Config.JobQueueSize = 1
}
if deps.Config.StopGracePeriod <= 0 {
deps.Config.StopGracePeriod = 10 * time.Second
}
if deps.Config.ReconcileInterval <= 0 {
deps.Config.ReconcileInterval = 60 * time.Second
}
if strings.TrimSpace(deps.Config.ContainerStateMount) == "" {
deps.Config.ContainerStateMount = "/var/lib/galaxy-game"
}
if !dockerclient.PullPolicy(deps.Config.ImagePullPolicy).IsKnown() {
return nil, fmt.Errorf("runtime: invalid image pull policy %q", deps.Config.ImagePullPolicy)
}
svc := &Service{deps: deps}
svc.scheduler = NewScheduler(svc)
svc.workers = NewWorkerPool(svc)
return svc, nil
}
// Logger exposes the named logger used by the service.
func (s *Service) Logger() *zap.Logger { return s.deps.Logger }
// Cache returns the in-memory projection.
func (s *Service) Cache() *Cache { return s.deps.Cache }
// EngineVersions returns the engine-version registry service.
func (s *Service) EngineVersions() *EngineVersionService { return s.deps.EngineVersions }
// Workers returns the runtime worker pool component.
func (s *Service) Workers() *WorkerPool { return s.workers }
// Reconciler builds an `app.Component` driving the periodic
// reconciliation loop documented in PLAN.md §5.5.
func (s *Service) Reconciler() *Reconciler { return NewReconciler(s) }
// SchedulerComponent returns the per-game scheduler bookkeeper. It
// implements `app.Component` so main.go can register it alongside the
// worker pool.
func (s *Service) SchedulerComponent() *Scheduler { return s.scheduler }
// gameLock returns a sync.Mutex unique to gameID. Used to serialise
// per-game runtime operations across goroutines.
func (s *Service) gameLock(gameID uuid.UUID) *sync.Mutex {
if v, ok := s.gameMu.Load(gameID); ok {
return v.(*sync.Mutex)
}
v, _ := s.gameMu.LoadOrStore(gameID, &sync.Mutex{})
return v.(*sync.Mutex)
}
// =====================================================================
// Lifecycle entry points (consumed by lobby.RuntimeGateway adapter)
// =====================================================================
// StartGame queues a start job for gameID. Returns once the operation
// is durably recorded; the actual pull / create / start runs on a
// worker goroutine.
func (s *Service) StartGame(ctx context.Context, gameID uuid.UUID) error {
op, err := s.beginOperation(ctx, gameID, OpStart, OpSourceLobby)
if err != nil {
return err
}
return s.enqueue(ctx, jobStart{operation: op})
}
// StopGame queues a stop job for gameID.
func (s *Service) StopGame(ctx context.Context, gameID uuid.UUID) error {
op, err := s.beginOperation(ctx, gameID, OpStop, OpSourceLobby)
if err != nil {
return err
}
return s.enqueue(ctx, jobStop{operation: op})
}
// PauseGame flips the runtime row's `paused` flag. The container
// keeps running; the scheduler short-circuits ticks while paused.
// Synchronous because no Docker call is involved.
func (s *Service) PauseGame(ctx context.Context, gameID uuid.UUID) error {
mu := s.gameLock(gameID)
mu.Lock()
defer mu.Unlock()
now := s.deps.Now().UTC()
paused := true
pausedAtPtr := &now
patch := runtimeRecordUpdate{Paused: &paused, PausedAt: &pausedAtPtr}
rec, err := s.deps.Store.UpdateRuntimeRecord(ctx, gameID, patch, now)
if err != nil {
return err
}
s.deps.Cache.PutRuntime(rec)
s.recordSyncOperation(ctx, gameID, OpPause, OpSourceLobby, rec.CurrentImageRef, rec.CurrentContainerID, nil)
return nil
}
// ResumeGame clears the `paused` flag. Synchronous.
func (s *Service) ResumeGame(ctx context.Context, gameID uuid.UUID) error {
mu := s.gameLock(gameID)
mu.Lock()
defer mu.Unlock()
now := s.deps.Now().UTC()
paused := false
var nilTime *time.Time
cleared := &nilTime
patch := runtimeRecordUpdate{Paused: &paused, PausedAt: cleared}
rec, err := s.deps.Store.UpdateRuntimeRecord(ctx, gameID, patch, now)
if err != nil {
return err
}
s.deps.Cache.PutRuntime(rec)
s.recordSyncOperation(ctx, gameID, OpResume, OpSourceLobby, rec.CurrentImageRef, rec.CurrentContainerID, nil)
return nil
}
// AdminRestart queues a restart job. Stop + remove + run with the
// same image_ref.
func (s *Service) AdminRestart(ctx context.Context, gameID uuid.UUID) (OperationLog, error) {
op, err := s.beginOperation(ctx, gameID, OpRestart, OpSourceAdmin)
if err != nil {
return OperationLog{}, err
}
if err := s.enqueue(ctx, jobRestart{operation: op}); err != nil {
return OperationLog{}, err
}
return op, nil
}
// AdminPatch validates the target version against the registry, then
// queues a stop + remove + run with the new image. Returns
// ErrPatchSemverIncompatible when the target crosses major/minor.
func (s *Service) AdminPatch(ctx context.Context, gameID uuid.UUID, targetVersion string) (OperationLog, error) {
rec, err := s.GetRuntime(ctx, gameID)
if err != nil {
return OperationLog{}, err
}
if rec.CurrentEngineVersion == "" {
return OperationLog{}, fmt.Errorf("%w: runtime has no current engine version", ErrConflict)
}
if err := CheckPatchCompatible(rec.CurrentEngineVersion, targetVersion); err != nil {
return OperationLog{}, err
}
target, err := s.deps.EngineVersions.Resolve(ctx, targetVersion)
if err != nil {
return OperationLog{}, err
}
op, err := s.beginOperation(ctx, gameID, OpPatch, OpSourceAdmin)
if err != nil {
return OperationLog{}, err
}
if err := s.enqueue(ctx, jobPatch{operation: op, target: target}); err != nil {
return OperationLog{}, err
}
return op, nil
}
// AdminForceNextTurn sets the skip_next_tick flag so the next
// scheduler tick fires immediately. Synchronous.
func (s *Service) AdminForceNextTurn(ctx context.Context, gameID uuid.UUID) (OperationLog, error) {
mu := s.gameLock(gameID)
mu.Lock()
defer mu.Unlock()
now := s.deps.Now().UTC()
skip := true
rec, err := s.deps.Store.UpdateRuntimeRecord(ctx, gameID, runtimeRecordUpdate{SkipNextTick: &skip}, now)
if err != nil {
return OperationLog{}, err
}
s.deps.Cache.PutRuntime(rec)
op := s.recordSyncOperation(ctx, gameID, OpForceNextTurn, OpSourceAdmin, rec.CurrentImageRef, rec.CurrentContainerID, nil)
return op, nil
}
// GetRuntime returns the runtime record for gameID, cache-first.
func (s *Service) GetRuntime(ctx context.Context, gameID uuid.UUID) (RuntimeRecord, error) {
if rec, ok := s.deps.Cache.GetRuntime(gameID); ok {
return rec, nil
}
rec, err := s.deps.Store.LoadRuntimeRecord(ctx, gameID)
if err != nil {
return RuntimeRecord{}, err
}
s.deps.Cache.PutRuntime(rec)
return rec, nil
}
// ResolvePlayerMapping returns the (race_name, engine_player_uuid)
// projection for the supplied (game_id, user_id). Used by the user
// game-proxy handlers to populate the engine `actor` field.
func (s *Service) ResolvePlayerMapping(ctx context.Context, gameID, userID uuid.UUID) (PlayerMapping, error) {
return s.deps.Store.LoadPlayerMapping(ctx, gameID, userID)
}
// EngineEndpoint returns the engine endpoint URL for gameID. Used by
// the user game-proxy handlers.
func (s *Service) EngineEndpoint(ctx context.Context, gameID uuid.UUID) (string, error) {
rec, err := s.GetRuntime(ctx, gameID)
if err != nil {
return "", err
}
if rec.EngineEndpoint == "" {
return "", fmt.Errorf("%w: runtime has no engine endpoint", ErrConflict)
}
return rec.EngineEndpoint, nil
}
// =====================================================================
// Worker / job execution
// =====================================================================
// job is the internal interface implemented by every long-running
// runtime task. The worker pool dispatches them in order.
type job interface {
GameID() uuid.UUID
Run(ctx context.Context, s *Service) error
Operation() OperationLog
}
type jobStart struct{ operation OperationLog }
type jobStop struct{ operation OperationLog }
type jobRestart struct{ operation OperationLog }
type jobPatch struct {
operation OperationLog
target EngineVersion
}
func (j jobStart) GameID() uuid.UUID { return j.operation.GameID }
func (j jobStop) GameID() uuid.UUID { return j.operation.GameID }
func (j jobRestart) GameID() uuid.UUID { return j.operation.GameID }
func (j jobPatch) GameID() uuid.UUID { return j.operation.GameID }
func (j jobStart) Operation() OperationLog { return j.operation }
func (j jobStop) Operation() OperationLog { return j.operation }
func (j jobRestart) Operation() OperationLog { return j.operation }
func (j jobPatch) Operation() OperationLog { return j.operation }
func (j jobStart) Run(ctx context.Context, s *Service) error { return s.runStart(ctx, j.operation) }
func (j jobStop) Run(ctx context.Context, s *Service) error { return s.runStop(ctx, j.operation) }
func (j jobRestart) Run(ctx context.Context, s *Service) error {
return s.runRestart(ctx, j.operation)
}
func (j jobPatch) Run(ctx context.Context, s *Service) error {
return s.runPatch(ctx, j.operation, j.target)
}
// enqueue places job onto the worker channel. Returns ErrJobQueueFull
// when the channel is at capacity; ErrShutdown when the pool is
// stopped.
func (s *Service) enqueue(ctx context.Context, j job) error {
if s.workers == nil {
return ErrShutdown
}
return s.workers.submit(ctx, j)
}
// beginOperation persists a queued operation log row. Caller is
// responsible for transitioning it to running/succeeded/failed via
// completeOperation.
func (s *Service) beginOperation(ctx context.Context, gameID uuid.UUID, op, source string) (OperationLog, error) {
in := operationLogInsert{
OperationID: uuid.New(),
GameID: gameID,
Op: op,
Source: source,
Status: OpStatusQueued,
StartedAt: s.deps.Now().UTC(),
}
return s.deps.Store.InsertOperationLog(ctx, in)
}
// recordSyncOperation logs an operation that completed synchronously
// (pause / resume / force-next-turn). It writes both the queued and
// the terminal row to keep the audit trail consistent with worker
// jobs.
func (s *Service) recordSyncOperation(ctx context.Context, gameID uuid.UUID, op, source, imageRef, containerID string, runErr error) OperationLog {
in := operationLogInsert{
OperationID: uuid.New(),
GameID: gameID,
Op: op,
Source: source,
Status: OpStatusRunning,
ImageRef: imageRef,
ContainerID: containerID,
StartedAt: s.deps.Now().UTC(),
}
rec, err := s.deps.Store.InsertOperationLog(ctx, in)
if err != nil {
s.deps.Logger.Warn("operation log insert failed",
zap.String("game_id", gameID.String()),
zap.String("op", op),
zap.Error(err))
return OperationLog{}
}
status := OpStatusSucceeded
errCode := ""
errMsg := ""
if runErr != nil {
status = OpStatusFailed
errCode = "internal_error"
errMsg = runErr.Error()
}
completed, err := s.deps.Store.CompleteOperationLog(ctx, rec.OperationID, status, errCode, errMsg, s.deps.Now().UTC())
if err != nil {
s.deps.Logger.Warn("operation log complete failed",
zap.String("game_id", gameID.String()),
zap.String("op", op),
zap.Error(err))
return rec
}
return completed
}
// completeOperation flips the row to a terminal status. runErr is nil
// on success.
func (s *Service) completeOperation(ctx context.Context, op OperationLog, runErr error) {
status := OpStatusSucceeded
errCode := ""
errMsg := ""
if runErr != nil {
status = OpStatusFailed
errCode = "internal_error"
errMsg = runErr.Error()
}
if _, err := s.deps.Store.CompleteOperationLog(ctx, op.OperationID, status, errCode, errMsg, s.deps.Now().UTC()); err != nil {
s.deps.Logger.Warn("operation log complete failed",
zap.String("game_id", op.GameID.String()),
zap.String("op", op.Op),
zap.String("operation_id", op.OperationID.String()),
zap.Error(err))
}
}
// =====================================================================
// runStart — the heart of the package
// =====================================================================
func (s *Service) runStart(ctx context.Context, op OperationLog) error {
gameID := op.GameID
mu := s.gameLock(gameID)
mu.Lock()
defer mu.Unlock()
game, err := s.deps.Store.LoadGameProjection(ctx, gameID)
if err != nil {
s.completeOperation(ctx, op, err)
return err
}
if strings.TrimSpace(game.TargetEngineVersion) == "" {
err := fmt.Errorf("%w: game has no target_engine_version", ErrInvalidInput)
s.publishStartConfigInvalid(ctx, op, "target_engine_version is empty")
s.completeOperation(ctx, op, err)
return err
}
memberships, err := s.deps.Store.ListActiveMemberships(ctx, gameID)
if err != nil {
s.completeOperation(ctx, op, err)
return err
}
if len(memberships) == 0 {
err := fmt.Errorf("%w: game has no active memberships", ErrConflict)
s.publishStartConfigInvalid(ctx, op, "no active memberships")
s.completeOperation(ctx, op, err)
return err
}
version, err := s.deps.EngineVersions.Resolve(ctx, game.TargetEngineVersion)
if err != nil {
s.publishStartConfigInvalid(ctx, op, fmt.Sprintf("engine version %q: %v", game.TargetEngineVersion, err))
s.completeOperation(ctx, op, err)
return err
}
mappings := make([]PlayerMapping, 0, len(memberships))
races := make([]rest.InitRace, 0, len(memberships))
for _, m := range memberships {
mappings = append(mappings, PlayerMapping{
GameID: gameID,
UserID: m.UserID,
RaceName: m.RaceName,
EnginePlayerUUID: uuid.New(),
})
races = append(races, rest.InitRace{RaceName: m.RaceName})
}
if err := s.deps.Store.InsertPlayerMappings(ctx, mappings); err != nil {
s.completeOperation(ctx, op, err)
return err
}
statePath := filepath.Join(filepath.Clean(s.deps.Config.ContainerStateMount), gameID.String())
hostStatePath := filepath.Join(filepath.Clean(s.hostStateRoot()), gameID.String())
// Bind-mount sources are resolved by the Docker daemon against
// the host filesystem, not against the backend process namespace.
// Production deploys mount the same `BACKEND_GAME_STATE_ROOT`
// path into the backend container at the same path, so creating
// the per-game subdirectory inside backend makes it visible to
// the daemon at the same absolute path.
//
// The directory is created with mode 0o777 (and explicitly
// chmod-ed to override umask) because the engine container may
// run as a different uid than backend. Both processes need
// read-write access to the bind-mounted state path; backend has
// no way to know the engine container's uid ahead of time, so
// world-writable is the conservative default. Production
// deployments that pin both containers to the same user can
// tighten the mode through a future configuration knob.
if err := os.MkdirAll(hostStatePath, 0o777); err != nil {
s.completeOperation(ctx, op, fmt.Errorf("create host state path %q: %w", hostStatePath, err))
return err
}
if err := os.Chmod(hostStatePath, 0o777); err != nil {
s.completeOperation(ctx, op, fmt.Errorf("chmod host state path %q: %w", hostStatePath, err))
return err
}
spec := dockerclient.RunSpec{
Name: ContainerName(gameID.String()),
Image: version.ImageRef,
Hostname: HostName(gameID.String()),
Network: s.dockerNetwork(),
Env: map[string]string{
"GAME_STATE_PATH": statePath,
},
Labels: map[string]string{
"galaxy.game_id": gameID.String(),
"galaxy.engine_version": version.Version,
},
BindMounts: []dockerclient.BindMount{
{
HostPath: hostStatePath,
MountPath: s.deps.Config.ContainerStateMount,
ReadOnly: false,
},
},
LogDriver: s.deps.Config.ContainerLogDriver,
LogOpts: s.deps.Config.ContainerLogOpts,
CPUQuota: s.deps.Config.ContainerCPUQuota,
Memory: s.deps.Config.ContainerMemory,
PIDsLimit: s.deps.Config.ContainerPIDsLimit,
PullPolicy: dockerclient.PullPolicy(s.deps.Config.ImagePullPolicy),
}
runResult, err := s.deps.Docker.Run(ctx, spec)
if err != nil {
s.publishStartFailure(ctx, op, version.ImageRef, err)
s.completeOperation(ctx, op, err)
return err
}
now := s.deps.Now().UTC()
startedAt := runResult.StartedAt
if startedAt.IsZero() {
startedAt = now
}
startedAtPtr := &startedAt
rec, err := s.upsertRuntimeRecord(ctx, runtimeRecordInsert{
GameID: gameID,
Status: RuntimeStatusStarting,
CurrentContainerID: runResult.ContainerID,
CurrentImageRef: version.ImageRef,
CurrentEngineVersion: version.Version,
EngineEndpoint: runResult.EngineEndpoint,
StatePath: statePath,
DockerNetwork: s.dockerNetwork(),
TurnSchedule: game.TurnSchedule,
StartedAt: &startedAt,
}, runtimeRecordUpdate{
Status: strPtr(RuntimeStatusStarting),
CurrentContainerID: strPtr(runResult.ContainerID),
CurrentImageRef: strPtr(version.ImageRef),
CurrentEngineVersion: strPtr(version.Version),
EngineEndpoint: strPtr(runResult.EngineEndpoint),
StatePath: strPtr(statePath),
DockerNetwork: strPtr(s.dockerNetwork()),
TurnSchedule: strPtr(game.TurnSchedule),
StartedAt: &startedAtPtr,
})
if err != nil {
s.completeOperation(ctx, op, err)
return err
}
// Wait for the engine HTTP listener before issuing init. Docker
// reports the container as running as soon as the entrypoint
// starts, but the Go binary inside may take a moment to bind
// the port; without this loop, Init races the listener and
// fails with `connection refused`.
if err := s.waitForEngineHealthz(ctx, runResult.EngineEndpoint, 30*time.Second); err != nil {
s.deps.Logger.Warn("engine healthz never succeeded",
zap.String("game_id", gameID.String()),
zap.Error(err))
s.transitionRuntimeStatus(ctx, gameID, RuntimeStatusEngineUnreachable, "")
s.completeOperation(ctx, op, err)
return err
}
initResp, err := s.deps.Engine.Init(ctx, runResult.EngineEndpoint, rest.InitRequest{Races: races})
if err != nil {
s.deps.Logger.Warn("engine init failed",
zap.String("game_id", gameID.String()),
zap.Error(err))
s.transitionRuntimeStatus(ctx, gameID, RuntimeStatusEngineUnreachable, "")
s.completeOperation(ctx, op, err)
return err
}
// Engine is up. Transition the runtime row to running and publish
// the snapshot into lobby.
rec, err = s.transitionRuntimeStatus(ctx, gameID, RuntimeStatusRunning, "ok")
if err != nil {
s.completeOperation(ctx, op, err)
return err
}
s.scheduler.startGame(rec)
if err := s.publishSnapshot(ctx, gameID, initResp); err != nil {
s.deps.Logger.Warn("publish init snapshot failed",
zap.String("game_id", gameID.String()),
zap.Error(err))
}
s.completeOperation(ctx, op, nil)
return nil
}
// runStop stops + removes the engine container and transitions the
// runtime row to `stopped`.
func (s *Service) runStop(ctx context.Context, op OperationLog) error {
gameID := op.GameID
mu := s.gameLock(gameID)
mu.Lock()
defer mu.Unlock()
rec, err := s.GetRuntime(ctx, gameID)
if err != nil {
s.completeOperation(ctx, op, err)
return err
}
s.scheduler.stopGame(gameID)
if rec.CurrentContainerID != "" {
if err := s.deps.Docker.Stop(ctx, rec.CurrentContainerID, int(s.deps.Config.StopGracePeriod/time.Second)); err != nil && !errors.Is(err, dockerclient.ErrContainerNotFound) {
s.completeOperation(ctx, op, err)
return err
}
if err := s.deps.Docker.Remove(ctx, rec.CurrentContainerID); err != nil {
s.completeOperation(ctx, op, err)
return err
}
}
now := s.deps.Now().UTC()
stoppedAtPtr := &now
updated, err := s.deps.Store.UpdateRuntimeRecord(ctx, gameID, runtimeRecordUpdate{
Status: strPtr(RuntimeStatusStopped),
StoppedAt: &stoppedAtPtr,
}, now)
if err != nil {
s.completeOperation(ctx, op, err)
return err
}
s.deps.Cache.PutRuntime(updated)
if err := s.deps.Store.DeletePlayerMappingsForGame(ctx, gameID); err != nil {
s.deps.Logger.Warn("delete player_mappings on stop failed",
zap.String("game_id", gameID.String()),
zap.Error(err))
}
s.completeOperation(ctx, op, nil)
return nil
}
// runRestart stops + removes + runs a fresh container with the same
// image_ref. Reuses runStart's logic via re-loading the lobby
// projection.
func (s *Service) runRestart(ctx context.Context, op OperationLog) error {
if err := s.runStop(ctx, op); err != nil {
return err
}
// Reuse runStart with a freshly minted operation row so the audit
// trail remains consistent.
startOp, err := s.beginOperation(ctx, op.GameID, OpStart, op.Source)
if err != nil {
return err
}
return s.runStart(ctx, startOp)
}
// runPatch stops + removes the current container, updates the engine
// version reference, and starts a fresh container.
func (s *Service) runPatch(ctx context.Context, op OperationLog, target EngineVersion) error {
mu := s.gameLock(op.GameID)
mu.Lock()
defer mu.Unlock()
rec, err := s.GetRuntime(ctx, op.GameID)
if err != nil {
s.completeOperation(ctx, op, err)
return err
}
s.scheduler.stopGame(op.GameID)
if rec.CurrentContainerID != "" {
if err := s.deps.Docker.Stop(ctx, rec.CurrentContainerID, int(s.deps.Config.StopGracePeriod/time.Second)); err != nil && !errors.Is(err, dockerclient.ErrContainerNotFound) {
s.completeOperation(ctx, op, err)
return err
}
if err := s.deps.Docker.Remove(ctx, rec.CurrentContainerID); err != nil {
s.completeOperation(ctx, op, err)
return err
}
}
statePath := rec.StatePath
if statePath == "" {
statePath = filepath.Join(filepath.Clean(s.deps.Config.ContainerStateMount), op.GameID.String())
}
hostStatePath := filepath.Join(filepath.Clean(s.hostStateRoot()), op.GameID.String())
spec := dockerclient.RunSpec{
Name: ContainerName(op.GameID.String()),
Image: target.ImageRef,
Hostname: HostName(op.GameID.String()),
Network: s.dockerNetwork(),
Env: map[string]string{
"GAME_STATE_PATH": statePath,
},
Labels: map[string]string{
"galaxy.game_id": op.GameID.String(),
"galaxy.engine_version": target.Version,
},
BindMounts: []dockerclient.BindMount{
{HostPath: hostStatePath, MountPath: s.deps.Config.ContainerStateMount},
},
LogDriver: s.deps.Config.ContainerLogDriver,
LogOpts: s.deps.Config.ContainerLogOpts,
CPUQuota: s.deps.Config.ContainerCPUQuota,
Memory: s.deps.Config.ContainerMemory,
PIDsLimit: s.deps.Config.ContainerPIDsLimit,
PullPolicy: dockerclient.PullPolicy(s.deps.Config.ImagePullPolicy),
}
runResult, err := s.deps.Docker.Run(ctx, spec)
if err != nil {
s.publishStartFailure(ctx, op, target.ImageRef, err)
s.completeOperation(ctx, op, err)
return err
}
now := s.deps.Now().UTC()
startedAt := runResult.StartedAt
if startedAt.IsZero() {
startedAt = now
}
startedAtPtr := &startedAt
updated, err := s.deps.Store.UpdateRuntimeRecord(ctx, op.GameID, runtimeRecordUpdate{
Status: strPtr(RuntimeStatusRunning),
CurrentContainerID: strPtr(runResult.ContainerID),
CurrentImageRef: strPtr(target.ImageRef),
CurrentEngineVersion: strPtr(target.Version),
EngineEndpoint: strPtr(runResult.EngineEndpoint),
StartedAt: &startedAtPtr,
EngineHealth: strPtr("ok"),
}, now)
if err != nil {
s.completeOperation(ctx, op, err)
return err
}
s.deps.Cache.PutRuntime(updated)
s.scheduler.startGame(updated)
s.completeOperation(ctx, op, nil)
return nil
}
// =====================================================================
// Snapshot / status helpers
// =====================================================================
// publishSnapshot writes a runtime_health_snapshots row, refreshes the
// runtime cache from `current_turn` / `engine_health`, and forwards
// the snapshot to lobby.
func (s *Service) publishSnapshot(ctx context.Context, gameID uuid.UUID, state rest.StateResponse) error {
now := s.deps.Now().UTC()
payload, err := json.Marshal(state)
if err != nil {
return fmt.Errorf("marshal snapshot: %w", err)
}
if err := s.deps.Store.InsertHealthSnapshot(ctx, uuid.New(), gameID, now, payload); err != nil {
return err
}
currentTurn := int32(state.Turn)
patch := runtimeRecordUpdate{
CurrentTurn: &currentTurn,
EngineHealth: strPtr("ok"),
LastObservedAt: dblTime(now),
}
if state.Finished {
patch.Status = strPtr(RuntimeStatusFinished)
finishedAtPtr := &now
patch.FinishedAt = &finishedAtPtr
}
rec, err := s.deps.Store.UpdateRuntimeRecord(ctx, gameID, patch, now)
if err != nil {
return err
}
s.deps.Cache.PutRuntime(rec)
if s.deps.Lobby != nil {
mappings, err := s.deps.Store.ListPlayerMappingsForGame(ctx, gameID)
if err != nil {
s.deps.Logger.Warn("list player_mappings on snapshot failed",
zap.String("game_id", gameID.String()),
zap.Error(err))
}
userByEngine := make(map[uuid.UUID]uuid.UUID, len(mappings))
userByRace := make(map[string]uuid.UUID, len(mappings))
for _, m := range mappings {
userByEngine[m.EnginePlayerUUID] = m.UserID
userByRace[m.RaceName] = m.UserID
}
stats := make([]LobbyPlayerStats, 0, len(state.Players))
for _, p := range state.Players {
userID, ok := userByEngine[p.ID]
if !ok {
userID = userByRace[p.RaceName]
}
if userID == uuid.Nil {
continue
}
stats = append(stats, LobbyPlayerStats{
UserID: userID,
CurrentPlanets: int32(p.Planets),
CurrentPopulation: int32(p.Population),
MaxPlanets: int32(p.Planets),
MaxPopulation: int32(p.Population),
})
}
runtimeStatus := RuntimeStatusRunning
if state.Finished {
runtimeStatus = RuntimeStatusFinished
}
err = s.deps.Lobby.OnRuntimeSnapshot(ctx, gameID, LobbySnapshot{
CurrentTurn: currentTurn,
RuntimeStatus: runtimeStatus,
EngineHealth: "ok",
ObservedAt: now,
PlayerStats: stats,
})
if err != nil {
s.deps.Logger.Warn("lobby snapshot consumer failed",
zap.String("game_id", gameID.String()),
zap.Error(err))
}
}
return nil
}
// transitionRuntimeStatus updates the status / engine_health columns
// and refreshes the cache.
func (s *Service) transitionRuntimeStatus(ctx context.Context, gameID uuid.UUID, status, health string) (RuntimeRecord, error) {
now := s.deps.Now().UTC()
patch := runtimeRecordUpdate{Status: &status}
if health != "" {
patch.EngineHealth = &health
}
if status == RuntimeStatusFinished {
finishedAtPtr := &now
patch.FinishedAt = &finishedAtPtr
}
if status == RuntimeStatusStopped {
stoppedAtPtr := &now
patch.StoppedAt = &stoppedAtPtr
}
rec, err := s.deps.Store.UpdateRuntimeRecord(ctx, gameID, patch, now)
if err != nil {
return RuntimeRecord{}, err
}
s.deps.Cache.PutRuntime(rec)
return rec, nil
}
// upsertRuntimeRecord inserts the record when no row exists; updates
// it otherwise. Used by runStart so a re-attempt after a worker crash
// stays idempotent.
func (s *Service) upsertRuntimeRecord(ctx context.Context, in runtimeRecordInsert, patch runtimeRecordUpdate) (RuntimeRecord, error) {
rec, err := s.deps.Store.InsertRuntimeRecord(ctx, in)
if err == nil {
s.deps.Cache.PutRuntime(rec)
return rec, nil
}
if !errors.Is(err, ErrConflict) {
return RuntimeRecord{}, err
}
updated, err := s.deps.Store.UpdateRuntimeRecord(ctx, in.GameID, patch, s.deps.Now().UTC())
if err != nil {
return RuntimeRecord{}, err
}
s.deps.Cache.PutRuntime(updated)
return updated, nil
}
// dockerNetwork returns the user-defined Docker network name engine
// containers attach to. Wired from cfg.Docker.Network through Deps.
func (s *Service) dockerNetwork() string { return s.deps.DockerNetwork }
// waitForEngineHealthz polls the engine `/healthz` endpoint until it
// responds 2xx or until the timeout elapses. The Docker daemon
// reports a container as `running` as soon as the entrypoint starts,
// but the engine binary may need a moment to bind its TCP port; the
// retry loop bridges that gap so the immediately-following Init call
// does not race the listener.
func (s *Service) waitForEngineHealthz(ctx context.Context, baseURL string, timeout time.Duration) error {
deadline := time.Now().Add(timeout)
var lastErr error
for {
probeCtx, cancel := context.WithTimeout(ctx, time.Second)
err := s.deps.Engine.Healthz(probeCtx, baseURL)
cancel()
if err == nil {
return nil
}
lastErr = err
if time.Now().After(deadline) {
return fmt.Errorf("engine healthz never succeeded within %s: %w", timeout, lastErr)
}
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(200 * time.Millisecond):
}
}
}
// hostStateRoot returns the host-side root directory under which the
// per-game state directory is created. Wired from cfg.Game.StateRoot
// through Deps.
func (s *Service) hostStateRoot() string {
if s.deps.HostStateRoot != "" {
return s.deps.HostStateRoot
}
return s.deps.Config.ContainerStateMount
}
// strPtr returns a pointer to s. Helps assemble runtimeRecordUpdate
// values inline.
func strPtr(s string) *string { return &s }
// dblTime returns a `**time.Time` set to t. Used to clear / set the
// nullable timestamp columns of `runtime_records` through
// runtimeRecordUpdate.
func dblTime(t time.Time) **time.Time { p := &t; return &p }
@@ -0,0 +1,298 @@
package runtime_test
import (
"context"
"database/sql"
"encoding/json"
"net/http"
"net/http/httptest"
"net/url"
"sync"
"testing"
"time"
"galaxy/backend/internal/config"
"galaxy/backend/internal/dockerclient"
"galaxy/backend/internal/engineclient"
backendpg "galaxy/backend/internal/postgres"
"galaxy/backend/internal/runtime"
"galaxy/model/rest"
pgshared "galaxy/postgres"
"github.com/google/uuid"
testcontainers "github.com/testcontainers/testcontainers-go"
tcpostgres "github.com/testcontainers/testcontainers-go/modules/postgres"
"github.com/testcontainers/testcontainers-go/wait"
"go.uber.org/zap/zaptest"
)
const (
pgImage = "postgres:16-alpine"
pgUser = "galaxy"
pgPassword = "galaxy"
pgDatabase = "galaxy_backend"
pgSchema = "backend"
pgStartup = 90 * time.Second
pgOpTO = 10 * time.Second
)
func dsnWithSearchPath(raw, schema string) (string, error) {
parsed, err := url.Parse(raw)
if err != nil {
return "", err
}
q := parsed.Query()
q.Set("search_path", schema)
parsed.RawQuery = q.Encode()
return parsed.String(), nil
}
func startPostgres(t *testing.T) *sql.DB {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
t.Cleanup(cancel)
container, err := tcpostgres.Run(ctx, pgImage,
tcpostgres.WithDatabase(pgDatabase),
tcpostgres.WithUsername(pgUser),
tcpostgres.WithPassword(pgPassword),
testcontainers.WithWaitStrategy(
wait.ForLog("database system is ready to accept connections").
WithOccurrence(2).
WithStartupTimeout(pgStartup),
),
)
if err != nil {
t.Skipf("postgres testcontainer unavailable, skipping: %v", err)
}
t.Cleanup(func() {
if termErr := testcontainers.TerminateContainer(container); termErr != nil {
t.Errorf("terminate postgres container: %v", termErr)
}
})
baseDSN, err := container.ConnectionString(ctx, "sslmode=disable")
if err != nil {
t.Fatalf("connection string: %v", err)
}
scopedDSN, err := dsnWithSearchPath(baseDSN, pgSchema)
if err != nil {
t.Fatalf("scope dsn: %v", err)
}
cfg := pgshared.DefaultConfig()
cfg.PrimaryDSN = scopedDSN
cfg.OperationTimeout = pgOpTO
db, err := pgshared.OpenPrimary(ctx, cfg)
if err != nil {
t.Fatalf("open primary: %v", err)
}
t.Cleanup(func() { _ = db.Close() })
if err := backendpg.ApplyMigrations(ctx, db); err != nil {
t.Fatalf("apply migrations: %v", err)
}
return db
}
// fakeDocker implements dockerclient.Client for tests.
type fakeDocker struct {
mu sync.Mutex
runs []dockerclient.RunSpec
stoppedIDs []string
removedIDs []string
listResult []dockerclient.ContainerSummary
endpointFor func(spec dockerclient.RunSpec) string
}
func (f *fakeDocker) EnsureNetwork(_ context.Context, _ string) error { return nil }
func (f *fakeDocker) PullImage(_ context.Context, _ string, _ dockerclient.PullPolicy) error {
return nil
}
func (f *fakeDocker) InspectImage(_ context.Context, ref string) (dockerclient.ImageInspect, error) {
return dockerclient.ImageInspect{Ref: ref}, nil
}
func (f *fakeDocker) InspectContainer(_ context.Context, _ string) (dockerclient.ContainerInspect, error) {
return dockerclient.ContainerInspect{}, nil
}
func (f *fakeDocker) Run(_ context.Context, spec dockerclient.RunSpec) (dockerclient.RunResult, error) {
f.mu.Lock()
defer f.mu.Unlock()
f.runs = append(f.runs, spec)
endpoint := "http://" + spec.Hostname + ":8080"
if f.endpointFor != nil {
endpoint = f.endpointFor(spec)
}
return dockerclient.RunResult{
ContainerID: "container-" + spec.Name,
EngineEndpoint: endpoint,
StartedAt: time.Now().UTC(),
}, nil
}
func (f *fakeDocker) Stop(_ context.Context, id string, _ int) error {
f.mu.Lock()
f.stoppedIDs = append(f.stoppedIDs, id)
f.mu.Unlock()
return nil
}
func (f *fakeDocker) Remove(_ context.Context, id string) error {
f.mu.Lock()
f.removedIDs = append(f.removedIDs, id)
f.mu.Unlock()
return nil
}
func (f *fakeDocker) List(_ context.Context, _ dockerclient.ListFilter) ([]dockerclient.ContainerSummary, error) {
return f.listResult, nil
}
// fakeLobbyConsumer captures runtime → lobby callbacks.
type fakeLobbyConsumer struct {
mu sync.Mutex
snapshots []runtime.LobbySnapshot
jobs []runtime.JobResult
}
func (f *fakeLobbyConsumer) OnRuntimeSnapshot(_ context.Context, _ uuid.UUID, snapshot runtime.LobbySnapshot) error {
f.mu.Lock()
defer f.mu.Unlock()
f.snapshots = append(f.snapshots, snapshot)
return nil
}
func (f *fakeLobbyConsumer) OnRuntimeJobResult(_ context.Context, _ uuid.UUID, result runtime.JobResult) error {
f.mu.Lock()
defer f.mu.Unlock()
f.jobs = append(f.jobs, result)
return nil
}
func TestServiceStartGameEndToEnd(t *testing.T) {
if testing.Short() {
t.Skip("postgres-backed test skipped in -short")
}
ctx := context.Background()
db := startPostgres(t)
gameID := uuid.New()
userID := uuid.New()
if _, err := db.ExecContext(ctx, `
INSERT INTO backend.games (
game_id, owner_user_id, visibility, status, game_name, description,
min_players, max_players, start_gap_hours, start_gap_players,
enrollment_ends_at, turn_schedule, target_engine_version,
runtime_snapshot
) VALUES ($1, NULL, 'public', 'starting', 'test-game', '',
1, 4, 0, 0, $2, '*/5 * * * *', '0.1.0', '{}'::jsonb)
`, gameID, time.Now().Add(time.Hour)); err != nil {
t.Fatalf("insert game: %v", err)
}
if _, err := db.ExecContext(ctx, `
INSERT INTO backend.memberships (membership_id, game_id, user_id, race_name, canonical_key, status)
VALUES ($1, $2, $3, 'Alpha', 'alpha', 'active')
`, uuid.New(), gameID, userID); err != nil {
t.Fatalf("insert membership: %v", err)
}
if _, err := db.ExecContext(ctx, `
INSERT INTO backend.engine_versions (version, image_ref, enabled)
VALUES ('0.1.0', 'galaxy-game:0.1.0', true)
`); err != nil {
t.Fatalf("insert engine version: %v", err)
}
engineSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
switch r.URL.Path {
case "/api/v1/admin/init":
_ = json.NewEncoder(w).Encode(rest.StateResponse{ID: gameID, Turn: 0, Players: []rest.PlayerState{{RaceName: "Alpha", Planets: 3, Population: 10}}})
case "/api/v1/admin/status":
_ = json.NewEncoder(w).Encode(rest.StateResponse{ID: gameID, Turn: 1, Players: []rest.PlayerState{{RaceName: "Alpha", Planets: 5, Population: 12}}})
case "/api/v1/admin/turn":
_ = json.NewEncoder(w).Encode(rest.StateResponse{ID: gameID, Turn: 2, Players: []rest.PlayerState{{RaceName: "Alpha", Planets: 6, Population: 14}}, Finished: true})
default:
http.NotFound(w, r)
}
}))
t.Cleanup(engineSrv.Close)
docker := &fakeDocker{endpointFor: func(_ dockerclient.RunSpec) string { return engineSrv.URL }}
engineCli, err := engineclient.NewClientWithHTTP(engineclient.Config{CallTimeout: time.Second, ProbeTimeout: time.Second}, engineSrv.Client())
if err != nil {
t.Fatalf("engineclient: %v", err)
}
store := runtime.NewStore(db)
cache := runtime.NewCache()
if err := cache.Warm(ctx, store); err != nil {
t.Fatalf("warm cache: %v", err)
}
versions := runtime.NewEngineVersionService(store, cache, nil)
consumer := &fakeLobbyConsumer{}
svc, err := runtime.NewService(runtime.Deps{
Store: store,
Cache: cache,
EngineVersions: versions,
Docker: docker,
Engine: engineCli,
Lobby: consumer,
DockerNetwork: "galaxy",
HostStateRoot: t.TempDir(),
Config: config.RuntimeConfig{
WorkerPoolSize: 1,
JobQueueSize: 4,
ReconcileInterval: time.Hour,
ImagePullPolicy: "if_missing",
ContainerLogDriver: "json-file",
ContainerCPUQuota: 1.0,
ContainerMemory: "128m",
ContainerPIDsLimit: 64,
ContainerStateMount: "/var/lib/galaxy-game",
StopGracePeriod: time.Second,
},
Logger: zaptest.NewLogger(t),
})
if err != nil {
t.Fatalf("NewService: %v", err)
}
// Drive StartGame; the worker pool is not running so we invoke
// the worker entry directly through the public API. StartGame
// enqueues; we drain by calling Workers().Run in a goroutine and
// shutting it down once we observe the side effects.
pool := svc.Workers()
runCtx, runCancel := context.WithCancel(ctx)
t.Cleanup(runCancel)
go func() { _ = pool.Run(runCtx) }()
if err := svc.StartGame(ctx, gameID); err != nil {
t.Fatalf("StartGame: %v", err)
}
deadline := time.Now().Add(5 * time.Second)
for time.Now().Before(deadline) {
rec, err := svc.GetRuntime(ctx, gameID)
if err == nil && rec.Status == runtime.RuntimeStatusRunning {
break
}
time.Sleep(50 * time.Millisecond)
}
rec, err := svc.GetRuntime(ctx, gameID)
if err != nil {
t.Fatalf("GetRuntime: %v", err)
}
if rec.Status != runtime.RuntimeStatusRunning {
t.Fatalf("runtime status = %s, want running", rec.Status)
}
if rec.CurrentImageRef != "galaxy-game:0.1.0" {
t.Fatalf("image_ref = %s", rec.CurrentImageRef)
}
consumer.mu.Lock()
snapshotCount := len(consumer.snapshots)
consumer.mu.Unlock()
if snapshotCount == 0 {
t.Fatalf("expected runtime snapshot")
}
mappings, err := store.ListPlayerMappingsForGame(ctx, gameID)
if err != nil {
t.Fatalf("ListPlayerMappingsForGame: %v", err)
}
if len(mappings) != 1 || mappings[0].UserID != userID {
t.Fatalf("unexpected mappings: %+v", mappings)
}
}
+714
View File
@@ -0,0 +1,714 @@
package runtime
import (
"context"
"database/sql"
"errors"
"fmt"
"time"
"galaxy/backend/internal/postgres/jet/backend/model"
"galaxy/backend/internal/postgres/jet/backend/table"
"github.com/go-jet/jet/v2/postgres"
"github.com/go-jet/jet/v2/qrm"
"github.com/google/uuid"
)
// engineVersionsPK is the constraint name surfaced when a duplicate
// `version` is inserted. Postgres synthesises `<table>_pkey` for the
// primary-key constraint, matching the migration in
// `backend/internal/postgres/migrations/00001_init.sql:407`.
const engineVersionsPK = "engine_versions_pkey"
// runtimeRecordsPK is the constraint name surfaced when a duplicate
// `runtime_records.game_id` insert hits the primary key.
const runtimeRecordsPK = "runtime_records_pkey"
// playerMappingsRaceUnique mirrors
// `player_mappings_game_race_uidx`, the partial UNIQUE that enforces
// the one-race-per-game invariant.
const playerMappingsRaceUnique = "player_mappings_game_race_uidx"
// Store is the Postgres-backed query surface for the runtime package.
// All queries are built through go-jet against the generated table
// bindings under `backend/internal/postgres/jet/backend/table`.
type Store struct {
db *sql.DB
}
// NewStore constructs a Store wrapping db.
func NewStore(db *sql.DB) *Store { return &Store{db: db} }
// engineVersionColumns is the canonical projection used by every
// engine-version read path.
func engineVersionColumns() postgres.ColumnList {
v := table.EngineVersions
return postgres.ColumnList{v.Version, v.ImageRef, v.Enabled, v.CreatedAt, v.UpdatedAt}
}
// runtimeRecordColumns is the canonical projection used by every
// runtime-record read path.
func runtimeRecordColumns() postgres.ColumnList {
r := table.RuntimeRecords
return postgres.ColumnList{
r.GameID, r.Status, r.CurrentContainerID, r.CurrentImageRef,
r.CurrentEngineVersion, r.EngineEndpoint, r.StatePath, r.DockerNetwork,
r.TurnSchedule, r.CurrentTurn, r.NextGenerationAt, r.SkipNextTick,
r.Paused, r.PausedAt, r.EngineHealth,
r.CreatedAt, r.UpdatedAt, r.StartedAt, r.StoppedAt, r.FinishedAt,
r.RemovedAt, r.LastObservedAt,
}
}
// operationLogColumns is the canonical projection used by every read
// of `backend.runtime_operation_log`.
func operationLogColumns() postgres.ColumnList {
o := table.RuntimeOperationLog
return postgres.ColumnList{
o.OperationID, o.GameID, o.Op, o.Source, o.Status, o.ImageRef,
o.ContainerID, o.ErrorCode, o.ErrorMessage, o.StartedAt, o.FinishedAt,
}
}
// =====================================================================
// Engine version registry
// =====================================================================
// ListEngineVersions returns every engine_versions row ordered by
// created_at DESC.
func (s *Store) ListEngineVersions(ctx context.Context) ([]EngineVersion, error) {
v := table.EngineVersions
stmt := postgres.SELECT(engineVersionColumns()).
FROM(v).
ORDER_BY(v.CreatedAt.DESC(), v.Version.DESC())
var rows []model.EngineVersions
if err := stmt.QueryContext(ctx, s.db, &rows); err != nil {
return nil, fmt.Errorf("runtime store: list engine versions: %w", err)
}
out := make([]EngineVersion, 0, len(rows))
for _, row := range rows {
out = append(out, modelToEngineVersion(row))
}
return out, nil
}
// GetEngineVersion returns the row for version. Returns ErrNotFound
// when no row matches.
func (s *Store) GetEngineVersion(ctx context.Context, version string) (EngineVersion, error) {
v := table.EngineVersions
stmt := postgres.SELECT(engineVersionColumns()).
FROM(v).
WHERE(v.Version.EQ(postgres.String(version))).
LIMIT(1)
var row model.EngineVersions
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
if errors.Is(err, qrm.ErrNoRows) {
return EngineVersion{}, ErrNotFound
}
return EngineVersion{}, fmt.Errorf("runtime store: load engine version %q: %w", version, err)
}
return modelToEngineVersion(row), nil
}
// InsertEngineVersion persists a fresh engine version row. Returns
// ErrEngineVersionTaken when the primary key collides.
func (s *Store) InsertEngineVersion(ctx context.Context, version, imageRef string, enabled bool, now time.Time) (EngineVersion, error) {
v := table.EngineVersions
stmt := v.INSERT(v.Version, v.ImageRef, v.Enabled, v.CreatedAt, v.UpdatedAt).
VALUES(version, imageRef, enabled, now, now).
RETURNING(engineVersionColumns())
var row model.EngineVersions
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
if isUniqueViolation(err, engineVersionsPK) {
return EngineVersion{}, ErrEngineVersionTaken
}
return EngineVersion{}, fmt.Errorf("runtime store: insert engine version %q: %w", version, err)
}
return modelToEngineVersion(row), nil
}
// engineVersionUpdate carries the parameters for UpdateEngineVersion.
// Nil pointers leave the corresponding column alone.
type engineVersionUpdate struct {
ImageRef *string
Enabled *bool
}
// UpdateEngineVersion patches the supplied columns and bumps
// updated_at. Returns ErrNotFound when no row matches.
func (s *Store) UpdateEngineVersion(ctx context.Context, version string, patch engineVersionUpdate, now time.Time) (EngineVersion, error) {
v := table.EngineVersions
rest := []any{}
if patch.ImageRef != nil {
rest = append(rest, v.ImageRef.SET(postgres.String(*patch.ImageRef)))
}
if patch.Enabled != nil {
rest = append(rest, v.Enabled.SET(postgres.Bool(*patch.Enabled)))
}
stmt := v.UPDATE().
SET(v.UpdatedAt.SET(postgres.TimestampzT(now)), rest...).
WHERE(v.Version.EQ(postgres.String(version))).
RETURNING(engineVersionColumns())
var row model.EngineVersions
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
if errors.Is(err, qrm.ErrNoRows) {
return EngineVersion{}, ErrNotFound
}
return EngineVersion{}, fmt.Errorf("runtime store: update engine version %q: %w", version, err)
}
return modelToEngineVersion(row), nil
}
// =====================================================================
// Runtime records
// =====================================================================
// runtimeRecordInsert carries the parameters for InsertRuntimeRecord.
type runtimeRecordInsert struct {
GameID uuid.UUID
Status string
CurrentContainerID string
CurrentImageRef string
CurrentEngineVersion string
EngineEndpoint string
StatePath string
DockerNetwork string
TurnSchedule string
StartedAt *time.Time
}
// InsertRuntimeRecord creates a fresh row.
func (s *Store) InsertRuntimeRecord(ctx context.Context, in runtimeRecordInsert) (RuntimeRecord, error) {
r := table.RuntimeRecords
stmt := r.INSERT(
r.GameID, r.Status, r.CurrentContainerID, r.CurrentImageRef,
r.CurrentEngineVersion, r.EngineEndpoint, r.StatePath,
r.DockerNetwork, r.TurnSchedule, r.StartedAt,
).VALUES(
in.GameID, in.Status,
nullableString(in.CurrentContainerID),
nullableString(in.CurrentImageRef),
nullableString(in.CurrentEngineVersion),
in.EngineEndpoint,
nullableString(in.StatePath),
nullableString(in.DockerNetwork),
in.TurnSchedule,
nullableTime(in.StartedAt),
).RETURNING(runtimeRecordColumns())
var row model.RuntimeRecords
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
if isUniqueViolation(err, runtimeRecordsPK) {
return RuntimeRecord{}, ErrConflict
}
return RuntimeRecord{}, fmt.Errorf("runtime store: insert runtime_record %s: %w", in.GameID, err)
}
return modelToRuntimeRecord(row), nil
}
// LoadRuntimeRecord returns the row for gameID. Returns ErrNotFound
// when no row matches.
func (s *Store) LoadRuntimeRecord(ctx context.Context, gameID uuid.UUID) (RuntimeRecord, error) {
r := table.RuntimeRecords
stmt := postgres.SELECT(runtimeRecordColumns()).
FROM(r).
WHERE(r.GameID.EQ(postgres.UUID(gameID))).
LIMIT(1)
var row model.RuntimeRecords
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
if errors.Is(err, qrm.ErrNoRows) {
return RuntimeRecord{}, ErrNotFound
}
return RuntimeRecord{}, fmt.Errorf("runtime store: load runtime_record %s: %w", gameID, err)
}
return modelToRuntimeRecord(row), nil
}
// ListAllRuntimeRecords returns every row, used by Cache.Warm.
func (s *Store) ListAllRuntimeRecords(ctx context.Context) ([]RuntimeRecord, error) {
stmt := postgres.SELECT(runtimeRecordColumns()).FROM(table.RuntimeRecords)
var rows []model.RuntimeRecords
if err := stmt.QueryContext(ctx, s.db, &rows); err != nil {
return nil, fmt.Errorf("runtime store: list runtime_records: %w", err)
}
out := make([]RuntimeRecord, 0, len(rows))
for _, row := range rows {
out = append(out, modelToRuntimeRecord(row))
}
return out, nil
}
// runtimeRecordUpdate carries the parameters for UpdateRuntimeRecord.
// Pointer fields default to "leave alone" when nil.
type runtimeRecordUpdate struct {
Status *string
CurrentContainerID *string
CurrentImageRef *string
CurrentEngineVersion *string
EngineEndpoint *string
StatePath *string
DockerNetwork *string
TurnSchedule *string
CurrentTurn *int32
NextGenerationAt **time.Time
SkipNextTick *bool
Paused *bool
PausedAt **time.Time
EngineHealth *string
StartedAt **time.Time
StoppedAt **time.Time
FinishedAt **time.Time
RemovedAt **time.Time
LastObservedAt **time.Time
}
// UpdateRuntimeRecord patches the supplied columns. Pointer fields are
// translated into a dynamic SET list — only the fields the caller
// supplies are emitted in the UPDATE. Nullable timestamps use a
// `**time.Time` so callers can distinguish "leave alone" (outer nil)
// from "clear to NULL" (inner nil).
func (s *Store) UpdateRuntimeRecord(ctx context.Context, gameID uuid.UUID, patch runtimeRecordUpdate, now time.Time) (RuntimeRecord, error) {
r := table.RuntimeRecords
rest := []any{}
if patch.Status != nil {
rest = append(rest, r.Status.SET(postgres.String(*patch.Status)))
}
if patch.CurrentContainerID != nil {
rest = append(rest, r.CurrentContainerID.SET(nullableStringSetExpr(*patch.CurrentContainerID)))
}
if patch.CurrentImageRef != nil {
rest = append(rest, r.CurrentImageRef.SET(nullableStringSetExpr(*patch.CurrentImageRef)))
}
if patch.CurrentEngineVersion != nil {
rest = append(rest, r.CurrentEngineVersion.SET(nullableStringSetExpr(*patch.CurrentEngineVersion)))
}
if patch.EngineEndpoint != nil {
rest = append(rest, r.EngineEndpoint.SET(postgres.String(*patch.EngineEndpoint)))
}
if patch.StatePath != nil {
rest = append(rest, r.StatePath.SET(nullableStringSetExpr(*patch.StatePath)))
}
if patch.DockerNetwork != nil {
rest = append(rest, r.DockerNetwork.SET(nullableStringSetExpr(*patch.DockerNetwork)))
}
if patch.TurnSchedule != nil {
rest = append(rest, r.TurnSchedule.SET(postgres.String(*patch.TurnSchedule)))
}
if patch.CurrentTurn != nil {
rest = append(rest, r.CurrentTurn.SET(postgres.Int(int64(*patch.CurrentTurn))))
}
if patch.NextGenerationAt != nil {
rest = append(rest, r.NextGenerationAt.SET(timePtrSetExpr(*patch.NextGenerationAt)))
}
if patch.SkipNextTick != nil {
rest = append(rest, r.SkipNextTick.SET(postgres.Bool(*patch.SkipNextTick)))
}
if patch.Paused != nil {
rest = append(rest, r.Paused.SET(postgres.Bool(*patch.Paused)))
}
if patch.PausedAt != nil {
rest = append(rest, r.PausedAt.SET(timePtrSetExpr(*patch.PausedAt)))
}
if patch.EngineHealth != nil {
rest = append(rest, r.EngineHealth.SET(postgres.String(*patch.EngineHealth)))
}
if patch.StartedAt != nil {
rest = append(rest, r.StartedAt.SET(timePtrSetExpr(*patch.StartedAt)))
}
if patch.StoppedAt != nil {
rest = append(rest, r.StoppedAt.SET(timePtrSetExpr(*patch.StoppedAt)))
}
if patch.FinishedAt != nil {
rest = append(rest, r.FinishedAt.SET(timePtrSetExpr(*patch.FinishedAt)))
}
if patch.RemovedAt != nil {
rest = append(rest, r.RemovedAt.SET(timePtrSetExpr(*patch.RemovedAt)))
}
if patch.LastObservedAt != nil {
rest = append(rest, r.LastObservedAt.SET(timePtrSetExpr(*patch.LastObservedAt)))
}
stmt := r.UPDATE().
SET(r.UpdatedAt.SET(postgres.TimestampzT(now)), rest...).
WHERE(r.GameID.EQ(postgres.UUID(gameID))).
RETURNING(runtimeRecordColumns())
var row model.RuntimeRecords
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
if errors.Is(err, qrm.ErrNoRows) {
return RuntimeRecord{}, ErrNotFound
}
return RuntimeRecord{}, fmt.Errorf("runtime store: update runtime_record %s: %w", gameID, err)
}
return modelToRuntimeRecord(row), nil
}
// DeleteRuntimeRecord removes the row at gameID. Idempotent: nil when
// no row matched.
func (s *Store) DeleteRuntimeRecord(ctx context.Context, gameID uuid.UUID) error {
stmt := table.RuntimeRecords.DELETE().
WHERE(table.RuntimeRecords.GameID.EQ(postgres.UUID(gameID)))
if _, err := stmt.ExecContext(ctx, s.db); err != nil {
return fmt.Errorf("runtime store: delete runtime_record %s: %w", gameID, err)
}
return nil
}
// =====================================================================
// Player mappings
// =====================================================================
// InsertPlayerMappings persists a slice of mappings in a single
// transaction. Existing rows for the (game_id, user_id) pair are
// replaced (ON CONFLICT) so re-runs of StartGame after a transient
// failure stay idempotent.
func (s *Store) InsertPlayerMappings(ctx context.Context, mappings []PlayerMapping) error {
if len(mappings) == 0 {
return nil
}
tx, err := s.db.BeginTx(ctx, nil)
if err != nil {
return fmt.Errorf("runtime store: begin player_mappings tx: %w", err)
}
defer func() { _ = tx.Rollback() }()
pm := table.PlayerMappings
for _, m := range mappings {
stmt := pm.INSERT(pm.GameID, pm.UserID, pm.RaceName, pm.EnginePlayerUUID).
VALUES(m.GameID, m.UserID, m.RaceName, m.EnginePlayerUUID).
ON_CONFLICT(pm.GameID, pm.UserID).
DO_UPDATE(postgres.SET(
pm.RaceName.SET(pm.EXCLUDED.RaceName),
pm.EnginePlayerUUID.SET(pm.EXCLUDED.EnginePlayerUUID),
))
if _, err := stmt.ExecContext(ctx, tx); err != nil {
if isUniqueViolation(err, playerMappingsRaceUnique) {
return fmt.Errorf("%w: race name %q duplicated within game", ErrConflict, m.RaceName)
}
return fmt.Errorf("runtime store: insert player_mapping %s/%s: %w", m.GameID, m.UserID, err)
}
}
if err := tx.Commit(); err != nil {
return fmt.Errorf("runtime store: commit player_mappings: %w", err)
}
return nil
}
// LoadPlayerMapping returns the mapping for (gameID, userID). Returns
// ErrNotFound when no row matches.
func (s *Store) LoadPlayerMapping(ctx context.Context, gameID, userID uuid.UUID) (PlayerMapping, error) {
pm := table.PlayerMappings
stmt := postgres.SELECT(pm.GameID, pm.UserID, pm.RaceName, pm.EnginePlayerUUID, pm.CreatedAt).
FROM(pm).
WHERE(
pm.GameID.EQ(postgres.UUID(gameID)).
AND(pm.UserID.EQ(postgres.UUID(userID))),
).
LIMIT(1)
var row model.PlayerMappings
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
if errors.Is(err, qrm.ErrNoRows) {
return PlayerMapping{}, ErrNotFound
}
return PlayerMapping{}, fmt.Errorf("runtime store: load player_mapping: %w", err)
}
return modelToPlayerMapping(row), nil
}
// ListPlayerMappingsForGame returns every mapping for gameID.
func (s *Store) ListPlayerMappingsForGame(ctx context.Context, gameID uuid.UUID) ([]PlayerMapping, error) {
pm := table.PlayerMappings
stmt := postgres.SELECT(pm.GameID, pm.UserID, pm.RaceName, pm.EnginePlayerUUID, pm.CreatedAt).
FROM(pm).
WHERE(pm.GameID.EQ(postgres.UUID(gameID))).
ORDER_BY(pm.RaceName.ASC())
var rows []model.PlayerMappings
if err := stmt.QueryContext(ctx, s.db, &rows); err != nil {
return nil, fmt.Errorf("runtime store: list player_mappings: %w", err)
}
out := make([]PlayerMapping, 0, len(rows))
for _, row := range rows {
out = append(out, modelToPlayerMapping(row))
}
return out, nil
}
// DeletePlayerMappingsForGame removes every mapping for gameID. Used
// on stop / cancel / reconciler-removal so a future StartGame can
// repopulate the projection without violating the per-game UNIQUE.
func (s *Store) DeletePlayerMappingsForGame(ctx context.Context, gameID uuid.UUID) error {
stmt := table.PlayerMappings.DELETE().
WHERE(table.PlayerMappings.GameID.EQ(postgres.UUID(gameID)))
if _, err := stmt.ExecContext(ctx, s.db); err != nil {
return fmt.Errorf("runtime store: delete player_mappings %s: %w", gameID, err)
}
return nil
}
// =====================================================================
// Operation log
// =====================================================================
// operationLogInsert carries the parameters for InsertOperationLog.
type operationLogInsert struct {
OperationID uuid.UUID
GameID uuid.UUID
Op string
Source string
Status string
ImageRef string
ContainerID string
StartedAt time.Time
}
// InsertOperationLog persists a queued / running operation row.
func (s *Store) InsertOperationLog(ctx context.Context, in operationLogInsert) (OperationLog, error) {
o := table.RuntimeOperationLog
stmt := o.INSERT(
o.OperationID, o.GameID, o.Op, o.Source, o.Status, o.ImageRef,
o.ContainerID, o.StartedAt,
).VALUES(
in.OperationID, in.GameID, in.Op, in.Source, in.Status, in.ImageRef,
in.ContainerID, in.StartedAt,
).RETURNING(operationLogColumns())
var row model.RuntimeOperationLog
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
return OperationLog{}, err
}
return modelToOperationLog(row), nil
}
// CompleteOperationLog updates the status / error fields on
// operationID. Returns the refreshed row.
func (s *Store) CompleteOperationLog(ctx context.Context, operationID uuid.UUID, status, errCode, errMsg string, finishedAt time.Time) (OperationLog, error) {
o := table.RuntimeOperationLog
stmt := o.UPDATE().
SET(
o.Status.SET(postgres.String(status)),
o.ErrorCode.SET(postgres.String(errCode)),
o.ErrorMessage.SET(postgres.String(errMsg)),
o.FinishedAt.SET(postgres.TimestampzT(finishedAt)),
).
WHERE(o.OperationID.EQ(postgres.UUID(operationID))).
RETURNING(operationLogColumns())
var row model.RuntimeOperationLog
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
if errors.Is(err, qrm.ErrNoRows) {
return OperationLog{}, ErrNotFound
}
return OperationLog{}, fmt.Errorf("runtime store: complete operation_log %s: %w", operationID, err)
}
return modelToOperationLog(row), nil
}
// =====================================================================
// Health snapshots
// =====================================================================
// InsertHealthSnapshot persists a JSON-encoded engine status snapshot.
func (s *Store) InsertHealthSnapshot(ctx context.Context, snapshotID, gameID uuid.UUID, observedAt time.Time, payload []byte) error {
hs := table.RuntimeHealthSnapshots
stmt := hs.INSERT(hs.SnapshotID, hs.GameID, hs.ObservedAt, hs.Payload).
VALUES(snapshotID, gameID, observedAt, string(payload))
if _, err := stmt.ExecContext(ctx, s.db); err != nil {
return fmt.Errorf("runtime store: insert health_snapshot %s: %w", gameID, err)
}
return nil
}
// =====================================================================
// Read-only lobby projection (per The implementation D2)
// =====================================================================
// LoadGameProjection reads `backend.games` for runtime's start/stop
// flow. Lobby remains the only writer of the table; runtime is a
// read-only consumer. Returns ErrNotFound on miss.
func (s *Store) LoadGameProjection(ctx context.Context, gameID uuid.UUID) (Game, error) {
g := table.Games
stmt := postgres.SELECT(
g.GameID, g.OwnerUserID, g.Visibility, g.Status, g.GameName,
g.TurnSchedule, g.TargetEngineVersion,
g.MinPlayers, g.MaxPlayers, g.StartGapHours, g.StartGapPlayers,
).
FROM(g).
WHERE(g.GameID.EQ(postgres.UUID(gameID))).
LIMIT(1)
var row model.Games
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
if errors.Is(err, qrm.ErrNoRows) {
return Game{}, ErrNotFound
}
return Game{}, fmt.Errorf("runtime store: load game %s: %w", gameID, err)
}
out := Game{
GameID: row.GameID,
Visibility: row.Visibility,
Status: row.Status,
GameName: row.GameName,
TurnSchedule: row.TurnSchedule,
TargetEngineVersion: row.TargetEngineVersion,
MinPlayers: row.MinPlayers,
MaxPlayers: row.MaxPlayers,
StartGapHours: row.StartGapHours,
StartGapPlayers: row.StartGapPlayers,
}
if row.OwnerUserID != nil {
owner := *row.OwnerUserID
out.OwnerUserID = &owner
}
return out, nil
}
// ListActiveMemberships reads active rows from `backend.memberships`
// for gameID.
func (s *Store) ListActiveMemberships(ctx context.Context, gameID uuid.UUID) ([]MembershipRow, error) {
m := table.Memberships
stmt := postgres.SELECT(m.MembershipID, m.GameID, m.UserID, m.RaceName).
FROM(m).
WHERE(
m.GameID.EQ(postgres.UUID(gameID)).
AND(m.Status.EQ(postgres.String("active"))),
).
ORDER_BY(m.JoinedAt.ASC())
var rows []model.Memberships
if err := stmt.QueryContext(ctx, s.db, &rows); err != nil {
return nil, fmt.Errorf("runtime store: list memberships %s: %w", gameID, err)
}
out := make([]MembershipRow, 0, len(rows))
for _, row := range rows {
out = append(out, MembershipRow{
MembershipID: row.MembershipID,
GameID: row.GameID,
UserID: row.UserID,
RaceName: row.RaceName,
})
}
return out, nil
}
// =====================================================================
// Model → domain converters
// =====================================================================
func modelToEngineVersion(row model.EngineVersions) EngineVersion {
return EngineVersion{
Version: row.Version,
ImageRef: row.ImageRef,
Enabled: row.Enabled,
CreatedAt: row.CreatedAt,
UpdatedAt: row.UpdatedAt,
}
}
func modelToRuntimeRecord(row model.RuntimeRecords) RuntimeRecord {
rec := RuntimeRecord{
GameID: row.GameID,
Status: row.Status,
EngineEndpoint: row.EngineEndpoint,
TurnSchedule: row.TurnSchedule,
CurrentTurn: row.CurrentTurn,
SkipNextTick: row.SkipNextTick,
Paused: row.Paused,
EngineHealth: row.EngineHealth,
CreatedAt: row.CreatedAt,
UpdatedAt: row.UpdatedAt,
CurrentContainerID: derefString(row.CurrentContainerID),
CurrentImageRef: derefString(row.CurrentImageRef),
CurrentEngineVersion: derefString(row.CurrentEngineVersion),
StatePath: derefString(row.StatePath),
DockerNetwork: derefString(row.DockerNetwork),
}
rec.NextGenerationAt = copyTimePtr(row.NextGenerationAt)
rec.PausedAt = copyTimePtr(row.PausedAt)
rec.StartedAt = copyTimePtr(row.StartedAt)
rec.StoppedAt = copyTimePtr(row.StoppedAt)
rec.FinishedAt = copyTimePtr(row.FinishedAt)
rec.RemovedAt = copyTimePtr(row.RemovedAt)
rec.LastObservedAt = copyTimePtr(row.LastObservedAt)
return rec
}
func modelToOperationLog(row model.RuntimeOperationLog) OperationLog {
op := OperationLog{
OperationID: row.OperationID,
GameID: row.GameID,
Op: row.Op,
Source: row.Source,
Status: row.Status,
ImageRef: row.ImageRef,
ContainerID: row.ContainerID,
ErrorCode: row.ErrorCode,
ErrorMessage: row.ErrorMessage,
StartedAt: row.StartedAt,
}
op.FinishedAt = copyTimePtr(row.FinishedAt)
return op
}
func modelToPlayerMapping(row model.PlayerMappings) PlayerMapping {
return PlayerMapping{
GameID: row.GameID,
UserID: row.UserID,
RaceName: row.RaceName,
EnginePlayerUUID: row.EnginePlayerUUID,
CreatedAt: row.CreatedAt,
}
}
// =====================================================================
// Scalar helpers
// =====================================================================
// nullableString converts a Go string to the `any` form expected by
// jet INSERT VALUES bindings: an empty string becomes nil so the
// column receives NULL.
func nullableString(s string) any {
if s == "" {
return nil
}
return s
}
// nullableTime mirrors nullableString for *time.Time.
func nullableTime(t *time.Time) any {
if t == nil {
return nil
}
return *t
}
// nullableStringSetExpr returns a typed jet expression suitable for
// UPDATE SET on a nullable text column. The empty string is mapped to
// SQL NULL, mirroring the INSERT-side semantics so a "" patch clears
// the column.
func nullableStringSetExpr(v string) postgres.StringExpression {
if v == "" {
return postgres.StringExp(postgres.NULL)
}
return postgres.String(v)
}
// timePtrSetExpr mirrors nullableStringSetExpr for *time.Time. nil
// clears the column; non-nil sets it.
func timePtrSetExpr(t *time.Time) postgres.TimestampzExpression {
if t == nil {
return postgres.TimestampzExp(postgres.NULL)
}
return postgres.TimestampzT(*t)
}
func derefString(p *string) string {
if p == nil {
return ""
}
return *p
}
func copyTimePtr(p *time.Time) *time.Time {
if p == nil {
return nil
}
t := *p
return &t
}
+122
View File
@@ -0,0 +1,122 @@
package runtime
import (
"time"
"github.com/google/uuid"
)
// EngineVersion mirrors a row in `backend.engine_versions`. The version
// label is the primary key and is also the value lobby stores on
// `games.target_engine_version`.
type EngineVersion struct {
Version string
ImageRef string
Enabled bool
CreatedAt time.Time
UpdatedAt time.Time
}
// RuntimeRecord mirrors a row in `backend.runtime_records`. Pointer
// fields are nullable in the schema; primitives default to zero.
type RuntimeRecord struct {
GameID uuid.UUID
Status string
CurrentContainerID string
CurrentImageRef string
CurrentEngineVersion string
EngineEndpoint string
StatePath string
DockerNetwork string
TurnSchedule string
CurrentTurn int32
NextGenerationAt *time.Time
SkipNextTick bool
Paused bool
PausedAt *time.Time
EngineHealth string
CreatedAt time.Time
UpdatedAt time.Time
StartedAt *time.Time
StoppedAt *time.Time
FinishedAt *time.Time
RemovedAt *time.Time
LastObservedAt *time.Time
}
// IsTerminal reports whether the record sits in a status that the
// cache should evict.
func (r RuntimeRecord) IsTerminal() bool {
switch r.Status {
case RuntimeStatusFinished, RuntimeStatusRemoved, RuntimeStatusStopped:
return true
default:
return false
}
}
// PlayerMapping mirrors a row in `backend.player_mappings`. The
// composite primary key is `(game_id, user_id)`; `engine_player_uuid`
// is the engine-assigned race id used by the engine's `actor` field.
type PlayerMapping struct {
GameID uuid.UUID
UserID uuid.UUID
RaceName string
EnginePlayerUUID uuid.UUID
CreatedAt time.Time
}
// OperationLog mirrors a row in `backend.runtime_operation_log`. Used
// by admin endpoints that surface a per-operation status envelope and
// by the worker pool for completion telemetry.
type OperationLog struct {
OperationID uuid.UUID
GameID uuid.UUID
Op string
Source string
Status string
ImageRef string
ContainerID string
ErrorCode string
ErrorMessage string
StartedAt time.Time
FinishedAt *time.Time
}
// HealthSnapshot mirrors a row in `backend.runtime_health_snapshots`.
// The `Payload` field carries the JSON-encoded engine status response
// or a synthesised summary when the engine is unreachable.
type HealthSnapshot struct {
SnapshotID uuid.UUID
GameID uuid.UUID
ObservedAt time.Time
Payload []byte
}
// Game is the read-only projection of a `backend.games` row that the
// runtime needs at start time. It is the runtime's view of a lobby
// row; lobby remains the only writer.
type Game struct {
GameID uuid.UUID
OwnerUserID *uuid.UUID
Visibility string
Status string
GameName string
TurnSchedule string
TargetEngineVersion string
MinPlayers int32
MaxPlayers int32
StartGapHours int32
StartGapPlayers int32
}
// MembershipRow is the read-only projection of an active
// `backend.memberships` row that the runtime needs at start time. It
// carries enough data to populate the engine `Init` request and the
// `player_mappings` projection.
type MembershipRow struct {
MembershipID uuid.UUID
GameID uuid.UUID
UserID uuid.UUID
RaceName string
}
+124
View File
@@ -0,0 +1,124 @@
package runtime
import (
"context"
"errors"
"sync"
"sync/atomic"
"go.uber.org/zap"
)
// WorkerPool drains long-running runtime jobs (start, stop, restart,
// patch). Implements `internal/app.Component` so the App lifecycle
// drives Run/Shutdown.
type WorkerPool struct {
svc *Service
jobs chan job
stopping atomic.Bool
wg sync.WaitGroup
}
// NewWorkerPool builds a worker pool sized by `cfg.WorkerPoolSize`
// with a buffered channel of depth `cfg.JobQueueSize`.
func NewWorkerPool(svc *Service) *WorkerPool {
return &WorkerPool{
svc: svc,
jobs: make(chan job, svc.deps.Config.JobQueueSize),
}
}
// submit places j on the worker channel. Returns ErrJobQueueFull when
// the channel is full and ErrShutdown when the pool is stopping.
func (w *WorkerPool) submit(ctx context.Context, j job) error {
if w == nil || w.stopping.Load() {
return ErrShutdown
}
select {
case <-ctx.Done():
return ctx.Err()
case w.jobs <- j:
return nil
default:
}
// One last attempt with the caller's context; lets a fast worker
// pick it up while we wait briefly.
select {
case <-ctx.Done():
return ctx.Err()
case w.jobs <- j:
return nil
}
}
// Run starts the configured number of worker goroutines and blocks
// until ctx is cancelled.
func (w *WorkerPool) Run(ctx context.Context) error {
if w == nil {
return nil
}
count := w.svc.deps.Config.WorkerPoolSize
if count <= 0 {
count = 1
}
for i := 0; i < count; i++ {
w.wg.Add(1)
go w.loop(ctx, i)
}
<-ctx.Done()
return nil
}
// Shutdown signals the pool to stop accepting new work and waits for
// in-flight workers to drain. The provided context bounds the wait;
// any worker still running when ctx expires is left to finish on its
// own and the pool returns.
func (w *WorkerPool) Shutdown(ctx context.Context) error {
if w == nil {
return nil
}
if !w.stopping.CompareAndSwap(false, true) {
return nil
}
close(w.jobs)
done := make(chan struct{})
go func() {
w.wg.Wait()
close(done)
}()
select {
case <-done:
return nil
case <-ctx.Done():
return ctx.Err()
}
}
func (w *WorkerPool) loop(ctx context.Context, idx int) {
defer w.wg.Done()
logger := w.svc.deps.Logger.With(zap.Int("worker", idx))
for {
select {
case <-ctx.Done():
return
case j, ok := <-w.jobs:
if !ok {
return
}
logger.Debug("runtime job picked",
zap.String("game_id", j.GameID().String()),
zap.String("op", j.Operation().Op),
)
if err := j.Run(ctx, w.svc); err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return
}
logger.Warn("runtime job failed",
zap.String("game_id", j.GameID().String()),
zap.String("op", j.Operation().Op),
zap.Error(err),
)
}
}
}
}