feat: backend service
This commit is contained in:
@@ -0,0 +1,174 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// Cache is the in-memory write-through projection of the runtime
|
||||
// records and engine version registry. Mirrors the lobby/auth/admin
|
||||
// cache idiom: Postgres is the source of truth, the cache is updated
|
||||
// only after a successful commit.
|
||||
//
|
||||
// Reads (Get*) take RLocks; writes (Put*, Remove*) take Locks. The
|
||||
// cache only retains non-terminal runtime records so the active set
|
||||
// stays small and warm.
|
||||
type Cache struct {
|
||||
mu sync.RWMutex
|
||||
runtimes map[uuid.UUID]RuntimeRecord
|
||||
engineVersions map[string]EngineVersion
|
||||
ready atomic.Bool
|
||||
}
|
||||
|
||||
// NewCache returns an empty Cache.
|
||||
func NewCache() *Cache {
|
||||
return &Cache{
|
||||
runtimes: make(map[uuid.UUID]RuntimeRecord),
|
||||
engineVersions: make(map[string]EngineVersion),
|
||||
}
|
||||
}
|
||||
|
||||
// Warm populates the cache from store. Must be called once at process
|
||||
// boot before the HTTP listener accepts traffic.
|
||||
func (c *Cache) Warm(ctx context.Context, store *Store) error {
|
||||
if c == nil {
|
||||
return nil
|
||||
}
|
||||
versions, err := store.ListEngineVersions(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("runtime cache warm: engine versions: %w", err)
|
||||
}
|
||||
records, err := store.ListAllRuntimeRecords(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("runtime cache warm: runtime records: %w", err)
|
||||
}
|
||||
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
c.engineVersions = make(map[string]EngineVersion, len(versions))
|
||||
for _, v := range versions {
|
||||
c.engineVersions[v.Version] = v
|
||||
}
|
||||
c.runtimes = make(map[uuid.UUID]RuntimeRecord)
|
||||
for _, r := range records {
|
||||
if r.IsTerminal() {
|
||||
continue
|
||||
}
|
||||
c.runtimes[r.GameID] = r
|
||||
}
|
||||
c.ready.Store(true)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Ready reports whether Warm completed at least once.
|
||||
func (c *Cache) Ready() bool {
|
||||
if c == nil {
|
||||
return false
|
||||
}
|
||||
return c.ready.Load()
|
||||
}
|
||||
|
||||
// Sizes returns the cardinalities of the two projections; used by the
|
||||
// startup log line and tests.
|
||||
func (c *Cache) Sizes() (runtimes int, engineVersions int) {
|
||||
if c == nil {
|
||||
return 0, 0
|
||||
}
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
return len(c.runtimes), len(c.engineVersions)
|
||||
}
|
||||
|
||||
// GetRuntime returns the cached runtime record for gameID together
|
||||
// with a presence flag.
|
||||
func (c *Cache) GetRuntime(gameID uuid.UUID) (RuntimeRecord, bool) {
|
||||
if c == nil {
|
||||
return RuntimeRecord{}, false
|
||||
}
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
r, ok := c.runtimes[gameID]
|
||||
return r, ok
|
||||
}
|
||||
|
||||
// PutRuntime stores or updates the runtime record. Terminal statuses
|
||||
// cause the entry to be evicted.
|
||||
func (c *Cache) PutRuntime(rec RuntimeRecord) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
if rec.IsTerminal() {
|
||||
delete(c.runtimes, rec.GameID)
|
||||
return
|
||||
}
|
||||
c.runtimes[rec.GameID] = rec
|
||||
}
|
||||
|
||||
// RemoveRuntime evicts the entry for gameID.
|
||||
func (c *Cache) RemoveRuntime(gameID uuid.UUID) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
delete(c.runtimes, gameID)
|
||||
}
|
||||
|
||||
// ActiveRuntimes returns a snapshot copy of every cached runtime
|
||||
// record. The reconciler and the scheduler both iterate this list.
|
||||
func (c *Cache) ActiveRuntimes() []RuntimeRecord {
|
||||
if c == nil {
|
||||
return nil
|
||||
}
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
out := make([]RuntimeRecord, 0, len(c.runtimes))
|
||||
for _, r := range c.runtimes {
|
||||
out = append(out, r)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// GetEngineVersion returns the cached engine_versions row keyed by
|
||||
// version label, together with a presence flag.
|
||||
func (c *Cache) GetEngineVersion(version string) (EngineVersion, bool) {
|
||||
if c == nil {
|
||||
return EngineVersion{}, false
|
||||
}
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
v, ok := c.engineVersions[version]
|
||||
return v, ok
|
||||
}
|
||||
|
||||
// PutEngineVersion stores or updates the engine_versions cache entry.
|
||||
func (c *Cache) PutEngineVersion(v EngineVersion) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
c.engineVersions[v.Version] = v
|
||||
}
|
||||
|
||||
// ListEngineVersions returns a snapshot of the cached engine_versions
|
||||
// rows ordered by created_at DESC. Falls back to a deterministic order
|
||||
// by version label when timestamps tie.
|
||||
func (c *Cache) ListEngineVersions() []EngineVersion {
|
||||
if c == nil {
|
||||
return nil
|
||||
}
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
out := make([]EngineVersion, 0, len(c.engineVersions))
|
||||
for _, v := range c.engineVersions {
|
||||
out = append(out, v)
|
||||
}
|
||||
return out
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
func TestCacheRuntimeRoundTrip(t *testing.T) {
|
||||
c := NewCache()
|
||||
gameID := uuid.New()
|
||||
rec := RuntimeRecord{GameID: gameID, Status: RuntimeStatusRunning}
|
||||
|
||||
c.PutRuntime(rec)
|
||||
got, ok := c.GetRuntime(gameID)
|
||||
if !ok {
|
||||
t.Fatal("expected cache hit")
|
||||
}
|
||||
if got.Status != RuntimeStatusRunning {
|
||||
t.Fatalf("status = %s, want running", got.Status)
|
||||
}
|
||||
|
||||
rec.Status = RuntimeStatusFinished
|
||||
c.PutRuntime(rec)
|
||||
if _, ok := c.GetRuntime(gameID); ok {
|
||||
t.Fatal("terminal status must evict")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCacheEngineVersionRoundTrip(t *testing.T) {
|
||||
c := NewCache()
|
||||
v := EngineVersion{Version: "0.1.0", ImageRef: "img", Enabled: true}
|
||||
c.PutEngineVersion(v)
|
||||
got, ok := c.GetEngineVersion("0.1.0")
|
||||
if !ok {
|
||||
t.Fatal("expected hit")
|
||||
}
|
||||
if got.ImageRef != "img" {
|
||||
t.Fatalf("image_ref = %s, want img", got.ImageRef)
|
||||
}
|
||||
if list := c.ListEngineVersions(); len(list) != 1 {
|
||||
t.Fatalf("list size = %d, want 1", len(list))
|
||||
}
|
||||
}
|
||||
|
||||
func TestCacheActiveRuntimes(t *testing.T) {
|
||||
c := NewCache()
|
||||
c.PutRuntime(RuntimeRecord{GameID: uuid.New(), Status: RuntimeStatusRunning})
|
||||
c.PutRuntime(RuntimeRecord{GameID: uuid.New(), Status: RuntimeStatusStarting})
|
||||
c.PutRuntime(RuntimeRecord{GameID: uuid.New(), Status: RuntimeStatusFinished}) // evicted
|
||||
if got := c.ActiveRuntimes(); len(got) != 2 {
|
||||
t.Fatalf("active = %d, want 2", len(got))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"galaxy/backend/internal/config"
|
||||
"galaxy/backend/internal/dockerclient"
|
||||
"galaxy/backend/internal/engineclient"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// LobbyConsumer is the inbound surface the runtime uses to publish
|
||||
// snapshots and adoption / removal events back into lobby. The
|
||||
// canonical implementation is `*lobby.Service`; tests substitute a
|
||||
// hand-rolled fake that records the calls.
|
||||
//
|
||||
// The interface is intentionally narrow: runtime only forwards
|
||||
// data-plane events. Lobby owns every status transition that follows
|
||||
// from the snapshot.
|
||||
type LobbyConsumer interface {
|
||||
// OnRuntimeSnapshot is invoked synchronously after every successful
|
||||
// engine read or health-probe transition. Lobby maps the snapshot
|
||||
// into its `games.runtime_snapshot` projection and may transition
|
||||
// the game's lifecycle status.
|
||||
OnRuntimeSnapshot(ctx context.Context, gameID uuid.UUID, snapshot LobbySnapshot) error
|
||||
|
||||
// OnRuntimeJobResult is invoked by the reconciler when a labelled
|
||||
// container that lobby believes is alive has disappeared. Lobby
|
||||
// reacts by cancelling the game (the engine container is gone).
|
||||
OnRuntimeJobResult(ctx context.Context, gameID uuid.UUID, result JobResult) error
|
||||
}
|
||||
|
||||
// LobbySnapshot is the runtime → lobby DTO. It is the runtime's view
|
||||
// of the engine status response, plus the per-player observations
|
||||
// lobby needs for capable-finish promotion.
|
||||
//
|
||||
// The structure intentionally mirrors `lobby.RuntimeSnapshot` in
|
||||
// shape; runtime keeps its own version so the two packages do not
|
||||
// import each other directly. The cmd/backend wiring layer adapts
|
||||
// between them.
|
||||
type LobbySnapshot struct {
|
||||
CurrentTurn int32
|
||||
RuntimeStatus string
|
||||
EngineHealth string
|
||||
ObservedAt time.Time
|
||||
PlayerStats []LobbyPlayerStats
|
||||
}
|
||||
|
||||
// LobbyPlayerStats is the per-player observation read from a runtime
|
||||
// snapshot. `MaxPlanets` / `MaxPopulation` are the per-snapshot
|
||||
// running maxima; lobby aggregates across the game lifetime.
|
||||
type LobbyPlayerStats struct {
|
||||
UserID uuid.UUID
|
||||
InitialPlanets int32
|
||||
InitialPopulation int32
|
||||
CurrentPlanets int32
|
||||
CurrentPopulation int32
|
||||
MaxPlanets int32
|
||||
MaxPopulation int32
|
||||
}
|
||||
|
||||
// JobResult is the outcome envelope passed to
|
||||
// `LobbyConsumer.OnRuntimeJobResult`. The reconciler produces it on
|
||||
// adoption / removal events; future job paths (start, stop, restart)
|
||||
// may reuse the same envelope.
|
||||
type JobResult struct {
|
||||
Op string
|
||||
Status string
|
||||
Message string
|
||||
}
|
||||
|
||||
// NotificationPublisher is the outbound surface runtime uses to emit
|
||||
// admin-channel notifications enumerated under `runtime.*` in
|
||||
// `backend/README.md` §10. The real implementation lives in
|
||||
// `backend/internal/notification` ; until then
|
||||
// `NewNoopNotificationPublisher` ships a logger-only stub so the
|
||||
// runtime path stays callable end-to-end during tests.
|
||||
//
|
||||
// Kind must be one of `runtime.image_pull_failed`,
|
||||
// `runtime.container_start_failed`, or `runtime.start_config_invalid`.
|
||||
// Payload carries the kind-specific fields documented in the catalog.
|
||||
// The IdempotencyKey is supplied by the caller and feeds the
|
||||
// notification UNIQUE(kind, idempotency_key) constraint.
|
||||
type NotificationPublisher interface {
|
||||
PublishRuntimeEvent(ctx context.Context, kind, idempotencyKey string, payload map[string]any) error
|
||||
}
|
||||
|
||||
// NewNoopNotificationPublisher returns a NotificationPublisher that
|
||||
// logs every event at info level and returns nil. The implementation swaps in
|
||||
// the real `*notification.Service` adapter.
|
||||
func NewNoopNotificationPublisher(logger *zap.Logger) NotificationPublisher {
|
||||
if logger == nil {
|
||||
logger = zap.NewNop()
|
||||
}
|
||||
return &noopNotificationPublisher{logger: logger.Named("runtime.notify.noop")}
|
||||
}
|
||||
|
||||
type noopNotificationPublisher struct {
|
||||
logger *zap.Logger
|
||||
}
|
||||
|
||||
func (p *noopNotificationPublisher) PublishRuntimeEvent(_ context.Context, kind, idempotencyKey string, payload map[string]any) error {
|
||||
p.logger.Info("runtime event (noop publisher)",
|
||||
zap.String("kind", kind),
|
||||
zap.String("idempotency_key", idempotencyKey),
|
||||
zap.Int("payload_keys", len(payload)),
|
||||
)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Deps aggregates every collaborator the runtime Service depends on.
|
||||
// Constructing the Service through Deps (rather than positional args)
|
||||
// keeps the wiring patches small as new dependencies are added.
|
||||
type Deps struct {
|
||||
Store *Store
|
||||
Cache *Cache
|
||||
EngineVersions *EngineVersionService
|
||||
|
||||
Docker dockerclient.Client
|
||||
Engine *engineclient.Client
|
||||
Lobby LobbyConsumer
|
||||
Notification NotificationPublisher
|
||||
|
||||
// DockerNetwork is the user-defined Docker network name engine
|
||||
// containers attach to. Wired from `cfg.Docker.Network`.
|
||||
DockerNetwork string
|
||||
|
||||
// HostStateRoot is the host-side directory that holds per-game
|
||||
// state subdirectories. Wired from `cfg.Game.StateRoot`.
|
||||
HostStateRoot string
|
||||
|
||||
Config config.RuntimeConfig
|
||||
Logger *zap.Logger
|
||||
Now func() time.Time
|
||||
}
|
||||
@@ -0,0 +1,189 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/util"
|
||||
)
|
||||
|
||||
// EngineVersionService implements the engine-version registry CRUD
|
||||
// surface consumed by the admin endpoints under
|
||||
// `/api/v1/admin/engine-versions/*`. Mutations are write-through: a
|
||||
// successful Postgres write is followed by a cache update so warm
|
||||
// reads observe the new state immediately.
|
||||
type EngineVersionService struct {
|
||||
store *Store
|
||||
cache *Cache
|
||||
now func() time.Time
|
||||
}
|
||||
|
||||
// NewEngineVersionService constructs the service. now defaults to
|
||||
// time.Now when nil.
|
||||
func NewEngineVersionService(store *Store, cache *Cache, now func() time.Time) *EngineVersionService {
|
||||
if now == nil {
|
||||
now = time.Now
|
||||
}
|
||||
return &EngineVersionService{store: store, cache: cache, now: now}
|
||||
}
|
||||
|
||||
// List returns every engine_versions row ordered by created_at DESC.
|
||||
// Cache-first when warm; falls back to a Postgres read otherwise.
|
||||
func (s *EngineVersionService) List(ctx context.Context) ([]EngineVersion, error) {
|
||||
if s.cache != nil && s.cache.Ready() {
|
||||
out := s.cache.ListEngineVersions()
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
if !out[i].CreatedAt.Equal(out[j].CreatedAt) {
|
||||
return out[i].CreatedAt.After(out[j].CreatedAt)
|
||||
}
|
||||
return out[i].Version > out[j].Version
|
||||
})
|
||||
return out, nil
|
||||
}
|
||||
return s.store.ListEngineVersions(ctx)
|
||||
}
|
||||
|
||||
// Get returns the row for version. Returns ErrNotFound on miss.
|
||||
func (s *EngineVersionService) Get(ctx context.Context, version string) (EngineVersion, error) {
|
||||
version = strings.TrimSpace(version)
|
||||
if version == "" {
|
||||
return EngineVersion{}, fmt.Errorf("%w: version must not be empty", ErrInvalidInput)
|
||||
}
|
||||
if s.cache != nil {
|
||||
if v, ok := s.cache.GetEngineVersion(version); ok {
|
||||
return v, nil
|
||||
}
|
||||
}
|
||||
v, err := s.store.GetEngineVersion(ctx, version)
|
||||
if err != nil {
|
||||
return EngineVersion{}, err
|
||||
}
|
||||
if s.cache != nil {
|
||||
s.cache.PutEngineVersion(v)
|
||||
}
|
||||
return v, nil
|
||||
}
|
||||
|
||||
// RegisterInput is the parameter struct for Register.
|
||||
type RegisterInput struct {
|
||||
Version string
|
||||
ImageRef string
|
||||
Enabled *bool
|
||||
}
|
||||
|
||||
// Validate normalises the request and rejects empty / malformed
|
||||
// fields. Semver is enforced via `pkg/util.ParseSemver`.
|
||||
func (in *RegisterInput) Validate() error {
|
||||
in.Version = strings.TrimSpace(in.Version)
|
||||
in.ImageRef = strings.TrimSpace(in.ImageRef)
|
||||
if in.Version == "" {
|
||||
return fmt.Errorf("%w: version must not be empty", ErrInvalidInput)
|
||||
}
|
||||
if _, err := util.ParseSemver(in.Version); err != nil {
|
||||
return fmt.Errorf("%w: version %q is not a valid semver: %v", ErrInvalidInput, in.Version, err)
|
||||
}
|
||||
if in.ImageRef == "" {
|
||||
return fmt.Errorf("%w: image_ref must not be empty", ErrInvalidInput)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Register persists a fresh engine_versions row. Returns
|
||||
// ErrEngineVersionTaken on duplicate version.
|
||||
func (s *EngineVersionService) Register(ctx context.Context, in RegisterInput) (EngineVersion, error) {
|
||||
if err := (&in).Validate(); err != nil {
|
||||
return EngineVersion{}, err
|
||||
}
|
||||
enabled := true
|
||||
if in.Enabled != nil {
|
||||
enabled = *in.Enabled
|
||||
}
|
||||
now := s.now().UTC()
|
||||
v, err := s.store.InsertEngineVersion(ctx, in.Version, in.ImageRef, enabled, now)
|
||||
if err != nil {
|
||||
return EngineVersion{}, err
|
||||
}
|
||||
if s.cache != nil {
|
||||
s.cache.PutEngineVersion(v)
|
||||
}
|
||||
return v, nil
|
||||
}
|
||||
|
||||
// UpdateInput is the parameter struct for Update. Nil pointers leave
|
||||
// the corresponding column alone.
|
||||
type UpdateInput struct {
|
||||
ImageRef *string
|
||||
Enabled *bool
|
||||
}
|
||||
|
||||
// Update patches mutable fields on an existing row.
|
||||
func (s *EngineVersionService) Update(ctx context.Context, version string, in UpdateInput) (EngineVersion, error) {
|
||||
version = strings.TrimSpace(version)
|
||||
if version == "" {
|
||||
return EngineVersion{}, fmt.Errorf("%w: version must not be empty", ErrInvalidInput)
|
||||
}
|
||||
patch := engineVersionUpdate{Enabled: in.Enabled}
|
||||
if in.ImageRef != nil {
|
||||
trimmed := strings.TrimSpace(*in.ImageRef)
|
||||
if trimmed == "" {
|
||||
return EngineVersion{}, fmt.Errorf("%w: image_ref must not be empty", ErrInvalidInput)
|
||||
}
|
||||
patch.ImageRef = &trimmed
|
||||
}
|
||||
now := s.now().UTC()
|
||||
v, err := s.store.UpdateEngineVersion(ctx, version, patch, now)
|
||||
if err != nil {
|
||||
return EngineVersion{}, err
|
||||
}
|
||||
if s.cache != nil {
|
||||
s.cache.PutEngineVersion(v)
|
||||
}
|
||||
return v, nil
|
||||
}
|
||||
|
||||
// Disable flips the enabled flag to false. Idempotent.
|
||||
func (s *EngineVersionService) Disable(ctx context.Context, version string) (EngineVersion, error) {
|
||||
disabled := false
|
||||
return s.Update(ctx, version, UpdateInput{Enabled: &disabled})
|
||||
}
|
||||
|
||||
// Resolve returns the row for version, rejecting disabled rows with
|
||||
// ErrEngineVersionDisabled. Used by `Service.StartGame` /
|
||||
// `AdminPatch` / `AdminRestart` before the docker pull.
|
||||
func (s *EngineVersionService) Resolve(ctx context.Context, version string) (EngineVersion, error) {
|
||||
v, err := s.Get(ctx, version)
|
||||
if err != nil {
|
||||
return EngineVersion{}, err
|
||||
}
|
||||
if !v.Enabled {
|
||||
return EngineVersion{}, fmt.Errorf("%w: %s", ErrEngineVersionDisabled, v.Version)
|
||||
}
|
||||
return v, nil
|
||||
}
|
||||
|
||||
// CheckPatchCompatible verifies the requested target version stays
|
||||
// inside the same major+minor line as `currentVersion`. Returns
|
||||
// ErrPatchSemverIncompatible otherwise.
|
||||
func CheckPatchCompatible(currentVersion, targetVersion string) error {
|
||||
current, err := util.ParseSemver(currentVersion)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%w: current version %q: %v", ErrInvalidInput, currentVersion, err)
|
||||
}
|
||||
target, err := util.ParseSemver(targetVersion)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%w: target version %q: %v", ErrInvalidInput, targetVersion, err)
|
||||
}
|
||||
if current.Major != target.Major || current.Minor != target.Minor {
|
||||
return fmt.Errorf("%w: %s -> %s", ErrPatchSemverIncompatible, currentVersion, targetVersion)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsKnownEngineVersion is a small helper used by tests and handlers.
|
||||
func IsKnownEngineVersion(err error) bool {
|
||||
return errors.Is(err, ErrEngineVersionDisabled) || errors.Is(err, ErrPatchSemverIncompatible)
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestEngineVersionRegisterValidate(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
input RegisterInput
|
||||
wantErr error
|
||||
}{
|
||||
{
|
||||
name: "empty version",
|
||||
input: RegisterInput{Version: "", ImageRef: "img"},
|
||||
wantErr: ErrInvalidInput,
|
||||
},
|
||||
{
|
||||
name: "non-semver",
|
||||
input: RegisterInput{Version: "abc", ImageRef: "img"},
|
||||
wantErr: ErrInvalidInput,
|
||||
},
|
||||
{
|
||||
name: "empty image",
|
||||
input: RegisterInput{Version: "0.1.0", ImageRef: ""},
|
||||
wantErr: ErrInvalidInput,
|
||||
},
|
||||
{
|
||||
name: "valid",
|
||||
input: RegisterInput{Version: "0.1.0", ImageRef: "img"},
|
||||
wantErr: nil,
|
||||
},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
err := (&c.input).Validate()
|
||||
if c.wantErr == nil {
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
if !errors.Is(err, c.wantErr) {
|
||||
t.Fatalf("got %v, want %v", err, c.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckPatchCompatible(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
current string
|
||||
target string
|
||||
wantErr error
|
||||
}{
|
||||
{"same patch", "0.1.0", "0.1.0", nil},
|
||||
{"compatible patch", "0.1.0", "0.1.4", nil},
|
||||
{"different minor", "0.1.0", "0.2.0", ErrPatchSemverIncompatible},
|
||||
{"different major", "1.0.0", "2.0.0", ErrPatchSemverIncompatible},
|
||||
{"invalid current", "abc", "0.1.0", ErrInvalidInput},
|
||||
{"invalid target", "0.1.0", "abc", ErrInvalidInput},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
err := CheckPatchCompatible(c.current, c.target)
|
||||
if c.wantErr == nil && err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if c.wantErr != nil && !errors.Is(err, c.wantErr) {
|
||||
t.Fatalf("got %v, want %v", err, c.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
package runtime
|
||||
|
||||
import "errors"
|
||||
|
||||
// Sentinel errors. Handlers map them to the standard JSON envelope at
|
||||
// the wire boundary; lobby and admin packages observe them through
|
||||
// errors.Is when they need to branch on the domain reason.
|
||||
var (
|
||||
// ErrNotFound is returned when no row matches the requested
|
||||
// primary key (engine version, runtime record, player mapping).
|
||||
ErrNotFound = errors.New("runtime: not found")
|
||||
|
||||
// ErrInvalidInput reports request-level validation failures
|
||||
// (empty fields, malformed semver, unknown enum values).
|
||||
ErrInvalidInput = errors.New("runtime: invalid input")
|
||||
|
||||
// ErrConflict reports that the requested action conflicts with
|
||||
// the current persisted state (illegal status transition, retry
|
||||
// while a job is still in-flight, race against the reconciler).
|
||||
ErrConflict = errors.New("runtime: conflict")
|
||||
|
||||
// ErrEngineVersionTaken means a duplicate primary key was
|
||||
// observed when registering a new engine version row.
|
||||
ErrEngineVersionTaken = errors.New("runtime: engine version already registered")
|
||||
|
||||
// ErrEngineVersionDisabled reports that a referenced engine
|
||||
// version row exists but is marked disabled.
|
||||
ErrEngineVersionDisabled = errors.New("runtime: engine version disabled")
|
||||
|
||||
// ErrPatchSemverIncompatible reports that an admin-requested
|
||||
// version patch crosses major or minor boundary, which Galaxy
|
||||
// disallows for in-place patching (per ARCHITECTURE.md §9).
|
||||
ErrPatchSemverIncompatible = errors.New("runtime: patch must stay inside the same major/minor line")
|
||||
|
||||
// ErrJobQueueFull reports that the worker pool's buffered job
|
||||
// channel is at capacity. Surfaced as 503 service_unavailable at
|
||||
// the wire boundary; in practice the pool size and queue depth
|
||||
// are budgeted in `BACKEND_RUNTIME_*` env vars so the operator
|
||||
// can absorb peaks.
|
||||
ErrJobQueueFull = errors.New("runtime: job queue full")
|
||||
|
||||
// ErrShutdown means the runtime service has stopped accepting
|
||||
// work because the parent context was cancelled.
|
||||
ErrShutdown = errors.New("runtime: shutting down")
|
||||
)
|
||||
@@ -0,0 +1,55 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
|
||||
"galaxy/backend/internal/dockerclient"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// publishStartConfigInvalid emits the `runtime.start_config_invalid`
|
||||
// admin notification for a pre-Run validation failure on the start /
|
||||
// patch path. The OperationLog supplies the idempotency key so the
|
||||
// catalog UNIQUE(kind, idempotency_key) constraint deduplicates a
|
||||
// repeated retry on the same operation row.
|
||||
func (s *Service) publishStartConfigInvalid(ctx context.Context, op OperationLog, reason string) {
|
||||
s.publishRuntimeEvent(ctx, "runtime.start_config_invalid", op, map[string]any{
|
||||
"game_id": op.GameID.String(),
|
||||
"reason": reason,
|
||||
})
|
||||
}
|
||||
|
||||
// publishStartFailure emits either `runtime.image_pull_failed` or
|
||||
// `runtime.container_start_failed` depending on whether the Docker
|
||||
// daemon reported a pull-stage error. The two kinds carry the catalog
|
||||
// payload from `backend/README.md` §10.
|
||||
func (s *Service) publishStartFailure(ctx context.Context, op OperationLog, imageRef string, runErr error) {
|
||||
if errors.Is(runErr, dockerclient.ErrImagePullFailed) {
|
||||
s.publishRuntimeEvent(ctx, "runtime.image_pull_failed", op, map[string]any{
|
||||
"game_id": op.GameID.String(),
|
||||
"image_ref": imageRef,
|
||||
})
|
||||
return
|
||||
}
|
||||
s.publishRuntimeEvent(ctx, "runtime.container_start_failed", op, map[string]any{
|
||||
"game_id": op.GameID.String(),
|
||||
})
|
||||
}
|
||||
|
||||
// publishRuntimeEvent threads the publisher call through the package
|
||||
// logger so a misconfigured publisher cannot silently drop events.
|
||||
func (s *Service) publishRuntimeEvent(ctx context.Context, kind string, op OperationLog, payload map[string]any) {
|
||||
if s.deps.Notification == nil {
|
||||
return
|
||||
}
|
||||
idempotencyKey := kind + ":" + op.GameID.String() + ":" + op.OperationID.String()
|
||||
if err := s.deps.Notification.PublishRuntimeEvent(ctx, kind, idempotencyKey, payload); err != nil {
|
||||
s.deps.Logger.Warn("runtime notification publish failed",
|
||||
zap.String("kind", kind),
|
||||
zap.String("idempotency_key", idempotencyKey),
|
||||
zap.Error(err),
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,203 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/backend/internal/dockerclient"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// Reconciler runs an immediate startup pass plus a periodic ticker
|
||||
// (`BACKEND_RUNTIME_RECONCILE_INTERVAL`). On every pass it diffs
|
||||
// labelled containers reported by Docker against
|
||||
// `runtime_records`, adopts unrecorded labelled containers, marks
|
||||
// recorded-but-missing as `removed`, and publishes a fresh snapshot
|
||||
// for matched pairs.
|
||||
//
|
||||
// Implements `internal/app.Component`.
|
||||
type Reconciler struct {
|
||||
svc *Service
|
||||
}
|
||||
|
||||
// NewReconciler builds a Reconciler bound to svc.
|
||||
func NewReconciler(svc *Service) *Reconciler { return &Reconciler{svc: svc} }
|
||||
|
||||
// Run drives the reconciliation loop until ctx is cancelled.
|
||||
func (r *Reconciler) Run(ctx context.Context) error {
|
||||
if r == nil {
|
||||
return nil
|
||||
}
|
||||
logger := r.svc.deps.Logger.Named("reconciler")
|
||||
if err := r.tick(ctx); err != nil {
|
||||
logger.Warn("initial reconcile tick failed", zap.Error(err))
|
||||
}
|
||||
ticker := time.NewTicker(r.svc.deps.Config.ReconcileInterval)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil
|
||||
case <-ticker.C:
|
||||
if err := r.tick(ctx); err != nil {
|
||||
logger.Warn("reconcile tick failed", zap.Error(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op: each tick is synchronous inside Run.
|
||||
func (r *Reconciler) Shutdown(_ context.Context) error { return nil }
|
||||
|
||||
// Tick runs a single reconciliation pass. Exposed for tests so they
|
||||
// can drive the reconciler without timing dependencies.
|
||||
func (r *Reconciler) Tick(ctx context.Context) error { return r.tick(ctx) }
|
||||
|
||||
func (r *Reconciler) tick(ctx context.Context) error {
|
||||
containers, err := r.svc.deps.Docker.List(ctx, dockerclient.ListFilter{
|
||||
Labels: map[string]string{dockerclient.ManagedLabel: dockerclient.ManagedLabelValue},
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("list managed containers: %w", err)
|
||||
}
|
||||
|
||||
byContainerID := make(map[string]dockerclient.ContainerSummary, len(containers))
|
||||
byGameID := make(map[uuid.UUID]dockerclient.ContainerSummary, len(containers))
|
||||
for _, c := range containers {
|
||||
byContainerID[c.ID] = c
|
||||
gameID, ok := parseGameIDFromContainerName(c.Name)
|
||||
if ok {
|
||||
byGameID[gameID] = c
|
||||
}
|
||||
}
|
||||
|
||||
records, err := r.svc.deps.Store.ListAllRuntimeRecords(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("list runtime records: %w", err)
|
||||
}
|
||||
knownGames := make(map[uuid.UUID]struct{}, len(records))
|
||||
|
||||
var errs []error
|
||||
for _, rec := range records {
|
||||
knownGames[rec.GameID] = struct{}{}
|
||||
if rec.IsTerminal() {
|
||||
continue
|
||||
}
|
||||
c, matched := matchContainer(rec, byContainerID, byGameID)
|
||||
if !matched {
|
||||
if err := r.markRemoved(ctx, rec); err != nil {
|
||||
errs = append(errs, fmt.Errorf("mark removed %s: %w", rec.GameID, err))
|
||||
}
|
||||
continue
|
||||
}
|
||||
if err := r.refreshSnapshot(ctx, rec, c); err != nil {
|
||||
errs = append(errs, fmt.Errorf("refresh snapshot %s: %w", rec.GameID, err))
|
||||
}
|
||||
}
|
||||
|
||||
for gameID, c := range byGameID {
|
||||
if _, ok := knownGames[gameID]; ok {
|
||||
continue
|
||||
}
|
||||
if err := r.adopt(ctx, gameID, c); err != nil {
|
||||
errs = append(errs, fmt.Errorf("adopt %s: %w", gameID, err))
|
||||
}
|
||||
}
|
||||
return errors.Join(errs...)
|
||||
}
|
||||
|
||||
func matchContainer(rec RuntimeRecord, byContainerID map[string]dockerclient.ContainerSummary, byGameID map[uuid.UUID]dockerclient.ContainerSummary) (dockerclient.ContainerSummary, bool) {
|
||||
if rec.CurrentContainerID != "" {
|
||||
if c, ok := byContainerID[rec.CurrentContainerID]; ok {
|
||||
return c, true
|
||||
}
|
||||
}
|
||||
if c, ok := byGameID[rec.GameID]; ok {
|
||||
return c, true
|
||||
}
|
||||
return dockerclient.ContainerSummary{}, false
|
||||
}
|
||||
|
||||
func (r *Reconciler) markRemoved(ctx context.Context, rec RuntimeRecord) error {
|
||||
updated, err := r.svc.transitionRuntimeStatus(ctx, rec.GameID, RuntimeStatusRemoved, "")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
r.svc.deps.Cache.PutRuntime(updated)
|
||||
if r.svc.deps.Lobby != nil {
|
||||
err = r.svc.deps.Lobby.OnRuntimeJobResult(ctx, rec.GameID, JobResult{
|
||||
Op: OpReconcile,
|
||||
Status: RuntimeStatusRemoved,
|
||||
Message: "container disappeared",
|
||||
})
|
||||
if err != nil {
|
||||
r.svc.deps.Logger.Warn("lobby OnRuntimeJobResult failed",
|
||||
zap.String("game_id", rec.GameID.String()),
|
||||
zap.Error(err))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *Reconciler) adopt(ctx context.Context, gameID uuid.UUID, c dockerclient.ContainerSummary) error {
|
||||
endpoint := fmt.Sprintf("http://%s:%d", HostName(gameID.String()), 8080)
|
||||
game, err := r.svc.deps.Store.LoadGameProjection(ctx, gameID)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNotFound) {
|
||||
r.svc.deps.Logger.Warn("orphan container, no matching game",
|
||||
zap.String("game_id", gameID.String()),
|
||||
zap.String("container_id", c.ID))
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
rec, err := r.svc.upsertRuntimeRecord(ctx, runtimeRecordInsert{
|
||||
GameID: gameID,
|
||||
Status: RuntimeStatusRunning,
|
||||
CurrentContainerID: c.ID,
|
||||
CurrentImageRef: c.ImageRef,
|
||||
CurrentEngineVersion: c.Labels["galaxy.engine_version"],
|
||||
EngineEndpoint: endpoint,
|
||||
DockerNetwork: r.svc.dockerNetwork(),
|
||||
TurnSchedule: game.TurnSchedule,
|
||||
}, runtimeRecordUpdate{
|
||||
Status: strPtr(RuntimeStatusRunning),
|
||||
CurrentContainerID: strPtr(c.ID),
|
||||
CurrentImageRef: strPtr(c.ImageRef),
|
||||
CurrentEngineVersion: strPtr(c.Labels["galaxy.engine_version"]),
|
||||
EngineEndpoint: strPtr(endpoint),
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
r.svc.deps.Cache.PutRuntime(rec)
|
||||
r.svc.scheduler.startGame(rec)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *Reconciler) refreshSnapshot(ctx context.Context, rec RuntimeRecord, _ dockerclient.ContainerSummary) error {
|
||||
state, err := r.svc.deps.Engine.Status(ctx, rec.EngineEndpoint)
|
||||
if err != nil {
|
||||
_, _ = r.svc.transitionRuntimeStatus(ctx, rec.GameID, RuntimeStatusEngineUnreachable, "")
|
||||
return nil
|
||||
}
|
||||
return r.svc.publishSnapshot(ctx, rec.GameID, state)
|
||||
}
|
||||
|
||||
func parseGameIDFromContainerName(name string) (uuid.UUID, bool) {
|
||||
const prefix = "galaxy-game-"
|
||||
suffix := strings.TrimPrefix(name, prefix)
|
||||
if suffix == name {
|
||||
return uuid.Nil, false
|
||||
}
|
||||
parsed, err := uuid.Parse(suffix)
|
||||
if err != nil {
|
||||
return uuid.Nil, false
|
||||
}
|
||||
return parsed, true
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
// Package runtime owns the lifecycle of game-engine containers and the
|
||||
// engine-version registry on the platform side. It is the single
|
||||
// component permitted to talk to the Docker daemon
|
||||
// (`internal/dockerclient`) and to running engine HTTP listeners
|
||||
// (`internal/engineclient`); cross-cutting concerns such as the lobby
|
||||
// state machine, notification fan-out, or player-mapping persistence
|
||||
// live in their domain packages and reach into runtime through a
|
||||
// narrow interface set documented in `deps.go`.
|
||||
//
|
||||
// The package introduces the package on top of the The implementation lobby. The
|
||||
// lobby `RuntimeGateway` shifts from a logger-only no-op to a real
|
||||
// adapter backed by `*runtime.Service`; runtime publishes snapshots
|
||||
// back into lobby through `LobbyConsumer.OnRuntimeSnapshot`. The
|
||||
// engine-version registry CRUD endpoints under
|
||||
// `/api/v1/admin/engine-versions/*` and the runtime admin/user proxy
|
||||
// endpoints flip from 501 placeholders to real responses.
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgconn"
|
||||
)
|
||||
|
||||
// Runtime status vocabulary mirrors `runtime_records_status_chk` in
|
||||
// `backend/internal/postgres/migrations/00001_init.sql`.
|
||||
const (
|
||||
RuntimeStatusStarting = "starting"
|
||||
RuntimeStatusRunning = "running"
|
||||
RuntimeStatusGenerationInProgress = "generation_in_progress"
|
||||
RuntimeStatusGenerationFailed = "generation_failed"
|
||||
RuntimeStatusStopped = "stopped"
|
||||
RuntimeStatusEngineUnreachable = "engine_unreachable"
|
||||
RuntimeStatusFinished = "finished"
|
||||
RuntimeStatusRemoved = "removed"
|
||||
)
|
||||
|
||||
// Operation log vocabulary recorded into `runtime_operation_log.op` and
|
||||
// `runtime_operation_log.status`. Kept as exported constants so
|
||||
// runtime, admin handlers, and tests share the same wire values.
|
||||
const (
|
||||
OpStart = "start"
|
||||
OpStop = "stop"
|
||||
OpPause = "pause"
|
||||
OpResume = "resume"
|
||||
OpRestart = "restart"
|
||||
OpPatch = "patch"
|
||||
OpForceNextTurn = "force_next_turn"
|
||||
OpReconcile = "reconcile"
|
||||
OpTurn = "turn"
|
||||
|
||||
OpSourceLobby = "lobby"
|
||||
OpSourceAdmin = "admin"
|
||||
OpSourceScheduler = "scheduler"
|
||||
OpSourceReconciler = "reconciler"
|
||||
|
||||
OpStatusQueued = "queued"
|
||||
OpStatusRunning = "running"
|
||||
OpStatusSucceeded = "succeeded"
|
||||
OpStatusFailed = "failed"
|
||||
)
|
||||
|
||||
// Container naming convention. The hostname is the primary alias on
|
||||
// the user-defined Docker network; the engine endpoint URL is
|
||||
// synthesised by `dockerclient.Adapter.Run` as `http://{hostname}:8080`.
|
||||
const (
|
||||
containerNamePrefix = "galaxy-game-"
|
||||
containerHostPrefix = "galaxy-game-"
|
||||
)
|
||||
|
||||
// pgErrCodeUniqueViolation is the SQLSTATE Postgres emits on a UNIQUE
|
||||
// constraint violation. Kept locally so the runtime package does not
|
||||
// import `internal/admin` or `internal/lobby` for the constant.
|
||||
const pgErrCodeUniqueViolation = "23505"
|
||||
|
||||
// isUniqueViolation reports whether err is a Postgres UNIQUE
|
||||
// constraint violation, optionally restricted to a specific constraint
|
||||
// name. Empty constraintName matches any UNIQUE violation.
|
||||
func isUniqueViolation(err error, constraintName string) bool {
|
||||
var pgErr *pgconn.PgError
|
||||
if !errors.As(err, &pgErr) {
|
||||
return false
|
||||
}
|
||||
if pgErr.Code != pgErrCodeUniqueViolation {
|
||||
return false
|
||||
}
|
||||
if constraintName == "" {
|
||||
return true
|
||||
}
|
||||
return pgErr.ConstraintName == constraintName
|
||||
}
|
||||
|
||||
// ContainerName synthesises the Docker container / hostname for the
|
||||
// supplied game id. Exported so tests and the reconciler can resolve
|
||||
// the inverse mapping without duplicating the format string.
|
||||
func ContainerName(gameID string) string { return containerNamePrefix + gameID }
|
||||
|
||||
// HostName synthesises the in-network hostname for the supplied game
|
||||
// id. Mirrors ContainerName so the engine endpoint URL `http://{host}:8080`
|
||||
// resolves through Docker DNS on the user-defined network.
|
||||
func HostName(gameID string) string { return containerHostPrefix + gameID }
|
||||
@@ -0,0 +1,266 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"galaxy/backend/internal/dockerclient"
|
||||
"galaxy/cronutil"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// Scheduler runs one goroutine per running game. Each goroutine holds
|
||||
// a `cronutil.Schedule` parsed from `runtime_records.turn_schedule`
|
||||
// and invokes `engineclient.Turn` on every tick (or when
|
||||
// `skip_next_tick=true` short-circuits the timer).
|
||||
//
|
||||
// Implements `app.Component` so main.go can register the bookkeeper
|
||||
// component alongside the worker pool and reconciler. Run blocks on
|
||||
// ctx; per-game goroutines tear down when their game leaves the cache
|
||||
// (stopGame is called) or when ctx is cancelled.
|
||||
type Scheduler struct {
|
||||
svc *Service
|
||||
|
||||
mu sync.Mutex
|
||||
tickers map[uuid.UUID]*scheduledGame
|
||||
parent context.Context
|
||||
stopping bool
|
||||
}
|
||||
|
||||
type scheduledGame struct {
|
||||
cancel context.CancelFunc
|
||||
done chan struct{}
|
||||
}
|
||||
|
||||
// NewScheduler builds a Scheduler. The svc reference is held for the
|
||||
// life of the Scheduler.
|
||||
func NewScheduler(svc *Service) *Scheduler {
|
||||
return &Scheduler{
|
||||
svc: svc,
|
||||
tickers: make(map[uuid.UUID]*scheduledGame),
|
||||
}
|
||||
}
|
||||
|
||||
// Run installs ctx as the parent context and re-attaches scheduler
|
||||
// goroutines for every active runtime record at startup. Blocks on
|
||||
// ctx.
|
||||
func (sch *Scheduler) Run(ctx context.Context) error {
|
||||
if sch == nil {
|
||||
return nil
|
||||
}
|
||||
sch.mu.Lock()
|
||||
sch.parent = ctx
|
||||
sch.stopping = false
|
||||
sch.mu.Unlock()
|
||||
|
||||
// Re-attach schedulers for every running record.
|
||||
for _, rec := range sch.svc.deps.Cache.ActiveRuntimes() {
|
||||
if rec.Status != RuntimeStatusRunning {
|
||||
continue
|
||||
}
|
||||
sch.startGame(rec)
|
||||
}
|
||||
|
||||
<-ctx.Done()
|
||||
return nil
|
||||
}
|
||||
|
||||
// Shutdown cancels every per-game goroutine and waits for them to
|
||||
// drain. The provided context bounds the wait.
|
||||
func (sch *Scheduler) Shutdown(ctx context.Context) error {
|
||||
if sch == nil {
|
||||
return nil
|
||||
}
|
||||
sch.mu.Lock()
|
||||
sch.stopping = true
|
||||
games := make([]*scheduledGame, 0, len(sch.tickers))
|
||||
for _, g := range sch.tickers {
|
||||
games = append(games, g)
|
||||
}
|
||||
sch.tickers = make(map[uuid.UUID]*scheduledGame)
|
||||
sch.mu.Unlock()
|
||||
|
||||
for _, g := range games {
|
||||
g.cancel()
|
||||
}
|
||||
for _, g := range games {
|
||||
select {
|
||||
case <-g.done:
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// startGame attaches a per-game scheduler goroutine. Idempotent: a
|
||||
// repeated call replaces the old goroutine with a fresh one bound to
|
||||
// the supplied record.
|
||||
func (sch *Scheduler) startGame(rec RuntimeRecord) {
|
||||
if sch == nil {
|
||||
return
|
||||
}
|
||||
sch.mu.Lock()
|
||||
if sch.stopping || sch.parent == nil {
|
||||
sch.mu.Unlock()
|
||||
return
|
||||
}
|
||||
if existing, ok := sch.tickers[rec.GameID]; ok {
|
||||
existing.cancel()
|
||||
sch.mu.Unlock()
|
||||
<-existing.done
|
||||
sch.mu.Lock()
|
||||
}
|
||||
parent := sch.parent
|
||||
if parent == nil {
|
||||
sch.mu.Unlock()
|
||||
return
|
||||
}
|
||||
gameCtx, cancel := context.WithCancel(parent)
|
||||
g := &scheduledGame{cancel: cancel, done: make(chan struct{})}
|
||||
sch.tickers[rec.GameID] = g
|
||||
sch.mu.Unlock()
|
||||
|
||||
go sch.loop(gameCtx, rec, g.done)
|
||||
}
|
||||
|
||||
// stopGame cancels the goroutine tied to gameID. Idempotent.
|
||||
func (sch *Scheduler) stopGame(gameID uuid.UUID) {
|
||||
if sch == nil {
|
||||
return
|
||||
}
|
||||
sch.mu.Lock()
|
||||
g, ok := sch.tickers[gameID]
|
||||
if ok {
|
||||
delete(sch.tickers, gameID)
|
||||
}
|
||||
sch.mu.Unlock()
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
g.cancel()
|
||||
<-g.done
|
||||
}
|
||||
|
||||
// activeCount reports how many games currently have a scheduler
|
||||
// goroutine. Used by tests.
|
||||
func (sch *Scheduler) activeCount() int {
|
||||
sch.mu.Lock()
|
||||
defer sch.mu.Unlock()
|
||||
return len(sch.tickers)
|
||||
}
|
||||
|
||||
// tickInterval computes the wait for the next scheduler firing. When
|
||||
// the cron schedule fails to parse the loop falls back to a one-hour
|
||||
// safety interval and logs a warning so operators notice.
|
||||
func (sch *Scheduler) loop(ctx context.Context, rec RuntimeRecord, done chan struct{}) {
|
||||
defer close(done)
|
||||
logger := sch.svc.deps.Logger.With(zap.String("game_id", rec.GameID.String()))
|
||||
|
||||
schedule, err := cronutil.Parse(rec.TurnSchedule)
|
||||
if err != nil {
|
||||
logger.Warn("invalid turn_schedule, scheduler stopping",
|
||||
zap.String("turn_schedule", rec.TurnSchedule),
|
||||
zap.Error(err))
|
||||
return
|
||||
}
|
||||
|
||||
for {
|
||||
latest, ok := sch.svc.deps.Cache.GetRuntime(rec.GameID)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if latest.Status != RuntimeStatusRunning {
|
||||
return
|
||||
}
|
||||
now := sch.svc.deps.Now().UTC()
|
||||
next := schedule.Next(now)
|
||||
wait := next.Sub(now)
|
||||
if latest.SkipNextTick {
|
||||
wait = 0
|
||||
}
|
||||
if wait < 0 {
|
||||
wait = 0
|
||||
}
|
||||
|
||||
timer := time.NewTimer(wait)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
timer.Stop()
|
||||
return
|
||||
case <-timer.C:
|
||||
}
|
||||
// Fresh fetch in case of pause / status change while waiting.
|
||||
current, ok := sch.svc.deps.Cache.GetRuntime(rec.GameID)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if current.Status != RuntimeStatusRunning {
|
||||
return
|
||||
}
|
||||
if current.Paused {
|
||||
continue
|
||||
}
|
||||
if err := sch.tick(ctx, current); err != nil {
|
||||
logger.Warn("scheduler tick failed", zap.Error(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// tick runs one engine /admin/turn call under the per-game mutex,
|
||||
// publishes the resulting snapshot, and clears `skip_next_tick`.
|
||||
func (sch *Scheduler) tick(ctx context.Context, rec RuntimeRecord) error {
|
||||
mu := sch.svc.gameLock(rec.GameID)
|
||||
if !mu.TryLock() {
|
||||
return nil // another op is in flight; skip this tick
|
||||
}
|
||||
defer mu.Unlock()
|
||||
|
||||
op, err := sch.svc.beginOperation(ctx, rec.GameID, OpTurn, OpSourceScheduler)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
state, err := sch.svc.deps.Engine.Turn(ctx, rec.EngineEndpoint)
|
||||
if err != nil {
|
||||
sch.svc.completeOperation(ctx, op, err)
|
||||
_, _ = sch.svc.transitionRuntimeStatus(ctx, rec.GameID, RuntimeStatusEngineUnreachable, "")
|
||||
// On engine unreachable, also clear skip_next_tick so the next
|
||||
// real tick can start fresh.
|
||||
_ = sch.clearSkipFlag(ctx, rec.GameID)
|
||||
// Best-effort: ask Docker whether the container is still
|
||||
// alive; if it's gone we mark the runtime row as removed.
|
||||
if rec.CurrentContainerID != "" {
|
||||
if _, inspErr := sch.svc.deps.Docker.InspectContainer(ctx, rec.CurrentContainerID); errors.Is(inspErr, dockerclient.ErrContainerNotFound) {
|
||||
_, _ = sch.svc.transitionRuntimeStatus(ctx, rec.GameID, RuntimeStatusRemoved, "")
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
if err := sch.svc.publishSnapshot(ctx, rec.GameID, state); err != nil {
|
||||
sch.svc.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
sch.svc.completeOperation(ctx, op, nil)
|
||||
_ = sch.clearSkipFlag(ctx, rec.GameID)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sch *Scheduler) clearSkipFlag(ctx context.Context, gameID uuid.UUID) error {
|
||||
rec, ok := sch.svc.deps.Cache.GetRuntime(gameID)
|
||||
if !ok || !rec.SkipNextTick {
|
||||
return nil
|
||||
}
|
||||
skip := false
|
||||
now := sch.svc.deps.Now().UTC()
|
||||
updated, err := sch.svc.deps.Store.UpdateRuntimeRecord(ctx, gameID, runtimeRecordUpdate{SkipNextTick: &skip}, now)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
sch.svc.deps.Cache.PutRuntime(updated)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -0,0 +1,908 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"galaxy/backend/internal/dockerclient"
|
||||
"galaxy/model/rest"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// Service is the runtime-domain entry point. It owns the per-game
|
||||
// lifecycle (start, stop, pause, resume, restart, patch,
|
||||
// force-next-turn), the runtime cache, the player-mapping projection,
|
||||
// and the operation log; it coordinates with the worker pool and the
|
||||
// per-game scheduler goroutines.
|
||||
type Service struct {
|
||||
deps Deps
|
||||
|
||||
gameMu sync.Map // uuid.UUID -> *sync.Mutex
|
||||
|
||||
scheduler *Scheduler
|
||||
workers *WorkerPool
|
||||
}
|
||||
|
||||
// NewService constructs a Service. Logger and Now default sensibly. The
|
||||
// `Service` is `app.Component`-shaped through the embedded WorkerPool /
|
||||
// Scheduler / Reconciler that callers register separately.
|
||||
func NewService(deps Deps) (*Service, error) {
|
||||
if deps.Store == nil {
|
||||
return nil, errors.New("runtime: store must not be nil")
|
||||
}
|
||||
if deps.Cache == nil {
|
||||
return nil, errors.New("runtime: cache must not be nil")
|
||||
}
|
||||
if deps.EngineVersions == nil {
|
||||
return nil, errors.New("runtime: engine version service must not be nil")
|
||||
}
|
||||
if deps.Docker == nil {
|
||||
return nil, errors.New("runtime: docker client must not be nil")
|
||||
}
|
||||
if deps.Engine == nil {
|
||||
return nil, errors.New("runtime: engine client must not be nil")
|
||||
}
|
||||
if deps.Logger == nil {
|
||||
deps.Logger = zap.NewNop()
|
||||
}
|
||||
deps.Logger = deps.Logger.Named("runtime")
|
||||
if deps.Notification == nil {
|
||||
deps.Notification = NewNoopNotificationPublisher(deps.Logger)
|
||||
}
|
||||
if deps.Now == nil {
|
||||
deps.Now = time.Now
|
||||
}
|
||||
if deps.Config.WorkerPoolSize <= 0 {
|
||||
deps.Config.WorkerPoolSize = 1
|
||||
}
|
||||
if deps.Config.JobQueueSize <= 0 {
|
||||
deps.Config.JobQueueSize = 1
|
||||
}
|
||||
if deps.Config.StopGracePeriod <= 0 {
|
||||
deps.Config.StopGracePeriod = 10 * time.Second
|
||||
}
|
||||
if deps.Config.ReconcileInterval <= 0 {
|
||||
deps.Config.ReconcileInterval = 60 * time.Second
|
||||
}
|
||||
if strings.TrimSpace(deps.Config.ContainerStateMount) == "" {
|
||||
deps.Config.ContainerStateMount = "/var/lib/galaxy-game"
|
||||
}
|
||||
if !dockerclient.PullPolicy(deps.Config.ImagePullPolicy).IsKnown() {
|
||||
return nil, fmt.Errorf("runtime: invalid image pull policy %q", deps.Config.ImagePullPolicy)
|
||||
}
|
||||
svc := &Service{deps: deps}
|
||||
svc.scheduler = NewScheduler(svc)
|
||||
svc.workers = NewWorkerPool(svc)
|
||||
return svc, nil
|
||||
}
|
||||
|
||||
// Logger exposes the named logger used by the service.
|
||||
func (s *Service) Logger() *zap.Logger { return s.deps.Logger }
|
||||
|
||||
// Cache returns the in-memory projection.
|
||||
func (s *Service) Cache() *Cache { return s.deps.Cache }
|
||||
|
||||
// EngineVersions returns the engine-version registry service.
|
||||
func (s *Service) EngineVersions() *EngineVersionService { return s.deps.EngineVersions }
|
||||
|
||||
// Workers returns the runtime worker pool component.
|
||||
func (s *Service) Workers() *WorkerPool { return s.workers }
|
||||
|
||||
// Reconciler builds an `app.Component` driving the periodic
|
||||
// reconciliation loop documented in PLAN.md §5.5.
|
||||
func (s *Service) Reconciler() *Reconciler { return NewReconciler(s) }
|
||||
|
||||
// SchedulerComponent returns the per-game scheduler bookkeeper. It
|
||||
// implements `app.Component` so main.go can register it alongside the
|
||||
// worker pool.
|
||||
func (s *Service) SchedulerComponent() *Scheduler { return s.scheduler }
|
||||
|
||||
// gameLock returns a sync.Mutex unique to gameID. Used to serialise
|
||||
// per-game runtime operations across goroutines.
|
||||
func (s *Service) gameLock(gameID uuid.UUID) *sync.Mutex {
|
||||
if v, ok := s.gameMu.Load(gameID); ok {
|
||||
return v.(*sync.Mutex)
|
||||
}
|
||||
v, _ := s.gameMu.LoadOrStore(gameID, &sync.Mutex{})
|
||||
return v.(*sync.Mutex)
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// Lifecycle entry points (consumed by lobby.RuntimeGateway adapter)
|
||||
// =====================================================================
|
||||
|
||||
// StartGame queues a start job for gameID. Returns once the operation
|
||||
// is durably recorded; the actual pull / create / start runs on a
|
||||
// worker goroutine.
|
||||
func (s *Service) StartGame(ctx context.Context, gameID uuid.UUID) error {
|
||||
op, err := s.beginOperation(ctx, gameID, OpStart, OpSourceLobby)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return s.enqueue(ctx, jobStart{operation: op})
|
||||
}
|
||||
|
||||
// StopGame queues a stop job for gameID.
|
||||
func (s *Service) StopGame(ctx context.Context, gameID uuid.UUID) error {
|
||||
op, err := s.beginOperation(ctx, gameID, OpStop, OpSourceLobby)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return s.enqueue(ctx, jobStop{operation: op})
|
||||
}
|
||||
|
||||
// PauseGame flips the runtime row's `paused` flag. The container
|
||||
// keeps running; the scheduler short-circuits ticks while paused.
|
||||
// Synchronous because no Docker call is involved.
|
||||
func (s *Service) PauseGame(ctx context.Context, gameID uuid.UUID) error {
|
||||
mu := s.gameLock(gameID)
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
now := s.deps.Now().UTC()
|
||||
paused := true
|
||||
pausedAtPtr := &now
|
||||
patch := runtimeRecordUpdate{Paused: &paused, PausedAt: &pausedAtPtr}
|
||||
rec, err := s.deps.Store.UpdateRuntimeRecord(ctx, gameID, patch, now)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.deps.Cache.PutRuntime(rec)
|
||||
s.recordSyncOperation(ctx, gameID, OpPause, OpSourceLobby, rec.CurrentImageRef, rec.CurrentContainerID, nil)
|
||||
return nil
|
||||
}
|
||||
|
||||
// ResumeGame clears the `paused` flag. Synchronous.
|
||||
func (s *Service) ResumeGame(ctx context.Context, gameID uuid.UUID) error {
|
||||
mu := s.gameLock(gameID)
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
now := s.deps.Now().UTC()
|
||||
paused := false
|
||||
var nilTime *time.Time
|
||||
cleared := &nilTime
|
||||
patch := runtimeRecordUpdate{Paused: &paused, PausedAt: cleared}
|
||||
rec, err := s.deps.Store.UpdateRuntimeRecord(ctx, gameID, patch, now)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.deps.Cache.PutRuntime(rec)
|
||||
s.recordSyncOperation(ctx, gameID, OpResume, OpSourceLobby, rec.CurrentImageRef, rec.CurrentContainerID, nil)
|
||||
return nil
|
||||
}
|
||||
|
||||
// AdminRestart queues a restart job. Stop + remove + run with the
|
||||
// same image_ref.
|
||||
func (s *Service) AdminRestart(ctx context.Context, gameID uuid.UUID) (OperationLog, error) {
|
||||
op, err := s.beginOperation(ctx, gameID, OpRestart, OpSourceAdmin)
|
||||
if err != nil {
|
||||
return OperationLog{}, err
|
||||
}
|
||||
if err := s.enqueue(ctx, jobRestart{operation: op}); err != nil {
|
||||
return OperationLog{}, err
|
||||
}
|
||||
return op, nil
|
||||
}
|
||||
|
||||
// AdminPatch validates the target version against the registry, then
|
||||
// queues a stop + remove + run with the new image. Returns
|
||||
// ErrPatchSemverIncompatible when the target crosses major/minor.
|
||||
func (s *Service) AdminPatch(ctx context.Context, gameID uuid.UUID, targetVersion string) (OperationLog, error) {
|
||||
rec, err := s.GetRuntime(ctx, gameID)
|
||||
if err != nil {
|
||||
return OperationLog{}, err
|
||||
}
|
||||
if rec.CurrentEngineVersion == "" {
|
||||
return OperationLog{}, fmt.Errorf("%w: runtime has no current engine version", ErrConflict)
|
||||
}
|
||||
if err := CheckPatchCompatible(rec.CurrentEngineVersion, targetVersion); err != nil {
|
||||
return OperationLog{}, err
|
||||
}
|
||||
target, err := s.deps.EngineVersions.Resolve(ctx, targetVersion)
|
||||
if err != nil {
|
||||
return OperationLog{}, err
|
||||
}
|
||||
op, err := s.beginOperation(ctx, gameID, OpPatch, OpSourceAdmin)
|
||||
if err != nil {
|
||||
return OperationLog{}, err
|
||||
}
|
||||
if err := s.enqueue(ctx, jobPatch{operation: op, target: target}); err != nil {
|
||||
return OperationLog{}, err
|
||||
}
|
||||
return op, nil
|
||||
}
|
||||
|
||||
// AdminForceNextTurn sets the skip_next_tick flag so the next
|
||||
// scheduler tick fires immediately. Synchronous.
|
||||
func (s *Service) AdminForceNextTurn(ctx context.Context, gameID uuid.UUID) (OperationLog, error) {
|
||||
mu := s.gameLock(gameID)
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
now := s.deps.Now().UTC()
|
||||
skip := true
|
||||
rec, err := s.deps.Store.UpdateRuntimeRecord(ctx, gameID, runtimeRecordUpdate{SkipNextTick: &skip}, now)
|
||||
if err != nil {
|
||||
return OperationLog{}, err
|
||||
}
|
||||
s.deps.Cache.PutRuntime(rec)
|
||||
op := s.recordSyncOperation(ctx, gameID, OpForceNextTurn, OpSourceAdmin, rec.CurrentImageRef, rec.CurrentContainerID, nil)
|
||||
return op, nil
|
||||
}
|
||||
|
||||
// GetRuntime returns the runtime record for gameID, cache-first.
|
||||
func (s *Service) GetRuntime(ctx context.Context, gameID uuid.UUID) (RuntimeRecord, error) {
|
||||
if rec, ok := s.deps.Cache.GetRuntime(gameID); ok {
|
||||
return rec, nil
|
||||
}
|
||||
rec, err := s.deps.Store.LoadRuntimeRecord(ctx, gameID)
|
||||
if err != nil {
|
||||
return RuntimeRecord{}, err
|
||||
}
|
||||
s.deps.Cache.PutRuntime(rec)
|
||||
return rec, nil
|
||||
}
|
||||
|
||||
// ResolvePlayerMapping returns the (race_name, engine_player_uuid)
|
||||
// projection for the supplied (game_id, user_id). Used by the user
|
||||
// game-proxy handlers to populate the engine `actor` field.
|
||||
func (s *Service) ResolvePlayerMapping(ctx context.Context, gameID, userID uuid.UUID) (PlayerMapping, error) {
|
||||
return s.deps.Store.LoadPlayerMapping(ctx, gameID, userID)
|
||||
}
|
||||
|
||||
// EngineEndpoint returns the engine endpoint URL for gameID. Used by
|
||||
// the user game-proxy handlers.
|
||||
func (s *Service) EngineEndpoint(ctx context.Context, gameID uuid.UUID) (string, error) {
|
||||
rec, err := s.GetRuntime(ctx, gameID)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if rec.EngineEndpoint == "" {
|
||||
return "", fmt.Errorf("%w: runtime has no engine endpoint", ErrConflict)
|
||||
}
|
||||
return rec.EngineEndpoint, nil
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// Worker / job execution
|
||||
// =====================================================================
|
||||
|
||||
// job is the internal interface implemented by every long-running
|
||||
// runtime task. The worker pool dispatches them in order.
|
||||
type job interface {
|
||||
GameID() uuid.UUID
|
||||
Run(ctx context.Context, s *Service) error
|
||||
Operation() OperationLog
|
||||
}
|
||||
|
||||
type jobStart struct{ operation OperationLog }
|
||||
type jobStop struct{ operation OperationLog }
|
||||
type jobRestart struct{ operation OperationLog }
|
||||
type jobPatch struct {
|
||||
operation OperationLog
|
||||
target EngineVersion
|
||||
}
|
||||
|
||||
func (j jobStart) GameID() uuid.UUID { return j.operation.GameID }
|
||||
func (j jobStop) GameID() uuid.UUID { return j.operation.GameID }
|
||||
func (j jobRestart) GameID() uuid.UUID { return j.operation.GameID }
|
||||
func (j jobPatch) GameID() uuid.UUID { return j.operation.GameID }
|
||||
func (j jobStart) Operation() OperationLog { return j.operation }
|
||||
func (j jobStop) Operation() OperationLog { return j.operation }
|
||||
func (j jobRestart) Operation() OperationLog { return j.operation }
|
||||
func (j jobPatch) Operation() OperationLog { return j.operation }
|
||||
|
||||
func (j jobStart) Run(ctx context.Context, s *Service) error { return s.runStart(ctx, j.operation) }
|
||||
func (j jobStop) Run(ctx context.Context, s *Service) error { return s.runStop(ctx, j.operation) }
|
||||
func (j jobRestart) Run(ctx context.Context, s *Service) error {
|
||||
return s.runRestart(ctx, j.operation)
|
||||
}
|
||||
func (j jobPatch) Run(ctx context.Context, s *Service) error {
|
||||
return s.runPatch(ctx, j.operation, j.target)
|
||||
}
|
||||
|
||||
// enqueue places job onto the worker channel. Returns ErrJobQueueFull
|
||||
// when the channel is at capacity; ErrShutdown when the pool is
|
||||
// stopped.
|
||||
func (s *Service) enqueue(ctx context.Context, j job) error {
|
||||
if s.workers == nil {
|
||||
return ErrShutdown
|
||||
}
|
||||
return s.workers.submit(ctx, j)
|
||||
}
|
||||
|
||||
// beginOperation persists a queued operation log row. Caller is
|
||||
// responsible for transitioning it to running/succeeded/failed via
|
||||
// completeOperation.
|
||||
func (s *Service) beginOperation(ctx context.Context, gameID uuid.UUID, op, source string) (OperationLog, error) {
|
||||
in := operationLogInsert{
|
||||
OperationID: uuid.New(),
|
||||
GameID: gameID,
|
||||
Op: op,
|
||||
Source: source,
|
||||
Status: OpStatusQueued,
|
||||
StartedAt: s.deps.Now().UTC(),
|
||||
}
|
||||
return s.deps.Store.InsertOperationLog(ctx, in)
|
||||
}
|
||||
|
||||
// recordSyncOperation logs an operation that completed synchronously
|
||||
// (pause / resume / force-next-turn). It writes both the queued and
|
||||
// the terminal row to keep the audit trail consistent with worker
|
||||
// jobs.
|
||||
func (s *Service) recordSyncOperation(ctx context.Context, gameID uuid.UUID, op, source, imageRef, containerID string, runErr error) OperationLog {
|
||||
in := operationLogInsert{
|
||||
OperationID: uuid.New(),
|
||||
GameID: gameID,
|
||||
Op: op,
|
||||
Source: source,
|
||||
Status: OpStatusRunning,
|
||||
ImageRef: imageRef,
|
||||
ContainerID: containerID,
|
||||
StartedAt: s.deps.Now().UTC(),
|
||||
}
|
||||
rec, err := s.deps.Store.InsertOperationLog(ctx, in)
|
||||
if err != nil {
|
||||
s.deps.Logger.Warn("operation log insert failed",
|
||||
zap.String("game_id", gameID.String()),
|
||||
zap.String("op", op),
|
||||
zap.Error(err))
|
||||
return OperationLog{}
|
||||
}
|
||||
status := OpStatusSucceeded
|
||||
errCode := ""
|
||||
errMsg := ""
|
||||
if runErr != nil {
|
||||
status = OpStatusFailed
|
||||
errCode = "internal_error"
|
||||
errMsg = runErr.Error()
|
||||
}
|
||||
completed, err := s.deps.Store.CompleteOperationLog(ctx, rec.OperationID, status, errCode, errMsg, s.deps.Now().UTC())
|
||||
if err != nil {
|
||||
s.deps.Logger.Warn("operation log complete failed",
|
||||
zap.String("game_id", gameID.String()),
|
||||
zap.String("op", op),
|
||||
zap.Error(err))
|
||||
return rec
|
||||
}
|
||||
return completed
|
||||
}
|
||||
|
||||
// completeOperation flips the row to a terminal status. runErr is nil
|
||||
// on success.
|
||||
func (s *Service) completeOperation(ctx context.Context, op OperationLog, runErr error) {
|
||||
status := OpStatusSucceeded
|
||||
errCode := ""
|
||||
errMsg := ""
|
||||
if runErr != nil {
|
||||
status = OpStatusFailed
|
||||
errCode = "internal_error"
|
||||
errMsg = runErr.Error()
|
||||
}
|
||||
if _, err := s.deps.Store.CompleteOperationLog(ctx, op.OperationID, status, errCode, errMsg, s.deps.Now().UTC()); err != nil {
|
||||
s.deps.Logger.Warn("operation log complete failed",
|
||||
zap.String("game_id", op.GameID.String()),
|
||||
zap.String("op", op.Op),
|
||||
zap.String("operation_id", op.OperationID.String()),
|
||||
zap.Error(err))
|
||||
}
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// runStart — the heart of the package
|
||||
// =====================================================================
|
||||
|
||||
func (s *Service) runStart(ctx context.Context, op OperationLog) error {
|
||||
gameID := op.GameID
|
||||
mu := s.gameLock(gameID)
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
|
||||
game, err := s.deps.Store.LoadGameProjection(ctx, gameID)
|
||||
if err != nil {
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
if strings.TrimSpace(game.TargetEngineVersion) == "" {
|
||||
err := fmt.Errorf("%w: game has no target_engine_version", ErrInvalidInput)
|
||||
s.publishStartConfigInvalid(ctx, op, "target_engine_version is empty")
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
memberships, err := s.deps.Store.ListActiveMemberships(ctx, gameID)
|
||||
if err != nil {
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
if len(memberships) == 0 {
|
||||
err := fmt.Errorf("%w: game has no active memberships", ErrConflict)
|
||||
s.publishStartConfigInvalid(ctx, op, "no active memberships")
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
|
||||
version, err := s.deps.EngineVersions.Resolve(ctx, game.TargetEngineVersion)
|
||||
if err != nil {
|
||||
s.publishStartConfigInvalid(ctx, op, fmt.Sprintf("engine version %q: %v", game.TargetEngineVersion, err))
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
|
||||
mappings := make([]PlayerMapping, 0, len(memberships))
|
||||
races := make([]rest.InitRace, 0, len(memberships))
|
||||
for _, m := range memberships {
|
||||
mappings = append(mappings, PlayerMapping{
|
||||
GameID: gameID,
|
||||
UserID: m.UserID,
|
||||
RaceName: m.RaceName,
|
||||
EnginePlayerUUID: uuid.New(),
|
||||
})
|
||||
races = append(races, rest.InitRace{RaceName: m.RaceName})
|
||||
}
|
||||
if err := s.deps.Store.InsertPlayerMappings(ctx, mappings); err != nil {
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
|
||||
statePath := filepath.Join(filepath.Clean(s.deps.Config.ContainerStateMount), gameID.String())
|
||||
hostStatePath := filepath.Join(filepath.Clean(s.hostStateRoot()), gameID.String())
|
||||
|
||||
// Bind-mount sources are resolved by the Docker daemon against
|
||||
// the host filesystem, not against the backend process namespace.
|
||||
// Production deploys mount the same `BACKEND_GAME_STATE_ROOT`
|
||||
// path into the backend container at the same path, so creating
|
||||
// the per-game subdirectory inside backend makes it visible to
|
||||
// the daemon at the same absolute path.
|
||||
//
|
||||
// The directory is created with mode 0o777 (and explicitly
|
||||
// chmod-ed to override umask) because the engine container may
|
||||
// run as a different uid than backend. Both processes need
|
||||
// read-write access to the bind-mounted state path; backend has
|
||||
// no way to know the engine container's uid ahead of time, so
|
||||
// world-writable is the conservative default. Production
|
||||
// deployments that pin both containers to the same user can
|
||||
// tighten the mode through a future configuration knob.
|
||||
if err := os.MkdirAll(hostStatePath, 0o777); err != nil {
|
||||
s.completeOperation(ctx, op, fmt.Errorf("create host state path %q: %w", hostStatePath, err))
|
||||
return err
|
||||
}
|
||||
if err := os.Chmod(hostStatePath, 0o777); err != nil {
|
||||
s.completeOperation(ctx, op, fmt.Errorf("chmod host state path %q: %w", hostStatePath, err))
|
||||
return err
|
||||
}
|
||||
|
||||
spec := dockerclient.RunSpec{
|
||||
Name: ContainerName(gameID.String()),
|
||||
Image: version.ImageRef,
|
||||
Hostname: HostName(gameID.String()),
|
||||
Network: s.dockerNetwork(),
|
||||
Env: map[string]string{
|
||||
"GAME_STATE_PATH": statePath,
|
||||
},
|
||||
Labels: map[string]string{
|
||||
"galaxy.game_id": gameID.String(),
|
||||
"galaxy.engine_version": version.Version,
|
||||
},
|
||||
BindMounts: []dockerclient.BindMount{
|
||||
{
|
||||
HostPath: hostStatePath,
|
||||
MountPath: s.deps.Config.ContainerStateMount,
|
||||
ReadOnly: false,
|
||||
},
|
||||
},
|
||||
LogDriver: s.deps.Config.ContainerLogDriver,
|
||||
LogOpts: s.deps.Config.ContainerLogOpts,
|
||||
CPUQuota: s.deps.Config.ContainerCPUQuota,
|
||||
Memory: s.deps.Config.ContainerMemory,
|
||||
PIDsLimit: s.deps.Config.ContainerPIDsLimit,
|
||||
PullPolicy: dockerclient.PullPolicy(s.deps.Config.ImagePullPolicy),
|
||||
}
|
||||
|
||||
runResult, err := s.deps.Docker.Run(ctx, spec)
|
||||
if err != nil {
|
||||
s.publishStartFailure(ctx, op, version.ImageRef, err)
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
|
||||
now := s.deps.Now().UTC()
|
||||
startedAt := runResult.StartedAt
|
||||
if startedAt.IsZero() {
|
||||
startedAt = now
|
||||
}
|
||||
startedAtPtr := &startedAt
|
||||
rec, err := s.upsertRuntimeRecord(ctx, runtimeRecordInsert{
|
||||
GameID: gameID,
|
||||
Status: RuntimeStatusStarting,
|
||||
CurrentContainerID: runResult.ContainerID,
|
||||
CurrentImageRef: version.ImageRef,
|
||||
CurrentEngineVersion: version.Version,
|
||||
EngineEndpoint: runResult.EngineEndpoint,
|
||||
StatePath: statePath,
|
||||
DockerNetwork: s.dockerNetwork(),
|
||||
TurnSchedule: game.TurnSchedule,
|
||||
StartedAt: &startedAt,
|
||||
}, runtimeRecordUpdate{
|
||||
Status: strPtr(RuntimeStatusStarting),
|
||||
CurrentContainerID: strPtr(runResult.ContainerID),
|
||||
CurrentImageRef: strPtr(version.ImageRef),
|
||||
CurrentEngineVersion: strPtr(version.Version),
|
||||
EngineEndpoint: strPtr(runResult.EngineEndpoint),
|
||||
StatePath: strPtr(statePath),
|
||||
DockerNetwork: strPtr(s.dockerNetwork()),
|
||||
TurnSchedule: strPtr(game.TurnSchedule),
|
||||
StartedAt: &startedAtPtr,
|
||||
})
|
||||
if err != nil {
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
|
||||
// Wait for the engine HTTP listener before issuing init. Docker
|
||||
// reports the container as running as soon as the entrypoint
|
||||
// starts, but the Go binary inside may take a moment to bind
|
||||
// the port; without this loop, Init races the listener and
|
||||
// fails with `connection refused`.
|
||||
if err := s.waitForEngineHealthz(ctx, runResult.EngineEndpoint, 30*time.Second); err != nil {
|
||||
s.deps.Logger.Warn("engine healthz never succeeded",
|
||||
zap.String("game_id", gameID.String()),
|
||||
zap.Error(err))
|
||||
s.transitionRuntimeStatus(ctx, gameID, RuntimeStatusEngineUnreachable, "")
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
|
||||
initResp, err := s.deps.Engine.Init(ctx, runResult.EngineEndpoint, rest.InitRequest{Races: races})
|
||||
if err != nil {
|
||||
s.deps.Logger.Warn("engine init failed",
|
||||
zap.String("game_id", gameID.String()),
|
||||
zap.Error(err))
|
||||
s.transitionRuntimeStatus(ctx, gameID, RuntimeStatusEngineUnreachable, "")
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
|
||||
// Engine is up. Transition the runtime row to running and publish
|
||||
// the snapshot into lobby.
|
||||
rec, err = s.transitionRuntimeStatus(ctx, gameID, RuntimeStatusRunning, "ok")
|
||||
if err != nil {
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
s.scheduler.startGame(rec)
|
||||
if err := s.publishSnapshot(ctx, gameID, initResp); err != nil {
|
||||
s.deps.Logger.Warn("publish init snapshot failed",
|
||||
zap.String("game_id", gameID.String()),
|
||||
zap.Error(err))
|
||||
}
|
||||
s.completeOperation(ctx, op, nil)
|
||||
return nil
|
||||
}
|
||||
|
||||
// runStop stops + removes the engine container and transitions the
|
||||
// runtime row to `stopped`.
|
||||
func (s *Service) runStop(ctx context.Context, op OperationLog) error {
|
||||
gameID := op.GameID
|
||||
mu := s.gameLock(gameID)
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
|
||||
rec, err := s.GetRuntime(ctx, gameID)
|
||||
if err != nil {
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
s.scheduler.stopGame(gameID)
|
||||
if rec.CurrentContainerID != "" {
|
||||
if err := s.deps.Docker.Stop(ctx, rec.CurrentContainerID, int(s.deps.Config.StopGracePeriod/time.Second)); err != nil && !errors.Is(err, dockerclient.ErrContainerNotFound) {
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
if err := s.deps.Docker.Remove(ctx, rec.CurrentContainerID); err != nil {
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
}
|
||||
now := s.deps.Now().UTC()
|
||||
stoppedAtPtr := &now
|
||||
updated, err := s.deps.Store.UpdateRuntimeRecord(ctx, gameID, runtimeRecordUpdate{
|
||||
Status: strPtr(RuntimeStatusStopped),
|
||||
StoppedAt: &stoppedAtPtr,
|
||||
}, now)
|
||||
if err != nil {
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
s.deps.Cache.PutRuntime(updated)
|
||||
if err := s.deps.Store.DeletePlayerMappingsForGame(ctx, gameID); err != nil {
|
||||
s.deps.Logger.Warn("delete player_mappings on stop failed",
|
||||
zap.String("game_id", gameID.String()),
|
||||
zap.Error(err))
|
||||
}
|
||||
s.completeOperation(ctx, op, nil)
|
||||
return nil
|
||||
}
|
||||
|
||||
// runRestart stops + removes + runs a fresh container with the same
|
||||
// image_ref. Reuses runStart's logic via re-loading the lobby
|
||||
// projection.
|
||||
func (s *Service) runRestart(ctx context.Context, op OperationLog) error {
|
||||
if err := s.runStop(ctx, op); err != nil {
|
||||
return err
|
||||
}
|
||||
// Reuse runStart with a freshly minted operation row so the audit
|
||||
// trail remains consistent.
|
||||
startOp, err := s.beginOperation(ctx, op.GameID, OpStart, op.Source)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return s.runStart(ctx, startOp)
|
||||
}
|
||||
|
||||
// runPatch stops + removes the current container, updates the engine
|
||||
// version reference, and starts a fresh container.
|
||||
func (s *Service) runPatch(ctx context.Context, op OperationLog, target EngineVersion) error {
|
||||
mu := s.gameLock(op.GameID)
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
|
||||
rec, err := s.GetRuntime(ctx, op.GameID)
|
||||
if err != nil {
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
s.scheduler.stopGame(op.GameID)
|
||||
if rec.CurrentContainerID != "" {
|
||||
if err := s.deps.Docker.Stop(ctx, rec.CurrentContainerID, int(s.deps.Config.StopGracePeriod/time.Second)); err != nil && !errors.Is(err, dockerclient.ErrContainerNotFound) {
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
if err := s.deps.Docker.Remove(ctx, rec.CurrentContainerID); err != nil {
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
statePath := rec.StatePath
|
||||
if statePath == "" {
|
||||
statePath = filepath.Join(filepath.Clean(s.deps.Config.ContainerStateMount), op.GameID.String())
|
||||
}
|
||||
hostStatePath := filepath.Join(filepath.Clean(s.hostStateRoot()), op.GameID.String())
|
||||
|
||||
spec := dockerclient.RunSpec{
|
||||
Name: ContainerName(op.GameID.String()),
|
||||
Image: target.ImageRef,
|
||||
Hostname: HostName(op.GameID.String()),
|
||||
Network: s.dockerNetwork(),
|
||||
Env: map[string]string{
|
||||
"GAME_STATE_PATH": statePath,
|
||||
},
|
||||
Labels: map[string]string{
|
||||
"galaxy.game_id": op.GameID.String(),
|
||||
"galaxy.engine_version": target.Version,
|
||||
},
|
||||
BindMounts: []dockerclient.BindMount{
|
||||
{HostPath: hostStatePath, MountPath: s.deps.Config.ContainerStateMount},
|
||||
},
|
||||
LogDriver: s.deps.Config.ContainerLogDriver,
|
||||
LogOpts: s.deps.Config.ContainerLogOpts,
|
||||
CPUQuota: s.deps.Config.ContainerCPUQuota,
|
||||
Memory: s.deps.Config.ContainerMemory,
|
||||
PIDsLimit: s.deps.Config.ContainerPIDsLimit,
|
||||
PullPolicy: dockerclient.PullPolicy(s.deps.Config.ImagePullPolicy),
|
||||
}
|
||||
runResult, err := s.deps.Docker.Run(ctx, spec)
|
||||
if err != nil {
|
||||
s.publishStartFailure(ctx, op, target.ImageRef, err)
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
now := s.deps.Now().UTC()
|
||||
startedAt := runResult.StartedAt
|
||||
if startedAt.IsZero() {
|
||||
startedAt = now
|
||||
}
|
||||
startedAtPtr := &startedAt
|
||||
updated, err := s.deps.Store.UpdateRuntimeRecord(ctx, op.GameID, runtimeRecordUpdate{
|
||||
Status: strPtr(RuntimeStatusRunning),
|
||||
CurrentContainerID: strPtr(runResult.ContainerID),
|
||||
CurrentImageRef: strPtr(target.ImageRef),
|
||||
CurrentEngineVersion: strPtr(target.Version),
|
||||
EngineEndpoint: strPtr(runResult.EngineEndpoint),
|
||||
StartedAt: &startedAtPtr,
|
||||
EngineHealth: strPtr("ok"),
|
||||
}, now)
|
||||
if err != nil {
|
||||
s.completeOperation(ctx, op, err)
|
||||
return err
|
||||
}
|
||||
s.deps.Cache.PutRuntime(updated)
|
||||
s.scheduler.startGame(updated)
|
||||
s.completeOperation(ctx, op, nil)
|
||||
return nil
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// Snapshot / status helpers
|
||||
// =====================================================================
|
||||
|
||||
// publishSnapshot writes a runtime_health_snapshots row, refreshes the
|
||||
// runtime cache from `current_turn` / `engine_health`, and forwards
|
||||
// the snapshot to lobby.
|
||||
func (s *Service) publishSnapshot(ctx context.Context, gameID uuid.UUID, state rest.StateResponse) error {
|
||||
now := s.deps.Now().UTC()
|
||||
payload, err := json.Marshal(state)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal snapshot: %w", err)
|
||||
}
|
||||
if err := s.deps.Store.InsertHealthSnapshot(ctx, uuid.New(), gameID, now, payload); err != nil {
|
||||
return err
|
||||
}
|
||||
currentTurn := int32(state.Turn)
|
||||
patch := runtimeRecordUpdate{
|
||||
CurrentTurn: ¤tTurn,
|
||||
EngineHealth: strPtr("ok"),
|
||||
LastObservedAt: dblTime(now),
|
||||
}
|
||||
if state.Finished {
|
||||
patch.Status = strPtr(RuntimeStatusFinished)
|
||||
finishedAtPtr := &now
|
||||
patch.FinishedAt = &finishedAtPtr
|
||||
}
|
||||
rec, err := s.deps.Store.UpdateRuntimeRecord(ctx, gameID, patch, now)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.deps.Cache.PutRuntime(rec)
|
||||
|
||||
if s.deps.Lobby != nil {
|
||||
mappings, err := s.deps.Store.ListPlayerMappingsForGame(ctx, gameID)
|
||||
if err != nil {
|
||||
s.deps.Logger.Warn("list player_mappings on snapshot failed",
|
||||
zap.String("game_id", gameID.String()),
|
||||
zap.Error(err))
|
||||
}
|
||||
userByEngine := make(map[uuid.UUID]uuid.UUID, len(mappings))
|
||||
userByRace := make(map[string]uuid.UUID, len(mappings))
|
||||
for _, m := range mappings {
|
||||
userByEngine[m.EnginePlayerUUID] = m.UserID
|
||||
userByRace[m.RaceName] = m.UserID
|
||||
}
|
||||
stats := make([]LobbyPlayerStats, 0, len(state.Players))
|
||||
for _, p := range state.Players {
|
||||
userID, ok := userByEngine[p.ID]
|
||||
if !ok {
|
||||
userID = userByRace[p.RaceName]
|
||||
}
|
||||
if userID == uuid.Nil {
|
||||
continue
|
||||
}
|
||||
stats = append(stats, LobbyPlayerStats{
|
||||
UserID: userID,
|
||||
CurrentPlanets: int32(p.Planets),
|
||||
CurrentPopulation: int32(p.Population),
|
||||
MaxPlanets: int32(p.Planets),
|
||||
MaxPopulation: int32(p.Population),
|
||||
})
|
||||
}
|
||||
runtimeStatus := RuntimeStatusRunning
|
||||
if state.Finished {
|
||||
runtimeStatus = RuntimeStatusFinished
|
||||
}
|
||||
err = s.deps.Lobby.OnRuntimeSnapshot(ctx, gameID, LobbySnapshot{
|
||||
CurrentTurn: currentTurn,
|
||||
RuntimeStatus: runtimeStatus,
|
||||
EngineHealth: "ok",
|
||||
ObservedAt: now,
|
||||
PlayerStats: stats,
|
||||
})
|
||||
if err != nil {
|
||||
s.deps.Logger.Warn("lobby snapshot consumer failed",
|
||||
zap.String("game_id", gameID.String()),
|
||||
zap.Error(err))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// transitionRuntimeStatus updates the status / engine_health columns
|
||||
// and refreshes the cache.
|
||||
func (s *Service) transitionRuntimeStatus(ctx context.Context, gameID uuid.UUID, status, health string) (RuntimeRecord, error) {
|
||||
now := s.deps.Now().UTC()
|
||||
patch := runtimeRecordUpdate{Status: &status}
|
||||
if health != "" {
|
||||
patch.EngineHealth = &health
|
||||
}
|
||||
if status == RuntimeStatusFinished {
|
||||
finishedAtPtr := &now
|
||||
patch.FinishedAt = &finishedAtPtr
|
||||
}
|
||||
if status == RuntimeStatusStopped {
|
||||
stoppedAtPtr := &now
|
||||
patch.StoppedAt = &stoppedAtPtr
|
||||
}
|
||||
rec, err := s.deps.Store.UpdateRuntimeRecord(ctx, gameID, patch, now)
|
||||
if err != nil {
|
||||
return RuntimeRecord{}, err
|
||||
}
|
||||
s.deps.Cache.PutRuntime(rec)
|
||||
return rec, nil
|
||||
}
|
||||
|
||||
// upsertRuntimeRecord inserts the record when no row exists; updates
|
||||
// it otherwise. Used by runStart so a re-attempt after a worker crash
|
||||
// stays idempotent.
|
||||
func (s *Service) upsertRuntimeRecord(ctx context.Context, in runtimeRecordInsert, patch runtimeRecordUpdate) (RuntimeRecord, error) {
|
||||
rec, err := s.deps.Store.InsertRuntimeRecord(ctx, in)
|
||||
if err == nil {
|
||||
s.deps.Cache.PutRuntime(rec)
|
||||
return rec, nil
|
||||
}
|
||||
if !errors.Is(err, ErrConflict) {
|
||||
return RuntimeRecord{}, err
|
||||
}
|
||||
updated, err := s.deps.Store.UpdateRuntimeRecord(ctx, in.GameID, patch, s.deps.Now().UTC())
|
||||
if err != nil {
|
||||
return RuntimeRecord{}, err
|
||||
}
|
||||
s.deps.Cache.PutRuntime(updated)
|
||||
return updated, nil
|
||||
}
|
||||
|
||||
// dockerNetwork returns the user-defined Docker network name engine
|
||||
// containers attach to. Wired from cfg.Docker.Network through Deps.
|
||||
func (s *Service) dockerNetwork() string { return s.deps.DockerNetwork }
|
||||
|
||||
// waitForEngineHealthz polls the engine `/healthz` endpoint until it
|
||||
// responds 2xx or until the timeout elapses. The Docker daemon
|
||||
// reports a container as `running` as soon as the entrypoint starts,
|
||||
// but the engine binary may need a moment to bind its TCP port; the
|
||||
// retry loop bridges that gap so the immediately-following Init call
|
||||
// does not race the listener.
|
||||
func (s *Service) waitForEngineHealthz(ctx context.Context, baseURL string, timeout time.Duration) error {
|
||||
deadline := time.Now().Add(timeout)
|
||||
var lastErr error
|
||||
for {
|
||||
probeCtx, cancel := context.WithTimeout(ctx, time.Second)
|
||||
err := s.deps.Engine.Healthz(probeCtx, baseURL)
|
||||
cancel()
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
lastErr = err
|
||||
if time.Now().After(deadline) {
|
||||
return fmt.Errorf("engine healthz never succeeded within %s: %w", timeout, lastErr)
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(200 * time.Millisecond):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// hostStateRoot returns the host-side root directory under which the
|
||||
// per-game state directory is created. Wired from cfg.Game.StateRoot
|
||||
// through Deps.
|
||||
func (s *Service) hostStateRoot() string {
|
||||
if s.deps.HostStateRoot != "" {
|
||||
return s.deps.HostStateRoot
|
||||
}
|
||||
return s.deps.Config.ContainerStateMount
|
||||
}
|
||||
|
||||
// strPtr returns a pointer to s. Helps assemble runtimeRecordUpdate
|
||||
// values inline.
|
||||
func strPtr(s string) *string { return &s }
|
||||
|
||||
// dblTime returns a `**time.Time` set to t. Used to clear / set the
|
||||
// nullable timestamp columns of `runtime_records` through
|
||||
// runtimeRecordUpdate.
|
||||
func dblTime(t time.Time) **time.Time { p := &t; return &p }
|
||||
@@ -0,0 +1,298 @@
|
||||
package runtime_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"net/url"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/backend/internal/config"
|
||||
"galaxy/backend/internal/dockerclient"
|
||||
"galaxy/backend/internal/engineclient"
|
||||
backendpg "galaxy/backend/internal/postgres"
|
||||
"galaxy/backend/internal/runtime"
|
||||
"galaxy/model/rest"
|
||||
pgshared "galaxy/postgres"
|
||||
|
||||
"github.com/google/uuid"
|
||||
testcontainers "github.com/testcontainers/testcontainers-go"
|
||||
tcpostgres "github.com/testcontainers/testcontainers-go/modules/postgres"
|
||||
"github.com/testcontainers/testcontainers-go/wait"
|
||||
"go.uber.org/zap/zaptest"
|
||||
)
|
||||
|
||||
const (
|
||||
pgImage = "postgres:16-alpine"
|
||||
pgUser = "galaxy"
|
||||
pgPassword = "galaxy"
|
||||
pgDatabase = "galaxy_backend"
|
||||
pgSchema = "backend"
|
||||
pgStartup = 90 * time.Second
|
||||
pgOpTO = 10 * time.Second
|
||||
)
|
||||
|
||||
func dsnWithSearchPath(raw, schema string) (string, error) {
|
||||
parsed, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
q := parsed.Query()
|
||||
q.Set("search_path", schema)
|
||||
parsed.RawQuery = q.Encode()
|
||||
return parsed.String(), nil
|
||||
}
|
||||
|
||||
func startPostgres(t *testing.T) *sql.DB {
|
||||
t.Helper()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
||||
t.Cleanup(cancel)
|
||||
|
||||
container, err := tcpostgres.Run(ctx, pgImage,
|
||||
tcpostgres.WithDatabase(pgDatabase),
|
||||
tcpostgres.WithUsername(pgUser),
|
||||
tcpostgres.WithPassword(pgPassword),
|
||||
testcontainers.WithWaitStrategy(
|
||||
wait.ForLog("database system is ready to accept connections").
|
||||
WithOccurrence(2).
|
||||
WithStartupTimeout(pgStartup),
|
||||
),
|
||||
)
|
||||
if err != nil {
|
||||
t.Skipf("postgres testcontainer unavailable, skipping: %v", err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
if termErr := testcontainers.TerminateContainer(container); termErr != nil {
|
||||
t.Errorf("terminate postgres container: %v", termErr)
|
||||
}
|
||||
})
|
||||
|
||||
baseDSN, err := container.ConnectionString(ctx, "sslmode=disable")
|
||||
if err != nil {
|
||||
t.Fatalf("connection string: %v", err)
|
||||
}
|
||||
scopedDSN, err := dsnWithSearchPath(baseDSN, pgSchema)
|
||||
if err != nil {
|
||||
t.Fatalf("scope dsn: %v", err)
|
||||
}
|
||||
cfg := pgshared.DefaultConfig()
|
||||
cfg.PrimaryDSN = scopedDSN
|
||||
cfg.OperationTimeout = pgOpTO
|
||||
db, err := pgshared.OpenPrimary(ctx, cfg)
|
||||
if err != nil {
|
||||
t.Fatalf("open primary: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { _ = db.Close() })
|
||||
if err := backendpg.ApplyMigrations(ctx, db); err != nil {
|
||||
t.Fatalf("apply migrations: %v", err)
|
||||
}
|
||||
return db
|
||||
}
|
||||
|
||||
// fakeDocker implements dockerclient.Client for tests.
|
||||
type fakeDocker struct {
|
||||
mu sync.Mutex
|
||||
runs []dockerclient.RunSpec
|
||||
stoppedIDs []string
|
||||
removedIDs []string
|
||||
listResult []dockerclient.ContainerSummary
|
||||
endpointFor func(spec dockerclient.RunSpec) string
|
||||
}
|
||||
|
||||
func (f *fakeDocker) EnsureNetwork(_ context.Context, _ string) error { return nil }
|
||||
func (f *fakeDocker) PullImage(_ context.Context, _ string, _ dockerclient.PullPolicy) error {
|
||||
return nil
|
||||
}
|
||||
func (f *fakeDocker) InspectImage(_ context.Context, ref string) (dockerclient.ImageInspect, error) {
|
||||
return dockerclient.ImageInspect{Ref: ref}, nil
|
||||
}
|
||||
func (f *fakeDocker) InspectContainer(_ context.Context, _ string) (dockerclient.ContainerInspect, error) {
|
||||
return dockerclient.ContainerInspect{}, nil
|
||||
}
|
||||
func (f *fakeDocker) Run(_ context.Context, spec dockerclient.RunSpec) (dockerclient.RunResult, error) {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
f.runs = append(f.runs, spec)
|
||||
endpoint := "http://" + spec.Hostname + ":8080"
|
||||
if f.endpointFor != nil {
|
||||
endpoint = f.endpointFor(spec)
|
||||
}
|
||||
return dockerclient.RunResult{
|
||||
ContainerID: "container-" + spec.Name,
|
||||
EngineEndpoint: endpoint,
|
||||
StartedAt: time.Now().UTC(),
|
||||
}, nil
|
||||
}
|
||||
func (f *fakeDocker) Stop(_ context.Context, id string, _ int) error {
|
||||
f.mu.Lock()
|
||||
f.stoppedIDs = append(f.stoppedIDs, id)
|
||||
f.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
func (f *fakeDocker) Remove(_ context.Context, id string) error {
|
||||
f.mu.Lock()
|
||||
f.removedIDs = append(f.removedIDs, id)
|
||||
f.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
func (f *fakeDocker) List(_ context.Context, _ dockerclient.ListFilter) ([]dockerclient.ContainerSummary, error) {
|
||||
return f.listResult, nil
|
||||
}
|
||||
|
||||
// fakeLobbyConsumer captures runtime → lobby callbacks.
|
||||
type fakeLobbyConsumer struct {
|
||||
mu sync.Mutex
|
||||
snapshots []runtime.LobbySnapshot
|
||||
jobs []runtime.JobResult
|
||||
}
|
||||
|
||||
func (f *fakeLobbyConsumer) OnRuntimeSnapshot(_ context.Context, _ uuid.UUID, snapshot runtime.LobbySnapshot) error {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
f.snapshots = append(f.snapshots, snapshot)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *fakeLobbyConsumer) OnRuntimeJobResult(_ context.Context, _ uuid.UUID, result runtime.JobResult) error {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
f.jobs = append(f.jobs, result)
|
||||
return nil
|
||||
}
|
||||
|
||||
func TestServiceStartGameEndToEnd(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("postgres-backed test skipped in -short")
|
||||
}
|
||||
ctx := context.Background()
|
||||
db := startPostgres(t)
|
||||
|
||||
gameID := uuid.New()
|
||||
userID := uuid.New()
|
||||
if _, err := db.ExecContext(ctx, `
|
||||
INSERT INTO backend.games (
|
||||
game_id, owner_user_id, visibility, status, game_name, description,
|
||||
min_players, max_players, start_gap_hours, start_gap_players,
|
||||
enrollment_ends_at, turn_schedule, target_engine_version,
|
||||
runtime_snapshot
|
||||
) VALUES ($1, NULL, 'public', 'starting', 'test-game', '',
|
||||
1, 4, 0, 0, $2, '*/5 * * * *', '0.1.0', '{}'::jsonb)
|
||||
`, gameID, time.Now().Add(time.Hour)); err != nil {
|
||||
t.Fatalf("insert game: %v", err)
|
||||
}
|
||||
if _, err := db.ExecContext(ctx, `
|
||||
INSERT INTO backend.memberships (membership_id, game_id, user_id, race_name, canonical_key, status)
|
||||
VALUES ($1, $2, $3, 'Alpha', 'alpha', 'active')
|
||||
`, uuid.New(), gameID, userID); err != nil {
|
||||
t.Fatalf("insert membership: %v", err)
|
||||
}
|
||||
if _, err := db.ExecContext(ctx, `
|
||||
INSERT INTO backend.engine_versions (version, image_ref, enabled)
|
||||
VALUES ('0.1.0', 'galaxy-game:0.1.0', true)
|
||||
`); err != nil {
|
||||
t.Fatalf("insert engine version: %v", err)
|
||||
}
|
||||
|
||||
engineSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
switch r.URL.Path {
|
||||
case "/api/v1/admin/init":
|
||||
_ = json.NewEncoder(w).Encode(rest.StateResponse{ID: gameID, Turn: 0, Players: []rest.PlayerState{{RaceName: "Alpha", Planets: 3, Population: 10}}})
|
||||
case "/api/v1/admin/status":
|
||||
_ = json.NewEncoder(w).Encode(rest.StateResponse{ID: gameID, Turn: 1, Players: []rest.PlayerState{{RaceName: "Alpha", Planets: 5, Population: 12}}})
|
||||
case "/api/v1/admin/turn":
|
||||
_ = json.NewEncoder(w).Encode(rest.StateResponse{ID: gameID, Turn: 2, Players: []rest.PlayerState{{RaceName: "Alpha", Planets: 6, Population: 14}}, Finished: true})
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
t.Cleanup(engineSrv.Close)
|
||||
|
||||
docker := &fakeDocker{endpointFor: func(_ dockerclient.RunSpec) string { return engineSrv.URL }}
|
||||
engineCli, err := engineclient.NewClientWithHTTP(engineclient.Config{CallTimeout: time.Second, ProbeTimeout: time.Second}, engineSrv.Client())
|
||||
if err != nil {
|
||||
t.Fatalf("engineclient: %v", err)
|
||||
}
|
||||
store := runtime.NewStore(db)
|
||||
cache := runtime.NewCache()
|
||||
if err := cache.Warm(ctx, store); err != nil {
|
||||
t.Fatalf("warm cache: %v", err)
|
||||
}
|
||||
versions := runtime.NewEngineVersionService(store, cache, nil)
|
||||
consumer := &fakeLobbyConsumer{}
|
||||
|
||||
svc, err := runtime.NewService(runtime.Deps{
|
||||
Store: store,
|
||||
Cache: cache,
|
||||
EngineVersions: versions,
|
||||
Docker: docker,
|
||||
Engine: engineCli,
|
||||
Lobby: consumer,
|
||||
DockerNetwork: "galaxy",
|
||||
HostStateRoot: t.TempDir(),
|
||||
Config: config.RuntimeConfig{
|
||||
WorkerPoolSize: 1,
|
||||
JobQueueSize: 4,
|
||||
ReconcileInterval: time.Hour,
|
||||
ImagePullPolicy: "if_missing",
|
||||
ContainerLogDriver: "json-file",
|
||||
ContainerCPUQuota: 1.0,
|
||||
ContainerMemory: "128m",
|
||||
ContainerPIDsLimit: 64,
|
||||
ContainerStateMount: "/var/lib/galaxy-game",
|
||||
StopGracePeriod: time.Second,
|
||||
},
|
||||
Logger: zaptest.NewLogger(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("NewService: %v", err)
|
||||
}
|
||||
|
||||
// Drive StartGame; the worker pool is not running so we invoke
|
||||
// the worker entry directly through the public API. StartGame
|
||||
// enqueues; we drain by calling Workers().Run in a goroutine and
|
||||
// shutting it down once we observe the side effects.
|
||||
pool := svc.Workers()
|
||||
runCtx, runCancel := context.WithCancel(ctx)
|
||||
t.Cleanup(runCancel)
|
||||
go func() { _ = pool.Run(runCtx) }()
|
||||
|
||||
if err := svc.StartGame(ctx, gameID); err != nil {
|
||||
t.Fatalf("StartGame: %v", err)
|
||||
}
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
rec, err := svc.GetRuntime(ctx, gameID)
|
||||
if err == nil && rec.Status == runtime.RuntimeStatusRunning {
|
||||
break
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
rec, err := svc.GetRuntime(ctx, gameID)
|
||||
if err != nil {
|
||||
t.Fatalf("GetRuntime: %v", err)
|
||||
}
|
||||
if rec.Status != runtime.RuntimeStatusRunning {
|
||||
t.Fatalf("runtime status = %s, want running", rec.Status)
|
||||
}
|
||||
if rec.CurrentImageRef != "galaxy-game:0.1.0" {
|
||||
t.Fatalf("image_ref = %s", rec.CurrentImageRef)
|
||||
}
|
||||
consumer.mu.Lock()
|
||||
snapshotCount := len(consumer.snapshots)
|
||||
consumer.mu.Unlock()
|
||||
if snapshotCount == 0 {
|
||||
t.Fatalf("expected runtime snapshot")
|
||||
}
|
||||
mappings, err := store.ListPlayerMappingsForGame(ctx, gameID)
|
||||
if err != nil {
|
||||
t.Fatalf("ListPlayerMappingsForGame: %v", err)
|
||||
}
|
||||
if len(mappings) != 1 || mappings[0].UserID != userID {
|
||||
t.Fatalf("unexpected mappings: %+v", mappings)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,714 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"galaxy/backend/internal/postgres/jet/backend/model"
|
||||
"galaxy/backend/internal/postgres/jet/backend/table"
|
||||
|
||||
"github.com/go-jet/jet/v2/postgres"
|
||||
"github.com/go-jet/jet/v2/qrm"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// engineVersionsPK is the constraint name surfaced when a duplicate
|
||||
// `version` is inserted. Postgres synthesises `<table>_pkey` for the
|
||||
// primary-key constraint, matching the migration in
|
||||
// `backend/internal/postgres/migrations/00001_init.sql:407`.
|
||||
const engineVersionsPK = "engine_versions_pkey"
|
||||
|
||||
// runtimeRecordsPK is the constraint name surfaced when a duplicate
|
||||
// `runtime_records.game_id` insert hits the primary key.
|
||||
const runtimeRecordsPK = "runtime_records_pkey"
|
||||
|
||||
// playerMappingsRaceUnique mirrors
|
||||
// `player_mappings_game_race_uidx`, the partial UNIQUE that enforces
|
||||
// the one-race-per-game invariant.
|
||||
const playerMappingsRaceUnique = "player_mappings_game_race_uidx"
|
||||
|
||||
// Store is the Postgres-backed query surface for the runtime package.
|
||||
// All queries are built through go-jet against the generated table
|
||||
// bindings under `backend/internal/postgres/jet/backend/table`.
|
||||
type Store struct {
|
||||
db *sql.DB
|
||||
}
|
||||
|
||||
// NewStore constructs a Store wrapping db.
|
||||
func NewStore(db *sql.DB) *Store { return &Store{db: db} }
|
||||
|
||||
// engineVersionColumns is the canonical projection used by every
|
||||
// engine-version read path.
|
||||
func engineVersionColumns() postgres.ColumnList {
|
||||
v := table.EngineVersions
|
||||
return postgres.ColumnList{v.Version, v.ImageRef, v.Enabled, v.CreatedAt, v.UpdatedAt}
|
||||
}
|
||||
|
||||
// runtimeRecordColumns is the canonical projection used by every
|
||||
// runtime-record read path.
|
||||
func runtimeRecordColumns() postgres.ColumnList {
|
||||
r := table.RuntimeRecords
|
||||
return postgres.ColumnList{
|
||||
r.GameID, r.Status, r.CurrentContainerID, r.CurrentImageRef,
|
||||
r.CurrentEngineVersion, r.EngineEndpoint, r.StatePath, r.DockerNetwork,
|
||||
r.TurnSchedule, r.CurrentTurn, r.NextGenerationAt, r.SkipNextTick,
|
||||
r.Paused, r.PausedAt, r.EngineHealth,
|
||||
r.CreatedAt, r.UpdatedAt, r.StartedAt, r.StoppedAt, r.FinishedAt,
|
||||
r.RemovedAt, r.LastObservedAt,
|
||||
}
|
||||
}
|
||||
|
||||
// operationLogColumns is the canonical projection used by every read
|
||||
// of `backend.runtime_operation_log`.
|
||||
func operationLogColumns() postgres.ColumnList {
|
||||
o := table.RuntimeOperationLog
|
||||
return postgres.ColumnList{
|
||||
o.OperationID, o.GameID, o.Op, o.Source, o.Status, o.ImageRef,
|
||||
o.ContainerID, o.ErrorCode, o.ErrorMessage, o.StartedAt, o.FinishedAt,
|
||||
}
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// Engine version registry
|
||||
// =====================================================================
|
||||
|
||||
// ListEngineVersions returns every engine_versions row ordered by
|
||||
// created_at DESC.
|
||||
func (s *Store) ListEngineVersions(ctx context.Context) ([]EngineVersion, error) {
|
||||
v := table.EngineVersions
|
||||
stmt := postgres.SELECT(engineVersionColumns()).
|
||||
FROM(v).
|
||||
ORDER_BY(v.CreatedAt.DESC(), v.Version.DESC())
|
||||
var rows []model.EngineVersions
|
||||
if err := stmt.QueryContext(ctx, s.db, &rows); err != nil {
|
||||
return nil, fmt.Errorf("runtime store: list engine versions: %w", err)
|
||||
}
|
||||
out := make([]EngineVersion, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
out = append(out, modelToEngineVersion(row))
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// GetEngineVersion returns the row for version. Returns ErrNotFound
|
||||
// when no row matches.
|
||||
func (s *Store) GetEngineVersion(ctx context.Context, version string) (EngineVersion, error) {
|
||||
v := table.EngineVersions
|
||||
stmt := postgres.SELECT(engineVersionColumns()).
|
||||
FROM(v).
|
||||
WHERE(v.Version.EQ(postgres.String(version))).
|
||||
LIMIT(1)
|
||||
var row model.EngineVersions
|
||||
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
|
||||
if errors.Is(err, qrm.ErrNoRows) {
|
||||
return EngineVersion{}, ErrNotFound
|
||||
}
|
||||
return EngineVersion{}, fmt.Errorf("runtime store: load engine version %q: %w", version, err)
|
||||
}
|
||||
return modelToEngineVersion(row), nil
|
||||
}
|
||||
|
||||
// InsertEngineVersion persists a fresh engine version row. Returns
|
||||
// ErrEngineVersionTaken when the primary key collides.
|
||||
func (s *Store) InsertEngineVersion(ctx context.Context, version, imageRef string, enabled bool, now time.Time) (EngineVersion, error) {
|
||||
v := table.EngineVersions
|
||||
stmt := v.INSERT(v.Version, v.ImageRef, v.Enabled, v.CreatedAt, v.UpdatedAt).
|
||||
VALUES(version, imageRef, enabled, now, now).
|
||||
RETURNING(engineVersionColumns())
|
||||
var row model.EngineVersions
|
||||
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
|
||||
if isUniqueViolation(err, engineVersionsPK) {
|
||||
return EngineVersion{}, ErrEngineVersionTaken
|
||||
}
|
||||
return EngineVersion{}, fmt.Errorf("runtime store: insert engine version %q: %w", version, err)
|
||||
}
|
||||
return modelToEngineVersion(row), nil
|
||||
}
|
||||
|
||||
// engineVersionUpdate carries the parameters for UpdateEngineVersion.
|
||||
// Nil pointers leave the corresponding column alone.
|
||||
type engineVersionUpdate struct {
|
||||
ImageRef *string
|
||||
Enabled *bool
|
||||
}
|
||||
|
||||
// UpdateEngineVersion patches the supplied columns and bumps
|
||||
// updated_at. Returns ErrNotFound when no row matches.
|
||||
func (s *Store) UpdateEngineVersion(ctx context.Context, version string, patch engineVersionUpdate, now time.Time) (EngineVersion, error) {
|
||||
v := table.EngineVersions
|
||||
rest := []any{}
|
||||
if patch.ImageRef != nil {
|
||||
rest = append(rest, v.ImageRef.SET(postgres.String(*patch.ImageRef)))
|
||||
}
|
||||
if patch.Enabled != nil {
|
||||
rest = append(rest, v.Enabled.SET(postgres.Bool(*patch.Enabled)))
|
||||
}
|
||||
stmt := v.UPDATE().
|
||||
SET(v.UpdatedAt.SET(postgres.TimestampzT(now)), rest...).
|
||||
WHERE(v.Version.EQ(postgres.String(version))).
|
||||
RETURNING(engineVersionColumns())
|
||||
|
||||
var row model.EngineVersions
|
||||
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
|
||||
if errors.Is(err, qrm.ErrNoRows) {
|
||||
return EngineVersion{}, ErrNotFound
|
||||
}
|
||||
return EngineVersion{}, fmt.Errorf("runtime store: update engine version %q: %w", version, err)
|
||||
}
|
||||
return modelToEngineVersion(row), nil
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// Runtime records
|
||||
// =====================================================================
|
||||
|
||||
// runtimeRecordInsert carries the parameters for InsertRuntimeRecord.
|
||||
type runtimeRecordInsert struct {
|
||||
GameID uuid.UUID
|
||||
Status string
|
||||
CurrentContainerID string
|
||||
CurrentImageRef string
|
||||
CurrentEngineVersion string
|
||||
EngineEndpoint string
|
||||
StatePath string
|
||||
DockerNetwork string
|
||||
TurnSchedule string
|
||||
StartedAt *time.Time
|
||||
}
|
||||
|
||||
// InsertRuntimeRecord creates a fresh row.
|
||||
func (s *Store) InsertRuntimeRecord(ctx context.Context, in runtimeRecordInsert) (RuntimeRecord, error) {
|
||||
r := table.RuntimeRecords
|
||||
stmt := r.INSERT(
|
||||
r.GameID, r.Status, r.CurrentContainerID, r.CurrentImageRef,
|
||||
r.CurrentEngineVersion, r.EngineEndpoint, r.StatePath,
|
||||
r.DockerNetwork, r.TurnSchedule, r.StartedAt,
|
||||
).VALUES(
|
||||
in.GameID, in.Status,
|
||||
nullableString(in.CurrentContainerID),
|
||||
nullableString(in.CurrentImageRef),
|
||||
nullableString(in.CurrentEngineVersion),
|
||||
in.EngineEndpoint,
|
||||
nullableString(in.StatePath),
|
||||
nullableString(in.DockerNetwork),
|
||||
in.TurnSchedule,
|
||||
nullableTime(in.StartedAt),
|
||||
).RETURNING(runtimeRecordColumns())
|
||||
|
||||
var row model.RuntimeRecords
|
||||
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
|
||||
if isUniqueViolation(err, runtimeRecordsPK) {
|
||||
return RuntimeRecord{}, ErrConflict
|
||||
}
|
||||
return RuntimeRecord{}, fmt.Errorf("runtime store: insert runtime_record %s: %w", in.GameID, err)
|
||||
}
|
||||
return modelToRuntimeRecord(row), nil
|
||||
}
|
||||
|
||||
// LoadRuntimeRecord returns the row for gameID. Returns ErrNotFound
|
||||
// when no row matches.
|
||||
func (s *Store) LoadRuntimeRecord(ctx context.Context, gameID uuid.UUID) (RuntimeRecord, error) {
|
||||
r := table.RuntimeRecords
|
||||
stmt := postgres.SELECT(runtimeRecordColumns()).
|
||||
FROM(r).
|
||||
WHERE(r.GameID.EQ(postgres.UUID(gameID))).
|
||||
LIMIT(1)
|
||||
var row model.RuntimeRecords
|
||||
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
|
||||
if errors.Is(err, qrm.ErrNoRows) {
|
||||
return RuntimeRecord{}, ErrNotFound
|
||||
}
|
||||
return RuntimeRecord{}, fmt.Errorf("runtime store: load runtime_record %s: %w", gameID, err)
|
||||
}
|
||||
return modelToRuntimeRecord(row), nil
|
||||
}
|
||||
|
||||
// ListAllRuntimeRecords returns every row, used by Cache.Warm.
|
||||
func (s *Store) ListAllRuntimeRecords(ctx context.Context) ([]RuntimeRecord, error) {
|
||||
stmt := postgres.SELECT(runtimeRecordColumns()).FROM(table.RuntimeRecords)
|
||||
var rows []model.RuntimeRecords
|
||||
if err := stmt.QueryContext(ctx, s.db, &rows); err != nil {
|
||||
return nil, fmt.Errorf("runtime store: list runtime_records: %w", err)
|
||||
}
|
||||
out := make([]RuntimeRecord, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
out = append(out, modelToRuntimeRecord(row))
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// runtimeRecordUpdate carries the parameters for UpdateRuntimeRecord.
|
||||
// Pointer fields default to "leave alone" when nil.
|
||||
type runtimeRecordUpdate struct {
|
||||
Status *string
|
||||
CurrentContainerID *string
|
||||
CurrentImageRef *string
|
||||
CurrentEngineVersion *string
|
||||
EngineEndpoint *string
|
||||
StatePath *string
|
||||
DockerNetwork *string
|
||||
TurnSchedule *string
|
||||
CurrentTurn *int32
|
||||
NextGenerationAt **time.Time
|
||||
SkipNextTick *bool
|
||||
Paused *bool
|
||||
PausedAt **time.Time
|
||||
EngineHealth *string
|
||||
StartedAt **time.Time
|
||||
StoppedAt **time.Time
|
||||
FinishedAt **time.Time
|
||||
RemovedAt **time.Time
|
||||
LastObservedAt **time.Time
|
||||
}
|
||||
|
||||
// UpdateRuntimeRecord patches the supplied columns. Pointer fields are
|
||||
// translated into a dynamic SET list — only the fields the caller
|
||||
// supplies are emitted in the UPDATE. Nullable timestamps use a
|
||||
// `**time.Time` so callers can distinguish "leave alone" (outer nil)
|
||||
// from "clear to NULL" (inner nil).
|
||||
func (s *Store) UpdateRuntimeRecord(ctx context.Context, gameID uuid.UUID, patch runtimeRecordUpdate, now time.Time) (RuntimeRecord, error) {
|
||||
r := table.RuntimeRecords
|
||||
rest := []any{}
|
||||
if patch.Status != nil {
|
||||
rest = append(rest, r.Status.SET(postgres.String(*patch.Status)))
|
||||
}
|
||||
if patch.CurrentContainerID != nil {
|
||||
rest = append(rest, r.CurrentContainerID.SET(nullableStringSetExpr(*patch.CurrentContainerID)))
|
||||
}
|
||||
if patch.CurrentImageRef != nil {
|
||||
rest = append(rest, r.CurrentImageRef.SET(nullableStringSetExpr(*patch.CurrentImageRef)))
|
||||
}
|
||||
if patch.CurrentEngineVersion != nil {
|
||||
rest = append(rest, r.CurrentEngineVersion.SET(nullableStringSetExpr(*patch.CurrentEngineVersion)))
|
||||
}
|
||||
if patch.EngineEndpoint != nil {
|
||||
rest = append(rest, r.EngineEndpoint.SET(postgres.String(*patch.EngineEndpoint)))
|
||||
}
|
||||
if patch.StatePath != nil {
|
||||
rest = append(rest, r.StatePath.SET(nullableStringSetExpr(*patch.StatePath)))
|
||||
}
|
||||
if patch.DockerNetwork != nil {
|
||||
rest = append(rest, r.DockerNetwork.SET(nullableStringSetExpr(*patch.DockerNetwork)))
|
||||
}
|
||||
if patch.TurnSchedule != nil {
|
||||
rest = append(rest, r.TurnSchedule.SET(postgres.String(*patch.TurnSchedule)))
|
||||
}
|
||||
if patch.CurrentTurn != nil {
|
||||
rest = append(rest, r.CurrentTurn.SET(postgres.Int(int64(*patch.CurrentTurn))))
|
||||
}
|
||||
if patch.NextGenerationAt != nil {
|
||||
rest = append(rest, r.NextGenerationAt.SET(timePtrSetExpr(*patch.NextGenerationAt)))
|
||||
}
|
||||
if patch.SkipNextTick != nil {
|
||||
rest = append(rest, r.SkipNextTick.SET(postgres.Bool(*patch.SkipNextTick)))
|
||||
}
|
||||
if patch.Paused != nil {
|
||||
rest = append(rest, r.Paused.SET(postgres.Bool(*patch.Paused)))
|
||||
}
|
||||
if patch.PausedAt != nil {
|
||||
rest = append(rest, r.PausedAt.SET(timePtrSetExpr(*patch.PausedAt)))
|
||||
}
|
||||
if patch.EngineHealth != nil {
|
||||
rest = append(rest, r.EngineHealth.SET(postgres.String(*patch.EngineHealth)))
|
||||
}
|
||||
if patch.StartedAt != nil {
|
||||
rest = append(rest, r.StartedAt.SET(timePtrSetExpr(*patch.StartedAt)))
|
||||
}
|
||||
if patch.StoppedAt != nil {
|
||||
rest = append(rest, r.StoppedAt.SET(timePtrSetExpr(*patch.StoppedAt)))
|
||||
}
|
||||
if patch.FinishedAt != nil {
|
||||
rest = append(rest, r.FinishedAt.SET(timePtrSetExpr(*patch.FinishedAt)))
|
||||
}
|
||||
if patch.RemovedAt != nil {
|
||||
rest = append(rest, r.RemovedAt.SET(timePtrSetExpr(*patch.RemovedAt)))
|
||||
}
|
||||
if patch.LastObservedAt != nil {
|
||||
rest = append(rest, r.LastObservedAt.SET(timePtrSetExpr(*patch.LastObservedAt)))
|
||||
}
|
||||
|
||||
stmt := r.UPDATE().
|
||||
SET(r.UpdatedAt.SET(postgres.TimestampzT(now)), rest...).
|
||||
WHERE(r.GameID.EQ(postgres.UUID(gameID))).
|
||||
RETURNING(runtimeRecordColumns())
|
||||
|
||||
var row model.RuntimeRecords
|
||||
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
|
||||
if errors.Is(err, qrm.ErrNoRows) {
|
||||
return RuntimeRecord{}, ErrNotFound
|
||||
}
|
||||
return RuntimeRecord{}, fmt.Errorf("runtime store: update runtime_record %s: %w", gameID, err)
|
||||
}
|
||||
return modelToRuntimeRecord(row), nil
|
||||
}
|
||||
|
||||
// DeleteRuntimeRecord removes the row at gameID. Idempotent: nil when
|
||||
// no row matched.
|
||||
func (s *Store) DeleteRuntimeRecord(ctx context.Context, gameID uuid.UUID) error {
|
||||
stmt := table.RuntimeRecords.DELETE().
|
||||
WHERE(table.RuntimeRecords.GameID.EQ(postgres.UUID(gameID)))
|
||||
if _, err := stmt.ExecContext(ctx, s.db); err != nil {
|
||||
return fmt.Errorf("runtime store: delete runtime_record %s: %w", gameID, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// Player mappings
|
||||
// =====================================================================
|
||||
|
||||
// InsertPlayerMappings persists a slice of mappings in a single
|
||||
// transaction. Existing rows for the (game_id, user_id) pair are
|
||||
// replaced (ON CONFLICT) so re-runs of StartGame after a transient
|
||||
// failure stay idempotent.
|
||||
func (s *Store) InsertPlayerMappings(ctx context.Context, mappings []PlayerMapping) error {
|
||||
if len(mappings) == 0 {
|
||||
return nil
|
||||
}
|
||||
tx, err := s.db.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("runtime store: begin player_mappings tx: %w", err)
|
||||
}
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
pm := table.PlayerMappings
|
||||
for _, m := range mappings {
|
||||
stmt := pm.INSERT(pm.GameID, pm.UserID, pm.RaceName, pm.EnginePlayerUUID).
|
||||
VALUES(m.GameID, m.UserID, m.RaceName, m.EnginePlayerUUID).
|
||||
ON_CONFLICT(pm.GameID, pm.UserID).
|
||||
DO_UPDATE(postgres.SET(
|
||||
pm.RaceName.SET(pm.EXCLUDED.RaceName),
|
||||
pm.EnginePlayerUUID.SET(pm.EXCLUDED.EnginePlayerUUID),
|
||||
))
|
||||
if _, err := stmt.ExecContext(ctx, tx); err != nil {
|
||||
if isUniqueViolation(err, playerMappingsRaceUnique) {
|
||||
return fmt.Errorf("%w: race name %q duplicated within game", ErrConflict, m.RaceName)
|
||||
}
|
||||
return fmt.Errorf("runtime store: insert player_mapping %s/%s: %w", m.GameID, m.UserID, err)
|
||||
}
|
||||
}
|
||||
if err := tx.Commit(); err != nil {
|
||||
return fmt.Errorf("runtime store: commit player_mappings: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadPlayerMapping returns the mapping for (gameID, userID). Returns
|
||||
// ErrNotFound when no row matches.
|
||||
func (s *Store) LoadPlayerMapping(ctx context.Context, gameID, userID uuid.UUID) (PlayerMapping, error) {
|
||||
pm := table.PlayerMappings
|
||||
stmt := postgres.SELECT(pm.GameID, pm.UserID, pm.RaceName, pm.EnginePlayerUUID, pm.CreatedAt).
|
||||
FROM(pm).
|
||||
WHERE(
|
||||
pm.GameID.EQ(postgres.UUID(gameID)).
|
||||
AND(pm.UserID.EQ(postgres.UUID(userID))),
|
||||
).
|
||||
LIMIT(1)
|
||||
var row model.PlayerMappings
|
||||
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
|
||||
if errors.Is(err, qrm.ErrNoRows) {
|
||||
return PlayerMapping{}, ErrNotFound
|
||||
}
|
||||
return PlayerMapping{}, fmt.Errorf("runtime store: load player_mapping: %w", err)
|
||||
}
|
||||
return modelToPlayerMapping(row), nil
|
||||
}
|
||||
|
||||
// ListPlayerMappingsForGame returns every mapping for gameID.
|
||||
func (s *Store) ListPlayerMappingsForGame(ctx context.Context, gameID uuid.UUID) ([]PlayerMapping, error) {
|
||||
pm := table.PlayerMappings
|
||||
stmt := postgres.SELECT(pm.GameID, pm.UserID, pm.RaceName, pm.EnginePlayerUUID, pm.CreatedAt).
|
||||
FROM(pm).
|
||||
WHERE(pm.GameID.EQ(postgres.UUID(gameID))).
|
||||
ORDER_BY(pm.RaceName.ASC())
|
||||
var rows []model.PlayerMappings
|
||||
if err := stmt.QueryContext(ctx, s.db, &rows); err != nil {
|
||||
return nil, fmt.Errorf("runtime store: list player_mappings: %w", err)
|
||||
}
|
||||
out := make([]PlayerMapping, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
out = append(out, modelToPlayerMapping(row))
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// DeletePlayerMappingsForGame removes every mapping for gameID. Used
|
||||
// on stop / cancel / reconciler-removal so a future StartGame can
|
||||
// repopulate the projection without violating the per-game UNIQUE.
|
||||
func (s *Store) DeletePlayerMappingsForGame(ctx context.Context, gameID uuid.UUID) error {
|
||||
stmt := table.PlayerMappings.DELETE().
|
||||
WHERE(table.PlayerMappings.GameID.EQ(postgres.UUID(gameID)))
|
||||
if _, err := stmt.ExecContext(ctx, s.db); err != nil {
|
||||
return fmt.Errorf("runtime store: delete player_mappings %s: %w", gameID, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// Operation log
|
||||
// =====================================================================
|
||||
|
||||
// operationLogInsert carries the parameters for InsertOperationLog.
|
||||
type operationLogInsert struct {
|
||||
OperationID uuid.UUID
|
||||
GameID uuid.UUID
|
||||
Op string
|
||||
Source string
|
||||
Status string
|
||||
ImageRef string
|
||||
ContainerID string
|
||||
StartedAt time.Time
|
||||
}
|
||||
|
||||
// InsertOperationLog persists a queued / running operation row.
|
||||
func (s *Store) InsertOperationLog(ctx context.Context, in operationLogInsert) (OperationLog, error) {
|
||||
o := table.RuntimeOperationLog
|
||||
stmt := o.INSERT(
|
||||
o.OperationID, o.GameID, o.Op, o.Source, o.Status, o.ImageRef,
|
||||
o.ContainerID, o.StartedAt,
|
||||
).VALUES(
|
||||
in.OperationID, in.GameID, in.Op, in.Source, in.Status, in.ImageRef,
|
||||
in.ContainerID, in.StartedAt,
|
||||
).RETURNING(operationLogColumns())
|
||||
var row model.RuntimeOperationLog
|
||||
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
|
||||
return OperationLog{}, err
|
||||
}
|
||||
return modelToOperationLog(row), nil
|
||||
}
|
||||
|
||||
// CompleteOperationLog updates the status / error fields on
|
||||
// operationID. Returns the refreshed row.
|
||||
func (s *Store) CompleteOperationLog(ctx context.Context, operationID uuid.UUID, status, errCode, errMsg string, finishedAt time.Time) (OperationLog, error) {
|
||||
o := table.RuntimeOperationLog
|
||||
stmt := o.UPDATE().
|
||||
SET(
|
||||
o.Status.SET(postgres.String(status)),
|
||||
o.ErrorCode.SET(postgres.String(errCode)),
|
||||
o.ErrorMessage.SET(postgres.String(errMsg)),
|
||||
o.FinishedAt.SET(postgres.TimestampzT(finishedAt)),
|
||||
).
|
||||
WHERE(o.OperationID.EQ(postgres.UUID(operationID))).
|
||||
RETURNING(operationLogColumns())
|
||||
var row model.RuntimeOperationLog
|
||||
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
|
||||
if errors.Is(err, qrm.ErrNoRows) {
|
||||
return OperationLog{}, ErrNotFound
|
||||
}
|
||||
return OperationLog{}, fmt.Errorf("runtime store: complete operation_log %s: %w", operationID, err)
|
||||
}
|
||||
return modelToOperationLog(row), nil
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// Health snapshots
|
||||
// =====================================================================
|
||||
|
||||
// InsertHealthSnapshot persists a JSON-encoded engine status snapshot.
|
||||
func (s *Store) InsertHealthSnapshot(ctx context.Context, snapshotID, gameID uuid.UUID, observedAt time.Time, payload []byte) error {
|
||||
hs := table.RuntimeHealthSnapshots
|
||||
stmt := hs.INSERT(hs.SnapshotID, hs.GameID, hs.ObservedAt, hs.Payload).
|
||||
VALUES(snapshotID, gameID, observedAt, string(payload))
|
||||
if _, err := stmt.ExecContext(ctx, s.db); err != nil {
|
||||
return fmt.Errorf("runtime store: insert health_snapshot %s: %w", gameID, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// Read-only lobby projection (per The implementation D2)
|
||||
// =====================================================================
|
||||
|
||||
// LoadGameProjection reads `backend.games` for runtime's start/stop
|
||||
// flow. Lobby remains the only writer of the table; runtime is a
|
||||
// read-only consumer. Returns ErrNotFound on miss.
|
||||
func (s *Store) LoadGameProjection(ctx context.Context, gameID uuid.UUID) (Game, error) {
|
||||
g := table.Games
|
||||
stmt := postgres.SELECT(
|
||||
g.GameID, g.OwnerUserID, g.Visibility, g.Status, g.GameName,
|
||||
g.TurnSchedule, g.TargetEngineVersion,
|
||||
g.MinPlayers, g.MaxPlayers, g.StartGapHours, g.StartGapPlayers,
|
||||
).
|
||||
FROM(g).
|
||||
WHERE(g.GameID.EQ(postgres.UUID(gameID))).
|
||||
LIMIT(1)
|
||||
var row model.Games
|
||||
if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
|
||||
if errors.Is(err, qrm.ErrNoRows) {
|
||||
return Game{}, ErrNotFound
|
||||
}
|
||||
return Game{}, fmt.Errorf("runtime store: load game %s: %w", gameID, err)
|
||||
}
|
||||
out := Game{
|
||||
GameID: row.GameID,
|
||||
Visibility: row.Visibility,
|
||||
Status: row.Status,
|
||||
GameName: row.GameName,
|
||||
TurnSchedule: row.TurnSchedule,
|
||||
TargetEngineVersion: row.TargetEngineVersion,
|
||||
MinPlayers: row.MinPlayers,
|
||||
MaxPlayers: row.MaxPlayers,
|
||||
StartGapHours: row.StartGapHours,
|
||||
StartGapPlayers: row.StartGapPlayers,
|
||||
}
|
||||
if row.OwnerUserID != nil {
|
||||
owner := *row.OwnerUserID
|
||||
out.OwnerUserID = &owner
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// ListActiveMemberships reads active rows from `backend.memberships`
|
||||
// for gameID.
|
||||
func (s *Store) ListActiveMemberships(ctx context.Context, gameID uuid.UUID) ([]MembershipRow, error) {
|
||||
m := table.Memberships
|
||||
stmt := postgres.SELECT(m.MembershipID, m.GameID, m.UserID, m.RaceName).
|
||||
FROM(m).
|
||||
WHERE(
|
||||
m.GameID.EQ(postgres.UUID(gameID)).
|
||||
AND(m.Status.EQ(postgres.String("active"))),
|
||||
).
|
||||
ORDER_BY(m.JoinedAt.ASC())
|
||||
var rows []model.Memberships
|
||||
if err := stmt.QueryContext(ctx, s.db, &rows); err != nil {
|
||||
return nil, fmt.Errorf("runtime store: list memberships %s: %w", gameID, err)
|
||||
}
|
||||
out := make([]MembershipRow, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
out = append(out, MembershipRow{
|
||||
MembershipID: row.MembershipID,
|
||||
GameID: row.GameID,
|
||||
UserID: row.UserID,
|
||||
RaceName: row.RaceName,
|
||||
})
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// Model → domain converters
|
||||
// =====================================================================
|
||||
|
||||
func modelToEngineVersion(row model.EngineVersions) EngineVersion {
|
||||
return EngineVersion{
|
||||
Version: row.Version,
|
||||
ImageRef: row.ImageRef,
|
||||
Enabled: row.Enabled,
|
||||
CreatedAt: row.CreatedAt,
|
||||
UpdatedAt: row.UpdatedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func modelToRuntimeRecord(row model.RuntimeRecords) RuntimeRecord {
|
||||
rec := RuntimeRecord{
|
||||
GameID: row.GameID,
|
||||
Status: row.Status,
|
||||
EngineEndpoint: row.EngineEndpoint,
|
||||
TurnSchedule: row.TurnSchedule,
|
||||
CurrentTurn: row.CurrentTurn,
|
||||
SkipNextTick: row.SkipNextTick,
|
||||
Paused: row.Paused,
|
||||
EngineHealth: row.EngineHealth,
|
||||
CreatedAt: row.CreatedAt,
|
||||
UpdatedAt: row.UpdatedAt,
|
||||
CurrentContainerID: derefString(row.CurrentContainerID),
|
||||
CurrentImageRef: derefString(row.CurrentImageRef),
|
||||
CurrentEngineVersion: derefString(row.CurrentEngineVersion),
|
||||
StatePath: derefString(row.StatePath),
|
||||
DockerNetwork: derefString(row.DockerNetwork),
|
||||
}
|
||||
rec.NextGenerationAt = copyTimePtr(row.NextGenerationAt)
|
||||
rec.PausedAt = copyTimePtr(row.PausedAt)
|
||||
rec.StartedAt = copyTimePtr(row.StartedAt)
|
||||
rec.StoppedAt = copyTimePtr(row.StoppedAt)
|
||||
rec.FinishedAt = copyTimePtr(row.FinishedAt)
|
||||
rec.RemovedAt = copyTimePtr(row.RemovedAt)
|
||||
rec.LastObservedAt = copyTimePtr(row.LastObservedAt)
|
||||
return rec
|
||||
}
|
||||
|
||||
func modelToOperationLog(row model.RuntimeOperationLog) OperationLog {
|
||||
op := OperationLog{
|
||||
OperationID: row.OperationID,
|
||||
GameID: row.GameID,
|
||||
Op: row.Op,
|
||||
Source: row.Source,
|
||||
Status: row.Status,
|
||||
ImageRef: row.ImageRef,
|
||||
ContainerID: row.ContainerID,
|
||||
ErrorCode: row.ErrorCode,
|
||||
ErrorMessage: row.ErrorMessage,
|
||||
StartedAt: row.StartedAt,
|
||||
}
|
||||
op.FinishedAt = copyTimePtr(row.FinishedAt)
|
||||
return op
|
||||
}
|
||||
|
||||
func modelToPlayerMapping(row model.PlayerMappings) PlayerMapping {
|
||||
return PlayerMapping{
|
||||
GameID: row.GameID,
|
||||
UserID: row.UserID,
|
||||
RaceName: row.RaceName,
|
||||
EnginePlayerUUID: row.EnginePlayerUUID,
|
||||
CreatedAt: row.CreatedAt,
|
||||
}
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// Scalar helpers
|
||||
// =====================================================================
|
||||
|
||||
// nullableString converts a Go string to the `any` form expected by
|
||||
// jet INSERT VALUES bindings: an empty string becomes nil so the
|
||||
// column receives NULL.
|
||||
func nullableString(s string) any {
|
||||
if s == "" {
|
||||
return nil
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// nullableTime mirrors nullableString for *time.Time.
|
||||
func nullableTime(t *time.Time) any {
|
||||
if t == nil {
|
||||
return nil
|
||||
}
|
||||
return *t
|
||||
}
|
||||
|
||||
// nullableStringSetExpr returns a typed jet expression suitable for
|
||||
// UPDATE SET on a nullable text column. The empty string is mapped to
|
||||
// SQL NULL, mirroring the INSERT-side semantics so a "" patch clears
|
||||
// the column.
|
||||
func nullableStringSetExpr(v string) postgres.StringExpression {
|
||||
if v == "" {
|
||||
return postgres.StringExp(postgres.NULL)
|
||||
}
|
||||
return postgres.String(v)
|
||||
}
|
||||
|
||||
// timePtrSetExpr mirrors nullableStringSetExpr for *time.Time. nil
|
||||
// clears the column; non-nil sets it.
|
||||
func timePtrSetExpr(t *time.Time) postgres.TimestampzExpression {
|
||||
if t == nil {
|
||||
return postgres.TimestampzExp(postgres.NULL)
|
||||
}
|
||||
return postgres.TimestampzT(*t)
|
||||
}
|
||||
|
||||
func derefString(p *string) string {
|
||||
if p == nil {
|
||||
return ""
|
||||
}
|
||||
return *p
|
||||
}
|
||||
|
||||
func copyTimePtr(p *time.Time) *time.Time {
|
||||
if p == nil {
|
||||
return nil
|
||||
}
|
||||
t := *p
|
||||
return &t
|
||||
}
|
||||
@@ -0,0 +1,122 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// EngineVersion mirrors a row in `backend.engine_versions`. The version
|
||||
// label is the primary key and is also the value lobby stores on
|
||||
// `games.target_engine_version`.
|
||||
type EngineVersion struct {
|
||||
Version string
|
||||
ImageRef string
|
||||
Enabled bool
|
||||
CreatedAt time.Time
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
// RuntimeRecord mirrors a row in `backend.runtime_records`. Pointer
|
||||
// fields are nullable in the schema; primitives default to zero.
|
||||
type RuntimeRecord struct {
|
||||
GameID uuid.UUID
|
||||
Status string
|
||||
CurrentContainerID string
|
||||
CurrentImageRef string
|
||||
CurrentEngineVersion string
|
||||
EngineEndpoint string
|
||||
StatePath string
|
||||
DockerNetwork string
|
||||
TurnSchedule string
|
||||
CurrentTurn int32
|
||||
NextGenerationAt *time.Time
|
||||
SkipNextTick bool
|
||||
Paused bool
|
||||
PausedAt *time.Time
|
||||
EngineHealth string
|
||||
CreatedAt time.Time
|
||||
UpdatedAt time.Time
|
||||
StartedAt *time.Time
|
||||
StoppedAt *time.Time
|
||||
FinishedAt *time.Time
|
||||
RemovedAt *time.Time
|
||||
LastObservedAt *time.Time
|
||||
}
|
||||
|
||||
// IsTerminal reports whether the record sits in a status that the
|
||||
// cache should evict.
|
||||
func (r RuntimeRecord) IsTerminal() bool {
|
||||
switch r.Status {
|
||||
case RuntimeStatusFinished, RuntimeStatusRemoved, RuntimeStatusStopped:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// PlayerMapping mirrors a row in `backend.player_mappings`. The
|
||||
// composite primary key is `(game_id, user_id)`; `engine_player_uuid`
|
||||
// is the engine-assigned race id used by the engine's `actor` field.
|
||||
type PlayerMapping struct {
|
||||
GameID uuid.UUID
|
||||
UserID uuid.UUID
|
||||
RaceName string
|
||||
EnginePlayerUUID uuid.UUID
|
||||
CreatedAt time.Time
|
||||
}
|
||||
|
||||
// OperationLog mirrors a row in `backend.runtime_operation_log`. Used
|
||||
// by admin endpoints that surface a per-operation status envelope and
|
||||
// by the worker pool for completion telemetry.
|
||||
type OperationLog struct {
|
||||
OperationID uuid.UUID
|
||||
GameID uuid.UUID
|
||||
Op string
|
||||
Source string
|
||||
Status string
|
||||
ImageRef string
|
||||
ContainerID string
|
||||
ErrorCode string
|
||||
ErrorMessage string
|
||||
StartedAt time.Time
|
||||
FinishedAt *time.Time
|
||||
}
|
||||
|
||||
// HealthSnapshot mirrors a row in `backend.runtime_health_snapshots`.
|
||||
// The `Payload` field carries the JSON-encoded engine status response
|
||||
// or a synthesised summary when the engine is unreachable.
|
||||
type HealthSnapshot struct {
|
||||
SnapshotID uuid.UUID
|
||||
GameID uuid.UUID
|
||||
ObservedAt time.Time
|
||||
Payload []byte
|
||||
}
|
||||
|
||||
// Game is the read-only projection of a `backend.games` row that the
|
||||
// runtime needs at start time. It is the runtime's view of a lobby
|
||||
// row; lobby remains the only writer.
|
||||
type Game struct {
|
||||
GameID uuid.UUID
|
||||
OwnerUserID *uuid.UUID
|
||||
Visibility string
|
||||
Status string
|
||||
GameName string
|
||||
TurnSchedule string
|
||||
TargetEngineVersion string
|
||||
MinPlayers int32
|
||||
MaxPlayers int32
|
||||
StartGapHours int32
|
||||
StartGapPlayers int32
|
||||
}
|
||||
|
||||
// MembershipRow is the read-only projection of an active
|
||||
// `backend.memberships` row that the runtime needs at start time. It
|
||||
// carries enough data to populate the engine `Init` request and the
|
||||
// `player_mappings` projection.
|
||||
type MembershipRow struct {
|
||||
MembershipID uuid.UUID
|
||||
GameID uuid.UUID
|
||||
UserID uuid.UUID
|
||||
RaceName string
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// WorkerPool drains long-running runtime jobs (start, stop, restart,
|
||||
// patch). Implements `internal/app.Component` so the App lifecycle
|
||||
// drives Run/Shutdown.
|
||||
type WorkerPool struct {
|
||||
svc *Service
|
||||
jobs chan job
|
||||
stopping atomic.Bool
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// NewWorkerPool builds a worker pool sized by `cfg.WorkerPoolSize`
|
||||
// with a buffered channel of depth `cfg.JobQueueSize`.
|
||||
func NewWorkerPool(svc *Service) *WorkerPool {
|
||||
return &WorkerPool{
|
||||
svc: svc,
|
||||
jobs: make(chan job, svc.deps.Config.JobQueueSize),
|
||||
}
|
||||
}
|
||||
|
||||
// submit places j on the worker channel. Returns ErrJobQueueFull when
|
||||
// the channel is full and ErrShutdown when the pool is stopping.
|
||||
func (w *WorkerPool) submit(ctx context.Context, j job) error {
|
||||
if w == nil || w.stopping.Load() {
|
||||
return ErrShutdown
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case w.jobs <- j:
|
||||
return nil
|
||||
default:
|
||||
}
|
||||
// One last attempt with the caller's context; lets a fast worker
|
||||
// pick it up while we wait briefly.
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case w.jobs <- j:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// Run starts the configured number of worker goroutines and blocks
|
||||
// until ctx is cancelled.
|
||||
func (w *WorkerPool) Run(ctx context.Context) error {
|
||||
if w == nil {
|
||||
return nil
|
||||
}
|
||||
count := w.svc.deps.Config.WorkerPoolSize
|
||||
if count <= 0 {
|
||||
count = 1
|
||||
}
|
||||
for i := 0; i < count; i++ {
|
||||
w.wg.Add(1)
|
||||
go w.loop(ctx, i)
|
||||
}
|
||||
<-ctx.Done()
|
||||
return nil
|
||||
}
|
||||
|
||||
// Shutdown signals the pool to stop accepting new work and waits for
|
||||
// in-flight workers to drain. The provided context bounds the wait;
|
||||
// any worker still running when ctx expires is left to finish on its
|
||||
// own and the pool returns.
|
||||
func (w *WorkerPool) Shutdown(ctx context.Context) error {
|
||||
if w == nil {
|
||||
return nil
|
||||
}
|
||||
if !w.stopping.CompareAndSwap(false, true) {
|
||||
return nil
|
||||
}
|
||||
close(w.jobs)
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
w.wg.Wait()
|
||||
close(done)
|
||||
}()
|
||||
select {
|
||||
case <-done:
|
||||
return nil
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
func (w *WorkerPool) loop(ctx context.Context, idx int) {
|
||||
defer w.wg.Done()
|
||||
logger := w.svc.deps.Logger.With(zap.Int("worker", idx))
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case j, ok := <-w.jobs:
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
logger.Debug("runtime job picked",
|
||||
zap.String("game_id", j.GameID().String()),
|
||||
zap.String("op", j.Operation().Op),
|
||||
)
|
||||
if err := j.Run(ctx, w.svc); err != nil {
|
||||
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
|
||||
return
|
||||
}
|
||||
logger.Warn("runtime job failed",
|
||||
zap.String("game_id", j.GameID().String()),
|
||||
zap.String("op", j.Operation().Op),
|
||||
zap.Error(err),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user