feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
+236
View File
@@ -0,0 +1,236 @@
package harness
import (
"context"
"crypto/rand"
"encoding/hex"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"sync"
"testing"
"time"
cerrdefs "github.com/containerd/errdefs"
"github.com/docker/docker/api/types/network"
dockerclient "github.com/docker/docker/client"
)
// Engine image tags used by the integration suite. `EngineImageRef` is
// the image we actually build from `galaxy/game/Dockerfile`;
// `PatchedEngineImageRef` is the same image content tagged at a higher
// semver patch so the patch lifecycle test exercises the
// `semver_patch_only` validation against a real image. Keeping both at
// the same digest avoids a redundant build.
const (
EngineImageRef = "galaxy/game:1.0.0-rtm-it"
PatchedEngineImageRef = "galaxy/game:1.0.1-rtm-it"
dockerNetworkPrefix = "rtmanager-it-"
dockerPingTimeout = 5 * time.Second
dockerNetworkTimeout = 30 * time.Second
imageBuildTimeout = 10 * time.Minute
)
// DockerEnv carries the per-package Docker client plus the workspace
// root used by image builds. The client is opened lazily on the first
// EnsureDocker call and closed by ShutdownDocker at TestMain exit.
type DockerEnv struct {
client *dockerclient.Client
workspaceRoot string
}
// Client returns the harness-owned Docker SDK client. Tests use it
// directly for "external actions" the harness does not wrap (e.g.,
// removing a running container behind RTM's back in `health_test`).
func (env *DockerEnv) Client() *dockerclient.Client { return env.client }
// WorkspaceRoot returns the absolute path of the galaxy/ workspace
// root. It is exported so the runtime helper can resolve the host
// game-state root relative to it if a test needs a deterministic
// location, though the default places state under `t.ArtifactDir()`.
func (env *DockerEnv) WorkspaceRoot() string { return env.workspaceRoot }
var (
dockerOnce sync.Once
dockerEnv *DockerEnv
dockerErr error
imageOnce sync.Once
imageErr error
)
// EnsureDocker opens the shared Docker SDK client and verifies the
// daemon is reachable. When the daemon is unavailable the helper calls
// `t.Skip` so suites stay green on hosts without `/var/run/docker.sock`
// or `DOCKER_HOST`.
func EnsureDocker(t testing.TB) *DockerEnv {
t.Helper()
dockerOnce.Do(func() {
dockerEnv, dockerErr = openDocker()
})
if dockerErr != nil {
t.Skipf("rtmanager integration: docker daemon unavailable: %v", dockerErr)
}
return dockerEnv
}
// EnsureEngineImage builds the `galaxy/game` engine image from the
// workspace root once per package run via `sync.Once`, then tags the
// resulting image at both `EngineImageRef` and `PatchedEngineImageRef`
// so the patch lifecycle has a second semver-valid tag to point at.
// Subsequent calls re-use the cached image. Any test that asks for the
// engine image must invoke this helper first; it is intentionally
// separate from `EnsureDocker` so suites that only need the daemon
// (e.g., a future "Docker network missing" negative test) do not pay
// the build cost.
func EnsureEngineImage(t testing.TB) string {
t.Helper()
env := EnsureDocker(t)
imageOnce.Do(func() {
imageErr = buildAndTagEngineImage(env)
})
if imageErr != nil {
t.Skipf("rtmanager integration: build galaxy/game image: %v", imageErr)
}
return EngineImageRef
}
// EnsureNetwork creates a uniquely-named Docker bridge network for the
// caller's test and registers cleanup. Each test gets its own network
// so concurrent scenarios cannot collide on the per-game DNS hostname.
func EnsureNetwork(t testing.TB) string {
t.Helper()
env := EnsureDocker(t)
name := dockerNetworkPrefix + uniqueSuffix(t)
createCtx, cancel := context.WithTimeout(context.Background(), dockerNetworkTimeout)
defer cancel()
if _, err := env.client.NetworkCreate(createCtx, name, network.CreateOptions{Driver: "bridge"}); err != nil {
t.Fatalf("rtmanager integration: create docker network %q: %v", name, err)
}
t.Cleanup(func() {
removeCtx, removeCancel := context.WithTimeout(context.Background(), dockerNetworkTimeout)
defer removeCancel()
if err := env.client.NetworkRemove(removeCtx, name); err != nil && !cerrdefs.IsNotFound(err) {
t.Logf("rtmanager integration: remove docker network %q: %v", name, err)
}
})
return name
}
// ShutdownDocker closes the shared Docker SDK client. `TestMain`
// invokes it after `m.Run`. The harness deliberately leaves the engine
// image in the local Docker cache so the next package run benefits
// from the layer cache; operators can `docker image rm` the
// `*-rtm-it` tags by hand if a stale image gets in the way.
func ShutdownDocker() {
if dockerEnv == nil {
return
}
if dockerEnv.client != nil {
_ = dockerEnv.client.Close()
}
dockerEnv = nil
}
// uniqueSuffix returns 8 hex characters of randomness suitable for a
// per-test resource name. The same helper is used in
// `internal/adapters/docker/smoke_test.go`; we duplicate it instead of
// importing because `_test.go`-only helpers cannot be exported.
func uniqueSuffix(t testing.TB) string {
t.Helper()
buf := make([]byte, 4)
if _, err := rand.Read(buf); err != nil {
t.Fatalf("rtmanager integration: read random suffix: %v", err)
}
return hex.EncodeToString(buf)
}
func openDocker() (*DockerEnv, error) {
if os.Getenv("DOCKER_HOST") == "" {
if _, err := os.Stat("/var/run/docker.sock"); err != nil {
return nil, fmt.Errorf("set DOCKER_HOST or expose /var/run/docker.sock: %w", err)
}
}
client, err := dockerclient.NewClientWithOpts(
dockerclient.FromEnv,
dockerclient.WithAPIVersionNegotiation(),
)
if err != nil {
return nil, fmt.Errorf("new docker client: %w", err)
}
pingCtx, cancel := context.WithTimeout(context.Background(), dockerPingTimeout)
defer cancel()
if _, err := client.Ping(pingCtx); err != nil {
_ = client.Close()
return nil, fmt.Errorf("ping docker daemon: %w", err)
}
root, err := workspaceRoot()
if err != nil {
_ = client.Close()
return nil, fmt.Errorf("resolve workspace root: %w", err)
}
return &DockerEnv{
client: client,
workspaceRoot: root,
}, nil
}
// buildAndTagEngineImage invokes `docker build` against the workspace
// root context to materialise the `galaxy/game` image, then tags the
// resulting image at the patch tag. Shelling out to the CLI keeps the
// implementation tiny — using the SDK would require streaming a tar
// of the workspace root, which is heavy and duplicates what the CLI
// already optimises. The workspace-root build context is required by
// `galaxy/game` (see `galaxy/game/README.md` §Build).
func buildAndTagEngineImage(env *DockerEnv) error {
if env == nil {
return errors.New("nil docker env")
}
ctx, cancel := context.WithTimeout(context.Background(), imageBuildTimeout)
defer cancel()
dockerfilePath := filepath.Join("game", "Dockerfile")
cmd := exec.CommandContext(ctx, "docker", "build",
"-f", dockerfilePath,
"-t", EngineImageRef,
".",
)
cmd.Dir = env.workspaceRoot
cmd.Env = append(os.Environ(), "DOCKER_BUILDKIT=1")
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("docker build (-f %s) in %s: %w; output:\n%s",
dockerfilePath, env.workspaceRoot, err, strings.TrimSpace(string(output)))
}
if err := env.client.ImageTag(ctx, EngineImageRef, PatchedEngineImageRef); err != nil {
return fmt.Errorf("tag %s as %s: %w", EngineImageRef, PatchedEngineImageRef, err)
}
return nil
}
// workspaceRoot resolves the absolute path of the galaxy/ workspace
// root by anchoring on this file's location. The harness lives at
// `galaxy/rtmanager/integration/harness/docker.go`, so the workspace
// root is three directories up. Mirrors the `cmd/jetgen` strategy.
func workspaceRoot() (string, error) {
_, file, _, ok := runtime.Caller(0)
if !ok {
return "", errors.New("resolve runtime caller for workspace root")
}
dir := filepath.Dir(file)
// dir = .../galaxy/rtmanager/integration/harness
root := filepath.Clean(filepath.Join(dir, "..", "..", ".."))
return root, nil
}
@@ -0,0 +1,59 @@
package harness
import (
"encoding/json"
"net/http"
"net/http/httptest"
"strings"
"testing"
)
// LobbyStub answers the single Lobby internal request the start
// service performs ([`internal/adapters/lobbyclient`]). The start
// service treats this response as ancillary diagnostics — the start
// envelope already carries `image_ref` — so the stub returns a
// deterministic 200 OK and lets the runtime ignore the payload.
//
// The stub only validates that the runtime configuration treats the
// Lobby URL as required (so it cannot regress to nil-out the
// ancillary fetch); the response body itself is unused by the
// integration assertions.
type LobbyStub struct {
Server *httptest.Server
}
// NewLobbyStub returns a started httptest.Server. The caller registers
// `t.Cleanup(stub.Close)` themselves through the runtime helper so the
// stub follows the same lifecycle as the rest of the per-test wiring.
func NewLobbyStub(t testing.TB) *LobbyStub {
t.Helper()
mux := http.NewServeMux()
mux.HandleFunc("GET /api/v1/internal/games/{game_id}", func(w http.ResponseWriter, r *http.Request) {
gameID := strings.TrimSpace(r.PathValue("game_id"))
if gameID == "" {
writeStubError(w, http.StatusBadRequest, "invalid_request", "game_id is required")
return
}
w.Header().Set("Content-Type", "application/json; charset=utf-8")
w.WriteHeader(http.StatusOK)
_ = json.NewEncoder(w).Encode(map[string]string{
"game_id": gameID,
"status": "running",
"target_engine_version": "1.0.0",
})
})
server := httptest.NewServer(mux)
t.Cleanup(server.Close)
return &LobbyStub{Server: server}
}
// URL returns the base URL of the running stub.
func (stub *LobbyStub) URL() string { return stub.Server.URL }
func writeStubError(w http.ResponseWriter, status int, code, message string) {
w.Header().Set("Content-Type", "application/json; charset=utf-8")
w.WriteHeader(status)
_ = json.NewEncoder(w).Encode(map[string]any{
"error": map[string]string{"code": code, "message": message},
})
}
+224
View File
@@ -0,0 +1,224 @@
// Package harness exposes the testcontainers / Docker / image-build
// scaffolding shared by the Runtime Manager service-local integration
// suite under [`galaxy/rtmanager/integration`](..).
//
// Only `_test.go` files (and the harness itself) reference this
// package; production code paths in `cmd/rtmanager` never import it.
// The package therefore stays out of the production binary's import
// graph, identical to the in-package `pgtest` and `integration/internal/harness`
// patterns it mirrors.
package harness
import (
"context"
"database/sql"
"net/url"
"os"
"sync"
"testing"
"time"
"galaxy/postgres"
"galaxy/rtmanager/internal/adapters/postgres/migrations"
testcontainers "github.com/testcontainers/testcontainers-go"
tcpostgres "github.com/testcontainers/testcontainers-go/modules/postgres"
"github.com/testcontainers/testcontainers-go/wait"
)
const (
pgImage = "postgres:16-alpine"
pgSuperUser = "galaxy"
pgSuperPassword = "galaxy"
pgSuperDatabase = "galaxy_rtmanager_it"
pgServiceRole = "rtmanagerservice"
pgServicePassword = "rtmanagerservice"
pgServiceSchema = "rtmanager"
pgStartupTimeout = 90 * time.Second
// pgOperationTimeout bounds the per-statement deadline used by every
// pool the harness opens. Short enough to surface a runaway
// integration test promptly, long enough to absorb laptop-grade I/O.
pgOperationTimeout = 10 * time.Second
)
// PostgresEnv carries the per-package PostgreSQL fixture. The container
// is started lazily on the first EnsurePostgres call and torn down by
// ShutdownPostgres at TestMain exit.
type PostgresEnv struct {
container *tcpostgres.PostgresContainer
pool *sql.DB
scopedDSN string
}
// Pool returns the harness-owned `*sql.DB` scoped to the rtmanager
// schema. Tests use it to read durable state directly through the
// existing store adapters.
func (env *PostgresEnv) Pool() *sql.DB { return env.pool }
// DSN returns the rtmanager-role-scoped DSN suitable for
// `RTMANAGER_POSTGRES_PRIMARY_DSN`. Both this DSN and Pool address the
// same database; the pool is reused across tests, while the runtime
// under test opens its own pool through this DSN.
func (env *PostgresEnv) DSN() string { return env.scopedDSN }
var (
pgOnce sync.Once
pgEnv *PostgresEnv
pgErr error
)
// EnsurePostgres starts the per-package PostgreSQL container on first
// invocation and applies the embedded goose migrations. Subsequent
// invocations reuse the same container. When Docker is unavailable the
// helper calls `t.Skip` so the suite stays green on hosts without a
// daemon (mirrors the contract from `internal/adapters/postgres/internal/pgtest`).
func EnsurePostgres(t testing.TB) *PostgresEnv {
t.Helper()
pgOnce.Do(func() {
pgEnv, pgErr = startPostgres()
})
if pgErr != nil {
t.Skipf("rtmanager integration: postgres container start failed (Docker unavailable?): %v", pgErr)
}
return pgEnv
}
// TruncatePostgres wipes every Runtime Manager table inside the shared
// pool, leaving the schema and indexes intact. Tests call this from
// their setup so each scenario starts on an empty state.
func TruncatePostgres(t testing.TB) {
t.Helper()
env := EnsurePostgres(t)
const stmt = `TRUNCATE TABLE runtime_records, operation_log, health_snapshots RESTART IDENTITY CASCADE`
if _, err := env.pool.ExecContext(context.Background(), stmt); err != nil {
t.Fatalf("truncate rtmanager tables: %v", err)
}
}
// ShutdownPostgres terminates the shared container and closes the pool.
// `TestMain` invokes it after `m.Run` so the container is released even
// if individual tests panic.
func ShutdownPostgres() {
if pgEnv == nil {
return
}
if pgEnv.pool != nil {
_ = pgEnv.pool.Close()
}
if pgEnv.container != nil {
_ = testcontainers.TerminateContainer(pgEnv.container)
}
pgEnv = nil
}
// RunMain is a convenience helper for the integration package
// `TestMain`: it runs the suite, captures the exit code, tears every
// shared container down, and exits. Wiring it through one helper keeps
// `TestMain` to two lines and centralises ordering.
func RunMain(m *testing.M) {
code := m.Run()
ShutdownRedis()
ShutdownPostgres()
ShutdownDocker()
os.Exit(code)
}
func startPostgres() (*PostgresEnv, error) {
ctx := context.Background()
container, err := tcpostgres.Run(ctx, pgImage,
tcpostgres.WithDatabase(pgSuperDatabase),
tcpostgres.WithUsername(pgSuperUser),
tcpostgres.WithPassword(pgSuperPassword),
testcontainers.WithWaitStrategy(
wait.ForLog("database system is ready to accept connections").
WithOccurrence(2).
WithStartupTimeout(pgStartupTimeout),
),
)
if err != nil {
return nil, err
}
baseDSN, err := container.ConnectionString(ctx, "sslmode=disable")
if err != nil {
_ = testcontainers.TerminateContainer(container)
return nil, err
}
if err := provisionRoleAndSchema(ctx, baseDSN); err != nil {
_ = testcontainers.TerminateContainer(container)
return nil, err
}
scopedDSN, err := scopedDSNForRole(baseDSN)
if err != nil {
_ = testcontainers.TerminateContainer(container)
return nil, err
}
cfg := postgres.DefaultConfig()
cfg.PrimaryDSN = scopedDSN
cfg.OperationTimeout = pgOperationTimeout
pool, err := postgres.OpenPrimary(ctx, cfg)
if err != nil {
_ = testcontainers.TerminateContainer(container)
return nil, err
}
if err := postgres.Ping(ctx, pool, pgOperationTimeout); err != nil {
_ = pool.Close()
_ = testcontainers.TerminateContainer(container)
return nil, err
}
if err := postgres.RunMigrations(ctx, pool, migrations.FS(), "."); err != nil {
_ = pool.Close()
_ = testcontainers.TerminateContainer(container)
return nil, err
}
return &PostgresEnv{
container: container,
pool: pool,
scopedDSN: scopedDSN,
}, nil
}
func provisionRoleAndSchema(ctx context.Context, baseDSN string) error {
cfg := postgres.DefaultConfig()
cfg.PrimaryDSN = baseDSN
cfg.OperationTimeout = pgOperationTimeout
db, err := postgres.OpenPrimary(ctx, cfg)
if err != nil {
return err
}
defer func() { _ = db.Close() }()
statements := []string{
`DO $$ BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'rtmanagerservice') THEN
CREATE ROLE rtmanagerservice LOGIN PASSWORD 'rtmanagerservice';
END IF;
END $$;`,
`CREATE SCHEMA IF NOT EXISTS rtmanager AUTHORIZATION rtmanagerservice;`,
`GRANT USAGE ON SCHEMA rtmanager TO rtmanagerservice;`,
}
for _, statement := range statements {
if _, err := db.ExecContext(ctx, statement); err != nil {
return err
}
}
return nil
}
func scopedDSNForRole(baseDSN string) (string, error) {
parsed, err := url.Parse(baseDSN)
if err != nil {
return "", err
}
values := url.Values{}
values.Set("search_path", pgServiceSchema)
values.Set("sslmode", "disable")
scoped := url.URL{
Scheme: parsed.Scheme,
User: url.UserPassword(pgServiceRole, pgServicePassword),
Host: parsed.Host,
Path: parsed.Path,
RawQuery: values.Encode(),
}
return scoped.String(), nil
}
+102
View File
@@ -0,0 +1,102 @@
package harness
import (
"context"
"sync"
"testing"
"github.com/redis/go-redis/v9"
testcontainers "github.com/testcontainers/testcontainers-go"
rediscontainer "github.com/testcontainers/testcontainers-go/modules/redis"
)
const redisImage = "redis:7"
// RedisEnv carries the per-package Redis fixture. The container is
// started lazily on the first EnsureRedis call and torn down by
// ShutdownRedis at TestMain exit. Both stream consumers and the
// per-game lease store hit this real Redis (miniredis would suffice
// for streams alone, but the lease semantics and eviction-by-TTL we
// rely on in `health_test` are easier to verify against a real
// daemon).
type RedisEnv struct {
container *rediscontainer.RedisContainer
addr string
}
// Addr returns the externally reachable host:port of the Redis
// container. Both the runtime under test and the harness-owned client
// connect through the same endpoint.
func (env *RedisEnv) Addr() string { return env.addr }
// NewClient opens a fresh `*redis.Client` against the harness Redis.
// Tests close their client through `t.Cleanup`; the harness keeps no
// shared client to avoid cross-test connection-pool surprises.
func (env *RedisEnv) NewClient(t testing.TB) *redis.Client {
t.Helper()
client := redis.NewClient(&redis.Options{Addr: env.addr})
t.Cleanup(func() { _ = client.Close() })
return client
}
var (
redisOnce sync.Once
redisEnv *RedisEnv
redisErr error
)
// EnsureRedis starts the per-package Redis container on first
// invocation and returns it. When Docker is unavailable the helper
// calls `t.Skip` so the suite stays green on hosts without a daemon.
func EnsureRedis(t testing.TB) *RedisEnv {
t.Helper()
redisOnce.Do(func() {
redisEnv, redisErr = startRedis()
})
if redisErr != nil {
t.Skipf("rtmanager integration: redis container start failed (Docker unavailable?): %v", redisErr)
}
return redisEnv
}
// FlushRedis drops every key on the harness Redis. Tests call it from
// their setup so streams, offset records, and leases from previous
// scenarios do not leak.
func FlushRedis(t testing.TB) {
t.Helper()
env := EnsureRedis(t)
client := redis.NewClient(&redis.Options{Addr: env.addr})
defer func() { _ = client.Close() }()
if _, err := client.FlushAll(context.Background()).Result(); err != nil {
t.Fatalf("flush rtmanager redis: %v", err)
}
}
// ShutdownRedis terminates the shared container. `TestMain` invokes it
// after `m.Run`.
func ShutdownRedis() {
if redisEnv == nil {
return
}
if redisEnv.container != nil {
_ = testcontainers.TerminateContainer(redisEnv.container)
}
redisEnv = nil
}
func startRedis() (*RedisEnv, error) {
ctx := context.Background()
container, err := rediscontainer.Run(ctx, redisImage)
if err != nil {
return nil, err
}
addr, err := container.Endpoint(ctx, "")
if err != nil {
_ = testcontainers.TerminateContainer(container)
return nil, err
}
return &RedisEnv{
container: container,
addr: addr,
}, nil
}
+195
View File
@@ -0,0 +1,195 @@
package harness
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"testing"
"time"
)
// defaultHTTPClient backs the runtime-readiness poll and the REST
// helpers below. A short timeout is enough — every internal endpoint
// runs against an in-process listener.
var defaultHTTPClient = &http.Client{Timeout: 5 * time.Second}
// newRequest is a thin shim over `http.NewRequestWithContext` so the
// readiness poll and the REST client share one constructor.
func newRequest(ctx context.Context, method, fullURL string, body io.Reader) (*http.Request, error) {
req, err := http.NewRequestWithContext(ctx, method, fullURL, body)
if err != nil {
return nil, err
}
if body != nil {
req.Header.Set("Content-Type", "application/json; charset=utf-8")
}
req.Header.Set("Accept", "application/json")
req.Header.Set("X-Galaxy-Caller", "admin")
return req, nil
}
// REST is a tiny client for the trusted internal HTTP surface RTM
// exposes to Game Master and Admin Service. It always identifies the
// caller as `admin` (the operation_log records `admin_rest`); tests
// that need GM semantics should add an option later. v1 keeps the
// helper minimal because the integration scenarios only need
// admin-driven flows.
type REST struct {
baseURL string
httpc *http.Client
}
// NewREST builds a REST client targeting env.InternalAddr.
func NewREST(env *Env) *REST {
return &REST{
baseURL: "http://" + env.InternalAddr,
httpc: defaultHTTPClient,
}
}
// Get issues GET path and returns the response body and status code.
func (r *REST) Get(t testing.TB, path string) ([]byte, int) {
t.Helper()
return r.do(t, http.MethodGet, path, nil)
}
// Post issues POST path with body (a Go value JSON-marshaled).
func (r *REST) Post(t testing.TB, path string, body any) ([]byte, int) {
t.Helper()
return r.do(t, http.MethodPost, path, body)
}
// Delete issues DELETE path with no body.
func (r *REST) Delete(t testing.TB, path string) ([]byte, int) {
t.Helper()
return r.do(t, http.MethodDelete, path, nil)
}
// GetRuntime fetches a runtime record by game id and returns the
// decoded payload, the status code, and the raw bytes for diagnostics.
func (r *REST) GetRuntime(t testing.TB, gameID string) (RuntimeRecordResponse, int) {
t.Helper()
body, status := r.Get(t, fmt.Sprintf("/api/v1/internal/runtimes/%s", url.PathEscape(gameID)))
var resp RuntimeRecordResponse
if status == http.StatusOK {
if err := json.Unmarshal(body, &resp); err != nil {
t.Fatalf("decode get-runtime response: %v; body=%s", err, string(body))
}
}
return resp, status
}
// StartRuntime invokes the start endpoint with imageRef.
func (r *REST) StartRuntime(t testing.TB, gameID, imageRef string) (RuntimeRecordResponse, int) {
t.Helper()
body, status := r.Post(t,
fmt.Sprintf("/api/v1/internal/runtimes/%s/start", url.PathEscape(gameID)),
map[string]string{"image_ref": imageRef},
)
return decodeRecord(t, body, status, "start")
}
// StopRuntime invokes the stop endpoint with reason.
func (r *REST) StopRuntime(t testing.TB, gameID, reason string) (RuntimeRecordResponse, int) {
t.Helper()
body, status := r.Post(t,
fmt.Sprintf("/api/v1/internal/runtimes/%s/stop", url.PathEscape(gameID)),
map[string]string{"reason": reason},
)
return decodeRecord(t, body, status, "stop")
}
// RestartRuntime invokes the restart endpoint.
func (r *REST) RestartRuntime(t testing.TB, gameID string) (RuntimeRecordResponse, int) {
t.Helper()
body, status := r.Post(t,
fmt.Sprintf("/api/v1/internal/runtimes/%s/restart", url.PathEscape(gameID)),
struct{}{},
)
return decodeRecord(t, body, status, "restart")
}
// PatchRuntime invokes the patch endpoint with imageRef.
func (r *REST) PatchRuntime(t testing.TB, gameID, imageRef string) (RuntimeRecordResponse, int) {
t.Helper()
body, status := r.Post(t,
fmt.Sprintf("/api/v1/internal/runtimes/%s/patch", url.PathEscape(gameID)),
map[string]string{"image_ref": imageRef},
)
return decodeRecord(t, body, status, "patch")
}
// CleanupRuntime invokes the DELETE container endpoint.
func (r *REST) CleanupRuntime(t testing.TB, gameID string) (RuntimeRecordResponse, int) {
t.Helper()
body, status := r.Delete(t,
fmt.Sprintf("/api/v1/internal/runtimes/%s/container", url.PathEscape(gameID)),
)
return decodeRecord(t, body, status, "cleanup")
}
// RuntimeRecordResponse mirrors the OpenAPI RuntimeRecord schema. Only
// the fields integration scenarios assert against live here; the
// listener encodes everything else.
type RuntimeRecordResponse struct {
GameID string `json:"game_id"`
Status string `json:"status"`
CurrentContainerID *string `json:"current_container_id"`
CurrentImageRef *string `json:"current_image_ref"`
EngineEndpoint *string `json:"engine_endpoint"`
StatePath string `json:"state_path"`
DockerNetwork string `json:"docker_network"`
StartedAt *string `json:"started_at"`
StoppedAt *string `json:"stopped_at"`
RemovedAt *string `json:"removed_at"`
LastOpAt string `json:"last_op_at"`
CreatedAt string `json:"created_at"`
}
func (r *REST) do(t testing.TB, method, path string, body any) ([]byte, int) {
t.Helper()
var reader io.Reader
if body != nil {
raw, err := json.Marshal(body)
if err != nil {
t.Fatalf("marshal request body: %v", err)
}
reader = bytes.NewReader(raw)
}
req, err := newRequest(context.Background(), method, r.baseURL+path, reader)
if err != nil {
t.Fatalf("build %s %s request: %v", method, path, err)
}
resp, err := r.httpc.Do(req)
if err != nil {
t.Fatalf("execute %s %s: %v", method, path, err)
}
defer resp.Body.Close()
raw, err := io.ReadAll(resp.Body)
if err != nil {
t.Fatalf("read %s %s response: %v", method, path, err)
}
return raw, resp.StatusCode
}
func decodeRecord(t testing.TB, body []byte, status int, op string) (RuntimeRecordResponse, int) {
t.Helper()
if status != http.StatusOK {
return RuntimeRecordResponse{}, status
}
var resp RuntimeRecordResponse
if err := json.Unmarshal(body, &resp); err != nil {
t.Fatalf("decode %s response: %v; body=%s", op, err, string(body))
}
return resp, status
}
// PathEscape is a re-export so test files can call it without
// importing `net/url` directly. Keeps the test source focused on
// scenarios.
func PathEscape(value string) string { return url.PathEscape(strings.TrimSpace(value)) }
+398
View File
@@ -0,0 +1,398 @@
package harness
import (
"context"
"errors"
"io"
"log/slog"
"net/url"
"os"
"strconv"
"strings"
"sync"
"testing"
"time"
"galaxy/postgres"
"galaxy/redisconn"
"galaxy/rtmanager/internal/app"
"galaxy/rtmanager/internal/config"
"github.com/redis/go-redis/v9"
)
// Default stream key shapes used by the integration suite. They match
// the production defaults so the wire shapes asserted in `streams.go`
// are identical to what Game Lobby sees in `integration/lobbyrtm`.
const (
StartJobsStream = "runtime:start_jobs"
StopJobsStream = "runtime:stop_jobs"
JobResultsStream = "runtime:job_results"
HealthEventsStream = "runtime:health_events"
NotificationIntentsKey = "notification:intents"
gameStateRootSubdir = "game-state"
listenAddr = "127.0.0.1:0"
listenerWaitTimeout = 10 * time.Second
readyzPollInterval = 25 * time.Millisecond
cleanupShutdownTimeout = 30 * time.Second
)
// Env carries everything one integration scenario needs to drive the
// Runtime Manager process. The struct is value-typed so tests reach
// fields without intermediate getters.
type Env struct {
// Cfg is the resolved Runtime Manager configuration handed to
// `app.NewRuntime`. Tests inspect it for stream key shapes,
// container defaults, and timeout knobs.
Cfg config.Config
// Runtime is the in-process Runtime Manager exposed for tests that
// need to peek at internal state (`runtime.InternalServer().Addr()`).
Runtime *app.Runtime
// Postgres holds the per-package PostgreSQL fixture.
Postgres *PostgresEnv
// Redis holds the per-package Redis fixture plus a fresh client the
// test owns.
Redis *RedisEnv
RedisClient *redis.Client
// Docker holds the per-package Docker daemon handle.
Docker *DockerEnv
// Lobby is the per-test stub HTTP server.
Lobby *LobbyStub
// Network is the unique Docker network name created for this test.
Network string
// EngineImageRef and PatchedImageRef are the two semver-compatible
// engine image tags the harness builds once per package. Patch
// scenarios point at the second tag.
EngineImageRef string
PatchedImageRef string
// GameStateRoot is the host filesystem path RTM writes per-game
// state directories under. It lives inside `t.ArtifactDir()` so
// failed scenarios leave the engine state behind for inspection.
GameStateRoot string
// InternalAddr is the bound address of RTM's internal HTTP listener
// (resolved after Run binds the port).
InternalAddr string
}
// EnvOptions carry per-test overrides to the harness defaults. Empty
// fields fall back to the defaults declared at the top of this file.
type EnvOptions struct {
// ReconcileInterval overrides the periodic reconciler interval.
// Default 500ms (so reconcile drift is observable inside a single
// scenario timeout).
ReconcileInterval time.Duration
// CleanupInterval overrides the container-cleanup interval.
CleanupInterval time.Duration
// InspectInterval overrides the Docker inspect worker interval.
InspectInterval time.Duration
// ProbeInterval / ProbeTimeout / ProbeFailuresThreshold override
// the active engine probe knobs.
ProbeInterval time.Duration
ProbeTimeout time.Duration
ProbeFailuresThreshold int
// GameLeaseTTL overrides the per-game Redis lease TTL.
GameLeaseTTL time.Duration
// StreamBlockTimeout overrides the consumer XREAD block window.
StreamBlockTimeout time.Duration
// LogToStderr makes the harness write the runtime's structured
// logs to stderr; the default discards them so test output stays
// focused on assertions.
LogToStderr bool
}
// NewEnv stands up a fresh Runtime Manager process for the calling
// test. It blocks until the internal HTTP listener is bound; tests can
// issue REST and stream requests immediately after the call returns.
//
// `t.Cleanup` runs in reverse order: stop the runtime, close the
// runtime, close the per-test redis client, remove the docker network,
// terminate the lobby stub. Containers RTM created during the test are
// removed by the test's own cleanup paths or by the integration
// `health_test` external-action helpers.
func NewEnv(t *testing.T, opts EnvOptions) *Env {
t.Helper()
pg := EnsurePostgres(t)
rd := EnsureRedis(t)
dk := EnsureDocker(t)
imageRef := EnsureEngineImage(t)
TruncatePostgres(t)
FlushRedis(t)
network := EnsureNetwork(t)
lobby := NewLobbyStub(t)
stateRoot := stateRoot(t)
cfg := buildConfig(buildConfigInput{
PostgresDSN: pg.DSN(),
RedisAddr: rd.Addr(),
DockerHost: resolveDockerHost(),
Network: network,
LobbyURL: lobby.URL(),
GameStateRoot: stateRoot,
ReconcileInterval: pickDuration(opts.ReconcileInterval, 500*time.Millisecond),
CleanupInterval: pickDuration(opts.CleanupInterval, 500*time.Millisecond),
InspectInterval: pickDuration(opts.InspectInterval, 500*time.Millisecond),
ProbeInterval: pickDuration(opts.ProbeInterval, 500*time.Millisecond),
ProbeTimeout: pickDuration(opts.ProbeTimeout, time.Second),
ProbeFailures: pickInt(opts.ProbeFailuresThreshold, 2),
GameLeaseTTL: pickDuration(opts.GameLeaseTTL, 5*time.Second),
StreamBlockTimeout: pickDuration(opts.StreamBlockTimeout, 200*time.Millisecond),
})
logger := newLogger(opts.LogToStderr)
ctx, cancel := context.WithCancel(context.Background())
runtime, err := app.NewRuntime(ctx, cfg, logger)
if err != nil {
cancel()
t.Fatalf("rtmanager integration: new runtime: %v", err)
}
runDone := make(chan error, 1)
go func() {
runDone <- runtime.Run(ctx)
}()
internalAddr := waitForListener(t, runtime)
waitForReady(t, runtime, listenerWaitTimeout)
var cleanupOnce sync.Once
t.Cleanup(func() {
cleanupOnce.Do(func() {
cancel()
waitCtx, waitCancel := context.WithTimeout(context.Background(), cleanupShutdownTimeout)
defer waitCancel()
select {
case err := <-runDone:
if err != nil && !isCleanShutdownErr(err) {
t.Logf("rtmanager integration: runtime.Run returned: %v", err)
}
case <-waitCtx.Done():
t.Logf("rtmanager integration: runtime did not stop within %s", cleanupShutdownTimeout)
}
if err := runtime.Close(); err != nil {
t.Logf("rtmanager integration: runtime.Close: %v", err)
}
})
})
return &Env{
Cfg: cfg,
Runtime: runtime,
Postgres: pg,
Redis: rd,
RedisClient: rd.NewClient(t),
Docker: dk,
Lobby: lobby,
Network: network,
EngineImageRef: imageRef,
PatchedImageRef: PatchedEngineImageRef,
GameStateRoot: stateRoot,
InternalAddr: internalAddr,
}
}
type buildConfigInput struct {
PostgresDSN string
RedisAddr string
DockerHost string
Network string
LobbyURL string
GameStateRoot string
ReconcileInterval time.Duration
CleanupInterval time.Duration
InspectInterval time.Duration
ProbeInterval time.Duration
ProbeTimeout time.Duration
ProbeFailures int
GameLeaseTTL time.Duration
StreamBlockTimeout time.Duration
}
func buildConfig(in buildConfigInput) config.Config {
cfg := config.DefaultConfig()
cfg.InternalHTTP.Addr = listenAddr
cfg.Docker.Host = in.DockerHost
cfg.Docker.Network = in.Network
cfg.Docker.PullPolicy = config.ImagePullPolicyIfMissing
cfg.Postgres = config.PostgresConfig{
Conn: postgres.Config{
PrimaryDSN: in.PostgresDSN,
OperationTimeout: pgOperationTimeout,
MaxOpenConns: 5,
MaxIdleConns: 2,
ConnMaxLifetime: 30 * time.Minute,
},
}
cfg.Redis = config.RedisConfig{
Conn: redisconn.Config{
MasterAddr: in.RedisAddr,
Password: "integration",
OperationTimeout: 2 * time.Second,
},
}
cfg.Streams.StartJobs = StartJobsStream
cfg.Streams.StopJobs = StopJobsStream
cfg.Streams.JobResults = JobResultsStream
cfg.Streams.HealthEvents = HealthEventsStream
cfg.Streams.NotificationIntents = NotificationIntentsKey
cfg.Streams.BlockTimeout = in.StreamBlockTimeout
cfg.Container.GameStateRoot = in.GameStateRoot
// Pin chown target to the current process uid/gid; the dev sandbox
// (and unprivileged dev machines) cannot chown to root.
cfg.Container.GameStateOwnerUID = os.Getuid()
cfg.Container.GameStateOwnerGID = os.Getgid()
cfg.Health.InspectInterval = in.InspectInterval
cfg.Health.ProbeInterval = in.ProbeInterval
cfg.Health.ProbeTimeout = in.ProbeTimeout
cfg.Health.ProbeFailuresThreshold = in.ProbeFailures
cfg.Cleanup.ReconcileInterval = in.ReconcileInterval
cfg.Cleanup.CleanupInterval = in.CleanupInterval
cfg.Coordination.GameLeaseTTL = in.GameLeaseTTL
cfg.Lobby = config.LobbyConfig{
BaseURL: in.LobbyURL,
Timeout: 2 * time.Second,
}
cfg.Telemetry.TracesExporter = "none"
cfg.Telemetry.MetricsExporter = "none"
return cfg
}
func newLogger(toStderr bool) *slog.Logger {
if toStderr {
return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug}))
}
return slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{Level: slog.LevelError}))
}
func stateRoot(t *testing.T) string {
t.Helper()
dir := t.ArtifactDir()
root := dir + string(os.PathSeparator) + gameStateRootSubdir
if err := os.MkdirAll(root, 0o755); err != nil {
t.Fatalf("rtmanager integration: create game-state root %q: %v", root, err)
}
return root
}
func resolveDockerHost() string {
if host := strings.TrimSpace(os.Getenv("DOCKER_HOST")); host != "" {
return host
}
return "unix:///var/run/docker.sock"
}
func pickDuration(value, fallback time.Duration) time.Duration {
if value > 0 {
return value
}
return fallback
}
func pickInt(value, fallback int) int {
if value > 0 {
return value
}
return fallback
}
// waitForListener spins until `runtime.InternalServer().Addr()` returns
// a non-empty value or the deadline fires. The internal listener binds
// during `runtime.Run`, which runs in its own goroutine; this helper
// is the bridge between "Run started" and "tests can use REST".
func waitForListener(t *testing.T, runtime *app.Runtime) string {
t.Helper()
deadline := time.Now().Add(listenerWaitTimeout)
for {
if runtime != nil && runtime.InternalServer() != nil {
if addr := runtime.InternalServer().Addr(); addr != "" {
return addr
}
}
if time.Now().After(deadline) {
t.Fatalf("rtmanager integration: internal HTTP listener did not bind within %s", listenerWaitTimeout)
}
time.Sleep(readyzPollInterval)
}
}
// waitForReady polls `/readyz` until it returns 200 or the deadline
// fires. RTM's readyz pings PG, Redis, and Docker; a successful
// response means every dependency is reachable through the runtime
// process.
func waitForReady(t *testing.T, runtime *app.Runtime, timeout time.Duration) {
t.Helper()
deadline := time.Now().Add(timeout)
addr := runtime.InternalServer().Addr()
probeURL := (&url.URL{Scheme: "http", Host: addr, Path: "/readyz"}).String()
for {
req, err := newRequest(context.Background(), "GET", probeURL, nil)
if err == nil {
resp, err := defaultHTTPClient.Do(req)
if err == nil {
_, _ = io.Copy(io.Discard, resp.Body)
_ = resp.Body.Close()
if resp.StatusCode == 200 {
return
}
}
}
if time.Now().After(deadline) {
t.Fatalf("rtmanager integration: /readyz did not return 200 within %s", timeout)
}
time.Sleep(readyzPollInterval)
}
}
func isCleanShutdownErr(err error) bool {
return err == nil || errors.Is(err, context.Canceled)
}
// IDFromTestName builds a deterministic-but-unique game id from the
// caller's test name. Two tests with the same name running back-to-back
// would otherwise collide on PG state through the per-test
// `TruncatePostgres` window; pinning the suffix to `Now().UnixNano()`
// rules that out.
func IDFromTestName(t *testing.T) string {
t.Helper()
// The container hostname is `galaxy-game-{game_id}` and must fit
// HOST_NAME_MAX=64 chars; runc rejects longer values with
// "sethostname: invalid argument". Cap the lowercased test-name
// component at 36 chars and append a 16-char base36 suffix so the
// total stays comfortably under the limit (12 + 36 + 1 + 16 = 65 →
// trim further if needed).
const maxNameLen = 35
suffix := strconv.FormatInt(time.Now().UnixNano(), 36)
prefix := strings.ToLower(strings.NewReplacer("/", "-", " ", "-").Replace(t.Name()))
if len(prefix) > maxNameLen {
prefix = prefix[:maxNameLen]
}
return prefix + "-" + suffix
}
+128
View File
@@ -0,0 +1,128 @@
package harness
import (
"context"
"errors"
"testing"
"time"
"galaxy/rtmanager/internal/adapters/postgres/healthsnapshotstore"
"galaxy/rtmanager/internal/adapters/postgres/operationlogstore"
"galaxy/rtmanager/internal/adapters/postgres/runtimerecordstore"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"github.com/stretchr/testify/require"
)
// RuntimeRecord returns the persisted runtime record for gameID. The
// helper opens the store on every call (cheap; the harness `*sql.DB`
// is shared) so individual scenarios stay isolated even if a previous
// test mutated store state.
func RuntimeRecord(t testing.TB, env *Env, gameID string) (runtime.RuntimeRecord, error) {
t.Helper()
store, err := runtimerecordstore.New(runtimerecordstore.Config{
DB: env.Postgres.Pool(),
OperationTimeout: pgOperationTimeout,
})
require.NoError(t, err)
return store.Get(context.Background(), gameID)
}
// MustRuntimeRecord asserts that the record exists and returns it.
func MustRuntimeRecord(t testing.TB, env *Env, gameID string) runtime.RuntimeRecord {
t.Helper()
record, err := RuntimeRecord(t, env, gameID)
require.NoErrorf(t, err, "load runtime record for %s", gameID)
return record
}
// EventuallyRuntimeRecord polls until predicate matches the runtime
// record for gameID, or the deadline fires. Returns the matching
// record. Used by lifecycle assertions that depend on async state
// transitions (start consumer → record).
func EventuallyRuntimeRecord(t testing.TB, env *Env, gameID string, predicate func(runtime.RuntimeRecord) bool, timeout time.Duration) runtime.RuntimeRecord {
t.Helper()
if timeout <= 0 {
timeout = defaultStreamTimeout
}
deadline := time.Now().Add(timeout)
for {
record, err := RuntimeRecord(t, env, gameID)
if err == nil && predicate(record) {
return record
}
if err != nil && !errors.Is(err, runtime.ErrNotFound) {
t.Fatalf("rtmanager integration: load runtime record: %v", err)
}
if time.Now().After(deadline) {
if err != nil {
t.Fatalf("rtmanager integration: runtime record predicate not met within %s; last err=%v",
timeout, err)
}
t.Fatalf("rtmanager integration: runtime record predicate not met within %s; last record=%+v",
timeout, record)
}
time.Sleep(defaultStreamPoll)
}
}
// OperationEntries returns up to `limit` most-recent operation_log
// entries for gameID, ordered descending by started_at.
func OperationEntries(t testing.TB, env *Env, gameID string, limit int) []operation.OperationEntry {
t.Helper()
store, err := operationlogstore.New(operationlogstore.Config{
DB: env.Postgres.Pool(),
OperationTimeout: pgOperationTimeout,
})
require.NoError(t, err)
entries, err := store.ListByGame(context.Background(), gameID, limit)
require.NoErrorf(t, err, "list operation log entries for %s", gameID)
return entries
}
// EventuallyOperationKind polls operation_log until at least one entry
// for gameID has the requested kind, or the deadline fires. Returns
// the matching entry.
func EventuallyOperationKind(t testing.TB, env *Env, gameID string, kind operation.OpKind, timeout time.Duration) operation.OperationEntry {
t.Helper()
if timeout <= 0 {
timeout = defaultStreamTimeout
}
deadline := time.Now().Add(timeout)
for {
entries := OperationEntries(t, env, gameID, 50)
for _, entry := range entries {
if entry.OpKind == kind {
return entry
}
}
if time.Now().After(deadline) {
t.Fatalf("rtmanager integration: operation_log entry with op_kind=%s not seen within %s; observed=%v",
kind, timeout, opKindSummary(entries))
}
time.Sleep(defaultStreamPoll)
}
}
// HealthSnapshot returns the latest persisted health snapshot for
// gameID, or the underlying not-found sentinel when nothing has been
// recorded yet.
func HealthSnapshot(t testing.TB, env *Env, gameID string) (health.HealthSnapshot, error) {
t.Helper()
store, err := healthsnapshotstore.New(healthsnapshotstore.Config{
DB: env.Postgres.Pool(),
OperationTimeout: pgOperationTimeout,
})
require.NoError(t, err)
return store.Get(context.Background(), gameID)
}
func opKindSummary(entries []operation.OperationEntry) []string {
out := make([]string, 0, len(entries))
for _, entry := range entries {
out = append(out, string(entry.OpKind)+"/"+string(entry.Outcome))
}
return out
}
+334
View File
@@ -0,0 +1,334 @@
package harness
import (
"context"
"encoding/json"
"fmt"
"strconv"
"strings"
"testing"
"time"
"galaxy/rtmanager/internal/ports"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/require"
)
// Default scenario timeouts. Stream-driven assertions sit on top of
// the runtime's worker tickers (defaults of 200-500ms in
// `EnvOptions`); 30s gives every reconcile / probe / event tick more
// than enough headroom even on a slow CI runner.
const (
defaultStreamTimeout = 30 * time.Second
defaultStreamPoll = 25 * time.Millisecond
)
// XAddStartJob appends one start-job entry in the
// `runtime:start_jobs` AsyncAPI shape and returns the assigned entry
// id. Mirrors the wire shape produced by Lobby's
// `runtimemanager.Publisher` so the consumer treats the entry exactly
// like a real Lobby-published job.
func XAddStartJob(t testing.TB, env *Env, gameID, imageRef string) string {
t.Helper()
id, err := env.RedisClient.XAdd(context.Background(), &redis.XAddArgs{
Stream: env.Cfg.Streams.StartJobs,
Values: map[string]any{
"game_id": gameID,
"image_ref": imageRef,
"requested_at_ms": time.Now().UTC().UnixMilli(),
},
}).Result()
require.NoErrorf(t, err, "xadd start_jobs for game %s", gameID)
return id
}
// XAddStopJob appends one stop-job entry classified by reason. The
// reason enum is documented at `ports.StopReason`.
func XAddStopJob(t testing.TB, env *Env, gameID, reason string) string {
t.Helper()
id, err := env.RedisClient.XAdd(context.Background(), &redis.XAddArgs{
Stream: env.Cfg.Streams.StopJobs,
Values: map[string]any{
"game_id": gameID,
"reason": reason,
"requested_at_ms": time.Now().UTC().UnixMilli(),
},
}).Result()
require.NoErrorf(t, err, "xadd stop_jobs for game %s", gameID)
return id
}
// JobResultEntry is the decoded shape of one `runtime:job_results`
// stream entry. Mirrors `ports.JobResult` plus the entry id surfaced
// by Redis so tests can correlate XADD ids with results.
type JobResultEntry struct {
StreamID string
GameID string
Outcome string
ContainerID string
EngineEndpoint string
ErrorCode string
ErrorMessage string
}
// HealthEventEntry mirrors the `runtime:health_events` AsyncAPI shape
// in decoded form.
type HealthEventEntry struct {
StreamID string
GameID string
ContainerID string
EventType string
OccurredAtMs int64
Details map[string]any
}
// NotificationIntentEntry decodes one `notification:intents` entry
// that RTM publishes for first-touch start failures.
type NotificationIntentEntry struct {
StreamID string
NotificationType string
IdempotencyKey string
Payload map[string]any
}
// WaitForJobResult polls `runtime:job_results` until predicate
// matches, or the timeout fires. Returns the matching entry. The
// helper does not consume the stream — every call rescans from `0-0`
// — because RTM's writes are append-only and the cardinality per test
// is small.
func WaitForJobResult(t testing.TB, env *Env, predicate func(JobResultEntry) bool, timeout time.Duration) JobResultEntry {
t.Helper()
if timeout <= 0 {
timeout = defaultStreamTimeout
}
deadline := time.Now().Add(timeout)
for {
entries, err := env.RedisClient.XRange(context.Background(), env.Cfg.Streams.JobResults, "-", "+").Result()
require.NoErrorf(t, err, "xrange %s", env.Cfg.Streams.JobResults)
for _, entry := range entries {
decoded := decodeJobResult(entry)
if predicate(decoded) {
return decoded
}
}
if time.Now().After(deadline) {
t.Fatalf("rtmanager integration: no job_result matched within %s; observed=%v",
timeout, jobResultStreamSummary(entries))
}
time.Sleep(defaultStreamPoll)
}
}
// AllJobResults returns every entry on `runtime:job_results` in stream
// order. Useful for assertions that depend on cardinality (replay
// tests).
func AllJobResults(t testing.TB, env *Env) []JobResultEntry {
t.Helper()
entries, err := env.RedisClient.XRange(context.Background(), env.Cfg.Streams.JobResults, "-", "+").Result()
require.NoErrorf(t, err, "xrange %s", env.Cfg.Streams.JobResults)
out := make([]JobResultEntry, 0, len(entries))
for _, entry := range entries {
out = append(out, decodeJobResult(entry))
}
return out
}
// WaitForHealthEvent polls `runtime:health_events` until predicate
// matches, or the timeout fires.
func WaitForHealthEvent(t testing.TB, env *Env, predicate func(HealthEventEntry) bool, timeout time.Duration) HealthEventEntry {
t.Helper()
if timeout <= 0 {
timeout = defaultStreamTimeout
}
deadline := time.Now().Add(timeout)
for {
entries, err := env.RedisClient.XRange(context.Background(), env.Cfg.Streams.HealthEvents, "-", "+").Result()
require.NoErrorf(t, err, "xrange %s", env.Cfg.Streams.HealthEvents)
for _, entry := range entries {
decoded := decodeHealthEvent(t, entry)
if predicate(decoded) {
return decoded
}
}
if time.Now().After(deadline) {
t.Fatalf("rtmanager integration: no health_event matched within %s; observed=%v",
timeout, healthEventStreamSummary(entries))
}
time.Sleep(defaultStreamPoll)
}
}
// WaitForNotificationIntent polls `notification:intents` until
// predicate matches.
func WaitForNotificationIntent(t testing.TB, env *Env, predicate func(NotificationIntentEntry) bool, timeout time.Duration) NotificationIntentEntry {
t.Helper()
if timeout <= 0 {
timeout = defaultStreamTimeout
}
deadline := time.Now().Add(timeout)
for {
entries, err := env.RedisClient.XRange(context.Background(), env.Cfg.Streams.NotificationIntents, "-", "+").Result()
require.NoErrorf(t, err, "xrange %s", env.Cfg.Streams.NotificationIntents)
for _, entry := range entries {
decoded := decodeNotificationIntent(t, entry)
if predicate(decoded) {
return decoded
}
}
if time.Now().After(deadline) {
t.Fatalf("rtmanager integration: no notification_intent matched within %s; observed=%v",
timeout, notificationStreamSummary(entries))
}
time.Sleep(defaultStreamPoll)
}
}
// JobOutcomeIs returns a predicate matching a job result whose game id
// and outcome equal the inputs.
func JobOutcomeIs(gameID, outcome string) func(JobResultEntry) bool {
return func(entry JobResultEntry) bool {
return entry.GameID == gameID && entry.Outcome == outcome
}
}
// JobOutcomeWithErrorCode matches a job result whose game id, outcome,
// and error_code all equal the inputs. Used by replay-no-op
// assertions.
func JobOutcomeWithErrorCode(gameID, outcome, errorCode string) func(JobResultEntry) bool {
return func(entry JobResultEntry) bool {
return entry.GameID == gameID && entry.Outcome == outcome && entry.ErrorCode == errorCode
}
}
// HealthEventTypeIs returns a predicate matching a health event whose
// game id and event_type equal the inputs.
func HealthEventTypeIs(gameID, eventType string) func(HealthEventEntry) bool {
return func(entry HealthEventEntry) bool {
return entry.GameID == gameID && entry.EventType == eventType
}
}
func decodeJobResult(message redis.XMessage) JobResultEntry {
return JobResultEntry{
StreamID: message.ID,
GameID: streamString(message.Values, "game_id"),
Outcome: streamString(message.Values, "outcome"),
ContainerID: streamString(message.Values, "container_id"),
EngineEndpoint: streamString(message.Values, "engine_endpoint"),
ErrorCode: streamString(message.Values, "error_code"),
ErrorMessage: streamString(message.Values, "error_message"),
}
}
func decodeHealthEvent(t testing.TB, message redis.XMessage) HealthEventEntry {
t.Helper()
occurredAt, _ := strconv.ParseInt(streamString(message.Values, "occurred_at_ms"), 10, 64)
entry := HealthEventEntry{
StreamID: message.ID,
GameID: streamString(message.Values, "game_id"),
ContainerID: streamString(message.Values, "container_id"),
EventType: streamString(message.Values, "event_type"),
OccurredAtMs: occurredAt,
}
rawDetails := streamString(message.Values, "details")
if rawDetails != "" {
var parsed map[string]any
if err := json.Unmarshal([]byte(rawDetails), &parsed); err == nil {
entry.Details = parsed
}
}
return entry
}
func decodeNotificationIntent(t testing.TB, message redis.XMessage) NotificationIntentEntry {
t.Helper()
entry := NotificationIntentEntry{
StreamID: message.ID,
NotificationType: streamString(message.Values, "notification_type"),
IdempotencyKey: streamString(message.Values, "idempotency_key"),
}
rawPayload := streamString(message.Values, "payload_json")
if rawPayload == "" {
rawPayload = streamString(message.Values, "payload")
}
if rawPayload != "" {
var parsed map[string]any
if err := json.Unmarshal([]byte(rawPayload), &parsed); err == nil {
entry.Payload = parsed
}
}
return entry
}
func streamString(values map[string]any, key string) string {
raw, ok := values[key]
if !ok {
return ""
}
switch typed := raw.(type) {
case string:
return typed
case []byte:
return string(typed)
default:
return fmt.Sprintf("%v", typed)
}
}
func jobResultStreamSummary(entries []redis.XMessage) []string {
out := make([]string, 0, len(entries))
for _, entry := range entries {
decoded := decodeJobResult(entry)
out = append(out, fmt.Sprintf("%s game=%s outcome=%s err=%s",
decoded.StreamID, decoded.GameID, decoded.Outcome, decoded.ErrorCode))
}
return out
}
func healthEventStreamSummary(entries []redis.XMessage) []string {
out := make([]string, 0, len(entries))
for _, entry := range entries {
out = append(out, fmt.Sprintf("%s %s %s",
entry.ID, streamString(entry.Values, "game_id"), streamString(entry.Values, "event_type")))
}
return out
}
func notificationStreamSummary(entries []redis.XMessage) []string {
out := make([]string, 0, len(entries))
for _, entry := range entries {
out = append(out, fmt.Sprintf("%s %s",
entry.ID, streamString(entry.Values, "notification_type")))
}
return out
}
// EnsureJobOutcomeConstants pins the constants from `ports` so suite
// authors can build predicates without importing `ports` themselves.
// Re-exported here to keep test source focused.
var (
JobOutcomeSuccess = ports.JobOutcomeSuccess
JobOutcomeFailure = ports.JobOutcomeFailure
)
// AssertNoJobResultBeyond fails the test if the count of entries on
// `runtime:job_results` exceeds `expectedCount`. Used by the replay
// tests to prove the second envelope was no-op.
func AssertNoJobResultBeyond(t testing.TB, env *Env, expectedCount int) {
t.Helper()
entries, err := env.RedisClient.XLen(context.Background(), env.Cfg.Streams.JobResults).Result()
require.NoError(t, err)
require.LessOrEqualf(t, entries, int64(expectedCount),
"job_results stream has more entries than expected; got=%d expected<=%d", entries, expectedCount)
}
// SanitizeContainerSummaryFor returns a stable diagnostic string for a
// container summary keyed by game id. Used in test failures.
func SanitizeContainerSummaryFor(values map[string]string, gameID string) string {
parts := make([]string, 0, len(values))
for key, value := range values {
parts = append(parts, key+"="+value)
}
return fmt.Sprintf("game=%s {%s}", gameID, strings.Join(parts, ", "))
}
+303
View File
@@ -0,0 +1,303 @@
//go:build integration
// Package integration_test owns the service-local end-to-end scenarios
// for Runtime Manager. The build tag keeps the suite out of the
// default `go test ./...` run; CI invokes the suite explicitly with
// `go test -tags=integration ./rtmanager/integration/...`.
//
// Design rationale for the suite — build tag, in-process harness,
// per-test isolation, two-tag engine image — lives in
// `rtmanager/docs/integration-tests.md`. Each test stands up its own
// Runtime Manager process via `harness.NewEnv`, drives the same
// streams Game Lobby uses in `integration/lobbyrtm`, and asserts the
// resulting PostgreSQL, Redis-stream, and Docker side-effects.
package integration_test
import (
"context"
"net/http"
"testing"
"time"
"galaxy/rtmanager/integration/harness"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/filters"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestMain centralises shared-container teardown so individual
// failing tests do not leak the testcontainers postgres / redis pair.
func TestMain(m *testing.M) {
harness.RunMain(m)
}
// TestLifecycle_StartInspectStopRestartPatchCleanup drives one game
// through every supported lifecycle operation against the real engine
// image and asserts each step's PG, Redis-stream, and Docker
// side-effects.
func TestLifecycle_StartInspectStopRestartPatchCleanup(t *testing.T) {
env := harness.NewEnv(t, harness.EnvOptions{LogToStderr: true})
rest := harness.NewREST(env)
gameID := harness.IDFromTestName(t)
// Step 1 — start through the Lobby async stream contract.
startEntryID := harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
t.Logf("start_jobs xadd id=%s", startEntryID)
startResult := harness.WaitForJobResult(t, env,
harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess),
30*time.Second,
)
require.Equal(t, "", startResult.ErrorCode, "fresh start must publish empty error_code")
require.NotEmpty(t, startResult.ContainerID, "fresh start job result must carry container_id")
require.NotEmpty(t, startResult.EngineEndpoint, "fresh start job result must carry engine_endpoint")
// PG record reflects the start.
startedRecord := harness.EventuallyRuntimeRecord(t, env, gameID,
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusRunning },
15*time.Second,
)
assert.Equal(t, env.EngineImageRef, startedRecord.CurrentImageRef)
assert.Equal(t, env.Network, startedRecord.DockerNetwork)
assert.Equal(t, startResult.ContainerID, startedRecord.CurrentContainerID)
assert.Equal(t, startResult.EngineEndpoint, startedRecord.EngineEndpoint)
// operation_log captures the start.
startEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindStart, 5*time.Second)
assert.Equal(t, operation.OutcomeSuccess, startEntry.Outcome)
assert.Equal(t, operation.OpSourceLobbyStream, startEntry.OpSource)
// Step 2 — inspect via the GM/Admin REST surface.
getResp, status := rest.GetRuntime(t, gameID)
require.Equal(t, http.StatusOK, status)
require.Equal(t, "running", getResp.Status)
require.NotNil(t, getResp.CurrentContainerID)
require.Equal(t, startResult.ContainerID, *getResp.CurrentContainerID)
require.NotNil(t, getResp.CurrentImageRef)
require.Equal(t, env.EngineImageRef, *getResp.CurrentImageRef)
require.NotNil(t, getResp.EngineEndpoint)
require.Equal(t, startResult.EngineEndpoint, *getResp.EngineEndpoint)
// Step 3 — stop through the Lobby async stream contract.
harness.XAddStopJob(t, env, gameID, "cancelled")
stopResult := waitForLatestStopOrStartResult(t, env, gameID)
require.Equal(t, ports.JobOutcomeSuccess, stopResult.Outcome)
require.Equal(t, "", stopResult.ErrorCode, "fresh stop must publish empty error_code")
stoppedRecord := harness.EventuallyRuntimeRecord(t, env, gameID,
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusStopped },
15*time.Second,
)
assert.Equal(t, startResult.ContainerID, stoppedRecord.CurrentContainerID,
"stop preserves the current container id until cleanup")
// Step 4 — restart via REST. Container id changes; engine endpoint
// stays stable.
restartResp, status := rest.RestartRuntime(t, gameID)
require.Equal(t, http.StatusOK, status)
require.Equal(t, "running", restartResp.Status)
require.NotNil(t, restartResp.CurrentContainerID)
require.NotEqual(t, startResult.ContainerID, *restartResp.CurrentContainerID,
"restart must produce a new container id")
require.NotNil(t, restartResp.EngineEndpoint)
require.Equal(t, startResult.EngineEndpoint, *restartResp.EngineEndpoint,
"restart must keep the engine endpoint stable")
restartContainerID := *restartResp.CurrentContainerID
restartEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindRestart, 5*time.Second)
assert.Equal(t, operation.OutcomeSuccess, restartEntry.Outcome)
assert.Equal(t, operation.OpSourceAdminRest, restartEntry.OpSource)
// Step 5 — patch to the second semver-compatible tag. Same image
// content, but the runtime should still record the new tag and
// recreate the container.
patchResp, status := rest.PatchRuntime(t, gameID, env.PatchedImageRef)
require.Equal(t, http.StatusOK, status)
require.Equal(t, "running", patchResp.Status)
require.NotNil(t, patchResp.CurrentImageRef)
assert.Equal(t, env.PatchedImageRef, *patchResp.CurrentImageRef)
require.NotNil(t, patchResp.CurrentContainerID)
assert.NotEqual(t, restartContainerID, *patchResp.CurrentContainerID,
"patch must recreate the container")
patchEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindPatch, 5*time.Second)
assert.Equal(t, operation.OutcomeSuccess, patchEntry.Outcome)
// Step 6 — quiesce via REST stop so cleanup is allowed (cleanup
// refuses to remove a running container per
// `rtmanager/README.md §Lifecycles → Cleanup`).
stopResp, status := rest.StopRuntime(t, gameID, "admin_request")
require.Equal(t, http.StatusOK, status)
require.Equal(t, "stopped", stopResp.Status)
// Step 7 — cleanup the container. PG record flips to removed and
// current_container_id becomes nil.
cleanupResp, status := rest.CleanupRuntime(t, gameID)
require.Equal(t, http.StatusOK, status)
require.Equal(t, "removed", cleanupResp.Status)
require.Nil(t, cleanupResp.CurrentContainerID)
cleanupEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindCleanupContainer, 5*time.Second)
assert.Equal(t, operation.OutcomeSuccess, cleanupEntry.Outcome)
assert.Equal(t, operation.OpSourceAdminRest, cleanupEntry.OpSource)
}
// TestReplay_StartJobIsNoop publishes the same start envelope twice
// and asserts that Runtime Manager produces a fresh job_result for
// the first XADD and a `replay_no_op` outcome for the second, without
// recreating the engine container.
func TestReplay_StartJobIsNoop(t *testing.T) {
env := harness.NewEnv(t, harness.EnvOptions{})
gameID := harness.IDFromTestName(t)
// First XADD: fresh start.
harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
first := harness.WaitForJobResult(t, env,
harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess),
30*time.Second,
)
require.Equal(t, "", first.ErrorCode)
// Second XADD: same envelope; the start service must short-circuit
// at the `runtime_records.status=running && image_ref` check.
harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
replay := harness.WaitForJobResult(t, env,
harness.JobOutcomeWithErrorCode(gameID, ports.JobOutcomeSuccess, "replay_no_op"),
15*time.Second,
)
assert.Equal(t, first.ContainerID, replay.ContainerID,
"replay must surface the same container id as the original start")
assert.Equal(t, first.EngineEndpoint, replay.EngineEndpoint)
// Docker view: exactly one engine container exists for this game.
assertSingleEngineContainer(t, env, gameID)
// Lifecycle stream produced exactly two entries: fresh + replay.
entries := harness.AllJobResults(t, env)
require.Len(t, entries, 2)
assert.Equal(t, "", entries[0].ErrorCode)
assert.Equal(t, "replay_no_op", entries[1].ErrorCode)
}
// TestReplay_StopJobIsNoop publishes a stop envelope twice after a
// successful start and asserts the second stop surfaces as
// `replay_no_op` without altering the runtime record's `stopped_at`.
func TestReplay_StopJobIsNoop(t *testing.T) {
env := harness.NewEnv(t, harness.EnvOptions{})
gameID := harness.IDFromTestName(t)
// Bring the game to `running`. The start path publishes one entry
// to `runtime:job_results`; the stops below publish two more, so
// per-game stream order is [start, first-stop, replay-stop].
harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
harness.WaitForJobResult(t, env,
harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess),
30*time.Second,
)
// First stop: fresh. The expectedCount accounts for the start
// entry that is already on the stream.
harness.XAddStopJob(t, env, gameID, "cancelled")
first := waitForJobResultByIndex(t, env, gameID, 2)
require.Equal(t, ports.JobOutcomeSuccess, first.Outcome)
require.Equal(t, "", first.ErrorCode)
stoppedRecord := harness.EventuallyRuntimeRecord(t, env, gameID,
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusStopped },
15*time.Second,
)
require.NotNil(t, stoppedRecord.StoppedAt, "stopped record must carry stopped_at")
originalStoppedAt := *stoppedRecord.StoppedAt
// Second stop: replay (third entry on the per-game stream).
harness.XAddStopJob(t, env, gameID, "cancelled")
replay := waitForJobResultByIndex(t, env, gameID, 3)
require.Equal(t, ports.JobOutcomeSuccess, replay.Outcome)
assert.Equal(t, "replay_no_op", replay.ErrorCode)
// stopped_at stays anchored to the first stop.
postReplay := harness.MustRuntimeRecord(t, env, gameID)
require.Equal(t, runtime.StatusStopped, postReplay.Status)
require.NotNil(t, postReplay.StoppedAt)
assert.True(t, originalStoppedAt.Equal(*postReplay.StoppedAt),
"stopped_at must not move on a replay stop; was %s, now %s",
originalStoppedAt, *postReplay.StoppedAt)
}
// waitForLatestStopOrStartResult finds the most recent `outcome=success`
// entry on `runtime:job_results` for gameID. The lifecycle scenario
// emits two consecutive successes (start then stop); the helper picks
// the second one without re-scanning the stream every iteration.
func waitForLatestStopOrStartResult(t *testing.T, env *harness.Env, gameID string) harness.JobResultEntry {
t.Helper()
deadline := time.Now().Add(30 * time.Second)
for {
entries := harness.AllJobResults(t, env)
// Two entries means we've observed both the start and stop
// outcomes for this game.
matched := 0
var last harness.JobResultEntry
for _, entry := range entries {
if entry.GameID == gameID && entry.Outcome == ports.JobOutcomeSuccess {
matched++
last = entry
}
}
if matched >= 2 {
return last
}
if time.Now().After(deadline) {
t.Fatalf("expected two job_results for %s, got %d", gameID, matched)
}
time.Sleep(50 * time.Millisecond)
}
}
// waitForJobResultByIndex polls the job_results stream until it has
// at least `expectedCount` entries for gameID and returns the
// expectedCount-th. Used by the replay tests to deterministically
// pick the second / nth result.
func waitForJobResultByIndex(t *testing.T, env *harness.Env, gameID string, expectedCount int) harness.JobResultEntry {
t.Helper()
deadline := time.Now().Add(30 * time.Second)
for {
entries := harness.AllJobResults(t, env)
matches := make([]harness.JobResultEntry, 0, len(entries))
for _, entry := range entries {
if entry.GameID == gameID {
matches = append(matches, entry)
}
}
if len(matches) >= expectedCount {
return matches[expectedCount-1]
}
if time.Now().After(deadline) {
t.Fatalf("expected at least %d job_results for %s, got %d",
expectedCount, gameID, len(matches))
}
time.Sleep(50 * time.Millisecond)
}
}
// assertSingleEngineContainer queries Docker by the per-game label and
// asserts exactly one matching container exists. Catches replay
// regressions that would let RTM start two containers for the same
// game id.
func assertSingleEngineContainer(t *testing.T, env *harness.Env, gameID string) {
t.Helper()
args := filters.NewArgs(
filters.Arg("label", "com.galaxy.owner=rtmanager"),
filters.Arg("label", "com.galaxy.game_id="+gameID),
)
containers, err := env.Docker.Client().ContainerList(
context.Background(),
container.ListOptions{All: true, Filters: args},
)
require.NoError(t, err)
require.Lenf(t, containers, 1, "expected one engine container for game %s, got %d", gameID, len(containers))
}
+200
View File
@@ -0,0 +1,200 @@
//go:build integration
package integration_test
import (
"context"
"fmt"
"strconv"
"testing"
"time"
"galaxy/notificationintent"
"galaxy/rtmanager/integration/harness"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
dockercontainer "github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/network"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestHealth_ContainerDisappearedAndAdopt verifies the two
// drift-detection paths. The Docker events listener emits
// `container_disappeared` when a tracked container is destroyed
// outside RTM, and the reconciler adopts a fresh container labelled
// `com.galaxy.owner=rtmanager` that has no PG row.
//
// `runtime_records.status=removed` is terminal per
// `runtime.AllowedTransitions`; the adoption path therefore uses a
// **fresh** game_id rather than re-adopting the disposed one. That
// matches the documented contract: reconciler adopts containers
// labelled `com.galaxy.owner=rtmanager` for which no PG row exists.
func TestHealth_ContainerDisappearedAndAdopt(t *testing.T) {
env := harness.NewEnv(t, harness.EnvOptions{
ReconcileInterval: 500 * time.Millisecond,
})
// Step 1 — bring a game to running through the start consumer.
disposalGameID := harness.IDFromTestName(t) + "-d"
harness.XAddStartJob(t, env, disposalGameID, env.EngineImageRef)
startResult := harness.WaitForJobResult(t, env,
harness.JobOutcomeIs(disposalGameID, ports.JobOutcomeSuccess),
30*time.Second,
)
originalContainerID := startResult.ContainerID
require.NotEmpty(t, originalContainerID)
// Step 2 — externally remove the container; the events listener
// should observe the destroy and publish `container_disappeared`.
removeContainer(t, env, originalContainerID)
disappeared := harness.WaitForHealthEvent(t, env,
harness.HealthEventTypeIs(disposalGameID, string(health.EventTypeContainerDisappeared)),
20*time.Second,
)
assert.Equal(t, originalContainerID, disappeared.ContainerID)
// The reconciler also marks the runtime record as removed within
// one or two ticks (`reconcile_dispose`).
harness.EventuallyRuntimeRecord(t, env, disposalGameID,
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusRemoved },
15*time.Second,
)
harness.EventuallyOperationKind(t, env, disposalGameID, operation.OpKindReconcileDispose, 5*time.Second)
// Step 3 — bring up an adoption candidate for an unseen game id
// by hand. The reconciler must label-match it, find no record,
// and insert one with status=running.
adoptionGameID := harness.IDFromTestName(t) + "-a"
manualContainerID := runManualEngineContainer(t, env, adoptionGameID)
t.Logf("manual container id=%s", manualContainerID)
adopted := harness.EventuallyRuntimeRecord(t, env, adoptionGameID,
func(r runtime.RuntimeRecord) bool {
return r.Status == runtime.StatusRunning && r.CurrentContainerID == manualContainerID
},
20*time.Second,
)
assert.Equal(t, env.EngineImageRef, adopted.CurrentImageRef)
adoptEntry := harness.EventuallyOperationKind(t, env, adoptionGameID, operation.OpKindReconcileAdopt, 5*time.Second)
assert.Equal(t, operation.OutcomeSuccess, adoptEntry.Outcome)
assert.Equal(t, operation.OpSourceAutoReconcile, adoptEntry.OpSource)
assert.Equal(t, manualContainerID, adoptEntry.ContainerID)
}
// TestNotification_ImagePullFailed drives Runtime Manager with a
// start envelope pointing at an unresolvable image reference. The
// start service must surface the failure on `runtime:job_results` and
// publish a `runtime.image_pull_failed` admin notification on
// `notification:intents`.
func TestNotification_ImagePullFailed(t *testing.T) {
env := harness.NewEnv(t, harness.EnvOptions{})
gameID := harness.IDFromTestName(t)
const missingImage = "galaxy/integration-missing:0.0.0"
harness.XAddStartJob(t, env, gameID, missingImage)
// Job result publishes a failure with the stable image_pull_failed
// code.
jobResult := harness.WaitForJobResult(t, env,
harness.JobOutcomeIs(gameID, ports.JobOutcomeFailure),
60*time.Second,
)
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, jobResult.ErrorCode)
assert.Empty(t, jobResult.ContainerID, "failure must not surface a container id")
assert.Empty(t, jobResult.EngineEndpoint, "failure must not surface an engine endpoint")
assert.NotEmpty(t, jobResult.ErrorMessage, "failure must carry an operator-readable message")
// Notification stream carries the matching admin-only intent.
intent := harness.WaitForNotificationIntent(t, env,
func(entry harness.NotificationIntentEntry) bool {
if entry.NotificationType != string(notificationintent.NotificationTypeRuntimeImagePullFailed) {
return false
}
payloadGameID, _ := entry.Payload["game_id"].(string)
return payloadGameID == gameID
},
30*time.Second,
)
require.NotNil(t, intent.Payload, "notification intent must carry a payload")
assert.Equal(t, gameID, intent.Payload["game_id"])
assert.Equal(t, missingImage, intent.Payload["image_ref"])
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, intent.Payload["error_code"])
// PG state: no running record was installed; operation_log
// captures one failed start with the stable error code.
_, err := harness.RuntimeRecord(t, env, gameID)
if err == nil {
// If an entry was upserted (rollback gap), it must not be
// running.
record := harness.MustRuntimeRecord(t, env, gameID)
assert.NotEqual(t, runtime.StatusRunning, record.Status,
"failed image pull must not leave a running record behind")
}
failureEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindStart, 5*time.Second)
assert.Equal(t, operation.OutcomeFailure, failureEntry.Outcome)
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, failureEntry.ErrorCode)
}
// removeContainer terminates and removes the container behind RTM's
// back. Force=true is required because the engine has not received a
// SIGTERM and stop signal handling is engine-internal.
func removeContainer(t *testing.T, env *harness.Env, containerID string) {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
require.NoError(t, env.Docker.Client().ContainerRemove(ctx, containerID, dockercontainer.RemoveOptions{Force: true}))
}
// runManualEngineContainer bypasses RTM and starts an engine container
// directly through the Docker SDK. The container carries every label
// the reconciler reads at adopt time (`com.galaxy.owner`,
// `com.galaxy.kind`, `com.galaxy.game_id`, `com.galaxy.engine_image_ref`,
// `com.galaxy.started_at_ms`) plus the per-game hostname so the
// computed `engine_endpoint` matches what `rtmanager` would have
// written.
func runManualEngineContainer(t *testing.T, env *harness.Env, gameID string) string {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
hostname := "galaxy-game-" + gameID
cfg := &dockercontainer.Config{
Image: env.EngineImageRef,
Hostname: hostname,
Labels: map[string]string{
"com.galaxy.owner": "rtmanager",
"com.galaxy.kind": "game-engine",
"com.galaxy.game_id": gameID,
"com.galaxy.engine_image_ref": env.EngineImageRef,
"com.galaxy.started_at_ms": strconv.FormatInt(time.Now().UnixMilli(), 10),
},
Env: []string{
"GAME_STATE_PATH=/var/lib/galaxy-game",
"STORAGE_PATH=/var/lib/galaxy-game",
},
}
hostCfg := &dockercontainer.HostConfig{}
netCfg := &network.NetworkingConfig{
EndpointsConfig: map[string]*network.EndpointSettings{
env.Network: {Aliases: []string{hostname}},
},
}
containerName := fmt.Sprintf("galaxy-game-%s-manual", gameID)
created, err := env.Docker.Client().ContainerCreate(ctx, cfg, hostCfg, netCfg, nil, containerName)
require.NoError(t, err)
t.Cleanup(func() {
removeCtx, removeCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer removeCancel()
_ = env.Docker.Client().ContainerRemove(removeCtx, created.ID, dockercontainer.RemoveOptions{Force: true})
})
require.NoError(t, env.Docker.Client().ContainerStart(ctx, created.ID, dockercontainer.StartOptions{}))
return created.ID
}