feat: runtime manager
This commit is contained in:
@@ -0,0 +1,236 @@
|
||||
package harness
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cerrdefs "github.com/containerd/errdefs"
|
||||
"github.com/docker/docker/api/types/network"
|
||||
dockerclient "github.com/docker/docker/client"
|
||||
)
|
||||
|
||||
// Engine image tags used by the integration suite. `EngineImageRef` is
|
||||
// the image we actually build from `galaxy/game/Dockerfile`;
|
||||
// `PatchedEngineImageRef` is the same image content tagged at a higher
|
||||
// semver patch so the patch lifecycle test exercises the
|
||||
// `semver_patch_only` validation against a real image. Keeping both at
|
||||
// the same digest avoids a redundant build.
|
||||
const (
|
||||
EngineImageRef = "galaxy/game:1.0.0-rtm-it"
|
||||
PatchedEngineImageRef = "galaxy/game:1.0.1-rtm-it"
|
||||
|
||||
dockerNetworkPrefix = "rtmanager-it-"
|
||||
|
||||
dockerPingTimeout = 5 * time.Second
|
||||
dockerNetworkTimeout = 30 * time.Second
|
||||
imageBuildTimeout = 10 * time.Minute
|
||||
)
|
||||
|
||||
// DockerEnv carries the per-package Docker client plus the workspace
|
||||
// root used by image builds. The client is opened lazily on the first
|
||||
// EnsureDocker call and closed by ShutdownDocker at TestMain exit.
|
||||
type DockerEnv struct {
|
||||
client *dockerclient.Client
|
||||
workspaceRoot string
|
||||
}
|
||||
|
||||
// Client returns the harness-owned Docker SDK client. Tests use it
|
||||
// directly for "external actions" the harness does not wrap (e.g.,
|
||||
// removing a running container behind RTM's back in `health_test`).
|
||||
func (env *DockerEnv) Client() *dockerclient.Client { return env.client }
|
||||
|
||||
// WorkspaceRoot returns the absolute path of the galaxy/ workspace
|
||||
// root. It is exported so the runtime helper can resolve the host
|
||||
// game-state root relative to it if a test needs a deterministic
|
||||
// location, though the default places state under `t.ArtifactDir()`.
|
||||
func (env *DockerEnv) WorkspaceRoot() string { return env.workspaceRoot }
|
||||
|
||||
var (
|
||||
dockerOnce sync.Once
|
||||
dockerEnv *DockerEnv
|
||||
dockerErr error
|
||||
|
||||
imageOnce sync.Once
|
||||
imageErr error
|
||||
)
|
||||
|
||||
// EnsureDocker opens the shared Docker SDK client and verifies the
|
||||
// daemon is reachable. When the daemon is unavailable the helper calls
|
||||
// `t.Skip` so suites stay green on hosts without `/var/run/docker.sock`
|
||||
// or `DOCKER_HOST`.
|
||||
func EnsureDocker(t testing.TB) *DockerEnv {
|
||||
t.Helper()
|
||||
dockerOnce.Do(func() {
|
||||
dockerEnv, dockerErr = openDocker()
|
||||
})
|
||||
if dockerErr != nil {
|
||||
t.Skipf("rtmanager integration: docker daemon unavailable: %v", dockerErr)
|
||||
}
|
||||
return dockerEnv
|
||||
}
|
||||
|
||||
// EnsureEngineImage builds the `galaxy/game` engine image from the
|
||||
// workspace root once per package run via `sync.Once`, then tags the
|
||||
// resulting image at both `EngineImageRef` and `PatchedEngineImageRef`
|
||||
// so the patch lifecycle has a second semver-valid tag to point at.
|
||||
// Subsequent calls re-use the cached image. Any test that asks for the
|
||||
// engine image must invoke this helper first; it is intentionally
|
||||
// separate from `EnsureDocker` so suites that only need the daemon
|
||||
// (e.g., a future "Docker network missing" negative test) do not pay
|
||||
// the build cost.
|
||||
func EnsureEngineImage(t testing.TB) string {
|
||||
t.Helper()
|
||||
env := EnsureDocker(t)
|
||||
imageOnce.Do(func() {
|
||||
imageErr = buildAndTagEngineImage(env)
|
||||
})
|
||||
if imageErr != nil {
|
||||
t.Skipf("rtmanager integration: build galaxy/game image: %v", imageErr)
|
||||
}
|
||||
return EngineImageRef
|
||||
}
|
||||
|
||||
// EnsureNetwork creates a uniquely-named Docker bridge network for the
|
||||
// caller's test and registers cleanup. Each test gets its own network
|
||||
// so concurrent scenarios cannot collide on the per-game DNS hostname.
|
||||
func EnsureNetwork(t testing.TB) string {
|
||||
t.Helper()
|
||||
env := EnsureDocker(t)
|
||||
name := dockerNetworkPrefix + uniqueSuffix(t)
|
||||
|
||||
createCtx, cancel := context.WithTimeout(context.Background(), dockerNetworkTimeout)
|
||||
defer cancel()
|
||||
if _, err := env.client.NetworkCreate(createCtx, name, network.CreateOptions{Driver: "bridge"}); err != nil {
|
||||
t.Fatalf("rtmanager integration: create docker network %q: %v", name, err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
removeCtx, removeCancel := context.WithTimeout(context.Background(), dockerNetworkTimeout)
|
||||
defer removeCancel()
|
||||
if err := env.client.NetworkRemove(removeCtx, name); err != nil && !cerrdefs.IsNotFound(err) {
|
||||
t.Logf("rtmanager integration: remove docker network %q: %v", name, err)
|
||||
}
|
||||
})
|
||||
return name
|
||||
}
|
||||
|
||||
// ShutdownDocker closes the shared Docker SDK client. `TestMain`
|
||||
// invokes it after `m.Run`. The harness deliberately leaves the engine
|
||||
// image in the local Docker cache so the next package run benefits
|
||||
// from the layer cache; operators can `docker image rm` the
|
||||
// `*-rtm-it` tags by hand if a stale image gets in the way.
|
||||
func ShutdownDocker() {
|
||||
if dockerEnv == nil {
|
||||
return
|
||||
}
|
||||
if dockerEnv.client != nil {
|
||||
_ = dockerEnv.client.Close()
|
||||
}
|
||||
dockerEnv = nil
|
||||
}
|
||||
|
||||
// uniqueSuffix returns 8 hex characters of randomness suitable for a
|
||||
// per-test resource name. The same helper is used in
|
||||
// `internal/adapters/docker/smoke_test.go`; we duplicate it instead of
|
||||
// importing because `_test.go`-only helpers cannot be exported.
|
||||
func uniqueSuffix(t testing.TB) string {
|
||||
t.Helper()
|
||||
buf := make([]byte, 4)
|
||||
if _, err := rand.Read(buf); err != nil {
|
||||
t.Fatalf("rtmanager integration: read random suffix: %v", err)
|
||||
}
|
||||
return hex.EncodeToString(buf)
|
||||
}
|
||||
|
||||
func openDocker() (*DockerEnv, error) {
|
||||
if os.Getenv("DOCKER_HOST") == "" {
|
||||
if _, err := os.Stat("/var/run/docker.sock"); err != nil {
|
||||
return nil, fmt.Errorf("set DOCKER_HOST or expose /var/run/docker.sock: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
client, err := dockerclient.NewClientWithOpts(
|
||||
dockerclient.FromEnv,
|
||||
dockerclient.WithAPIVersionNegotiation(),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new docker client: %w", err)
|
||||
}
|
||||
|
||||
pingCtx, cancel := context.WithTimeout(context.Background(), dockerPingTimeout)
|
||||
defer cancel()
|
||||
if _, err := client.Ping(pingCtx); err != nil {
|
||||
_ = client.Close()
|
||||
return nil, fmt.Errorf("ping docker daemon: %w", err)
|
||||
}
|
||||
|
||||
root, err := workspaceRoot()
|
||||
if err != nil {
|
||||
_ = client.Close()
|
||||
return nil, fmt.Errorf("resolve workspace root: %w", err)
|
||||
}
|
||||
|
||||
return &DockerEnv{
|
||||
client: client,
|
||||
workspaceRoot: root,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// buildAndTagEngineImage invokes `docker build` against the workspace
|
||||
// root context to materialise the `galaxy/game` image, then tags the
|
||||
// resulting image at the patch tag. Shelling out to the CLI keeps the
|
||||
// implementation tiny — using the SDK would require streaming a tar
|
||||
// of the workspace root, which is heavy and duplicates what the CLI
|
||||
// already optimises. The workspace-root build context is required by
|
||||
// `galaxy/game` (see `galaxy/game/README.md` §Build).
|
||||
func buildAndTagEngineImage(env *DockerEnv) error {
|
||||
if env == nil {
|
||||
return errors.New("nil docker env")
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), imageBuildTimeout)
|
||||
defer cancel()
|
||||
|
||||
dockerfilePath := filepath.Join("game", "Dockerfile")
|
||||
cmd := exec.CommandContext(ctx, "docker", "build",
|
||||
"-f", dockerfilePath,
|
||||
"-t", EngineImageRef,
|
||||
".",
|
||||
)
|
||||
cmd.Dir = env.workspaceRoot
|
||||
cmd.Env = append(os.Environ(), "DOCKER_BUILDKIT=1")
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("docker build (-f %s) in %s: %w; output:\n%s",
|
||||
dockerfilePath, env.workspaceRoot, err, strings.TrimSpace(string(output)))
|
||||
}
|
||||
|
||||
if err := env.client.ImageTag(ctx, EngineImageRef, PatchedEngineImageRef); err != nil {
|
||||
return fmt.Errorf("tag %s as %s: %w", EngineImageRef, PatchedEngineImageRef, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// workspaceRoot resolves the absolute path of the galaxy/ workspace
|
||||
// root by anchoring on this file's location. The harness lives at
|
||||
// `galaxy/rtmanager/integration/harness/docker.go`, so the workspace
|
||||
// root is three directories up. Mirrors the `cmd/jetgen` strategy.
|
||||
func workspaceRoot() (string, error) {
|
||||
_, file, _, ok := runtime.Caller(0)
|
||||
if !ok {
|
||||
return "", errors.New("resolve runtime caller for workspace root")
|
||||
}
|
||||
dir := filepath.Dir(file)
|
||||
// dir = .../galaxy/rtmanager/integration/harness
|
||||
root := filepath.Clean(filepath.Join(dir, "..", "..", ".."))
|
||||
return root, nil
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package harness
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// LobbyStub answers the single Lobby internal request the start
|
||||
// service performs ([`internal/adapters/lobbyclient`]). The start
|
||||
// service treats this response as ancillary diagnostics — the start
|
||||
// envelope already carries `image_ref` — so the stub returns a
|
||||
// deterministic 200 OK and lets the runtime ignore the payload.
|
||||
//
|
||||
// The stub only validates that the runtime configuration treats the
|
||||
// Lobby URL as required (so it cannot regress to nil-out the
|
||||
// ancillary fetch); the response body itself is unused by the
|
||||
// integration assertions.
|
||||
type LobbyStub struct {
|
||||
Server *httptest.Server
|
||||
}
|
||||
|
||||
// NewLobbyStub returns a started httptest.Server. The caller registers
|
||||
// `t.Cleanup(stub.Close)` themselves through the runtime helper so the
|
||||
// stub follows the same lifecycle as the rest of the per-test wiring.
|
||||
func NewLobbyStub(t testing.TB) *LobbyStub {
|
||||
t.Helper()
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("GET /api/v1/internal/games/{game_id}", func(w http.ResponseWriter, r *http.Request) {
|
||||
gameID := strings.TrimSpace(r.PathValue("game_id"))
|
||||
if gameID == "" {
|
||||
writeStubError(w, http.StatusBadRequest, "invalid_request", "game_id is required")
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_ = json.NewEncoder(w).Encode(map[string]string{
|
||||
"game_id": gameID,
|
||||
"status": "running",
|
||||
"target_engine_version": "1.0.0",
|
||||
})
|
||||
})
|
||||
server := httptest.NewServer(mux)
|
||||
t.Cleanup(server.Close)
|
||||
return &LobbyStub{Server: server}
|
||||
}
|
||||
|
||||
// URL returns the base URL of the running stub.
|
||||
func (stub *LobbyStub) URL() string { return stub.Server.URL }
|
||||
|
||||
func writeStubError(w http.ResponseWriter, status int, code, message string) {
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
w.WriteHeader(status)
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"error": map[string]string{"code": code, "message": message},
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,224 @@
|
||||
// Package harness exposes the testcontainers / Docker / image-build
|
||||
// scaffolding shared by the Runtime Manager service-local integration
|
||||
// suite under [`galaxy/rtmanager/integration`](..).
|
||||
//
|
||||
// Only `_test.go` files (and the harness itself) reference this
|
||||
// package; production code paths in `cmd/rtmanager` never import it.
|
||||
// The package therefore stays out of the production binary's import
|
||||
// graph, identical to the in-package `pgtest` and `integration/internal/harness`
|
||||
// patterns it mirrors.
|
||||
package harness
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"net/url"
|
||||
"os"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/postgres"
|
||||
"galaxy/rtmanager/internal/adapters/postgres/migrations"
|
||||
|
||||
testcontainers "github.com/testcontainers/testcontainers-go"
|
||||
tcpostgres "github.com/testcontainers/testcontainers-go/modules/postgres"
|
||||
"github.com/testcontainers/testcontainers-go/wait"
|
||||
)
|
||||
|
||||
const (
|
||||
pgImage = "postgres:16-alpine"
|
||||
pgSuperUser = "galaxy"
|
||||
pgSuperPassword = "galaxy"
|
||||
pgSuperDatabase = "galaxy_rtmanager_it"
|
||||
pgServiceRole = "rtmanagerservice"
|
||||
pgServicePassword = "rtmanagerservice"
|
||||
pgServiceSchema = "rtmanager"
|
||||
pgStartupTimeout = 90 * time.Second
|
||||
|
||||
// pgOperationTimeout bounds the per-statement deadline used by every
|
||||
// pool the harness opens. Short enough to surface a runaway
|
||||
// integration test promptly, long enough to absorb laptop-grade I/O.
|
||||
pgOperationTimeout = 10 * time.Second
|
||||
)
|
||||
|
||||
// PostgresEnv carries the per-package PostgreSQL fixture. The container
|
||||
// is started lazily on the first EnsurePostgres call and torn down by
|
||||
// ShutdownPostgres at TestMain exit.
|
||||
type PostgresEnv struct {
|
||||
container *tcpostgres.PostgresContainer
|
||||
pool *sql.DB
|
||||
scopedDSN string
|
||||
}
|
||||
|
||||
// Pool returns the harness-owned `*sql.DB` scoped to the rtmanager
|
||||
// schema. Tests use it to read durable state directly through the
|
||||
// existing store adapters.
|
||||
func (env *PostgresEnv) Pool() *sql.DB { return env.pool }
|
||||
|
||||
// DSN returns the rtmanager-role-scoped DSN suitable for
|
||||
// `RTMANAGER_POSTGRES_PRIMARY_DSN`. Both this DSN and Pool address the
|
||||
// same database; the pool is reused across tests, while the runtime
|
||||
// under test opens its own pool through this DSN.
|
||||
func (env *PostgresEnv) DSN() string { return env.scopedDSN }
|
||||
|
||||
var (
|
||||
pgOnce sync.Once
|
||||
pgEnv *PostgresEnv
|
||||
pgErr error
|
||||
)
|
||||
|
||||
// EnsurePostgres starts the per-package PostgreSQL container on first
|
||||
// invocation and applies the embedded goose migrations. Subsequent
|
||||
// invocations reuse the same container. When Docker is unavailable the
|
||||
// helper calls `t.Skip` so the suite stays green on hosts without a
|
||||
// daemon (mirrors the contract from `internal/adapters/postgres/internal/pgtest`).
|
||||
func EnsurePostgres(t testing.TB) *PostgresEnv {
|
||||
t.Helper()
|
||||
pgOnce.Do(func() {
|
||||
pgEnv, pgErr = startPostgres()
|
||||
})
|
||||
if pgErr != nil {
|
||||
t.Skipf("rtmanager integration: postgres container start failed (Docker unavailable?): %v", pgErr)
|
||||
}
|
||||
return pgEnv
|
||||
}
|
||||
|
||||
// TruncatePostgres wipes every Runtime Manager table inside the shared
|
||||
// pool, leaving the schema and indexes intact. Tests call this from
|
||||
// their setup so each scenario starts on an empty state.
|
||||
func TruncatePostgres(t testing.TB) {
|
||||
t.Helper()
|
||||
env := EnsurePostgres(t)
|
||||
const stmt = `TRUNCATE TABLE runtime_records, operation_log, health_snapshots RESTART IDENTITY CASCADE`
|
||||
if _, err := env.pool.ExecContext(context.Background(), stmt); err != nil {
|
||||
t.Fatalf("truncate rtmanager tables: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ShutdownPostgres terminates the shared container and closes the pool.
|
||||
// `TestMain` invokes it after `m.Run` so the container is released even
|
||||
// if individual tests panic.
|
||||
func ShutdownPostgres() {
|
||||
if pgEnv == nil {
|
||||
return
|
||||
}
|
||||
if pgEnv.pool != nil {
|
||||
_ = pgEnv.pool.Close()
|
||||
}
|
||||
if pgEnv.container != nil {
|
||||
_ = testcontainers.TerminateContainer(pgEnv.container)
|
||||
}
|
||||
pgEnv = nil
|
||||
}
|
||||
|
||||
// RunMain is a convenience helper for the integration package
|
||||
// `TestMain`: it runs the suite, captures the exit code, tears every
|
||||
// shared container down, and exits. Wiring it through one helper keeps
|
||||
// `TestMain` to two lines and centralises ordering.
|
||||
func RunMain(m *testing.M) {
|
||||
code := m.Run()
|
||||
ShutdownRedis()
|
||||
ShutdownPostgres()
|
||||
ShutdownDocker()
|
||||
os.Exit(code)
|
||||
}
|
||||
|
||||
func startPostgres() (*PostgresEnv, error) {
|
||||
ctx := context.Background()
|
||||
container, err := tcpostgres.Run(ctx, pgImage,
|
||||
tcpostgres.WithDatabase(pgSuperDatabase),
|
||||
tcpostgres.WithUsername(pgSuperUser),
|
||||
tcpostgres.WithPassword(pgSuperPassword),
|
||||
testcontainers.WithWaitStrategy(
|
||||
wait.ForLog("database system is ready to accept connections").
|
||||
WithOccurrence(2).
|
||||
WithStartupTimeout(pgStartupTimeout),
|
||||
),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
baseDSN, err := container.ConnectionString(ctx, "sslmode=disable")
|
||||
if err != nil {
|
||||
_ = testcontainers.TerminateContainer(container)
|
||||
return nil, err
|
||||
}
|
||||
if err := provisionRoleAndSchema(ctx, baseDSN); err != nil {
|
||||
_ = testcontainers.TerminateContainer(container)
|
||||
return nil, err
|
||||
}
|
||||
scopedDSN, err := scopedDSNForRole(baseDSN)
|
||||
if err != nil {
|
||||
_ = testcontainers.TerminateContainer(container)
|
||||
return nil, err
|
||||
}
|
||||
cfg := postgres.DefaultConfig()
|
||||
cfg.PrimaryDSN = scopedDSN
|
||||
cfg.OperationTimeout = pgOperationTimeout
|
||||
pool, err := postgres.OpenPrimary(ctx, cfg)
|
||||
if err != nil {
|
||||
_ = testcontainers.TerminateContainer(container)
|
||||
return nil, err
|
||||
}
|
||||
if err := postgres.Ping(ctx, pool, pgOperationTimeout); err != nil {
|
||||
_ = pool.Close()
|
||||
_ = testcontainers.TerminateContainer(container)
|
||||
return nil, err
|
||||
}
|
||||
if err := postgres.RunMigrations(ctx, pool, migrations.FS(), "."); err != nil {
|
||||
_ = pool.Close()
|
||||
_ = testcontainers.TerminateContainer(container)
|
||||
return nil, err
|
||||
}
|
||||
return &PostgresEnv{
|
||||
container: container,
|
||||
pool: pool,
|
||||
scopedDSN: scopedDSN,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func provisionRoleAndSchema(ctx context.Context, baseDSN string) error {
|
||||
cfg := postgres.DefaultConfig()
|
||||
cfg.PrimaryDSN = baseDSN
|
||||
cfg.OperationTimeout = pgOperationTimeout
|
||||
db, err := postgres.OpenPrimary(ctx, cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer func() { _ = db.Close() }()
|
||||
|
||||
statements := []string{
|
||||
`DO $$ BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'rtmanagerservice') THEN
|
||||
CREATE ROLE rtmanagerservice LOGIN PASSWORD 'rtmanagerservice';
|
||||
END IF;
|
||||
END $$;`,
|
||||
`CREATE SCHEMA IF NOT EXISTS rtmanager AUTHORIZATION rtmanagerservice;`,
|
||||
`GRANT USAGE ON SCHEMA rtmanager TO rtmanagerservice;`,
|
||||
}
|
||||
for _, statement := range statements {
|
||||
if _, err := db.ExecContext(ctx, statement); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func scopedDSNForRole(baseDSN string) (string, error) {
|
||||
parsed, err := url.Parse(baseDSN)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
values := url.Values{}
|
||||
values.Set("search_path", pgServiceSchema)
|
||||
values.Set("sslmode", "disable")
|
||||
scoped := url.URL{
|
||||
Scheme: parsed.Scheme,
|
||||
User: url.UserPassword(pgServiceRole, pgServicePassword),
|
||||
Host: parsed.Host,
|
||||
Path: parsed.Path,
|
||||
RawQuery: values.Encode(),
|
||||
}
|
||||
return scoped.String(), nil
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
package harness
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"testing"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
testcontainers "github.com/testcontainers/testcontainers-go"
|
||||
rediscontainer "github.com/testcontainers/testcontainers-go/modules/redis"
|
||||
)
|
||||
|
||||
const redisImage = "redis:7"
|
||||
|
||||
// RedisEnv carries the per-package Redis fixture. The container is
|
||||
// started lazily on the first EnsureRedis call and torn down by
|
||||
// ShutdownRedis at TestMain exit. Both stream consumers and the
|
||||
// per-game lease store hit this real Redis (miniredis would suffice
|
||||
// for streams alone, but the lease semantics and eviction-by-TTL we
|
||||
// rely on in `health_test` are easier to verify against a real
|
||||
// daemon).
|
||||
type RedisEnv struct {
|
||||
container *rediscontainer.RedisContainer
|
||||
addr string
|
||||
}
|
||||
|
||||
// Addr returns the externally reachable host:port of the Redis
|
||||
// container. Both the runtime under test and the harness-owned client
|
||||
// connect through the same endpoint.
|
||||
func (env *RedisEnv) Addr() string { return env.addr }
|
||||
|
||||
// NewClient opens a fresh `*redis.Client` against the harness Redis.
|
||||
// Tests close their client through `t.Cleanup`; the harness keeps no
|
||||
// shared client to avoid cross-test connection-pool surprises.
|
||||
func (env *RedisEnv) NewClient(t testing.TB) *redis.Client {
|
||||
t.Helper()
|
||||
client := redis.NewClient(&redis.Options{Addr: env.addr})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
return client
|
||||
}
|
||||
|
||||
var (
|
||||
redisOnce sync.Once
|
||||
redisEnv *RedisEnv
|
||||
redisErr error
|
||||
)
|
||||
|
||||
// EnsureRedis starts the per-package Redis container on first
|
||||
// invocation and returns it. When Docker is unavailable the helper
|
||||
// calls `t.Skip` so the suite stays green on hosts without a daemon.
|
||||
func EnsureRedis(t testing.TB) *RedisEnv {
|
||||
t.Helper()
|
||||
redisOnce.Do(func() {
|
||||
redisEnv, redisErr = startRedis()
|
||||
})
|
||||
if redisErr != nil {
|
||||
t.Skipf("rtmanager integration: redis container start failed (Docker unavailable?): %v", redisErr)
|
||||
}
|
||||
return redisEnv
|
||||
}
|
||||
|
||||
// FlushRedis drops every key on the harness Redis. Tests call it from
|
||||
// their setup so streams, offset records, and leases from previous
|
||||
// scenarios do not leak.
|
||||
func FlushRedis(t testing.TB) {
|
||||
t.Helper()
|
||||
env := EnsureRedis(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: env.addr})
|
||||
defer func() { _ = client.Close() }()
|
||||
if _, err := client.FlushAll(context.Background()).Result(); err != nil {
|
||||
t.Fatalf("flush rtmanager redis: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ShutdownRedis terminates the shared container. `TestMain` invokes it
|
||||
// after `m.Run`.
|
||||
func ShutdownRedis() {
|
||||
if redisEnv == nil {
|
||||
return
|
||||
}
|
||||
if redisEnv.container != nil {
|
||||
_ = testcontainers.TerminateContainer(redisEnv.container)
|
||||
}
|
||||
redisEnv = nil
|
||||
}
|
||||
|
||||
func startRedis() (*RedisEnv, error) {
|
||||
ctx := context.Background()
|
||||
container, err := rediscontainer.Run(ctx, redisImage)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
addr, err := container.Endpoint(ctx, "")
|
||||
if err != nil {
|
||||
_ = testcontainers.TerminateContainer(container)
|
||||
return nil, err
|
||||
}
|
||||
return &RedisEnv{
|
||||
container: container,
|
||||
addr: addr,
|
||||
}, nil
|
||||
}
|
||||
@@ -0,0 +1,195 @@
|
||||
package harness
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// defaultHTTPClient backs the runtime-readiness poll and the REST
|
||||
// helpers below. A short timeout is enough — every internal endpoint
|
||||
// runs against an in-process listener.
|
||||
var defaultHTTPClient = &http.Client{Timeout: 5 * time.Second}
|
||||
|
||||
// newRequest is a thin shim over `http.NewRequestWithContext` so the
|
||||
// readiness poll and the REST client share one constructor.
|
||||
func newRequest(ctx context.Context, method, fullURL string, body io.Reader) (*http.Request, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, method, fullURL, body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if body != nil {
|
||||
req.Header.Set("Content-Type", "application/json; charset=utf-8")
|
||||
}
|
||||
req.Header.Set("Accept", "application/json")
|
||||
req.Header.Set("X-Galaxy-Caller", "admin")
|
||||
return req, nil
|
||||
}
|
||||
|
||||
// REST is a tiny client for the trusted internal HTTP surface RTM
|
||||
// exposes to Game Master and Admin Service. It always identifies the
|
||||
// caller as `admin` (the operation_log records `admin_rest`); tests
|
||||
// that need GM semantics should add an option later. v1 keeps the
|
||||
// helper minimal because the integration scenarios only need
|
||||
// admin-driven flows.
|
||||
type REST struct {
|
||||
baseURL string
|
||||
httpc *http.Client
|
||||
}
|
||||
|
||||
// NewREST builds a REST client targeting env.InternalAddr.
|
||||
func NewREST(env *Env) *REST {
|
||||
return &REST{
|
||||
baseURL: "http://" + env.InternalAddr,
|
||||
httpc: defaultHTTPClient,
|
||||
}
|
||||
}
|
||||
|
||||
// Get issues GET path and returns the response body and status code.
|
||||
func (r *REST) Get(t testing.TB, path string) ([]byte, int) {
|
||||
t.Helper()
|
||||
return r.do(t, http.MethodGet, path, nil)
|
||||
}
|
||||
|
||||
// Post issues POST path with body (a Go value JSON-marshaled).
|
||||
func (r *REST) Post(t testing.TB, path string, body any) ([]byte, int) {
|
||||
t.Helper()
|
||||
return r.do(t, http.MethodPost, path, body)
|
||||
}
|
||||
|
||||
// Delete issues DELETE path with no body.
|
||||
func (r *REST) Delete(t testing.TB, path string) ([]byte, int) {
|
||||
t.Helper()
|
||||
return r.do(t, http.MethodDelete, path, nil)
|
||||
}
|
||||
|
||||
// GetRuntime fetches a runtime record by game id and returns the
|
||||
// decoded payload, the status code, and the raw bytes for diagnostics.
|
||||
func (r *REST) GetRuntime(t testing.TB, gameID string) (RuntimeRecordResponse, int) {
|
||||
t.Helper()
|
||||
body, status := r.Get(t, fmt.Sprintf("/api/v1/internal/runtimes/%s", url.PathEscape(gameID)))
|
||||
var resp RuntimeRecordResponse
|
||||
if status == http.StatusOK {
|
||||
if err := json.Unmarshal(body, &resp); err != nil {
|
||||
t.Fatalf("decode get-runtime response: %v; body=%s", err, string(body))
|
||||
}
|
||||
}
|
||||
return resp, status
|
||||
}
|
||||
|
||||
// StartRuntime invokes the start endpoint with imageRef.
|
||||
func (r *REST) StartRuntime(t testing.TB, gameID, imageRef string) (RuntimeRecordResponse, int) {
|
||||
t.Helper()
|
||||
body, status := r.Post(t,
|
||||
fmt.Sprintf("/api/v1/internal/runtimes/%s/start", url.PathEscape(gameID)),
|
||||
map[string]string{"image_ref": imageRef},
|
||||
)
|
||||
return decodeRecord(t, body, status, "start")
|
||||
}
|
||||
|
||||
// StopRuntime invokes the stop endpoint with reason.
|
||||
func (r *REST) StopRuntime(t testing.TB, gameID, reason string) (RuntimeRecordResponse, int) {
|
||||
t.Helper()
|
||||
body, status := r.Post(t,
|
||||
fmt.Sprintf("/api/v1/internal/runtimes/%s/stop", url.PathEscape(gameID)),
|
||||
map[string]string{"reason": reason},
|
||||
)
|
||||
return decodeRecord(t, body, status, "stop")
|
||||
}
|
||||
|
||||
// RestartRuntime invokes the restart endpoint.
|
||||
func (r *REST) RestartRuntime(t testing.TB, gameID string) (RuntimeRecordResponse, int) {
|
||||
t.Helper()
|
||||
body, status := r.Post(t,
|
||||
fmt.Sprintf("/api/v1/internal/runtimes/%s/restart", url.PathEscape(gameID)),
|
||||
struct{}{},
|
||||
)
|
||||
return decodeRecord(t, body, status, "restart")
|
||||
}
|
||||
|
||||
// PatchRuntime invokes the patch endpoint with imageRef.
|
||||
func (r *REST) PatchRuntime(t testing.TB, gameID, imageRef string) (RuntimeRecordResponse, int) {
|
||||
t.Helper()
|
||||
body, status := r.Post(t,
|
||||
fmt.Sprintf("/api/v1/internal/runtimes/%s/patch", url.PathEscape(gameID)),
|
||||
map[string]string{"image_ref": imageRef},
|
||||
)
|
||||
return decodeRecord(t, body, status, "patch")
|
||||
}
|
||||
|
||||
// CleanupRuntime invokes the DELETE container endpoint.
|
||||
func (r *REST) CleanupRuntime(t testing.TB, gameID string) (RuntimeRecordResponse, int) {
|
||||
t.Helper()
|
||||
body, status := r.Delete(t,
|
||||
fmt.Sprintf("/api/v1/internal/runtimes/%s/container", url.PathEscape(gameID)),
|
||||
)
|
||||
return decodeRecord(t, body, status, "cleanup")
|
||||
}
|
||||
|
||||
// RuntimeRecordResponse mirrors the OpenAPI RuntimeRecord schema. Only
|
||||
// the fields integration scenarios assert against live here; the
|
||||
// listener encodes everything else.
|
||||
type RuntimeRecordResponse struct {
|
||||
GameID string `json:"game_id"`
|
||||
Status string `json:"status"`
|
||||
CurrentContainerID *string `json:"current_container_id"`
|
||||
CurrentImageRef *string `json:"current_image_ref"`
|
||||
EngineEndpoint *string `json:"engine_endpoint"`
|
||||
StatePath string `json:"state_path"`
|
||||
DockerNetwork string `json:"docker_network"`
|
||||
StartedAt *string `json:"started_at"`
|
||||
StoppedAt *string `json:"stopped_at"`
|
||||
RemovedAt *string `json:"removed_at"`
|
||||
LastOpAt string `json:"last_op_at"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
}
|
||||
|
||||
func (r *REST) do(t testing.TB, method, path string, body any) ([]byte, int) {
|
||||
t.Helper()
|
||||
var reader io.Reader
|
||||
if body != nil {
|
||||
raw, err := json.Marshal(body)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal request body: %v", err)
|
||||
}
|
||||
reader = bytes.NewReader(raw)
|
||||
}
|
||||
req, err := newRequest(context.Background(), method, r.baseURL+path, reader)
|
||||
if err != nil {
|
||||
t.Fatalf("build %s %s request: %v", method, path, err)
|
||||
}
|
||||
resp, err := r.httpc.Do(req)
|
||||
if err != nil {
|
||||
t.Fatalf("execute %s %s: %v", method, path, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
raw, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
t.Fatalf("read %s %s response: %v", method, path, err)
|
||||
}
|
||||
return raw, resp.StatusCode
|
||||
}
|
||||
|
||||
func decodeRecord(t testing.TB, body []byte, status int, op string) (RuntimeRecordResponse, int) {
|
||||
t.Helper()
|
||||
if status != http.StatusOK {
|
||||
return RuntimeRecordResponse{}, status
|
||||
}
|
||||
var resp RuntimeRecordResponse
|
||||
if err := json.Unmarshal(body, &resp); err != nil {
|
||||
t.Fatalf("decode %s response: %v; body=%s", op, err, string(body))
|
||||
}
|
||||
return resp, status
|
||||
}
|
||||
|
||||
// PathEscape is a re-export so test files can call it without
|
||||
// importing `net/url` directly. Keeps the test source focused on
|
||||
// scenarios.
|
||||
func PathEscape(value string) string { return url.PathEscape(strings.TrimSpace(value)) }
|
||||
@@ -0,0 +1,398 @@
|
||||
package harness
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/postgres"
|
||||
"galaxy/redisconn"
|
||||
"galaxy/rtmanager/internal/app"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// Default stream key shapes used by the integration suite. They match
|
||||
// the production defaults so the wire shapes asserted in `streams.go`
|
||||
// are identical to what Game Lobby sees in `integration/lobbyrtm`.
|
||||
const (
|
||||
StartJobsStream = "runtime:start_jobs"
|
||||
StopJobsStream = "runtime:stop_jobs"
|
||||
JobResultsStream = "runtime:job_results"
|
||||
HealthEventsStream = "runtime:health_events"
|
||||
NotificationIntentsKey = "notification:intents"
|
||||
gameStateRootSubdir = "game-state"
|
||||
listenAddr = "127.0.0.1:0"
|
||||
listenerWaitTimeout = 10 * time.Second
|
||||
readyzPollInterval = 25 * time.Millisecond
|
||||
cleanupShutdownTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
// Env carries everything one integration scenario needs to drive the
|
||||
// Runtime Manager process. The struct is value-typed so tests reach
|
||||
// fields without intermediate getters.
|
||||
type Env struct {
|
||||
// Cfg is the resolved Runtime Manager configuration handed to
|
||||
// `app.NewRuntime`. Tests inspect it for stream key shapes,
|
||||
// container defaults, and timeout knobs.
|
||||
Cfg config.Config
|
||||
|
||||
// Runtime is the in-process Runtime Manager exposed for tests that
|
||||
// need to peek at internal state (`runtime.InternalServer().Addr()`).
|
||||
Runtime *app.Runtime
|
||||
|
||||
// Postgres holds the per-package PostgreSQL fixture.
|
||||
Postgres *PostgresEnv
|
||||
|
||||
// Redis holds the per-package Redis fixture plus a fresh client the
|
||||
// test owns.
|
||||
Redis *RedisEnv
|
||||
RedisClient *redis.Client
|
||||
|
||||
// Docker holds the per-package Docker daemon handle.
|
||||
Docker *DockerEnv
|
||||
|
||||
// Lobby is the per-test stub HTTP server.
|
||||
Lobby *LobbyStub
|
||||
|
||||
// Network is the unique Docker network name created for this test.
|
||||
Network string
|
||||
|
||||
// EngineImageRef and PatchedImageRef are the two semver-compatible
|
||||
// engine image tags the harness builds once per package. Patch
|
||||
// scenarios point at the second tag.
|
||||
EngineImageRef string
|
||||
PatchedImageRef string
|
||||
|
||||
// GameStateRoot is the host filesystem path RTM writes per-game
|
||||
// state directories under. It lives inside `t.ArtifactDir()` so
|
||||
// failed scenarios leave the engine state behind for inspection.
|
||||
GameStateRoot string
|
||||
|
||||
// InternalAddr is the bound address of RTM's internal HTTP listener
|
||||
// (resolved after Run binds the port).
|
||||
InternalAddr string
|
||||
}
|
||||
|
||||
// EnvOptions carry per-test overrides to the harness defaults. Empty
|
||||
// fields fall back to the defaults declared at the top of this file.
|
||||
type EnvOptions struct {
|
||||
// ReconcileInterval overrides the periodic reconciler interval.
|
||||
// Default 500ms (so reconcile drift is observable inside a single
|
||||
// scenario timeout).
|
||||
ReconcileInterval time.Duration
|
||||
|
||||
// CleanupInterval overrides the container-cleanup interval.
|
||||
CleanupInterval time.Duration
|
||||
|
||||
// InspectInterval overrides the Docker inspect worker interval.
|
||||
InspectInterval time.Duration
|
||||
|
||||
// ProbeInterval / ProbeTimeout / ProbeFailuresThreshold override
|
||||
// the active engine probe knobs.
|
||||
ProbeInterval time.Duration
|
||||
ProbeTimeout time.Duration
|
||||
ProbeFailuresThreshold int
|
||||
|
||||
// GameLeaseTTL overrides the per-game Redis lease TTL.
|
||||
GameLeaseTTL time.Duration
|
||||
|
||||
// StreamBlockTimeout overrides the consumer XREAD block window.
|
||||
StreamBlockTimeout time.Duration
|
||||
|
||||
// LogToStderr makes the harness write the runtime's structured
|
||||
// logs to stderr; the default discards them so test output stays
|
||||
// focused on assertions.
|
||||
LogToStderr bool
|
||||
}
|
||||
|
||||
// NewEnv stands up a fresh Runtime Manager process for the calling
|
||||
// test. It blocks until the internal HTTP listener is bound; tests can
|
||||
// issue REST and stream requests immediately after the call returns.
|
||||
//
|
||||
// `t.Cleanup` runs in reverse order: stop the runtime, close the
|
||||
// runtime, close the per-test redis client, remove the docker network,
|
||||
// terminate the lobby stub. Containers RTM created during the test are
|
||||
// removed by the test's own cleanup paths or by the integration
|
||||
// `health_test` external-action helpers.
|
||||
func NewEnv(t *testing.T, opts EnvOptions) *Env {
|
||||
t.Helper()
|
||||
|
||||
pg := EnsurePostgres(t)
|
||||
rd := EnsureRedis(t)
|
||||
dk := EnsureDocker(t)
|
||||
imageRef := EnsureEngineImage(t)
|
||||
TruncatePostgres(t)
|
||||
FlushRedis(t)
|
||||
network := EnsureNetwork(t)
|
||||
lobby := NewLobbyStub(t)
|
||||
stateRoot := stateRoot(t)
|
||||
|
||||
cfg := buildConfig(buildConfigInput{
|
||||
PostgresDSN: pg.DSN(),
|
||||
RedisAddr: rd.Addr(),
|
||||
DockerHost: resolveDockerHost(),
|
||||
Network: network,
|
||||
LobbyURL: lobby.URL(),
|
||||
GameStateRoot: stateRoot,
|
||||
ReconcileInterval: pickDuration(opts.ReconcileInterval, 500*time.Millisecond),
|
||||
CleanupInterval: pickDuration(opts.CleanupInterval, 500*time.Millisecond),
|
||||
InspectInterval: pickDuration(opts.InspectInterval, 500*time.Millisecond),
|
||||
ProbeInterval: pickDuration(opts.ProbeInterval, 500*time.Millisecond),
|
||||
ProbeTimeout: pickDuration(opts.ProbeTimeout, time.Second),
|
||||
ProbeFailures: pickInt(opts.ProbeFailuresThreshold, 2),
|
||||
GameLeaseTTL: pickDuration(opts.GameLeaseTTL, 5*time.Second),
|
||||
StreamBlockTimeout: pickDuration(opts.StreamBlockTimeout, 200*time.Millisecond),
|
||||
})
|
||||
|
||||
logger := newLogger(opts.LogToStderr)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
runtime, err := app.NewRuntime(ctx, cfg, logger)
|
||||
if err != nil {
|
||||
cancel()
|
||||
t.Fatalf("rtmanager integration: new runtime: %v", err)
|
||||
}
|
||||
|
||||
runDone := make(chan error, 1)
|
||||
go func() {
|
||||
runDone <- runtime.Run(ctx)
|
||||
}()
|
||||
|
||||
internalAddr := waitForListener(t, runtime)
|
||||
waitForReady(t, runtime, listenerWaitTimeout)
|
||||
|
||||
var cleanupOnce sync.Once
|
||||
t.Cleanup(func() {
|
||||
cleanupOnce.Do(func() {
|
||||
cancel()
|
||||
waitCtx, waitCancel := context.WithTimeout(context.Background(), cleanupShutdownTimeout)
|
||||
defer waitCancel()
|
||||
select {
|
||||
case err := <-runDone:
|
||||
if err != nil && !isCleanShutdownErr(err) {
|
||||
t.Logf("rtmanager integration: runtime.Run returned: %v", err)
|
||||
}
|
||||
case <-waitCtx.Done():
|
||||
t.Logf("rtmanager integration: runtime did not stop within %s", cleanupShutdownTimeout)
|
||||
}
|
||||
if err := runtime.Close(); err != nil {
|
||||
t.Logf("rtmanager integration: runtime.Close: %v", err)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
return &Env{
|
||||
Cfg: cfg,
|
||||
Runtime: runtime,
|
||||
Postgres: pg,
|
||||
Redis: rd,
|
||||
RedisClient: rd.NewClient(t),
|
||||
Docker: dk,
|
||||
Lobby: lobby,
|
||||
Network: network,
|
||||
EngineImageRef: imageRef,
|
||||
PatchedImageRef: PatchedEngineImageRef,
|
||||
GameStateRoot: stateRoot,
|
||||
InternalAddr: internalAddr,
|
||||
}
|
||||
}
|
||||
|
||||
type buildConfigInput struct {
|
||||
PostgresDSN string
|
||||
RedisAddr string
|
||||
DockerHost string
|
||||
Network string
|
||||
LobbyURL string
|
||||
GameStateRoot string
|
||||
ReconcileInterval time.Duration
|
||||
CleanupInterval time.Duration
|
||||
InspectInterval time.Duration
|
||||
ProbeInterval time.Duration
|
||||
ProbeTimeout time.Duration
|
||||
ProbeFailures int
|
||||
GameLeaseTTL time.Duration
|
||||
StreamBlockTimeout time.Duration
|
||||
}
|
||||
|
||||
func buildConfig(in buildConfigInput) config.Config {
|
||||
cfg := config.DefaultConfig()
|
||||
cfg.InternalHTTP.Addr = listenAddr
|
||||
|
||||
cfg.Docker.Host = in.DockerHost
|
||||
cfg.Docker.Network = in.Network
|
||||
cfg.Docker.PullPolicy = config.ImagePullPolicyIfMissing
|
||||
|
||||
cfg.Postgres = config.PostgresConfig{
|
||||
Conn: postgres.Config{
|
||||
PrimaryDSN: in.PostgresDSN,
|
||||
OperationTimeout: pgOperationTimeout,
|
||||
MaxOpenConns: 5,
|
||||
MaxIdleConns: 2,
|
||||
ConnMaxLifetime: 30 * time.Minute,
|
||||
},
|
||||
}
|
||||
|
||||
cfg.Redis = config.RedisConfig{
|
||||
Conn: redisconn.Config{
|
||||
MasterAddr: in.RedisAddr,
|
||||
Password: "integration",
|
||||
OperationTimeout: 2 * time.Second,
|
||||
},
|
||||
}
|
||||
|
||||
cfg.Streams.StartJobs = StartJobsStream
|
||||
cfg.Streams.StopJobs = StopJobsStream
|
||||
cfg.Streams.JobResults = JobResultsStream
|
||||
cfg.Streams.HealthEvents = HealthEventsStream
|
||||
cfg.Streams.NotificationIntents = NotificationIntentsKey
|
||||
cfg.Streams.BlockTimeout = in.StreamBlockTimeout
|
||||
|
||||
cfg.Container.GameStateRoot = in.GameStateRoot
|
||||
// Pin chown target to the current process uid/gid; the dev sandbox
|
||||
// (and unprivileged dev machines) cannot chown to root.
|
||||
cfg.Container.GameStateOwnerUID = os.Getuid()
|
||||
cfg.Container.GameStateOwnerGID = os.Getgid()
|
||||
|
||||
cfg.Health.InspectInterval = in.InspectInterval
|
||||
cfg.Health.ProbeInterval = in.ProbeInterval
|
||||
cfg.Health.ProbeTimeout = in.ProbeTimeout
|
||||
cfg.Health.ProbeFailuresThreshold = in.ProbeFailures
|
||||
|
||||
cfg.Cleanup.ReconcileInterval = in.ReconcileInterval
|
||||
cfg.Cleanup.CleanupInterval = in.CleanupInterval
|
||||
|
||||
cfg.Coordination.GameLeaseTTL = in.GameLeaseTTL
|
||||
|
||||
cfg.Lobby = config.LobbyConfig{
|
||||
BaseURL: in.LobbyURL,
|
||||
Timeout: 2 * time.Second,
|
||||
}
|
||||
|
||||
cfg.Telemetry.TracesExporter = "none"
|
||||
cfg.Telemetry.MetricsExporter = "none"
|
||||
|
||||
return cfg
|
||||
}
|
||||
|
||||
func newLogger(toStderr bool) *slog.Logger {
|
||||
if toStderr {
|
||||
return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug}))
|
||||
}
|
||||
return slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{Level: slog.LevelError}))
|
||||
}
|
||||
|
||||
func stateRoot(t *testing.T) string {
|
||||
t.Helper()
|
||||
dir := t.ArtifactDir()
|
||||
root := dir + string(os.PathSeparator) + gameStateRootSubdir
|
||||
if err := os.MkdirAll(root, 0o755); err != nil {
|
||||
t.Fatalf("rtmanager integration: create game-state root %q: %v", root, err)
|
||||
}
|
||||
return root
|
||||
}
|
||||
|
||||
func resolveDockerHost() string {
|
||||
if host := strings.TrimSpace(os.Getenv("DOCKER_HOST")); host != "" {
|
||||
return host
|
||||
}
|
||||
return "unix:///var/run/docker.sock"
|
||||
}
|
||||
|
||||
func pickDuration(value, fallback time.Duration) time.Duration {
|
||||
if value > 0 {
|
||||
return value
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func pickInt(value, fallback int) int {
|
||||
if value > 0 {
|
||||
return value
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
// waitForListener spins until `runtime.InternalServer().Addr()` returns
|
||||
// a non-empty value or the deadline fires. The internal listener binds
|
||||
// during `runtime.Run`, which runs in its own goroutine; this helper
|
||||
// is the bridge between "Run started" and "tests can use REST".
|
||||
func waitForListener(t *testing.T, runtime *app.Runtime) string {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(listenerWaitTimeout)
|
||||
for {
|
||||
if runtime != nil && runtime.InternalServer() != nil {
|
||||
if addr := runtime.InternalServer().Addr(); addr != "" {
|
||||
return addr
|
||||
}
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("rtmanager integration: internal HTTP listener did not bind within %s", listenerWaitTimeout)
|
||||
}
|
||||
time.Sleep(readyzPollInterval)
|
||||
}
|
||||
}
|
||||
|
||||
// waitForReady polls `/readyz` until it returns 200 or the deadline
|
||||
// fires. RTM's readyz pings PG, Redis, and Docker; a successful
|
||||
// response means every dependency is reachable through the runtime
|
||||
// process.
|
||||
func waitForReady(t *testing.T, runtime *app.Runtime, timeout time.Duration) {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(timeout)
|
||||
addr := runtime.InternalServer().Addr()
|
||||
probeURL := (&url.URL{Scheme: "http", Host: addr, Path: "/readyz"}).String()
|
||||
for {
|
||||
req, err := newRequest(context.Background(), "GET", probeURL, nil)
|
||||
if err == nil {
|
||||
resp, err := defaultHTTPClient.Do(req)
|
||||
if err == nil {
|
||||
_, _ = io.Copy(io.Discard, resp.Body)
|
||||
_ = resp.Body.Close()
|
||||
if resp.StatusCode == 200 {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("rtmanager integration: /readyz did not return 200 within %s", timeout)
|
||||
}
|
||||
time.Sleep(readyzPollInterval)
|
||||
}
|
||||
}
|
||||
|
||||
func isCleanShutdownErr(err error) bool {
|
||||
return err == nil || errors.Is(err, context.Canceled)
|
||||
}
|
||||
|
||||
// IDFromTestName builds a deterministic-but-unique game id from the
|
||||
// caller's test name. Two tests with the same name running back-to-back
|
||||
// would otherwise collide on PG state through the per-test
|
||||
// `TruncatePostgres` window; pinning the suffix to `Now().UnixNano()`
|
||||
// rules that out.
|
||||
func IDFromTestName(t *testing.T) string {
|
||||
t.Helper()
|
||||
// The container hostname is `galaxy-game-{game_id}` and must fit
|
||||
// HOST_NAME_MAX=64 chars; runc rejects longer values with
|
||||
// "sethostname: invalid argument". Cap the lowercased test-name
|
||||
// component at 36 chars and append a 16-char base36 suffix so the
|
||||
// total stays comfortably under the limit (12 + 36 + 1 + 16 = 65 →
|
||||
// trim further if needed).
|
||||
const maxNameLen = 35
|
||||
suffix := strconv.FormatInt(time.Now().UnixNano(), 36)
|
||||
prefix := strings.ToLower(strings.NewReplacer("/", "-", " ", "-").Replace(t.Name()))
|
||||
if len(prefix) > maxNameLen {
|
||||
prefix = prefix[:maxNameLen]
|
||||
}
|
||||
return prefix + "-" + suffix
|
||||
}
|
||||
@@ -0,0 +1,128 @@
|
||||
package harness
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/postgres/healthsnapshotstore"
|
||||
"galaxy/rtmanager/internal/adapters/postgres/operationlogstore"
|
||||
"galaxy/rtmanager/internal/adapters/postgres/runtimerecordstore"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// RuntimeRecord returns the persisted runtime record for gameID. The
|
||||
// helper opens the store on every call (cheap; the harness `*sql.DB`
|
||||
// is shared) so individual scenarios stay isolated even if a previous
|
||||
// test mutated store state.
|
||||
func RuntimeRecord(t testing.TB, env *Env, gameID string) (runtime.RuntimeRecord, error) {
|
||||
t.Helper()
|
||||
store, err := runtimerecordstore.New(runtimerecordstore.Config{
|
||||
DB: env.Postgres.Pool(),
|
||||
OperationTimeout: pgOperationTimeout,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return store.Get(context.Background(), gameID)
|
||||
}
|
||||
|
||||
// MustRuntimeRecord asserts that the record exists and returns it.
|
||||
func MustRuntimeRecord(t testing.TB, env *Env, gameID string) runtime.RuntimeRecord {
|
||||
t.Helper()
|
||||
record, err := RuntimeRecord(t, env, gameID)
|
||||
require.NoErrorf(t, err, "load runtime record for %s", gameID)
|
||||
return record
|
||||
}
|
||||
|
||||
// EventuallyRuntimeRecord polls until predicate matches the runtime
|
||||
// record for gameID, or the deadline fires. Returns the matching
|
||||
// record. Used by lifecycle assertions that depend on async state
|
||||
// transitions (start consumer → record).
|
||||
func EventuallyRuntimeRecord(t testing.TB, env *Env, gameID string, predicate func(runtime.RuntimeRecord) bool, timeout time.Duration) runtime.RuntimeRecord {
|
||||
t.Helper()
|
||||
if timeout <= 0 {
|
||||
timeout = defaultStreamTimeout
|
||||
}
|
||||
deadline := time.Now().Add(timeout)
|
||||
for {
|
||||
record, err := RuntimeRecord(t, env, gameID)
|
||||
if err == nil && predicate(record) {
|
||||
return record
|
||||
}
|
||||
if err != nil && !errors.Is(err, runtime.ErrNotFound) {
|
||||
t.Fatalf("rtmanager integration: load runtime record: %v", err)
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
if err != nil {
|
||||
t.Fatalf("rtmanager integration: runtime record predicate not met within %s; last err=%v",
|
||||
timeout, err)
|
||||
}
|
||||
t.Fatalf("rtmanager integration: runtime record predicate not met within %s; last record=%+v",
|
||||
timeout, record)
|
||||
}
|
||||
time.Sleep(defaultStreamPoll)
|
||||
}
|
||||
}
|
||||
|
||||
// OperationEntries returns up to `limit` most-recent operation_log
|
||||
// entries for gameID, ordered descending by started_at.
|
||||
func OperationEntries(t testing.TB, env *Env, gameID string, limit int) []operation.OperationEntry {
|
||||
t.Helper()
|
||||
store, err := operationlogstore.New(operationlogstore.Config{
|
||||
DB: env.Postgres.Pool(),
|
||||
OperationTimeout: pgOperationTimeout,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
entries, err := store.ListByGame(context.Background(), gameID, limit)
|
||||
require.NoErrorf(t, err, "list operation log entries for %s", gameID)
|
||||
return entries
|
||||
}
|
||||
|
||||
// EventuallyOperationKind polls operation_log until at least one entry
|
||||
// for gameID has the requested kind, or the deadline fires. Returns
|
||||
// the matching entry.
|
||||
func EventuallyOperationKind(t testing.TB, env *Env, gameID string, kind operation.OpKind, timeout time.Duration) operation.OperationEntry {
|
||||
t.Helper()
|
||||
if timeout <= 0 {
|
||||
timeout = defaultStreamTimeout
|
||||
}
|
||||
deadline := time.Now().Add(timeout)
|
||||
for {
|
||||
entries := OperationEntries(t, env, gameID, 50)
|
||||
for _, entry := range entries {
|
||||
if entry.OpKind == kind {
|
||||
return entry
|
||||
}
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("rtmanager integration: operation_log entry with op_kind=%s not seen within %s; observed=%v",
|
||||
kind, timeout, opKindSummary(entries))
|
||||
}
|
||||
time.Sleep(defaultStreamPoll)
|
||||
}
|
||||
}
|
||||
|
||||
// HealthSnapshot returns the latest persisted health snapshot for
|
||||
// gameID, or the underlying not-found sentinel when nothing has been
|
||||
// recorded yet.
|
||||
func HealthSnapshot(t testing.TB, env *Env, gameID string) (health.HealthSnapshot, error) {
|
||||
t.Helper()
|
||||
store, err := healthsnapshotstore.New(healthsnapshotstore.Config{
|
||||
DB: env.Postgres.Pool(),
|
||||
OperationTimeout: pgOperationTimeout,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return store.Get(context.Background(), gameID)
|
||||
}
|
||||
|
||||
func opKindSummary(entries []operation.OperationEntry) []string {
|
||||
out := make([]string, 0, len(entries))
|
||||
for _, entry := range entries {
|
||||
out = append(out, string(entry.OpKind)+"/"+string(entry.Outcome))
|
||||
}
|
||||
return out
|
||||
}
|
||||
@@ -0,0 +1,334 @@
|
||||
package harness
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// Default scenario timeouts. Stream-driven assertions sit on top of
|
||||
// the runtime's worker tickers (defaults of 200-500ms in
|
||||
// `EnvOptions`); 30s gives every reconcile / probe / event tick more
|
||||
// than enough headroom even on a slow CI runner.
|
||||
const (
|
||||
defaultStreamTimeout = 30 * time.Second
|
||||
defaultStreamPoll = 25 * time.Millisecond
|
||||
)
|
||||
|
||||
// XAddStartJob appends one start-job entry in the
|
||||
// `runtime:start_jobs` AsyncAPI shape and returns the assigned entry
|
||||
// id. Mirrors the wire shape produced by Lobby's
|
||||
// `runtimemanager.Publisher` so the consumer treats the entry exactly
|
||||
// like a real Lobby-published job.
|
||||
func XAddStartJob(t testing.TB, env *Env, gameID, imageRef string) string {
|
||||
t.Helper()
|
||||
id, err := env.RedisClient.XAdd(context.Background(), &redis.XAddArgs{
|
||||
Stream: env.Cfg.Streams.StartJobs,
|
||||
Values: map[string]any{
|
||||
"game_id": gameID,
|
||||
"image_ref": imageRef,
|
||||
"requested_at_ms": time.Now().UTC().UnixMilli(),
|
||||
},
|
||||
}).Result()
|
||||
require.NoErrorf(t, err, "xadd start_jobs for game %s", gameID)
|
||||
return id
|
||||
}
|
||||
|
||||
// XAddStopJob appends one stop-job entry classified by reason. The
|
||||
// reason enum is documented at `ports.StopReason`.
|
||||
func XAddStopJob(t testing.TB, env *Env, gameID, reason string) string {
|
||||
t.Helper()
|
||||
id, err := env.RedisClient.XAdd(context.Background(), &redis.XAddArgs{
|
||||
Stream: env.Cfg.Streams.StopJobs,
|
||||
Values: map[string]any{
|
||||
"game_id": gameID,
|
||||
"reason": reason,
|
||||
"requested_at_ms": time.Now().UTC().UnixMilli(),
|
||||
},
|
||||
}).Result()
|
||||
require.NoErrorf(t, err, "xadd stop_jobs for game %s", gameID)
|
||||
return id
|
||||
}
|
||||
|
||||
// JobResultEntry is the decoded shape of one `runtime:job_results`
|
||||
// stream entry. Mirrors `ports.JobResult` plus the entry id surfaced
|
||||
// by Redis so tests can correlate XADD ids with results.
|
||||
type JobResultEntry struct {
|
||||
StreamID string
|
||||
GameID string
|
||||
Outcome string
|
||||
ContainerID string
|
||||
EngineEndpoint string
|
||||
ErrorCode string
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// HealthEventEntry mirrors the `runtime:health_events` AsyncAPI shape
|
||||
// in decoded form.
|
||||
type HealthEventEntry struct {
|
||||
StreamID string
|
||||
GameID string
|
||||
ContainerID string
|
||||
EventType string
|
||||
OccurredAtMs int64
|
||||
Details map[string]any
|
||||
}
|
||||
|
||||
// NotificationIntentEntry decodes one `notification:intents` entry
|
||||
// that RTM publishes for first-touch start failures.
|
||||
type NotificationIntentEntry struct {
|
||||
StreamID string
|
||||
NotificationType string
|
||||
IdempotencyKey string
|
||||
Payload map[string]any
|
||||
}
|
||||
|
||||
// WaitForJobResult polls `runtime:job_results` until predicate
|
||||
// matches, or the timeout fires. Returns the matching entry. The
|
||||
// helper does not consume the stream — every call rescans from `0-0`
|
||||
// — because RTM's writes are append-only and the cardinality per test
|
||||
// is small.
|
||||
func WaitForJobResult(t testing.TB, env *Env, predicate func(JobResultEntry) bool, timeout time.Duration) JobResultEntry {
|
||||
t.Helper()
|
||||
if timeout <= 0 {
|
||||
timeout = defaultStreamTimeout
|
||||
}
|
||||
deadline := time.Now().Add(timeout)
|
||||
for {
|
||||
entries, err := env.RedisClient.XRange(context.Background(), env.Cfg.Streams.JobResults, "-", "+").Result()
|
||||
require.NoErrorf(t, err, "xrange %s", env.Cfg.Streams.JobResults)
|
||||
for _, entry := range entries {
|
||||
decoded := decodeJobResult(entry)
|
||||
if predicate(decoded) {
|
||||
return decoded
|
||||
}
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("rtmanager integration: no job_result matched within %s; observed=%v",
|
||||
timeout, jobResultStreamSummary(entries))
|
||||
}
|
||||
time.Sleep(defaultStreamPoll)
|
||||
}
|
||||
}
|
||||
|
||||
// AllJobResults returns every entry on `runtime:job_results` in stream
|
||||
// order. Useful for assertions that depend on cardinality (replay
|
||||
// tests).
|
||||
func AllJobResults(t testing.TB, env *Env) []JobResultEntry {
|
||||
t.Helper()
|
||||
entries, err := env.RedisClient.XRange(context.Background(), env.Cfg.Streams.JobResults, "-", "+").Result()
|
||||
require.NoErrorf(t, err, "xrange %s", env.Cfg.Streams.JobResults)
|
||||
out := make([]JobResultEntry, 0, len(entries))
|
||||
for _, entry := range entries {
|
||||
out = append(out, decodeJobResult(entry))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// WaitForHealthEvent polls `runtime:health_events` until predicate
|
||||
// matches, or the timeout fires.
|
||||
func WaitForHealthEvent(t testing.TB, env *Env, predicate func(HealthEventEntry) bool, timeout time.Duration) HealthEventEntry {
|
||||
t.Helper()
|
||||
if timeout <= 0 {
|
||||
timeout = defaultStreamTimeout
|
||||
}
|
||||
deadline := time.Now().Add(timeout)
|
||||
for {
|
||||
entries, err := env.RedisClient.XRange(context.Background(), env.Cfg.Streams.HealthEvents, "-", "+").Result()
|
||||
require.NoErrorf(t, err, "xrange %s", env.Cfg.Streams.HealthEvents)
|
||||
for _, entry := range entries {
|
||||
decoded := decodeHealthEvent(t, entry)
|
||||
if predicate(decoded) {
|
||||
return decoded
|
||||
}
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("rtmanager integration: no health_event matched within %s; observed=%v",
|
||||
timeout, healthEventStreamSummary(entries))
|
||||
}
|
||||
time.Sleep(defaultStreamPoll)
|
||||
}
|
||||
}
|
||||
|
||||
// WaitForNotificationIntent polls `notification:intents` until
|
||||
// predicate matches.
|
||||
func WaitForNotificationIntent(t testing.TB, env *Env, predicate func(NotificationIntentEntry) bool, timeout time.Duration) NotificationIntentEntry {
|
||||
t.Helper()
|
||||
if timeout <= 0 {
|
||||
timeout = defaultStreamTimeout
|
||||
}
|
||||
deadline := time.Now().Add(timeout)
|
||||
for {
|
||||
entries, err := env.RedisClient.XRange(context.Background(), env.Cfg.Streams.NotificationIntents, "-", "+").Result()
|
||||
require.NoErrorf(t, err, "xrange %s", env.Cfg.Streams.NotificationIntents)
|
||||
for _, entry := range entries {
|
||||
decoded := decodeNotificationIntent(t, entry)
|
||||
if predicate(decoded) {
|
||||
return decoded
|
||||
}
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("rtmanager integration: no notification_intent matched within %s; observed=%v",
|
||||
timeout, notificationStreamSummary(entries))
|
||||
}
|
||||
time.Sleep(defaultStreamPoll)
|
||||
}
|
||||
}
|
||||
|
||||
// JobOutcomeIs returns a predicate matching a job result whose game id
|
||||
// and outcome equal the inputs.
|
||||
func JobOutcomeIs(gameID, outcome string) func(JobResultEntry) bool {
|
||||
return func(entry JobResultEntry) bool {
|
||||
return entry.GameID == gameID && entry.Outcome == outcome
|
||||
}
|
||||
}
|
||||
|
||||
// JobOutcomeWithErrorCode matches a job result whose game id, outcome,
|
||||
// and error_code all equal the inputs. Used by replay-no-op
|
||||
// assertions.
|
||||
func JobOutcomeWithErrorCode(gameID, outcome, errorCode string) func(JobResultEntry) bool {
|
||||
return func(entry JobResultEntry) bool {
|
||||
return entry.GameID == gameID && entry.Outcome == outcome && entry.ErrorCode == errorCode
|
||||
}
|
||||
}
|
||||
|
||||
// HealthEventTypeIs returns a predicate matching a health event whose
|
||||
// game id and event_type equal the inputs.
|
||||
func HealthEventTypeIs(gameID, eventType string) func(HealthEventEntry) bool {
|
||||
return func(entry HealthEventEntry) bool {
|
||||
return entry.GameID == gameID && entry.EventType == eventType
|
||||
}
|
||||
}
|
||||
|
||||
func decodeJobResult(message redis.XMessage) JobResultEntry {
|
||||
return JobResultEntry{
|
||||
StreamID: message.ID,
|
||||
GameID: streamString(message.Values, "game_id"),
|
||||
Outcome: streamString(message.Values, "outcome"),
|
||||
ContainerID: streamString(message.Values, "container_id"),
|
||||
EngineEndpoint: streamString(message.Values, "engine_endpoint"),
|
||||
ErrorCode: streamString(message.Values, "error_code"),
|
||||
ErrorMessage: streamString(message.Values, "error_message"),
|
||||
}
|
||||
}
|
||||
|
||||
func decodeHealthEvent(t testing.TB, message redis.XMessage) HealthEventEntry {
|
||||
t.Helper()
|
||||
occurredAt, _ := strconv.ParseInt(streamString(message.Values, "occurred_at_ms"), 10, 64)
|
||||
entry := HealthEventEntry{
|
||||
StreamID: message.ID,
|
||||
GameID: streamString(message.Values, "game_id"),
|
||||
ContainerID: streamString(message.Values, "container_id"),
|
||||
EventType: streamString(message.Values, "event_type"),
|
||||
OccurredAtMs: occurredAt,
|
||||
}
|
||||
rawDetails := streamString(message.Values, "details")
|
||||
if rawDetails != "" {
|
||||
var parsed map[string]any
|
||||
if err := json.Unmarshal([]byte(rawDetails), &parsed); err == nil {
|
||||
entry.Details = parsed
|
||||
}
|
||||
}
|
||||
return entry
|
||||
}
|
||||
|
||||
func decodeNotificationIntent(t testing.TB, message redis.XMessage) NotificationIntentEntry {
|
||||
t.Helper()
|
||||
entry := NotificationIntentEntry{
|
||||
StreamID: message.ID,
|
||||
NotificationType: streamString(message.Values, "notification_type"),
|
||||
IdempotencyKey: streamString(message.Values, "idempotency_key"),
|
||||
}
|
||||
rawPayload := streamString(message.Values, "payload_json")
|
||||
if rawPayload == "" {
|
||||
rawPayload = streamString(message.Values, "payload")
|
||||
}
|
||||
if rawPayload != "" {
|
||||
var parsed map[string]any
|
||||
if err := json.Unmarshal([]byte(rawPayload), &parsed); err == nil {
|
||||
entry.Payload = parsed
|
||||
}
|
||||
}
|
||||
return entry
|
||||
}
|
||||
|
||||
func streamString(values map[string]any, key string) string {
|
||||
raw, ok := values[key]
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
switch typed := raw.(type) {
|
||||
case string:
|
||||
return typed
|
||||
case []byte:
|
||||
return string(typed)
|
||||
default:
|
||||
return fmt.Sprintf("%v", typed)
|
||||
}
|
||||
}
|
||||
|
||||
func jobResultStreamSummary(entries []redis.XMessage) []string {
|
||||
out := make([]string, 0, len(entries))
|
||||
for _, entry := range entries {
|
||||
decoded := decodeJobResult(entry)
|
||||
out = append(out, fmt.Sprintf("%s game=%s outcome=%s err=%s",
|
||||
decoded.StreamID, decoded.GameID, decoded.Outcome, decoded.ErrorCode))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func healthEventStreamSummary(entries []redis.XMessage) []string {
|
||||
out := make([]string, 0, len(entries))
|
||||
for _, entry := range entries {
|
||||
out = append(out, fmt.Sprintf("%s %s %s",
|
||||
entry.ID, streamString(entry.Values, "game_id"), streamString(entry.Values, "event_type")))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func notificationStreamSummary(entries []redis.XMessage) []string {
|
||||
out := make([]string, 0, len(entries))
|
||||
for _, entry := range entries {
|
||||
out = append(out, fmt.Sprintf("%s %s",
|
||||
entry.ID, streamString(entry.Values, "notification_type")))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// EnsureJobOutcomeConstants pins the constants from `ports` so suite
|
||||
// authors can build predicates without importing `ports` themselves.
|
||||
// Re-exported here to keep test source focused.
|
||||
var (
|
||||
JobOutcomeSuccess = ports.JobOutcomeSuccess
|
||||
JobOutcomeFailure = ports.JobOutcomeFailure
|
||||
)
|
||||
|
||||
// AssertNoJobResultBeyond fails the test if the count of entries on
|
||||
// `runtime:job_results` exceeds `expectedCount`. Used by the replay
|
||||
// tests to prove the second envelope was no-op.
|
||||
func AssertNoJobResultBeyond(t testing.TB, env *Env, expectedCount int) {
|
||||
t.Helper()
|
||||
entries, err := env.RedisClient.XLen(context.Background(), env.Cfg.Streams.JobResults).Result()
|
||||
require.NoError(t, err)
|
||||
require.LessOrEqualf(t, entries, int64(expectedCount),
|
||||
"job_results stream has more entries than expected; got=%d expected<=%d", entries, expectedCount)
|
||||
}
|
||||
|
||||
// SanitizeContainerSummaryFor returns a stable diagnostic string for a
|
||||
// container summary keyed by game id. Used in test failures.
|
||||
func SanitizeContainerSummaryFor(values map[string]string, gameID string) string {
|
||||
parts := make([]string, 0, len(values))
|
||||
for key, value := range values {
|
||||
parts = append(parts, key+"="+value)
|
||||
}
|
||||
return fmt.Sprintf("game=%s {%s}", gameID, strings.Join(parts, ", "))
|
||||
}
|
||||
@@ -0,0 +1,303 @@
|
||||
//go:build integration
|
||||
|
||||
// Package integration_test owns the service-local end-to-end scenarios
|
||||
// for Runtime Manager. The build tag keeps the suite out of the
|
||||
// default `go test ./...` run; CI invokes the suite explicitly with
|
||||
// `go test -tags=integration ./rtmanager/integration/...`.
|
||||
//
|
||||
// Design rationale for the suite — build tag, in-process harness,
|
||||
// per-test isolation, two-tag engine image — lives in
|
||||
// `rtmanager/docs/integration-tests.md`. Each test stands up its own
|
||||
// Runtime Manager process via `harness.NewEnv`, drives the same
|
||||
// streams Game Lobby uses in `integration/lobbyrtm`, and asserts the
|
||||
// resulting PostgreSQL, Redis-stream, and Docker side-effects.
|
||||
package integration_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/integration/harness"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
|
||||
"github.com/docker/docker/api/types/container"
|
||||
"github.com/docker/docker/api/types/filters"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// TestMain centralises shared-container teardown so individual
|
||||
// failing tests do not leak the testcontainers postgres / redis pair.
|
||||
func TestMain(m *testing.M) {
|
||||
harness.RunMain(m)
|
||||
}
|
||||
|
||||
// TestLifecycle_StartInspectStopRestartPatchCleanup drives one game
|
||||
// through every supported lifecycle operation against the real engine
|
||||
// image and asserts each step's PG, Redis-stream, and Docker
|
||||
// side-effects.
|
||||
func TestLifecycle_StartInspectStopRestartPatchCleanup(t *testing.T) {
|
||||
env := harness.NewEnv(t, harness.EnvOptions{LogToStderr: true})
|
||||
rest := harness.NewREST(env)
|
||||
gameID := harness.IDFromTestName(t)
|
||||
|
||||
// Step 1 — start through the Lobby async stream contract.
|
||||
startEntryID := harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
|
||||
t.Logf("start_jobs xadd id=%s", startEntryID)
|
||||
|
||||
startResult := harness.WaitForJobResult(t, env,
|
||||
harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess),
|
||||
30*time.Second,
|
||||
)
|
||||
require.Equal(t, "", startResult.ErrorCode, "fresh start must publish empty error_code")
|
||||
require.NotEmpty(t, startResult.ContainerID, "fresh start job result must carry container_id")
|
||||
require.NotEmpty(t, startResult.EngineEndpoint, "fresh start job result must carry engine_endpoint")
|
||||
|
||||
// PG record reflects the start.
|
||||
startedRecord := harness.EventuallyRuntimeRecord(t, env, gameID,
|
||||
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusRunning },
|
||||
15*time.Second,
|
||||
)
|
||||
assert.Equal(t, env.EngineImageRef, startedRecord.CurrentImageRef)
|
||||
assert.Equal(t, env.Network, startedRecord.DockerNetwork)
|
||||
assert.Equal(t, startResult.ContainerID, startedRecord.CurrentContainerID)
|
||||
assert.Equal(t, startResult.EngineEndpoint, startedRecord.EngineEndpoint)
|
||||
|
||||
// operation_log captures the start.
|
||||
startEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindStart, 5*time.Second)
|
||||
assert.Equal(t, operation.OutcomeSuccess, startEntry.Outcome)
|
||||
assert.Equal(t, operation.OpSourceLobbyStream, startEntry.OpSource)
|
||||
|
||||
// Step 2 — inspect via the GM/Admin REST surface.
|
||||
getResp, status := rest.GetRuntime(t, gameID)
|
||||
require.Equal(t, http.StatusOK, status)
|
||||
require.Equal(t, "running", getResp.Status)
|
||||
require.NotNil(t, getResp.CurrentContainerID)
|
||||
require.Equal(t, startResult.ContainerID, *getResp.CurrentContainerID)
|
||||
require.NotNil(t, getResp.CurrentImageRef)
|
||||
require.Equal(t, env.EngineImageRef, *getResp.CurrentImageRef)
|
||||
require.NotNil(t, getResp.EngineEndpoint)
|
||||
require.Equal(t, startResult.EngineEndpoint, *getResp.EngineEndpoint)
|
||||
|
||||
// Step 3 — stop through the Lobby async stream contract.
|
||||
harness.XAddStopJob(t, env, gameID, "cancelled")
|
||||
stopResult := waitForLatestStopOrStartResult(t, env, gameID)
|
||||
require.Equal(t, ports.JobOutcomeSuccess, stopResult.Outcome)
|
||||
require.Equal(t, "", stopResult.ErrorCode, "fresh stop must publish empty error_code")
|
||||
|
||||
stoppedRecord := harness.EventuallyRuntimeRecord(t, env, gameID,
|
||||
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusStopped },
|
||||
15*time.Second,
|
||||
)
|
||||
assert.Equal(t, startResult.ContainerID, stoppedRecord.CurrentContainerID,
|
||||
"stop preserves the current container id until cleanup")
|
||||
|
||||
// Step 4 — restart via REST. Container id changes; engine endpoint
|
||||
// stays stable.
|
||||
restartResp, status := rest.RestartRuntime(t, gameID)
|
||||
require.Equal(t, http.StatusOK, status)
|
||||
require.Equal(t, "running", restartResp.Status)
|
||||
require.NotNil(t, restartResp.CurrentContainerID)
|
||||
require.NotEqual(t, startResult.ContainerID, *restartResp.CurrentContainerID,
|
||||
"restart must produce a new container id")
|
||||
require.NotNil(t, restartResp.EngineEndpoint)
|
||||
require.Equal(t, startResult.EngineEndpoint, *restartResp.EngineEndpoint,
|
||||
"restart must keep the engine endpoint stable")
|
||||
|
||||
restartContainerID := *restartResp.CurrentContainerID
|
||||
restartEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindRestart, 5*time.Second)
|
||||
assert.Equal(t, operation.OutcomeSuccess, restartEntry.Outcome)
|
||||
assert.Equal(t, operation.OpSourceAdminRest, restartEntry.OpSource)
|
||||
|
||||
// Step 5 — patch to the second semver-compatible tag. Same image
|
||||
// content, but the runtime should still record the new tag and
|
||||
// recreate the container.
|
||||
patchResp, status := rest.PatchRuntime(t, gameID, env.PatchedImageRef)
|
||||
require.Equal(t, http.StatusOK, status)
|
||||
require.Equal(t, "running", patchResp.Status)
|
||||
require.NotNil(t, patchResp.CurrentImageRef)
|
||||
assert.Equal(t, env.PatchedImageRef, *patchResp.CurrentImageRef)
|
||||
require.NotNil(t, patchResp.CurrentContainerID)
|
||||
assert.NotEqual(t, restartContainerID, *patchResp.CurrentContainerID,
|
||||
"patch must recreate the container")
|
||||
|
||||
patchEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindPatch, 5*time.Second)
|
||||
assert.Equal(t, operation.OutcomeSuccess, patchEntry.Outcome)
|
||||
|
||||
// Step 6 — quiesce via REST stop so cleanup is allowed (cleanup
|
||||
// refuses to remove a running container per
|
||||
// `rtmanager/README.md §Lifecycles → Cleanup`).
|
||||
stopResp, status := rest.StopRuntime(t, gameID, "admin_request")
|
||||
require.Equal(t, http.StatusOK, status)
|
||||
require.Equal(t, "stopped", stopResp.Status)
|
||||
|
||||
// Step 7 — cleanup the container. PG record flips to removed and
|
||||
// current_container_id becomes nil.
|
||||
cleanupResp, status := rest.CleanupRuntime(t, gameID)
|
||||
require.Equal(t, http.StatusOK, status)
|
||||
require.Equal(t, "removed", cleanupResp.Status)
|
||||
require.Nil(t, cleanupResp.CurrentContainerID)
|
||||
|
||||
cleanupEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindCleanupContainer, 5*time.Second)
|
||||
assert.Equal(t, operation.OutcomeSuccess, cleanupEntry.Outcome)
|
||||
assert.Equal(t, operation.OpSourceAdminRest, cleanupEntry.OpSource)
|
||||
}
|
||||
|
||||
// TestReplay_StartJobIsNoop publishes the same start envelope twice
|
||||
// and asserts that Runtime Manager produces a fresh job_result for
|
||||
// the first XADD and a `replay_no_op` outcome for the second, without
|
||||
// recreating the engine container.
|
||||
func TestReplay_StartJobIsNoop(t *testing.T) {
|
||||
env := harness.NewEnv(t, harness.EnvOptions{})
|
||||
gameID := harness.IDFromTestName(t)
|
||||
|
||||
// First XADD: fresh start.
|
||||
harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
|
||||
first := harness.WaitForJobResult(t, env,
|
||||
harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess),
|
||||
30*time.Second,
|
||||
)
|
||||
require.Equal(t, "", first.ErrorCode)
|
||||
|
||||
// Second XADD: same envelope; the start service must short-circuit
|
||||
// at the `runtime_records.status=running && image_ref` check.
|
||||
harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
|
||||
replay := harness.WaitForJobResult(t, env,
|
||||
harness.JobOutcomeWithErrorCode(gameID, ports.JobOutcomeSuccess, "replay_no_op"),
|
||||
15*time.Second,
|
||||
)
|
||||
assert.Equal(t, first.ContainerID, replay.ContainerID,
|
||||
"replay must surface the same container id as the original start")
|
||||
assert.Equal(t, first.EngineEndpoint, replay.EngineEndpoint)
|
||||
|
||||
// Docker view: exactly one engine container exists for this game.
|
||||
assertSingleEngineContainer(t, env, gameID)
|
||||
|
||||
// Lifecycle stream produced exactly two entries: fresh + replay.
|
||||
entries := harness.AllJobResults(t, env)
|
||||
require.Len(t, entries, 2)
|
||||
assert.Equal(t, "", entries[0].ErrorCode)
|
||||
assert.Equal(t, "replay_no_op", entries[1].ErrorCode)
|
||||
}
|
||||
|
||||
// TestReplay_StopJobIsNoop publishes a stop envelope twice after a
|
||||
// successful start and asserts the second stop surfaces as
|
||||
// `replay_no_op` without altering the runtime record's `stopped_at`.
|
||||
func TestReplay_StopJobIsNoop(t *testing.T) {
|
||||
env := harness.NewEnv(t, harness.EnvOptions{})
|
||||
gameID := harness.IDFromTestName(t)
|
||||
|
||||
// Bring the game to `running`. The start path publishes one entry
|
||||
// to `runtime:job_results`; the stops below publish two more, so
|
||||
// per-game stream order is [start, first-stop, replay-stop].
|
||||
harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
|
||||
harness.WaitForJobResult(t, env,
|
||||
harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess),
|
||||
30*time.Second,
|
||||
)
|
||||
|
||||
// First stop: fresh. The expectedCount accounts for the start
|
||||
// entry that is already on the stream.
|
||||
harness.XAddStopJob(t, env, gameID, "cancelled")
|
||||
first := waitForJobResultByIndex(t, env, gameID, 2)
|
||||
require.Equal(t, ports.JobOutcomeSuccess, first.Outcome)
|
||||
require.Equal(t, "", first.ErrorCode)
|
||||
|
||||
stoppedRecord := harness.EventuallyRuntimeRecord(t, env, gameID,
|
||||
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusStopped },
|
||||
15*time.Second,
|
||||
)
|
||||
require.NotNil(t, stoppedRecord.StoppedAt, "stopped record must carry stopped_at")
|
||||
originalStoppedAt := *stoppedRecord.StoppedAt
|
||||
|
||||
// Second stop: replay (third entry on the per-game stream).
|
||||
harness.XAddStopJob(t, env, gameID, "cancelled")
|
||||
replay := waitForJobResultByIndex(t, env, gameID, 3)
|
||||
require.Equal(t, ports.JobOutcomeSuccess, replay.Outcome)
|
||||
assert.Equal(t, "replay_no_op", replay.ErrorCode)
|
||||
|
||||
// stopped_at stays anchored to the first stop.
|
||||
postReplay := harness.MustRuntimeRecord(t, env, gameID)
|
||||
require.Equal(t, runtime.StatusStopped, postReplay.Status)
|
||||
require.NotNil(t, postReplay.StoppedAt)
|
||||
assert.True(t, originalStoppedAt.Equal(*postReplay.StoppedAt),
|
||||
"stopped_at must not move on a replay stop; was %s, now %s",
|
||||
originalStoppedAt, *postReplay.StoppedAt)
|
||||
}
|
||||
|
||||
// waitForLatestStopOrStartResult finds the most recent `outcome=success`
|
||||
// entry on `runtime:job_results` for gameID. The lifecycle scenario
|
||||
// emits two consecutive successes (start then stop); the helper picks
|
||||
// the second one without re-scanning the stream every iteration.
|
||||
func waitForLatestStopOrStartResult(t *testing.T, env *harness.Env, gameID string) harness.JobResultEntry {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(30 * time.Second)
|
||||
for {
|
||||
entries := harness.AllJobResults(t, env)
|
||||
// Two entries means we've observed both the start and stop
|
||||
// outcomes for this game.
|
||||
matched := 0
|
||||
var last harness.JobResultEntry
|
||||
for _, entry := range entries {
|
||||
if entry.GameID == gameID && entry.Outcome == ports.JobOutcomeSuccess {
|
||||
matched++
|
||||
last = entry
|
||||
}
|
||||
}
|
||||
if matched >= 2 {
|
||||
return last
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("expected two job_results for %s, got %d", gameID, matched)
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
// waitForJobResultByIndex polls the job_results stream until it has
|
||||
// at least `expectedCount` entries for gameID and returns the
|
||||
// expectedCount-th. Used by the replay tests to deterministically
|
||||
// pick the second / nth result.
|
||||
func waitForJobResultByIndex(t *testing.T, env *harness.Env, gameID string, expectedCount int) harness.JobResultEntry {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(30 * time.Second)
|
||||
for {
|
||||
entries := harness.AllJobResults(t, env)
|
||||
matches := make([]harness.JobResultEntry, 0, len(entries))
|
||||
for _, entry := range entries {
|
||||
if entry.GameID == gameID {
|
||||
matches = append(matches, entry)
|
||||
}
|
||||
}
|
||||
if len(matches) >= expectedCount {
|
||||
return matches[expectedCount-1]
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("expected at least %d job_results for %s, got %d",
|
||||
expectedCount, gameID, len(matches))
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
// assertSingleEngineContainer queries Docker by the per-game label and
|
||||
// asserts exactly one matching container exists. Catches replay
|
||||
// regressions that would let RTM start two containers for the same
|
||||
// game id.
|
||||
func assertSingleEngineContainer(t *testing.T, env *harness.Env, gameID string) {
|
||||
t.Helper()
|
||||
args := filters.NewArgs(
|
||||
filters.Arg("label", "com.galaxy.owner=rtmanager"),
|
||||
filters.Arg("label", "com.galaxy.game_id="+gameID),
|
||||
)
|
||||
containers, err := env.Docker.Client().ContainerList(
|
||||
context.Background(),
|
||||
container.ListOptions{All: true, Filters: args},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.Lenf(t, containers, 1, "expected one engine container for game %s, got %d", gameID, len(containers))
|
||||
}
|
||||
@@ -0,0 +1,200 @@
|
||||
//go:build integration
|
||||
|
||||
package integration_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/integration/harness"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
|
||||
dockercontainer "github.com/docker/docker/api/types/container"
|
||||
"github.com/docker/docker/api/types/network"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// TestHealth_ContainerDisappearedAndAdopt verifies the two
|
||||
// drift-detection paths. The Docker events listener emits
|
||||
// `container_disappeared` when a tracked container is destroyed
|
||||
// outside RTM, and the reconciler adopts a fresh container labelled
|
||||
// `com.galaxy.owner=rtmanager` that has no PG row.
|
||||
//
|
||||
// `runtime_records.status=removed` is terminal per
|
||||
// `runtime.AllowedTransitions`; the adoption path therefore uses a
|
||||
// **fresh** game_id rather than re-adopting the disposed one. That
|
||||
// matches the documented contract: reconciler adopts containers
|
||||
// labelled `com.galaxy.owner=rtmanager` for which no PG row exists.
|
||||
func TestHealth_ContainerDisappearedAndAdopt(t *testing.T) {
|
||||
env := harness.NewEnv(t, harness.EnvOptions{
|
||||
ReconcileInterval: 500 * time.Millisecond,
|
||||
})
|
||||
|
||||
// Step 1 — bring a game to running through the start consumer.
|
||||
disposalGameID := harness.IDFromTestName(t) + "-d"
|
||||
harness.XAddStartJob(t, env, disposalGameID, env.EngineImageRef)
|
||||
startResult := harness.WaitForJobResult(t, env,
|
||||
harness.JobOutcomeIs(disposalGameID, ports.JobOutcomeSuccess),
|
||||
30*time.Second,
|
||||
)
|
||||
originalContainerID := startResult.ContainerID
|
||||
require.NotEmpty(t, originalContainerID)
|
||||
|
||||
// Step 2 — externally remove the container; the events listener
|
||||
// should observe the destroy and publish `container_disappeared`.
|
||||
removeContainer(t, env, originalContainerID)
|
||||
disappeared := harness.WaitForHealthEvent(t, env,
|
||||
harness.HealthEventTypeIs(disposalGameID, string(health.EventTypeContainerDisappeared)),
|
||||
20*time.Second,
|
||||
)
|
||||
assert.Equal(t, originalContainerID, disappeared.ContainerID)
|
||||
|
||||
// The reconciler also marks the runtime record as removed within
|
||||
// one or two ticks (`reconcile_dispose`).
|
||||
harness.EventuallyRuntimeRecord(t, env, disposalGameID,
|
||||
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusRemoved },
|
||||
15*time.Second,
|
||||
)
|
||||
harness.EventuallyOperationKind(t, env, disposalGameID, operation.OpKindReconcileDispose, 5*time.Second)
|
||||
|
||||
// Step 3 — bring up an adoption candidate for an unseen game id
|
||||
// by hand. The reconciler must label-match it, find no record,
|
||||
// and insert one with status=running.
|
||||
adoptionGameID := harness.IDFromTestName(t) + "-a"
|
||||
manualContainerID := runManualEngineContainer(t, env, adoptionGameID)
|
||||
t.Logf("manual container id=%s", manualContainerID)
|
||||
|
||||
adopted := harness.EventuallyRuntimeRecord(t, env, adoptionGameID,
|
||||
func(r runtime.RuntimeRecord) bool {
|
||||
return r.Status == runtime.StatusRunning && r.CurrentContainerID == manualContainerID
|
||||
},
|
||||
20*time.Second,
|
||||
)
|
||||
assert.Equal(t, env.EngineImageRef, adopted.CurrentImageRef)
|
||||
|
||||
adoptEntry := harness.EventuallyOperationKind(t, env, adoptionGameID, operation.OpKindReconcileAdopt, 5*time.Second)
|
||||
assert.Equal(t, operation.OutcomeSuccess, adoptEntry.Outcome)
|
||||
assert.Equal(t, operation.OpSourceAutoReconcile, adoptEntry.OpSource)
|
||||
assert.Equal(t, manualContainerID, adoptEntry.ContainerID)
|
||||
}
|
||||
|
||||
// TestNotification_ImagePullFailed drives Runtime Manager with a
|
||||
// start envelope pointing at an unresolvable image reference. The
|
||||
// start service must surface the failure on `runtime:job_results` and
|
||||
// publish a `runtime.image_pull_failed` admin notification on
|
||||
// `notification:intents`.
|
||||
func TestNotification_ImagePullFailed(t *testing.T) {
|
||||
env := harness.NewEnv(t, harness.EnvOptions{})
|
||||
gameID := harness.IDFromTestName(t)
|
||||
|
||||
const missingImage = "galaxy/integration-missing:0.0.0"
|
||||
harness.XAddStartJob(t, env, gameID, missingImage)
|
||||
|
||||
// Job result publishes a failure with the stable image_pull_failed
|
||||
// code.
|
||||
jobResult := harness.WaitForJobResult(t, env,
|
||||
harness.JobOutcomeIs(gameID, ports.JobOutcomeFailure),
|
||||
60*time.Second,
|
||||
)
|
||||
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, jobResult.ErrorCode)
|
||||
assert.Empty(t, jobResult.ContainerID, "failure must not surface a container id")
|
||||
assert.Empty(t, jobResult.EngineEndpoint, "failure must not surface an engine endpoint")
|
||||
assert.NotEmpty(t, jobResult.ErrorMessage, "failure must carry an operator-readable message")
|
||||
|
||||
// Notification stream carries the matching admin-only intent.
|
||||
intent := harness.WaitForNotificationIntent(t, env,
|
||||
func(entry harness.NotificationIntentEntry) bool {
|
||||
if entry.NotificationType != string(notificationintent.NotificationTypeRuntimeImagePullFailed) {
|
||||
return false
|
||||
}
|
||||
payloadGameID, _ := entry.Payload["game_id"].(string)
|
||||
return payloadGameID == gameID
|
||||
},
|
||||
30*time.Second,
|
||||
)
|
||||
require.NotNil(t, intent.Payload, "notification intent must carry a payload")
|
||||
assert.Equal(t, gameID, intent.Payload["game_id"])
|
||||
assert.Equal(t, missingImage, intent.Payload["image_ref"])
|
||||
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, intent.Payload["error_code"])
|
||||
|
||||
// PG state: no running record was installed; operation_log
|
||||
// captures one failed start with the stable error code.
|
||||
_, err := harness.RuntimeRecord(t, env, gameID)
|
||||
if err == nil {
|
||||
// If an entry was upserted (rollback gap), it must not be
|
||||
// running.
|
||||
record := harness.MustRuntimeRecord(t, env, gameID)
|
||||
assert.NotEqual(t, runtime.StatusRunning, record.Status,
|
||||
"failed image pull must not leave a running record behind")
|
||||
}
|
||||
|
||||
failureEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindStart, 5*time.Second)
|
||||
assert.Equal(t, operation.OutcomeFailure, failureEntry.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, failureEntry.ErrorCode)
|
||||
}
|
||||
|
||||
// removeContainer terminates and removes the container behind RTM's
|
||||
// back. Force=true is required because the engine has not received a
|
||||
// SIGTERM and stop signal handling is engine-internal.
|
||||
func removeContainer(t *testing.T, env *harness.Env, containerID string) {
|
||||
t.Helper()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
require.NoError(t, env.Docker.Client().ContainerRemove(ctx, containerID, dockercontainer.RemoveOptions{Force: true}))
|
||||
}
|
||||
|
||||
// runManualEngineContainer bypasses RTM and starts an engine container
|
||||
// directly through the Docker SDK. The container carries every label
|
||||
// the reconciler reads at adopt time (`com.galaxy.owner`,
|
||||
// `com.galaxy.kind`, `com.galaxy.game_id`, `com.galaxy.engine_image_ref`,
|
||||
// `com.galaxy.started_at_ms`) plus the per-game hostname so the
|
||||
// computed `engine_endpoint` matches what `rtmanager` would have
|
||||
// written.
|
||||
func runManualEngineContainer(t *testing.T, env *harness.Env, gameID string) string {
|
||||
t.Helper()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
hostname := "galaxy-game-" + gameID
|
||||
cfg := &dockercontainer.Config{
|
||||
Image: env.EngineImageRef,
|
||||
Hostname: hostname,
|
||||
Labels: map[string]string{
|
||||
"com.galaxy.owner": "rtmanager",
|
||||
"com.galaxy.kind": "game-engine",
|
||||
"com.galaxy.game_id": gameID,
|
||||
"com.galaxy.engine_image_ref": env.EngineImageRef,
|
||||
"com.galaxy.started_at_ms": strconv.FormatInt(time.Now().UnixMilli(), 10),
|
||||
},
|
||||
Env: []string{
|
||||
"GAME_STATE_PATH=/var/lib/galaxy-game",
|
||||
"STORAGE_PATH=/var/lib/galaxy-game",
|
||||
},
|
||||
}
|
||||
hostCfg := &dockercontainer.HostConfig{}
|
||||
netCfg := &network.NetworkingConfig{
|
||||
EndpointsConfig: map[string]*network.EndpointSettings{
|
||||
env.Network: {Aliases: []string{hostname}},
|
||||
},
|
||||
}
|
||||
containerName := fmt.Sprintf("galaxy-game-%s-manual", gameID)
|
||||
created, err := env.Docker.Client().ContainerCreate(ctx, cfg, hostCfg, netCfg, nil, containerName)
|
||||
require.NoError(t, err)
|
||||
t.Cleanup(func() {
|
||||
removeCtx, removeCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer removeCancel()
|
||||
_ = env.Docker.Client().ContainerRemove(removeCtx, created.ID, dockercontainer.RemoveOptions{Force: true})
|
||||
})
|
||||
|
||||
require.NoError(t, env.Docker.Client().ContainerStart(ctx, created.ID, dockercontainer.StartOptions{}))
|
||||
return created.ID
|
||||
}
|
||||
Reference in New Issue
Block a user