package harness import ( "context" "errors" "io" "log/slog" "net/url" "os" "strconv" "strings" "sync" "testing" "time" "galaxy/postgres" "galaxy/redisconn" "galaxy/rtmanager/internal/app" "galaxy/rtmanager/internal/config" "github.com/redis/go-redis/v9" ) // Default stream key shapes used by the integration suite. They match // the production defaults so the wire shapes asserted in `streams.go` // are identical to what Game Lobby sees in `integration/lobbyrtm`. const ( StartJobsStream = "runtime:start_jobs" StopJobsStream = "runtime:stop_jobs" JobResultsStream = "runtime:job_results" HealthEventsStream = "runtime:health_events" NotificationIntentsKey = "notification:intents" gameStateRootSubdir = "game-state" listenAddr = "127.0.0.1:0" listenerWaitTimeout = 10 * time.Second readyzPollInterval = 25 * time.Millisecond cleanupShutdownTimeout = 30 * time.Second ) // Env carries everything one integration scenario needs to drive the // Runtime Manager process. The struct is value-typed so tests reach // fields without intermediate getters. type Env struct { // Cfg is the resolved Runtime Manager configuration handed to // `app.NewRuntime`. Tests inspect it for stream key shapes, // container defaults, and timeout knobs. Cfg config.Config // Runtime is the in-process Runtime Manager exposed for tests that // need to peek at internal state (`runtime.InternalServer().Addr()`). Runtime *app.Runtime // Postgres holds the per-package PostgreSQL fixture. Postgres *PostgresEnv // Redis holds the per-package Redis fixture plus a fresh client the // test owns. Redis *RedisEnv RedisClient *redis.Client // Docker holds the per-package Docker daemon handle. Docker *DockerEnv // Lobby is the per-test stub HTTP server. Lobby *LobbyStub // Network is the unique Docker network name created for this test. Network string // EngineImageRef and PatchedImageRef are the two semver-compatible // engine image tags the harness builds once per package. Patch // scenarios point at the second tag. EngineImageRef string PatchedImageRef string // GameStateRoot is the host filesystem path RTM writes per-game // state directories under. It lives inside `t.ArtifactDir()` so // failed scenarios leave the engine state behind for inspection. GameStateRoot string // InternalAddr is the bound address of RTM's internal HTTP listener // (resolved after Run binds the port). InternalAddr string } // EnvOptions carry per-test overrides to the harness defaults. Empty // fields fall back to the defaults declared at the top of this file. type EnvOptions struct { // ReconcileInterval overrides the periodic reconciler interval. // Default 500ms (so reconcile drift is observable inside a single // scenario timeout). ReconcileInterval time.Duration // CleanupInterval overrides the container-cleanup interval. CleanupInterval time.Duration // InspectInterval overrides the Docker inspect worker interval. InspectInterval time.Duration // ProbeInterval / ProbeTimeout / ProbeFailuresThreshold override // the active engine probe knobs. ProbeInterval time.Duration ProbeTimeout time.Duration ProbeFailuresThreshold int // GameLeaseTTL overrides the per-game Redis lease TTL. GameLeaseTTL time.Duration // StreamBlockTimeout overrides the consumer XREAD block window. StreamBlockTimeout time.Duration // LogToStderr makes the harness write the runtime's structured // logs to stderr; the default discards them so test output stays // focused on assertions. LogToStderr bool } // NewEnv stands up a fresh Runtime Manager process for the calling // test. It blocks until the internal HTTP listener is bound; tests can // issue REST and stream requests immediately after the call returns. // // `t.Cleanup` runs in reverse order: stop the runtime, close the // runtime, close the per-test redis client, remove the docker network, // terminate the lobby stub. Containers RTM created during the test are // removed by the test's own cleanup paths or by the integration // `health_test` external-action helpers. func NewEnv(t *testing.T, opts EnvOptions) *Env { t.Helper() pg := EnsurePostgres(t) rd := EnsureRedis(t) dk := EnsureDocker(t) imageRef := EnsureEngineImage(t) TruncatePostgres(t) FlushRedis(t) network := EnsureNetwork(t) lobby := NewLobbyStub(t) stateRoot := stateRoot(t) cfg := buildConfig(buildConfigInput{ PostgresDSN: pg.DSN(), RedisAddr: rd.Addr(), DockerHost: resolveDockerHost(), Network: network, LobbyURL: lobby.URL(), GameStateRoot: stateRoot, ReconcileInterval: pickDuration(opts.ReconcileInterval, 500*time.Millisecond), CleanupInterval: pickDuration(opts.CleanupInterval, 500*time.Millisecond), InspectInterval: pickDuration(opts.InspectInterval, 500*time.Millisecond), ProbeInterval: pickDuration(opts.ProbeInterval, 500*time.Millisecond), ProbeTimeout: pickDuration(opts.ProbeTimeout, time.Second), ProbeFailures: pickInt(opts.ProbeFailuresThreshold, 2), GameLeaseTTL: pickDuration(opts.GameLeaseTTL, 5*time.Second), StreamBlockTimeout: pickDuration(opts.StreamBlockTimeout, 200*time.Millisecond), }) logger := newLogger(opts.LogToStderr) ctx, cancel := context.WithCancel(context.Background()) runtime, err := app.NewRuntime(ctx, cfg, logger) if err != nil { cancel() t.Fatalf("rtmanager integration: new runtime: %v", err) } runDone := make(chan error, 1) go func() { runDone <- runtime.Run(ctx) }() internalAddr := waitForListener(t, runtime) waitForReady(t, runtime, listenerWaitTimeout) var cleanupOnce sync.Once t.Cleanup(func() { cleanupOnce.Do(func() { cancel() waitCtx, waitCancel := context.WithTimeout(context.Background(), cleanupShutdownTimeout) defer waitCancel() select { case err := <-runDone: if err != nil && !isCleanShutdownErr(err) { t.Logf("rtmanager integration: runtime.Run returned: %v", err) } case <-waitCtx.Done(): t.Logf("rtmanager integration: runtime did not stop within %s", cleanupShutdownTimeout) } if err := runtime.Close(); err != nil { t.Logf("rtmanager integration: runtime.Close: %v", err) } }) }) return &Env{ Cfg: cfg, Runtime: runtime, Postgres: pg, Redis: rd, RedisClient: rd.NewClient(t), Docker: dk, Lobby: lobby, Network: network, EngineImageRef: imageRef, PatchedImageRef: PatchedEngineImageRef, GameStateRoot: stateRoot, InternalAddr: internalAddr, } } type buildConfigInput struct { PostgresDSN string RedisAddr string DockerHost string Network string LobbyURL string GameStateRoot string ReconcileInterval time.Duration CleanupInterval time.Duration InspectInterval time.Duration ProbeInterval time.Duration ProbeTimeout time.Duration ProbeFailures int GameLeaseTTL time.Duration StreamBlockTimeout time.Duration } func buildConfig(in buildConfigInput) config.Config { cfg := config.DefaultConfig() cfg.InternalHTTP.Addr = listenAddr cfg.Docker.Host = in.DockerHost cfg.Docker.Network = in.Network cfg.Docker.PullPolicy = config.ImagePullPolicyIfMissing cfg.Postgres = config.PostgresConfig{ Conn: postgres.Config{ PrimaryDSN: in.PostgresDSN, OperationTimeout: pgOperationTimeout, MaxOpenConns: 5, MaxIdleConns: 2, ConnMaxLifetime: 30 * time.Minute, }, } cfg.Redis = config.RedisConfig{ Conn: redisconn.Config{ MasterAddr: in.RedisAddr, Password: "integration", OperationTimeout: 2 * time.Second, }, } cfg.Streams.StartJobs = StartJobsStream cfg.Streams.StopJobs = StopJobsStream cfg.Streams.JobResults = JobResultsStream cfg.Streams.HealthEvents = HealthEventsStream cfg.Streams.NotificationIntents = NotificationIntentsKey cfg.Streams.BlockTimeout = in.StreamBlockTimeout cfg.Container.GameStateRoot = in.GameStateRoot // Pin chown target to the current process uid/gid; the dev sandbox // (and unprivileged dev machines) cannot chown to root. cfg.Container.GameStateOwnerUID = os.Getuid() cfg.Container.GameStateOwnerGID = os.Getgid() cfg.Health.InspectInterval = in.InspectInterval cfg.Health.ProbeInterval = in.ProbeInterval cfg.Health.ProbeTimeout = in.ProbeTimeout cfg.Health.ProbeFailuresThreshold = in.ProbeFailures cfg.Cleanup.ReconcileInterval = in.ReconcileInterval cfg.Cleanup.CleanupInterval = in.CleanupInterval cfg.Coordination.GameLeaseTTL = in.GameLeaseTTL cfg.Lobby = config.LobbyConfig{ BaseURL: in.LobbyURL, Timeout: 2 * time.Second, } cfg.Telemetry.TracesExporter = "none" cfg.Telemetry.MetricsExporter = "none" return cfg } func newLogger(toStderr bool) *slog.Logger { if toStderr { return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug})) } return slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{Level: slog.LevelError})) } func stateRoot(t *testing.T) string { t.Helper() dir := t.ArtifactDir() root := dir + string(os.PathSeparator) + gameStateRootSubdir if err := os.MkdirAll(root, 0o755); err != nil { t.Fatalf("rtmanager integration: create game-state root %q: %v", root, err) } return root } func resolveDockerHost() string { if host := strings.TrimSpace(os.Getenv("DOCKER_HOST")); host != "" { return host } return "unix:///var/run/docker.sock" } func pickDuration(value, fallback time.Duration) time.Duration { if value > 0 { return value } return fallback } func pickInt(value, fallback int) int { if value > 0 { return value } return fallback } // waitForListener spins until `runtime.InternalServer().Addr()` returns // a non-empty value or the deadline fires. The internal listener binds // during `runtime.Run`, which runs in its own goroutine; this helper // is the bridge between "Run started" and "tests can use REST". func waitForListener(t *testing.T, runtime *app.Runtime) string { t.Helper() deadline := time.Now().Add(listenerWaitTimeout) for { if runtime != nil && runtime.InternalServer() != nil { if addr := runtime.InternalServer().Addr(); addr != "" { return addr } } if time.Now().After(deadline) { t.Fatalf("rtmanager integration: internal HTTP listener did not bind within %s", listenerWaitTimeout) } time.Sleep(readyzPollInterval) } } // waitForReady polls `/readyz` until it returns 200 or the deadline // fires. RTM's readyz pings PG, Redis, and Docker; a successful // response means every dependency is reachable through the runtime // process. func waitForReady(t *testing.T, runtime *app.Runtime, timeout time.Duration) { t.Helper() deadline := time.Now().Add(timeout) addr := runtime.InternalServer().Addr() probeURL := (&url.URL{Scheme: "http", Host: addr, Path: "/readyz"}).String() for { req, err := newRequest(context.Background(), "GET", probeURL, nil) if err == nil { resp, err := defaultHTTPClient.Do(req) if err == nil { _, _ = io.Copy(io.Discard, resp.Body) _ = resp.Body.Close() if resp.StatusCode == 200 { return } } } if time.Now().After(deadline) { t.Fatalf("rtmanager integration: /readyz did not return 200 within %s", timeout) } time.Sleep(readyzPollInterval) } } func isCleanShutdownErr(err error) bool { return err == nil || errors.Is(err, context.Canceled) } // IDFromTestName builds a deterministic-but-unique game id from the // caller's test name. Two tests with the same name running back-to-back // would otherwise collide on PG state through the per-test // `TruncatePostgres` window; pinning the suffix to `Now().UnixNano()` // rules that out. func IDFromTestName(t *testing.T) string { t.Helper() // The container hostname is `galaxy-game-{game_id}` and must fit // HOST_NAME_MAX=64 chars; runc rejects longer values with // "sethostname: invalid argument". Cap the lowercased test-name // component at 36 chars and append a 16-char base36 suffix so the // total stays comfortably under the limit (12 + 36 + 1 + 16 = 65 → // trim further if needed). const maxNameLen = 35 suffix := strconv.FormatInt(time.Now().UnixNano(), 36) prefix := strings.ToLower(strings.NewReplacer("/", "-", " ", "-").Replace(t.Name())) if len(prefix) > maxNameLen { prefix = prefix[:maxNameLen] } return prefix + "-" + suffix }