Files
galaxy-game/rtmanager/integration/harness/runtime.go
T
2026-04-28 20:39:18 +02:00

399 lines
12 KiB
Go

package harness
import (
"context"
"errors"
"io"
"log/slog"
"net/url"
"os"
"strconv"
"strings"
"sync"
"testing"
"time"
"galaxy/postgres"
"galaxy/redisconn"
"galaxy/rtmanager/internal/app"
"galaxy/rtmanager/internal/config"
"github.com/redis/go-redis/v9"
)
// Default stream key shapes used by the integration suite. They match
// the production defaults so the wire shapes asserted in `streams.go`
// are identical to what Game Lobby sees in `integration/lobbyrtm`.
const (
StartJobsStream = "runtime:start_jobs"
StopJobsStream = "runtime:stop_jobs"
JobResultsStream = "runtime:job_results"
HealthEventsStream = "runtime:health_events"
NotificationIntentsKey = "notification:intents"
gameStateRootSubdir = "game-state"
listenAddr = "127.0.0.1:0"
listenerWaitTimeout = 10 * time.Second
readyzPollInterval = 25 * time.Millisecond
cleanupShutdownTimeout = 30 * time.Second
)
// Env carries everything one integration scenario needs to drive the
// Runtime Manager process. The struct is value-typed so tests reach
// fields without intermediate getters.
type Env struct {
// Cfg is the resolved Runtime Manager configuration handed to
// `app.NewRuntime`. Tests inspect it for stream key shapes,
// container defaults, and timeout knobs.
Cfg config.Config
// Runtime is the in-process Runtime Manager exposed for tests that
// need to peek at internal state (`runtime.InternalServer().Addr()`).
Runtime *app.Runtime
// Postgres holds the per-package PostgreSQL fixture.
Postgres *PostgresEnv
// Redis holds the per-package Redis fixture plus a fresh client the
// test owns.
Redis *RedisEnv
RedisClient *redis.Client
// Docker holds the per-package Docker daemon handle.
Docker *DockerEnv
// Lobby is the per-test stub HTTP server.
Lobby *LobbyStub
// Network is the unique Docker network name created for this test.
Network string
// EngineImageRef and PatchedImageRef are the two semver-compatible
// engine image tags the harness builds once per package. Patch
// scenarios point at the second tag.
EngineImageRef string
PatchedImageRef string
// GameStateRoot is the host filesystem path RTM writes per-game
// state directories under. It lives inside `t.ArtifactDir()` so
// failed scenarios leave the engine state behind for inspection.
GameStateRoot string
// InternalAddr is the bound address of RTM's internal HTTP listener
// (resolved after Run binds the port).
InternalAddr string
}
// EnvOptions carry per-test overrides to the harness defaults. Empty
// fields fall back to the defaults declared at the top of this file.
type EnvOptions struct {
// ReconcileInterval overrides the periodic reconciler interval.
// Default 500ms (so reconcile drift is observable inside a single
// scenario timeout).
ReconcileInterval time.Duration
// CleanupInterval overrides the container-cleanup interval.
CleanupInterval time.Duration
// InspectInterval overrides the Docker inspect worker interval.
InspectInterval time.Duration
// ProbeInterval / ProbeTimeout / ProbeFailuresThreshold override
// the active engine probe knobs.
ProbeInterval time.Duration
ProbeTimeout time.Duration
ProbeFailuresThreshold int
// GameLeaseTTL overrides the per-game Redis lease TTL.
GameLeaseTTL time.Duration
// StreamBlockTimeout overrides the consumer XREAD block window.
StreamBlockTimeout time.Duration
// LogToStderr makes the harness write the runtime's structured
// logs to stderr; the default discards them so test output stays
// focused on assertions.
LogToStderr bool
}
// NewEnv stands up a fresh Runtime Manager process for the calling
// test. It blocks until the internal HTTP listener is bound; tests can
// issue REST and stream requests immediately after the call returns.
//
// `t.Cleanup` runs in reverse order: stop the runtime, close the
// runtime, close the per-test redis client, remove the docker network,
// terminate the lobby stub. Containers RTM created during the test are
// removed by the test's own cleanup paths or by the integration
// `health_test` external-action helpers.
func NewEnv(t *testing.T, opts EnvOptions) *Env {
t.Helper()
pg := EnsurePostgres(t)
rd := EnsureRedis(t)
dk := EnsureDocker(t)
imageRef := EnsureEngineImage(t)
TruncatePostgres(t)
FlushRedis(t)
network := EnsureNetwork(t)
lobby := NewLobbyStub(t)
stateRoot := stateRoot(t)
cfg := buildConfig(buildConfigInput{
PostgresDSN: pg.DSN(),
RedisAddr: rd.Addr(),
DockerHost: resolveDockerHost(),
Network: network,
LobbyURL: lobby.URL(),
GameStateRoot: stateRoot,
ReconcileInterval: pickDuration(opts.ReconcileInterval, 500*time.Millisecond),
CleanupInterval: pickDuration(opts.CleanupInterval, 500*time.Millisecond),
InspectInterval: pickDuration(opts.InspectInterval, 500*time.Millisecond),
ProbeInterval: pickDuration(opts.ProbeInterval, 500*time.Millisecond),
ProbeTimeout: pickDuration(opts.ProbeTimeout, time.Second),
ProbeFailures: pickInt(opts.ProbeFailuresThreshold, 2),
GameLeaseTTL: pickDuration(opts.GameLeaseTTL, 5*time.Second),
StreamBlockTimeout: pickDuration(opts.StreamBlockTimeout, 200*time.Millisecond),
})
logger := newLogger(opts.LogToStderr)
ctx, cancel := context.WithCancel(context.Background())
runtime, err := app.NewRuntime(ctx, cfg, logger)
if err != nil {
cancel()
t.Fatalf("rtmanager integration: new runtime: %v", err)
}
runDone := make(chan error, 1)
go func() {
runDone <- runtime.Run(ctx)
}()
internalAddr := waitForListener(t, runtime)
waitForReady(t, runtime, listenerWaitTimeout)
var cleanupOnce sync.Once
t.Cleanup(func() {
cleanupOnce.Do(func() {
cancel()
waitCtx, waitCancel := context.WithTimeout(context.Background(), cleanupShutdownTimeout)
defer waitCancel()
select {
case err := <-runDone:
if err != nil && !isCleanShutdownErr(err) {
t.Logf("rtmanager integration: runtime.Run returned: %v", err)
}
case <-waitCtx.Done():
t.Logf("rtmanager integration: runtime did not stop within %s", cleanupShutdownTimeout)
}
if err := runtime.Close(); err != nil {
t.Logf("rtmanager integration: runtime.Close: %v", err)
}
})
})
return &Env{
Cfg: cfg,
Runtime: runtime,
Postgres: pg,
Redis: rd,
RedisClient: rd.NewClient(t),
Docker: dk,
Lobby: lobby,
Network: network,
EngineImageRef: imageRef,
PatchedImageRef: PatchedEngineImageRef,
GameStateRoot: stateRoot,
InternalAddr: internalAddr,
}
}
type buildConfigInput struct {
PostgresDSN string
RedisAddr string
DockerHost string
Network string
LobbyURL string
GameStateRoot string
ReconcileInterval time.Duration
CleanupInterval time.Duration
InspectInterval time.Duration
ProbeInterval time.Duration
ProbeTimeout time.Duration
ProbeFailures int
GameLeaseTTL time.Duration
StreamBlockTimeout time.Duration
}
func buildConfig(in buildConfigInput) config.Config {
cfg := config.DefaultConfig()
cfg.InternalHTTP.Addr = listenAddr
cfg.Docker.Host = in.DockerHost
cfg.Docker.Network = in.Network
cfg.Docker.PullPolicy = config.ImagePullPolicyIfMissing
cfg.Postgres = config.PostgresConfig{
Conn: postgres.Config{
PrimaryDSN: in.PostgresDSN,
OperationTimeout: pgOperationTimeout,
MaxOpenConns: 5,
MaxIdleConns: 2,
ConnMaxLifetime: 30 * time.Minute,
},
}
cfg.Redis = config.RedisConfig{
Conn: redisconn.Config{
MasterAddr: in.RedisAddr,
Password: "integration",
OperationTimeout: 2 * time.Second,
},
}
cfg.Streams.StartJobs = StartJobsStream
cfg.Streams.StopJobs = StopJobsStream
cfg.Streams.JobResults = JobResultsStream
cfg.Streams.HealthEvents = HealthEventsStream
cfg.Streams.NotificationIntents = NotificationIntentsKey
cfg.Streams.BlockTimeout = in.StreamBlockTimeout
cfg.Container.GameStateRoot = in.GameStateRoot
// Pin chown target to the current process uid/gid; the dev sandbox
// (and unprivileged dev machines) cannot chown to root.
cfg.Container.GameStateOwnerUID = os.Getuid()
cfg.Container.GameStateOwnerGID = os.Getgid()
cfg.Health.InspectInterval = in.InspectInterval
cfg.Health.ProbeInterval = in.ProbeInterval
cfg.Health.ProbeTimeout = in.ProbeTimeout
cfg.Health.ProbeFailuresThreshold = in.ProbeFailures
cfg.Cleanup.ReconcileInterval = in.ReconcileInterval
cfg.Cleanup.CleanupInterval = in.CleanupInterval
cfg.Coordination.GameLeaseTTL = in.GameLeaseTTL
cfg.Lobby = config.LobbyConfig{
BaseURL: in.LobbyURL,
Timeout: 2 * time.Second,
}
cfg.Telemetry.TracesExporter = "none"
cfg.Telemetry.MetricsExporter = "none"
return cfg
}
func newLogger(toStderr bool) *slog.Logger {
if toStderr {
return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug}))
}
return slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{Level: slog.LevelError}))
}
func stateRoot(t *testing.T) string {
t.Helper()
dir := t.ArtifactDir()
root := dir + string(os.PathSeparator) + gameStateRootSubdir
if err := os.MkdirAll(root, 0o755); err != nil {
t.Fatalf("rtmanager integration: create game-state root %q: %v", root, err)
}
return root
}
func resolveDockerHost() string {
if host := strings.TrimSpace(os.Getenv("DOCKER_HOST")); host != "" {
return host
}
return "unix:///var/run/docker.sock"
}
func pickDuration(value, fallback time.Duration) time.Duration {
if value > 0 {
return value
}
return fallback
}
func pickInt(value, fallback int) int {
if value > 0 {
return value
}
return fallback
}
// waitForListener spins until `runtime.InternalServer().Addr()` returns
// a non-empty value or the deadline fires. The internal listener binds
// during `runtime.Run`, which runs in its own goroutine; this helper
// is the bridge between "Run started" and "tests can use REST".
func waitForListener(t *testing.T, runtime *app.Runtime) string {
t.Helper()
deadline := time.Now().Add(listenerWaitTimeout)
for {
if runtime != nil && runtime.InternalServer() != nil {
if addr := runtime.InternalServer().Addr(); addr != "" {
return addr
}
}
if time.Now().After(deadline) {
t.Fatalf("rtmanager integration: internal HTTP listener did not bind within %s", listenerWaitTimeout)
}
time.Sleep(readyzPollInterval)
}
}
// waitForReady polls `/readyz` until it returns 200 or the deadline
// fires. RTM's readyz pings PG, Redis, and Docker; a successful
// response means every dependency is reachable through the runtime
// process.
func waitForReady(t *testing.T, runtime *app.Runtime, timeout time.Duration) {
t.Helper()
deadline := time.Now().Add(timeout)
addr := runtime.InternalServer().Addr()
probeURL := (&url.URL{Scheme: "http", Host: addr, Path: "/readyz"}).String()
for {
req, err := newRequest(context.Background(), "GET", probeURL, nil)
if err == nil {
resp, err := defaultHTTPClient.Do(req)
if err == nil {
_, _ = io.Copy(io.Discard, resp.Body)
_ = resp.Body.Close()
if resp.StatusCode == 200 {
return
}
}
}
if time.Now().After(deadline) {
t.Fatalf("rtmanager integration: /readyz did not return 200 within %s", timeout)
}
time.Sleep(readyzPollInterval)
}
}
func isCleanShutdownErr(err error) bool {
return err == nil || errors.Is(err, context.Canceled)
}
// IDFromTestName builds a deterministic-but-unique game id from the
// caller's test name. Two tests with the same name running back-to-back
// would otherwise collide on PG state through the per-test
// `TruncatePostgres` window; pinning the suffix to `Now().UnixNano()`
// rules that out.
func IDFromTestName(t *testing.T) string {
t.Helper()
// The container hostname is `galaxy-game-{game_id}` and must fit
// HOST_NAME_MAX=64 chars; runc rejects longer values with
// "sethostname: invalid argument". Cap the lowercased test-name
// component at 36 chars and append a 16-char base36 suffix so the
// total stays comfortably under the limit (12 + 36 + 1 + 16 = 65 →
// trim further if needed).
const maxNameLen = 35
suffix := strconv.FormatInt(time.Now().UnixNano(), 36)
prefix := strings.ToLower(strings.NewReplacer("/", "-", " ", "-").Replace(t.Name()))
if len(prefix) > maxNameLen {
prefix = prefix[:maxNameLen]
}
return prefix + "-" + suffix
}