feat: runtime manager
This commit is contained in:
@@ -0,0 +1,398 @@
|
||||
package harness
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/postgres"
|
||||
"galaxy/redisconn"
|
||||
"galaxy/rtmanager/internal/app"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// Default stream key shapes used by the integration suite. They match
|
||||
// the production defaults so the wire shapes asserted in `streams.go`
|
||||
// are identical to what Game Lobby sees in `integration/lobbyrtm`.
|
||||
const (
|
||||
StartJobsStream = "runtime:start_jobs"
|
||||
StopJobsStream = "runtime:stop_jobs"
|
||||
JobResultsStream = "runtime:job_results"
|
||||
HealthEventsStream = "runtime:health_events"
|
||||
NotificationIntentsKey = "notification:intents"
|
||||
gameStateRootSubdir = "game-state"
|
||||
listenAddr = "127.0.0.1:0"
|
||||
listenerWaitTimeout = 10 * time.Second
|
||||
readyzPollInterval = 25 * time.Millisecond
|
||||
cleanupShutdownTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
// Env carries everything one integration scenario needs to drive the
|
||||
// Runtime Manager process. The struct is value-typed so tests reach
|
||||
// fields without intermediate getters.
|
||||
type Env struct {
|
||||
// Cfg is the resolved Runtime Manager configuration handed to
|
||||
// `app.NewRuntime`. Tests inspect it for stream key shapes,
|
||||
// container defaults, and timeout knobs.
|
||||
Cfg config.Config
|
||||
|
||||
// Runtime is the in-process Runtime Manager exposed for tests that
|
||||
// need to peek at internal state (`runtime.InternalServer().Addr()`).
|
||||
Runtime *app.Runtime
|
||||
|
||||
// Postgres holds the per-package PostgreSQL fixture.
|
||||
Postgres *PostgresEnv
|
||||
|
||||
// Redis holds the per-package Redis fixture plus a fresh client the
|
||||
// test owns.
|
||||
Redis *RedisEnv
|
||||
RedisClient *redis.Client
|
||||
|
||||
// Docker holds the per-package Docker daemon handle.
|
||||
Docker *DockerEnv
|
||||
|
||||
// Lobby is the per-test stub HTTP server.
|
||||
Lobby *LobbyStub
|
||||
|
||||
// Network is the unique Docker network name created for this test.
|
||||
Network string
|
||||
|
||||
// EngineImageRef and PatchedImageRef are the two semver-compatible
|
||||
// engine image tags the harness builds once per package. Patch
|
||||
// scenarios point at the second tag.
|
||||
EngineImageRef string
|
||||
PatchedImageRef string
|
||||
|
||||
// GameStateRoot is the host filesystem path RTM writes per-game
|
||||
// state directories under. It lives inside `t.ArtifactDir()` so
|
||||
// failed scenarios leave the engine state behind for inspection.
|
||||
GameStateRoot string
|
||||
|
||||
// InternalAddr is the bound address of RTM's internal HTTP listener
|
||||
// (resolved after Run binds the port).
|
||||
InternalAddr string
|
||||
}
|
||||
|
||||
// EnvOptions carry per-test overrides to the harness defaults. Empty
|
||||
// fields fall back to the defaults declared at the top of this file.
|
||||
type EnvOptions struct {
|
||||
// ReconcileInterval overrides the periodic reconciler interval.
|
||||
// Default 500ms (so reconcile drift is observable inside a single
|
||||
// scenario timeout).
|
||||
ReconcileInterval time.Duration
|
||||
|
||||
// CleanupInterval overrides the container-cleanup interval.
|
||||
CleanupInterval time.Duration
|
||||
|
||||
// InspectInterval overrides the Docker inspect worker interval.
|
||||
InspectInterval time.Duration
|
||||
|
||||
// ProbeInterval / ProbeTimeout / ProbeFailuresThreshold override
|
||||
// the active engine probe knobs.
|
||||
ProbeInterval time.Duration
|
||||
ProbeTimeout time.Duration
|
||||
ProbeFailuresThreshold int
|
||||
|
||||
// GameLeaseTTL overrides the per-game Redis lease TTL.
|
||||
GameLeaseTTL time.Duration
|
||||
|
||||
// StreamBlockTimeout overrides the consumer XREAD block window.
|
||||
StreamBlockTimeout time.Duration
|
||||
|
||||
// LogToStderr makes the harness write the runtime's structured
|
||||
// logs to stderr; the default discards them so test output stays
|
||||
// focused on assertions.
|
||||
LogToStderr bool
|
||||
}
|
||||
|
||||
// NewEnv stands up a fresh Runtime Manager process for the calling
|
||||
// test. It blocks until the internal HTTP listener is bound; tests can
|
||||
// issue REST and stream requests immediately after the call returns.
|
||||
//
|
||||
// `t.Cleanup` runs in reverse order: stop the runtime, close the
|
||||
// runtime, close the per-test redis client, remove the docker network,
|
||||
// terminate the lobby stub. Containers RTM created during the test are
|
||||
// removed by the test's own cleanup paths or by the integration
|
||||
// `health_test` external-action helpers.
|
||||
func NewEnv(t *testing.T, opts EnvOptions) *Env {
|
||||
t.Helper()
|
||||
|
||||
pg := EnsurePostgres(t)
|
||||
rd := EnsureRedis(t)
|
||||
dk := EnsureDocker(t)
|
||||
imageRef := EnsureEngineImage(t)
|
||||
TruncatePostgres(t)
|
||||
FlushRedis(t)
|
||||
network := EnsureNetwork(t)
|
||||
lobby := NewLobbyStub(t)
|
||||
stateRoot := stateRoot(t)
|
||||
|
||||
cfg := buildConfig(buildConfigInput{
|
||||
PostgresDSN: pg.DSN(),
|
||||
RedisAddr: rd.Addr(),
|
||||
DockerHost: resolveDockerHost(),
|
||||
Network: network,
|
||||
LobbyURL: lobby.URL(),
|
||||
GameStateRoot: stateRoot,
|
||||
ReconcileInterval: pickDuration(opts.ReconcileInterval, 500*time.Millisecond),
|
||||
CleanupInterval: pickDuration(opts.CleanupInterval, 500*time.Millisecond),
|
||||
InspectInterval: pickDuration(opts.InspectInterval, 500*time.Millisecond),
|
||||
ProbeInterval: pickDuration(opts.ProbeInterval, 500*time.Millisecond),
|
||||
ProbeTimeout: pickDuration(opts.ProbeTimeout, time.Second),
|
||||
ProbeFailures: pickInt(opts.ProbeFailuresThreshold, 2),
|
||||
GameLeaseTTL: pickDuration(opts.GameLeaseTTL, 5*time.Second),
|
||||
StreamBlockTimeout: pickDuration(opts.StreamBlockTimeout, 200*time.Millisecond),
|
||||
})
|
||||
|
||||
logger := newLogger(opts.LogToStderr)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
runtime, err := app.NewRuntime(ctx, cfg, logger)
|
||||
if err != nil {
|
||||
cancel()
|
||||
t.Fatalf("rtmanager integration: new runtime: %v", err)
|
||||
}
|
||||
|
||||
runDone := make(chan error, 1)
|
||||
go func() {
|
||||
runDone <- runtime.Run(ctx)
|
||||
}()
|
||||
|
||||
internalAddr := waitForListener(t, runtime)
|
||||
waitForReady(t, runtime, listenerWaitTimeout)
|
||||
|
||||
var cleanupOnce sync.Once
|
||||
t.Cleanup(func() {
|
||||
cleanupOnce.Do(func() {
|
||||
cancel()
|
||||
waitCtx, waitCancel := context.WithTimeout(context.Background(), cleanupShutdownTimeout)
|
||||
defer waitCancel()
|
||||
select {
|
||||
case err := <-runDone:
|
||||
if err != nil && !isCleanShutdownErr(err) {
|
||||
t.Logf("rtmanager integration: runtime.Run returned: %v", err)
|
||||
}
|
||||
case <-waitCtx.Done():
|
||||
t.Logf("rtmanager integration: runtime did not stop within %s", cleanupShutdownTimeout)
|
||||
}
|
||||
if err := runtime.Close(); err != nil {
|
||||
t.Logf("rtmanager integration: runtime.Close: %v", err)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
return &Env{
|
||||
Cfg: cfg,
|
||||
Runtime: runtime,
|
||||
Postgres: pg,
|
||||
Redis: rd,
|
||||
RedisClient: rd.NewClient(t),
|
||||
Docker: dk,
|
||||
Lobby: lobby,
|
||||
Network: network,
|
||||
EngineImageRef: imageRef,
|
||||
PatchedImageRef: PatchedEngineImageRef,
|
||||
GameStateRoot: stateRoot,
|
||||
InternalAddr: internalAddr,
|
||||
}
|
||||
}
|
||||
|
||||
type buildConfigInput struct {
|
||||
PostgresDSN string
|
||||
RedisAddr string
|
||||
DockerHost string
|
||||
Network string
|
||||
LobbyURL string
|
||||
GameStateRoot string
|
||||
ReconcileInterval time.Duration
|
||||
CleanupInterval time.Duration
|
||||
InspectInterval time.Duration
|
||||
ProbeInterval time.Duration
|
||||
ProbeTimeout time.Duration
|
||||
ProbeFailures int
|
||||
GameLeaseTTL time.Duration
|
||||
StreamBlockTimeout time.Duration
|
||||
}
|
||||
|
||||
func buildConfig(in buildConfigInput) config.Config {
|
||||
cfg := config.DefaultConfig()
|
||||
cfg.InternalHTTP.Addr = listenAddr
|
||||
|
||||
cfg.Docker.Host = in.DockerHost
|
||||
cfg.Docker.Network = in.Network
|
||||
cfg.Docker.PullPolicy = config.ImagePullPolicyIfMissing
|
||||
|
||||
cfg.Postgres = config.PostgresConfig{
|
||||
Conn: postgres.Config{
|
||||
PrimaryDSN: in.PostgresDSN,
|
||||
OperationTimeout: pgOperationTimeout,
|
||||
MaxOpenConns: 5,
|
||||
MaxIdleConns: 2,
|
||||
ConnMaxLifetime: 30 * time.Minute,
|
||||
},
|
||||
}
|
||||
|
||||
cfg.Redis = config.RedisConfig{
|
||||
Conn: redisconn.Config{
|
||||
MasterAddr: in.RedisAddr,
|
||||
Password: "integration",
|
||||
OperationTimeout: 2 * time.Second,
|
||||
},
|
||||
}
|
||||
|
||||
cfg.Streams.StartJobs = StartJobsStream
|
||||
cfg.Streams.StopJobs = StopJobsStream
|
||||
cfg.Streams.JobResults = JobResultsStream
|
||||
cfg.Streams.HealthEvents = HealthEventsStream
|
||||
cfg.Streams.NotificationIntents = NotificationIntentsKey
|
||||
cfg.Streams.BlockTimeout = in.StreamBlockTimeout
|
||||
|
||||
cfg.Container.GameStateRoot = in.GameStateRoot
|
||||
// Pin chown target to the current process uid/gid; the dev sandbox
|
||||
// (and unprivileged dev machines) cannot chown to root.
|
||||
cfg.Container.GameStateOwnerUID = os.Getuid()
|
||||
cfg.Container.GameStateOwnerGID = os.Getgid()
|
||||
|
||||
cfg.Health.InspectInterval = in.InspectInterval
|
||||
cfg.Health.ProbeInterval = in.ProbeInterval
|
||||
cfg.Health.ProbeTimeout = in.ProbeTimeout
|
||||
cfg.Health.ProbeFailuresThreshold = in.ProbeFailures
|
||||
|
||||
cfg.Cleanup.ReconcileInterval = in.ReconcileInterval
|
||||
cfg.Cleanup.CleanupInterval = in.CleanupInterval
|
||||
|
||||
cfg.Coordination.GameLeaseTTL = in.GameLeaseTTL
|
||||
|
||||
cfg.Lobby = config.LobbyConfig{
|
||||
BaseURL: in.LobbyURL,
|
||||
Timeout: 2 * time.Second,
|
||||
}
|
||||
|
||||
cfg.Telemetry.TracesExporter = "none"
|
||||
cfg.Telemetry.MetricsExporter = "none"
|
||||
|
||||
return cfg
|
||||
}
|
||||
|
||||
func newLogger(toStderr bool) *slog.Logger {
|
||||
if toStderr {
|
||||
return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug}))
|
||||
}
|
||||
return slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{Level: slog.LevelError}))
|
||||
}
|
||||
|
||||
func stateRoot(t *testing.T) string {
|
||||
t.Helper()
|
||||
dir := t.ArtifactDir()
|
||||
root := dir + string(os.PathSeparator) + gameStateRootSubdir
|
||||
if err := os.MkdirAll(root, 0o755); err != nil {
|
||||
t.Fatalf("rtmanager integration: create game-state root %q: %v", root, err)
|
||||
}
|
||||
return root
|
||||
}
|
||||
|
||||
func resolveDockerHost() string {
|
||||
if host := strings.TrimSpace(os.Getenv("DOCKER_HOST")); host != "" {
|
||||
return host
|
||||
}
|
||||
return "unix:///var/run/docker.sock"
|
||||
}
|
||||
|
||||
func pickDuration(value, fallback time.Duration) time.Duration {
|
||||
if value > 0 {
|
||||
return value
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func pickInt(value, fallback int) int {
|
||||
if value > 0 {
|
||||
return value
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
// waitForListener spins until `runtime.InternalServer().Addr()` returns
|
||||
// a non-empty value or the deadline fires. The internal listener binds
|
||||
// during `runtime.Run`, which runs in its own goroutine; this helper
|
||||
// is the bridge between "Run started" and "tests can use REST".
|
||||
func waitForListener(t *testing.T, runtime *app.Runtime) string {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(listenerWaitTimeout)
|
||||
for {
|
||||
if runtime != nil && runtime.InternalServer() != nil {
|
||||
if addr := runtime.InternalServer().Addr(); addr != "" {
|
||||
return addr
|
||||
}
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("rtmanager integration: internal HTTP listener did not bind within %s", listenerWaitTimeout)
|
||||
}
|
||||
time.Sleep(readyzPollInterval)
|
||||
}
|
||||
}
|
||||
|
||||
// waitForReady polls `/readyz` until it returns 200 or the deadline
|
||||
// fires. RTM's readyz pings PG, Redis, and Docker; a successful
|
||||
// response means every dependency is reachable through the runtime
|
||||
// process.
|
||||
func waitForReady(t *testing.T, runtime *app.Runtime, timeout time.Duration) {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(timeout)
|
||||
addr := runtime.InternalServer().Addr()
|
||||
probeURL := (&url.URL{Scheme: "http", Host: addr, Path: "/readyz"}).String()
|
||||
for {
|
||||
req, err := newRequest(context.Background(), "GET", probeURL, nil)
|
||||
if err == nil {
|
||||
resp, err := defaultHTTPClient.Do(req)
|
||||
if err == nil {
|
||||
_, _ = io.Copy(io.Discard, resp.Body)
|
||||
_ = resp.Body.Close()
|
||||
if resp.StatusCode == 200 {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("rtmanager integration: /readyz did not return 200 within %s", timeout)
|
||||
}
|
||||
time.Sleep(readyzPollInterval)
|
||||
}
|
||||
}
|
||||
|
||||
func isCleanShutdownErr(err error) bool {
|
||||
return err == nil || errors.Is(err, context.Canceled)
|
||||
}
|
||||
|
||||
// IDFromTestName builds a deterministic-but-unique game id from the
|
||||
// caller's test name. Two tests with the same name running back-to-back
|
||||
// would otherwise collide on PG state through the per-test
|
||||
// `TruncatePostgres` window; pinning the suffix to `Now().UnixNano()`
|
||||
// rules that out.
|
||||
func IDFromTestName(t *testing.T) string {
|
||||
t.Helper()
|
||||
// The container hostname is `galaxy-game-{game_id}` and must fit
|
||||
// HOST_NAME_MAX=64 chars; runc rejects longer values with
|
||||
// "sethostname: invalid argument". Cap the lowercased test-name
|
||||
// component at 36 chars and append a 16-char base36 suffix so the
|
||||
// total stays comfortably under the limit (12 + 36 + 1 + 16 = 65 →
|
||||
// trim further if needed).
|
||||
const maxNameLen = 35
|
||||
suffix := strconv.FormatInt(time.Now().UnixNano(), 36)
|
||||
prefix := strings.ToLower(strings.NewReplacer("/", "-", " ", "-").Replace(t.Name()))
|
||||
if len(prefix) > maxNameLen {
|
||||
prefix = prefix[:maxNameLen]
|
||||
}
|
||||
return prefix + "-" + suffix
|
||||
}
|
||||
Reference in New Issue
Block a user