feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,289 @@
package harness
import (
"context"
"crypto/rand"
"encoding/hex"
"encoding/json"
"fmt"
"net"
"net/http"
"os/exec"
"strings"
"testing"
"time"
)
const (
dockerNetworkPrefix = "lobbyrtm-it-"
dockerNetworkTimeout = 30 * time.Second
dockerCLITimeout = 30 * time.Second
containerHealthzPort = 8080
containerHealthzTimeout = 5 * time.Second
containerHealthzPoll = 100 * time.Millisecond
)
// EnsureDockerNetwork creates a uniquely-named Docker bridge network
// for the caller's test and registers cleanup. Each test gets its own
// network so concurrent scenarios cannot collide on the per-game DNS
// hostname (`galaxy-game-{game_id}`). The helper skips the test when
// no Docker daemon is reachable.
func EnsureDockerNetwork(t testing.TB) string {
t.Helper()
requireDockerDaemon(t)
name := dockerNetworkPrefix + uniqueSuffix(t)
ctx, cancel := context.WithTimeout(context.Background(), dockerNetworkTimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "docker", "network", "create", "--driver", "bridge", name)
output, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("integration harness: create docker network %q: %v; output:\n%s",
name, err, strings.TrimSpace(string(output)))
}
t.Cleanup(func() {
cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), dockerNetworkTimeout)
defer cleanupCancel()
removeCmd := exec.CommandContext(cleanupCtx, "docker", "network", "rm", name)
if rmErr := removeCmd.Run(); rmErr != nil {
t.Logf("integration harness: remove docker network %q: %v", name, rmErr)
}
})
return name
}
// FindContainerIDByLabel returns the id of the single running container
// labelled with the given game id, or an empty string when no match is
// found. The label keys are the ones rtmanager attaches at start time
// (`com.galaxy.owner=rtmanager`, `com.galaxy.game_id=<gameID>`).
func FindContainerIDByLabel(t testing.TB, gameID string) string {
t.Helper()
requireDockerDaemon(t)
ctx, cancel := context.WithTimeout(context.Background(), dockerCLITimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "docker", "ps", "-aq", "--no-trunc",
"--filter", "label=com.galaxy.owner=rtmanager",
"--filter", "label=com.galaxy.game_id="+gameID,
)
output, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("integration harness: docker ps for game %s: %v; output:\n%s",
gameID, err, strings.TrimSpace(string(output)))
}
id := strings.TrimSpace(string(output))
if id == "" {
return ""
}
if strings.Contains(id, "\n") {
t.Fatalf("integration harness: multiple containers for game %s:\n%s", gameID, id)
}
return id
}
// ContainerState returns the runtime state string (e.g. `running`,
// `exited`) of the container with the given id, looked up via
// `docker inspect`.
func ContainerState(t testing.TB, containerID string) string {
t.Helper()
requireDockerDaemon(t)
ctx, cancel := context.WithTimeout(context.Background(), dockerCLITimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "docker", "inspect", "--format", "{{.State.Status}}", containerID)
output, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("integration harness: docker inspect %s: %v; output:\n%s",
containerID, err, strings.TrimSpace(string(output)))
}
return strings.TrimSpace(string(output))
}
// ContainerNetworkIP returns the IPv4 address of the named container
// inside the named bridge network. Returns an empty string when the
// container has no endpoint on that network.
func ContainerNetworkIP(t testing.TB, containerID, networkName string) string {
t.Helper()
requireDockerDaemon(t)
ctx, cancel := context.WithTimeout(context.Background(), dockerCLITimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "docker", "inspect", "--format", "{{json .NetworkSettings.Networks}}", containerID)
output, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("integration harness: docker inspect networks %s: %v; output:\n%s",
containerID, err, strings.TrimSpace(string(output)))
}
var networks map[string]struct {
IPAddress string `json:"IPAddress"`
}
if err := json.Unmarshal(output, &networks); err != nil {
t.Fatalf("integration harness: parse network json for %s: %v; payload=%s",
containerID, err, strings.TrimSpace(string(output)))
}
if entry, ok := networks[networkName]; ok {
return entry.IPAddress
}
return ""
}
// WaitForEngineHealthz polls the engine `/healthz` on port 8080 until
// it returns 200 or the timeout fires. On macOS the docker bridge IP is
// not routable from the host, so the helper falls back to a transient
// `busybox` probe container on the same docker network. On Linux it
// dials the bridge IP directly.
func WaitForEngineHealthz(t testing.TB, ip string, timeout time.Duration) {
t.Helper()
if ip == "" {
t.Fatalf("integration harness: empty engine ip")
}
if timeout <= 0 {
timeout = containerHealthzTimeout
}
if dialFromHost(ip, containerHealthzPort, 500*time.Millisecond) {
waitForHealthzFromHost(t, ip, timeout)
return
}
network, hostname := containerNetworkAndHostname(t, ip)
if network == "" || hostname == "" {
t.Fatalf("integration harness: cannot resolve docker network/hostname for engine ip %s", ip)
}
waitForHealthzViaProbe(t, network, hostname, timeout)
}
// dialFromHost reports whether tcp connect to ip:port succeeds within
// timeout. Used to detect the macOS routing limitation cheaply.
func dialFromHost(ip string, port int, timeout time.Duration) bool {
conn, err := net.DialTimeout("tcp", net.JoinHostPort(ip, fmt.Sprintf("%d", port)), timeout)
if err != nil {
return false
}
_ = conn.Close()
return true
}
func waitForHealthzFromHost(t testing.TB, ip string, timeout time.Duration) {
t.Helper()
url := fmt.Sprintf("http://%s/healthz", net.JoinHostPort(ip, fmt.Sprintf("%d", containerHealthzPort)))
client := &http.Client{
Timeout: 500 * time.Millisecond,
Transport: &http.Transport{DisableKeepAlives: true},
}
t.Cleanup(client.CloseIdleConnections)
deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) {
req, err := http.NewRequest(http.MethodGet, url, nil)
if err != nil {
t.Fatalf("integration harness: build healthz request for %s: %v", url, err)
}
resp, err := client.Do(req)
if err == nil {
resp.Body.Close()
if resp.StatusCode == http.StatusOK {
return
}
}
time.Sleep(containerHealthzPoll)
}
t.Fatalf("integration harness: engine /healthz on %s did not return 200 within %s", url, timeout)
}
// containerNetworkAndHostname locates the bridge network and engine
// container hostname behind the given IP so the busybox probe can use
// the docker DNS name rather than rely on host routing. The lookup is
// scoped to RTM-owned containers (`com.galaxy.owner=rtmanager`).
func containerNetworkAndHostname(t testing.TB, ip string) (string, string) {
t.Helper()
requireDockerDaemon(t)
ctx, cancel := context.WithTimeout(context.Background(), dockerCLITimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "docker", "ps", "-aq", "--no-trunc",
"--filter", "label=com.galaxy.owner=rtmanager",
)
output, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("integration harness: docker ps for engine probe: %v; output:\n%s", err, strings.TrimSpace(string(output)))
}
for _, id := range strings.Split(strings.TrimSpace(string(output)), "\n") {
id = strings.TrimSpace(id)
if id == "" {
continue
}
ipsByNetwork, hostname, ok := inspectIPAndHostname(t, id)
if !ok {
continue
}
for networkName, networkIP := range ipsByNetwork {
if networkIP == ip {
return networkName, hostname
}
}
}
return "", ""
}
func inspectIPAndHostname(t testing.TB, containerID string) (map[string]string, string, bool) {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), dockerCLITimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "docker", "inspect", "--format",
"{{json .NetworkSettings.Networks}}|{{.Config.Hostname}}", containerID)
output, err := cmd.CombinedOutput()
if err != nil {
return nil, "", false
}
parts := strings.SplitN(strings.TrimSpace(string(output)), "|", 2)
if len(parts) != 2 {
return nil, "", false
}
var networks map[string]struct {
IPAddress string `json:"IPAddress"`
}
if err := json.Unmarshal([]byte(parts[0]), &networks); err != nil {
return nil, "", false
}
ipsByNetwork := make(map[string]string, len(networks))
for name, entry := range networks {
ipsByNetwork[name] = entry.IPAddress
}
return ipsByNetwork, parts[1], true
}
// waitForHealthzViaProbe runs `wget -qO- http://<hostname>:8080/healthz`
// inside a transient busybox container on networkName until the probe
// exits 0 or the timeout fires.
func waitForHealthzViaProbe(t testing.TB, networkName, hostname string, timeout time.Duration) {
t.Helper()
deadline := time.Now().Add(timeout)
url := fmt.Sprintf("http://%s:%d/healthz", hostname, containerHealthzPort)
for time.Now().Before(deadline) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
cmd := exec.CommandContext(ctx, "docker", "run", "--rm",
"--network", networkName,
"busybox:stable",
"wget", "-qO-", url,
)
out, err := cmd.CombinedOutput()
cancel()
if err == nil && strings.Contains(string(out), "ok") {
return
}
time.Sleep(containerHealthzPoll)
}
t.Fatalf("integration harness: engine /healthz on %s did not return 200 via probe within %s", url, timeout)
}
func uniqueSuffix(t testing.TB) string {
t.Helper()
buf := make([]byte, 4)
if _, err := rand.Read(buf); err != nil {
t.Fatalf("integration harness: read random suffix: %v", err)
}
return hex.EncodeToString(buf)
}
+139
View File
@@ -0,0 +1,139 @@
package harness
import (
"context"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"sync"
"testing"
"time"
)
// EngineImageRef is the canonical tag the lobbyrtm boundary suite (and
// any future suite that needs the galaxy/game engine binary) builds and
// runs against. The `-lobbyrtm-it` suffix differs from the
// `-rtm-it` tag the service-local rtmanager/integration harness uses, so
// an operator running both suites locally cannot accidentally consume
// the wrong image, and `docker image rm` of one suite's leftovers does
// not remove the other suite's tag.
const EngineImageRef = "galaxy/game:1.0.0-lobbyrtm-it"
const (
imageBuildTimeout = 10 * time.Minute
dockerDaemonPingTimeout = 5 * time.Second
)
var (
engineImageOnce sync.Once
engineImageErr error
dockerAvailableOnce sync.Once
dockerAvailableErr error
)
// RequireDockerDaemon skips the calling test when no Docker daemon is
// reachable from this process. Suites that need Docker but stand up
// testcontainers (Postgres/Redis) before any RTM-specific helper
// should call this helper first so the skip path runs *before* the
// testcontainer client probes the daemon and fails hard.
func RequireDockerDaemon(t testing.TB) {
t.Helper()
requireDockerDaemon(t)
}
// EnsureGalaxyGameImage builds the galaxy/game engine image from the
// workspace root once per test process and returns the canonical tag.
// On hosts without a reachable Docker daemon the helper calls `t.Skip`
// so suites stay green when `/var/run/docker.sock` is missing and
// `DOCKER_HOST` is unset.
//
// The build is wrapped in `sync.Once`; concurrent suite invocations
// share the same image. The Dockerfile path and build context match
// `rtmanager/integration/harness/docker.go::buildAndTagEngineImage` —
// galaxy's `go.work` resolves `galaxy/{model,error,...}` only when the
// workspace root is the build context.
func EnsureGalaxyGameImage(t testing.TB) string {
t.Helper()
requireDockerDaemon(t)
engineImageOnce.Do(func() {
engineImageErr = buildEngineImage()
})
if engineImageErr != nil {
t.Fatalf("integration harness: build galaxy/game image: %v", engineImageErr)
}
return EngineImageRef
}
func buildEngineImage() error {
root, err := workspaceRoot()
if err != nil {
return fmt.Errorf("resolve workspace root: %w", err)
}
ctx, cancel := context.WithTimeout(context.Background(), imageBuildTimeout)
defer cancel()
dockerfilePath := filepath.Join("game", "Dockerfile")
cmd := exec.CommandContext(ctx, "docker", "build",
"-f", dockerfilePath,
"-t", EngineImageRef,
".",
)
cmd.Dir = root
cmd.Env = append(os.Environ(), "DOCKER_BUILDKIT=1")
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("docker build (-f %s) in %s: %w; output:\n%s",
dockerfilePath, root, err, strings.TrimSpace(string(output)))
}
return nil
}
// requireDockerDaemon skips the calling test when no Docker daemon is
// reachable from this process. The check runs once per process and
// caches the verdict so successive callers do not pay the ping cost.
func requireDockerDaemon(t testing.TB) {
t.Helper()
dockerAvailableOnce.Do(func() {
dockerAvailableErr = pingDockerDaemon()
})
if dockerAvailableErr != nil {
t.Skipf("integration harness: docker daemon unavailable: %v", dockerAvailableErr)
}
}
func pingDockerDaemon() error {
if os.Getenv("DOCKER_HOST") == "" {
if _, err := os.Stat("/var/run/docker.sock"); err != nil {
return fmt.Errorf("set DOCKER_HOST or expose /var/run/docker.sock: %w", err)
}
}
ctx, cancel := context.WithTimeout(context.Background(), dockerDaemonPingTimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "docker", "version", "--format", "{{.Server.Version}}")
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("docker version: %w; output:\n%s", err, strings.TrimSpace(string(output)))
}
return nil
}
// workspaceRoot resolves the absolute path of the galaxy/ workspace
// root by anchoring on this file's location. The harness lives at
// `galaxy/integration/internal/harness/engineimage.go`; the workspace
// root is three directories up.
func workspaceRoot() (string, error) {
_, file, _, ok := runtime.Caller(0)
if !ok {
return "", errors.New("resolve runtime caller for workspace root")
}
dir := filepath.Dir(file)
root := filepath.Clean(filepath.Join(dir, "..", "..", ".."))
return root, nil
}
@@ -0,0 +1,54 @@
package harness
import (
"context"
"testing"
)
// RTManagerServicePersistence captures the per-test persistence
// dependencies of the Runtime Manager binary: a PostgreSQL container
// hosting the `rtmanager` schema owned by the `rtmanagerservice` role,
// plus the Redis credentials that point the service at the
// caller-supplied master address.
type RTManagerServicePersistence struct {
// Postgres exposes the started container so tests that need direct
// SQL access to the rtmanager schema can read or write through it.
Postgres *PostgresRuntime
// Env carries the environment entries that must be passed to the
// rtmanager process. It is safe to merge into the caller's existing
// env map, or to use as-is and append further RTMANAGER_* knobs in
// place. RTMANAGER_GAME_STATE_ROOT is intentionally omitted; the
// caller supplies a per-test directory.
Env map[string]string
}
// StartRTManagerServicePersistence brings up one isolated PostgreSQL
// container, provisions the `rtmanager` schema with the
// `rtmanagerservice` role, and returns the environment entries that
// wire the rtmanager binary at that container plus the supplied Redis
// master address.
//
// The Redis password value matches the architectural rule that Redis
// traffic is password-protected; miniredis accepts arbitrary password
// values when its own RequireAuth is not engaged, and the same value
// works against the real testcontainers Redis runtime.
//
// Cleanup of the container is handled by StartPostgresContainer through
// `t.Cleanup`; callers do not need to defer anything.
func StartRTManagerServicePersistence(t testing.TB, redisMasterAddr string) RTManagerServicePersistence {
t.Helper()
rt := StartPostgresContainer(t)
if err := rt.EnsureRoleAndSchema(context.Background(), "rtmanager", "rtmanagerservice", "rtmanagerservice"); err != nil {
t.Fatalf("ensure rtmanager schema/role: %v", err)
}
env := WithPostgres(rt, "RTMANAGER", "rtmanager", "rtmanagerservice")
env["RTMANAGER_REDIS_MASTER_ADDR"] = redisMasterAddr
env["RTMANAGER_REDIS_PASSWORD"] = "integration"
return RTManagerServicePersistence{
Postgres: rt,
Env: env,
}
}