feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
+303
View File
@@ -0,0 +1,303 @@
//go:build integration
// Package integration_test owns the service-local end-to-end scenarios
// for Runtime Manager. The build tag keeps the suite out of the
// default `go test ./...` run; CI invokes the suite explicitly with
// `go test -tags=integration ./rtmanager/integration/...`.
//
// Design rationale for the suite — build tag, in-process harness,
// per-test isolation, two-tag engine image — lives in
// `rtmanager/docs/integration-tests.md`. Each test stands up its own
// Runtime Manager process via `harness.NewEnv`, drives the same
// streams Game Lobby uses in `integration/lobbyrtm`, and asserts the
// resulting PostgreSQL, Redis-stream, and Docker side-effects.
package integration_test
import (
"context"
"net/http"
"testing"
"time"
"galaxy/rtmanager/integration/harness"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/filters"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestMain centralises shared-container teardown so individual
// failing tests do not leak the testcontainers postgres / redis pair.
func TestMain(m *testing.M) {
harness.RunMain(m)
}
// TestLifecycle_StartInspectStopRestartPatchCleanup drives one game
// through every supported lifecycle operation against the real engine
// image and asserts each step's PG, Redis-stream, and Docker
// side-effects.
func TestLifecycle_StartInspectStopRestartPatchCleanup(t *testing.T) {
env := harness.NewEnv(t, harness.EnvOptions{LogToStderr: true})
rest := harness.NewREST(env)
gameID := harness.IDFromTestName(t)
// Step 1 — start through the Lobby async stream contract.
startEntryID := harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
t.Logf("start_jobs xadd id=%s", startEntryID)
startResult := harness.WaitForJobResult(t, env,
harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess),
30*time.Second,
)
require.Equal(t, "", startResult.ErrorCode, "fresh start must publish empty error_code")
require.NotEmpty(t, startResult.ContainerID, "fresh start job result must carry container_id")
require.NotEmpty(t, startResult.EngineEndpoint, "fresh start job result must carry engine_endpoint")
// PG record reflects the start.
startedRecord := harness.EventuallyRuntimeRecord(t, env, gameID,
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusRunning },
15*time.Second,
)
assert.Equal(t, env.EngineImageRef, startedRecord.CurrentImageRef)
assert.Equal(t, env.Network, startedRecord.DockerNetwork)
assert.Equal(t, startResult.ContainerID, startedRecord.CurrentContainerID)
assert.Equal(t, startResult.EngineEndpoint, startedRecord.EngineEndpoint)
// operation_log captures the start.
startEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindStart, 5*time.Second)
assert.Equal(t, operation.OutcomeSuccess, startEntry.Outcome)
assert.Equal(t, operation.OpSourceLobbyStream, startEntry.OpSource)
// Step 2 — inspect via the GM/Admin REST surface.
getResp, status := rest.GetRuntime(t, gameID)
require.Equal(t, http.StatusOK, status)
require.Equal(t, "running", getResp.Status)
require.NotNil(t, getResp.CurrentContainerID)
require.Equal(t, startResult.ContainerID, *getResp.CurrentContainerID)
require.NotNil(t, getResp.CurrentImageRef)
require.Equal(t, env.EngineImageRef, *getResp.CurrentImageRef)
require.NotNil(t, getResp.EngineEndpoint)
require.Equal(t, startResult.EngineEndpoint, *getResp.EngineEndpoint)
// Step 3 — stop through the Lobby async stream contract.
harness.XAddStopJob(t, env, gameID, "cancelled")
stopResult := waitForLatestStopOrStartResult(t, env, gameID)
require.Equal(t, ports.JobOutcomeSuccess, stopResult.Outcome)
require.Equal(t, "", stopResult.ErrorCode, "fresh stop must publish empty error_code")
stoppedRecord := harness.EventuallyRuntimeRecord(t, env, gameID,
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusStopped },
15*time.Second,
)
assert.Equal(t, startResult.ContainerID, stoppedRecord.CurrentContainerID,
"stop preserves the current container id until cleanup")
// Step 4 — restart via REST. Container id changes; engine endpoint
// stays stable.
restartResp, status := rest.RestartRuntime(t, gameID)
require.Equal(t, http.StatusOK, status)
require.Equal(t, "running", restartResp.Status)
require.NotNil(t, restartResp.CurrentContainerID)
require.NotEqual(t, startResult.ContainerID, *restartResp.CurrentContainerID,
"restart must produce a new container id")
require.NotNil(t, restartResp.EngineEndpoint)
require.Equal(t, startResult.EngineEndpoint, *restartResp.EngineEndpoint,
"restart must keep the engine endpoint stable")
restartContainerID := *restartResp.CurrentContainerID
restartEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindRestart, 5*time.Second)
assert.Equal(t, operation.OutcomeSuccess, restartEntry.Outcome)
assert.Equal(t, operation.OpSourceAdminRest, restartEntry.OpSource)
// Step 5 — patch to the second semver-compatible tag. Same image
// content, but the runtime should still record the new tag and
// recreate the container.
patchResp, status := rest.PatchRuntime(t, gameID, env.PatchedImageRef)
require.Equal(t, http.StatusOK, status)
require.Equal(t, "running", patchResp.Status)
require.NotNil(t, patchResp.CurrentImageRef)
assert.Equal(t, env.PatchedImageRef, *patchResp.CurrentImageRef)
require.NotNil(t, patchResp.CurrentContainerID)
assert.NotEqual(t, restartContainerID, *patchResp.CurrentContainerID,
"patch must recreate the container")
patchEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindPatch, 5*time.Second)
assert.Equal(t, operation.OutcomeSuccess, patchEntry.Outcome)
// Step 6 — quiesce via REST stop so cleanup is allowed (cleanup
// refuses to remove a running container per
// `rtmanager/README.md §Lifecycles → Cleanup`).
stopResp, status := rest.StopRuntime(t, gameID, "admin_request")
require.Equal(t, http.StatusOK, status)
require.Equal(t, "stopped", stopResp.Status)
// Step 7 — cleanup the container. PG record flips to removed and
// current_container_id becomes nil.
cleanupResp, status := rest.CleanupRuntime(t, gameID)
require.Equal(t, http.StatusOK, status)
require.Equal(t, "removed", cleanupResp.Status)
require.Nil(t, cleanupResp.CurrentContainerID)
cleanupEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindCleanupContainer, 5*time.Second)
assert.Equal(t, operation.OutcomeSuccess, cleanupEntry.Outcome)
assert.Equal(t, operation.OpSourceAdminRest, cleanupEntry.OpSource)
}
// TestReplay_StartJobIsNoop publishes the same start envelope twice
// and asserts that Runtime Manager produces a fresh job_result for
// the first XADD and a `replay_no_op` outcome for the second, without
// recreating the engine container.
func TestReplay_StartJobIsNoop(t *testing.T) {
env := harness.NewEnv(t, harness.EnvOptions{})
gameID := harness.IDFromTestName(t)
// First XADD: fresh start.
harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
first := harness.WaitForJobResult(t, env,
harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess),
30*time.Second,
)
require.Equal(t, "", first.ErrorCode)
// Second XADD: same envelope; the start service must short-circuit
// at the `runtime_records.status=running && image_ref` check.
harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
replay := harness.WaitForJobResult(t, env,
harness.JobOutcomeWithErrorCode(gameID, ports.JobOutcomeSuccess, "replay_no_op"),
15*time.Second,
)
assert.Equal(t, first.ContainerID, replay.ContainerID,
"replay must surface the same container id as the original start")
assert.Equal(t, first.EngineEndpoint, replay.EngineEndpoint)
// Docker view: exactly one engine container exists for this game.
assertSingleEngineContainer(t, env, gameID)
// Lifecycle stream produced exactly two entries: fresh + replay.
entries := harness.AllJobResults(t, env)
require.Len(t, entries, 2)
assert.Equal(t, "", entries[0].ErrorCode)
assert.Equal(t, "replay_no_op", entries[1].ErrorCode)
}
// TestReplay_StopJobIsNoop publishes a stop envelope twice after a
// successful start and asserts the second stop surfaces as
// `replay_no_op` without altering the runtime record's `stopped_at`.
func TestReplay_StopJobIsNoop(t *testing.T) {
env := harness.NewEnv(t, harness.EnvOptions{})
gameID := harness.IDFromTestName(t)
// Bring the game to `running`. The start path publishes one entry
// to `runtime:job_results`; the stops below publish two more, so
// per-game stream order is [start, first-stop, replay-stop].
harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
harness.WaitForJobResult(t, env,
harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess),
30*time.Second,
)
// First stop: fresh. The expectedCount accounts for the start
// entry that is already on the stream.
harness.XAddStopJob(t, env, gameID, "cancelled")
first := waitForJobResultByIndex(t, env, gameID, 2)
require.Equal(t, ports.JobOutcomeSuccess, first.Outcome)
require.Equal(t, "", first.ErrorCode)
stoppedRecord := harness.EventuallyRuntimeRecord(t, env, gameID,
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusStopped },
15*time.Second,
)
require.NotNil(t, stoppedRecord.StoppedAt, "stopped record must carry stopped_at")
originalStoppedAt := *stoppedRecord.StoppedAt
// Second stop: replay (third entry on the per-game stream).
harness.XAddStopJob(t, env, gameID, "cancelled")
replay := waitForJobResultByIndex(t, env, gameID, 3)
require.Equal(t, ports.JobOutcomeSuccess, replay.Outcome)
assert.Equal(t, "replay_no_op", replay.ErrorCode)
// stopped_at stays anchored to the first stop.
postReplay := harness.MustRuntimeRecord(t, env, gameID)
require.Equal(t, runtime.StatusStopped, postReplay.Status)
require.NotNil(t, postReplay.StoppedAt)
assert.True(t, originalStoppedAt.Equal(*postReplay.StoppedAt),
"stopped_at must not move on a replay stop; was %s, now %s",
originalStoppedAt, *postReplay.StoppedAt)
}
// waitForLatestStopOrStartResult finds the most recent `outcome=success`
// entry on `runtime:job_results` for gameID. The lifecycle scenario
// emits two consecutive successes (start then stop); the helper picks
// the second one without re-scanning the stream every iteration.
func waitForLatestStopOrStartResult(t *testing.T, env *harness.Env, gameID string) harness.JobResultEntry {
t.Helper()
deadline := time.Now().Add(30 * time.Second)
for {
entries := harness.AllJobResults(t, env)
// Two entries means we've observed both the start and stop
// outcomes for this game.
matched := 0
var last harness.JobResultEntry
for _, entry := range entries {
if entry.GameID == gameID && entry.Outcome == ports.JobOutcomeSuccess {
matched++
last = entry
}
}
if matched >= 2 {
return last
}
if time.Now().After(deadline) {
t.Fatalf("expected two job_results for %s, got %d", gameID, matched)
}
time.Sleep(50 * time.Millisecond)
}
}
// waitForJobResultByIndex polls the job_results stream until it has
// at least `expectedCount` entries for gameID and returns the
// expectedCount-th. Used by the replay tests to deterministically
// pick the second / nth result.
func waitForJobResultByIndex(t *testing.T, env *harness.Env, gameID string, expectedCount int) harness.JobResultEntry {
t.Helper()
deadline := time.Now().Add(30 * time.Second)
for {
entries := harness.AllJobResults(t, env)
matches := make([]harness.JobResultEntry, 0, len(entries))
for _, entry := range entries {
if entry.GameID == gameID {
matches = append(matches, entry)
}
}
if len(matches) >= expectedCount {
return matches[expectedCount-1]
}
if time.Now().After(deadline) {
t.Fatalf("expected at least %d job_results for %s, got %d",
expectedCount, gameID, len(matches))
}
time.Sleep(50 * time.Millisecond)
}
}
// assertSingleEngineContainer queries Docker by the per-game label and
// asserts exactly one matching container exists. Catches replay
// regressions that would let RTM start two containers for the same
// game id.
func assertSingleEngineContainer(t *testing.T, env *harness.Env, gameID string) {
t.Helper()
args := filters.NewArgs(
filters.Arg("label", "com.galaxy.owner=rtmanager"),
filters.Arg("label", "com.galaxy.game_id="+gameID),
)
containers, err := env.Docker.Client().ContainerList(
context.Background(),
container.ListOptions{All: true, Filters: args},
)
require.NoError(t, err)
require.Lenf(t, containers, 1, "expected one engine container for game %s, got %d", gameID, len(containers))
}