galaxy-game/rtmanager/integration/lifecycle_test.go

//go:build integration

// Package integration_test owns the service-local end-to-end scenarios
// for Runtime Manager. The build tag keeps the suite out of the
// default `go test ./...` run; CI invokes the suite explicitly with
// `go test -tags=integration ./rtmanager/integration/...`.
//
// Design rationale for the suite — build tag, in-process harness,
// per-test isolation, two-tag engine image — lives in
// `rtmanager/docs/integration-tests.md`. Each test stands up its own
// Runtime Manager process via `harness.NewEnv`, drives the same
// streams Game Lobby uses in `integration/lobbyrtm`, and asserts the
// resulting PostgreSQL, Redis-stream, and Docker side-effects.
package integration_test

import (
	"context"
	"net/http"
	"testing"
	"time"

	"galaxy/rtmanager/integration/harness"
	"galaxy/rtmanager/internal/domain/operation"
	"galaxy/rtmanager/internal/domain/runtime"
	"galaxy/rtmanager/internal/ports"

	"github.com/docker/docker/api/types/container"
	"github.com/docker/docker/api/types/filters"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

// TestMain centralises shared-container teardown so individual
// failing tests do not leak the testcontainers postgres / redis pair.
func TestMain(m *testing.M) {
	harness.RunMain(m)
}

// TestLifecycle_StartInspectStopRestartPatchCleanup drives one game
// through every supported lifecycle operation against the real engine
// image and asserts each step's PG, Redis-stream, and Docker
// side-effects.
func TestLifecycle_StartInspectStopRestartPatchCleanup(t *testing.T) {
	env := harness.NewEnv(t, harness.EnvOptions{LogToStderr: true})
	rest := harness.NewREST(env)
	gameID := harness.IDFromTestName(t)

	// Step 1 — start through the Lobby async stream contract.
	startEntryID := harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
	t.Logf("start_jobs xadd id=%s", startEntryID)

	startResult := harness.WaitForJobResult(t, env,
		harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess),
		30*time.Second,
	)
	require.Equal(t, "", startResult.ErrorCode, "fresh start must publish empty error_code")
	require.NotEmpty(t, startResult.ContainerID, "fresh start job result must carry container_id")
	require.NotEmpty(t, startResult.EngineEndpoint, "fresh start job result must carry engine_endpoint")

	// PG record reflects the start.
	startedRecord := harness.EventuallyRuntimeRecord(t, env, gameID,
		func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusRunning },
		15*time.Second,
	)
	assert.Equal(t, env.EngineImageRef, startedRecord.CurrentImageRef)
	assert.Equal(t, env.Network, startedRecord.DockerNetwork)
	assert.Equal(t, startResult.ContainerID, startedRecord.CurrentContainerID)
	assert.Equal(t, startResult.EngineEndpoint, startedRecord.EngineEndpoint)

	// operation_log captures the start.
	startEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindStart, 5*time.Second)
	assert.Equal(t, operation.OutcomeSuccess, startEntry.Outcome)
	assert.Equal(t, operation.OpSourceLobbyStream, startEntry.OpSource)

	// Step 2 — inspect via the GM/Admin REST surface.
	getResp, status := rest.GetRuntime(t, gameID)
	require.Equal(t, http.StatusOK, status)
	require.Equal(t, "running", getResp.Status)
	require.NotNil(t, getResp.CurrentContainerID)
	require.Equal(t, startResult.ContainerID, *getResp.CurrentContainerID)
	require.NotNil(t, getResp.CurrentImageRef)
	require.Equal(t, env.EngineImageRef, *getResp.CurrentImageRef)
	require.NotNil(t, getResp.EngineEndpoint)
	require.Equal(t, startResult.EngineEndpoint, *getResp.EngineEndpoint)

	// Step 3 — stop through the Lobby async stream contract.
	harness.XAddStopJob(t, env, gameID, "cancelled")
	stopResult := waitForLatestStopOrStartResult(t, env, gameID)
	require.Equal(t, ports.JobOutcomeSuccess, stopResult.Outcome)
	require.Equal(t, "", stopResult.ErrorCode, "fresh stop must publish empty error_code")

	stoppedRecord := harness.EventuallyRuntimeRecord(t, env, gameID,
		func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusStopped },
		15*time.Second,
	)
	assert.Equal(t, startResult.ContainerID, stoppedRecord.CurrentContainerID,
		"stop preserves the current container id until cleanup")

	// Step 4 — restart via REST. Container id changes; engine endpoint
	// stays stable.
	restartResp, status := rest.RestartRuntime(t, gameID)
	require.Equal(t, http.StatusOK, status)
	require.Equal(t, "running", restartResp.Status)
	require.NotNil(t, restartResp.CurrentContainerID)
	require.NotEqual(t, startResult.ContainerID, *restartResp.CurrentContainerID,
		"restart must produce a new container id")
	require.NotNil(t, restartResp.EngineEndpoint)
	require.Equal(t, startResult.EngineEndpoint, *restartResp.EngineEndpoint,
		"restart must keep the engine endpoint stable")

	restartContainerID := *restartResp.CurrentContainerID
	restartEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindRestart, 5*time.Second)
	assert.Equal(t, operation.OutcomeSuccess, restartEntry.Outcome)
	assert.Equal(t, operation.OpSourceAdminRest, restartEntry.OpSource)

	// Step 5 — patch to the second semver-compatible tag. Same image
	// content, but the runtime should still record the new tag and
	// recreate the container.
	patchResp, status := rest.PatchRuntime(t, gameID, env.PatchedImageRef)
	require.Equal(t, http.StatusOK, status)
	require.Equal(t, "running", patchResp.Status)
	require.NotNil(t, patchResp.CurrentImageRef)
	assert.Equal(t, env.PatchedImageRef, *patchResp.CurrentImageRef)
	require.NotNil(t, patchResp.CurrentContainerID)
	assert.NotEqual(t, restartContainerID, *patchResp.CurrentContainerID,
		"patch must recreate the container")

	patchEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindPatch, 5*time.Second)
	assert.Equal(t, operation.OutcomeSuccess, patchEntry.Outcome)

	// Step 6 — quiesce via REST stop so cleanup is allowed (cleanup
	// refuses to remove a running container per
	// `rtmanager/README.md §Lifecycles → Cleanup`).
	stopResp, status := rest.StopRuntime(t, gameID, "admin_request")
	require.Equal(t, http.StatusOK, status)
	require.Equal(t, "stopped", stopResp.Status)

	// Step 7 — cleanup the container. PG record flips to removed and
	// current_container_id becomes nil.
	cleanupResp, status := rest.CleanupRuntime(t, gameID)
	require.Equal(t, http.StatusOK, status)
	require.Equal(t, "removed", cleanupResp.Status)
	require.Nil(t, cleanupResp.CurrentContainerID)

	cleanupEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindCleanupContainer, 5*time.Second)
	assert.Equal(t, operation.OutcomeSuccess, cleanupEntry.Outcome)
	assert.Equal(t, operation.OpSourceAdminRest, cleanupEntry.OpSource)
}

// TestReplay_StartJobIsNoop publishes the same start envelope twice
// and asserts that Runtime Manager produces a fresh job_result for
// the first XADD and a `replay_no_op` outcome for the second, without
// recreating the engine container.
func TestReplay_StartJobIsNoop(t *testing.T) {
	env := harness.NewEnv(t, harness.EnvOptions{})
	gameID := harness.IDFromTestName(t)

	// First XADD: fresh start.
	harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
	first := harness.WaitForJobResult(t, env,
		harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess),
		30*time.Second,
	)
	require.Equal(t, "", first.ErrorCode)

	// Second XADD: same envelope; the start service must short-circuit
	// at the `runtime_records.status=running && image_ref` check.
	harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
	replay := harness.WaitForJobResult(t, env,
		harness.JobOutcomeWithErrorCode(gameID, ports.JobOutcomeSuccess, "replay_no_op"),
		15*time.Second,
	)
	assert.Equal(t, first.ContainerID, replay.ContainerID,
		"replay must surface the same container id as the original start")
	assert.Equal(t, first.EngineEndpoint, replay.EngineEndpoint)

	// Docker view: exactly one engine container exists for this game.
	assertSingleEngineContainer(t, env, gameID)

	// Lifecycle stream produced exactly two entries: fresh + replay.
	entries := harness.AllJobResults(t, env)
	require.Len(t, entries, 2)
	assert.Equal(t, "", entries[0].ErrorCode)
	assert.Equal(t, "replay_no_op", entries[1].ErrorCode)
}

// TestReplay_StopJobIsNoop publishes a stop envelope twice after a
// successful start and asserts the second stop surfaces as
// `replay_no_op` without altering the runtime record's `stopped_at`.
func TestReplay_StopJobIsNoop(t *testing.T) {
	env := harness.NewEnv(t, harness.EnvOptions{})
	gameID := harness.IDFromTestName(t)

	// Bring the game to `running`. The start path publishes one entry
	// to `runtime:job_results`; the stops below publish two more, so
	// per-game stream order is [start, first-stop, replay-stop].
	harness.XAddStartJob(t, env, gameID, env.EngineImageRef)
	harness.WaitForJobResult(t, env,
		harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess),
		30*time.Second,
	)

	// First stop: fresh. The expectedCount accounts for the start
	// entry that is already on the stream.
	harness.XAddStopJob(t, env, gameID, "cancelled")
	first := waitForJobResultByIndex(t, env, gameID, 2)
	require.Equal(t, ports.JobOutcomeSuccess, first.Outcome)
	require.Equal(t, "", first.ErrorCode)

	stoppedRecord := harness.EventuallyRuntimeRecord(t, env, gameID,
		func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusStopped },
		15*time.Second,
	)
	require.NotNil(t, stoppedRecord.StoppedAt, "stopped record must carry stopped_at")
	originalStoppedAt := *stoppedRecord.StoppedAt

	// Second stop: replay (third entry on the per-game stream).
	harness.XAddStopJob(t, env, gameID, "cancelled")
	replay := waitForJobResultByIndex(t, env, gameID, 3)
	require.Equal(t, ports.JobOutcomeSuccess, replay.Outcome)
	assert.Equal(t, "replay_no_op", replay.ErrorCode)

	// stopped_at stays anchored to the first stop.
	postReplay := harness.MustRuntimeRecord(t, env, gameID)
	require.Equal(t, runtime.StatusStopped, postReplay.Status)
	require.NotNil(t, postReplay.StoppedAt)
	assert.True(t, originalStoppedAt.Equal(*postReplay.StoppedAt),
		"stopped_at must not move on a replay stop; was %s, now %s",
		originalStoppedAt, *postReplay.StoppedAt)
}

// waitForLatestStopOrStartResult finds the most recent `outcome=success`
// entry on `runtime:job_results` for gameID. The lifecycle scenario
// emits two consecutive successes (start then stop); the helper picks
// the second one without re-scanning the stream every iteration.
func waitForLatestStopOrStartResult(t *testing.T, env *harness.Env, gameID string) harness.JobResultEntry {
	t.Helper()
	deadline := time.Now().Add(30 * time.Second)
	for {
		entries := harness.AllJobResults(t, env)
		// Two entries means we've observed both the start and stop
		// outcomes for this game.
		matched := 0
		var last harness.JobResultEntry
		for _, entry := range entries {
			if entry.GameID == gameID && entry.Outcome == ports.JobOutcomeSuccess {
				matched++
				last = entry
			}
		}
		if matched >= 2 {
			return last
		}
		if time.Now().After(deadline) {
			t.Fatalf("expected two job_results for %s, got %d", gameID, matched)
		}
		time.Sleep(50 * time.Millisecond)
	}
}

// waitForJobResultByIndex polls the job_results stream until it has
// at least `expectedCount` entries for gameID and returns the
// expectedCount-th. Used by the replay tests to deterministically
// pick the second / nth result.
func waitForJobResultByIndex(t *testing.T, env *harness.Env, gameID string, expectedCount int) harness.JobResultEntry {
	t.Helper()
	deadline := time.Now().Add(30 * time.Second)
	for {
		entries := harness.AllJobResults(t, env)
		matches := make([]harness.JobResultEntry, 0, len(entries))
		for _, entry := range entries {
			if entry.GameID == gameID {
				matches = append(matches, entry)
			}
		}
		if len(matches) >= expectedCount {
			return matches[expectedCount-1]
		}
		if time.Now().After(deadline) {
			t.Fatalf("expected at least %d job_results for %s, got %d",
				expectedCount, gameID, len(matches))
		}
		time.Sleep(50 * time.Millisecond)
	}
}

// assertSingleEngineContainer queries Docker by the per-game label and
// asserts exactly one matching container exists. Catches replay
// regressions that would let RTM start two containers for the same
// game id.
func assertSingleEngineContainer(t *testing.T, env *harness.Env, gameID string) {
	t.Helper()
	args := filters.NewArgs(
		filters.Arg("label", "com.galaxy.owner=rtmanager"),
		filters.Arg("label", "com.galaxy.game_id="+gameID),
	)
	containers, err := env.Docker.Client().ContainerList(
		context.Background(),
		container.ListOptions{All: true, Filters: args},
	)
	require.NoError(t, err)
	require.Lenf(t, containers, 1, "expected one engine container for game %s, got %d", gameID, len(containers))
}