package lobbyrtm_test import ( "net/http" "strings" "testing" "time" "galaxy/integration/internal/harness" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) const ( jobOutcomeSuccess = "success" jobOutcomeFailure = "failure" stopReasonCancelled = "cancelled" errorCodeImagePullFailed = "image_pull_failed" ) // TestStartFlowSucceedsWithRealEngine drives the happy path: // Lobby creates a private game, the owner walks it through enrollment // to start, Lobby publishes a `runtime:start_jobs` envelope with the // resolved `image_ref`, RTM starts a real `galaxy/game` engine // container, publishes a success `runtime:job_results` entry, and // Lobby's runtimejobresult worker transitions the game to `running`. // The test then hits the engine's `/healthz` endpoint directly via // the bridge network IP, proving the container is alive end-to-end. func TestStartFlowSucceedsWithRealEngine(t *testing.T) { h := newLobbyRTMHarness(t) owner, _, gameID := h.prepareInflightGame(t, "start-owner@example.com", "start-invitee@example.com", "Start Galaxy", defaultEngineVersion, ) t.Logf("owner=%s game=%s", owner.UserID, gameID) // RTM publishes a success job_result for the start envelope. startResult := h.waitJobResult(t, func(entry jobResultEntry) bool { return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess }, 90*time.Second) require.Empty(t, startResult.ErrorCode, "happy path must publish empty error_code") require.NotEmpty(t, startResult.ContainerID, "happy path must carry a container id") require.NotEmpty(t, startResult.EngineEndpoint, "happy path must carry an engine endpoint") // Lobby's runtime-job-result worker drives the game to `running`. h.waitGameStatus(t, gameID, "running", 30*time.Second) // RTM persists the runtime record and exposes it through REST. h.waitRTMRuntimeStatus(t, gameID, "running", 15*time.Second) // A real engine container exists with the expected labels. containerID := harness.FindContainerIDByLabel(t, gameID) require.NotEmptyf(t, containerID, "no engine container found for game %s", gameID) require.Equal(t, startResult.ContainerID, containerID, "job_result container_id must match the live container") require.Equal(t, "running", harness.ContainerState(t, containerID)) // The engine answers /healthz on the bridge network IP. ip := harness.ContainerNetworkIP(t, containerID, h.dockerNetwork) require.NotEmptyf(t, ip, "engine container %s has no IP on network %s", containerID, h.dockerNetwork) harness.WaitForEngineHealthz(t, ip, 15*time.Second) } // TestRunningGameStopsWhenOwnerCascadeBlocked drives the stop path: // drive the same game to `running`, publish a // `user.lifecycle.permanent_blocked` event for the owner, the Lobby // userlifecycle worker cascades to the inflight game, publishes a // `runtime:stop_jobs` envelope with `reason=cancelled`, and RTM stops // the engine. The test asserts on the public boundary surfaces only. func TestRunningGameStopsWhenOwnerCascadeBlocked(t *testing.T) { h := newLobbyRTMHarness(t) owner, _, gameID := h.prepareInflightGame(t, "stop-owner@example.com", "stop-invitee@example.com", "Stop Galaxy", defaultEngineVersion, ) t.Logf("owner=%s game=%s", owner.UserID, gameID) // Wait for the start outcome so we know RTM is fully running // before we trigger the cascade. h.waitJobResult(t, func(entry jobResultEntry) bool { return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess }, 90*time.Second) h.waitGameStatus(t, gameID, "running", 30*time.Second) containerID := harness.FindContainerIDByLabel(t, gameID) require.NotEmpty(t, containerID) // Trigger the cascade: permanent block on the game owner causes // Lobby's userlifecycle worker to publish stop_job(cancelled) and // transition the owned game to `cancelled`. h.publishUserLifecycleEvent(t, "user.lifecycle.permanent_blocked", owner.UserID) // Lobby observably publishes the right stop envelope on the boundary. stop := h.waitStopJobReason(t, gameID, stopReasonCancelled, 30*time.Second) assert.Equal(t, gameID, stop.GameID) // Lobby moves the game to cancelled. h.waitGameStatus(t, gameID, "cancelled", 30*time.Second) // RTM consumes stop_job, stops the engine, and persists status=stopped. h.waitRTMRuntimeStatus(t, gameID, "stopped", 30*time.Second) // The container is no longer running. Docker reports `exited` // (or `created`/`removing` during teardown); none of those match // `running`, which is the only state that contradicts a successful // stop. require.Eventuallyf(t, func() bool { state := harness.ContainerState(t, containerID) return state != "running" }, 30*time.Second, 250*time.Millisecond, "engine container %s did not leave running state", containerID) // RTM emitted at least two job_results for this game: one success // for the start, one success for the stop. successCount := 0 for _, entry := range h.allJobResults(t) { if entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess { successCount++ } } assert.GreaterOrEqualf(t, successCount, 2, "expected at least two success job_results (start + stop) for game %s", gameID) } // TestStartFailsWhenImageMissing drives the failure path: the game's // `target_engine_version` resolves to a non-existent image tag, RTM // fails to pull, publishes a failure `runtime:job_results` plus a // `runtime.image_pull_failed` notification intent, and Lobby's // runtimejobresult worker transitions the game to `start_failed`. func TestStartFailsWhenImageMissing(t *testing.T) { h := newLobbyRTMHarness(t) owner, _, gameID := h.prepareInflightGame(t, "fail-owner@example.com", "fail-invitee@example.com", "Fail Galaxy", missingEngineVersion, ) t.Logf("owner=%s game=%s", owner.UserID, gameID) expectedImageRef := "galaxy/game:" + missingEngineVersion + "-lobbyrtm-it" // RTM publishes a failure job_result with the stable code. failure := h.waitJobResult(t, func(entry jobResultEntry) bool { return entry.GameID == gameID && entry.Outcome == jobOutcomeFailure }, 120*time.Second) assert.Equal(t, errorCodeImagePullFailed, failure.ErrorCode) assert.Empty(t, failure.ContainerID) assert.Empty(t, failure.EngineEndpoint) assert.NotEmpty(t, failure.ErrorMessage) // RTM also publishes an admin notification intent on the shared stream. intent := h.waitNotificationIntent(t, func(entry notificationIntentEntry) bool { if entry.NotificationType != notificationImagePulled { return false } payloadGameID, _ := entry.Payload["game_id"].(string) return payloadGameID == gameID }, 30*time.Second) require.NotNil(t, intent.Payload) assert.Equal(t, gameID, intent.Payload["game_id"]) assert.Equal(t, expectedImageRef, intent.Payload["image_ref"]) assert.Equal(t, errorCodeImagePullFailed, intent.Payload["error_code"]) // Lobby flips the game to start_failed. h.waitGameStatus(t, gameID, "start_failed", 60*time.Second) // No engine container should exist for this game. containerID := harness.FindContainerIDByLabel(t, gameID) if containerID != "" { state := harness.ContainerState(t, containerID) assert.NotEqual(t, "running", state, "failed image pull must not leave a running container behind (state=%s)", state) } // RTM either has no record (clean rollback) or has one not in // `running`. Either is acceptable per the start service contract. status, code := h.rtmRuntimeStatus(t, gameID) switch code { case http.StatusNotFound: // nothing persisted — clean rollback path case http.StatusOK: assert.NotEqual(t, "running", status, "failed image pull must not persist a running record") default: t.Fatalf("unexpected RTM runtime response: status=%q code=%d", status, code) } // Sanity check the notification carried RTM's producer marker // rather than Lobby's, so we know the suite truly observed RTM // publishing on the shared stream. assert.Truef(t, strings.Contains(intent.Producer, "rtm") || strings.Contains(intent.Producer, "runtime"), "image_pull_failed intent producer should be RTM-flavoured, got %q", intent.Producer) }