Files
galaxy-game/integration/lobbyrtm/lobby_rtm_test.go
T
2026-04-28 20:39:18 +02:00

205 lines
7.9 KiB
Go

package lobbyrtm_test
import (
"net/http"
"strings"
"testing"
"time"
"galaxy/integration/internal/harness"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
const (
jobOutcomeSuccess = "success"
jobOutcomeFailure = "failure"
stopReasonCancelled = "cancelled"
errorCodeImagePullFailed = "image_pull_failed"
)
// TestStartFlowSucceedsWithRealEngine drives the happy path:
// Lobby creates a private game, the owner walks it through enrollment
// to start, Lobby publishes a `runtime:start_jobs` envelope with the
// resolved `image_ref`, RTM starts a real `galaxy/game` engine
// container, publishes a success `runtime:job_results` entry, and
// Lobby's runtimejobresult worker transitions the game to `running`.
// The test then hits the engine's `/healthz` endpoint directly via
// the bridge network IP, proving the container is alive end-to-end.
func TestStartFlowSucceedsWithRealEngine(t *testing.T) {
h := newLobbyRTMHarness(t)
owner, _, gameID := h.prepareInflightGame(t,
"start-owner@example.com",
"start-invitee@example.com",
"Start Galaxy",
defaultEngineVersion,
)
t.Logf("owner=%s game=%s", owner.UserID, gameID)
// RTM publishes a success job_result for the start envelope.
startResult := h.waitJobResult(t, func(entry jobResultEntry) bool {
return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess
}, 90*time.Second)
require.Empty(t, startResult.ErrorCode, "happy path must publish empty error_code")
require.NotEmpty(t, startResult.ContainerID, "happy path must carry a container id")
require.NotEmpty(t, startResult.EngineEndpoint, "happy path must carry an engine endpoint")
// Lobby's runtime-job-result worker drives the game to `running`.
h.waitGameStatus(t, gameID, "running", 30*time.Second)
// RTM persists the runtime record and exposes it through REST.
h.waitRTMRuntimeStatus(t, gameID, "running", 15*time.Second)
// A real engine container exists with the expected labels.
containerID := harness.FindContainerIDByLabel(t, gameID)
require.NotEmptyf(t, containerID, "no engine container found for game %s", gameID)
require.Equal(t, startResult.ContainerID, containerID,
"job_result container_id must match the live container")
require.Equal(t, "running", harness.ContainerState(t, containerID))
// The engine answers /healthz on the bridge network IP.
ip := harness.ContainerNetworkIP(t, containerID, h.dockerNetwork)
require.NotEmptyf(t, ip, "engine container %s has no IP on network %s", containerID, h.dockerNetwork)
harness.WaitForEngineHealthz(t, ip, 15*time.Second)
}
// TestRunningGameStopsWhenOwnerCascadeBlocked drives the stop path:
// drive the same game to `running`, publish a
// `user.lifecycle.permanent_blocked` event for the owner, the Lobby
// userlifecycle worker cascades to the inflight game, publishes a
// `runtime:stop_jobs` envelope with `reason=cancelled`, and RTM stops
// the engine. The test asserts on the public boundary surfaces only.
func TestRunningGameStopsWhenOwnerCascadeBlocked(t *testing.T) {
h := newLobbyRTMHarness(t)
owner, _, gameID := h.prepareInflightGame(t,
"stop-owner@example.com",
"stop-invitee@example.com",
"Stop Galaxy",
defaultEngineVersion,
)
t.Logf("owner=%s game=%s", owner.UserID, gameID)
// Wait for the start outcome so we know RTM is fully running
// before we trigger the cascade.
h.waitJobResult(t, func(entry jobResultEntry) bool {
return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess
}, 90*time.Second)
h.waitGameStatus(t, gameID, "running", 30*time.Second)
containerID := harness.FindContainerIDByLabel(t, gameID)
require.NotEmpty(t, containerID)
// Trigger the cascade: permanent block on the game owner causes
// Lobby's userlifecycle worker to publish stop_job(cancelled) and
// transition the owned game to `cancelled`.
h.publishUserLifecycleEvent(t, "user.lifecycle.permanent_blocked", owner.UserID)
// Lobby observably publishes the right stop envelope on the boundary.
stop := h.waitStopJobReason(t, gameID, stopReasonCancelled, 30*time.Second)
assert.Equal(t, gameID, stop.GameID)
// Lobby moves the game to cancelled.
h.waitGameStatus(t, gameID, "cancelled", 30*time.Second)
// RTM consumes stop_job, stops the engine, and persists status=stopped.
h.waitRTMRuntimeStatus(t, gameID, "stopped", 30*time.Second)
// The container is no longer running. Docker reports `exited`
// (or `created`/`removing` during teardown); none of those match
// `running`, which is the only state that contradicts a successful
// stop.
require.Eventuallyf(t, func() bool {
state := harness.ContainerState(t, containerID)
return state != "running"
}, 30*time.Second, 250*time.Millisecond,
"engine container %s did not leave running state", containerID)
// RTM emitted at least two job_results for this game: one success
// for the start, one success for the stop.
successCount := 0
for _, entry := range h.allJobResults(t) {
if entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess {
successCount++
}
}
assert.GreaterOrEqualf(t, successCount, 2,
"expected at least two success job_results (start + stop) for game %s", gameID)
}
// TestStartFailsWhenImageMissing drives the failure path: the game's
// `target_engine_version` resolves to a non-existent image tag, RTM
// fails to pull, publishes a failure `runtime:job_results` plus a
// `runtime.image_pull_failed` notification intent, and Lobby's
// runtimejobresult worker transitions the game to `start_failed`.
func TestStartFailsWhenImageMissing(t *testing.T) {
h := newLobbyRTMHarness(t)
owner, _, gameID := h.prepareInflightGame(t,
"fail-owner@example.com",
"fail-invitee@example.com",
"Fail Galaxy",
missingEngineVersion,
)
t.Logf("owner=%s game=%s", owner.UserID, gameID)
expectedImageRef := "galaxy/game:" + missingEngineVersion + "-lobbyrtm-it"
// RTM publishes a failure job_result with the stable code.
failure := h.waitJobResult(t, func(entry jobResultEntry) bool {
return entry.GameID == gameID && entry.Outcome == jobOutcomeFailure
}, 120*time.Second)
assert.Equal(t, errorCodeImagePullFailed, failure.ErrorCode)
assert.Empty(t, failure.ContainerID)
assert.Empty(t, failure.EngineEndpoint)
assert.NotEmpty(t, failure.ErrorMessage)
// RTM also publishes an admin notification intent on the shared stream.
intent := h.waitNotificationIntent(t, func(entry notificationIntentEntry) bool {
if entry.NotificationType != notificationImagePulled {
return false
}
payloadGameID, _ := entry.Payload["game_id"].(string)
return payloadGameID == gameID
}, 30*time.Second)
require.NotNil(t, intent.Payload)
assert.Equal(t, gameID, intent.Payload["game_id"])
assert.Equal(t, expectedImageRef, intent.Payload["image_ref"])
assert.Equal(t, errorCodeImagePullFailed, intent.Payload["error_code"])
// Lobby flips the game to start_failed.
h.waitGameStatus(t, gameID, "start_failed", 60*time.Second)
// No engine container should exist for this game.
containerID := harness.FindContainerIDByLabel(t, gameID)
if containerID != "" {
state := harness.ContainerState(t, containerID)
assert.NotEqual(t, "running", state,
"failed image pull must not leave a running container behind (state=%s)", state)
}
// RTM either has no record (clean rollback) or has one not in
// `running`. Either is acceptable per the start service contract.
status, code := h.rtmRuntimeStatus(t, gameID)
switch code {
case http.StatusNotFound:
// nothing persisted — clean rollback path
case http.StatusOK:
assert.NotEqual(t, "running", status,
"failed image pull must not persist a running record")
default:
t.Fatalf("unexpected RTM runtime response: status=%q code=%d", status, code)
}
// Sanity check the notification carried RTM's producer marker
// rather than Lobby's, so we know the suite truly observed RTM
// publishing on the shared stream.
assert.Truef(t,
strings.Contains(intent.Producer, "rtm") ||
strings.Contains(intent.Producer, "runtime"),
"image_pull_failed intent producer should be RTM-flavoured, got %q", intent.Producer)
}