205 lines
7.9 KiB
Go
205 lines
7.9 KiB
Go
package lobbyrtm_test
|
|
|
|
import (
|
|
"net/http"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
|
|
"galaxy/integration/internal/harness"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
const (
|
|
jobOutcomeSuccess = "success"
|
|
jobOutcomeFailure = "failure"
|
|
|
|
stopReasonCancelled = "cancelled"
|
|
|
|
errorCodeImagePullFailed = "image_pull_failed"
|
|
)
|
|
|
|
// TestStartFlowSucceedsWithRealEngine drives the happy path:
|
|
// Lobby creates a private game, the owner walks it through enrollment
|
|
// to start, Lobby publishes a `runtime:start_jobs` envelope with the
|
|
// resolved `image_ref`, RTM starts a real `galaxy/game` engine
|
|
// container, publishes a success `runtime:job_results` entry, and
|
|
// Lobby's runtimejobresult worker transitions the game to `running`.
|
|
// The test then hits the engine's `/healthz` endpoint directly via
|
|
// the bridge network IP, proving the container is alive end-to-end.
|
|
func TestStartFlowSucceedsWithRealEngine(t *testing.T) {
|
|
h := newLobbyRTMHarness(t)
|
|
|
|
owner, _, gameID := h.prepareInflightGame(t,
|
|
"start-owner@example.com",
|
|
"start-invitee@example.com",
|
|
"Start Galaxy",
|
|
defaultEngineVersion,
|
|
)
|
|
t.Logf("owner=%s game=%s", owner.UserID, gameID)
|
|
|
|
// RTM publishes a success job_result for the start envelope.
|
|
startResult := h.waitJobResult(t, func(entry jobResultEntry) bool {
|
|
return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess
|
|
}, 90*time.Second)
|
|
require.Empty(t, startResult.ErrorCode, "happy path must publish empty error_code")
|
|
require.NotEmpty(t, startResult.ContainerID, "happy path must carry a container id")
|
|
require.NotEmpty(t, startResult.EngineEndpoint, "happy path must carry an engine endpoint")
|
|
|
|
// Lobby's runtime-job-result worker drives the game to `running`.
|
|
h.waitGameStatus(t, gameID, "running", 30*time.Second)
|
|
|
|
// RTM persists the runtime record and exposes it through REST.
|
|
h.waitRTMRuntimeStatus(t, gameID, "running", 15*time.Second)
|
|
|
|
// A real engine container exists with the expected labels.
|
|
containerID := harness.FindContainerIDByLabel(t, gameID)
|
|
require.NotEmptyf(t, containerID, "no engine container found for game %s", gameID)
|
|
require.Equal(t, startResult.ContainerID, containerID,
|
|
"job_result container_id must match the live container")
|
|
require.Equal(t, "running", harness.ContainerState(t, containerID))
|
|
|
|
// The engine answers /healthz on the bridge network IP.
|
|
ip := harness.ContainerNetworkIP(t, containerID, h.dockerNetwork)
|
|
require.NotEmptyf(t, ip, "engine container %s has no IP on network %s", containerID, h.dockerNetwork)
|
|
harness.WaitForEngineHealthz(t, ip, 15*time.Second)
|
|
}
|
|
|
|
// TestRunningGameStopsWhenOwnerCascadeBlocked drives the stop path:
|
|
// drive the same game to `running`, publish a
|
|
// `user.lifecycle.permanent_blocked` event for the owner, the Lobby
|
|
// userlifecycle worker cascades to the inflight game, publishes a
|
|
// `runtime:stop_jobs` envelope with `reason=cancelled`, and RTM stops
|
|
// the engine. The test asserts on the public boundary surfaces only.
|
|
func TestRunningGameStopsWhenOwnerCascadeBlocked(t *testing.T) {
|
|
h := newLobbyRTMHarness(t)
|
|
|
|
owner, _, gameID := h.prepareInflightGame(t,
|
|
"stop-owner@example.com",
|
|
"stop-invitee@example.com",
|
|
"Stop Galaxy",
|
|
defaultEngineVersion,
|
|
)
|
|
t.Logf("owner=%s game=%s", owner.UserID, gameID)
|
|
|
|
// Wait for the start outcome so we know RTM is fully running
|
|
// before we trigger the cascade.
|
|
h.waitJobResult(t, func(entry jobResultEntry) bool {
|
|
return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess
|
|
}, 90*time.Second)
|
|
h.waitGameStatus(t, gameID, "running", 30*time.Second)
|
|
containerID := harness.FindContainerIDByLabel(t, gameID)
|
|
require.NotEmpty(t, containerID)
|
|
|
|
// Trigger the cascade: permanent block on the game owner causes
|
|
// Lobby's userlifecycle worker to publish stop_job(cancelled) and
|
|
// transition the owned game to `cancelled`.
|
|
h.publishUserLifecycleEvent(t, "user.lifecycle.permanent_blocked", owner.UserID)
|
|
|
|
// Lobby observably publishes the right stop envelope on the boundary.
|
|
stop := h.waitStopJobReason(t, gameID, stopReasonCancelled, 30*time.Second)
|
|
assert.Equal(t, gameID, stop.GameID)
|
|
|
|
// Lobby moves the game to cancelled.
|
|
h.waitGameStatus(t, gameID, "cancelled", 30*time.Second)
|
|
|
|
// RTM consumes stop_job, stops the engine, and persists status=stopped.
|
|
h.waitRTMRuntimeStatus(t, gameID, "stopped", 30*time.Second)
|
|
|
|
// The container is no longer running. Docker reports `exited`
|
|
// (or `created`/`removing` during teardown); none of those match
|
|
// `running`, which is the only state that contradicts a successful
|
|
// stop.
|
|
require.Eventuallyf(t, func() bool {
|
|
state := harness.ContainerState(t, containerID)
|
|
return state != "running"
|
|
}, 30*time.Second, 250*time.Millisecond,
|
|
"engine container %s did not leave running state", containerID)
|
|
|
|
// RTM emitted at least two job_results for this game: one success
|
|
// for the start, one success for the stop.
|
|
successCount := 0
|
|
for _, entry := range h.allJobResults(t) {
|
|
if entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess {
|
|
successCount++
|
|
}
|
|
}
|
|
assert.GreaterOrEqualf(t, successCount, 2,
|
|
"expected at least two success job_results (start + stop) for game %s", gameID)
|
|
}
|
|
|
|
// TestStartFailsWhenImageMissing drives the failure path: the game's
|
|
// `target_engine_version` resolves to a non-existent image tag, RTM
|
|
// fails to pull, publishes a failure `runtime:job_results` plus a
|
|
// `runtime.image_pull_failed` notification intent, and Lobby's
|
|
// runtimejobresult worker transitions the game to `start_failed`.
|
|
func TestStartFailsWhenImageMissing(t *testing.T) {
|
|
h := newLobbyRTMHarness(t)
|
|
|
|
owner, _, gameID := h.prepareInflightGame(t,
|
|
"fail-owner@example.com",
|
|
"fail-invitee@example.com",
|
|
"Fail Galaxy",
|
|
missingEngineVersion,
|
|
)
|
|
t.Logf("owner=%s game=%s", owner.UserID, gameID)
|
|
|
|
expectedImageRef := "galaxy/game:" + missingEngineVersion + "-lobbyrtm-it"
|
|
|
|
// RTM publishes a failure job_result with the stable code.
|
|
failure := h.waitJobResult(t, func(entry jobResultEntry) bool {
|
|
return entry.GameID == gameID && entry.Outcome == jobOutcomeFailure
|
|
}, 120*time.Second)
|
|
assert.Equal(t, errorCodeImagePullFailed, failure.ErrorCode)
|
|
assert.Empty(t, failure.ContainerID)
|
|
assert.Empty(t, failure.EngineEndpoint)
|
|
assert.NotEmpty(t, failure.ErrorMessage)
|
|
|
|
// RTM also publishes an admin notification intent on the shared stream.
|
|
intent := h.waitNotificationIntent(t, func(entry notificationIntentEntry) bool {
|
|
if entry.NotificationType != notificationImagePulled {
|
|
return false
|
|
}
|
|
payloadGameID, _ := entry.Payload["game_id"].(string)
|
|
return payloadGameID == gameID
|
|
}, 30*time.Second)
|
|
require.NotNil(t, intent.Payload)
|
|
assert.Equal(t, gameID, intent.Payload["game_id"])
|
|
assert.Equal(t, expectedImageRef, intent.Payload["image_ref"])
|
|
assert.Equal(t, errorCodeImagePullFailed, intent.Payload["error_code"])
|
|
|
|
// Lobby flips the game to start_failed.
|
|
h.waitGameStatus(t, gameID, "start_failed", 60*time.Second)
|
|
|
|
// No engine container should exist for this game.
|
|
containerID := harness.FindContainerIDByLabel(t, gameID)
|
|
if containerID != "" {
|
|
state := harness.ContainerState(t, containerID)
|
|
assert.NotEqual(t, "running", state,
|
|
"failed image pull must not leave a running container behind (state=%s)", state)
|
|
}
|
|
|
|
// RTM either has no record (clean rollback) or has one not in
|
|
// `running`. Either is acceptable per the start service contract.
|
|
status, code := h.rtmRuntimeStatus(t, gameID)
|
|
switch code {
|
|
case http.StatusNotFound:
|
|
// nothing persisted — clean rollback path
|
|
case http.StatusOK:
|
|
assert.NotEqual(t, "running", status,
|
|
"failed image pull must not persist a running record")
|
|
default:
|
|
t.Fatalf("unexpected RTM runtime response: status=%q code=%d", status, code)
|
|
}
|
|
|
|
// Sanity check the notification carried RTM's producer marker
|
|
// rather than Lobby's, so we know the suite truly observed RTM
|
|
// publishing on the shared stream.
|
|
assert.Truef(t,
|
|
strings.Contains(intent.Producer, "rtm") ||
|
|
strings.Contains(intent.Producer, "runtime"),
|
|
"image_pull_failed intent producer should be RTM-flavoured, got %q", intent.Producer)
|
|
}
|