feat: runtime manager
This commit is contained in:
@@ -0,0 +1,204 @@
|
||||
package lobbyrtm_test
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/integration/internal/harness"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
const (
|
||||
jobOutcomeSuccess = "success"
|
||||
jobOutcomeFailure = "failure"
|
||||
|
||||
stopReasonCancelled = "cancelled"
|
||||
|
||||
errorCodeImagePullFailed = "image_pull_failed"
|
||||
)
|
||||
|
||||
// TestStartFlowSucceedsWithRealEngine drives the happy path:
|
||||
// Lobby creates a private game, the owner walks it through enrollment
|
||||
// to start, Lobby publishes a `runtime:start_jobs` envelope with the
|
||||
// resolved `image_ref`, RTM starts a real `galaxy/game` engine
|
||||
// container, publishes a success `runtime:job_results` entry, and
|
||||
// Lobby's runtimejobresult worker transitions the game to `running`.
|
||||
// The test then hits the engine's `/healthz` endpoint directly via
|
||||
// the bridge network IP, proving the container is alive end-to-end.
|
||||
func TestStartFlowSucceedsWithRealEngine(t *testing.T) {
|
||||
h := newLobbyRTMHarness(t)
|
||||
|
||||
owner, _, gameID := h.prepareInflightGame(t,
|
||||
"start-owner@example.com",
|
||||
"start-invitee@example.com",
|
||||
"Start Galaxy",
|
||||
defaultEngineVersion,
|
||||
)
|
||||
t.Logf("owner=%s game=%s", owner.UserID, gameID)
|
||||
|
||||
// RTM publishes a success job_result for the start envelope.
|
||||
startResult := h.waitJobResult(t, func(entry jobResultEntry) bool {
|
||||
return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess
|
||||
}, 90*time.Second)
|
||||
require.Empty(t, startResult.ErrorCode, "happy path must publish empty error_code")
|
||||
require.NotEmpty(t, startResult.ContainerID, "happy path must carry a container id")
|
||||
require.NotEmpty(t, startResult.EngineEndpoint, "happy path must carry an engine endpoint")
|
||||
|
||||
// Lobby's runtime-job-result worker drives the game to `running`.
|
||||
h.waitGameStatus(t, gameID, "running", 30*time.Second)
|
||||
|
||||
// RTM persists the runtime record and exposes it through REST.
|
||||
h.waitRTMRuntimeStatus(t, gameID, "running", 15*time.Second)
|
||||
|
||||
// A real engine container exists with the expected labels.
|
||||
containerID := harness.FindContainerIDByLabel(t, gameID)
|
||||
require.NotEmptyf(t, containerID, "no engine container found for game %s", gameID)
|
||||
require.Equal(t, startResult.ContainerID, containerID,
|
||||
"job_result container_id must match the live container")
|
||||
require.Equal(t, "running", harness.ContainerState(t, containerID))
|
||||
|
||||
// The engine answers /healthz on the bridge network IP.
|
||||
ip := harness.ContainerNetworkIP(t, containerID, h.dockerNetwork)
|
||||
require.NotEmptyf(t, ip, "engine container %s has no IP on network %s", containerID, h.dockerNetwork)
|
||||
harness.WaitForEngineHealthz(t, ip, 15*time.Second)
|
||||
}
|
||||
|
||||
// TestRunningGameStopsWhenOwnerCascadeBlocked drives the stop path:
|
||||
// drive the same game to `running`, publish a
|
||||
// `user.lifecycle.permanent_blocked` event for the owner, the Lobby
|
||||
// userlifecycle worker cascades to the inflight game, publishes a
|
||||
// `runtime:stop_jobs` envelope with `reason=cancelled`, and RTM stops
|
||||
// the engine. The test asserts on the public boundary surfaces only.
|
||||
func TestRunningGameStopsWhenOwnerCascadeBlocked(t *testing.T) {
|
||||
h := newLobbyRTMHarness(t)
|
||||
|
||||
owner, _, gameID := h.prepareInflightGame(t,
|
||||
"stop-owner@example.com",
|
||||
"stop-invitee@example.com",
|
||||
"Stop Galaxy",
|
||||
defaultEngineVersion,
|
||||
)
|
||||
t.Logf("owner=%s game=%s", owner.UserID, gameID)
|
||||
|
||||
// Wait for the start outcome so we know RTM is fully running
|
||||
// before we trigger the cascade.
|
||||
h.waitJobResult(t, func(entry jobResultEntry) bool {
|
||||
return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess
|
||||
}, 90*time.Second)
|
||||
h.waitGameStatus(t, gameID, "running", 30*time.Second)
|
||||
containerID := harness.FindContainerIDByLabel(t, gameID)
|
||||
require.NotEmpty(t, containerID)
|
||||
|
||||
// Trigger the cascade: permanent block on the game owner causes
|
||||
// Lobby's userlifecycle worker to publish stop_job(cancelled) and
|
||||
// transition the owned game to `cancelled`.
|
||||
h.publishUserLifecycleEvent(t, "user.lifecycle.permanent_blocked", owner.UserID)
|
||||
|
||||
// Lobby observably publishes the right stop envelope on the boundary.
|
||||
stop := h.waitStopJobReason(t, gameID, stopReasonCancelled, 30*time.Second)
|
||||
assert.Equal(t, gameID, stop.GameID)
|
||||
|
||||
// Lobby moves the game to cancelled.
|
||||
h.waitGameStatus(t, gameID, "cancelled", 30*time.Second)
|
||||
|
||||
// RTM consumes stop_job, stops the engine, and persists status=stopped.
|
||||
h.waitRTMRuntimeStatus(t, gameID, "stopped", 30*time.Second)
|
||||
|
||||
// The container is no longer running. Docker reports `exited`
|
||||
// (or `created`/`removing` during teardown); none of those match
|
||||
// `running`, which is the only state that contradicts a successful
|
||||
// stop.
|
||||
require.Eventuallyf(t, func() bool {
|
||||
state := harness.ContainerState(t, containerID)
|
||||
return state != "running"
|
||||
}, 30*time.Second, 250*time.Millisecond,
|
||||
"engine container %s did not leave running state", containerID)
|
||||
|
||||
// RTM emitted at least two job_results for this game: one success
|
||||
// for the start, one success for the stop.
|
||||
successCount := 0
|
||||
for _, entry := range h.allJobResults(t) {
|
||||
if entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess {
|
||||
successCount++
|
||||
}
|
||||
}
|
||||
assert.GreaterOrEqualf(t, successCount, 2,
|
||||
"expected at least two success job_results (start + stop) for game %s", gameID)
|
||||
}
|
||||
|
||||
// TestStartFailsWhenImageMissing drives the failure path: the game's
|
||||
// `target_engine_version` resolves to a non-existent image tag, RTM
|
||||
// fails to pull, publishes a failure `runtime:job_results` plus a
|
||||
// `runtime.image_pull_failed` notification intent, and Lobby's
|
||||
// runtimejobresult worker transitions the game to `start_failed`.
|
||||
func TestStartFailsWhenImageMissing(t *testing.T) {
|
||||
h := newLobbyRTMHarness(t)
|
||||
|
||||
owner, _, gameID := h.prepareInflightGame(t,
|
||||
"fail-owner@example.com",
|
||||
"fail-invitee@example.com",
|
||||
"Fail Galaxy",
|
||||
missingEngineVersion,
|
||||
)
|
||||
t.Logf("owner=%s game=%s", owner.UserID, gameID)
|
||||
|
||||
expectedImageRef := "galaxy/game:" + missingEngineVersion + "-lobbyrtm-it"
|
||||
|
||||
// RTM publishes a failure job_result with the stable code.
|
||||
failure := h.waitJobResult(t, func(entry jobResultEntry) bool {
|
||||
return entry.GameID == gameID && entry.Outcome == jobOutcomeFailure
|
||||
}, 120*time.Second)
|
||||
assert.Equal(t, errorCodeImagePullFailed, failure.ErrorCode)
|
||||
assert.Empty(t, failure.ContainerID)
|
||||
assert.Empty(t, failure.EngineEndpoint)
|
||||
assert.NotEmpty(t, failure.ErrorMessage)
|
||||
|
||||
// RTM also publishes an admin notification intent on the shared stream.
|
||||
intent := h.waitNotificationIntent(t, func(entry notificationIntentEntry) bool {
|
||||
if entry.NotificationType != notificationImagePulled {
|
||||
return false
|
||||
}
|
||||
payloadGameID, _ := entry.Payload["game_id"].(string)
|
||||
return payloadGameID == gameID
|
||||
}, 30*time.Second)
|
||||
require.NotNil(t, intent.Payload)
|
||||
assert.Equal(t, gameID, intent.Payload["game_id"])
|
||||
assert.Equal(t, expectedImageRef, intent.Payload["image_ref"])
|
||||
assert.Equal(t, errorCodeImagePullFailed, intent.Payload["error_code"])
|
||||
|
||||
// Lobby flips the game to start_failed.
|
||||
h.waitGameStatus(t, gameID, "start_failed", 60*time.Second)
|
||||
|
||||
// No engine container should exist for this game.
|
||||
containerID := harness.FindContainerIDByLabel(t, gameID)
|
||||
if containerID != "" {
|
||||
state := harness.ContainerState(t, containerID)
|
||||
assert.NotEqual(t, "running", state,
|
||||
"failed image pull must not leave a running container behind (state=%s)", state)
|
||||
}
|
||||
|
||||
// RTM either has no record (clean rollback) or has one not in
|
||||
// `running`. Either is acceptable per the start service contract.
|
||||
status, code := h.rtmRuntimeStatus(t, gameID)
|
||||
switch code {
|
||||
case http.StatusNotFound:
|
||||
// nothing persisted — clean rollback path
|
||||
case http.StatusOK:
|
||||
assert.NotEqual(t, "running", status,
|
||||
"failed image pull must not persist a running record")
|
||||
default:
|
||||
t.Fatalf("unexpected RTM runtime response: status=%q code=%d", status, code)
|
||||
}
|
||||
|
||||
// Sanity check the notification carried RTM's producer marker
|
||||
// rather than Lobby's, so we know the suite truly observed RTM
|
||||
// publishing on the shared stream.
|
||||
assert.Truef(t,
|
||||
strings.Contains(intent.Producer, "rtm") ||
|
||||
strings.Contains(intent.Producer, "runtime"),
|
||||
"image_pull_failed intent producer should be RTM-flavoured, got %q", intent.Producer)
|
||||
}
|
||||
Reference in New Issue
Block a user