feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
+747
View File
@@ -0,0 +1,747 @@
// Package lobbyrtm_test exercises the Lobby ↔ Runtime Manager
// boundary against real Lobby + real Runtime Manager + real
// PostgreSQL + real Redis + real Docker daemon running the
// galaxy/game test engine container. It satisfies the inter-service
// requirement spelled out in `TESTING.md §7` and PLAN.md Stage 20.
//
// The boundary contract is: Lobby publishes `runtime:start_jobs` and
// `runtime:stop_jobs` envelopes, RTM consumes them and runs/stops
// engine containers, RTM publishes `runtime:job_results`, Lobby
// transitions the game accordingly. The suite asserts only on those
// public surfaces (Lobby/RTM REST, Redis Streams, Docker container
// state); it never imports `*/internal/...` packages of either
// service.
package lobbyrtm_test
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"maps"
"net/http"
"net/http/httptest"
"os"
"strconv"
"strings"
"sync/atomic"
"testing"
"time"
"galaxy/integration/internal/harness"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/require"
)
const (
defaultEngineVersion = "1.0.0"
missingEngineVersion = "0.0.0-missing"
startJobsStream = "runtime:start_jobs"
stopJobsStream = "runtime:stop_jobs"
jobResultsStream = "runtime:job_results"
healthEventsStream = "runtime:health_events"
notificationIntentsKey = "notification:intents"
userLifecycleStream = "user:lifecycle_events"
gmEventsStream = "gm:lobby_events"
expectedLobbyProducer = "game_lobby"
notificationImagePulled = "runtime.image_pull_failed"
)
// suiteSeq scopes per-test stream prefixes so concurrent test
// invocations cannot bleed events into each other.
var suiteSeq atomic.Int64
// lobbyRTMHarness owns the per-test infrastructure: containers,
// processes, stream keys, and helper clients. One harness per test
// keeps each scenario fully isolated.
type lobbyRTMHarness struct {
redis *redis.Client
userServiceURL string
lobbyPublicURL string
lobbyAdminURL string
rtmInternalURL string
intentsStream string
lifecycleStream string
jobResultsStream string
startJobsStream string
stopJobsStream string
healthEvents string
gmStub *httptest.Server
dockerNetwork string
engineImage string
userServiceProcess *harness.Process
lobbyProcess *harness.Process
rtmProcess *harness.Process
}
type ensureUserResponse struct {
Outcome string `json:"outcome"`
UserID string `json:"user_id"`
}
type httpResponse struct {
StatusCode int
Body string
Header http.Header
}
// newLobbyRTMHarness brings up one independent test environment:
// Postgres containers per service (mirrors `lobbynotification`), one
// Redis container, real binaries for User Service / Lobby / RTM, a
// GM stub that returns 200, a per-test Docker bridge network, and
// the freshly-built `galaxy/game` test image.
func newLobbyRTMHarness(t *testing.T) *lobbyRTMHarness {
t.Helper()
// Skip the whole suite when Docker is unreachable. The ensure-only
// check runs before any testcontainer is started so the skip path
// kicks in before testcontainers-go tries (and fails) to probe the
// daemon.
harness.RequireDockerDaemon(t)
redisRuntime := harness.StartRedisContainer(t)
redisClient := redis.NewClient(&redis.Options{
Addr: redisRuntime.Addr,
Protocol: 2,
DisableIdentity: true,
})
t.Cleanup(func() {
require.NoError(t, redisClient.Close())
})
gmStub := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{}`))
}))
t.Cleanup(gmStub.Close)
engineImage := harness.EnsureGalaxyGameImage(t)
dockerNetwork := harness.EnsureDockerNetwork(t)
userServiceAddr := harness.FreeTCPAddress(t)
lobbyPublicAddr := harness.FreeTCPAddress(t)
lobbyInternalAddr := harness.FreeTCPAddress(t)
rtmInternalAddr := harness.FreeTCPAddress(t)
userServiceBinary := harness.BuildBinary(t, "userservice", "./user/cmd/userservice")
lobbyBinary := harness.BuildBinary(t, "lobby", "./lobby/cmd/lobby")
rtmBinary := harness.BuildBinary(t, "rtmanager", "./rtmanager/cmd/rtmanager")
userServiceEnv := harness.StartUserServicePersistence(t, redisRuntime.Addr).Env
userServiceEnv["USERSERVICE_LOG_LEVEL"] = "info"
userServiceEnv["USERSERVICE_INTERNAL_HTTP_ADDR"] = userServiceAddr
userServiceEnv["OTEL_TRACES_EXPORTER"] = "none"
userServiceEnv["OTEL_METRICS_EXPORTER"] = "none"
userServiceProcess := harness.StartProcess(t, "userservice", userServiceBinary, userServiceEnv)
waitForUserServiceReady(t, userServiceProcess, "http://"+userServiceAddr)
suffix := strconv.FormatInt(suiteSeq.Add(1), 10)
intentsStream := notificationIntentsKey + ":" + suffix
lifecycleStream := userLifecycleStream + ":" + suffix
jobResultsStreamKey := jobResultsStream + ":" + suffix
startJobsStreamKey := startJobsStream + ":" + suffix
stopJobsStreamKey := stopJobsStream + ":" + suffix
healthEventsStreamKey := healthEventsStream + ":" + suffix
gmEventsStreamKey := gmEventsStream + ":" + suffix
lobbyEnv := harness.StartLobbyServicePersistence(t, redisRuntime.Addr).Env
lobbyEnv["LOBBY_LOG_LEVEL"] = "info"
lobbyEnv["LOBBY_PUBLIC_HTTP_ADDR"] = lobbyPublicAddr
lobbyEnv["LOBBY_INTERNAL_HTTP_ADDR"] = lobbyInternalAddr
lobbyEnv["LOBBY_USER_SERVICE_BASE_URL"] = "http://" + userServiceAddr
lobbyEnv["LOBBY_GM_BASE_URL"] = gmStub.URL
lobbyEnv["LOBBY_NOTIFICATION_INTENTS_STREAM"] = intentsStream
lobbyEnv["LOBBY_USER_LIFECYCLE_STREAM"] = lifecycleStream
lobbyEnv["LOBBY_RUNTIME_JOB_RESULTS_STREAM"] = jobResultsStreamKey
lobbyEnv["LOBBY_RUNTIME_START_JOBS_STREAM"] = startJobsStreamKey
lobbyEnv["LOBBY_RUNTIME_STOP_JOBS_STREAM"] = stopJobsStreamKey
lobbyEnv["LOBBY_GM_EVENTS_STREAM"] = gmEventsStreamKey
lobbyEnv["LOBBY_RUNTIME_JOB_RESULTS_READ_BLOCK_TIMEOUT"] = "200ms"
lobbyEnv["LOBBY_USER_LIFECYCLE_READ_BLOCK_TIMEOUT"] = "200ms"
lobbyEnv["LOBBY_GM_EVENTS_READ_BLOCK_TIMEOUT"] = "200ms"
lobbyEnv["LOBBY_ENGINE_IMAGE_TEMPLATE"] = "galaxy/game:{engine_version}-lobbyrtm-it"
lobbyEnv["OTEL_TRACES_EXPORTER"] = "none"
lobbyEnv["OTEL_METRICS_EXPORTER"] = "none"
lobbyProcess := harness.StartProcess(t, "lobby", lobbyBinary, lobbyEnv)
harness.WaitForHTTPStatus(t, lobbyProcess, "http://"+lobbyInternalAddr+"/readyz", http.StatusOK)
rtmEnv := harness.StartRTManagerServicePersistence(t, redisRuntime.Addr).Env
rtmEnv["RTMANAGER_LOG_LEVEL"] = "info"
rtmEnv["RTMANAGER_INTERNAL_HTTP_ADDR"] = rtmInternalAddr
rtmEnv["RTMANAGER_LOBBY_INTERNAL_BASE_URL"] = "http://" + lobbyInternalAddr
rtmEnv["RTMANAGER_DOCKER_HOST"] = resolveDockerHost()
rtmEnv["RTMANAGER_DOCKER_NETWORK"] = dockerNetwork
// On dev machines and in sandboxes the rtmanager process cannot
// chown the per-game state dir to root (uid 0). Pin the owner to
// the current process uid/gid so `chown` is a no-op.
rtmEnv["RTMANAGER_GAME_STATE_OWNER_UID"] = strconv.Itoa(os.Getuid())
rtmEnv["RTMANAGER_GAME_STATE_OWNER_GID"] = strconv.Itoa(os.Getgid())
rtmEnv["RTMANAGER_GAME_STATE_ROOT"] = t.TempDir()
rtmEnv["RTMANAGER_REDIS_START_JOBS_STREAM"] = startJobsStreamKey
rtmEnv["RTMANAGER_REDIS_STOP_JOBS_STREAM"] = stopJobsStreamKey
rtmEnv["RTMANAGER_REDIS_JOB_RESULTS_STREAM"] = jobResultsStreamKey
rtmEnv["RTMANAGER_REDIS_HEALTH_EVENTS_STREAM"] = healthEventsStreamKey
rtmEnv["RTMANAGER_NOTIFICATION_INTENTS_STREAM"] = intentsStream
rtmEnv["RTMANAGER_STREAM_BLOCK_TIMEOUT"] = "200ms"
rtmEnv["RTMANAGER_RECONCILE_INTERVAL"] = "1s"
rtmEnv["RTMANAGER_CLEANUP_INTERVAL"] = "1s"
rtmEnv["RTMANAGER_INSPECT_INTERVAL"] = "1s"
rtmEnv["RTMANAGER_PROBE_INTERVAL"] = "1s"
rtmEnv["RTMANAGER_PROBE_TIMEOUT"] = "1s"
rtmEnv["RTMANAGER_PROBE_FAILURES_THRESHOLD"] = "3"
rtmEnv["RTMANAGER_GAME_LEASE_TTL_SECONDS"] = "10"
rtmEnv["OTEL_TRACES_EXPORTER"] = "none"
rtmEnv["OTEL_METRICS_EXPORTER"] = "none"
rtmProcess := harness.StartProcess(t, "rtmanager", rtmBinary, rtmEnv)
harness.WaitForHTTPStatus(t, rtmProcess, "http://"+rtmInternalAddr+"/readyz", http.StatusOK)
return &lobbyRTMHarness{
redis: redisClient,
userServiceURL: "http://" + userServiceAddr,
lobbyPublicURL: "http://" + lobbyPublicAddr,
lobbyAdminURL: "http://" + lobbyInternalAddr,
rtmInternalURL: "http://" + rtmInternalAddr,
intentsStream: intentsStream,
lifecycleStream: lifecycleStream,
jobResultsStream: jobResultsStreamKey,
startJobsStream: startJobsStreamKey,
stopJobsStream: stopJobsStreamKey,
healthEvents: healthEventsStreamKey,
gmStub: gmStub,
dockerNetwork: dockerNetwork,
engineImage: engineImage,
userServiceProcess: userServiceProcess,
lobbyProcess: lobbyProcess,
rtmProcess: rtmProcess,
}
}
// ensureUser provisions a fresh User Service account by email and
// returns the assigned user_id. The email pattern includes the test
// name to avoid collisions across concurrent tests sharing the
// container.
func (h *lobbyRTMHarness) ensureUser(t *testing.T, email string) ensureUserResponse {
t.Helper()
resp := postJSON(t, h.userServiceURL+"/api/v1/internal/users/ensure-by-email", map[string]any{
"email": email,
"registration_context": map[string]string{
"preferred_language": "en",
"time_zone": "Europe/Kaliningrad",
},
}, nil)
var out ensureUserResponse
requireJSONStatus(t, resp, http.StatusOK, &out)
require.Equal(t, "created", out.Outcome)
require.NotEmpty(t, out.UserID)
return out
}
// userCreatePrivateGame creates a private game owned by ownerUserID
// with the supplied target engine version. Returns the assigned
// game_id.
func (h *lobbyRTMHarness) userCreatePrivateGame(
t *testing.T,
ownerUserID, name, targetEngineVersion string,
enrollmentEndsAt int64,
) string {
t.Helper()
resp := postJSON(t, h.lobbyPublicURL+"/api/v1/lobby/games", map[string]any{
"game_name": name,
"game_type": "private",
"min_players": 1,
"max_players": 4,
"start_gap_hours": 6,
"start_gap_players": 1,
"enrollment_ends_at": enrollmentEndsAt,
"turn_schedule": "0 18 * * *",
"target_engine_version": targetEngineVersion,
}, http.Header{"X-User-Id": []string{ownerUserID}})
require.Equalf(t, http.StatusCreated, resp.StatusCode, "create private game: %s", resp.Body)
var record map[string]any
require.NoError(t, json.Unmarshal([]byte(resp.Body), &record))
gameID, ok := record["game_id"].(string)
require.Truef(t, ok, "game_id missing: %s", resp.Body)
return gameID
}
func (h *lobbyRTMHarness) userOpenEnrollment(t *testing.T, ownerUserID, gameID string) {
t.Helper()
resp := postJSON(t,
h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/open-enrollment",
nil,
http.Header{"X-User-Id": []string{ownerUserID}},
)
require.Equalf(t, http.StatusOK, resp.StatusCode, "user open enrollment: %s", resp.Body)
}
func (h *lobbyRTMHarness) userCreateInvite(t *testing.T, ownerUserID, gameID, inviteeUserID string) {
t.Helper()
resp := postJSON(t,
h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/invites",
map[string]any{"invitee_user_id": inviteeUserID},
http.Header{"X-User-Id": []string{ownerUserID}},
)
require.Equalf(t, http.StatusCreated, resp.StatusCode, "create invite: %s", resp.Body)
}
func (h *lobbyRTMHarness) firstCreatedInviteID(t *testing.T, inviteeUserID, gameID string) string {
t.Helper()
req, err := http.NewRequest(http.MethodGet,
h.lobbyPublicURL+"/api/v1/lobby/my/invites?status=created", nil)
require.NoError(t, err)
req.Header.Set("X-User-Id", inviteeUserID)
resp := doRequest(t, req)
require.Equalf(t, http.StatusOK, resp.StatusCode, "list my invites: %s", resp.Body)
var body struct {
Items []struct {
InviteID string `json:"invite_id"`
GameID string `json:"game_id"`
} `json:"items"`
}
require.NoError(t, json.Unmarshal([]byte(resp.Body), &body))
for _, item := range body.Items {
if item.GameID == gameID {
return item.InviteID
}
}
t.Fatalf("no invite found for invitee %s on game %s; body=%s", inviteeUserID, gameID, resp.Body)
return ""
}
func (h *lobbyRTMHarness) userRedeemInvite(t *testing.T, inviteeUserID, gameID, inviteID, raceName string) {
t.Helper()
resp := postJSON(t,
h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/invites/"+inviteID+"/redeem",
map[string]any{"race_name": raceName},
http.Header{"X-User-Id": []string{inviteeUserID}},
)
require.Equalf(t, http.StatusOK, resp.StatusCode, "redeem invite: %s", resp.Body)
}
func (h *lobbyRTMHarness) userReadyToStart(t *testing.T, ownerUserID, gameID string) {
t.Helper()
resp := postJSON(t,
h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/ready-to-start",
nil,
http.Header{"X-User-Id": []string{ownerUserID}},
)
require.Equalf(t, http.StatusOK, resp.StatusCode, "ready-to-start: %s", resp.Body)
}
func (h *lobbyRTMHarness) userStartGame(t *testing.T, ownerUserID, gameID string) {
t.Helper()
resp := postJSON(t,
h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/start",
nil,
http.Header{"X-User-Id": []string{ownerUserID}},
)
require.Equalf(t, http.StatusOK, resp.StatusCode, "user start: %s", resp.Body)
}
// prepareInflightGame walks one private game from creation through
// `start`. For the happy and cancel scenarios the game subsequently
// reaches `running` once RTM publishes the success job_result; for
// the failure scenario it ends in `start_failed`.
//
// Returns owner and invitee user records plus the game id.
func (h *lobbyRTMHarness) prepareInflightGame(
t *testing.T,
ownerEmail, inviteeEmail, gameName, targetEngineVersion string,
) (owner, invitee ensureUserResponse, gameID string) {
t.Helper()
owner = h.ensureUser(t, ownerEmail)
invitee = h.ensureUser(t, inviteeEmail)
gameID = h.userCreatePrivateGame(t, owner.UserID, gameName, targetEngineVersion,
time.Now().Add(48*time.Hour).Unix())
h.userOpenEnrollment(t, owner.UserID, gameID)
h.userCreateInvite(t, owner.UserID, gameID, invitee.UserID)
inviteID := h.firstCreatedInviteID(t, invitee.UserID, gameID)
h.userRedeemInvite(t, invitee.UserID, gameID, inviteID, "PilotInvitee")
h.userReadyToStart(t, owner.UserID, gameID)
h.userStartGame(t, owner.UserID, gameID)
return owner, invitee, gameID
}
// gameStatus reads one game record off Lobby's internal API and
// returns its status field. Used by waitGameStatus and direct
// assertions.
func (h *lobbyRTMHarness) gameStatus(t *testing.T, gameID string) string {
t.Helper()
req, err := http.NewRequest(http.MethodGet,
h.lobbyAdminURL+"/api/v1/internal/games/"+gameID, nil)
require.NoError(t, err)
resp := doRequest(t, req)
if resp.StatusCode != http.StatusOK {
t.Fatalf("get game internal: status=%d body=%s", resp.StatusCode, resp.Body)
}
var record struct {
Status string `json:"status"`
}
require.NoError(t, json.Unmarshal([]byte(resp.Body), &record))
return record.Status
}
// waitGameStatus polls `GET /api/v1/internal/games/{gameID}` until
// the record reports the expected status or the timeout fires.
func (h *lobbyRTMHarness) waitGameStatus(t *testing.T, gameID, want string, timeout time.Duration) {
t.Helper()
deadline := time.Now().Add(timeout)
for {
got := h.gameStatus(t, gameID)
if got == want {
return
}
if time.Now().After(deadline) {
t.Fatalf("game %s status: want %q got %q (after %s)", gameID, want, got, timeout)
}
time.Sleep(150 * time.Millisecond)
}
}
// publishUserLifecycleEvent appends one event to the per-test
// `user:lifecycle_events` stream. The Lobby userlifecycle worker
// consumes the same stream.
func (h *lobbyRTMHarness) publishUserLifecycleEvent(t *testing.T, eventType, userID string) {
t.Helper()
_, err := h.redis.XAdd(context.Background(), &redis.XAddArgs{
Stream: h.lifecycleStream,
Values: map[string]any{
"event_type": eventType,
"user_id": userID,
"occurred_at_ms": strconv.FormatInt(time.Now().UnixMilli(), 10),
"source": "user_admin",
"actor_type": "admin",
"actor_id": "admin-1",
"reason_code": "terminal_policy_violation",
},
}).Result()
require.NoError(t, err)
}
// jobResultEntry decodes one `runtime:job_results` Redis Stream entry.
type jobResultEntry struct {
StreamID string
GameID string
Outcome string
ContainerID string
EngineEndpoint string
ErrorCode string
ErrorMessage string
}
// stopJobEntry decodes one `runtime:stop_jobs` Redis Stream entry as
// published by Lobby.
type stopJobEntry struct {
StreamID string
GameID string
Reason string
}
// notificationIntentEntry decodes one `notification:intents` entry.
type notificationIntentEntry struct {
StreamID string
NotificationType string
Producer string
Payload map[string]any
}
// allJobResults returns every entry on the per-test job_results
// stream in stream order.
func (h *lobbyRTMHarness) allJobResults(t *testing.T) []jobResultEntry {
t.Helper()
entries, err := h.redis.XRange(context.Background(), h.jobResultsStream, "-", "+").Result()
require.NoError(t, err)
out := make([]jobResultEntry, 0, len(entries))
for _, entry := range entries {
out = append(out, jobResultEntry{
StreamID: entry.ID,
GameID: streamString(entry.Values, "game_id"),
Outcome: streamString(entry.Values, "outcome"),
ContainerID: streamString(entry.Values, "container_id"),
EngineEndpoint: streamString(entry.Values, "engine_endpoint"),
ErrorCode: streamString(entry.Values, "error_code"),
ErrorMessage: streamString(entry.Values, "error_message"),
})
}
return out
}
// waitJobResult polls the per-test job_results stream until predicate
// matches one entry, or the timeout fires.
func (h *lobbyRTMHarness) waitJobResult(
t *testing.T,
predicate func(jobResultEntry) bool,
timeout time.Duration,
) jobResultEntry {
t.Helper()
deadline := time.Now().Add(timeout)
for {
entries := h.allJobResults(t)
for _, entry := range entries {
if predicate(entry) {
return entry
}
}
if time.Now().After(deadline) {
t.Fatalf("no job_result matched within %s; observed=%+v", timeout, entries)
}
time.Sleep(150 * time.Millisecond)
}
}
// allStopJobs returns every entry on the per-test stop_jobs stream.
func (h *lobbyRTMHarness) allStopJobs(t *testing.T) []stopJobEntry {
t.Helper()
entries, err := h.redis.XRange(context.Background(), h.stopJobsStream, "-", "+").Result()
require.NoError(t, err)
out := make([]stopJobEntry, 0, len(entries))
for _, entry := range entries {
out = append(out, stopJobEntry{
StreamID: entry.ID,
GameID: streamString(entry.Values, "game_id"),
Reason: streamString(entry.Values, "reason"),
})
}
return out
}
// waitStopJobReason polls the stop_jobs stream until an entry for
// gameID with the expected reason appears.
func (h *lobbyRTMHarness) waitStopJobReason(t *testing.T, gameID, reason string, timeout time.Duration) stopJobEntry {
t.Helper()
deadline := time.Now().Add(timeout)
for {
for _, entry := range h.allStopJobs(t) {
if entry.GameID == gameID && entry.Reason == reason {
return entry
}
}
if time.Now().After(deadline) {
t.Fatalf("no stop_job for game %s with reason %q within %s", gameID, reason, timeout)
}
time.Sleep(150 * time.Millisecond)
}
}
// allNotificationIntents returns every entry on the per-test
// notification:intents stream.
func (h *lobbyRTMHarness) allNotificationIntents(t *testing.T) []notificationIntentEntry {
t.Helper()
entries, err := h.redis.XRange(context.Background(), h.intentsStream, "-", "+").Result()
require.NoError(t, err)
out := make([]notificationIntentEntry, 0, len(entries))
for _, entry := range entries {
decoded := notificationIntentEntry{
StreamID: entry.ID,
NotificationType: streamString(entry.Values, "notification_type"),
Producer: streamString(entry.Values, "producer"),
}
// `pkg/notificationintent` publishes the payload under the
// field name `payload_json`. Older versions of this harness
// looked for `payload` and silently produced an empty Payload
// map, which made every predicate that checks `Payload["…"]`
// fall through. Read both field names for forward compat.
raw := streamString(entry.Values, "payload_json")
if raw == "" {
raw = streamString(entry.Values, "payload")
}
if raw != "" {
var parsed map[string]any
if err := json.Unmarshal([]byte(raw), &parsed); err == nil {
decoded.Payload = parsed
}
}
out = append(out, decoded)
}
return out
}
// waitNotificationIntent polls the intents stream until the
// predicate matches.
func (h *lobbyRTMHarness) waitNotificationIntent(
t *testing.T,
predicate func(notificationIntentEntry) bool,
timeout time.Duration,
) notificationIntentEntry {
t.Helper()
deadline := time.Now().Add(timeout)
for {
entries := h.allNotificationIntents(t)
for _, entry := range entries {
if predicate(entry) {
return entry
}
}
if time.Now().After(deadline) {
summary := make([]string, 0, len(entries))
for _, entry := range entries {
summary = append(summary, entry.NotificationType+":"+entry.Producer)
}
t.Fatalf("no notification_intent matched within %s; observed=%v", timeout, summary)
}
time.Sleep(150 * time.Millisecond)
}
}
// rtmRuntimeStatus issues `GET /api/v1/internal/runtimes/{gameID}`
// against RTM and returns the persisted runtime record's status, or
// the empty string when RTM responds 404.
func (h *lobbyRTMHarness) rtmRuntimeStatus(t *testing.T, gameID string) (string, int) {
t.Helper()
req, err := http.NewRequest(http.MethodGet,
h.rtmInternalURL+"/api/v1/internal/runtimes/"+gameID, nil)
require.NoError(t, err)
resp := doRequest(t, req)
if resp.StatusCode == http.StatusNotFound {
return "", resp.StatusCode
}
if resp.StatusCode != http.StatusOK {
t.Fatalf("rtm get runtime: status=%d body=%s", resp.StatusCode, resp.Body)
}
var record struct {
Status string `json:"status"`
}
require.NoError(t, json.Unmarshal([]byte(resp.Body), &record))
return record.Status, resp.StatusCode
}
// waitRTMRuntimeStatus polls RTM until the runtime record reports
// the expected status or the timeout fires.
func (h *lobbyRTMHarness) waitRTMRuntimeStatus(t *testing.T, gameID, want string, timeout time.Duration) {
t.Helper()
deadline := time.Now().Add(timeout)
for {
status, code := h.rtmRuntimeStatus(t, gameID)
if status == want {
return
}
if time.Now().After(deadline) {
t.Fatalf("rtm runtime status for %s: want %q got %q (http %d) within %s",
gameID, want, status, code, timeout)
}
time.Sleep(150 * time.Millisecond)
}
}
// streamString reads a Redis Streams field as a string regardless of
// the underlying go-redis decoded type.
func streamString(values map[string]any, key string) string {
raw, ok := values[key]
if !ok {
return ""
}
switch typed := raw.(type) {
case string:
return typed
case []byte:
return string(typed)
default:
return fmt.Sprintf("%v", typed)
}
}
func waitForUserServiceReady(t *testing.T, process *harness.Process, baseURL string) {
t.Helper()
client := &http.Client{Timeout: 250 * time.Millisecond}
t.Cleanup(client.CloseIdleConnections)
deadline := time.Now().Add(10 * time.Second)
for time.Now().Before(deadline) {
req, err := http.NewRequest(http.MethodGet,
baseURL+"/api/v1/internal/users/user-readiness-probe/exists", nil)
require.NoError(t, err)
response, err := client.Do(req)
if err == nil {
_, _ = io.Copy(io.Discard, response.Body)
response.Body.Close()
if response.StatusCode == http.StatusOK {
return
}
}
time.Sleep(25 * time.Millisecond)
}
t.Fatalf("wait for userservice readiness: timeout\n%s", process.Logs())
}
func postJSON(t *testing.T, url string, body any, header http.Header) httpResponse {
t.Helper()
var reader io.Reader
if body != nil {
payload, err := json.Marshal(body)
require.NoError(t, err)
reader = bytes.NewReader(payload)
}
req, err := http.NewRequest(http.MethodPost, url, reader)
require.NoError(t, err)
if body != nil {
req.Header.Set("Content-Type", "application/json")
}
maps.Copy(req.Header, header)
return doRequest(t, req)
}
func doRequest(t *testing.T, request *http.Request) httpResponse {
t.Helper()
client := &http.Client{
Timeout: 5 * time.Second,
Transport: &http.Transport{DisableKeepAlives: true},
}
t.Cleanup(client.CloseIdleConnections)
response, err := client.Do(request)
require.NoError(t, err)
defer response.Body.Close()
payload, err := io.ReadAll(response.Body)
require.NoError(t, err)
return httpResponse{
StatusCode: response.StatusCode,
Body: string(payload),
Header: response.Header.Clone(),
}
}
func requireJSONStatus(t *testing.T, response httpResponse, wantStatus int, target any) {
t.Helper()
require.Equalf(t, wantStatus, response.StatusCode, "unexpected status, body=%s", response.Body)
if target != nil {
require.NoError(t, decodeStrictJSON([]byte(response.Body), target))
}
}
func decodeStrictJSON(payload []byte, target any) error {
decoder := json.NewDecoder(bytes.NewReader(payload))
decoder.DisallowUnknownFields()
if err := decoder.Decode(target); err != nil {
return err
}
if err := decoder.Decode(&struct{}{}); err != io.EOF {
if err == nil {
return errors.New("unexpected trailing JSON input")
}
return err
}
return nil
}
// resolveDockerHost honours DOCKER_HOST when the developer machine
// routes through colima or a remote daemon, falling back to the
// standard unix path otherwise.
func resolveDockerHost() string {
if host := strings.TrimSpace(os.Getenv("DOCKER_HOST")); host != "" {
return host
}
return "unix:///var/run/docker.sock"
}
+204
View File
@@ -0,0 +1,204 @@
package lobbyrtm_test
import (
"net/http"
"strings"
"testing"
"time"
"galaxy/integration/internal/harness"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
const (
jobOutcomeSuccess = "success"
jobOutcomeFailure = "failure"
stopReasonCancelled = "cancelled"
errorCodeImagePullFailed = "image_pull_failed"
)
// TestStartFlowSucceedsWithRealEngine drives the happy path:
// Lobby creates a private game, the owner walks it through enrollment
// to start, Lobby publishes a `runtime:start_jobs` envelope with the
// resolved `image_ref`, RTM starts a real `galaxy/game` engine
// container, publishes a success `runtime:job_results` entry, and
// Lobby's runtimejobresult worker transitions the game to `running`.
// The test then hits the engine's `/healthz` endpoint directly via
// the bridge network IP, proving the container is alive end-to-end.
func TestStartFlowSucceedsWithRealEngine(t *testing.T) {
h := newLobbyRTMHarness(t)
owner, _, gameID := h.prepareInflightGame(t,
"start-owner@example.com",
"start-invitee@example.com",
"Start Galaxy",
defaultEngineVersion,
)
t.Logf("owner=%s game=%s", owner.UserID, gameID)
// RTM publishes a success job_result for the start envelope.
startResult := h.waitJobResult(t, func(entry jobResultEntry) bool {
return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess
}, 90*time.Second)
require.Empty(t, startResult.ErrorCode, "happy path must publish empty error_code")
require.NotEmpty(t, startResult.ContainerID, "happy path must carry a container id")
require.NotEmpty(t, startResult.EngineEndpoint, "happy path must carry an engine endpoint")
// Lobby's runtime-job-result worker drives the game to `running`.
h.waitGameStatus(t, gameID, "running", 30*time.Second)
// RTM persists the runtime record and exposes it through REST.
h.waitRTMRuntimeStatus(t, gameID, "running", 15*time.Second)
// A real engine container exists with the expected labels.
containerID := harness.FindContainerIDByLabel(t, gameID)
require.NotEmptyf(t, containerID, "no engine container found for game %s", gameID)
require.Equal(t, startResult.ContainerID, containerID,
"job_result container_id must match the live container")
require.Equal(t, "running", harness.ContainerState(t, containerID))
// The engine answers /healthz on the bridge network IP.
ip := harness.ContainerNetworkIP(t, containerID, h.dockerNetwork)
require.NotEmptyf(t, ip, "engine container %s has no IP on network %s", containerID, h.dockerNetwork)
harness.WaitForEngineHealthz(t, ip, 15*time.Second)
}
// TestRunningGameStopsWhenOwnerCascadeBlocked drives the stop path:
// drive the same game to `running`, publish a
// `user.lifecycle.permanent_blocked` event for the owner, the Lobby
// userlifecycle worker cascades to the inflight game, publishes a
// `runtime:stop_jobs` envelope with `reason=cancelled`, and RTM stops
// the engine. The test asserts on the public boundary surfaces only.
func TestRunningGameStopsWhenOwnerCascadeBlocked(t *testing.T) {
h := newLobbyRTMHarness(t)
owner, _, gameID := h.prepareInflightGame(t,
"stop-owner@example.com",
"stop-invitee@example.com",
"Stop Galaxy",
defaultEngineVersion,
)
t.Logf("owner=%s game=%s", owner.UserID, gameID)
// Wait for the start outcome so we know RTM is fully running
// before we trigger the cascade.
h.waitJobResult(t, func(entry jobResultEntry) bool {
return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess
}, 90*time.Second)
h.waitGameStatus(t, gameID, "running", 30*time.Second)
containerID := harness.FindContainerIDByLabel(t, gameID)
require.NotEmpty(t, containerID)
// Trigger the cascade: permanent block on the game owner causes
// Lobby's userlifecycle worker to publish stop_job(cancelled) and
// transition the owned game to `cancelled`.
h.publishUserLifecycleEvent(t, "user.lifecycle.permanent_blocked", owner.UserID)
// Lobby observably publishes the right stop envelope on the boundary.
stop := h.waitStopJobReason(t, gameID, stopReasonCancelled, 30*time.Second)
assert.Equal(t, gameID, stop.GameID)
// Lobby moves the game to cancelled.
h.waitGameStatus(t, gameID, "cancelled", 30*time.Second)
// RTM consumes stop_job, stops the engine, and persists status=stopped.
h.waitRTMRuntimeStatus(t, gameID, "stopped", 30*time.Second)
// The container is no longer running. Docker reports `exited`
// (or `created`/`removing` during teardown); none of those match
// `running`, which is the only state that contradicts a successful
// stop.
require.Eventuallyf(t, func() bool {
state := harness.ContainerState(t, containerID)
return state != "running"
}, 30*time.Second, 250*time.Millisecond,
"engine container %s did not leave running state", containerID)
// RTM emitted at least two job_results for this game: one success
// for the start, one success for the stop.
successCount := 0
for _, entry := range h.allJobResults(t) {
if entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess {
successCount++
}
}
assert.GreaterOrEqualf(t, successCount, 2,
"expected at least two success job_results (start + stop) for game %s", gameID)
}
// TestStartFailsWhenImageMissing drives the failure path: the game's
// `target_engine_version` resolves to a non-existent image tag, RTM
// fails to pull, publishes a failure `runtime:job_results` plus a
// `runtime.image_pull_failed` notification intent, and Lobby's
// runtimejobresult worker transitions the game to `start_failed`.
func TestStartFailsWhenImageMissing(t *testing.T) {
h := newLobbyRTMHarness(t)
owner, _, gameID := h.prepareInflightGame(t,
"fail-owner@example.com",
"fail-invitee@example.com",
"Fail Galaxy",
missingEngineVersion,
)
t.Logf("owner=%s game=%s", owner.UserID, gameID)
expectedImageRef := "galaxy/game:" + missingEngineVersion + "-lobbyrtm-it"
// RTM publishes a failure job_result with the stable code.
failure := h.waitJobResult(t, func(entry jobResultEntry) bool {
return entry.GameID == gameID && entry.Outcome == jobOutcomeFailure
}, 120*time.Second)
assert.Equal(t, errorCodeImagePullFailed, failure.ErrorCode)
assert.Empty(t, failure.ContainerID)
assert.Empty(t, failure.EngineEndpoint)
assert.NotEmpty(t, failure.ErrorMessage)
// RTM also publishes an admin notification intent on the shared stream.
intent := h.waitNotificationIntent(t, func(entry notificationIntentEntry) bool {
if entry.NotificationType != notificationImagePulled {
return false
}
payloadGameID, _ := entry.Payload["game_id"].(string)
return payloadGameID == gameID
}, 30*time.Second)
require.NotNil(t, intent.Payload)
assert.Equal(t, gameID, intent.Payload["game_id"])
assert.Equal(t, expectedImageRef, intent.Payload["image_ref"])
assert.Equal(t, errorCodeImagePullFailed, intent.Payload["error_code"])
// Lobby flips the game to start_failed.
h.waitGameStatus(t, gameID, "start_failed", 60*time.Second)
// No engine container should exist for this game.
containerID := harness.FindContainerIDByLabel(t, gameID)
if containerID != "" {
state := harness.ContainerState(t, containerID)
assert.NotEqual(t, "running", state,
"failed image pull must not leave a running container behind (state=%s)", state)
}
// RTM either has no record (clean rollback) or has one not in
// `running`. Either is acceptable per the start service contract.
status, code := h.rtmRuntimeStatus(t, gameID)
switch code {
case http.StatusNotFound:
// nothing persisted — clean rollback path
case http.StatusOK:
assert.NotEqual(t, "running", status,
"failed image pull must not persist a running record")
default:
t.Fatalf("unexpected RTM runtime response: status=%q code=%d", status, code)
}
// Sanity check the notification carried RTM's producer marker
// rather than Lobby's, so we know the suite truly observed RTM
// publishing on the shared stream.
assert.Truef(t,
strings.Contains(intent.Producer, "rtm") ||
strings.Contains(intent.Producer, "runtime"),
"image_pull_failed intent producer should be RTM-flavoured, got %q", intent.Producer)
}