feat: runtime manager
This commit is contained in:
@@ -0,0 +1,747 @@
|
||||
// Package lobbyrtm_test exercises the Lobby ↔ Runtime Manager
|
||||
// boundary against real Lobby + real Runtime Manager + real
|
||||
// PostgreSQL + real Redis + real Docker daemon running the
|
||||
// galaxy/game test engine container. It satisfies the inter-service
|
||||
// requirement spelled out in `TESTING.md §7` and PLAN.md Stage 20.
|
||||
//
|
||||
// The boundary contract is: Lobby publishes `runtime:start_jobs` and
|
||||
// `runtime:stop_jobs` envelopes, RTM consumes them and runs/stops
|
||||
// engine containers, RTM publishes `runtime:job_results`, Lobby
|
||||
// transitions the game accordingly. The suite asserts only on those
|
||||
// public surfaces (Lobby/RTM REST, Redis Streams, Docker container
|
||||
// state); it never imports `*/internal/...` packages of either
|
||||
// service.
|
||||
package lobbyrtm_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"maps"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/integration/internal/harness"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultEngineVersion = "1.0.0"
|
||||
missingEngineVersion = "0.0.0-missing"
|
||||
|
||||
startJobsStream = "runtime:start_jobs"
|
||||
stopJobsStream = "runtime:stop_jobs"
|
||||
jobResultsStream = "runtime:job_results"
|
||||
healthEventsStream = "runtime:health_events"
|
||||
notificationIntentsKey = "notification:intents"
|
||||
userLifecycleStream = "user:lifecycle_events"
|
||||
gmEventsStream = "gm:lobby_events"
|
||||
expectedLobbyProducer = "game_lobby"
|
||||
notificationImagePulled = "runtime.image_pull_failed"
|
||||
)
|
||||
|
||||
// suiteSeq scopes per-test stream prefixes so concurrent test
|
||||
// invocations cannot bleed events into each other.
|
||||
var suiteSeq atomic.Int64
|
||||
|
||||
// lobbyRTMHarness owns the per-test infrastructure: containers,
|
||||
// processes, stream keys, and helper clients. One harness per test
|
||||
// keeps each scenario fully isolated.
|
||||
type lobbyRTMHarness struct {
|
||||
redis *redis.Client
|
||||
|
||||
userServiceURL string
|
||||
lobbyPublicURL string
|
||||
lobbyAdminURL string
|
||||
rtmInternalURL string
|
||||
|
||||
intentsStream string
|
||||
lifecycleStream string
|
||||
jobResultsStream string
|
||||
startJobsStream string
|
||||
stopJobsStream string
|
||||
healthEvents string
|
||||
|
||||
gmStub *httptest.Server
|
||||
|
||||
dockerNetwork string
|
||||
engineImage string
|
||||
|
||||
userServiceProcess *harness.Process
|
||||
lobbyProcess *harness.Process
|
||||
rtmProcess *harness.Process
|
||||
}
|
||||
|
||||
type ensureUserResponse struct {
|
||||
Outcome string `json:"outcome"`
|
||||
UserID string `json:"user_id"`
|
||||
}
|
||||
|
||||
type httpResponse struct {
|
||||
StatusCode int
|
||||
Body string
|
||||
Header http.Header
|
||||
}
|
||||
|
||||
// newLobbyRTMHarness brings up one independent test environment:
|
||||
// Postgres containers per service (mirrors `lobbynotification`), one
|
||||
// Redis container, real binaries for User Service / Lobby / RTM, a
|
||||
// GM stub that returns 200, a per-test Docker bridge network, and
|
||||
// the freshly-built `galaxy/game` test image.
|
||||
func newLobbyRTMHarness(t *testing.T) *lobbyRTMHarness {
|
||||
t.Helper()
|
||||
|
||||
// Skip the whole suite when Docker is unreachable. The ensure-only
|
||||
// check runs before any testcontainer is started so the skip path
|
||||
// kicks in before testcontainers-go tries (and fails) to probe the
|
||||
// daemon.
|
||||
harness.RequireDockerDaemon(t)
|
||||
|
||||
redisRuntime := harness.StartRedisContainer(t)
|
||||
redisClient := redis.NewClient(&redis.Options{
|
||||
Addr: redisRuntime.Addr,
|
||||
Protocol: 2,
|
||||
DisableIdentity: true,
|
||||
})
|
||||
t.Cleanup(func() {
|
||||
require.NoError(t, redisClient.Close())
|
||||
})
|
||||
|
||||
gmStub := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{}`))
|
||||
}))
|
||||
t.Cleanup(gmStub.Close)
|
||||
|
||||
engineImage := harness.EnsureGalaxyGameImage(t)
|
||||
dockerNetwork := harness.EnsureDockerNetwork(t)
|
||||
|
||||
userServiceAddr := harness.FreeTCPAddress(t)
|
||||
lobbyPublicAddr := harness.FreeTCPAddress(t)
|
||||
lobbyInternalAddr := harness.FreeTCPAddress(t)
|
||||
rtmInternalAddr := harness.FreeTCPAddress(t)
|
||||
|
||||
userServiceBinary := harness.BuildBinary(t, "userservice", "./user/cmd/userservice")
|
||||
lobbyBinary := harness.BuildBinary(t, "lobby", "./lobby/cmd/lobby")
|
||||
rtmBinary := harness.BuildBinary(t, "rtmanager", "./rtmanager/cmd/rtmanager")
|
||||
|
||||
userServiceEnv := harness.StartUserServicePersistence(t, redisRuntime.Addr).Env
|
||||
userServiceEnv["USERSERVICE_LOG_LEVEL"] = "info"
|
||||
userServiceEnv["USERSERVICE_INTERNAL_HTTP_ADDR"] = userServiceAddr
|
||||
userServiceEnv["OTEL_TRACES_EXPORTER"] = "none"
|
||||
userServiceEnv["OTEL_METRICS_EXPORTER"] = "none"
|
||||
userServiceProcess := harness.StartProcess(t, "userservice", userServiceBinary, userServiceEnv)
|
||||
waitForUserServiceReady(t, userServiceProcess, "http://"+userServiceAddr)
|
||||
|
||||
suffix := strconv.FormatInt(suiteSeq.Add(1), 10)
|
||||
intentsStream := notificationIntentsKey + ":" + suffix
|
||||
lifecycleStream := userLifecycleStream + ":" + suffix
|
||||
jobResultsStreamKey := jobResultsStream + ":" + suffix
|
||||
startJobsStreamKey := startJobsStream + ":" + suffix
|
||||
stopJobsStreamKey := stopJobsStream + ":" + suffix
|
||||
healthEventsStreamKey := healthEventsStream + ":" + suffix
|
||||
gmEventsStreamKey := gmEventsStream + ":" + suffix
|
||||
|
||||
lobbyEnv := harness.StartLobbyServicePersistence(t, redisRuntime.Addr).Env
|
||||
lobbyEnv["LOBBY_LOG_LEVEL"] = "info"
|
||||
lobbyEnv["LOBBY_PUBLIC_HTTP_ADDR"] = lobbyPublicAddr
|
||||
lobbyEnv["LOBBY_INTERNAL_HTTP_ADDR"] = lobbyInternalAddr
|
||||
lobbyEnv["LOBBY_USER_SERVICE_BASE_URL"] = "http://" + userServiceAddr
|
||||
lobbyEnv["LOBBY_GM_BASE_URL"] = gmStub.URL
|
||||
lobbyEnv["LOBBY_NOTIFICATION_INTENTS_STREAM"] = intentsStream
|
||||
lobbyEnv["LOBBY_USER_LIFECYCLE_STREAM"] = lifecycleStream
|
||||
lobbyEnv["LOBBY_RUNTIME_JOB_RESULTS_STREAM"] = jobResultsStreamKey
|
||||
lobbyEnv["LOBBY_RUNTIME_START_JOBS_STREAM"] = startJobsStreamKey
|
||||
lobbyEnv["LOBBY_RUNTIME_STOP_JOBS_STREAM"] = stopJobsStreamKey
|
||||
lobbyEnv["LOBBY_GM_EVENTS_STREAM"] = gmEventsStreamKey
|
||||
lobbyEnv["LOBBY_RUNTIME_JOB_RESULTS_READ_BLOCK_TIMEOUT"] = "200ms"
|
||||
lobbyEnv["LOBBY_USER_LIFECYCLE_READ_BLOCK_TIMEOUT"] = "200ms"
|
||||
lobbyEnv["LOBBY_GM_EVENTS_READ_BLOCK_TIMEOUT"] = "200ms"
|
||||
lobbyEnv["LOBBY_ENGINE_IMAGE_TEMPLATE"] = "galaxy/game:{engine_version}-lobbyrtm-it"
|
||||
lobbyEnv["OTEL_TRACES_EXPORTER"] = "none"
|
||||
lobbyEnv["OTEL_METRICS_EXPORTER"] = "none"
|
||||
lobbyProcess := harness.StartProcess(t, "lobby", lobbyBinary, lobbyEnv)
|
||||
harness.WaitForHTTPStatus(t, lobbyProcess, "http://"+lobbyInternalAddr+"/readyz", http.StatusOK)
|
||||
|
||||
rtmEnv := harness.StartRTManagerServicePersistence(t, redisRuntime.Addr).Env
|
||||
rtmEnv["RTMANAGER_LOG_LEVEL"] = "info"
|
||||
rtmEnv["RTMANAGER_INTERNAL_HTTP_ADDR"] = rtmInternalAddr
|
||||
rtmEnv["RTMANAGER_LOBBY_INTERNAL_BASE_URL"] = "http://" + lobbyInternalAddr
|
||||
rtmEnv["RTMANAGER_DOCKER_HOST"] = resolveDockerHost()
|
||||
rtmEnv["RTMANAGER_DOCKER_NETWORK"] = dockerNetwork
|
||||
// On dev machines and in sandboxes the rtmanager process cannot
|
||||
// chown the per-game state dir to root (uid 0). Pin the owner to
|
||||
// the current process uid/gid so `chown` is a no-op.
|
||||
rtmEnv["RTMANAGER_GAME_STATE_OWNER_UID"] = strconv.Itoa(os.Getuid())
|
||||
rtmEnv["RTMANAGER_GAME_STATE_OWNER_GID"] = strconv.Itoa(os.Getgid())
|
||||
rtmEnv["RTMANAGER_GAME_STATE_ROOT"] = t.TempDir()
|
||||
rtmEnv["RTMANAGER_REDIS_START_JOBS_STREAM"] = startJobsStreamKey
|
||||
rtmEnv["RTMANAGER_REDIS_STOP_JOBS_STREAM"] = stopJobsStreamKey
|
||||
rtmEnv["RTMANAGER_REDIS_JOB_RESULTS_STREAM"] = jobResultsStreamKey
|
||||
rtmEnv["RTMANAGER_REDIS_HEALTH_EVENTS_STREAM"] = healthEventsStreamKey
|
||||
rtmEnv["RTMANAGER_NOTIFICATION_INTENTS_STREAM"] = intentsStream
|
||||
rtmEnv["RTMANAGER_STREAM_BLOCK_TIMEOUT"] = "200ms"
|
||||
rtmEnv["RTMANAGER_RECONCILE_INTERVAL"] = "1s"
|
||||
rtmEnv["RTMANAGER_CLEANUP_INTERVAL"] = "1s"
|
||||
rtmEnv["RTMANAGER_INSPECT_INTERVAL"] = "1s"
|
||||
rtmEnv["RTMANAGER_PROBE_INTERVAL"] = "1s"
|
||||
rtmEnv["RTMANAGER_PROBE_TIMEOUT"] = "1s"
|
||||
rtmEnv["RTMANAGER_PROBE_FAILURES_THRESHOLD"] = "3"
|
||||
rtmEnv["RTMANAGER_GAME_LEASE_TTL_SECONDS"] = "10"
|
||||
rtmEnv["OTEL_TRACES_EXPORTER"] = "none"
|
||||
rtmEnv["OTEL_METRICS_EXPORTER"] = "none"
|
||||
rtmProcess := harness.StartProcess(t, "rtmanager", rtmBinary, rtmEnv)
|
||||
harness.WaitForHTTPStatus(t, rtmProcess, "http://"+rtmInternalAddr+"/readyz", http.StatusOK)
|
||||
|
||||
return &lobbyRTMHarness{
|
||||
redis: redisClient,
|
||||
userServiceURL: "http://" + userServiceAddr,
|
||||
lobbyPublicURL: "http://" + lobbyPublicAddr,
|
||||
lobbyAdminURL: "http://" + lobbyInternalAddr,
|
||||
rtmInternalURL: "http://" + rtmInternalAddr,
|
||||
intentsStream: intentsStream,
|
||||
lifecycleStream: lifecycleStream,
|
||||
jobResultsStream: jobResultsStreamKey,
|
||||
startJobsStream: startJobsStreamKey,
|
||||
stopJobsStream: stopJobsStreamKey,
|
||||
healthEvents: healthEventsStreamKey,
|
||||
gmStub: gmStub,
|
||||
dockerNetwork: dockerNetwork,
|
||||
engineImage: engineImage,
|
||||
userServiceProcess: userServiceProcess,
|
||||
lobbyProcess: lobbyProcess,
|
||||
rtmProcess: rtmProcess,
|
||||
}
|
||||
}
|
||||
|
||||
// ensureUser provisions a fresh User Service account by email and
|
||||
// returns the assigned user_id. The email pattern includes the test
|
||||
// name to avoid collisions across concurrent tests sharing the
|
||||
// container.
|
||||
func (h *lobbyRTMHarness) ensureUser(t *testing.T, email string) ensureUserResponse {
|
||||
t.Helper()
|
||||
resp := postJSON(t, h.userServiceURL+"/api/v1/internal/users/ensure-by-email", map[string]any{
|
||||
"email": email,
|
||||
"registration_context": map[string]string{
|
||||
"preferred_language": "en",
|
||||
"time_zone": "Europe/Kaliningrad",
|
||||
},
|
||||
}, nil)
|
||||
var out ensureUserResponse
|
||||
requireJSONStatus(t, resp, http.StatusOK, &out)
|
||||
require.Equal(t, "created", out.Outcome)
|
||||
require.NotEmpty(t, out.UserID)
|
||||
return out
|
||||
}
|
||||
|
||||
// userCreatePrivateGame creates a private game owned by ownerUserID
|
||||
// with the supplied target engine version. Returns the assigned
|
||||
// game_id.
|
||||
func (h *lobbyRTMHarness) userCreatePrivateGame(
|
||||
t *testing.T,
|
||||
ownerUserID, name, targetEngineVersion string,
|
||||
enrollmentEndsAt int64,
|
||||
) string {
|
||||
t.Helper()
|
||||
resp := postJSON(t, h.lobbyPublicURL+"/api/v1/lobby/games", map[string]any{
|
||||
"game_name": name,
|
||||
"game_type": "private",
|
||||
"min_players": 1,
|
||||
"max_players": 4,
|
||||
"start_gap_hours": 6,
|
||||
"start_gap_players": 1,
|
||||
"enrollment_ends_at": enrollmentEndsAt,
|
||||
"turn_schedule": "0 18 * * *",
|
||||
"target_engine_version": targetEngineVersion,
|
||||
}, http.Header{"X-User-Id": []string{ownerUserID}})
|
||||
require.Equalf(t, http.StatusCreated, resp.StatusCode, "create private game: %s", resp.Body)
|
||||
var record map[string]any
|
||||
require.NoError(t, json.Unmarshal([]byte(resp.Body), &record))
|
||||
gameID, ok := record["game_id"].(string)
|
||||
require.Truef(t, ok, "game_id missing: %s", resp.Body)
|
||||
return gameID
|
||||
}
|
||||
|
||||
func (h *lobbyRTMHarness) userOpenEnrollment(t *testing.T, ownerUserID, gameID string) {
|
||||
t.Helper()
|
||||
resp := postJSON(t,
|
||||
h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/open-enrollment",
|
||||
nil,
|
||||
http.Header{"X-User-Id": []string{ownerUserID}},
|
||||
)
|
||||
require.Equalf(t, http.StatusOK, resp.StatusCode, "user open enrollment: %s", resp.Body)
|
||||
}
|
||||
|
||||
func (h *lobbyRTMHarness) userCreateInvite(t *testing.T, ownerUserID, gameID, inviteeUserID string) {
|
||||
t.Helper()
|
||||
resp := postJSON(t,
|
||||
h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/invites",
|
||||
map[string]any{"invitee_user_id": inviteeUserID},
|
||||
http.Header{"X-User-Id": []string{ownerUserID}},
|
||||
)
|
||||
require.Equalf(t, http.StatusCreated, resp.StatusCode, "create invite: %s", resp.Body)
|
||||
}
|
||||
|
||||
func (h *lobbyRTMHarness) firstCreatedInviteID(t *testing.T, inviteeUserID, gameID string) string {
|
||||
t.Helper()
|
||||
req, err := http.NewRequest(http.MethodGet,
|
||||
h.lobbyPublicURL+"/api/v1/lobby/my/invites?status=created", nil)
|
||||
require.NoError(t, err)
|
||||
req.Header.Set("X-User-Id", inviteeUserID)
|
||||
resp := doRequest(t, req)
|
||||
require.Equalf(t, http.StatusOK, resp.StatusCode, "list my invites: %s", resp.Body)
|
||||
|
||||
var body struct {
|
||||
Items []struct {
|
||||
InviteID string `json:"invite_id"`
|
||||
GameID string `json:"game_id"`
|
||||
} `json:"items"`
|
||||
}
|
||||
require.NoError(t, json.Unmarshal([]byte(resp.Body), &body))
|
||||
for _, item := range body.Items {
|
||||
if item.GameID == gameID {
|
||||
return item.InviteID
|
||||
}
|
||||
}
|
||||
t.Fatalf("no invite found for invitee %s on game %s; body=%s", inviteeUserID, gameID, resp.Body)
|
||||
return ""
|
||||
}
|
||||
|
||||
func (h *lobbyRTMHarness) userRedeemInvite(t *testing.T, inviteeUserID, gameID, inviteID, raceName string) {
|
||||
t.Helper()
|
||||
resp := postJSON(t,
|
||||
h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/invites/"+inviteID+"/redeem",
|
||||
map[string]any{"race_name": raceName},
|
||||
http.Header{"X-User-Id": []string{inviteeUserID}},
|
||||
)
|
||||
require.Equalf(t, http.StatusOK, resp.StatusCode, "redeem invite: %s", resp.Body)
|
||||
}
|
||||
|
||||
func (h *lobbyRTMHarness) userReadyToStart(t *testing.T, ownerUserID, gameID string) {
|
||||
t.Helper()
|
||||
resp := postJSON(t,
|
||||
h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/ready-to-start",
|
||||
nil,
|
||||
http.Header{"X-User-Id": []string{ownerUserID}},
|
||||
)
|
||||
require.Equalf(t, http.StatusOK, resp.StatusCode, "ready-to-start: %s", resp.Body)
|
||||
}
|
||||
|
||||
func (h *lobbyRTMHarness) userStartGame(t *testing.T, ownerUserID, gameID string) {
|
||||
t.Helper()
|
||||
resp := postJSON(t,
|
||||
h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/start",
|
||||
nil,
|
||||
http.Header{"X-User-Id": []string{ownerUserID}},
|
||||
)
|
||||
require.Equalf(t, http.StatusOK, resp.StatusCode, "user start: %s", resp.Body)
|
||||
}
|
||||
|
||||
// prepareInflightGame walks one private game from creation through
|
||||
// `start`. For the happy and cancel scenarios the game subsequently
|
||||
// reaches `running` once RTM publishes the success job_result; for
|
||||
// the failure scenario it ends in `start_failed`.
|
||||
//
|
||||
// Returns owner and invitee user records plus the game id.
|
||||
func (h *lobbyRTMHarness) prepareInflightGame(
|
||||
t *testing.T,
|
||||
ownerEmail, inviteeEmail, gameName, targetEngineVersion string,
|
||||
) (owner, invitee ensureUserResponse, gameID string) {
|
||||
t.Helper()
|
||||
owner = h.ensureUser(t, ownerEmail)
|
||||
invitee = h.ensureUser(t, inviteeEmail)
|
||||
|
||||
gameID = h.userCreatePrivateGame(t, owner.UserID, gameName, targetEngineVersion,
|
||||
time.Now().Add(48*time.Hour).Unix())
|
||||
h.userOpenEnrollment(t, owner.UserID, gameID)
|
||||
h.userCreateInvite(t, owner.UserID, gameID, invitee.UserID)
|
||||
inviteID := h.firstCreatedInviteID(t, invitee.UserID, gameID)
|
||||
h.userRedeemInvite(t, invitee.UserID, gameID, inviteID, "PilotInvitee")
|
||||
h.userReadyToStart(t, owner.UserID, gameID)
|
||||
h.userStartGame(t, owner.UserID, gameID)
|
||||
return owner, invitee, gameID
|
||||
}
|
||||
|
||||
// gameStatus reads one game record off Lobby's internal API and
|
||||
// returns its status field. Used by waitGameStatus and direct
|
||||
// assertions.
|
||||
func (h *lobbyRTMHarness) gameStatus(t *testing.T, gameID string) string {
|
||||
t.Helper()
|
||||
req, err := http.NewRequest(http.MethodGet,
|
||||
h.lobbyAdminURL+"/api/v1/internal/games/"+gameID, nil)
|
||||
require.NoError(t, err)
|
||||
resp := doRequest(t, req)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("get game internal: status=%d body=%s", resp.StatusCode, resp.Body)
|
||||
}
|
||||
var record struct {
|
||||
Status string `json:"status"`
|
||||
}
|
||||
require.NoError(t, json.Unmarshal([]byte(resp.Body), &record))
|
||||
return record.Status
|
||||
}
|
||||
|
||||
// waitGameStatus polls `GET /api/v1/internal/games/{gameID}` until
|
||||
// the record reports the expected status or the timeout fires.
|
||||
func (h *lobbyRTMHarness) waitGameStatus(t *testing.T, gameID, want string, timeout time.Duration) {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(timeout)
|
||||
for {
|
||||
got := h.gameStatus(t, gameID)
|
||||
if got == want {
|
||||
return
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("game %s status: want %q got %q (after %s)", gameID, want, got, timeout)
|
||||
}
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
// publishUserLifecycleEvent appends one event to the per-test
|
||||
// `user:lifecycle_events` stream. The Lobby userlifecycle worker
|
||||
// consumes the same stream.
|
||||
func (h *lobbyRTMHarness) publishUserLifecycleEvent(t *testing.T, eventType, userID string) {
|
||||
t.Helper()
|
||||
_, err := h.redis.XAdd(context.Background(), &redis.XAddArgs{
|
||||
Stream: h.lifecycleStream,
|
||||
Values: map[string]any{
|
||||
"event_type": eventType,
|
||||
"user_id": userID,
|
||||
"occurred_at_ms": strconv.FormatInt(time.Now().UnixMilli(), 10),
|
||||
"source": "user_admin",
|
||||
"actor_type": "admin",
|
||||
"actor_id": "admin-1",
|
||||
"reason_code": "terminal_policy_violation",
|
||||
},
|
||||
}).Result()
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
// jobResultEntry decodes one `runtime:job_results` Redis Stream entry.
|
||||
type jobResultEntry struct {
|
||||
StreamID string
|
||||
GameID string
|
||||
Outcome string
|
||||
ContainerID string
|
||||
EngineEndpoint string
|
||||
ErrorCode string
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// stopJobEntry decodes one `runtime:stop_jobs` Redis Stream entry as
|
||||
// published by Lobby.
|
||||
type stopJobEntry struct {
|
||||
StreamID string
|
||||
GameID string
|
||||
Reason string
|
||||
}
|
||||
|
||||
// notificationIntentEntry decodes one `notification:intents` entry.
|
||||
type notificationIntentEntry struct {
|
||||
StreamID string
|
||||
NotificationType string
|
||||
Producer string
|
||||
Payload map[string]any
|
||||
}
|
||||
|
||||
// allJobResults returns every entry on the per-test job_results
|
||||
// stream in stream order.
|
||||
func (h *lobbyRTMHarness) allJobResults(t *testing.T) []jobResultEntry {
|
||||
t.Helper()
|
||||
entries, err := h.redis.XRange(context.Background(), h.jobResultsStream, "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
out := make([]jobResultEntry, 0, len(entries))
|
||||
for _, entry := range entries {
|
||||
out = append(out, jobResultEntry{
|
||||
StreamID: entry.ID,
|
||||
GameID: streamString(entry.Values, "game_id"),
|
||||
Outcome: streamString(entry.Values, "outcome"),
|
||||
ContainerID: streamString(entry.Values, "container_id"),
|
||||
EngineEndpoint: streamString(entry.Values, "engine_endpoint"),
|
||||
ErrorCode: streamString(entry.Values, "error_code"),
|
||||
ErrorMessage: streamString(entry.Values, "error_message"),
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// waitJobResult polls the per-test job_results stream until predicate
|
||||
// matches one entry, or the timeout fires.
|
||||
func (h *lobbyRTMHarness) waitJobResult(
|
||||
t *testing.T,
|
||||
predicate func(jobResultEntry) bool,
|
||||
timeout time.Duration,
|
||||
) jobResultEntry {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(timeout)
|
||||
for {
|
||||
entries := h.allJobResults(t)
|
||||
for _, entry := range entries {
|
||||
if predicate(entry) {
|
||||
return entry
|
||||
}
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("no job_result matched within %s; observed=%+v", timeout, entries)
|
||||
}
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
// allStopJobs returns every entry on the per-test stop_jobs stream.
|
||||
func (h *lobbyRTMHarness) allStopJobs(t *testing.T) []stopJobEntry {
|
||||
t.Helper()
|
||||
entries, err := h.redis.XRange(context.Background(), h.stopJobsStream, "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
out := make([]stopJobEntry, 0, len(entries))
|
||||
for _, entry := range entries {
|
||||
out = append(out, stopJobEntry{
|
||||
StreamID: entry.ID,
|
||||
GameID: streamString(entry.Values, "game_id"),
|
||||
Reason: streamString(entry.Values, "reason"),
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// waitStopJobReason polls the stop_jobs stream until an entry for
|
||||
// gameID with the expected reason appears.
|
||||
func (h *lobbyRTMHarness) waitStopJobReason(t *testing.T, gameID, reason string, timeout time.Duration) stopJobEntry {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(timeout)
|
||||
for {
|
||||
for _, entry := range h.allStopJobs(t) {
|
||||
if entry.GameID == gameID && entry.Reason == reason {
|
||||
return entry
|
||||
}
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("no stop_job for game %s with reason %q within %s", gameID, reason, timeout)
|
||||
}
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
// allNotificationIntents returns every entry on the per-test
|
||||
// notification:intents stream.
|
||||
func (h *lobbyRTMHarness) allNotificationIntents(t *testing.T) []notificationIntentEntry {
|
||||
t.Helper()
|
||||
entries, err := h.redis.XRange(context.Background(), h.intentsStream, "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
out := make([]notificationIntentEntry, 0, len(entries))
|
||||
for _, entry := range entries {
|
||||
decoded := notificationIntentEntry{
|
||||
StreamID: entry.ID,
|
||||
NotificationType: streamString(entry.Values, "notification_type"),
|
||||
Producer: streamString(entry.Values, "producer"),
|
||||
}
|
||||
// `pkg/notificationintent` publishes the payload under the
|
||||
// field name `payload_json`. Older versions of this harness
|
||||
// looked for `payload` and silently produced an empty Payload
|
||||
// map, which made every predicate that checks `Payload["…"]`
|
||||
// fall through. Read both field names for forward compat.
|
||||
raw := streamString(entry.Values, "payload_json")
|
||||
if raw == "" {
|
||||
raw = streamString(entry.Values, "payload")
|
||||
}
|
||||
if raw != "" {
|
||||
var parsed map[string]any
|
||||
if err := json.Unmarshal([]byte(raw), &parsed); err == nil {
|
||||
decoded.Payload = parsed
|
||||
}
|
||||
}
|
||||
out = append(out, decoded)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// waitNotificationIntent polls the intents stream until the
|
||||
// predicate matches.
|
||||
func (h *lobbyRTMHarness) waitNotificationIntent(
|
||||
t *testing.T,
|
||||
predicate func(notificationIntentEntry) bool,
|
||||
timeout time.Duration,
|
||||
) notificationIntentEntry {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(timeout)
|
||||
for {
|
||||
entries := h.allNotificationIntents(t)
|
||||
for _, entry := range entries {
|
||||
if predicate(entry) {
|
||||
return entry
|
||||
}
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
summary := make([]string, 0, len(entries))
|
||||
for _, entry := range entries {
|
||||
summary = append(summary, entry.NotificationType+":"+entry.Producer)
|
||||
}
|
||||
t.Fatalf("no notification_intent matched within %s; observed=%v", timeout, summary)
|
||||
}
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
// rtmRuntimeStatus issues `GET /api/v1/internal/runtimes/{gameID}`
|
||||
// against RTM and returns the persisted runtime record's status, or
|
||||
// the empty string when RTM responds 404.
|
||||
func (h *lobbyRTMHarness) rtmRuntimeStatus(t *testing.T, gameID string) (string, int) {
|
||||
t.Helper()
|
||||
req, err := http.NewRequest(http.MethodGet,
|
||||
h.rtmInternalURL+"/api/v1/internal/runtimes/"+gameID, nil)
|
||||
require.NoError(t, err)
|
||||
resp := doRequest(t, req)
|
||||
if resp.StatusCode == http.StatusNotFound {
|
||||
return "", resp.StatusCode
|
||||
}
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("rtm get runtime: status=%d body=%s", resp.StatusCode, resp.Body)
|
||||
}
|
||||
var record struct {
|
||||
Status string `json:"status"`
|
||||
}
|
||||
require.NoError(t, json.Unmarshal([]byte(resp.Body), &record))
|
||||
return record.Status, resp.StatusCode
|
||||
}
|
||||
|
||||
// waitRTMRuntimeStatus polls RTM until the runtime record reports
|
||||
// the expected status or the timeout fires.
|
||||
func (h *lobbyRTMHarness) waitRTMRuntimeStatus(t *testing.T, gameID, want string, timeout time.Duration) {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(timeout)
|
||||
for {
|
||||
status, code := h.rtmRuntimeStatus(t, gameID)
|
||||
if status == want {
|
||||
return
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("rtm runtime status for %s: want %q got %q (http %d) within %s",
|
||||
gameID, want, status, code, timeout)
|
||||
}
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
// streamString reads a Redis Streams field as a string regardless of
|
||||
// the underlying go-redis decoded type.
|
||||
func streamString(values map[string]any, key string) string {
|
||||
raw, ok := values[key]
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
switch typed := raw.(type) {
|
||||
case string:
|
||||
return typed
|
||||
case []byte:
|
||||
return string(typed)
|
||||
default:
|
||||
return fmt.Sprintf("%v", typed)
|
||||
}
|
||||
}
|
||||
|
||||
func waitForUserServiceReady(t *testing.T, process *harness.Process, baseURL string) {
|
||||
t.Helper()
|
||||
client := &http.Client{Timeout: 250 * time.Millisecond}
|
||||
t.Cleanup(client.CloseIdleConnections)
|
||||
|
||||
deadline := time.Now().Add(10 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
req, err := http.NewRequest(http.MethodGet,
|
||||
baseURL+"/api/v1/internal/users/user-readiness-probe/exists", nil)
|
||||
require.NoError(t, err)
|
||||
response, err := client.Do(req)
|
||||
if err == nil {
|
||||
_, _ = io.Copy(io.Discard, response.Body)
|
||||
response.Body.Close()
|
||||
if response.StatusCode == http.StatusOK {
|
||||
return
|
||||
}
|
||||
}
|
||||
time.Sleep(25 * time.Millisecond)
|
||||
}
|
||||
t.Fatalf("wait for userservice readiness: timeout\n%s", process.Logs())
|
||||
}
|
||||
|
||||
func postJSON(t *testing.T, url string, body any, header http.Header) httpResponse {
|
||||
t.Helper()
|
||||
var reader io.Reader
|
||||
if body != nil {
|
||||
payload, err := json.Marshal(body)
|
||||
require.NoError(t, err)
|
||||
reader = bytes.NewReader(payload)
|
||||
}
|
||||
req, err := http.NewRequest(http.MethodPost, url, reader)
|
||||
require.NoError(t, err)
|
||||
if body != nil {
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
}
|
||||
maps.Copy(req.Header, header)
|
||||
return doRequest(t, req)
|
||||
}
|
||||
|
||||
func doRequest(t *testing.T, request *http.Request) httpResponse {
|
||||
t.Helper()
|
||||
client := &http.Client{
|
||||
Timeout: 5 * time.Second,
|
||||
Transport: &http.Transport{DisableKeepAlives: true},
|
||||
}
|
||||
t.Cleanup(client.CloseIdleConnections)
|
||||
|
||||
response, err := client.Do(request)
|
||||
require.NoError(t, err)
|
||||
defer response.Body.Close()
|
||||
|
||||
payload, err := io.ReadAll(response.Body)
|
||||
require.NoError(t, err)
|
||||
return httpResponse{
|
||||
StatusCode: response.StatusCode,
|
||||
Body: string(payload),
|
||||
Header: response.Header.Clone(),
|
||||
}
|
||||
}
|
||||
|
||||
func requireJSONStatus(t *testing.T, response httpResponse, wantStatus int, target any) {
|
||||
t.Helper()
|
||||
require.Equalf(t, wantStatus, response.StatusCode, "unexpected status, body=%s", response.Body)
|
||||
if target != nil {
|
||||
require.NoError(t, decodeStrictJSON([]byte(response.Body), target))
|
||||
}
|
||||
}
|
||||
|
||||
func decodeStrictJSON(payload []byte, target any) error {
|
||||
decoder := json.NewDecoder(bytes.NewReader(payload))
|
||||
decoder.DisallowUnknownFields()
|
||||
if err := decoder.Decode(target); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := decoder.Decode(&struct{}{}); err != io.EOF {
|
||||
if err == nil {
|
||||
return errors.New("unexpected trailing JSON input")
|
||||
}
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// resolveDockerHost honours DOCKER_HOST when the developer machine
|
||||
// routes through colima or a remote daemon, falling back to the
|
||||
// standard unix path otherwise.
|
||||
func resolveDockerHost() string {
|
||||
if host := strings.TrimSpace(os.Getenv("DOCKER_HOST")); host != "" {
|
||||
return host
|
||||
}
|
||||
return "unix:///var/run/docker.sock"
|
||||
}
|
||||
|
||||
@@ -0,0 +1,204 @@
|
||||
package lobbyrtm_test
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/integration/internal/harness"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
const (
|
||||
jobOutcomeSuccess = "success"
|
||||
jobOutcomeFailure = "failure"
|
||||
|
||||
stopReasonCancelled = "cancelled"
|
||||
|
||||
errorCodeImagePullFailed = "image_pull_failed"
|
||||
)
|
||||
|
||||
// TestStartFlowSucceedsWithRealEngine drives the happy path:
|
||||
// Lobby creates a private game, the owner walks it through enrollment
|
||||
// to start, Lobby publishes a `runtime:start_jobs` envelope with the
|
||||
// resolved `image_ref`, RTM starts a real `galaxy/game` engine
|
||||
// container, publishes a success `runtime:job_results` entry, and
|
||||
// Lobby's runtimejobresult worker transitions the game to `running`.
|
||||
// The test then hits the engine's `/healthz` endpoint directly via
|
||||
// the bridge network IP, proving the container is alive end-to-end.
|
||||
func TestStartFlowSucceedsWithRealEngine(t *testing.T) {
|
||||
h := newLobbyRTMHarness(t)
|
||||
|
||||
owner, _, gameID := h.prepareInflightGame(t,
|
||||
"start-owner@example.com",
|
||||
"start-invitee@example.com",
|
||||
"Start Galaxy",
|
||||
defaultEngineVersion,
|
||||
)
|
||||
t.Logf("owner=%s game=%s", owner.UserID, gameID)
|
||||
|
||||
// RTM publishes a success job_result for the start envelope.
|
||||
startResult := h.waitJobResult(t, func(entry jobResultEntry) bool {
|
||||
return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess
|
||||
}, 90*time.Second)
|
||||
require.Empty(t, startResult.ErrorCode, "happy path must publish empty error_code")
|
||||
require.NotEmpty(t, startResult.ContainerID, "happy path must carry a container id")
|
||||
require.NotEmpty(t, startResult.EngineEndpoint, "happy path must carry an engine endpoint")
|
||||
|
||||
// Lobby's runtime-job-result worker drives the game to `running`.
|
||||
h.waitGameStatus(t, gameID, "running", 30*time.Second)
|
||||
|
||||
// RTM persists the runtime record and exposes it through REST.
|
||||
h.waitRTMRuntimeStatus(t, gameID, "running", 15*time.Second)
|
||||
|
||||
// A real engine container exists with the expected labels.
|
||||
containerID := harness.FindContainerIDByLabel(t, gameID)
|
||||
require.NotEmptyf(t, containerID, "no engine container found for game %s", gameID)
|
||||
require.Equal(t, startResult.ContainerID, containerID,
|
||||
"job_result container_id must match the live container")
|
||||
require.Equal(t, "running", harness.ContainerState(t, containerID))
|
||||
|
||||
// The engine answers /healthz on the bridge network IP.
|
||||
ip := harness.ContainerNetworkIP(t, containerID, h.dockerNetwork)
|
||||
require.NotEmptyf(t, ip, "engine container %s has no IP on network %s", containerID, h.dockerNetwork)
|
||||
harness.WaitForEngineHealthz(t, ip, 15*time.Second)
|
||||
}
|
||||
|
||||
// TestRunningGameStopsWhenOwnerCascadeBlocked drives the stop path:
|
||||
// drive the same game to `running`, publish a
|
||||
// `user.lifecycle.permanent_blocked` event for the owner, the Lobby
|
||||
// userlifecycle worker cascades to the inflight game, publishes a
|
||||
// `runtime:stop_jobs` envelope with `reason=cancelled`, and RTM stops
|
||||
// the engine. The test asserts on the public boundary surfaces only.
|
||||
func TestRunningGameStopsWhenOwnerCascadeBlocked(t *testing.T) {
|
||||
h := newLobbyRTMHarness(t)
|
||||
|
||||
owner, _, gameID := h.prepareInflightGame(t,
|
||||
"stop-owner@example.com",
|
||||
"stop-invitee@example.com",
|
||||
"Stop Galaxy",
|
||||
defaultEngineVersion,
|
||||
)
|
||||
t.Logf("owner=%s game=%s", owner.UserID, gameID)
|
||||
|
||||
// Wait for the start outcome so we know RTM is fully running
|
||||
// before we trigger the cascade.
|
||||
h.waitJobResult(t, func(entry jobResultEntry) bool {
|
||||
return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess
|
||||
}, 90*time.Second)
|
||||
h.waitGameStatus(t, gameID, "running", 30*time.Second)
|
||||
containerID := harness.FindContainerIDByLabel(t, gameID)
|
||||
require.NotEmpty(t, containerID)
|
||||
|
||||
// Trigger the cascade: permanent block on the game owner causes
|
||||
// Lobby's userlifecycle worker to publish stop_job(cancelled) and
|
||||
// transition the owned game to `cancelled`.
|
||||
h.publishUserLifecycleEvent(t, "user.lifecycle.permanent_blocked", owner.UserID)
|
||||
|
||||
// Lobby observably publishes the right stop envelope on the boundary.
|
||||
stop := h.waitStopJobReason(t, gameID, stopReasonCancelled, 30*time.Second)
|
||||
assert.Equal(t, gameID, stop.GameID)
|
||||
|
||||
// Lobby moves the game to cancelled.
|
||||
h.waitGameStatus(t, gameID, "cancelled", 30*time.Second)
|
||||
|
||||
// RTM consumes stop_job, stops the engine, and persists status=stopped.
|
||||
h.waitRTMRuntimeStatus(t, gameID, "stopped", 30*time.Second)
|
||||
|
||||
// The container is no longer running. Docker reports `exited`
|
||||
// (or `created`/`removing` during teardown); none of those match
|
||||
// `running`, which is the only state that contradicts a successful
|
||||
// stop.
|
||||
require.Eventuallyf(t, func() bool {
|
||||
state := harness.ContainerState(t, containerID)
|
||||
return state != "running"
|
||||
}, 30*time.Second, 250*time.Millisecond,
|
||||
"engine container %s did not leave running state", containerID)
|
||||
|
||||
// RTM emitted at least two job_results for this game: one success
|
||||
// for the start, one success for the stop.
|
||||
successCount := 0
|
||||
for _, entry := range h.allJobResults(t) {
|
||||
if entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess {
|
||||
successCount++
|
||||
}
|
||||
}
|
||||
assert.GreaterOrEqualf(t, successCount, 2,
|
||||
"expected at least two success job_results (start + stop) for game %s", gameID)
|
||||
}
|
||||
|
||||
// TestStartFailsWhenImageMissing drives the failure path: the game's
|
||||
// `target_engine_version` resolves to a non-existent image tag, RTM
|
||||
// fails to pull, publishes a failure `runtime:job_results` plus a
|
||||
// `runtime.image_pull_failed` notification intent, and Lobby's
|
||||
// runtimejobresult worker transitions the game to `start_failed`.
|
||||
func TestStartFailsWhenImageMissing(t *testing.T) {
|
||||
h := newLobbyRTMHarness(t)
|
||||
|
||||
owner, _, gameID := h.prepareInflightGame(t,
|
||||
"fail-owner@example.com",
|
||||
"fail-invitee@example.com",
|
||||
"Fail Galaxy",
|
||||
missingEngineVersion,
|
||||
)
|
||||
t.Logf("owner=%s game=%s", owner.UserID, gameID)
|
||||
|
||||
expectedImageRef := "galaxy/game:" + missingEngineVersion + "-lobbyrtm-it"
|
||||
|
||||
// RTM publishes a failure job_result with the stable code.
|
||||
failure := h.waitJobResult(t, func(entry jobResultEntry) bool {
|
||||
return entry.GameID == gameID && entry.Outcome == jobOutcomeFailure
|
||||
}, 120*time.Second)
|
||||
assert.Equal(t, errorCodeImagePullFailed, failure.ErrorCode)
|
||||
assert.Empty(t, failure.ContainerID)
|
||||
assert.Empty(t, failure.EngineEndpoint)
|
||||
assert.NotEmpty(t, failure.ErrorMessage)
|
||||
|
||||
// RTM also publishes an admin notification intent on the shared stream.
|
||||
intent := h.waitNotificationIntent(t, func(entry notificationIntentEntry) bool {
|
||||
if entry.NotificationType != notificationImagePulled {
|
||||
return false
|
||||
}
|
||||
payloadGameID, _ := entry.Payload["game_id"].(string)
|
||||
return payloadGameID == gameID
|
||||
}, 30*time.Second)
|
||||
require.NotNil(t, intent.Payload)
|
||||
assert.Equal(t, gameID, intent.Payload["game_id"])
|
||||
assert.Equal(t, expectedImageRef, intent.Payload["image_ref"])
|
||||
assert.Equal(t, errorCodeImagePullFailed, intent.Payload["error_code"])
|
||||
|
||||
// Lobby flips the game to start_failed.
|
||||
h.waitGameStatus(t, gameID, "start_failed", 60*time.Second)
|
||||
|
||||
// No engine container should exist for this game.
|
||||
containerID := harness.FindContainerIDByLabel(t, gameID)
|
||||
if containerID != "" {
|
||||
state := harness.ContainerState(t, containerID)
|
||||
assert.NotEqual(t, "running", state,
|
||||
"failed image pull must not leave a running container behind (state=%s)", state)
|
||||
}
|
||||
|
||||
// RTM either has no record (clean rollback) or has one not in
|
||||
// `running`. Either is acceptable per the start service contract.
|
||||
status, code := h.rtmRuntimeStatus(t, gameID)
|
||||
switch code {
|
||||
case http.StatusNotFound:
|
||||
// nothing persisted — clean rollback path
|
||||
case http.StatusOK:
|
||||
assert.NotEqual(t, "running", status,
|
||||
"failed image pull must not persist a running record")
|
||||
default:
|
||||
t.Fatalf("unexpected RTM runtime response: status=%q code=%d", status, code)
|
||||
}
|
||||
|
||||
// Sanity check the notification carried RTM's producer marker
|
||||
// rather than Lobby's, so we know the suite truly observed RTM
|
||||
// publishing on the shared stream.
|
||||
assert.Truef(t,
|
||||
strings.Contains(intent.Producer, "rtm") ||
|
||||
strings.Contains(intent.Producer, "runtime"),
|
||||
"image_pull_failed intent producer should be RTM-flavoured, got %q", intent.Producer)
|
||||
}
|
||||
Reference in New Issue
Block a user