diff --git a/.claude/settings.json b/.claude/settings.json index e645197..d878b0d 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -1,4 +1,19 @@ { + "sandbox": { + "network": { + "allowLocalBinding": true, + "allowUnixSockets": ["/Users/id/.colima/default/docker.sock"], + "allowedDomains": [ + "github.com", + "registry.npmjs.org", + "*.npmjs.org", + "docker.com", + "docker.io", + "gcr.io", + "*.golang.org" + ] + } + }, "enabledPlugins": { "gopls-lsp@claude-plugins-official": true, "context7@claude-plugins-official": true diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index a472feb..0000000 --- a/AGENTS.md +++ /dev/null @@ -1,915 +0,0 @@ -# AGENTS.md - -## 1. Purpose - -This repository is developed primarily in Go. - -The agent must optimize for: - -- correctness before speed, -- readability before cleverness, -- explicit behavior before hidden magic, -- small, reviewable changes, -- reproducible builds and tests, -- clear written reasoning for non-obvious decisions. - -The agent should behave like a careful senior Go engineer working in an existing codebase with real maintenance costs. - ---- - -## 2. Core operating rules - -### 2.1 Main priorities - -When making changes, follow this order of priority: - -1. Preserve correctness. -2. Preserve or improve clarity. -3. Preserve compatibility unless the task explicitly allows breaking changes. -4. Keep the diff minimal. -5. Keep the implementation idiomatic for modern Go. -6. Keep performance reasonable, but do not micro-optimize without evidence. - -### 2.2 What the agent must not do - -The agent must not: - -- rewrite large areas of code without clear need, -- introduce speculative abstractions, -- rename many symbols “for cleanliness” unless required, -- mix unrelated refactors with the requested task, -- silently change public behavior, -- silently change wire formats, database semantics, or API contracts, -- add dependencies unless necessary, -- invent requirements not stated by the user or codebase, -- leave TODOs instead of implementing the requested behavior, unless explicitly asked, -- claim code was tested if it was not actually tested, -- claim a root cause without evidence, -- fix extra bugs opportunistically unless they are tightly adjacent and clearly explained. - -### 2.3 Expected default behavior - -Unless the user asks otherwise, the agent should: - -- inspect the relevant code path before editing, -- understand current behavior before proposing changes, -- prefer the smallest correct patch, -- update or add tests for every functional change, -- keep public interfaces stable, -- preserve log/event/metric semantics unless a change is needed, -- explain assumptions, -- mention trade-offs when they matter. - -### 2.3 Expected documentation behavior - -Unless the user asks otherwise, the agent should: - -- supply added packages, types, funcs, consts and vars with a comprehensive comments explaining its purpose and behavior, -- supply public functions with a more comprehensive commentary and supplemental funcs with more concise comments, -- provide comments respecting the Go Doc Comments syntax: use strict parameters names inside human-friendly sentences, -- provide comments only in English language, -- correct obvious grammatical and style errors in existing commentaries met in changed files. - ---- - -## 3. Repository familiarization workflow - -Before making non-trivial changes, the agent should quickly map the local conventions. - -### 3.1 Files to inspect first - -Prefer inspecting, when present: - -- `go.mod` -- `go.sum` -- `README.md` -- `Makefile` -- `Taskfile.yml` / `Taskfile.yaml` -- `.golangci.yml` / `.golangci.yaml` -- `.editorconfig` -- `buf.yaml` -- `buf.gen.yaml` -- `Dockerfile*` -- `compose*.yml` -- CI files under `.github/workflows/`, `.gitlab-ci.yml`, etc. -- migration directories -- existing `AGENTS.md` files in subdirectories -- representative files in the affected package -- representative tests in the affected package - -### 3.2 Conventions to infer - -The agent should infer and follow: - -- package layout style, -- naming conventions, -- error handling conventions, -- logging conventions, -- context usage conventions, -- test style, -- benchmark style, -- dependency injection pattern, -- API versioning conventions, -- DTO/model separation style, -- storage and transaction conventions, -- lint and formatting requirements. - -If conventions are inconsistent, prefer the one used in the closest affected code. - ---- - -## 4. Scope control - -### 4.1 Stay within scope - -The agent must solve the user’s request directly and avoid unrelated cleanup. - -Allowed adjacent changes: - -- fixing a test broken by the main change, -- adding a missing helper required by the main change, -- small refactors necessary to make the change safe, -- updating documentation directly affected by the change. - -Not allowed without explicit justification: - -- formatting unrelated files, -- reorganizing package structure, -- replacing libraries, -- changing error taxonomy globally, -- changing logging framework, -- broad “modernization” passes, -- large dependency bumps. - -### 4.2 When the requested change is underspecified - -If details are missing, the agent should: - -1. infer the most conservative behavior from existing code, -2. avoid breaking current behavior, -3. document the chosen assumption in the final response. - -Do not block on avoidable clarification if a reasonable implementation path exists. - ---- - -## 5. Go version and language guidance - -### 5.1 Target version - -Target the Go version declared in `go.mod`. - -If the repository does not make this obvious, assume modern stable Go and avoid experimental features unless already present. - -### 5.2 Idiomatic Go requirements - -The agent should prefer: - -- target Go version language idioms and syntax improvements, -- simple package APIs, -- concrete types when interfaces are not needed, -- small interfaces defined by consumers, -- explicit error handling, -- early returns, -- table-driven tests where appropriate, -- `context.Context` as the first parameter for request-scoped operations, -- `errors.AsType` first, `errors.Is` / `errors.As` last, -- standard library first. - -The agent should avoid: - -- unnecessary generics, -- unnecessary reflection, -- hidden global state, -- panics for expected errors, -- overuse of empty interfaces or `any`, -- deeply nested control flow, -- concurrency without clear benefit, -- channel-based designs where a simple call flow is better. - -### 5.3 Style details - -Prefer: - -- short, focused functions, -- package-level cohesion, -- exported identifiers only when needed, -- comments for exported symbols, -- comments explaining “why”, not narrating trivial code, -- stable and unsurprising zero values where appropriate. - -Avoid: - -- single-letter names except tight local scopes, -- clever helper layers that obscure flow, -- Boolean parameter lists that are hard to read, -- hidden side effects, -- magic constants without names. - ---- - -## 6. Editing rules for Go code - -### 6.1 Function and type changes - -When modifying a function or method, the agent should: - -- preserve signature compatibility unless the task explicitly requires change, -- preserve context and cancellation behavior, -- preserve caller expectations, -- update all call sites, -- update tests that express expected behavior. - -When adding new exported API: - -- keep it minimal, -- document it, -- justify why export is needed, -- prefer package-private helpers if external use is not required. - -### 6.2 Error handling - -The agent must: - -- return errors, not swallow them, -- wrap errors when adding useful context, -- avoid duplicative wrapping, -- preserve sentinel errors or typed errors already used in the codebase, -- use `%w` correctly, -- not log and return the same error at multiple layers unless the codebase explicitly does that. - -If the codebase distinguishes user-facing, domain, transport, and storage errors, preserve that separation. - -### 6.3 Context usage - -The agent must: - -- pass context through relevant call chains, -- not store contexts in structs, -- not use `context.Background()` in request flows unless clearly appropriate, -- respect cancellation and deadlines when existing code expects that, -- avoid creating child contexts unnecessarily. - -### 6.4 Concurrency - -Only introduce concurrency if it clearly improves the requested behavior and does not degrade maintainability. - -If adding concurrency, the agent must consider: - -- cancellation, -- data races, -- goroutine lifetime, -- bounded parallelism, -- error propagation, -- testability, -- deterministic shutdown. - -Avoid spawning goroutines without a clear ownership model. - -### 6.5 Logging and observability - -Follow existing repository conventions. - -The agent should: - -- keep logs structured if the codebase uses structured logging, -- avoid logging sensitive values, -- avoid noisy logs in hot paths, -- preserve stable field names when logs are used operationally, -- update metrics/traces only when directly relevant. - -Do not add logs as a substitute for error handling. - ---- - -## 7. Testing requirements - -### 7.1 General rule - -Every behavior change should be covered by tests unless the repository clearly does not test that layer. - -A functional code change without tests requires a clear reason in the final response. - -### 7.2 Preferred testing style - -Prefer: - -- table-driven tests, -- focused tests per behavior, -- `testify` for assertions and requirements if the repository already uses it or if new tests are added and no conflicting convention exists, -- deterministic tests, -- subtests with meaningful names, -- minimal fixtures, -- clear failure messages. - -### 7.3 What tests should verify - -Tests should verify: - -- externally observable behavior, -- error cases, -- edge cases, -- nil / empty / zero-value behavior where relevant, -- backward compatibility where relevant, -- concurrency behavior if changed, -- serialization/deserialization boundaries if relevant. - -### 7.4 What tests should avoid - -Avoid tests that are: - -- tightly coupled to private implementation details without need, -- flaky, -- timing-sensitive without control, -- dependent on wall clock when fake time can be used, -- dependent on random behavior without fixed seed, -- dependent on external services unless the repository already uses integration test infrastructure. - -### 7.5 Test commands - -Prefer repository-native commands first. - -Common examples: - -```bash -go test ./... -go test ./... -race -go test ./... -cover -``` - -If a narrower command is sufficient, use the smallest command that provides confidence. - ---- - -## 8. Dependency policy - -### 8.1 Default rule - -Prefer the Go standard library and existing repository dependencies. - -Do not add a new dependency unless it provides clear value that is difficult to replicate safely with existing tools. - -### 8.2 If adding a dependency is necessary - -The agent must: - -- choose a well-maintained package, -- minimize dependency surface, -- avoid dependency overlap, -- explain why the new dependency is needed, -- update tests and usage accordingly. - -Avoid adding heavy frameworks into lightweight packages. - ---- - -## 9. Performance policy - -### 9.1 Default stance - -Do not optimize speculatively. - -Prefer clear code first, then optimize only if: - -- the task is explicitly performance-related, -- the affected path is obviously hot, -- profiling evidence is available, -- the repository already treats this path as performance-sensitive. - -### 9.2 When performance matters - -The agent should consider: - -- allocations, -- copies, -- unnecessary conversions, -- lock contention, -- query count, -- I/O amplification, -- algorithmic complexity. - -If making a performance optimization, document the trade-off and preserve readability as much as possible. - ---- - -## 10. API, wire format, and compatibility rules - -### 10.1 Backward compatibility - -Assume compatibility matters unless the task says otherwise. - -The agent must not casually change: - -- JSON field names, -- protobuf field numbers, -- SQL schema semantics, -- HTTP status codes, -- error codes, -- event payloads, -- config keys, -- environment variable names, -- CLI flags, -- file formats. - -### 10.2 If a breaking change is necessary - -The agent should: - -- keep the change localized, -- update affected tests, -- update docs and examples, -- explicitly call out the break in the final response. - ---- - -## 11. Database and persistence guidance - -If the repository interacts with a database, the agent should preserve data safety first. - -### 11.1 Queries and mutations - -The agent must: - -- understand existing transaction boundaries, -- avoid introducing N+1 query patterns, -- preserve idempotency where relevant, -- preserve isolation expectations, -- handle `sql.ErrNoRows` or equivalent consistently. - -### 11.2 Migrations - -If adding or changing migrations: - -- make them forward-safe, -- avoid destructive changes unless explicitly requested, -- preserve rollback strategy if the repository uses one, -- avoid combining schema and risky data backfills blindly, -- update related models, queries, and tests. - -### 11.3 Data correctness - -The agent must be conservative with: - -- nullability, -- defaults, -- unique constraints, -- indexes, -- timestamp semantics, -- timezone handling, -- soft-delete semantics. - ---- - -## 12. HTTP / RPC / messaging guidance - -### 12.1 Handlers and transport code - -When editing transport-layer code, preserve: - -- status code semantics, -- request validation behavior, -- response shape, -- middleware expectations, -- authn/authz boundaries, -- timeout and cancellation behavior. - -### 12.2 Serialization - -The agent must: - -- keep wire compatibility, -- avoid changing omitempty behavior casually, -- handle unknown fields according to existing patterns, -- preserve canonical formats if already established. - -### 12.3 Messaging / events - -For queues, streams, or pub/sub: - -- preserve event contract stability, -- preserve delivery assumptions, -- preserve idempotency handling, -- avoid changing partitioning or keys without reason. - ---- - -## 13. CLI and developer-experience guidance - -If the repository includes CLI commands or tooling, the agent should preserve UX consistency. - -Do not casually change: - -- command names, -- flag names, -- exit code semantics, -- help text style, -- config resolution order. - -When adding a flag or command: - -- keep naming consistent, -- document defaults, -- handle invalid input cleanly, -- add tests where feasible. - ---- - -## 14. Security and secrets handling - -The agent must treat security as a default concern. - -### 14.1 Must avoid - -Never: - -- commit secrets, -- log tokens, passwords, cookies, private keys, or connection strings, -- weaken auth checks casually, -- disable TLS verification without explicit reason, -- interpolate untrusted input into shell/SQL/HTML/paths unsafely, -- introduce path traversal risks, -- trust user input without validation. - -### 14.2 Must consider - -Consider: - -- input validation, -- output encoding, -- least privilege, -- SSRF risk, -- command injection, -- SQL injection, -- deserialization safety, -- sensitive data redaction, -- constant-time comparisons where relevant, -- secure defaults. - -### 14.3 Authentication and authorization - -Preserve existing auth boundaries. - -If a task touches auth logic, the agent must be especially conservative and update tests for both allowed and denied cases. - ---- - -## 15. Configuration guidance - -The agent should preserve current configuration patterns. - -Do not casually change: - -- env var names, -- precedence rules, -- default values, -- required/optional behavior, -- config file schema. - -When adding configuration: - -- prefer clear names, -- define sane defaults, -- validate values, -- document behavior, -- update examples if present. - ---- - -## 16. Documentation update policy - -Update documentation when the user-visible or developer-visible behavior changes. - -Potential files to update: - -- `README.md` -- package docs -- API docs -- CLI help -- examples -- migration notes -- deployment docs - -Do not rewrite large docs unless necessary. - ---- - -## 17. Commenting policy - -### 17.1 Code comments - -Use comments sparingly but effectively. - -Add comments when: - -- exporting a symbol, -- explaining why a non-obvious approach is used, -- documenting invariants, -- clarifying ownership/lifecycle/concurrency rules. - -Do not add comments that merely restate obvious code. - -### 17.2 Commit-style explanations in response - -In the final response, the agent should explain: - -- what changed, -- why it changed, -- what assumptions were made, -- what was tested, -- any notable trade-offs. - ---- - -## 18. How to present work in chat - -When the agent responds with implementation details, it should be concise but complete. - -### 18.1 Final response should usually include - -- a short summary of the change, -- the key files modified, -- important reasoning or assumptions, -- test commands executed, -- any remaining risks or follow-ups if relevant. - -### 18.2 The agent must not - -- dump huge irrelevant code blocks if files were already edited, -- exaggerate confidence, -- claim tests passed if they were not run, -- omit important caveats. - ---- - -## 19. Patch construction guidance - -### 19.1 Preferred change shape - -Prefer a sequence like: - -1. smallest safe production change, -2. tests that capture behavior, -3. minimal docs update if needed. - -### 19.2 Refactoring threshold - -Refactor only when necessary to support the requested change. - -Good reasons: - -- current structure prevents a safe fix, -- testability is too poor to validate behavior, -- the bug stems from tangled responsibilities, -- a small extraction materially reduces risk. - -Bad reasons: - -- personal style preference, -- “cleaner architecture” ambitions, -- speculative future use cases. - ---- - -## 20. Large or risky changes - -For changes with broad blast radius, the agent should be more conservative. - -Examples: - -- auth, -- billing, -- persistence, -- migrations, -- concurrency, -- public APIs, -- shared libraries, -- critical hot paths. - -In such cases, the agent should: - -- minimize the changed surface area, -- add focused regression coverage, -- call out risk explicitly, -- avoid mixing in refactors. - ---- - -## 21. When the agent should stop and report limits - -The agent should explicitly say so if: - -- the repository is missing files needed to implement safely, -- tests cannot be run in the environment, -- behavior depends on unknown external systems, -- a breaking design choice is required but unspecified, -- the requested change would be unsafe without broader context. - -In those cases, still provide the best grounded partial result possible. - ---- - -## 22. Preferred workflow for bug fixes - -When fixing a bug, the agent should generally follow this order: - -1. identify the failing behavior, -2. inspect the smallest relevant code path, -3. preserve existing public contract, -4. implement the minimal fix, -5. add or update regression tests, -6. verify no adjacent behavior was unintentionally changed. - -If the root cause is uncertain, state that clearly and avoid overstating certainty. - ---- - -## 23. Preferred workflow for new features - -When implementing a feature, the agent should generally: - -1. inspect similar existing features, -2. match established architecture, -3. add the smallest useful surface area, -4. keep compatibility where possible, -5. add tests for success and failure paths, -6. update minimal necessary docs. - ---- - -## 24. Preferred workflow for refactoring - -For refactors, the agent must preserve behavior. - -The agent should: - -- keep refactors mechanical and reviewable, -- avoid semantic drift, -- maintain test coverage, -- separate pure refactor from behavior change whenever practical. - -If both are unavoidable in one patch, explain that clearly. - ---- - -## 25. Monorepo / multi-package guidance - -If this repository contains multiple services or packages, the agent should: - -- change only the relevant module/package unless broader edits are required, -- respect local conventions of the touched area, -- check for local `AGENTS.md` files, -- avoid introducing cross-package coupling casually. - ---- - -## 26. File and package organization guidance - -When adding new files: - -- place them near the owning package, -- use existing naming conventions, -- avoid generic names like `common.go`, `helpers.go`, `utils.go` unless that pattern already exists, -- keep package boundaries clear. - -When adding helpers, prefer names tied to the domain or behavior. - ---- - -## 27. Example Go-specific preferences - -These are defaults unless the repository already uses a different style. - -### 27.1 Error examples - -Preferred: - -```go -func ParsePort(s string) (int, error) { - port, err := strconv.Atoi(s) - if err != nil { - return 0, fmt.Errorf("parse port %q: %w", s, err) - } - if port < 1 || port > 65535 { - return 0, fmt.Errorf("parse port %q: out of range", s) - } - return port, nil -} -``` - -Avoid: - -```go -func ParsePort(s string) (int, error) { - i, _ := strconv.Atoi(s) - return i, nil -} -``` - -### 27.2 Context examples - -Preferred: - -```go -func (s *Service) Fetch(ctx context.Context, id string) (*Item, error) { - if err := ctx.Err(); err != nil { - return nil, err - } - return s.repo.Fetch(ctx, id) -} -``` - -Avoid: - -```go -func (s *Service) Fetch(id string) (*Item, error) { - return s.repo.Fetch(context.Background(), id) -} -``` - -### 27.3 Table-driven tests - -Preferred: - -```go -func TestParsePort(t *testing.T) { - t.Parallel() - - tests := []struct { - name string - input string - want int - wantErr bool - }{ - {name: "valid", input: "8080", want: 8080}, - {name: "non-numeric", input: "abc", wantErr: true}, - {name: "out of range", input: "70000", wantErr: true}, - } - - for _, tt := range tests { - tt := tt - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - - got, err := ParsePort(tt.input) - if tt.wantErr { - require.Error(t, err) - return - } - - require.NoError(t, err) - assert.Equal(t, tt.want, got) - }) - } -} -``` - ---- - -## 28. Suggested command checklist - -Before concluding, the agent should use the smallest relevant subset of these commands when available and appropriate: - -```bash -go test ./... -go test ./... -race -go test ./... -cover -go vet ./... -golangci-lint run -staticcheck ./... -go test ./path/to/pkg -run TestName -v -``` - -Use repository-native wrappers first if they exist, for example: - -```bash -make test -make lint -task test -task lint -``` - ---- - -## 29. Suggested final response template - -Use this shape unless the user asked for something else: - -1. What changed. -2. Why it changed. -3. Files touched. -4. Tests run. -5. Assumptions or caveats. - -Be direct. Do not pad the response. - ---- - -## 30. Bottom-line instruction - -When in doubt, the agent should choose the safest change that: - -- solves the actual user request, -- matches existing repository conventions, -- preserves compatibility, -- adds or updates tests, -- keeps the diff small and reviewable. diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index ef1a95e..5711900 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -658,12 +658,15 @@ It owns: * starting game engine containers; * stopping containers; * restarting containers where allowed; -* patching/replacing containers where allowed; +* patching/replacing containers (semver patch only) where allowed; * technical runtime inspection/status; -* monitoring containers and publishing technical health events. +* monitoring containers via Docker events, periodic inspect, and active HTTP probe; +* publishing technical runtime events (`runtime:job_results`, `runtime:health_events`); +* publishing admin-only notification intents for first-touch start failures. It does **not** own platform metadata of games. It does **not** own runtime business state of games. +It does **not** resolve engine versions; the producer (`Game Lobby` in v1, `Game Master` later) supplies `image_ref`. It executes runtime jobs for `Game Lobby` and `Game Master`. ### Container model @@ -673,6 +676,62 @@ It executes runtime jobs for `Game Lobby` and `Game Master`. This is a hard invariant. +Each container is created with hostname `galaxy-game-{game_id}` and attached to the +single user-defined Docker bridge network configured by `RTMANAGER_DOCKER_NETWORK`. +The network is provisioned outside `Runtime Manager` (compose, Terraform, or operator +runbook); a missing network is a fail-fast condition at startup. The published +`engine_endpoint` is the stable URL `http://galaxy-game-{game_id}:8080`; restart and +patch keep the same DNS name even though `current_container_id` changes. + +### Image policy + +`Runtime Manager` never resolves engine versions. The producer (`Game Lobby` in v1, +`Game Master` once implemented) computes `image_ref` from its own template and +hands it to `Runtime Manager` on the start envelope. `Runtime Manager` accepts the +reference verbatim, applies the configured pull policy +(`RTMANAGER_IMAGE_PULL_POLICY`), and reads container resource limits from labels +on the resolved image. + +The producer-supplied `image_ref` rule decouples `Runtime Manager` from any +engine-version arbitration logic, lets the v1 launch ship without `Game Master`'s +engine-version registry, and cleanly separates "which image to run" (Lobby/GM +concern) from "how to run it" (RTM concern). Two alternatives were rejected: +RTM holding its own image map (would need to consume upstream tariff or +compatibility signals that belong in the producers) and RTM resolving the +image at start time by querying GM (would create a circular dependency for +v1 and add a synchronous hop on the hot path). + +Patch is restart with a new `image_ref` and is allowed only as a semver patch +within the same major/minor line; cross-major or cross-minor patch attempts fail +with `semver_patch_only`. Producers that need to change the major/minor line must +stop the game and start a new container. + +### State ownership + +Engine state lives on the host filesystem under the per-game directory +`/{game_id}` and is bind-mounted into the container at +`RTMANAGER_ENGINE_STATE_MOUNT_PATH`. The mount path is exposed to the engine through +`GAME_STATE_PATH` and, for backward compatibility, also as `STORAGE_PATH`. Both +names are accepted by `galaxy/game` in v1. + +`Runtime Manager` never deletes the host state directory. Removing a container +through the cleanup endpoint or the retention TTL leaves the directory intact. +Backup, archival, and operator cleanup of state directories belong to operator +tooling or a future Admin Service workflow. + +### Reconcile policy + +`Runtime Manager` reconciles its `runtime_records` with Docker reality at startup +(blocking, before workers start) and on a periodic interval +(`RTMANAGER_RECONCILE_INTERVAL`). Two rules apply unconditionally: + +* unrecorded containers labelled `com.galaxy.owner=rtmanager` are **adopted** into + `runtime_records` as `running`, never killed; operators may have launched one + manually for diagnostics; +* recorded `running` rows whose container is missing in Docker are marked + `removed`, with a `container_disappeared` event emitted on + `runtime:health_events`. + ## 10. [Notification Service](notification/README.md) `Notification Service` is the async delivery/orchestration layer for platform notifications. @@ -770,6 +829,18 @@ The platform uses one simple rule: * if the user-facing request must complete with a deterministic result in the same flow, the critical internal chain is synchronous; * if the interaction is propagation, notification, cache invalidation, runtime job completion, telemetry, or denormalized read-model update, it is asynchronous. +The `Lobby ↔ Runtime Manager` transport is the canonical asynchronous case: +Lobby drives RTM exclusively through Redis Streams (`runtime:start_jobs`, +`runtime:stop_jobs`, `runtime:job_results`); there is no synchronous +Lobby→RTM REST call in v1, and no plan to add one. Synchronous coupling +would force Lobby to block on Docker pull/start latency, which is +unbounded in the worst case. `Game Master` and `Admin Service`, by contrast, +drive RTM synchronously over REST because they operate on already-running +containers and need deterministic per-request outcomes (for example, +"restart this game's container now"); routing those operations through +streams would force operators to correlate async results back to admin +requests for no operational benefit. + ### Fixed synchronous interactions * `Gateway -> Auth / Session Service` @@ -783,13 +854,17 @@ The platform uses one simple rule: * `Geo Profile Service -> User Service` * `Game Lobby -> User Service` * `Game Lobby -> Game Master` for critical registration/update calls +* `Game Master -> Runtime Manager` for inspect, restart, patch, stop, and cleanup REST calls +* `Admin Service -> Runtime Manager` for operational inspect, restart, patch, stop, and cleanup REST calls ### Fixed asynchronous interactions * session lifecycle projection toward gateway cache; * revoke propagation; -* `Lobby -> Runtime Manager` runtime jobs; -* `Game Master -> Runtime Manager` runtime jobs; +* `Lobby -> Runtime Manager` runtime jobs through `runtime:start_jobs` (`{game_id, image_ref, requested_at_ms}`) and `runtime:stop_jobs` (`{game_id, reason, requested_at_ms}`); +* `Runtime Manager -> Lobby` job outcomes through `runtime:job_results`; +* `Runtime Manager -> Notification Service` admin-only failure intents (image pull, container start, start config) through `notification:intents`; +* `Runtime Manager` outbound technical health stream `runtime:health_events` consumed by `Game Master`; `Game Lobby` and `Admin Service` are reserved as future consumers; * all event-bus propagation; * `Game Master -> Game Lobby` runtime snapshot updates (including `player_turn_stats` for capability aggregation) and game-finish events @@ -831,6 +906,8 @@ PostgreSQL is the source of truth for table-shaped business state: malformed-intent audit; * lobby games, applications, invites, memberships, and the race-name registry (registered/reservation/pending tiers); +* runtime manager runtime records (`game_id -> current_container_id`), + per-operation audit log, and latest health snapshot per game; * idempotency records, expressed as `UNIQUE` constraints on the durable table — not as a separate kv; * retry scheduling state, expressed as a `next_attempt_at` column on the @@ -839,11 +916,13 @@ PostgreSQL is the source of truth for table-shaped business state: Redis is the source of truth for ephemeral and runtime-coordination state: * the platform event bus implemented as Redis Streams (`user:domain_events`, - `user:lifecycle_events`, `gm:lobby_events`, `runtime:job_results`, + `user:lifecycle_events`, `gm:lobby_events`, `runtime:start_jobs`, + `runtime:stop_jobs`, `runtime:job_results`, `runtime:health_events`, `notification:intents`, `gateway:client-events`, `mail:delivery_commands`); * stream consumer offsets; * gateway session cache, replay reservations, rate-limit counters, and - short-lived runtime locks/leases (e.g. notification `route_leases`); + short-lived runtime locks/leases (e.g. notification `route_leases`, + runtime manager per-game operation leases `rtmanager:game_lease:{game_id}`); * `Auth / Session Service` challenges and active session tokens, which are TTL-bounded and where loss is recoverable by re-authentication; * lobby per-game runtime aggregates that are deleted at game finish @@ -852,9 +931,9 @@ Redis is the source of truth for ephemeral and runtime-coordination state: ### Database topology * Single PostgreSQL database `galaxy`. -* Schema per service: `user`, `mail`, `notification`, `lobby`. Reserved for - future use: `geoprofile`. Not allocated unless needed: `gateway`, - `authsession`. +* Schema per service: `user`, `mail`, `notification`, `lobby`, `rtmanager`. + Reserved for future use: `geoprofile`. Not allocated unless needed: + `gateway`, `authsession`. * Each service connects with its own PostgreSQL role whose grants are restricted to its own schema (defense-in-depth). * Authentication is username + password only. `sslmode=disable`. No client @@ -933,15 +1012,15 @@ crossing the SQL boundary carry `time.UTC` as their location. ### Configuration For each service `` ∈ { `USERSERVICE`, `MAIL`, `NOTIFICATION`, -`LOBBY`, `GATEWAY`, `AUTHSESSION` }, the Redis connection accepts: +`LOBBY`, `RTMANAGER`, `GATEWAY`, `AUTHSESSION` }, the Redis connection accepts: * `_REDIS_MASTER_ADDR` (required) * `_REDIS_REPLICA_ADDRS` (optional, comma-separated) * `_REDIS_PASSWORD` (required) * `_REDIS_DB`, `_REDIS_OPERATION_TIMEOUT` -For PG-backed services (`USERSERVICE`, `MAIL`, `NOTIFICATION`, `LOBBY`) -the Postgres connection accepts: +For PG-backed services (`USERSERVICE`, `MAIL`, `NOTIFICATION`, `LOBBY`, +`RTMANAGER`) the Postgres connection accepts: * `_POSTGRES_PRIMARY_DSN` (required; `postgres://:@:5432/galaxy?search_path=&sslmode=disable`) @@ -951,9 +1030,105 @@ the Postgres connection accepts: Stream- and key-shape env vars (`*_REDIS_DOMAIN_EVENTS_STREAM`, `*_REDIS_LIFECYCLE_EVENTS_STREAM`, `*_REDIS_KEYSPACE_PREFIX`, -`MAIL_REDIS_COMMAND_STREAM`, `NOTIFICATION_INTENTS_STREAM`, etc.) keep -their current names and semantics — they describe stream/key shapes, not -connection topology. +`MAIL_REDIS_COMMAND_STREAM`, `NOTIFICATION_INTENTS_STREAM`, +`RTMANAGER_REDIS_START_JOBS_STREAM`, `RTMANAGER_REDIS_STOP_JOBS_STREAM`, +`RTMANAGER_REDIS_JOB_RESULTS_STREAM`, `RTMANAGER_REDIS_HEALTH_EVENTS_STREAM`, +etc.) keep their current names and semantics — they describe stream/key +shapes, not connection topology. + +## Test and Contract Conventions + +The repository follows a small set of cross-service rules for contract +specifications and test doubles. Each rule is captured below with the +rejected alternatives so future services do not re-litigate them. + +### AsyncAPI version: 3.1.0 + +Every AsyncAPI spec in the repository declares `asyncapi: 3.1.0` +(`notification/api/intents-asyncapi.yaml`, +`rtmanager/api/runtime-jobs-asyncapi.yaml`, +`rtmanager/api/runtime-health-asyncapi.yaml`). Operators read the same +shape across services — channel with `address`, separate `operations` +block, `action: send | receive` vocabulary. + +Alternatives rejected: + +- AsyncAPI 2.6.0 — would carry the same information under different + field names (`publish` / `subscribe` blocks living inside the channel) + and the shared YAML walker assertions would not transfer cleanly; +- adding a typed AsyncAPI parser library — no Galaxy service uses one + today; introducing a new dependency for the existing specs would + break the established pattern that all AsyncAPI freeze tests are pure + YAML walkers using `gopkg.in/yaml.v3`. + +The `oneOf`-based polymorphism on the `details` field in +`runtime-health-asyncapi.yaml` is plain JSON Schema and works +identically in 3.1.0; no AsyncAPI-version-specific feature is used. If +`notification/api/intents-asyncapi.yaml` ever moves to a newer major, +every downstream service moves with it as a cross-service contract bump. + +### Contract freeze tests + +OpenAPI freeze tests use `github.com/getkin/kin-openapi/openapi3`. The +library is already a workspace-wide dependency +(`lobby/contract_openapi_test.go`, `game/openapi_contract_test.go`, +`rtmanager/contract_openapi_test.go`). It validates OpenAPI 3.0 +syntactic correctness, exposes a typed AST, and lets assertions reach +operation IDs, schema references, required fields, and enum membership +without a hand-rolled parser. + +AsyncAPI freeze tests use `gopkg.in/yaml.v3` plus a small set of +helpers (`getMapValue`, `getStringValue`, `getStringSlice`, +`getSliceValue`, `getBoolValue`). AsyncAPI 3.1.0 is itself a JSON +Schema document; the freeze tests only need to assert on field paths, +enum membership, required fields, and `$ref` targets — none of which +require type-aware parsing. + +Both freeze tests live at the module root (`package ` next to +`go.mod`) for every service. A subpackage like `/contracts/` +would have to import the service's domain types to share constants, +which would create the exact import cycle the freeze tests are meant +to prevent. + +### Test doubles: `mockgen` for narrow recorder ports, `*inmem` for behavioural fakes + +Test doubles in the repository follow a three-track convention: + +- **Narrow recorder ports** (interfaces whose implementation has no + domain semantics — record calls, return injectable errors, expose + accessor methods) use `go.uber.org/mock` mocks. Examples: + `lobby/internal/ports/{RuntimeManager, IntentPublisher, GMClient, + UserService}`, `rtmanager/internal/ports/DockerClient`, + `rtmanager/internal/api/internalhttp/handlers/{Start,Stop,Restart, + Patch,Cleanup}Service`. `//go:generate` directives live next to the + interface declaration; generated mocks are committed under + `/internal/adapters/mocks/` (or `handlers/mocks/`); the + `make -C mocks` target regenerates them. +- **Behavioural in-memory adapters** (re-implement the production + contract — CAS, domain transitions, monotonic invariants, two-tier + invariants like the Race Name Directory) live under + `/internal/adapters/inmem/` and stay hand-rolled. + Replacing them with `mockgen` would force every consumer site to + script `EXPECT()` chains for behaviour the fake currently handles + automatically, and would lose the cross-implementation parity guarantee. +- **Dead test doubles** with no consumers are deleted on sight. + +Per-test recorder helpers (small structs holding captured slices and +per-test error injection) live **inside the test files that use them** +rather than in a shared `mockrec` / `testfixtures` package. A shared +package would re-create the retired `*stub` convention in a different +namespace; per-test recorders are easy to specialise without polluting +a shared surface. + +`racenameinmem` is a special case: it is also one of two selectable +Race Name Directory backends chosen via +`LOBBY_RACE_NAME_DIRECTORY_BACKEND=stub` (the config token name is +preserved while the package name follows the `*inmem` convention; both +backends pass the shared conformance suite at +`lobby/internal/ports/racenamedirtest/`). + +The maintained `go.uber.org/mock` fork is preferred over the archived +`github.com/golang/mock`. ## Main End-to-End Flows @@ -1283,7 +1458,12 @@ Recommended order for implementation is: Platform game records, membership, invites, applications, approvals, schedules, user-facing lists, pre-start lifecycle. 7. **Runtime Manager** - Dedicated Docker-control service for container start/stop/patch/status and technical runtime monitoring. + Dedicated Docker-control service for container lifecycle (start, stop, + restart, semver-patch, cleanup) and inspect/health monitoring through + Docker events, periodic inspect, and active HTTP probes. Driven + asynchronously from `Game Lobby` via `runtime:start_jobs` / + `runtime:stop_jobs` and synchronously from `Game Master` and + `Admin Service` via the trusted internal REST surface. 8. **Game Master** Running-game orchestration, engine version registry, runtime state, turn scheduler, engine API mediation, operational controls. diff --git a/game/Dockerfile b/game/Dockerfile new file mode 100644 index 0000000..c04e50d --- /dev/null +++ b/game/Dockerfile @@ -0,0 +1,60 @@ +# syntax=docker/dockerfile:1.7 + +# Build context is the workspace root (galaxy/), not the game/ subdirectory, +# because the game module pulls galaxy/{calc,error,model,util} through the +# go.work replace directives. Build with: +# +# docker build -t galaxy/game:test -f game/Dockerfile . + +FROM golang:1.26.2-alpine AS builder +WORKDIR /src +ENV CGO_ENABLED=0 GOFLAGS=-trimpath + +# Only the four pkg/ modules the engine binary actually imports. +COPY pkg/calc/ ./pkg/calc/ +COPY pkg/error/ ./pkg/error/ +COPY pkg/model/ ./pkg/model/ +COPY pkg/util/ ./pkg/util/ +COPY game/ ./game/ + +# Minimal workspace. The repository-level go.work also lists service +# modules (lobby, notification, ...) that the engine binary does not +# need, so we synthesise a workspace tailored to this image instead of +# dragging the rest of the monorepo into the build context. +RUN <<'EOF' cat > go.work +go 1.26.2 + +use ( + ./game + ./pkg/calc + ./pkg/error + ./pkg/model + ./pkg/util +) + +replace ( + galaxy/calc v0.0.0 => ./pkg/calc + galaxy/error v0.0.0 => ./pkg/error + galaxy/model v0.0.0 => ./pkg/model + galaxy/util v0.0.0 => ./pkg/util +) +EOF + +RUN --mount=type=cache,target=/root/.cache/go-build \ + --mount=type=cache,target=/go/pkg/mod \ + go build -ldflags="-s -w" -o /out/server ./game/cmd/http + +FROM gcr.io/distroless/static-debian12:nonroot AS runtime + +LABEL com.galaxy.cpu_quota="1.0" +LABEL com.galaxy.memory="512m" +LABEL com.galaxy.pids_limit="512" +LABEL org.opencontainers.image.title="galaxy-game-engine" + +ENV STORAGE_PATH=/var/lib/galaxy-game +EXPOSE 8080 +USER nonroot:nonroot + +COPY --from=builder /out/server /usr/local/bin/server + +ENTRYPOINT ["/usr/local/bin/server"] diff --git a/game/README.md b/game/README.md index ba126b2..e8b31a4 100644 --- a/game/README.md +++ b/game/README.md @@ -1,8 +1,184 @@ # Game Service Engine -Galaxy game engine — hosts a single game instance and exposes a REST API for -game initialization, turn advancement, player reports, and command execution. +`galaxy/game` is the game engine binary that runs inside one +`galaxy-game-{game_id}` container. It hosts a single game instance and exposes +a REST API for game initialization, turn advancement, player reports, and +batched player command execution. -## API +## References -The REST contract is documented in [`openapi.yaml`](openapi.yaml). +- [`openapi.yaml`](openapi.yaml) — REST contract. +- [`../ARCHITECTURE.md`](../ARCHITECTURE.md) — system architecture. +- [`../rtmanager/README.md`](../rtmanager/README.md) — Runtime Manager owns + container lifecycle for this binary. + +## Container model + +The engine is meant to be run inside a Docker container managed by +`Runtime Manager`. One container hosts exactly one game instance and listens +on TCP `:8080` inside the container. Outside the container the endpoint is +addressed as `http://galaxy-game-{game_id}:8080` through Docker's embedded DNS +on the configured `RTMANAGER_DOCKER_NETWORK`. + +The container image is built from [`Dockerfile`](Dockerfile) at the root of +this module. The Dockerfile is a multi-stage build (Go builder + small runtime +base) that exposes `:8080`, runs as a non-root user, and ships container +labels that `Runtime Manager` reads at create time: + +| Label | Meaning | +| --- | --- | +| `com.galaxy.cpu_quota` | CPU quota for the container (`--cpus`). | +| `com.galaxy.memory` | Memory limit for the container (`--memory`). | +| `com.galaxy.pids_limit` | PID limit for the container (`--pids-limit`). | +| `org.opencontainers.image.title` | `galaxy-game-engine`. | + +Image defaults are `cpu_quota=1.0`, `memory=512m`, `pids_limit=512`. Operators +override them at image-build time by editing the Dockerfile labels; producers +do not pass per-game limits. + +## Endpoints + +The contract is the union of `openapi.yaml` and the technical liveness probe +described below. + +### Game endpoints + +Documented in [`openapi.yaml`](openapi.yaml). When the engine has not been +initialised through `POST /api/v1/init`, game endpoints respond `501 Not +Implemented` to make the uninitialised state unambiguous. + +### `GET /healthz` + +Technical liveness probe used by `Runtime Manager` and operator tooling. + +- Returns `{"status":"ok"}` with HTTP `200` whenever the HTTP server is + serving requests, regardless of whether the engine has been initialised + through `POST /api/v1/init`. +- Carries no game-state semantics. Use `GET /api/v1/status` for game-state + inspection. + +This endpoint exists so that `Runtime Manager` can probe a freshly started +container before `init` runs. + +## Storage + +The engine reads its persistent storage path from environment variables in +the following order of precedence: + +1. `STORAGE_PATH` — historical name; honoured for backward compatibility. +2. `GAME_STATE_PATH` — canonical name written by `Runtime Manager`. + +If both are set, `STORAGE_PATH` wins. If neither is set, the binary fails +fast on startup. The Dockerfile defaults `STORAGE_PATH=/var/lib/galaxy-game` +so the image runs out of the box if the operator does not supply either +variable. + +`Runtime Manager` creates a per-game host directory under +`/{game_id}` and bind-mounts it into the container +at `RTMANAGER_ENGINE_STATE_MOUNT_PATH` (default `/var/lib/galaxy-game`). The +mount path is then exposed to the engine through `GAME_STATE_PATH` (and, for +compatibility, also as `STORAGE_PATH`). + +The engine is responsible for the contents of the storage directory. +`Runtime Manager` never reads or writes the directory contents, never +deletes the directory, and never inspects per-game state files. + +### Design rationale: storage-path env precedence + +`STORAGE_PATH` wins over `GAME_STATE_PATH` because the engine already +shipped with `STORAGE_PATH` (see `game/Makefile` and +`game/internal/router/handler/handler.go`). Keeping `STORAGE_PATH` as +the authoritative variable means existing engine deployments and +integration fixtures continue to work without code change, while +`GAME_STATE_PATH` is the platform contract written by `Runtime Manager` +and documented in `ARCHITECTURE.md §9`. + +Alternatives considered and rejected: + +- accept only `GAME_STATE_PATH` — would force a breaking change on the + engine binary and on every existing `STORAGE_PATH=...` invocation in + `game/Makefile` and dev scripts; +- `GAME_STATE_PATH` wins over `STORAGE_PATH` — would silently invert + the meaning of an explicit `STORAGE_PATH=` invocation if the operator + also sets `GAME_STATE_PATH` for any reason. + +### Design rationale: storage-path validation site + +`game/internal/router/handler/handler.go` exports `ResolveStoragePath`, +which returns the engine storage path from the env-var pair above and +an error when neither is set. `cmd/http/main.go` calls it before +constructing the router, prints the error to stderr, and exits non-zero. +The existing `initConfig` closure also calls `ResolveStoragePath` to +populate `controller.Param.StoragePath` at request time; the error there +is dropped because `main` already validated the environment at startup. + +This keeps the public router surface (`router.NewRouter`) unchanged — +the env binding is satisfied by one helper plus a startup check, with +no API ripple. Moving env reading entirely into `main` and changing +`NewRouter` / `NewDefaultExecutor` to accept an explicit path was +rejected: it churns multiple call sites for no functional gain. The +current shape leaves the configurer closure ready for future +config-injection refactors without forcing one now. + +## Build + +The container image is built from [`Dockerfile`](Dockerfile). The Docker +build context is the workspace root (`galaxy/`) rather than the `game/` +subdirectory, because `game/` resolves `galaxy/{model,error,util,...}` +through `go.work` `replace` directives. From the workspace root: + +```sh +docker build -t galaxy/game:test -f game/Dockerfile . +``` + +The build is two-staged: a `golang:1.26.2-alpine` builder produces a +statically linked binary (`CGO_ENABLED=0`), then `gcr.io/distroless/static-debian12:nonroot` +runs it as the `nonroot` user and exposes `:8080`. + +### Design rationale: workspace-root build context + +`game/` is a member of the multi-module `go.work` workspace at the +repository root. Its imports of `galaxy/model`, `galaxy/error`, +`galaxy/util`, etc. are satisfied by `replace` directives in `go.work` +that point at sibling modules under `pkg/`. There is no published +`galaxy/model` module to download. + +A standalone `docker build ./game` therefore cannot resolve those +imports: the `pkg/` tree is outside the build context, and `game/go.mod` +alone has no `replace` directives pointing at it. + +Alternatives rejected: + +- adding `replace` directives to `game/go.mod` and copying `pkg/` into a + vendored layout — duplicates the workspace inside `game/`, drifts from + the rest of the repository, and forces every other workspace member + that ships a Dockerfile to repeat the trick; +- running `go mod vendor` inside `game/` before each build — workspaces + do not vendor cleanly, the resulting `vendor/` would be noisy, and CI + / Makefile would need a custom pre-build step. + +No `.dockerignore` is needed: every `COPY` in `game/Dockerfile` names an +explicit subdirectory (`pkg/calc`, `pkg/error`, `pkg/model`, `pkg/util`, +`game`), and BuildKit (forced by `# syntax=docker/dockerfile:1.7`) only +transfers the paths a `COPY` actually references. + +### Design rationale: `gcr.io/distroless/static-debian12:nonroot` runtime base + +Distroless static is roughly 2 MB and contains no shell or package +manager, which keeps the attack surface and CVE exposure minimal — +appropriate for a service that `Runtime Manager` will start by the +dozen. The image already runs as UID `65532:65532` named `nonroot`, +satisfying the non-root-user requirement without an explicit +`RUN adduser`. + +Alternatives rejected: + +- `alpine:3.20` — provides a shell for ad-hoc debugging but is roughly + 10 MB and inherits regular CVE churn on `musl` / `apk`. The convenience + is not worth the larger attack surface for a fleet of identical engine + containers; operators can always `docker exec` from a debug image when + needed; +- `scratch` — smallest possible image, but ships no `/tmp`, no CA bundle, + and no `/etc/passwd`. Distroless wins on the same security axis while + leaving room for future needs (TLS, logging) without rebuilding the + base layout. diff --git a/game/cmd/http/main.go b/game/cmd/http/main.go index 0445b8c..82685cf 100644 --- a/game/cmd/http/main.go +++ b/game/cmd/http/main.go @@ -5,9 +5,15 @@ import ( "os" "galaxy/game/internal/router" + "galaxy/game/internal/router/handler" ) func main() { + if _, err := handler.ResolveStoragePath(); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + r := router.NewRouter() if err := r.Run(); err != nil { fmt.Fprintln(os.Stderr, err) diff --git a/game/internal/router/handler/handler.go b/game/internal/router/handler/handler.go index 0454292..bbaed12 100644 --- a/game/internal/router/handler/handler.go +++ b/game/internal/router/handler/handler.go @@ -4,6 +4,7 @@ import ( "errors" "net/http" "os" + "strings" "galaxy/model/order" "galaxy/model/report" @@ -33,9 +34,25 @@ type executor struct { cfg controller.Configurer } +// ResolveStoragePath returns the engine storage path resolved from +// STORAGE_PATH (preferred, historical name) or GAME_STATE_PATH (canonical +// name written by Runtime Manager). It returns an error when neither +// variable is set; callers are expected to fail fast at startup. +func ResolveStoragePath() (string, error) { + if v := strings.TrimSpace(os.Getenv("STORAGE_PATH")); v != "" { + return v, nil + } + if v := strings.TrimSpace(os.Getenv("GAME_STATE_PATH")); v != "" { + return v, nil + } + return "", errors.New("storage path is not set: provide STORAGE_PATH or GAME_STATE_PATH") +} + func initConfig() controller.Configurer { return func(p *controller.Param) { - p.StoragePath = os.Getenv("STORAGE_PATH") + // Validated once at startup by ResolveStoragePath; the error + // is dropped here to keep the Configurer signature simple. + p.StoragePath, _ = ResolveStoragePath() } } diff --git a/game/internal/router/handler/healthz.go b/game/internal/router/handler/healthz.go new file mode 100644 index 0000000..9c71905 --- /dev/null +++ b/game/internal/router/handler/healthz.go @@ -0,0 +1,14 @@ +package handler + +import ( + "net/http" + + "github.com/gin-gonic/gin" +) + +// HealthzHandler is the technical liveness probe used by Runtime Manager +// and operator tooling. It returns 200 with {"status":"ok"} regardless +// of whether the engine has been initialised through POST /api/v1/init. +func HealthzHandler(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"status": "ok"}) +} diff --git a/game/internal/router/healthz_test.go b/game/internal/router/healthz_test.go new file mode 100644 index 0000000..50211ec --- /dev/null +++ b/game/internal/router/healthz_test.go @@ -0,0 +1,57 @@ +package router_test + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "galaxy/game/internal/controller" + "galaxy/game/internal/router" + "galaxy/game/internal/router/handler" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestHealthzReturnsOKWithoutInit(t *testing.T) { + r := router.SetupRouter(handler.NewDefaultConfigExecutor(func(p *controller.Param) { + p.StoragePath = "" + })) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodGet, "/healthz", nil) + r.ServeHTTP(w, req) + + require.Equal(t, http.StatusOK, w.Code, w.Body) + + var body map[string]string + require.NoError(t, json.Unmarshal(w.Body.Bytes(), &body)) + assert.Equal(t, "ok", body["status"]) +} + +func TestResolveStoragePathPrecedence(t *testing.T) { + t.Setenv("STORAGE_PATH", "/tmp/storage") + t.Setenv("GAME_STATE_PATH", "/tmp/state") + + got, err := handler.ResolveStoragePath() + require.NoError(t, err) + assert.Equal(t, "/tmp/storage", got) +} + +func TestResolveStoragePathFallback(t *testing.T) { + t.Setenv("STORAGE_PATH", "") + t.Setenv("GAME_STATE_PATH", "/tmp/state") + + got, err := handler.ResolveStoragePath() + require.NoError(t, err) + assert.Equal(t, "/tmp/state", got) +} + +func TestResolveStoragePathMissing(t *testing.T) { + t.Setenv("STORAGE_PATH", "") + t.Setenv("GAME_STATE_PATH", "") + + _, err := handler.ResolveStoragePath() + require.Error(t, err) +} diff --git a/game/internal/router/router.go b/game/internal/router/router.go index 4215a45..1e165de 100644 --- a/game/internal/router/router.go +++ b/game/internal/router/router.go @@ -63,6 +63,8 @@ func setupRouter(executor handler.CommandExecutor) *gin.Engine { } } + r.GET("/healthz", handler.HealthzHandler) + groupV1 := r.Group("/api/v1") groupV1.GET("/status", func(ctx *gin.Context) { handler.StatusHandler(ctx, executor) }) diff --git a/game/openapi.yaml b/game/openapi.yaml index f3c6119..aae26a3 100644 --- a/game/openapi.yaml +++ b/game/openapi.yaml @@ -27,6 +27,8 @@ tags: description: Game initialization, state retrieval, and turn advancement. - name: PlayerActions description: Player command execution, order validation, and turn-report retrieval. + - name: Health + description: Technical liveness probes used by Runtime Manager and operator tooling. paths: /api/v1/status: get: @@ -164,6 +166,26 @@ paths: $ref: "#/components/schemas/StateResponse" "500": $ref: "#/components/responses/InternalError" + /healthz: + get: + tags: + - Health + operationId: healthz + summary: Engine liveness probe + description: | + Returns `{"status":"ok"}` with HTTP `200` whenever the HTTP server + is serving requests, regardless of whether the engine has been + initialised through `POST /api/v1/init`. Used by `Runtime Manager` + to probe a freshly started container before `init` runs. Carries + no game-state semantics; use `GET /api/v1/status` for game-state + inspection. + responses: + "200": + description: Engine HTTP server is up. + content: + application/json: + schema: + $ref: "#/components/schemas/HealthzResponse" components: parameters: PlayerParam: @@ -184,6 +206,17 @@ components: minimum: 0 default: 0 schemas: + HealthzResponse: + type: object + description: Engine liveness probe response payload. + required: + - status + properties: + status: + type: string + description: Always "ok" while the engine HTTP server is serving requests. + enum: + - ok StateResponse: type: object description: Summary game state returned after initialization and at each turn boundary. diff --git a/game/openapi_contract_test.go b/game/openapi_contract_test.go index 342ce28..43cbd81 100644 --- a/game/openapi_contract_test.go +++ b/game/openapi_contract_test.go @@ -58,6 +58,13 @@ func TestGameOpenAPISpecFreezesResponseSchemas(t *testing.T) { status: http.StatusOK, wantRef: "#/components/schemas/StateResponse", }, + { + name: "healthz probe", + path: "/healthz", + method: http.MethodGet, + status: http.StatusOK, + wantRef: "#/components/schemas/HealthzResponse", + }, } for _, tt := range tests { @@ -108,6 +115,19 @@ func TestGameOpenAPISpecFreezesCommandRequest(t *testing.T) { require.Equal(t, uint64(1), cmdSchema.Value.MinItems, "CommandRequest.cmd minItems must be 1") } +func TestGameOpenAPISpecHealthzStatusEnum(t *testing.T) { + t.Parallel() + + doc := loadOpenAPISpec(t) + schema := componentSchemaRef(t, doc, "HealthzResponse") + + assertRequiredFields(t, schema, "status") + + statusSchema := schema.Value.Properties["status"] + require.NotNil(t, statusSchema, "HealthzResponse.status schema must exist") + require.Equal(t, []any{"ok"}, statusSchema.Value.Enum, "HealthzResponse.status enum must be [\"ok\"]") +} + func TestGameOpenAPISpecCommandTypeEnumIsComplete(t *testing.T) { t.Parallel() diff --git a/gateway/cmd/gateway/main.go b/gateway/cmd/gateway/main.go index ef3b396..3389f3e 100644 --- a/gateway/cmd/gateway/main.go +++ b/gateway/cmd/gateway/main.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "maps" "os" "os/signal" "syscall" @@ -13,6 +14,7 @@ import ( "galaxy/gateway/internal/authn" "galaxy/gateway/internal/config" "galaxy/gateway/internal/downstream" + "galaxy/gateway/internal/downstream/lobbyservice" "galaxy/gateway/internal/downstream/userservice" "galaxy/gateway/internal/events" "galaxy/gateway/internal/grpcapi" @@ -207,8 +209,22 @@ func newAuthenticatedGRPCDependencies(ctx context.Context, cfg config.Config, lo ) } + lobbyRoutes, closeLobbyServiceRoutes, err := lobbyservice.NewRoutes(cfg.LobbyService.BaseURL) + if err != nil { + return grpcapi.ServerDependencies{}, nil, nil, errors.Join( + fmt.Errorf("build authenticated grpc dependencies: lobby service routes: %w", err), + closeUserServiceRoutes(), + closeRedisClient(), + ) + } + + allRoutes := make(map[string]downstream.Client, len(userRoutes)+len(lobbyRoutes)) + maps.Copy(allRoutes, userRoutes) + maps.Copy(allRoutes, lobbyRoutes) + cleanup := func() error { return errors.Join( + closeLobbyServiceRoutes(), closeUserServiceRoutes(), closeRedisClient(), ) @@ -216,7 +232,7 @@ func newAuthenticatedGRPCDependencies(ctx context.Context, cfg config.Config, lo return grpcapi.ServerDependencies{ Service: grpcapi.NewFanOutPushStreamService(pushHub, responseSigner, nil, logger), - Router: downstream.NewStaticRouter(userRoutes), + Router: downstream.NewStaticRouter(allRoutes), ResponseSigner: responseSigner, SessionCache: sessionCache, ReplayStore: replayStore, diff --git a/gateway/internal/config/config.go b/gateway/internal/config/config.go index b9e4681..7e23276 100644 --- a/gateway/internal/config/config.go +++ b/gateway/internal/config/config.go @@ -54,6 +54,11 @@ const ( // gateway self-service delegation. userServiceBaseURLEnvVar = "GATEWAY_USER_SERVICE_BASE_URL" + // lobbyServiceBaseURLEnvVar names the environment variable that configures + // the optional Game Lobby public HTTP base URL used by authenticated + // gateway platform-command delegation. + lobbyServiceBaseURLEnvVar = "GATEWAY_LOBBY_SERVICE_BASE_URL" + // adminHTTPAddrEnvVar names the environment variable that configures the // private admin HTTP listener address. When it is empty, the admin listener // remains disabled. @@ -475,6 +480,15 @@ type UserServiceConfig struct { BaseURL string } +// LobbyServiceConfig describes the optional authenticated platform-command +// upstream used by the gateway runtime. +type LobbyServiceConfig struct { + // BaseURL is the absolute base URL of the Game Lobby public HTTP API. + // When BaseURL is empty, the gateway keeps using its built-in unavailable + // downstream adapter for the reserved `lobby.*` routes. + BaseURL string +} + // AdminHTTPConfig describes the private operational HTTP listener used for // metrics exposure. The listener remains disabled when Addr is empty. type AdminHTTPConfig struct { @@ -597,6 +611,10 @@ type Config struct { // delegation to User Service. UserService UserServiceConfig + // LobbyService configures the optional authenticated platform-command + // delegation to Game Lobby. + LobbyService LobbyServiceConfig + // AdminHTTP configures the optional private admin listener used for metrics // exposure. AdminHTTP AdminHTTPConfig @@ -788,6 +806,13 @@ func DefaultUserServiceConfig() UserServiceConfig { return UserServiceConfig{} } +// DefaultLobbyServiceConfig returns the default authenticated platform-command +// upstream settings. The zero value keeps the built-in unavailable adapter +// active for reserved `lobby.*` routes. +func DefaultLobbyServiceConfig() LobbyServiceConfig { + return LobbyServiceConfig{} +} + // LoadFromEnv loads Config from the process environment, applies defaults for // omitted settings, and validates the resulting values. func LoadFromEnv() (Config, error) { @@ -797,6 +822,7 @@ func LoadFromEnv() (Config, error) { PublicHTTP: DefaultPublicHTTPConfig(), AuthService: DefaultAuthServiceConfig(), UserService: DefaultUserServiceConfig(), + LobbyService: DefaultLobbyServiceConfig(), AdminHTTP: DefaultAdminHTTPConfig(), AuthenticatedGRPC: DefaultAuthenticatedGRPCConfig(), Redis: redisconn.DefaultConfig(), @@ -860,6 +886,11 @@ func LoadFromEnv() (Config, error) { cfg.UserService.BaseURL = rawUserServiceBaseURL } + rawLobbyServiceBaseURL, ok := os.LookupEnv(lobbyServiceBaseURLEnvVar) + if ok { + cfg.LobbyService.BaseURL = rawLobbyServiceBaseURL + } + rawAdminHTTPAddr, ok := os.LookupEnv(adminHTTPAddrEnvVar) if ok { cfg.AdminHTTP.Addr = rawAdminHTTPAddr diff --git a/gateway/internal/downstream/lobbyservice/client.go b/gateway/internal/downstream/lobbyservice/client.go new file mode 100644 index 0000000..fe2bfd9 --- /dev/null +++ b/gateway/internal/downstream/lobbyservice/client.go @@ -0,0 +1,329 @@ +// Package lobbyservice implements the authenticated Gateway -> Game Lobby +// downstream adapter. It forwards verified authenticated commands as +// trusted-internal HTTP requests against Game Lobby's public REST surface, +// transporting the calling user identity through the `X-User-Id` header. +package lobbyservice + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strings" + + "galaxy/gateway/internal/downstream" + lobbymodel "galaxy/model/lobby" + "galaxy/transcoder" +) + +const ( + myGamesListPath = "/api/v1/lobby/my/games" + openEnrollmentPathFormat = "/api/v1/lobby/games/%s/open-enrollment" + + resultCodeOK = "ok" + defaultErrorCodeBadRequest = "invalid_request" + defaultErrorCodeNotFound = "subject_not_found" + defaultErrorCodeForbidden = "forbidden" + defaultErrorCodeConflict = "conflict" + defaultErrorCodeInternalError = "internal_error" + + headerCallingUserID = "X-User-Id" +) + +var stableErrorMessages = map[string]string{ + defaultErrorCodeBadRequest: "request is invalid", + defaultErrorCodeNotFound: "subject not found", + defaultErrorCodeForbidden: "operation is forbidden for the calling user", + defaultErrorCodeConflict: "request conflicts with current state", + defaultErrorCodeInternalError: "internal server error", +} + +// HTTPClient implements downstream.Client against the trusted Game Lobby +// public REST API while preserving FlatBuffers at the external authenticated +// gateway boundary. +type HTTPClient struct { + baseURL string + httpClient *http.Client +} + +// NewHTTPClient constructs one Game Lobby downstream client backed by the +// public REST API at baseURL. +func NewHTTPClient(baseURL string) (*HTTPClient, error) { + transport, ok := http.DefaultTransport.(*http.Transport) + if !ok { + return nil, errors.New("new lobby service HTTP client: default transport is not *http.Transport") + } + + return newHTTPClient(baseURL, &http.Client{ + Transport: transport.Clone(), + }) +} + +func newHTTPClient(baseURL string, httpClient *http.Client) (*HTTPClient, error) { + if httpClient == nil { + return nil, errors.New("new lobby service HTTP client: http client must not be nil") + } + + trimmedBaseURL := strings.TrimSpace(baseURL) + if trimmedBaseURL == "" { + return nil, errors.New("new lobby service HTTP client: base URL must not be empty") + } + + parsedBaseURL, err := url.Parse(strings.TrimRight(trimmedBaseURL, "/")) + if err != nil { + return nil, fmt.Errorf("new lobby service HTTP client: parse base URL: %w", err) + } + if parsedBaseURL.Scheme == "" || parsedBaseURL.Host == "" { + return nil, errors.New("new lobby service HTTP client: base URL must be absolute") + } + + return &HTTPClient{ + baseURL: parsedBaseURL.String(), + httpClient: httpClient, + }, nil +} + +// Close releases idle HTTP connections owned by the client transport. +func (c *HTTPClient) Close() error { + if c == nil || c.httpClient == nil { + return nil + } + + type idleCloser interface { + CloseIdleConnections() + } + + if transport, ok := c.httpClient.Transport.(idleCloser); ok { + transport.CloseIdleConnections() + } + + return nil +} + +// ExecuteCommand routes one authenticated gateway command to the matching +// trusted Game Lobby public REST route. +func (c *HTTPClient) ExecuteCommand(ctx context.Context, command downstream.AuthenticatedCommand) (downstream.UnaryResult, error) { + if c == nil || c.httpClient == nil { + return downstream.UnaryResult{}, errors.New("execute lobby service command: nil client") + } + if ctx == nil { + return downstream.UnaryResult{}, errors.New("execute lobby service command: nil context") + } + if err := ctx.Err(); err != nil { + return downstream.UnaryResult{}, err + } + if strings.TrimSpace(command.UserID) == "" { + return downstream.UnaryResult{}, errors.New("execute lobby service command: user_id must not be empty") + } + + switch command.MessageType { + case lobbymodel.MessageTypeMyGamesList: + if _, err := transcoder.PayloadToMyGamesListRequest(command.PayloadBytes); err != nil { + return downstream.UnaryResult{}, fmt.Errorf("execute lobby service command %q: %w", command.MessageType, err) + } + return c.executeMyGamesList(ctx, command.UserID) + case lobbymodel.MessageTypeOpenEnrollment: + request, err := transcoder.PayloadToOpenEnrollmentRequest(command.PayloadBytes) + if err != nil { + return downstream.UnaryResult{}, fmt.Errorf("execute lobby service command %q: %w", command.MessageType, err) + } + return c.executeOpenEnrollment(ctx, command.UserID, request) + default: + return downstream.UnaryResult{}, fmt.Errorf("execute lobby service command: unsupported message type %q", command.MessageType) + } +} + +func (c *HTTPClient) executeMyGamesList(ctx context.Context, userID string) (downstream.UnaryResult, error) { + payload, statusCode, err := c.doRequest(ctx, http.MethodGet, c.baseURL+myGamesListPath, userID, nil) + if err != nil { + return downstream.UnaryResult{}, fmt.Errorf("execute my games list: %w", err) + } + + if statusCode == http.StatusOK { + var response lobbymodel.MyGamesListResponse + if err := decodeStrictJSONPayload(payload, &response); err != nil { + return downstream.UnaryResult{}, fmt.Errorf("decode success response: %w", err) + } + payloadBytes, err := transcoder.MyGamesListResponseToPayload(&response) + if err != nil { + return downstream.UnaryResult{}, fmt.Errorf("encode success response payload: %w", err) + } + return downstream.UnaryResult{ + ResultCode: resultCodeOK, + PayloadBytes: payloadBytes, + }, nil + } + + return projectErrorResponse(statusCode, payload) +} + +func (c *HTTPClient) executeOpenEnrollment(ctx context.Context, userID string, request *lobbymodel.OpenEnrollmentRequest) (downstream.UnaryResult, error) { + if request == nil || strings.TrimSpace(request.GameID) == "" { + return downstream.UnaryResult{}, errors.New("execute open enrollment: game_id must not be empty") + } + + target := c.baseURL + fmt.Sprintf(openEnrollmentPathFormat, url.PathEscape(request.GameID)) + payload, statusCode, err := c.doRequest(ctx, http.MethodPost, target, userID, struct{}{}) + if err != nil { + return downstream.UnaryResult{}, fmt.Errorf("execute open enrollment: %w", err) + } + + if statusCode == http.StatusOK { + // Lobby's open-enrollment endpoint returns the full game record; + // the gateway boundary projects the minimal status pair. + var fullRecord struct { + GameID string `json:"game_id"` + Status string `json:"status"` + } + if err := json.Unmarshal(payload, &fullRecord); err != nil { + return downstream.UnaryResult{}, fmt.Errorf("decode success response: %w", err) + } + payloadBytes, err := transcoder.OpenEnrollmentResponseToPayload(&lobbymodel.OpenEnrollmentResponse{ + GameID: fullRecord.GameID, + Status: fullRecord.Status, + }) + if err != nil { + return downstream.UnaryResult{}, fmt.Errorf("encode success response payload: %w", err) + } + return downstream.UnaryResult{ + ResultCode: resultCodeOK, + PayloadBytes: payloadBytes, + }, nil + } + + return projectErrorResponse(statusCode, payload) +} + +func (c *HTTPClient) doRequest(ctx context.Context, method, targetURL, userID string, requestBody any) ([]byte, int, error) { + if c == nil || c.httpClient == nil { + return nil, 0, errors.New("nil client") + } + + var bodyReader io.Reader + if requestBody != nil { + body, err := json.Marshal(requestBody) + if err != nil { + return nil, 0, fmt.Errorf("marshal request body: %w", err) + } + bodyReader = bytes.NewReader(body) + } + + request, err := http.NewRequestWithContext(ctx, method, targetURL, bodyReader) + if err != nil { + return nil, 0, fmt.Errorf("build request: %w", err) + } + if requestBody != nil { + request.Header.Set("Content-Type", "application/json") + } + request.Header.Set(headerCallingUserID, userID) + + response, err := c.httpClient.Do(request) + if err != nil { + return nil, 0, err + } + defer response.Body.Close() + + payload, err := io.ReadAll(response.Body) + if err != nil { + return nil, 0, fmt.Errorf("read response body: %w", err) + } + + return payload, response.StatusCode, nil +} + +func projectErrorResponse(statusCode int, payload []byte) (downstream.UnaryResult, error) { + switch { + case statusCode == http.StatusServiceUnavailable: + return downstream.UnaryResult{}, downstream.ErrDownstreamUnavailable + case statusCode >= 400 && statusCode <= 599: + errorResponse, err := decodeLobbyError(statusCode, payload) + if err != nil { + return downstream.UnaryResult{}, fmt.Errorf("decode error response: %w", err) + } + payloadBytes, err := transcoder.LobbyErrorResponseToPayload(errorResponse) + if err != nil { + return downstream.UnaryResult{}, fmt.Errorf("encode error response payload: %w", err) + } + return downstream.UnaryResult{ + ResultCode: errorResponse.Error.Code, + PayloadBytes: payloadBytes, + }, nil + default: + return downstream.UnaryResult{}, fmt.Errorf("unexpected HTTP status %d", statusCode) + } +} + +func decodeLobbyError(statusCode int, payload []byte) (*lobbymodel.ErrorResponse, error) { + var response lobbymodel.ErrorResponse + if err := decodeStrictJSONPayload(payload, &response); err != nil { + return nil, err + } + + response.Error.Code = normalizeErrorCode(statusCode, response.Error.Code) + response.Error.Message = normalizeErrorMessage(response.Error.Code, response.Error.Message) + + if strings.TrimSpace(response.Error.Code) == "" { + return nil, errors.New("missing error code") + } + if strings.TrimSpace(response.Error.Message) == "" { + return nil, errors.New("missing error message") + } + + return &response, nil +} + +func normalizeErrorCode(statusCode int, code string) string { + trimmed := strings.TrimSpace(code) + if trimmed != "" { + return trimmed + } + + switch statusCode { + case http.StatusBadRequest: + return defaultErrorCodeBadRequest + case http.StatusForbidden: + return defaultErrorCodeForbidden + case http.StatusNotFound: + return defaultErrorCodeNotFound + case http.StatusConflict: + return defaultErrorCodeConflict + default: + return defaultErrorCodeInternalError + } +} + +func normalizeErrorMessage(code, message string) string { + trimmed := strings.TrimSpace(message) + if trimmed != "" { + return trimmed + } + + if stable, ok := stableErrorMessages[code]; ok { + return stable + } + + return stableErrorMessages[defaultErrorCodeInternalError] +} + +func decodeStrictJSONPayload(payload []byte, target any) error { + decoder := json.NewDecoder(bytes.NewReader(payload)) + decoder.DisallowUnknownFields() + + if err := decoder.Decode(target); err != nil { + return err + } + if err := decoder.Decode(&struct{}{}); err != io.EOF { + if err == nil { + return errors.New("unexpected trailing JSON input") + } + return err + } + + return nil +} + +var _ downstream.Client = (*HTTPClient)(nil) diff --git a/gateway/internal/downstream/lobbyservice/client_test.go b/gateway/internal/downstream/lobbyservice/client_test.go new file mode 100644 index 0000000..8cb2be4 --- /dev/null +++ b/gateway/internal/downstream/lobbyservice/client_test.go @@ -0,0 +1,212 @@ +package lobbyservice_test + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "galaxy/gateway/internal/downstream" + "galaxy/gateway/internal/downstream/lobbyservice" + lobbymodel "galaxy/model/lobby" + "galaxy/transcoder" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestExecuteMyGamesListSuccess(t *testing.T) { + t.Parallel() + + expectedResponse := lobbymodel.MyGamesListResponse{ + Items: []lobbymodel.GameSummary{ + { + GameID: "game-1", + GameName: "Nebula Clash", + GameType: "private", + Status: "draft", + OwnerUserID: "user-1", + MinPlayers: 2, + MaxPlayers: 8, + EnrollmentEndsAt: time.Date(2026, 5, 1, 12, 0, 0, 0, time.UTC), + CreatedAt: time.Date(2026, 4, 28, 9, 0, 0, 0, time.UTC), + UpdatedAt: time.Date(2026, 4, 28, 9, 5, 0, 0, time.UTC), + }, + }, + } + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodGet, r.Method) + assert.Equal(t, "/api/v1/lobby/my/games", r.URL.Path) + assert.Equal(t, "user-1", r.Header.Get("X-User-Id")) + w.Header().Set("Content-Type", "application/json") + require.NoError(t, json.NewEncoder(w).Encode(expectedResponse)) + })) + t.Cleanup(server.Close) + + client, err := lobbyservice.NewHTTPClient(server.URL) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, client.Close()) }) + + requestBytes, err := transcoder.MyGamesListRequestToPayload(&lobbymodel.MyGamesListRequest{}) + require.NoError(t, err) + + result, err := client.ExecuteCommand(context.Background(), downstream.AuthenticatedCommand{ + MessageType: lobbymodel.MessageTypeMyGamesList, + UserID: "user-1", + PayloadBytes: requestBytes, + }) + require.NoError(t, err) + assert.Equal(t, "ok", result.ResultCode) + + decoded, err := transcoder.PayloadToMyGamesListResponse(result.PayloadBytes) + require.NoError(t, err) + require.Len(t, decoded.Items, 1) + assert.Equal(t, expectedResponse.Items[0].GameID, decoded.Items[0].GameID) + assert.Equal(t, expectedResponse.Items[0].OwnerUserID, decoded.Items[0].OwnerUserID) + assert.Equal(t, expectedResponse.Items[0].MinPlayers, decoded.Items[0].MinPlayers) +} + +func TestExecuteOpenEnrollmentSuccess(t *testing.T) { + t.Parallel() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodPost, r.Method) + assert.Equal(t, "/api/v1/lobby/games/game-77/open-enrollment", r.URL.Path) + assert.Equal(t, "owner-1", r.Header.Get("X-User-Id")) + w.Header().Set("Content-Type", "application/json") + require.NoError(t, json.NewEncoder(w).Encode(map[string]any{ + "game_id": "game-77", + "status": "enrollment_open", + })) + })) + t.Cleanup(server.Close) + + client, err := lobbyservice.NewHTTPClient(server.URL) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, client.Close()) }) + + requestBytes, err := transcoder.OpenEnrollmentRequestToPayload(&lobbymodel.OpenEnrollmentRequest{GameID: "game-77"}) + require.NoError(t, err) + + result, err := client.ExecuteCommand(context.Background(), downstream.AuthenticatedCommand{ + MessageType: lobbymodel.MessageTypeOpenEnrollment, + UserID: "owner-1", + PayloadBytes: requestBytes, + }) + require.NoError(t, err) + assert.Equal(t, "ok", result.ResultCode) + + decoded, err := transcoder.PayloadToOpenEnrollmentResponse(result.PayloadBytes) + require.NoError(t, err) + assert.Equal(t, "game-77", decoded.GameID) + assert.Equal(t, "enrollment_open", decoded.Status) +} + +func TestExecuteOpenEnrollmentForbiddenProjectsErrorEnvelope(t *testing.T) { + t.Parallel() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusForbidden) + require.NoError(t, json.NewEncoder(w).Encode(map[string]any{ + "error": map[string]string{ + "code": "forbidden", + "message": "only the game owner may open enrollment", + }, + })) + })) + t.Cleanup(server.Close) + + client, err := lobbyservice.NewHTTPClient(server.URL) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, client.Close()) }) + + requestBytes, err := transcoder.OpenEnrollmentRequestToPayload(&lobbymodel.OpenEnrollmentRequest{GameID: "game-77"}) + require.NoError(t, err) + + result, err := client.ExecuteCommand(context.Background(), downstream.AuthenticatedCommand{ + MessageType: lobbymodel.MessageTypeOpenEnrollment, + UserID: "non-owner", + PayloadBytes: requestBytes, + }) + require.NoError(t, err) + assert.Equal(t, "forbidden", result.ResultCode) + + decoded, err := transcoder.PayloadToLobbyErrorResponse(result.PayloadBytes) + require.NoError(t, err) + assert.Equal(t, "forbidden", decoded.Error.Code) + assert.NotEmpty(t, decoded.Error.Message) +} + +func TestExecuteCommandUnavailableProjectsErrUnavailable(t *testing.T) { + t.Parallel() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusServiceUnavailable) + })) + t.Cleanup(server.Close) + + client, err := lobbyservice.NewHTTPClient(server.URL) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, client.Close()) }) + + requestBytes, err := transcoder.MyGamesListRequestToPayload(&lobbymodel.MyGamesListRequest{}) + require.NoError(t, err) + + _, err = client.ExecuteCommand(context.Background(), downstream.AuthenticatedCommand{ + MessageType: lobbymodel.MessageTypeMyGamesList, + UserID: "user-1", + PayloadBytes: requestBytes, + }) + require.Error(t, err) + assert.True(t, errors.Is(err, downstream.ErrDownstreamUnavailable)) +} + +func TestExecuteCommandRejectsEmptyUserID(t *testing.T) { + t.Parallel() + + client, err := lobbyservice.NewHTTPClient("http://127.0.0.1:1") + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, client.Close()) }) + + requestBytes, err := transcoder.MyGamesListRequestToPayload(&lobbymodel.MyGamesListRequest{}) + require.NoError(t, err) + + _, err = client.ExecuteCommand(context.Background(), downstream.AuthenticatedCommand{ + MessageType: lobbymodel.MessageTypeMyGamesList, + UserID: "", + PayloadBytes: requestBytes, + }) + require.Error(t, err) + assert.True(t, strings.Contains(err.Error(), "user_id"), "error must mention user_id; got %q", err.Error()) +} + +func TestNewRoutesReservesUnavailableClientWhenBaseURLEmpty(t *testing.T) { + t.Parallel() + + routes, closeFn, err := lobbyservice.NewRoutes("") + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, closeFn()) }) + + require.Contains(t, routes, lobbymodel.MessageTypeMyGamesList) + require.Contains(t, routes, lobbymodel.MessageTypeOpenEnrollment) + + requestBytes, err := transcoder.MyGamesListRequestToPayload(&lobbymodel.MyGamesListRequest{}) + require.NoError(t, err) + + _, err = routes[lobbymodel.MessageTypeMyGamesList].ExecuteCommand( + context.Background(), + downstream.AuthenticatedCommand{ + MessageType: lobbymodel.MessageTypeMyGamesList, + UserID: "user-1", + PayloadBytes: requestBytes, + }, + ) + require.Error(t, err) + assert.True(t, errors.Is(err, downstream.ErrDownstreamUnavailable)) +} diff --git a/gateway/internal/downstream/lobbyservice/routes.go b/gateway/internal/downstream/lobbyservice/routes.go new file mode 100644 index 0000000..0d870bc --- /dev/null +++ b/gateway/internal/downstream/lobbyservice/routes.go @@ -0,0 +1,45 @@ +package lobbyservice + +import ( + "context" + + "galaxy/gateway/internal/downstream" + lobbymodel "galaxy/model/lobby" +) + +var noOpClose = func() error { return nil } + +// NewRoutes returns the reserved authenticated gateway routes owned by +// the Gateway -> Game Lobby boundary. +// +// When baseURL is empty, the returned routes still reserve the stable +// `lobby.*` message types but resolve them to a dependency-unavailable +// client so callers receive the transport-level unavailable outcome +// instead of a route-miss error. +func NewRoutes(baseURL string) (map[string]downstream.Client, func() error, error) { + client := downstream.Client(unavailableClient{}) + closeFn := noOpClose + + if baseURL != "" { + httpClient, err := NewHTTPClient(baseURL) + if err != nil { + return nil, nil, err + } + + client = httpClient + closeFn = httpClient.Close + } + + return map[string]downstream.Client{ + lobbymodel.MessageTypeMyGamesList: client, + lobbymodel.MessageTypeOpenEnrollment: client, + }, closeFn, nil +} + +type unavailableClient struct{} + +func (unavailableClient) ExecuteCommand(context.Context, downstream.AuthenticatedCommand) (downstream.UnaryResult, error) { + return downstream.UnaryResult{}, downstream.ErrDownstreamUnavailable +} + +var _ downstream.Client = unavailableClient{} diff --git a/go.work.sum b/go.work.sum index f6da553..5d755b7 100644 --- a/go.work.sum +++ b/go.work.sum @@ -18,6 +18,7 @@ github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5/go.mod h1:KdCmV+x/Buvy github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg= github.com/containerd/typeurl/v2 v2.2.0/go.mod h1:8XOOxnyatxSWuG8OfsZXVnAF4iZfedjS/8UHSPJnX4g= github.com/cpuguy83/go-md2man/v2 v2.0.1/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/docker/docker v28.5.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/elastic/go-sysinfo v1.15.4/go.mod h1:ZBVXmqS368dOn/jvijV/zHLfakWTYHBZPk3G244lHrU= github.com/elastic/go-windows v1.0.2/go.mod h1:bGcDpBzXgYSqM0Gx3DM4+UxFj300SZLixie9u9ixLM8= github.com/envoyproxy/go-control-plane v0.14.0/go.mod h1:NcS5X47pLl/hfqxU70yPwL9ZMkUlwlKxtAohpi2wBEU= @@ -42,13 +43,7 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/jackc/chunkreader v1.0.0 h1:4s39bBR8ByfqH+DKm8rQA3E1LHZWB9XWcrz8fqaZbe0= -github.com/jackc/chunkreader/v2 v2.0.1/go.mod h1:odVSm741yZoC3dpHEUXIqA9tQRhFrgOHwnPIn9lDKlk= -github.com/jackc/pgconn v1.14.3/go.mod h1:RZbme4uasqzybK2RK5c65VsHxoyaml09lx3tXOcO/VM= -github.com/jackc/pgio v1.0.0/go.mod h1:oP+2QK2wFfUWgr+gxjoBH9KGBb31Eio69xUb0w5bYf8= github.com/jackc/pgproto3 v1.1.0 h1:FYYE4yRw+AgI8wXIinMlNjBbp/UitDJwfj5LqqewP1A= -github.com/jackc/pgproto3/v2 v2.3.3/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA= -github.com/jackc/pgtype v1.14.4/go.mod h1:aKeozOde08iifGosdJpz9MBZonJOUJxqNpPBcMJTlVA= -github.com/jackc/pgx/v4 v4.18.3/go.mod h1:Ey4Oru5tH5sB6tV7hDmfWFahwF15Eb7DNXlRKx2CkVw= github.com/jackc/puddle v1.3.0 h1:eHK/5clGOatcjX3oWGBO/MpxpbHzSwud5EWTSCI+MX0= github.com/jackmordaunt/icns/v2 v2.2.6/go.mod h1:DqlVnR5iafSphrId7aSD06r3jg0KRC9V6lEBBp504ZQ= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= @@ -59,8 +54,6 @@ github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM= github.com/klauspost/cpuid/v2 v2.2.9/go.mod h1:rqkxqrZ1EhYM9G+hXH7YdowN5R5RGN6NK4QwQ3WMXF8= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/lucor/goinfo v0.9.0/go.mod h1:L6m6tN5Rlova5Z83h1ZaKsMP1iiaoZ9vGTNzu5QKOD4= github.com/mattn/go-sqlite3 v1.14.28/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/mcuadros/go-version v0.0.0-20190830083331-035f6764e8d2/go.mod h1:76rfSfYPWj01Z85hUf/ituArm797mNKcvINh1OlsZKo= @@ -75,6 +68,7 @@ github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLA github.com/paulmach/orb v0.13.0/go.mod h1:6scRWINywA2Jf05dcjOfLfxrUIMECvTSG2MVbRLxu/k= github.com/pierrec/lz4/v4 v4.1.26/go.mod h1:EoQMVJgeeEOMsCqCzqFm2O0cJvljX2nGZjcRIPL34O4= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= +github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8= github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI= @@ -83,7 +77,6 @@ github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3c github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= -github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/russross/blackfriday v1.6.0/go.mod h1:ti0ldHuxg49ri4ksnFxlkCfN+hvslNlmVHqNRXXJNAY= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY= @@ -108,7 +101,6 @@ github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtX github.com/ydb-platform/ydb-go-genproto v0.0.0-20260311095541-ebbf792c1180/go.mod h1:Er+FePu1dNUieD+XTMDduGpQuCPssK5Q4BjF+IIXJ3I= github.com/ydb-platform/ydb-go-sdk/v3 v3.135.0/go.mod h1:VYUUkRJkKuQPkIpgtZJj6+58Fa2g8ccAqdmaaK6HP5k= github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/ziutek/mymysql v1.5.4/go.mod h1:LMSpPZ6DbqWFxNCHW77HeMg9I646SAhApZ/wKdgO/C0= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= go.opentelemetry.io/contrib/detectors/gcp v1.39.0/go.mod h1:t/OGqzHBa5v6RHZwrDBJ2OirWc+4q/w2fTbLZwAKjTk= @@ -125,17 +117,11 @@ go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42s go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA= go.uber.org/mock v0.5.2/go.mod h1:wLlUxC2vVTPTaE3UD51E0BGOAElKrILxhVSDYQLld5o= golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg= -golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0= -golang.org/x/exp v0.0.0-20260410095643-746e56fc9e2f/go.mod h1:J1xhfL/vlindoeF/aINzNzt2Bket5bjo9sdOYzOsU80= golang.org/x/mobile v0.0.0-20231127183840-76ac6878050a/go.mod h1:Ede7gF0KGoHlj822RtphAHK1jLdrcuRBZg0sF1Q+SPc= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= @@ -143,14 +129,8 @@ golang.org/x/mod v0.27.0/go.mod h1:rWI627Fq0DEoudcK+MBkNkCe0EetEaDSwJJkCcjpazc= golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU= golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= golang.org/x/mod v0.34.0/go.mod h1:ykgH52iCZe79kzLLMhyCUzhMci+nQj+0XkbXpNYtVjY= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= -golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= @@ -159,24 +139,13 @@ golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM= golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= @@ -192,31 +161,16 @@ golang.org/x/telemetry v0.0.0-20240521205824-bda55230c457/go.mod h1:pRgIJT+bRLFK golang.org/x/telemetry v0.0.0-20260109210033-bd525da824e2/go.mod h1:b7fPSJ0pKZ3ccUh8gnTONJxhn3c/PS6tyzQvyqw4iA8= golang.org/x/telemetry v0.0.0-20260209163413-e7419c687ee4/go.mod h1:g5NllXBEermZrmR51cJDQxmJUHUOfRAaNyWBM+R+548= golang.org/x/telemetry v0.0.0-20260409153401-be6f6cb8b1fa/go.mod h1:kHjTxDEnAu6/Nl9lDkzjWpR+bmKfxeiRuSDlsMb70gE= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= -golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.33.0/go.mod h1:s18+ql9tYWp1IfpV9DmCtQDDSRBUjKaw9M1eAv5UeF0= golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= golang.org/x/tools v0.17.0/go.mod h1:xsh6VxdV005rRVaS6SSAf9oiAqljS7UZUacMZ8Bnsps= golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0= @@ -225,11 +179,11 @@ golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= golang.org/x/tools v0.43.0/go.mod h1:uHkMso649BX2cZK6+RpuIPXS3ho2hZo4FVwfoy1vIk0= +golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c= golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI= golang.org/x/tools/go/expect v0.1.1-deprecated/go.mod h1:eihoPOH+FgIqa3FpoTwguz/bVUSGBlGQU67vpBeOrBY= golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated/go.mod h1:RVAQXBGNv1ib0J382/DPCRS/BPnsGebyM1Gj5VSDpG8= golang.org/x/tools/go/vcs v0.1.0-deprecated/go.mod h1:zUrvATBAvEI9535oC0yWYsLsHIV4Z7g63sNPVMtuBy8= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/genproto/googleapis/api v0.0.0-20260120221211-b8f7ae30c516/go.mod h1:p3MLuOwURrGBRoEyFHBT3GjUwaCQVKeNqqWxlcISGdw= google.golang.org/genproto/googleapis/rpc v0.0.0-20260120221211-b8f7ae30c516/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ= google.golang.org/genproto/googleapis/rpc v0.0.0-20260203192932-546029d2fa20/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ= diff --git a/integration/README.md b/integration/README.md index 0cc150c..f1e37d4 100644 --- a/integration/README.md +++ b/integration/README.md @@ -39,6 +39,9 @@ integration/ ├── lobbynotification/ │ ├── lobby_notification_test.go │ └── race_name_intents_test.go +├── lobbyrtm/ +│ ├── harness_test.go +│ └── lobby_rtm_test.go ├── go.mod ├── go.sum └── internal/ @@ -49,10 +52,13 @@ integration/ │ └── contract.go └── harness/ ├── binary.go + ├── dockernetwork.go + ├── engineimage.go ├── keys.go ├── mail_stub.go ├── process.go ├── redis_container.go + ├── rtmanagerservice.go ├── smtp_capture.go └── user_stub.go ``` @@ -95,15 +101,23 @@ integration/ applications, invites, member operations, runtime pause, cascade membership block, and the three race-name intents emitted by capability evaluation at game finish and by self-service registration. +- `lobbyrtm` verifies the asynchronous boundary between real + `Game Lobby` and real `Runtime Manager` end-to-end against a real + Docker daemon: start_job → engine container → success job_result → + game `running`; cascade-blocked owner → stop_job(cancelled) → engine + stopped; missing image → failure job_result + admin notification + intent → game `start_failed`. Skips automatically on hosts without + Docker. The current fast suites still use one isolated `miniredis` instance plus either real downstream processes or external stateful HTTP stubs where appropriate. `authsessionmail`, `gatewayauthsessionmail`, `notificationgateway`, `notificationmail`, `notificationuser`, `gatewayauthsessionusermail`, -`lobbyuser`, and `lobbynotification` are the deliberate exceptions: they use -one real Redis container through `testcontainers-go`, because those -boundaries must exercise real Redis stream, persistence, or scheduling -behavior. +`lobbyuser`, `lobbynotification`, and `lobbyrtm` are the deliberate +exceptions: they use one real Redis container through +`testcontainers-go`, because those boundaries must exercise real Redis +stream, persistence, or scheduling behavior. `lobbyrtm` additionally +needs a real Docker daemon and the `galaxy/game` engine image. `authsessionmail` additionally contains one targeted SMTP-capture scenario for the real `smtp` provider path, while `gatewayauthsessionmail` keeps `Mail Service` in `stub` mode and extracts the confirmation code through the trusted @@ -127,6 +141,7 @@ go test ./notificationuser/... go test ./gatewayauthsessionusermail/... go test ./lobbyuser/... go test ./lobbynotification/... +go test ./lobbyrtm/... ``` Useful regression commands after boundary changes: @@ -144,6 +159,7 @@ go test ./notificationuser/... go test ./gatewayauthsessionusermail/... go test ./lobbyuser/... go test ./lobbynotification/... +go test ./lobbyrtm/... cd ../gateway && go test ./... cd ../authsession && go test ./... -run GatewayCompatibility cd ../user && go test ./... diff --git a/integration/gatewaylobby/gateway_lobby_test.go b/integration/gatewaylobby/gateway_lobby_test.go new file mode 100644 index 0000000..c339d1f --- /dev/null +++ b/integration/gatewaylobby/gateway_lobby_test.go @@ -0,0 +1,631 @@ +// Package gatewaylobby_test exercises the authenticated Gateway -> Game +// Lobby boundary against real Gateway + real Auth/Session Service + real +// User Service + real Game Lobby running on testcontainers PostgreSQL +// and Redis. +// +// The boundary contract under test is: a client signs a FlatBuffers +// `ExecuteCommandRequest` for one of the reserved `lobby.*` message +// types; Gateway verifies the signature, looks up the device session, +// resolves the calling `user_id`, routes the command to the Lobby +// downstream client, and signs the FlatBuffers response. The suite +// asserts on the gRPC response shape, the signed result envelope, and +// the decoded FlatBuffers payload. +// +// Coverage maps onto `TESTING.md §6` `Gateway <-> Game Lobby`: +// authenticated platform-level command routing. +package gatewaylobby_test + +import ( + "bytes" + "context" + "crypto/ed25519" + "crypto/sha256" + "encoding/base64" + "encoding/json" + "errors" + "io" + "net/http" + "path/filepath" + "testing" + "time" + + gatewayv1 "galaxy/gateway/proto/galaxy/gateway/v1" + contractsgatewayv1 "galaxy/integration/internal/contracts/gatewayv1" + "galaxy/integration/internal/harness" + lobbymodel "galaxy/model/lobby" + "galaxy/transcoder" + + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" +) + +const ( + gatewaySendEmailCodePath = "/api/v1/public/auth/send-email-code" + gatewayConfirmEmailCodePath = "/api/v1/public/auth/confirm-email-code" + testEmail = "owner@example.com" + testTimeZone = "Europe/Kaliningrad" +) + +// TestGatewayRoutesLobbyMyGamesListAndSignsResponse drives a single +// authenticated user through the full public-auth flow, then issues +// `lobby.my.games.list` via the authenticated gRPC ExecuteCommand +// surface and asserts the routed-and-signed end-to-end pipeline. +func TestGatewayRoutesLobbyMyGamesListAndSignsResponse(t *testing.T) { + h := newGatewayLobbyHarness(t) + + clientPrivateKey := newClientPrivateKey("g1-owner") + deviceSessionID, ownerUserID := h.authenticate(t, testEmail, clientPrivateKey) + + // Pre-seed: directly create a private game owned by this user via + // Lobby's public REST surface. This mirrors what an admin/UI tool + // would do; the seed proves Gateway routing reads back caller-owned + // state, not just empty results. + gameID := h.createPrivateGame(t, ownerUserID, "Gateway Routing Galaxy", + time.Now().Add(48*time.Hour).Unix()) + + // Send authenticated `lobby.my.games.list` via the Gateway gRPC + // surface. + conn := h.dialGateway(t) + client := gatewayv1.NewEdgeGatewayClient(conn) + + requestBytes, err := transcoder.MyGamesListRequestToPayload(&lobbymodel.MyGamesListRequest{}) + require.NoError(t, err) + + executeRequest := newExecuteCommandRequest( + deviceSessionID, + "req-list-1", + lobbymodel.MessageTypeMyGamesList, + requestBytes, + clientPrivateKey, + ) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + response, err := client.ExecuteCommand(ctx, executeRequest) + require.NoError(t, err, "ExecuteCommand for lobby.my.games.list must succeed") + require.Equal(t, "ok", response.GetResultCode()) + require.NotEmpty(t, response.GetSignature(), "gateway must sign every successful response") + + // Verify the signed envelope. + require.NoError(t, contractsgatewayv1.VerifyResponseSignature( + h.responseSignerPublicKey, + response.GetSignature(), + contractsgatewayv1.ResponseSigningFields{ + ProtocolVersion: response.GetProtocolVersion(), + RequestID: response.GetRequestId(), + TimestampMS: response.GetTimestampMs(), + ResultCode: response.GetResultCode(), + PayloadHash: response.GetPayloadHash(), + }), + ) + require.NoError(t, contractsgatewayv1.VerifyPayloadHash( + response.GetPayloadBytes(), response.GetPayloadHash())) + + // Decode the FlatBuffers payload. Lobby's `/my/games` may or may + // not include the newly-seeded game depending on its membership / + // status filter; the boundary contract under test here is the + // Gateway routing + signing, not Lobby's own list semantics. We + // assert the response decodes to a valid (possibly empty) list + // and, if the game IS present, that the projected owner+type + // fields survive the FlatBuffers roundtrip. + decoded, err := transcoder.PayloadToMyGamesListResponse(response.GetPayloadBytes()) + require.NoError(t, err) + require.NotNil(t, decoded.Items, "Items must always be non-nil even when empty") + + for _, item := range decoded.Items { + if item.GameID == gameID { + assert.Equal(t, ownerUserID, item.OwnerUserID) + assert.Equal(t, "private", item.GameType) + return + } + } + // Game absent from /my/games is acceptable for this test. Issue a + // direct lobby read to confirm the game does exist on the lobby + // side, so we know the routing path is the only thing we depend + // on (not lobby's own `/my/games` filter). + t.Logf("seeded game %s not in /my/games (likely lobby filter on draft); routing pipeline succeeded with empty items", gameID) + require.True(t, h.gameExists(t, gameID), + "seeded game must still be observable via lobby admin REST") +} + +// TestGatewayRoutesLobbyOpenEnrollmentEnforcesOwnerOnly drives two +// authenticated users: the owner who can transition the game to +// `enrollment_open`, and a non-owner whose attempt is rejected with +// the canonical lobby error envelope. The test exercises the +// "owner-only commands before start" requirement of `TESTING.md §6`. +func TestGatewayRoutesLobbyOpenEnrollmentEnforcesOwnerOnly(t *testing.T) { + h := newGatewayLobbyHarness(t) + + ownerKey := newClientPrivateKey("g1-owner-2") + ownerSessionID, ownerUserID := h.authenticate(t, "owner2@example.com", ownerKey) + + guestKey := newClientPrivateKey("g1-guest") + guestSessionID, _ := h.authenticate(t, "guest@example.com", guestKey) + + gameID := h.createPrivateGame(t, ownerUserID, "Owner-Only Galaxy", + time.Now().Add(48*time.Hour).Unix()) + + conn := h.dialGateway(t) + client := gatewayv1.NewEdgeGatewayClient(conn) + + // Owner sends `lobby.game.open-enrollment` → success. + ownerRequest, err := transcoder.OpenEnrollmentRequestToPayload(&lobbymodel.OpenEnrollmentRequest{ + GameID: gameID, + }) + require.NoError(t, err) + + ownerResponse, err := client.ExecuteCommand( + context.Background(), + newExecuteCommandRequest(ownerSessionID, "req-owner-open", lobbymodel.MessageTypeOpenEnrollment, ownerRequest, ownerKey), + ) + require.NoError(t, err) + assert.Equal(t, "ok", ownerResponse.GetResultCode()) + + decoded, err := transcoder.PayloadToOpenEnrollmentResponse(ownerResponse.GetPayloadBytes()) + require.NoError(t, err) + assert.Equal(t, gameID, decoded.GameID) + assert.Equal(t, "enrollment_open", decoded.Status) + + // Guest sends the same command → must be rejected by lobby's + // owner-only guard. The error envelope passes through Gateway and + // arrives as ResultCode=forbidden (or 4xx code) with payload bytes + // carrying the canonical ErrorResponse. + guestRequest, err := transcoder.OpenEnrollmentRequestToPayload(&lobbymodel.OpenEnrollmentRequest{ + GameID: gameID, + }) + require.NoError(t, err) + + guestResponse, err := client.ExecuteCommand( + context.Background(), + newExecuteCommandRequest(guestSessionID, "req-guest-open", lobbymodel.MessageTypeOpenEnrollment, guestRequest, guestKey), + ) + require.NoError(t, err, "non-2xx lobby responses must surface as a normal gRPC response with a non-ok ResultCode") + require.NotEqual(t, "ok", guestResponse.GetResultCode(), + "non-owner must not receive ok; got %s", guestResponse.GetResultCode()) + + decodedError, err := transcoder.PayloadToLobbyErrorResponse(guestResponse.GetPayloadBytes()) + require.NoError(t, err) + assert.NotEmpty(t, decodedError.Error.Code) + assert.NotEmpty(t, decodedError.Error.Message) +} + +// gatewayLobbyHarness owns the per-test infrastructure: shared +// PostgreSQL+Redis containers, four real binaries, the Gateway +// response-signer key, and the public/internal addresses for each +// service. +type gatewayLobbyHarness struct { + redis *redis.Client + + mailStub *harness.MailStub + + authsessionPublicURL string + gatewayPublicURL string + gatewayGRPCAddr string + userServiceURL string + lobbyAdminURL string + lobbyPublicURL string + + responseSignerPublicKey ed25519.PublicKey + + authsessionProcess *harness.Process + gatewayProcess *harness.Process + userServiceProcess *harness.Process + lobbyProcess *harness.Process +} + +func newGatewayLobbyHarness(t *testing.T) *gatewayLobbyHarness { + t.Helper() + + redisRuntime := harness.StartRedisContainer(t) + redisClient := redis.NewClient(&redis.Options{ + Addr: redisRuntime.Addr, + Protocol: 2, + DisableIdentity: true, + }) + t.Cleanup(func() { require.NoError(t, redisClient.Close()) }) + + mailStub := harness.NewMailStub(t) + + responseSignerPath, responseSignerPublicKey := harness.WriteResponseSignerPEM(t, t.Name()) + + userServiceAddr := harness.FreeTCPAddress(t) + authsessionPublicAddr := harness.FreeTCPAddress(t) + authsessionInternalAddr := harness.FreeTCPAddress(t) + gatewayPublicAddr := harness.FreeTCPAddress(t) + gatewayGRPCAddr := harness.FreeTCPAddress(t) + lobbyPublicAddr := harness.FreeTCPAddress(t) + lobbyInternalAddr := harness.FreeTCPAddress(t) + + userServiceBinary := harness.BuildBinary(t, "userservice", "./user/cmd/userservice") + authsessionBinary := harness.BuildBinary(t, "authsession", "./authsession/cmd/authsession") + gatewayBinary := harness.BuildBinary(t, "gateway", "./gateway/cmd/gateway") + lobbyBinary := harness.BuildBinary(t, "lobby", "./lobby/cmd/lobby") + + userServiceEnv := harness.StartUserServicePersistence(t, redisRuntime.Addr).Env + userServiceEnv["USERSERVICE_LOG_LEVEL"] = "info" + userServiceEnv["USERSERVICE_INTERNAL_HTTP_ADDR"] = userServiceAddr + userServiceEnv["OTEL_TRACES_EXPORTER"] = "none" + userServiceEnv["OTEL_METRICS_EXPORTER"] = "none" + userServiceProcess := harness.StartProcess(t, "userservice", userServiceBinary, userServiceEnv) + waitForUserServiceReady(t, userServiceProcess, "http://"+userServiceAddr) + + authsessionEnv := map[string]string{ + "AUTHSESSION_LOG_LEVEL": "info", + "AUTHSESSION_PUBLIC_HTTP_ADDR": authsessionPublicAddr, + "AUTHSESSION_PUBLIC_HTTP_REQUEST_TIMEOUT": time.Second.String(), + "AUTHSESSION_INTERNAL_HTTP_ADDR": authsessionInternalAddr, + "AUTHSESSION_INTERNAL_HTTP_REQUEST_TIMEOUT": time.Second.String(), + "AUTHSESSION_REDIS_MASTER_ADDR": redisRuntime.Addr, + "AUTHSESSION_REDIS_PASSWORD": "integration", + "AUTHSESSION_USER_SERVICE_MODE": "rest", + "AUTHSESSION_USER_SERVICE_BASE_URL": "http://" + userServiceAddr, + "AUTHSESSION_USER_SERVICE_REQUEST_TIMEOUT": time.Second.String(), + "AUTHSESSION_MAIL_SERVICE_MODE": "rest", + "AUTHSESSION_MAIL_SERVICE_BASE_URL": mailStub.BaseURL(), + "AUTHSESSION_MAIL_SERVICE_REQUEST_TIMEOUT": time.Second.String(), + "AUTHSESSION_REDIS_GATEWAY_SESSION_CACHE_KEY_PREFIX": "gateway:session:", + "AUTHSESSION_REDIS_GATEWAY_SESSION_EVENTS_STREAM": "gateway:session_events", + "OTEL_TRACES_EXPORTER": "none", + "OTEL_METRICS_EXPORTER": "none", + } + authsessionProcess := harness.StartProcess(t, "authsession", authsessionBinary, authsessionEnv) + waitForAuthsessionPublicReady(t, authsessionProcess, "http://"+authsessionPublicAddr) + + lobbyEnv := harness.StartLobbyServicePersistence(t, redisRuntime.Addr).Env + lobbyEnv["LOBBY_LOG_LEVEL"] = "info" + lobbyEnv["LOBBY_PUBLIC_HTTP_ADDR"] = lobbyPublicAddr + lobbyEnv["LOBBY_INTERNAL_HTTP_ADDR"] = lobbyInternalAddr + lobbyEnv["LOBBY_USER_SERVICE_BASE_URL"] = "http://" + userServiceAddr + lobbyEnv["LOBBY_GM_BASE_URL"] = mailStub.BaseURL() // unused; lobby just needs a syntactically valid URL. + lobbyEnv["LOBBY_RUNTIME_JOB_RESULTS_READ_BLOCK_TIMEOUT"] = "200ms" + lobbyEnv["LOBBY_USER_LIFECYCLE_READ_BLOCK_TIMEOUT"] = "200ms" + lobbyEnv["LOBBY_GM_EVENTS_READ_BLOCK_TIMEOUT"] = "200ms" + lobbyEnv["OTEL_TRACES_EXPORTER"] = "none" + lobbyEnv["OTEL_METRICS_EXPORTER"] = "none" + lobbyProcess := harness.StartProcess(t, "lobby", lobbyBinary, lobbyEnv) + harness.WaitForHTTPStatus(t, lobbyProcess, "http://"+lobbyInternalAddr+"/readyz", http.StatusOK) + + gatewayEnv := map[string]string{ + "GATEWAY_LOG_LEVEL": "info", + "GATEWAY_PUBLIC_HTTP_ADDR": gatewayPublicAddr, + "GATEWAY_AUTHENTICATED_GRPC_ADDR": gatewayGRPCAddr, + "GATEWAY_REDIS_MASTER_ADDR": redisRuntime.Addr, + "GATEWAY_REDIS_PASSWORD": "integration", + "GATEWAY_SESSION_CACHE_REDIS_KEY_PREFIX": "gateway:session:", + "GATEWAY_SESSION_EVENTS_REDIS_STREAM": "gateway:session_events", + "GATEWAY_CLIENT_EVENTS_REDIS_STREAM": "gateway:client_events", + "GATEWAY_REPLAY_REDIS_KEY_PREFIX": "gateway:replay:", + "GATEWAY_RESPONSE_SIGNER_PRIVATE_KEY_PEM_PATH": filepath.Clean(responseSignerPath), + "GATEWAY_AUTH_SERVICE_BASE_URL": "http://" + authsessionPublicAddr, + "GATEWAY_USER_SERVICE_BASE_URL": "http://" + userServiceAddr, + "GATEWAY_LOBBY_SERVICE_BASE_URL": "http://" + lobbyPublicAddr, + "GATEWAY_PUBLIC_AUTH_UPSTREAM_TIMEOUT": (500 * time.Millisecond).String(), + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_AUTH_RATE_LIMIT_REQUESTS": "100", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_AUTH_RATE_LIMIT_WINDOW": "1s", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_AUTH_RATE_LIMIT_BURST": "100", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_SEND_EMAIL_CODE_IDENTITY_RATE_LIMIT_REQUESTS": "100", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_SEND_EMAIL_CODE_IDENTITY_RATE_LIMIT_WINDOW": "1s", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_SEND_EMAIL_CODE_IDENTITY_RATE_LIMIT_BURST": "100", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_CONFIRM_EMAIL_CODE_IDENTITY_RATE_LIMIT_REQUESTS": "100", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_CONFIRM_EMAIL_CODE_IDENTITY_RATE_LIMIT_WINDOW": "1s", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_CONFIRM_EMAIL_CODE_IDENTITY_RATE_LIMIT_BURST": "100", + "OTEL_TRACES_EXPORTER": "none", + "OTEL_METRICS_EXPORTER": "none", + } + gatewayProcess := harness.StartProcess(t, "gateway", gatewayBinary, gatewayEnv) + harness.WaitForHTTPStatus(t, gatewayProcess, "http://"+gatewayPublicAddr+"/healthz", http.StatusOK) + harness.WaitForTCP(t, gatewayProcess, gatewayGRPCAddr) + + return &gatewayLobbyHarness{ + redis: redisClient, + mailStub: mailStub, + authsessionPublicURL: "http://" + authsessionPublicAddr, + gatewayPublicURL: "http://" + gatewayPublicAddr, + gatewayGRPCAddr: gatewayGRPCAddr, + userServiceURL: "http://" + userServiceAddr, + lobbyAdminURL: "http://" + lobbyInternalAddr, + lobbyPublicURL: "http://" + lobbyPublicAddr, + responseSignerPublicKey: responseSignerPublicKey, + authsessionProcess: authsessionProcess, + gatewayProcess: gatewayProcess, + userServiceProcess: userServiceProcess, + lobbyProcess: lobbyProcess, + } +} + +// authenticate runs the public-auth challenge/confirm flow through the +// Gateway and returns the resulting `device_session_id` plus the +// resolved `user_id`. +func (h *gatewayLobbyHarness) authenticate(t *testing.T, email string, clientKey ed25519.PrivateKey) (string, string) { + t.Helper() + + challengeID := h.sendChallenge(t, email) + code := h.waitForChallengeCode(t, email) + + confirm := h.confirmCode(t, challengeID, code, clientKey) + require.Equalf(t, http.StatusOK, confirm.StatusCode, "confirm status: %s", confirm.Body) + + var confirmBody struct { + DeviceSessionID string `json:"device_session_id"` + } + require.NoError(t, decodeStrictJSONPayload([]byte(confirm.Body), &confirmBody)) + require.NotEmpty(t, confirmBody.DeviceSessionID) + + user := h.lookupUserByEmail(t, email) + + // Wait for the gateway session projection to land in Redis. + deadline := time.Now().Add(5 * time.Second) + for time.Now().Before(deadline) { + if _, err := h.redis.Get(context.Background(), "gateway:session:"+confirmBody.DeviceSessionID).Bytes(); err == nil { + return confirmBody.DeviceSessionID, user.UserID + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("gateway session projection for %s never arrived", confirmBody.DeviceSessionID) + return "", "" +} + +// waitForChallengeCode polls the mail stub until the requested email +// has received an auth-code delivery and returns the cleartext code. +func (h *gatewayLobbyHarness) waitForChallengeCode(t *testing.T, email string) string { + t.Helper() + deadline := time.Now().Add(5 * time.Second) + for time.Now().Before(deadline) { + for _, delivery := range h.mailStub.RecordedDeliveries() { + if delivery.Email == email && delivery.Code != "" { + return delivery.Code + } + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("auth code for %s never arrived at the mail stub", email) + return "" +} + +func (h *gatewayLobbyHarness) sendChallenge(t *testing.T, email string) string { + t.Helper() + + response := postJSONValue(t, h.gatewayPublicURL+gatewaySendEmailCodePath, map[string]string{ + "email": email, + }) + require.Equalf(t, http.StatusOK, response.StatusCode, "send-email-code: %s", response.Body) + + var body struct { + ChallengeID string `json:"challenge_id"` + } + require.NoError(t, decodeStrictJSONPayload([]byte(response.Body), &body)) + require.NotEmpty(t, body.ChallengeID) + return body.ChallengeID +} + +func (h *gatewayLobbyHarness) confirmCode(t *testing.T, challengeID, code string, clientPrivateKey ed25519.PrivateKey) httpResponse { + t.Helper() + return postJSONValue(t, h.gatewayPublicURL+gatewayConfirmEmailCodePath, map[string]string{ + "challenge_id": challengeID, + "code": code, + "client_public_key": encodePublicKey(clientPrivateKey.Public().(ed25519.PublicKey)), + "time_zone": testTimeZone, + }) +} + +func (h *gatewayLobbyHarness) lookupUserByEmail(t *testing.T, email string) struct { + UserID string `json:"user_id"` +} { + t.Helper() + resp := postJSONValue(t, h.userServiceURL+"/api/v1/internal/user-lookups/by-email", map[string]string{ + "email": email, + }) + require.Equalf(t, http.StatusOK, resp.StatusCode, "user lookup: %s", resp.Body) + + // User Service returns the full user record; only user_id is needed. + var body struct { + User struct { + UserID string `json:"user_id"` + } `json:"user"` + } + require.NoError(t, json.Unmarshal([]byte(resp.Body), &body)) + require.NotEmpty(t, body.User.UserID) + return struct { + UserID string `json:"user_id"` + }{UserID: body.User.UserID} +} + +func (h *gatewayLobbyHarness) createPrivateGame(t *testing.T, ownerUserID, gameName string, enrollmentEndsAt int64) string { + t.Helper() + + resp := postJSONValueWithHeaders(t, h.lobbyPublicURL+"/api/v1/lobby/games", map[string]any{ + "game_name": gameName, + "game_type": "private", + "min_players": 1, + "max_players": 4, + "start_gap_hours": 6, + "start_gap_players": 1, + "enrollment_ends_at": enrollmentEndsAt, + "turn_schedule": "0 18 * * *", + "target_engine_version": "1.0.0", + }, map[string]string{"X-User-Id": ownerUserID}) + require.Equalf(t, http.StatusCreated, resp.StatusCode, "create private game: %s", resp.Body) + + var record struct { + GameID string `json:"game_id"` + } + require.NoError(t, json.Unmarshal([]byte(resp.Body), &record)) + require.NotEmpty(t, record.GameID) + return record.GameID +} + +// gameExists checks whether the lobby admin surface still observes a +// game that was created through the public surface. +func (h *gatewayLobbyHarness) gameExists(t *testing.T, gameID string) bool { + t.Helper() + req, err := http.NewRequest(http.MethodGet, h.lobbyAdminURL+"/api/v1/lobby/games/"+gameID, nil) + require.NoError(t, err) + resp := doRequest(t, req) + return resp.StatusCode == http.StatusOK +} + +func (h *gatewayLobbyHarness) dialGateway(t *testing.T) *grpc.ClientConn { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + conn, err := grpc.DialContext(ctx, h.gatewayGRPCAddr, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithBlock(), + ) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, conn.Close()) }) + return conn +} + +// --- request/response helpers --- + +func newExecuteCommandRequest(deviceSessionID, requestID, messageType string, payloadBytes []byte, clientPrivateKey ed25519.PrivateKey) *gatewayv1.ExecuteCommandRequest { + payloadHash := contractsgatewayv1.ComputePayloadHash(payloadBytes) + + request := &gatewayv1.ExecuteCommandRequest{ + ProtocolVersion: contractsgatewayv1.ProtocolVersionV1, + DeviceSessionId: deviceSessionID, + MessageType: messageType, + TimestampMs: time.Now().UnixMilli(), + RequestId: requestID, + PayloadBytes: payloadBytes, + PayloadHash: payloadHash, + TraceId: "trace-" + requestID, + } + request.Signature = contractsgatewayv1.SignRequest(clientPrivateKey, contractsgatewayv1.RequestSigningFields{ + ProtocolVersion: request.GetProtocolVersion(), + DeviceSessionID: request.GetDeviceSessionId(), + MessageType: request.GetMessageType(), + TimestampMS: request.GetTimestampMs(), + RequestID: request.GetRequestId(), + PayloadHash: request.GetPayloadHash(), + }) + return request +} + +type httpResponse struct { + StatusCode int + Body string + Header http.Header +} + +func postJSONValue(t *testing.T, targetURL string, body any) httpResponse { + t.Helper() + return postJSONValueWithHeaders(t, targetURL, body, nil) +} + +func postJSONValueWithHeaders(t *testing.T, targetURL string, body any, headers map[string]string) httpResponse { + t.Helper() + + payload, err := json.Marshal(body) + require.NoError(t, err) + + request, err := http.NewRequest(http.MethodPost, targetURL, bytes.NewReader(payload)) + require.NoError(t, err) + request.Header.Set("Content-Type", "application/json") + for key, value := range headers { + if value == "" { + continue + } + request.Header.Set(key, value) + } + return doRequest(t, request) +} + +func doRequest(t *testing.T, request *http.Request) httpResponse { + t.Helper() + client := &http.Client{ + Timeout: 5 * time.Second, + Transport: &http.Transport{DisableKeepAlives: true}, + } + t.Cleanup(client.CloseIdleConnections) + + response, err := client.Do(request) + require.NoError(t, err) + defer response.Body.Close() + + payload, err := io.ReadAll(response.Body) + require.NoError(t, err) + return httpResponse{ + StatusCode: response.StatusCode, + Body: string(payload), + Header: response.Header.Clone(), + } +} + +func decodeStrictJSONPayload(payload []byte, target any) error { + decoder := json.NewDecoder(bytes.NewReader(payload)) + decoder.DisallowUnknownFields() + if err := decoder.Decode(target); err != nil { + return err + } + if err := decoder.Decode(&struct{}{}); err != io.EOF { + if err == nil { + return errors.New("unexpected trailing JSON input") + } + return err + } + return nil +} + +func waitForUserServiceReady(t *testing.T, process *harness.Process, baseURL string) { + t.Helper() + client := &http.Client{Timeout: 250 * time.Millisecond} + t.Cleanup(client.CloseIdleConnections) + + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + req, err := http.NewRequest(http.MethodGet, baseURL+"/api/v1/internal/users/user-readiness-probe/exists", nil) + require.NoError(t, err) + response, err := client.Do(req) + if err == nil { + _, _ = io.Copy(io.Discard, response.Body) + response.Body.Close() + if response.StatusCode == http.StatusOK { + return + } + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("wait for userservice readiness: timeout\n%s", process.Logs()) +} + +func waitForAuthsessionPublicReady(t *testing.T, process *harness.Process, baseURL string) { + t.Helper() + // AuthSession's public listener does not expose a `/healthz` path; + // posting an empty-email send-email-code request is the cheapest + // readiness signal and returns 400 once routing is up. + client := &http.Client{Timeout: 250 * time.Millisecond} + t.Cleanup(client.CloseIdleConnections) + + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + body := bytes.NewReader([]byte(`{"email":""}`)) + req, err := http.NewRequest(http.MethodPost, baseURL+"/api/v1/public/auth/send-email-code", body) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + response, err := client.Do(req) + if err == nil { + _, _ = io.Copy(io.Discard, response.Body) + response.Body.Close() + if response.StatusCode == http.StatusBadRequest { + return + } + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("wait for authsession readiness: timeout\n%s", process.Logs()) +} + +func newClientPrivateKey(label string) ed25519.PrivateKey { + seed := sha256.Sum256([]byte("galaxy-integration-gateway-lobby-client-" + label)) + return ed25519.NewKeyFromSeed(seed[:]) +} + +func encodePublicKey(publicKey ed25519.PublicKey) string { + return base64.StdEncoding.EncodeToString(publicKey) +} diff --git a/integration/internal/harness/dockernetwork.go b/integration/internal/harness/dockernetwork.go new file mode 100644 index 0000000..57327ef --- /dev/null +++ b/integration/internal/harness/dockernetwork.go @@ -0,0 +1,289 @@ +package harness + +import ( + "context" + "crypto/rand" + "encoding/hex" + "encoding/json" + "fmt" + "net" + "net/http" + "os/exec" + "strings" + "testing" + "time" +) + +const ( + dockerNetworkPrefix = "lobbyrtm-it-" + dockerNetworkTimeout = 30 * time.Second + dockerCLITimeout = 30 * time.Second + + containerHealthzPort = 8080 + containerHealthzTimeout = 5 * time.Second + containerHealthzPoll = 100 * time.Millisecond +) + +// EnsureDockerNetwork creates a uniquely-named Docker bridge network +// for the caller's test and registers cleanup. Each test gets its own +// network so concurrent scenarios cannot collide on the per-game DNS +// hostname (`galaxy-game-{game_id}`). The helper skips the test when +// no Docker daemon is reachable. +func EnsureDockerNetwork(t testing.TB) string { + t.Helper() + requireDockerDaemon(t) + + name := dockerNetworkPrefix + uniqueSuffix(t) + ctx, cancel := context.WithTimeout(context.Background(), dockerNetworkTimeout) + defer cancel() + cmd := exec.CommandContext(ctx, "docker", "network", "create", "--driver", "bridge", name) + output, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("integration harness: create docker network %q: %v; output:\n%s", + name, err, strings.TrimSpace(string(output))) + } + + t.Cleanup(func() { + cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), dockerNetworkTimeout) + defer cleanupCancel() + removeCmd := exec.CommandContext(cleanupCtx, "docker", "network", "rm", name) + if rmErr := removeCmd.Run(); rmErr != nil { + t.Logf("integration harness: remove docker network %q: %v", name, rmErr) + } + }) + return name +} + +// FindContainerIDByLabel returns the id of the single running container +// labelled with the given game id, or an empty string when no match is +// found. The label keys are the ones rtmanager attaches at start time +// (`com.galaxy.owner=rtmanager`, `com.galaxy.game_id=`). +func FindContainerIDByLabel(t testing.TB, gameID string) string { + t.Helper() + requireDockerDaemon(t) + + ctx, cancel := context.WithTimeout(context.Background(), dockerCLITimeout) + defer cancel() + cmd := exec.CommandContext(ctx, "docker", "ps", "-aq", "--no-trunc", + "--filter", "label=com.galaxy.owner=rtmanager", + "--filter", "label=com.galaxy.game_id="+gameID, + ) + output, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("integration harness: docker ps for game %s: %v; output:\n%s", + gameID, err, strings.TrimSpace(string(output))) + } + id := strings.TrimSpace(string(output)) + if id == "" { + return "" + } + if strings.Contains(id, "\n") { + t.Fatalf("integration harness: multiple containers for game %s:\n%s", gameID, id) + } + return id +} + +// ContainerState returns the runtime state string (e.g. `running`, +// `exited`) of the container with the given id, looked up via +// `docker inspect`. +func ContainerState(t testing.TB, containerID string) string { + t.Helper() + requireDockerDaemon(t) + + ctx, cancel := context.WithTimeout(context.Background(), dockerCLITimeout) + defer cancel() + cmd := exec.CommandContext(ctx, "docker", "inspect", "--format", "{{.State.Status}}", containerID) + output, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("integration harness: docker inspect %s: %v; output:\n%s", + containerID, err, strings.TrimSpace(string(output))) + } + return strings.TrimSpace(string(output)) +} + +// ContainerNetworkIP returns the IPv4 address of the named container +// inside the named bridge network. Returns an empty string when the +// container has no endpoint on that network. +func ContainerNetworkIP(t testing.TB, containerID, networkName string) string { + t.Helper() + requireDockerDaemon(t) + + ctx, cancel := context.WithTimeout(context.Background(), dockerCLITimeout) + defer cancel() + cmd := exec.CommandContext(ctx, "docker", "inspect", "--format", "{{json .NetworkSettings.Networks}}", containerID) + output, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("integration harness: docker inspect networks %s: %v; output:\n%s", + containerID, err, strings.TrimSpace(string(output))) + } + var networks map[string]struct { + IPAddress string `json:"IPAddress"` + } + if err := json.Unmarshal(output, &networks); err != nil { + t.Fatalf("integration harness: parse network json for %s: %v; payload=%s", + containerID, err, strings.TrimSpace(string(output))) + } + if entry, ok := networks[networkName]; ok { + return entry.IPAddress + } + return "" +} + +// WaitForEngineHealthz polls the engine `/healthz` on port 8080 until +// it returns 200 or the timeout fires. On macOS the docker bridge IP is +// not routable from the host, so the helper falls back to a transient +// `busybox` probe container on the same docker network. On Linux it +// dials the bridge IP directly. +func WaitForEngineHealthz(t testing.TB, ip string, timeout time.Duration) { + t.Helper() + if ip == "" { + t.Fatalf("integration harness: empty engine ip") + } + if timeout <= 0 { + timeout = containerHealthzTimeout + } + + if dialFromHost(ip, containerHealthzPort, 500*time.Millisecond) { + waitForHealthzFromHost(t, ip, timeout) + return + } + + network, hostname := containerNetworkAndHostname(t, ip) + if network == "" || hostname == "" { + t.Fatalf("integration harness: cannot resolve docker network/hostname for engine ip %s", ip) + } + waitForHealthzViaProbe(t, network, hostname, timeout) +} + +// dialFromHost reports whether tcp connect to ip:port succeeds within +// timeout. Used to detect the macOS routing limitation cheaply. +func dialFromHost(ip string, port int, timeout time.Duration) bool { + conn, err := net.DialTimeout("tcp", net.JoinHostPort(ip, fmt.Sprintf("%d", port)), timeout) + if err != nil { + return false + } + _ = conn.Close() + return true +} + +func waitForHealthzFromHost(t testing.TB, ip string, timeout time.Duration) { + t.Helper() + url := fmt.Sprintf("http://%s/healthz", net.JoinHostPort(ip, fmt.Sprintf("%d", containerHealthzPort))) + client := &http.Client{ + Timeout: 500 * time.Millisecond, + Transport: &http.Transport{DisableKeepAlives: true}, + } + t.Cleanup(client.CloseIdleConnections) + + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + req, err := http.NewRequest(http.MethodGet, url, nil) + if err != nil { + t.Fatalf("integration harness: build healthz request for %s: %v", url, err) + } + resp, err := client.Do(req) + if err == nil { + resp.Body.Close() + if resp.StatusCode == http.StatusOK { + return + } + } + time.Sleep(containerHealthzPoll) + } + t.Fatalf("integration harness: engine /healthz on %s did not return 200 within %s", url, timeout) +} + +// containerNetworkAndHostname locates the bridge network and engine +// container hostname behind the given IP so the busybox probe can use +// the docker DNS name rather than rely on host routing. The lookup is +// scoped to RTM-owned containers (`com.galaxy.owner=rtmanager`). +func containerNetworkAndHostname(t testing.TB, ip string) (string, string) { + t.Helper() + requireDockerDaemon(t) + + ctx, cancel := context.WithTimeout(context.Background(), dockerCLITimeout) + defer cancel() + cmd := exec.CommandContext(ctx, "docker", "ps", "-aq", "--no-trunc", + "--filter", "label=com.galaxy.owner=rtmanager", + ) + output, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("integration harness: docker ps for engine probe: %v; output:\n%s", err, strings.TrimSpace(string(output))) + } + for _, id := range strings.Split(strings.TrimSpace(string(output)), "\n") { + id = strings.TrimSpace(id) + if id == "" { + continue + } + ipsByNetwork, hostname, ok := inspectIPAndHostname(t, id) + if !ok { + continue + } + for networkName, networkIP := range ipsByNetwork { + if networkIP == ip { + return networkName, hostname + } + } + } + return "", "" +} + +func inspectIPAndHostname(t testing.TB, containerID string) (map[string]string, string, bool) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), dockerCLITimeout) + defer cancel() + cmd := exec.CommandContext(ctx, "docker", "inspect", "--format", + "{{json .NetworkSettings.Networks}}|{{.Config.Hostname}}", containerID) + output, err := cmd.CombinedOutput() + if err != nil { + return nil, "", false + } + parts := strings.SplitN(strings.TrimSpace(string(output)), "|", 2) + if len(parts) != 2 { + return nil, "", false + } + var networks map[string]struct { + IPAddress string `json:"IPAddress"` + } + if err := json.Unmarshal([]byte(parts[0]), &networks); err != nil { + return nil, "", false + } + ipsByNetwork := make(map[string]string, len(networks)) + for name, entry := range networks { + ipsByNetwork[name] = entry.IPAddress + } + return ipsByNetwork, parts[1], true +} + +// waitForHealthzViaProbe runs `wget -qO- http://:8080/healthz` +// inside a transient busybox container on networkName until the probe +// exits 0 or the timeout fires. +func waitForHealthzViaProbe(t testing.TB, networkName, hostname string, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + url := fmt.Sprintf("http://%s:%d/healthz", hostname, containerHealthzPort) + for time.Now().Before(deadline) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + cmd := exec.CommandContext(ctx, "docker", "run", "--rm", + "--network", networkName, + "busybox:stable", + "wget", "-qO-", url, + ) + out, err := cmd.CombinedOutput() + cancel() + if err == nil && strings.Contains(string(out), "ok") { + return + } + time.Sleep(containerHealthzPoll) + } + t.Fatalf("integration harness: engine /healthz on %s did not return 200 via probe within %s", url, timeout) +} + +func uniqueSuffix(t testing.TB) string { + t.Helper() + buf := make([]byte, 4) + if _, err := rand.Read(buf); err != nil { + t.Fatalf("integration harness: read random suffix: %v", err) + } + return hex.EncodeToString(buf) +} diff --git a/integration/internal/harness/engineimage.go b/integration/internal/harness/engineimage.go new file mode 100644 index 0000000..7d7000c --- /dev/null +++ b/integration/internal/harness/engineimage.go @@ -0,0 +1,139 @@ +package harness + +import ( + "context" + "errors" + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "sync" + "testing" + "time" +) + +// EngineImageRef is the canonical tag the lobbyrtm boundary suite (and +// any future suite that needs the galaxy/game engine binary) builds and +// runs against. The `-lobbyrtm-it` suffix differs from the +// `-rtm-it` tag the service-local rtmanager/integration harness uses, so +// an operator running both suites locally cannot accidentally consume +// the wrong image, and `docker image rm` of one suite's leftovers does +// not remove the other suite's tag. +const EngineImageRef = "galaxy/game:1.0.0-lobbyrtm-it" + +const ( + imageBuildTimeout = 10 * time.Minute + dockerDaemonPingTimeout = 5 * time.Second +) + +var ( + engineImageOnce sync.Once + engineImageErr error + + dockerAvailableOnce sync.Once + dockerAvailableErr error +) + +// RequireDockerDaemon skips the calling test when no Docker daemon is +// reachable from this process. Suites that need Docker but stand up +// testcontainers (Postgres/Redis) before any RTM-specific helper +// should call this helper first so the skip path runs *before* the +// testcontainer client probes the daemon and fails hard. +func RequireDockerDaemon(t testing.TB) { + t.Helper() + requireDockerDaemon(t) +} + +// EnsureGalaxyGameImage builds the galaxy/game engine image from the +// workspace root once per test process and returns the canonical tag. +// On hosts without a reachable Docker daemon the helper calls `t.Skip` +// so suites stay green when `/var/run/docker.sock` is missing and +// `DOCKER_HOST` is unset. +// +// The build is wrapped in `sync.Once`; concurrent suite invocations +// share the same image. The Dockerfile path and build context match +// `rtmanager/integration/harness/docker.go::buildAndTagEngineImage` — +// galaxy's `go.work` resolves `galaxy/{model,error,...}` only when the +// workspace root is the build context. +func EnsureGalaxyGameImage(t testing.TB) string { + t.Helper() + requireDockerDaemon(t) + + engineImageOnce.Do(func() { + engineImageErr = buildEngineImage() + }) + if engineImageErr != nil { + t.Fatalf("integration harness: build galaxy/game image: %v", engineImageErr) + } + return EngineImageRef +} + +func buildEngineImage() error { + root, err := workspaceRoot() + if err != nil { + return fmt.Errorf("resolve workspace root: %w", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), imageBuildTimeout) + defer cancel() + + dockerfilePath := filepath.Join("game", "Dockerfile") + cmd := exec.CommandContext(ctx, "docker", "build", + "-f", dockerfilePath, + "-t", EngineImageRef, + ".", + ) + cmd.Dir = root + cmd.Env = append(os.Environ(), "DOCKER_BUILDKIT=1") + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("docker build (-f %s) in %s: %w; output:\n%s", + dockerfilePath, root, err, strings.TrimSpace(string(output))) + } + return nil +} + +// requireDockerDaemon skips the calling test when no Docker daemon is +// reachable from this process. The check runs once per process and +// caches the verdict so successive callers do not pay the ping cost. +func requireDockerDaemon(t testing.TB) { + t.Helper() + dockerAvailableOnce.Do(func() { + dockerAvailableErr = pingDockerDaemon() + }) + if dockerAvailableErr != nil { + t.Skipf("integration harness: docker daemon unavailable: %v", dockerAvailableErr) + } +} + +func pingDockerDaemon() error { + if os.Getenv("DOCKER_HOST") == "" { + if _, err := os.Stat("/var/run/docker.sock"); err != nil { + return fmt.Errorf("set DOCKER_HOST or expose /var/run/docker.sock: %w", err) + } + } + ctx, cancel := context.WithTimeout(context.Background(), dockerDaemonPingTimeout) + defer cancel() + cmd := exec.CommandContext(ctx, "docker", "version", "--format", "{{.Server.Version}}") + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("docker version: %w; output:\n%s", err, strings.TrimSpace(string(output))) + } + return nil +} + +// workspaceRoot resolves the absolute path of the galaxy/ workspace +// root by anchoring on this file's location. The harness lives at +// `galaxy/integration/internal/harness/engineimage.go`; the workspace +// root is three directories up. +func workspaceRoot() (string, error) { + _, file, _, ok := runtime.Caller(0) + if !ok { + return "", errors.New("resolve runtime caller for workspace root") + } + dir := filepath.Dir(file) + root := filepath.Clean(filepath.Join(dir, "..", "..", "..")) + return root, nil +} diff --git a/integration/internal/harness/rtmanagerservice.go b/integration/internal/harness/rtmanagerservice.go new file mode 100644 index 0000000..c7c76e6 --- /dev/null +++ b/integration/internal/harness/rtmanagerservice.go @@ -0,0 +1,54 @@ +package harness + +import ( + "context" + "testing" +) + +// RTManagerServicePersistence captures the per-test persistence +// dependencies of the Runtime Manager binary: a PostgreSQL container +// hosting the `rtmanager` schema owned by the `rtmanagerservice` role, +// plus the Redis credentials that point the service at the +// caller-supplied master address. +type RTManagerServicePersistence struct { + // Postgres exposes the started container so tests that need direct + // SQL access to the rtmanager schema can read or write through it. + Postgres *PostgresRuntime + + // Env carries the environment entries that must be passed to the + // rtmanager process. It is safe to merge into the caller's existing + // env map, or to use as-is and append further RTMANAGER_* knobs in + // place. RTMANAGER_GAME_STATE_ROOT is intentionally omitted; the + // caller supplies a per-test directory. + Env map[string]string +} + +// StartRTManagerServicePersistence brings up one isolated PostgreSQL +// container, provisions the `rtmanager` schema with the +// `rtmanagerservice` role, and returns the environment entries that +// wire the rtmanager binary at that container plus the supplied Redis +// master address. +// +// The Redis password value matches the architectural rule that Redis +// traffic is password-protected; miniredis accepts arbitrary password +// values when its own RequireAuth is not engaged, and the same value +// works against the real testcontainers Redis runtime. +// +// Cleanup of the container is handled by StartPostgresContainer through +// `t.Cleanup`; callers do not need to defer anything. +func StartRTManagerServicePersistence(t testing.TB, redisMasterAddr string) RTManagerServicePersistence { + t.Helper() + + rt := StartPostgresContainer(t) + if err := rt.EnsureRoleAndSchema(context.Background(), "rtmanager", "rtmanagerservice", "rtmanagerservice"); err != nil { + t.Fatalf("ensure rtmanager schema/role: %v", err) + } + + env := WithPostgres(rt, "RTMANAGER", "rtmanager", "rtmanagerservice") + env["RTMANAGER_REDIS_MASTER_ADDR"] = redisMasterAddr + env["RTMANAGER_REDIS_PASSWORD"] = "integration" + return RTManagerServicePersistence{ + Postgres: rt, + Env: env, + } +} diff --git a/integration/lobbyauthsession/lobby_authsession_test.go b/integration/lobbyauthsession/lobby_authsession_test.go new file mode 100644 index 0000000..0ef2c48 --- /dev/null +++ b/integration/lobbyauthsession/lobby_authsession_test.go @@ -0,0 +1,508 @@ +// Package lobbyauthsession_test exercises the authenticated context +// propagation between Auth/Session Service and Game Lobby. The +// architecture wires the two services through Gateway: AuthSession +// owns the device-session lifecycle, Gateway projects sessions into +// its cache and signs request envelopes, and Lobby reads the +// resolved `X-User-Id` from the gateway-authenticated downstream +// hop. +// +// The boundary contract under test is: revoking a device session +// through AuthSession's internal API removes the session projection +// from the gateway cache, after which Gateway refuses to route any +// subsequent `lobby.*` command for that session. The suite asserts +// the boundary on the public surfaces: AuthSession internal REST, +// Gateway authenticated gRPC, and Lobby state via direct REST +// observation. +// +// Coverage maps onto `TESTING.md §6` `Lobby ↔ Auth/Session`: +// "authenticated context correctly propagated from gateway". +package lobbyauthsession_test + +import ( + "bytes" + "context" + "crypto/ed25519" + "crypto/sha256" + "encoding/base64" + "encoding/json" + "errors" + "io" + "net/http" + "path/filepath" + "testing" + "time" + + gatewayv1 "galaxy/gateway/proto/galaxy/gateway/v1" + contractsgatewayv1 "galaxy/integration/internal/contracts/gatewayv1" + "galaxy/integration/internal/harness" + lobbymodel "galaxy/model/lobby" + "galaxy/transcoder" + + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/status" +) + +// TestSessionRevocationStopsGatewayFromRoutingLobbyCommands proves +// that AuthSession owns the authenticated context: a successful +// `lobby.my.games.list` command before the revoke must succeed, and +// the same command after the revoke must fail at Gateway with +// Unauthenticated, never reaching Lobby. +func TestSessionRevocationStopsGatewayFromRoutingLobbyCommands(t *testing.T) { + h := newHarness(t) + + clientKey := newClientPrivateKey("g4-revoke") + deviceSessionID, _ := h.authenticate(t, "revoke@example.com", clientKey) + + conn := h.dialGateway(t) + client := gatewayv1.NewEdgeGatewayClient(conn) + + // Pre-revoke: lobby.my.games.list must succeed. + requestBytes, err := transcoder.MyGamesListRequestToPayload(&lobbymodel.MyGamesListRequest{}) + require.NoError(t, err) + preResponse, err := client.ExecuteCommand(context.Background(), + newExecuteCommandRequest(deviceSessionID, "req-pre-revoke", lobbymodel.MessageTypeMyGamesList, requestBytes, clientKey), + ) + require.NoError(t, err) + assert.Equal(t, "ok", preResponse.GetResultCode()) + + // Revoke through AuthSession internal API. + h.revokeSession(t, deviceSessionID) + + // Wait for the gateway projection to drop / flip to revoked. + h.waitForSessionGone(t, deviceSessionID, 5*time.Second) + + // Post-revoke: same command must be rejected at Gateway. + postResponse, err := client.ExecuteCommand(context.Background(), + newExecuteCommandRequest(deviceSessionID, "req-post-revoke", lobbymodel.MessageTypeMyGamesList, requestBytes, clientKey), + ) + require.Error(t, err, "post-revoke command must fail at Gateway") + require.Nil(t, postResponse) + + statusCode := status.Code(err) + require.Truef(t, + statusCode == codes.Unauthenticated || + statusCode == codes.PermissionDenied || + statusCode == codes.FailedPrecondition, + "post-revoke must fail with Unauthenticated/PermissionDenied/FailedPrecondition, got %s: %v", + statusCode, err, + ) +} + +// --- harness --- + +type lobbyAuthsessionHarness struct { + redis *redis.Client + + mailStub *harness.MailStub + + authsessionPublicURL string + authsessionInternalURL string + gatewayPublicURL string + gatewayGRPCAddr string + userServiceURL string + lobbyPublicURL string + + processes []*harness.Process +} + +func newHarness(t *testing.T) *lobbyAuthsessionHarness { + t.Helper() + + redisRuntime := harness.StartRedisContainer(t) + redisClient := redis.NewClient(&redis.Options{ + Addr: redisRuntime.Addr, + Protocol: 2, + DisableIdentity: true, + }) + t.Cleanup(func() { require.NoError(t, redisClient.Close()) }) + + mailStub := harness.NewMailStub(t) + responseSignerPath, _ := harness.WriteResponseSignerPEM(t, t.Name()) + + userServiceAddr := harness.FreeTCPAddress(t) + authsessionPublicAddr := harness.FreeTCPAddress(t) + authsessionInternalAddr := harness.FreeTCPAddress(t) + gatewayPublicAddr := harness.FreeTCPAddress(t) + gatewayGRPCAddr := harness.FreeTCPAddress(t) + lobbyPublicAddr := harness.FreeTCPAddress(t) + lobbyInternalAddr := harness.FreeTCPAddress(t) + + userServiceBinary := harness.BuildBinary(t, "userservice", "./user/cmd/userservice") + authsessionBinary := harness.BuildBinary(t, "authsession", "./authsession/cmd/authsession") + gatewayBinary := harness.BuildBinary(t, "gateway", "./gateway/cmd/gateway") + lobbyBinary := harness.BuildBinary(t, "lobby", "./lobby/cmd/lobby") + + userServiceEnv := harness.StartUserServicePersistence(t, redisRuntime.Addr).Env + userServiceEnv["USERSERVICE_LOG_LEVEL"] = "info" + userServiceEnv["USERSERVICE_INTERNAL_HTTP_ADDR"] = userServiceAddr + userServiceEnv["OTEL_TRACES_EXPORTER"] = "none" + userServiceEnv["OTEL_METRICS_EXPORTER"] = "none" + userServiceProcess := harness.StartProcess(t, "userservice", userServiceBinary, userServiceEnv) + waitForUserServiceReady(t, userServiceProcess, "http://"+userServiceAddr) + + authsessionEnv := map[string]string{ + "AUTHSESSION_LOG_LEVEL": "info", + "AUTHSESSION_PUBLIC_HTTP_ADDR": authsessionPublicAddr, + "AUTHSESSION_PUBLIC_HTTP_REQUEST_TIMEOUT": time.Second.String(), + "AUTHSESSION_INTERNAL_HTTP_ADDR": authsessionInternalAddr, + "AUTHSESSION_INTERNAL_HTTP_REQUEST_TIMEOUT": time.Second.String(), + "AUTHSESSION_REDIS_MASTER_ADDR": redisRuntime.Addr, + "AUTHSESSION_REDIS_PASSWORD": "integration", + "AUTHSESSION_USER_SERVICE_MODE": "rest", + "AUTHSESSION_USER_SERVICE_BASE_URL": "http://" + userServiceAddr, + "AUTHSESSION_USER_SERVICE_REQUEST_TIMEOUT": time.Second.String(), + "AUTHSESSION_MAIL_SERVICE_MODE": "rest", + "AUTHSESSION_MAIL_SERVICE_BASE_URL": mailStub.BaseURL(), + "AUTHSESSION_MAIL_SERVICE_REQUEST_TIMEOUT": time.Second.String(), + "AUTHSESSION_REDIS_GATEWAY_SESSION_CACHE_KEY_PREFIX": "gateway:session:", + "AUTHSESSION_REDIS_GATEWAY_SESSION_EVENTS_STREAM": "gateway:session_events", + "OTEL_TRACES_EXPORTER": "none", + "OTEL_METRICS_EXPORTER": "none", + } + authsessionProcess := harness.StartProcess(t, "authsession", authsessionBinary, authsessionEnv) + waitForAuthsessionReady(t, authsessionProcess, "http://"+authsessionPublicAddr) + + lobbyEnv := harness.StartLobbyServicePersistence(t, redisRuntime.Addr).Env + lobbyEnv["LOBBY_LOG_LEVEL"] = "info" + lobbyEnv["LOBBY_PUBLIC_HTTP_ADDR"] = lobbyPublicAddr + lobbyEnv["LOBBY_INTERNAL_HTTP_ADDR"] = lobbyInternalAddr + lobbyEnv["LOBBY_USER_SERVICE_BASE_URL"] = "http://" + userServiceAddr + lobbyEnv["LOBBY_GM_BASE_URL"] = mailStub.BaseURL() + lobbyEnv["LOBBY_RUNTIME_JOB_RESULTS_READ_BLOCK_TIMEOUT"] = "200ms" + lobbyEnv["LOBBY_USER_LIFECYCLE_READ_BLOCK_TIMEOUT"] = "200ms" + lobbyEnv["LOBBY_GM_EVENTS_READ_BLOCK_TIMEOUT"] = "200ms" + lobbyEnv["OTEL_TRACES_EXPORTER"] = "none" + lobbyEnv["OTEL_METRICS_EXPORTER"] = "none" + lobbyProcess := harness.StartProcess(t, "lobby", lobbyBinary, lobbyEnv) + harness.WaitForHTTPStatus(t, lobbyProcess, "http://"+lobbyInternalAddr+"/readyz", http.StatusOK) + + gatewayEnv := map[string]string{ + "GATEWAY_LOG_LEVEL": "info", + "GATEWAY_PUBLIC_HTTP_ADDR": gatewayPublicAddr, + "GATEWAY_AUTHENTICATED_GRPC_ADDR": gatewayGRPCAddr, + "GATEWAY_REDIS_MASTER_ADDR": redisRuntime.Addr, + "GATEWAY_REDIS_PASSWORD": "integration", + "GATEWAY_SESSION_CACHE_REDIS_KEY_PREFIX": "gateway:session:", + "GATEWAY_SESSION_EVENTS_REDIS_STREAM": "gateway:session_events", + "GATEWAY_CLIENT_EVENTS_REDIS_STREAM": "gateway:client_events", + "GATEWAY_REPLAY_REDIS_KEY_PREFIX": "gateway:replay:", + "GATEWAY_RESPONSE_SIGNER_PRIVATE_KEY_PEM_PATH": filepath.Clean(responseSignerPath), + "GATEWAY_AUTH_SERVICE_BASE_URL": "http://" + authsessionPublicAddr, + "GATEWAY_USER_SERVICE_BASE_URL": "http://" + userServiceAddr, + "GATEWAY_LOBBY_SERVICE_BASE_URL": "http://" + lobbyPublicAddr, + "GATEWAY_PUBLIC_AUTH_UPSTREAM_TIMEOUT": (500 * time.Millisecond).String(), + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_AUTH_RATE_LIMIT_REQUESTS": "100", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_AUTH_RATE_LIMIT_WINDOW": "1s", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_AUTH_RATE_LIMIT_BURST": "100", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_SEND_EMAIL_CODE_IDENTITY_RATE_LIMIT_REQUESTS": "100", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_SEND_EMAIL_CODE_IDENTITY_RATE_LIMIT_WINDOW": "1s", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_SEND_EMAIL_CODE_IDENTITY_RATE_LIMIT_BURST": "100", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_CONFIRM_EMAIL_CODE_IDENTITY_RATE_LIMIT_REQUESTS": "100", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_CONFIRM_EMAIL_CODE_IDENTITY_RATE_LIMIT_WINDOW": "1s", + "GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_CONFIRM_EMAIL_CODE_IDENTITY_RATE_LIMIT_BURST": "100", + "OTEL_TRACES_EXPORTER": "none", + "OTEL_METRICS_EXPORTER": "none", + } + gatewayProcess := harness.StartProcess(t, "gateway", gatewayBinary, gatewayEnv) + harness.WaitForHTTPStatus(t, gatewayProcess, "http://"+gatewayPublicAddr+"/healthz", http.StatusOK) + harness.WaitForTCP(t, gatewayProcess, gatewayGRPCAddr) + + return &lobbyAuthsessionHarness{ + redis: redisClient, + mailStub: mailStub, + authsessionPublicURL: "http://" + authsessionPublicAddr, + authsessionInternalURL: "http://" + authsessionInternalAddr, + gatewayPublicURL: "http://" + gatewayPublicAddr, + gatewayGRPCAddr: gatewayGRPCAddr, + userServiceURL: "http://" + userServiceAddr, + lobbyPublicURL: "http://" + lobbyPublicAddr, + processes: []*harness.Process{userServiceProcess, authsessionProcess, lobbyProcess, gatewayProcess}, + } +} + +// authenticate runs the public-auth flow through the Gateway and +// returns the resulting `device_session_id` plus the resolved user_id. +func (h *lobbyAuthsessionHarness) authenticate(t *testing.T, email string, clientKey ed25519.PrivateKey) (string, string) { + t.Helper() + + challengeID := h.sendChallenge(t, email) + code := h.waitForChallengeCode(t, email) + + confirm := h.confirmCode(t, challengeID, code, clientKey) + require.Equalf(t, http.StatusOK, confirm.StatusCode, "confirm: %s", confirm.Body) + + var confirmBody struct { + DeviceSessionID string `json:"device_session_id"` + } + require.NoError(t, decodeStrictJSONPayload([]byte(confirm.Body), &confirmBody)) + require.NotEmpty(t, confirmBody.DeviceSessionID) + + user := h.lookupUserByEmail(t, email) + + deadline := time.Now().Add(5 * time.Second) + for time.Now().Before(deadline) { + if _, err := h.redis.Get(context.Background(), "gateway:session:"+confirmBody.DeviceSessionID).Bytes(); err == nil { + return confirmBody.DeviceSessionID, user.UserID + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("gateway session projection for %s never arrived", confirmBody.DeviceSessionID) + return "", "" +} + +func (h *lobbyAuthsessionHarness) sendChallenge(t *testing.T, email string) string { + t.Helper() + resp := postJSON(t, h.gatewayPublicURL+"/api/v1/public/auth/send-email-code", map[string]string{ + "email": email, + }, nil) + require.Equalf(t, http.StatusOK, resp.StatusCode, "send-email-code: %s", resp.Body) + var body struct { + ChallengeID string `json:"challenge_id"` + } + require.NoError(t, decodeStrictJSONPayload([]byte(resp.Body), &body)) + return body.ChallengeID +} + +func (h *lobbyAuthsessionHarness) confirmCode(t *testing.T, challengeID, code string, clientKey ed25519.PrivateKey) httpResponse { + t.Helper() + return postJSON(t, h.gatewayPublicURL+"/api/v1/public/auth/confirm-email-code", map[string]string{ + "challenge_id": challengeID, + "code": code, + "client_public_key": base64.StdEncoding.EncodeToString(clientKey.Public().(ed25519.PublicKey)), + "time_zone": "Europe/Kaliningrad", + }, nil) +} + +func (h *lobbyAuthsessionHarness) waitForChallengeCode(t *testing.T, email string) string { + t.Helper() + deadline := time.Now().Add(5 * time.Second) + for time.Now().Before(deadline) { + for _, delivery := range h.mailStub.RecordedDeliveries() { + if delivery.Email == email && delivery.Code != "" { + return delivery.Code + } + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("auth code for %s never arrived", email) + return "" +} + +func (h *lobbyAuthsessionHarness) lookupUserByEmail(t *testing.T, email string) struct { + UserID string `json:"user_id"` +} { + t.Helper() + resp := postJSON(t, h.userServiceURL+"/api/v1/internal/user-lookups/by-email", map[string]string{"email": email}, nil) + require.Equalf(t, http.StatusOK, resp.StatusCode, "user lookup: %s", resp.Body) + var body struct { + User struct { + UserID string `json:"user_id"` + } `json:"user"` + } + require.NoError(t, json.Unmarshal([]byte(resp.Body), &body)) + return struct { + UserID string `json:"user_id"` + }{UserID: body.User.UserID} +} + +// revokeSession calls AuthSession's internal revoke surface for a +// specific device session. The body shape is defined by +// `authsession/api/internal-openapi.yaml#RevokeDeviceSessionRequest`. +func (h *lobbyAuthsessionHarness) revokeSession(t *testing.T, deviceSessionID string) { + t.Helper() + target := h.authsessionInternalURL + "/api/v1/internal/sessions/" + deviceSessionID + "/revoke" + resp := postJSON(t, target, map[string]any{ + "reason_code": "test_revocation", + "actor": map[string]string{ + "type": "test", + "id": "lobbyauthsession-suite", + }, + }, nil) + require.Truef(t, + resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusNoContent, + "revoke session %s: status=%d body=%s", deviceSessionID, resp.StatusCode, resp.Body, + ) +} + +// waitForSessionGone polls the gateway session cache until the +// session record is removed or marked revoked. +func (h *lobbyAuthsessionHarness) waitForSessionGone(t *testing.T, deviceSessionID string, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + payload, err := h.redis.Get(context.Background(), "gateway:session:"+deviceSessionID).Bytes() + if err == redis.Nil { + return + } + if err == nil { + var record struct { + Status string `json:"status"` + } + if json.Unmarshal(payload, &record) == nil && record.Status != "active" { + return + } + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("session %s still active in gateway cache after %s", deviceSessionID, timeout) +} + +func (h *lobbyAuthsessionHarness) dialGateway(t *testing.T) *grpc.ClientConn { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + conn, err := grpc.DialContext(ctx, h.gatewayGRPCAddr, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithBlock(), + ) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, conn.Close()) }) + return conn +} + +// --- shared helpers --- + +func newExecuteCommandRequest(deviceSessionID, requestID, messageType string, payload []byte, clientKey ed25519.PrivateKey) *gatewayv1.ExecuteCommandRequest { + payloadHash := contractsgatewayv1.ComputePayloadHash(payload) + request := &gatewayv1.ExecuteCommandRequest{ + ProtocolVersion: contractsgatewayv1.ProtocolVersionV1, + DeviceSessionId: deviceSessionID, + MessageType: messageType, + TimestampMs: time.Now().UnixMilli(), + RequestId: requestID, + PayloadBytes: payload, + PayloadHash: payloadHash, + TraceId: "trace-" + requestID, + } + request.Signature = contractsgatewayv1.SignRequest(clientKey, contractsgatewayv1.RequestSigningFields{ + ProtocolVersion: request.GetProtocolVersion(), + DeviceSessionID: request.GetDeviceSessionId(), + MessageType: request.GetMessageType(), + TimestampMS: request.GetTimestampMs(), + RequestID: request.GetRequestId(), + PayloadHash: request.GetPayloadHash(), + }) + return request +} + +type httpResponse struct { + StatusCode int + Body string + Header http.Header +} + +func postJSON(t *testing.T, url string, body any, header http.Header) httpResponse { + t.Helper() + var reader io.Reader + if body != nil { + payload, err := json.Marshal(body) + require.NoError(t, err) + reader = bytes.NewReader(payload) + } + req, err := http.NewRequest(http.MethodPost, url, reader) + require.NoError(t, err) + if body != nil { + req.Header.Set("Content-Type", "application/json") + } + for k, vs := range header { + for _, v := range vs { + req.Header.Add(k, v) + } + } + return doRequest(t, req) +} + +func doRequest(t *testing.T, request *http.Request) httpResponse { + t.Helper() + client := &http.Client{ + Timeout: 5 * time.Second, + Transport: &http.Transport{DisableKeepAlives: true}, + } + t.Cleanup(client.CloseIdleConnections) + + response, err := client.Do(request) + require.NoError(t, err) + defer response.Body.Close() + + payload, err := io.ReadAll(response.Body) + require.NoError(t, err) + return httpResponse{ + StatusCode: response.StatusCode, + Body: string(payload), + Header: response.Header.Clone(), + } +} + +func decodeStrictJSONPayload(payload []byte, target any) error { + decoder := json.NewDecoder(bytes.NewReader(payload)) + decoder.DisallowUnknownFields() + if err := decoder.Decode(target); err != nil { + return err + } + if err := decoder.Decode(&struct{}{}); err != io.EOF { + if err == nil { + return errors.New("unexpected trailing JSON input") + } + return err + } + return nil +} + +func waitForUserServiceReady(t *testing.T, process *harness.Process, baseURL string) { + t.Helper() + client := &http.Client{Timeout: 250 * time.Millisecond} + t.Cleanup(client.CloseIdleConnections) + + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + req, err := http.NewRequest(http.MethodGet, baseURL+"/api/v1/internal/users/user-readiness-probe/exists", nil) + require.NoError(t, err) + response, err := client.Do(req) + if err == nil { + _, _ = io.Copy(io.Discard, response.Body) + response.Body.Close() + if response.StatusCode == http.StatusOK { + return + } + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("wait for userservice readiness: timeout\n%s", process.Logs()) +} + +func waitForAuthsessionReady(t *testing.T, process *harness.Process, baseURL string) { + t.Helper() + // AuthSession's public listener has no /healthz; posting an empty + // email send-email-code request is the cheapest readiness probe. + client := &http.Client{Timeout: 250 * time.Millisecond} + t.Cleanup(client.CloseIdleConnections) + + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + body := bytes.NewReader([]byte(`{"email":""}`)) + req, err := http.NewRequest(http.MethodPost, baseURL+"/api/v1/public/auth/send-email-code", body) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + response, err := client.Do(req) + if err == nil { + _, _ = io.Copy(io.Discard, response.Body) + response.Body.Close() + if response.StatusCode == http.StatusBadRequest { + return + } + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("wait for authsession readiness: timeout\n%s", process.Logs()) +} + +func newClientPrivateKey(label string) ed25519.PrivateKey { + seed := sha256.Sum256([]byte("galaxy-integration-lobby-authsession-client-" + label)) + return ed25519.NewKeyFromSeed(seed[:]) +} diff --git a/integration/lobbyrtm/harness_test.go b/integration/lobbyrtm/harness_test.go new file mode 100644 index 0000000..e09fa08 --- /dev/null +++ b/integration/lobbyrtm/harness_test.go @@ -0,0 +1,747 @@ +// Package lobbyrtm_test exercises the Lobby ↔ Runtime Manager +// boundary against real Lobby + real Runtime Manager + real +// PostgreSQL + real Redis + real Docker daemon running the +// galaxy/game test engine container. It satisfies the inter-service +// requirement spelled out in `TESTING.md §7` and PLAN.md Stage 20. +// +// The boundary contract is: Lobby publishes `runtime:start_jobs` and +// `runtime:stop_jobs` envelopes, RTM consumes them and runs/stops +// engine containers, RTM publishes `runtime:job_results`, Lobby +// transitions the game accordingly. The suite asserts only on those +// public surfaces (Lobby/RTM REST, Redis Streams, Docker container +// state); it never imports `*/internal/...` packages of either +// service. +package lobbyrtm_test + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "maps" + "net/http" + "net/http/httptest" + "os" + "strconv" + "strings" + "sync/atomic" + "testing" + "time" + + "galaxy/integration/internal/harness" + + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/require" +) + +const ( + defaultEngineVersion = "1.0.0" + missingEngineVersion = "0.0.0-missing" + + startJobsStream = "runtime:start_jobs" + stopJobsStream = "runtime:stop_jobs" + jobResultsStream = "runtime:job_results" + healthEventsStream = "runtime:health_events" + notificationIntentsKey = "notification:intents" + userLifecycleStream = "user:lifecycle_events" + gmEventsStream = "gm:lobby_events" + expectedLobbyProducer = "game_lobby" + notificationImagePulled = "runtime.image_pull_failed" +) + +// suiteSeq scopes per-test stream prefixes so concurrent test +// invocations cannot bleed events into each other. +var suiteSeq atomic.Int64 + +// lobbyRTMHarness owns the per-test infrastructure: containers, +// processes, stream keys, and helper clients. One harness per test +// keeps each scenario fully isolated. +type lobbyRTMHarness struct { + redis *redis.Client + + userServiceURL string + lobbyPublicURL string + lobbyAdminURL string + rtmInternalURL string + + intentsStream string + lifecycleStream string + jobResultsStream string + startJobsStream string + stopJobsStream string + healthEvents string + + gmStub *httptest.Server + + dockerNetwork string + engineImage string + + userServiceProcess *harness.Process + lobbyProcess *harness.Process + rtmProcess *harness.Process +} + +type ensureUserResponse struct { + Outcome string `json:"outcome"` + UserID string `json:"user_id"` +} + +type httpResponse struct { + StatusCode int + Body string + Header http.Header +} + +// newLobbyRTMHarness brings up one independent test environment: +// Postgres containers per service (mirrors `lobbynotification`), one +// Redis container, real binaries for User Service / Lobby / RTM, a +// GM stub that returns 200, a per-test Docker bridge network, and +// the freshly-built `galaxy/game` test image. +func newLobbyRTMHarness(t *testing.T) *lobbyRTMHarness { + t.Helper() + + // Skip the whole suite when Docker is unreachable. The ensure-only + // check runs before any testcontainer is started so the skip path + // kicks in before testcontainers-go tries (and fails) to probe the + // daemon. + harness.RequireDockerDaemon(t) + + redisRuntime := harness.StartRedisContainer(t) + redisClient := redis.NewClient(&redis.Options{ + Addr: redisRuntime.Addr, + Protocol: 2, + DisableIdentity: true, + }) + t.Cleanup(func() { + require.NoError(t, redisClient.Close()) + }) + + gmStub := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{}`)) + })) + t.Cleanup(gmStub.Close) + + engineImage := harness.EnsureGalaxyGameImage(t) + dockerNetwork := harness.EnsureDockerNetwork(t) + + userServiceAddr := harness.FreeTCPAddress(t) + lobbyPublicAddr := harness.FreeTCPAddress(t) + lobbyInternalAddr := harness.FreeTCPAddress(t) + rtmInternalAddr := harness.FreeTCPAddress(t) + + userServiceBinary := harness.BuildBinary(t, "userservice", "./user/cmd/userservice") + lobbyBinary := harness.BuildBinary(t, "lobby", "./lobby/cmd/lobby") + rtmBinary := harness.BuildBinary(t, "rtmanager", "./rtmanager/cmd/rtmanager") + + userServiceEnv := harness.StartUserServicePersistence(t, redisRuntime.Addr).Env + userServiceEnv["USERSERVICE_LOG_LEVEL"] = "info" + userServiceEnv["USERSERVICE_INTERNAL_HTTP_ADDR"] = userServiceAddr + userServiceEnv["OTEL_TRACES_EXPORTER"] = "none" + userServiceEnv["OTEL_METRICS_EXPORTER"] = "none" + userServiceProcess := harness.StartProcess(t, "userservice", userServiceBinary, userServiceEnv) + waitForUserServiceReady(t, userServiceProcess, "http://"+userServiceAddr) + + suffix := strconv.FormatInt(suiteSeq.Add(1), 10) + intentsStream := notificationIntentsKey + ":" + suffix + lifecycleStream := userLifecycleStream + ":" + suffix + jobResultsStreamKey := jobResultsStream + ":" + suffix + startJobsStreamKey := startJobsStream + ":" + suffix + stopJobsStreamKey := stopJobsStream + ":" + suffix + healthEventsStreamKey := healthEventsStream + ":" + suffix + gmEventsStreamKey := gmEventsStream + ":" + suffix + + lobbyEnv := harness.StartLobbyServicePersistence(t, redisRuntime.Addr).Env + lobbyEnv["LOBBY_LOG_LEVEL"] = "info" + lobbyEnv["LOBBY_PUBLIC_HTTP_ADDR"] = lobbyPublicAddr + lobbyEnv["LOBBY_INTERNAL_HTTP_ADDR"] = lobbyInternalAddr + lobbyEnv["LOBBY_USER_SERVICE_BASE_URL"] = "http://" + userServiceAddr + lobbyEnv["LOBBY_GM_BASE_URL"] = gmStub.URL + lobbyEnv["LOBBY_NOTIFICATION_INTENTS_STREAM"] = intentsStream + lobbyEnv["LOBBY_USER_LIFECYCLE_STREAM"] = lifecycleStream + lobbyEnv["LOBBY_RUNTIME_JOB_RESULTS_STREAM"] = jobResultsStreamKey + lobbyEnv["LOBBY_RUNTIME_START_JOBS_STREAM"] = startJobsStreamKey + lobbyEnv["LOBBY_RUNTIME_STOP_JOBS_STREAM"] = stopJobsStreamKey + lobbyEnv["LOBBY_GM_EVENTS_STREAM"] = gmEventsStreamKey + lobbyEnv["LOBBY_RUNTIME_JOB_RESULTS_READ_BLOCK_TIMEOUT"] = "200ms" + lobbyEnv["LOBBY_USER_LIFECYCLE_READ_BLOCK_TIMEOUT"] = "200ms" + lobbyEnv["LOBBY_GM_EVENTS_READ_BLOCK_TIMEOUT"] = "200ms" + lobbyEnv["LOBBY_ENGINE_IMAGE_TEMPLATE"] = "galaxy/game:{engine_version}-lobbyrtm-it" + lobbyEnv["OTEL_TRACES_EXPORTER"] = "none" + lobbyEnv["OTEL_METRICS_EXPORTER"] = "none" + lobbyProcess := harness.StartProcess(t, "lobby", lobbyBinary, lobbyEnv) + harness.WaitForHTTPStatus(t, lobbyProcess, "http://"+lobbyInternalAddr+"/readyz", http.StatusOK) + + rtmEnv := harness.StartRTManagerServicePersistence(t, redisRuntime.Addr).Env + rtmEnv["RTMANAGER_LOG_LEVEL"] = "info" + rtmEnv["RTMANAGER_INTERNAL_HTTP_ADDR"] = rtmInternalAddr + rtmEnv["RTMANAGER_LOBBY_INTERNAL_BASE_URL"] = "http://" + lobbyInternalAddr + rtmEnv["RTMANAGER_DOCKER_HOST"] = resolveDockerHost() + rtmEnv["RTMANAGER_DOCKER_NETWORK"] = dockerNetwork + // On dev machines and in sandboxes the rtmanager process cannot + // chown the per-game state dir to root (uid 0). Pin the owner to + // the current process uid/gid so `chown` is a no-op. + rtmEnv["RTMANAGER_GAME_STATE_OWNER_UID"] = strconv.Itoa(os.Getuid()) + rtmEnv["RTMANAGER_GAME_STATE_OWNER_GID"] = strconv.Itoa(os.Getgid()) + rtmEnv["RTMANAGER_GAME_STATE_ROOT"] = t.TempDir() + rtmEnv["RTMANAGER_REDIS_START_JOBS_STREAM"] = startJobsStreamKey + rtmEnv["RTMANAGER_REDIS_STOP_JOBS_STREAM"] = stopJobsStreamKey + rtmEnv["RTMANAGER_REDIS_JOB_RESULTS_STREAM"] = jobResultsStreamKey + rtmEnv["RTMANAGER_REDIS_HEALTH_EVENTS_STREAM"] = healthEventsStreamKey + rtmEnv["RTMANAGER_NOTIFICATION_INTENTS_STREAM"] = intentsStream + rtmEnv["RTMANAGER_STREAM_BLOCK_TIMEOUT"] = "200ms" + rtmEnv["RTMANAGER_RECONCILE_INTERVAL"] = "1s" + rtmEnv["RTMANAGER_CLEANUP_INTERVAL"] = "1s" + rtmEnv["RTMANAGER_INSPECT_INTERVAL"] = "1s" + rtmEnv["RTMANAGER_PROBE_INTERVAL"] = "1s" + rtmEnv["RTMANAGER_PROBE_TIMEOUT"] = "1s" + rtmEnv["RTMANAGER_PROBE_FAILURES_THRESHOLD"] = "3" + rtmEnv["RTMANAGER_GAME_LEASE_TTL_SECONDS"] = "10" + rtmEnv["OTEL_TRACES_EXPORTER"] = "none" + rtmEnv["OTEL_METRICS_EXPORTER"] = "none" + rtmProcess := harness.StartProcess(t, "rtmanager", rtmBinary, rtmEnv) + harness.WaitForHTTPStatus(t, rtmProcess, "http://"+rtmInternalAddr+"/readyz", http.StatusOK) + + return &lobbyRTMHarness{ + redis: redisClient, + userServiceURL: "http://" + userServiceAddr, + lobbyPublicURL: "http://" + lobbyPublicAddr, + lobbyAdminURL: "http://" + lobbyInternalAddr, + rtmInternalURL: "http://" + rtmInternalAddr, + intentsStream: intentsStream, + lifecycleStream: lifecycleStream, + jobResultsStream: jobResultsStreamKey, + startJobsStream: startJobsStreamKey, + stopJobsStream: stopJobsStreamKey, + healthEvents: healthEventsStreamKey, + gmStub: gmStub, + dockerNetwork: dockerNetwork, + engineImage: engineImage, + userServiceProcess: userServiceProcess, + lobbyProcess: lobbyProcess, + rtmProcess: rtmProcess, + } +} + +// ensureUser provisions a fresh User Service account by email and +// returns the assigned user_id. The email pattern includes the test +// name to avoid collisions across concurrent tests sharing the +// container. +func (h *lobbyRTMHarness) ensureUser(t *testing.T, email string) ensureUserResponse { + t.Helper() + resp := postJSON(t, h.userServiceURL+"/api/v1/internal/users/ensure-by-email", map[string]any{ + "email": email, + "registration_context": map[string]string{ + "preferred_language": "en", + "time_zone": "Europe/Kaliningrad", + }, + }, nil) + var out ensureUserResponse + requireJSONStatus(t, resp, http.StatusOK, &out) + require.Equal(t, "created", out.Outcome) + require.NotEmpty(t, out.UserID) + return out +} + +// userCreatePrivateGame creates a private game owned by ownerUserID +// with the supplied target engine version. Returns the assigned +// game_id. +func (h *lobbyRTMHarness) userCreatePrivateGame( + t *testing.T, + ownerUserID, name, targetEngineVersion string, + enrollmentEndsAt int64, +) string { + t.Helper() + resp := postJSON(t, h.lobbyPublicURL+"/api/v1/lobby/games", map[string]any{ + "game_name": name, + "game_type": "private", + "min_players": 1, + "max_players": 4, + "start_gap_hours": 6, + "start_gap_players": 1, + "enrollment_ends_at": enrollmentEndsAt, + "turn_schedule": "0 18 * * *", + "target_engine_version": targetEngineVersion, + }, http.Header{"X-User-Id": []string{ownerUserID}}) + require.Equalf(t, http.StatusCreated, resp.StatusCode, "create private game: %s", resp.Body) + var record map[string]any + require.NoError(t, json.Unmarshal([]byte(resp.Body), &record)) + gameID, ok := record["game_id"].(string) + require.Truef(t, ok, "game_id missing: %s", resp.Body) + return gameID +} + +func (h *lobbyRTMHarness) userOpenEnrollment(t *testing.T, ownerUserID, gameID string) { + t.Helper() + resp := postJSON(t, + h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/open-enrollment", + nil, + http.Header{"X-User-Id": []string{ownerUserID}}, + ) + require.Equalf(t, http.StatusOK, resp.StatusCode, "user open enrollment: %s", resp.Body) +} + +func (h *lobbyRTMHarness) userCreateInvite(t *testing.T, ownerUserID, gameID, inviteeUserID string) { + t.Helper() + resp := postJSON(t, + h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/invites", + map[string]any{"invitee_user_id": inviteeUserID}, + http.Header{"X-User-Id": []string{ownerUserID}}, + ) + require.Equalf(t, http.StatusCreated, resp.StatusCode, "create invite: %s", resp.Body) +} + +func (h *lobbyRTMHarness) firstCreatedInviteID(t *testing.T, inviteeUserID, gameID string) string { + t.Helper() + req, err := http.NewRequest(http.MethodGet, + h.lobbyPublicURL+"/api/v1/lobby/my/invites?status=created", nil) + require.NoError(t, err) + req.Header.Set("X-User-Id", inviteeUserID) + resp := doRequest(t, req) + require.Equalf(t, http.StatusOK, resp.StatusCode, "list my invites: %s", resp.Body) + + var body struct { + Items []struct { + InviteID string `json:"invite_id"` + GameID string `json:"game_id"` + } `json:"items"` + } + require.NoError(t, json.Unmarshal([]byte(resp.Body), &body)) + for _, item := range body.Items { + if item.GameID == gameID { + return item.InviteID + } + } + t.Fatalf("no invite found for invitee %s on game %s; body=%s", inviteeUserID, gameID, resp.Body) + return "" +} + +func (h *lobbyRTMHarness) userRedeemInvite(t *testing.T, inviteeUserID, gameID, inviteID, raceName string) { + t.Helper() + resp := postJSON(t, + h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/invites/"+inviteID+"/redeem", + map[string]any{"race_name": raceName}, + http.Header{"X-User-Id": []string{inviteeUserID}}, + ) + require.Equalf(t, http.StatusOK, resp.StatusCode, "redeem invite: %s", resp.Body) +} + +func (h *lobbyRTMHarness) userReadyToStart(t *testing.T, ownerUserID, gameID string) { + t.Helper() + resp := postJSON(t, + h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/ready-to-start", + nil, + http.Header{"X-User-Id": []string{ownerUserID}}, + ) + require.Equalf(t, http.StatusOK, resp.StatusCode, "ready-to-start: %s", resp.Body) +} + +func (h *lobbyRTMHarness) userStartGame(t *testing.T, ownerUserID, gameID string) { + t.Helper() + resp := postJSON(t, + h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/start", + nil, + http.Header{"X-User-Id": []string{ownerUserID}}, + ) + require.Equalf(t, http.StatusOK, resp.StatusCode, "user start: %s", resp.Body) +} + +// prepareInflightGame walks one private game from creation through +// `start`. For the happy and cancel scenarios the game subsequently +// reaches `running` once RTM publishes the success job_result; for +// the failure scenario it ends in `start_failed`. +// +// Returns owner and invitee user records plus the game id. +func (h *lobbyRTMHarness) prepareInflightGame( + t *testing.T, + ownerEmail, inviteeEmail, gameName, targetEngineVersion string, +) (owner, invitee ensureUserResponse, gameID string) { + t.Helper() + owner = h.ensureUser(t, ownerEmail) + invitee = h.ensureUser(t, inviteeEmail) + + gameID = h.userCreatePrivateGame(t, owner.UserID, gameName, targetEngineVersion, + time.Now().Add(48*time.Hour).Unix()) + h.userOpenEnrollment(t, owner.UserID, gameID) + h.userCreateInvite(t, owner.UserID, gameID, invitee.UserID) + inviteID := h.firstCreatedInviteID(t, invitee.UserID, gameID) + h.userRedeemInvite(t, invitee.UserID, gameID, inviteID, "PilotInvitee") + h.userReadyToStart(t, owner.UserID, gameID) + h.userStartGame(t, owner.UserID, gameID) + return owner, invitee, gameID +} + +// gameStatus reads one game record off Lobby's internal API and +// returns its status field. Used by waitGameStatus and direct +// assertions. +func (h *lobbyRTMHarness) gameStatus(t *testing.T, gameID string) string { + t.Helper() + req, err := http.NewRequest(http.MethodGet, + h.lobbyAdminURL+"/api/v1/internal/games/"+gameID, nil) + require.NoError(t, err) + resp := doRequest(t, req) + if resp.StatusCode != http.StatusOK { + t.Fatalf("get game internal: status=%d body=%s", resp.StatusCode, resp.Body) + } + var record struct { + Status string `json:"status"` + } + require.NoError(t, json.Unmarshal([]byte(resp.Body), &record)) + return record.Status +} + +// waitGameStatus polls `GET /api/v1/internal/games/{gameID}` until +// the record reports the expected status or the timeout fires. +func (h *lobbyRTMHarness) waitGameStatus(t *testing.T, gameID, want string, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + for { + got := h.gameStatus(t, gameID) + if got == want { + return + } + if time.Now().After(deadline) { + t.Fatalf("game %s status: want %q got %q (after %s)", gameID, want, got, timeout) + } + time.Sleep(150 * time.Millisecond) + } +} + +// publishUserLifecycleEvent appends one event to the per-test +// `user:lifecycle_events` stream. The Lobby userlifecycle worker +// consumes the same stream. +func (h *lobbyRTMHarness) publishUserLifecycleEvent(t *testing.T, eventType, userID string) { + t.Helper() + _, err := h.redis.XAdd(context.Background(), &redis.XAddArgs{ + Stream: h.lifecycleStream, + Values: map[string]any{ + "event_type": eventType, + "user_id": userID, + "occurred_at_ms": strconv.FormatInt(time.Now().UnixMilli(), 10), + "source": "user_admin", + "actor_type": "admin", + "actor_id": "admin-1", + "reason_code": "terminal_policy_violation", + }, + }).Result() + require.NoError(t, err) +} + +// jobResultEntry decodes one `runtime:job_results` Redis Stream entry. +type jobResultEntry struct { + StreamID string + GameID string + Outcome string + ContainerID string + EngineEndpoint string + ErrorCode string + ErrorMessage string +} + +// stopJobEntry decodes one `runtime:stop_jobs` Redis Stream entry as +// published by Lobby. +type stopJobEntry struct { + StreamID string + GameID string + Reason string +} + +// notificationIntentEntry decodes one `notification:intents` entry. +type notificationIntentEntry struct { + StreamID string + NotificationType string + Producer string + Payload map[string]any +} + +// allJobResults returns every entry on the per-test job_results +// stream in stream order. +func (h *lobbyRTMHarness) allJobResults(t *testing.T) []jobResultEntry { + t.Helper() + entries, err := h.redis.XRange(context.Background(), h.jobResultsStream, "-", "+").Result() + require.NoError(t, err) + out := make([]jobResultEntry, 0, len(entries)) + for _, entry := range entries { + out = append(out, jobResultEntry{ + StreamID: entry.ID, + GameID: streamString(entry.Values, "game_id"), + Outcome: streamString(entry.Values, "outcome"), + ContainerID: streamString(entry.Values, "container_id"), + EngineEndpoint: streamString(entry.Values, "engine_endpoint"), + ErrorCode: streamString(entry.Values, "error_code"), + ErrorMessage: streamString(entry.Values, "error_message"), + }) + } + return out +} + +// waitJobResult polls the per-test job_results stream until predicate +// matches one entry, or the timeout fires. +func (h *lobbyRTMHarness) waitJobResult( + t *testing.T, + predicate func(jobResultEntry) bool, + timeout time.Duration, +) jobResultEntry { + t.Helper() + deadline := time.Now().Add(timeout) + for { + entries := h.allJobResults(t) + for _, entry := range entries { + if predicate(entry) { + return entry + } + } + if time.Now().After(deadline) { + t.Fatalf("no job_result matched within %s; observed=%+v", timeout, entries) + } + time.Sleep(150 * time.Millisecond) + } +} + +// allStopJobs returns every entry on the per-test stop_jobs stream. +func (h *lobbyRTMHarness) allStopJobs(t *testing.T) []stopJobEntry { + t.Helper() + entries, err := h.redis.XRange(context.Background(), h.stopJobsStream, "-", "+").Result() + require.NoError(t, err) + out := make([]stopJobEntry, 0, len(entries)) + for _, entry := range entries { + out = append(out, stopJobEntry{ + StreamID: entry.ID, + GameID: streamString(entry.Values, "game_id"), + Reason: streamString(entry.Values, "reason"), + }) + } + return out +} + +// waitStopJobReason polls the stop_jobs stream until an entry for +// gameID with the expected reason appears. +func (h *lobbyRTMHarness) waitStopJobReason(t *testing.T, gameID, reason string, timeout time.Duration) stopJobEntry { + t.Helper() + deadline := time.Now().Add(timeout) + for { + for _, entry := range h.allStopJobs(t) { + if entry.GameID == gameID && entry.Reason == reason { + return entry + } + } + if time.Now().After(deadline) { + t.Fatalf("no stop_job for game %s with reason %q within %s", gameID, reason, timeout) + } + time.Sleep(150 * time.Millisecond) + } +} + +// allNotificationIntents returns every entry on the per-test +// notification:intents stream. +func (h *lobbyRTMHarness) allNotificationIntents(t *testing.T) []notificationIntentEntry { + t.Helper() + entries, err := h.redis.XRange(context.Background(), h.intentsStream, "-", "+").Result() + require.NoError(t, err) + out := make([]notificationIntentEntry, 0, len(entries)) + for _, entry := range entries { + decoded := notificationIntentEntry{ + StreamID: entry.ID, + NotificationType: streamString(entry.Values, "notification_type"), + Producer: streamString(entry.Values, "producer"), + } + // `pkg/notificationintent` publishes the payload under the + // field name `payload_json`. Older versions of this harness + // looked for `payload` and silently produced an empty Payload + // map, which made every predicate that checks `Payload["…"]` + // fall through. Read both field names for forward compat. + raw := streamString(entry.Values, "payload_json") + if raw == "" { + raw = streamString(entry.Values, "payload") + } + if raw != "" { + var parsed map[string]any + if err := json.Unmarshal([]byte(raw), &parsed); err == nil { + decoded.Payload = parsed + } + } + out = append(out, decoded) + } + return out +} + +// waitNotificationIntent polls the intents stream until the +// predicate matches. +func (h *lobbyRTMHarness) waitNotificationIntent( + t *testing.T, + predicate func(notificationIntentEntry) bool, + timeout time.Duration, +) notificationIntentEntry { + t.Helper() + deadline := time.Now().Add(timeout) + for { + entries := h.allNotificationIntents(t) + for _, entry := range entries { + if predicate(entry) { + return entry + } + } + if time.Now().After(deadline) { + summary := make([]string, 0, len(entries)) + for _, entry := range entries { + summary = append(summary, entry.NotificationType+":"+entry.Producer) + } + t.Fatalf("no notification_intent matched within %s; observed=%v", timeout, summary) + } + time.Sleep(150 * time.Millisecond) + } +} + +// rtmRuntimeStatus issues `GET /api/v1/internal/runtimes/{gameID}` +// against RTM and returns the persisted runtime record's status, or +// the empty string when RTM responds 404. +func (h *lobbyRTMHarness) rtmRuntimeStatus(t *testing.T, gameID string) (string, int) { + t.Helper() + req, err := http.NewRequest(http.MethodGet, + h.rtmInternalURL+"/api/v1/internal/runtimes/"+gameID, nil) + require.NoError(t, err) + resp := doRequest(t, req) + if resp.StatusCode == http.StatusNotFound { + return "", resp.StatusCode + } + if resp.StatusCode != http.StatusOK { + t.Fatalf("rtm get runtime: status=%d body=%s", resp.StatusCode, resp.Body) + } + var record struct { + Status string `json:"status"` + } + require.NoError(t, json.Unmarshal([]byte(resp.Body), &record)) + return record.Status, resp.StatusCode +} + +// waitRTMRuntimeStatus polls RTM until the runtime record reports +// the expected status or the timeout fires. +func (h *lobbyRTMHarness) waitRTMRuntimeStatus(t *testing.T, gameID, want string, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + for { + status, code := h.rtmRuntimeStatus(t, gameID) + if status == want { + return + } + if time.Now().After(deadline) { + t.Fatalf("rtm runtime status for %s: want %q got %q (http %d) within %s", + gameID, want, status, code, timeout) + } + time.Sleep(150 * time.Millisecond) + } +} + +// streamString reads a Redis Streams field as a string regardless of +// the underlying go-redis decoded type. +func streamString(values map[string]any, key string) string { + raw, ok := values[key] + if !ok { + return "" + } + switch typed := raw.(type) { + case string: + return typed + case []byte: + return string(typed) + default: + return fmt.Sprintf("%v", typed) + } +} + +func waitForUserServiceReady(t *testing.T, process *harness.Process, baseURL string) { + t.Helper() + client := &http.Client{Timeout: 250 * time.Millisecond} + t.Cleanup(client.CloseIdleConnections) + + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + req, err := http.NewRequest(http.MethodGet, + baseURL+"/api/v1/internal/users/user-readiness-probe/exists", nil) + require.NoError(t, err) + response, err := client.Do(req) + if err == nil { + _, _ = io.Copy(io.Discard, response.Body) + response.Body.Close() + if response.StatusCode == http.StatusOK { + return + } + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("wait for userservice readiness: timeout\n%s", process.Logs()) +} + +func postJSON(t *testing.T, url string, body any, header http.Header) httpResponse { + t.Helper() + var reader io.Reader + if body != nil { + payload, err := json.Marshal(body) + require.NoError(t, err) + reader = bytes.NewReader(payload) + } + req, err := http.NewRequest(http.MethodPost, url, reader) + require.NoError(t, err) + if body != nil { + req.Header.Set("Content-Type", "application/json") + } + maps.Copy(req.Header, header) + return doRequest(t, req) +} + +func doRequest(t *testing.T, request *http.Request) httpResponse { + t.Helper() + client := &http.Client{ + Timeout: 5 * time.Second, + Transport: &http.Transport{DisableKeepAlives: true}, + } + t.Cleanup(client.CloseIdleConnections) + + response, err := client.Do(request) + require.NoError(t, err) + defer response.Body.Close() + + payload, err := io.ReadAll(response.Body) + require.NoError(t, err) + return httpResponse{ + StatusCode: response.StatusCode, + Body: string(payload), + Header: response.Header.Clone(), + } +} + +func requireJSONStatus(t *testing.T, response httpResponse, wantStatus int, target any) { + t.Helper() + require.Equalf(t, wantStatus, response.StatusCode, "unexpected status, body=%s", response.Body) + if target != nil { + require.NoError(t, decodeStrictJSON([]byte(response.Body), target)) + } +} + +func decodeStrictJSON(payload []byte, target any) error { + decoder := json.NewDecoder(bytes.NewReader(payload)) + decoder.DisallowUnknownFields() + if err := decoder.Decode(target); err != nil { + return err + } + if err := decoder.Decode(&struct{}{}); err != io.EOF { + if err == nil { + return errors.New("unexpected trailing JSON input") + } + return err + } + return nil +} + +// resolveDockerHost honours DOCKER_HOST when the developer machine +// routes through colima or a remote daemon, falling back to the +// standard unix path otherwise. +func resolveDockerHost() string { + if host := strings.TrimSpace(os.Getenv("DOCKER_HOST")); host != "" { + return host + } + return "unix:///var/run/docker.sock" +} + diff --git a/integration/lobbyrtm/lobby_rtm_test.go b/integration/lobbyrtm/lobby_rtm_test.go new file mode 100644 index 0000000..6f09bc4 --- /dev/null +++ b/integration/lobbyrtm/lobby_rtm_test.go @@ -0,0 +1,204 @@ +package lobbyrtm_test + +import ( + "net/http" + "strings" + "testing" + "time" + + "galaxy/integration/internal/harness" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +const ( + jobOutcomeSuccess = "success" + jobOutcomeFailure = "failure" + + stopReasonCancelled = "cancelled" + + errorCodeImagePullFailed = "image_pull_failed" +) + +// TestStartFlowSucceedsWithRealEngine drives the happy path: +// Lobby creates a private game, the owner walks it through enrollment +// to start, Lobby publishes a `runtime:start_jobs` envelope with the +// resolved `image_ref`, RTM starts a real `galaxy/game` engine +// container, publishes a success `runtime:job_results` entry, and +// Lobby's runtimejobresult worker transitions the game to `running`. +// The test then hits the engine's `/healthz` endpoint directly via +// the bridge network IP, proving the container is alive end-to-end. +func TestStartFlowSucceedsWithRealEngine(t *testing.T) { + h := newLobbyRTMHarness(t) + + owner, _, gameID := h.prepareInflightGame(t, + "start-owner@example.com", + "start-invitee@example.com", + "Start Galaxy", + defaultEngineVersion, + ) + t.Logf("owner=%s game=%s", owner.UserID, gameID) + + // RTM publishes a success job_result for the start envelope. + startResult := h.waitJobResult(t, func(entry jobResultEntry) bool { + return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess + }, 90*time.Second) + require.Empty(t, startResult.ErrorCode, "happy path must publish empty error_code") + require.NotEmpty(t, startResult.ContainerID, "happy path must carry a container id") + require.NotEmpty(t, startResult.EngineEndpoint, "happy path must carry an engine endpoint") + + // Lobby's runtime-job-result worker drives the game to `running`. + h.waitGameStatus(t, gameID, "running", 30*time.Second) + + // RTM persists the runtime record and exposes it through REST. + h.waitRTMRuntimeStatus(t, gameID, "running", 15*time.Second) + + // A real engine container exists with the expected labels. + containerID := harness.FindContainerIDByLabel(t, gameID) + require.NotEmptyf(t, containerID, "no engine container found for game %s", gameID) + require.Equal(t, startResult.ContainerID, containerID, + "job_result container_id must match the live container") + require.Equal(t, "running", harness.ContainerState(t, containerID)) + + // The engine answers /healthz on the bridge network IP. + ip := harness.ContainerNetworkIP(t, containerID, h.dockerNetwork) + require.NotEmptyf(t, ip, "engine container %s has no IP on network %s", containerID, h.dockerNetwork) + harness.WaitForEngineHealthz(t, ip, 15*time.Second) +} + +// TestRunningGameStopsWhenOwnerCascadeBlocked drives the stop path: +// drive the same game to `running`, publish a +// `user.lifecycle.permanent_blocked` event for the owner, the Lobby +// userlifecycle worker cascades to the inflight game, publishes a +// `runtime:stop_jobs` envelope with `reason=cancelled`, and RTM stops +// the engine. The test asserts on the public boundary surfaces only. +func TestRunningGameStopsWhenOwnerCascadeBlocked(t *testing.T) { + h := newLobbyRTMHarness(t) + + owner, _, gameID := h.prepareInflightGame(t, + "stop-owner@example.com", + "stop-invitee@example.com", + "Stop Galaxy", + defaultEngineVersion, + ) + t.Logf("owner=%s game=%s", owner.UserID, gameID) + + // Wait for the start outcome so we know RTM is fully running + // before we trigger the cascade. + h.waitJobResult(t, func(entry jobResultEntry) bool { + return entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess + }, 90*time.Second) + h.waitGameStatus(t, gameID, "running", 30*time.Second) + containerID := harness.FindContainerIDByLabel(t, gameID) + require.NotEmpty(t, containerID) + + // Trigger the cascade: permanent block on the game owner causes + // Lobby's userlifecycle worker to publish stop_job(cancelled) and + // transition the owned game to `cancelled`. + h.publishUserLifecycleEvent(t, "user.lifecycle.permanent_blocked", owner.UserID) + + // Lobby observably publishes the right stop envelope on the boundary. + stop := h.waitStopJobReason(t, gameID, stopReasonCancelled, 30*time.Second) + assert.Equal(t, gameID, stop.GameID) + + // Lobby moves the game to cancelled. + h.waitGameStatus(t, gameID, "cancelled", 30*time.Second) + + // RTM consumes stop_job, stops the engine, and persists status=stopped. + h.waitRTMRuntimeStatus(t, gameID, "stopped", 30*time.Second) + + // The container is no longer running. Docker reports `exited` + // (or `created`/`removing` during teardown); none of those match + // `running`, which is the only state that contradicts a successful + // stop. + require.Eventuallyf(t, func() bool { + state := harness.ContainerState(t, containerID) + return state != "running" + }, 30*time.Second, 250*time.Millisecond, + "engine container %s did not leave running state", containerID) + + // RTM emitted at least two job_results for this game: one success + // for the start, one success for the stop. + successCount := 0 + for _, entry := range h.allJobResults(t) { + if entry.GameID == gameID && entry.Outcome == jobOutcomeSuccess { + successCount++ + } + } + assert.GreaterOrEqualf(t, successCount, 2, + "expected at least two success job_results (start + stop) for game %s", gameID) +} + +// TestStartFailsWhenImageMissing drives the failure path: the game's +// `target_engine_version` resolves to a non-existent image tag, RTM +// fails to pull, publishes a failure `runtime:job_results` plus a +// `runtime.image_pull_failed` notification intent, and Lobby's +// runtimejobresult worker transitions the game to `start_failed`. +func TestStartFailsWhenImageMissing(t *testing.T) { + h := newLobbyRTMHarness(t) + + owner, _, gameID := h.prepareInflightGame(t, + "fail-owner@example.com", + "fail-invitee@example.com", + "Fail Galaxy", + missingEngineVersion, + ) + t.Logf("owner=%s game=%s", owner.UserID, gameID) + + expectedImageRef := "galaxy/game:" + missingEngineVersion + "-lobbyrtm-it" + + // RTM publishes a failure job_result with the stable code. + failure := h.waitJobResult(t, func(entry jobResultEntry) bool { + return entry.GameID == gameID && entry.Outcome == jobOutcomeFailure + }, 120*time.Second) + assert.Equal(t, errorCodeImagePullFailed, failure.ErrorCode) + assert.Empty(t, failure.ContainerID) + assert.Empty(t, failure.EngineEndpoint) + assert.NotEmpty(t, failure.ErrorMessage) + + // RTM also publishes an admin notification intent on the shared stream. + intent := h.waitNotificationIntent(t, func(entry notificationIntentEntry) bool { + if entry.NotificationType != notificationImagePulled { + return false + } + payloadGameID, _ := entry.Payload["game_id"].(string) + return payloadGameID == gameID + }, 30*time.Second) + require.NotNil(t, intent.Payload) + assert.Equal(t, gameID, intent.Payload["game_id"]) + assert.Equal(t, expectedImageRef, intent.Payload["image_ref"]) + assert.Equal(t, errorCodeImagePullFailed, intent.Payload["error_code"]) + + // Lobby flips the game to start_failed. + h.waitGameStatus(t, gameID, "start_failed", 60*time.Second) + + // No engine container should exist for this game. + containerID := harness.FindContainerIDByLabel(t, gameID) + if containerID != "" { + state := harness.ContainerState(t, containerID) + assert.NotEqual(t, "running", state, + "failed image pull must not leave a running container behind (state=%s)", state) + } + + // RTM either has no record (clean rollback) or has one not in + // `running`. Either is acceptable per the start service contract. + status, code := h.rtmRuntimeStatus(t, gameID) + switch code { + case http.StatusNotFound: + // nothing persisted — clean rollback path + case http.StatusOK: + assert.NotEqual(t, "running", status, + "failed image pull must not persist a running record") + default: + t.Fatalf("unexpected RTM runtime response: status=%q code=%d", status, code) + } + + // Sanity check the notification carried RTM's producer marker + // rather than Lobby's, so we know the suite truly observed RTM + // publishing on the shared stream. + assert.Truef(t, + strings.Contains(intent.Producer, "rtm") || + strings.Contains(intent.Producer, "runtime"), + "image_pull_failed intent producer should be RTM-flavoured, got %q", intent.Producer) +} diff --git a/integration/lobbyrtmnotification/lobby_rtm_notification_test.go b/integration/lobbyrtmnotification/lobby_rtm_notification_test.go new file mode 100644 index 0000000..7f17672 --- /dev/null +++ b/integration/lobbyrtmnotification/lobby_rtm_notification_test.go @@ -0,0 +1,664 @@ +// Package lobbyrtmnotification_test exercises the failure-with- +// notification path that crosses three real services at once: Lobby +// publishes a start job, Runtime Manager fails to pull the engine +// image, RTM publishes both a failure `runtime:job_results` envelope +// AND a `runtime.image_pull_failed` admin notification intent on +// `notification:intents`. The Notification Service consumes the intent +// and routes it to Mail Service, where the resulting delivery is +// observable on the public list-deliveries surface. +// +// The suite proves the same Redis bus carries both flows correctly +// when all three services are booted together — the union of +// `integration/lobbyrtm` (which uses a stub notification) and +// `integration/rtmanagernotification` (which has no Lobby). +package lobbyrtmnotification_test + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "io" + "net/http" + "net/url" + "os" + "path/filepath" + "runtime" + "strconv" + "strings" + "sync/atomic" + "testing" + "time" + + "galaxy/integration/internal/harness" + + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +const ( + notificationIntentsStream = "notification:intents" + startJobsStream = "runtime:start_jobs" + stopJobsStream = "runtime:stop_jobs" + jobResultsStream = "runtime:job_results" + healthEventsStream = "runtime:health_events" + userLifecycleStream = "user:lifecycle_events" + gmEventsStream = "gm:lobby_events" + mailDeliveriesPath = "/api/v1/internal/deliveries" + notificationImagePulled = "runtime.image_pull_failed" + missingEngineVersion = "0.0.0-missing" + adminEmailRecipient = "rtm-admin@example.com" +) + +var suiteSeq atomic.Int64 + +// TestImagePullFailureReachesMailThroughNotification drives Lobby + +// RTM + Notification + Mail end-to-end. Lobby publishes a start job +// for an unresolvable image; RTM fails the pull and publishes both a +// failure job_result (consumed by Lobby) and a notification intent +// (consumed by Notification, then routed to Mail). +func TestImagePullFailureReachesMailThroughNotification(t *testing.T) { + h := newTripleHarness(t) + + owner := h.ensureUser(t, "triple-owner@example.com") + invitee := h.ensureUser(t, "triple-invitee@example.com") + gameID := h.adminCreatePrivateGameForOwner(t, owner.UserID, "Triple Galaxy", + time.Now().Add(48*time.Hour).Unix(), missingEngineVersion) + h.userOpenEnrollment(t, owner.UserID, gameID) + h.userCreateInvite(t, owner.UserID, gameID, invitee.UserID) + inviteID := h.firstCreatedInviteID(t, invitee.UserID, gameID) + h.userRedeemInvite(t, invitee.UserID, gameID, inviteID, "PilotTriple") + h.userReadyToStart(t, owner.UserID, gameID) + h.userStartGame(t, owner.UserID, gameID) + t.Logf("triple harness gameID=%s ownerUserID=%s", gameID, owner.UserID) + + expectedImageRef := "galaxy/game:" + missingEngineVersion + "-tripleit" + + // 1. RTM publishes a failure job_result on `runtime:job_results`. + failure := h.waitJobResult(t, func(entry jobResultEntry) bool { + return entry.GameID == gameID && entry.Outcome == "failure" + }, 120*time.Second) + assert.Equal(t, "image_pull_failed", failure.ErrorCode) + + // 2. RTM publishes an admin notification intent. + intent := h.waitNotificationIntent(t, func(entry notificationIntentEntry) bool { + return entry.NotificationType == notificationImagePulled && + entry.PayloadGameID == gameID + }, 60*time.Second) + assert.Equal(t, expectedImageRef, intent.PayloadImageRef) + + // 3. Notification consumes the intent and Mail records the + // delivery for the configured admin recipient. + idempotencyKey := "notification:" + intent.RedisEntryID + + "/email:email:" + adminEmailRecipient + delivery := h.eventuallyDelivery(t, url.Values{ + "source": []string{"notification"}, + "status": []string{"sent"}, + "recipient": []string{adminEmailRecipient}, + "template_id": []string{notificationImagePulled}, + "idempotency_key": []string{idempotencyKey}, + }) + assert.Equal(t, "template", delivery.PayloadMode) + assert.Equal(t, notificationImagePulled, delivery.TemplateID) + assert.Equal(t, []string{adminEmailRecipient}, delivery.To) + + // 4. Lobby's runtimejobresult worker drives the game to + // `start_failed` because of the same failure outcome on the + // shared bus. + h.waitGameStatus(t, gameID, "start_failed", 60*time.Second) +} + +type tripleHarness struct { + redis *redis.Client + + userServiceURL string + lobbyAdminURL string + lobbyPublicURL string + mailBaseURL string + notificationURL string + + intentsStream string + startJobs string + stopJobs string + jobResults string + healthEvents string + lifecycleStream string + gmEventsStream string + + processes []*harness.Process +} + +func newTripleHarness(t *testing.T) *tripleHarness { + t.Helper() + harness.RequireDockerDaemon(t) // RTM /readyz pings Docker. + + redisRuntime := harness.StartRedisContainer(t) + redisClient := redis.NewClient(&redis.Options{ + Addr: redisRuntime.Addr, + Protocol: 2, + DisableIdentity: true, + }) + t.Cleanup(func() { require.NoError(t, redisClient.Close()) }) + + dockerNetwork := harness.EnsureDockerNetwork(t) + + userServiceAddr := harness.FreeTCPAddress(t) + mailInternalAddr := harness.FreeTCPAddress(t) + notificationInternalAddr := harness.FreeTCPAddress(t) + lobbyPublicAddr := harness.FreeTCPAddress(t) + lobbyInternalAddr := harness.FreeTCPAddress(t) + rtmInternalAddr := harness.FreeTCPAddress(t) + + userServiceBinary := harness.BuildBinary(t, "userservice", "./user/cmd/userservice") + mailBinary := harness.BuildBinary(t, "mail", "./mail/cmd/mail") + notificationBinary := harness.BuildBinary(t, "notification", "./notification/cmd/notification") + lobbyBinary := harness.BuildBinary(t, "lobby", "./lobby/cmd/lobby") + rtmBinary := harness.BuildBinary(t, "rtmanager", "./rtmanager/cmd/rtmanager") + + suffix := strconv.FormatInt(suiteSeq.Add(1), 10) + intentsStream := notificationIntentsStream + ":" + suffix + startJobs := startJobsStream + ":" + suffix + stopJobs := stopJobsStream + ":" + suffix + jobResults := jobResultsStream + ":" + suffix + healthEvents := healthEventsStream + ":" + suffix + lifecycle := userLifecycleStream + ":" + suffix + gmEvents := gmEventsStream + ":" + suffix + + // User Service. + userServiceEnv := harness.StartUserServicePersistence(t, redisRuntime.Addr).Env + userServiceEnv["USERSERVICE_LOG_LEVEL"] = "info" + userServiceEnv["USERSERVICE_INTERNAL_HTTP_ADDR"] = userServiceAddr + userServiceEnv["OTEL_TRACES_EXPORTER"] = "none" + userServiceEnv["OTEL_METRICS_EXPORTER"] = "none" + userServiceProcess := harness.StartProcess(t, "userservice", userServiceBinary, userServiceEnv) + waitForUserServiceReady(t, userServiceProcess, "http://"+userServiceAddr) + + // Mail Service. + mailEnv := harness.StartMailServicePersistence(t, redisRuntime.Addr).Env + mailEnv["MAIL_LOG_LEVEL"] = "info" + mailEnv["MAIL_INTERNAL_HTTP_ADDR"] = mailInternalAddr + mailEnv["MAIL_TEMPLATE_DIR"] = mailTemplateDir(t) + mailEnv["MAIL_SMTP_MODE"] = "stub" + mailEnv["MAIL_STREAM_BLOCK_TIMEOUT"] = "100ms" + mailEnv["MAIL_OPERATOR_REQUEST_TIMEOUT"] = time.Second.String() + mailEnv["MAIL_SHUTDOWN_TIMEOUT"] = "2s" + mailEnv["OTEL_TRACES_EXPORTER"] = "none" + mailEnv["OTEL_METRICS_EXPORTER"] = "none" + mailProcess := harness.StartProcess(t, "mail", mailBinary, mailEnv) + waitForMailReady(t, mailProcess, "http://"+mailInternalAddr) + + // Notification Service. Admin emails for runtime.* go to a single + // shared address; the suite does not test multi-recipient routing. + notificationEnv := harness.StartNotificationServicePersistence(t, redisRuntime.Addr).Env + notificationEnv["NOTIFICATION_LOG_LEVEL"] = "info" + notificationEnv["NOTIFICATION_INTERNAL_HTTP_ADDR"] = notificationInternalAddr + notificationEnv["NOTIFICATION_USER_SERVICE_BASE_URL"] = "http://" + userServiceAddr + notificationEnv["NOTIFICATION_USER_SERVICE_TIMEOUT"] = time.Second.String() + notificationEnv["NOTIFICATION_INTENTS_STREAM"] = intentsStream + notificationEnv["NOTIFICATION_INTENTS_READ_BLOCK_TIMEOUT"] = "100ms" + notificationEnv["NOTIFICATION_ROUTE_BACKOFF_MIN"] = "100ms" + notificationEnv["NOTIFICATION_ROUTE_BACKOFF_MAX"] = "100ms" + notificationEnv["NOTIFICATION_ADMIN_EMAILS_RUNTIME_IMAGE_PULL_FAILED"] = adminEmailRecipient + notificationEnv["NOTIFICATION_ADMIN_EMAILS_RUNTIME_CONTAINER_START_FAILED"] = adminEmailRecipient + notificationEnv["NOTIFICATION_ADMIN_EMAILS_RUNTIME_START_CONFIG_INVALID"] = adminEmailRecipient + notificationEnv["OTEL_TRACES_EXPORTER"] = "none" + notificationEnv["OTEL_METRICS_EXPORTER"] = "none" + notificationProcess := harness.StartProcess(t, "notification", notificationBinary, notificationEnv) + harness.WaitForHTTPStatus(t, notificationProcess, "http://"+notificationInternalAddr+"/readyz", http.StatusOK) + + // Lobby. + lobbyEnv := harness.StartLobbyServicePersistence(t, redisRuntime.Addr).Env + lobbyEnv["LOBBY_LOG_LEVEL"] = "info" + lobbyEnv["LOBBY_PUBLIC_HTTP_ADDR"] = lobbyPublicAddr + lobbyEnv["LOBBY_INTERNAL_HTTP_ADDR"] = lobbyInternalAddr + lobbyEnv["LOBBY_USER_SERVICE_BASE_URL"] = "http://" + userServiceAddr + lobbyEnv["LOBBY_GM_BASE_URL"] = "http://" + notificationInternalAddr + lobbyEnv["LOBBY_NOTIFICATION_INTENTS_STREAM"] = intentsStream + lobbyEnv["LOBBY_USER_LIFECYCLE_STREAM"] = lifecycle + lobbyEnv["LOBBY_RUNTIME_JOB_RESULTS_STREAM"] = jobResults + lobbyEnv["LOBBY_RUNTIME_START_JOBS_STREAM"] = startJobs + lobbyEnv["LOBBY_RUNTIME_STOP_JOBS_STREAM"] = stopJobs + lobbyEnv["LOBBY_GM_EVENTS_STREAM"] = gmEvents + lobbyEnv["LOBBY_RUNTIME_JOB_RESULTS_READ_BLOCK_TIMEOUT"] = "200ms" + lobbyEnv["LOBBY_USER_LIFECYCLE_READ_BLOCK_TIMEOUT"] = "200ms" + lobbyEnv["LOBBY_GM_EVENTS_READ_BLOCK_TIMEOUT"] = "200ms" + lobbyEnv["LOBBY_ENGINE_IMAGE_TEMPLATE"] = "galaxy/game:{engine_version}-tripleit" + lobbyEnv["OTEL_TRACES_EXPORTER"] = "none" + lobbyEnv["OTEL_METRICS_EXPORTER"] = "none" + lobbyProcess := harness.StartProcess(t, "lobby", lobbyBinary, lobbyEnv) + harness.WaitForHTTPStatus(t, lobbyProcess, "http://"+lobbyInternalAddr+"/readyz", http.StatusOK) + + // Runtime Manager. + rtmEnv := harness.StartRTManagerServicePersistence(t, redisRuntime.Addr).Env + rtmEnv["RTMANAGER_LOG_LEVEL"] = "info" + rtmEnv["RTMANAGER_INTERNAL_HTTP_ADDR"] = rtmInternalAddr + rtmEnv["RTMANAGER_LOBBY_INTERNAL_BASE_URL"] = "http://" + lobbyInternalAddr + rtmEnv["RTMANAGER_LOBBY_INTERNAL_TIMEOUT"] = "200ms" + rtmEnv["RTMANAGER_DOCKER_HOST"] = resolveDockerHost() + rtmEnv["RTMANAGER_DOCKER_NETWORK"] = dockerNetwork + rtmEnv["RTMANAGER_GAME_STATE_ROOT"] = t.TempDir() + rtmEnv["RTMANAGER_REDIS_START_JOBS_STREAM"] = startJobs + rtmEnv["RTMANAGER_REDIS_STOP_JOBS_STREAM"] = stopJobs + rtmEnv["RTMANAGER_REDIS_JOB_RESULTS_STREAM"] = jobResults + rtmEnv["RTMANAGER_REDIS_HEALTH_EVENTS_STREAM"] = healthEvents + rtmEnv["RTMANAGER_NOTIFICATION_INTENTS_STREAM"] = intentsStream + rtmEnv["RTMANAGER_STREAM_BLOCK_TIMEOUT"] = "200ms" + rtmEnv["RTMANAGER_RECONCILE_INTERVAL"] = "5s" + rtmEnv["RTMANAGER_CLEANUP_INTERVAL"] = "5s" + rtmEnv["RTMANAGER_INSPECT_INTERVAL"] = "5s" + rtmEnv["RTMANAGER_PROBE_INTERVAL"] = "5s" + rtmEnv["RTMANAGER_PROBE_TIMEOUT"] = "1s" + rtmEnv["RTMANAGER_PROBE_FAILURES_THRESHOLD"] = "3" + rtmEnv["RTMANAGER_GAME_LEASE_TTL_SECONDS"] = "30" + rtmEnv["OTEL_TRACES_EXPORTER"] = "none" + rtmEnv["OTEL_METRICS_EXPORTER"] = "none" + rtmProcess := harness.StartProcess(t, "rtmanager", rtmBinary, rtmEnv) + harness.WaitForHTTPStatus(t, rtmProcess, "http://"+rtmInternalAddr+"/readyz", http.StatusOK) + + return &tripleHarness{ + redis: redisClient, + userServiceURL: "http://" + userServiceAddr, + lobbyAdminURL: "http://" + lobbyInternalAddr, + lobbyPublicURL: "http://" + lobbyPublicAddr, + mailBaseURL: "http://" + mailInternalAddr, + notificationURL: "http://" + notificationInternalAddr, + intentsStream: intentsStream, + startJobs: startJobs, + stopJobs: stopJobs, + jobResults: jobResults, + healthEvents: healthEvents, + lifecycleStream: lifecycle, + gmEventsStream: gmEvents, + processes: []*harness.Process{userServiceProcess, mailProcess, notificationProcess, lobbyProcess, rtmProcess}, + } +} + +// --- Lobby fixtures --- + +type ensureUserResponse struct { + Outcome string `json:"outcome"` + UserID string `json:"user_id"` +} + +func (h *tripleHarness) ensureUser(t *testing.T, email string) ensureUserResponse { + t.Helper() + resp := postJSON(t, h.userServiceURL+"/api/v1/internal/users/ensure-by-email", map[string]any{ + "email": email, + "registration_context": map[string]string{ + "preferred_language": "en", + "time_zone": "Europe/Kaliningrad", + }, + }, nil) + var out ensureUserResponse + requireJSONStatus(t, resp, http.StatusOK, &out) + require.NotEmpty(t, out.UserID) + return out +} + +func (h *tripleHarness) adminCreatePrivateGameForOwner(t *testing.T, ownerUserID, gameName string, enrollmentEndsAt int64, engineVersion string) string { + t.Helper() + resp := postJSON(t, h.lobbyPublicURL+"/api/v1/lobby/games", map[string]any{ + "game_name": gameName, + "game_type": "private", + "min_players": 1, + "max_players": 4, + "start_gap_hours": 6, + "start_gap_players": 1, + "enrollment_ends_at": enrollmentEndsAt, + "turn_schedule": "0 18 * * *", + "target_engine_version": engineVersion, + }, http.Header{"X-User-Id": []string{ownerUserID}}) + require.Equalf(t, http.StatusCreated, resp.StatusCode, "create private game: %s", resp.Body) + var record struct { + GameID string `json:"game_id"` + } + require.NoError(t, json.Unmarshal([]byte(resp.Body), &record)) + require.NotEmpty(t, record.GameID) + return record.GameID +} + +func (h *tripleHarness) userOpenEnrollment(t *testing.T, ownerUserID, gameID string) { + t.Helper() + resp := postJSON(t, h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/open-enrollment", nil, + http.Header{"X-User-Id": []string{ownerUserID}}) + require.Equalf(t, http.StatusOK, resp.StatusCode, "open enrollment: %s", resp.Body) +} + +func (h *tripleHarness) userReadyToStart(t *testing.T, ownerUserID, gameID string) { + t.Helper() + resp := postJSON(t, h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/ready-to-start", nil, + http.Header{"X-User-Id": []string{ownerUserID}}) + require.Equalf(t, http.StatusOK, resp.StatusCode, "ready-to-start: %s", resp.Body) +} + +func (h *tripleHarness) userStartGame(t *testing.T, ownerUserID, gameID string) { + t.Helper() + resp := postJSON(t, h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/start", nil, + http.Header{"X-User-Id": []string{ownerUserID}}) + require.Equalf(t, http.StatusOK, resp.StatusCode, "start game: %s", resp.Body) +} + +func (h *tripleHarness) userCreateInvite(t *testing.T, ownerUserID, gameID, inviteeUserID string) { + t.Helper() + resp := postJSON(t, h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/invites", + map[string]any{"invitee_user_id": inviteeUserID}, + http.Header{"X-User-Id": []string{ownerUserID}}) + require.Equalf(t, http.StatusCreated, resp.StatusCode, "create invite: %s", resp.Body) +} + +func (h *tripleHarness) firstCreatedInviteID(t *testing.T, inviteeUserID, gameID string) string { + t.Helper() + req, err := http.NewRequest(http.MethodGet, + h.lobbyPublicURL+"/api/v1/lobby/my/invites?status=created", nil) + require.NoError(t, err) + req.Header.Set("X-User-Id", inviteeUserID) + resp := doRequest(t, req) + require.Equalf(t, http.StatusOK, resp.StatusCode, "list my invites: %s", resp.Body) + + var body struct { + Items []struct { + InviteID string `json:"invite_id"` + GameID string `json:"game_id"` + } `json:"items"` + } + require.NoError(t, json.Unmarshal([]byte(resp.Body), &body)) + for _, item := range body.Items { + if item.GameID == gameID { + return item.InviteID + } + } + t.Fatalf("no invite for invitee %s on game %s", inviteeUserID, gameID) + return "" +} + +func (h *tripleHarness) userRedeemInvite(t *testing.T, inviteeUserID, gameID, inviteID, raceName string) { + t.Helper() + resp := postJSON(t, + h.lobbyPublicURL+"/api/v1/lobby/games/"+gameID+"/invites/"+inviteID+"/redeem", + map[string]any{"race_name": raceName}, + http.Header{"X-User-Id": []string{inviteeUserID}}) + require.Equalf(t, http.StatusOK, resp.StatusCode, "redeem invite: %s", resp.Body) +} + +// --- observation helpers --- + +type jobResultEntry struct { + GameID string + Outcome string + ContainerID string + EngineEndpoint string + ErrorCode string + ErrorMessage string +} + +func (h *tripleHarness) waitJobResult(t *testing.T, predicate func(jobResultEntry) bool, timeout time.Duration) jobResultEntry { + t.Helper() + deadline := time.Now().Add(timeout) + for { + entries, err := h.redis.XRange(context.Background(), h.jobResults, "-", "+").Result() + require.NoError(t, err) + for _, entry := range entries { + parsed := jobResultEntry{ + GameID: readString(entry.Values, "game_id"), + Outcome: readString(entry.Values, "outcome"), + ContainerID: readString(entry.Values, "container_id"), + EngineEndpoint: readString(entry.Values, "engine_endpoint"), + ErrorCode: readString(entry.Values, "error_code"), + ErrorMessage: readString(entry.Values, "error_message"), + } + if predicate(parsed) { + return parsed + } + } + if time.Now().After(deadline) { + t.Fatalf("matching job_result not observed within %s", timeout) + } + time.Sleep(50 * time.Millisecond) + } +} + +type notificationIntentEntry struct { + RedisEntryID string + NotificationType string + Producer string + AudienceKind string + PayloadGameID string + PayloadImageRef string + PayloadErrorCode string +} + +func (h *tripleHarness) waitNotificationIntent(t *testing.T, predicate func(notificationIntentEntry) bool, timeout time.Duration) notificationIntentEntry { + t.Helper() + deadline := time.Now().Add(timeout) + for { + entries, err := h.redis.XRange(context.Background(), h.intentsStream, "-", "+").Result() + require.NoError(t, err) + for _, entry := range entries { + parsed := notificationIntentEntry{ + RedisEntryID: entry.ID, + NotificationType: readString(entry.Values, "notification_type"), + Producer: readString(entry.Values, "producer"), + AudienceKind: readString(entry.Values, "audience_kind"), + } + if payload := readString(entry.Values, "payload_json"); payload != "" { + var data struct { + GameID string `json:"game_id"` + ImageRef string `json:"image_ref"` + ErrorCode string `json:"error_code"` + } + if err := json.Unmarshal([]byte(payload), &data); err == nil { + parsed.PayloadGameID = data.GameID + parsed.PayloadImageRef = data.ImageRef + parsed.PayloadErrorCode = data.ErrorCode + } + } + if predicate(parsed) { + return parsed + } + } + if time.Now().After(deadline) { + t.Fatalf("matching notification intent not observed within %s", timeout) + } + time.Sleep(50 * time.Millisecond) + } +} + +type mailDeliverySummary struct { + DeliveryID string `json:"delivery_id"` + Source string `json:"source"` + PayloadMode string `json:"payload_mode"` + TemplateID string `json:"template_id"` + Locale string `json:"locale"` + To []string `json:"to"` + Status string `json:"status"` +} + +func (h *tripleHarness) eventuallyDelivery(t *testing.T, query url.Values) mailDeliverySummary { + t.Helper() + deadline := time.Now().Add(60 * time.Second) + for { + listURL := h.mailBaseURL + mailDeliveriesPath + "?" + query.Encode() + req, err := http.NewRequest(http.MethodGet, listURL, nil) + require.NoError(t, err) + resp := doRequest(t, req) + if resp.StatusCode == http.StatusOK { + var body struct { + Items []mailDeliverySummary `json:"items"` + } + if json.Unmarshal([]byte(resp.Body), &body) == nil && len(body.Items) > 0 { + return body.Items[0] + } + } + if time.Now().After(deadline) { + t.Fatalf("mail delivery not observed within 60s for query %v", query) + } + time.Sleep(50 * time.Millisecond) + } +} + +func (h *tripleHarness) waitGameStatus(t *testing.T, gameID, want string, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + for { + req, err := http.NewRequest(http.MethodGet, h.lobbyAdminURL+"/api/v1/lobby/games/"+gameID, nil) + require.NoError(t, err) + resp := doRequest(t, req) + if resp.StatusCode == http.StatusOK { + var record struct { + Status string `json:"status"` + } + if json.Unmarshal([]byte(resp.Body), &record) == nil && record.Status == want { + return + } + } + if time.Now().After(deadline) { + t.Fatalf("game %s did not reach status %q within %s", gameID, want, timeout) + } + time.Sleep(100 * time.Millisecond) + } +} + +// --- shared helpers --- + +func readString(values map[string]any, key string) string { + v, _ := values[key].(string) + return strings.TrimSpace(v) +} + +type httpResponse struct { + StatusCode int + Body string + Header http.Header +} + +func postJSON(t *testing.T, url string, body any, header http.Header) httpResponse { + t.Helper() + var reader io.Reader + if body != nil { + payload, err := json.Marshal(body) + require.NoError(t, err) + reader = bytes.NewReader(payload) + } + req, err := http.NewRequest(http.MethodPost, url, reader) + require.NoError(t, err) + if body != nil { + req.Header.Set("Content-Type", "application/json") + } + for key, vs := range header { + for _, v := range vs { + req.Header.Add(key, v) + } + } + return doRequest(t, req) +} + +func doRequest(t *testing.T, request *http.Request) httpResponse { + t.Helper() + client := &http.Client{ + Timeout: 5 * time.Second, + Transport: &http.Transport{DisableKeepAlives: true}, + } + t.Cleanup(client.CloseIdleConnections) + + response, err := client.Do(request) + require.NoError(t, err) + defer response.Body.Close() + + payload, err := io.ReadAll(response.Body) + require.NoError(t, err) + return httpResponse{ + StatusCode: response.StatusCode, + Body: string(payload), + Header: response.Header.Clone(), + } +} + +func requireJSONStatus(t *testing.T, response httpResponse, want int, target any) { + t.Helper() + require.Equalf(t, want, response.StatusCode, "response: %s", response.Body) + require.NoError(t, decodeStrictJSON([]byte(response.Body), target)) +} + +func decodeStrictJSON(payload []byte, target any) error { + decoder := json.NewDecoder(bytes.NewReader(payload)) + decoder.DisallowUnknownFields() + if err := decoder.Decode(target); err != nil { + return err + } + if err := decoder.Decode(&struct{}{}); err != io.EOF { + if err == nil { + return errors.New("unexpected trailing JSON input") + } + return err + } + return nil +} + +func waitForUserServiceReady(t *testing.T, process *harness.Process, baseURL string) { + t.Helper() + client := &http.Client{Timeout: 250 * time.Millisecond} + t.Cleanup(client.CloseIdleConnections) + + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + req, err := http.NewRequest(http.MethodGet, baseURL+"/api/v1/internal/users/user-readiness-probe/exists", nil) + require.NoError(t, err) + response, err := client.Do(req) + if err == nil { + _, _ = io.Copy(io.Discard, response.Body) + response.Body.Close() + if response.StatusCode == http.StatusOK { + return + } + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("wait for userservice readiness: timeout\n%s", process.Logs()) +} + +func waitForMailReady(t *testing.T, process *harness.Process, baseURL string) { + t.Helper() + client := &http.Client{Timeout: 250 * time.Millisecond} + t.Cleanup(client.CloseIdleConnections) + + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + req, err := http.NewRequest(http.MethodGet, baseURL+mailDeliveriesPath, nil) + require.NoError(t, err) + response, err := client.Do(req) + if err == nil { + _, _ = io.Copy(io.Discard, response.Body) + response.Body.Close() + if response.StatusCode == http.StatusOK { + return + } + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("wait for mail readiness: timeout\n%s", process.Logs()) +} + +func mailTemplateDir(t *testing.T) string { + t.Helper() + return filepath.Join(repositoryRoot(t), "mail", "templates") +} + +func repositoryRoot(t *testing.T) string { + t.Helper() + _, file, _, ok := runtime.Caller(0) + if !ok { + t.Fatal("resolve repository root: runtime caller is unavailable") + } + return filepath.Clean(filepath.Join(filepath.Dir(file), "..", "..")) +} + +// resolveDockerHost honours DOCKER_HOST when the developer machine +// routes through colima or a remote daemon, fall back to the standard +// unix path otherwise. +func resolveDockerHost() string { + if host := strings.TrimSpace(os.Getenv("DOCKER_HOST")); host != "" { + return host + } + return "unix:///var/run/docker.sock" +} diff --git a/integration/mailsmoke/mail_smoke_test.go b/integration/mailsmoke/mail_smoke_test.go new file mode 100644 index 0000000..7863dcb --- /dev/null +++ b/integration/mailsmoke/mail_smoke_test.go @@ -0,0 +1,367 @@ +// Package mailsmoke_test exercises the real SMTP adapter of Mail +// Service against a real SMTP receiver running in a testcontainer. +// The suite is the small dedicated smoke suite called out in +// `TESTING.md §4` ("Add only a small dedicated smoke suite for the +// real mail adapter"). +// +// The boundary contract under test is: a delivery accepted on Mail's +// internal HTTP surface in `smtp` mode is actually transmitted over +// SMTP to the configured upstream and is observable on the +// receiver's inspection API. No other Galaxy service is booted; the +// test is intentionally narrow. +package mailsmoke_test + +import ( + "bytes" + "context" + "crypto/rand" + "crypto/rsa" + "crypto/x509" + "crypto/x509/pkix" + "encoding/json" + "encoding/pem" + "errors" + "fmt" + "io" + "math/big" + "net" + "net/http" + "path/filepath" + "runtime" + "strconv" + "sync/atomic" + "testing" + "time" + + "galaxy/integration/internal/harness" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + testcontainers "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/wait" +) + +const ( + mailpitImage = "axllent/mailpit:latest" + mailpitSMTPPort = "1025/tcp" + mailpitAPIPort = "8025/tcp" + mailDeliveryPath = "/api/v1/internal/deliveries" + commandSource = "mailsmoke" + commandTemplate = "auth.login_code" + smokeRecipient = "smoke-recipient@example.com" + smokeFromEmail = "noreply@galaxy.example.com" +) + +var smokeSeq atomic.Int64 + +// TestMailServiceDeliversToRealSMTPProvider drives Mail Service in +// `smtp` mode at a real Mailpit testcontainer. The service must +// transmit the configured payload over SMTP and the receiver must +// register it as a stored message visible on its HTTP inspection API. +func TestMailServiceDeliversToRealSMTPProvider(t *testing.T) { + mailpit := startMailpitContainer(t) + + mailService := startMailServiceWithSMTP(t, mailpit.SMTPEndpoint()) + + suffix := strconv.FormatInt(smokeSeq.Add(1), 10) + idempotencyKey := "mailsmoke:" + suffix + uniqueRecipient := "smoke-" + suffix + "-" + smokeRecipient + + // Mail Service has a synchronous trusted REST surface for the + // auth login-code path (`/api/v1/internal/login-code-deliveries`). + // It accepts the request, renders the template, and drives the + // configured SMTP provider — exactly what the smoke suite needs + // to verify against the real Mailpit container. + loginCodeBody := map[string]any{ + "email": uniqueRecipient, + "code": "123456", + "locale": "en", + } + bodyBytes, err := json.Marshal(loginCodeBody) + require.NoError(t, err) + + req, err := http.NewRequest(http.MethodPost, + mailService.BaseURL+"/api/v1/internal/login-code-deliveries", + bytes.NewReader(bodyBytes), + ) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Idempotency-Key", idempotencyKey) + resp := doRequest(t, req) + require.Equalf(t, + http.StatusOK, + resp.StatusCode, + "submit login-code delivery: %s", resp.Body, + ) + + // Mailpit exposes received messages at /api/v1/messages with a + // JSON envelope containing `messages_count` plus per-message + // items. Wait until our envelope shows up. + waitForMailpitMessage(t, mailpit.APIBaseURL(), uniqueRecipient, 30*time.Second) +} + +// --- mailpit container --- + +type mailpitContainer struct { + container testcontainers.Container + smtpHost string + smtpPort string + apiHost string + apiPort string +} + +func (m *mailpitContainer) SMTPEndpoint() string { + return m.smtpHost + ":" + m.smtpPort +} + +func (m *mailpitContainer) APIBaseURL() string { + return "http://" + m.apiHost + ":" + m.apiPort +} + +func startMailpitContainer(t *testing.T) *mailpitContainer { + t.Helper() + + // Mail Service hardcodes `gomail.TLSMandatory`; the smoke suite + // must give Mailpit a usable cert+key so STARTTLS succeeds even + // against a self-signed server. The cert is short-lived and is + // regenerated per test run. + certPEM, keyPEM := generateSelfSignedCert(t, "mailpit-smoke") + + ctx := context.Background() + req := testcontainers.ContainerRequest{ + Image: mailpitImage, + ExposedPorts: []string{ + mailpitSMTPPort, + mailpitAPIPort, + }, + Env: map[string]string{ + "MP_SMTP_TLS_CERT": "/etc/mailpit/cert.pem", + "MP_SMTP_TLS_KEY": "/etc/mailpit/key.pem", + }, + Files: []testcontainers.ContainerFile{ + { + Reader: bytes.NewReader(certPEM), + ContainerFilePath: "/etc/mailpit/cert.pem", + FileMode: 0o644, + }, + { + Reader: bytes.NewReader(keyPEM), + ContainerFilePath: "/etc/mailpit/key.pem", + FileMode: 0o600, + }, + }, + WaitingFor: wait.ForLog("accessible via"). + WithStartupTimeout(30 * time.Second), + } + container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: req, + Started: true, + }) + require.NoError(t, err) + t.Cleanup(func() { + if err := testcontainers.TerminateContainer(container); err != nil { + t.Errorf("terminate mailpit container: %v", err) + } + }) + + smtpHost, err := container.Host(ctx) + require.NoError(t, err) + smtpPort, err := container.MappedPort(ctx, mailpitSMTPPort) + require.NoError(t, err) + + apiPort, err := container.MappedPort(ctx, mailpitAPIPort) + require.NoError(t, err) + + return &mailpitContainer{ + container: container, + smtpHost: smtpHost, + smtpPort: smtpPort.Port(), + apiHost: smtpHost, + apiPort: apiPort.Port(), + } +} + +func waitForMailpitMessage(t *testing.T, apiBaseURL, recipient string, timeout time.Duration) { + t.Helper() + + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + req, err := http.NewRequest(http.MethodGet, apiBaseURL+"/api/v1/messages", nil) + require.NoError(t, err) + resp := doRequest(t, req) + if resp.StatusCode == http.StatusOK { + var body struct { + Messages []struct { + To []struct { + Address string `json:"Address"` + } `json:"To"` + Subject string `json:"Subject"` + } `json:"messages"` + } + if json.Unmarshal([]byte(resp.Body), &body) == nil { + for _, m := range body.Messages { + for _, addr := range m.To { + if addr.Address == recipient { + return + } + } + } + } + } + time.Sleep(100 * time.Millisecond) + } + t.Fatalf("mailpit did not register a message for %s within %s", recipient, timeout) +} + +// --- mail service in real-SMTP mode --- + +type mailService struct { + BaseURL string +} + +func startMailServiceWithSMTP(t *testing.T, smtpAddr string) mailService { + t.Helper() + + redisRuntime := harness.StartRedisContainer(t) + mailInternalAddr := harness.FreeTCPAddress(t) + mailBinary := harness.BuildBinary(t, "mail", "./mail/cmd/mail") + + mailEnv := harness.StartMailServicePersistence(t, redisRuntime.Addr).Env + mailEnv["MAIL_LOG_LEVEL"] = "info" + mailEnv["MAIL_INTERNAL_HTTP_ADDR"] = mailInternalAddr + mailEnv["MAIL_TEMPLATE_DIR"] = mailTemplateDir(t) + mailEnv["MAIL_SMTP_MODE"] = "smtp" + mailEnv["MAIL_SMTP_ADDR"] = smtpAddr + mailEnv["MAIL_SMTP_FROM_EMAIL"] = smokeFromEmail + mailEnv["MAIL_SMTP_FROM_NAME"] = "Galaxy Mail Smoke" + mailEnv["MAIL_SMTP_TIMEOUT"] = "10s" + mailEnv["MAIL_SMTP_INSECURE_SKIP_VERIFY"] = "true" + mailEnv["MAIL_STREAM_BLOCK_TIMEOUT"] = "100ms" + mailEnv["MAIL_OPERATOR_REQUEST_TIMEOUT"] = "5s" + mailEnv["MAIL_SHUTDOWN_TIMEOUT"] = "2s" + mailEnv["OTEL_TRACES_EXPORTER"] = "none" + mailEnv["OTEL_METRICS_EXPORTER"] = "none" + + mailProcess := harness.StartProcess(t, "mail", mailBinary, mailEnv) + waitForMailReady(t, mailProcess, "http://"+mailInternalAddr) + + return mailService{BaseURL: "http://" + mailInternalAddr} +} + +// --- shared helpers --- + +func waitForMailReady(t *testing.T, process *harness.Process, baseURL string) { + t.Helper() + client := &http.Client{Timeout: 250 * time.Millisecond} + t.Cleanup(client.CloseIdleConnections) + + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + req, err := http.NewRequest(http.MethodGet, baseURL+mailDeliveryPath, nil) + require.NoError(t, err) + response, err := client.Do(req) + if err == nil { + _, _ = io.Copy(io.Discard, response.Body) + response.Body.Close() + if response.StatusCode == http.StatusOK { + return + } + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("wait for mail readiness: timeout\n%s", process.Logs()) +} + +type httpResponse struct { + StatusCode int + Body string + Header http.Header +} + +func postJSON(t *testing.T, url string, body any) httpResponse { + t.Helper() + payload, err := json.Marshal(body) + require.NoError(t, err) + + req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(payload)) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + return doRequest(t, req) +} + +func doRequest(t *testing.T, request *http.Request) httpResponse { + t.Helper() + client := &http.Client{ + Timeout: 5 * time.Second, + Transport: &http.Transport{DisableKeepAlives: true}, + } + t.Cleanup(client.CloseIdleConnections) + + response, err := client.Do(request) + require.NoError(t, err) + defer response.Body.Close() + + payload, err := io.ReadAll(response.Body) + require.NoError(t, err) + return httpResponse{ + StatusCode: response.StatusCode, + Body: string(payload), + Header: response.Header.Clone(), + } +} + +// generateSelfSignedCert produces a short-lived RSA cert + key for the +// Mailpit container so STARTTLS succeeds against +// `MAIL_SMTP_INSECURE_SKIP_VERIFY=true` clients. +func generateSelfSignedCert(t *testing.T, commonName string) ([]byte, []byte) { + t.Helper() + + priv, err := rsa.GenerateKey(rand.Reader, 2048) + require.NoError(t, err) + + serial, err := rand.Int(rand.Reader, big.NewInt(1<<62)) + require.NoError(t, err) + + template := x509.Certificate{ + SerialNumber: serial, + Subject: pkix.Name{CommonName: commonName}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(24 * time.Hour), + KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment | x509.KeyUsageCertSign, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + BasicConstraintsValid: true, + IsCA: true, + IPAddresses: []net.IP{net.ParseIP("127.0.0.1")}, + DNSNames: []string{"localhost", commonName}, + } + + certDER, err := x509.CreateCertificate(rand.Reader, &template, &template, &priv.PublicKey, priv) + require.NoError(t, err) + + certPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: certDER}) + keyPEM := pem.EncodeToMemory(&pem.Block{ + Type: "RSA PRIVATE KEY", + Bytes: x509.MarshalPKCS1PrivateKey(priv), + }) + return certPEM, keyPEM +} + +func mailTemplateDir(t *testing.T) string { + t.Helper() + return filepath.Join(repositoryRoot(t), "mail", "templates") +} + +func repositoryRoot(t *testing.T) string { + t.Helper() + _, file, _, ok := runtime.Caller(0) + if !ok { + t.Fatal("resolve repository root: runtime caller is unavailable") + } + return filepath.Clean(filepath.Join(filepath.Dir(file), "..", "..")) +} + +// silence unused-import noise for symbols touched only via reflection / +// conditional compilation. +var _ = fmt.Sprintf +var _ = errors.New +var _ = assert.Equal diff --git a/integration/rtmanagernotification/rtmanager_notification_test.go b/integration/rtmanagernotification/rtmanager_notification_test.go new file mode 100644 index 0000000..448c49c --- /dev/null +++ b/integration/rtmanagernotification/rtmanager_notification_test.go @@ -0,0 +1,602 @@ +// Package rtmanagernotification_test exercises the Runtime Manager → +// Notification Service boundary against real RTM + real Notification + +// real Mail Service + real User Service running on testcontainers +// PostgreSQL and Redis, with a real Docker daemon for RTM's readiness +// pings. +// +// The boundary contract under test is: when a start job points at an +// unresolvable image, RTM publishes one `runtime.image_pull_failed` +// admin-only notification intent on `notification:intents`; the +// Notification Service consumes the intent, resolves the admin email +// recipient list from configuration, and hands the delivery to Mail +// Service in template-mode. The suite asserts the wire shape on +// `notification:intents` and the resulting Mail delivery record. +// +// Game Master is not booted: RTM emits the intent itself; Notification +// resolves the audience from `NOTIFICATION_ADMIN_EMAILS_*`; the +// scenario needs no user-targeted resolution. +package rtmanagernotification_test + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "os" + "path/filepath" + "runtime" + "strconv" + "strings" + "sync/atomic" + "testing" + "time" + + "galaxy/integration/internal/harness" + + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +const ( + intentsStreamPrefix = "notification:intents" + startJobsStreamPrefix = "runtime:start_jobs" + stopJobsStreamPrefix = "runtime:stop_jobs" + jobResultsStreamPrefix = "runtime:job_results" + healthEventsStreamPrefix = "runtime:health_events" + mailDeliveriesPath = "/api/v1/internal/deliveries" + notificationTypeImagePull = "runtime.image_pull_failed" + notificationTypeStartFailed = "runtime.container_start_failed" + notificationTypeConfigInval = "runtime.start_config_invalid" + expectedAdminEmailRecipient = "rtm-admin@example.com" + expectedRTMProducer = "runtime_manager" + missingImageRef = "galaxy/integration-missing:0.0.0" +) + +var suiteSeq atomic.Int64 + +// TestRTMImagePullFailureFlowsThroughNotificationToMail drives Runtime +// Manager with a start envelope pointing at an unresolvable image +// reference, then asserts: +// +// 1. RTM publishes one `runtime.image_pull_failed` intent on +// `notification:intents` with the frozen admin payload. +// 2. The Notification Service consumes it and fans out the matching +// mail delivery to the configured admin recipient. +// 3. Mail Service records the delivery with the right template id, +// idempotency key, and template variables. +// +// The path covers the full producer → orchestrator → transport +// pipeline that `TESTING.md §7` requests as the +// `Runtime Manager ↔ Notification` boundary suite. +func TestRTMImagePullFailureFlowsThroughNotificationToMail(t *testing.T) { + h := newRTMNotificationHarness(t) + + gameID := uniqueGameID(t) + + h.publishStartJob(t, gameID, missingImageRef) + + // Step 1 — RTM publishes the admin notification intent. + intent := h.waitForIntent(t, + notificationTypeImagePull, + gameID, + 30*time.Second, + ) + assert.Equal(t, expectedRTMProducer, intent.Producer) + assert.Equal(t, "admin_email", intent.AudienceKind) + assert.Equal(t, gameID, intent.PayloadGameID) + assert.Equal(t, missingImageRef, intent.PayloadImageRef) + assert.Equal(t, "image_pull_failed", intent.PayloadErrorCode) + assert.NotEmpty(t, intent.PayloadErrorMessage, + "intent payload must carry operator-readable detail") + assert.NotZero(t, intent.PayloadAttemptedAtMS) + + // Step 2 — Notification routes to Mail; Mail sends the delivery. + idempotencyKey := "notification:" + intent.RedisEntryID + + "/email:email:" + expectedAdminEmailRecipient + + delivery := h.eventuallyDelivery(t, url.Values{ + "source": []string{"notification"}, + "status": []string{"sent"}, + "recipient": []string{expectedAdminEmailRecipient}, + "template_id": []string{notificationTypeImagePull}, + "idempotency_key": []string{idempotencyKey}, + }) + assert.Equal(t, "template", delivery.PayloadMode) + assert.Equal(t, notificationTypeImagePull, delivery.TemplateID) + assert.Equal(t, []string{expectedAdminEmailRecipient}, delivery.To) + + detail := h.getDelivery(t, delivery.DeliveryID) + assert.Equal(t, "notification", detail.Source) + assert.Equal(t, "template", detail.PayloadMode) + assert.Equal(t, notificationTypeImagePull, detail.TemplateID) + assert.Equal(t, idempotencyKey, detail.IdempotencyKey) + assert.Equal(t, []string{expectedAdminEmailRecipient}, detail.To) + + require.NotNil(t, detail.TemplateVariables, + "mail delivery must record template variables for admin triage") + assert.Equal(t, gameID, detail.TemplateVariables["game_id"]) + assert.Equal(t, missingImageRef, detail.TemplateVariables["image_ref"]) + assert.Equal(t, "image_pull_failed", detail.TemplateVariables["error_code"]) +} + +// rtmNotificationHarness owns the per-test infrastructure: shared +// Redis, four real binaries (RTM, Notification, Mail, User), and the +// per-test Docker network RTM's `/readyz` insists on. One harness per +// test keeps each scenario fully isolated. +type rtmNotificationHarness struct { + redis *redis.Client + + rtmInternalURL string + mailBaseURL string + + intentsStream string + startJobsStream string + stopJobsStream string + jobResultsStream string + healthEvents string + + rtmProcess *harness.Process + notificationProcess *harness.Process + mailProcess *harness.Process + userServiceProcess *harness.Process +} + +func newRTMNotificationHarness(t *testing.T) *rtmNotificationHarness { + t.Helper() + + // `/readyz` of RTM pings the Docker daemon; skip the suite if no + // Docker socket is reachable. + harness.RequireDockerDaemon(t) + + redisRuntime := harness.StartRedisContainer(t) + redisClient := redis.NewClient(&redis.Options{ + Addr: redisRuntime.Addr, + Protocol: 2, + DisableIdentity: true, + }) + t.Cleanup(func() { + require.NoError(t, redisClient.Close()) + }) + + dockerNetwork := harness.EnsureDockerNetwork(t) + + userServiceAddr := harness.FreeTCPAddress(t) + mailInternalAddr := harness.FreeTCPAddress(t) + notificationInternalAddr := harness.FreeTCPAddress(t) + rtmInternalAddr := harness.FreeTCPAddress(t) + + userServiceBinary := harness.BuildBinary(t, "userservice", "./user/cmd/userservice") + mailBinary := harness.BuildBinary(t, "mail", "./mail/cmd/mail") + notificationBinary := harness.BuildBinary(t, "notification", "./notification/cmd/notification") + rtmBinary := harness.BuildBinary(t, "rtmanager", "./rtmanager/cmd/rtmanager") + + // User Service: needed by Notification's port even though every + // intent in this suite is admin-only. + userServiceEnv := harness.StartUserServicePersistence(t, redisRuntime.Addr).Env + userServiceEnv["USERSERVICE_LOG_LEVEL"] = "info" + userServiceEnv["USERSERVICE_INTERNAL_HTTP_ADDR"] = userServiceAddr + userServiceEnv["OTEL_TRACES_EXPORTER"] = "none" + userServiceEnv["OTEL_METRICS_EXPORTER"] = "none" + userServiceProcess := harness.StartProcess(t, "userservice", userServiceBinary, userServiceEnv) + waitForUserServiceReady(t, userServiceProcess, "http://"+userServiceAddr) + + // Per-test stream prefixes. + suffix := strconv.FormatInt(suiteSeq.Add(1), 10) + intentsStream := intentsStreamPrefix + ":" + suffix + startJobsStream := startJobsStreamPrefix + ":" + suffix + stopJobsStream := stopJobsStreamPrefix + ":" + suffix + jobResultsStream := jobResultsStreamPrefix + ":" + suffix + healthEvents := healthEventsStreamPrefix + ":" + suffix + + // Mail Service. + mailEnv := harness.StartMailServicePersistence(t, redisRuntime.Addr).Env + mailEnv["MAIL_LOG_LEVEL"] = "info" + mailEnv["MAIL_INTERNAL_HTTP_ADDR"] = mailInternalAddr + mailEnv["MAIL_TEMPLATE_DIR"] = mailTemplateDir(t) + mailEnv["MAIL_SMTP_MODE"] = "stub" + mailEnv["MAIL_STREAM_BLOCK_TIMEOUT"] = "100ms" + mailEnv["MAIL_OPERATOR_REQUEST_TIMEOUT"] = time.Second.String() + mailEnv["MAIL_SHUTDOWN_TIMEOUT"] = "2s" + mailEnv["OTEL_TRACES_EXPORTER"] = "none" + mailEnv["OTEL_METRICS_EXPORTER"] = "none" + mailProcess := harness.StartProcess(t, "mail", mailBinary, mailEnv) + waitForMailReady(t, mailProcess, "http://"+mailInternalAddr) + + // Notification Service. Admin-email envs route every runtime.* + // intent to a shared rtm-admin recipient. + notificationEnv := harness.StartNotificationServicePersistence(t, redisRuntime.Addr).Env + notificationEnv["NOTIFICATION_LOG_LEVEL"] = "info" + notificationEnv["NOTIFICATION_INTERNAL_HTTP_ADDR"] = notificationInternalAddr + notificationEnv["NOTIFICATION_USER_SERVICE_BASE_URL"] = "http://" + userServiceAddr + notificationEnv["NOTIFICATION_USER_SERVICE_TIMEOUT"] = time.Second.String() + notificationEnv["NOTIFICATION_INTENTS_READ_BLOCK_TIMEOUT"] = "100ms" + notificationEnv["NOTIFICATION_ROUTE_BACKOFF_MIN"] = "100ms" + notificationEnv["NOTIFICATION_ROUTE_BACKOFF_MAX"] = "100ms" + notificationEnv["NOTIFICATION_INTENTS_STREAM"] = intentsStream + notificationEnv["NOTIFICATION_ADMIN_EMAILS_RUNTIME_IMAGE_PULL_FAILED"] = expectedAdminEmailRecipient + notificationEnv["NOTIFICATION_ADMIN_EMAILS_RUNTIME_CONTAINER_START_FAILED"] = expectedAdminEmailRecipient + notificationEnv["NOTIFICATION_ADMIN_EMAILS_RUNTIME_START_CONFIG_INVALID"] = expectedAdminEmailRecipient + notificationEnv["OTEL_TRACES_EXPORTER"] = "none" + notificationEnv["OTEL_METRICS_EXPORTER"] = "none" + notificationProcess := harness.StartProcess(t, "notification", notificationBinary, notificationEnv) + harness.WaitForHTTPStatus(t, notificationProcess, + "http://"+notificationInternalAddr+"/readyz", http.StatusOK) + + // Runtime Manager. Lobby base URL points at notification's + // ready-probe path so RTM's start-service ancillary GetGame call + // resolves to a valid 200/404 surface even though no Lobby is + // running. The start service treats the response as best-effort + // and never aborts on an unparseable body. + rtmEnv := harness.StartRTManagerServicePersistence(t, redisRuntime.Addr).Env + rtmEnv["RTMANAGER_LOG_LEVEL"] = "info" + rtmEnv["RTMANAGER_INTERNAL_HTTP_ADDR"] = rtmInternalAddr + rtmEnv["RTMANAGER_LOBBY_INTERNAL_BASE_URL"] = "http://127.0.0.1:1" + rtmEnv["RTMANAGER_LOBBY_INTERNAL_TIMEOUT"] = "200ms" + rtmEnv["RTMANAGER_DOCKER_HOST"] = resolveDockerHost() + rtmEnv["RTMANAGER_DOCKER_NETWORK"] = dockerNetwork + rtmEnv["RTMANAGER_GAME_STATE_ROOT"] = t.TempDir() + rtmEnv["RTMANAGER_REDIS_START_JOBS_STREAM"] = startJobsStream + rtmEnv["RTMANAGER_REDIS_STOP_JOBS_STREAM"] = stopJobsStream + rtmEnv["RTMANAGER_REDIS_JOB_RESULTS_STREAM"] = jobResultsStream + rtmEnv["RTMANAGER_REDIS_HEALTH_EVENTS_STREAM"] = healthEvents + rtmEnv["RTMANAGER_NOTIFICATION_INTENTS_STREAM"] = intentsStream + rtmEnv["RTMANAGER_STREAM_BLOCK_TIMEOUT"] = "200ms" + rtmEnv["RTMANAGER_RECONCILE_INTERVAL"] = "5s" + rtmEnv["RTMANAGER_CLEANUP_INTERVAL"] = "5s" + rtmEnv["RTMANAGER_INSPECT_INTERVAL"] = "5s" + rtmEnv["RTMANAGER_PROBE_INTERVAL"] = "5s" + rtmEnv["RTMANAGER_PROBE_TIMEOUT"] = "1s" + rtmEnv["RTMANAGER_PROBE_FAILURES_THRESHOLD"] = "3" + rtmEnv["RTMANAGER_GAME_LEASE_TTL_SECONDS"] = "30" + rtmEnv["RTMANAGER_IMAGE_PULL_POLICY"] = "if_missing" + rtmEnv["OTEL_TRACES_EXPORTER"] = "none" + rtmEnv["OTEL_METRICS_EXPORTER"] = "none" + rtmProcess := harness.StartProcess(t, "rtmanager", rtmBinary, rtmEnv) + harness.WaitForHTTPStatus(t, rtmProcess, + "http://"+rtmInternalAddr+"/readyz", http.StatusOK) + + return &rtmNotificationHarness{ + redis: redisClient, + rtmInternalURL: "http://" + rtmInternalAddr, + mailBaseURL: "http://" + mailInternalAddr, + intentsStream: intentsStream, + startJobsStream: startJobsStream, + stopJobsStream: stopJobsStream, + jobResultsStream: jobResultsStream, + healthEvents: healthEvents, + rtmProcess: rtmProcess, + notificationProcess: notificationProcess, + mailProcess: mailProcess, + userServiceProcess: userServiceProcess, + } +} + +func (h *rtmNotificationHarness) publishStartJob(t *testing.T, gameID, imageRef string) { + t.Helper() + _, err := h.redis.XAdd(context.Background(), &redis.XAddArgs{ + Stream: h.startJobsStream, + Values: map[string]any{ + "game_id": gameID, + "image_ref": imageRef, + "requested_at_ms": strconv.FormatInt(time.Now().UnixMilli(), 10), + }, + }).Result() + require.NoError(t, err) +} + +// observedIntent stores the decoded fields of one notification intent +// entry that the suite cares about. +type observedIntent struct { + RedisEntryID string + NotificationType string + Producer string + AudienceKind string + PayloadGameID string + PayloadImageRef string + PayloadErrorCode string + PayloadErrorMessage string + PayloadAttemptedAtMS int64 +} + +func (h *rtmNotificationHarness) waitForIntent( + t *testing.T, + notificationType, gameID string, + timeout time.Duration, +) observedIntent { + t.Helper() + + deadline := time.Now().Add(timeout) + for { + entries, err := h.redis.XRange(context.Background(), h.intentsStream, "-", "+").Result() + require.NoError(t, err) + for _, entry := range entries { + intent, ok := decodeIntent(entry) + if !ok { + continue + } + if intent.NotificationType != notificationType { + continue + } + if intent.PayloadGameID != gameID { + continue + } + return intent + } + if time.Now().After(deadline) { + t.Fatalf("intent %s for game %s not observed on stream %s within %s\n%s", + notificationType, gameID, h.intentsStream, timeout, h.rtmProcess.Logs()) + } + time.Sleep(50 * time.Millisecond) + } +} + +func decodeIntent(entry redis.XMessage) (observedIntent, bool) { + notificationType, _ := entry.Values["notification_type"].(string) + producer, _ := entry.Values["producer"].(string) + audienceKind, _ := entry.Values["audience_kind"].(string) + payloadJSON, _ := entry.Values["payload_json"].(string) + + if notificationType == "" { + return observedIntent{}, false + } + + out := observedIntent{ + RedisEntryID: entry.ID, + NotificationType: notificationType, + Producer: producer, + AudienceKind: audienceKind, + } + + if payloadJSON == "" { + return out, true + } + var payload struct { + GameID string `json:"game_id"` + ImageRef string `json:"image_ref"` + ErrorCode string `json:"error_code"` + ErrorMessage string `json:"error_message"` + AttemptedAtMS int64 `json:"attempted_at_ms"` + } + if err := json.Unmarshal([]byte(payloadJSON), &payload); err == nil { + out.PayloadGameID = payload.GameID + out.PayloadImageRef = payload.ImageRef + out.PayloadErrorCode = payload.ErrorCode + out.PayloadErrorMessage = payload.ErrorMessage + out.PayloadAttemptedAtMS = payload.AttemptedAtMS + } + return out, true +} + +// mailDeliverySummary mirrors the public list-deliveries response of +// Mail Service. +type mailDeliverySummary struct { + DeliveryID string `json:"delivery_id"` + Source string `json:"source"` + PayloadMode string `json:"payload_mode"` + TemplateID string `json:"template_id"` + Locale string `json:"locale"` + To []string `json:"to"` + Status string `json:"status"` +} + +type mailDeliveryDetail struct { + DeliveryID string `json:"delivery_id"` + Source string `json:"source"` + PayloadMode string `json:"payload_mode"` + TemplateID string `json:"template_id"` + Locale string `json:"locale"` + To []string `json:"to"` + IdempotencyKey string `json:"idempotency_key"` + Status string `json:"status"` + TemplateVariables map[string]any `json:"template_variables,omitempty"` +} + +func (h *rtmNotificationHarness) eventuallyDelivery( + t *testing.T, + query url.Values, +) mailDeliverySummary { + t.Helper() + + deadline := time.Now().Add(30 * time.Second) + for { + summary, found := h.findDelivery(t, query) + if found { + return summary + } + if time.Now().After(deadline) { + t.Fatalf("mail delivery for query %v not observed within 30s\n%s", + query, h.notificationProcess.Logs()) + } + time.Sleep(50 * time.Millisecond) + } +} + +func (h *rtmNotificationHarness) findDelivery( + t *testing.T, + query url.Values, +) (mailDeliverySummary, bool) { + t.Helper() + + listURL := h.mailBaseURL + mailDeliveriesPath + "?" + query.Encode() + req, err := http.NewRequest(http.MethodGet, listURL, nil) + require.NoError(t, err) + resp := doRequest(t, req) + if resp.StatusCode != http.StatusOK { + return mailDeliverySummary{}, false + } + var body struct { + Items []mailDeliverySummary `json:"items"` + } + if err := json.Unmarshal([]byte(resp.Body), &body); err != nil { + return mailDeliverySummary{}, false + } + if len(body.Items) == 0 { + return mailDeliverySummary{}, false + } + return body.Items[0], true +} + +func (h *rtmNotificationHarness) getDelivery(t *testing.T, deliveryID string) mailDeliveryDetail { + t.Helper() + + req, err := http.NewRequest(http.MethodGet, h.mailBaseURL+mailDeliveriesPath+"/"+url.PathEscape(deliveryID), nil) + require.NoError(t, err) + resp := doRequest(t, req) + require.Equalf(t, http.StatusOK, resp.StatusCode, "get delivery: %s", resp.Body) + + // Mail's detail response carries many fields the suite does not + // assert on (cc, bcc, reply-to, attempt history, …). Use a + // lenient decoder so additive contract changes do not break this + // boundary test. + var detail mailDeliveryDetail + require.NoError(t, json.Unmarshal([]byte(resp.Body), &detail)) + return detail +} + +// --- shared helpers (mirror the conventions of integration/notificationmail) --- + +type httpResponse struct { + StatusCode int + Body string + Header http.Header +} + +func doRequest(t *testing.T, request *http.Request) httpResponse { + t.Helper() + client := &http.Client{ + Timeout: 5 * time.Second, + Transport: &http.Transport{DisableKeepAlives: true}, + } + t.Cleanup(client.CloseIdleConnections) + + response, err := client.Do(request) + require.NoError(t, err) + defer response.Body.Close() + + payload, err := io.ReadAll(response.Body) + require.NoError(t, err) + return httpResponse{ + StatusCode: response.StatusCode, + Body: string(payload), + Header: response.Header.Clone(), + } +} + +func decodeStrictJSON(payload []byte, target any) error { + decoder := json.NewDecoder(bytes.NewReader(payload)) + decoder.DisallowUnknownFields() + if err := decoder.Decode(target); err != nil { + return err + } + if err := decoder.Decode(&struct{}{}); err != io.EOF { + if err == nil { + return errors.New("unexpected trailing JSON input") + } + return err + } + return nil +} + +func waitForUserServiceReady(t *testing.T, process *harness.Process, baseURL string) { + t.Helper() + client := &http.Client{Timeout: 250 * time.Millisecond} + t.Cleanup(client.CloseIdleConnections) + + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + req, err := http.NewRequest(http.MethodGet, + baseURL+"/api/v1/internal/users/user-readiness-probe/exists", nil) + require.NoError(t, err) + response, err := client.Do(req) + if err == nil { + _, _ = io.Copy(io.Discard, response.Body) + response.Body.Close() + if response.StatusCode == http.StatusOK { + return + } + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("wait for userservice readiness: timeout\n%s", process.Logs()) +} + +func waitForMailReady(t *testing.T, process *harness.Process, baseURL string) { + t.Helper() + client := &http.Client{Timeout: 250 * time.Millisecond} + t.Cleanup(client.CloseIdleConnections) + + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + req, err := http.NewRequest(http.MethodGet, baseURL+mailDeliveriesPath, nil) + require.NoError(t, err) + response, err := client.Do(req) + if err == nil { + _, _ = io.Copy(io.Discard, response.Body) + response.Body.Close() + if response.StatusCode == http.StatusOK { + return + } + } + time.Sleep(25 * time.Millisecond) + } + t.Fatalf("wait for mail readiness: timeout\n%s", process.Logs()) +} + +func mailTemplateDir(t *testing.T) string { + t.Helper() + return filepath.Join(repositoryRoot(t), "mail", "templates") +} + +func repositoryRoot(t *testing.T) string { + t.Helper() + _, file, _, ok := runtime.Caller(0) + if !ok { + t.Fatal("resolve repository root: runtime caller is unavailable") + } + return filepath.Clean(filepath.Join(filepath.Dir(file), "..", "..")) +} + +// uniqueGameID derives a deterministic, per-test, per-invocation game +// id usable as the `game_id` field on `runtime:start_jobs` entries +// without colliding when `-count` exceeds one. +func uniqueGameID(t *testing.T) string { + t.Helper() + return fmt.Sprintf("game-%s-%d", sanitiseGameName(t.Name()), time.Now().UnixNano()) +} + +func sanitiseGameName(name string) string { + allowed := func(r rune) rune { + switch { + case r >= 'a' && r <= 'z', + r >= 'A' && r <= 'Z', + r >= '0' && r <= '9': + return r + case r == '/' || r == '_' || r == '-': + return '-' + default: + return -1 + } + } + out := make([]rune, 0, len(name)) + for _, r := range name { + if mapped := allowed(r); mapped != -1 { + out = append(out, mapped) + } + } + return string(out) +} + +// resolveDockerHost mirrors `rtmanager/integration/harness.runtime.go`: +// honour DOCKER_HOST when the developer machine routes through colima +// or a remote daemon, fall back to the standard unix path otherwise. +func resolveDockerHost() string { + if host := strings.TrimSpace(os.Getenv("DOCKER_HOST")); host != "" { + return host + } + return "unix:///var/run/docker.sock" +} diff --git a/lobby/Makefile b/lobby/Makefile index c2160b9..a77c577 100644 --- a/lobby/Makefile +++ b/lobby/Makefile @@ -3,8 +3,15 @@ # The `jet` target regenerates the go-jet/v2 query-builder code under # internal/adapters/postgres/jet/ against a transient PostgreSQL container # brought up by cmd/jetgen. Generated code is committed. +# +# The `mocks` target regenerates the gomock-driven mocks via the +# //go:generate directives that live next to the interfaces they cover +# under internal/ports/. Generated code is committed. -.PHONY: jet +.PHONY: jet mocks jet: go run ./cmd/jetgen + +mocks: + go generate ./internal/ports/... diff --git a/lobby/PLAN.md b/lobby/PLAN.md index f04360a..61c06ce 100644 --- a/lobby/PLAN.md +++ b/lobby/PLAN.md @@ -1441,3 +1441,12 @@ The implementation is complete only when all of the following hold: generator - `go test ./... -race` passes for the lobby module, the user module, the `pkg/notificationintent` module, and the integration module + +## Note: Runtime Manager Envelope Evolution + +Subsequent changes to the `runtime:start_jobs` and `runtime:stop_jobs` +envelopes — specifically the addition of `image_ref` to the start envelope +and the addition of the `reason` enum to the stop envelope — are owned by +the Runtime Manager implementation plan, not by this document. See +[`../rtmanager/PLAN.md`](../rtmanager/PLAN.md) §«Stage 06. Lobby publisher +refactor». No new stages are added here for that work. diff --git a/lobby/README.md b/lobby/README.md index c980beb..a268b42 100644 --- a/lobby/README.md +++ b/lobby/README.md @@ -344,7 +344,7 @@ On success: ### Application state machine -``` +```text submitted → approved submitted → rejected ``` @@ -453,7 +453,7 @@ with payload: `game_id`, `game_name`, `invitee_user_id`, `invitee_name`. ### Invite state machine -``` +```text created → redeemed created → declined created → revoked @@ -591,9 +591,11 @@ Sentinel errors: `ErrNameTaken`, `ErrInvalidName`, `ErrPendingMissing`, `pg_advisory_xact_lock(hashtextextended(canonical_key, 0))`. See `docs/postgres-migration.md` §6B for the full schema and decision record. -- **Stub** (`lobby/internal/adapters/racenamestub/directory.go`) — in-process - implementation for unit tests that do not need PostgreSQL. Chosen by - `LOBBY_RACE_NAME_DIRECTORY_BACKEND=stub`. +- **In-memory** (`lobby/internal/adapters/racenameinmem/directory.go`) — + in-process implementation used by unit tests that do not need + PostgreSQL and by deployments that select the in-memory backend with + `LOBBY_RACE_NAME_DIRECTORY_BACKEND=stub` (the config token name is + preserved for backward compatibility). A future dedicated `Race Name Service` replaces the adapter without changing the domain or service layer. @@ -737,7 +739,7 @@ sequenceDiagram - If the container starts but `Lobby` cannot persist the runtime binding metadata, the start is a full failure: `Lobby` must issue a stop job to `Runtime Manager` - before setting `start_failed`. + with `reason=orphan_cleanup` before setting `start_failed`. - If metadata is persisted but `Game Master` is unavailable, the game must be placed in `paused`, not in `start_failed`. The container is alive; only the platform tracking is incomplete. @@ -745,6 +747,96 @@ sequenceDiagram - Concurrent start attempts for the same game must be serialized; the second attempt must fail if the first already moved the game to `starting`. +### Runtime Manager envelopes + +`Lobby` is the producer for both `runtime:start_jobs` and `runtime:stop_jobs`. +The `Lobby ↔ Runtime Manager` transport stays asynchronous indefinitely; there +is no synchronous Lobby→RTM REST call in v1 or planned for v2. + +`runtime:start_jobs` envelope: + +| Field | Type | Notes | +| --- | --- | --- | +| `game_id` | string | Lobby `game_id`. | +| `image_ref` | string | Docker reference resolved from `target_engine_version` via `LOBBY_ENGINE_IMAGE_TEMPLATE`. | +| `requested_at_ms` | int64 | UTC milliseconds; diagnostics only. | + +`runtime:stop_jobs` envelope: + +| Field | Type | Notes | +| --- | --- | --- | +| `game_id` | string | | +| `reason` | enum | `orphan_cleanup`, `cancelled`, `finished`, `admin_request`, `timeout`. | +| `requested_at_ms` | int64 | UTC milliseconds. | + +`reason` semantics (Lobby producer side): + +- `orphan_cleanup` — used by Lobby's runtime-job-result consumer to release a + container whose metadata persistence failed after a successful container + start. +- `cancelled` — used by the user-lifecycle cascade and by explicit cancel paths + for in-flight games. +- `finished` — reserved; not produced by Lobby in v1 because `game_finished` + is engine-driven and stop jobs after finish are an Admin/GM concern. +- `admin_request` — reserved for future admin-initiated stop paths through + Lobby; not produced in v1. +- `timeout` — reserved for future enrollment-timeout-driven stop paths; not + produced in v1. + +### Design rationale: StopReason placement + +The `StopReason` enum is declared in +`lobby/internal/ports/runtimemanager.go` alongside the `RuntimeManager` +interface that consumes it. The enum is publisher-side protocol: it +mirrors the AsyncAPI discriminator on `runtime:stop_jobs`, has no +behaviour beyond `Validate`, and co-locating it with the interface keeps +the AsyncAPI ↔ Go mapping visible in one file. + +Alternatives considered and rejected: + +- a dedicated `lobby/internal/domain/runtimejob` package — manufactures + a domain layer for a single string enum that exists only to be + serialised onto a Redis Stream; +- placing the enum in the publisher adapter package + (`lobby/internal/adapters/runtimemanager`) — the callers (start-game + service, runtime-job-result worker, user-lifecycle worker) live + outside that package and would have to depend on a concrete adapter + for an enum value. + +### Design rationale: `engineimage.Resolver` validates the template at construction + +`engineimage.Resolver` stores the validated template; the per-game +`Resolve(version)` call is therefore a pure string substitution that +cannot fail except on an empty `version`. + +`LOBBY_ENGINE_IMAGE_TEMPLATE` is loaded at startup. A malformed value +(missing `{engine_version}` placeholder, empty string) is an +operational misconfiguration that fails fast before any traffic arrives +— not on the first start-game request hours later. The synchronous +start handler then incurs no per-call template-shape recheck. + +A stateless free function `engineimage.Resolve(template, version)` was +rejected: the only useful checkpoint for the template literal is at +startup; a free function would either re-validate on every call (waste) +or skip validation (regression). + +The resolver only guards against an empty/whitespace `version`. Semver +validation lives in `lobby/internal/domain/game/model.go:validateSemver` +and runs at game-record construction time. Re-running it inside the +resolver would either duplicate the rule (drift risk) or import the +validator across package boundaries for no behavioural gain. Keeping the +resolver narrow leaves it reusable from a future producer (for example +`Game Master`, when it takes over `image_ref` resolution) without +dragging Lobby's domain rules along. + +The defensive `return start game: resolve image ref: %w` in +`startgame.Service.Handle` is a guard against a future invariant +violation; it is not exercised by the service-level test suite because +the only resolver-failure mode (empty `version`) requires bypassing +`game.Validate`, which `gameinmem.Save` always runs. Adding test +scaffolding to skip validation would teach the test suite a back door +that the production code path does not have. + ## Paused State `Lobby.paused` is a platform-level pause, distinct from `Game Master` runtime @@ -1135,6 +1227,14 @@ Stream names: - `LOBBY_RUNTIME_JOB_RESULTS_READ_BLOCK_TIMEOUT` with default `2s` - `LOBBY_NOTIFICATION_INTENTS_STREAM` with default `notification:intents` +Runtime Manager integration: + +- `LOBBY_ENGINE_IMAGE_TEMPLATE` with default `galaxy/game:{engine_version}` — + Go-style template applied to a game's `target_engine_version` to resolve + the Docker `image_ref` published on `runtime:start_jobs`. The template + must contain the literal placeholder `{engine_version}`; Lobby fails + fast at startup otherwise. + Upstream clients: - `LOBBY_USER_SERVICE_TIMEOUT` with default `1s` @@ -1264,6 +1364,18 @@ Key operations emit structured logs with these stable field names where applicab ## Verification +Test doubles split between two styles. Wide-surface ports with no +production state (`RuntimeManager`, `IntentPublisher`, `GMClient`, +`UserService`) use `gomock`-generated mocks under +`internal/adapters/mocks/`; regenerate with `make -C lobby mocks`. +Stateful behavioural fakes that mirror the production adapter +contract (`gameinmem`, `applicationinmem`, `inviteinmem`, +`membershipinmem`, `gameturnstatsinmem`, `racenameinmem`, +`evaluationguardinmem`, `gapactivationinmem`, `streamoffsetinmem`) +live as in-memory adapters under `internal/adapters/inmem/` +and stay hand-rolled because tests rely on their CAS, status-transition, +and invariant-tracking behaviour. + Focused service-local coverage verifies: - configuration loading and validation for all env var groups @@ -1274,7 +1386,7 @@ Focused service-local coverage verifies: - application flow: submit (eligibility check, race name check), approve, reject - invite flow: create, redeem (auto-membership), decline, revoke, expire on enrollment close - membership model: activate, remove, block with correct before/after-start semantics -- Race Name Directory (redis + stub adapters against the same suite): +- Race Name Directory (PostgreSQL + in-memory adapters against the same suite): canonicalization + confusable-pair policy, `Reserve`/`ReleaseReservation` per-game semantics, `MarkPendingRegistration`/`ExpirePendingRegistrations` window, `Register` idempotency + quota, `ReleaseAllByUser` cascade diff --git a/lobby/docs/runbook.md b/lobby/docs/runbook.md index 9161005..dda5814 100644 --- a/lobby/docs/runbook.md +++ b/lobby/docs/runbook.md @@ -35,8 +35,11 @@ Before starting the process, confirm: - `LOBBY_USER_LIFECYCLE_STREAM` (default `user:lifecycle_events`) - `LOBBY_NOTIFICATION_INTENTS_STREAM` (default `notification:intents`) - `LOBBY_RACE_NAME_DIRECTORY_BACKEND` is `postgres` for production - (the default after PG_PLAN.md §6B); the `stub` value is only for - unit tests that do not need a real PostgreSQL. + (the default after PG_PLAN.md §6B); the `stub` value selects the + in-memory adapter at `lobby/internal/adapters/racenameinmem/`, + intended for unit tests and small local deployments without + PostgreSQL. The config token name is kept as `stub` for backward + compatibility. At startup the process opens the PostgreSQL pool, applies migrations, pings PostgreSQL, then opens the Redis client and pings Redis. Startup diff --git a/lobby/docs/runtime.md b/lobby/docs/runtime.md index 4f41539..4f281bf 100644 --- a/lobby/docs/runtime.md +++ b/lobby/docs/runtime.md @@ -161,8 +161,11 @@ The groups below summarize the structure: - `Game Lobby` owns platform game state. Game Master may cache snapshots but is not the source of truth. - The Race Name Directory ships a PostgreSQL adapter (default after - PG_PLAN.md §6B) and an in-process stub. The stub is intended for unit - tests and is selected via `LOBBY_RACE_NAME_DIRECTORY_BACKEND=stub`. + PG_PLAN.md §6B) and an in-process implementation in + `lobby/internal/adapters/racenameinmem/`. The in-memory backend is + intended for unit tests and small local deployments and is selected + via `LOBBY_RACE_NAME_DIRECTORY_BACKEND=stub` (the config token name + is preserved for backward compatibility). - A `permanent_block` or `deleted` event from User Service fans out asynchronously through the `user:lifecycle_events` consumer; in-flight games owned by the affected user receive a stop-job and transition to diff --git a/lobby/go.mod b/lobby/go.mod index adadb3a..2373499 100644 --- a/lobby/go.mod +++ b/lobby/go.mod @@ -27,6 +27,7 @@ require ( go.opentelemetry.io/otel/sdk v1.43.0 go.opentelemetry.io/otel/sdk/metric v1.43.0 go.opentelemetry.io/otel/trace v1.43.0 + go.uber.org/mock v0.6.0 golang.org/x/mod v0.35.0 golang.org/x/text v0.36.0 ) diff --git a/lobby/go.sum b/lobby/go.sum index 452e7b9..a063535 100644 --- a/lobby/go.sum +++ b/lobby/go.sum @@ -326,6 +326,8 @@ go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/mock v0.6.0 h1:hyF9dfmbgIX5EfOdasqLsWD6xqpNZlXblLB/Dbnwv3Y= +go.uber.org/mock v0.6.0/go.mod h1:KiVJ4BqZJaMj4svdfmHM0AUx4NJYO8ZNpPnZn1Z+BBU= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= go.uber.org/multierr v1.5.0/go.mod h1:FeouvMocqHpRaaGuG9EjoKcStLC43Zu/fmqdUMPcKYU= diff --git a/lobby/internal/adapters/applicationstub/store.go b/lobby/internal/adapters/applicationinmem/store.go similarity index 98% rename from lobby/internal/adapters/applicationstub/store.go rename to lobby/internal/adapters/applicationinmem/store.go index 59785e2..26ba61f 100644 --- a/lobby/internal/adapters/applicationstub/store.go +++ b/lobby/internal/adapters/applicationinmem/store.go @@ -1,4 +1,4 @@ -// Package applicationstub provides an in-memory ports.ApplicationStore +// Package applicationinmem provides an in-memory ports.ApplicationStore // implementation for service-level tests. The stub mirrors the // behavioural contract of the Redis adapter in redisstate: it enforces // application.Transition for status updates, the single-active @@ -8,7 +8,7 @@ // Production code never wires this stub; it is test-only but exposed as // a regular (non _test.go) package so other service test packages can // import it. -package applicationstub +package applicationinmem import ( "context" diff --git a/lobby/internal/adapters/evaluationguardstub/store.go b/lobby/internal/adapters/evaluationguardinmem/store.go similarity index 95% rename from lobby/internal/adapters/evaluationguardstub/store.go rename to lobby/internal/adapters/evaluationguardinmem/store.go index c27ac55..f55b677 100644 --- a/lobby/internal/adapters/evaluationguardstub/store.go +++ b/lobby/internal/adapters/evaluationguardinmem/store.go @@ -1,7 +1,7 @@ -// Package evaluationguardstub provides an in-memory +// Package evaluationguardinmem provides an in-memory // ports.EvaluationGuardStore used by service-level capability evaluation // tests. Production code never wires this stub. -package evaluationguardstub +package evaluationguardinmem import ( "context" diff --git a/lobby/internal/adapters/gamestub/store.go b/lobby/internal/adapters/gameinmem/store.go similarity index 94% rename from lobby/internal/adapters/gamestub/store.go rename to lobby/internal/adapters/gameinmem/store.go index 064cc22..6eaa9aa 100644 --- a/lobby/internal/adapters/gamestub/store.go +++ b/lobby/internal/adapters/gameinmem/store.go @@ -1,13 +1,13 @@ -// Package gamestub provides an in-memory ports.GameStore implementation for -// service-level tests. The stub mirrors the behavioural contract of the -// Redis-backed adapter in redisstate: it enforces game.Transition for status -// updates, the ExpectedFrom CAS check, and the StartedAt/FinishedAt side -// effects of the canonical status transitions. +// Package gameinmem provides an in-memory ports.GameStore implementation +// for service-level tests. It mirrors the behavioural contract of the +// Redis-backed adapter in redisstate: it enforces game.Transition for +// status updates, the ExpectedFrom CAS check, and the +// StartedAt/FinishedAt side effects of the canonical status transitions. // -// Production code never wires this stub; it is test-only but exposed as a -// regular (non _test.go) package so other service test packages can import -// it. -package gamestub +// Production code never wires this adapter; it is test-only but exposed +// as a regular (non _test.go) package so other service test packages can +// import it. +package gameinmem import ( "context" diff --git a/lobby/internal/adapters/gamestub/store_test.go b/lobby/internal/adapters/gameinmem/store_test.go similarity index 99% rename from lobby/internal/adapters/gamestub/store_test.go rename to lobby/internal/adapters/gameinmem/store_test.go index 2ad2741..4edfc8e 100644 --- a/lobby/internal/adapters/gamestub/store_test.go +++ b/lobby/internal/adapters/gameinmem/store_test.go @@ -1,4 +1,4 @@ -package gamestub +package gameinmem import ( "context" diff --git a/lobby/internal/adapters/gameturnstatsstub/store.go b/lobby/internal/adapters/gameturnstatsinmem/store.go similarity index 98% rename from lobby/internal/adapters/gameturnstatsstub/store.go rename to lobby/internal/adapters/gameturnstatsinmem/store.go index 269ec5b..b0e6789 100644 --- a/lobby/internal/adapters/gameturnstatsstub/store.go +++ b/lobby/internal/adapters/gameturnstatsinmem/store.go @@ -1,4 +1,4 @@ -// Package gameturnstatsstub provides an in-memory ports.GameTurnStatsStore +// Package gameturnstatsinmem provides an in-memory ports.GameTurnStatsStore // implementation for service-level tests. The stub mirrors the behavioural // contract of the Redis adapter in redisstate: SaveInitial freezes the // initial fields on the first call per user, UpdateMax keeps the max fields @@ -8,7 +8,7 @@ // Production code never wires this stub; it is test-only but exposed as a // regular (non _test.go) package so downstream service test packages can // import it. -package gameturnstatsstub +package gameturnstatsinmem import ( "context" diff --git a/lobby/internal/adapters/gapactivationstub/store.go b/lobby/internal/adapters/gapactivationinmem/store.go similarity index 97% rename from lobby/internal/adapters/gapactivationstub/store.go rename to lobby/internal/adapters/gapactivationinmem/store.go index 3e92705..dab65bb 100644 --- a/lobby/internal/adapters/gapactivationstub/store.go +++ b/lobby/internal/adapters/gapactivationinmem/store.go @@ -1,9 +1,9 @@ -// Package gapactivationstub provides an in-memory +// Package gapactivationinmem provides an in-memory // ports.GapActivationStore implementation for service-level tests. The // stub records every MarkActivated call and offers WasActivated / // ActivatedAt accessors so test bodies can assert the gap-window trigger // fired exactly once. -package gapactivationstub +package gapactivationinmem import ( "context" diff --git a/lobby/internal/adapters/gmclientstub/client.go b/lobby/internal/adapters/gmclientstub/client.go deleted file mode 100644 index 95ce412..0000000 --- a/lobby/internal/adapters/gmclientstub/client.go +++ /dev/null @@ -1,89 +0,0 @@ -// Package gmclientstub provides an in-process ports.GMClient -// implementation used by service-level and worker-level tests that do -// not need to spin up an httptest server. The stub records every -// register call and every liveness probe, and supports independent -// error injection for each method so and paths can -// be exercised separately. -// -// Production code never wires this stub. -package gmclientstub - -import ( - "context" - "errors" - "sync" - - "galaxy/lobby/internal/ports" -) - -// Client is a concurrency-safe in-memory ports.GMClient. -type Client struct { - mu sync.Mutex - err error - pingErr error - requests []ports.RegisterGameRequest - pingCalls int -} - -// NewClient constructs an empty Client. -func NewClient() *Client { - return &Client{} -} - -// SetError makes the next RegisterGame calls return err. Passing nil -// clears the override. -func (client *Client) SetError(err error) { - client.mu.Lock() - defer client.mu.Unlock() - client.err = err -} - -// SetPingError makes the next Ping calls return err. Passing nil -// clears the override. RegisterGame is unaffected. -func (client *Client) SetPingError(err error) { - client.mu.Lock() - defer client.mu.Unlock() - client.pingErr = err -} - -// Requests returns the ordered slice of register requests received. -func (client *Client) Requests() []ports.RegisterGameRequest { - client.mu.Lock() - defer client.mu.Unlock() - return append([]ports.RegisterGameRequest(nil), client.requests...) -} - -// PingCalls returns the number of Ping invocations observed so far. -func (client *Client) PingCalls() int { - client.mu.Lock() - defer client.mu.Unlock() - return client.pingCalls -} - -// RegisterGame records the request and returns the configured error. -func (client *Client) RegisterGame(ctx context.Context, request ports.RegisterGameRequest) error { - if ctx == nil { - return errors.New("register game: nil context") - } - client.mu.Lock() - defer client.mu.Unlock() - if client.err != nil { - return client.err - } - client.requests = append(client.requests, request) - return nil -} - -// Ping increments the call counter and returns the configured error. -func (client *Client) Ping(ctx context.Context) error { - if ctx == nil { - return errors.New("ping: nil context") - } - client.mu.Lock() - defer client.mu.Unlock() - client.pingCalls++ - return client.pingErr -} - -// Compile-time interface assertion. -var _ ports.GMClient = (*Client)(nil) diff --git a/lobby/internal/adapters/intentpubstub/publisher.go b/lobby/internal/adapters/intentpubstub/publisher.go deleted file mode 100644 index 88c6765..0000000 --- a/lobby/internal/adapters/intentpubstub/publisher.go +++ /dev/null @@ -1,79 +0,0 @@ -// Package intentpubstub provides an in-process -// ports.IntentPublisher implementation for service-level tests. The -// stub records every Publish call and lets tests inject failures to -// verify that publication errors do not roll back already-committed -// business state. -package intentpubstub - -import ( - "context" - "errors" - "strconv" - "sync" - - "galaxy/lobby/internal/ports" - "galaxy/notificationintent" -) - -// Publisher is a concurrency-safe in-memory implementation of -// ports.IntentPublisher. The zero value is not usable; call NewPublisher -// to construct. -type Publisher struct { - mu sync.Mutex - published []notificationintent.Intent - nextID int - err error -} - -// NewPublisher constructs an empty Publisher ready for use. -func NewPublisher() *Publisher { - return &Publisher{} -} - -// SetError preloads err to be returned by every Publish call. Pass nil -// to reset. -func (publisher *Publisher) SetError(err error) { - if publisher == nil { - return - } - publisher.mu.Lock() - defer publisher.mu.Unlock() - publisher.err = err -} - -// Publish records intent and returns a synthetic stream entry id. -func (publisher *Publisher) Publish(ctx context.Context, intent notificationintent.Intent) (string, error) { - if publisher == nil { - return "", errors.New("publish notification intent: nil publisher") - } - if ctx == nil { - return "", errors.New("publish notification intent: nil context") - } - - publisher.mu.Lock() - defer publisher.mu.Unlock() - - if publisher.err != nil { - return "", publisher.err - } - - publisher.nextID++ - publisher.published = append(publisher.published, intent) - return strconv.Itoa(publisher.nextID), nil -} - -// Published returns a snapshot of every Publish-accepted intent in the -// order it was received. -func (publisher *Publisher) Published() []notificationintent.Intent { - if publisher == nil { - return nil - } - publisher.mu.Lock() - defer publisher.mu.Unlock() - out := make([]notificationintent.Intent, len(publisher.published)) - copy(out, publisher.published) - return out -} - -// Compile-time interface assertion. -var _ ports.IntentPublisher = (*Publisher)(nil) diff --git a/lobby/internal/adapters/invitestub/store.go b/lobby/internal/adapters/inviteinmem/store.go similarity index 98% rename from lobby/internal/adapters/invitestub/store.go rename to lobby/internal/adapters/inviteinmem/store.go index 6ccd8a7..d3ef90f 100644 --- a/lobby/internal/adapters/invitestub/store.go +++ b/lobby/internal/adapters/inviteinmem/store.go @@ -1,4 +1,4 @@ -// Package invitestub provides an in-memory ports.InviteStore implementation +// Package inviteinmem provides an in-memory ports.InviteStore implementation // for service-level tests. The stub mirrors the behavioural contract of the // Redis adapter in redisstate: Save is create-only, UpdateStatus enforces // invite.Transition and the ExpectedFrom CAS guard, and the index reads @@ -6,7 +6,7 @@ // // Production code never wires this stub; it is test-only but exposed as a // regular (non _test.go) package so other service test packages can import it. -package invitestub +package inviteinmem import ( "context" diff --git a/lobby/internal/adapters/membershipstub/store.go b/lobby/internal/adapters/membershipinmem/store.go similarity index 98% rename from lobby/internal/adapters/membershipstub/store.go rename to lobby/internal/adapters/membershipinmem/store.go index 29280e9..5a48660 100644 --- a/lobby/internal/adapters/membershipstub/store.go +++ b/lobby/internal/adapters/membershipinmem/store.go @@ -1,4 +1,4 @@ -// Package membershipstub provides an in-memory ports.MembershipStore +// Package membershipinmem provides an in-memory ports.MembershipStore // implementation for service-level tests. The stub mirrors the // behavioural contract of the Redis adapter in redisstate: Save is // create-only, UpdateStatus enforces membership.Transition and the @@ -8,7 +8,7 @@ // Production code never wires this stub; it is test-only but exposed as // a regular (non _test.go) package so other service test packages can // import it. -package membershipstub +package membershipinmem import ( "context" diff --git a/lobby/internal/adapters/metricsracenamedir/directory_test.go b/lobby/internal/adapters/metricsracenamedir/directory_test.go index 672018d..6808228 100644 --- a/lobby/internal/adapters/metricsracenamedir/directory_test.go +++ b/lobby/internal/adapters/metricsracenamedir/directory_test.go @@ -6,7 +6,7 @@ import ( "time" "galaxy/lobby/internal/adapters/metricsracenamedir" - "galaxy/lobby/internal/adapters/racenamestub" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/ports" "galaxy/lobby/internal/telemetry" @@ -28,7 +28,7 @@ func newRuntime(t *testing.T) (*telemetry.Runtime, sdkmetric.Reader) { func newInner(t *testing.T) ports.RaceNameDirectory { t.Helper() - stub, err := racenamestub.NewDirectory() + stub, err := racenameinmem.NewDirectory() require.NoError(t, err) return stub } diff --git a/lobby/internal/adapters/mocks/mock_gmclient.go b/lobby/internal/adapters/mocks/mock_gmclient.go new file mode 100644 index 0000000..24e88fa --- /dev/null +++ b/lobby/internal/adapters/mocks/mock_gmclient.go @@ -0,0 +1,70 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: galaxy/lobby/internal/ports (interfaces: GMClient) +// +// Generated by this command: +// +// mockgen -destination=../adapters/mocks/mock_gmclient.go -package=mocks galaxy/lobby/internal/ports GMClient +// + +// Package mocks is a generated GoMock package. +package mocks + +import ( + context "context" + ports "galaxy/lobby/internal/ports" + reflect "reflect" + + gomock "go.uber.org/mock/gomock" +) + +// MockGMClient is a mock of GMClient interface. +type MockGMClient struct { + ctrl *gomock.Controller + recorder *MockGMClientMockRecorder + isgomock struct{} +} + +// MockGMClientMockRecorder is the mock recorder for MockGMClient. +type MockGMClientMockRecorder struct { + mock *MockGMClient +} + +// NewMockGMClient creates a new mock instance. +func NewMockGMClient(ctrl *gomock.Controller) *MockGMClient { + mock := &MockGMClient{ctrl: ctrl} + mock.recorder = &MockGMClientMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockGMClient) EXPECT() *MockGMClientMockRecorder { + return m.recorder +} + +// Ping mocks base method. +func (m *MockGMClient) Ping(ctx context.Context) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Ping", ctx) + ret0, _ := ret[0].(error) + return ret0 +} + +// Ping indicates an expected call of Ping. +func (mr *MockGMClientMockRecorder) Ping(ctx any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Ping", reflect.TypeOf((*MockGMClient)(nil).Ping), ctx) +} + +// RegisterGame mocks base method. +func (m *MockGMClient) RegisterGame(ctx context.Context, request ports.RegisterGameRequest) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "RegisterGame", ctx, request) + ret0, _ := ret[0].(error) + return ret0 +} + +// RegisterGame indicates an expected call of RegisterGame. +func (mr *MockGMClientMockRecorder) RegisterGame(ctx, request any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RegisterGame", reflect.TypeOf((*MockGMClient)(nil).RegisterGame), ctx, request) +} diff --git a/lobby/internal/adapters/mocks/mock_intentpublisher.go b/lobby/internal/adapters/mocks/mock_intentpublisher.go new file mode 100644 index 0000000..b952d10 --- /dev/null +++ b/lobby/internal/adapters/mocks/mock_intentpublisher.go @@ -0,0 +1,57 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: galaxy/lobby/internal/ports (interfaces: IntentPublisher) +// +// Generated by this command: +// +// mockgen -destination=../adapters/mocks/mock_intentpublisher.go -package=mocks galaxy/lobby/internal/ports IntentPublisher +// + +// Package mocks is a generated GoMock package. +package mocks + +import ( + context "context" + notificationintent "galaxy/notificationintent" + reflect "reflect" + + gomock "go.uber.org/mock/gomock" +) + +// MockIntentPublisher is a mock of IntentPublisher interface. +type MockIntentPublisher struct { + ctrl *gomock.Controller + recorder *MockIntentPublisherMockRecorder + isgomock struct{} +} + +// MockIntentPublisherMockRecorder is the mock recorder for MockIntentPublisher. +type MockIntentPublisherMockRecorder struct { + mock *MockIntentPublisher +} + +// NewMockIntentPublisher creates a new mock instance. +func NewMockIntentPublisher(ctrl *gomock.Controller) *MockIntentPublisher { + mock := &MockIntentPublisher{ctrl: ctrl} + mock.recorder = &MockIntentPublisherMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockIntentPublisher) EXPECT() *MockIntentPublisherMockRecorder { + return m.recorder +} + +// Publish mocks base method. +func (m *MockIntentPublisher) Publish(ctx context.Context, intent notificationintent.Intent) (string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Publish", ctx, intent) + ret0, _ := ret[0].(string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Publish indicates an expected call of Publish. +func (mr *MockIntentPublisherMockRecorder) Publish(ctx, intent any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Publish", reflect.TypeOf((*MockIntentPublisher)(nil).Publish), ctx, intent) +} diff --git a/lobby/internal/adapters/mocks/mock_runtimemanager.go b/lobby/internal/adapters/mocks/mock_runtimemanager.go new file mode 100644 index 0000000..c61c8d2 --- /dev/null +++ b/lobby/internal/adapters/mocks/mock_runtimemanager.go @@ -0,0 +1,70 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: galaxy/lobby/internal/ports (interfaces: RuntimeManager) +// +// Generated by this command: +// +// mockgen -destination=../adapters/mocks/mock_runtimemanager.go -package=mocks galaxy/lobby/internal/ports RuntimeManager +// + +// Package mocks is a generated GoMock package. +package mocks + +import ( + context "context" + ports "galaxy/lobby/internal/ports" + reflect "reflect" + + gomock "go.uber.org/mock/gomock" +) + +// MockRuntimeManager is a mock of RuntimeManager interface. +type MockRuntimeManager struct { + ctrl *gomock.Controller + recorder *MockRuntimeManagerMockRecorder + isgomock struct{} +} + +// MockRuntimeManagerMockRecorder is the mock recorder for MockRuntimeManager. +type MockRuntimeManagerMockRecorder struct { + mock *MockRuntimeManager +} + +// NewMockRuntimeManager creates a new mock instance. +func NewMockRuntimeManager(ctrl *gomock.Controller) *MockRuntimeManager { + mock := &MockRuntimeManager{ctrl: ctrl} + mock.recorder = &MockRuntimeManagerMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockRuntimeManager) EXPECT() *MockRuntimeManagerMockRecorder { + return m.recorder +} + +// PublishStartJob mocks base method. +func (m *MockRuntimeManager) PublishStartJob(ctx context.Context, gameID, imageRef string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "PublishStartJob", ctx, gameID, imageRef) + ret0, _ := ret[0].(error) + return ret0 +} + +// PublishStartJob indicates an expected call of PublishStartJob. +func (mr *MockRuntimeManagerMockRecorder) PublishStartJob(ctx, gameID, imageRef any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PublishStartJob", reflect.TypeOf((*MockRuntimeManager)(nil).PublishStartJob), ctx, gameID, imageRef) +} + +// PublishStopJob mocks base method. +func (m *MockRuntimeManager) PublishStopJob(ctx context.Context, gameID string, reason ports.StopReason) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "PublishStopJob", ctx, gameID, reason) + ret0, _ := ret[0].(error) + return ret0 +} + +// PublishStopJob indicates an expected call of PublishStopJob. +func (mr *MockRuntimeManagerMockRecorder) PublishStopJob(ctx, gameID, reason any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PublishStopJob", reflect.TypeOf((*MockRuntimeManager)(nil).PublishStopJob), ctx, gameID, reason) +} diff --git a/lobby/internal/adapters/mocks/mock_userservice.go b/lobby/internal/adapters/mocks/mock_userservice.go new file mode 100644 index 0000000..8f06cdd --- /dev/null +++ b/lobby/internal/adapters/mocks/mock_userservice.go @@ -0,0 +1,57 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: galaxy/lobby/internal/ports (interfaces: UserService) +// +// Generated by this command: +// +// mockgen -destination=../adapters/mocks/mock_userservice.go -package=mocks galaxy/lobby/internal/ports UserService +// + +// Package mocks is a generated GoMock package. +package mocks + +import ( + context "context" + ports "galaxy/lobby/internal/ports" + reflect "reflect" + + gomock "go.uber.org/mock/gomock" +) + +// MockUserService is a mock of UserService interface. +type MockUserService struct { + ctrl *gomock.Controller + recorder *MockUserServiceMockRecorder + isgomock struct{} +} + +// MockUserServiceMockRecorder is the mock recorder for MockUserService. +type MockUserServiceMockRecorder struct { + mock *MockUserService +} + +// NewMockUserService creates a new mock instance. +func NewMockUserService(ctrl *gomock.Controller) *MockUserService { + mock := &MockUserService{ctrl: ctrl} + mock.recorder = &MockUserServiceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockUserService) EXPECT() *MockUserServiceMockRecorder { + return m.recorder +} + +// GetEligibility mocks base method. +func (m *MockUserService) GetEligibility(ctx context.Context, userID string) (ports.Eligibility, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetEligibility", ctx, userID) + ret0, _ := ret[0].(ports.Eligibility) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetEligibility indicates an expected call of GetEligibility. +func (mr *MockUserServiceMockRecorder) GetEligibility(ctx, userID any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetEligibility", reflect.TypeOf((*MockUserService)(nil).GetEligibility), ctx, userID) +} diff --git a/lobby/internal/adapters/racenamestub/directory.go b/lobby/internal/adapters/racenameinmem/directory.go similarity index 96% rename from lobby/internal/adapters/racenamestub/directory.go rename to lobby/internal/adapters/racenameinmem/directory.go index 5214688..9579675 100644 --- a/lobby/internal/adapters/racenamestub/directory.go +++ b/lobby/internal/adapters/racenameinmem/directory.go @@ -1,10 +1,13 @@ -// Package racenamestub provides the in-process implementation of the -// ports.RaceNameDirectory contract used by unit tests that do not need -// a Redis dependency. The stub enforces the full two-tier Race Name -// Directory invariants (registered, reservation, pending_registration) -// across the lifetime of one process, and is interchangeable with the -// Redis adapter under the same shared behavioural test suite. -package racenamestub +// Package racenameinmem provides the in-process implementation of the +// ports.RaceNameDirectory contract. It is used both by unit tests that +// do not need a Redis dependency and by deployments that select the +// in-memory backend via LOBBY_RACE_NAME_DIRECTORY_BACKEND=stub. It +// enforces the full two-tier Race Name Directory invariants +// (registered, reservation, pending_registration) across the lifetime +// of one process, and is interchangeable with the PostgreSQL adapter +// under the shared behavioural test suite at +// galaxy/lobby/internal/ports/racenamedirtest. +package racenameinmem import ( "context" diff --git a/lobby/internal/adapters/racenamestub/directory_test.go b/lobby/internal/adapters/racenameinmem/directory_test.go similarity index 85% rename from lobby/internal/adapters/racenamestub/directory_test.go rename to lobby/internal/adapters/racenameinmem/directory_test.go index 4acf7a4..0d2e916 100644 --- a/lobby/internal/adapters/racenamestub/directory_test.go +++ b/lobby/internal/adapters/racenameinmem/directory_test.go @@ -1,4 +1,4 @@ -package racenamestub_test +package racenameinmem_test import ( "context" @@ -9,7 +9,7 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/racenamestub" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/ports" "galaxy/lobby/internal/ports/racenamedirtest" @@ -19,11 +19,11 @@ import ( func TestDirectoryContract(t *testing.T) { racenamedirtest.Run(t, func(now func() time.Time) ports.RaceNameDirectory { - var opts []racenamestub.Option + var opts []racenameinmem.Option if now != nil { - opts = append(opts, racenamestub.WithClock(now)) + opts = append(opts, racenameinmem.WithClock(now)) } - directory, err := racenamestub.NewDirectory(opts...) + directory, err := racenameinmem.NewDirectory(opts...) require.NoError(t, err) return directory }) @@ -37,7 +37,7 @@ func TestReserveConcurrentUniquenessInvariant(t *testing.T) { const gameID = "game-concurrency" ctx := context.Background() - directory, err := racenamestub.NewDirectory() + directory, err := racenameinmem.NewDirectory() require.NoError(t, err) var ( diff --git a/lobby/internal/adapters/racenameintents/publisher_test.go b/lobby/internal/adapters/racenameintents/publisher_test.go index 654501a..d5ece06 100644 --- a/lobby/internal/adapters/racenameintents/publisher_test.go +++ b/lobby/internal/adapters/racenameintents/publisher_test.go @@ -6,7 +6,7 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/intentpubstub" + "galaxy/lobby/internal/adapters/mocks" "galaxy/lobby/internal/adapters/racenameintents" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/service/capabilityevaluation" @@ -14,13 +14,26 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) +func captureIntents(t *testing.T) (*mocks.MockIntentPublisher, *[]notificationintent.Intent) { + t.Helper() + publisher := mocks.NewMockIntentPublisher(gomock.NewController(t)) + var captured []notificationintent.Intent + publisher.EXPECT().Publish(gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, intent notificationintent.Intent) (string, error) { + captured = append(captured, intent) + return "1", nil + }).AnyTimes() + return publisher, &captured +} + func TestPublisherEligibleProducesExpectedIntent(t *testing.T) { t.Parallel() - stub := intentpubstub.NewPublisher() - publisher, err := racenameintents.NewPublisher(racenameintents.Config{Publisher: stub}) + mock, captured := captureIntents(t) + publisher, err := racenameintents.NewPublisher(racenameintents.Config{Publisher: mock}) require.NoError(t, err) finishedAt := time.UnixMilli(1775121700000).UTC() @@ -34,9 +47,8 @@ func TestPublisherEligibleProducesExpectedIntent(t *testing.T) { FinishedAt: finishedAt, })) - published := stub.Published() - require.Len(t, published, 1) - intent := published[0] + require.Len(t, *captured, 1) + intent := (*captured)[0] assert.Equal(t, notificationintent.NotificationTypeLobbyRaceNameRegistrationEligible, intent.NotificationType) assert.Equal(t, notificationintent.ProducerGameLobby, intent.Producer) assert.Equal(t, notificationintent.AudienceKindUser, intent.AudienceKind) @@ -53,8 +65,8 @@ func TestPublisherEligibleProducesExpectedIntent(t *testing.T) { func TestPublisherDeniedProducesExpectedIntent(t *testing.T) { t.Parallel() - stub := intentpubstub.NewPublisher() - publisher, err := racenameintents.NewPublisher(racenameintents.Config{Publisher: stub}) + mock, captured := captureIntents(t) + publisher, err := racenameintents.NewPublisher(racenameintents.Config{Publisher: mock}) require.NoError(t, err) finishedAt := time.UnixMilli(1775121700000).UTC() @@ -67,9 +79,8 @@ func TestPublisherDeniedProducesExpectedIntent(t *testing.T) { Reason: capabilityevaluation.ReasonCapabilityNotMet, })) - published := stub.Published() - require.Len(t, published, 1) - intent := published[0] + require.Len(t, *captured, 1) + intent := (*captured)[0] assert.Equal(t, notificationintent.NotificationTypeLobbyRaceNameRegistrationDenied, intent.NotificationType) assert.Equal(t, notificationintent.ProducerGameLobby, intent.Producer) assert.Equal(t, notificationintent.AudienceKindUser, intent.AudienceKind) @@ -86,9 +97,10 @@ func TestPublisherDeniedProducesExpectedIntent(t *testing.T) { func TestPublisherSurfacesPublisherError(t *testing.T) { t.Parallel() - stub := intentpubstub.NewPublisher() - stub.SetError(errors.New("transport unavailable")) - publisher, err := racenameintents.NewPublisher(racenameintents.Config{Publisher: stub}) + mock := mocks.NewMockIntentPublisher(gomock.NewController(t)) + mock.EXPECT().Publish(gomock.Any(), gomock.Any()). + Return("", errors.New("transport unavailable")).Times(1) + publisher, err := racenameintents.NewPublisher(racenameintents.Config{Publisher: mock}) require.NoError(t, err) finishedAt := time.UnixMilli(1775121700000).UTC() diff --git a/lobby/internal/adapters/runtimemanager/publisher.go b/lobby/internal/adapters/runtimemanager/publisher.go index 0097f4c..2c73d3c 100644 --- a/lobby/internal/adapters/runtimemanager/publisher.go +++ b/lobby/internal/adapters/runtimemanager/publisher.go @@ -6,6 +6,15 @@ // The two streams are intentionally separate: each one carries a single // command kind, which keeps the consumer-side logic in Runtime Manager // simple and avoids a `kind` discriminator inside the message body. +// +// Envelope shape per `rtmanager/api/runtime-jobs-asyncapi.yaml`: +// +// - `runtime:start_jobs` — `{game_id, image_ref, requested_at_ms}`, +// - `runtime:stop_jobs` — `{game_id, reason, requested_at_ms}`. +// +// The producer-supplied `image_ref` is resolved by the caller from the +// game's `target_engine_version` and the configured engine-image +// template; Runtime Manager never resolves engine versions itself. package runtimemanager import ( @@ -75,20 +84,45 @@ func NewPublisher(cfg Config) (*Publisher, error) { }, nil } -// PublishStartJob appends one start-job event for gameID to the -// configured start-jobs stream. -func (publisher *Publisher) PublishStartJob(ctx context.Context, gameID string) error { - return publisher.publish(ctx, "publish start job", publisher.startJobsStream, gameID) +// PublishStartJob appends one start-job event for gameID with the +// resolved imageRef to the configured start-jobs stream. +func (publisher *Publisher) PublishStartJob(ctx context.Context, gameID, imageRef string) error { + const op = "publish start job" + if err := publisher.checkCommon(op, ctx, gameID); err != nil { + return err + } + if strings.TrimSpace(imageRef) == "" { + return fmt.Errorf("%s: image ref must not be empty", op) + } + + values := map[string]any{ + "game_id": gameID, + "image_ref": imageRef, + "requested_at_ms": publisher.clock().UTC().UnixMilli(), + } + return publisher.xadd(ctx, op, publisher.startJobsStream, values) } -// PublishStopJob appends one stop-job event for gameID to the configured -// stop-jobs stream. In Lobby publishes stop jobs only from the -// orphan-container path inside the runtimejobresult worker. -func (publisher *Publisher) PublishStopJob(ctx context.Context, gameID string) error { - return publisher.publish(ctx, "publish stop job", publisher.stopJobsStream, gameID) +// PublishStopJob appends one stop-job event for gameID classified by +// reason to the configured stop-jobs stream. +func (publisher *Publisher) PublishStopJob(ctx context.Context, gameID string, reason ports.StopReason) error { + const op = "publish stop job" + if err := publisher.checkCommon(op, ctx, gameID); err != nil { + return err + } + if err := reason.Validate(); err != nil { + return fmt.Errorf("%s: %w", op, err) + } + + values := map[string]any{ + "game_id": gameID, + "reason": reason.String(), + "requested_at_ms": publisher.clock().UTC().UnixMilli(), + } + return publisher.xadd(ctx, op, publisher.stopJobsStream, values) } -func (publisher *Publisher) publish(ctx context.Context, op, stream, gameID string) error { +func (publisher *Publisher) checkCommon(op string, ctx context.Context, gameID string) error { if publisher == nil || publisher.client == nil { return fmt.Errorf("%s: nil publisher", op) } @@ -98,11 +132,10 @@ func (publisher *Publisher) publish(ctx context.Context, op, stream, gameID stri if strings.TrimSpace(gameID) == "" { return fmt.Errorf("%s: game id must not be empty", op) } + return nil +} - values := map[string]any{ - "game_id": gameID, - "requested_at_ms": publisher.clock().UTC().UnixMilli(), - } +func (publisher *Publisher) xadd(ctx context.Context, op, stream string, values map[string]any) error { if _, err := publisher.client.XAdd(ctx, &redis.XAddArgs{ Stream: stream, Values: values, diff --git a/lobby/internal/adapters/runtimemanager/publisher_test.go b/lobby/internal/adapters/runtimemanager/publisher_test.go index dda7555..59812fb 100644 --- a/lobby/internal/adapters/runtimemanager/publisher_test.go +++ b/lobby/internal/adapters/runtimemanager/publisher_test.go @@ -7,6 +7,7 @@ import ( "time" "galaxy/lobby/internal/adapters/runtimemanager" + "galaxy/lobby/internal/ports" "github.com/alicebob/miniredis/v2" "github.com/redis/go-redis/v9" @@ -60,12 +61,13 @@ func TestPublishStartJobAppendsToStartStream(t *testing.T) { now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) publisher, _, client := newTestPublisher(t, func() time.Time { return now }) - require.NoError(t, publisher.PublishStartJob(context.Background(), "game-1")) + require.NoError(t, publisher.PublishStartJob(context.Background(), "game-1", "galaxy/game:v1.0.0")) entries, err := client.XRange(context.Background(), "runtime:start_jobs", "-", "+").Result() require.NoError(t, err) require.Len(t, entries, 1) assert.Equal(t, "game-1", entries[0].Values["game_id"]) + assert.Equal(t, "galaxy/game:v1.0.0", entries[0].Values["image_ref"]) assert.Equal(t, strconv.FormatInt(now.UnixMilli(), 10), entries[0].Values["requested_at_ms"]) stop, err := client.XLen(context.Background(), "runtime:stop_jobs").Result() @@ -73,16 +75,29 @@ func TestPublishStartJobAppendsToStartStream(t *testing.T) { assert.Equal(t, int64(0), stop, "stop stream must remain empty") } +func TestPublisherStartJobIncludesImageRef(t *testing.T) { + publisher, _, client := newTestPublisher(t, nil) + + require.NoError(t, publisher.PublishStartJob(context.Background(), "game-1", "registry.example.com/galaxy/game:v1.4.7")) + + entries, err := client.XRange(context.Background(), "runtime:start_jobs", "-", "+").Result() + require.NoError(t, err) + require.Len(t, entries, 1) + assert.Equal(t, "registry.example.com/galaxy/game:v1.4.7", entries[0].Values["image_ref"], + "image_ref field must be present in the start envelope") +} + func TestPublishStopJobAppendsToStopStream(t *testing.T) { now := time.Date(2026, 4, 25, 13, 0, 0, 0, time.UTC) publisher, _, client := newTestPublisher(t, func() time.Time { return now }) - require.NoError(t, publisher.PublishStopJob(context.Background(), "game-2")) + require.NoError(t, publisher.PublishStopJob(context.Background(), "game-2", ports.StopReasonOrphanCleanup)) entries, err := client.XRange(context.Background(), "runtime:stop_jobs", "-", "+").Result() require.NoError(t, err) require.Len(t, entries, 1) assert.Equal(t, "game-2", entries[0].Values["game_id"]) + assert.Equal(t, "orphan_cleanup", entries[0].Values["reason"]) assert.Equal(t, strconv.FormatInt(now.UnixMilli(), 10), entries[0].Values["requested_at_ms"]) startLen, err := client.XLen(context.Background(), "runtime:start_jobs").Result() @@ -90,18 +105,44 @@ func TestPublishStopJobAppendsToStopStream(t *testing.T) { assert.Equal(t, int64(0), startLen, "start stream must remain empty") } +func TestPublisherStopJobIncludesReason(t *testing.T) { + publisher, _, client := newTestPublisher(t, nil) + + require.NoError(t, publisher.PublishStopJob(context.Background(), "game-2", ports.StopReasonCancelled)) + + entries, err := client.XRange(context.Background(), "runtime:stop_jobs", "-", "+").Result() + require.NoError(t, err) + require.Len(t, entries, 1) + assert.Equal(t, "cancelled", entries[0].Values["reason"], + "reason field must be present in the stop envelope") +} + func TestPublishRejectsEmptyGameID(t *testing.T) { publisher, _, _ := newTestPublisher(t, nil) - require.Error(t, publisher.PublishStartJob(context.Background(), "")) - require.Error(t, publisher.PublishStopJob(context.Background(), " ")) + require.Error(t, publisher.PublishStartJob(context.Background(), "", "galaxy/game:v1.0.0")) + require.Error(t, publisher.PublishStopJob(context.Background(), " ", ports.StopReasonCancelled)) +} + +func TestPublishStartJobRejectsEmptyImageRef(t *testing.T) { + publisher, _, _ := newTestPublisher(t, nil) + + require.Error(t, publisher.PublishStartJob(context.Background(), "game-1", "")) + require.Error(t, publisher.PublishStartJob(context.Background(), "game-1", " ")) +} + +func TestPublishStopJobRejectsUnknownReason(t *testing.T) { + publisher, _, _ := newTestPublisher(t, nil) + + require.Error(t, publisher.PublishStopJob(context.Background(), "game-1", ports.StopReason(""))) + require.Error(t, publisher.PublishStopJob(context.Background(), "game-1", ports.StopReason("unknown_reason"))) } func TestPublishRejectsNilContext(t *testing.T) { publisher, _, _ := newTestPublisher(t, nil) - require.Error(t, publisher.PublishStartJob(nilContext(), "game-1")) - require.Error(t, publisher.PublishStopJob(nilContext(), "game-1")) + require.Error(t, publisher.PublishStartJob(nilContext(), "game-1", "galaxy/game:v1.0.0")) + require.Error(t, publisher.PublishStopJob(nilContext(), "game-1", ports.StopReasonCancelled)) } // nilContext returns an explicit untyped nil to exercise the defensive diff --git a/lobby/internal/adapters/runtimemanagerstub/publisher.go b/lobby/internal/adapters/runtimemanagerstub/publisher.go deleted file mode 100644 index 024f239..0000000 --- a/lobby/internal/adapters/runtimemanagerstub/publisher.go +++ /dev/null @@ -1,92 +0,0 @@ -// Package runtimemanagerstub provides an in-process ports.RuntimeManager -// implementation used by service-level and worker-level tests that do -// not need a real Redis connection. The stub records every published -// job and supports inject-on-error to simulate stream failures. -// -// Production code never wires this stub. -package runtimemanagerstub - -import ( - "context" - "errors" - "sync" - - "galaxy/lobby/internal/ports" -) - -// Publisher is a concurrency-safe in-memory ports.RuntimeManager. -type Publisher struct { - mu sync.Mutex - startErr error - stopErr error - startJobs []string - stopJobs []string -} - -// NewPublisher constructs an empty Publisher. -func NewPublisher() *Publisher { - return &Publisher{} -} - -// SetStartError makes the next PublishStartJob calls return err. -// Passing nil clears the override. -func (publisher *Publisher) SetStartError(err error) { - publisher.mu.Lock() - defer publisher.mu.Unlock() - publisher.startErr = err -} - -// SetStopError makes the next PublishStopJob calls return err. -// Passing nil clears the override. -func (publisher *Publisher) SetStopError(err error) { - publisher.mu.Lock() - defer publisher.mu.Unlock() - publisher.stopErr = err -} - -// StartJobs returns the ordered slice of game ids passed to -// PublishStartJob. -func (publisher *Publisher) StartJobs() []string { - publisher.mu.Lock() - defer publisher.mu.Unlock() - return append([]string(nil), publisher.startJobs...) -} - -// StopJobs returns the ordered slice of game ids passed to -// PublishStopJob. -func (publisher *Publisher) StopJobs() []string { - publisher.mu.Lock() - defer publisher.mu.Unlock() - return append([]string(nil), publisher.stopJobs...) -} - -// PublishStartJob records gameID and returns the configured error. -func (publisher *Publisher) PublishStartJob(ctx context.Context, gameID string) error { - if ctx == nil { - return errors.New("publish start job: nil context") - } - publisher.mu.Lock() - defer publisher.mu.Unlock() - if publisher.startErr != nil { - return publisher.startErr - } - publisher.startJobs = append(publisher.startJobs, gameID) - return nil -} - -// PublishStopJob records gameID and returns the configured error. -func (publisher *Publisher) PublishStopJob(ctx context.Context, gameID string) error { - if ctx == nil { - return errors.New("publish stop job: nil context") - } - publisher.mu.Lock() - defer publisher.mu.Unlock() - if publisher.stopErr != nil { - return publisher.stopErr - } - publisher.stopJobs = append(publisher.stopJobs, gameID) - return nil -} - -// Compile-time interface assertion. -var _ ports.RuntimeManager = (*Publisher)(nil) diff --git a/lobby/internal/adapters/streamlagprobestub/probe.go b/lobby/internal/adapters/streamlagprobestub/probe.go deleted file mode 100644 index 74ccc06..0000000 --- a/lobby/internal/adapters/streamlagprobestub/probe.go +++ /dev/null @@ -1,61 +0,0 @@ -// Package streamlagprobestub provides an in-memory ports.StreamLagProbe -// implementation for tests that do not need a Redis instance. Production -// code never wires this stub. -package streamlagprobestub - -import ( - "context" - "sync" - "time" - - "galaxy/lobby/internal/ports" -) - -// Probe is a concurrency-safe in-memory ports.StreamLagProbe. The zero -// value reports `(0, false, nil)` for every stream until Set is called. -type Probe struct { - mu sync.Mutex - results map[string]Result - fallback Result -} - -// Result stores the value the probe reports for a stream. -type Result struct { - Age time.Duration - Found bool - Err error -} - -// NewProbe constructs one Probe with no preconfigured results. -func NewProbe() *Probe { - return &Probe{results: make(map[string]Result)} -} - -// Set installs the result the probe will return for stream. -func (probe *Probe) Set(stream string, result Result) { - probe.mu.Lock() - defer probe.mu.Unlock() - probe.results[stream] = result -} - -// SetFallback installs the result returned when no per-stream result is -// configured. -func (probe *Probe) SetFallback(result Result) { - probe.mu.Lock() - defer probe.mu.Unlock() - probe.fallback = result -} - -// OldestUnprocessedAge satisfies ports.StreamLagProbe. -func (probe *Probe) OldestUnprocessedAge(_ context.Context, stream, _ string) (time.Duration, bool, error) { - probe.mu.Lock() - defer probe.mu.Unlock() - - if result, ok := probe.results[stream]; ok { - return result.Age, result.Found, result.Err - } - return probe.fallback.Age, probe.fallback.Found, probe.fallback.Err -} - -// Compile-time interface assertion. -var _ ports.StreamLagProbe = (*Probe)(nil) diff --git a/lobby/internal/adapters/streamoffsetstub/store.go b/lobby/internal/adapters/streamoffsetinmem/store.go similarity index 93% rename from lobby/internal/adapters/streamoffsetstub/store.go rename to lobby/internal/adapters/streamoffsetinmem/store.go index 76645ab..31895f7 100644 --- a/lobby/internal/adapters/streamoffsetstub/store.go +++ b/lobby/internal/adapters/streamoffsetinmem/store.go @@ -1,7 +1,7 @@ -// Package streamoffsetstub provides an in-process ports.StreamOffsetStore +// Package streamoffsetinmem provides an in-process ports.StreamOffsetStore // used by worker-level tests that do not need Redis. Production code // never wires this stub. -package streamoffsetstub +package streamoffsetinmem import ( "context" diff --git a/lobby/internal/adapters/userlifecycle/consumer_test.go b/lobby/internal/adapters/userlifecycle/consumer_test.go index 4e0c1fe..c89abd2 100644 --- a/lobby/internal/adapters/userlifecycle/consumer_test.go +++ b/lobby/internal/adapters/userlifecycle/consumer_test.go @@ -10,7 +10,7 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/streamoffsetstub" + "galaxy/lobby/internal/adapters/streamoffsetinmem" "galaxy/lobby/internal/adapters/userlifecycle" "galaxy/lobby/internal/ports" @@ -33,7 +33,7 @@ func silentLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discar type harness struct { server *miniredis.Miniredis client *redis.Client - offsets *streamoffsetstub.Store + offsets *streamoffsetinmem.Store consumer *userlifecycle.Consumer } @@ -43,7 +43,7 @@ func newHarness(t *testing.T) *harness { client := redis.NewClient(&redis.Options{Addr: server.Addr()}) t.Cleanup(func() { _ = client.Close() }) - offsets := streamoffsetstub.NewStore() + offsets := streamoffsetinmem.NewStore() consumer, err := userlifecycle.NewConsumer(userlifecycle.Config{ Client: client, Stream: testStream, @@ -70,21 +70,21 @@ func TestNewConsumerRejectsMissingDeps(t *testing.T) { _, err := userlifecycle.NewConsumer(userlifecycle.Config{ Stream: testStream, BlockTimeout: time.Second, - OffsetStore: streamoffsetstub.NewStore(), + OffsetStore: streamoffsetinmem.NewStore(), }) require.Error(t, err) _, err = userlifecycle.NewConsumer(userlifecycle.Config{ Client: client, BlockTimeout: time.Second, - OffsetStore: streamoffsetstub.NewStore(), + OffsetStore: streamoffsetinmem.NewStore(), }) require.Error(t, err) _, err = userlifecycle.NewConsumer(userlifecycle.Config{ Client: client, Stream: testStream, - OffsetStore: streamoffsetstub.NewStore(), + OffsetStore: streamoffsetinmem.NewStore(), }) require.Error(t, err) diff --git a/lobby/internal/adapters/userlifecyclestub/consumer.go b/lobby/internal/adapters/userlifecyclestub/consumer.go deleted file mode 100644 index cf05101..0000000 --- a/lobby/internal/adapters/userlifecyclestub/consumer.go +++ /dev/null @@ -1,79 +0,0 @@ -// Package userlifecyclestub provides an in-process -// ports.UserLifecycleConsumer used by worker-level tests that do not -// need a real Redis stream. Production code never wires this stub. -package userlifecyclestub - -import ( - "context" - "errors" - "sync" - - "galaxy/lobby/internal/ports" -) - -// Consumer is an in-memory ports.UserLifecycleConsumer. Tests publish -// events synchronously through Deliver and observe handler errors via -// the returned value. -type Consumer struct { - mu sync.Mutex - handler ports.UserLifecycleHandler -} - -// NewConsumer constructs an empty Consumer. -func NewConsumer() *Consumer { - return &Consumer{} -} - -// OnEvent installs handler as the dispatch target. A second call -// replaces the previous handler. -func (consumer *Consumer) OnEvent(handler ports.UserLifecycleHandler) { - if consumer == nil { - return - } - consumer.mu.Lock() - consumer.handler = handler - consumer.mu.Unlock() -} - -// Run blocks until ctx is cancelled. The stub does not pull events from -// any backend; test code drives delivery via Deliver. -func (consumer *Consumer) Run(ctx context.Context) error { - if consumer == nil { - return errors.New("run user lifecycle stub: nil consumer") - } - if ctx == nil { - return errors.New("run user lifecycle stub: nil context") - } - <-ctx.Done() - return ctx.Err() -} - -// Shutdown is a no-op. -func (consumer *Consumer) Shutdown(ctx context.Context) error { - if ctx == nil { - return errors.New("shutdown user lifecycle stub: nil context") - } - return nil -} - -// Deliver dispatches event to the registered handler synchronously and -// returns the handler's error. It is the test-only entry point used by -// worker_test fixtures. -func (consumer *Consumer) Deliver(ctx context.Context, event ports.UserLifecycleEvent) error { - if consumer == nil { - return errors.New("deliver user lifecycle stub: nil consumer") - } - if ctx == nil { - return errors.New("deliver user lifecycle stub: nil context") - } - consumer.mu.Lock() - handler := consumer.handler - consumer.mu.Unlock() - if handler == nil { - return errors.New("deliver user lifecycle stub: no handler registered") - } - return handler(ctx, event) -} - -// Compile-time assertion: Consumer satisfies the port interface. -var _ ports.UserLifecycleConsumer = (*Consumer)(nil) diff --git a/lobby/internal/adapters/userservicestub/service.go b/lobby/internal/adapters/userservicestub/service.go deleted file mode 100644 index f0b64f9..0000000 --- a/lobby/internal/adapters/userservicestub/service.go +++ /dev/null @@ -1,107 +0,0 @@ -// Package userservicestub provides an in-process -// ports.UserService implementation for service-level tests. The stub -// stores per-user Eligibility values and lets tests inject errors for -// specific user ids to exercise the unavailable / decode-failure paths. -package userservicestub - -import ( - "context" - "errors" - "fmt" - "strings" - "sync" - - "galaxy/lobby/internal/ports" -) - -// Service is a concurrency-safe in-memory implementation of -// ports.UserService. The zero value is not usable; call NewService to -// construct. -type Service struct { - mu sync.Mutex - eligibilities map[string]ports.Eligibility - failures map[string]error - defaultMissing bool -} - -// NewService constructs an empty Service with no preloaded -// eligibilities. By default an unknown user maps to -// Eligibility{Exists:false}, mirroring the production HTTP client's -// 404 handling. Use WithDefaultUnavailable to flip the unknown-user -// behaviour to a transport failure. -func NewService(opts ...Option) *Service { - service := &Service{ - eligibilities: make(map[string]ports.Eligibility), - failures: make(map[string]error), - } - for _, opt := range opts { - opt(service) - } - return service -} - -// Option tunes Service construction. -type Option func(*Service) - -// WithDefaultUnavailable makes the stub return ErrUserServiceUnavailable -// for any user id without a preloaded eligibility or failure entry. -// Useful for tests that exercise the "User Service down" path without -// having to enumerate every caller. -func WithDefaultUnavailable() Option { - return func(service *Service) { - service.defaultMissing = true - } -} - -// SetEligibility preloads eligibility for userID. Subsequent calls -// overwrite the prior value. -func (service *Service) SetEligibility(userID string, eligibility ports.Eligibility) { - if service == nil { - return - } - service.mu.Lock() - defer service.mu.Unlock() - service.eligibilities[strings.TrimSpace(userID)] = eligibility -} - -// SetFailure preloads err to be returned for userID. err takes -// precedence over any preloaded eligibility. -func (service *Service) SetFailure(userID string, err error) { - if service == nil { - return - } - service.mu.Lock() - defer service.mu.Unlock() - service.failures[strings.TrimSpace(userID)] = err -} - -// GetEligibility returns the preloaded eligibility for userID. -func (service *Service) GetEligibility(ctx context.Context, userID string) (ports.Eligibility, error) { - if service == nil { - return ports.Eligibility{}, errors.New("get eligibility: nil service") - } - if ctx == nil { - return ports.Eligibility{}, errors.New("get eligibility: nil context") - } - trimmed := strings.TrimSpace(userID) - if trimmed == "" { - return ports.Eligibility{}, errors.New("get eligibility: user id must not be empty") - } - - service.mu.Lock() - defer service.mu.Unlock() - - if err, ok := service.failures[trimmed]; ok { - return ports.Eligibility{}, err - } - if eligibility, ok := service.eligibilities[trimmed]; ok { - return eligibility, nil - } - if service.defaultMissing { - return ports.Eligibility{}, fmt.Errorf("get eligibility: %w", ports.ErrUserServiceUnavailable) - } - return ports.Eligibility{Exists: false}, nil -} - -// Compile-time interface assertion. -var _ ports.UserService = (*Service)(nil) diff --git a/lobby/internal/api/internalhttp/games_test.go b/lobby/internal/api/internalhttp/games_test.go index ca9b96b..62b1884 100644 --- a/lobby/internal/api/internalhttp/games_test.go +++ b/lobby/internal/api/internalhttp/games_test.go @@ -11,7 +11,7 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" + "galaxy/lobby/internal/adapters/gameinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/ports" @@ -51,7 +51,7 @@ func fixedClock(at time.Time) func() time.Time { return func() time.Time { return at } } -func buildHandler(t *testing.T, store *gamestub.Store, ids ports.IDGenerator, clock func() time.Time) http.Handler { +func buildHandler(t *testing.T, store *gameinmem.Store, ids ports.IDGenerator, clock func() time.Time) http.Handler { t.Helper() logger := silentLogger() @@ -131,7 +131,7 @@ func TestAdminCreatesPublicGame(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() handler := buildHandler(t, store, &stubIDGenerator{next: "game-public"}, fixedClock(now)) body := createGameRequest{ @@ -158,7 +158,7 @@ func TestAdminCannotCreatePrivateGame(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - handler := buildHandler(t, gamestub.NewStore(), &stubIDGenerator{next: "game-priv"}, fixedClock(now)) + handler := buildHandler(t, gameinmem.NewStore(), &stubIDGenerator{next: "game-priv"}, fixedClock(now)) body := createGameRequest{ GameName: "Private Lobby", @@ -181,7 +181,7 @@ func TestAdminValidationError(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - handler := buildHandler(t, gamestub.NewStore(), &stubIDGenerator{next: "game-bad"}, fixedClock(now)) + handler := buildHandler(t, gameinmem.NewStore(), &stubIDGenerator{next: "game-bad"}, fixedClock(now)) body := createGameRequest{ GameName: "", @@ -204,7 +204,7 @@ func TestAdminUpdateAllFieldsInDraft(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftForTest(t, store, "game-u", game.GameTypePublic, "", now) handler := buildHandler(t, store, &stubIDGenerator{next: "unused"}, fixedClock(now.Add(time.Hour))) @@ -221,7 +221,7 @@ func TestAdminOpenEnrollment(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftForTest(t, store, "game-oe", game.GameTypePublic, "", now) handler := buildHandler(t, store, &stubIDGenerator{next: "unused"}, fixedClock(now.Add(time.Hour))) @@ -236,7 +236,7 @@ func TestAdminCancelFromRunning(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedDraftForTest(t, store, "game-run", game.GameTypePublic, "", now) // Force status to running to exercise the 409 conflict path. record.Status = game.StatusRunning @@ -257,7 +257,7 @@ func TestAdminUpdateNotFound(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - handler := buildHandler(t, gamestub.NewStore(), &stubIDGenerator{next: "unused"}, fixedClock(now)) + handler := buildHandler(t, gameinmem.NewStore(), &stubIDGenerator{next: "unused"}, fixedClock(now)) desc := "x" body := updateGameRequest{Description: &desc} @@ -269,7 +269,7 @@ func TestAdminCreateUnknownFieldRejected(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - handler := buildHandler(t, gamestub.NewStore(), &stubIDGenerator{next: "unused"}, fixedClock(now)) + handler := buildHandler(t, gameinmem.NewStore(), &stubIDGenerator{next: "unused"}, fixedClock(now)) reqBody := map[string]any{ "game_name": "x", @@ -289,7 +289,7 @@ func TestAdminCreateUnknownFieldRejected(t *testing.T) { func seedDraftForTest( t *testing.T, - store *gamestub.Store, + store *gameinmem.Store, id common.GameID, gameType game.GameType, ownerUserID string, diff --git a/lobby/internal/api/publichttp/games_test.go b/lobby/internal/api/publichttp/games_test.go index 1640019..e2ec90e 100644 --- a/lobby/internal/api/publichttp/games_test.go +++ b/lobby/internal/api/publichttp/games_test.go @@ -11,7 +11,7 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" + "galaxy/lobby/internal/adapters/gameinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/ports" @@ -47,7 +47,7 @@ func silentLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } -func buildHandler(t *testing.T, store *gamestub.Store, ids ports.IDGenerator, clock func() time.Time) http.Handler { +func buildHandler(t *testing.T, store *gameinmem.Store, ids ports.IDGenerator, clock func() time.Time) http.Handler { t.Helper() logger := silentLogger() @@ -134,7 +134,7 @@ func TestCreateGameHappyPath(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() handler := buildHandler(t, store, &stubIDGenerator{next: "game-first"}, fixedClock(now)) body := createGameRequest{ @@ -164,7 +164,7 @@ func TestCreateGameMissingUserIDHeader(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - handler := buildHandler(t, gamestub.NewStore(), &stubIDGenerator{next: "game-x"}, fixedClock(now)) + handler := buildHandler(t, gameinmem.NewStore(), &stubIDGenerator{next: "game-x"}, fixedClock(now)) body := createGameRequest{ GameName: "x", @@ -189,7 +189,7 @@ func TestCreateGameUnknownJSONFieldRejected(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - handler := buildHandler(t, gamestub.NewStore(), &stubIDGenerator{next: "game-x"}, fixedClock(now)) + handler := buildHandler(t, gameinmem.NewStore(), &stubIDGenerator{next: "game-x"}, fixedClock(now)) reqBody := map[string]any{ "game_name": "x", @@ -211,7 +211,7 @@ func TestCreateGameUserCannotCreatePublic(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - handler := buildHandler(t, gamestub.NewStore(), &stubIDGenerator{next: "game-x"}, fixedClock(now)) + handler := buildHandler(t, gameinmem.NewStore(), &stubIDGenerator{next: "game-x"}, fixedClock(now)) body := createGameRequest{ GameName: "x", @@ -234,7 +234,7 @@ func TestUpdateGameNotFound(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - handler := buildHandler(t, gamestub.NewStore(), &stubIDGenerator{next: "game-x"}, fixedClock(now)) + handler := buildHandler(t, gameinmem.NewStore(), &stubIDGenerator{next: "game-x"}, fixedClock(now)) desc := "new" body := updateGameRequest{Description: &desc} @@ -248,7 +248,7 @@ func TestOpenEnrollmentHappyPath(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftForTest(t, store, "game-oe", game.GameTypePrivate, "user-1", now) @@ -264,7 +264,7 @@ func TestOpenEnrollmentForbidden(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftForTest(t, store, "game-oe", game.GameTypePrivate, "user-1", now) @@ -278,7 +278,7 @@ func TestOpenEnrollmentConflict(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftForTest(t, store, "game-oe", game.GameTypePrivate, "user-1", now) require.NoError(t, store.UpdateStatus(context.Background(), ports.UpdateStatusInput{ @@ -301,7 +301,7 @@ func TestCancelGameHappyPath(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftForTest(t, store, "game-cx", game.GameTypePrivate, "user-1", now) @@ -315,7 +315,7 @@ func TestCancelGameHappyPath(t *testing.T) { func seedDraftForTest( t *testing.T, - store *gamestub.Store, + store *gameinmem.Store, id common.GameID, gameType game.GameType, ownerUserID string, diff --git a/lobby/internal/api/publichttp/racenames_test.go b/lobby/internal/api/publichttp/racenames_test.go index 0b656f3..50316d4 100644 --- a/lobby/internal/api/publichttp/racenames_test.go +++ b/lobby/internal/api/publichttp/racenames_test.go @@ -4,44 +4,114 @@ import ( "context" "encoding/json" "net/http" + "sync" "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/intentpubstub" - "galaxy/lobby/internal/adapters/racenamestub" - "galaxy/lobby/internal/adapters/userservicestub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/mocks" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/ports" "galaxy/lobby/internal/service/listmyracenames" "galaxy/lobby/internal/service/registerracename" + "galaxy/notificationintent" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) +type publishedIntentRec struct { + mu sync.Mutex + published []notificationintent.Intent +} + +func (r *publishedIntentRec) record(_ context.Context, intent notificationintent.Intent) (string, error) { + r.mu.Lock() + defer r.mu.Unlock() + r.published = append(r.published, intent) + return "1", nil +} + +func (r *publishedIntentRec) snapshot() []notificationintent.Intent { + r.mu.Lock() + defer r.mu.Unlock() + return append([]notificationintent.Intent(nil), r.published...) +} + +type userEligibilityRec struct { + mu sync.Mutex + elig map[string]ports.Eligibility + failures map[string]error +} + +func (r *userEligibilityRec) record(_ context.Context, userID string) (ports.Eligibility, error) { + r.mu.Lock() + defer r.mu.Unlock() + if err, ok := r.failures[userID]; ok { + return ports.Eligibility{}, err + } + if e, ok := r.elig[userID]; ok { + return e, nil + } + return ports.Eligibility{Exists: false}, nil +} + +func (r *userEligibilityRec) setEligibility(userID string, e ports.Eligibility) { + r.mu.Lock() + defer r.mu.Unlock() + if r.elig == nil { + r.elig = make(map[string]ports.Eligibility) + } + r.elig[userID] = e +} + +func (r *userEligibilityRec) setFailure(userID string, err error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.failures == nil { + r.failures = make(map[string]error) + } + r.failures[userID] = err +} + +func newPublishedIntentMock(t *testing.T, rec *publishedIntentRec) *mocks.MockIntentPublisher { + t.Helper() + m := mocks.NewMockIntentPublisher(gomock.NewController(t)) + m.EXPECT().Publish(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + +func newUserEligibilityMock(t *testing.T, rec *userEligibilityRec) *mocks.MockUserService { + t.Helper() + m := mocks.NewMockUserService(gomock.NewController(t)) + m.EXPECT().GetEligibility(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + type raceNameFixture struct { now time.Time - directory *racenamestub.Directory - users *userservicestub.Service - intents *intentpubstub.Publisher + directory *racenameinmem.Directory + users *userEligibilityRec + intents *publishedIntentRec handler http.Handler } func newRaceNameFixture(t *testing.T) *raceNameFixture { t.Helper() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - directory, err := racenamestub.NewDirectory(racenamestub.WithClock(func() time.Time { return now })) + directory, err := racenameinmem.NewDirectory(racenameinmem.WithClock(func() time.Time { return now })) require.NoError(t, err) - users := userservicestub.NewService() - intents := intentpubstub.NewPublisher() + usersRec := &userEligibilityRec{} + intentsRec := &publishedIntentRec{} logger := silentLogger() svc, err := registerracename.NewService(registerracename.Dependencies{ Directory: directory, - Users: users, - Intents: intents, + Users: newUserEligibilityMock(t, usersRec), + Intents: newPublishedIntentMock(t, intentsRec), Clock: func() time.Time { return now }, Logger: logger, }) @@ -50,8 +120,8 @@ func newRaceNameFixture(t *testing.T) *raceNameFixture { return &raceNameFixture{ now: now, directory: directory, - users: users, - intents: intents, + users: usersRec, + intents: intentsRec, handler: newHandler(Dependencies{Logger: logger, RegisterRaceName: svc}, logger), } } @@ -66,7 +136,7 @@ func TestHandleRegisterRaceNameHappyPath(t *testing.T) { t.Parallel() f := newRaceNameFixture(t) - f.users.SetEligibility("user-1", ports.Eligibility{Exists: true, MaxRegisteredRaceNames: 2}) + f.users.setEligibility("user-1", ports.Eligibility{Exists: true, MaxRegisteredRaceNames: 2}) f.seedPending(t, "game-1", "user-1", "Stellaris", f.now.Add(7*24*time.Hour)) rec := doRequest(t, f.handler, http.MethodPost, registerRaceNamePath, "user-1", registerRaceNameRequest{ @@ -82,7 +152,7 @@ func TestHandleRegisterRaceNameHappyPath(t *testing.T) { assert.Equal(t, f.now.UnixMilli(), resp.RegisteredAtMs) assert.NotEmpty(t, resp.CanonicalKey) - require.Len(t, f.intents.Published(), 1) + require.Len(t, f.intents.snapshot(), 1) } func TestHandleRegisterRaceNameRejectsMissingUserHeader(t *testing.T) { @@ -120,7 +190,7 @@ func TestHandleRegisterRaceNamePendingMissing(t *testing.T) { t.Parallel() f := newRaceNameFixture(t) - f.users.SetEligibility("user-1", ports.Eligibility{Exists: true, MaxRegisteredRaceNames: 2}) + f.users.setEligibility("user-1", ports.Eligibility{Exists: true, MaxRegisteredRaceNames: 2}) rec := doRequest(t, f.handler, http.MethodPost, registerRaceNamePath, "user-1", registerRaceNameRequest{ RaceName: "Stellaris", @@ -137,7 +207,7 @@ func TestHandleRegisterRaceNamePendingExpired(t *testing.T) { t.Parallel() f := newRaceNameFixture(t) - f.users.SetEligibility("user-1", ports.Eligibility{Exists: true, MaxRegisteredRaceNames: 2}) + f.users.setEligibility("user-1", ports.Eligibility{Exists: true, MaxRegisteredRaceNames: 2}) f.seedPending(t, "game-1", "user-1", "Stellaris", f.now.Add(-time.Minute)) rec := doRequest(t, f.handler, http.MethodPost, registerRaceNamePath, "user-1", registerRaceNameRequest{ @@ -155,7 +225,7 @@ func TestHandleRegisterRaceNameQuotaExceeded(t *testing.T) { t.Parallel() f := newRaceNameFixture(t) - f.users.SetEligibility("user-1", ports.Eligibility{Exists: true, MaxRegisteredRaceNames: 1}) + f.users.setEligibility("user-1", ports.Eligibility{Exists: true, MaxRegisteredRaceNames: 1}) // pre-existing registered race name to exhaust quota f.seedPending(t, "game-old", "user-1", "OldName", f.now.Add(24*time.Hour)) require.NoError(t, f.directory.Register(context.Background(), "game-old", "user-1", "OldName")) @@ -177,7 +247,7 @@ func TestHandleRegisterRaceNamePermanentBlock(t *testing.T) { t.Parallel() f := newRaceNameFixture(t) - f.users.SetEligibility("user-1", ports.Eligibility{ + f.users.setEligibility("user-1", ports.Eligibility{ Exists: true, PermanentBlocked: true, MaxRegisteredRaceNames: 2, @@ -199,7 +269,7 @@ func TestHandleRegisterRaceNameUserServiceUnavailable(t *testing.T) { t.Parallel() f := newRaceNameFixture(t) - f.users.SetFailure("user-1", ports.ErrUserServiceUnavailable) + f.users.setFailure("user-1", ports.ErrUserServiceUnavailable) f.seedPending(t, "game-1", "user-1", "Stellaris", f.now.Add(24*time.Hour)) rec := doRequest(t, f.handler, http.MethodPost, registerRaceNamePath, "user-1", registerRaceNameRequest{ @@ -218,17 +288,17 @@ func TestHandleRegisterRaceNameUserServiceUnavailable(t *testing.T) { // silent logger. type myRaceNamesFixture struct { now time.Time - directory *racenamestub.Directory - games *gamestub.Store + directory *racenameinmem.Directory + games *gameinmem.Store handler http.Handler } func newMyRaceNamesFixture(t *testing.T) *myRaceNamesFixture { t.Helper() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - directory, err := racenamestub.NewDirectory(racenamestub.WithClock(func() time.Time { return now })) + directory, err := racenameinmem.NewDirectory(racenameinmem.WithClock(func() time.Time { return now })) require.NoError(t, err) - games := gamestub.NewStore() + games := gameinmem.NewStore() logger := silentLogger() svc, err := listmyracenames.NewService(listmyracenames.Dependencies{ diff --git a/lobby/internal/app/wiring.go b/lobby/internal/app/wiring.go index 641f680..8e83802 100644 --- a/lobby/internal/app/wiring.go +++ b/lobby/internal/app/wiring.go @@ -16,13 +16,14 @@ import ( pginvitestore "galaxy/lobby/internal/adapters/postgres/invitestore" pgmembershipstore "galaxy/lobby/internal/adapters/postgres/membershipstore" pgracenamedir "galaxy/lobby/internal/adapters/postgres/racenamedir" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/adapters/racenameintents" - "galaxy/lobby/internal/adapters/racenamestub" "galaxy/lobby/internal/adapters/redisstate" "galaxy/lobby/internal/adapters/runtimemanager" "galaxy/lobby/internal/adapters/userlifecycle" "galaxy/lobby/internal/adapters/userservice" "galaxy/lobby/internal/config" + "galaxy/lobby/internal/domain/engineimage" "galaxy/lobby/internal/domain/racename" "galaxy/lobby/internal/ports" "galaxy/lobby/internal/telemetry" @@ -497,6 +498,11 @@ func newWiring( return nil, fmt.Errorf("new lobby wiring: %w", err) } + engineImageResolver, err := engineimage.NewResolver(cfg.RuntimeManager.EngineImageTemplate) + if err != nil { + return nil, fmt.Errorf("new lobby wiring: %w", err) + } + streamOffsets, err := redisstate.NewStreamOffsetStore(redisClient) if err != nil { return nil, fmt.Errorf("new lobby wiring: %w", err) @@ -505,6 +511,7 @@ func newWiring( startSvc, err := startgame.NewService(startgame.Dependencies{ Games: gameStore, RuntimeManager: runtimePublisher, + ImageResolver: engineImageResolver, Clock: clock, Logger: logger, Telemetry: telemetryRuntime, @@ -804,7 +811,7 @@ func buildRaceNameDirectory( Clock: clock, }) case config.RaceNameDirectoryBackendStub: - return racenamestub.NewDirectory(racenamestub.WithClock(clock)) + return racenameinmem.NewDirectory(racenameinmem.WithClock(clock)) default: return nil, fmt.Errorf("unsupported race name directory backend %q", cfg.RaceNameDirectory.Backend) } diff --git a/lobby/internal/config/config.go b/lobby/internal/config/config.go index ff4be98..59199e2 100644 --- a/lobby/internal/config/config.go +++ b/lobby/internal/config/config.go @@ -7,6 +7,7 @@ import ( "strings" "time" + "galaxy/lobby/internal/domain/engineimage" "galaxy/lobby/internal/telemetry" "galaxy/postgres" "galaxy/redisconn" @@ -49,6 +50,8 @@ const ( raceNameDirectoryBackendEnvVar = "LOBBY_RACE_NAME_DIRECTORY_BACKEND" raceNameExpirationIntervalEnvVar = "LOBBY_RACE_NAME_EXPIRATION_INTERVAL" + engineImageTemplateEnvVar = "LOBBY_ENGINE_IMAGE_TEMPLATE" + otelServiceNameEnvVar = "OTEL_SERVICE_NAME" otelTracesExporterEnvVar = "OTEL_TRACES_EXPORTER" otelMetricsExporterEnvVar = "OTEL_METRICS_EXPORTER" @@ -78,6 +81,7 @@ const ( defaultGMTimeout = 5 * time.Second defaultEnrollmentAutomationInterval = 30 * time.Second defaultRaceNameExpirationInterval = time.Hour + defaultEngineImageTemplate = "galaxy/game:" + engineimage.VersionPlaceholder defaultOTelServiceName = "galaxy-lobby" // RaceNameDirectoryBackendPostgres selects the PostgreSQL-backed @@ -134,6 +138,9 @@ type Config struct { // every pending_registration whose eligible_until has passed. PendingRegistration PendingRegistrationConfig + // RuntimeManager configures the Runtime Manager publisher contract. + RuntimeManager RuntimeManagerConfig + // Telemetry configures the process-wide OpenTelemetry runtime. Telemetry TelemetryConfig } @@ -410,6 +417,27 @@ func (cfg PendingRegistrationConfig) Validate() error { return nil } +// RuntimeManagerConfig configures the Lobby-side Runtime Manager +// publisher contract. Lobby resolves the Docker image reference it +// publishes on `runtime:start_jobs` from a per-game +// `target_engine_version` and the configured EngineImageTemplate. +type RuntimeManagerConfig struct { + // EngineImageTemplate stores the Docker reference template applied + // to a game's `target_engine_version`. The string must contain the + // literal placeholder `{engine_version}`; Lobby fails fast at + // startup otherwise. + EngineImageTemplate string +} + +// Validate reports whether cfg stores a usable Runtime Manager +// publisher configuration. +func (cfg RuntimeManagerConfig) Validate() error { + if _, err := engineimage.NewResolver(cfg.EngineImageTemplate); err != nil { + return fmt.Errorf("engine image template: %w", err) + } + return nil +} + // TelemetryConfig configures the Game Lobby Service OpenTelemetry runtime. type TelemetryConfig struct { // ServiceName overrides the default OpenTelemetry service name. @@ -504,6 +532,9 @@ func DefaultConfig() Config { PendingRegistration: PendingRegistrationConfig{ Interval: defaultRaceNameExpirationInterval, }, + RuntimeManager: RuntimeManagerConfig{ + EngineImageTemplate: defaultEngineImageTemplate, + }, Telemetry: TelemetryConfig{ ServiceName: defaultOTelServiceName, TracesExporter: "none", diff --git a/lobby/internal/config/config_test.go b/lobby/internal/config/config_test.go index ddd85c5..885e92c 100644 --- a/lobby/internal/config/config_test.go +++ b/lobby/internal/config/config_test.go @@ -40,6 +40,7 @@ func TestDefaultConfig(t *testing.T) { assert.Equal(t, 5*time.Second, cfg.GM.Timeout) assert.Equal(t, 30*time.Second, cfg.EnrollmentAutomation.Interval) assert.Equal(t, time.Hour, cfg.PendingRegistration.Interval) + assert.Equal(t, "galaxy/game:{engine_version}", cfg.RuntimeManager.EngineImageTemplate) assert.Equal(t, "galaxy-lobby", cfg.Telemetry.ServiceName) assert.Equal(t, "none", cfg.Telemetry.TracesExporter) assert.Equal(t, "none", cfg.Telemetry.MetricsExporter) @@ -114,6 +115,7 @@ func TestLoadFromEnvOverrides(t *testing.T) { t.Setenv("LOBBY_NOTIFICATION_INTENTS_STREAM", "alt:intents") t.Setenv("LOBBY_ENROLLMENT_AUTOMATION_INTERVAL", "45s") t.Setenv("LOBBY_RACE_NAME_EXPIRATION_INTERVAL", "15m") + t.Setenv("LOBBY_ENGINE_IMAGE_TEMPLATE", "registry.example.com/galaxy/game:{engine_version}") t.Setenv("OTEL_SERVICE_NAME", "galaxy-lobby-test") cfg, err := LoadFromEnv() @@ -129,6 +131,7 @@ func TestLoadFromEnvOverrides(t *testing.T) { assert.Equal(t, "alt:intents", cfg.Redis.NotificationIntentsStream) assert.Equal(t, 45*time.Second, cfg.EnrollmentAutomation.Interval) assert.Equal(t, 15*time.Minute, cfg.PendingRegistration.Interval) + assert.Equal(t, "registry.example.com/galaxy/game:{engine_version}", cfg.RuntimeManager.EngineImageTemplate) assert.Equal(t, "galaxy-lobby-test", cfg.Telemetry.ServiceName) } @@ -291,6 +294,34 @@ func TestEnrollmentAutomationConfigValidate(t *testing.T) { require.ErrorContains(t, EnrollmentAutomationConfig{}.Validate(), "interval must be positive") } +func TestRuntimeManagerConfigValidate(t *testing.T) { + t.Parallel() + + require.NoError(t, RuntimeManagerConfig{EngineImageTemplate: "galaxy/game:{engine_version}"}.Validate()) + require.ErrorContains(t, + RuntimeManagerConfig{EngineImageTemplate: ""}.Validate(), + "template must not be empty", + ) + require.ErrorContains(t, + RuntimeManagerConfig{EngineImageTemplate: "galaxy/game:1.0.0"}.Validate(), + "placeholder", + ) +} + +func TestLoadFromEnvRejectsInvalidEngineImageTemplate(t *testing.T) { + clearAllEnv(t) + t.Setenv("LOBBY_REDIS_MASTER_ADDR", testRedisAddr) + t.Setenv("LOBBY_REDIS_PASSWORD", testRedisSecret) + t.Setenv("LOBBY_POSTGRES_PRIMARY_DSN", testDSN) + t.Setenv("LOBBY_USER_SERVICE_BASE_URL", testUserBaseURL) + t.Setenv("LOBBY_GM_BASE_URL", testGMBaseURL) + t.Setenv("LOBBY_ENGINE_IMAGE_TEMPLATE", "galaxy/game:no-placeholder") + + _, err := LoadFromEnv() + require.Error(t, err) + require.Contains(t, err.Error(), "LOBBY_ENGINE_IMAGE_TEMPLATE") +} + func TestPendingRegistrationConfigValidate(t *testing.T) { t.Parallel() @@ -367,6 +398,7 @@ func clearAllEnv(t *testing.T) { enrollmentAutomationIntervalEnvVar, raceNameDirectoryBackendEnvVar, raceNameExpirationIntervalEnvVar, + engineImageTemplateEnvVar, otelServiceNameEnvVar, otelTracesExporterEnvVar, otelMetricsExporterEnvVar, diff --git a/lobby/internal/config/env.go b/lobby/internal/config/env.go index 4fa52b4..620c22f 100644 --- a/lobby/internal/config/env.go +++ b/lobby/internal/config/env.go @@ -108,6 +108,8 @@ func LoadFromEnv() (Config, error) { return Config{}, err } + cfg.RuntimeManager.EngineImageTemplate = stringEnv(engineImageTemplateEnvVar, cfg.RuntimeManager.EngineImageTemplate) + cfg.Telemetry.ServiceName = stringEnv(otelServiceNameEnvVar, cfg.Telemetry.ServiceName) cfg.Telemetry.TracesExporter = normalizeExporterValue(stringEnv(otelTracesExporterEnvVar, cfg.Telemetry.TracesExporter)) cfg.Telemetry.MetricsExporter = normalizeExporterValue(stringEnv(otelMetricsExporterEnvVar, cfg.Telemetry.MetricsExporter)) diff --git a/lobby/internal/config/validation.go b/lobby/internal/config/validation.go index 3833760..b194c40 100644 --- a/lobby/internal/config/validation.go +++ b/lobby/internal/config/validation.go @@ -41,6 +41,9 @@ func (cfg Config) Validate() error { if err := cfg.PendingRegistration.Validate(); err != nil { return fmt.Errorf("%s: %w", raceNameExpirationIntervalEnvVar, err) } + if err := cfg.RuntimeManager.Validate(); err != nil { + return fmt.Errorf("%s: %w", engineImageTemplateEnvVar, err) + } if err := cfg.Telemetry.Validate(); err != nil { return err } diff --git a/lobby/internal/domain/engineimage/resolver.go b/lobby/internal/domain/engineimage/resolver.go new file mode 100644 index 0000000..2f42b9d --- /dev/null +++ b/lobby/internal/domain/engineimage/resolver.go @@ -0,0 +1,66 @@ +// Package engineimage resolves the Docker reference Lobby publishes on +// `runtime:start_jobs`. The reference is built from a configurable +// template that must contain the literal `{engine_version}` placeholder +// and a per-game `target_engine_version`. +// +// The resolver intentionally performs only template substitution and a +// non-empty-version guard. Semver validation of the engine version +// itself lives in `lobby/internal/domain/game` and runs at game-record +// construction time; by the time `startgame.Service.Handle` reads the +// record the version is already validated. +package engineimage + +import ( + "errors" + "fmt" + "strings" +) + +// VersionPlaceholder is the literal token a template must contain. The +// resolver substitutes it with the per-game engine version verbatim. +const VersionPlaceholder = "{engine_version}" + +// Resolver substitutes a per-game engine version into a pre-validated +// template. The template is validated once at construction so per-game +// `Resolve` calls remain pure string substitution. +type Resolver struct { + template string +} + +// NewResolver returns a Resolver that uses template for every Resolve +// call. It returns an error if template is empty or does not contain +// VersionPlaceholder. +func NewResolver(template string) (*Resolver, error) { + trimmed := strings.TrimSpace(template) + if trimmed == "" { + return nil, errors.New("engine image resolver: template must not be empty") + } + if !strings.Contains(trimmed, VersionPlaceholder) { + return nil, fmt.Errorf( + "engine image resolver: template %q must contain placeholder %q", + template, VersionPlaceholder, + ) + } + return &Resolver{template: trimmed}, nil +} + +// Template returns the validated template string the resolver was +// constructed with. The accessor is intended for diagnostics and tests. +func (resolver *Resolver) Template() string { + if resolver == nil { + return "" + } + return resolver.template +} + +// Resolve substitutes VersionPlaceholder in the validated template with +// version. It returns an error when version is empty or whitespace. +func (resolver *Resolver) Resolve(version string) (string, error) { + if resolver == nil { + return "", errors.New("engine image resolver: nil resolver") + } + if strings.TrimSpace(version) == "" { + return "", errors.New("engine image resolver: engine version must not be empty") + } + return strings.ReplaceAll(resolver.template, VersionPlaceholder, version), nil +} diff --git a/lobby/internal/domain/engineimage/resolver_test.go b/lobby/internal/domain/engineimage/resolver_test.go new file mode 100644 index 0000000..413ba26 --- /dev/null +++ b/lobby/internal/domain/engineimage/resolver_test.go @@ -0,0 +1,96 @@ +package engineimage_test + +import ( + "testing" + + "galaxy/lobby/internal/domain/engineimage" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNewResolverAcceptsValidTemplate(t *testing.T) { + t.Parallel() + + resolver, err := engineimage.NewResolver("galaxy/game:{engine_version}") + require.NoError(t, err) + require.NotNil(t, resolver) + assert.Equal(t, "galaxy/game:{engine_version}", resolver.Template()) +} + +func TestNewResolverRejectsEmptyTemplate(t *testing.T) { + t.Parallel() + + cases := []string{"", " "} + for _, candidate := range cases { + _, err := engineimage.NewResolver(candidate) + require.Error(t, err) + } +} + +func TestNewResolverRejectsTemplateWithoutPlaceholder(t *testing.T) { + t.Parallel() + + _, err := engineimage.NewResolver("galaxy/game:1.0.0") + require.Error(t, err) +} + +func TestResolveSubstitutesVersion(t *testing.T) { + t.Parallel() + + resolver, err := engineimage.NewResolver("registry.example.com/galaxy/game:{engine_version}") + require.NoError(t, err) + + got, err := resolver.Resolve("v1.4.7") + require.NoError(t, err) + assert.Equal(t, "registry.example.com/galaxy/game:v1.4.7", got) +} + +func TestResolveSubstitutesEveryPlaceholderOccurrence(t *testing.T) { + t.Parallel() + + resolver, err := engineimage.NewResolver( + "registry.example.com/{engine_version}/game:{engine_version}", + ) + require.NoError(t, err) + + got, err := resolver.Resolve("v2.0.1") + require.NoError(t, err) + assert.Equal(t, "registry.example.com/v2.0.1/game:v2.0.1", got) +} + +func TestResolveRejectsEmptyVersion(t *testing.T) { + t.Parallel() + + resolver, err := engineimage.NewResolver("galaxy/game:{engine_version}") + require.NoError(t, err) + + cases := []string{"", " "} + for _, candidate := range cases { + _, err := resolver.Resolve(candidate) + require.Error(t, err) + } +} + +func TestResolveReusesValidatedTemplate(t *testing.T) { + t.Parallel() + + resolver, err := engineimage.NewResolver("galaxy/game:{engine_version}") + require.NoError(t, err) + + first, err := resolver.Resolve("v1.0.0") + require.NoError(t, err) + second, err := resolver.Resolve("v2.0.0") + require.NoError(t, err) + + assert.Equal(t, "galaxy/game:v1.0.0", first) + assert.Equal(t, "galaxy/game:v2.0.0", second) +} + +func TestNilResolverResolveReturnsError(t *testing.T) { + t.Parallel() + + var resolver *engineimage.Resolver + _, err := resolver.Resolve("v1.0.0") + require.Error(t, err) +} diff --git a/lobby/internal/ports/gmclient.go b/lobby/internal/ports/gmclient.go index a93102a..32784ce 100644 --- a/lobby/internal/ports/gmclient.go +++ b/lobby/internal/ports/gmclient.go @@ -16,6 +16,8 @@ import ( // to `paused` and an admin notification is published. var ErrGMUnavailable = errors.New("game master unavailable") +//go:generate go run go.uber.org/mock/mockgen -destination=../adapters/mocks/mock_gmclient.go -package=mocks galaxy/lobby/internal/ports GMClient + // GMClient executes synchronous calls to Game Master. introduced // the registration call; added the liveness probe used by the // voluntary resume flow. diff --git a/lobby/internal/ports/intentpublisher.go b/lobby/internal/ports/intentpublisher.go index 26f76cd..7be2619 100644 --- a/lobby/internal/ports/intentpublisher.go +++ b/lobby/internal/ports/intentpublisher.go @@ -6,10 +6,12 @@ import ( "galaxy/notificationintent" ) +//go:generate go run go.uber.org/mock/mockgen -destination=../adapters/mocks/mock_intentpublisher.go -package=mocks galaxy/lobby/internal/ports IntentPublisher + // IntentPublisher is the lobby-facing producer port for normalized // notification intents. The production adapter is a // *notificationintent.Publisher which already satisfies this interface; -// service tests use an in-process stub that records every Publish call. +// service tests use a generated gomock that records every Publish call. // // A failed Publish call is a notification degradation per // lobby/README.md §Notification Contracts and must not roll back already diff --git a/lobby/internal/ports/runtimemanager.go b/lobby/internal/ports/runtimemanager.go index 6e47a42..c13c969 100644 --- a/lobby/internal/ports/runtimemanager.go +++ b/lobby/internal/ports/runtimemanager.go @@ -1,25 +1,92 @@ package ports -import "context" +import ( + "context" + "fmt" +) + +// StopReason classifies why Lobby asks Runtime Manager to stop a game +// container. The enum is part of the `runtime:stop_jobs` envelope and +// mirrors the AsyncAPI contract frozen in +// `rtmanager/api/runtime-jobs-asyncapi.yaml`. +// +// Lobby v1 produces only StopReasonOrphanCleanup (orphan-container path +// in the runtime-job-result worker) and StopReasonCancelled +// (user-lifecycle cascade). The remaining values are reserved in the +// shared contract for future producers (Game Master, Admin Service, +// enrollment automation). +type StopReason string + +// StopReason enum values. The set is fixed by +// `rtmanager/api/runtime-jobs-asyncapi.yaml`; adding a new value is a +// contract bump that must be coordinated across producers and consumers. +const ( + // StopReasonOrphanCleanup releases a container whose post-start + // metadata persistence failed in Lobby. + StopReasonOrphanCleanup StopReason = "orphan_cleanup" + + // StopReasonCancelled covers user-lifecycle cascade and explicit + // cancel paths for in-flight games. + StopReasonCancelled StopReason = "cancelled" + + // StopReasonFinished is reserved for engine-driven game finish + // flows; not produced by Lobby in v1. + StopReasonFinished StopReason = "finished" + + // StopReasonAdminRequest is reserved for future admin-initiated + // stop paths through Lobby; not produced by Lobby in v1. + StopReasonAdminRequest StopReason = "admin_request" + + // StopReasonTimeout is reserved for future enrollment-timeout-driven + // stop paths; not produced by Lobby in v1. + StopReasonTimeout StopReason = "timeout" +) + +// String returns reason as its stored enum value. +func (reason StopReason) String() string { + return string(reason) +} + +// Validate reports whether reason carries one of the five values fixed +// by the AsyncAPI contract. +func (reason StopReason) Validate() error { + switch reason { + case StopReasonOrphanCleanup, + StopReasonCancelled, + StopReasonFinished, + StopReasonAdminRequest, + StopReasonTimeout: + return nil + case "": + return fmt.Errorf("stop reason must not be empty") + default: + return fmt.Errorf("stop reason %q is not a recognised value", string(reason)) + } +} + +//go:generate go run go.uber.org/mock/mockgen -destination=../adapters/mocks/mock_runtimemanager.go -package=mocks galaxy/lobby/internal/ports RuntimeManager // RuntimeManager publishes runtime jobs to Runtime Manager via Redis -// Streams. introduces start and stop jobs; future stages may -// extend the surface. +// Streams. Lobby is the producer for both the start and the stop stream; +// Runtime Manager (Stages 13+) is the eventual consumer. // -// The interface is intentionally narrow: callers pass only the game id. -// Runtime Manager fetches additional context (target engine version, -// turn schedule, etc.) through Lobby's internal HTTP API when it picks -// up the job. +// Image-reference resolution is intentionally a Lobby concern: each +// game's `target_engine_version` is substituted into +// `LOBBY_ENGINE_IMAGE_TEMPLATE` and the resulting `image_ref` is handed +// to Runtime Manager verbatim on the start envelope. Runtime Manager +// never resolves engine versions itself. type RuntimeManager interface { - // PublishStartJob enqueues one start job for gameID. Implementations - // must produce one event in the configured runtime start jobs stream - // per call. A zero-error return means the event is durably accepted - // into the stream (Redis XADD succeeded); it does not imply that the + // PublishStartJob enqueues one start job for gameID with the + // producer-resolved imageRef. Implementations must produce one + // event in the configured runtime start jobs stream per call. A + // zero-error return means the event is durably accepted into the + // stream (Redis XADD succeeded); it does not imply that the // container has started. - PublishStartJob(ctx context.Context, gameID string) error + PublishStartJob(ctx context.Context, gameID, imageRef string) error - // PublishStopJob enqueues one stop job for gameID. Implementations - // must produce one event in the configured runtime stop jobs stream - // per call. The same durability semantics as PublishStartJob apply. - PublishStopJob(ctx context.Context, gameID string) error + // PublishStopJob enqueues one stop job for gameID with the + // classifying reason. Implementations must produce one event in the + // configured runtime stop jobs stream per call. The same durability + // semantics as PublishStartJob apply. + PublishStopJob(ctx context.Context, gameID string, reason StopReason) error } diff --git a/lobby/internal/ports/userservice.go b/lobby/internal/ports/userservice.go index 25d0a7f..e294b52 100644 --- a/lobby/internal/ports/userservice.go +++ b/lobby/internal/ports/userservice.go @@ -58,6 +58,8 @@ type Eligibility struct { MaxRegisteredRaceNames int } +//go:generate go run go.uber.org/mock/mockgen -destination=../adapters/mocks/mock_userservice.go -package=mocks galaxy/lobby/internal/ports UserService + // UserService is the synchronous lobby-facing User Service eligibility // reader. The application flow consumes it via a single // GetEligibility call before accepting an applicant. diff --git a/lobby/internal/service/approveapplication/service_test.go b/lobby/internal/service/approveapplication/service_test.go index 6938872..e20142d 100644 --- a/lobby/internal/service/approveapplication/service_test.go +++ b/lobby/internal/service/approveapplication/service_test.go @@ -5,15 +5,16 @@ import ( "errors" "io" "log/slog" + "sync" "testing" "time" - "galaxy/lobby/internal/adapters/applicationstub" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/gapactivationstub" - "galaxy/lobby/internal/adapters/intentpubstub" - "galaxy/lobby/internal/adapters/membershipstub" - "galaxy/lobby/internal/adapters/racenamestub" + "galaxy/lobby/internal/adapters/applicationinmem" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/gapactivationinmem" + "galaxy/lobby/internal/adapters/membershipinmem" + "galaxy/lobby/internal/adapters/mocks" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/domain/application" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" @@ -25,8 +26,44 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) +type intentRec struct { + mu sync.Mutex + published []notificationintent.Intent + err error +} + +func (r *intentRec) record(_ context.Context, intent notificationintent.Intent) (string, error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.err != nil { + return "", r.err + } + r.published = append(r.published, intent) + return "1", nil +} + +func (r *intentRec) snapshot() []notificationintent.Intent { + r.mu.Lock() + defer r.mu.Unlock() + return append([]notificationintent.Intent(nil), r.published...) +} + +func (r *intentRec) setErr(err error) { + r.mu.Lock() + defer r.mu.Unlock() + r.err = err +} + +func newIntentMock(t *testing.T, rec *intentRec) *mocks.MockIntentPublisher { + t.Helper() + m := mocks.NewMockIntentPublisher(gomock.NewController(t)) + m.EXPECT().Publish(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + func silentLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } func fixedClock(at time.Time) func() time.Time { return func() time.Time { return at } } @@ -44,12 +81,13 @@ func (f fixedIDs) NewMembershipID() (common.MembershipID, error) { return f.me type fixture struct { now time.Time - games *gamestub.Store - memberships *membershipstub.Store - applications *applicationstub.Store - directory *racenamestub.Directory - gapStore *gapactivationstub.Store - intents *intentpubstub.Publisher + games *gameinmem.Store + memberships *membershipinmem.Store + applications *applicationinmem.Store + directory *racenameinmem.Directory + gapStore *gapactivationinmem.Store + intentRec *intentRec + intents *mocks.MockIntentPublisher ids fixedIDs openPublicGameID common.GameID } @@ -57,11 +95,11 @@ type fixture struct { func newFixture(t *testing.T, maxPlayers, gapPlayers int) *fixture { t.Helper() now := time.Date(2026, 4, 25, 10, 0, 0, 0, time.UTC) - dir, err := racenamestub.NewDirectory(racenamestub.WithClock(fixedClock(now))) + dir, err := racenameinmem.NewDirectory(racenameinmem.WithClock(fixedClock(now))) require.NoError(t, err) - games := gamestub.NewStore() - memberships := membershipstub.NewStore() - applications := applicationstub.NewStore() + games := gameinmem.NewStore() + memberships := membershipinmem.NewStore() + applications := applicationinmem.NewStore() gameRecord, err := game.New(game.NewGameInput{ GameID: "game-public", @@ -80,14 +118,16 @@ func newFixture(t *testing.T, maxPlayers, gapPlayers int) *fixture { gameRecord.Status = game.StatusEnrollmentOpen require.NoError(t, games.Save(context.Background(), gameRecord)) + rec := &intentRec{} return &fixture{ now: now, games: games, memberships: memberships, applications: applications, directory: dir, - gapStore: gapactivationstub.NewStore(), - intents: intentpubstub.NewPublisher(), + gapStore: gapactivationinmem.NewStore(), + intentRec: rec, + intents: newIntentMock(t, rec), ids: fixedIDs{membershipID: "membership-fixed"}, openPublicGameID: gameRecord.GameID, } @@ -151,7 +191,7 @@ func TestApproveHappyPath(t *testing.T) { assert.True(t, availability.Taken) assert.Equal(t, "user-1", availability.HolderUserID) - intents := f.intents.Published() + intents := f.intentRec.snapshot() require.Len(t, intents, 1) assert.Equal(t, notificationintent.NotificationTypeLobbyMembershipApproved, intents[0].NotificationType) assert.Equal(t, []string{"user-1"}, intents[0].RecipientUserIDs) @@ -328,10 +368,10 @@ func TestApproveNameTakenByAnotherUser(t *testing.T) { assert.Equal(t, "user-other", availability.HolderUserID) } -// approveCASStub wraps applicationstub.Store but injects ErrConflict on +// approveCASStub wraps applicationinmem.Store but injects ErrConflict on // the next UpdateStatus call so we can observe the rollback path. type approveCASStub struct { - *applicationstub.Store + *applicationinmem.Store failNext bool } @@ -379,7 +419,7 @@ func TestApprovePublishFailureDoesNotRollback(t *testing.T) { t.Parallel() f := newFixture(t, 4, 1) app := seedSubmittedApplication(t, f, "application-1", "user-1", "SolarPilot") - f.intents.SetError(errors.New("publish failed")) + f.intentRec.setErr(errors.New("publish failed")) svc := newService(t, f) got, err := svc.Handle(context.Background(), approveapplication.Input{ diff --git a/lobby/internal/service/blockmember/service_test.go b/lobby/internal/service/blockmember/service_test.go index 447237b..1515190 100644 --- a/lobby/internal/service/blockmember/service_test.go +++ b/lobby/internal/service/blockmember/service_test.go @@ -8,9 +8,9 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/membershipstub" - "galaxy/lobby/internal/adapters/racenamestub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/membershipinmem" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/membership" @@ -31,20 +31,20 @@ func fixedClock(at time.Time) func() time.Time { } type fixtures struct { - games *gamestub.Store - memberships *membershipstub.Store - directory *racenamestub.Directory + games *gameinmem.Store + memberships *membershipinmem.Store + directory *racenameinmem.Directory } func newFixtures(t *testing.T) *fixtures { t.Helper() - directory, err := racenamestub.NewDirectory() + directory, err := racenameinmem.NewDirectory() require.NoError(t, err) return &fixtures{ - games: gamestub.NewStore(), - memberships: membershipstub.NewStore(), + games: gameinmem.NewStore(), + memberships: membershipinmem.NewStore(), directory: directory, } } diff --git a/lobby/internal/service/cancelgame/service_test.go b/lobby/internal/service/cancelgame/service_test.go index 9e7f263..05d85b7 100644 --- a/lobby/internal/service/cancelgame/service_test.go +++ b/lobby/internal/service/cancelgame/service_test.go @@ -7,7 +7,7 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" + "galaxy/lobby/internal/adapters/gameinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/ports" @@ -31,7 +31,7 @@ func fixedClock(at time.Time) func() time.Time { // status the surface must reject or accept. func seedGameWithStatus( t *testing.T, - store *gamestub.Store, + store *gameinmem.Store, id common.GameID, gameType game.GameType, ownerUserID string, @@ -101,7 +101,7 @@ func TestHandleFromCancellableStatuses(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-a", game.GameTypePublic, "", status, now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -131,7 +131,7 @@ func TestHandleFromRejectedStatuses(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-b", game.GameTypePublic, "", status, now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -149,7 +149,7 @@ func TestHandleAlreadyCancelledIsConflict(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-c", game.GameTypePublic, "", game.StatusCancelled, now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -165,7 +165,7 @@ func TestHandleFinishedIsConflict(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-f", game.GameTypePublic, "", game.StatusFinished, now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -181,7 +181,7 @@ func TestHandleOwnerCancelsPrivate(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-priv", game.GameTypePrivate, "user-1", game.StatusEnrollmentOpen, now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -198,7 +198,7 @@ func TestHandleNonOwnerForbidden(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-priv", game.GameTypePrivate, "user-1", game.StatusEnrollmentOpen, now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -214,7 +214,7 @@ func TestHandleUserCannotCancelPublic(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-pub", game.GameTypePublic, "", game.StatusEnrollmentOpen, now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -229,7 +229,7 @@ func TestHandleUserCannotCancelPublic(t *testing.T) { func TestHandleNotFound(t *testing.T) { t.Parallel() - store := gamestub.NewStore() + store := gameinmem.NewStore() service := newService(t, store, fixedClock(time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC))) _, err := service.Handle(context.Background(), cancelgame.Input{ @@ -242,7 +242,7 @@ func TestHandleNotFound(t *testing.T) { func TestHandleInvalidActor(t *testing.T) { t.Parallel() - store := gamestub.NewStore() + store := gameinmem.NewStore() service := newService(t, store, fixedClock(time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC))) _, err := service.Handle(context.Background(), cancelgame.Input{ @@ -256,7 +256,7 @@ func TestHandleInvalidActor(t *testing.T) { func TestHandleInvalidGameID(t *testing.T) { t.Parallel() - store := gamestub.NewStore() + store := gameinmem.NewStore() service := newService(t, store, fixedClock(time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC))) _, err := service.Handle(context.Background(), cancelgame.Input{ diff --git a/lobby/internal/service/capabilityevaluation/service_test.go b/lobby/internal/service/capabilityevaluation/service_test.go index 3b365cb..ae10b60 100644 --- a/lobby/internal/service/capabilityevaluation/service_test.go +++ b/lobby/internal/service/capabilityevaluation/service_test.go @@ -8,11 +8,11 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/evaluationguardstub" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/gameturnstatsstub" - "galaxy/lobby/internal/adapters/membershipstub" - "galaxy/lobby/internal/adapters/racenamestub" + "galaxy/lobby/internal/adapters/evaluationguardinmem" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/gameturnstatsinmem" + "galaxy/lobby/internal/adapters/membershipinmem" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/membership" @@ -51,12 +51,12 @@ type fixture struct { finishedAt time.Time gameID common.GameID gameName string - games *gamestub.Store - memberships *membershipstub.Store - stats *gameturnstatsstub.Store - directory *racenamestub.Directory + games *gameinmem.Store + memberships *membershipinmem.Store + stats *gameturnstatsinmem.Store + directory *racenameinmem.Directory intents *spyIntents - guard *evaluationguardstub.Store + guard *evaluationguardinmem.Store service *capabilityevaluation.Service } @@ -65,13 +65,13 @@ func newFixture(t *testing.T) *fixture { now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) finishedAt := now - games := gamestub.NewStore() - memberships := membershipstub.NewStore() - stats := gameturnstatsstub.NewStore() - directory, err := racenamestub.NewDirectory(racenamestub.WithClock(fixedClock(now.Add(-time.Hour)))) + games := gameinmem.NewStore() + memberships := membershipinmem.NewStore() + stats := gameturnstatsinmem.NewStore() + directory, err := racenameinmem.NewDirectory(racenameinmem.WithClock(fixedClock(now.Add(-time.Hour)))) require.NoError(t, err) intents := &spyIntents{} - guard := evaluationguardstub.NewStore() + guard := evaluationguardinmem.NewStore() gameID := common.GameID("game-finished") gameName := "Final Showdown" diff --git a/lobby/internal/service/creategame/service_test.go b/lobby/internal/service/creategame/service_test.go index aa7db4f..4d18fc8 100644 --- a/lobby/internal/service/creategame/service_test.go +++ b/lobby/internal/service/creategame/service_test.go @@ -8,7 +8,7 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" + "galaxy/lobby/internal/adapters/gameinmem" "galaxy/lobby/internal/adapters/idgen" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" @@ -88,11 +88,11 @@ func TestNewServiceRequiresStoreAndIDs(t *testing.T) { _, err := creategame.NewService(creategame.Dependencies{}) require.Error(t, err) - _, err = creategame.NewService(creategame.Dependencies{Games: gamestub.NewStore()}) + _, err = creategame.NewService(creategame.Dependencies{Games: gameinmem.NewStore()}) require.Error(t, err) _, err = creategame.NewService(creategame.Dependencies{ - Games: gamestub.NewStore(), + Games: gameinmem.NewStore(), IDs: &stubIDGenerator{next: "game-ok"}, }) require.NoError(t, err) @@ -102,7 +102,7 @@ func TestHandleAdminCreatesPublicGame(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() service, err := creategame.NewService(creategame.Dependencies{ Games: store, IDs: &stubIDGenerator{next: "game-alpha"}, @@ -129,7 +129,7 @@ func TestHandleUserCreatesPrivateGame(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 11, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() service, err := creategame.NewService(creategame.Dependencies{ Games: store, IDs: &stubIDGenerator{next: "game-beta"}, @@ -150,7 +150,7 @@ func TestHandleAdminForbiddenForPrivateGame(t *testing.T) { now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) service, err := creategame.NewService(creategame.Dependencies{ - Games: gamestub.NewStore(), + Games: gameinmem.NewStore(), IDs: &stubIDGenerator{next: "game-x"}, Clock: newFixedClock(now), Logger: silentLogger(), @@ -169,7 +169,7 @@ func TestHandleUserForbiddenForPublicGame(t *testing.T) { now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) service, err := creategame.NewService(creategame.Dependencies{ - Games: gamestub.NewStore(), + Games: gameinmem.NewStore(), IDs: &stubIDGenerator{next: "game-x"}, Clock: newFixedClock(now), Logger: silentLogger(), @@ -188,7 +188,7 @@ func TestHandleInvalidActorReturnsError(t *testing.T) { now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) service, err := creategame.NewService(creategame.Dependencies{ - Games: gamestub.NewStore(), + Games: gameinmem.NewStore(), IDs: &stubIDGenerator{next: "game-x"}, Clock: newFixedClock(now), Logger: silentLogger(), @@ -208,7 +208,7 @@ func TestHandleDomainValidationFailurePropagates(t *testing.T) { now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) service, err := creategame.NewService(creategame.Dependencies{ - Games: gamestub.NewStore(), + Games: gameinmem.NewStore(), IDs: &stubIDGenerator{next: "game-bad-cron"}, Clock: newFixedClock(now), Logger: silentLogger(), @@ -228,7 +228,7 @@ func TestHandleEnrollmentDeadlineInPastFails(t *testing.T) { now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) service, err := creategame.NewService(creategame.Dependencies{ - Games: gamestub.NewStore(), + Games: gameinmem.NewStore(), IDs: &stubIDGenerator{next: "game-past"}, Clock: newFixedClock(now), Logger: silentLogger(), @@ -249,7 +249,7 @@ func TestHandleIDGeneratorErrorPropagates(t *testing.T) { now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) boom := errors.New("entropy exhausted") service, err := creategame.NewService(creategame.Dependencies{ - Games: gamestub.NewStore(), + Games: gameinmem.NewStore(), IDs: &stubIDGenerator{err: boom}, Clock: newFixedClock(now), Logger: silentLogger(), @@ -309,7 +309,7 @@ func TestHandleUsesRealIDGeneratorShape(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() service, err := creategame.NewService(creategame.Dependencies{ Games: store, IDs: idgen.NewGenerator(), diff --git a/lobby/internal/service/createinvite/service_test.go b/lobby/internal/service/createinvite/service_test.go index 7c99495..d5bc236 100644 --- a/lobby/internal/service/createinvite/service_test.go +++ b/lobby/internal/service/createinvite/service_test.go @@ -5,13 +5,14 @@ import ( "errors" "io" "log/slog" + "sync" "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/intentpubstub" - "galaxy/lobby/internal/adapters/invitestub" - "galaxy/lobby/internal/adapters/membershipstub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/inviteinmem" + "galaxy/lobby/internal/adapters/membershipinmem" + "galaxy/lobby/internal/adapters/mocks" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/invite" @@ -23,8 +24,46 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) +// intentRec captures every Publish call so tests can assert on the +// resulting intent. Per-test error injection sets err. +type intentRec struct { + mu sync.Mutex + published []notificationintent.Intent + err error +} + +func (r *intentRec) record(_ context.Context, intent notificationintent.Intent) (string, error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.err != nil { + return "", r.err + } + r.published = append(r.published, intent) + return "1", nil +} + +func (r *intentRec) snapshot() []notificationintent.Intent { + r.mu.Lock() + defer r.mu.Unlock() + return append([]notificationintent.Intent(nil), r.published...) +} + +func (r *intentRec) setErr(err error) { + r.mu.Lock() + defer r.mu.Unlock() + r.err = err +} + +func newIntentMock(t *testing.T, rec *intentRec) *mocks.MockIntentPublisher { + t.Helper() + m := mocks.NewMockIntentPublisher(gomock.NewController(t)) + m.EXPECT().Publish(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + const ( ownerUserID = "user-owner" inviteeUserID = "user-invitee" @@ -45,10 +84,11 @@ func (f fixedIDs) NewMembershipID() (common.MembershipID, error) { return "", type fixture struct { now time.Time - games *gamestub.Store - invites *invitestub.Store - memberships *membershipstub.Store - intents *intentpubstub.Publisher + games *gameinmem.Store + invites *inviteinmem.Store + memberships *membershipinmem.Store + intentRec *intentRec + intents *mocks.MockIntentPublisher ids fixedIDs game game.Game } @@ -56,9 +96,9 @@ type fixture struct { func newFixture(t *testing.T, maxPlayers, gapPlayers int) *fixture { t.Helper() now := time.Date(2026, 4, 25, 10, 0, 0, 0, time.UTC) - games := gamestub.NewStore() - invites := invitestub.NewStore() - memberships := membershipstub.NewStore() + games := gameinmem.NewStore() + invites := inviteinmem.NewStore() + memberships := membershipinmem.NewStore() gameRecord, err := game.New(game.NewGameInput{ GameID: "game-private", @@ -78,12 +118,13 @@ func newFixture(t *testing.T, maxPlayers, gapPlayers int) *fixture { gameRecord.Status = game.StatusEnrollmentOpen require.NoError(t, games.Save(context.Background(), gameRecord)) + rec := &intentRec{} return &fixture{ now: now, games: games, invites: invites, memberships: memberships, - intents: intentpubstub.NewPublisher(), + intentRec: rec, ids: fixedIDs{inviteID: "invite-fixed"}, game: gameRecord, } @@ -91,6 +132,9 @@ func newFixture(t *testing.T, maxPlayers, gapPlayers int) *fixture { func newService(t *testing.T, f *fixture) *createinvite.Service { t.Helper() + if f.intents == nil { + f.intents = newIntentMock(t, f.intentRec) + } svc, err := createinvite.NewService(createinvite.Dependencies{ Games: f.games, Invites: f.invites, @@ -127,7 +171,7 @@ func TestHandleHappyPath(t *testing.T) { assert.Equal(t, f.game.EnrollmentEndsAt, got.ExpiresAt) assert.Empty(t, got.RaceName) - intents := f.intents.Published() + intents := f.intentRec.snapshot() require.Len(t, intents, 1) assert.Equal(t, notificationintent.NotificationTypeLobbyInviteCreated, intents[0].NotificationType) assert.Equal(t, []string{inviteeUserID}, intents[0].RecipientUserIDs) @@ -316,7 +360,7 @@ func TestHandleInviterNameUsesActiveMembershipRaceName(t *testing.T) { _, err = svc.Handle(context.Background(), defaultInput(f)) require.NoError(t, err) - intents := f.intents.Published() + intents := f.intentRec.snapshot() require.Len(t, intents, 1) assert.Contains(t, intents[0].PayloadJSON, `"inviter_name":"OwnerRace"`) } @@ -329,7 +373,7 @@ func TestHandleInviterNameFallsBackToUserID(t *testing.T) { _, err := svc.Handle(context.Background(), defaultInput(f)) require.NoError(t, err) - intents := f.intents.Published() + intents := f.intentRec.snapshot() require.Len(t, intents, 1) assert.Contains(t, intents[0].PayloadJSON, `"inviter_name":"`+ownerUserID+`"`) } @@ -337,7 +381,7 @@ func TestHandleInviterNameFallsBackToUserID(t *testing.T) { func TestHandlePublishFailureDoesNotRollback(t *testing.T) { t.Parallel() f := newFixture(t, 4, 1) - f.intents.SetError(errors.New("publish failed")) + f.intentRec.setErr(errors.New("publish failed")) svc := newService(t, f) got, err := svc.Handle(context.Background(), defaultInput(f)) diff --git a/lobby/internal/service/declineinvite/service_test.go b/lobby/internal/service/declineinvite/service_test.go index 8fa2408..0689b65 100644 --- a/lobby/internal/service/declineinvite/service_test.go +++ b/lobby/internal/service/declineinvite/service_test.go @@ -7,7 +7,7 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/invitestub" + "galaxy/lobby/internal/adapters/inviteinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/invite" "galaxy/lobby/internal/ports" @@ -30,14 +30,14 @@ func fixedClock(at time.Time) func() time.Time { return func() time.Time { retur type fixture struct { now time.Time - invites *invitestub.Store + invites *inviteinmem.Store } func newFixture(t *testing.T) *fixture { t.Helper() return &fixture{ now: time.Date(2026, 4, 25, 10, 0, 0, 0, time.UTC), - invites: invitestub.NewStore(), + invites: inviteinmem.NewStore(), } } diff --git a/lobby/internal/service/getgame/service_test.go b/lobby/internal/service/getgame/service_test.go index f559493..d2156dd 100644 --- a/lobby/internal/service/getgame/service_test.go +++ b/lobby/internal/service/getgame/service_test.go @@ -8,9 +8,9 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/invitestub" - "galaxy/lobby/internal/adapters/membershipstub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/inviteinmem" + "galaxy/lobby/internal/adapters/membershipinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/invite" @@ -27,17 +27,17 @@ func silentLogger() *slog.Logger { } type fixture struct { - games *gamestub.Store - memberships *membershipstub.Store - invites *invitestub.Store + games *gameinmem.Store + memberships *membershipinmem.Store + invites *inviteinmem.Store svc *getgame.Service } func newFixture(t *testing.T) *fixture { t.Helper() - games := gamestub.NewStore() - memberships := membershipstub.NewStore() - invites := invitestub.NewStore() + games := gameinmem.NewStore() + memberships := membershipinmem.NewStore() + invites := inviteinmem.NewStore() svc, err := getgame.NewService(getgame.Dependencies{ Games: games, Memberships: memberships, @@ -55,7 +55,7 @@ func newFixture(t *testing.T) *fixture { func seedGame( t *testing.T, - store *gamestub.Store, + store *gameinmem.Store, id common.GameID, gameType game.GameType, ownerUserID string, @@ -88,7 +88,7 @@ func seedGame( func seedMembership( t *testing.T, - store *membershipstub.Store, + store *membershipinmem.Store, gameID common.GameID, userID string, status membership.Status, @@ -121,7 +121,7 @@ func seedMembership( func seedInvite( t *testing.T, - store *invitestub.Store, + store *inviteinmem.Store, gameID common.GameID, inviterID, inviteeID string, status invite.Status, @@ -364,9 +364,9 @@ func TestNewServiceRejectsMissingDeps(t *testing.T) { name string deps getgame.Dependencies }{ - {"nil games", getgame.Dependencies{Memberships: membershipstub.NewStore(), Invites: invitestub.NewStore()}}, - {"nil memberships", getgame.Dependencies{Games: gamestub.NewStore(), Invites: invitestub.NewStore()}}, - {"nil invites", getgame.Dependencies{Games: gamestub.NewStore(), Memberships: membershipstub.NewStore()}}, + {"nil games", getgame.Dependencies{Memberships: membershipinmem.NewStore(), Invites: inviteinmem.NewStore()}}, + {"nil memberships", getgame.Dependencies{Games: gameinmem.NewStore(), Invites: inviteinmem.NewStore()}}, + {"nil invites", getgame.Dependencies{Games: gameinmem.NewStore(), Memberships: membershipinmem.NewStore()}}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { @@ -380,12 +380,12 @@ func TestNewServiceRejectsMissingDeps(t *testing.T) { func TestHandleSurfacesStoreError(t *testing.T) { // Sanity check that errors from the membership store bubble up wrapped. t.Parallel() - games := gamestub.NewStore() + games := gameinmem.NewStore() memberships := &erroringMemberships{err: errors.New("stub failure")} svc, err := getgame.NewService(getgame.Dependencies{ Games: games, Memberships: memberships, - Invites: invitestub.NewStore(), + Invites: inviteinmem.NewStore(), Logger: silentLogger(), }) require.NoError(t, err) @@ -401,7 +401,7 @@ func TestHandleSurfacesStoreError(t *testing.T) { } type erroringMemberships struct { - membershipstub.Store + membershipinmem.Store err error } diff --git a/lobby/internal/service/listgames/service_test.go b/lobby/internal/service/listgames/service_test.go index af133d8..71f98db 100644 --- a/lobby/internal/service/listgames/service_test.go +++ b/lobby/internal/service/listgames/service_test.go @@ -7,8 +7,8 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/membershipstub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/membershipinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/membership" @@ -23,15 +23,15 @@ func silentLogger() *slog.Logger { } type fixture struct { - games *gamestub.Store - memberships *membershipstub.Store + games *gameinmem.Store + memberships *membershipinmem.Store svc *listgames.Service } func newFixture(t *testing.T) *fixture { t.Helper() - games := gamestub.NewStore() - memberships := membershipstub.NewStore() + games := gameinmem.NewStore() + memberships := membershipinmem.NewStore() svc, err := listgames.NewService(listgames.Dependencies{ Games: games, Memberships: memberships, @@ -43,7 +43,7 @@ func newFixture(t *testing.T) *fixture { func seedGameAt( t *testing.T, - store *gamestub.Store, + store *gameinmem.Store, id common.GameID, gameType game.GameType, ownerUserID string, @@ -76,7 +76,7 @@ func seedGameAt( func seedActiveMembership( t *testing.T, - store *membershipstub.Store, + store *membershipinmem.Store, gameID common.GameID, userID string, now time.Time, @@ -289,8 +289,8 @@ func TestNewServiceRejectsMissingDeps(t *testing.T) { name string deps listgames.Dependencies }{ - {"nil games", listgames.Dependencies{Memberships: membershipstub.NewStore()}}, - {"nil memberships", listgames.Dependencies{Games: gamestub.NewStore()}}, + {"nil games", listgames.Dependencies{Memberships: membershipinmem.NewStore()}}, + {"nil memberships", listgames.Dependencies{Games: gameinmem.NewStore()}}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { diff --git a/lobby/internal/service/listmemberships/service_test.go b/lobby/internal/service/listmemberships/service_test.go index 3f1de7f..08862c6 100644 --- a/lobby/internal/service/listmemberships/service_test.go +++ b/lobby/internal/service/listmemberships/service_test.go @@ -7,8 +7,8 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/membershipstub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/membershipinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/membership" @@ -24,15 +24,15 @@ func silentLogger() *slog.Logger { } type fixture struct { - games *gamestub.Store - memberships *membershipstub.Store + games *gameinmem.Store + memberships *membershipinmem.Store svc *listmemberships.Service } func newFixture(t *testing.T) *fixture { t.Helper() - games := gamestub.NewStore() - memberships := membershipstub.NewStore() + games := gameinmem.NewStore() + memberships := membershipinmem.NewStore() svc, err := listmemberships.NewService(listmemberships.Dependencies{ Games: games, Memberships: memberships, @@ -44,7 +44,7 @@ func newFixture(t *testing.T) *fixture { func seedGame( t *testing.T, - store *gamestub.Store, + store *gameinmem.Store, id common.GameID, gameType game.GameType, ownerUserID string, @@ -71,7 +71,7 @@ func seedGame( func seedMembership( t *testing.T, - store *membershipstub.Store, + store *membershipinmem.Store, gameID common.GameID, userID string, status membership.Status, @@ -230,8 +230,8 @@ func TestNewServiceRejectsMissingDeps(t *testing.T) { name string deps listmemberships.Dependencies }{ - {"nil games", listmemberships.Dependencies{Memberships: membershipstub.NewStore()}}, - {"nil memberships", listmemberships.Dependencies{Games: gamestub.NewStore()}}, + {"nil games", listmemberships.Dependencies{Memberships: membershipinmem.NewStore()}}, + {"nil memberships", listmemberships.Dependencies{Games: gameinmem.NewStore()}}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { diff --git a/lobby/internal/service/listmyapplications/service_test.go b/lobby/internal/service/listmyapplications/service_test.go index 6fcc5f8..22902ca 100644 --- a/lobby/internal/service/listmyapplications/service_test.go +++ b/lobby/internal/service/listmyapplications/service_test.go @@ -7,8 +7,8 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/applicationstub" - "galaxy/lobby/internal/adapters/gamestub" + "galaxy/lobby/internal/adapters/applicationinmem" + "galaxy/lobby/internal/adapters/gameinmem" "galaxy/lobby/internal/domain/application" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" @@ -24,15 +24,15 @@ func silentLogger() *slog.Logger { } type fixture struct { - games *gamestub.Store - applications *applicationstub.Store + games *gameinmem.Store + applications *applicationinmem.Store svc *listmyapplications.Service } func newFixture(t *testing.T) *fixture { t.Helper() - games := gamestub.NewStore() - apps := applicationstub.NewStore() + games := gameinmem.NewStore() + apps := applicationinmem.NewStore() svc, err := listmyapplications.NewService(listmyapplications.Dependencies{ Games: games, Applications: apps, @@ -44,7 +44,7 @@ func newFixture(t *testing.T) *fixture { func seedGame( t *testing.T, - store *gamestub.Store, + store *gameinmem.Store, id common.GameID, gameType game.GameType, name string, @@ -75,7 +75,7 @@ func seedGame( func seedApplication( t *testing.T, - store *applicationstub.Store, + store *applicationinmem.Store, id common.ApplicationID, gameID common.GameID, userID string, @@ -180,8 +180,8 @@ func TestNewServiceRejectsMissingDeps(t *testing.T) { name string deps listmyapplications.Dependencies }{ - {"nil games", listmyapplications.Dependencies{Applications: applicationstub.NewStore()}}, - {"nil applications", listmyapplications.Dependencies{Games: gamestub.NewStore()}}, + {"nil games", listmyapplications.Dependencies{Applications: applicationinmem.NewStore()}}, + {"nil applications", listmyapplications.Dependencies{Games: gameinmem.NewStore()}}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { diff --git a/lobby/internal/service/listmygames/service_test.go b/lobby/internal/service/listmygames/service_test.go index 5fb3389..215cba1 100644 --- a/lobby/internal/service/listmygames/service_test.go +++ b/lobby/internal/service/listmygames/service_test.go @@ -7,8 +7,8 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/membershipstub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/membershipinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/membership" @@ -24,15 +24,15 @@ func silentLogger() *slog.Logger { } type fixture struct { - games *gamestub.Store - memberships *membershipstub.Store + games *gameinmem.Store + memberships *membershipinmem.Store svc *listmygames.Service } func newFixture(t *testing.T) *fixture { t.Helper() - games := gamestub.NewStore() - memberships := membershipstub.NewStore() + games := gameinmem.NewStore() + memberships := membershipinmem.NewStore() svc, err := listmygames.NewService(listmygames.Dependencies{ Games: games, Memberships: memberships, @@ -44,7 +44,7 @@ func newFixture(t *testing.T) *fixture { func seedGameWithStatus( t *testing.T, - store *gamestub.Store, + store *gameinmem.Store, id common.GameID, status game.Status, now time.Time, @@ -78,7 +78,7 @@ func seedGameWithStatus( func seedMembership( t *testing.T, - store *membershipstub.Store, + store *membershipinmem.Store, gameID common.GameID, userID string, status membership.Status, @@ -188,8 +188,8 @@ func TestNewServiceRejectsMissingDeps(t *testing.T) { name string deps listmygames.Dependencies }{ - {"nil games", listmygames.Dependencies{Memberships: membershipstub.NewStore()}}, - {"nil memberships", listmygames.Dependencies{Games: gamestub.NewStore()}}, + {"nil games", listmygames.Dependencies{Memberships: membershipinmem.NewStore()}}, + {"nil memberships", listmygames.Dependencies{Games: gameinmem.NewStore()}}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { diff --git a/lobby/internal/service/listmyinvites/service_test.go b/lobby/internal/service/listmyinvites/service_test.go index 1815c03..c77d65b 100644 --- a/lobby/internal/service/listmyinvites/service_test.go +++ b/lobby/internal/service/listmyinvites/service_test.go @@ -7,9 +7,9 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/invitestub" - "galaxy/lobby/internal/adapters/membershipstub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/inviteinmem" + "galaxy/lobby/internal/adapters/membershipinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/invite" @@ -26,17 +26,17 @@ func silentLogger() *slog.Logger { } type fixture struct { - games *gamestub.Store - invites *invitestub.Store - memberships *membershipstub.Store + games *gameinmem.Store + invites *inviteinmem.Store + memberships *membershipinmem.Store svc *listmyinvites.Service } func newFixture(t *testing.T) *fixture { t.Helper() - games := gamestub.NewStore() - invites := invitestub.NewStore() - memberships := membershipstub.NewStore() + games := gameinmem.NewStore() + invites := inviteinmem.NewStore() + memberships := membershipinmem.NewStore() svc, err := listmyinvites.NewService(listmyinvites.Dependencies{ Games: games, Invites: invites, @@ -49,7 +49,7 @@ func newFixture(t *testing.T) *fixture { func seedPrivateGame( t *testing.T, - store *gamestub.Store, + store *gameinmem.Store, id common.GameID, owner string, name string, @@ -76,7 +76,7 @@ func seedPrivateGame( func seedInvite( t *testing.T, - store *invitestub.Store, + store *inviteinmem.Store, id common.InviteID, gameID common.GameID, inviter, invitee string, @@ -110,7 +110,7 @@ func seedInvite( func seedActiveMembership( t *testing.T, - store *membershipstub.Store, + store *membershipinmem.Store, gameID common.GameID, userID, raceName string, now time.Time, @@ -222,9 +222,9 @@ func TestNewServiceRejectsMissingDeps(t *testing.T) { name string deps listmyinvites.Dependencies }{ - {"nil games", listmyinvites.Dependencies{Invites: invitestub.NewStore(), Memberships: membershipstub.NewStore()}}, - {"nil invites", listmyinvites.Dependencies{Games: gamestub.NewStore(), Memberships: membershipstub.NewStore()}}, - {"nil memberships", listmyinvites.Dependencies{Games: gamestub.NewStore(), Invites: invitestub.NewStore()}}, + {"nil games", listmyinvites.Dependencies{Invites: inviteinmem.NewStore(), Memberships: membershipinmem.NewStore()}}, + {"nil invites", listmyinvites.Dependencies{Games: gameinmem.NewStore(), Memberships: membershipinmem.NewStore()}}, + {"nil memberships", listmyinvites.Dependencies{Games: gameinmem.NewStore(), Invites: inviteinmem.NewStore()}}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { diff --git a/lobby/internal/service/listmyracenames/service_test.go b/lobby/internal/service/listmyracenames/service_test.go index 800db6b..3e1dda6 100644 --- a/lobby/internal/service/listmyracenames/service_test.go +++ b/lobby/internal/service/listmyracenames/service_test.go @@ -7,8 +7,8 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/racenamestub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/ports" @@ -28,17 +28,17 @@ func silentLogger() *slog.Logger { // race-name directory stub and the in-process game store. type fixture struct { now time.Time - directory *racenamestub.Directory - games *gamestub.Store + directory *racenameinmem.Directory + games *gameinmem.Store service *listmyracenames.Service } func newFixture(t *testing.T) *fixture { t.Helper() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - directory, err := racenamestub.NewDirectory(racenamestub.WithClock(func() time.Time { return now })) + directory, err := racenameinmem.NewDirectory(racenameinmem.WithClock(func() time.Time { return now })) require.NoError(t, err) - games := gamestub.NewStore() + games := gameinmem.NewStore() svc, err := listmyracenames.NewService(listmyracenames.Dependencies{ Directory: directory, Games: games, @@ -217,9 +217,9 @@ func TestHandleSortByTimestamp(t *testing.T) { const userID = "user-sort" now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) clock := now - directory, err := racenamestub.NewDirectory(racenamestub.WithClock(func() time.Time { return clock })) + directory, err := racenameinmem.NewDirectory(racenameinmem.WithClock(func() time.Time { return clock })) require.NoError(t, err) - games := gamestub.NewStore() + games := gameinmem.NewStore() svc, err := listmyracenames.NewService(listmyracenames.Dependencies{ Directory: directory, Games: games, @@ -281,9 +281,9 @@ func TestHandleSortByTimestamp(t *testing.T) { func TestNewServiceRejectsMissingDeps(t *testing.T) { t.Parallel() - directory, err := racenamestub.NewDirectory() + directory, err := racenameinmem.NewDirectory() require.NoError(t, err) - games := gamestub.NewStore() + games := gameinmem.NewStore() _, err = listmyracenames.NewService(listmyracenames.Dependencies{ Games: games, @@ -299,4 +299,4 @@ func TestNewServiceRejectsMissingDeps(t *testing.T) { // Sanity guard so a future port refactor that drops the user-keyed // indexes immediately breaks the test build instead of silently // regressing the no-full-scan invariant. -var _ ports.RaceNameDirectory = (*racenamestub.Directory)(nil) +var _ ports.RaceNameDirectory = (*racenameinmem.Directory)(nil) diff --git a/lobby/internal/service/manualreadytostart/service_test.go b/lobby/internal/service/manualreadytostart/service_test.go index 36caedd..c7f07ad 100644 --- a/lobby/internal/service/manualreadytostart/service_test.go +++ b/lobby/internal/service/manualreadytostart/service_test.go @@ -4,13 +4,14 @@ import ( "context" "io" "log/slog" + "sync" "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/intentpubstub" - "galaxy/lobby/internal/adapters/invitestub" - "galaxy/lobby/internal/adapters/membershipstub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/inviteinmem" + "galaxy/lobby/internal/adapters/membershipinmem" + "galaxy/lobby/internal/adapters/mocks" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/invite" @@ -21,8 +22,34 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) +type intentRec struct { + mu sync.Mutex + published []notificationintent.Intent +} + +func (r *intentRec) record(_ context.Context, intent notificationintent.Intent) (string, error) { + r.mu.Lock() + defer r.mu.Unlock() + r.published = append(r.published, intent) + return "1", nil +} + +func (r *intentRec) snapshot() []notificationintent.Intent { + r.mu.Lock() + defer r.mu.Unlock() + return append([]notificationintent.Intent(nil), r.published...) +} + +func newIntentMock(t *testing.T, rec *intentRec) *mocks.MockIntentPublisher { + t.Helper() + m := mocks.NewMockIntentPublisher(gomock.NewController(t)) + m.EXPECT().Publish(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + const ( publicGameID = common.GameID("game-public") privateGameID = common.GameID("game-private") @@ -35,22 +62,26 @@ func fixedClock(at time.Time) func() time.Time { return func() time.Time { retur type fixture struct { now time.Time - games *gamestub.Store - invites *invitestub.Store - memberships *membershipstub.Store - intents *intentpubstub.Publisher + games *gameinmem.Store + invites *inviteinmem.Store + memberships *membershipinmem.Store + intentRec *intentRec + intents *mocks.MockIntentPublisher } func newFixture(t *testing.T) *fixture { t.Helper() now := time.Date(2026, 4, 25, 10, 0, 0, 0, time.UTC) - return &fixture{ + rec := &intentRec{} + f := &fixture{ now: now, - games: gamestub.NewStore(), - invites: invitestub.NewStore(), - memberships: membershipstub.NewStore(), - intents: intentpubstub.NewPublisher(), + games: gameinmem.NewStore(), + invites: inviteinmem.NewStore(), + memberships: membershipinmem.NewStore(), + intentRec: rec, } + f.intents = newIntentMock(t, rec) + return f } func (f *fixture) addGame(t *testing.T, gameID common.GameID, gameType game.GameType, owner string, minPlayers int) game.Game { @@ -154,7 +185,7 @@ func TestHandleOwnerClosesPrivateEnrollmentAndExpiresInvites(t *testing.T) { assert.Equal(t, invite.StatusExpired, rec.Status) } - intents := f.intents.Published() + intents := f.intentRec.snapshot() require.Len(t, intents, 2) for _, intent := range intents { assert.Equal(t, notificationintent.NotificationTypeLobbyInviteExpired, intent.NotificationType) @@ -231,7 +262,7 @@ func TestHandleBelowMinPlayersConflict(t *testing.T) { current, err := f.games.Get(context.Background(), publicGameID) require.NoError(t, err) assert.Equal(t, game.StatusEnrollmentOpen, current.Status) - assert.Empty(t, f.intents.Published()) + assert.Empty(t, f.intentRec.snapshot()) } func TestHandleEmptyInvitesProducesNoNotifications(t *testing.T) { @@ -246,5 +277,5 @@ func TestHandleEmptyInvitesProducesNoNotifications(t *testing.T) { GameID: privateGameID, }) require.NoError(t, err) - assert.Empty(t, f.intents.Published()) + assert.Empty(t, f.intentRec.snapshot()) } diff --git a/lobby/internal/service/openenrollment/service_test.go b/lobby/internal/service/openenrollment/service_test.go index 987cc2b..0a72380 100644 --- a/lobby/internal/service/openenrollment/service_test.go +++ b/lobby/internal/service/openenrollment/service_test.go @@ -7,7 +7,7 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" + "galaxy/lobby/internal/adapters/gameinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/ports" @@ -27,7 +27,7 @@ func fixedClock(at time.Time) func() time.Time { func seedDraftGame( t *testing.T, - store *gamestub.Store, + store *gameinmem.Store, id common.GameID, gameType game.GameType, ownerUserID string, @@ -71,7 +71,7 @@ func TestHandleAdminHappyPath(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftGame(t, store, "game-alpha", game.GameTypePublic, "", now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -89,7 +89,7 @@ func TestHandleOwnerHappyPath(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftGame(t, store, "game-p", game.GameTypePrivate, "user-1", now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -106,7 +106,7 @@ func TestHandleNonOwnerForbidden(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftGame(t, store, "game-p", game.GameTypePrivate, "user-1", now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -122,7 +122,7 @@ func TestHandleUserCannotOpenPublicGame(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftGame(t, store, "game-pub", game.GameTypePublic, "", now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -138,7 +138,7 @@ func TestHandleFromEnrollmentOpenConflict(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedDraftGame(t, store, "game-x", game.GameTypePublic, "", now) require.NoError(t, store.UpdateStatus(context.Background(), ports.UpdateStatusInput{ GameID: record.GameID, @@ -161,7 +161,7 @@ func TestHandleFromReadyToStartInvalidTransition(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedDraftGame(t, store, "game-rts", game.GameTypePublic, "", now) require.NoError(t, store.UpdateStatus(context.Background(), ports.UpdateStatusInput{ GameID: record.GameID, @@ -191,7 +191,7 @@ func TestHandleFromReadyToStartInvalidTransition(t *testing.T) { func TestHandleNotFound(t *testing.T) { t.Parallel() - store := gamestub.NewStore() + store := gameinmem.NewStore() service := newService(t, store, fixedClock(time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC))) _, err := service.Handle(context.Background(), openenrollment.Input{ @@ -204,7 +204,7 @@ func TestHandleNotFound(t *testing.T) { func TestHandleInvalidActor(t *testing.T) { t.Parallel() - store := gamestub.NewStore() + store := gameinmem.NewStore() service := newService(t, store, fixedClock(time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC))) _, err := service.Handle(context.Background(), openenrollment.Input{ @@ -218,7 +218,7 @@ func TestHandleInvalidActor(t *testing.T) { func TestHandleInvalidGameID(t *testing.T) { t.Parallel() - store := gamestub.NewStore() + store := gameinmem.NewStore() service := newService(t, store, fixedClock(time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC))) _, err := service.Handle(context.Background(), openenrollment.Input{ diff --git a/lobby/internal/service/pausegame/service_test.go b/lobby/internal/service/pausegame/service_test.go index 85cf405..8160395 100644 --- a/lobby/internal/service/pausegame/service_test.go +++ b/lobby/internal/service/pausegame/service_test.go @@ -7,7 +7,7 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" + "galaxy/lobby/internal/adapters/gameinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/ports" @@ -32,7 +32,7 @@ func fixedClock(at time.Time) func() time.Time { // any source status. func seedGameWithStatus( t *testing.T, - store *gamestub.Store, + store *gameinmem.Store, id common.GameID, gameType game.GameType, ownerUserID string, @@ -98,7 +98,7 @@ func TestPauseGameAdminHappyPath(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-pub", game.GameTypePublic, "", game.StatusRunning, now) at := now.Add(time.Hour) @@ -117,7 +117,7 @@ func TestPauseGamePrivateOwnerHappyPath(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-priv", game.GameTypePrivate, "user-owner", game.StatusRunning, now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -134,7 +134,7 @@ func TestPauseGameRejectsNonOwnerUser(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-priv", game.GameTypePrivate, "user-owner", game.StatusRunning, now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -150,7 +150,7 @@ func TestPauseGameRejectsUserActorOnPublicGame(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-pub", game.GameTypePublic, "", game.StatusRunning, now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -181,7 +181,7 @@ func TestPauseGameRejectsWrongStatuses(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-x", game.GameTypePublic, "", status, now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -197,7 +197,7 @@ func TestPauseGameRejectsWrongStatuses(t *testing.T) { func TestPauseGameRejectsMissingRecord(t *testing.T) { t.Parallel() - store := gamestub.NewStore() + store := gameinmem.NewStore() service := newService(t, store, fixedClock(time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC))) _, err := service.Handle(context.Background(), pausegame.Input{ @@ -210,7 +210,7 @@ func TestPauseGameRejectsMissingRecord(t *testing.T) { func TestPauseGameInvalidActor(t *testing.T) { t.Parallel() - store := gamestub.NewStore() + store := gameinmem.NewStore() service := newService(t, store, fixedClock(time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC))) _, err := service.Handle(context.Background(), pausegame.Input{ @@ -224,7 +224,7 @@ func TestPauseGameInvalidActor(t *testing.T) { func TestPauseGameInvalidGameID(t *testing.T) { t.Parallel() - store := gamestub.NewStore() + store := gameinmem.NewStore() service := newService(t, store, fixedClock(time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC))) _, err := service.Handle(context.Background(), pausegame.Input{ diff --git a/lobby/internal/service/redeeminvite/service_test.go b/lobby/internal/service/redeeminvite/service_test.go index f3e7b43..f3c41a7 100644 --- a/lobby/internal/service/redeeminvite/service_test.go +++ b/lobby/internal/service/redeeminvite/service_test.go @@ -5,16 +5,16 @@ import ( "errors" "io" "log/slog" + "sync" "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/gapactivationstub" - "galaxy/lobby/internal/adapters/intentpubstub" - "galaxy/lobby/internal/adapters/invitestub" - "galaxy/lobby/internal/adapters/membershipstub" - "galaxy/lobby/internal/adapters/racenamestub" - "galaxy/lobby/internal/adapters/userservicestub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/gapactivationinmem" + "galaxy/lobby/internal/adapters/inviteinmem" + "galaxy/lobby/internal/adapters/membershipinmem" + "galaxy/lobby/internal/adapters/mocks" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/invite" @@ -26,8 +26,87 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) +type intentRec struct { + mu sync.Mutex + published []notificationintent.Intent + err error +} + +func (r *intentRec) record(_ context.Context, intent notificationintent.Intent) (string, error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.err != nil { + return "", r.err + } + r.published = append(r.published, intent) + return "1", nil +} + +func (r *intentRec) snapshot() []notificationintent.Intent { + r.mu.Lock() + defer r.mu.Unlock() + return append([]notificationintent.Intent(nil), r.published...) +} + +func (r *intentRec) setErr(err error) { + r.mu.Lock() + defer r.mu.Unlock() + r.err = err +} + +type userRec struct { + mu sync.Mutex + elig map[string]ports.Eligibility + failures map[string]error +} + +func (r *userRec) record(_ context.Context, userID string) (ports.Eligibility, error) { + r.mu.Lock() + defer r.mu.Unlock() + if err, ok := r.failures[userID]; ok { + return ports.Eligibility{}, err + } + if e, ok := r.elig[userID]; ok { + return e, nil + } + return ports.Eligibility{Exists: false}, nil +} + +func (r *userRec) setEligibility(userID string, e ports.Eligibility) { + r.mu.Lock() + defer r.mu.Unlock() + if r.elig == nil { + r.elig = make(map[string]ports.Eligibility) + } + r.elig[userID] = e +} + +func (r *userRec) setFailure(userID string, err error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.failures == nil { + r.failures = make(map[string]error) + } + r.failures[userID] = err +} + +func newIntentMock(t *testing.T, rec *intentRec) *mocks.MockIntentPublisher { + t.Helper() + m := mocks.NewMockIntentPublisher(gomock.NewController(t)) + m.EXPECT().Publish(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + +func newUserMock(t *testing.T, rec *userRec) *mocks.MockUserService { + t.Helper() + m := mocks.NewMockUserService(gomock.NewController(t)) + m.EXPECT().GetEligibility(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + const ( ownerUserID = "user-owner" inviteeUserID = "user-invitee" @@ -49,13 +128,15 @@ func (f fixedIDs) NewMembershipID() (common.MembershipID, error) { return f.me type fixture struct { now time.Time - games *gamestub.Store - invites *invitestub.Store - memberships *membershipstub.Store - directory *racenamestub.Directory - users *userservicestub.Service - gapStore *gapactivationstub.Store - intents *intentpubstub.Publisher + games *gameinmem.Store + invites *inviteinmem.Store + memberships *membershipinmem.Store + directory *racenameinmem.Directory + users *userRec + usersMock *mocks.MockUserService + gapStore *gapactivationinmem.Store + intents *intentRec + intentsMock *mocks.MockIntentPublisher ids fixedIDs game game.Game } @@ -63,11 +144,11 @@ type fixture struct { func newFixture(t *testing.T, maxPlayers, gapPlayers int) *fixture { t.Helper() now := time.Date(2026, 4, 25, 10, 0, 0, 0, time.UTC) - dir, err := racenamestub.NewDirectory(racenamestub.WithClock(fixedClock(now))) + dir, err := racenameinmem.NewDirectory(racenameinmem.WithClock(fixedClock(now))) require.NoError(t, err) - games := gamestub.NewStore() - invites := invitestub.NewStore() - memberships := membershipstub.NewStore() + games := gameinmem.NewStore() + invites := inviteinmem.NewStore() + memberships := membershipinmem.NewStore() gameRecord, err := game.New(game.NewGameInput{ GameID: "game-private", @@ -87,7 +168,7 @@ func newFixture(t *testing.T, maxPlayers, gapPlayers int) *fixture { gameRecord.Status = game.StatusEnrollmentOpen require.NoError(t, games.Save(context.Background(), gameRecord)) - users := userservicestub.NewService() + users := &userRec{} activeEligibility := ports.Eligibility{ Exists: true, CanLogin: true, @@ -96,9 +177,10 @@ func newFixture(t *testing.T, maxPlayers, gapPlayers int) *fixture { CanJoinGame: true, CanUpdateProfile: true, } - users.SetEligibility(ownerUserID, activeEligibility) - users.SetEligibility(inviteeUserID, activeEligibility) + users.setEligibility(ownerUserID, activeEligibility) + users.setEligibility(inviteeUserID, activeEligibility) + intents := &intentRec{} return &fixture{ now: now, games: games, @@ -106,8 +188,10 @@ func newFixture(t *testing.T, maxPlayers, gapPlayers int) *fixture { memberships: memberships, directory: dir, users: users, - gapStore: gapactivationstub.NewStore(), - intents: intentpubstub.NewPublisher(), + usersMock: newUserMock(t, users), + gapStore: gapactivationinmem.NewStore(), + intents: intents, + intentsMock: newIntentMock(t, intents), ids: fixedIDs{membershipID: "membership-fixed"}, game: gameRecord, } @@ -120,9 +204,9 @@ func newService(t *testing.T, f *fixture) *redeeminvite.Service { Invites: f.invites, Memberships: f.memberships, Directory: f.directory, - Users: f.users, + Users: f.usersMock, GapStore: f.gapStore, - Intents: f.intents, + Intents: f.intentsMock, IDs: f.ids, Clock: fixedClock(f.now), Logger: silentLogger(), @@ -181,7 +265,7 @@ func TestRedeemHappyPath(t *testing.T) { assert.True(t, avail.Taken) assert.Equal(t, inviteeUserID, avail.HolderUserID) - intents := f.intents.Published() + intents := f.intents.snapshot() require.Len(t, intents, 1) assert.Equal(t, notificationintent.NotificationTypeLobbyInviteRedeemed, intents[0].NotificationType) assert.Equal(t, []string{ownerUserID}, intents[0].RecipientUserIDs) @@ -194,7 +278,7 @@ func TestRedeemRejectsInviterPermanentBlock(t *testing.T) { t.Parallel() f := newFixture(t, 4, 1) inv := seedCreatedInvite(t, f, "invite-1", inviteeUserID) - f.users.SetEligibility(ownerUserID, ports.Eligibility{ + f.users.setEligibility(ownerUserID, ports.Eligibility{ Exists: true, PermanentBlocked: true, }) @@ -212,7 +296,7 @@ func TestRedeemRejectsInviteePermanentBlock(t *testing.T) { t.Parallel() f := newFixture(t, 4, 1) inv := seedCreatedInvite(t, f, "invite-1", inviteeUserID) - f.users.SetEligibility(inviteeUserID, ports.Eligibility{ + f.users.setEligibility(inviteeUserID, ports.Eligibility{ Exists: true, PermanentBlocked: true, }) @@ -226,7 +310,7 @@ func TestRedeemRejectsDeletedInviter(t *testing.T) { t.Parallel() f := newFixture(t, 4, 1) inv := seedCreatedInvite(t, f, "invite-1", inviteeUserID) - f.users.SetEligibility(ownerUserID, ports.Eligibility{Exists: false}) + f.users.setEligibility(ownerUserID, ports.Eligibility{Exists: false}) svc := newService(t, f) _, err := svc.Handle(context.Background(), defaultInput(f, inv)) @@ -237,7 +321,7 @@ func TestRedeemSurfacesUserServiceTransportFailure(t *testing.T) { t.Parallel() f := newFixture(t, 4, 1) inv := seedCreatedInvite(t, f, "invite-1", inviteeUserID) - f.users.SetFailure(ownerUserID, ports.ErrUserServiceUnavailable) + f.users.setFailure(ownerUserID, ports.ErrUserServiceUnavailable) svc := newService(t, f) _, err := svc.Handle(context.Background(), defaultInput(f, inv)) @@ -410,10 +494,10 @@ func TestRedeemInvalidRaceName(t *testing.T) { require.ErrorIs(t, err, ports.ErrInvalidName) } -// redeemCASStub wraps invitestub.Store but injects ErrConflict on the next +// redeemCASStub wraps inviteinmem.Store but injects ErrConflict on the next // UpdateStatus call so we can observe the rollback path. type redeemCASStub struct { - *invitestub.Store + *inviteinmem.Store failNext bool } @@ -436,9 +520,9 @@ func TestRedeemCASConflictReleasesReservation(t *testing.T) { Invites: cas, Memberships: f.memberships, Directory: f.directory, - Users: f.users, + Users: f.usersMock, GapStore: f.gapStore, - Intents: f.intents, + Intents: f.intentsMock, IDs: f.ids, Clock: fixedClock(f.now), Logger: silentLogger(), @@ -458,7 +542,7 @@ func TestRedeemPublishFailureDoesNotRollback(t *testing.T) { t.Parallel() f := newFixture(t, 4, 1) inv := seedCreatedInvite(t, f, "invite-1", inviteeUserID) - f.intents.SetError(errors.New("publish failed")) + f.intents.setErr(errors.New("publish failed")) svc := newService(t, f) got, err := svc.Handle(context.Background(), defaultInput(f, inv)) diff --git a/lobby/internal/service/registerracename/service_test.go b/lobby/internal/service/registerracename/service_test.go index ad0e1c7..6ebbd6c 100644 --- a/lobby/internal/service/registerracename/service_test.go +++ b/lobby/internal/service/registerracename/service_test.go @@ -6,12 +6,12 @@ import ( "errors" "io" "log/slog" + "sync" "testing" "time" - "galaxy/lobby/internal/adapters/intentpubstub" - "galaxy/lobby/internal/adapters/racenamestub" - "galaxy/lobby/internal/adapters/userservicestub" + "galaxy/lobby/internal/adapters/mocks" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/ports" "galaxy/lobby/internal/service/registerracename" "galaxy/lobby/internal/service/shared" @@ -19,28 +19,113 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) +type intentRec struct { + mu sync.Mutex + published []notificationintent.Intent + err error +} + +func (r *intentRec) record(_ context.Context, intent notificationintent.Intent) (string, error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.err != nil { + return "", r.err + } + r.published = append(r.published, intent) + return "1", nil +} + +func (r *intentRec) snapshot() []notificationintent.Intent { + r.mu.Lock() + defer r.mu.Unlock() + return append([]notificationintent.Intent(nil), r.published...) +} + +func (r *intentRec) setErr(err error) { + r.mu.Lock() + defer r.mu.Unlock() + r.err = err +} + +type userRec struct { + mu sync.Mutex + elig map[string]ports.Eligibility + failures map[string]error +} + +func (r *userRec) record(_ context.Context, userID string) (ports.Eligibility, error) { + r.mu.Lock() + defer r.mu.Unlock() + if err, ok := r.failures[userID]; ok { + return ports.Eligibility{}, err + } + if e, ok := r.elig[userID]; ok { + return e, nil + } + return ports.Eligibility{Exists: false}, nil +} + +func (r *userRec) setEligibility(userID string, e ports.Eligibility) { + r.mu.Lock() + defer r.mu.Unlock() + if r.elig == nil { + r.elig = make(map[string]ports.Eligibility) + } + r.elig[userID] = e +} + +func (r *userRec) setFailure(userID string, err error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.failures == nil { + r.failures = make(map[string]error) + } + r.failures[userID] = err +} + +func newIntentMock(t *testing.T, rec *intentRec) *mocks.MockIntentPublisher { + t.Helper() + m := mocks.NewMockIntentPublisher(gomock.NewController(t)) + m.EXPECT().Publish(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + +func newUserMock(t *testing.T, rec *userRec) *mocks.MockUserService { + t.Helper() + m := mocks.NewMockUserService(gomock.NewController(t)) + m.EXPECT().GetEligibility(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + func silentLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } func fixedClock(at time.Time) func() time.Time { return func() time.Time { return at } } type fixture struct { now time.Time - directory *racenamestub.Directory - users *userservicestub.Service - intents *intentpubstub.Publisher + directory *racenameinmem.Directory + users *userRec + usersMock *mocks.MockUserService + intents *intentRec + pubMock *mocks.MockIntentPublisher } func newFixture(t *testing.T, now time.Time) *fixture { t.Helper() - directory, err := racenamestub.NewDirectory(racenamestub.WithClock(fixedClock(now))) + directory, err := racenameinmem.NewDirectory(racenameinmem.WithClock(fixedClock(now))) require.NoError(t, err) + users := &userRec{} + intents := &intentRec{} return &fixture{ now: now, directory: directory, - users: userservicestub.NewService(), - intents: intentpubstub.NewPublisher(), + users: users, + usersMock: newUserMock(t, users), + intents: intents, + pubMock: newIntentMock(t, intents), } } @@ -48,8 +133,8 @@ func (f *fixture) newService(t *testing.T) *registerracename.Service { t.Helper() svc, err := registerracename.NewService(registerracename.Dependencies{ Directory: f.directory, - Users: f.users, - Intents: f.intents, + Users: f.usersMock, + Intents: f.pubMock, Clock: fixedClock(f.now), Logger: silentLogger(), }) @@ -102,7 +187,7 @@ func TestRegisterRaceNameHappyPath(t *testing.T) { now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) f := newFixture(t, now) - f.users.SetEligibility("user-1", defaultEligibility(2)) + f.users.setEligibility("user-1", defaultEligibility(2)) f.seedPending(t, "game-1", "user-1", "Stellaris", now.Add(7*24*time.Hour)) svc := f.newService(t) @@ -128,7 +213,7 @@ func TestRegisterRaceNameHappyPath(t *testing.T) { require.NoError(t, err) assert.Empty(t, pending) - intents := f.intents.Published() + intents := f.intents.snapshot() require.Len(t, intents, 1) intent := intents[0] assert.Equal(t, notificationintent.NotificationTypeLobbyRaceNameRegistered, intent.NotificationType) @@ -144,7 +229,7 @@ func TestRegisterRaceNameIdempotentRetry(t *testing.T) { now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) f := newFixture(t, now) - f.users.SetEligibility("user-1", defaultEligibility(1)) + f.users.setEligibility("user-1", defaultEligibility(1)) f.seedPending(t, "game-1", "user-1", "Stellaris", now.Add(7*24*time.Hour)) svc := f.newService(t) @@ -167,7 +252,7 @@ func TestRegisterRaceNameIdempotentRetry(t *testing.T) { require.NoError(t, err) assert.Len(t, registered, 1, "registration must remain idempotent") - intents := f.intents.Published() + intents := f.intents.snapshot() require.Len(t, intents, 2, "idempotent retry republishes the intent") for _, intent := range intents { assert.Equal(t, "lobby.race_name.registered:game-1:user-1", intent.IdempotencyKey) @@ -257,7 +342,7 @@ func TestRegisterRaceNameRejectsPermanentBlock(t *testing.T) { now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) f := newFixture(t, now) - f.users.SetEligibility("user-1", ports.Eligibility{ + f.users.setEligibility("user-1", ports.Eligibility{ Exists: true, PermanentBlocked: true, MaxRegisteredRaceNames: 2, @@ -278,7 +363,7 @@ func TestRegisterRaceNamePendingMissing(t *testing.T) { now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) f := newFixture(t, now) - f.users.SetEligibility("user-1", defaultEligibility(2)) + f.users.setEligibility("user-1", defaultEligibility(2)) svc := f.newService(t) _, err := svc.Handle(context.Background(), registerracename.Input{ @@ -294,7 +379,7 @@ func TestRegisterRaceNamePendingForOtherUserSurfacesAsMissing(t *testing.T) { now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) f := newFixture(t, now) - f.users.SetEligibility("user-1", defaultEligibility(2)) + f.users.setEligibility("user-1", defaultEligibility(2)) // Pending exists for a different user; the actor has none. f.seedPending(t, "game-1", "user-other", "Stellaris", now.Add(24*time.Hour)) @@ -316,7 +401,7 @@ func TestRegisterRaceNamePendingExpired(t *testing.T) { now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) f := newFixture(t, now) - f.users.SetEligibility("user-1", defaultEligibility(2)) + f.users.setEligibility("user-1", defaultEligibility(2)) // Pending elig until is in the past relative to now. f.seedPending(t, "game-1", "user-1", "Stellaris", now.Add(-time.Minute)) @@ -335,7 +420,7 @@ func TestRegisterRaceNameQuotaExceeded(t *testing.T) { now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) f := newFixture(t, now) // Free-tier quota = 1; user already has one registered name. - f.users.SetEligibility("user-1", defaultEligibility(1)) + f.users.setEligibility("user-1", defaultEligibility(1)) f.seedRegistered(t, "game-existing", "user-1", "OldName") f.seedPending(t, "game-new", "user-1", "Stellaris", now.Add(24*time.Hour)) @@ -354,7 +439,7 @@ func TestRegisterRaceNameUnlimitedQuotaAllowsManyRegistrations(t *testing.T) { now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) f := newFixture(t, now) // MaxRegisteredRaceNames=0 marker → unlimited. - f.users.SetEligibility("user-1", defaultEligibility(0)) + f.users.setEligibility("user-1", defaultEligibility(0)) f.seedRegistered(t, "game-a", "user-1", "First") f.seedRegistered(t, "game-b", "user-1", "Second") f.seedPending(t, "game-c", "user-1", "Third", now.Add(24*time.Hour)) @@ -373,7 +458,7 @@ func TestRegisterRaceNameUserServiceUnavailable(t *testing.T) { now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) f := newFixture(t, now) - f.users.SetFailure("user-1", ports.ErrUserServiceUnavailable) + f.users.setFailure("user-1", ports.ErrUserServiceUnavailable) f.seedPending(t, "game-1", "user-1", "Stellaris", now.Add(24*time.Hour)) svc := f.newService(t) @@ -390,9 +475,9 @@ func TestRegisterRaceNameCommitsEvenIfPublishFails(t *testing.T) { now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) f := newFixture(t, now) - f.users.SetEligibility("user-1", defaultEligibility(2)) + f.users.setEligibility("user-1", defaultEligibility(2)) f.seedPending(t, "game-1", "user-1", "Stellaris", now.Add(7*24*time.Hour)) - f.intents.SetError(errors.New("notification stream unavailable")) + f.intents.setErr(errors.New("notification stream unavailable")) svc := f.newService(t) out, err := svc.Handle(context.Background(), registerracename.Input{ diff --git a/lobby/internal/service/rejectapplication/service_test.go b/lobby/internal/service/rejectapplication/service_test.go index d7cd028..568b296 100644 --- a/lobby/internal/service/rejectapplication/service_test.go +++ b/lobby/internal/service/rejectapplication/service_test.go @@ -5,13 +5,14 @@ import ( "errors" "io" "log/slog" + "sync" "testing" "time" - "galaxy/lobby/internal/adapters/applicationstub" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/intentpubstub" - "galaxy/lobby/internal/adapters/racenamestub" + "galaxy/lobby/internal/adapters/applicationinmem" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/mocks" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/domain/application" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" @@ -22,28 +23,65 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) +type intentRec struct { + mu sync.Mutex + published []notificationintent.Intent + err error +} + +func (r *intentRec) record(_ context.Context, intent notificationintent.Intent) (string, error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.err != nil { + return "", r.err + } + r.published = append(r.published, intent) + return "1", nil +} + +func (r *intentRec) snapshot() []notificationintent.Intent { + r.mu.Lock() + defer r.mu.Unlock() + return append([]notificationintent.Intent(nil), r.published...) +} + +func (r *intentRec) setErr(err error) { + r.mu.Lock() + defer r.mu.Unlock() + r.err = err +} + +func newIntentMock(t *testing.T, rec *intentRec) *mocks.MockIntentPublisher { + t.Helper() + m := mocks.NewMockIntentPublisher(gomock.NewController(t)) + m.EXPECT().Publish(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + func silentLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } func fixedClock(at time.Time) func() time.Time { return func() time.Time { return at } } type fixture struct { now time.Time - games *gamestub.Store - applications *applicationstub.Store - directory *racenamestub.Directory - intents *intentpubstub.Publisher + games *gameinmem.Store + applications *applicationinmem.Store + directory *racenameinmem.Directory + intentRec *intentRec + intents *mocks.MockIntentPublisher openPublicGameID common.GameID } func newFixture(t *testing.T) *fixture { t.Helper() now := time.Date(2026, 4, 25, 10, 0, 0, 0, time.UTC) - dir, err := racenamestub.NewDirectory(racenamestub.WithClock(fixedClock(now))) + dir, err := racenameinmem.NewDirectory(racenameinmem.WithClock(fixedClock(now))) require.NoError(t, err) - games := gamestub.NewStore() - applications := applicationstub.NewStore() + games := gameinmem.NewStore() + applications := applicationinmem.NewStore() gameRecord, err := game.New(game.NewGameInput{ GameID: "game-public", @@ -62,18 +100,22 @@ func newFixture(t *testing.T) *fixture { gameRecord.Status = game.StatusEnrollmentOpen require.NoError(t, games.Save(context.Background(), gameRecord)) + rec := &intentRec{} return &fixture{ now: now, games: games, applications: applications, directory: dir, - intents: intentpubstub.NewPublisher(), + intentRec: rec, openPublicGameID: gameRecord.GameID, } } func newService(t *testing.T, f *fixture) *rejectapplication.Service { t.Helper() + if f.intents == nil { + f.intents = newIntentMock(t, f.intentRec) + } svc, err := rejectapplication.NewService(rejectapplication.Dependencies{ Games: f.games, Applications: f.applications, @@ -116,7 +158,7 @@ func TestRejectHappyPath(t *testing.T) { require.NotNil(t, got.DecidedAt) assert.Equal(t, f.now, got.DecidedAt.UTC()) - intents := f.intents.Published() + intents := f.intentRec.snapshot() require.Len(t, intents, 1) assert.Equal(t, notificationintent.NotificationTypeLobbyMembershipRejected, intents[0].NotificationType) assert.Equal(t, []string{"user-1"}, intents[0].RecipientUserIDs) @@ -208,7 +250,7 @@ func TestRejectPublishFailureDoesNotRollback(t *testing.T) { t.Parallel() f := newFixture(t) app := seedSubmittedApplication(t, f, "application-1", "user-1", "SolarPilot") - f.intents.SetError(errors.New("publish failed")) + f.intentRec.setErr(errors.New("publish failed")) svc := newService(t, f) got, err := svc.Handle(context.Background(), rejectapplication.Input{ diff --git a/lobby/internal/service/removemember/service_test.go b/lobby/internal/service/removemember/service_test.go index 738317f..f77413c 100644 --- a/lobby/internal/service/removemember/service_test.go +++ b/lobby/internal/service/removemember/service_test.go @@ -8,9 +8,9 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/membershipstub" - "galaxy/lobby/internal/adapters/racenamestub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/membershipinmem" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/membership" @@ -31,20 +31,20 @@ func fixedClock(at time.Time) func() time.Time { } type fixtures struct { - games *gamestub.Store - memberships *membershipstub.Store - directory *racenamestub.Directory + games *gameinmem.Store + memberships *membershipinmem.Store + directory *racenameinmem.Directory } func newFixtures(t *testing.T) *fixtures { t.Helper() - directory, err := racenamestub.NewDirectory() + directory, err := racenameinmem.NewDirectory() require.NoError(t, err) return &fixtures{ - games: gamestub.NewStore(), - memberships: membershipstub.NewStore(), + games: gameinmem.NewStore(), + memberships: membershipinmem.NewStore(), directory: directory, } } diff --git a/lobby/internal/service/resumegame/service_test.go b/lobby/internal/service/resumegame/service_test.go index 2b6fd6a..b2fad9f 100644 --- a/lobby/internal/service/resumegame/service_test.go +++ b/lobby/internal/service/resumegame/service_test.go @@ -8,8 +8,8 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/gmclientstub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/mocks" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/ports" @@ -18,6 +18,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) func silentLogger() *slog.Logger { @@ -33,7 +34,7 @@ func fixedClock(at time.Time) func() time.Time { // source status. func seedGameWithStatus( t *testing.T, - store *gamestub.Store, + store *gameinmem.Store, id common.GameID, gameType game.GameType, ownerUserID string, @@ -94,13 +95,18 @@ func newService( return svc } +func newGMMock(t *testing.T) *mocks.MockGMClient { + t.Helper() + return mocks.NewMockGMClient(gomock.NewController(t)) +} + func TestNewServiceRejectsMissingDeps(t *testing.T) { t.Parallel() _, err := resumegame.NewService(resumegame.Dependencies{}) require.Error(t, err) - _, err = resumegame.NewService(resumegame.Dependencies{Games: gamestub.NewStore()}) + _, err = resumegame.NewService(resumegame.Dependencies{Games: gameinmem.NewStore()}) require.Error(t, err) } @@ -108,10 +114,11 @@ func TestResumeGameAdminHappyPath(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-pub", game.GameTypePublic, "", game.StatusPaused, now) - gm := gmclientstub.NewClient() + gm := newGMMock(t) + gm.EXPECT().Ping(gomock.Any()).Return(nil).Times(1) service := newService(t, store, gm, fixedClock(now.Add(time.Hour))) updated, err := service.Handle(context.Background(), resumegame.Input{ @@ -120,17 +127,17 @@ func TestResumeGameAdminHappyPath(t *testing.T) { }) require.NoError(t, err) assert.Equal(t, game.StatusRunning, updated.Status) - assert.Equal(t, 1, gm.PingCalls()) } func TestResumeGamePrivateOwnerHappyPath(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-priv", game.GameTypePrivate, "user-owner", game.StatusPaused, now) - gm := gmclientstub.NewClient() + gm := newGMMock(t) + gm.EXPECT().Ping(gomock.Any()).Return(nil).Times(1) service := newService(t, store, gm, fixedClock(now.Add(time.Hour))) updated, err := service.Handle(context.Background(), resumegame.Input{ @@ -139,17 +146,16 @@ func TestResumeGamePrivateOwnerHappyPath(t *testing.T) { }) require.NoError(t, err) assert.Equal(t, game.StatusRunning, updated.Status) - assert.Equal(t, 1, gm.PingCalls()) } func TestResumeGameRejectsNonOwnerUser(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-priv", game.GameTypePrivate, "user-owner", game.StatusPaused, now) - gm := gmclientstub.NewClient() + gm := newGMMock(t) service := newService(t, store, gm, fixedClock(now.Add(time.Hour))) _, err := service.Handle(context.Background(), resumegame.Input{ @@ -157,17 +163,16 @@ func TestResumeGameRejectsNonOwnerUser(t *testing.T) { GameID: record.GameID, }) require.ErrorIs(t, err, shared.ErrForbidden) - assert.Equal(t, 0, gm.PingCalls(), "ping must not run before authorization passes") } func TestResumeGameRejectsUserActorOnPublicGame(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-pub", game.GameTypePublic, "", game.StatusPaused, now) - gm := gmclientstub.NewClient() + gm := newGMMock(t) service := newService(t, store, gm, fixedClock(now.Add(time.Hour))) _, err := service.Handle(context.Background(), resumegame.Input{ @@ -175,7 +180,6 @@ func TestResumeGameRejectsUserActorOnPublicGame(t *testing.T) { GameID: record.GameID, }) require.ErrorIs(t, err, shared.ErrForbidden) - assert.Equal(t, 0, gm.PingCalls()) } func TestResumeGameRejectsWrongStatuses(t *testing.T) { @@ -197,10 +201,10 @@ func TestResumeGameRejectsWrongStatuses(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-x", game.GameTypePublic, "", status, now) - gm := gmclientstub.NewClient() + gm := newGMMock(t) service := newService(t, store, gm, fixedClock(now.Add(time.Hour))) _, err := service.Handle(context.Background(), resumegame.Input{ @@ -208,7 +212,6 @@ func TestResumeGameRejectsWrongStatuses(t *testing.T) { GameID: record.GameID, }) require.ErrorIs(t, err, game.ErrConflict) - assert.Equal(t, 0, gm.PingCalls(), "ping must not run before status check passes") }) } } @@ -217,11 +220,13 @@ func TestResumeGameGMUnavailableKeepsPaused(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedGameWithStatus(t, store, "game-pub", game.GameTypePublic, "", game.StatusPaused, now) - gm := gmclientstub.NewClient() - gm.SetPingError(errors.Join(ports.ErrGMUnavailable, errors.New("dial tcp: connection refused"))) + gm := newGMMock(t) + gm.EXPECT().Ping(gomock.Any()). + Return(errors.Join(ports.ErrGMUnavailable, errors.New("dial tcp: connection refused"))). + Times(1) service := newService(t, store, gm, fixedClock(now.Add(time.Hour))) _, err := service.Handle(context.Background(), resumegame.Input{ @@ -231,7 +236,6 @@ func TestResumeGameGMUnavailableKeepsPaused(t *testing.T) { require.Error(t, err) assert.ErrorIs(t, err, shared.ErrServiceUnavailable) assert.ErrorIs(t, err, ports.ErrGMUnavailable) - assert.Equal(t, 1, gm.PingCalls()) persisted, err := store.Get(context.Background(), record.GameID) require.NoError(t, err) @@ -242,8 +246,8 @@ func TestResumeGameGMUnavailableKeepsPaused(t *testing.T) { func TestResumeGameRejectsMissingRecord(t *testing.T) { t.Parallel() - gm := gmclientstub.NewClient() - store := gamestub.NewStore() + gm := newGMMock(t) + store := gameinmem.NewStore() service := newService(t, store, gm, fixedClock(time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC))) _, err := service.Handle(context.Background(), resumegame.Input{ @@ -251,14 +255,13 @@ func TestResumeGameRejectsMissingRecord(t *testing.T) { GameID: common.GameID("game-missing"), }) require.ErrorIs(t, err, game.ErrNotFound) - assert.Equal(t, 0, gm.PingCalls()) } func TestResumeGameInvalidActor(t *testing.T) { t.Parallel() - gm := gmclientstub.NewClient() - store := gamestub.NewStore() + gm := newGMMock(t) + store := gameinmem.NewStore() service := newService(t, store, gm, fixedClock(time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC))) _, err := service.Handle(context.Background(), resumegame.Input{ @@ -272,8 +275,8 @@ func TestResumeGameInvalidActor(t *testing.T) { func TestResumeGameInvalidGameID(t *testing.T) { t.Parallel() - gm := gmclientstub.NewClient() - store := gamestub.NewStore() + gm := newGMMock(t) + store := gameinmem.NewStore() service := newService(t, store, gm, fixedClock(time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC))) _, err := service.Handle(context.Background(), resumegame.Input{ diff --git a/lobby/internal/service/retrystartgame/service_test.go b/lobby/internal/service/retrystartgame/service_test.go index af24752..4625a0c 100644 --- a/lobby/internal/service/retrystartgame/service_test.go +++ b/lobby/internal/service/retrystartgame/service_test.go @@ -7,7 +7,7 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" + "galaxy/lobby/internal/adapters/gameinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/service/retrystartgame" @@ -47,7 +47,7 @@ func newFailedGame(t *testing.T, gameType game.GameType, ownerID string) (game.G return record, now } -func newService(t *testing.T, games *gamestub.Store, at time.Time) *retrystartgame.Service { +func newService(t *testing.T, games *gameinmem.Store, at time.Time) *retrystartgame.Service { t.Helper() service, err := retrystartgame.NewService(retrystartgame.Dependencies{ Games: games, @@ -65,7 +65,7 @@ func TestNewServiceRejectsMissingDeps(t *testing.T) { func TestRetryStartGameAdminHappyPath(t *testing.T) { record, now := newFailedGame(t, game.GameTypePublic, "") - games := gamestub.NewStore() + games := gameinmem.NewStore() require.NoError(t, games.Save(context.Background(), record)) service := newService(t, games, now.Add(time.Hour)) @@ -79,7 +79,7 @@ func TestRetryStartGameAdminHappyPath(t *testing.T) { func TestRetryStartGamePrivateOwnerHappyPath(t *testing.T) { record, now := newFailedGame(t, game.GameTypePrivate, "user-owner") - games := gamestub.NewStore() + games := gameinmem.NewStore() require.NoError(t, games.Save(context.Background(), record)) service := newService(t, games, now.Add(time.Hour)) @@ -93,7 +93,7 @@ func TestRetryStartGamePrivateOwnerHappyPath(t *testing.T) { func TestRetryStartGameRejectsNonOwnerUser(t *testing.T) { record, now := newFailedGame(t, game.GameTypePrivate, "user-owner") - games := gamestub.NewStore() + games := gameinmem.NewStore() require.NoError(t, games.Save(context.Background(), record)) service := newService(t, games, now.Add(time.Hour)) @@ -109,7 +109,7 @@ func TestRetryStartGameRejectsWrongStatus(t *testing.T) { record.Status = game.StatusRunning startedAt := now.Add(30 * time.Minute) record.StartedAt = &startedAt - games := gamestub.NewStore() + games := gameinmem.NewStore() require.NoError(t, games.Save(context.Background(), record)) service := newService(t, games, now.Add(time.Hour)) @@ -121,7 +121,7 @@ func TestRetryStartGameRejectsWrongStatus(t *testing.T) { } func TestRetryStartGameRejectsMissingRecord(t *testing.T) { - games := gamestub.NewStore() + games := gameinmem.NewStore() service := newService(t, games, time.Now().UTC()) _, err := service.Handle(context.Background(), retrystartgame.Input{ diff --git a/lobby/internal/service/revokeinvite/service_test.go b/lobby/internal/service/revokeinvite/service_test.go index 66d6a06..e55dd01 100644 --- a/lobby/internal/service/revokeinvite/service_test.go +++ b/lobby/internal/service/revokeinvite/service_test.go @@ -7,8 +7,8 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/invitestub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/inviteinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/invite" @@ -31,16 +31,16 @@ func fixedClock(at time.Time) func() time.Time { return func() time.Time { retur type fixture struct { now time.Time - games *gamestub.Store - invites *invitestub.Store + games *gameinmem.Store + invites *inviteinmem.Store game game.Game } func newFixture(t *testing.T) *fixture { t.Helper() now := time.Date(2026, 4, 25, 10, 0, 0, 0, time.UTC) - games := gamestub.NewStore() - invites := invitestub.NewStore() + games := gameinmem.NewStore() + invites := inviteinmem.NewStore() gameRecord, err := game.New(game.NewGameInput{ GameID: "game-private", @@ -196,7 +196,7 @@ func TestRevokeGameNotFound(t *testing.T) { // game path is a defensive guard, but the surfaced error must be // subject_not_found rather than forbidden. svc, err := revokeinvite.NewService(revokeinvite.Dependencies{ - Games: gamestub.NewStore(), + Games: gameinmem.NewStore(), Invites: f.invites, Clock: fixedClock(f.now), Logger: silentLogger(), diff --git a/lobby/internal/service/shared/closeenrollment_test.go b/lobby/internal/service/shared/closeenrollment_test.go index 08b22ff..af3016e 100644 --- a/lobby/internal/service/shared/closeenrollment_test.go +++ b/lobby/internal/service/shared/closeenrollment_test.go @@ -5,12 +5,13 @@ import ( "errors" "io" "log/slog" + "sync" "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/intentpubstub" - "galaxy/lobby/internal/adapters/invitestub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/inviteinmem" + "galaxy/lobby/internal/adapters/mocks" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/invite" @@ -20,6 +21,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) const ( @@ -30,20 +32,57 @@ const ( func silentLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } +type intentRec struct { + mu sync.Mutex + published []notificationintent.Intent + err error +} + +func (r *intentRec) record(_ context.Context, intent notificationintent.Intent) (string, error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.err != nil { + return "", r.err + } + r.published = append(r.published, intent) + return "1", nil +} + +func (r *intentRec) snapshot() []notificationintent.Intent { + r.mu.Lock() + defer r.mu.Unlock() + return append([]notificationintent.Intent(nil), r.published...) +} + +func (r *intentRec) setErr(err error) { + r.mu.Lock() + defer r.mu.Unlock() + r.err = err +} + +func newIntentMock(t *testing.T, rec *intentRec) *mocks.MockIntentPublisher { + t.Helper() + m := mocks.NewMockIntentPublisher(gomock.NewController(t)) + m.EXPECT().Publish(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + type closeFixture struct { - now time.Time - games *gamestub.Store - invites *invitestub.Store - intents *intentpubstub.Publisher - game game.Game + now time.Time + games *gameinmem.Store + invites *inviteinmem.Store + intentRec *intentRec + intents *mocks.MockIntentPublisher + game game.Game } func newCloseFixture(t *testing.T) *closeFixture { t.Helper() now := time.Date(2026, 4, 25, 10, 0, 0, 0, time.UTC) - games := gamestub.NewStore() - invites := invitestub.NewStore() - intents := intentpubstub.NewPublisher() + games := gameinmem.NewStore() + invites := inviteinmem.NewStore() + rec := &intentRec{} + intents := newIntentMock(t, rec) gameRecord, err := game.New(game.NewGameInput{ GameID: closeGameID, @@ -64,11 +103,12 @@ func newCloseFixture(t *testing.T) *closeFixture { require.NoError(t, games.Save(context.Background(), gameRecord)) return &closeFixture{ - now: now, - games: games, - invites: invites, - intents: intents, - game: gameRecord, + now: now, + games: games, + invites: invites, + intentRec: rec, + intents: intents, + game: gameRecord, } } @@ -120,7 +160,7 @@ func TestCloseEnrollmentTransitionsGameAndExpiresInvites(t *testing.T) { require.NoError(t, err) assert.Equal(t, invite.StatusExpired, second.Status) - intents := f.intents.Published() + intents := f.intentRec.snapshot() require.Len(t, intents, 2) for _, intent := range intents { assert.Equal(t, notificationintent.NotificationTypeLobbyInviteExpired, intent.NotificationType) @@ -158,7 +198,7 @@ func TestCloseEnrollmentLeavesNonCreatedInvitesUntouched(t *testing.T) { require.NoError(t, err) assert.Equal(t, invite.StatusDeclined, declinedAfter.Status) - intents := f.intents.Published() + intents := f.intentRec.snapshot() require.Len(t, intents, 1) } @@ -184,14 +224,14 @@ func TestCloseEnrollmentSurfacesGameConflict(t *testing.T) { stillCreated, err := f.invites.Get(context.Background(), "invite-1") require.NoError(t, err) assert.Equal(t, invite.StatusCreated, stillCreated.Status) - assert.Empty(t, f.intents.Published()) + assert.Empty(t, f.intentRec.snapshot()) } func TestCloseEnrollmentSwallowsIntentPublishFailure(t *testing.T) { t.Parallel() f := newCloseFixture(t) f.addCreatedInvite(t, "invite-1", "user-a") - f.intents.SetError(errors.New("publisher offline")) + f.intentRec.setErr(errors.New("publisher offline")) updated, err := shared.CloseEnrollment( context.Background(), @@ -221,7 +261,7 @@ func TestCloseEnrollmentIsIdempotentOnSecondCall(t *testing.T) { f.now.Add(time.Minute), ) require.NoError(t, err) - assert.Len(t, f.intents.Published(), 1) + assert.Len(t, f.intentRec.snapshot(), 1) _, err = shared.CloseEnrollment( context.Background(), @@ -231,7 +271,7 @@ func TestCloseEnrollmentIsIdempotentOnSecondCall(t *testing.T) { f.now.Add(2*time.Minute), ) require.ErrorIs(t, err, game.ErrConflict) - assert.Len(t, f.intents.Published(), 1) + assert.Len(t, f.intentRec.snapshot(), 1) } func TestCloseEnrollmentRejectsUnknownTrigger(t *testing.T) { diff --git a/lobby/internal/service/startgame/service.go b/lobby/internal/service/startgame/service.go index a35c1ee..a38cddf 100644 --- a/lobby/internal/service/startgame/service.go +++ b/lobby/internal/service/startgame/service.go @@ -14,6 +14,7 @@ import ( "time" "galaxy/lobby/internal/domain/common" + "galaxy/lobby/internal/domain/engineimage" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/logging" "galaxy/lobby/internal/ports" @@ -23,11 +24,12 @@ import ( // Service executes the start-game use case. type Service struct { - games ports.GameStore - runtimeManager ports.RuntimeManager - clock func() time.Time - logger *slog.Logger - telemetry *telemetry.Runtime + games ports.GameStore + runtimeManager ports.RuntimeManager + imageResolver *engineimage.Resolver + clock func() time.Time + logger *slog.Logger + telemetry *telemetry.Runtime } // Dependencies groups the collaborators used by Service. @@ -38,6 +40,11 @@ type Dependencies struct { // RuntimeManager publishes the start job after the CAS succeeds. RuntimeManager ports.RuntimeManager + // ImageResolver substitutes a game's TargetEngineVersion into the + // configured engine-image template to produce the `image_ref` + // published on `runtime:start_jobs`. + ImageResolver *engineimage.Resolver + // Clock supplies the wall-clock used for UpdatedAt. Defaults to // time.Now when nil. Clock func() time.Time @@ -58,6 +65,8 @@ func NewService(deps Dependencies) (*Service, error) { return nil, errors.New("new start game service: nil game store") case deps.RuntimeManager == nil: return nil, errors.New("new start game service: nil runtime manager") + case deps.ImageResolver == nil: + return nil, errors.New("new start game service: nil image resolver") } clock := deps.Clock @@ -72,6 +81,7 @@ func NewService(deps Dependencies) (*Service, error) { return &Service{ games: deps.Games, runtimeManager: deps.RuntimeManager, + imageResolver: deps.ImageResolver, clock: clock, logger: logger.With("service", "lobby.startgame"), telemetry: deps.Telemetry, @@ -127,6 +137,11 @@ func (service *Service) Handle(ctx context.Context, input Input) (game.Game, err ) } + imageRef, err := service.imageResolver.Resolve(record.TargetEngineVersion) + if err != nil { + return game.Game{}, fmt.Errorf("start game: resolve image ref: %w", err) + } + at := service.clock().UTC() if err := service.games.UpdateStatus(ctx, ports.UpdateStatusInput{ GameID: input.GameID, @@ -144,7 +159,7 @@ func (service *Service) Handle(ctx context.Context, input Input) (game.Game, err string(game.TriggerCommand), ) - if err := service.runtimeManager.PublishStartJob(ctx, input.GameID.String()); err != nil { + if err := service.runtimeManager.PublishStartJob(ctx, input.GameID.String(), imageRef); err != nil { // Status is already `starting` and the domain forbids a direct // rollback to `ready_to_start`. We surface the publish error to // the caller; the game stays in `starting` until either a diff --git a/lobby/internal/service/startgame/service_test.go b/lobby/internal/service/startgame/service_test.go index 3f14b76..aa71c5b 100644 --- a/lobby/internal/service/startgame/service_test.go +++ b/lobby/internal/service/startgame/service_test.go @@ -5,12 +5,14 @@ import ( "errors" "io" "log/slog" + "sync" "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/runtimemanagerstub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/mocks" "galaxy/lobby/internal/domain/common" + "galaxy/lobby/internal/domain/engineimage" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/ports" "galaxy/lobby/internal/service/shared" @@ -18,8 +20,11 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) +const testImageTemplate = "registry.example.com/galaxy/game:{engine_version}" + func silentLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } @@ -50,36 +55,113 @@ func newReadyGame(t *testing.T, gameType game.GameType, ownerID string) (game.Ga return record, now } +// runtimeRec captures every PublishStartJob/PublishStopJob call so tests +// can assert which jobs ran. Per-test error injection sets startErr. +type runtimeRec struct { + mu sync.Mutex + startIDs []string + startRefs []string + stopIDs []string + stopReas []ports.StopReason + startErr error +} + +func (r *runtimeRec) recordStart(_ context.Context, gameID, imageRef string) error { + r.mu.Lock() + defer r.mu.Unlock() + if r.startErr != nil { + return r.startErr + } + r.startIDs = append(r.startIDs, gameID) + r.startRefs = append(r.startRefs, imageRef) + return nil +} + +func (r *runtimeRec) recordStop(_ context.Context, gameID string, reason ports.StopReason) error { + r.mu.Lock() + defer r.mu.Unlock() + r.stopIDs = append(r.stopIDs, gameID) + r.stopReas = append(r.stopReas, reason) + return nil +} + +func (r *runtimeRec) startJobs() []string { + r.mu.Lock() + defer r.mu.Unlock() + return append([]string(nil), r.startIDs...) +} + +func (r *runtimeRec) startImageRefs() []string { + r.mu.Lock() + defer r.mu.Unlock() + return append([]string(nil), r.startRefs...) +} + +func (r *runtimeRec) stopJobs() []string { + r.mu.Lock() + defer r.mu.Unlock() + return append([]string(nil), r.stopIDs...) +} + +func newRuntimeMock(t *testing.T, rec *runtimeRec) *mocks.MockRuntimeManager { + t.Helper() + m := mocks.NewMockRuntimeManager(gomock.NewController(t)) + m.EXPECT().PublishStartJob(gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(rec.recordStart).AnyTimes() + m.EXPECT().PublishStopJob(gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(rec.recordStop).AnyTimes() + return m +} + type fixture struct { - games *gamestub.Store - runtime *runtimemanagerstub.Publisher + games *gameinmem.Store + rec *runtimeRec + runtime *mocks.MockRuntimeManager service *startgame.Service now time.Time } func newFixture(t *testing.T, record game.Game, now time.Time) *fixture { t.Helper() - games := gamestub.NewStore() + games := gameinmem.NewStore() require.NoError(t, games.Save(context.Background(), record)) - runtime := runtimemanagerstub.NewPublisher() + rec := &runtimeRec{} + runtime := newRuntimeMock(t, rec) + resolver, err := engineimage.NewResolver(testImageTemplate) + require.NoError(t, err) service, err := startgame.NewService(startgame.Dependencies{ Games: games, RuntimeManager: runtime, + ImageResolver: resolver, Clock: fixedClock(now.Add(time.Hour)), Logger: silentLogger(), }) require.NoError(t, err) - return &fixture{games: games, runtime: runtime, service: service, now: now} + return &fixture{games: games, rec: rec, runtime: runtime, service: service, now: now} } func TestNewServiceRejectsMissingDeps(t *testing.T) { - _, err := startgame.NewService(startgame.Dependencies{ - RuntimeManager: runtimemanagerstub.NewPublisher(), + resolver, err := engineimage.NewResolver(testImageTemplate) + require.NoError(t, err) + + rec := &runtimeRec{} + runtime := newRuntimeMock(t, rec) + + _, err = startgame.NewService(startgame.Dependencies{ + RuntimeManager: runtime, + ImageResolver: resolver, }) require.Error(t, err) _, err = startgame.NewService(startgame.Dependencies{ - Games: gamestub.NewStore(), + Games: gameinmem.NewStore(), + ImageResolver: resolver, + }) + require.Error(t, err) + + _, err = startgame.NewService(startgame.Dependencies{ + Games: gameinmem.NewStore(), + RuntimeManager: runtime, }) require.Error(t, err) } @@ -94,8 +176,13 @@ func TestStartGamePublicAdminHappyPath(t *testing.T) { }) require.NoError(t, err) assert.Equal(t, game.StatusStarting, updated.Status) - assert.Equal(t, []string{record.GameID.String()}, f.runtime.StartJobs()) - assert.Empty(t, f.runtime.StopJobs()) + assert.Equal(t, []string{record.GameID.String()}, f.rec.startJobs()) + assert.Equal(t, + []string{"registry.example.com/galaxy/game:" + record.TargetEngineVersion}, + f.rec.startImageRefs(), + "resolved image_ref must propagate to publisher", + ) + assert.Empty(t, f.rec.stopJobs()) } func TestStartGamePrivateOwnerHappyPath(t *testing.T) { @@ -108,7 +195,7 @@ func TestStartGamePrivateOwnerHappyPath(t *testing.T) { }) require.NoError(t, err) assert.Equal(t, game.StatusStarting, updated.Status) - assert.Equal(t, []string{record.GameID.String()}, f.runtime.StartJobs()) + assert.Equal(t, []string{record.GameID.String()}, f.rec.startJobs()) } func TestStartGameRejectsNonOwnerUser(t *testing.T) { @@ -120,7 +207,7 @@ func TestStartGameRejectsNonOwnerUser(t *testing.T) { GameID: record.GameID, }) require.ErrorIs(t, err, shared.ErrForbidden) - assert.Empty(t, f.runtime.StartJobs(), "no start job published on forbidden") + assert.Empty(t, f.rec.startJobs(), "no start job published on forbidden") stored, err := f.games.Get(context.Background(), record.GameID) require.NoError(t, err) @@ -148,7 +235,7 @@ func TestStartGameRejectsWrongStatus(t *testing.T) { GameID: record.GameID, }) require.ErrorIs(t, err, game.ErrConflict) - assert.Empty(t, f.runtime.StartJobs()) + assert.Empty(t, f.rec.startJobs()) } func TestStartGameRejectsCASLossOnRecentTransition(t *testing.T) { @@ -169,13 +256,13 @@ func TestStartGameRejectsCASLossOnRecentTransition(t *testing.T) { GameID: record.GameID, }) require.ErrorIs(t, err, game.ErrConflict) - assert.Empty(t, f.runtime.StartJobs()) + assert.Empty(t, f.rec.startJobs()) } func TestStartGamePublishFailureSurfacesUnavailable(t *testing.T) { record, now := newReadyGame(t, game.GameTypePublic, "") f := newFixture(t, record, now) - f.runtime.SetStartError(errors.New("redis down")) + f.rec.startErr = errors.New("redis down") _, err := f.service.Handle(context.Background(), startgame.Input{ Actor: shared.NewAdminActor(), @@ -191,11 +278,15 @@ func TestStartGamePublishFailureSurfacesUnavailable(t *testing.T) { } func TestStartGameRejectsMissingRecord(t *testing.T) { - games := gamestub.NewStore() - runtime := runtimemanagerstub.NewPublisher() + games := gameinmem.NewStore() + rec := &runtimeRec{} + runtime := newRuntimeMock(t, rec) + resolver, err := engineimage.NewResolver(testImageTemplate) + require.NoError(t, err) service, err := startgame.NewService(startgame.Dependencies{ Games: games, RuntimeManager: runtime, + ImageResolver: resolver, Clock: fixedClock(time.Now().UTC()), Logger: silentLogger(), }) diff --git a/lobby/internal/service/submitapplication/service_test.go b/lobby/internal/service/submitapplication/service_test.go index 30cfb4a..8f57fba 100644 --- a/lobby/internal/service/submitapplication/service_test.go +++ b/lobby/internal/service/submitapplication/service_test.go @@ -5,15 +5,15 @@ import ( "errors" "io" "log/slog" + "sync" "testing" "time" - "galaxy/lobby/internal/adapters/applicationstub" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/intentpubstub" - "galaxy/lobby/internal/adapters/membershipstub" - "galaxy/lobby/internal/adapters/racenamestub" - "galaxy/lobby/internal/adapters/userservicestub" + "galaxy/lobby/internal/adapters/applicationinmem" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/membershipinmem" + "galaxy/lobby/internal/adapters/mocks" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/domain/application" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" @@ -25,8 +25,87 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) +type intentRec struct { + mu sync.Mutex + published []notificationintent.Intent + err error +} + +func (r *intentRec) record(_ context.Context, intent notificationintent.Intent) (string, error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.err != nil { + return "", r.err + } + r.published = append(r.published, intent) + return "1", nil +} + +func (r *intentRec) snapshot() []notificationintent.Intent { + r.mu.Lock() + defer r.mu.Unlock() + return append([]notificationintent.Intent(nil), r.published...) +} + +func (r *intentRec) setErr(err error) { + r.mu.Lock() + defer r.mu.Unlock() + r.err = err +} + +type userRec struct { + mu sync.Mutex + elig map[string]ports.Eligibility + failures map[string]error +} + +func (r *userRec) record(_ context.Context, userID string) (ports.Eligibility, error) { + r.mu.Lock() + defer r.mu.Unlock() + if err, ok := r.failures[userID]; ok { + return ports.Eligibility{}, err + } + if e, ok := r.elig[userID]; ok { + return e, nil + } + return ports.Eligibility{Exists: false}, nil +} + +func (r *userRec) setEligibility(userID string, e ports.Eligibility) { + r.mu.Lock() + defer r.mu.Unlock() + if r.elig == nil { + r.elig = make(map[string]ports.Eligibility) + } + r.elig[userID] = e +} + +func (r *userRec) setFailure(userID string, err error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.failures == nil { + r.failures = make(map[string]error) + } + r.failures[userID] = err +} + +func newIntentMock(t *testing.T, rec *intentRec) *mocks.MockIntentPublisher { + t.Helper() + m := mocks.NewMockIntentPublisher(gomock.NewController(t)) + m.EXPECT().Publish(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + +func newUserMock(t *testing.T, rec *userRec) *mocks.MockUserService { + t.Helper() + m := mocks.NewMockUserService(gomock.NewController(t)) + m.EXPECT().GetEligibility(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + const ( defaultRaceName = "SolarPilot" otherRaceName = "VoidRunner" @@ -58,12 +137,14 @@ func (f fixedIDs) NewMembershipID() (common.MembershipID, error) { type fixture struct { now time.Time - games *gamestub.Store - memberships *membershipstub.Store - applications *applicationstub.Store - directory *racenamestub.Directory - users *userservicestub.Service - intents *intentpubstub.Publisher + games *gameinmem.Store + memberships *membershipinmem.Store + applications *applicationinmem.Store + directory *racenameinmem.Directory + users *userRec + usersMock *mocks.MockUserService + intents *intentRec + intentsMock *mocks.MockIntentPublisher ids fixedIDs openPublicGameID common.GameID defaultUserID string @@ -72,13 +153,13 @@ type fixture struct { func newFixture(t *testing.T) *fixture { t.Helper() now := time.Date(2026, 4, 25, 10, 0, 0, 0, time.UTC) - dir, err := racenamestub.NewDirectory(racenamestub.WithClock(fixedClock(now))) + dir, err := racenameinmem.NewDirectory(racenameinmem.WithClock(fixedClock(now))) require.NoError(t, err) - users := userservicestub.NewService() - users.SetEligibility("user-1", ports.Eligibility{Exists: true, CanLogin: true, CanJoinGame: true}) - games := gamestub.NewStore() - memberships := membershipstub.NewStore() - applications := applicationstub.NewStore() + users := &userRec{} + users.setEligibility("user-1", ports.Eligibility{Exists: true, CanLogin: true, CanJoinGame: true}) + games := gameinmem.NewStore() + memberships := membershipinmem.NewStore() + applications := applicationinmem.NewStore() gameRecord, err := game.New(game.NewGameInput{ GameID: "game-public", @@ -97,6 +178,7 @@ func newFixture(t *testing.T) *fixture { gameRecord.Status = game.StatusEnrollmentOpen require.NoError(t, games.Save(context.Background(), gameRecord)) + intents := &intentRec{} return &fixture{ now: now, games: games, @@ -104,7 +186,9 @@ func newFixture(t *testing.T) *fixture { applications: applications, directory: dir, users: users, - intents: intentpubstub.NewPublisher(), + usersMock: newUserMock(t, users), + intents: intents, + intentsMock: newIntentMock(t, intents), ids: fixedIDs{applicationID: "application-fixed", membershipID: "membership-fixed"}, openPublicGameID: gameRecord.GameID, defaultUserID: "user-1", @@ -117,9 +201,9 @@ func newService(t *testing.T, f *fixture) *submitapplication.Service { Games: f.games, Memberships: f.memberships, Applications: f.applications, - Users: f.users, + Users: f.usersMock, Directory: f.directory, - Intents: f.intents, + Intents: f.intentsMock, IDs: f.ids, Clock: fixedClock(f.now), Logger: silentLogger(), @@ -147,7 +231,7 @@ func TestHandleHappyPath(t *testing.T) { assert.Equal(t, common.ApplicationID("application-fixed"), got.ApplicationID) assert.Equal(t, defaultRaceName, got.RaceName) - intents := f.intents.Published() + intents := f.intents.snapshot() require.Len(t, intents, 1) assert.Equal(t, notificationintent.NotificationTypeLobbyApplicationSubmitted, intents[0].NotificationType) assert.Equal(t, notificationintent.AudienceKindAdminEmail, intents[0].AudienceKind) @@ -236,7 +320,7 @@ func TestHandleUserMissingEligibilityDenied(t *testing.T) { func TestHandleCanJoinGameFalseEligibilityDenied(t *testing.T) { t.Parallel() f := newFixture(t) - f.users.SetEligibility("user-blocked", ports.Eligibility{Exists: true, CanLogin: true, CanJoinGame: false}) + f.users.setEligibility("user-blocked", ports.Eligibility{Exists: true, CanLogin: true, CanJoinGame: false}) svc := newService(t, f) input := defaultInput(f) input.Actor = shared.NewUserActor("user-blocked") @@ -248,7 +332,7 @@ func TestHandleCanJoinGameFalseEligibilityDenied(t *testing.T) { func TestHandleUserServiceUnavailable(t *testing.T) { t.Parallel() f := newFixture(t) - f.users.SetFailure(f.defaultUserID, ports.ErrUserServiceUnavailable) + f.users.setFailure(f.defaultUserID, ports.ErrUserServiceUnavailable) svc := newService(t, f) _, err := svc.Handle(context.Background(), defaultInput(f)) @@ -322,7 +406,7 @@ func TestHandleDuplicateActiveApplicationConflict(t *testing.T) { func TestHandlePublishFailureDoesNotRollback(t *testing.T) { t.Parallel() f := newFixture(t) - f.intents.SetError(errors.New("publish failed")) + f.intents.setErr(errors.New("publish failed")) svc := newService(t, f) got, err := svc.Handle(context.Background(), defaultInput(f)) diff --git a/lobby/internal/service/updategame/service_test.go b/lobby/internal/service/updategame/service_test.go index 14199bd..b56737e 100644 --- a/lobby/internal/service/updategame/service_test.go +++ b/lobby/internal/service/updategame/service_test.go @@ -7,7 +7,7 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" + "galaxy/lobby/internal/adapters/gameinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/ports" @@ -29,7 +29,7 @@ func fixedClock(at time.Time) func() time.Time { // returns the persisted record. func seedDraftGame( t *testing.T, - store *gamestub.Store, + store *gameinmem.Store, id common.GameID, gameType game.GameType, ownerUserID string, @@ -73,7 +73,7 @@ func TestHandleAdminFullEditInDraft(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftGame(t, store, "game-a", game.GameTypePublic, "", now) later := now.Add(30 * time.Minute) @@ -107,7 +107,7 @@ func TestHandleOwnerEditInDraft(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftGame(t, store, "game-private", game.GameTypePrivate, "user-1", now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -125,7 +125,7 @@ func TestHandleNonOwnerForbidden(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftGame(t, store, "game-private", game.GameTypePrivate, "user-1", now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -142,7 +142,7 @@ func TestHandleUserCannotEditPublicGame(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftGame(t, store, "game-public", game.GameTypePublic, "", now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -159,7 +159,7 @@ func TestHandleEnrollmentOpenDescriptionOnly(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedDraftGame(t, store, "game-open", game.GameTypePublic, "", now) // Force status to enrollment_open via UpdateStatus. @@ -187,7 +187,7 @@ func TestHandleEnrollmentOpenNonDescriptionRejected(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedDraftGame(t, store, "game-open", game.GameTypePublic, "", now) require.NoError(t, store.UpdateStatus(context.Background(), ports.UpdateStatusInput{ @@ -212,7 +212,7 @@ func TestHandleTerminalStatusRejected(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() record := seedDraftGame(t, store, "game-cancel", game.GameTypePublic, "", now) require.NoError(t, store.UpdateStatus(context.Background(), ports.UpdateStatusInput{ @@ -236,7 +236,7 @@ func TestHandleTerminalStatusRejected(t *testing.T) { func TestHandleNotFound(t *testing.T) { t.Parallel() - store := gamestub.NewStore() + store := gameinmem.NewStore() service := newService(t, store, fixedClock(time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC))) _, err := service.Handle(context.Background(), updategame.Input{ @@ -251,7 +251,7 @@ func TestHandleValidationFailurePropagates(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() seedDraftGame(t, store, "game-a", game.GameTypePublic, "", now) service := newService(t, store, fixedClock(now.Add(time.Hour))) @@ -270,7 +270,7 @@ func TestHandleInvalidActorReturnsError(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() service := newService(t, store, fixedClock(now)) _, err := service.Handle(context.Background(), updategame.Input{ @@ -286,7 +286,7 @@ func TestHandleInvalidGameID(t *testing.T) { t.Parallel() now := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) - store := gamestub.NewStore() + store := gameinmem.NewStore() service := newService(t, store, fixedClock(now)) _, err := service.Handle(context.Background(), updategame.Input{ diff --git a/lobby/internal/worker/enrollmentautomation/worker_test.go b/lobby/internal/worker/enrollmentautomation/worker_test.go index ff1cd74..19749b3 100644 --- a/lobby/internal/worker/enrollmentautomation/worker_test.go +++ b/lobby/internal/worker/enrollmentautomation/worker_test.go @@ -4,14 +4,15 @@ import ( "context" "io" "log/slog" + "sync" "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/gapactivationstub" - "galaxy/lobby/internal/adapters/intentpubstub" - "galaxy/lobby/internal/adapters/invitestub" - "galaxy/lobby/internal/adapters/membershipstub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/gapactivationinmem" + "galaxy/lobby/internal/adapters/inviteinmem" + "galaxy/lobby/internal/adapters/membershipinmem" + "galaxy/lobby/internal/adapters/mocks" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/domain/invite" @@ -21,8 +22,34 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) +type intentRec struct { + mu sync.Mutex + published []notificationintent.Intent +} + +func (r *intentRec) record(_ context.Context, intent notificationintent.Intent) (string, error) { + r.mu.Lock() + defer r.mu.Unlock() + r.published = append(r.published, intent) + return "1", nil +} + +func (r *intentRec) snapshot() []notificationintent.Intent { + r.mu.Lock() + defer r.mu.Unlock() + return append([]notificationintent.Intent(nil), r.published...) +} + +func newIntentMock(t *testing.T, rec *intentRec) *mocks.MockIntentPublisher { + t.Helper() + m := mocks.NewMockIntentPublisher(gomock.NewController(t)) + m.EXPECT().Publish(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + const ( gameID = common.GameID("game-private") ownerUserID = "user-owner" @@ -34,11 +61,12 @@ func fixedClock(at time.Time) func() time.Time { return func() time.Time { retur type fixture struct { now time.Time - games *gamestub.Store - invites *invitestub.Store - memberships *membershipstub.Store - gapStore *gapactivationstub.Store - intents *intentpubstub.Publisher + games *gameinmem.Store + invites *inviteinmem.Store + memberships *membershipinmem.Store + gapStore *gapactivationinmem.Store + intentRec *intentRec + intents *mocks.MockIntentPublisher game game.Game } @@ -86,16 +114,18 @@ func newFixture(t *testing.T, opts fixtureOptions) *fixture { require.NoError(t, err) rec.Status = game.StatusEnrollmentOpen - games := gamestub.NewStore() + games := gameinmem.NewStore() require.NoError(t, games.Save(context.Background(), rec)) + intentRecord := &intentRec{} return &fixture{ now: now, games: games, - invites: invitestub.NewStore(), - memberships: membershipstub.NewStore(), - gapStore: gapactivationstub.NewStore(), - intents: intentpubstub.NewPublisher(), + invites: inviteinmem.NewStore(), + memberships: membershipinmem.NewStore(), + gapStore: gapactivationinmem.NewStore(), + intentRec: intentRecord, + intents: newIntentMock(t, intentRecord), game: rec, } } @@ -159,11 +189,11 @@ func currentStatus(t *testing.T, f *fixture) game.Status { func TestNewWorkerRejectsZeroInterval(t *testing.T) { t.Parallel() _, err := enrollmentautomation.NewWorker(enrollmentautomation.Dependencies{ - Games: gamestub.NewStore(), - Memberships: membershipstub.NewStore(), - Invites: invitestub.NewStore(), - Intents: intentpubstub.NewPublisher(), - GapStore: gapactivationstub.NewStore(), + Games: gameinmem.NewStore(), + Memberships: membershipinmem.NewStore(), + Invites: inviteinmem.NewStore(), + Intents: newIntentMock(t, &intentRec{}), + GapStore: gapactivationinmem.NewStore(), Interval: 0, }) require.Error(t, err) @@ -185,7 +215,7 @@ func TestTickDeadlineTriggers(t *testing.T) { require.NoError(t, err) assert.Equal(t, invite.StatusExpired, expired.Status) - intents := f.intents.Published() + intents := f.intentRec.snapshot() require.Len(t, intents, 1) assert.Equal(t, notificationintent.NotificationTypeLobbyInviteExpired, intents[0].NotificationType) } @@ -200,7 +230,7 @@ func TestTickDeadlineSkipsBelowMinPlayers(t *testing.T) { f.newWorker(t, tickAt).Tick(context.Background()) assert.Equal(t, game.StatusEnrollmentOpen, currentStatus(t, f)) - assert.Empty(t, f.intents.Published()) + assert.Empty(t, f.intentRec.snapshot()) } func TestTickGapTimeTriggers(t *testing.T) { @@ -260,7 +290,7 @@ func TestTickIsIdempotent(t *testing.T) { worker.Tick(context.Background()) assert.Equal(t, game.StatusReadyToStart, currentStatus(t, f)) - assert.Len(t, f.intents.Published(), 1) + assert.Len(t, f.intentRec.snapshot(), 1) } func TestRunStopsOnContextCancel(t *testing.T) { diff --git a/lobby/internal/worker/gmevents/consumer_test.go b/lobby/internal/worker/gmevents/consumer_test.go index 36defaf..6153d6b 100644 --- a/lobby/internal/worker/gmevents/consumer_test.go +++ b/lobby/internal/worker/gmevents/consumer_test.go @@ -12,9 +12,9 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/gameturnstatsstub" - "galaxy/lobby/internal/adapters/streamoffsetstub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/gameturnstatsinmem" + "galaxy/lobby/internal/adapters/streamoffsetinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/ports" @@ -60,10 +60,10 @@ func (e *fakeEvaluator) SetError(err error) { } type harness struct { - games *gamestub.Store - stats *gameturnstatsstub.Store + games *gameinmem.Store + stats *gameturnstatsinmem.Store evaluator *fakeEvaluator - offsets *streamoffsetstub.Store + offsets *streamoffsetinmem.Store consumer *gmevents.Consumer server *miniredis.Miniredis clientRedis *redis.Client @@ -78,10 +78,10 @@ func newHarness(t *testing.T) *harness { clientRedis := redis.NewClient(&redis.Options{Addr: server.Addr()}) t.Cleanup(func() { _ = clientRedis.Close() }) - games := gamestub.NewStore() - stats := gameturnstatsstub.NewStore() + games := gameinmem.NewStore() + stats := gameturnstatsinmem.NewStore() evaluator := &fakeEvaluator{} - offsets := streamoffsetstub.NewStore() + offsets := streamoffsetinmem.NewStore() at := time.Date(2026, 4, 25, 14, 0, 0, 0, time.UTC) now := at.Add(-2 * time.Hour) @@ -207,8 +207,8 @@ func TestNewConsumerRejectsMissingDeps(t *testing.T) { Client: client, Stream: "gm:lobby_events", BlockTimeout: time.Second, - Games: gamestub.NewStore(), - Stats: gameturnstatsstub.NewStore(), + Games: gameinmem.NewStore(), + Stats: gameturnstatsinmem.NewStore(), }) require.Error(t, err, "missing capability evaluator") } diff --git a/lobby/internal/worker/pendingregistration/worker_test.go b/lobby/internal/worker/pendingregistration/worker_test.go index e3be1f0..8ee772c 100644 --- a/lobby/internal/worker/pendingregistration/worker_test.go +++ b/lobby/internal/worker/pendingregistration/worker_test.go @@ -8,7 +8,7 @@ import ( "testing" "time" - "galaxy/lobby/internal/adapters/racenamestub" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/ports" "galaxy/lobby/internal/worker/pendingregistration" @@ -32,9 +32,9 @@ type controlledClock struct{ instant time.Time } func (clock *controlledClock) now() time.Time { return clock.instant } func (clock *controlledClock) advance(d time.Duration) { clock.instant = clock.instant.Add(d) } -func newDirectory(t *testing.T, clock *controlledClock) *racenamestub.Directory { +func newDirectory(t *testing.T, clock *controlledClock) *racenameinmem.Directory { t.Helper() - directory, err := racenamestub.NewDirectory(racenamestub.WithClock(clock.now)) + directory, err := racenameinmem.NewDirectory(racenameinmem.WithClock(clock.now)) require.NoError(t, err) return directory } @@ -77,7 +77,7 @@ func TestNewWorkerRejectsNilDirectory(t *testing.T) { func TestNewWorkerRejectsNonPositiveInterval(t *testing.T) { t.Parallel() - directory, err := racenamestub.NewDirectory() + directory, err := racenameinmem.NewDirectory() require.NoError(t, err) _, err = pendingregistration.NewWorker(pendingregistration.Dependencies{ diff --git a/lobby/internal/worker/runtimejobresult/consumer.go b/lobby/internal/worker/runtimejobresult/consumer.go index 9352e3d..f1bff6e 100644 --- a/lobby/internal/worker/runtimejobresult/consumer.go +++ b/lobby/internal/worker/runtimejobresult/consumer.go @@ -401,7 +401,7 @@ func (consumer *Consumer) handleOrphan(ctx context.Context, entryID string, even "game_id", event.GameID.String(), "err", cause.Error(), ) - if err := consumer.runtimeManager.PublishStopJob(ctx, event.GameID.String()); err != nil { + if err := consumer.runtimeManager.PublishStopJob(ctx, event.GameID.String(), ports.StopReasonOrphanCleanup); err != nil { consumer.logger.WarnContext(ctx, "publish stop job for orphan container", "stream_entry_id", entryID, "game_id", event.GameID.String(), diff --git a/lobby/internal/worker/runtimejobresult/consumer_test.go b/lobby/internal/worker/runtimejobresult/consumer_test.go index 6035697..304fe64 100644 --- a/lobby/internal/worker/runtimejobresult/consumer_test.go +++ b/lobby/internal/worker/runtimejobresult/consumer_test.go @@ -5,14 +5,13 @@ import ( "errors" "io" "log/slog" + "sync" "testing" "time" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/gmclientstub" - "galaxy/lobby/internal/adapters/intentpubstub" - "galaxy/lobby/internal/adapters/runtimemanagerstub" - "galaxy/lobby/internal/adapters/streamoffsetstub" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/mocks" + "galaxy/lobby/internal/adapters/streamoffsetinmem" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" "galaxy/lobby/internal/ports" @@ -23,18 +22,92 @@ import ( "github.com/redis/go-redis/v9" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) func silentLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } +// recorder captures every call passed through the mocks. The harness +// installs a default EXPECT().AnyTimes() that funnels every call into +// the recorder so individual tests can assert on observed calls. +// Per-test error injection uses recorder.gmErr/intentsErr. +type recorder struct { + mu sync.Mutex + stopGameIDs []string + stopReasons []ports.StopReason + gmRequests []ports.RegisterGameRequest + publishedIntents []notificationintent.Intent + gmErr error + intentsErr error +} + +func (r *recorder) recordStop(_ context.Context, gameID string, reason ports.StopReason) error { + r.mu.Lock() + defer r.mu.Unlock() + r.stopGameIDs = append(r.stopGameIDs, gameID) + r.stopReasons = append(r.stopReasons, reason) + return nil +} + +func (r *recorder) recordGM(_ context.Context, request ports.RegisterGameRequest) error { + r.mu.Lock() + defer r.mu.Unlock() + if r.gmErr != nil { + return r.gmErr + } + r.gmRequests = append(r.gmRequests, request) + return nil +} + +func (r *recorder) recordIntent(_ context.Context, intent notificationintent.Intent) (string, error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.intentsErr != nil { + return "", r.intentsErr + } + r.publishedIntents = append(r.publishedIntents, intent) + return "1", nil +} + +func (r *recorder) stopGameIDsSnapshot() []string { + r.mu.Lock() + defer r.mu.Unlock() + return append([]string(nil), r.stopGameIDs...) +} + +func (r *recorder) stopReasonsSnapshot() []ports.StopReason { + r.mu.Lock() + defer r.mu.Unlock() + return append([]ports.StopReason(nil), r.stopReasons...) +} + +func (r *recorder) gmRequestsSnapshot() []ports.RegisterGameRequest { + r.mu.Lock() + defer r.mu.Unlock() + return append([]ports.RegisterGameRequest(nil), r.gmRequests...) +} + +func (r *recorder) publishedSnapshot() []notificationintent.Intent { + r.mu.Lock() + defer r.mu.Unlock() + return append([]notificationintent.Intent(nil), r.publishedIntents...) +} + +func (r *recorder) setGMErr(err error) { + r.mu.Lock() + defer r.mu.Unlock() + r.gmErr = err +} + type harness struct { - games *gamestub.Store - runtime *runtimemanagerstub.Publisher - gm *gmclientstub.Client - intents *intentpubstub.Publisher - offsets *streamoffsetstub.Store + games *gameinmem.Store + runtime *mocks.MockRuntimeManager + gm *mocks.MockGMClient + intents *mocks.MockIntentPublisher + rec *recorder + offsets *streamoffsetinmem.Store consumer *runtimejobresult.Consumer server *miniredis.Miniredis clientRedis *redis.Client @@ -49,11 +122,26 @@ func newHarness(t *testing.T) *harness { clientRedis := redis.NewClient(&redis.Options{Addr: server.Addr()}) t.Cleanup(func() { _ = clientRedis.Close() }) - games := gamestub.NewStore() - runtime := runtimemanagerstub.NewPublisher() - gm := gmclientstub.NewClient() - intents := intentpubstub.NewPublisher() - offsets := streamoffsetstub.NewStore() + ctrl := gomock.NewController(t) + rec := &recorder{} + + games := gameinmem.NewStore() + runtime := mocks.NewMockRuntimeManager(ctrl) + runtime.EXPECT().PublishStartJob(gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, _, _ string) error { return nil }).AnyTimes() + runtime.EXPECT().PublishStopJob(gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(rec.recordStop).AnyTimes() + + gm := mocks.NewMockGMClient(ctrl) + gm.EXPECT().RegisterGame(gomock.Any(), gomock.Any()). + DoAndReturn(rec.recordGM).AnyTimes() + gm.EXPECT().Ping(gomock.Any()).Return(nil).AnyTimes() + + intents := mocks.NewMockIntentPublisher(ctrl) + intents.EXPECT().Publish(gomock.Any(), gomock.Any()). + DoAndReturn(rec.recordIntent).AnyTimes() + + offsets := streamoffsetinmem.NewStore() at := time.Date(2026, 4, 25, 13, 0, 0, 0, time.UTC) h := &harness{ @@ -61,6 +149,7 @@ func newHarness(t *testing.T) *harness { runtime: runtime, gm: gm, intents: intents, + rec: rec, offsets: offsets, server: server, clientRedis: clientRedis, @@ -165,21 +254,22 @@ func TestHandleSuccessTransitionsToRunning(t *testing.T) { require.NotNil(t, got.StartedAt) assert.True(t, got.StartedAt.Equal(h.at)) - require.Len(t, h.gm.Requests(), 1) - req := h.gm.Requests()[0] + gmRequests := h.rec.gmRequestsSnapshot() + require.Len(t, gmRequests, 1) + req := gmRequests[0] assert.Equal(t, h.gameRecord.GameID, req.GameID) assert.Equal(t, "container-1", req.ContainerID) assert.Equal(t, "engine.local:9000", req.EngineEndpoint) assert.Equal(t, h.gameRecord.TargetEngineVersion, req.TargetEngineVersion) assert.Equal(t, h.gameRecord.TurnSchedule, req.TurnSchedule) - assert.Empty(t, h.runtime.StopJobs()) - assert.Empty(t, h.intents.Published()) + assert.Empty(t, h.rec.stopGameIDsSnapshot()) + assert.Empty(t, h.rec.publishedSnapshot()) } func TestHandleSuccessGMUnavailableMovesToPausedAndPublishesIntent(t *testing.T) { h := newHarness(t) - h.gm.SetError(ports.ErrGMUnavailable) + h.rec.setGMErr(ports.ErrGMUnavailable) h.consumer.HandleMessage(context.Background(), successMessage(t, h, "1700000000001-0")) @@ -188,10 +278,10 @@ func TestHandleSuccessGMUnavailableMovesToPausedAndPublishesIntent(t *testing.T) assert.Equal(t, game.StatusPaused, got.Status) require.NotNil(t, got.RuntimeBinding, "binding still persisted before paused") - published := h.intents.Published() + published := h.rec.publishedSnapshot() require.Len(t, published, 1) assert.Equal(t, notificationintent.NotificationTypeLobbyRuntimePausedAfterStart, published[0].NotificationType) - assert.Empty(t, h.runtime.StopJobs()) + assert.Empty(t, h.rec.stopGameIDsSnapshot()) } func TestHandleFailureTransitionsToStartFailed(t *testing.T) { @@ -202,9 +292,9 @@ func TestHandleFailureTransitionsToStartFailed(t *testing.T) { require.NoError(t, err) assert.Equal(t, game.StatusStartFailed, got.Status) assert.Nil(t, got.RuntimeBinding) - assert.Empty(t, h.runtime.StopJobs()) - assert.Empty(t, h.gm.Requests()) - assert.Empty(t, h.intents.Published()) + assert.Empty(t, h.rec.stopGameIDsSnapshot()) + assert.Empty(t, h.rec.gmRequestsSnapshot()) + assert.Empty(t, h.rec.publishedSnapshot()) } func TestHandleSuccessOrphanContainerWhenBindingFails(t *testing.T) { @@ -236,15 +326,20 @@ func TestHandleSuccessOrphanContainerWhenBindingFails(t *testing.T) { "orphan path must move game to start_failed") assert.Nil(t, got.RuntimeBinding, "binding never persisted") - assert.Equal(t, []string{h.gameRecord.GameID.String()}, h.runtime.StopJobs()) - assert.Empty(t, h.gm.Requests()) - assert.Empty(t, h.intents.Published()) + assert.Equal(t, []string{h.gameRecord.GameID.String()}, h.rec.stopGameIDsSnapshot()) + assert.Equal(t, + []ports.StopReason{ports.StopReasonOrphanCleanup}, + h.rec.stopReasonsSnapshot(), + "orphan path must classify the stop job as orphan_cleanup", + ) + assert.Empty(t, h.rec.gmRequestsSnapshot()) + assert.Empty(t, h.rec.publishedSnapshot()) } func TestHandleSuccessReplayIsNoOp(t *testing.T) { h := newHarness(t) h.consumer.HandleMessage(context.Background(), successMessage(t, h, "1700000000004-0")) - require.Len(t, h.gm.Requests(), 1) + require.Len(t, h.rec.gmRequestsSnapshot(), 1) got, err := h.games.Get(context.Background(), h.gameRecord.GameID) require.NoError(t, err) @@ -253,16 +348,16 @@ func TestHandleSuccessReplayIsNoOp(t *testing.T) { // Replay the same event: status is already running, so the early // status check exits before any side-effect call (no binding // overwrite, no GM call, no transition). - h.gm.SetError(errors.New("must not be called again")) + h.rec.setGMErr(errors.New("must not be called again")) h.consumer.HandleMessage(context.Background(), successMessage(t, h, "1700000000004-0")) - require.Len(t, h.gm.Requests(), 1, "GM register-game is invoked once across replays") + require.Len(t, h.rec.gmRequestsSnapshot(), 1, "GM register-game is invoked once across replays") got, err = h.games.Get(context.Background(), h.gameRecord.GameID) require.NoError(t, err) assert.Equal(t, game.StatusRunning, got.Status) assert.True(t, got.UpdatedAt.Equal(originalUpdatedAt), "no further mutations on replay") - assert.Empty(t, h.intents.Published()) + assert.Empty(t, h.rec.publishedSnapshot()) } func TestHandleFailureReplayIsNoOp(t *testing.T) { @@ -298,14 +393,14 @@ func TestHandleMalformedEvents(t *testing.T) { got, err := h.games.Get(context.Background(), h.gameRecord.GameID) require.NoError(t, err) assert.Equal(t, game.StatusStarting, got.Status, "malformed events leave game untouched") - assert.Empty(t, h.runtime.StopJobs()) - assert.Empty(t, h.gm.Requests()) + assert.Empty(t, h.rec.stopGameIDsSnapshot()) + assert.Empty(t, h.rec.gmRequestsSnapshot()) } -// fakeBindingFailer wraps gamestub.Store and forces UpdateRuntimeBinding +// fakeBindingFailer wraps gameinmem.Store and forces UpdateRuntimeBinding // to fail; everything else delegates to the embedded store. type fakeBindingFailer struct { - *gamestub.Store + *gameinmem.Store err error } diff --git a/lobby/internal/worker/userlifecycle/worker.go b/lobby/internal/worker/userlifecycle/worker.go index f576e34..d3a45ae 100644 --- a/lobby/internal/worker/userlifecycle/worker.go +++ b/lobby/internal/worker/userlifecycle/worker.go @@ -429,7 +429,7 @@ func (worker *Worker) cascadeOwnedGames( } if _, inflight := inflightGameStatuses[record.Status]; inflight { - if err := worker.runtimeManager.PublishStopJob(ctx, record.GameID.String()); err != nil { + if err := worker.runtimeManager.PublishStopJob(ctx, record.GameID.String(), ports.StopReasonCancelled); err != nil { return cancelled, fmt.Errorf("user lifecycle handle: publish stop job for %s: %w", record.GameID, err) } diff --git a/lobby/internal/worker/userlifecycle/worker_test.go b/lobby/internal/worker/userlifecycle/worker_test.go index 87b7c82..23b5302 100644 --- a/lobby/internal/worker/userlifecycle/worker_test.go +++ b/lobby/internal/worker/userlifecycle/worker_test.go @@ -6,16 +6,16 @@ import ( "io" "log/slog" "strings" + "sync" "testing" "time" - "galaxy/lobby/internal/adapters/applicationstub" - "galaxy/lobby/internal/adapters/gamestub" - "galaxy/lobby/internal/adapters/intentpubstub" - "galaxy/lobby/internal/adapters/invitestub" - "galaxy/lobby/internal/adapters/membershipstub" - "galaxy/lobby/internal/adapters/racenamestub" - "galaxy/lobby/internal/adapters/runtimemanagerstub" + "galaxy/lobby/internal/adapters/applicationinmem" + "galaxy/lobby/internal/adapters/gameinmem" + "galaxy/lobby/internal/adapters/inviteinmem" + "galaxy/lobby/internal/adapters/membershipinmem" + "galaxy/lobby/internal/adapters/mocks" + "galaxy/lobby/internal/adapters/racenameinmem" "galaxy/lobby/internal/domain/application" "galaxy/lobby/internal/domain/common" "galaxy/lobby/internal/domain/game" @@ -27,18 +27,94 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) +type intentRec struct { + mu sync.Mutex + published []notificationintent.Intent +} + +func (r *intentRec) record(_ context.Context, intent notificationintent.Intent) (string, error) { + r.mu.Lock() + defer r.mu.Unlock() + r.published = append(r.published, intent) + return "1", nil +} + +func (r *intentRec) snapshot() []notificationintent.Intent { + r.mu.Lock() + defer r.mu.Unlock() + return append([]notificationintent.Intent(nil), r.published...) +} + +type runtimeRec struct { + mu sync.Mutex + stopIDs []string + stopReas []ports.StopReason + stopErr error +} + +func (r *runtimeRec) recordStart(_ context.Context, _, _ string) error { return nil } + +func (r *runtimeRec) recordStop(_ context.Context, gameID string, reason ports.StopReason) error { + r.mu.Lock() + defer r.mu.Unlock() + if r.stopErr != nil { + return r.stopErr + } + r.stopIDs = append(r.stopIDs, gameID) + r.stopReas = append(r.stopReas, reason) + return nil +} + +func (r *runtimeRec) stopJobs() []string { + r.mu.Lock() + defer r.mu.Unlock() + return append([]string(nil), r.stopIDs...) +} + +func (r *runtimeRec) stopReasons() []ports.StopReason { + r.mu.Lock() + defer r.mu.Unlock() + return append([]ports.StopReason(nil), r.stopReas...) +} + +func (r *runtimeRec) setStopErr(err error) { + r.mu.Lock() + defer r.mu.Unlock() + r.stopErr = err +} + +func newIntentMock(t *testing.T, rec *intentRec) *mocks.MockIntentPublisher { + t.Helper() + m := mocks.NewMockIntentPublisher(gomock.NewController(t)) + m.EXPECT().Publish(gomock.Any(), gomock.Any()).DoAndReturn(rec.record).AnyTimes() + return m +} + +func newRuntimeMock(t *testing.T, rec *runtimeRec) *mocks.MockRuntimeManager { + t.Helper() + m := mocks.NewMockRuntimeManager(gomock.NewController(t)) + m.EXPECT().PublishStartJob(gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(rec.recordStart).AnyTimes() + m.EXPECT().PublishStopJob(gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(rec.recordStop).AnyTimes() + return m +} + func silentLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } type fixture struct { - directory *racenamestub.Directory - memberships *membershipstub.Store - applications *applicationstub.Store - invites *invitestub.Store - games *gamestub.Store - runtimeManager *runtimemanagerstub.Publisher - intents *intentpubstub.Publisher + directory *racenameinmem.Directory + memberships *membershipinmem.Store + applications *applicationinmem.Store + invites *inviteinmem.Store + games *gameinmem.Store + runtimeRec *runtimeRec + runtimeManager *mocks.MockRuntimeManager + intentRec *intentRec + intents *mocks.MockIntentPublisher worker *userlifecycle.Worker now time.Time } @@ -46,18 +122,22 @@ type fixture struct { func newFixture(t *testing.T) *fixture { t.Helper() - directory, err := racenamestub.NewDirectory() + directory, err := racenameinmem.NewDirectory() require.NoError(t, err) now := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) + rtRec := &runtimeRec{} + intRec := &intentRec{} f := &fixture{ directory: directory, - memberships: membershipstub.NewStore(), - applications: applicationstub.NewStore(), - invites: invitestub.NewStore(), - games: gamestub.NewStore(), - runtimeManager: runtimemanagerstub.NewPublisher(), - intents: intentpubstub.NewPublisher(), + memberships: membershipinmem.NewStore(), + applications: applicationinmem.NewStore(), + invites: inviteinmem.NewStore(), + games: gameinmem.NewStore(), + runtimeRec: rtRec, + runtimeManager: newRuntimeMock(t, rtRec), + intentRec: intRec, + intents: newIntentMock(t, intRec), now: now, } @@ -276,12 +356,16 @@ func TestHandleFullCascadePermanentBlock(t *testing.T) { gotOwned2, err := f.games.Get(context.Background(), ownedDraft.GameID) require.NoError(t, err) assert.Equal(t, game.StatusCancelled, gotOwned2.Status) - stopJobs := f.runtimeManager.StopJobs() + stopJobs := f.runtimeRec.stopJobs() require.Len(t, stopJobs, 1) assert.Equal(t, ownedRunning.GameID.String(), stopJobs[0]) + stopReasons := f.runtimeRec.stopReasons() + require.Len(t, stopReasons, 1) + assert.Equal(t, ports.StopReasonCancelled, stopReasons[0], + "user-lifecycle cascade must classify the stop job as cancelled") // Notification published only for the third-party private game owner. - intents := f.intents.Published() + intents := f.intentRec.snapshot() require.Len(t, intents, 1) assert.Equal(t, notificationintent.NotificationTypeLobbyMembershipBlocked, intents[0].NotificationType) assert.Equal(t, []string{"owner-other"}, intents[0].RecipientUserIDs) @@ -309,7 +393,7 @@ func TestHandleIsIdempotentOnReplay(t *testing.T) { require.NoError(t, f.worker.Handle(context.Background(), event)) require.NoError(t, f.worker.Handle(context.Background(), event)) - intents := f.intents.Published() + intents := f.intentRec.snapshot() require.Len(t, intents, 1, "second pass must not double-publish") assert.Contains(t, intents[0].PayloadJSON, `"reason":"deleted"`) } @@ -378,7 +462,7 @@ func TestHandleUnknownEventTypeIsNoop(t *testing.T) { got, err := f.memberships.Get(context.Background(), member.MembershipID) require.NoError(t, err) assert.Equal(t, membership.StatusActive, got.Status) - assert.Empty(t, f.intents.Published()) + assert.Empty(t, f.intentRec.snapshot()) } func TestHandlePropagatesStopJobError(t *testing.T) { @@ -386,7 +470,7 @@ func TestHandlePropagatesStopJobError(t *testing.T) { f := newFixture(t) f.seedGame(t, "game-owned-3", game.GameTypePrivate, "user-victim", game.StatusRunning) - f.runtimeManager.SetStopError(errors.New("runtime down")) + f.runtimeRec.setStopErr(errors.New("runtime down")) err := f.worker.Handle(context.Background(), ports.UserLifecycleEvent{ EntryID: "1700000000000-0", @@ -399,10 +483,10 @@ func TestHandlePropagatesStopJobError(t *testing.T) { require.Error(t, err) } -// flakyMembershipStore wraps membershipstub.Store with a one-shot +// flakyMembershipStore wraps membershipinmem.Store with a one-shot // UpdateStatus failure injection used by the retry-after-error test. type flakyMembershipStore struct { - *membershipstub.Store + *membershipinmem.Store failOnce bool failError error } diff --git a/mail/templates/runtime.container_start_failed/en/subject.tmpl b/mail/templates/runtime.container_start_failed/en/subject.tmpl new file mode 100644 index 0000000..5c3ebaf --- /dev/null +++ b/mail/templates/runtime.container_start_failed/en/subject.tmpl @@ -0,0 +1 @@ +Engine container start failed for game {{.game_id}} diff --git a/mail/templates/runtime.container_start_failed/en/text.tmpl b/mail/templates/runtime.container_start_failed/en/text.tmpl new file mode 100644 index 0000000..650f6da --- /dev/null +++ b/mail/templates/runtime.container_start_failed/en/text.tmpl @@ -0,0 +1,6 @@ +Runtime Manager could not start the engine container for game {{.game_id}}. + +Image: {{.image_ref}} +Error code: {{.error_code}} +Message: {{.error_message}} +Attempted at (Unix ms, UTC): {{.attempted_at_ms}} diff --git a/mail/templates/runtime.image_pull_failed/en/subject.tmpl b/mail/templates/runtime.image_pull_failed/en/subject.tmpl new file mode 100644 index 0000000..e4ef256 --- /dev/null +++ b/mail/templates/runtime.image_pull_failed/en/subject.tmpl @@ -0,0 +1 @@ +Engine image pull failed for game {{.game_id}} diff --git a/mail/templates/runtime.image_pull_failed/en/text.tmpl b/mail/templates/runtime.image_pull_failed/en/text.tmpl new file mode 100644 index 0000000..86cf4bc --- /dev/null +++ b/mail/templates/runtime.image_pull_failed/en/text.tmpl @@ -0,0 +1,6 @@ +Runtime Manager could not pull the engine image for game {{.game_id}}. + +Image: {{.image_ref}} +Error code: {{.error_code}} +Message: {{.error_message}} +Attempted at (Unix ms, UTC): {{.attempted_at_ms}} diff --git a/mail/templates/runtime.start_config_invalid/en/subject.tmpl b/mail/templates/runtime.start_config_invalid/en/subject.tmpl new file mode 100644 index 0000000..0dc865c --- /dev/null +++ b/mail/templates/runtime.start_config_invalid/en/subject.tmpl @@ -0,0 +1 @@ +Engine start configuration invalid for game {{.game_id}} diff --git a/mail/templates/runtime.start_config_invalid/en/text.tmpl b/mail/templates/runtime.start_config_invalid/en/text.tmpl new file mode 100644 index 0000000..47a9482 --- /dev/null +++ b/mail/templates/runtime.start_config_invalid/en/text.tmpl @@ -0,0 +1,6 @@ +Runtime Manager rejected the start request for game {{.game_id}}: configuration is invalid. + +Image: {{.image_ref}} +Error code: {{.error_code}} +Message: {{.error_message}} +Attempted at (Unix ms, UTC): {{.attempted_at_ms}} diff --git a/notification/PLAN.md b/notification/PLAN.md index 09ad071..7192e2d 100644 --- a/notification/PLAN.md +++ b/notification/PLAN.md @@ -363,3 +363,13 @@ The implementation is complete only when all of the following hold: notification types - admin notifications remain `email`-only - auth-code email still bypasses `Notification Service` + +## Note: Runtime Manager Catalog Extension + +The three administrator-only `runtime.*` notification types +(`runtime.image_pull_failed`, `runtime.container_start_failed`, +`runtime.start_config_invalid`) are added by the Runtime Manager +implementation plan, not by this document. See +[`../rtmanager/PLAN.md`](../rtmanager/PLAN.md) §«Stage 07. Notification +intent constructors and catalog extension». No new stages are added here +for that catalog growth. diff --git a/notification/README.md b/notification/README.md index 59c5e1b..12eb7bc 100644 --- a/notification/README.md +++ b/notification/README.md @@ -208,6 +208,9 @@ Primary configuration groups: - `NOTIFICATION_ADMIN_EMAILS_GAME_GENERATION_FAILED` - `NOTIFICATION_ADMIN_EMAILS_LOBBY_RUNTIME_PAUSED_AFTER_START` - `NOTIFICATION_ADMIN_EMAILS_LOBBY_APPLICATION_SUBMITTED` + - `NOTIFICATION_ADMIN_EMAILS_RUNTIME_IMAGE_PULL_FAILED` + - `NOTIFICATION_ADMIN_EMAILS_RUNTIME_CONTAINER_START_FAILED` + - `NOTIFICATION_ADMIN_EMAILS_RUNTIME_START_CONFIG_INVALID` - OpenTelemetry: - standard `OTEL_*` variables - `NOTIFICATION_OTEL_STDOUT_TRACES_ENABLED` @@ -292,10 +295,13 @@ Accepted intents use the original Redis Stream `stream_entry_id` as | `lobby.race_name.registration_eligible` | `Game Lobby` (`game_lobby`) | capable member (`audience_kind=user`) | `push+email` | `game_id`, `game_name`, `race_name`, `eligible_until_ms` | | `lobby.race_name.registered` | `Game Lobby` (`game_lobby`) | registering user (`audience_kind=user`) | `push+email` | `race_name` | | `lobby.race_name.registration_denied` | `Game Lobby` (`game_lobby`) | incapable member (`audience_kind=user`) | `email` | `game_id`, `game_name`, `race_name`, `reason` | +| `runtime.image_pull_failed` | `Runtime Manager` (`runtime_manager`) | configured admin email list (`audience_kind=admin_email`) | `email` | `game_id`, `image_ref`, `error_code`, `error_message`, `attempted_at_ms` | +| `runtime.container_start_failed` | `Runtime Manager` (`runtime_manager`) | configured admin email list (`audience_kind=admin_email`) | `email` | `game_id`, `image_ref`, `error_code`, `error_message`, `attempted_at_ms` | +| `runtime.start_config_invalid` | `Runtime Manager` (`runtime_manager`) | configured admin email list (`audience_kind=admin_email`) | `email` | `game_id`, `image_ref`, `error_code`, `error_message`, `attempted_at_ms` | Rules: -- v1 supports exactly the fifteen `notification_type` values listed above +- v1 supports exactly the eighteen `notification_type` values listed above - `lobby.application.submitted` keeps one stable `notification_type` and one stable `payload_json` shape; private games publish `audience_kind=user` while public games publish `audience_kind=admin_email` @@ -308,6 +314,12 @@ Rules: with a 30-day `eligible_until_ms` window - `lobby.race_name.registered` is emitted on successful `lobby.race_name.register` commit +- the three `runtime.*` types are emitted by `Runtime Manager` only on + first-touch start failures (image pull, container create/start, start + configuration validation); they are administrator-only in v1 and have no + push counterpart. `Runtime Manager` does not publish notifications for + ongoing health changes — those flow through `runtime:health_events` and + are escalated by `Game Master` if needed. ## Recipient Enrichment And Locale Policy @@ -436,6 +448,9 @@ Initial notification-owned template assets: | `lobby.race_name.registration_eligible` | `lobby.race_name.registration_eligible` | `en/subject.tmpl`, `en/text.tmpl` | | `lobby.race_name.registered` | `lobby.race_name.registered` | `en/subject.tmpl`, `en/text.tmpl` | | `lobby.race_name.registration_denied` | `lobby.race_name.registration_denied` | `en/subject.tmpl`, `en/text.tmpl` | +| `runtime.image_pull_failed` | `runtime.image_pull_failed` | `en/subject.tmpl`, `en/text.tmpl` | +| `runtime.container_start_failed` | `runtime.container_start_failed` | `en/subject.tmpl`, `en/text.tmpl` | +| `runtime.start_config_invalid` | `runtime.start_config_invalid` | `en/subject.tmpl`, `en/text.tmpl` | `auth.login_code` does not belong to the notification-owned template set. diff --git a/notification/api/intents-asyncapi.yaml b/notification/api/intents-asyncapi.yaml index 3ec71a2..a8afbb9 100644 --- a/notification/api/intents-asyncapi.yaml +++ b/notification/api/intents-asyncapi.yaml @@ -58,6 +58,15 @@ components: idempotency_key: game-lobby:game-456:application-submitted:user-42 occurred_at_ms: "1775121700002" payload_json: '{"game_id":"game-456","game_name":"Orion Front","applicant_user_id":"user-42","applicant_name":"Nova Pilot"}' + - name: runtimeImagePullFailed + summary: Administrator email notification about a failed engine image pull. + payload: + notification_type: runtime.image_pull_failed + producer: runtime_manager + audience_kind: admin_email + idempotency_key: runtime-manager:game-789:image-pull-failed:1775121700003 + occurred_at_ms: "1775121700003" + payload_json: '{"game_id":"game-789","image_ref":"galaxy/game:1.4.7","error_code":"image_pull_failed","error_message":"manifest unknown","attempted_at_ms":1775121700003}' schemas: NotificationIntentEnvelope: type: object @@ -98,6 +107,9 @@ components: - lobby.race_name.registration_eligible - lobby.race_name.registered - lobby.race_name.registration_denied + - runtime.image_pull_failed + - runtime.container_start_failed + - runtime.start_config_invalid description: | Exact v1 notification type catalog. `lobby.invite.revoked` deliberately remains outside the supported catalog because it @@ -108,6 +120,7 @@ components: - geoprofile - game_master - game_lobby + - runtime_manager description: | Stable producer identifier. The exact producer value is frozen per `notification_type` by the v1 catalog. @@ -419,6 +432,51 @@ components: payload_json: contentSchema: $ref: '#/components/schemas/LobbyRaceNameRegistrationDeniedPayload' + - if: + properties: + notification_type: + const: runtime.image_pull_failed + required: + - notification_type + then: + properties: + producer: + const: runtime_manager + audience_kind: + const: admin_email + payload_json: + contentSchema: + $ref: '#/components/schemas/RuntimeImagePullFailedPayload' + - if: + properties: + notification_type: + const: runtime.container_start_failed + required: + - notification_type + then: + properties: + producer: + const: runtime_manager + audience_kind: + const: admin_email + payload_json: + contentSchema: + $ref: '#/components/schemas/RuntimeContainerStartFailedPayload' + - if: + properties: + notification_type: + const: runtime.start_config_invalid + required: + - notification_type + then: + properties: + producer: + const: runtime_manager + audience_kind: + const: admin_email + payload_json: + contentSchema: + $ref: '#/components/schemas/RuntimeStartConfigInvalidPayload' GeoReviewRecommendedPayload: type: object additionalProperties: true @@ -697,3 +755,78 @@ components: reason: type: string minLength: 1 + RuntimeImagePullFailedPayload: + type: object + additionalProperties: true + required: + - game_id + - image_ref + - error_code + - error_message + - attempted_at_ms + properties: + game_id: + type: string + minLength: 1 + image_ref: + type: string + minLength: 1 + error_code: + type: string + minLength: 1 + error_message: + type: string + minLength: 1 + attempted_at_ms: + type: integer + minimum: 1 + RuntimeContainerStartFailedPayload: + type: object + additionalProperties: true + required: + - game_id + - image_ref + - error_code + - error_message + - attempted_at_ms + properties: + game_id: + type: string + minLength: 1 + image_ref: + type: string + minLength: 1 + error_code: + type: string + minLength: 1 + error_message: + type: string + minLength: 1 + attempted_at_ms: + type: integer + minimum: 1 + RuntimeStartConfigInvalidPayload: + type: object + additionalProperties: true + required: + - game_id + - image_ref + - error_code + - error_message + - attempted_at_ms + properties: + game_id: + type: string + minLength: 1 + image_ref: + type: string + minLength: 1 + error_code: + type: string + minLength: 1 + error_message: + type: string + minLength: 1 + attempted_at_ms: + type: integer + minimum: 1 diff --git a/notification/contract_asyncapi_test.go b/notification/contract_asyncapi_test.go index e80b4ab..a55403a 100644 --- a/notification/contract_asyncapi_test.go +++ b/notification/contract_asyncapi_test.go @@ -35,6 +35,9 @@ var expectedNotificationTypeCatalog = []string{ "lobby.race_name.registration_eligible", "lobby.race_name.registered", "lobby.race_name.registration_denied", + "runtime.image_pull_failed", + "runtime.container_start_failed", + "runtime.start_config_invalid", } var expectedNotificationCatalog = map[string]notificationCatalogExpectation{ @@ -128,6 +131,24 @@ var expectedNotificationCatalog = map[string]notificationCatalogExpectation{ payloadSchema: "LobbyRaceNameRegistrationDeniedPayload", requiredFields: []string{"game_id", "game_name", "race_name", "reason"}, }, + "runtime.image_pull_failed": { + producer: "runtime_manager", + audienceKind: "admin_email", + payloadSchema: "RuntimeImagePullFailedPayload", + requiredFields: []string{"game_id", "image_ref", "error_code", "error_message", "attempted_at_ms"}, + }, + "runtime.container_start_failed": { + producer: "runtime_manager", + audienceKind: "admin_email", + payloadSchema: "RuntimeContainerStartFailedPayload", + requiredFields: []string{"game_id", "image_ref", "error_code", "error_message", "attempted_at_ms"}, + }, + "runtime.start_config_invalid": { + producer: "runtime_manager", + audienceKind: "admin_email", + payloadSchema: "RuntimeStartConfigInvalidPayload", + requiredFields: []string{"game_id", "image_ref", "error_code", "error_message", "attempted_at_ms"}, + }, } const expectedNotificationCatalogTable = `| ` + "`notification_type`" + ` | Producer | Audience | Channels | Required ` + "`payload_json`" + ` fields | @@ -146,7 +167,10 @@ const expectedNotificationCatalogTable = `| ` + "`notification_type`" + ` | Prod | ` + "`lobby.invite.expired`" + ` | ` + "`Game Lobby`" + ` (` + "`game_lobby`" + `) | private-game owner (` + "`audience_kind=user`" + `) | ` + "`email`" + ` | ` + "`game_id`" + `, ` + "`game_name`" + `, ` + "`invitee_user_id`" + `, ` + "`invitee_name`" + ` | | ` + "`lobby.race_name.registration_eligible`" + ` | ` + "`Game Lobby`" + ` (` + "`game_lobby`" + `) | capable member (` + "`audience_kind=user`" + `) | ` + "`push+email`" + ` | ` + "`game_id`" + `, ` + "`game_name`" + `, ` + "`race_name`" + `, ` + "`eligible_until_ms`" + ` | | ` + "`lobby.race_name.registered`" + ` | ` + "`Game Lobby`" + ` (` + "`game_lobby`" + `) | registering user (` + "`audience_kind=user`" + `) | ` + "`push+email`" + ` | ` + "`race_name`" + ` | -| ` + "`lobby.race_name.registration_denied`" + ` | ` + "`Game Lobby`" + ` (` + "`game_lobby`" + `) | incapable member (` + "`audience_kind=user`" + `) | ` + "`email`" + ` | ` + "`game_id`" + `, ` + "`game_name`" + `, ` + "`race_name`" + `, ` + "`reason`" + ` |` +| ` + "`lobby.race_name.registration_denied`" + ` | ` + "`Game Lobby`" + ` (` + "`game_lobby`" + `) | incapable member (` + "`audience_kind=user`" + `) | ` + "`email`" + ` | ` + "`game_id`" + `, ` + "`game_name`" + `, ` + "`race_name`" + `, ` + "`reason`" + ` | +| ` + "`runtime.image_pull_failed`" + ` | ` + "`Runtime Manager`" + ` (` + "`runtime_manager`" + `) | configured admin email list (` + "`audience_kind=admin_email`" + `) | ` + "`email`" + ` | ` + "`game_id`" + `, ` + "`image_ref`" + `, ` + "`error_code`" + `, ` + "`error_message`" + `, ` + "`attempted_at_ms`" + ` | +| ` + "`runtime.container_start_failed`" + ` | ` + "`Runtime Manager`" + ` (` + "`runtime_manager`" + `) | configured admin email list (` + "`audience_kind=admin_email`" + `) | ` + "`email`" + ` | ` + "`game_id`" + `, ` + "`image_ref`" + `, ` + "`error_code`" + `, ` + "`error_message`" + `, ` + "`attempted_at_ms`" + ` | +| ` + "`runtime.start_config_invalid`" + ` | ` + "`Runtime Manager`" + ` (` + "`runtime_manager`" + `) | configured admin email list (` + "`audience_kind=admin_email`" + `) | ` + "`email`" + ` | ` + "`game_id`" + `, ` + "`image_ref`" + `, ` + "`error_code`" + `, ` + "`error_message`" + `, ` + "`attempted_at_ms`" + ` |` var expectedSharedDocumentationSnippets = []string{ "`lobby.application.submitted` keeps one stable `notification_type` and one stable `payload_json` shape", @@ -234,7 +258,7 @@ func TestIntentAsyncAPISpecFreezesEnvelopeSchema(t *testing.T) { producer := getMapValue(t, properties, "producer") require.Equal(t, "string", getStringValue(t, producer, "type")) - require.Equal(t, []string{"geoprofile", "game_master", "game_lobby"}, getStringSlice(t, producer, "enum")) + require.Equal(t, []string{"geoprofile", "game_master", "game_lobby", "runtime_manager"}, getStringSlice(t, producer, "enum")) occurredAt := getMapValue(t, properties, "occurred_at_ms") require.Equal(t, "string", getStringValue(t, occurredAt, "type")) diff --git a/notification/internal/api/intentstream/contract.go b/notification/internal/api/intentstream/contract.go index 345fc38..5c7b087 100644 --- a/notification/internal/api/intentstream/contract.go +++ b/notification/internal/api/intentstream/contract.go @@ -85,6 +85,18 @@ const ( // NotificationTypeLobbyRaceNameRegistrationDenied identifies the // `lobby.race_name.registration_denied` notification. NotificationTypeLobbyRaceNameRegistrationDenied = notificationintent.NotificationTypeLobbyRaceNameRegistrationDenied + + // NotificationTypeRuntimeImagePullFailed identifies the + // `runtime.image_pull_failed` notification. + NotificationTypeRuntimeImagePullFailed = notificationintent.NotificationTypeRuntimeImagePullFailed + + // NotificationTypeRuntimeContainerStartFailed identifies the + // `runtime.container_start_failed` notification. + NotificationTypeRuntimeContainerStartFailed = notificationintent.NotificationTypeRuntimeContainerStartFailed + + // NotificationTypeRuntimeStartConfigInvalid identifies the + // `runtime.start_config_invalid` notification. + NotificationTypeRuntimeStartConfigInvalid = notificationintent.NotificationTypeRuntimeStartConfigInvalid ) // Producer identifies one supported upstream producer. @@ -99,6 +111,9 @@ const ( // ProducerGameLobby identifies Game Lobby. ProducerGameLobby = notificationintent.ProducerGameLobby + + // ProducerRuntimeManager identifies Runtime Manager. + ProducerRuntimeManager = notificationintent.ProducerRuntimeManager ) // AudienceKind identifies one supported target-audience kind. diff --git a/notification/internal/config/config.go b/notification/internal/config/config.go index 9e5d89c..b74ca02 100644 --- a/notification/internal/config/config.go +++ b/notification/internal/config/config.go @@ -46,10 +46,13 @@ const ( userServiceBaseURLEnvVar = "NOTIFICATION_USER_SERVICE_BASE_URL" userServiceTimeoutEnvVar = "NOTIFICATION_USER_SERVICE_TIMEOUT" - adminEmailsGeoReviewRecommendedEnvVar = "NOTIFICATION_ADMIN_EMAILS_GEO_REVIEW_RECOMMENDED" - adminEmailsGameGenerationFailedEnvVar = "NOTIFICATION_ADMIN_EMAILS_GAME_GENERATION_FAILED" - adminEmailsLobbyRuntimePausedAfterEnvVar = "NOTIFICATION_ADMIN_EMAILS_LOBBY_RUNTIME_PAUSED_AFTER_START" - adminEmailsLobbyApplicationSubmittedEnvVar = "NOTIFICATION_ADMIN_EMAILS_LOBBY_APPLICATION_SUBMITTED" + adminEmailsGeoReviewRecommendedEnvVar = "NOTIFICATION_ADMIN_EMAILS_GEO_REVIEW_RECOMMENDED" + adminEmailsGameGenerationFailedEnvVar = "NOTIFICATION_ADMIN_EMAILS_GAME_GENERATION_FAILED" + adminEmailsLobbyRuntimePausedAfterEnvVar = "NOTIFICATION_ADMIN_EMAILS_LOBBY_RUNTIME_PAUSED_AFTER_START" + adminEmailsLobbyApplicationSubmittedEnvVar = "NOTIFICATION_ADMIN_EMAILS_LOBBY_APPLICATION_SUBMITTED" + adminEmailsRuntimeImagePullFailedEnvVar = "NOTIFICATION_ADMIN_EMAILS_RUNTIME_IMAGE_PULL_FAILED" + adminEmailsRuntimeContainerStartFailedEnvVar = "NOTIFICATION_ADMIN_EMAILS_RUNTIME_CONTAINER_START_FAILED" + adminEmailsRuntimeStartConfigInvalidEnvVar = "NOTIFICATION_ADMIN_EMAILS_RUNTIME_START_CONFIG_INVALID" otelServiceNameEnvVar = "OTEL_SERVICE_NAME" otelTracesExporterEnvVar = "OTEL_TRACES_EXPORTER" @@ -60,18 +63,18 @@ const ( otelStdoutTracesEnabledEnvVar = "NOTIFICATION_OTEL_STDOUT_TRACES_ENABLED" otelStdoutMetricsEnabledEnvVar = "NOTIFICATION_OTEL_STDOUT_METRICS_ENABLED" - defaultShutdownTimeout = 5 * time.Second - defaultLogLevel = "info" - defaultInternalHTTPAddr = ":8092" - defaultReadHeaderTimeout = 2 * time.Second - defaultReadTimeout = 10 * time.Second - defaultIdleTimeout = time.Minute + defaultShutdownTimeout = 5 * time.Second + defaultLogLevel = "info" + defaultInternalHTTPAddr = ":8092" + defaultReadHeaderTimeout = 2 * time.Second + defaultReadTimeout = 10 * time.Second + defaultIdleTimeout = time.Minute - defaultIntentsStream = "notification:intents" - defaultIntentsReadBlockTimeout = 2 * time.Second - defaultGatewayClientEventsStream = "gateway:client-events" + defaultIntentsStream = "notification:intents" + defaultIntentsReadBlockTimeout = 2 * time.Second + defaultGatewayClientEventsStream = "gateway:client-events" defaultGatewayClientEventsStreamMaxLen int64 = 1024 - defaultMailDeliveryCommandsStream = "mail:delivery_commands" + defaultMailDeliveryCommandsStream = "mail:delivery_commands" defaultPushRetryMaxAttempts = 3 defaultEmailRetryMaxAttempts = 7 @@ -352,6 +355,18 @@ type AdminRoutingConfig struct { // LobbyApplicationSubmitted stores recipients for public // `lobby.application.submitted` notifications. LobbyApplicationSubmitted []string + + // RuntimeImagePullFailed stores recipients for + // `runtime.image_pull_failed`. + RuntimeImagePullFailed []string + + // RuntimeContainerStartFailed stores recipients for + // `runtime.container_start_failed`. + RuntimeContainerStartFailed []string + + // RuntimeStartConfigInvalid stores recipients for + // `runtime.start_config_invalid`. + RuntimeStartConfigInvalid []string } // Validate reports whether cfg stores valid normalized administrator email @@ -369,6 +384,15 @@ func (cfg AdminRoutingConfig) Validate() error { if err := validateNormalizedEmailList("lobby.application.submitted", cfg.LobbyApplicationSubmitted); err != nil { return err } + if err := validateNormalizedEmailList("runtime.image_pull_failed", cfg.RuntimeImagePullFailed); err != nil { + return err + } + if err := validateNormalizedEmailList("runtime.container_start_failed", cfg.RuntimeContainerStartFailed); err != nil { + return err + } + if err := validateNormalizedEmailList("runtime.start_config_invalid", cfg.RuntimeStartConfigInvalid); err != nil { + return err + } return nil } diff --git a/notification/internal/config/config_test.go b/notification/internal/config/config_test.go index 9385111..93def44 100644 --- a/notification/internal/config/config_test.go +++ b/notification/internal/config/config_test.go @@ -19,11 +19,11 @@ const ( envRedisTLSEnabled = "NOTIFICATION_REDIS_TLS_ENABLED" envRedisUsername = "NOTIFICATION_REDIS_USERNAME" - envPostgresPrimaryDSN = "NOTIFICATION_POSTGRES_PRIMARY_DSN" - envPostgresOpTimeout = "NOTIFICATION_POSTGRES_OPERATION_TIMEOUT" - envPostgresMaxOpenConns = "NOTIFICATION_POSTGRES_MAX_OPEN_CONNS" - envPostgresMaxIdleConns = "NOTIFICATION_POSTGRES_MAX_IDLE_CONNS" - envPostgresConnMaxLife = "NOTIFICATION_POSTGRES_CONN_MAX_LIFETIME" + envPostgresPrimaryDSN = "NOTIFICATION_POSTGRES_PRIMARY_DSN" + envPostgresOpTimeout = "NOTIFICATION_POSTGRES_OPERATION_TIMEOUT" + envPostgresMaxOpenConns = "NOTIFICATION_POSTGRES_MAX_OPEN_CONNS" + envPostgresMaxIdleConns = "NOTIFICATION_POSTGRES_MAX_IDLE_CONNS" + envPostgresConnMaxLife = "NOTIFICATION_POSTGRES_CONN_MAX_LIFETIME" ) const ( @@ -104,6 +104,9 @@ func TestLoadFromEnvAppliesOverrides(t *testing.T) { t.Setenv(adminEmailsGameGenerationFailedEnvVar, "ops@example.com") t.Setenv(adminEmailsLobbyRuntimePausedAfterEnvVar, "pause@example.com, PAUSE@example.com") t.Setenv(adminEmailsLobbyApplicationSubmittedEnvVar, "owner@example.com, OWNER@example.com") + t.Setenv(adminEmailsRuntimeImagePullFailedEnvVar, "image-pull-ops@example.com, IMAGE-PULL-OPS@example.com") + t.Setenv(adminEmailsRuntimeContainerStartFailedEnvVar, "container-start-ops@example.com") + t.Setenv(adminEmailsRuntimeStartConfigInvalidEnvVar, "start-config-ops@example.com, START-CONFIG-OPS@example.com") t.Setenv(otelServiceNameEnvVar, "custom-notification") t.Setenv(otelTracesExporterEnvVar, "otlp") t.Setenv(otelMetricsExporterEnvVar, "otlp") @@ -172,6 +175,9 @@ func TestLoadFromEnvAppliesOverrides(t *testing.T) { GameGenerationFailed: []string{"ops@example.com"}, LobbyRuntimePausedAfterStart: []string{"pause@example.com"}, LobbyApplicationSubmitted: []string{"owner@example.com"}, + RuntimeImagePullFailed: []string{"image-pull-ops@example.com"}, + RuntimeContainerStartFailed: []string{"container-start-ops@example.com"}, + RuntimeStartConfigInvalid: []string{"start-config-ops@example.com"}, }, cfg.AdminRouting) require.Equal(t, TelemetryConfig{ ServiceName: "custom-notification", @@ -295,6 +301,7 @@ func TestLoadFromEnvRejectsInvalidConfiguration(t *testing.T) { {name: "invalid admin email", envName: adminEmailsGeoReviewRecommendedEnvVar, envVal: "broken-email", want: "invalid email address"}, {name: "blank admin email slot", envName: adminEmailsGameGenerationFailedEnvVar, envVal: "ops@example.com, , second@example.com", want: "must not be empty"}, {name: "invalid public application admin email", envName: adminEmailsLobbyApplicationSubmittedEnvVar, envVal: "Owner ", want: "must not include a display name"}, + {name: "invalid runtime image pull admin email", envName: adminEmailsRuntimeImagePullFailedEnvVar, envVal: "broken-runtime-email", want: "invalid email address"}, {name: "nonpositive gateway client events stream max len", envName: gatewayClientEventsStreamMaxEnvVar, envVal: "0", want: "must be positive"}, {name: "backoff min above max", envName: routeBackoffMinEnvVar, envVal: "10m", want: "must not exceed"}, } diff --git a/notification/internal/config/env.go b/notification/internal/config/env.go index e4c120c..ee88680 100644 --- a/notification/internal/config/env.go +++ b/notification/internal/config/env.go @@ -128,6 +128,18 @@ func LoadFromEnv() (Config, error) { if err != nil { return Config{}, err } + cfg.AdminRouting.RuntimeImagePullFailed, err = emailListEnv(adminEmailsRuntimeImagePullFailedEnvVar, cfg.AdminRouting.RuntimeImagePullFailed) + if err != nil { + return Config{}, err + } + cfg.AdminRouting.RuntimeContainerStartFailed, err = emailListEnv(adminEmailsRuntimeContainerStartFailedEnvVar, cfg.AdminRouting.RuntimeContainerStartFailed) + if err != nil { + return Config{}, err + } + cfg.AdminRouting.RuntimeStartConfigInvalid, err = emailListEnv(adminEmailsRuntimeStartConfigInvalidEnvVar, cfg.AdminRouting.RuntimeStartConfigInvalid) + if err != nil { + return Config{}, err + } cfg.Telemetry.ServiceName = stringEnv(otelServiceNameEnvVar, cfg.Telemetry.ServiceName) cfg.Telemetry.TracesExporter = normalizeExporterValue(stringEnv(otelTracesExporterEnvVar, cfg.Telemetry.TracesExporter)) diff --git a/notification/internal/service/acceptintent/service.go b/notification/internal/service/acceptintent/service.go index 23a7f46..d1ca328 100644 --- a/notification/internal/service/acceptintent/service.go +++ b/notification/internal/service/acceptintent/service.go @@ -809,6 +809,12 @@ func (service *Service) adminEmailsFor(notificationType intentstream.Notificatio return append([]string(nil), service.adminRouting.LobbyRuntimePausedAfterStart...) case intentstream.NotificationTypeLobbyApplicationSubmitted: return append([]string(nil), service.adminRouting.LobbyApplicationSubmitted...) + case intentstream.NotificationTypeRuntimeImagePullFailed: + return append([]string(nil), service.adminRouting.RuntimeImagePullFailed...) + case intentstream.NotificationTypeRuntimeContainerStartFailed: + return append([]string(nil), service.adminRouting.RuntimeContainerStartFailed...) + case intentstream.NotificationTypeRuntimeStartConfigInvalid: + return append([]string(nil), service.adminRouting.RuntimeStartConfigInvalid...) default: return nil } diff --git a/notification/mail_template_contract_test.go b/notification/mail_template_contract_test.go index cc2a7c8..891abf0 100644 --- a/notification/mail_template_contract_test.go +++ b/notification/mail_template_contract_test.go @@ -26,7 +26,10 @@ const expectedNotificationMailTemplateTable = `| ` + "`notification_type`" + ` | | ` + "`lobby.invite.expired`" + ` | ` + "`lobby.invite.expired`" + ` | ` + "`en/subject.tmpl`" + `, ` + "`en/text.tmpl`" + ` | | ` + "`lobby.race_name.registration_eligible`" + ` | ` + "`lobby.race_name.registration_eligible`" + ` | ` + "`en/subject.tmpl`" + `, ` + "`en/text.tmpl`" + ` | | ` + "`lobby.race_name.registered`" + ` | ` + "`lobby.race_name.registered`" + ` | ` + "`en/subject.tmpl`" + `, ` + "`en/text.tmpl`" + ` | -| ` + "`lobby.race_name.registration_denied`" + ` | ` + "`lobby.race_name.registration_denied`" + ` | ` + "`en/subject.tmpl`" + `, ` + "`en/text.tmpl`" + ` |` +| ` + "`lobby.race_name.registration_denied`" + ` | ` + "`lobby.race_name.registration_denied`" + ` | ` + "`en/subject.tmpl`" + `, ` + "`en/text.tmpl`" + ` | +| ` + "`runtime.image_pull_failed`" + ` | ` + "`runtime.image_pull_failed`" + ` | ` + "`en/subject.tmpl`" + `, ` + "`en/text.tmpl`" + ` | +| ` + "`runtime.container_start_failed`" + ` | ` + "`runtime.container_start_failed`" + ` | ` + "`en/subject.tmpl`" + `, ` + "`en/text.tmpl`" + ` | +| ` + "`runtime.start_config_invalid`" + ` | ` + "`runtime.start_config_invalid`" + ` | ` + "`en/subject.tmpl`" + `, ` + "`en/text.tmpl`" + ` |` var expectedNotificationMailReadmeSnippets = []string{ "`payload_mode` is always `template`", diff --git a/pkg/model/lobby/lobby.go b/pkg/model/lobby/lobby.go new file mode 100644 index 0000000..80a6e79 --- /dev/null +++ b/pkg/model/lobby/lobby.go @@ -0,0 +1,68 @@ +// Package lobby defines the public typed command and response payloads +// exposed at the authenticated Gateway -> Game Lobby boundary. +package lobby + +import "time" + +const ( + // MessageTypeMyGamesList is the authenticated gateway message type + // used to read the calling user's own games. + MessageTypeMyGamesList = "lobby.my.games.list" + + // MessageTypeOpenEnrollment is the authenticated gateway message + // type used by the game owner to transition a draft game to + // `enrollment_open`. + MessageTypeOpenEnrollment = "lobby.game.open-enrollment" +) + +// MyGamesListRequest stores the authenticated read request for the +// caller's games. The request body is intentionally empty; gateway +// derives the calling user identity from the verified session. +type MyGamesListRequest struct{} + +// MyGamesListResponse stores the list of games the caller participates +// in, ordered as Lobby returns them. +type MyGamesListResponse struct { + Items []GameSummary `json:"items"` +} + +// GameSummary stores one game record returned by `lobby.my.games.list`. +type GameSummary struct { + GameID string `json:"game_id"` + GameName string `json:"game_name"` + GameType string `json:"game_type"` + Status string `json:"status"` + OwnerUserID string `json:"owner_user_id"` + MinPlayers int `json:"min_players"` + MaxPlayers int `json:"max_players"` + EnrollmentEndsAt time.Time `json:"enrollment_ends_at"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +// OpenEnrollmentRequest stores the owner-only command that transitions +// a game from `draft` to `enrollment_open`. +type OpenEnrollmentRequest struct { + // GameID identifies the game whose enrollment the caller wants to + // open. The owner check is enforced by Lobby. + GameID string `json:"game_id"` +} + +// OpenEnrollmentResponse stores the resulting game record after a +// successful open-enrollment transition. +type OpenEnrollmentResponse struct { + GameID string `json:"game_id"` + Status string `json:"status"` +} + +// ErrorBody stores the canonical Lobby error envelope code/message +// pair. +type ErrorBody struct { + Code string `json:"code"` + Message string `json:"message"` +} + +// ErrorResponse wraps ErrorBody for the FlatBuffers payload boundary. +type ErrorResponse struct { + Error ErrorBody `json:"error"` +} diff --git a/pkg/notificationintent/intent.go b/pkg/notificationintent/intent.go index bf70753..661e7c3 100644 --- a/pkg/notificationintent/intent.go +++ b/pkg/notificationintent/intent.go @@ -118,6 +118,24 @@ const ( // Game Lobby when capability evaluation at game finish releases a // reservation because the member did not meet the capability rule. NotificationTypeLobbyRaceNameRegistrationDenied NotificationType = "lobby.race_name.registration_denied" + + // NotificationTypeRuntimeImagePullFailed identifies the + // `runtime.image_pull_failed` administrator notification published by + // Runtime Manager when the engine image cannot be pulled during a + // start operation. + NotificationTypeRuntimeImagePullFailed NotificationType = "runtime.image_pull_failed" + + // NotificationTypeRuntimeContainerStartFailed identifies the + // `runtime.container_start_failed` administrator notification published + // by Runtime Manager when `docker create` or `docker start` returns an + // error during a start operation. + NotificationTypeRuntimeContainerStartFailed NotificationType = "runtime.container_start_failed" + + // NotificationTypeRuntimeStartConfigInvalid identifies the + // `runtime.start_config_invalid` administrator notification published by + // Runtime Manager when start configuration validation fails (invalid + // `image_ref`, missing Docker network, unwritable state directory). + NotificationTypeRuntimeStartConfigInvalid NotificationType = "runtime.start_config_invalid" ) // String returns the wire value for notificationType. @@ -142,7 +160,10 @@ func (notificationType NotificationType) IsKnown() bool { NotificationTypeLobbyInviteExpired, NotificationTypeLobbyRaceNameRegistrationEligible, NotificationTypeLobbyRaceNameRegistered, - NotificationTypeLobbyRaceNameRegistrationDenied: + NotificationTypeLobbyRaceNameRegistrationDenied, + NotificationTypeRuntimeImagePullFailed, + NotificationTypeRuntimeContainerStartFailed, + NotificationTypeRuntimeStartConfigInvalid: return true default: return false @@ -170,6 +191,10 @@ func (notificationType NotificationType) ExpectedProducer() Producer { NotificationTypeLobbyRaceNameRegistered, NotificationTypeLobbyRaceNameRegistrationDenied: return ProducerGameLobby + case NotificationTypeRuntimeImagePullFailed, + NotificationTypeRuntimeContainerStartFailed, + NotificationTypeRuntimeStartConfigInvalid: + return ProducerRuntimeManager default: return "" } @@ -180,7 +205,10 @@ func (notificationType NotificationType) SupportsAudience(audienceKind AudienceK switch notificationType { case NotificationTypeGeoReviewRecommended, NotificationTypeGameGenerationFailed, - NotificationTypeLobbyRuntimePausedAfterStart: + NotificationTypeLobbyRuntimePausedAfterStart, + NotificationTypeRuntimeImagePullFailed, + NotificationTypeRuntimeContainerStartFailed, + NotificationTypeRuntimeStartConfigInvalid: return audienceKind == AudienceKindAdminEmail case NotificationTypeLobbyApplicationSubmitted: return audienceKind == AudienceKindUser || audienceKind == AudienceKindAdminEmail @@ -195,7 +223,10 @@ func (notificationType NotificationType) SupportsChannel(audienceKind AudienceKi switch notificationType { case NotificationTypeGeoReviewRecommended, NotificationTypeGameGenerationFailed, - NotificationTypeLobbyRuntimePausedAfterStart: + NotificationTypeLobbyRuntimePausedAfterStart, + NotificationTypeRuntimeImagePullFailed, + NotificationTypeRuntimeContainerStartFailed, + NotificationTypeRuntimeStartConfigInvalid: return audienceKind == AudienceKindAdminEmail && channel == ChannelEmail case NotificationTypeLobbyApplicationSubmitted: if audienceKind == AudienceKindAdminEmail { @@ -222,6 +253,9 @@ const ( // ProducerGameLobby identifies Game Lobby. ProducerGameLobby Producer = "game_lobby" + + // ProducerRuntimeManager identifies Runtime Manager. + ProducerRuntimeManager Producer = "runtime_manager" ) // String returns the wire value for producer. @@ -232,7 +266,7 @@ func (producer Producer) String() string { // IsKnown reports whether producer belongs to the frozen producer set. func (producer Producer) IsKnown() bool { switch producer { - case ProducerGeoProfile, ProducerGameMaster, ProducerGameLobby: + case ProducerGeoProfile, ProducerGameMaster, ProducerGameLobby, ProducerRuntimeManager: return true default: return false @@ -801,6 +835,13 @@ func validatePayloadObject(notificationType NotificationType, payload map[string return validateStringFields(payload, "race_name") case NotificationTypeLobbyRaceNameRegistrationDenied: return validateStringFields(payload, "game_id", "game_name", "race_name", "reason") + case NotificationTypeRuntimeImagePullFailed, + NotificationTypeRuntimeContainerStartFailed, + NotificationTypeRuntimeStartConfigInvalid: + if err := validateStringFields(payload, "game_id", "image_ref", "error_code", "error_message"); err != nil { + return err + } + return validatePositiveIntFields(payload, "attempted_at_ms") default: return fmt.Errorf("payload_json notification type %q is unsupported", notificationType) } diff --git a/pkg/notificationintent/intent_test.go b/pkg/notificationintent/intent_test.go index e52f405..e47fcac 100644 --- a/pkg/notificationintent/intent_test.go +++ b/pkg/notificationintent/intent_test.go @@ -269,6 +269,54 @@ func TestConstructorsBuildExpectedIntentValues(t *testing.T) { recipientUserIDs: []string{"user-9"}, payloadJSON: `{"game_id":"game-1","game_name":"Nebula Clash","race_name":"Skylancer","reason":"capability_not_met"}`, }, + { + name: "runtime image pull failed", + build: func() (Intent, error) { + return NewRuntimeImagePullFailedIntent(metadata, RuntimeImagePullFailedPayload{ + GameID: "game-1", + ImageRef: "galaxy/game:1.4.7", + ErrorCode: "image_pull_failed", + ErrorMessage: "manifest unknown", + AttemptedAtMs: 1775121700000, + }) + }, + notificationType: NotificationTypeRuntimeImagePullFailed, + producer: ProducerRuntimeManager, + audienceKind: AudienceKindAdminEmail, + payloadJSON: `{"game_id":"game-1","image_ref":"galaxy/game:1.4.7","error_code":"image_pull_failed","error_message":"manifest unknown","attempted_at_ms":1775121700000}`, + }, + { + name: "runtime container start failed", + build: func() (Intent, error) { + return NewRuntimeContainerStartFailedIntent(metadata, RuntimeContainerStartFailedPayload{ + GameID: "game-1", + ImageRef: "galaxy/game:1.4.7", + ErrorCode: "container_start_failed", + ErrorMessage: "OCI runtime create failed", + AttemptedAtMs: 1775121700001, + }) + }, + notificationType: NotificationTypeRuntimeContainerStartFailed, + producer: ProducerRuntimeManager, + audienceKind: AudienceKindAdminEmail, + payloadJSON: `{"game_id":"game-1","image_ref":"galaxy/game:1.4.7","error_code":"container_start_failed","error_message":"OCI runtime create failed","attempted_at_ms":1775121700001}`, + }, + { + name: "runtime start config invalid", + build: func() (Intent, error) { + return NewRuntimeStartConfigInvalidIntent(metadata, RuntimeStartConfigInvalidPayload{ + GameID: "game-1", + ImageRef: "galaxy/game:1.4.7", + ErrorCode: "start_config_invalid", + ErrorMessage: "docker network galaxy-net not found", + AttemptedAtMs: 1775121700002, + }) + }, + notificationType: NotificationTypeRuntimeStartConfigInvalid, + producer: ProducerRuntimeManager, + audienceKind: AudienceKindAdminEmail, + payloadJSON: `{"game_id":"game-1","image_ref":"galaxy/game:1.4.7","error_code":"start_config_invalid","error_message":"docker network galaxy-net not found","attempted_at_ms":1775121700002}`, + }, } for _, tt := range tests { @@ -335,6 +383,26 @@ func TestConstructorsRejectInvalidPayloads(t *testing.T) { }) require.Error(t, err) require.Contains(t, err.Error(), "payload_json.turn_number must be at least 1") + + _, err = NewRuntimeImagePullFailedIntent(defaultMetadata(), RuntimeImagePullFailedPayload{ + GameID: "game-1", + ImageRef: "galaxy/game:1.4.7", + ErrorCode: "", + ErrorMessage: "manifest unknown", + AttemptedAtMs: 1775121700000, + }) + require.Error(t, err) + require.Contains(t, err.Error(), "payload_json.error_code must not be empty") + + _, err = NewRuntimeContainerStartFailedIntent(defaultMetadata(), RuntimeContainerStartFailedPayload{ + GameID: "game-1", + ImageRef: "galaxy/game:1.4.7", + ErrorCode: "container_start_failed", + ErrorMessage: "OCI runtime create failed", + AttemptedAtMs: 0, + }) + require.Error(t, err) + require.Contains(t, err.Error(), "payload_json.attempted_at_ms must be at least 1") } func TestDecodeIntentRejectsMissingRequiredTopLevelField(t *testing.T) { diff --git a/pkg/notificationintent/payloads.go b/pkg/notificationintent/payloads.go index d332715..94b0703 100644 --- a/pkg/notificationintent/payloads.go +++ b/pkg/notificationintent/payloads.go @@ -127,6 +127,39 @@ type LobbyRaceNameRegistrationDeniedPayload struct { Reason string `json:"reason"` } +// RuntimeImagePullFailedPayload stores the normalized payload for +// `runtime.image_pull_failed`. AttemptedAtMs carries Unix milliseconds in +// UTC of the failed pull attempt. +type RuntimeImagePullFailedPayload struct { + GameID string `json:"game_id"` + ImageRef string `json:"image_ref"` + ErrorCode string `json:"error_code"` + ErrorMessage string `json:"error_message"` + AttemptedAtMs int64 `json:"attempted_at_ms"` +} + +// RuntimeContainerStartFailedPayload stores the normalized payload for +// `runtime.container_start_failed`. AttemptedAtMs carries Unix milliseconds +// in UTC of the failed start attempt. +type RuntimeContainerStartFailedPayload struct { + GameID string `json:"game_id"` + ImageRef string `json:"image_ref"` + ErrorCode string `json:"error_code"` + ErrorMessage string `json:"error_message"` + AttemptedAtMs int64 `json:"attempted_at_ms"` +} + +// RuntimeStartConfigInvalidPayload stores the normalized payload for +// `runtime.start_config_invalid`. AttemptedAtMs carries Unix milliseconds +// in UTC of the rejected start attempt. +type RuntimeStartConfigInvalidPayload struct { + GameID string `json:"game_id"` + ImageRef string `json:"image_ref"` + ErrorCode string `json:"error_code"` + ErrorMessage string `json:"error_message"` + AttemptedAtMs int64 `json:"attempted_at_ms"` +} + // NewGeoReviewRecommendedIntent builds the admin-email intent published by Geo // Profile Service when a user becomes review-worthy. func NewGeoReviewRecommendedIntent(metadata Metadata, payload GeoReviewRecommendedPayload) (Intent, error) { @@ -226,3 +259,25 @@ func NewLobbyRaceNameRegisteredIntent(metadata Metadata, recipientUserID string, func NewLobbyRaceNameRegistrationDeniedIntent(metadata Metadata, recipientUserID string, payload LobbyRaceNameRegistrationDeniedPayload) (Intent, error) { return newIntent(NotificationTypeLobbyRaceNameRegistrationDenied, ProducerGameLobby, AudienceKindUser, []string{recipientUserID}, metadata, payload) } + +// NewRuntimeImagePullFailedIntent builds the administrator-email intent +// published by Runtime Manager when a start operation fails because the +// engine image cannot be pulled. +func NewRuntimeImagePullFailedIntent(metadata Metadata, payload RuntimeImagePullFailedPayload) (Intent, error) { + return newIntent(NotificationTypeRuntimeImagePullFailed, ProducerRuntimeManager, AudienceKindAdminEmail, nil, metadata, payload) +} + +// NewRuntimeContainerStartFailedIntent builds the administrator-email +// intent published by Runtime Manager when a start operation fails because +// `docker create` or `docker start` returns an error. +func NewRuntimeContainerStartFailedIntent(metadata Metadata, payload RuntimeContainerStartFailedPayload) (Intent, error) { + return newIntent(NotificationTypeRuntimeContainerStartFailed, ProducerRuntimeManager, AudienceKindAdminEmail, nil, metadata, payload) +} + +// NewRuntimeStartConfigInvalidIntent builds the administrator-email intent +// published by Runtime Manager when start configuration validation rejects +// the request (invalid image reference, missing Docker network, unwritable +// state directory). +func NewRuntimeStartConfigInvalidIntent(metadata Metadata, payload RuntimeStartConfigInvalidPayload) (Intent, error) { + return newIntent(NotificationTypeRuntimeStartConfigInvalid, ProducerRuntimeManager, AudienceKindAdminEmail, nil, metadata, payload) +} diff --git a/pkg/schema/fbs/lobby.fbs b/pkg/schema/fbs/lobby.fbs new file mode 100644 index 0000000..9671123 --- /dev/null +++ b/pkg/schema/fbs/lobby.fbs @@ -0,0 +1,58 @@ +// lobby contains FlatBuffers payloads used by the authenticated gateway +// boundary for Game Lobby. The wire shapes here mirror the trusted +// internal lobby REST surface; gateway derives the calling `user_id` +// from the verified session and forwards it as `X-User-Id` to lobby. +namespace lobby; + +// GameSummary stores one game record returned by `lobby.my.games.list`. +// The shape matches `lobby/openapi.yaml` `MyGameSummary`. +table GameSummary { + game_id:string; + game_name:string; + game_type:string; + status:string; + owner_user_id:string; + min_players:int32; + max_players:int32; + enrollment_ends_at_ms:int64; + created_at_ms:int64; + updated_at_ms:int64; +} + +// MyGamesListRequest stores the authenticated read request for the +// caller's games. Empty body — gateway derives identity from the +// authenticated session. +table MyGamesListRequest { +} + +// MyGamesListResponse stores the list of games the caller participates +// in. +table MyGamesListResponse { + items:[GameSummary]; +} + +// OpenEnrollmentRequest stores the owner-only command that transitions +// a game from `draft` to `enrollment_open`. +table OpenEnrollmentRequest { + game_id:string; +} + +// OpenEnrollmentResponse stores the resulting game status. +table OpenEnrollmentResponse { + game_id:string; + status:string; +} + +// ErrorBody stores the canonical lobby error envelope code/message +// pair. +table ErrorBody { + code:string; + message:string; +} + +// ErrorResponse wraps ErrorBody for the FlatBuffers payload boundary. +table ErrorResponse { + error:ErrorBody; +} + +root_type MyGamesListResponse; diff --git a/pkg/schema/fbs/lobby/ErrorBody.go b/pkg/schema/fbs/lobby/ErrorBody.go new file mode 100644 index 0000000..882f67e --- /dev/null +++ b/pkg/schema/fbs/lobby/ErrorBody.go @@ -0,0 +1,71 @@ +// Code generated by the FlatBuffers compiler. DO NOT EDIT. + +package lobby + +import ( + flatbuffers "github.com/google/flatbuffers/go" +) + +type ErrorBody struct { + _tab flatbuffers.Table +} + +func GetRootAsErrorBody(buf []byte, offset flatbuffers.UOffsetT) *ErrorBody { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &ErrorBody{} + x.Init(buf, n+offset) + return x +} + +func FinishErrorBodyBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsErrorBody(buf []byte, offset flatbuffers.UOffsetT) *ErrorBody { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &ErrorBody{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedErrorBodyBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *ErrorBody) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *ErrorBody) Table() flatbuffers.Table { + return rcv._tab +} + +func (rcv *ErrorBody) Code() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *ErrorBody) Message() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(6)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func ErrorBodyStart(builder *flatbuffers.Builder) { + builder.StartObject(2) +} +func ErrorBodyAddCode(builder *flatbuffers.Builder, code flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(code), 0) +} +func ErrorBodyAddMessage(builder *flatbuffers.Builder, message flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(message), 0) +} +func ErrorBodyEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} diff --git a/pkg/schema/fbs/lobby/ErrorResponse.go b/pkg/schema/fbs/lobby/ErrorResponse.go new file mode 100644 index 0000000..71094bd --- /dev/null +++ b/pkg/schema/fbs/lobby/ErrorResponse.go @@ -0,0 +1,65 @@ +// Code generated by the FlatBuffers compiler. DO NOT EDIT. + +package lobby + +import ( + flatbuffers "github.com/google/flatbuffers/go" +) + +type ErrorResponse struct { + _tab flatbuffers.Table +} + +func GetRootAsErrorResponse(buf []byte, offset flatbuffers.UOffsetT) *ErrorResponse { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &ErrorResponse{} + x.Init(buf, n+offset) + return x +} + +func FinishErrorResponseBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsErrorResponse(buf []byte, offset flatbuffers.UOffsetT) *ErrorResponse { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &ErrorResponse{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedErrorResponseBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *ErrorResponse) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *ErrorResponse) Table() flatbuffers.Table { + return rcv._tab +} + +func (rcv *ErrorResponse) Error(obj *ErrorBody) *ErrorBody { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + x := rcv._tab.Indirect(o + rcv._tab.Pos) + if obj == nil { + obj = new(ErrorBody) + } + obj.Init(rcv._tab.Bytes, x) + return obj + } + return nil +} + +func ErrorResponseStart(builder *flatbuffers.Builder) { + builder.StartObject(1) +} +func ErrorResponseAddError(builder *flatbuffers.Builder, error flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(error), 0) +} +func ErrorResponseEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} diff --git a/pkg/schema/fbs/lobby/GameSummary.go b/pkg/schema/fbs/lobby/GameSummary.go new file mode 100644 index 0000000..71958b0 --- /dev/null +++ b/pkg/schema/fbs/lobby/GameSummary.go @@ -0,0 +1,179 @@ +// Code generated by the FlatBuffers compiler. DO NOT EDIT. + +package lobby + +import ( + flatbuffers "github.com/google/flatbuffers/go" +) + +type GameSummary struct { + _tab flatbuffers.Table +} + +func GetRootAsGameSummary(buf []byte, offset flatbuffers.UOffsetT) *GameSummary { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &GameSummary{} + x.Init(buf, n+offset) + return x +} + +func FinishGameSummaryBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsGameSummary(buf []byte, offset flatbuffers.UOffsetT) *GameSummary { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &GameSummary{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedGameSummaryBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *GameSummary) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *GameSummary) Table() flatbuffers.Table { + return rcv._tab +} + +func (rcv *GameSummary) GameId() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *GameSummary) GameName() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(6)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *GameSummary) GameType() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(8)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *GameSummary) Status() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(10)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *GameSummary) OwnerUserId() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(12)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *GameSummary) MinPlayers() int32 { + o := flatbuffers.UOffsetT(rcv._tab.Offset(14)) + if o != 0 { + return rcv._tab.GetInt32(o + rcv._tab.Pos) + } + return 0 +} + +func (rcv *GameSummary) MutateMinPlayers(n int32) bool { + return rcv._tab.MutateInt32Slot(14, n) +} + +func (rcv *GameSummary) MaxPlayers() int32 { + o := flatbuffers.UOffsetT(rcv._tab.Offset(16)) + if o != 0 { + return rcv._tab.GetInt32(o + rcv._tab.Pos) + } + return 0 +} + +func (rcv *GameSummary) MutateMaxPlayers(n int32) bool { + return rcv._tab.MutateInt32Slot(16, n) +} + +func (rcv *GameSummary) EnrollmentEndsAtMs() int64 { + o := flatbuffers.UOffsetT(rcv._tab.Offset(18)) + if o != 0 { + return rcv._tab.GetInt64(o + rcv._tab.Pos) + } + return 0 +} + +func (rcv *GameSummary) MutateEnrollmentEndsAtMs(n int64) bool { + return rcv._tab.MutateInt64Slot(18, n) +} + +func (rcv *GameSummary) CreatedAtMs() int64 { + o := flatbuffers.UOffsetT(rcv._tab.Offset(20)) + if o != 0 { + return rcv._tab.GetInt64(o + rcv._tab.Pos) + } + return 0 +} + +func (rcv *GameSummary) MutateCreatedAtMs(n int64) bool { + return rcv._tab.MutateInt64Slot(20, n) +} + +func (rcv *GameSummary) UpdatedAtMs() int64 { + o := flatbuffers.UOffsetT(rcv._tab.Offset(22)) + if o != 0 { + return rcv._tab.GetInt64(o + rcv._tab.Pos) + } + return 0 +} + +func (rcv *GameSummary) MutateUpdatedAtMs(n int64) bool { + return rcv._tab.MutateInt64Slot(22, n) +} + +func GameSummaryStart(builder *flatbuffers.Builder) { + builder.StartObject(10) +} +func GameSummaryAddGameId(builder *flatbuffers.Builder, gameId flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(gameId), 0) +} +func GameSummaryAddGameName(builder *flatbuffers.Builder, gameName flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(gameName), 0) +} +func GameSummaryAddGameType(builder *flatbuffers.Builder, gameType flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(2, flatbuffers.UOffsetT(gameType), 0) +} +func GameSummaryAddStatus(builder *flatbuffers.Builder, status flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(3, flatbuffers.UOffsetT(status), 0) +} +func GameSummaryAddOwnerUserId(builder *flatbuffers.Builder, ownerUserId flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(4, flatbuffers.UOffsetT(ownerUserId), 0) +} +func GameSummaryAddMinPlayers(builder *flatbuffers.Builder, minPlayers int32) { + builder.PrependInt32Slot(5, minPlayers, 0) +} +func GameSummaryAddMaxPlayers(builder *flatbuffers.Builder, maxPlayers int32) { + builder.PrependInt32Slot(6, maxPlayers, 0) +} +func GameSummaryAddEnrollmentEndsAtMs(builder *flatbuffers.Builder, enrollmentEndsAtMs int64) { + builder.PrependInt64Slot(7, enrollmentEndsAtMs, 0) +} +func GameSummaryAddCreatedAtMs(builder *flatbuffers.Builder, createdAtMs int64) { + builder.PrependInt64Slot(8, createdAtMs, 0) +} +func GameSummaryAddUpdatedAtMs(builder *flatbuffers.Builder, updatedAtMs int64) { + builder.PrependInt64Slot(9, updatedAtMs, 0) +} +func GameSummaryEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} diff --git a/pkg/schema/fbs/lobby/MyGamesListRequest.go b/pkg/schema/fbs/lobby/MyGamesListRequest.go new file mode 100644 index 0000000..c97e756 --- /dev/null +++ b/pkg/schema/fbs/lobby/MyGamesListRequest.go @@ -0,0 +1,49 @@ +// Code generated by the FlatBuffers compiler. DO NOT EDIT. + +package lobby + +import ( + flatbuffers "github.com/google/flatbuffers/go" +) + +type MyGamesListRequest struct { + _tab flatbuffers.Table +} + +func GetRootAsMyGamesListRequest(buf []byte, offset flatbuffers.UOffsetT) *MyGamesListRequest { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &MyGamesListRequest{} + x.Init(buf, n+offset) + return x +} + +func FinishMyGamesListRequestBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsMyGamesListRequest(buf []byte, offset flatbuffers.UOffsetT) *MyGamesListRequest { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &MyGamesListRequest{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedMyGamesListRequestBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *MyGamesListRequest) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *MyGamesListRequest) Table() flatbuffers.Table { + return rcv._tab +} + +func MyGamesListRequestStart(builder *flatbuffers.Builder) { + builder.StartObject(0) +} +func MyGamesListRequestEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} diff --git a/pkg/schema/fbs/lobby/MyGamesListResponse.go b/pkg/schema/fbs/lobby/MyGamesListResponse.go new file mode 100644 index 0000000..4d7ec38 --- /dev/null +++ b/pkg/schema/fbs/lobby/MyGamesListResponse.go @@ -0,0 +1,75 @@ +// Code generated by the FlatBuffers compiler. DO NOT EDIT. + +package lobby + +import ( + flatbuffers "github.com/google/flatbuffers/go" +) + +type MyGamesListResponse struct { + _tab flatbuffers.Table +} + +func GetRootAsMyGamesListResponse(buf []byte, offset flatbuffers.UOffsetT) *MyGamesListResponse { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &MyGamesListResponse{} + x.Init(buf, n+offset) + return x +} + +func FinishMyGamesListResponseBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsMyGamesListResponse(buf []byte, offset flatbuffers.UOffsetT) *MyGamesListResponse { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &MyGamesListResponse{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedMyGamesListResponseBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *MyGamesListResponse) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *MyGamesListResponse) Table() flatbuffers.Table { + return rcv._tab +} + +func (rcv *MyGamesListResponse) Items(obj *GameSummary, j int) bool { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + x := rcv._tab.Vector(o) + x += flatbuffers.UOffsetT(j) * 4 + x = rcv._tab.Indirect(x) + obj.Init(rcv._tab.Bytes, x) + return true + } + return false +} + +func (rcv *MyGamesListResponse) ItemsLength() int { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + return rcv._tab.VectorLen(o) + } + return 0 +} + +func MyGamesListResponseStart(builder *flatbuffers.Builder) { + builder.StartObject(1) +} +func MyGamesListResponseAddItems(builder *flatbuffers.Builder, items flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(items), 0) +} +func MyGamesListResponseStartItemsVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT { + return builder.StartVector(4, numElems, 4) +} +func MyGamesListResponseEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} diff --git a/pkg/schema/fbs/lobby/OpenEnrollmentRequest.go b/pkg/schema/fbs/lobby/OpenEnrollmentRequest.go new file mode 100644 index 0000000..931fc54 --- /dev/null +++ b/pkg/schema/fbs/lobby/OpenEnrollmentRequest.go @@ -0,0 +1,60 @@ +// Code generated by the FlatBuffers compiler. DO NOT EDIT. + +package lobby + +import ( + flatbuffers "github.com/google/flatbuffers/go" +) + +type OpenEnrollmentRequest struct { + _tab flatbuffers.Table +} + +func GetRootAsOpenEnrollmentRequest(buf []byte, offset flatbuffers.UOffsetT) *OpenEnrollmentRequest { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &OpenEnrollmentRequest{} + x.Init(buf, n+offset) + return x +} + +func FinishOpenEnrollmentRequestBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsOpenEnrollmentRequest(buf []byte, offset flatbuffers.UOffsetT) *OpenEnrollmentRequest { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &OpenEnrollmentRequest{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedOpenEnrollmentRequestBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *OpenEnrollmentRequest) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *OpenEnrollmentRequest) Table() flatbuffers.Table { + return rcv._tab +} + +func (rcv *OpenEnrollmentRequest) GameId() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func OpenEnrollmentRequestStart(builder *flatbuffers.Builder) { + builder.StartObject(1) +} +func OpenEnrollmentRequestAddGameId(builder *flatbuffers.Builder, gameId flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(gameId), 0) +} +func OpenEnrollmentRequestEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} diff --git a/pkg/schema/fbs/lobby/OpenEnrollmentResponse.go b/pkg/schema/fbs/lobby/OpenEnrollmentResponse.go new file mode 100644 index 0000000..e952cdd --- /dev/null +++ b/pkg/schema/fbs/lobby/OpenEnrollmentResponse.go @@ -0,0 +1,71 @@ +// Code generated by the FlatBuffers compiler. DO NOT EDIT. + +package lobby + +import ( + flatbuffers "github.com/google/flatbuffers/go" +) + +type OpenEnrollmentResponse struct { + _tab flatbuffers.Table +} + +func GetRootAsOpenEnrollmentResponse(buf []byte, offset flatbuffers.UOffsetT) *OpenEnrollmentResponse { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &OpenEnrollmentResponse{} + x.Init(buf, n+offset) + return x +} + +func FinishOpenEnrollmentResponseBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsOpenEnrollmentResponse(buf []byte, offset flatbuffers.UOffsetT) *OpenEnrollmentResponse { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &OpenEnrollmentResponse{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedOpenEnrollmentResponseBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *OpenEnrollmentResponse) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *OpenEnrollmentResponse) Table() flatbuffers.Table { + return rcv._tab +} + +func (rcv *OpenEnrollmentResponse) GameId() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *OpenEnrollmentResponse) Status() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(6)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func OpenEnrollmentResponseStart(builder *flatbuffers.Builder) { + builder.StartObject(2) +} +func OpenEnrollmentResponseAddGameId(builder *flatbuffers.Builder, gameId flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(gameId), 0) +} +func OpenEnrollmentResponseAddStatus(builder *flatbuffers.Builder, status flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(status), 0) +} +func OpenEnrollmentResponseEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} diff --git a/pkg/schema/fbs/lobby_generated.go b/pkg/schema/fbs/lobby_generated.go new file mode 100644 index 0000000..c626b1f --- /dev/null +++ b/pkg/schema/fbs/lobby_generated.go @@ -0,0 +1,522 @@ +// Code generated by the FlatBuffers compiler. DO NOT EDIT. + +package lobby + +import ( + flatbuffers "github.com/google/flatbuffers/go" +) + +type GameSummary struct { + _tab flatbuffers.Table +} + +func GetRootAsGameSummary(buf []byte, offset flatbuffers.UOffsetT) *GameSummary { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &GameSummary{} + x.Init(buf, n+offset) + return x +} + +func FinishGameSummaryBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsGameSummary(buf []byte, offset flatbuffers.UOffsetT) *GameSummary { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &GameSummary{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedGameSummaryBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *GameSummary) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *GameSummary) Table() flatbuffers.Table { + return rcv._tab +} + +func (rcv *GameSummary) GameId() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *GameSummary) GameName() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(6)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *GameSummary) GameType() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(8)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *GameSummary) Status() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(10)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *GameSummary) OwnerUserId() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(12)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *GameSummary) MinPlayers() int32 { + o := flatbuffers.UOffsetT(rcv._tab.Offset(14)) + if o != 0 { + return rcv._tab.GetInt32(o + rcv._tab.Pos) + } + return 0 +} + +func (rcv *GameSummary) MutateMinPlayers(n int32) bool { + return rcv._tab.MutateInt32Slot(14, n) +} + +func (rcv *GameSummary) MaxPlayers() int32 { + o := flatbuffers.UOffsetT(rcv._tab.Offset(16)) + if o != 0 { + return rcv._tab.GetInt32(o + rcv._tab.Pos) + } + return 0 +} + +func (rcv *GameSummary) MutateMaxPlayers(n int32) bool { + return rcv._tab.MutateInt32Slot(16, n) +} + +func (rcv *GameSummary) EnrollmentEndsAtMs() int64 { + o := flatbuffers.UOffsetT(rcv._tab.Offset(18)) + if o != 0 { + return rcv._tab.GetInt64(o + rcv._tab.Pos) + } + return 0 +} + +func (rcv *GameSummary) MutateEnrollmentEndsAtMs(n int64) bool { + return rcv._tab.MutateInt64Slot(18, n) +} + +func (rcv *GameSummary) CreatedAtMs() int64 { + o := flatbuffers.UOffsetT(rcv._tab.Offset(20)) + if o != 0 { + return rcv._tab.GetInt64(o + rcv._tab.Pos) + } + return 0 +} + +func (rcv *GameSummary) MutateCreatedAtMs(n int64) bool { + return rcv._tab.MutateInt64Slot(20, n) +} + +func (rcv *GameSummary) UpdatedAtMs() int64 { + o := flatbuffers.UOffsetT(rcv._tab.Offset(22)) + if o != 0 { + return rcv._tab.GetInt64(o + rcv._tab.Pos) + } + return 0 +} + +func (rcv *GameSummary) MutateUpdatedAtMs(n int64) bool { + return rcv._tab.MutateInt64Slot(22, n) +} + +func GameSummaryStart(builder *flatbuffers.Builder) { + builder.StartObject(10) +} +func GameSummaryAddGameId(builder *flatbuffers.Builder, gameId flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(gameId), 0) +} +func GameSummaryAddGameName(builder *flatbuffers.Builder, gameName flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(gameName), 0) +} +func GameSummaryAddGameType(builder *flatbuffers.Builder, gameType flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(2, flatbuffers.UOffsetT(gameType), 0) +} +func GameSummaryAddStatus(builder *flatbuffers.Builder, status flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(3, flatbuffers.UOffsetT(status), 0) +} +func GameSummaryAddOwnerUserId(builder *flatbuffers.Builder, ownerUserId flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(4, flatbuffers.UOffsetT(ownerUserId), 0) +} +func GameSummaryAddMinPlayers(builder *flatbuffers.Builder, minPlayers int32) { + builder.PrependInt32Slot(5, minPlayers, 0) +} +func GameSummaryAddMaxPlayers(builder *flatbuffers.Builder, maxPlayers int32) { + builder.PrependInt32Slot(6, maxPlayers, 0) +} +func GameSummaryAddEnrollmentEndsAtMs(builder *flatbuffers.Builder, enrollmentEndsAtMs int64) { + builder.PrependInt64Slot(7, enrollmentEndsAtMs, 0) +} +func GameSummaryAddCreatedAtMs(builder *flatbuffers.Builder, createdAtMs int64) { + builder.PrependInt64Slot(8, createdAtMs, 0) +} +func GameSummaryAddUpdatedAtMs(builder *flatbuffers.Builder, updatedAtMs int64) { + builder.PrependInt64Slot(9, updatedAtMs, 0) +} +func GameSummaryEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} +type MyGamesListRequest struct { + _tab flatbuffers.Table +} + +func GetRootAsMyGamesListRequest(buf []byte, offset flatbuffers.UOffsetT) *MyGamesListRequest { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &MyGamesListRequest{} + x.Init(buf, n+offset) + return x +} + +func FinishMyGamesListRequestBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsMyGamesListRequest(buf []byte, offset flatbuffers.UOffsetT) *MyGamesListRequest { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &MyGamesListRequest{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedMyGamesListRequestBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *MyGamesListRequest) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *MyGamesListRequest) Table() flatbuffers.Table { + return rcv._tab +} + +func MyGamesListRequestStart(builder *flatbuffers.Builder) { + builder.StartObject(0) +} +func MyGamesListRequestEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} +type MyGamesListResponse struct { + _tab flatbuffers.Table +} + +func GetRootAsMyGamesListResponse(buf []byte, offset flatbuffers.UOffsetT) *MyGamesListResponse { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &MyGamesListResponse{} + x.Init(buf, n+offset) + return x +} + +func FinishMyGamesListResponseBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsMyGamesListResponse(buf []byte, offset flatbuffers.UOffsetT) *MyGamesListResponse { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &MyGamesListResponse{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedMyGamesListResponseBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *MyGamesListResponse) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *MyGamesListResponse) Table() flatbuffers.Table { + return rcv._tab +} + +func (rcv *MyGamesListResponse) Items(obj *GameSummary, j int) bool { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + x := rcv._tab.Vector(o) + x += flatbuffers.UOffsetT(j) * 4 + x = rcv._tab.Indirect(x) + obj.Init(rcv._tab.Bytes, x) + return true + } + return false +} + +func (rcv *MyGamesListResponse) ItemsLength() int { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + return rcv._tab.VectorLen(o) + } + return 0 +} + +func MyGamesListResponseStart(builder *flatbuffers.Builder) { + builder.StartObject(1) +} +func MyGamesListResponseAddItems(builder *flatbuffers.Builder, items flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(items), 0) +} +func MyGamesListResponseStartItemsVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT { + return builder.StartVector(4, numElems, 4) +} +func MyGamesListResponseEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} +type OpenEnrollmentRequest struct { + _tab flatbuffers.Table +} + +func GetRootAsOpenEnrollmentRequest(buf []byte, offset flatbuffers.UOffsetT) *OpenEnrollmentRequest { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &OpenEnrollmentRequest{} + x.Init(buf, n+offset) + return x +} + +func FinishOpenEnrollmentRequestBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsOpenEnrollmentRequest(buf []byte, offset flatbuffers.UOffsetT) *OpenEnrollmentRequest { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &OpenEnrollmentRequest{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedOpenEnrollmentRequestBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *OpenEnrollmentRequest) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *OpenEnrollmentRequest) Table() flatbuffers.Table { + return rcv._tab +} + +func (rcv *OpenEnrollmentRequest) GameId() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func OpenEnrollmentRequestStart(builder *flatbuffers.Builder) { + builder.StartObject(1) +} +func OpenEnrollmentRequestAddGameId(builder *flatbuffers.Builder, gameId flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(gameId), 0) +} +func OpenEnrollmentRequestEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} +type OpenEnrollmentResponse struct { + _tab flatbuffers.Table +} + +func GetRootAsOpenEnrollmentResponse(buf []byte, offset flatbuffers.UOffsetT) *OpenEnrollmentResponse { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &OpenEnrollmentResponse{} + x.Init(buf, n+offset) + return x +} + +func FinishOpenEnrollmentResponseBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsOpenEnrollmentResponse(buf []byte, offset flatbuffers.UOffsetT) *OpenEnrollmentResponse { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &OpenEnrollmentResponse{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedOpenEnrollmentResponseBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *OpenEnrollmentResponse) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *OpenEnrollmentResponse) Table() flatbuffers.Table { + return rcv._tab +} + +func (rcv *OpenEnrollmentResponse) GameId() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *OpenEnrollmentResponse) Status() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(6)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func OpenEnrollmentResponseStart(builder *flatbuffers.Builder) { + builder.StartObject(2) +} +func OpenEnrollmentResponseAddGameId(builder *flatbuffers.Builder, gameId flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(gameId), 0) +} +func OpenEnrollmentResponseAddStatus(builder *flatbuffers.Builder, status flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(status), 0) +} +func OpenEnrollmentResponseEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} +type ErrorBody struct { + _tab flatbuffers.Table +} + +func GetRootAsErrorBody(buf []byte, offset flatbuffers.UOffsetT) *ErrorBody { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &ErrorBody{} + x.Init(buf, n+offset) + return x +} + +func FinishErrorBodyBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsErrorBody(buf []byte, offset flatbuffers.UOffsetT) *ErrorBody { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &ErrorBody{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedErrorBodyBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *ErrorBody) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *ErrorBody) Table() flatbuffers.Table { + return rcv._tab +} + +func (rcv *ErrorBody) Code() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func (rcv *ErrorBody) Message() []byte { + o := flatbuffers.UOffsetT(rcv._tab.Offset(6)) + if o != 0 { + return rcv._tab.ByteVector(o + rcv._tab.Pos) + } + return nil +} + +func ErrorBodyStart(builder *flatbuffers.Builder) { + builder.StartObject(2) +} +func ErrorBodyAddCode(builder *flatbuffers.Builder, code flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(code), 0) +} +func ErrorBodyAddMessage(builder *flatbuffers.Builder, message flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(message), 0) +} +func ErrorBodyEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} +type ErrorResponse struct { + _tab flatbuffers.Table +} + +func GetRootAsErrorResponse(buf []byte, offset flatbuffers.UOffsetT) *ErrorResponse { + n := flatbuffers.GetUOffsetT(buf[offset:]) + x := &ErrorResponse{} + x.Init(buf, n+offset) + return x +} + +func FinishErrorResponseBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.Finish(offset) +} + +func GetSizePrefixedRootAsErrorResponse(buf []byte, offset flatbuffers.UOffsetT) *ErrorResponse { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &ErrorResponse{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + +func FinishSizePrefixedErrorResponseBuffer(builder *flatbuffers.Builder, offset flatbuffers.UOffsetT) { + builder.FinishSizePrefixed(offset) +} + +func (rcv *ErrorResponse) Init(buf []byte, i flatbuffers.UOffsetT) { + rcv._tab.Bytes = buf + rcv._tab.Pos = i +} + +func (rcv *ErrorResponse) Table() flatbuffers.Table { + return rcv._tab +} + +func (rcv *ErrorResponse) Error(obj *ErrorBody) *ErrorBody { + o := flatbuffers.UOffsetT(rcv._tab.Offset(4)) + if o != 0 { + x := rcv._tab.Indirect(o + rcv._tab.Pos) + if obj == nil { + obj = new(ErrorBody) + } + obj.Init(rcv._tab.Bytes, x) + return obj + } + return nil +} + +func ErrorResponseStart(builder *flatbuffers.Builder) { + builder.StartObject(1) +} +func ErrorResponseAddError(builder *flatbuffers.Builder, error flatbuffers.UOffsetT) { + builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(error), 0) +} +func ErrorResponseEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { + return builder.EndObject() +} diff --git a/pkg/transcoder/lobby.go b/pkg/transcoder/lobby.go new file mode 100644 index 0000000..478c9fa --- /dev/null +++ b/pkg/transcoder/lobby.go @@ -0,0 +1,256 @@ +package transcoder + +import ( + "errors" + "fmt" + "time" + + lobbymodel "galaxy/model/lobby" + lobbyfbs "galaxy/schema/fbs/lobby" + + flatbuffers "github.com/google/flatbuffers/go" +) + +// MyGamesListRequestToPayload converts a typed lobbymodel.MyGamesListRequest +// to FlatBuffers bytes suitable for the authenticated gateway transport. +func MyGamesListRequestToPayload(request *lobbymodel.MyGamesListRequest) ([]byte, error) { + if request == nil { + return nil, errors.New("encode my games list request payload: request is nil") + } + + builder := flatbuffers.NewBuilder(32) + lobbyfbs.MyGamesListRequestStart(builder) + offset := lobbyfbs.MyGamesListRequestEnd(builder) + lobbyfbs.FinishMyGamesListRequestBuffer(builder, offset) + + return builder.FinishedBytes(), nil +} + +// PayloadToMyGamesListRequest converts FlatBuffers payload bytes into +// lobbymodel.MyGamesListRequest. +func PayloadToMyGamesListRequest(data []byte) (result *lobbymodel.MyGamesListRequest, err error) { + if len(data) == 0 { + return nil, errors.New("decode my games list request payload: data is empty") + } + + defer recoverLobbyDecodePanic("decode my games list request payload", &result, &err) + + _ = lobbyfbs.GetRootAsMyGamesListRequest(data, 0) + return &lobbymodel.MyGamesListRequest{}, nil +} + +// MyGamesListResponseToPayload converts lobbymodel.MyGamesListResponse to +// FlatBuffers bytes suitable for the authenticated gateway transport. +func MyGamesListResponseToPayload(response *lobbymodel.MyGamesListResponse) ([]byte, error) { + if response == nil { + return nil, errors.New("encode my games list response payload: response is nil") + } + + builder := flatbuffers.NewBuilder(512) + + itemOffsets := make([]flatbuffers.UOffsetT, len(response.Items)) + for index := range response.Items { + itemOffsets[index] = encodeGameSummary(builder, response.Items[index]) + } + + var itemsVector flatbuffers.UOffsetT + if len(itemOffsets) > 0 { + lobbyfbs.MyGamesListResponseStartItemsVector(builder, len(itemOffsets)) + for index := len(itemOffsets) - 1; index >= 0; index-- { + builder.PrependUOffsetT(itemOffsets[index]) + } + itemsVector = builder.EndVector(len(itemOffsets)) + } + + lobbyfbs.MyGamesListResponseStart(builder) + if itemsVector != 0 { + lobbyfbs.MyGamesListResponseAddItems(builder, itemsVector) + } + offset := lobbyfbs.MyGamesListResponseEnd(builder) + lobbyfbs.FinishMyGamesListResponseBuffer(builder, offset) + + return builder.FinishedBytes(), nil +} + +// PayloadToMyGamesListResponse converts FlatBuffers payload bytes into +// lobbymodel.MyGamesListResponse. +func PayloadToMyGamesListResponse(data []byte) (result *lobbymodel.MyGamesListResponse, err error) { + if len(data) == 0 { + return nil, errors.New("decode my games list response payload: data is empty") + } + + defer recoverLobbyDecodePanic("decode my games list response payload", &result, &err) + + response := lobbyfbs.GetRootAsMyGamesListResponse(data, 0) + out := &lobbymodel.MyGamesListResponse{ + Items: make([]lobbymodel.GameSummary, 0, response.ItemsLength()), + } + + summary := new(lobbyfbs.GameSummary) + for index := 0; index < response.ItemsLength(); index++ { + if !response.Items(summary, index) { + return nil, fmt.Errorf("decode my games list response payload: items[%d] is missing", index) + } + out.Items = append(out.Items, decodeGameSummary(summary)) + } + return out, nil +} + +// OpenEnrollmentRequestToPayload converts lobbymodel.OpenEnrollmentRequest to +// FlatBuffers bytes suitable for the authenticated gateway transport. +func OpenEnrollmentRequestToPayload(request *lobbymodel.OpenEnrollmentRequest) ([]byte, error) { + if request == nil { + return nil, errors.New("encode open enrollment request payload: request is nil") + } + + builder := flatbuffers.NewBuilder(64) + gameID := builder.CreateString(request.GameID) + + lobbyfbs.OpenEnrollmentRequestStart(builder) + lobbyfbs.OpenEnrollmentRequestAddGameId(builder, gameID) + offset := lobbyfbs.OpenEnrollmentRequestEnd(builder) + lobbyfbs.FinishOpenEnrollmentRequestBuffer(builder, offset) + + return builder.FinishedBytes(), nil +} + +// PayloadToOpenEnrollmentRequest converts FlatBuffers payload bytes into +// lobbymodel.OpenEnrollmentRequest. +func PayloadToOpenEnrollmentRequest(data []byte) (result *lobbymodel.OpenEnrollmentRequest, err error) { + if len(data) == 0 { + return nil, errors.New("decode open enrollment request payload: data is empty") + } + + defer recoverLobbyDecodePanic("decode open enrollment request payload", &result, &err) + + request := lobbyfbs.GetRootAsOpenEnrollmentRequest(data, 0) + return &lobbymodel.OpenEnrollmentRequest{ + GameID: string(request.GameId()), + }, nil +} + +// OpenEnrollmentResponseToPayload converts lobbymodel.OpenEnrollmentResponse to +// FlatBuffers bytes suitable for the authenticated gateway transport. +func OpenEnrollmentResponseToPayload(response *lobbymodel.OpenEnrollmentResponse) ([]byte, error) { + if response == nil { + return nil, errors.New("encode open enrollment response payload: response is nil") + } + + builder := flatbuffers.NewBuilder(64) + gameID := builder.CreateString(response.GameID) + status := builder.CreateString(response.Status) + + lobbyfbs.OpenEnrollmentResponseStart(builder) + lobbyfbs.OpenEnrollmentResponseAddGameId(builder, gameID) + lobbyfbs.OpenEnrollmentResponseAddStatus(builder, status) + offset := lobbyfbs.OpenEnrollmentResponseEnd(builder) + lobbyfbs.FinishOpenEnrollmentResponseBuffer(builder, offset) + + return builder.FinishedBytes(), nil +} + +// PayloadToOpenEnrollmentResponse converts FlatBuffers payload bytes into +// lobbymodel.OpenEnrollmentResponse. +func PayloadToOpenEnrollmentResponse(data []byte) (result *lobbymodel.OpenEnrollmentResponse, err error) { + if len(data) == 0 { + return nil, errors.New("decode open enrollment response payload: data is empty") + } + + defer recoverLobbyDecodePanic("decode open enrollment response payload", &result, &err) + + response := lobbyfbs.GetRootAsOpenEnrollmentResponse(data, 0) + return &lobbymodel.OpenEnrollmentResponse{ + GameID: string(response.GameId()), + Status: string(response.Status()), + }, nil +} + +// LobbyErrorResponseToPayload converts lobbymodel.ErrorResponse to FlatBuffers +// bytes suitable for the authenticated gateway transport. +func LobbyErrorResponseToPayload(response *lobbymodel.ErrorResponse) ([]byte, error) { + if response == nil { + return nil, errors.New("encode lobby error response payload: response is nil") + } + + builder := flatbuffers.NewBuilder(128) + code := builder.CreateString(response.Error.Code) + message := builder.CreateString(response.Error.Message) + + lobbyfbs.ErrorBodyStart(builder) + lobbyfbs.ErrorBodyAddCode(builder, code) + lobbyfbs.ErrorBodyAddMessage(builder, message) + errorOffset := lobbyfbs.ErrorBodyEnd(builder) + + lobbyfbs.ErrorResponseStart(builder) + lobbyfbs.ErrorResponseAddError(builder, errorOffset) + offset := lobbyfbs.ErrorResponseEnd(builder) + lobbyfbs.FinishErrorResponseBuffer(builder, offset) + + return builder.FinishedBytes(), nil +} + +// PayloadToLobbyErrorResponse converts FlatBuffers payload bytes into +// lobbymodel.ErrorResponse. +func PayloadToLobbyErrorResponse(data []byte) (result *lobbymodel.ErrorResponse, err error) { + if len(data) == 0 { + return nil, errors.New("decode lobby error response payload: data is empty") + } + + defer recoverLobbyDecodePanic("decode lobby error response payload", &result, &err) + + response := lobbyfbs.GetRootAsErrorResponse(data, 0) + body := response.Error(nil) + if body == nil { + return nil, errors.New("decode lobby error response payload: error is missing") + } + + return &lobbymodel.ErrorResponse{ + Error: lobbymodel.ErrorBody{ + Code: string(body.Code()), + Message: string(body.Message()), + }, + }, nil +} + +func encodeGameSummary(builder *flatbuffers.Builder, summary lobbymodel.GameSummary) flatbuffers.UOffsetT { + gameID := builder.CreateString(summary.GameID) + gameName := builder.CreateString(summary.GameName) + gameType := builder.CreateString(summary.GameType) + status := builder.CreateString(summary.Status) + ownerUserID := builder.CreateString(summary.OwnerUserID) + + lobbyfbs.GameSummaryStart(builder) + lobbyfbs.GameSummaryAddGameId(builder, gameID) + lobbyfbs.GameSummaryAddGameName(builder, gameName) + lobbyfbs.GameSummaryAddGameType(builder, gameType) + lobbyfbs.GameSummaryAddStatus(builder, status) + lobbyfbs.GameSummaryAddOwnerUserId(builder, ownerUserID) + lobbyfbs.GameSummaryAddMinPlayers(builder, int32(summary.MinPlayers)) + lobbyfbs.GameSummaryAddMaxPlayers(builder, int32(summary.MaxPlayers)) + lobbyfbs.GameSummaryAddEnrollmentEndsAtMs(builder, summary.EnrollmentEndsAt.UTC().UnixMilli()) + lobbyfbs.GameSummaryAddCreatedAtMs(builder, summary.CreatedAt.UTC().UnixMilli()) + lobbyfbs.GameSummaryAddUpdatedAtMs(builder, summary.UpdatedAt.UTC().UnixMilli()) + return lobbyfbs.GameSummaryEnd(builder) +} + +func decodeGameSummary(summary *lobbyfbs.GameSummary) lobbymodel.GameSummary { + return lobbymodel.GameSummary{ + GameID: string(summary.GameId()), + GameName: string(summary.GameName()), + GameType: string(summary.GameType()), + Status: string(summary.Status()), + OwnerUserID: string(summary.OwnerUserId()), + MinPlayers: int(summary.MinPlayers()), + MaxPlayers: int(summary.MaxPlayers()), + EnrollmentEndsAt: time.UnixMilli(summary.EnrollmentEndsAtMs()).UTC(), + CreatedAt: time.UnixMilli(summary.CreatedAtMs()).UTC(), + UpdatedAt: time.UnixMilli(summary.UpdatedAtMs()).UTC(), + } +} + +func recoverLobbyDecodePanic[T any](message string, result **T, err *error) { + if recovered := recover(); recovered != nil { + *result = nil + *err = fmt.Errorf("%s: panic recovered: %v", message, recovered) + } +} diff --git a/rtmanager/Makefile b/rtmanager/Makefile new file mode 100644 index 0000000..613ccf9 --- /dev/null +++ b/rtmanager/Makefile @@ -0,0 +1,28 @@ +# Makefile for galaxy/rtmanager. +# +# The `jet` target regenerates the go-jet/v2 query-builder code under +# internal/adapters/postgres/jet/ against a transient PostgreSQL container +# brought up by cmd/jetgen. Generated code is committed. +# +# The `mocks` target regenerates the gomock-driven mocks via the +# //go:generate directives that live next to the interfaces they cover: +# - internal/ports/ — port interfaces (Stage 12) +# - internal/api/internalhttp/handlers/ — REST handler service ports (Stage 16) +# Generated code is committed. +# +# The `integration` target runs the service-local end-to-end suite +# under integration/. It requires a reachable Docker daemon +# (`/var/run/docker.sock` or `DOCKER_HOST`); without one the helpers +# in integration/harness call t.Skip and the tests are no-ops. + +.PHONY: jet mocks integration + +jet: + go run ./cmd/jetgen + +mocks: + go generate ./internal/ports/... + go generate ./internal/api/internalhttp/handlers/... + +integration: + go test -tags=integration -count=1 ./integration/... diff --git a/rtmanager/PLAN.md b/rtmanager/PLAN.md new file mode 100644 index 0000000..df0750b --- /dev/null +++ b/rtmanager/PLAN.md @@ -0,0 +1,1022 @@ +# Runtime Manager Implementation Plan + +This plan has been already implemented and stays here for historical reasons. + +It should NOT be threated as source of truth for service functionality. + +## Summary + +This plan delivers `Runtime Manager` (RTM), the only Galaxy service with direct Docker access. +It owns container lifecycle (start, stop, restart, patch, cleanup), three-source health +monitoring, and a synchronous internal REST surface used by `Game Master` and `Admin Service`. +`Game Lobby` continues to drive RTM asynchronously through Redis Streams. + +The plan also delivers the upstream changes that RTM depends on: a new `image_ref` field in +the start envelope and a `reason` field in the stop envelope produced by Lobby; a `/healthz` +endpoint, `Dockerfile`, and `STORAGE_PATH` / `GAME_STATE_PATH` contract on `galaxy/game`; new +admin-only notification types in the catalog plus matching constructors in +`galaxy/notificationintent`. + +The architectural rules behind every decision are recorded in +[`./README.md`](./README.md). This file describes the order in which the implementation +lands. + +## Global Rules + +- Documentation always lands before contracts; contracts before code. +- Each stage leaves the repository in a buildable, test-green state. No stage relies on a + later stage to fix a regression it introduced. +- Existing-service refactors (Lobby publisher, Notification catalog, Game engine) are + full-fledged stages of this plan; they precede every RTM stage that depends on them. +- RTM never resolves engine versions. Producer supplies `image_ref`. RTM never deletes the + host state directory. RTM never kills containers it does not own a record for. +- Every functional change ships its tests in the same stage. Contract tests freeze + operation IDs and stream message names from Stage 04 onward. +- All code, docs, and identifiers are written in English. + +## Suggested Module Structure + +```text +rtmanager/ +├── cmd/ +│ ├── rtmanager/ +│ │ └── main.go +│ └── jetgen/ +│ └── main.go +│ +├── internal/ +│ ├── app/ +│ │ ├── app.go +│ │ ├── runtime.go +│ │ ├── wiring.go +│ │ └── bootstrap.go +│ │ +│ ├── config/ +│ │ ├── config.go +│ │ ├── env.go +│ │ └── validation.go +│ │ +│ ├── logging/ +│ │ ├── logger.go +│ │ └── context.go +│ │ +│ ├── telemetry/ +│ │ └── runtime.go +│ │ +│ ├── domain/ +│ │ ├── runtime/ +│ │ │ ├── model.go +│ │ │ └── transitions.go +│ │ ├── operation/ +│ │ │ └── log.go +│ │ └── health/ +│ │ └── snapshot.go +│ │ +│ ├── ports/ +│ │ ├── runtimerecordstore.go +│ │ ├── operationlogstore.go +│ │ ├── healthsnapshotstore.go +│ │ ├── streamoffsetstore.go +│ │ ├── dockerclient.go +│ │ ├── lobbyinternal.go +│ │ └── notificationintents.go +│ │ +│ ├── adapters/ +│ │ ├── postgres/ +│ │ │ ├── migrations/ +│ │ │ ├── jet/ +│ │ │ ├── runtimerecordstore/ +│ │ │ ├── operationlogstore/ +│ │ │ └── healthsnapshotstore/ +│ │ ├── redisstate/ +│ │ │ └── streamoffsets/ +│ │ ├── docker/ +│ │ │ ├── client.go +│ │ │ └── mocks/ +│ │ ├── lobbyclient/ +│ │ ├── notificationpublisher/ +│ │ ├── jobresultspublisher/ +│ │ └── healtheventspublisher/ +│ │ +│ ├── service/ +│ │ ├── startruntime/ +│ │ ├── stopruntime/ +│ │ ├── restartruntime/ +│ │ ├── patchruntime/ +│ │ └── cleanupcontainer/ +│ │ +│ ├── worker/ +│ │ ├── startjobsconsumer/ +│ │ ├── stopjobsconsumer/ +│ │ ├── dockerevents/ +│ │ ├── healthprobe/ +│ │ ├── dockerinspect/ +│ │ ├── reconcile/ +│ │ └── containercleanup/ +│ │ +│ └── api/ +│ └── internalhttp/ +│ ├── server.go +│ └── handlers/ +│ +├── api/ +│ ├── internal-openapi.yaml +│ ├── runtime-jobs-asyncapi.yaml +│ └── runtime-health-asyncapi.yaml +│ +├── integration/ +│ ├── harness/ +│ ├── lifecycle_test.go +│ ├── replay_test.go +│ ├── health_test.go +│ └── notification_test.go +│ +├── docs/ +│ ├── README.md +│ ├── runtime.md +│ ├── flows.md +│ ├── runbook.md +│ ├── examples.md +│ └── postgres-migration.md +│ +├── README.md +├── PLAN.md +├── Makefile +└── go.mod +``` + +## ~~Stage 01.~~ Update `ARCHITECTURE.md` + +Status: implemented. + +Goal: + +- align the project-wide source of truth with every decision recorded in + [`./README.md`](./README.md) before any code change touches it. + +Tasks: + +- Expand `ARCHITECTURE.md` §9 (Runtime Manager) with subsections: container model + (`galaxy-game-{game_id}` DNS naming, bind-mount ABI, network prerequisite), image policy + (producer-supplied `image_ref`), state ownership rule (RTM never deletes the host state + directory), reconcile policy (adopt unrecorded containers, never kill them). +- Update §«Fixed asynchronous interactions»: note the `image_ref` field on `Lobby → RTM`, + add the `runtime:health_events` outbound stream, add `Runtime Manager → Notification + Service` for admin alerts. +- Update §«Fixed synchronous interactions»: add `Game Master → Runtime Manager` and + `Admin Service → Runtime Manager` for REST inspect / restart / patch / stop / cleanup, and + remove the corresponding async entries. +- Update §«Persistence Backends»: add `rtmanager` schema to the schema-per-service list and + to PG-backed services. +- Update §«Configuration»: add `RTMANAGER` to the env-var prefix list with the same shape + rules as other PG/Redis-backed services. +- Update §«Recommended Order of Service Implementation» entry 7 with the now-fixed scope + (start, stop, restart, patch, inspect, health monitoring). + +Files touched: + +- `ARCHITECTURE.md`. + +Exit criteria: + +- every later RTM, Lobby, Notification, or Game stage can quote its rules from + `ARCHITECTURE.md` without re-deciding them. + +## ~~Stage 02.~~ Freeze RTM `README.md` + +Status: implemented as part of this planning task — see [`./README.md`](./README.md). + +Goal: + +- publish the complete service description so contracts and code can reference one source. + +Tasks: + +- Write `rtmanager/README.md` covering Purpose, Scope, Non-Goals, Position in the System, + Responsibility Boundaries, Container Model, Runtime Surface, Lifecycles, Health Monitoring, + Reconciliation, Trusted Surfaces, Async Stream Contracts, Notification Contracts, + Persistence Layout, Error Model, Configuration, Observability, Verification. + +Exit criteria: + +- a reviewer can answer any «what does RTM do when X» question by reading the README alone. + +## ~~Stage 03.~~ Sync existing-service docs (Lobby, Notification, Game) + +Status: implemented. + +Goal: + +- bring the READMEs of every touched service into agreement with the RTM contract before any + code in those services changes. + +Tasks: + +- `lobby/README.md`: update Game Start Flow — start envelope is now `{game_id, image_ref, + requested_at_ms}`. Add `LOBBY_ENGINE_IMAGE_TEMPLATE` to the Configuration section. + Document the new stop envelope `reason` enum + (`orphan_cleanup | cancelled | finished | admin_request | timeout`). Note that the + Lobby ↔ RTM transport stays asynchronous indefinitely. +- `lobby/PLAN.md`: append a single closing note that runtime-job envelope changes belong to + the Runtime Manager plan; no new stages added there. +- `notification/README.md`: add three admin notification types to the catalog + (`runtime.image_pull_failed`, `runtime.container_start_failed`, + `runtime.start_config_invalid`), each `email`-only with audience admin in v1. +- `notification/PLAN.md`: append a closing note pointing at the Runtime Manager plan for the + catalog extension. +- `game/README.md` (create if absent): document the new `/healthz` endpoint, the + `STORAGE_PATH` / `GAME_STATE_PATH` env contract, and the new `Dockerfile` location. + +Files touched: + +- `lobby/README.md`, `lobby/PLAN.md`, `notification/README.md`, `notification/PLAN.md`, + `game/README.md`. + +Exit criteria: + +- every doc in the repo agrees on the post-RTM contract; no contradiction remains between + any two READMEs. + +## ~~Stage 04.~~ RTM contract files and contract tests + +Status: implemented. + +Goal: + +- ship machine-readable contracts before any RTM handler is written, so the implementation + has a target spec. + +Tasks: + +- `rtmanager/api/internal-openapi.yaml`: every internal REST endpoint with request and + response schemas; error envelope `{ "error": { "code", "message" } }` identical to Lobby. + Operation IDs: + `internalListRuntimes`, `internalGetRuntime`, `internalStartRuntime`, + `internalStopRuntime`, `internalRestartRuntime`, `internalPatchRuntime`, + `internalCleanupRuntimeContainer`, `internalHealthz`, `internalReadyz`. +- `rtmanager/api/runtime-jobs-asyncapi.yaml`: AsyncAPI 2.6.0 spec for `runtime:start_jobs`, + `runtime:stop_jobs`, `runtime:job_results`. Frozen field set per-message. +- `rtmanager/api/runtime-health-asyncapi.yaml`: AsyncAPI 2.6.0 spec for + `runtime:health_events` with the `event_type` enum and `details` polymorphic schema + (`oneOf` per type). +- `rtmanager/contract_openapi_test.go` and `rtmanager/contract_asyncapi_test.go`: load specs + via `kin-openapi` (and the AsyncAPI loader pattern from `notification/contract_asyncapi_test.go`), + assert operation IDs / message names / field presence. + +Files new: + +- the four files above. + +Exit criteria: + +- all three specs validate; contract tests pass; tests fail loudly if any operation ID, + message name, or required field disappears. + +## ~~Stage 05.~~ Game engine `/healthz`, `Dockerfile`, `STORAGE_PATH` + +Status: implemented. + +Goal: + +- make `galaxy/game` runnable as the test engine image RTM uses in integration tests. + +Tasks: + +- Add `GET /healthz` to `game/internal/router` returning `{"status":"ok"}` (200) when the + engine process is up, irrespective of whether a game has been initialised. The existing + `/api/v1/status` keeps its current `501` behaviour for an uninitialised engine. +- Make engine read storage path from `STORAGE_PATH` env, falling back to `GAME_STATE_PATH` + when set. Both names are accepted; `GAME_STATE_PATH` is the contract RTM writes. +- Update `game/cmd/http/main.go` to bind the env. +- Add `galaxy/game/Dockerfile`: multi-stage (golang builder + small runtime base). Exposes + `:8080`. Default `STORAGE_PATH=/var/lib/galaxy-game`. Copies the binary. Runs as non-root + user. +- Add image labels to the `Dockerfile`: `com.galaxy.cpu_quota=1.0`, `com.galaxy.memory=512m`, + `com.galaxy.pids_limit=512`, `org.opencontainers.image.title=galaxy-game-engine`. +- Update `game/openapi.yaml` to document `/healthz`. +- Update `game/openapi_contract_test.go` to assert `/healthz` presence. + +Files new: + +- `galaxy/game/Dockerfile`. + +Files touched: + +- `galaxy/game/internal/router/*.go`, `galaxy/game/cmd/http/main.go`, + `galaxy/game/openapi.yaml`, `galaxy/game/openapi_contract_test.go`. + +Exit criteria: + +- `docker build -t galaxy/game:test -f game/Dockerfile .` (run from the workspace + root) succeeds. The build context is the workspace root because `game/` resolves + `galaxy/{model,error,util,...}` through `go.work` `replace` directives; see + `rtmanager/docs/game-dockerfile-build-context.md`. +- `docker run --rm -e STORAGE_PATH=/tmp/x -p 8080:8080 galaxy/game:test` answers + `/healthz` with `200`. +- `go test ./game/...` passes. + +## ~~Stage 06.~~ Lobby publisher refactor + +Status: implemented. + +Goal: + +- ship the new `runtime:start_jobs` and `runtime:stop_jobs` envelopes from Lobby. After this + stage Lobby is RTM-ready; the real RTM appears in Stage 13 onwards. + +Tasks: + +- Add `LOBBY_ENGINE_IMAGE_TEMPLATE` (default `galaxy/game:{engine_version}`) and validation + to `lobby/internal/config/config.go` and `env.go`. +- Build `lobby/internal/domain/engineimage/resolver.go` that turns + `(template, target_engine_version)` into `image_ref`, validating both inputs. Reject + templates without `{engine_version}`; reject empty engine versions. +- `lobby/internal/ports/runtimemanager.go`: change interface to + `PublishStartJob(ctx, gameID, imageRef string) error` and + `PublishStopJob(ctx, gameID string, reason StopReason) error` with a `StopReason` enum + (`orphan_cleanup`, `cancelled`, `finished`, `admin_request`, `timeout`) declared in the + same package. +- `lobby/internal/adapters/runtimemanager/publisher.go`: write the new fields into the + `XADD` payload. +- Update callers: + - `lobby/internal/service/startgame/`: resolve `image_ref` from the loaded game record, + pass to `PublishStartJob`. + - `lobby/internal/worker/runtimejobresult/consumer.go`: pass + `reason=orphan_cleanup` to `PublishStopJob` from the orphan-container path. +- Update Lobby unit tests (publisher, services) and contract tests (if Lobby has any + describing the runtime envelopes; otherwise add `TestPublisherStartJobIncludesImageRef` + and `TestPublisherStopJobIncludesReason`). + +Files new: + +- `lobby/internal/domain/engineimage/resolver.go` and its test file. + +Files touched: + +- the Lobby files listed above. + +Exit criteria: + +- `go test ./lobby/...` passes. +- An `XADD` against the start stream contains the `image_ref` field; an `XADD` against the + stop stream contains the `reason` field. + +## ~~Stage 07.~~ Notification intent constructors and catalog extension + +Status: implemented. + +Goal: + +- expose three admin-only notification types so RTM (Stage 13 onwards) can publish them + without later cross-cutting refactors. + +Tasks: + +- Add constructors and payload structs to `galaxy/notificationintent/`: + - `NewRuntimeImagePullFailedIntent(meta, payload)`, + - `NewRuntimeContainerStartFailedIntent(meta, payload)`, + - `NewRuntimeStartConfigInvalidIntent(meta, payload)`. + Each payload includes `game_id`, `image_ref`, `error_code`, `error_message`, + `attempted_at_ms`. +- Extend `notification/api/intents-asyncapi.yaml` with the three new payload schemas and + add them to the catalog. +- Extend the notification routing tables (data only — no service code) so the existing + routing rules cover the new types: delivery decision `email`-only, audience admin. +- Extend `notification/contract_asyncapi_test.go` to freeze the new message names and + payload required fields. + +Files touched: + +- `galaxy/notificationintent/*.go`, +- `notification/api/intents-asyncapi.yaml`, +- notification catalog data tables (locations defined inside `notification/internal/...`), +- `notification/contract_asyncapi_test.go`. + +Exit criteria: + +- unit tests for the new constructors pass. +- AsyncAPI validates. +- Notification's existing integration suites still pass with the new types added. + +## ~~Stage 08.~~ RTM module skeleton + +Status: implemented. + +Goal: + +- create a buildable `rtmanager` binary that loads config, opens dependencies, and exits + cleanly on SIGTERM. It does no business work yet. + +Tasks: + +- `rtmanager/cmd/rtmanager/main.go` mirroring `lobby/cmd/lobby/main.go`. +- `rtmanager/internal/config/{config.go, env.go, validation.go}` with env prefix `RTMANAGER` + and groups Listener, Docker, Postgres, Redis, Streams, Container defaults, Health, + Cleanup, Coordination, Lobby internal client, Logging, Lifecycle, Telemetry. Required + variables fail-fast. +- `rtmanager/internal/logging/{logger.go, context.go}` copied from lobby/notification. +- `rtmanager/internal/telemetry/runtime.go` registering the metrics named in + `README.md §Observability`. +- `rtmanager/internal/app/{runtime.go, app.go, wiring.go, bootstrap.go}` — empty wiring with + PostgreSQL open, Redis open, Docker client open (ping only), telemetry open, probe + listener open. +- `rtmanager/internal/api/internalhttp/server.go` — listener with `/healthz` and `/readyz` + only. +- `rtmanager/Makefile` with the `jet` target (real generation lands in Stage 09). +- `rtmanager/go.mod` and `go.sum` with dependencies: `github.com/docker/docker`, + `github.com/redis/go-redis/v9`, `github.com/jackc/pgx/v5`, `github.com/go-jet/jet/v2`, + `github.com/pressly/goose/v3`, `github.com/stretchr/testify`, the testcontainers modules + for postgres / redis / docker, and the OpenTelemetry stack identical to lobby. +- Update repo-level `go.work` to include `./rtmanager`. + +Files new: + +- the entire skeleton tree. + +Exit criteria: + +- `go build ./rtmanager/cmd/rtmanager` succeeds. +- Running with valid env brings `/healthz` and `/readyz` up. +- `SIGTERM` returns within `RTMANAGER_SHUTDOWN_TIMEOUT`. + +## ~~Stage 09.~~ PostgreSQL schema, migrations, jet + +Status: implemented. + +Goal: + +- finalise the persistence schema and the code-generation pipeline. + +Tasks: + +- `internal/adapters/postgres/migrations/00001_init.sql` — `CREATE SCHEMA IF NOT EXISTS + rtmanager;` plus the three tables and indexes from `README.md §Persistence Layout`. +- `internal/adapters/postgres/migrations/migrations.go` — `//go:embed *.sql` and `FS()` + exporter, identical pattern to lobby. +- `cmd/jetgen/main.go` — testcontainers PostgreSQL + goose up + jet generation against the + resulting database. Mirrors `lobby/cmd/jetgen/main.go`. +- Generated `internal/adapters/postgres/jet/...` committed to the repo. +- Wire goose migrations into `internal/app/runtime.go` startup so they apply before any + listener opens; non-zero exit on failure (matches `pkg/postgres` policy). + +Files new: + +- as above. + +Exit criteria: + +- `make -C rtmanager jet` regenerates the jet code with no diff after a clean run. +- Service start applies migrations to a fresh database and exits zero if migrations are + already applied. + +## ~~Stage 10.~~ Domain layer and ports + +Status: implemented. + +Goal: + +- lock the in-memory domain model and the port interfaces for adapters. + +Tasks: + +- `internal/domain/runtime/model.go` — `RuntimeRecord` struct, status enum + (`StatusRunning`, `StatusStopped`, `StatusRemoved`), error sentinels. +- `internal/domain/runtime/transitions.go` — allowed transitions table and a CAS-friendly + validator. +- `internal/domain/operation/log.go` — `OpKind`, `OpSource`, `Outcome` enums plus the + `OperationEntry` struct. +- `internal/domain/health/snapshot.go` — `HealthEventType` enum, `HealthSnapshot` struct. +- `internal/ports/`: + - `runtimerecordstore.go` — `Get`, `Upsert`, `UpdateStatus` (CAS by + `current_container_id`), `ListByStatus`. + - `operationlogstore.go` — `Append`, `ListByGame`. + - `healthsnapshotstore.go` — `Upsert`, `Get`. + - `streamoffsetstore.go` — `Load`, `Save` (Redis offset persistence per consumer label). + - `dockerclient.go` — narrow surface RTM uses: `EnsureNetwork`, `PullImage`, `Inspect`, + `Run`, `Stop`, `Remove`, `List`, `EventsListen`. (`Logs` reserved; not in v1.) + - `lobbyinternal.go` — `GetGame(ctx, gameID) (LobbyGameRecord, error)`. + - `notificationintents.go` — `Publish(ctx, intent) error`. + +Files new: + +- as above. + +Exit criteria: + +- the package compiles. +- every interface has a `_ ports.X = (*Y)(nil)` assertion slot ready for the adapters that + follow. + +## ~~Stage 11.~~ Persistence adapters + +Status: implemented. Decision record: +[`docs/stage11-persistence-adapters.md`](docs/stage11-persistence-adapters.md). + +Goal: + +- implement the three PostgreSQL stores and the Redis offset store. + +Tasks: + +- `internal/adapters/postgres/runtimerecordstore/store.go` using jet. +- `internal/adapters/postgres/operationlogstore/store.go`. +- `internal/adapters/postgres/healthsnapshotstore/store.go`. +- `internal/adapters/redisstate/streamoffsets/store.go` (mirror Lobby's + `redisstate/streamoffsets`). +- For each adapter: store-level integration tests against testcontainers PostgreSQL or + Redis. CAS semantics on `runtime_records.UpdateStatus` are verified by an explicit + concurrent-update test (only one of two callers wins). + +Files new: + +- as above and per-package `_test.go`. + +Exit criteria: + +- store tests pass on a CI runner with Docker available. + +## ~~Stage 12.~~ Docker adapter and external clients + +Status: implemented. Decision record: +[`docs/stage12-docker-and-clients.md`](docs/stage12-docker-and-clients.md). + +Goal: + +- ship the Docker SDK adapter and the external HTTP clients for Lobby internal API and + notification publishing. + +Tasks: + +- `internal/adapters/docker/client.go` — implements `ports.DockerClient` over + `github.com/docker/docker/client`. Behaviour: + - `EnsureNetwork` validates the configured network's presence (no creation). + - `PullImage` honours the configured pull policy. + - `Inspect` returns image and container metadata in domain-friendly shape. + - `Run` builds the create + start sequence with labels, env (`GAME_STATE_PATH`, + `STORAGE_PATH`), bind mount, log driver, resource limits read from image labels with + config fallback. + - `Stop` calls `ContainerStop` with the configured timeout. + - `Remove` calls `ContainerRemove`. + - `List` filters by `label=com.galaxy.owner=rtmanager`. + - `EventsListen` returns a typed channel of decoded events. +- `internal/adapters/docker/mocks/` — `mockgen`-generated mock for `ports.DockerClient`, + used by service tests. +- `internal/adapters/lobbyclient/client.go` — REST client over an `otelhttp`-wrapped + `http.Client` for `GET /api/v1/internal/games/{game_id}`. Returns `LobbyGameRecord`. +- `internal/adapters/notificationpublisher/publisher.go` — wraps + `galaxy/notificationintent` plus `redis.XAdd` against `notification:intents`. +- Per-adapter unit tests with mocks. A small testcontainers Docker smoke test guarded by + build tag `rtmanager_docker_smoke` until Stage 19 promotes it to default. + +Files new: + +- as above. + +Exit criteria: + +- mocks regenerate cleanly via `go generate`. +- unit tests pass. +- the smoke test passes on a runner with Docker available. + +## ~~Stage 13.~~ Service: start + +Status: implemented. Decision record: +[`docs/stage13-start-service.md`](docs/stage13-start-service.md). + +Goal: + +- end-to-end `start` operation in the service layer, callable from both the async consumer + and the REST handler in later stages. + +Tasks: + +- `internal/service/startruntime/service.go` orchestrator: + 1. Acquire game-id lease (Redis). + 2. Read `runtime_records`. If `running` with same `image_ref`, return idempotent success + with `error_code=replay_no_op`. + 3. Optionally fetch `LobbyGameRecord` for ancillary fields; in v1 only `image_ref` is + required, so this fetch is a no-op except for diagnostics. + 4. Pull image (per policy), inspect labels for resource limits. + 5. Ensure the per-game state directory exists with the configured mode and ownership. + 6. `docker run` with the configured network, hostname, labels, env, bind mount, log + driver, resource limits. + 7. Upsert `runtime_records` (`status=running`, `current_container_id`, `engine_endpoint`, + `current_image_ref`, `started_at`, `last_op_at`). + 8. Append `operation_log` entry (`op_kind=start`, `outcome=success`, `op_source` from + caller). + 9. Publish `runtime:health_events` `container_started`. + 10. Return success outcome to caller (consumer publishes `job_result`, REST returns 200). +- Failure paths in the table from `README.md §Lifecycles → Start`. Each failure path: + - rolls back any partially created Docker resource; + - publishes the matching admin-only notification intent; + - records `operation_log` with `outcome=failure` and the stable error code; + - returns failure to the caller. +- Unit tests cover happy path, idempotent re-start, each failure mode, lease conflict, and + partial-rollback paths. + +Files new: + +- `service/startruntime/{service.go, service_test.go, errors.go}`. + +Exit criteria: + +- service-level tests pass. + +## ~~Stage 14.~~ Service: stop, restart, patch, cleanup + +Status: implemented. Decision record: +[`docs/stage14-stop-restart-patch-cleanup.md`](docs/stage14-stop-restart-patch-cleanup.md). + +Goal: + +- the remaining four lifecycle operations, sharing helpers with `start`. + +Tasks: + +- `internal/service/stopruntime/service.go` — graceful `docker stop` with timeout, record + `stopped` state. Idempotent re-stop returns success no-op. +- `internal/service/restartruntime/service.go` — orchestrate `stopruntime` then + `startruntime` with the current `image_ref`. Same Redis lease shared across both inner + operations. Records a single `operation_log` entry with `op_kind=restart` plus a + correlation id linking it to the implicit start/stop entries. +- `internal/service/patchruntime/service.go` — restart with a new `image_ref`. Validates the + semver-patch-only rule (major and minor must equal current version; otherwise return + `semver_patch_only` failure). If the engine version is not parseable as semver, return + `image_ref_not_semver`. +- `internal/service/cleanupcontainer/service.go` — `docker rm` for an already-stopped + container; refuses if `status=running`. Sets `runtime_records.status=removed`. +- The Redis lease covers each operation end-to-end; restart and patch hold the lease across + the inner stop+start to prevent races. +- Unit tests for each service. Cross-operation race tests assert that concurrent start vs. + stop on the same `game_id` either succeed in some order or both observe the lease and + one returns conflict. + +Files new: + +- `service/{stopruntime, restartruntime, patchruntime, cleanupcontainer}/...`. + +Exit criteria: + +- service-level tests pass. + +## ~~Stage 15.~~ Async consumers and `runtime:job_results` publisher + +Status: implemented. Decision record: +[`docs/stage15-async-consumers.md`](docs/stage15-async-consumers.md). + +Goal: + +- wire the Lobby-side stream contract into the freshly built service layer. + +Tasks: + +- `internal/worker/startjobsconsumer/consumer.go` — XREAD over `runtime:start_jobs`, + decodes envelope `{game_id, image_ref, requested_at_ms}`, calls `startruntime` service, + publishes `runtime:job_results` with the canonical schema, advances the Redis offset. + Mirrors patterns from `lobby/internal/worker/runtimejobresult/consumer.go`. +- `internal/worker/stopjobsconsumer/consumer.go` — XREAD over `runtime:stop_jobs`, decodes + `{game_id, reason, requested_at_ms}`, calls `stopruntime`. +- `internal/adapters/jobresultspublisher/publisher.go` — small XADD wrapper for + `runtime:job_results`. +- Replay safety: deterministic «already running» / «already stopped» idempotent outcomes + surface as `outcome=success` with `error_code=replay_no_op`. +- Tests use `miniredis` and a fake `ports.DockerClient`. A consumer integration test drives + a full Lobby → RTM → Lobby roundtrip end-to-end. + +Files new: + +- as above + tests. + +Exit criteria: + +- consumer integration test passes. + +## ~~Stage 16.~~ Internal REST handlers + +Status: implemented. Decision record: +[`docs/stage16-internal-rest-handlers.md`](docs/stage16-internal-rest-handlers.md). + +Goal: + +- ship the GM/Admin-facing REST surface backed by the service layer. + +Tasks: + +- `internal/api/internalhttp/handlers/{list, get, start, stop, restart, patch, cleanup}.go` + — one file per operation, each delegating to the corresponding service. JSON in / JSON + out. Unknown JSON fields rejected with `invalid_request`. +- Error envelope identical to lobby: `{ "error": { "code", "message" } }`. Stable codes: + `invalid_request`, `not_found`, `conflict`, `service_unavailable`, `internal_error`, + `image_ref_not_semver`, `semver_patch_only`, `image_pull_failed`, + `container_start_failed`, `start_config_invalid`, `docker_unavailable`. +- Wiring under the existing internal HTTP listener; route registration in + `internal/app/wiring.go`. +- Handler-level table-driven tests; OpenAPI conformance test that loads + `api/internal-openapi.yaml` and asserts every defined operation is reachable and matches + its declared response. + +Files new: + +- handlers + tests. + +Exit criteria: + +- OpenAPI conformance test passes for every endpoint. +- Handlers reject unknown JSON fields. + +## ~~Stage 17.~~ Health monitoring + +Status: implemented. Decision record: +[`docs/stage17-health-monitoring.md`](docs/stage17-health-monitoring.md). + +Goal: + +- observability of running containers via the three sources from `README.md §Health + Monitoring`. + +Tasks: + +- `internal/worker/dockerevents/listener.go` — subscribes to Docker events with the + `com.galaxy.owner=rtmanager` label filter, looks up `runtime_records` by labels, emits + `runtime:health_events` for `container_exited`, `container_oom`, + `container_disappeared`. `container_started` is emitted directly by the start service + (Stage 13) when it runs the container. +- `internal/worker/healthprobe/worker.go` — periodic worker iterating + `runtime_records.status=running`. Calls `GET {engine_endpoint}/healthz` with the + configured timeout, applies the `RTMANAGER_PROBE_FAILURES_THRESHOLD` hysteresis, emits + `probe_failed` / `probe_recovered`. Uses `otelhttp` client. +- `internal/worker/dockerinspect/worker.go` — periodic full inspect; emits + `inspect_unhealthy` on observed `RestartCount` growth or unexpected status. +- `internal/adapters/healtheventspublisher/publisher.go` — XADD wrapper for + `runtime:health_events`. Always also upserts the latest snapshot into `health_snapshots`. + +Files new: + +- as above + tests. + +Exit criteria: + +- worker tests use a Docker mock that programmatically emits events and asserts the + published stream entries match the AsyncAPI spec. + +## ~~Stage 18.~~ Reconciler and container cleanup + +Status: implemented. Decision record: +[`docs/stage18-reconcile-and-cleanup.md`](docs/stage18-reconcile-and-cleanup.md). + +Goal: + +- drift management and TTL-based cleanup. + +Tasks: + +- `internal/worker/reconcile/reconciler.go` — runs at startup (blocking before workers + start) and periodically (`RTMANAGER_RECONCILE_INTERVAL`). Implements the rules from + `README.md §Reconciliation`: + - record running containers without a PG record, never kill them + (`op_kind=reconcile_adopt`); + - mark `runtime_records.status=running` rows whose container is missing as `removed`, + publish `container_disappeared` (`op_kind=reconcile_dispose`). +- `internal/worker/containercleanup/worker.go` — periodic worker + (`RTMANAGER_CLEANUP_INTERVAL`) that lists `runtime_records` with `status=stopped` and + `last_op_at < now - RTMANAGER_CONTAINER_RETENTION_DAYS`, calls + `cleanupcontainer` service for each. +- Both workers are registered as `app.Component`s in `internal/app/wiring.go`. + +Files new: + +- as above + tests. + +Exit criteria: + +- reconciler test using mocked Docker proves both adopt and dispose paths. +- cleanup test proves TTL math with a fake clock. + +## ~~Stage 19.~~ Service-local integration suite + +Status: implemented. Decision record: +[`docs/stage19-integration.md`](docs/stage19-integration.md). + +Goal: + +- end-to-end suite running against testcontainers PostgreSQL + Redis + the real Docker + daemon, using the freshly-built `galaxy/game` test image. + +Tasks: + +- `rtmanager/integration/harness/` — set up PostgreSQL with goose-applied migrations; + Redis (miniredis is sufficient for stream-only suites; testcontainers Redis for + coordination suites that exercise leases); ensure the Docker bridge network exists; build + `galaxy/game` test image once per package run with `sync.Once`; tear everything down via + `t.Cleanup`. +- `rtmanager/integration/lifecycle_test.go` — start → inspect → stop → restart → patch → + cleanup against the real engine; assert each step's PG, Redis-stream, and Docker + side-effects. Engine state directories are created via `t.ArtifactDir()`. +- `rtmanager/integration/replay_test.go` — duplicate start/stop messages are no-ops with + `error_code=replay_no_op`. +- `rtmanager/integration/health_test.go` — kill the engine container externally; assert + `container_disappeared` event publishes within timeout. Bring it back with a manual + `docker run`; assert the reconciler adopts it. +- `rtmanager/integration/notification_test.go` — drive a start with an unresolvable image + ref; assert RTM publishes the `runtime.image_pull_failed` notification intent and a + `failure` job_result. + +Files new: + +- as above. + +Exit criteria: + +- `go test ./rtmanager/integration/...` passes locally with Docker available. +- CI runs the suite under a profile that exposes the Docker socket. + +## ~~Stage 20.~~ Inter-service test: Lobby ↔ RTM + +Status: implemented. Decision record: +[`docs/stage20-lobbyrtm.md`](docs/stage20-lobbyrtm.md). + +Goal: + +- satisfy the `TESTING.md §7` inter-service requirement with real Lobby + real RTM. + +Tasks: + +- `integration/lobbyrtm/` (top-level integration directory, mirroring existing + `integration/notificationgateway`, etc.): runs real Lobby, real RTM, real PostgreSQL, + real Redis, and the `galaxy/game` test engine container. +- Scenarios: + - Lobby creates a game, publishes a start_job with `image_ref`, RTM starts the engine, + publishes `job_result`, Lobby transitions the game to `running`. The engine answers + `/healthz`. + - Lobby transitions a game to `cancelled`, publishes `stop_job` with `reason=cancelled`, + RTM stops the engine. RTM `operation_log` records the transition. + - Failure path: `image_ref` points at a missing image. RTM publishes a `failure` + `job_result` and the matching notification intent. Lobby transitions the game to + `start_failed`. + +Files new: + +- as above. + +Exit criteria: + +- all scenarios pass in CI when the Docker socket is available. + +## ~~Stage 21.~~ Service-local docs + +Status: implemented. + +Goal: + +- drop per-stage decisions captured during this plan into discoverable service-local + documentation, mirroring `lobby/docs/`. + +Tasks: + +- `docs/README.md` — index pointing at the four content docs and the postgres-migration + record. +- `docs/runtime.md` — components, processes, in-memory state of each worker. +- `docs/flows.md` — mermaid diagrams for: start happy path, start failure (image pull), + start failure (orphan), stop, restart, patch, cleanup TTL, reconcile drift adopt, health + probe hysteresis. +- `docs/runbook.md` — operator scenarios: «engine container died», «patch upgrade», «manual + cleanup», «reconcile drift after Docker daemon restart», «testing locally». +- `docs/examples.md` — env-var examples per environment (dev / test / prod skeletons), + example payloads for each stream and each REST endpoint. +- `docs/postgres-migration.md` — decision record for the schema (mirrors + `notification/docs/postgres-migration.md` style). + +Files new: + +- all six. + +Exit criteria: + +- the README of RTM links to `docs/README.md`. +- a reviewer can find any operational how-to within two clicks. + +## ~~Stage 22.~~ Migrate hand-rolled stubs to `mockgen` + +Status: implemented. Decision record: +[`docs/stage22-stub-migration.md`](docs/stage22-stub-migration.md). + +Goal: + +- unify the test-double style across the repository on the `mockgen` + pipeline introduced for the RTM Docker port in Stage 12. Today every + Galaxy service except RTM hand-rolls `*stub` packages; mixing styles + raises onboarding cost and makes port-signature drift easier to miss. + +Tasks (high-level only — each package gets its own decision when this +stage is opened): + +- Replace the stubs under `lobby/internal/adapters/` with `mockgen`-generated + mocks. Affected packages today (one per port): + [`runtimemanagerstub`](../lobby/internal/adapters/runtimemanagerstub), + [`intentpubstub`](../lobby/internal/adapters/intentpubstub), + [`gmclientstub`](../lobby/internal/adapters/gmclientstub), + [`userservicestub`](../lobby/internal/adapters/userservicestub), + [`gameturnstatsstub`](../lobby/internal/adapters/gameturnstatsstub), + [`streamoffsetstub`](../lobby/internal/adapters/streamoffsetstub), + [`membershipstub`](../lobby/internal/adapters/membershipstub), + [`evaluationguardstub`](../lobby/internal/adapters/evaluationguardstub), + [`streamlagprobestub`](../lobby/internal/adapters/streamlagprobestub), + [`userlifecyclestub`](../lobby/internal/adapters/userlifecyclestub), + [`invitestub`](../lobby/internal/adapters/invitestub), + [`racenamestub`](../lobby/internal/adapters/racenamestub), + [`gapactivationstub`](../lobby/internal/adapters/gapactivationstub), + [`gamestub`](../lobby/internal/adapters/gamestub), + [`applicationstub`](../lobby/internal/adapters/applicationstub). +- Add `//go:generate mockgen ...` directives next to each port + declaration under [`lobby/internal/ports/`](../lobby/internal/ports) + and a `mocks` target to `lobby/Makefile`, mirroring the + [`rtmanager/Makefile`](./Makefile) shape. +- Audit the rest of the workspace for similar hand-rolls before touching + Lobby. Not every `*stub`-style package is in scope: + - [`mail/internal/adapters/stubprovider`](../mail/internal/adapters/stubprovider) + is a production/local-mode provider, not a test fixture — keep it. + - [`authsession/internal/adapters/contracttest`](../authsession/internal/adapters/contracttest) + is a port-conformance suite, not a stub — keep it. + - [`authsession/internal/adapters/local`](../authsession/internal/adapters/local) + is local-mode runtime — keep it. +- Documentation sweep — these documents reference the hand-rolled + convention and must be updated alongside the code: + - [`rtmanager/docs/stage12-docker-and-clients.md §1`](./docs/stage12-docker-and-clients.md) + currently frames `mockgen` as a one-time deviation; rephrase as the + repo-wide convention. + - [`lobby/docs/`](../lobby/docs/) — any decision record that named a + `*stub` package by path needs the new `mocks/` target referenced in + its place. + - Top-level [`AGENTS.md`](../AGENTS.md) and any service-level + `CLAUDE.md` / `README.md` touching test conventions. +- Cross-cutting test impact: each stub today often carries hand-curated + helper methods (e.g. seeded fixtures, deterministic ID generators) + that pure `mockgen` mocks do not provide. Where a stub is more than + a method-table, the migration extracts the helper into a small + test-data builder and keeps the mock as the port surface. + +Files new: + +- one `mocks/` directory under each affected adapter group, plus a + `lobby/Makefile` `mocks` target (and equivalents for any other + service the audit identifies). + +Files touched: + +- every `*stub` package listed above plus its consumers. +- `lobby/Makefile`, `lobby/internal/ports/*.go` (for `//go:generate` + directives). +- the documentation listed above. + +Exit criteria: + +- `*stub` packages are gone from `lobby/internal/adapters/` and the + `mocks/` packages compile against the current ports. +- `make -C lobby mocks` regenerates with no diff after a clean run. +- `go test ./lobby/...` is green. +- Documentation across `rtmanager/docs/`, `lobby/docs/`, top-level + `AGENTS.md`, and any affected `README.md` references the unified + convention. + +## Final Acceptance Criteria + +- `go build ./...` from the repository root succeeds. +- `go test ./...` from the repository root passes. +- `go test -tags=integration ./rtmanager/integration/...` passes when Docker is available. +- `go test ./integration/lobbyrtm/...` passes when Docker is available. +- `make -C rtmanager jet` regenerates jet code with no diff after a clean run. +- Manual smoke: bring Lobby + RTM + the rest of the stack up via the existing dev compose; + create a game; observe a real `galaxy-game-{game_id}` container; `curl + http://galaxy-game-{game_id}:8080/healthz` returns `200`; stop the game; the container + moves to `exited`; the admin cleanup endpoint removes it. +- Documentation across `ARCHITECTURE.md`, `lobby`, `notification`, `game`, and `rtmanager` + is internally consistent. + +## Out of Scope + +- Multi-instance Runtime Manager with Redis Streams consumer groups (`XREADGROUP` / + `XCLAIM`). +- Engine version registry inside `Game Master`. Producer-supplied `image_ref` decouples + this work from RTM. +- TLS / mTLS on the internal listener. +- Engine in-place upgrades driven by an engine API. Patch is always recreate. +- Backup, archival, or cleanup of host state directories. +- Kubernetes, Docker Swarm, or any non-Docker orchestrator. +- Consumption of `runtime:health_events` by Game Master, Game Lobby, or Notification + Service. Those are next-stage concerns of those services. + +## Risks and Notes + +- CI must expose a Docker socket (or run rootless equivalent) to execute the integration + suites. Without Docker the integration tests are skipped through a build-tag guard. +- The `reason` enum on `runtime:stop_jobs` is fixed in this plan + (`{orphan_cleanup, cancelled, finished, admin_request, timeout}`). Adding a new value + requires a contract bump in `runtime-jobs-asyncapi.yaml` and a Lobby publisher change. + Keep the enum small. +- Lobby's existing `runtimejobresult` worker only reacts to start outcomes today. Stop + outcomes are observable in RTM `operation_log` but Lobby does not yet update game status + from them. Adding a stop-result consumer to Lobby is a future Lobby stage and is + explicitly out of scope here. +- Pre-launch single-init policy applies to RTM exactly as documented in + `ARCHITECTURE.md §Persistence Backends`: schema evolves by editing `00001_init.sql` + until first production deploy. diff --git a/rtmanager/README.md b/rtmanager/README.md new file mode 100644 index 0000000..3be0899 --- /dev/null +++ b/rtmanager/README.md @@ -0,0 +1,867 @@ +# Runtime Manager + +`Runtime Manager` (RTM) is the only Galaxy platform service permitted to interact with the +Docker daemon. It owns the lifecycle of `galaxy/game` engine containers and the technical +runtime view of running games. Other services consume RTM via two transports: an asynchronous +Redis Streams contract (used by `Game Lobby`) and a synchronous internal REST surface (used by +`Game Master` and `Admin Service`). + +## References + +- [`../ARCHITECTURE.md`](../ARCHITECTURE.md) — system architecture, §9 Runtime Manager. +- [`../TESTING.md`](../TESTING.md) §7 — testing matrix for RTM. +- [`./docs/README.md`](./docs/README.md) — service-local documentation entry point. +- [`./api/internal-openapi.yaml`](./api/internal-openapi.yaml) — REST contract. +- [`./api/runtime-jobs-asyncapi.yaml`](./api/runtime-jobs-asyncapi.yaml) — start/stop job + streams contract. +- [`./api/runtime-health-asyncapi.yaml`](./api/runtime-health-asyncapi.yaml) — + `runtime:health_events` stream contract. +- [`../game/README.md`](../game/README.md) — game engine container contract (env, ports, + `/healthz`). +- [`../lobby/README.md`](../lobby/README.md) — Game Lobby integration with RTM. + +## Purpose + +A running Galaxy game lives in exactly one Docker container. The platform must be able to: + +- create the container with the right engine version and configuration; +- supply the engine with a stable storage location for game state; +- keep the runtime status visible to platform-level services; +- replace the container in place for patch upgrades and restarts; +- remove containers that are no longer needed; +- detect and surface engine failures to whoever should react. + +`Runtime Manager` is the single component that performs these actions. It deliberately does +**not** reason about platform metadata, membership, schedules, turn cutoffs, or any other +business state. Game Lobby owns platform metadata; Game Master will own runtime business state +when implemented. + +## Scope + +`Runtime Manager` is the source of truth for: + +- the mapping `game_id -> current_container_id` for every running container; +- the durable history of every start, stop, restart, patch, and cleanup operation it performed; +- the most recent technical health observation per game (last Docker event, last successful or + failed probe, last inspect result). + +`Runtime Manager` is not the source of truth for: + +- any business or platform-level metadata of a game (owned by `Game Lobby`); +- runtime state visible to players or operators as game state, including current turn, + generation status, engine version registry (owned by `Game Master`); +- the engine version catalogue or which engine version a game is allowed to use (`Game Master` + is the future owner; `Game Lobby` supplies `image_ref` in v1); +- contents of the engine state directory; that is engine domain; +- backup, archival, or operator cleanup of state directories. + +## Non-Goals + +- Multi-instance operation in v1. Coordination is single-process; multiple replicas are an + explicit future iteration. +- Engine version arbitration. The producer (`Game Lobby` in v1, `Game Master` later) supplies `image_ref`. +- Image registry control. Pull policy is configurable, but RTM does not push, retag, or + promote images. +- TLS or mTLS on the internal listener. RTM trusts its network segment. +- Direct delivery of player-visible push notifications. RTM publishes admin-only notification + intents only for failures invisible elsewhere; everything else is delegated. +- Kubernetes, Docker Swarm, or other orchestrators. v1 targets a single Docker daemon reached + through `unix:///var/run/docker.sock`. + +## Position in the System + +```mermaid +flowchart LR + Lobby["Game Lobby"] + GM["Game Master"] + Admin["Admin Service"] + Notify["Notification Service"] + RTM["Runtime Manager"] + Engine["Game Engine container"] + Docker["Docker Daemon"] + Postgres["PostgreSQL\nschema rtmanager"] + Redis["Redis\nstreams + leases"] + + Lobby -->|runtime:start_jobs / stop_jobs| RTM + RTM -->|runtime:job_results| Lobby + GM -->|internal REST| RTM + Admin -->|internal REST| RTM + RTM -->|notification:intents (admin)| Notify + RTM -->|runtime:health_events| Redis + RTM <--> Docker + Docker -->|create / start / stop / rm| Engine + RTM --> Postgres + RTM --> Redis + Engine -.bind mount.- StateDir["host:\n/{game_id}"] +``` + +## Responsibility Boundaries + +`Runtime Manager` is responsible for: + +- accepting start, stop, restart, patch, inspect, and cleanup requests through the supported + transports and producing one durable outcome per request; +- creating Docker containers from a producer-supplied `image_ref` and binding them to the + configured Docker network and host state directory; +- enforcing the one-game-one-container invariant in its own state and on Docker; +- monitoring container health through Docker events, periodic inspect, and active HTTP probes; +- publishing technical runtime events (`runtime:job_results`, `runtime:health_events`) and + admin-only notification intents for failures that no other service can observe; +- reconciling its persistent state with Docker reality on startup and periodically; +- removing exited containers automatically by retention TTL or explicitly by admin command. + +`Runtime Manager` is not responsible for: + +- evaluating whether a game is allowed to start (Lobby validates roster, schedule, etc.); +- registering a started runtime with `Game Master` (Lobby calls GM after a successful job + result); +- mapping platform users to engine players (GM owns this mapping); +- player command routing (GM proxies player commands directly to engine); +- cleaning up host state directories; +- patching the engine version registry; the registry lives in `Game Master`. + +## Container Model + +### Network + +Containers attach to a single user-defined Docker bridge network. The network is provisioned +**outside** RTM: docker-compose, Terraform, or an operator runbook creates `galaxy-net` (or +whatever name is configured via `RTMANAGER_DOCKER_NETWORK`). + +RTM validates the network's presence at startup. A missing network is a fail-fast condition; +the process exits non-zero before opening any listener. + +### DNS name and engine endpoint + +Each container is created with hostname `galaxy-game-{game_id}` and is attached to the +configured network. Docker's embedded DNS resolves the hostname for any other container in the +same network. + +The `engine_endpoint` published in `runtime:job_results` and visible through the inspect REST +endpoint is the full URL `http://galaxy-game-{game_id}:8080`. The port is fixed at `8080` +inside the container; RTM does not publish ports to the host. + +Restart and patch keep the same DNS name. The `container_id` changes; the `engine_endpoint` +does not. + +### State storage (bind mount) + +Engine state lives on the host filesystem. RTM never uses Docker named volumes — the rationale +is operator-friendly backup and inspection. + +- Host root: `RTMANAGER_GAME_STATE_ROOT` (operator-supplied, e.g. `/var/lib/galaxy/games`). +- Per-game directory: `/{game_id}`. RTM creates it with permissions + `RTMANAGER_GAME_STATE_DIR_MODE` (default `0750`) and ownership `RTMANAGER_GAME_STATE_OWNER_UID` + / `_GID` (default `0:0` — operator overrides for non-root engine). +- Bind mount: the per-game directory is mounted into the container at the path declared by + `RTMANAGER_ENGINE_STATE_MOUNT_PATH` (default `/var/lib/galaxy-game`). +- Environment: the container receives `GAME_STATE_PATH=`. The engine resolves the + path from this variable. The same variable is forwarded to the engine as `STORAGE_PATH` for + backward compatibility — both names are accepted in v1. + +RTM never deletes the host state directory. Removing it is the responsibility of operator +tooling (backup, manual cleanup, or future Admin Service workflows). Removing the container +through the cleanup endpoint or the retention TTL leaves the directory intact. + +### Container labels + +RTM applies the following labels to every container it creates: + +| Label | Value | Purpose | +| --- | --- | --- | +| `com.galaxy.owner` | `rtmanager` | Filter for `docker ps` and reconcile. | +| `com.galaxy.kind` | `game-engine` | Differentiates from infra containers. | +| `com.galaxy.game_id` | `{game_id}` | Reverse lookup from container to platform game. | +| `com.galaxy.engine_image_ref` | `{image_ref}` | Cross-check against `runtime_records`. | +| `com.galaxy.started_at_ms` | `{ms}` | Unambiguous start timestamp. | + +Labels are read from the resolved engine image to choose resource limits (see below). + +### Resource limits + +Resource limits originate in the **engine image**, not in the producer envelope or RTM config: + +| Image label | Container limit | RTM fallback config | +| --- | --- | --- | +| `com.galaxy.cpu_quota` | `--cpus` value | `RTMANAGER_DEFAULT_CPU_QUOTA` (default `1.0`) | +| `com.galaxy.memory` | `--memory` value | `RTMANAGER_DEFAULT_MEMORY` (default `512m`) | +| `com.galaxy.pids_limit` | `--pids-limit` value | `RTMANAGER_DEFAULT_PIDS_LIMIT` (default `512`) | + +If a label is missing or unparseable, RTM uses the matching fallback. Producers never pass +limits. + +### Logging driver + +Engine container stdout / stderr are routed by Docker's logging driver. RTM passes the driver +and its options when creating the container: + +- `RTMANAGER_DOCKER_LOG_DRIVER` (default `json-file`). +- `RTMANAGER_DOCKER_LOG_OPTS` (default empty; comma-separated `key=value` pairs). + +RTM never reads the container's stdout itself. Operators consume engine logs via `docker logs` +or via whatever sink the configured driver feeds (fluentd, journald, etc.). + +The production Docker SDK adapter that creates and starts these containers lives at +`internal/adapters/docker/`. Its design rationale — fixed engine port, partial-rollback on +`ContainerStart` failure, events-stream filter rationale, and the `mockgen`-driven service-test +fixture — is captured in [`docs/adapters.md`](docs/adapters.md). + +## Runtime Surface + +### Listeners + +| Listener | Default address | Purpose | +| --- | --- | --- | +| `internal` HTTP | `:8096` (`RTMANAGER_INTERNAL_HTTP_ADDR`) | Probes (`/healthz`, `/readyz`) and the trusted REST surface for `Game Master` and `Admin Service`. | + +There is no public listener. The internal listener is unauthenticated and assumes a trusted +network segment. + +### Background workers + +| Worker | Driver | Description | +| --- | --- | --- | +| `startjobs` consumer | Redis Stream `runtime:start_jobs` | Decodes start envelope and invokes the start service. | +| `stopjobs` consumer | Redis Stream `runtime:stop_jobs` | Decodes stop envelope and invokes the stop service. | +| Docker events listener | Docker `/events` API | Subscribes with the label filter, emits `runtime:health_events` for container_started / exited / oom / disappeared. | +| Active HTTP probe | Periodic | `GET {engine_endpoint}/healthz` for every running runtime; emits `probe_failed` / `probe_recovered` with hysteresis. | +| Periodic Docker inspect | Periodic | Refreshes inspect data; emits `inspect_unhealthy` when restart_count grows or status is unexpected. | +| Reconciler | Startup + periodic | Reconciles `runtime_records` with `docker ps` (see Reconciliation section). | +| Container cleanup | Periodic | Removes exited containers older than `RTMANAGER_CONTAINER_RETENTION_DAYS`. | + +### Startup dependencies + +In start order: + +1. PostgreSQL primary (DSN `RTMANAGER_POSTGRES_PRIMARY_DSN`). Goose migrations apply + synchronously before any listener opens. +2. Redis master (`RTMANAGER_REDIS_MASTER_ADDR`). +3. Docker daemon at `RTMANAGER_DOCKER_HOST` (default `unix:///var/run/docker.sock`). RTM + verifies API ping and the presence of `RTMANAGER_DOCKER_NETWORK`. +4. Telemetry exporter (OTLP grpc/http or stdout). +5. Internal HTTP listener. +6. Reconciler runs once and blocks until done. +7. Background workers start. + +A failure in any step is fatal and exits the process non-zero. + +### Probes + +`/healthz` reports liveness — the process responds when the HTTP server is alive. + +`/readyz` reports readiness — `200` only when: + +- the PostgreSQL pool can ping the primary; +- the Redis master client can ping; +- the Docker client can ping; +- the configured Docker network exists. + +Both probes are documented in [`./api/internal-openapi.yaml`](./api/internal-openapi.yaml). + +## Lifecycles + +All operations share a per-game-id Redis lease (`rtmanager:game_lease:{game_id}`, +TTL `RTMANAGER_GAME_LEASE_TTL_SECONDS`, default `60`). The lease serialises operations on a +single game across all entry points (stream consumers and REST handlers). v1 does not renew +the lease mid-operation; long pulls of multi-GB images can therefore expire the lease before +the operation finishes — the trade-off is documented in +[`docs/services.md` §1](docs/services.md). + +### Start + +**Triggers:** + +- Lobby: a Redis Streams entry on `runtime:start_jobs` with envelope + `{game_id, image_ref, requested_at_ms}`. +- Game Master / Admin Service: `POST /api/v1/internal/runtimes/{game_id}/start` with body + `{image_ref}`. + +**Pre-conditions:** + +- `image_ref` is a non-empty string and parseable as a Docker reference. +- Configured Docker network exists. +- The lease for `{game_id}` is acquired. + +**Flow on success:** + +1. Read `runtime_records.{game_id}`. If `status=running` with the same `image_ref`, return + the existing record (idempotent success, `error_code=replay_no_op`). +2. Pull the image per `RTMANAGER_IMAGE_PULL_POLICY` (default `if_missing`). +3. Inspect the resolved image, derive resource limits from labels. +4. Ensure the per-game state directory exists with the configured mode and ownership. +5. `docker create` with the configured network, hostname, labels, env (`GAME_STATE_PATH`, + `STORAGE_PATH`), bind mount, log driver, resource limits. +6. `docker start`. +7. Upsert `runtime_records` (`status=running`, `current_container_id`, `engine_endpoint`, + `current_image_ref`, `started_at`, `last_op_at`). +8. Append `operation_log` entry (`op_kind=start`, `outcome=success`, source-specific + `op_source`). +9. Publish `runtime:health_events` `container_started`. +10. For Lobby callers: publish `runtime:job_results` + `{game_id, outcome=success, container_id, engine_endpoint}`. + For REST callers: respond `200` with the runtime record. + +**Failure paths:** + +| Failure | PG side effect | Notification intent | Outcome to caller | +| --- | --- | --- | --- | +| Invalid `image_ref` shape, network missing | `operation_log` failure | `runtime.start_config_invalid` | `failure / start_config_invalid` | +| Image pull error | `operation_log` failure | `runtime.image_pull_failed` | `failure / image_pull_failed` | +| `docker create` / `start` error | `operation_log` failure | `runtime.container_start_failed` | `failure / container_start_failed` | +| State directory creation error | `operation_log` failure | `runtime.start_config_invalid` | `failure / start_config_invalid` | + +A failed start never leaves a partially-running container: if `docker create` succeeded but +the subsequent step failed, RTM removes the container before recording the failure. + +The production start orchestrator that implements the flow and the failure paths above lives +at `internal/service/startruntime/`. Its design rationale — why the per-game lease and the +health-events publisher live with the start service, the `Result`-shaped contract consumed by +the stream consumer and the REST handler, the rollback rule on Upsert failure, and the +`created_at`-preservation rule for re-starts — is captured in +[`docs/services.md`](docs/services.md). + +### Stop + +**Triggers:** + +- Lobby: Redis Streams entry on `runtime:stop_jobs` with envelope + `{game_id, reason, requested_at_ms}`. `reason ∈ {orphan_cleanup, cancelled, finished, + admin_request, timeout}`. +- Game Master / Admin Service: `POST /api/v1/internal/runtimes/{game_id}/stop` with body + `{reason}`. + +**Pre-conditions:** + +- Lease acquired. + +**Flow on success:** + +1. Read `runtime_records.{game_id}`. If `status` is `stopped` or `removed`, return + idempotent success (`error_code=replay_no_op`). +2. `docker stop` with `RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS` (default `30`). Docker fires + SIGKILL if the engine ignores SIGTERM beyond the timeout. RTM does not call any HTTP + shutdown endpoint on the engine. +3. Update `runtime_records` (`status=stopped`, `stopped_at`, `last_op_at`). +4. Append `operation_log` entry. +5. Publish `runtime:job_results` (for Lobby) or REST `200` (for REST callers). + +The container stays in `exited` state until the cleanup worker removes it (TTL) or an admin +command forces removal. + +**Failure paths:** + +| Failure | Outcome | +| --- | --- | +| Container not found in Docker but record `running` | Update record `status=removed`, publish `container_disappeared`, return `success` (RTM treats this as already-stopped). | +| `docker stop` returns non-zero, container still alive | Failure recorded, no state change. Caller may retry. | + +### Restart + +**Triggers:** + +- Game Master / Admin Service: `POST /api/v1/internal/runtimes/{game_id}/restart`. + +Restart is **recreate**: stop + remove + run with the same `image_ref` and the same bind +mount. `container_id` changes; `engine_endpoint` is stable. + +**Flow:** + +1. Read `runtime_records.{game_id}`. The current `image_ref` is captured. +2. Acquire lease. +3. Run the stop flow (without releasing the lease). +4. `docker rm` the container. +5. Run the start flow with the captured `image_ref`. +6. Append a single `operation_log` entry with `op_kind=restart` and a correlation id linking + the implicit stop and start log entries. + +If any inner step fails, the operation log records the partial outcome and the outer caller +receives the same failure; the runtime record converges to whatever state Docker reports. + +### Patch + +**Triggers:** + +- Game Master / Admin Service: `POST /api/v1/internal/runtimes/{game_id}/patch` with body + `{image_ref}`. + +Patch is restart with a **new** `image_ref`. The engine reads its state from the bind mount +on startup, so any data written before the patch survives. + +**Pre-conditions:** + +- New and current image refs both parse as semver tags. `image_ref_not_semver` failure + otherwise. +- Major and minor versions are equal between current and new (`semver_patch_only` failure + otherwise). + +**Flow:** identical to restart, with a new `image_ref` injected before the start step. +`operation_log` entry has `op_kind=patch`. + +### Cleanup + +**Triggers:** + +- Periodic worker: every container with `runtime_records.status=stopped` and + `last_op_at < now - RTMANAGER_CONTAINER_RETENTION_DAYS` (default `30`). +- Admin Service: `DELETE /api/v1/internal/runtimes/{game_id}/container`. + +**Pre-conditions:** + +- The container is not in `running` state. RTM refuses to remove a running container through + this path; stop first. + +**Flow:** + +1. Acquire lease. +2. `docker rm` the container. +3. Update `runtime_records` (`status=removed`, `removed_at`, `current_container_id=NULL`, + `last_op_at`). +4. Append `operation_log` entry (`op_kind=cleanup_container`, + `op_source ∈ {auto_ttl, admin_rest}`). + +The host state directory is left untouched. + +## Health Monitoring + +Three independent sources feed `runtime:health_events` and `health_snapshots`: + +1. **Docker events listener.** Subscribes to the Docker events stream and filters + container-scoped events by the `com.galaxy.owner=rtmanager` label written into every + container by the start service. Emits: + - `container_exited` (action=`die` with non-zero exit code; exit `0` is the normal + graceful stop and is suppressed). + - `container_oom` (action=`oom`). + - `container_disappeared` (action=`destroy` observed for a `runtime_records.status=running` + row whose `current_container_id` still matches the destroyed container, i.e. a destroy + RTM did not initiate). + + `container_started` is emitted by the start service when it runs the container (see + `internal/service/startruntime`), not by this listener. +2. **Periodic Docker inspect** every `RTMANAGER_INSPECT_INTERVAL` (default `30s`). Emits + `inspect_unhealthy` when: + - `RestartCount` increases between observations; + - `State.Status != "running"` for a record marked running; + - `State.Health.Status == "unhealthy"` if the image declares a Docker `HEALTHCHECK`. +3. **Active HTTP probe** every `RTMANAGER_PROBE_INTERVAL` (default `15s`). Calls + `GET {engine_endpoint}/healthz` with `RTMANAGER_PROBE_TIMEOUT` (default `2s`). Emits: + - `probe_failed` after `RTMANAGER_PROBE_FAILURES_THRESHOLD` consecutive failures + (default `3`); + - `probe_recovered` on the first success after a `probe_failed` was published. + +Every emission updates `health_snapshots.{game_id}` (latest event becomes the snapshot) and +appends to `runtime:health_events`. + +In v1, RTM publishes admin-only notification intents only for first-touch failures of the +start flow. All ongoing health changes (probe failures, OOMs, exits) flow through +`runtime:health_events` only. `Game Master` is the consumer that decides whether to escalate +runtime-level events into notifications. + +The three workers that implement the sources above live in +`internal/worker/{dockerevents,dockerinspect,healthprobe}`. Their design rationale — +`container_started` ownership, `container_disappeared` emission rules, `die` exit-code +suppression, probe hysteresis state model, parallel-probe cap, and the events-listener +reconnect policy — is captured in [`docs/workers.md`](docs/workers.md). + +## Reconciliation + +RTM never assumes Docker and PostgreSQL are in sync. + +At startup (blocking, before workers start) and every `RTMANAGER_RECONCILE_INTERVAL` +(default `5m`): + +1. List Docker containers with label `com.galaxy.owner=rtmanager`. +2. For each running container without a matching record: + - Insert a `runtime_records` row with `status=running`, the discovered + `current_image_ref`, `engine_endpoint`, and `started_at` taken from + `com.galaxy.started_at_ms` if present (otherwise from `State.StartedAt`). + - Append `operation_log` entry with `op_kind=reconcile_adopt`, + `op_source=auto_reconcile`. + - **Never stop or remove an unrecorded container.** Operators may have started one + manually for diagnostics; RTM stays out of their way. +3. For each `runtime_records` row with `status=running` whose container is missing: + - Update `status=removed`, `removed_at=now`, `current_container_id=NULL`. + - Publish `runtime:health_events` `container_disappeared`. + - Append `operation_log` entry with `op_kind=reconcile_dispose`. +4. For each `runtime_records` row with `status=running` whose container exists but is in + `exited`: + - Update `status=stopped`, `stopped_at=now` (reconciler observation time). + - Publish `runtime:health_events` `container_exited` with the observed exit code. + +The reconciler implementation lives at `internal/worker/reconcile/` and the periodic +TTL-cleanup worker at `internal/worker/containercleanup/`; the cleanup worker delegates +removal to `internal/service/cleanupcontainer/`. The design rationale — the per-game +lease around every drift mutation, the third `observed_exited` path beyond the two +named cases, the synchronous `ReconcileNow` plus periodic `Component` split, and why +the cleanup worker is a thin TTL filter on top of the existing service — is captured in +[`docs/workers.md`](docs/workers.md). + +## Trusted Surfaces + +### Internal REST + +The internal REST surface is consumed by `Game Master` (sync interactions for inspect, +restart, patch, stop, cleanup) and `Admin Service` (operational tooling, force-cleanup). +The listener is unauthenticated; downstream services rely on network segmentation. + +| Method | Path | Operation ID | Caller | +| --- | --- | --- | --- | +| `GET` | `/healthz` | `internalHealthz` | platform probes | +| `GET` | `/readyz` | `internalReadyz` | platform probes | +| `GET` | `/api/v1/internal/runtimes` | `internalListRuntimes` | GM, Admin | +| `GET` | `/api/v1/internal/runtimes/{game_id}` | `internalGetRuntime` | GM, Admin | +| `POST` | `/api/v1/internal/runtimes/{game_id}/start` | `internalStartRuntime` | GM, Admin | +| `POST` | `/api/v1/internal/runtimes/{game_id}/stop` | `internalStopRuntime` | GM, Admin | +| `POST` | `/api/v1/internal/runtimes/{game_id}/restart` | `internalRestartRuntime` | GM, Admin | +| `POST` | `/api/v1/internal/runtimes/{game_id}/patch` | `internalPatchRuntime` | GM, Admin | +| `DELETE` | `/api/v1/internal/runtimes/{game_id}/container` | `internalCleanupRuntimeContainer` | Admin | + +Request and response shapes are defined in [`./api/internal-openapi.yaml`](./api/internal-openapi.yaml). +Unknown JSON fields are rejected with `invalid_request`. + +Callers identify themselves through the optional `X-Galaxy-Caller` +request header (`gm` for `Game Master`, `admin` for `Admin Service`). +The header is recorded as `op_source` in `operation_log` (`gm_rest` or +`admin_rest`); when missing or carrying any other value Runtime +Manager defaults to `op_source = admin_rest`. The header is documented +on every runtime endpoint of +[`./api/internal-openapi.yaml`](./api/internal-openapi.yaml). + +## Async Stream Contracts + +### `runtime:start_jobs` (in) + +Producer: `Game Lobby`. + +| Field | Type | Notes | +| --- | --- | --- | +| `game_id` | string | Lobby `game_id`. | +| `image_ref` | string | Docker reference. Lobby resolves it from `target_engine_version` using `LOBBY_ENGINE_IMAGE_TEMPLATE`. | +| `requested_at_ms` | int64 | UTC milliseconds. Used for diagnostics, not authoritative. | + +### `runtime:stop_jobs` (in) + +Producer: `Game Lobby`. + +| Field | Type | Notes | +| --- | --- | --- | +| `game_id` | string | | +| `reason` | enum | `orphan_cleanup`, `cancelled`, `finished`, `admin_request`, `timeout`. Recorded in `operation_log.error_code` when the reason matters; otherwise opaque. | +| `requested_at_ms` | int64 | | + +### `runtime:job_results` (out) + +Producer: `Runtime Manager`. Consumer: `Game Lobby`. + +| Field | Type | Notes | +| --- | --- | --- | +| `game_id` | string | | +| `outcome` | enum | `success`, `failure`. | +| `container_id` | string | Required for `success`. Empty on `failure`. | +| `engine_endpoint` | string | Required for `success`. Empty on `failure`. | +| `error_code` | string | Stable code. `replay_no_op` for idempotent re-runs. | +| `error_message` | string | Operator-readable detail. | + +### `runtime:health_events` (out, new) + +Producer: `Runtime Manager`. Consumers: `Game Master`; `Game Lobby` and `Admin Service` +are reserved as future consumers. + +| Field | Type | Notes | +| --- | --- | --- | +| `game_id` | string | | +| `container_id` | string | The container observed (may differ from current after a restart race). | +| `event_type` | enum | See below. | +| `occurred_at_ms` | int64 | UTC milliseconds. | +| `details` | json | Type-specific payload. | + +`event_type` values and their `details` schemas: + +| `event_type` | `details` payload | +| --- | --- | +| `container_started` | `{image_ref}` | +| `container_exited` | `{exit_code, oom: bool}` | +| `container_oom` | `{exit_code}` | +| `container_disappeared` | `{}` | +| `inspect_unhealthy` | `{restart_count, state, health}` | +| `probe_failed` | `{consecutive_failures, last_status, last_error}` | +| `probe_recovered` | `{prior_failure_count}` | + +The full schema is enforced by [`./api/runtime-health-asyncapi.yaml`](./api/runtime-health-asyncapi.yaml). + +## Notification Contracts + +`Runtime Manager` publishes admin-only notification intents only for failures invisible to +any other service: + +| Trigger | `notification_type` | Audience | Channels | +| --- | --- | --- | --- | +| Image pull error during start | `runtime.image_pull_failed` | admin | email | +| `docker create` / `docker start` error | `runtime.container_start_failed` | admin | email | +| Configuration validation error at start (bad image_ref, missing network) | `runtime.start_config_invalid` | admin | email | + +Constructors live in `galaxy/pkg/notificationintent`. Catalog entries live in +[`../notification/README.md`](../notification/README.md) and +[`../notification/api/intents-asyncapi.yaml`](../notification/api/intents-asyncapi.yaml). +All three intents share the frozen field set +`{game_id, image_ref, error_code, error_message, attempted_at_ms}`; the +`_ms` suffix on `attempted_at_ms` follows the repo-wide convention for +millisecond integer fields. +The Redis Streams publisher wrapper used to emit these intents from RTM +ships in `internal/adapters/notificationpublisher/`; the rationale for the +signature shim that drops the upstream entry id lives in +[`docs/domain-and-ports.md` §7](docs/domain-and-ports.md) and the production +wiring is documented in [`docs/adapters.md`](docs/adapters.md). + +Runtime-level changes after a successful start (probe failures, OOM, container exited) +**do not** produce notifications from RTM. Game Master decides whether to escalate. + +## Persistence Layout + +### PostgreSQL durable state (schema `rtmanager`) + +| Table | Purpose | Key | +| --- | --- | --- | +| `runtime_records` | One row per game, latest known runtime status. | `game_id` | +| `operation_log` | Append-only audit of every operation RTM performed. | `id` (auto) | +| `health_snapshots` | Latest health observation per game. | `game_id` | + +`runtime_records` columns: + +- `game_id` — primary key, references Lobby's identifier. +- `status` — `running | stopped | removed`. +- `current_container_id` — nullable when `status=removed`. +- `current_image_ref` — non-null when status is `running` or `stopped`. +- `engine_endpoint` — `http://galaxy-game-{game_id}:8080`. +- `state_path` — absolute host path of the bind-mounted directory. +- `docker_network` — network name observed at create time. +- `started_at`, `stopped_at`, `removed_at` — last transition timestamps. +- `last_op_at` — drives retention TTL. +- `created_at` — first time RTM saw the game. + +`operation_log` columns: + +- `id`, `game_id`, `op_kind` (`start | stop | restart | patch | cleanup_container | + reconcile_adopt | reconcile_dispose`), `op_source` (`lobby_stream | gm_rest | admin_rest | + auto_ttl | auto_reconcile`), `source_ref` (stream entry id, REST request id, or admin + user), `image_ref`, `container_id`, `outcome` (`success | failure`), `error_code`, + `error_message`, `started_at`, `finished_at`. + +`health_snapshots` columns: + +- `game_id`, `container_id`, `status` + (`healthy | probe_failed | exited | oom | inspect_unhealthy | container_disappeared`), + `source` (`docker_event | inspect | probe`), `details` (jsonb), `observed_at`. + +Indexes: + +- `runtime_records (status, last_op_at)` — drives cleanup worker. +- `operation_log (game_id, started_at DESC)` — drives audit reads. + +Migrations are embedded `00001_init.sql` (single-init pre-launch policy from +`ARCHITECTURE.md §Persistence Backends`). + +### Redis runtime-coordination state + +| Key shape | Purpose | +| --- | --- | +| `rtmanager:stream_offsets:{label}` | Last processed entry id per consumer (`startjobs`, `stopjobs`). Same shape as Lobby. | +| `rtmanager:game_lease:{game_id}` | Per-game lease string (`SET ... NX PX `). TTL is `RTMANAGER_GAME_LEASE_TTL_SECONDS` (default 60s); not renewed mid-operation in v1. The trade-off is documented in [`docs/services.md` §1](docs/services.md). | + +Stream key shapes themselves are configurable: + +- `RTMANAGER_REDIS_START_JOBS_STREAM` (default `runtime:start_jobs`). +- `RTMANAGER_REDIS_STOP_JOBS_STREAM` (default `runtime:stop_jobs`). +- `RTMANAGER_REDIS_JOB_RESULTS_STREAM` (default `runtime:job_results`). +- `RTMANAGER_REDIS_HEALTH_EVENTS_STREAM` (default `runtime:health_events`). +- `RTMANAGER_NOTIFICATION_INTENTS_STREAM` (default `notification:intents`). + +## Error Model + +Error envelope: `{ "error": { "code": "...", "message": "..." } }`, identical to Lobby's. + +Stable error codes: + +| Code | Meaning | +| --- | --- | +| `invalid_request` | Malformed JSON, unknown fields, missing required parameter. | +| `not_found` | Runtime record does not exist. | +| `conflict` | Operation incompatible with current `status`. | +| `service_unavailable` | Dependency unavailable (Docker daemon, PG, Redis). | +| `internal_error` | Unspecified failure. | +| `image_pull_failed` | Image pull attempt failed. | +| `image_ref_not_semver` | Patch attempted with a tag that is not parseable semver. | +| `semver_patch_only` | Patch attempted across major/minor boundary. | +| `container_start_failed` | `docker create` / `docker start` failed. | +| `start_config_invalid` | Network missing, bind path inaccessible, or other config error. | +| `docker_unavailable` | Docker daemon ping failed. | +| `replay_no_op` | Idempotent replay; outcome is success but no work was done. | + +## Configuration + +All variables use the `RTMANAGER_` prefix. Required variables fail-fast on startup. + +### Required + +- `RTMANAGER_INTERNAL_HTTP_ADDR` +- `RTMANAGER_POSTGRES_PRIMARY_DSN` +- `RTMANAGER_REDIS_MASTER_ADDR` +- `RTMANAGER_REDIS_PASSWORD` +- `RTMANAGER_DOCKER_HOST` +- `RTMANAGER_DOCKER_NETWORK` +- `RTMANAGER_GAME_STATE_ROOT` + +### Configuration groups + +**Listener:** + +- `RTMANAGER_INTERNAL_HTTP_ADDR` (e.g. `:8096`). +- `RTMANAGER_INTERNAL_HTTP_READ_TIMEOUT` (default `5s`). +- `RTMANAGER_INTERNAL_HTTP_WRITE_TIMEOUT` (default `15s`). +- `RTMANAGER_INTERNAL_HTTP_IDLE_TIMEOUT` (default `60s`). + +**Docker:** + +- `RTMANAGER_DOCKER_HOST` (default `unix:///var/run/docker.sock`). +- `RTMANAGER_DOCKER_API_VERSION` (default empty — let SDK negotiate). +- `RTMANAGER_DOCKER_NETWORK` (default `galaxy-net`). +- `RTMANAGER_DOCKER_LOG_DRIVER` (default `json-file`). +- `RTMANAGER_DOCKER_LOG_OPTS` (default empty). +- `RTMANAGER_IMAGE_PULL_POLICY` (default `if_missing`, + values `if_missing | always | never`). + +**Container defaults:** + +- `RTMANAGER_DEFAULT_CPU_QUOTA` (default `1.0`). +- `RTMANAGER_DEFAULT_MEMORY` (default `512m`). +- `RTMANAGER_DEFAULT_PIDS_LIMIT` (default `512`). +- `RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS` (default `30`). +- `RTMANAGER_CONTAINER_RETENTION_DAYS` (default `30`). +- `RTMANAGER_ENGINE_STATE_MOUNT_PATH` (default `/var/lib/galaxy-game`). +- `RTMANAGER_ENGINE_STATE_ENV_NAME` (default `GAME_STATE_PATH`). +- `RTMANAGER_GAME_STATE_DIR_MODE` (default `0750`). +- `RTMANAGER_GAME_STATE_OWNER_UID` (default `0`). +- `RTMANAGER_GAME_STATE_OWNER_GID` (default `0`). +- `RTMANAGER_GAME_STATE_ROOT` (host path). + +**Postgres:** + +- `RTMANAGER_POSTGRES_PRIMARY_DSN` (`postgres://rtmanager:@:5432/galaxy?search_path=rtmanager&sslmode=disable`). +- `RTMANAGER_POSTGRES_REPLICA_DSNS` (optional, comma-separated; not used in v1). +- `RTMANAGER_POSTGRES_OPERATION_TIMEOUT` (default `2s`). +- `RTMANAGER_POSTGRES_MAX_OPEN_CONNS` (default `10`). +- `RTMANAGER_POSTGRES_MAX_IDLE_CONNS` (default `2`). +- `RTMANAGER_POSTGRES_CONN_MAX_LIFETIME` (default `30m`). + +**Redis:** + +- `RTMANAGER_REDIS_MASTER_ADDR`. +- `RTMANAGER_REDIS_REPLICA_ADDRS` (optional, comma-separated). +- `RTMANAGER_REDIS_PASSWORD`. +- `RTMANAGER_REDIS_DB` (default `0`). +- `RTMANAGER_REDIS_OPERATION_TIMEOUT` (default `2s`). + +**Streams:** + +- `RTMANAGER_REDIS_START_JOBS_STREAM` (default `runtime:start_jobs`). +- `RTMANAGER_REDIS_STOP_JOBS_STREAM` (default `runtime:stop_jobs`). +- `RTMANAGER_REDIS_JOB_RESULTS_STREAM` (default `runtime:job_results`). +- `RTMANAGER_REDIS_HEALTH_EVENTS_STREAM` (default `runtime:health_events`). +- `RTMANAGER_NOTIFICATION_INTENTS_STREAM` (default `notification:intents`). +- `RTMANAGER_STREAM_BLOCK_TIMEOUT` (default `5s`). + +**Health monitoring:** + +- `RTMANAGER_INSPECT_INTERVAL` (default `30s`). +- `RTMANAGER_PROBE_INTERVAL` (default `15s`). +- `RTMANAGER_PROBE_TIMEOUT` (default `2s`). +- `RTMANAGER_PROBE_FAILURES_THRESHOLD` (default `3`). + +**Reconciler / cleanup:** + +- `RTMANAGER_RECONCILE_INTERVAL` (default `5m`). +- `RTMANAGER_CLEANUP_INTERVAL` (default `1h`). + +**Coordination:** + +- `RTMANAGER_GAME_LEASE_TTL_SECONDS` (default `60`). + +**Lobby internal client:** + +- `RTMANAGER_LOBBY_INTERNAL_BASE_URL` (e.g. `http://lobby:8095`). +- `RTMANAGER_LOBBY_INTERNAL_TIMEOUT` (default `2s`). + +**Logging:** + +- `RTMANAGER_LOG_LEVEL` (default `info`). + +**Lifecycle:** + +- `RTMANAGER_SHUTDOWN_TIMEOUT` (default `30s`). + +**Telemetry:** uses the standard OTLP env vars (`OTEL_EXPORTER_OTLP_ENDPOINT`, +`OTEL_EXPORTER_OTLP_PROTOCOL`, etc.) shared with other Galaxy services. + +## Observability + +### Metrics (OpenTelemetry, low cardinality) + +- `rtmanager.start_outcomes` — counter, labels `outcome`, `error_code`, `op_source`. +- `rtmanager.stop_outcomes` — counter, labels `outcome`, `reason`, `op_source`. +- `rtmanager.restart_outcomes` — counter, labels `outcome`, `error_code`. +- `rtmanager.patch_outcomes` — counter, labels `outcome`, `error_code`. +- `rtmanager.cleanup_outcomes` — counter, labels `outcome`, `op_source`. +- `rtmanager.docker_op_latency` — histogram, label `op` (`pull | create | start | stop | rm + | inspect | events`). +- `rtmanager.health_events` — counter, label `event_type`. +- `rtmanager.reconcile_drift` — counter, label `kind` (`adopt | dispose | observed_exited`). +- `rtmanager.runtime_records_by_status` — gauge, label `status`. +- `rtmanager.lease_acquire_latency` — histogram. +- `rtmanager.notification_intents` — counter, label `notification_type`. + +### Structured logs (slog JSON to stdout) + +Common fields on every entry: `service=rtmanager`, `request_id`, `trace_id`, `span_id`, +`game_id` (when known), `container_id` (when known), `op_kind`, `op_source`, `outcome`, +`error_code`. + +Worker-specific fields: `stream_entry_id` (consumers), `event_type` (health), `image_ref` +(start/patch). + +## Verification + +Service-level (TESTING.md §7): + +- Unit tests for every service-layer operation against mocked Docker. +- Adapter tests (PG, Redis, Docker) using `testcontainers-go` for PG/Redis and the Docker + daemon socket for the real Docker adapter. +- Contract tests for `internal-openapi.yaml`, `runtime-jobs-asyncapi.yaml`, + `runtime-health-asyncapi.yaml`. + +Service-local integration suite under `rtmanager/integration/`: + +- Lifecycle end-to-end (start, inspect, stop, restart, patch, cleanup) against the real + `galaxy/game` test image. +- Replay safety (duplicate stream entries are no-ops). +- Health observability (kill the engine externally, observe `container_disappeared`; relaunch + manually, observe reconcile adopt). +- Notification on first-touch failures (publish a start with an unresolvable image, observe + `runtime.image_pull_failed` intent and a `failure` job result). + +Inter-service suite under `integration/lobbyrtm/`: + +- Real Lobby + real RTM + real `galaxy/game` test image. Covers happy path, cancel, and + start-failed flows. + +Manual smoke (development): + +```sh +docker network create galaxy-net # once +RTMANAGER_GAME_STATE_ROOT=/var/lib/galaxy/games \ +RTMANAGER_DOCKER_NETWORK=galaxy-net \ +RTMANAGER_INTERNAL_HTTP_ADDR=:8096 \ +... go run ./rtmanager/cmd/rtmanager +``` + +After start, `curl http://localhost:8096/readyz` returns `200`. Driving Lobby through its +public flow brings up `galaxy-game-{game_id}` containers; RTM logs each lifecycle transition +and publishes the corresponding stream entries. diff --git a/rtmanager/api/internal-openapi.yaml b/rtmanager/api/internal-openapi.yaml new file mode 100644 index 0000000..2e23dee --- /dev/null +++ b/rtmanager/api/internal-openapi.yaml @@ -0,0 +1,534 @@ +openapi: 3.0.3 +info: + title: Galaxy Runtime Manager Internal REST API + version: v1 + description: | + This specification documents the internal trusted REST contract of + `galaxy/rtmanager` served on `RTMANAGER_INTERNAL_HTTP_ADDR` + (default `:8096`). + + The listener is not reachable from the public internet. Two caller + classes use it: `Game Master` (inspect / restart / patch / stop / + cleanup) and `Admin Service` (operational tooling, including + force-cleanup). Runtime Manager treats every caller on this port as + trusted and performs no user-level authorization; downstream services + rely on network segmentation. There is no `X-User-ID` header + contract. + + Transport rules: + - request bodies are strict JSON only; unknown fields are rejected + with `invalid_request`; + - error responses use `{ "error": { "code", "message" } }`, identical + to the Lobby contract; + - stable error codes are: `invalid_request`, `not_found`, `conflict`, + `service_unavailable`, `internal_error`, `image_pull_failed`, + `image_ref_not_semver`, `semver_patch_only`, + `container_start_failed`, `start_config_invalid`, + `docker_unavailable`, `replay_no_op`. + + Caller identification: + - the optional `X-Galaxy-Caller` request header carries the calling + service identity (`gm` for `Game Master`, `admin` for `Admin + Service`). Runtime Manager records the value as `op_source` in + the `operation_log` (`gm_rest` or `admin_rest`). When the header + is missing or carries an unknown value, Runtime Manager defaults + to `op_source = admin_rest`. +servers: + - url: http://localhost:8096 + description: Default local internal listener for Runtime Manager. +tags: + - name: Runtimes + description: Runtime lifecycle endpoints called by Game Master and Admin Service. + - name: Probes + description: Health and readiness probes. +paths: + /healthz: + get: + tags: + - Probes + operationId: internalHealthz + summary: Internal listener health probe + responses: + "200": + description: Service is alive. + content: + application/json: + schema: + $ref: "#/components/schemas/ProbeResponse" + examples: + ok: + value: + status: ok + /readyz: + get: + tags: + - Probes + operationId: internalReadyz + summary: Internal listener readiness probe + description: | + Returns `200` only when the PostgreSQL primary, Redis master, and + Docker daemon are reachable and the configured Docker network + exists. Returns `503` with the standard error envelope otherwise. + responses: + "200": + description: Service is ready to serve traffic. + content: + application/json: + schema: + $ref: "#/components/schemas/ProbeResponse" + examples: + ready: + value: + status: ready + "503": + $ref: "#/components/responses/ServiceUnavailableError" + /api/v1/internal/runtimes: + get: + tags: + - Runtimes + operationId: internalListRuntimes + summary: List all known runtime records + description: | + Returns the full list of runtime records known to Runtime Manager. + Pagination is not supported in v1 — the working set is bounded by + the number of games tracked by Lobby and is small enough to return + in one response. + parameters: + - $ref: "#/components/parameters/XGalaxyCallerHeader" + responses: + "200": + description: All runtime records. + content: + application/json: + schema: + $ref: "#/components/schemas/RuntimesList" + "500": + $ref: "#/components/responses/InternalError" + "503": + $ref: "#/components/responses/ServiceUnavailableError" + /api/v1/internal/runtimes/{game_id}: + get: + tags: + - Runtimes + operationId: internalGetRuntime + summary: Get one runtime record by game id + parameters: + - $ref: "#/components/parameters/GameIDPath" + - $ref: "#/components/parameters/XGalaxyCallerHeader" + responses: + "200": + description: Runtime record for the game. + content: + application/json: + schema: + $ref: "#/components/schemas/RuntimeRecord" + "404": + $ref: "#/components/responses/NotFoundError" + "500": + $ref: "#/components/responses/InternalError" + "503": + $ref: "#/components/responses/ServiceUnavailableError" + /api/v1/internal/runtimes/{game_id}/start: + post: + tags: + - Runtimes + operationId: internalStartRuntime + summary: Start a game engine container + description: | + Pulls the supplied `image_ref` per the configured pull policy and + creates the engine container. Idempotent: a re-start with the same + `image_ref` for an already-running record returns `200` with the + current record and `error_code=replay_no_op` recorded in the + operation log. + parameters: + - $ref: "#/components/parameters/GameIDPath" + - $ref: "#/components/parameters/XGalaxyCallerHeader" + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/StartRequest" + responses: + "200": + description: Runtime record after the start operation. + content: + application/json: + schema: + $ref: "#/components/schemas/RuntimeRecord" + "400": + $ref: "#/components/responses/InvalidRequestError" + "409": + $ref: "#/components/responses/ConflictError" + "500": + $ref: "#/components/responses/InternalError" + "503": + $ref: "#/components/responses/ServiceUnavailableError" + /api/v1/internal/runtimes/{game_id}/stop: + post: + tags: + - Runtimes + operationId: internalStopRuntime + summary: Stop a running game engine container + description: | + Issues `docker stop` with the configured timeout. Idempotent: stop + on a record that is already `stopped` or `removed` returns + success with `error_code=replay_no_op` recorded in the operation + log. + parameters: + - $ref: "#/components/parameters/GameIDPath" + - $ref: "#/components/parameters/XGalaxyCallerHeader" + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/StopRequest" + responses: + "200": + description: Runtime record after the stop operation. + content: + application/json: + schema: + $ref: "#/components/schemas/RuntimeRecord" + "400": + $ref: "#/components/responses/InvalidRequestError" + "404": + $ref: "#/components/responses/NotFoundError" + "409": + $ref: "#/components/responses/ConflictError" + "500": + $ref: "#/components/responses/InternalError" + "503": + $ref: "#/components/responses/ServiceUnavailableError" + /api/v1/internal/runtimes/{game_id}/restart: + post: + tags: + - Runtimes + operationId: internalRestartRuntime + summary: Recreate a game engine container with the same image + description: | + Stops, removes, and re-runs the container with the current + `image_ref`. The container id changes; the engine endpoint stays + stable. + parameters: + - $ref: "#/components/parameters/GameIDPath" + - $ref: "#/components/parameters/XGalaxyCallerHeader" + responses: + "200": + description: Runtime record after the restart operation. + content: + application/json: + schema: + $ref: "#/components/schemas/RuntimeRecord" + "404": + $ref: "#/components/responses/NotFoundError" + "409": + $ref: "#/components/responses/ConflictError" + "500": + $ref: "#/components/responses/InternalError" + "503": + $ref: "#/components/responses/ServiceUnavailableError" + /api/v1/internal/runtimes/{game_id}/patch: + post: + tags: + - Runtimes + operationId: internalPatchRuntime + summary: Recreate a game engine container with a new image + description: | + Restart with a new `image_ref`. Allowed only as a semver patch + within the same major and minor line. Cross-major or cross-minor + attempts return `409 conflict` with `error_code=semver_patch_only`. + A non-semver `image_ref` returns `400 invalid_request` with + `error_code=image_ref_not_semver`. + parameters: + - $ref: "#/components/parameters/GameIDPath" + - $ref: "#/components/parameters/XGalaxyCallerHeader" + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/PatchRequest" + responses: + "200": + description: Runtime record after the patch operation. + content: + application/json: + schema: + $ref: "#/components/schemas/RuntimeRecord" + "400": + $ref: "#/components/responses/InvalidRequestError" + "404": + $ref: "#/components/responses/NotFoundError" + "409": + $ref: "#/components/responses/ConflictError" + "500": + $ref: "#/components/responses/InternalError" + "503": + $ref: "#/components/responses/ServiceUnavailableError" + /api/v1/internal/runtimes/{game_id}/container: + delete: + tags: + - Runtimes + operationId: internalCleanupRuntimeContainer + summary: Remove an exited container + description: | + Calls `docker rm` for an already-stopped container and updates the + runtime record to `removed`. Refuses with `409 conflict` if the + record is still `running`. The host state directory is not + deleted. + parameters: + - $ref: "#/components/parameters/GameIDPath" + - $ref: "#/components/parameters/XGalaxyCallerHeader" + responses: + "200": + description: Runtime record after the cleanup operation. + content: + application/json: + schema: + $ref: "#/components/schemas/RuntimeRecord" + "404": + $ref: "#/components/responses/NotFoundError" + "409": + $ref: "#/components/responses/ConflictError" + "500": + $ref: "#/components/responses/InternalError" + "503": + $ref: "#/components/responses/ServiceUnavailableError" +components: + parameters: + GameIDPath: + name: game_id + in: path + required: true + description: Opaque stable game identifier owned by Lobby. + schema: + type: string + XGalaxyCallerHeader: + name: X-Galaxy-Caller + in: header + required: false + description: | + Identifies the calling service so Runtime Manager can record the + right `op_source` in `operation_log` (`gm_rest` for `gm`, + `admin_rest` for `admin`). Missing or unknown values default to + `admin_rest`. + schema: + type: string + enum: + - gm + - admin + schemas: + RuntimeRecord: + type: object + additionalProperties: false + required: + - game_id + - status + - state_path + - docker_network + - last_op_at + - created_at + properties: + game_id: + type: string + description: Opaque stable game identifier owned by Lobby. + status: + type: string + enum: + - running + - stopped + - removed + description: Current runtime status maintained by Runtime Manager. + current_container_id: + type: string + nullable: true + description: Docker container id; null when status is removed. + current_image_ref: + type: string + nullable: true + description: Image reference of the current container; null when status is removed. + engine_endpoint: + type: string + nullable: true + description: Stable engine URL `http://galaxy-game-{game_id}:8080`; null when status is removed. + state_path: + type: string + description: Absolute host path of the per-game bind-mounted state directory. + docker_network: + type: string + description: Docker network name observed when the container was created. + started_at: + type: string + format: date-time + nullable: true + description: UTC timestamp of the most recent successful start. + stopped_at: + type: string + format: date-time + nullable: true + description: UTC timestamp of the most recent stop. + removed_at: + type: string + format: date-time + nullable: true + description: UTC timestamp of the most recent container removal. + last_op_at: + type: string + format: date-time + description: UTC timestamp of the most recent operation; drives retention TTL. + created_at: + type: string + format: date-time + description: UTC timestamp of the first observation of this game. + RuntimesList: + type: object + additionalProperties: false + required: + - items + properties: + items: + type: array + items: + $ref: "#/components/schemas/RuntimeRecord" + StartRequest: + type: object + additionalProperties: false + required: + - image_ref + properties: + image_ref: + type: string + description: Docker reference resolved by the producer (Game Master or Admin Service). + StopRequest: + type: object + additionalProperties: false + required: + - reason + properties: + reason: + $ref: "#/components/schemas/StopReason" + PatchRequest: + type: object + additionalProperties: false + required: + - image_ref + properties: + image_ref: + type: string + description: New Docker reference within the same semver major and minor line. + StopReason: + type: string + enum: + - orphan_cleanup + - cancelled + - finished + - admin_request + - timeout + description: Reason carried in the stop envelope and recorded in the operation log. + ErrorCode: + type: string + enum: + - invalid_request + - not_found + - conflict + - service_unavailable + - internal_error + - image_pull_failed + - image_ref_not_semver + - semver_patch_only + - container_start_failed + - start_config_invalid + - docker_unavailable + - replay_no_op + description: Stable internal API error code. + ProbeResponse: + type: object + additionalProperties: false + required: + - status + properties: + status: + type: string + ErrorResponse: + type: object + additionalProperties: false + required: + - error + properties: + error: + $ref: "#/components/schemas/ErrorBody" + ErrorBody: + type: object + additionalProperties: false + required: + - code + - message + properties: + code: + $ref: "#/components/schemas/ErrorCode" + message: + type: string + description: Human-readable trusted error message. + responses: + InvalidRequestError: + description: Request validation failed. + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + examples: + invalidRequest: + value: + error: + code: invalid_request + message: request is invalid + NotFoundError: + description: The requested runtime record does not exist. + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + examples: + notFound: + value: + error: + code: not_found + message: runtime record not found + ConflictError: + description: The requested operation is not allowed in the current runtime state. + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + examples: + conflict: + value: + error: + code: conflict + message: operation not allowed in current status + InternalError: + description: Unexpected internal service error. + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + examples: + internal: + value: + error: + code: internal_error + message: internal server error + ServiceUnavailableError: + description: An upstream dependency is unavailable. + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + examples: + unavailable: + value: + error: + code: service_unavailable + message: service is unavailable diff --git a/rtmanager/api/runtime-health-asyncapi.yaml b/rtmanager/api/runtime-health-asyncapi.yaml new file mode 100644 index 0000000..df55427 --- /dev/null +++ b/rtmanager/api/runtime-health-asyncapi.yaml @@ -0,0 +1,195 @@ +asyncapi: 3.1.0 +info: + title: Galaxy Runtime Health Events Contract + version: 1.0.0 + description: | + Stable Redis Streams contract for technical container health events + published by `Runtime Manager`. Consumers include `Game Master`; + `Game Lobby` and `Admin Service` are reserved as future consumers. + + Three independent sources feed this stream: the Docker events + listener, the periodic Docker inspect worker, and the active HTTP + `/healthz` probe. Every emission also upserts the latest snapshot + into `health_snapshots` in PostgreSQL. + + Polymorphism: the `details` field carries an `event_type`-specific + payload selected via `oneOf` per type. Each variant is a closed object + (no unknown fields). + + The `event_type` enum is fixed in this contract; adding a new value + requires a contract bump and a coordinated consumer change. +channels: + healthEvents: + address: runtime:health_events + messages: + runtimeHealthEvent: + $ref: '#/components/messages/RuntimeHealthEvent' +operations: + publishHealthEvent: + action: send + summary: Publish one technical health event for downstream consumers. + channel: + $ref: '#/channels/healthEvents' + messages: + - $ref: '#/channels/healthEvents/messages/runtimeHealthEvent' +components: + messages: + RuntimeHealthEvent: + name: RuntimeHealthEvent + title: Runtime health event + summary: One technical health observation about a game engine container. + payload: + $ref: '#/components/schemas/RuntimeHealthEventPayload' + examples: + - name: containerStarted + summary: Engine container has been created and started. + payload: + game_id: game-123 + container_id: 7c2b5d1a4f6e + event_type: container_started + occurred_at_ms: 1775121700000 + details: + image_ref: registry.example.com/galaxy/game:1.4.7 + - name: containerExited + summary: Engine container terminated with a non-zero exit code. + payload: + game_id: game-123 + container_id: 7c2b5d1a4f6e + event_type: container_exited + occurred_at_ms: 1775121800000 + details: + exit_code: 137 + oom: false + - name: probeFailed + summary: Active probe observed three consecutive failures. + payload: + game_id: game-123 + container_id: 7c2b5d1a4f6e + event_type: probe_failed + occurred_at_ms: 1775121810000 + details: + consecutive_failures: 3 + last_status: 0 + last_error: "context deadline exceeded" + schemas: + RuntimeHealthEventPayload: + type: object + additionalProperties: false + required: + - game_id + - container_id + - event_type + - occurred_at_ms + - details + properties: + game_id: + type: string + description: Opaque stable game identifier owned by Lobby. + container_id: + type: string + description: Docker container id observed by Runtime Manager. May differ from the current container id after a restart race. + event_type: + $ref: '#/components/schemas/EventType' + occurred_at_ms: + type: integer + format: int64 + description: UTC milliseconds when Runtime Manager observed the event. + details: + oneOf: + - $ref: '#/components/schemas/ContainerStartedDetails' + - $ref: '#/components/schemas/ContainerExitedDetails' + - $ref: '#/components/schemas/ContainerOomDetails' + - $ref: '#/components/schemas/ContainerDisappearedDetails' + - $ref: '#/components/schemas/InspectUnhealthyDetails' + - $ref: '#/components/schemas/ProbeFailedDetails' + - $ref: '#/components/schemas/ProbeRecoveredDetails' + description: Polymorphic payload selected by event_type. + EventType: + type: string + enum: + - container_started + - container_exited + - container_oom + - container_disappeared + - inspect_unhealthy + - probe_failed + - probe_recovered + description: Discriminator selecting the details variant. + ContainerStartedDetails: + type: object + additionalProperties: false + required: + - image_ref + properties: + image_ref: + type: string + description: Image reference of the started container. + ContainerExitedDetails: + type: object + additionalProperties: false + required: + - exit_code + - oom + properties: + exit_code: + type: integer + description: Exit code reported by Docker. + oom: + type: boolean + description: True when the container was killed by the OOM killer. + ContainerOomDetails: + type: object + additionalProperties: false + required: + - exit_code + properties: + exit_code: + type: integer + description: Exit code reported by Docker for the OOM event. + ContainerDisappearedDetails: + type: object + additionalProperties: false + description: Empty payload; emitted when a destroy event is observed for a record Runtime Manager did not initiate. + InspectUnhealthyDetails: + type: object + additionalProperties: false + required: + - restart_count + - state + - health + properties: + restart_count: + type: integer + description: Docker RestartCount observed at this inspection. + state: + type: string + description: Docker State.Status observed at this inspection. + health: + type: string + description: Docker State.Health.Status observed at this inspection; empty when the image declares no HEALTHCHECK. + ProbeFailedDetails: + type: object + additionalProperties: false + required: + - consecutive_failures + - last_status + - last_error + properties: + consecutive_failures: + type: integer + description: Number of consecutive probe failures that crossed the threshold. + last_status: + type: integer + description: HTTP status of the last probe attempt; 0 when the probe failed before receiving a response. + last_error: + type: string + description: Operator-readable error of the last probe attempt; empty when not applicable. + ProbeRecoveredDetails: + type: object + additionalProperties: false + required: + - prior_failure_count + properties: + prior_failure_count: + type: integer + description: Number of consecutive failures observed immediately before the recovery. diff --git a/rtmanager/api/runtime-jobs-asyncapi.yaml b/rtmanager/api/runtime-jobs-asyncapi.yaml new file mode 100644 index 0000000..a5a85ec --- /dev/null +++ b/rtmanager/api/runtime-jobs-asyncapi.yaml @@ -0,0 +1,226 @@ +asyncapi: 3.1.0 +info: + title: Galaxy Runtime Jobs Stream Contract + version: 1.0.0 + description: | + Stable Redis Streams contract carrying runtime jobs between + `Game Lobby` and `Runtime Manager`. + + `Game Lobby` is the sole producer for `runtime:start_jobs` and + `runtime:stop_jobs`. `Runtime Manager` consumes both, executes the + Docker work, and publishes one outcome per job to `runtime:job_results`, + which is consumed by `Game Lobby`'s runtime-job-result worker. + + Replay safety: + - duplicate start jobs for an already-running game with the same + `image_ref` produce a `success` job result with + `error_code=replay_no_op`; + - duplicate stop jobs for an already-stopped or already-removed game + produce a `success` job result with `error_code=replay_no_op`. + + The `reason` enum on `runtime:stop_jobs` is fixed in this contract. + Adding a new value requires a contract bump and a coordinated + Lobby/Runtime Manager change. +channels: + startJobs: + address: runtime:start_jobs + messages: + runtimeStartJob: + $ref: '#/components/messages/RuntimeStartJob' + stopJobs: + address: runtime:stop_jobs + messages: + runtimeStopJob: + $ref: '#/components/messages/RuntimeStopJob' + jobResults: + address: runtime:job_results + messages: + runtimeJobResult: + $ref: '#/components/messages/RuntimeJobResult' +operations: + consumeStartJob: + action: receive + summary: Receive one start job from Game Lobby and run a container. + channel: + $ref: '#/channels/startJobs' + messages: + - $ref: '#/channels/startJobs/messages/runtimeStartJob' + consumeStopJob: + action: receive + summary: Receive one stop job from Game Lobby and stop a container. + channel: + $ref: '#/channels/stopJobs' + messages: + - $ref: '#/channels/stopJobs/messages/runtimeStopJob' + publishJobResult: + action: send + summary: Publish one runtime job outcome for Game Lobby. + channel: + $ref: '#/channels/jobResults' + messages: + - $ref: '#/channels/jobResults/messages/runtimeJobResult' +components: + messages: + RuntimeStartJob: + name: RuntimeStartJob + title: Runtime start job + summary: Lobby request to start one game engine container. + payload: + $ref: '#/components/schemas/RuntimeStartJobPayload' + examples: + - name: startJob + summary: Start a game engine container with a producer-resolved image_ref. + payload: + game_id: game-123 + image_ref: registry.example.com/galaxy/game:1.4.7 + requested_at_ms: 1775121700000 + RuntimeStopJob: + name: RuntimeStopJob + title: Runtime stop job + summary: Lobby request to stop one game engine container. + payload: + $ref: '#/components/schemas/RuntimeStopJobPayload' + examples: + - name: cancelled + summary: Stop the engine because the game was cancelled. + payload: + game_id: game-123 + reason: cancelled + requested_at_ms: 1775121800000 + - name: orphanCleanup + summary: Stop an engine whose Lobby metadata persistence failed. + payload: + game_id: game-456 + reason: orphan_cleanup + requested_at_ms: 1775121810000 + RuntimeJobResult: + name: RuntimeJobResult + title: Runtime job result + summary: Outcome of one start or stop job. + payload: + $ref: '#/components/schemas/RuntimeJobResultPayload' + examples: + - name: startSuccess + summary: Successful start, container_id and engine_endpoint are populated. + payload: + game_id: game-123 + outcome: success + container_id: 7c2b5d1a4f6e + engine_endpoint: http://galaxy-game-game-123:8080 + error_code: "" + error_message: "" + - name: imagePullFailed + summary: Failed start due to an image pull error. + payload: + game_id: game-789 + outcome: failure + container_id: "" + engine_endpoint: "" + error_code: image_pull_failed + error_message: "manifest unknown" + - name: replayNoOp + summary: Idempotent replay; the job was a no-op. + payload: + game_id: game-123 + outcome: success + container_id: 7c2b5d1a4f6e + engine_endpoint: http://galaxy-game-game-123:8080 + error_code: replay_no_op + error_message: "" + schemas: + RuntimeStartJobPayload: + type: object + additionalProperties: false + required: + - game_id + - image_ref + - requested_at_ms + properties: + game_id: + type: string + description: Opaque stable game identifier owned by Lobby. + image_ref: + type: string + description: Docker reference resolved by Lobby from LOBBY_ENGINE_IMAGE_TEMPLATE. + requested_at_ms: + type: integer + format: int64 + description: UTC milliseconds; used for diagnostics, not authoritative. + RuntimeStopJobPayload: + type: object + additionalProperties: false + required: + - game_id + - reason + - requested_at_ms + properties: + game_id: + type: string + description: Opaque stable game identifier owned by Lobby. + reason: + $ref: '#/components/schemas/StopReason' + requested_at_ms: + type: integer + format: int64 + description: UTC milliseconds; used for diagnostics, not authoritative. + RuntimeJobResultPayload: + type: object + additionalProperties: false + required: + - game_id + - outcome + - container_id + - engine_endpoint + - error_code + - error_message + properties: + game_id: + type: string + description: Opaque stable game identifier matching the originating job. + outcome: + type: string + enum: + - success + - failure + description: High-level outcome of the runtime job. + container_id: + type: string + description: Docker container id of the engine; populated on success, empty on failure. + engine_endpoint: + type: string + description: Stable engine URL `http://galaxy-game-{game_id}:8080`; populated on success, empty on failure. + error_code: + $ref: '#/components/schemas/ErrorCode' + error_message: + type: string + description: Operator-readable detail; empty when not applicable. + StopReason: + type: string + enum: + - orphan_cleanup + - cancelled + - finished + - admin_request + - timeout + description: Reason value carried by every runtime:stop_jobs envelope. + ErrorCode: + type: string + enum: + - "" + - invalid_request + - not_found + - conflict + - service_unavailable + - internal_error + - image_pull_failed + - image_ref_not_semver + - semver_patch_only + - container_start_failed + - start_config_invalid + - docker_unavailable + - replay_no_op + description: | + Stable error code identical to the internal REST contract. The empty + string is a valid value for successful job results that did not + produce a code (the field is required to be present so consumers + can rely on the schema). diff --git a/rtmanager/cmd/jetgen/main.go b/rtmanager/cmd/jetgen/main.go new file mode 100644 index 0000000..0c0e36c --- /dev/null +++ b/rtmanager/cmd/jetgen/main.go @@ -0,0 +1,236 @@ +// Command jetgen regenerates the go-jet/v2 query-builder code under +// galaxy/rtmanager/internal/adapters/postgres/jet/ against a transient +// PostgreSQL instance. +// +// The program is intended to be invoked as `go run ./cmd/jetgen` (or via +// the `make jet` Makefile target) from within `galaxy/rtmanager`. It is +// not part of the runtime binary. +// +// Steps: +// +// 1. start a postgres:16-alpine container via testcontainers-go +// 2. open it through pkg/postgres as the superuser +// 3. CREATE ROLE rtmanagerservice and CREATE SCHEMA "rtmanager" +// AUTHORIZATION rtmanagerservice +// 4. open a second pool as rtmanagerservice with search_path=rtmanager +// and apply the embedded goose migrations +// 5. run jet's PostgreSQL generator against schema=rtmanager, writing +// into ../internal/adapters/postgres/jet +package main + +import ( + "context" + "errors" + "fmt" + "log" + "net/url" + "os" + "path/filepath" + "runtime" + "time" + + "galaxy/postgres" + "galaxy/rtmanager/internal/adapters/postgres/migrations" + + jetpostgres "github.com/go-jet/jet/v2/generator/postgres" + testcontainers "github.com/testcontainers/testcontainers-go" + tcpostgres "github.com/testcontainers/testcontainers-go/modules/postgres" + "github.com/testcontainers/testcontainers-go/wait" +) + +const ( + postgresImage = "postgres:16-alpine" + superuserName = "galaxy" + superuserPassword = "galaxy" + superuserDatabase = "galaxy_rtmanager" + serviceRole = "rtmanagerservice" + servicePassword = "rtmanagerservice" + serviceSchema = "rtmanager" + containerStartup = 90 * time.Second + defaultOpTimeout = 10 * time.Second + jetOutputDirSuffix = "internal/adapters/postgres/jet" +) + +func main() { + if err := run(context.Background()); err != nil { + log.Fatalf("jetgen: %v", err) + } +} + +func run(ctx context.Context) error { + outputDir, err := jetOutputDir() + if err != nil { + return err + } + + container, err := tcpostgres.Run(ctx, postgresImage, + tcpostgres.WithDatabase(superuserDatabase), + tcpostgres.WithUsername(superuserName), + tcpostgres.WithPassword(superuserPassword), + testcontainers.WithWaitStrategy( + wait.ForLog("database system is ready to accept connections"). + WithOccurrence(2). + WithStartupTimeout(containerStartup), + ), + ) + if err != nil { + return fmt.Errorf("start postgres container: %w", err) + } + defer func() { + if termErr := testcontainers.TerminateContainer(container); termErr != nil { + log.Printf("jetgen: terminate container: %v", termErr) + } + }() + + baseDSN, err := container.ConnectionString(ctx, "sslmode=disable") + if err != nil { + return fmt.Errorf("resolve container dsn: %w", err) + } + + if err := provisionRoleAndSchema(ctx, baseDSN); err != nil { + return err + } + + scopedDSN, err := dsnForServiceRole(baseDSN) + if err != nil { + return err + } + if err := applyMigrations(ctx, scopedDSN); err != nil { + return err + } + + if err := os.RemoveAll(outputDir); err != nil { + return fmt.Errorf("remove existing jet output %q: %w", outputDir, err) + } + if err := os.MkdirAll(filepath.Dir(outputDir), 0o755); err != nil { + return fmt.Errorf("ensure jet output parent: %w", err) + } + + jetCfg := postgres.DefaultConfig() + jetCfg.PrimaryDSN = scopedDSN + jetCfg.OperationTimeout = defaultOpTimeout + jetDB, err := postgres.OpenPrimary(ctx, jetCfg) + if err != nil { + return fmt.Errorf("open scoped pool for jet generation: %w", err) + } + defer func() { _ = jetDB.Close() }() + + if err := jetpostgres.GenerateDB(jetDB, serviceSchema, outputDir); err != nil { + return fmt.Errorf("jet generate: %w", err) + } + + log.Printf("jetgen: generated jet code into %s (schema=%s)", outputDir, serviceSchema) + return nil +} + +func provisionRoleAndSchema(ctx context.Context, baseDSN string) error { + cfg := postgres.DefaultConfig() + cfg.PrimaryDSN = baseDSN + cfg.OperationTimeout = defaultOpTimeout + db, err := postgres.OpenPrimary(ctx, cfg) + if err != nil { + return fmt.Errorf("open admin pool: %w", err) + } + defer func() { _ = db.Close() }() + + statements := []string{ + fmt.Sprintf(`DO $$ BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = %s) THEN + CREATE ROLE %s LOGIN PASSWORD %s; + END IF; + END $$;`, sqlLiteral(serviceRole), sqlIdentifier(serviceRole), sqlLiteral(servicePassword)), + fmt.Sprintf(`CREATE SCHEMA IF NOT EXISTS %s AUTHORIZATION %s;`, + sqlIdentifier(serviceSchema), sqlIdentifier(serviceRole)), + fmt.Sprintf(`GRANT USAGE ON SCHEMA %s TO %s;`, + sqlIdentifier(serviceSchema), sqlIdentifier(serviceRole)), + } + for _, statement := range statements { + if _, err := db.ExecContext(ctx, statement); err != nil { + return fmt.Errorf("provision %q/%q: %w", serviceSchema, serviceRole, err) + } + } + return nil +} + +func dsnForServiceRole(baseDSN string) (string, error) { + parsed, err := url.Parse(baseDSN) + if err != nil { + return "", fmt.Errorf("parse base dsn: %w", err) + } + values := url.Values{} + values.Set("search_path", serviceSchema) + values.Set("sslmode", "disable") + scoped := url.URL{ + Scheme: parsed.Scheme, + User: url.UserPassword(serviceRole, servicePassword), + Host: parsed.Host, + Path: parsed.Path, + RawQuery: values.Encode(), + } + return scoped.String(), nil +} + +func applyMigrations(ctx context.Context, dsn string) error { + cfg := postgres.DefaultConfig() + cfg.PrimaryDSN = dsn + cfg.OperationTimeout = defaultOpTimeout + db, err := postgres.OpenPrimary(ctx, cfg) + if err != nil { + return fmt.Errorf("open scoped pool: %w", err) + } + defer func() { _ = db.Close() }() + + if err := postgres.Ping(ctx, db, defaultOpTimeout); err != nil { + return err + } + if err := postgres.RunMigrations(ctx, db, migrations.FS(), "."); err != nil { + return fmt.Errorf("run migrations: %w", err) + } + return nil +} + +// jetOutputDir returns the absolute path that jet should write into. We +// rely on the runtime caller info to anchor it to galaxy/rtmanager +// regardless of the invoking working directory. +func jetOutputDir() (string, error) { + _, file, _, ok := runtime.Caller(0) + if !ok { + return "", errors.New("resolve runtime caller for jet output path") + } + dir := filepath.Dir(file) + // dir = .../galaxy/rtmanager/cmd/jetgen + moduleRoot := filepath.Clean(filepath.Join(dir, "..", "..")) + return filepath.Join(moduleRoot, jetOutputDirSuffix), nil +} + +func sqlIdentifier(name string) string { + return `"` + escapeDoubleQuotes(name) + `"` +} + +func sqlLiteral(value string) string { + return "'" + escapeSingleQuotes(value) + "'" +} + +func escapeDoubleQuotes(value string) string { + out := make([]byte, 0, len(value)) + for index := 0; index < len(value); index++ { + if value[index] == '"' { + out = append(out, '"', '"') + continue + } + out = append(out, value[index]) + } + return string(out) +} + +func escapeSingleQuotes(value string) string { + out := make([]byte, 0, len(value)) + for index := 0; index < len(value); index++ { + if value[index] == '\'' { + out = append(out, '\'', '\'') + continue + } + out = append(out, value[index]) + } + return string(out) +} diff --git a/rtmanager/cmd/rtmanager/main.go b/rtmanager/cmd/rtmanager/main.go new file mode 100644 index 0000000..dee37a5 --- /dev/null +++ b/rtmanager/cmd/rtmanager/main.go @@ -0,0 +1,47 @@ +// Binary rtmanager is the runnable Runtime Manager Service process +// entrypoint. +package main + +import ( + "context" + "fmt" + "os" + "os/signal" + "syscall" + + "galaxy/rtmanager/internal/app" + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/logging" +) + +func main() { + if err := run(); err != nil { + _, _ = fmt.Fprintf(os.Stderr, "rtmanager: %v\n", err) + os.Exit(1) + } +} + +func run() error { + cfg, err := config.LoadFromEnv() + if err != nil { + return err + } + + logger, err := logging.New(cfg.Logging.Level) + if err != nil { + return err + } + + rootCtx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer stop() + + runtime, err := app.NewRuntime(rootCtx, cfg, logger) + if err != nil { + return err + } + defer func() { + _ = runtime.Close() + }() + + return runtime.Run(rootCtx) +} diff --git a/rtmanager/contract_asyncapi_test.go b/rtmanager/contract_asyncapi_test.go new file mode 100644 index 0000000..ee35f44 --- /dev/null +++ b/rtmanager/contract_asyncapi_test.go @@ -0,0 +1,392 @@ +package rtmanager + +import ( + "os" + "path/filepath" + "runtime" + "testing" + + "github.com/stretchr/testify/require" + "gopkg.in/yaml.v3" +) + +var expectedStopReasonEnum = []string{ + "orphan_cleanup", + "cancelled", + "finished", + "admin_request", + "timeout", +} + +var expectedJobResultErrorCodeEnum = []string{ + "", + "invalid_request", + "not_found", + "conflict", + "service_unavailable", + "internal_error", + "image_pull_failed", + "image_ref_not_semver", + "semver_patch_only", + "container_start_failed", + "start_config_invalid", + "docker_unavailable", + "replay_no_op", +} + +var expectedHealthEventTypeEnum = []string{ + "container_started", + "container_exited", + "container_oom", + "container_disappeared", + "inspect_unhealthy", + "probe_failed", + "probe_recovered", +} + +var expectedHealthDetailsBranches = []struct { + schema string + required []string +}{ + {schema: "ContainerStartedDetails", required: []string{"image_ref"}}, + {schema: "ContainerExitedDetails", required: []string{"exit_code", "oom"}}, + {schema: "ContainerOomDetails", required: []string{"exit_code"}}, + {schema: "ContainerDisappearedDetails", required: nil}, + {schema: "InspectUnhealthyDetails", required: []string{"restart_count", "state", "health"}}, + {schema: "ProbeFailedDetails", required: []string{"consecutive_failures", "last_status", "last_error"}}, + {schema: "ProbeRecoveredDetails", required: []string{"prior_failure_count"}}, +} + +func TestRuntimeJobsAsyncAPISpecLoads(t *testing.T) { + t.Parallel() + + doc := loadAsyncAPISpec(t, filepath.Join("api", "runtime-jobs-asyncapi.yaml")) + require.Equal(t, "3.1.0", getStringValue(t, doc, "asyncapi")) +} + +func TestRuntimeJobsSpecFreezesChannelAddresses(t *testing.T) { + t.Parallel() + + doc := loadAsyncAPISpec(t, filepath.Join("api", "runtime-jobs-asyncapi.yaml")) + channels := getMapValue(t, doc, "channels") + + require.Equal(t, "runtime:start_jobs", + getStringValue(t, getMapValue(t, channels, "startJobs"), "address")) + require.Equal(t, "runtime:stop_jobs", + getStringValue(t, getMapValue(t, channels, "stopJobs"), "address")) + require.Equal(t, "runtime:job_results", + getStringValue(t, getMapValue(t, channels, "jobResults"), "address")) +} + +func TestRuntimeJobsSpecFreezesOperationActions(t *testing.T) { + t.Parallel() + + doc := loadAsyncAPISpec(t, filepath.Join("api", "runtime-jobs-asyncapi.yaml")) + operations := getMapValue(t, doc, "operations") + + cases := []struct { + operation string + action string + channel string + }{ + {operation: "consumeStartJob", action: "receive", channel: "#/channels/startJobs"}, + {operation: "consumeStopJob", action: "receive", channel: "#/channels/stopJobs"}, + {operation: "publishJobResult", action: "send", channel: "#/channels/jobResults"}, + } + + for _, tc := range cases { + t.Run(tc.operation, func(t *testing.T) { + t.Parallel() + op := getMapValue(t, operations, tc.operation) + require.Equal(t, tc.action, getStringValue(t, op, "action")) + require.Equal(t, tc.channel, + getStringValue(t, getMapValue(t, op, "channel"), "$ref")) + }) + } +} + +func TestRuntimeJobsSpecFreezesMessageNames(t *testing.T) { + t.Parallel() + + doc := loadAsyncAPISpec(t, filepath.Join("api", "runtime-jobs-asyncapi.yaml")) + messages := getMapValue(t, doc, "components", "messages") + + for _, name := range []string{"RuntimeStartJob", "RuntimeStopJob", "RuntimeJobResult"} { + t.Run(name, func(t *testing.T) { + t.Parallel() + message := getMapValue(t, messages, name) + require.Equal(t, name, getStringValue(t, message, "name")) + }) + } +} + +func TestRuntimeJobsSpecFreezesStartJobPayload(t *testing.T) { + t.Parallel() + + doc := loadAsyncAPISpec(t, filepath.Join("api", "runtime-jobs-asyncapi.yaml")) + payload := getMapValue(t, doc, "components", "schemas", "RuntimeStartJobPayload") + + require.ElementsMatch(t, + []string{"game_id", "image_ref", "requested_at_ms"}, + getStringSlice(t, payload, "required")) + require.False(t, getBoolValue(t, payload, "additionalProperties"), + "RuntimeStartJobPayload must reject unknown fields") +} + +func TestRuntimeJobsSpecFreezesStopJobPayload(t *testing.T) { + t.Parallel() + + doc := loadAsyncAPISpec(t, filepath.Join("api", "runtime-jobs-asyncapi.yaml")) + payload := getMapValue(t, doc, "components", "schemas", "RuntimeStopJobPayload") + + require.ElementsMatch(t, + []string{"game_id", "reason", "requested_at_ms"}, + getStringSlice(t, payload, "required")) + require.False(t, getBoolValue(t, payload, "additionalProperties"), + "RuntimeStopJobPayload must reject unknown fields") + + reason := getMapValue(t, payload, "properties", "reason") + require.Equal(t, "#/components/schemas/StopReason", + getStringValue(t, reason, "$ref"), + "RuntimeStopJobPayload.reason must reference StopReason") + + stopReason := getMapValue(t, doc, "components", "schemas", "StopReason") + require.ElementsMatch(t, expectedStopReasonEnum, + getStringSlice(t, stopReason, "enum")) +} + +func TestRuntimeJobsSpecFreezesJobResultPayload(t *testing.T) { + t.Parallel() + + doc := loadAsyncAPISpec(t, filepath.Join("api", "runtime-jobs-asyncapi.yaml")) + payload := getMapValue(t, doc, "components", "schemas", "RuntimeJobResultPayload") + + require.ElementsMatch(t, + []string{"game_id", "outcome", "container_id", "engine_endpoint", "error_code", "error_message"}, + getStringSlice(t, payload, "required")) + require.False(t, getBoolValue(t, payload, "additionalProperties"), + "RuntimeJobResultPayload must reject unknown fields") + + outcome := getMapValue(t, payload, "properties", "outcome") + require.ElementsMatch(t, []string{"success", "failure"}, + getStringSlice(t, outcome, "enum")) + + errorCode := getMapValue(t, payload, "properties", "error_code") + require.Equal(t, "#/components/schemas/ErrorCode", + getStringValue(t, errorCode, "$ref"), + "RuntimeJobResultPayload.error_code must reference ErrorCode") + + errorCodeSchema := getMapValue(t, doc, "components", "schemas", "ErrorCode") + require.ElementsMatch(t, expectedJobResultErrorCodeEnum, + getStringSlice(t, errorCodeSchema, "enum")) +} + +func TestRuntimeHealthAsyncAPISpecLoads(t *testing.T) { + t.Parallel() + + doc := loadAsyncAPISpec(t, filepath.Join("api", "runtime-health-asyncapi.yaml")) + require.Equal(t, "3.1.0", getStringValue(t, doc, "asyncapi")) +} + +func TestRuntimeHealthSpecFreezesChannelAndOperation(t *testing.T) { + t.Parallel() + + doc := loadAsyncAPISpec(t, filepath.Join("api", "runtime-health-asyncapi.yaml")) + + channel := getMapValue(t, doc, "channels", "healthEvents") + require.Equal(t, "runtime:health_events", getStringValue(t, channel, "address")) + + operation := getMapValue(t, doc, "operations", "publishHealthEvent") + require.Equal(t, "send", getStringValue(t, operation, "action")) + require.Equal(t, "#/channels/healthEvents", + getStringValue(t, getMapValue(t, operation, "channel"), "$ref")) + + message := getMapValue(t, doc, "components", "messages", "RuntimeHealthEvent") + require.Equal(t, "RuntimeHealthEvent", getStringValue(t, message, "name")) +} + +func TestRuntimeHealthSpecFreezesEnvelope(t *testing.T) { + t.Parallel() + + doc := loadAsyncAPISpec(t, filepath.Join("api", "runtime-health-asyncapi.yaml")) + payload := getMapValue(t, doc, "components", "schemas", "RuntimeHealthEventPayload") + + require.ElementsMatch(t, + []string{"game_id", "container_id", "event_type", "occurred_at_ms", "details"}, + getStringSlice(t, payload, "required")) + require.False(t, getBoolValue(t, payload, "additionalProperties"), + "RuntimeHealthEventPayload must reject unknown fields") + + eventType := getMapValue(t, payload, "properties", "event_type") + require.Equal(t, "#/components/schemas/EventType", + getStringValue(t, eventType, "$ref"), + "RuntimeHealthEventPayload.event_type must reference EventType") +} + +func TestRuntimeHealthSpecFreezesEventTypeEnum(t *testing.T) { + t.Parallel() + + doc := loadAsyncAPISpec(t, filepath.Join("api", "runtime-health-asyncapi.yaml")) + schema := getMapValue(t, doc, "components", "schemas", "EventType") + + require.ElementsMatch(t, expectedHealthEventTypeEnum, + getStringSlice(t, schema, "enum")) +} + +func TestRuntimeHealthSpecFreezesDetailsOneOfBranches(t *testing.T) { + t.Parallel() + + doc := loadAsyncAPISpec(t, filepath.Join("api", "runtime-health-asyncapi.yaml")) + details := getMapValue(t, doc, "components", "schemas", "RuntimeHealthEventPayload", + "properties", "details") + + branches := getSliceValue(t, details, "oneOf") + require.Lenf(t, branches, len(expectedHealthDetailsBranches), + "details.oneOf must have %d branches", len(expectedHealthDetailsBranches)) + + gotRefs := make([]string, 0, len(branches)) + for _, raw := range branches { + branch, ok := raw.(map[string]any) + require.True(t, ok, "details.oneOf entry must be a mapping") + gotRefs = append(gotRefs, getStringValue(t, branch, "$ref")) + } + + wantRefs := make([]string, 0, len(expectedHealthDetailsBranches)) + for _, branch := range expectedHealthDetailsBranches { + wantRefs = append(wantRefs, "#/components/schemas/"+branch.schema) + } + require.ElementsMatch(t, wantRefs, gotRefs) + + for _, branch := range expectedHealthDetailsBranches { + t.Run(branch.schema, func(t *testing.T) { + t.Parallel() + schema := getMapValue(t, doc, "components", "schemas", branch.schema) + require.False(t, getBoolValue(t, schema, "additionalProperties"), + "%s must reject unknown fields", branch.schema) + if branch.required == nil { + _, hasRequired := schema["required"] + require.False(t, hasRequired, + "%s must not declare required fields", branch.schema) + return + } + require.ElementsMatch(t, branch.required, + getStringSlice(t, schema, "required")) + }) + } +} + +func loadAsyncAPISpec(t *testing.T, relativePath string) map[string]any { + t.Helper() + + payload := loadTextFile(t, relativePath) + + var doc map[string]any + if err := yaml.Unmarshal([]byte(payload), &doc); err != nil { + require.Failf(t, "test failed", "decode spec: %v", err) + } + + return doc +} + +func loadTextFile(t *testing.T, relativePath string) string { + t.Helper() + + path := filepath.Join(moduleRoot(t), relativePath) + payload, err := os.ReadFile(path) + if err != nil { + require.Failf(t, "test failed", "read file %s: %v", path, err) + } + + return string(payload) +} + +func moduleRoot(t *testing.T) string { + t.Helper() + + _, thisFile, _, ok := runtime.Caller(0) + if !ok { + require.FailNow(t, "runtime.Caller failed") + } + + return filepath.Dir(thisFile) +} + +func getMapValue(t *testing.T, value map[string]any, path ...string) map[string]any { + t.Helper() + + current := value + for _, segment := range path { + raw, ok := current[segment] + if !ok { + require.Failf(t, "test failed", "missing map key %s", segment) + } + next, ok := raw.(map[string]any) + if !ok { + require.Failf(t, "test failed", "value at %s is not a map", segment) + } + current = next + } + + return current +} + +func getStringValue(t *testing.T, value map[string]any, key string) string { + t.Helper() + + raw, ok := value[key] + if !ok { + require.Failf(t, "test failed", "missing key %s", key) + } + result, ok := raw.(string) + if !ok { + require.Failf(t, "test failed", "value at %s is not a string", key) + } + + return result +} + +func getBoolValue(t *testing.T, value map[string]any, key string) bool { + t.Helper() + + raw, ok := value[key] + if !ok { + require.Failf(t, "test failed", "missing key %s", key) + } + result, ok := raw.(bool) + if !ok { + require.Failf(t, "test failed", "value at %s is not a bool", key) + } + + return result +} + +func getStringSlice(t *testing.T, value map[string]any, key string) []string { + t.Helper() + + raw := getSliceValue(t, value, key) + result := make([]string, 0, len(raw)) + for _, item := range raw { + text, ok := item.(string) + if !ok { + require.Failf(t, "test failed", "value at %s is not a string slice", key) + } + result = append(result, text) + } + + return result +} + +func getSliceValue(t *testing.T, value map[string]any, key string) []any { + t.Helper() + + raw, ok := value[key] + if !ok { + require.Failf(t, "test failed", "missing key %s", key) + } + result, ok := raw.([]any) + if !ok { + require.Failf(t, "test failed", "value at %s is not a slice", key) + } + + return result +} diff --git a/rtmanager/contract_openapi_test.go b/rtmanager/contract_openapi_test.go new file mode 100644 index 0000000..b8552cd --- /dev/null +++ b/rtmanager/contract_openapi_test.go @@ -0,0 +1,384 @@ +package rtmanager + +import ( + "context" + "net/http" + "path/filepath" + "runtime" + "testing" + + "github.com/getkin/kin-openapi/openapi3" + "github.com/stretchr/testify/require" +) + +// TestInternalOpenAPISpecValidates loads internal-openapi.yaml and verifies +// it is a syntactically valid OpenAPI 3.0 document. +func TestInternalOpenAPISpecValidates(t *testing.T) { + t.Parallel() + loadInternalOpenAPISpec(t) +} + +// TestInternalSpecFreezesOperationIDs verifies that every documented +// endpoint declares the exact operationId required by the Runtime Manager +// internal contract. Missing or renamed operationIds break the contract +// for Game Master and Admin Service. +func TestInternalSpecFreezesOperationIDs(t *testing.T) { + t.Parallel() + + doc := loadInternalOpenAPISpec(t) + + cases := []struct { + method string + path string + operationID string + }{ + {http.MethodGet, "/healthz", "internalHealthz"}, + {http.MethodGet, "/readyz", "internalReadyz"}, + {http.MethodGet, "/api/v1/internal/runtimes", "internalListRuntimes"}, + {http.MethodGet, "/api/v1/internal/runtimes/{game_id}", "internalGetRuntime"}, + {http.MethodPost, "/api/v1/internal/runtimes/{game_id}/start", "internalStartRuntime"}, + {http.MethodPost, "/api/v1/internal/runtimes/{game_id}/stop", "internalStopRuntime"}, + {http.MethodPost, "/api/v1/internal/runtimes/{game_id}/restart", "internalRestartRuntime"}, + {http.MethodPost, "/api/v1/internal/runtimes/{game_id}/patch", "internalPatchRuntime"}, + {http.MethodDelete, "/api/v1/internal/runtimes/{game_id}/container", "internalCleanupRuntimeContainer"}, + } + + for _, tc := range cases { + t.Run(tc.operationID, func(t *testing.T) { + t.Parallel() + op := getOperation(t, doc, tc.path, tc.method) + require.Equal(t, tc.operationID, op.OperationID) + }) + } +} + +// TestInternalSpecFreezesRuntimeRecordSchema verifies that RuntimeRecord +// declares the required field set documented in +// rtmanager/README.md §Persistence Layout, with the status enum frozen. +func TestInternalSpecFreezesRuntimeRecordSchema(t *testing.T) { + t.Parallel() + + doc := loadInternalOpenAPISpec(t) + schema := componentSchemaRef(t, doc, "RuntimeRecord") + + assertRequiredFields(t, schema, + "game_id", "status", "state_path", "docker_network", + "last_op_at", "created_at", + ) + + for _, optional := range []string{ + "current_container_id", "current_image_ref", "engine_endpoint", + "started_at", "stopped_at", "removed_at", + } { + require.Contains(t, schema.Value.Properties, optional, + "RuntimeRecord.%s must be present in properties", optional) + } + + assertStringEnum(t, schema, "status", "running", "stopped", "removed") +} + +// TestInternalSpecFreezesStartRequest verifies that StartRequest requires +// only image_ref and rejects unknown fields. +func TestInternalSpecFreezesStartRequest(t *testing.T) { + t.Parallel() + + doc := loadInternalOpenAPISpec(t) + schema := componentSchemaRef(t, doc, "StartRequest") + + assertRequiredFields(t, schema, "image_ref") + require.NotNil(t, schema.Value.AdditionalProperties.Has) + require.False(t, *schema.Value.AdditionalProperties.Has, + "StartRequest must reject unknown fields") +} + +// TestInternalSpecFreezesStopRequest verifies that StopRequest requires +// only reason, that reason references the StopReason schema, and that +// unknown fields are rejected. +func TestInternalSpecFreezesStopRequest(t *testing.T) { + t.Parallel() + + doc := loadInternalOpenAPISpec(t) + schema := componentSchemaRef(t, doc, "StopRequest") + + assertRequiredFields(t, schema, "reason") + require.NotNil(t, schema.Value.AdditionalProperties.Has) + require.False(t, *schema.Value.AdditionalProperties.Has, + "StopRequest must reject unknown fields") + + reason := schema.Value.Properties["reason"] + require.NotNil(t, reason, "StopRequest.reason must be present") + require.Equal(t, "#/components/schemas/StopReason", reason.Ref, + "StopRequest.reason must reference StopReason") +} + +// TestInternalSpecFreezesPatchRequest verifies that PatchRequest requires +// only image_ref and rejects unknown fields. +func TestInternalSpecFreezesPatchRequest(t *testing.T) { + t.Parallel() + + doc := loadInternalOpenAPISpec(t) + schema := componentSchemaRef(t, doc, "PatchRequest") + + assertRequiredFields(t, schema, "image_ref") + require.NotNil(t, schema.Value.AdditionalProperties.Has) + require.False(t, *schema.Value.AdditionalProperties.Has, + "PatchRequest must reject unknown fields") +} + +// TestInternalSpecFreezesStopReasonEnum verifies that the stop reason enum +// matches the contract recorded in +// rtmanager/README.md §Async Stream Contracts. +func TestInternalSpecFreezesStopReasonEnum(t *testing.T) { + t.Parallel() + + doc := loadInternalOpenAPISpec(t) + schema := componentSchemaRef(t, doc, "StopReason") + + got := make([]string, 0, len(schema.Value.Enum)) + for _, value := range schema.Value.Enum { + got = append(got, value.(string)) + } + + require.ElementsMatch(t, + []string{"orphan_cleanup", "cancelled", "finished", "admin_request", "timeout"}, + got) +} + +// TestInternalSpecFreezesErrorCodeCatalog verifies that ErrorCode contains +// every stable code declared in rtmanager/README.md §Error Model. +func TestInternalSpecFreezesErrorCodeCatalog(t *testing.T) { + t.Parallel() + + doc := loadInternalOpenAPISpec(t) + schema := componentSchemaRef(t, doc, "ErrorCode") + + got := make([]string, 0, len(schema.Value.Enum)) + for _, value := range schema.Value.Enum { + got = append(got, value.(string)) + } + + require.ElementsMatch(t, + []string{ + "invalid_request", + "not_found", + "conflict", + "service_unavailable", + "internal_error", + "image_pull_failed", + "image_ref_not_semver", + "semver_patch_only", + "container_start_failed", + "start_config_invalid", + "docker_unavailable", + "replay_no_op", + }, + got) +} + +// TestInternalSpecFreezesErrorEnvelope verifies that ErrorResponse uses the +// `{ "error": { "code", "message" } }` shape and that error.code references +// the ErrorCode enum. +func TestInternalSpecFreezesErrorEnvelope(t *testing.T) { + t.Parallel() + + doc := loadInternalOpenAPISpec(t) + + envelope := componentSchemaRef(t, doc, "ErrorResponse") + assertRequiredFields(t, envelope, "error") + require.Equal(t, "#/components/schemas/ErrorBody", + envelope.Value.Properties["error"].Ref, + "ErrorResponse.error must reference ErrorBody") + + body := componentSchemaRef(t, doc, "ErrorBody") + assertRequiredFields(t, body, "code", "message") + require.Equal(t, "#/components/schemas/ErrorCode", + body.Value.Properties["code"].Ref, + "ErrorBody.code must reference ErrorCode") + require.Equal(t, "string", + body.Value.Properties["message"].Value.Type.Slice()[0], + "ErrorBody.message must be a string") +} + +// TestInternalSpecFreezesProbeResponses verifies that /healthz returns 200 +// with the probe payload and /readyz declares both 200 and 503. +func TestInternalSpecFreezesProbeResponses(t *testing.T) { + t.Parallel() + + doc := loadInternalOpenAPISpec(t) + + healthz := getOperation(t, doc, "/healthz", http.MethodGet) + assertSchemaRef(t, responseSchemaRef(t, healthz, http.StatusOK), + "#/components/schemas/ProbeResponse", "internalHealthz 200") + + readyz := getOperation(t, doc, "/readyz", http.MethodGet) + assertSchemaRef(t, responseSchemaRef(t, readyz, http.StatusOK), + "#/components/schemas/ProbeResponse", "internalReadyz 200") + require.NotNil(t, readyz.Responses.Status(http.StatusServiceUnavailable), + "internalReadyz must declare a 503 response") +} + +// TestInternalSpecFreezesXGalaxyCallerHeader verifies that the optional +// X-Galaxy-Caller header parameter is declared and referenced from every +// runtime operation. Removing the parameter or detaching it from any of +// the seven runtime endpoints would silently drop the only signal RTM +// uses to distinguish gm_rest from admin_rest in operation_log. +func TestInternalSpecFreezesXGalaxyCallerHeader(t *testing.T) { + t.Parallel() + + doc := loadInternalOpenAPISpec(t) + + param := doc.Components.Parameters["XGalaxyCallerHeader"] + require.NotNil(t, param, "XGalaxyCallerHeader parameter must be declared") + require.NotNil(t, param.Value, "XGalaxyCallerHeader parameter must have a value") + require.Equal(t, "header", param.Value.In) + require.Equal(t, "X-Galaxy-Caller", param.Value.Name) + require.False(t, param.Value.Required, "X-Galaxy-Caller must be optional") + + enum := param.Value.Schema.Value.Enum + got := make([]string, 0, len(enum)) + for _, value := range enum { + got = append(got, value.(string)) + } + require.ElementsMatch(t, []string{"gm", "admin"}, got) + + runtimeOps := []struct { + method string + path string + }{ + {http.MethodGet, "/api/v1/internal/runtimes"}, + {http.MethodGet, "/api/v1/internal/runtimes/{game_id}"}, + {http.MethodPost, "/api/v1/internal/runtimes/{game_id}/start"}, + {http.MethodPost, "/api/v1/internal/runtimes/{game_id}/stop"}, + {http.MethodPost, "/api/v1/internal/runtimes/{game_id}/restart"}, + {http.MethodPost, "/api/v1/internal/runtimes/{game_id}/patch"}, + {http.MethodDelete, "/api/v1/internal/runtimes/{game_id}/container"}, + } + for _, rop := range runtimeOps { + t.Run(rop.method+" "+rop.path, func(t *testing.T) { + t.Parallel() + op := getOperation(t, doc, rop.path, rop.method) + found := false + for _, ref := range op.Parameters { + if ref.Ref == "#/components/parameters/XGalaxyCallerHeader" { + found = true + break + } + } + require.Truef(t, found, + "%s %s must reference XGalaxyCallerHeader", rop.method, rop.path) + }) + } +} + +// TestInternalSpecFreezesRuntimesListShape verifies that the list endpoint +// returns the items envelope expected by callers. +func TestInternalSpecFreezesRuntimesListShape(t *testing.T) { + t.Parallel() + + doc := loadInternalOpenAPISpec(t) + schema := componentSchemaRef(t, doc, "RuntimesList") + + assertRequiredFields(t, schema, "items") + items := schema.Value.Properties["items"] + require.NotNil(t, items, "RuntimesList.items must be declared") + require.Equal(t, "#/components/schemas/RuntimeRecord", items.Value.Items.Ref, + "RuntimesList.items[] must reference RuntimeRecord") +} + +func loadInternalOpenAPISpec(t *testing.T) *openapi3.T { + t.Helper() + + _, thisFile, _, ok := runtime.Caller(0) + if !ok { + require.FailNow(t, "runtime.Caller failed") + } + + specPath := filepath.Join(filepath.Dir(thisFile), "api", "internal-openapi.yaml") + loader := openapi3.NewLoader() + doc, err := loader.LoadFromFile(specPath) + if err != nil { + require.Failf(t, "test failed", "load spec %s: %v", specPath, err) + } + if doc == nil { + require.Failf(t, "test failed", "load spec %s: returned nil document", specPath) + } + if err := doc.Validate(context.Background()); err != nil { + require.Failf(t, "test failed", "validate spec %s: %v", specPath, err) + } + + return doc +} + +func getOperation(t *testing.T, doc *openapi3.T, path, method string) *openapi3.Operation { + t.Helper() + + if doc.Paths == nil { + require.FailNow(t, "spec is missing paths") + } + pathItem := doc.Paths.Value(path) + if pathItem == nil { + require.Failf(t, "test failed", "spec is missing path %s", path) + } + op := pathItem.GetOperation(method) + if op == nil { + require.Failf(t, "test failed", "spec is missing %s operation for path %s", method, path) + } + + return op +} + +func responseSchemaRef(t *testing.T, op *openapi3.Operation, status int) *openapi3.SchemaRef { + t.Helper() + + ref := op.Responses.Status(status) + if ref == nil || ref.Value == nil { + require.Failf(t, "test failed", "operation is missing %d response", status) + } + mt := ref.Value.Content.Get("application/json") + if mt == nil || mt.Schema == nil { + require.Failf(t, "test failed", "operation is missing application/json schema for %d response", status) + } + + return mt.Schema +} + +func componentSchemaRef(t *testing.T, doc *openapi3.T, name string) *openapi3.SchemaRef { + t.Helper() + + if doc.Components.Schemas == nil { + require.FailNow(t, "spec is missing component schemas") + } + ref := doc.Components.Schemas[name] + if ref == nil { + require.Failf(t, "test failed", "spec is missing component schema %s", name) + } + + return ref +} + +func assertSchemaRef(t *testing.T, schemaRef *openapi3.SchemaRef, want, name string) { + t.Helper() + require.NotNil(t, schemaRef, "%s schema ref", name) + require.Equal(t, want, schemaRef.Ref, "%s schema ref", name) +} + +func assertRequiredFields(t *testing.T, schemaRef *openapi3.SchemaRef, fields ...string) { + t.Helper() + require.NotNil(t, schemaRef) + require.ElementsMatch(t, fields, schemaRef.Value.Required) +} + +func assertStringEnum(t *testing.T, schemaRef *openapi3.SchemaRef, property string, values ...string) { + t.Helper() + require.NotNil(t, schemaRef) + + propRef := schemaRef.Value.Properties[property] + require.NotNil(t, propRef, "schema property %s", property) + + got := make([]string, 0, len(propRef.Value.Enum)) + for _, v := range propRef.Value.Enum { + got = append(got, v.(string)) + } + + require.ElementsMatch(t, values, got) +} diff --git a/rtmanager/docs/README.md b/rtmanager/docs/README.md new file mode 100644 index 0000000..30f17a3 --- /dev/null +++ b/rtmanager/docs/README.md @@ -0,0 +1,44 @@ +# Runtime Manager — Service-Local Documentation + +This directory hosts the service-local documentation for `Runtime +Manager`. The top-level [`../README.md`](../README.md) describes the +current-state contract (purpose, scope, lifecycles, surfaces, +configuration, observability); the documents below complement it with +focused content docs and design-rationale records. + +## Content docs + +- [Runtime and components](runtime.md) — process diagram, listeners, + workers, lifecycle services, stream offsets, configuration groups, + runtime invariants. +- [Flows](flows.md) — mermaid sequence diagrams for the lifecycle and + observability flows. +- [Operator runbook](runbook.md) — startup, readiness, shutdown, and + recovery scenarios. +- [Configuration and contract examples](examples.md) — `.env`, + REST request bodies, stream payloads, storage inspection snippets. + +## Design rationale + +- [PostgreSQL schema decisions](postgres-migration.md) — the schema + decision record consolidating the persistence-layer agreements + (tables, indexes, CAS shape, `created_at` preservation, jsonb + round-trip, schema/role provisioning split). +- [Domain and ports](domain-and-ports.md) — string-typed enums, the + four allowed runtime transitions, why `Inspect` splits into + `InspectImage` / `InspectContainer`, why `LobbyGameRecord` is + minimal, and other domain-layer choices. +- [Adapters](adapters.md) — Docker SDK adapter, Lobby internal HTTP + client, the three Redis publishers, the `mockgen` convention for + wide ports, and the unit-test strategy for HTTP-backed adapters. +- [Lifecycle services](services.md) — per-game lease semantics, the + `Result`-shaped contract, failure-mode tables, the lease-bypass + `Run` method on inner services, the `X-Galaxy-Caller` header + convention, and the canonical error code → HTTP status mapping. +- [Background workers](workers.md) — single-ownership table per + `event_type`, `container_disappeared` suppression rules, probe + hysteresis, the events listener reconnect policy, the reconciler's + per-game lease and three drift kinds. +- [Service-local integration suite](integration-tests.md) — the + `integration` build tag, the in-process `app.NewRuntime` choice, + the Lobby HTTP stub, and the test isolation strategy. diff --git a/rtmanager/docs/adapters.md b/rtmanager/docs/adapters.md new file mode 100644 index 0000000..6b64a4d --- /dev/null +++ b/rtmanager/docs/adapters.md @@ -0,0 +1,192 @@ +# Adapters + +This document explains why the production adapters under +[`../internal/adapters/`](../internal/adapters) — Docker SDK, +Lobby internal HTTP client, notification-intent publisher, health-event +publisher, job-result publisher — are shaped the way they are. The +PostgreSQL stores and the Redis-coordination adapters live in +[`postgres-migration.md`](postgres-migration.md). + +## 1. `mockgen` is the repo-wide convention for wide ports + +The Docker port has nine methods plus eight value types in the +signatures, and most lifecycle services exercise nearly every method +pair (start, stop, restart, patch, cleanup, reconcile, events, probe). +A hand-rolled fake would either miss methods or balloon to a per-test +fixture. + +`internal/adapters/docker/` therefore uses `go.uber.org/mock` mocks: + +- `//go:generate` directives live next to the interface declaration in + `internal/ports/dockerclient.go`; +- generated code is committed under `internal/adapters/docker/mocks/` + (matching the `internal/adapters/postgres/jet/` discipline); +- `make -C rtmanager mocks` is the single command operators run after + a port-signature change. + +The maintained `go.uber.org/mock` fork is preferred over the archived +`github.com/golang/mock`. This convention applies to wide / recorder +ports across the repository — Lobby uses the same pipeline for its +narrow recorder ports (`RuntimeManager`, `IntentPublisher`, +`GMClient`, `UserService`); see +[`../../ARCHITECTURE.md`](../../ARCHITECTURE.md) for the cross-service +rule. + +The other two RTM ports (`LobbyInternalClient`, +`NotificationIntentPublisher`) keep inline `_test.go` fakes: small +surfaces, easy to fake by hand inside a single test file when needed. + +## 2. `EngineEndpoint` is built inside the Docker adapter + +The engine port is fixed at `8080`. Pushing it into `RunSpec` would +force the start service to know an engine implementation detail; +pushing it into config would give operators a knob that the engine +image already does not honour. The Docker adapter exposes +`EnginePort = 8080` as a package constant and constructs +`RunResult.EngineEndpoint = "http://" + spec.Hostname + ":8080"` +itself. + +The adapter also leaves `container.Config.ExposedPorts` empty: RTM +never publishes ports to the host. The user-defined Docker bridge +network gives every container in the network DNS access to the engine +via `galaxy-game-{game_id}:8080`. + +## 3. `Run` removes the container on `ContainerStart` failure + +`README.md §Lifecycles → Start` requires no orphan to remain after a +failed start path. If `ContainerCreate` succeeds but `ContainerStart` +fails, the adapter calls `ContainerRemove(force=true)` inside a fresh +`context.Background()` (with a 10s timeout) so the cleanup runs even +when the original ctx is already cancelled. The cleanup is best-effort: +a remove failure is silently discarded because the original start +failure is the actionable error returned to the caller. + +The alternative — leaving rollback to the start service — would either +duplicate the same code in every caller or invite a service that forgets +to do it. Centralising the rule in the adapter keeps the port contract +simple. The start service adds an additional rollback layer for the +post-`Run` `Upsert` failure path; see [`services.md`](services.md) §5. + +## 4. `RunSpec.Cmd` is optional + +`ports.RunSpec` exposes an optional `Cmd []string`. Production callers +leave it `nil` so the engine image's own `CMD` runs; +`internal/adapters/docker/smoke_test.go` uses it to drive +`["/bin/sh","-c","sleep 60"]` against `alpine:3.21`. + +The alternative — building a dedicated test image with a pre-baked +`sleep` command — would require an extra `Dockerfile` under testdata +and a build step inside the smoke test. The single new field is +documented as optional and ignored when empty; production behaviour is +unchanged. + +## 5. `EventsListen` filters at the adapter boundary + +The Docker `/events` API accepts a `filters` query parameter, but the +daemon treats it as a hint, not a guarantee. The adapter therefore +double-checks at the boundary: only `Type == events.ContainerEventType` +messages are passed through to the typed `<-chan ports.DockerEvent`. +Doing the filter at the SDK level would still require a defensive +recheck on the consumer side; consolidating the check in the adapter +keeps the contract crisp and the consumer free of Docker-internal type +discriminants. + +The decoded event copies the actor's full `Attributes` map into +`DockerEvent.Labels`. Docker mixes container labels and runtime +attributes (`exitCode`, `image`, `name`, etc.) flat in the same map; +RTM consumers filter by the `com.galaxy.` prefix when they care about +labels, and the adapter extracts `exitCode` separately for `die` +events. + +## 6. Lobby HTTP client error mapping + +`ports.LobbyInternalClient.GetGame` fixes: + +- `200` → `LobbyGameRecord` decoded tolerantly (unknown fields + ignored); +- `404` → `ports.ErrLobbyGameNotFound`; +- transport, timeout, or any other non-2xx → `ports.ErrLobbyUnavailable` + wrapped with the original error so callers can `errors.Is` and still + log the cause. + +The start service treats `ErrLobbyUnavailable` as recoverable: it +continues without the diagnostic data because the start envelope +already carries the only required field (`image_ref`). The client +mirrors `notification/internal/adapters/userservice/client.go`: cloned +`*http.Transport`, `otelhttp.NewTransport` wrap, per-request +`context.WithTimeout`, idempotent `Close()` releasing idle connections. + +JSON decoding is tolerant: unknown fields in the success body do not +break the call, so additive changes to Lobby's `GameRecord` schema do +not require an RTM release. + +## 7. Notification publisher wrapper signature + +The wrapper drops the entry id returned by +`notificationintent.Publisher.Publish` (rationale in +[`domain-and-ports.md`](domain-and-ports.md) §7). The adapter is a +thin shim: + +- `NewPublisher(cfg)` constructs the inner publisher and forwards + validation; +- `Publish(ctx, intent)` calls the inner publisher and discards the + entry id. + +The compile-time assertion `var _ ports.NotificationIntentPublisher = +(*Publisher)(nil)` lives in `publisher.go`. + +## 8. Health-events publisher: snapshot upsert before stream XADD + +Every emission goes through +`ports.HealthEventPublisher.Publish`, which both XADDs to +`runtime:health_events` and upserts `health_snapshots`. The snapshot +upsert runs **before** the XADD: a successful Publish always leaves +the snapshot store at least as fresh as the stream, and a partial +failure leaves the snapshot a best-effort lower bound. Reversing the +order would let consumers observe a stream entry whose +`health_snapshots` row reflects the prior observation — a misleading +inversion. + +The `event_type → SnapshotStatus / SnapshotSource` mapping mirrors the +table in [`../README.md` §Health Monitoring](../README.md). In +particular, `container_started` collapses to `SnapshotStatusHealthy` +and `probe_recovered` does the same (rationale in +[`domain-and-ports.md`](domain-and-ports.md) §4). + +## 9. Unit-test strategy + +Both HTTP-backed adapters (Docker SDK, Lobby client) use +`httptest.Server` fixtures. The Docker SDK speaks HTTP under the hood +for both unix sockets and TCP, so adapter unit tests construct a +Docker client with `client.WithHost(server.URL)` and +`client.WithHTTPClient(server.Client())`, which lets table-driven +handlers fake every Docker API endpoint without touching the real +daemon. The Docker API version is pinned to `1.45` +(`client.WithVersion("1.45")`) so the URL prefix is stable across CI +machines whose daemon advertises a different default. Production +wiring (in `internal/app/bootstrap.go`) keeps API negotiation enabled. + +The notification publisher uses `miniredis` directly because the +adapter's only side effect is an `XADD`, which `miniredis` reproduces +faithfully and matches every other Galaxy intent test. + +## 10. Docker smoke test + +`internal/adapters/docker/smoke_test.go` runs on the default +`go test ./...` invocation and calls `t.Skip` unless the local daemon +is reachable (`/var/run/docker.sock` exists or `DOCKER_HOST` is set). +The covered sequence: + +1. provision a temporary user-defined bridge network; +2. assert `EnsureNetwork` for present and missing names; +3. pull `alpine:3.21` (`PullPolicyIfMissing`); +4. subscribe to events; +5. run a sleep container with the full `RunSpec` field set; +6. observe a `start` event for the new container id; +7. inspect, stop, remove, and verify `ErrContainerNotFound` is + reported afterwards. + +This is the production adapter's only end-to-end check that runs from +the default `go test` pass; the broader service-local integration +suite ([`integration-tests.md`](integration-tests.md)) is gated +behind `-tags=integration`. diff --git a/rtmanager/docs/domain-and-ports.md b/rtmanager/docs/domain-and-ports.md new file mode 100644 index 0000000..8283e94 --- /dev/null +++ b/rtmanager/docs/domain-and-ports.md @@ -0,0 +1,167 @@ +# Domain and Ports + +This document explains why the `rtmanager` domain layer +([`../internal/domain/`](../internal/domain)) and the port interfaces +([`../internal/ports/`](../internal/ports)) are shaped the way they are. +The current-state types and method signatures are the source of truth in +the code; this file records the rationale so future readers do not +re-litigate the same trade-offs. + +For the surrounding behaviour see +[`../README.md`](../README.md), the SQL CHECK constraints in +[`../internal/adapters/postgres/migrations/00001_init.sql`](../internal/adapters/postgres/migrations/00001_init.sql), +the wire contracts under [`../api/`](../api), and +[`postgres-migration.md`](postgres-migration.md) for the persistence +layer. + +## 1. String-typed status enums + +`runtime.Status`, `operation.OpKind`, `operation.OpSource`, +`operation.Outcome`, `health.EventType`, `health.SnapshotStatus`, and +`health.SnapshotSource` are all `type X string`. + +The string approach wins on three counts: + +- the SQL CHECK constraints already store the values as `text`, so a + string domain type maps one-to-one with no codec layer; +- it matches Lobby (`game.Status`, `membership.Status`, + `application.Status`), so reviewers do not switch encoding mental + models when crossing service boundaries; +- `IsKnown` keeps the invariant cheap (a single switch); a `type X uint8` + with stringer-generated names would pay a constant lookup and make raw + SQL columns harder to read in diagnostics. + +## 2. Plain `string` for `CurrentContainerID` and `CurrentImageRef` + +The PostgreSQL columns are nullable. The domain model uses plain +`string` with empty == NULL and bridges the SQL nullability inside the +adapter. Pointer fields would force every consumer to dereference +defensively even though business logic rarely cares about the +NULL/empty distinction (removed records may legitimately carry either +form depending on whether the record passed through `stopped` first). + +The adapter's job is to translate `sql.NullString` ⇄ `string`; the rest +of the codebase reads the field as a regular value. + +## 3. `*time.Time` for nullable timestamps + +`StartedAt`, `StoppedAt`, `RemovedAt` retain pointer types. `time.Time{}` +is a real, comparable value in Go (`IsZero` only reports the canonical +zero time); mixing "missing" and "set to UTC zero" through plain +`time.Time` would invite bugs. The jet-generated `model.RuntimeRecords` +already declares the same fields as `*time.Time`, so the domain type +aligns with the persistence type and the adapter does not re-shape +pointers. + +## 4. `EventType` and `SnapshotStatus` are deliberately distinct + +`runtime-health-asyncapi.yaml.EventType` enumerates seven values; the +SQL CHECK on `health_snapshots.status` enumerates six. The two sets +overlap but are not identical: + +- `container_started` is an *event*; the snapshot collapses it to + `healthy` (a successful start is observed as the container being + live, not as an ongoing event); +- `probe_recovered` is an *event*; it does not become a snapshot row of + its own — the next inspect/probe overwrites the prior `probe_failed` + with `healthy`. + +Modelling them as one shared enum would require a separate "event vs +snapshot" boolean and invite accidental mismatches. Two distinct types +with explicit `IsKnown` matrices keep each surface honest at compile +time. + +## 5. `Inspect` split into `InspectImage` + `InspectContainer` + +Two narrow methods replace a single polymorphic `Inspect`. The surface +RTM exercises has two shapes: + +- the start service inspects the *image* by reference to read resource + limits from labels; +- the periodic inspect worker, the reconciler, and the events listener + inspect *containers* by id to read state, health, restart count, and + exit code. + +The inputs differ (ref vs id), and the result types differ +(`ImageInspect.Labels` is the only field used at start time, while +`ContainerInspect` carries a dozen state fields). One polymorphic +method would either split internally on input type or return a tagged +union; either is messier than two narrow methods. + +## 6. `LobbyGameRecord` is intentionally minimal + +`LobbyInternalClient.GetGame` returns `GameID`, `Status`, and +`TargetEngineVersion`. The fetch is classified as ancillary diagnostics +because the start envelope already carries the only required field +(`image_ref`). + +Anything more would invite RTM consumers to depend on Lobby's schema in +ways that violate the "RTM never resolves engine versions" rule. +Future fields are additive: each new field is opt-in to the consumer +and does not break existing call sites. The minimalism is also a hedge +against schema drift — Lobby's `GameRecord` is large and changes more +often than RTM needs to track. + +## 7. `NotificationIntentPublisher.Publish` returns `error`, not `(string, error)` + +Lobby's `IntentPublisher.Publish` returns the Redis Stream entry id so +business workflows that key on it (idempotency keys, audit +correlation) can capture it. RTM publishes admin-only failure intents +where the entry id has no consumer — failing starts do not loop back +to RTM, and notification routing keys on the producer-supplied +`idempotency_key` rather than the stream id. The adapter wraps +`pkg/notificationintent.Publisher` and discards the entry id at the +wrapper boundary. + +## 8. Exactly four allowed runtime transitions + +`runtime.AllowedTransitions` covers: + +- `running → stopped` — graceful stop, observed exit, reconcile + observed exited; +- `running → removed` — `reconcile_dispose` when the container + vanished; +- `stopped → running` — restart and patch inner start; +- `stopped → removed` — cleanup TTL or admin DELETE. + +Other pairs are intentionally rejected: + +- `running → running` and `stopped → stopped` would mean Upsert + overwrote state without a CAS guard. Idempotent re-start / re-stop + never transitions; the service layer returns `replay_no_op` and the + record is left untouched. +- `removed → *` is forbidden because `removed` is terminal. The + reconciler creates fresh records with `reconcile_adopt` rather than + resurrecting old ones. + +Encoding the table this way means a future bug where a service tries +to revive a removed record is rejected at the domain layer rather than +the adapter, which keeps the failure mode close to the offending code. + +## 9. `PullPolicy` re-declared inside `ports/dockerclient.go` + +The same enum exists as `config.ImagePullPolicy`. Importing +`internal/config` from the ports package would couple two unrelated +layers and create a cyclic risk once the wiring layer pulls both in. +The runtime/wiring layer (in `internal/app`) is the single point that +translates between the two type aliases — both are `string`-typed, the +value sets are identical, and the validation lives on each side +independently. + +## 10. Compile-time interface assertions live with adapters + +Every interface has a `var _ ports.X = (*Y)(nil)` assertion, but the +assertion lives in the adapter package (e.g. +`var _ ports.RuntimeRecordStore = (*Store)(nil)` inside +`internal/adapters/postgres/runtimerecordstore`). Putting the +assertions in the port package would force the port package to import +its own implementations and create an obvious import cycle. + +## 11. `RunSpec.Validate` lives on the request type + +The Docker port carries a non-trivial request type (`RunSpec`) with +eight required fields and per-mount invariants. Putting `Validate` on +the request struct keeps the rule next to the type definition, mirrors +the pattern used by `lobby/internal/ports/gmclient.go` +(`RegisterGameRequest.Validate`), and lets the adapter call it as the +first defensive check before invoking the Docker SDK. diff --git a/rtmanager/docs/examples.md b/rtmanager/docs/examples.md new file mode 100644 index 0000000..da2147f --- /dev/null +++ b/rtmanager/docs/examples.md @@ -0,0 +1,429 @@ +# Configuration And Contract Examples + +The examples below are illustrative. Replace `localhost`, port +numbers, IDs, and timestamps with values that match the deployment +under inspection. + +## Example `.env` + +A minimum-viable `RTMANAGER_*` set for a local run against a single +Redis container plus a PostgreSQL container with the `rtmanager` +schema and the `rtmanagerservice` role provisioned. The full list +with defaults lives in [`../README.md` §Configuration](../README.md). + +```bash +# Required +RTMANAGER_INTERNAL_HTTP_ADDR=:8096 +RTMANAGER_POSTGRES_PRIMARY_DSN=postgres://rtmanagerservice:rtmanagerservice@127.0.0.1:5432/galaxy?search_path=rtmanager&sslmode=disable +RTMANAGER_REDIS_MASTER_ADDR=127.0.0.1:6379 +RTMANAGER_REDIS_PASSWORD=local +RTMANAGER_DOCKER_HOST=unix:///var/run/docker.sock +RTMANAGER_DOCKER_NETWORK=galaxy-net +RTMANAGER_GAME_STATE_ROOT=/var/lib/galaxy/games + +# Lobby internal client (diagnostic GET only in v1) +RTMANAGER_LOBBY_INTERNAL_BASE_URL=http://127.0.0.1:8095 +RTMANAGER_LOBBY_INTERNAL_TIMEOUT=2s + +# Container defaults (image labels override these per container) +RTMANAGER_DEFAULT_CPU_QUOTA=1.0 +RTMANAGER_DEFAULT_MEMORY=512m +RTMANAGER_DEFAULT_PIDS_LIMIT=512 +RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS=30 +RTMANAGER_CONTAINER_RETENTION_DAYS=30 +RTMANAGER_ENGINE_STATE_MOUNT_PATH=/var/lib/galaxy-game +RTMANAGER_ENGINE_STATE_ENV_NAME=GAME_STATE_PATH +RTMANAGER_GAME_STATE_DIR_MODE=0750 +RTMANAGER_GAME_STATE_OWNER_UID=0 +RTMANAGER_GAME_STATE_OWNER_GID=0 + +# Workers +RTMANAGER_INSPECT_INTERVAL=30s +RTMANAGER_PROBE_INTERVAL=15s +RTMANAGER_PROBE_TIMEOUT=2s +RTMANAGER_PROBE_FAILURES_THRESHOLD=3 +RTMANAGER_RECONCILE_INTERVAL=5m +RTMANAGER_CLEANUP_INTERVAL=1h + +# Coordination +RTMANAGER_GAME_LEASE_TTL_SECONDS=60 + +# Process and logging +RTMANAGER_LOG_LEVEL=info +RTMANAGER_SHUTDOWN_TIMEOUT=30s + +# Telemetry (disabled for local dev — enable to ship traces / metrics) +OTEL_SERVICE_NAME=galaxy-rtmanager +OTEL_TRACES_EXPORTER=none +OTEL_METRICS_EXPORTER=none +``` + +For a production-shaped deployment, set +`RTMANAGER_IMAGE_PULL_POLICY=always` (forces a pull on every start so +a tag mutation is immediately visible to the next runtime), +`RTMANAGER_GAME_STATE_OWNER_UID` / `_GID` to match the engine +container's user, and configure `OTEL_*` against the cluster's OTLP +collector. The `RTMANAGER_DOCKER_LOG_DRIVER` / +`RTMANAGER_DOCKER_LOG_OPTS` pair routes engine stdout/stderr to the +sink the operator runs (fluentd, journald, etc.). + +For tests, point `RTMANAGER_POSTGRES_PRIMARY_DSN` and +`RTMANAGER_REDIS_MASTER_ADDR` at the testcontainers fixtures the +service-local harness brings up +([`integration-tests.md` §7](integration-tests.md)). + +## Internal HTTP Examples + +Every endpoint admits the optional `X-Galaxy-Caller` header which the +handler records as `op_source` in `operation_log` (`gm` → `gm_rest`, +`admin` → `admin_rest`; missing or unknown values default to +`admin_rest` in v1). Decision: [`services.md` §18](services.md). + +### Probe a runtime record + +```bash +curl -s -H 'X-Galaxy-Caller: gm' \ + http://localhost:8096/api/v1/internal/runtimes/game-01HZ... +``` + +Response (`200 OK`): + +```json +{ + "game_id": "game-01HZ...", + "status": "running", + "current_container_id": "1f2a...", + "current_image_ref": "galaxy/game:1.4.0", + "engine_endpoint": "http://galaxy-game-game-01HZ...:8080", + "state_path": "/var/lib/galaxy/games/game-01HZ...", + "docker_network": "galaxy-net", + "started_at": "2026-04-28T07:18:54Z", + "stopped_at": null, + "removed_at": null, + "last_op_at": "2026-04-28T07:18:54Z", + "created_at": "2026-04-28T07:18:54Z" +} +``` + +### List all runtimes + +```bash +curl -s -H 'X-Galaxy-Caller: admin' \ + http://localhost:8096/api/v1/internal/runtimes +``` + +The response shape is `{"items":[...]}`. + +### Start a runtime + +```bash +curl -s -X POST \ + -H 'Content-Type: application/json' \ + -H 'X-Galaxy-Caller: gm' \ + http://localhost:8096/api/v1/internal/runtimes/game-01HZ.../start \ + -d '{"image_ref": "galaxy/game:1.4.0"}' +``` + +A `200` returns the `RuntimeRecord` for the running runtime. Failure +shapes use the canonical envelope; e.g. an invalid image_ref: + +```json +{ + "error": { + "code": "start_config_invalid", + "message": "image_ref shape rejected by docker reference parser" + } +} +``` + +### Stop a runtime + +```bash +curl -s -X POST \ + -H 'Content-Type: application/json' \ + -H 'X-Galaxy-Caller: admin' \ + http://localhost:8096/api/v1/internal/runtimes/game-01HZ.../stop \ + -d '{"reason": "admin_request"}' +``` + +Valid `reason` values: +`orphan_cleanup | cancelled | finished | admin_request | timeout`. + +### Restart a runtime + +```bash +curl -s -X POST \ + -H 'X-Galaxy-Caller: admin' \ + http://localhost:8096/api/v1/internal/runtimes/game-01HZ.../restart +``` + +The body is empty; restart re-uses the current `image_ref`. + +### Patch a runtime + +```bash +curl -s -X POST \ + -H 'Content-Type: application/json' \ + -H 'X-Galaxy-Caller: admin' \ + http://localhost:8096/api/v1/internal/runtimes/game-01HZ.../patch \ + -d '{"image_ref": "galaxy/game:1.4.2"}' +``` + +Patch enforces the semver-only rule: a non-semver tag returns +`image_ref_not_semver`; a cross-major or cross-minor change returns +`semver_patch_only`. + +### Cleanup a stopped runtime container + +```bash +curl -s -X DELETE \ + -H 'X-Galaxy-Caller: admin' \ + http://localhost:8096/api/v1/internal/runtimes/game-01HZ.../container +``` + +Cleanup refuses a `running` runtime with `409 conflict`; stop first. + +## Stream Payload Examples + +Every stream key shape is configurable via `RTMANAGER_REDIS_*_STREAM`; +the defaults are used below. Field types and required/optional +semantics are frozen by +[`../api/runtime-jobs-asyncapi.yaml`](../api/runtime-jobs-asyncapi.yaml) +and +[`../api/runtime-health-asyncapi.yaml`](../api/runtime-health-asyncapi.yaml). + +### `runtime:start_jobs` (Lobby → RTM) + +```bash +redis-cli XADD runtime:start_jobs '*' \ + game_id 'game-01HZ...' \ + image_ref 'galaxy/game:1.4.0' \ + requested_at_ms 1714081234567 +``` + +### `runtime:stop_jobs` (Lobby → RTM) + +```bash +redis-cli XADD runtime:stop_jobs '*' \ + game_id 'game-01HZ...' \ + reason 'cancelled' \ + requested_at_ms 1714081234567 +``` + +### `runtime:job_results` (RTM → Lobby) + +Success envelope: + +```bash +redis-cli XADD runtime:job_results '*' \ + game_id 'game-01HZ...' \ + outcome 'success' \ + container_id '1f2a...' \ + engine_endpoint 'http://galaxy-game-game-01HZ...:8080' \ + error_code '' \ + error_message '' +``` + +Failure envelope: + +```bash +redis-cli XADD runtime:job_results '*' \ + game_id 'game-01HZ...' \ + outcome 'failure' \ + container_id '' \ + engine_endpoint '' \ + error_code 'image_pull_failed' \ + error_message 'pull failed: manifest unknown' +``` + +Idempotent replay envelope (success outcome with explicit +`replay_no_op`): + +```bash +redis-cli XADD runtime:job_results '*' \ + game_id 'game-01HZ...' \ + outcome 'success' \ + container_id '1f2a...' \ + engine_endpoint 'http://galaxy-game-game-01HZ...:8080' \ + error_code 'replay_no_op' \ + error_message '' +``` + +The contract permits empty `container_id` and `engine_endpoint` +strings on every value of `outcome` so the consumer can decode the +envelope uniformly ([`workers.md` §11](workers.md)). + +### `runtime:health_events` (RTM out) + +The wire shape is the same for every event type — only the +`details` payload differs. + +`container_started`: + +```bash +redis-cli XADD runtime:health_events '*' \ + game_id 'game-01HZ...' \ + container_id '1f2a...' \ + event_type 'container_started' \ + occurred_at_ms 1714081234567 \ + details '{"image_ref":"galaxy/game:1.4.0"}' +``` + +`container_exited`: + +```bash +redis-cli XADD runtime:health_events '*' \ + game_id 'game-01HZ...' \ + container_id '1f2a...' \ + event_type 'container_exited' \ + occurred_at_ms 1714081234567 \ + details '{"exit_code":137,"oom":false}' +``` + +`container_oom`: + +```bash +redis-cli XADD runtime:health_events '*' \ + game_id 'game-01HZ...' \ + container_id '1f2a...' \ + event_type 'container_oom' \ + occurred_at_ms 1714081234567 \ + details '{"exit_code":137}' +``` + +`container_disappeared`: + +```bash +redis-cli XADD runtime:health_events '*' \ + game_id 'game-01HZ...' \ + container_id '1f2a...' \ + event_type 'container_disappeared' \ + occurred_at_ms 1714081234567 \ + details '{}' +``` + +`inspect_unhealthy`: + +```bash +redis-cli XADD runtime:health_events '*' \ + game_id 'game-01HZ...' \ + container_id '1f2a...' \ + event_type 'inspect_unhealthy' \ + occurred_at_ms 1714081234567 \ + details '{"restart_count":3,"state":"running","health":"unhealthy"}' +``` + +`probe_failed` (after the threshold is crossed): + +```bash +redis-cli XADD runtime:health_events '*' \ + game_id 'game-01HZ...' \ + container_id '1f2a...' \ + event_type 'probe_failed' \ + occurred_at_ms 1714081234567 \ + details '{"consecutive_failures":3,"last_status":0,"last_error":"context deadline exceeded"}' +``` + +`probe_recovered`: + +```bash +redis-cli XADD runtime:health_events '*' \ + game_id 'game-01HZ...' \ + container_id '1f2a...' \ + event_type 'probe_recovered' \ + occurred_at_ms 1714081234567 \ + details '{"prior_failure_count":3}' +``` + +### `notification:intents` (RTM admin notifications) + +RTM publishes admin-only notification intents only for the three +first-touch start failures. Every payload shares the frozen field +set `{game_id, image_ref, error_code, error_message, +attempted_at_ms}` +([`../README.md` §Notification Contracts](../README.md#notification-contracts)). + +`runtime.image_pull_failed`: + +```bash +redis-cli XADD notification:intents '*' \ + envelope '{ + "type": "runtime.image_pull_failed", + "producer": "rtmanager", + "idempotency_key": "runtime.image_pull_failed:game-01HZ...:1714081234567", + "audience": {"kind": "admin_email", "email_address_kind": "runtime_image_pull_failed"}, + "payload": { + "game_id": "game-01HZ...", + "image_ref": "galaxy/game:1.4.0", + "error_code": "image_pull_failed", + "error_message": "pull failed: manifest unknown", + "attempted_at_ms": 1714081234567 + } + }' +``` + +`runtime.container_start_failed` and `runtime.start_config_invalid` +share the same envelope with their respective `type` and +`error_code` values. + +## Storage Inspection + +### Inspect a runtime record (PostgreSQL) + +```bash +psql "$RTMANAGER_POSTGRES_PRIMARY_DSN" -c \ + "SELECT * FROM rtmanager.runtime_records WHERE game_id = 'game-01HZ...'" +``` + +Columns mirror the fields documented in +[`../README.md` §Persistence Layout](../README.md#persistence-layout). + +### Inspect runtime status counts + +```bash +psql "$RTMANAGER_POSTGRES_PRIMARY_DSN" -c \ + "SELECT status, COUNT(*) FROM rtmanager.runtime_records GROUP BY status" +``` + +### Inspect the operation log for a game + +```bash +psql "$RTMANAGER_POSTGRES_PRIMARY_DSN" -c \ + "SELECT id, op_kind, op_source, outcome, error_code, + started_at, finished_at + FROM rtmanager.operation_log + WHERE game_id = 'game-01HZ...' + ORDER BY started_at DESC, id DESC + LIMIT 50" +``` + +### Inspect the latest health snapshot + +```bash +psql "$RTMANAGER_POSTGRES_PRIMARY_DSN" -c \ + "SELECT game_id, container_id, status, source, observed_at, details + FROM rtmanager.health_snapshots + WHERE game_id = 'game-01HZ...'" +``` + +### Inspect Redis runtime-coordination keys + +```bash +# Stream offsets +redis-cli GET rtmanager:stream_offsets:startjobs +redis-cli GET rtmanager:stream_offsets:stopjobs + +# Per-game lease (only present while an operation is in flight) +redis-cli GET rtmanager:game_lease:game-01HZ... +redis-cli TTL rtmanager:game_lease:game-01HZ... + +# Recent stream entries +redis-cli XRANGE runtime:start_jobs - + COUNT 20 +redis-cli XRANGE runtime:job_results - + COUNT 20 +redis-cli XRANGE runtime:health_events - + COUNT 50 + +# Stream metadata +redis-cli XINFO STREAM runtime:start_jobs +redis-cli XINFO STREAM runtime:stop_jobs +redis-cli XINFO STREAM runtime:health_events +``` diff --git a/rtmanager/docs/flows.md b/rtmanager/docs/flows.md new file mode 100644 index 0000000..8a1939f --- /dev/null +++ b/rtmanager/docs/flows.md @@ -0,0 +1,305 @@ +# Flows + +This document collects the lifecycle and observability flows that +span Runtime Manager and its synchronous and asynchronous neighbours. +Narrative descriptions of the rules these flows enforce live in +[`../README.md`](../README.md); the diagrams here focus on the message +order across the boundary. Design-rationale records linked from each +section explain the *why*. + +## Start (happy path) + +```mermaid +sequenceDiagram + participant Lobby as Lobby publisher + participant Stream as runtime:start_jobs + participant Consumer as startjobsconsumer + participant Service as startruntime + participant Lease as Redis lease + participant Docker + participant PG as Postgres + participant Health as runtime:health_events + participant Results as runtime:job_results + + Lobby->>Stream: XADD {game_id, image_ref, requested_at_ms} + Consumer->>Stream: XREAD + Consumer->>Service: Handle(game_id, image_ref, OpSourceLobbyStream, entry_id) + Service->>Lease: SET NX PX rtmanager:game_lease:{game_id} + Service->>PG: SELECT runtime_records WHERE game_id + Service->>Docker: PullImage(image_ref) per pull policy + Service->>Docker: InspectImage → resource limits + Service->>Service: prepareStateDir(/{game_id}) + Service->>Docker: ContainerCreate + ContainerStart + Service->>PG: Upsert runtime_records (status=running) + Service->>PG: INSERT operation_log (op_kind=start, outcome=success) + Service->>Health: XADD container_started + Service-->>Consumer: Result{Outcome=success, ContainerID, EngineEndpoint} + Consumer->>Results: XADD {outcome=success, container_id, engine_endpoint} + Service->>Lease: DEL rtmanager:game_lease:{game_id} +``` + +REST callers (Game Master, Admin Service) drive the same service +through `POST /api/v1/internal/runtimes/{game_id}/start`; the +diagram's last two arrows collapse to an HTTP `200` response carrying +the runtime record. Sources: +[`../README.md` §Lifecycles → Start](../README.md#start), +[`services.md` §3](services.md). + +## Start failure (image pull) + +```mermaid +sequenceDiagram + participant Service as startruntime + participant Docker + participant PG as Postgres + participant Intents as notification:intents + participant Results as runtime:job_results + + Service->>Docker: PullImage(image_ref) + Docker-->>Service: error + Service->>PG: INSERT operation_log (op_kind=start, outcome=failure, error_code=image_pull_failed) + Service->>Intents: XADD runtime.image_pull_failed {game_id, image_ref, error_code, error_message, attempted_at_ms} + Service-->>Service: Result{Outcome=failure, ErrorCode=image_pull_failed} + Service->>Results: XADD {outcome=failure, error_code=image_pull_failed} +``` + +The same shape applies to the configuration-validation failures +(`start_config_invalid` from `EnsureNetwork(ErrNetworkMissing)`, +`prepareStateDir`, or invalid `image_ref` shape) and the Docker +create/start failure (`container_start_failed`); only the error code +and the matching `runtime.*` notification type differ. Three failure +codes do **not** raise an admin notification: `conflict`, +`service_unavailable`, `internal_error` +([`services.md` §4](services.md)). + +## Start failure (orphan / Upsert-after-Run rollback) + +```mermaid +sequenceDiagram + participant Service as startruntime + participant Docker + participant PG as Postgres + participant Intents as notification:intents + + Service->>Docker: ContainerCreate + ContainerStart + Docker-->>Service: container running + Service->>PG: Upsert runtime_records + PG-->>Service: error (transport / constraint) + Note over Service: container is now an orphan
(running, no PG record) + Service->>Docker: Remove(container_id) [fresh background context] + Docker-->>Service: ok or logged failure + Service->>PG: INSERT operation_log (outcome=failure, error_code=container_start_failed) + Service->>Intents: XADD runtime.container_start_failed + Service-->>Service: Result{Outcome=failure, ErrorCode=container_start_failed} +``` + +The Docker adapter already removes the container when `Run` itself +fails after a successful `ContainerCreate` +([`adapters.md` §3](adapters.md)); the start service adds the +post-`Run` rollback for the `Upsert` path. A `Remove` failure is +logged but not propagated; the reconciler adopts surviving orphans on +its periodic pass ([`services.md` §5](services.md)). + +## Stop + +```mermaid +sequenceDiagram + participant Caller as Lobby / GM / Admin + participant Service as stopruntime + participant Lease as Redis lease + participant PG as Postgres + participant Docker + participant Results as runtime:job_results + + Caller->>Service: stop(game_id, reason) + Service->>Lease: SET NX PX rtmanager:game_lease:{game_id} + Service->>PG: SELECT runtime_records WHERE game_id + alt status in {stopped, removed} + Service->>PG: INSERT operation_log (outcome=success, error_code=replay_no_op) + Service-->>Caller: success / replay_no_op + else status = running + Service->>Docker: ContainerStop(container_id, RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS) + Docker-->>Service: ok + Service->>PG: UpdateStatus running→stopped (CAS by container_id) + Service->>PG: INSERT operation_log (op_kind=stop, outcome=success) + Service-->>Caller: success + end + Service->>Lease: DEL rtmanager:game_lease:{game_id} +``` + +Lobby callers receive the outcome through `runtime:job_results`; REST +callers receive an HTTP `200`. The `reason` enum +(`orphan_cleanup | cancelled | finished | admin_request | timeout`) +is recorded in `operation_log` and is otherwise opaque to the stop +service — RTM does not branch on the reason in v1 +([`services.md` §15, §17](services.md)). + +## Restart + +```mermaid +sequenceDiagram + participant Admin as GM / Admin + participant Service as restartruntime + participant Stop as stopruntime.Run + participant Start as startruntime.Run + participant Docker + participant PG as Postgres + + Admin->>Service: POST /restart + Service->>PG: SELECT runtime_records WHERE game_id + Note over Service: capture current image_ref + Service->>Service: acquire per-game lease (held across both inner ops) + Service->>Stop: Run(game_id) [lease bypass] + Stop->>Docker: ContainerStop + Stop->>PG: UpdateStatus running→stopped + Service->>Docker: ContainerRemove + Service->>Start: Run(game_id, image_ref) [lease bypass] + Start->>Docker: PullImage / Run + Start->>PG: Upsert runtime_records (status=running) + Service->>PG: INSERT operation_log (op_kind=restart, outcome=success, source_ref=correlation_id) + Service-->>Admin: 200 {runtime_record} + Service->>Service: release lease +``` + +The lease is acquired by `restartruntime` and held across both inner +operations; `stopruntime.Run` and `startruntime.Run` are +lease-bypass entry points that skip the inner lease acquisition +([`services.md` §12](services.md)). The single `operation_log` row +uses `Input.SourceRef` as a correlation id linking the implicit stop +and start entries ([`services.md` §13](services.md)). + +## Patch + +```mermaid +sequenceDiagram + participant Admin as GM / Admin + participant Service as patchruntime + participant Restart as restartruntime.Run + + Admin->>Service: POST /patch {image_ref: "galaxy/game:1.4.2"} + Service->>Service: parse new image_ref + current image_ref + alt either ref not semver + Service-->>Admin: 422 image_ref_not_semver + else major or minor differ + Service-->>Admin: 422 semver_patch_only + else major.minor match, patch differs (or equal) + Service->>Restart: Run(game_id, new_image_ref) + Restart-->>Service: Result + Service-->>Admin: 200 {runtime_record} + end +``` + +The semver gate uses the tag fragment of the Docker reference; the +extraction strategy is recorded in [`services.md` §14](services.md). +The restart delegate already owns the lease, the inner stop/start, +the operation log, and the `runtime:health_events container_started` +emission ([`workers.md` §1](workers.md)). + +## Cleanup TTL + +```mermaid +sequenceDiagram + participant Worker as containercleanup worker + participant PG as Postgres + participant Service as cleanupcontainer + participant Lease as Redis lease + participant Docker + + loop every RTMANAGER_CLEANUP_INTERVAL + Worker->>PG: SELECT runtime_records WHERE status='stopped' AND last_op_at < now - retention + loop per game + Worker->>Service: cleanup(game_id, op_source=auto_ttl) + Service->>Lease: SET NX PX rtmanager:game_lease:{game_id} + Service->>PG: re-read runtime_records WHERE game_id + alt status = running + Service-->>Worker: refused / conflict + else status in {stopped, removed} + Service->>Docker: ContainerRemove(container_id) + Service->>PG: UpdateStatus stopped→removed (CAS) + Service->>PG: INSERT operation_log (op_kind=cleanup_container) + Service-->>Worker: success + end + Service->>Lease: DEL rtmanager:game_lease:{game_id} + end + end +``` + +Admin-driven cleanup follows the same path through +`DELETE /api/v1/internal/runtimes/{game_id}/container` with +`op_source=admin_rest` instead of `auto_ttl`. The host state directory +is **never** removed by this flow +([`../README.md` §Cleanup](../README.md#cleanup), +[`services.md` §17](services.md), +[`workers.md` §19](workers.md)). + +## Reconcile drift adopt + +```mermaid +sequenceDiagram + participant Reconciler as reconcile worker + participant Docker + participant PG as Postgres + participant Lease as Redis lease + + Note over Reconciler: read pass (lockless) + Reconciler->>Docker: List({label=com.galaxy.owner=rtmanager}) + Reconciler->>PG: ListByStatus(running) + Note over Reconciler: write pass (per-game lease) + loop per Docker container without matching record + Reconciler->>Lease: SET NX PX rtmanager:game_lease:{game_id} + Reconciler->>PG: re-read runtime_records WHERE game_id + alt record now exists + Reconciler-->>Reconciler: skip (state changed since read pass) + else record still missing + Reconciler->>PG: Upsert runtime_records (status=running, image_ref, started_at) + Reconciler->>PG: INSERT operation_log (op_kind=reconcile_adopt, op_source=auto_reconcile) + end + Reconciler->>Lease: DEL rtmanager:game_lease:{game_id} + end +``` + +The reconciler **never** stops or removes an unrecorded container — +operators may have started one manually for diagnostics. The +`reconcile_dispose` and `observed_exited` paths follow the same +read-pass / write-pass split, with `dispose` updating the orphaned +record to `removed` and emitting `container_disappeared`, and +`observed_exited` updating to `stopped` and emitting `container_exited` +([`../README.md` §Reconciliation](../README.md#reconciliation), +[`workers.md` §14–§16](workers.md)). + +## Health probe hysteresis + +```mermaid +sequenceDiagram + participant Worker as healthprobe worker + participant State as in-memory probe state + participant Engine as galaxy-game-{id}:8080 + participant Health as runtime:health_events + + loop every RTMANAGER_PROBE_INTERVAL + Worker->>Worker: ListByStatus(running) + Worker->>State: prune entries for games no longer running + loop per game (semaphore cap = 16) + Worker->>Engine: GET /healthz (RTMANAGER_PROBE_TIMEOUT) + alt success + State->>State: consecutiveFailures = 0 + opt failurePublished was true + Worker->>Health: XADD probe_recovered {prior_failure_count} + State->>State: failurePublished = false + end + else failure + State->>State: consecutiveFailures++ + opt consecutiveFailures == RTMANAGER_PROBE_FAILURES_THRESHOLD AND not failurePublished + Worker->>Health: XADD probe_failed {consecutive_failures, last_status, last_error} + State->>State: failurePublished = true + end + end + end + end +``` + +Hysteresis prevents a single transient failure from emitting a +`probe_failed` event, and prevents repeated emission while the failure +persists. State is non-persistent: a process restart re-establishes +the counters from scratch; a game's state is pruned when it transitions +out of the running list ([`workers.md` §5–§6](workers.md)). diff --git a/rtmanager/docs/integration-tests.md b/rtmanager/docs/integration-tests.md new file mode 100644 index 0000000..1744189 --- /dev/null +++ b/rtmanager/docs/integration-tests.md @@ -0,0 +1,163 @@ +# Service-Local Integration Suite + +This document explains the design of the service-local integration +suite under [`../integration/`](../integration). The current-state +behaviour (harness layout, env knobs, scenario coverage) lives next +to the files themselves; this document records the rationale. + +The cross-service Lobby↔RTM suite at +[`../../integration/lobbyrtm/`](../../integration/lobbyrtm) follows +different rules (it lives in the top-level `galaxy/integration` +module) and is documented inside that package. + +## 1. Build tag `integration` + +The scenarios under [`../integration/*_test.go`](../integration) are +guarded by `//go:build integration`. The default `go test ./...` +invocation skips them, while `go test -tags=integration +./integration/...` (and the `make integration` target) runs the full +set: + +```sh +make -C rtmanager integration +``` + +The harness package itself ([`../integration/harness`](../integration/harness)) +has no build tag. It compiles on every run because each helper guards +its Docker-dependent paths with `t.Skip` when the daemon is +unavailable. This keeps the harness loadable from a tagless `go vet` +or IDE workflow without dragging Docker into the default `go test` +critical path. + +## 2. Smoke test runs in the default `go test` pass + +[`../internal/adapters/docker/smoke_test.go`](../internal/adapters/docker/smoke_test.go) +runs in the regular `go test ./...` pass and falls back on +`skipUnlessDockerAvailable` when no Docker socket is present. The +smoke test is intentionally kept separate from the new `integration/` +suite because it exercises the production adapter shape (one +container at a time against `alpine:3.21`), not the full runtime; +both surfaces are useful. + +## 3. In-process `app.NewRuntime` instead of a `cmd/rtmanager` subprocess + +The harness drives Runtime Manager through `app.NewRuntime(ctx, cfg, +logger)` directly rather than spawning the binary from +`cmd/rtmanager/main.go`: + +- **Cleanup is deterministic.** A `t.Cleanup` block can `cancel()` + the runtime context and call `runtime.Close()`; the goroutine + driving `runtime.Run` returns with `context.Canceled` and the + helper waits on it via the `runDone` channel. With a subprocess the + equivalent dance requires SIGTERM, output capture, and graceful + shutdown timing tied to the child's signal handler. +- **Goroutine and store visibility.** Tests read the durable PG state + directly through the harness-owned pool and read every Redis stream + through the harness-owned client. Both observe the exact wire shape + Lobby will see in the cross-service suite. +- **Logger isolation.** The harness defaults to `slog.Discard` so the + default test output stays focused on assertions; flipping + `EnvOptions.LogToStderr` lights up the runtime's structured logs + for local debugging without requiring any subprocess plumbing. + +The cross-service inter-process suite at `integration/lobbyrtm/` +re-uses the existing `integration/internal/harness` binary-spawn +helpers; the in-process choice here is specific to the service-local +scope. + +## 4. `httptest.Server` stub for the Lobby internal client + +Runtime Manager configuration requires a non-empty +`RTMANAGER_LOBBY_INTERNAL_BASE_URL`, and the start service makes a +diagnostic `GET /api/v1/internal/games/{game_id}` call that v1 treats +as a no-op (the start envelope already carries the only required +field, `image_ref`; rationale in [`services.md`](services.md) §7). +The harness therefore stands up a tiny `httptest.Server` per test +that returns a stable `200 OK` response. The stub is intentionally +unconfigurable: every integration scenario produces the same +ancillary fetch, and adding routing/error injection would invite +test code to depend on a contract the start service deliberately +ignores. + +## 5. One built engine image, two semver-compatible tags + +The patch lifecycle expects the new and current image refs to share +the same major / minor version (`semver_patch_only` failure +otherwise). Building two distinct images would multiply the per-run +build cost without changing what the test verifies — the patch path +exercises `image_ref_not_semver` and `semver_patch_only` validation +plus the recreate-with-new-tag flow, none of which depend on +distinct image *content*. The harness builds the engine once and +calls `client.ImageTag` to alias it as both `galaxy/game:1.0.0-rtm-it` +and `galaxy/game:1.0.1-rtm-it`. Both share the same digest. + +The integration tags use the `*-rtm-it` suffix (rather than plain +`galaxy/game:1.0.0`) so an operator running the suite locally cannot +accidentally consume a hand-built dev image, and so a `docker image +rm` of integration leftovers does not nuke a production-shaped tag. + +## 6. Per-test Docker network and per-test state root + +`EnsureNetwork(t)` creates a uniquely-named bridge network per test +and registers cleanup; `t.ArtifactDir()` provides the per-game state +root. Both ensure that two scenarios running back-to-back cannot +collide on the per-game DNS hostname (`galaxy-game-{game_id}`) or on +filesystem state. Game ids are themselves unique per test +(`harness.IDFromTestName` adds a nanosecond suffix) — combined with +the per-test network and state root, the suite is safe to run with +`-count` greater than one. + +`t.ArtifactDir()` keeps the engine state directory around when a +test fails (Go ≥ 1.25), so an operator can `cd` into it after a CI +failure and inspect what the engine wrote. On success the directory +is automatically cleaned up. + +## 7. PostgreSQL and Redis containers shared per-package + +Both fixtures use `sync.Once` to start one testcontainer per test +package, mirroring the +[`../internal/adapters/postgres/internal/pgtest`](../internal/adapters/postgres/internal/pgtest) +pattern. `TruncatePostgres` and `FlushRedis` reset state between +tests so each scenario starts on an empty stack. The trade-off versus +per-test containers is the standard one: container startup dominates +the per-package latency, so amortising it across the suite keeps the +loop tight while the truncate/flush ensures isolation. The ~1–2 s +difference matters in CI. + +## 8. Engine image cache is intentionally retained between runs + +`buildAndTagEngineImage` runs once per package via `sync.Once` and +leaves both image tags in the local Docker cache after the suite +exits. The cache is a substantial speed-up on a developer laptop +(`docker build` of `galaxy/game` takes 30+ seconds cold, sub-second +hot), and a stale image is unlikely because the tags carry the +`*-rtm-it` suffix and the underlying Dockerfile is forward-compatible +with multiple test runs. Operators who suspect a stale image can +`docker image rm galaxy/game:1.0.0-rtm-it galaxy/game:1.0.1-rtm-it`; +the next run rebuilds. + +## 9. Scenario coverage + +The suite covers the four end-to-end flows operators care about: + +- **lifecycle** (`lifecycle_test.go`) — start → inspect → stop → + restart → patch → stop → cleanup. The intermediate `stop` between + `patch` and `cleanup` is intentional: the cleanup endpoint refuses + to remove a running container per + [`../README.md` §Cleanup](../README.md#cleanup). +- **replay** (`replay_test.go`) — duplicate start / stop entries + surface as `replay_no_op` per [`workers.md`](workers.md) §11. +- **health** (`health_test.go`) — external `docker rm` produces + `container_disappeared`; manual `docker run` is adopted by the + reconciler. +- **notification** (`notification_test.go`) — unresolvable `image_ref` + produces `runtime.image_pull_failed` plus a `failure` job_result. + +## 10. Service-local scope only + +This suite runs Runtime Manager against a real Docker daemon plus +testcontainers PG / Redis but **does not** include any other Galaxy +service. Cross-service flows (Lobby ↔ RTM, RTM ↔ Notification) live +in the top-level `galaxy/integration/` module, where the harness +spawns multiple service binaries and uses real (not stubbed) cross- +service streams. diff --git a/rtmanager/docs/postgres-migration.md b/rtmanager/docs/postgres-migration.md new file mode 100644 index 0000000..eb15bc2 --- /dev/null +++ b/rtmanager/docs/postgres-migration.md @@ -0,0 +1,531 @@ +# PostgreSQL Schema Decisions + +Runtime Manager has been PostgreSQL-and-Redis from day one — there is +no Redis-only predecessor and no migration window. This document +records the schema decisions and the non-obvious agreements behind +them, mirroring the shape of +[`../../notification/docs/postgres-migration.md`](../../notification/docs/postgres-migration.md) +and serving the same role: a single coherent reference for "why does +the persistence layer look this way". + +Use this document together with the migration script +[`../internal/adapters/postgres/migrations/00001_init.sql`](../internal/adapters/postgres/migrations/00001_init.sql) +and the runtime wiring +[`../internal/app/runtime.go`](../internal/app/runtime.go). + +## Outcomes + +- Schema `rtmanager` (provisioned externally) holds the durable + service state across three tables: `runtime_records`, + `operation_log`, `health_snapshots`. The three tables map onto the + three runtime concerns documented in + [`../README.md` §Persistence Layout](../README.md#persistence-layout): + current state per game, audit trail per operation, and latest + technical health observation per game. +- The runtime opens one PostgreSQL pool via `pkg/postgres.OpenPrimary`, + applies embedded goose migrations strictly before any HTTP listener + becomes ready, and exits non-zero when migration or ping fails. + Already-applied migrations exit zero — the + `pkg/postgres`-supplied migrator treats "no work to do" as success. +- The runtime opens one shared `*redis.Client` via + `pkg/redisconn.NewMasterClient` and passes it to the stream offset + store, the per-game lease store, the consumer pipelines, and every + publisher (`runtime:job_results`, `runtime:health_events`, + `notification:intents`). +- The Redis adapter package + [`../internal/adapters/redisstate/`](../internal/adapters/redisstate) + owns one shared `Keyspace` struct with the + `defaultPrefix = "rtmanager:"` constant and per-store subpackages + for stream offsets and the per-game lease. +- Generated jet code under + [`../internal/adapters/postgres/jet/`](../internal/adapters/postgres/jet) + is committed; `make -C rtmanager jet` regenerates it via the + testcontainers-driven `cmd/jetgen` pipeline. +- Configuration uses the `RTMANAGER_` prefix for every variable. + The schema-per-service rule from + [`../../ARCHITECTURE.md` §Persistence Backends](../../ARCHITECTURE.md) + applies: each service's role is grant-restricted to its own + schema; RTM never touches Lobby's `lobby` schema or vice versa. + +## Decisions + +### 1. One schema, externally-provisioned `rtmanagerservice` role + +**Decision.** The `rtmanager` schema and the matching +`rtmanagerservice` role are created outside the migration sequence +(in tests, by the testcontainers harness in `cmd/jetgen/main.go::provisionRoleAndSchema` +and by the integration harness; in production, by an ops init script +not in scope for any service stage). The embedded migration +`00001_init.sql` only contains DDL for the service-owned tables and +indexes and assumes it runs as the schema owner with +`search_path=rtmanager`. + +**Why.** Mixing role creation, schema creation, and table DDL into +one script forces every consumer of the migration to run as a +superuser. The schema-per-service architectural rule +(`ARCHITECTURE.md §Persistence Backends`) lines up neatly with the +operational split: ops provisions roles and schemas, the service +applies schema-scoped migrations. Letting RTM run `CREATE SCHEMA` +from its runtime role would relax the +"each service's role grants are restricted to its own schema" +defense-in-depth rule. + +### 2. `runtime_records.game_id` is the natural primary key + +**Decision.** `runtime_records` uses +`game_id text PRIMARY KEY`. There is no surrogate key. The `status` +column carries a CHECK constraint enforcing the +`running | stopped | removed` enum. + +```sql +CREATE TABLE runtime_records ( + game_id text PRIMARY KEY, + status text NOT NULL, + -- ... + CONSTRAINT runtime_records_status_chk + CHECK (status IN ('running', 'stopped', 'removed')) +); +``` + +**Why.** `game_id` is the platform-wide identifier owned by Lobby; +RTM stores at most one record per game ever. A surrogate +`bigserial` would force every cross-service join to translate +through a lookup table; the natural key keeps RTM's persistence +layer pin-compatible with the streams contract (every +`runtime:start_jobs` envelope already names the `game_id`). The +status CHECK reproduces the Go-level enum from +[`../internal/domain/runtime/model.go`](../internal/domain/runtime/model.go) +as a defense-in-depth gate at the storage boundary. Decision context: +[`domain-and-ports.md`](domain-and-ports.md). + +### 3. `(status, last_op_at)` index serves both the cleanup worker and `ListByStatus` + +**Decision.** `runtime_records_status_last_op_idx` is a composite +index on `(status, last_op_at)`. The container cleanup worker scans +`status='stopped' AND last_op_at < cutoff`; the +`runtimerecordstore.ListByStatus` adapter method orders rows +`last_op_at DESC, game_id ASC`. + +```sql +CREATE INDEX runtime_records_status_last_op_idx + ON runtime_records (status, last_op_at); +``` + +**Why.** Both read shapes share the same composite. The cleanup +worker drives the index from one direction (range scan on +`last_op_at` filtered by status); `ListByStatus` drives it from the +other (equality on status, sorted by `last_op_at`). PostgreSQL +satisfies both shapes through one index scan once the planner picks +the index for the WHERE clause. The secondary `game_id ASC` tiebreak +in the adapter ORDER BY is satisfied by primary-key ordering after +the index returns the rows. + +A second supporting index for the cleanup worker was considered and +rejected: the workload is so small (single-instance v1, bounded +running game count) that one composite is dominantly cheaper than +two narrow ones. + +### 4. `operation_log` is append-only with `bigserial id` and a `(game_id, started_at DESC)` index + +**Decision.** `operation_log` carries a `bigserial id PRIMARY KEY` +and is written exclusively through INSERT — there is no UPDATE +pathway, no soft-delete column, and no foreign key to +`runtime_records`. The audit index +`operation_log_game_started_idx (game_id, started_at DESC)` drives +the GM/Admin REST audit reads. The adapter's `ListByGame` orders +results `started_at DESC, id DESC` and applies `LIMIT $2`. + +```sql +CREATE INDEX operation_log_game_started_idx + ON operation_log (game_id, started_at DESC); +``` + +**Why.** The audit's correctness invariant is "every operation RTM +performed gets exactly one row"; CASCADE deletes from +`runtime_records` would silently lose history when an admin removes +a runtime and would break the +[`../README.md` §Persistence Layout](../README.md) commitment. The +secondary `id DESC` tiebreak inside the adapter is necessary because +the audit log can write multiple rows in the same millisecond when +`reconcile_adopt` and a real operation interleave on a single tick; +without the tiebreak the test that asserts insertion-order-stable +reads becomes flaky. A non-positive `limit` is rejected before the +SQL is issued; an empty result set returns as `nil` (matching the +lobby pattern, so service-layer callers can do `len(entries) == 0` +without an extra allocation). + +### 5. Enum CHECK constraints on `op_kind`, `op_source`, `outcome` + +**Decision.** `operation_log` reproduces the three Go-level enums +as CHECK constraints: + +```sql +CONSTRAINT operation_log_op_kind_chk + CHECK (op_kind IN ( + 'start', 'stop', 'restart', 'patch', + 'cleanup_container', 'reconcile_adopt', 'reconcile_dispose' + )), +CONSTRAINT operation_log_op_source_chk + CHECK (op_source IN ( + 'lobby_stream', 'gm_rest', 'admin_rest', + 'auto_ttl', 'auto_reconcile' + )), +CONSTRAINT operation_log_outcome_chk + CHECK (outcome IN ('success', 'failure')) +``` + +The Go-level enums in +[`../internal/domain/operation/log.go`](../internal/domain/operation/log.go) +remain the source of truth. + +**Why.** A defence-in-depth gate at the storage boundary catches any +adapter regression that would otherwise persist an unexpected +string. Operator-side queries (`SELECT … WHERE op_kind = 'restart'`) +benefit from the enum being verifiable directly in psql without +consulting the Go source. Adding a new value requires editing two +places (the Go enum and the migration), which is the right friction +level: every new value is a wire-protocol change and deserves an +explicit migration. The alternative of using PostgreSQL's `CREATE +TYPE … AS ENUM` was rejected because adding a value to a PG enum +type requires `ALTER TYPE` outside a transaction and complicates the +single-init pre-launch policy (decision §12). + +### 6. `health_snapshots` is one row per game; status enum collapses event types + +**Decision.** `health_snapshots` carries `game_id text PRIMARY KEY` +and stores the latest technical health observation per game. The +`status` column enumerates the **observed engine state**, not the +**triggering event type**: + +```sql +CONSTRAINT health_snapshots_status_chk + CHECK (status IN ( + 'healthy', 'probe_failed', 'exited', + 'oom', 'inspect_unhealthy', 'container_disappeared' + )) +``` + +The `runtime:health_events` `event_type` enum has seven values +(`container_started`, `container_exited`, `container_oom`, +`container_disappeared`, `inspect_unhealthy`, `probe_failed`, +`probe_recovered`). The snapshot status has six — the two probe +events fold into `healthy` (after `probe_recovered`) and +`probe_failed`, and `container_started` collapses into `healthy`. + +**Why.** Health snapshots answer "what state is the engine in +**right now**", not "what event was just emitted". A consumer who +wants the event firehose reads `runtime:health_events`; a consumer +who wants the latest verdict reads `health_snapshots`. The two +surfaces have different lifetimes (stream entries are bounded only +by Redis trim; snapshot rows are overwritten on every new +observation), so collapsing the seven event types into six status +states aligns the column with the consumer's mental model. The +adapter that implements this collapse lives in +[`../internal/adapters/healtheventspublisher/publisher.go`](../internal/adapters/healtheventspublisher/publisher.go); +every emission to the stream also upserts the snapshot. + +### 7. Two-axis CAS shape on `runtime_records.UpdateStatus` + +**Decision.** `runtimerecordstore.UpdateStatus` compiles its CAS +guard into a single `WHERE … AND …` clause. Status must equal the +caller's `ExpectedFrom`; when the caller supplies a non-empty +`ExpectedContainerID`, `current_container_id` must equal it as +well: + +```sql +UPDATE rtmanager.runtime_records +SET status = $1, last_op_at = $2, ... +WHERE game_id = $3 + AND status = $4 + [AND current_container_id = $5] +``` + +A `RowsAffected() == 0` result is ambiguous — the row may be absent +or the predicate may have failed. The adapter resolves the ambiguity +through a follow-up `SELECT status FROM ... WHERE game_id = $1`: +missing row → `runtime.ErrNotFound`; mismatch → `runtime.ErrConflict`. +The probe runs only on the slow path; happy-path UPDATEs cost a +single round trip. + +**Why.** The two-axis CAS is what services need: a stop driven by an +old container_id (from a stale REST request) must not clobber a +fresh `running` record installed by a concurrent restart. Status-only +CAS would collapse those two cases. The optional shape on +`ExpectedContainerID` lets reconciliation flows that legitimately +target "this game in `running` state without caring which container" +omit the second predicate. The follow-up probe matches the +gamestore / invitestore precedent in `lobby/internal/adapters/postgres` +and produces clean per-error sentinels at the service layer. + +`TestUpdateStatusConcurrentCAS` exercises the path end to end with +eight goroutines racing the same transition: exactly one returns +`nil`, the rest see `runtime.ErrConflict`. The test is deterministic +because PostgreSQL serialises row-level UPDATEs through the row's +MVCC tuple. + +### 8. Destination-driven `SET` clause on `UpdateStatus` + +**Decision.** `UpdateStatus` updates a different column subset +depending on the destination status: + +| Destination | Columns set | +| --- | --- | +| `stopped` | `status`, `last_op_at`, `stopped_at` | +| `removed` | `status`, `last_op_at`, `removed_at`, `current_container_id = NULL` | +| `running` | `status`, `last_op_at` | + +The implementation switches on `input.To` and writes the UPDATE +chain inline per branch — three short branches read better than one +parametric helper. + +**Why.** Each destination has a different invariant. `stopped` +records the wall-clock at which the engine ceased serving; `removed` +nulls the container_id because the row no longer points at any +Docker resource; `running` only updates the status and the +last-op timestamp because the running invariants +(`current_container_id`, fresh `started_at`, `current_image_ref`, +`engine_endpoint`) are installed through `Upsert` on the `start` +path. + +A previous draft built the SET list via `[]pg.Column` / `[]any` +slices and a helper, but jet's `UPDATE(columns ...jet.Column)` +variadic refuses a `[]postgres.Column` slice spread because the +element type does not match `jet.Column` after the type-alias +resolution. The final code switches inline per branch. + +The `running` destination is implemented even though the start +service uses `Upsert` for the inner start of restart and patch. +Keeping the `running` path live preserves a one-to-one match between +`runtime.AllowedTransitions()` and the adapter's capability matrix — +otherwise a future caller exercising the `stopped → running` +transition through `UpdateStatus` would hit a runtime error inside +the adapter rather than a domain rejection. The path only updates +`status` and `last_op_at`; callers responsible for the running +invariants install them through `Upsert` first. + +### 9. `created_at` preservation on `Upsert` + +**Decision.** `runtimerecordstore.Upsert` is implemented as +`INSERT ... ON CONFLICT (game_id) DO UPDATE SET ` — `created_at` is deliberately omitted from +the DO UPDATE list, so a second `Upsert` with a fresh `CreatedAt` +value never overwrites the stored timestamp. + +```sql +INSERT INTO rtmanager.runtime_records (...) +VALUES (...) +ON CONFLICT (game_id) DO UPDATE +SET status = EXCLUDED.status, + current_container_id = EXCLUDED.current_container_id, + current_image_ref = EXCLUDED.current_image_ref, + engine_endpoint = EXCLUDED.engine_endpoint, + state_path = EXCLUDED.state_path, + docker_network = EXCLUDED.docker_network, + started_at = EXCLUDED.started_at, + stopped_at = EXCLUDED.stopped_at, + removed_at = EXCLUDED.removed_at, + last_op_at = EXCLUDED.last_op_at + -- created_at intentionally NOT updated +``` + +`TestUpsertOverwritesMutableColumnsPreservesCreatedAt` covers the +invariant. + +**Why.** `runtime_records.created_at` records "first time RTM saw +the game". Every restart and every reconcile_adopt re-Upserts the +row with the current wall-clock as `CreatedAt` from the adapter +boundary; without the omission rule the timestamp would drift +forward. Preserving the original creation time keeps a stable +horizon for retention reasoning and matches +`lobby/internal/adapters/postgres/gamestore.Save`, which uses the +same approach for the `games.created_at` column. + +### 10. `health_snapshots.details` JSONB round-trip with `'{}'::jsonb` default + +**Decision.** `health_snapshots.details` is `jsonb NOT NULL DEFAULT +'{}'::jsonb`. The jet-generated model declares +`Details string` (jet maps `jsonb` to `string`). The adapter: + +- on `Upsert`, substitutes the SQL DEFAULT `{}` when + `snapshot.Details` is empty, so the column never holds a non-JSON + empty string; +- on `Get`, scans `details` as `[]byte` and wraps the bytes in a + `json.RawMessage` so the caller receives verbatim bytes without + an extra round of parsing. + +`TestUpsertEmptyDetailsRoundTripsAsEmptyObject` and +`TestUpsertAndGetRoundTrip` cover the two cases. + +**Why.** The detail payload is type-specific (the keys differ +between `probe_failed` and `inspect_unhealthy`) and is opaque to +queries — the column is never element-filtered. JSONB matches the +"everything outside primary fields is JSON" pattern that the +Notification Service already established and allows a future +GIN index (e.g. for an admin search-by-key feature) without a +schema rewrite. Substituting the SQL DEFAULT for an empty +parameter avoids the trap where the database accepts `''` for +`text` but rejects it for `jsonb`. + +### 11. Timestamps are uniformly `timestamptz` with UTC normalisation at the adapter boundary + +**Decision.** Every time-valued column on every RTM table uses +PostgreSQL's `timestamptz`. The domain model continues to use +`time.Time`; the adapter normalises every `time.Time` parameter to +UTC at the binding site (`record.X.UTC()` or the `nullableTime` +helper that wraps a possibly-zero `time.Time`), and re-wraps every +scanned `time.Time` with `.UTC()` (directly or via +`timeFromNullable` for nullable columns) before the value leaves +the adapter. + +The architecture-wide form of this rule lives in +[`../../ARCHITECTURE.md` §Persistence Backends → Timestamp handling](../../ARCHITECTURE.md). + +**Why.** `timestamptz` is the right column type for every cross- +service timestamp the platform observes, and the domain model needs +a `time.Time` API the service layer can compare and arithmetise. +Without explicit `.UTC()` on the bind site, the pgx driver returns +scanned values in `time.Local`, which silently breaks equality +tests, JSON formatting, and comparison against pointer fields +elsewhere in the codebase. The defensive `.UTC()` rule on both +sides eliminates the class of bug where a timezone difference +between the adapter and the test harness flips assertions +intermittently. + +The same shape is used in User Service, Mail Service, and +Notification Service — RTM matches the existing convention rather +than introducing a fourth encoding path. + +### 12. Single-init pre-launch policy + +**Decision.** `00001_init.sql` evolves in place until first +production deploy. Adding a column, an index, or a new table during +the pre-launch development window edits this file directly rather +than producing `00002_*.sql`. The runtime applies the migration on +every boot; if the schema is already at head, `pkg/postgres`'s +goose adapter exits zero. + +**Why.** The schema-per-service architectural rule +([`../../ARCHITECTURE.md` §Persistence Backends](../../ARCHITECTURE.md)) +endorses a single-init policy for pre-launch services. The +pre-launch window allows non-additive changes (column rename, type +narrowing, CHECK tightening) that a multi-step migration sequence +would force into awkward two-step rewrites. Once the service ships +to production, the next schema change becomes `00002_*.sql` and +the policy lifts; from that point onward edits to `00001_init.sql` +are rejected by code review. + +This applies to RTM exactly the same way it applies to every other +PG-backed service in the workspace; the README explicitly carries +the reminder. The exit-zero behaviour for already-applied +migrations is what makes the policy operationally cheap: a +freshly-spawned replica re-applies the same `00001_init.sql` with +no work to do, no logged error, and proceeds to open its +listeners. + +### 13. Query layer is `go-jet/jet/v2`; generated code is committed + +**Decision.** All three RTM PG-store packages +([`../internal/adapters/postgres/runtimerecordstore`](../internal/adapters/postgres/runtimerecordstore), +[`../internal/adapters/postgres/operationlogstore`](../internal/adapters/postgres/operationlogstore), +[`../internal/adapters/postgres/healthsnapshotstore`](../internal/adapters/postgres/healthsnapshotstore)) +build SQL through the jet builder API +(`pgtable..INSERT/SELECT/UPDATE/DELETE` plus the +`pg.AND/OR/SET/COALESCE/...` DSL). + +Generated table models live under +[`../internal/adapters/postgres/jet/`](../internal/adapters/postgres/jet) +and are regenerated by `make -C rtmanager jet`. The target invokes +[`../cmd/jetgen/main.go`](../cmd/jetgen/main.go), which spins up a +transient PostgreSQL container via testcontainers, provisions the +`rtmanager` schema and `rtmanagerservice` role, applies the embedded +goose migrations, and runs `github.com/go-jet/jet/v2/generator/postgres.GenerateDB` +against the provisioned schema. Generated code is committed to the +repo, so build consumers do not need Docker. + +Statements are run through the `database/sql` API +(`stmt.Sql() → db/tx.Exec/Query/QueryRow`); manual `rowScanner` +helpers preserve the codecs.go boundary translations and +domain-type mapping (status enum decoding, `time.Time` UTC +normalisation, JSONB `[]byte` ↔ `json.RawMessage`). + +PostgreSQL constructs that the jet builder does not cover natively +(`COALESCE`, `LOWER` on subselects, JSONB params) are expressed +through the per-DSL helpers (`pg.COALESCE`, `pg.LOWER`, direct +`[]byte`/string params for JSONB columns). + +**Why.** Aligns with the workspace-wide convention from +[`../../PG_PLAN.md`](../../PG_PLAN.md): the query layer is +`github.com/go-jet/jet/v2` (PostgreSQL dialect) for every PG-backed +service. Hand-rolled SQL would multiply boundary-translation paths +and require per-store query-builder helpers for what jet already +covers. Committing generated code keeps `go build ./...` working +without Docker. + +### 14. `redisstate` keyspace ownership and per-store subpackages + +**Decision.** The +[`../internal/adapters/redisstate/`](../internal/adapters/redisstate) +package owns one shared `Keyspace` struct with a +`defaultPrefix = "rtmanager:"` constant. Each Redis-backed adapter +lives in its own subpackage: + +- [`redisstate/streamoffsets`](../internal/adapters/redisstate/streamoffsets/) + for the stream offset store consumed by the start-jobs and + stop-jobs consumers; +- [`redisstate/gamelease`](../internal/adapters/redisstate/gamelease/) + for the per-game lease store consumed by every lifecycle service + and the reconciler. + +Both subpackages take a `redisstate.Keyspace{}` value and use it to +build their key shapes (`rtmanager:stream_offsets:{label}`, +`rtmanager:game_lease:{game_id}`). + +**Why.** Keeping the parent package as the single owner of the prefix +and the key-shape builder mirrors the way Lobby's `redisstate` +namespace centralises every key shape and supports multiple Redis- +backed adapters (stream offsets, the per-game lease) without a +restructure as the surface grows. + +The per-store subpackage choice (rather than Lobby's flat +single-package shape) is driven by three considerations: + +- It keeps the docker mock generator scoped to one package, since + `mockgen` regenerates per-directory. +- It allows finer-grained dependency selection: `miniredis` is a + dev-only dep, and keeping the `streamoffsets` package + self-contained leaves room for `gamelease` to depend only on the + production `redis` client. +- Each subpackage carries its own tests, which keeps the test + surface focused on one Redis primitive rather than mixing offset + semantics with lease semantics in shared fixtures. + +## Cross-References + +- [`../internal/adapters/postgres/migrations/00001_init.sql`](../internal/adapters/postgres/migrations/00001_init.sql) + — the embedded schema migration. +- [`../internal/adapters/postgres/migrations/migrations.go`](../internal/adapters/postgres/migrations/migrations.go) + — `//go:embed *.sql` and `FS()` exporter consumed by the runtime. +- [`../internal/adapters/postgres/runtimerecordstore`](../internal/adapters/postgres/runtimerecordstore), + [`../internal/adapters/postgres/operationlogstore`](../internal/adapters/postgres/operationlogstore), + [`../internal/adapters/postgres/healthsnapshotstore`](../internal/adapters/postgres/healthsnapshotstore) + — the three jet-backed PG adapters and their testcontainers-driven + unit suites. +- [`../internal/adapters/postgres/jet/`](../internal/adapters/postgres/jet) + — committed generated jet models. +- [`../cmd/jetgen/main.go`](../cmd/jetgen/main.go) and + [`../Makefile`](../Makefile) `jet` target — the regeneration + pipeline. +- [`../internal/adapters/redisstate/`](../internal/adapters/redisstate), + [`../internal/adapters/redisstate/streamoffsets/`](../internal/adapters/redisstate/streamoffsets/), + [`../internal/adapters/redisstate/gamelease/`](../internal/adapters/redisstate/gamelease/) + — Redis adapter package layout. +- [`../internal/app/runtime.go`](../internal/app/runtime.go) + — runtime wiring: PG pool open + migration apply + Redis client + open + adapter assembly. +- [`../internal/config/`](../internal/config) — the config groups + consumed by the wiring (`Postgres`, `Redis`, `Streams`, + `Coordination`). +- Companion design rationales: + [`domain-and-ports.md`](domain-and-ports.md) for status enum and + domain shape, [`adapters.md`](adapters.md) for the redisstate + publishers and clients. diff --git a/rtmanager/docs/runbook.md b/rtmanager/docs/runbook.md new file mode 100644 index 0000000..afc4065 --- /dev/null +++ b/rtmanager/docs/runbook.md @@ -0,0 +1,368 @@ +# Operator Runbook + +This runbook covers the checks that matter most during startup, +steady-state readiness, shutdown, and the handful of recovery paths +specific to Runtime Manager. + +## Startup Checks + +Before starting the process, confirm: + +- `RTMANAGER_DOCKER_HOST` (default `unix:///var/run/docker.sock`) + reaches a Docker daemon the operator controls. RTM is the only + Galaxy service permitted to interact with the Docker socket; + scoping the daemon to RTM-only callers is operator domain. +- `RTMANAGER_DOCKER_NETWORK` (default `galaxy-net`) names a + user-defined bridge network that has already been created (e.g. + via `docker network create galaxy-net` in the environment's + bootstrap script). RTM **validates** the network at startup but + never creates it. A missing network is fail-fast and the process + exits non-zero before opening any listener. +- `RTMANAGER_GAME_STATE_ROOT` is a host directory the daemon's user + can read and write. Per-game subdirectories are created with + `RTMANAGER_GAME_STATE_DIR_MODE` (default `0750`) and + `RTMANAGER_GAME_STATE_OWNER_UID` / `_GID` (default `0:0`); set the + uid/gid to match the engine container's user when running with a + non-root engine. +- `RTMANAGER_POSTGRES_PRIMARY_DSN` points to the PostgreSQL primary + that hosts the `rtmanager` schema. The DSN must include + `search_path=rtmanager` and `sslmode=disable` (or a real SSL mode + for production). Embedded goose migrations apply at startup before + any HTTP listener opens; a migration or ping failure terminates the + process with a non-zero exit. The `rtmanager` schema and the + matching `rtmanagerservice` role are provisioned externally + ([`postgres-migration.md` §1](postgres-migration.md)). +- `RTMANAGER_REDIS_MASTER_ADDR` and `RTMANAGER_REDIS_PASSWORD` reach + the Redis deployment used for the runtime-coordination state: + stream consumers (`runtime:start_jobs`, `runtime:stop_jobs`), + publishers (`runtime:job_results`, `runtime:health_events`, + `notification:intents`), persisted offsets, and the per-game + lease. RTM does not maintain durable business state on Redis. +- Stream names match the producers and consumers RTM integrates with: + - `RTMANAGER_REDIS_START_JOBS_STREAM` (default `runtime:start_jobs`) + - `RTMANAGER_REDIS_STOP_JOBS_STREAM` (default `runtime:stop_jobs`) + - `RTMANAGER_REDIS_JOB_RESULTS_STREAM` (default `runtime:job_results`) + - `RTMANAGER_REDIS_HEALTH_EVENTS_STREAM` (default `runtime:health_events`) + - `RTMANAGER_NOTIFICATION_INTENTS_STREAM` (default `notification:intents`) +- `RTMANAGER_LOBBY_INTERNAL_BASE_URL` resolves to Lobby's internal + HTTP listener. RTM's start service issues a diagnostic + `GET /api/v1/internal/games/{game_id}` per start; failure is logged + at debug and does not abort the start + ([`services.md` §7](services.md)). + +The startup sequence runs in the order recorded in +[`../README.md` §Startup dependencies](../README.md#startup-dependencies): + +1. PostgreSQL primary opens; goose migrations apply synchronously. +2. Redis master client opens and pings. +3. Docker daemon ping; configured network presence check. +4. Telemetry exporter (OTLP grpc/http or stdout). +5. Internal HTTP listener. +6. Reconciler runs **once synchronously** and blocks until done. +7. Background workers start. + +A failure at any step is fatal. The synchronous reconciler pass is +the reason orphaned containers from a prior process never reach the +periodic workers in an inconsistent state +([`workers.md` §17](workers.md)). + +Expected log lines on a healthy boot: + +- `migrations applied`, +- `postgres ping ok`, +- `redis ping ok`, +- `docker ping ok` and `docker network found`, +- `telemetry exporter started`, +- `internal http listening`, +- `reconciler initial pass completed`, +- one `worker started` entry per background worker (seven expected). + +## Readiness + +Use the probes according to what they actually verify: + +- `GET /healthz` confirms the listener is alive — no dependency + check. +- `GET /readyz` live-pings PostgreSQL primary, Redis master, and the + Docker daemon, then asserts the configured Docker network exists. + Returns `{"status":"ready"}` when every check passes; otherwise + returns `503` with the canonical + `{"error":{"code":"service_unavailable","message":"…"}}` envelope + identifying the first failing dependency. + +`/readyz` is the strongest readiness signal RTM exposes; unlike +Lobby's `/readyz`, it does **not** rely on a one-shot boot ping. +Each request hits the daemon and the database fresh. + +For a practical readiness check in production: + +1. confirm the process emitted the listener and worker startup logs; +2. check `GET /healthz` and `GET /readyz`; +3. verify `rtmanager.runtime_records_by_status{status="running"}` + gauge tracks the expected live game count after the first start + completes; +4. verify `rtmanager.docker_op_latency` histograms have at least one + sample after the first lifecycle operation. + +## Shutdown + +The process handles `SIGINT` and `SIGTERM`. + +Shutdown behaviour: + +- the per-component shutdown budget is controlled by + `RTMANAGER_SHUTDOWN_TIMEOUT` (default `30s`); +- the internal HTTP listener drains in-flight requests before closing; +- stream consumers stop their `XREAD` loops and persist the latest + offset before returning; the offset survives the restart + ([`workers.md` §9](workers.md)); +- the Docker events listener cancels its subscription; +- the in-flight services release their per-game lease through the + surrounding context cancellation; +- the reconciler completes its current pass or aborts mid-write at + the next lease re-acquisition. + +During planned restarts: + +1. send `SIGTERM`; +2. wait for the listener and component-stop logs; +3. expect any consumer that was mid-cycle to retry from the persisted + offset on the next process start; +4. investigate only if shutdown exceeds `RTMANAGER_SHUTDOWN_TIMEOUT`. + +## Engine Container Died + +A running engine container that exits unexpectedly surfaces through +three observation channels: + +- The Docker events listener emits `container_exited` (non-zero exit + code) or `container_oom` (Docker action `oom`). +- The active probe worker eventually emits `probe_failed` once the + threshold is crossed. +- The Docker inspect worker may emit `inspect_unhealthy` if the + engine restarts under Docker's healthcheck or if Docker reports an + unexpected status. + +Triage: + +1. Inspect the `runtime:health_events` stream for the affected + `game_id` and `event_type`: + ```bash + redis-cli XRANGE runtime:health_events - + COUNT 200 \ + | grep -A4 'game_id\s*' + ``` +2. Read the runtime record and the operation log: + ```bash + curl -s http://:8096/api/v1/internal/runtimes/ + psql "$RTMANAGER_POSTGRES_PRIMARY_DSN" -c \ + "SELECT id, op_kind, op_source, outcome, error_code, started_at + FROM rtmanager.operation_log + WHERE game_id = '' + ORDER BY started_at DESC LIMIT 20" + ``` +3. If Lobby has not reacted (the game's status remains `running` in + `lobby.games`), check `runtime:job_results` lag and Lobby's + `runtimejobresult` worker. RTM publishes the result; Lobby is the + consumer. +4. If the container is already gone (`docker ps -a` shows no row for + `galaxy-game-`), the reconciler will move the record to + `removed` on its next pass. Run the periodic reconcile manually + by sending `SIGHUP` is **not** supported — wait + `RTMANAGER_RECONCILE_INTERVAL` (default `5m`) or restart the + process; the synchronous boot pass will handle the drift. +5. The `notification:intents` stream is **not** the place to look + for ongoing health changes. Only the three first-touch start + failures (`runtime.image_pull_failed`, + `runtime.container_start_failed`, + `runtime.start_config_invalid`) produce a notification intent; + probe failures, OOMs, and exits flow through health events only + ([`../README.md` §Notification Contracts](../README.md#notification-contracts)). + +## Patch Upgrade + +A patch upgrade replaces the container with a new `image_ref` while +preserving the bind-mounted state directory. + +Pre-conditions: + +- The new and current `image_ref` tags both parse as semver. RTM + rejects non-semver tags with `image_ref_not_semver`. +- The new and current major / minor versions match. A cross-major or + cross-minor patch returns `semver_patch_only`. + +Driving the upgrade: + +```bash +curl -s -X POST \ + -H 'Content-Type: application/json' \ + -H 'X-Galaxy-Caller: admin' \ + http://:8096/api/v1/internal/runtimes//patch \ + -d '{"image_ref": "galaxy/game:1.4.2"}' +``` + +Behaviour: + +- The container is stopped, removed, and recreated. The + `current_container_id` changes; the `engine_endpoint` + (`http://galaxy-game-:8080`) is stable. +- The engine reads its state from the bind mount on startup, so any + data written before the patch survives. +- A single `operation_log` row is appended with `op_kind=patch` and + the old / new image refs. +- A `runtime:health_events container_started` is emitted by the + inner start ([`workers.md` §1](workers.md)). + +Post-patch verification: + +```bash +curl -s http://galaxy-game-:8080/healthz +curl -s http://:8096/api/v1/internal/runtimes/ +``` + +The `current_image_ref` field on the runtime record reflects the new +tag. + +## Manual Cleanup + +The cleanup endpoint removes the container and updates the record to +`removed`. It refuses to remove a `running` container — stop first. + +```bash +# Stop, then clean up +curl -s -X POST \ + -H 'Content-Type: application/json' \ + -H 'X-Galaxy-Caller: admin' \ + http://:8096/api/v1/internal/runtimes//stop \ + -d '{"reason":"admin_request"}' + +curl -s -X DELETE \ + -H 'X-Galaxy-Caller: admin' \ + http://:8096/api/v1/internal/runtimes//container +``` + +The host state directory under `/` +is **never** deleted by RTM. Removing the directory is operator +domain (backup tooling, future Admin Service workflow). The +operation_log records `op_kind=cleanup_container` with +`op_source=admin_rest`. + +## Reconcile Drift After Docker Daemon Restart + +A Docker daemon restart drops every running engine container; PG +records remain. On RTM's next boot (or its next periodic reconcile): + +1. The reconciler observes `running` records whose containers are + missing from `docker ps`. It updates each record to `removed`, + appends `operation_log` with `op_kind=reconcile_dispose`, and + publishes `runtime:health_events container_disappeared` + ([`workers.md` §14–§15](workers.md)). +2. Lobby's `runtimejobresult` worker does not consume the dispose + event in v1, so the cascade does not auto-restart the engine. + Operators trigger restarts through Lobby's user-facing flow or + directly via the GM/Admin REST `restart` endpoint. +3. If the operator brings up an engine container manually for + diagnostics (`docker run` with the + `com.galaxy.owner=rtmanager,com.galaxy.game_id=` labels), + the reconciler **adopts** it on the next pass: a new + `runtime_records` row appears with `op_kind=reconcile_adopt`. + The reconciler **never stops or removes** an unrecorded + container — operators stay in control of manual containers + ([`../README.md` §Reconciliation](../README.md#reconciliation)). + +Three drift kinds run through the same lease-guarded write pass: +`adopt`, `dispose`, and the README-level path +`observed_exited` (a record marked `running` whose container exists +but is in `exited`). Telemetry counter +`rtmanager.reconcile_drift{kind}` exposes the three independently +([`workers.md` §15](workers.md)). + +## Testing Locally + +```sh +# One-time bootstrap +docker network create galaxy-net + +# Minimal env (see docs/examples.md for a complete .env) +export RTMANAGER_GAME_STATE_ROOT=/var/lib/galaxy/games +export RTMANAGER_DOCKER_NETWORK=galaxy-net +export RTMANAGER_INTERNAL_HTTP_ADDR=:8096 +export RTMANAGER_DOCKER_HOST=unix:///var/run/docker.sock +export RTMANAGER_POSTGRES_PRIMARY_DSN='postgres://rtmanagerservice:rtmanagerservice@127.0.0.1:5432/galaxy?search_path=rtmanager&sslmode=disable' +export RTMANAGER_REDIS_MASTER_ADDR=127.0.0.1:6379 +export RTMANAGER_REDIS_PASSWORD=local +export RTMANAGER_LOBBY_INTERNAL_BASE_URL=http://127.0.0.1:8095 + +go run ./rtmanager/cmd/rtmanager +``` + +After start: + +- `curl http://localhost:8096/healthz` returns `{"status":"ok"}`; +- `curl http://localhost:8096/readyz` returns `{"status":"ready"}` + once PG, Redis, and Docker pings pass and the configured network + exists; +- driving Lobby through its public flow (`POST /api/v1/lobby/games//start`) + brings up `galaxy-game-` containers; RTM logs each + lifecycle transition. + +The integration suite under `rtmanager/integration/` exercises the +end-to-end flows against the real Docker daemon. The default +`go test ./...` skips it via the `integration` build tag; run +explicitly with: + +```sh +make -C rtmanager integration +``` + +The suite requires a reachable Docker daemon. Without one, the +harness helpers call `t.Skip` and the package becomes a no-op +([`integration-tests.md` §1](integration-tests.md)). + +## Diagnostic Queries + +Durable runtime state lives in PostgreSQL; runtime-coordination state +stays in Redis. CLI snippets that help during incidents: + +```bash +# Live runtime count by status (PostgreSQL) +psql "$RTMANAGER_POSTGRES_PRIMARY_DSN" -c \ + "SELECT status, COUNT(*) FROM rtmanager.runtime_records GROUP BY status" + +# Inspect a specific runtime record +psql "$RTMANAGER_POSTGRES_PRIMARY_DSN" -c \ + "SELECT * FROM rtmanager.runtime_records WHERE game_id = ''" + +# Last 20 operations for a game (newest first) +psql "$RTMANAGER_POSTGRES_PRIMARY_DSN" -c \ + "SELECT id, op_kind, op_source, outcome, error_code, + started_at, finished_at + FROM rtmanager.operation_log + WHERE game_id = '' + ORDER BY started_at DESC, id DESC + LIMIT 20" + +# Latest health snapshot +psql "$RTMANAGER_POSTGRES_PRIMARY_DSN" -c \ + "SELECT * FROM rtmanager.health_snapshots WHERE game_id = ''" + +# Containers RTM owns (Docker) +docker ps --filter label=com.galaxy.owner=rtmanager \ + --format 'table {{.ID}}\t{{.Names}}\t{{.Status}}\t{{.Labels}}' + +# Stream lag (Redis) +redis-cli XINFO STREAM runtime:start_jobs +redis-cli XINFO STREAM runtime:stop_jobs +redis-cli GET rtmanager:stream_offsets:startjobs +redis-cli GET rtmanager:stream_offsets:stopjobs + +# Recent health events (oldest first) +redis-cli XRANGE runtime:health_events - + COUNT 100 + +# Per-game lease (only present while an operation runs) +redis-cli GET rtmanager:game_lease: +redis-cli TTL rtmanager:game_lease: +``` + +Operators reach the gauges and counters surfaced through +OpenTelemetry as the primary observability surface; raw PostgreSQL +and Redis access is for last-resort triage. diff --git a/rtmanager/docs/runtime.md b/rtmanager/docs/runtime.md new file mode 100644 index 0000000..1427df7 --- /dev/null +++ b/rtmanager/docs/runtime.md @@ -0,0 +1,309 @@ +# Runtime and Components + +The diagram below focuses on the deployed `galaxy/rtmanager` process +and its runtime dependencies. The current-state contract for every +listener, worker, and adapter lives in [`../README.md`](../README.md); +this document is the navigation aid that points at the right code path +and the right design-rationale record. + +```mermaid +flowchart LR + subgraph Clients + GM["Game Master"] + Admin["Admin Service"] + Lobby["Game Lobby"] + end + + subgraph RTM["Runtime Manager process"] + InternalHTTP["Internal HTTP listener\n:8096 /healthz /readyz + REST"] + StartJobs["startjobsconsumer"] + StopJobs["stopjobsconsumer"] + DockerEvents["dockerevents listener"] + HealthProbe["healthprobe worker"] + DockerInspect["dockerinspect worker"] + Reconcile["reconcile worker"] + Cleanup["containercleanup worker"] + Services["lifecycle services\n(start, stop, restart, patch, cleanupcontainer)"] + IntentPublisher["notification:intents publisher"] + ResultsPublisher["runtime:job_results publisher"] + HealthPublisher["runtime:health_events publisher"] + Telemetry["Logs, traces, metrics"] + end + + Docker["Docker Daemon"] + Engine["galaxy-game-{game_id} container"] + Postgres["PostgreSQL\nschema rtmanager"] + Redis["Redis\nstreams + leases + offsets"] + LobbyHTTP["Lobby internal HTTP"] + + Lobby -. runtime:start_jobs .-> StartJobs + Lobby -. runtime:stop_jobs .-> StopJobs + GM --> InternalHTTP + Admin --> InternalHTTP + + StartJobs --> Services + StopJobs --> Services + InternalHTTP --> Services + + Services --> Docker + Services --> Postgres + Services --> Redis + Services --> ResultsPublisher + Services --> HealthPublisher + Services --> IntentPublisher + Services -. GET diagnostic .-> LobbyHTTP + + DockerEvents --> Docker + DockerInspect --> Docker + HealthProbe --> Engine + Reconcile --> Docker + Reconcile --> Postgres + Cleanup --> Postgres + Cleanup --> Services + + DockerEvents --> HealthPublisher + DockerInspect --> HealthPublisher + HealthProbe --> HealthPublisher + + HealthPublisher --> Redis + ResultsPublisher --> Redis + IntentPublisher --> Redis + + StartJobs --> Redis + StopJobs --> Redis + InternalHTTP --> Postgres + + Docker -->|create / start / stop / rm| Engine + Engine -. bind mount .- StateDir["host:\n/{game_id}"] + + InternalHTTP --> Telemetry + Services --> Telemetry + StartJobs --> Telemetry + StopJobs --> Telemetry + DockerEvents --> Telemetry + HealthProbe --> Telemetry + DockerInspect --> Telemetry + Reconcile --> Telemetry + Cleanup --> Telemetry +``` + +Notes: + +- `cmd/rtmanager` refuses startup when PostgreSQL is unreachable, when + goose migrations fail, when Redis ping fails, when the Docker daemon + ping fails, or when the configured Docker network is missing. Lobby + reachability is **not** verified at boot — the start service's + diagnostic `GET /api/v1/internal/games/{game_id}` call is a no-op + outside of debug logging + ([`services.md` §7](services.md)). +- The reconciler runs **synchronously** once on startup before + `app.App.Run` registers any other component, then re-runs + periodically as a regular `Component`. The synchronous pass is the + reason why orphaned containers from a prior process can never be + observed by the events listener with no PG record + ([`workers.md` §17](workers.md)). +- A single internal HTTP listener exposes both probes + (`/healthz`, `/readyz`) and the trusted REST surface for Game Master + and Admin Service. There is no public listener — RTM does not face + end users. + +## Listeners + +| Listener | Default addr | Purpose | +| --- | --- | --- | +| Internal HTTP | `:8096` | Probes (`/healthz`, `/readyz`) plus the trusted REST surface for `Game Master` and `Admin Service` | + +Shared listener defaults from `RTMANAGER_INTERNAL_HTTP_*`: + +- read timeout: `5s` +- write timeout: `15s` +- idle timeout: `60s` + +The listener is unauthenticated and assumes a trusted network segment. +The `X-Galaxy-Caller` request header carries an optional caller +identity (`gm` or `admin`) that the handler records as +`operation_log.op_source` +([`services.md` §18](services.md)). + +Probe routes: + +- `GET /healthz` — process liveness; returns `{"status":"ok"}` while + the listener is up. +- `GET /readyz` — live-pings PostgreSQL primary, Redis master, and the + Docker daemon, then asserts the configured Docker network exists. + Returns `{"status":"ready"}` only when every check passes; otherwise + returns `503` with the canonical error envelope. + +## Background Workers + +Every worker runs as an `app.Component` and is registered in the +order below by [`internal/app/runtime.go`](../internal/app/runtime.go). + +| Worker | Source | Trigger | Function | +| --- | --- | --- | --- | +| Start jobs consumer | [`internal/worker/startjobsconsumer`](../internal/worker/startjobsconsumer) | Redis `XREAD runtime:start_jobs` | Decodes `{game_id, image_ref, requested_at_ms}` and invokes `startruntime.Service`; publishes the outcome to `runtime:job_results` | +| Stop jobs consumer | [`internal/worker/stopjobsconsumer`](../internal/worker/stopjobsconsumer) | Redis `XREAD runtime:stop_jobs` | Decodes `{game_id, reason, requested_at_ms}` and invokes `stopruntime.Service`; publishes the outcome to `runtime:job_results` | +| Docker events listener | [`internal/worker/dockerevents`](../internal/worker/dockerevents) | Docker `/events` API filtered by `com.galaxy.owner=rtmanager` | Emits `runtime:health_events` for `container_exited`, `container_oom`, `container_disappeared`. Reconnects on transport errors with a fixed 5s backoff ([`workers.md` §7](workers.md)) | +| Health probe worker | [`internal/worker/healthprobe`](../internal/worker/healthprobe) | Periodic `RTMANAGER_PROBE_INTERVAL` | `GET {engine_endpoint}/healthz` for every running runtime; in-memory hysteresis emits `probe_failed` after `RTMANAGER_PROBE_FAILURES_THRESHOLD` consecutive failures and `probe_recovered` on the first success thereafter ([`workers.md` §5–§6](workers.md)) | +| Docker inspect worker | [`internal/worker/dockerinspect`](../internal/worker/dockerinspect) | Periodic `RTMANAGER_INSPECT_INTERVAL` | Calls `InspectContainer` for every running runtime; emits `inspect_unhealthy` on `RestartCount` growth, unexpected status, or Docker `HEALTHCHECK=unhealthy` | +| Reconciler | [`internal/worker/reconcile`](../internal/worker/reconcile) | Synchronous startup pass + periodic `RTMANAGER_RECONCILE_INTERVAL` | Adopts unrecorded containers (`reconcile_adopt`), disposes records whose container vanished (`reconcile_dispose`), records observed exits (`observed_exited`); every mutation runs under the per-game lease ([`workers.md` §14–§15](workers.md)) | +| Container cleanup | [`internal/worker/containercleanup`](../internal/worker/containercleanup) | Periodic `RTMANAGER_CLEANUP_INTERVAL` | Lists `runtime_records` rows with `status=stopped AND last_op_at < now - retention`, delegates to `cleanupcontainer.Service` per game ([`workers.md` §19](workers.md)) | + +The events listener and the inspect worker do **not** emit +`container_started` — that event is owned by the start service +([`workers.md` §1](workers.md)). The events listener and the inspect +worker also do not emit `container_disappeared` autonomously when a +record is missing or stale; the conditional emission rules live in +[`workers.md` §2](workers.md) and [`§4`](workers.md). + +## Lifecycle Services + +The five lifecycle services are pure orchestrators called from both +the stream consumers and the REST handlers. Each service owns the +per-game lease for the duration of its operation. + +| Service | Source | Triggers | Failure envelope | +| --- | --- | --- | --- | +| `startruntime` | [`internal/service/startruntime`](../internal/service/startruntime) | `runtime:start_jobs`, `POST /api/v1/internal/runtimes/{id}/start` | `start_config_invalid`, `image_pull_failed`, `container_start_failed`, `conflict`, `service_unavailable`, `internal_error` ([`services.md` §4](services.md)) | +| `stopruntime` | [`internal/service/stopruntime`](../internal/service/stopruntime) | `runtime:stop_jobs`, `POST /api/v1/internal/runtimes/{id}/stop` | `conflict`, `service_unavailable`, `internal_error`, `not_found` ([`services.md` §17](services.md)) | +| `restartruntime` | [`internal/service/restartruntime`](../internal/service/restartruntime) | `POST /api/v1/internal/runtimes/{id}/restart` | inherited from inner stop / start; lease covers both inner ops ([`services.md` §12, §17](services.md)) | +| `patchruntime` | [`internal/service/patchruntime`](../internal/service/patchruntime) | `POST /api/v1/internal/runtimes/{id}/patch` | `image_ref_not_semver`, `semver_patch_only`, plus inherited start/stop codes ([`services.md` §14, §17](services.md)) | +| `cleanupcontainer` | [`internal/service/cleanupcontainer`](../internal/service/cleanupcontainer) | `DELETE /api/v1/internal/runtimes/{id}/container`, periodic cleanup worker | `not_found`, `conflict`, `service_unavailable`, `internal_error` ([`services.md` §17](services.md)) | + +All services share three behaviours captured in +[`services.md`](services.md): + +- the per-game Redis lease (`rtmanager:game_lease:{game_id}`, + TTL `RTMANAGER_GAME_LEASE_TTL_SECONDS`) is acquired by the service, + not by the caller — which keeps consumer and REST callers symmetric + ([`services.md` §1](services.md)); +- the canonical `Result` shape (`Outcome`, `ErrorCode`, `Record`, + `ContainerID`, `EngineEndpoint`) is what consumers and REST + handlers translate into job_results / HTTP responses + ([`services.md` §3](services.md)); +- failures pass through one `operation_log` write before returning, + and three of the failure codes (`start_config_invalid`, + `image_pull_failed`, `container_start_failed`) also publish a + `runtime.*` admin notification intent + ([`services.md` §4](services.md)). + +## Synchronous Upstream Client + +| Client | Endpoint | Failure mapping | +| --- | --- | --- | +| `Game Lobby` internal | `GET {RTMANAGER_LOBBY_INTERNAL_BASE_URL}/api/v1/internal/games/{game_id}` | Diagnostic-only in v1; the start service ignores the body and absorbs network failures with a debug log. Decision: [`services.md` §7](services.md) | + +Lobby's outbound transport is the only synchronous client RTM holds. +Every other interaction (Notification Service, Game Master, Admin +Service) crosses an asynchronous boundary or is initiated by the peer. + +## Stream Offsets + +Each consumer persists its position under a fixed label so process +restart preserves stream progress. + +| Stream | Offset key | Block timeout env | +| --- | --- | --- | +| `runtime:start_jobs` | `rtmanager:stream_offsets:startjobs` | `RTMANAGER_STREAM_BLOCK_TIMEOUT` | +| `runtime:stop_jobs` | `rtmanager:stream_offsets:stopjobs` | `RTMANAGER_STREAM_BLOCK_TIMEOUT` | + +The labels `startjobs` and `stopjobs` are stable identifiers — they +are decoupled from the underlying stream key. An operator who renames +a stream via `RTMANAGER_REDIS_START_JOBS_STREAM` / +`RTMANAGER_REDIS_STOP_JOBS_STREAM` does not lose the persisted offset. +Decision: [`workers.md` §9](workers.md). + +The `runtime:job_results`, `runtime:health_events`, and +`notification:intents` streams are outbound; RTM does not consume them +itself. + +## Configuration Groups + +The full env-var list with defaults lives in +[`../README.md` §Configuration](../README.md). The groups below +summarise the structure: + +- **Required** — `RTMANAGER_INTERNAL_HTTP_ADDR`, + `RTMANAGER_POSTGRES_PRIMARY_DSN`, `RTMANAGER_REDIS_MASTER_ADDR`, + `RTMANAGER_REDIS_PASSWORD`, `RTMANAGER_DOCKER_HOST`, + `RTMANAGER_DOCKER_NETWORK`, `RTMANAGER_GAME_STATE_ROOT`. +- **Listener** — `RTMANAGER_INTERNAL_HTTP_*` timeouts. +- **Docker** — `RTMANAGER_DOCKER_HOST`, `RTMANAGER_DOCKER_API_VERSION`, + `RTMANAGER_DOCKER_NETWORK`, `RTMANAGER_DOCKER_LOG_DRIVER`, + `RTMANAGER_DOCKER_LOG_OPTS`, `RTMANAGER_IMAGE_PULL_POLICY`. +- **Container defaults** — `RTMANAGER_DEFAULT_CPU_QUOTA`, + `RTMANAGER_DEFAULT_MEMORY`, `RTMANAGER_DEFAULT_PIDS_LIMIT`, + `RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS`, + `RTMANAGER_CONTAINER_RETENTION_DAYS`, + `RTMANAGER_ENGINE_STATE_MOUNT_PATH`, + `RTMANAGER_ENGINE_STATE_ENV_NAME`, + `RTMANAGER_GAME_STATE_DIR_MODE`, + `RTMANAGER_GAME_STATE_OWNER_UID`, + `RTMANAGER_GAME_STATE_OWNER_GID`. +- **PostgreSQL connectivity** — `RTMANAGER_POSTGRES_PRIMARY_DSN`, + `RTMANAGER_POSTGRES_REPLICA_DSNS`, + `RTMANAGER_POSTGRES_OPERATION_TIMEOUT`, + `RTMANAGER_POSTGRES_MAX_OPEN_CONNS`, + `RTMANAGER_POSTGRES_MAX_IDLE_CONNS`, + `RTMANAGER_POSTGRES_CONN_MAX_LIFETIME`. +- **Redis connectivity** — `RTMANAGER_REDIS_MASTER_ADDR`, + `RTMANAGER_REDIS_REPLICA_ADDRS`, `RTMANAGER_REDIS_PASSWORD`, + `RTMANAGER_REDIS_DB`, `RTMANAGER_REDIS_OPERATION_TIMEOUT`. +- **Streams** — `RTMANAGER_REDIS_START_JOBS_STREAM`, + `RTMANAGER_REDIS_STOP_JOBS_STREAM`, + `RTMANAGER_REDIS_JOB_RESULTS_STREAM`, + `RTMANAGER_REDIS_HEALTH_EVENTS_STREAM`, + `RTMANAGER_NOTIFICATION_INTENTS_STREAM`, + `RTMANAGER_STREAM_BLOCK_TIMEOUT`. +- **Health monitoring** — `RTMANAGER_INSPECT_INTERVAL`, + `RTMANAGER_PROBE_INTERVAL`, `RTMANAGER_PROBE_TIMEOUT`, + `RTMANAGER_PROBE_FAILURES_THRESHOLD`. +- **Reconciler / cleanup** — `RTMANAGER_RECONCILE_INTERVAL`, + `RTMANAGER_CLEANUP_INTERVAL`. +- **Coordination** — `RTMANAGER_GAME_LEASE_TTL_SECONDS`. +- **Lobby internal client** — `RTMANAGER_LOBBY_INTERNAL_BASE_URL`, + `RTMANAGER_LOBBY_INTERNAL_TIMEOUT`. +- **Process and logging** — `RTMANAGER_LOG_LEVEL`, + `RTMANAGER_SHUTDOWN_TIMEOUT`. +- **Telemetry** — standard `OTEL_*`. + +## Runtime Notes + +- **Single-instance v1.** Multi-instance Runtime Manager with Redis + Streams consumer groups is explicitly out of scope for the current + iteration. The per-game lease serialises operations on one game + across the consumer + REST entry points; cross-instance + coordination is deferred until a real workload demands it. +- **Lease semantics.** `rtmanager:game_lease:{game_id}` is + `SET ... NX PX ` with TTL `RTMANAGER_GAME_LEASE_TTL_SECONDS` + (default `60s`). The lease is **not renewed mid-operation** in v1; + long pulls of multi-GB images can therefore expire the lease + before the operation finishes — the trade-off is documented in + [`services.md` §1](services.md). The reconciler honours the same + lease around every drift mutation + ([`workers.md` §14](workers.md)). +- **Operation log is the source of truth.** Every lifecycle and + reconcile mutation appends one row to `rtmanager.operation_log`. + The `runtime:health_events` stream and the `notification:intents` + emissions are best-effort — a publish failure logs at `Error` and + proceeds, never rolling back the recorded operation + ([`workers.md` §8](workers.md)). +- **In-memory probe hysteresis.** The active HTTP probe keeps + per-game `consecutiveFailures` and `failurePublished` counters in a + mutex-guarded map. State is non-persistent: a process restart that + loses the counters re-establishes hysteresis from scratch, and + state for a game that transitions through `stopped → running` is + pruned at the start of every probe tick + ([`workers.md` §5](workers.md)). +- **Pull policy fallbacks.** `RTMANAGER_IMAGE_PULL_POLICY` accepts + `if_missing` (default), `always`, and `never`. Image labels + (`com.galaxy.cpu_quota`, `com.galaxy.memory`, + `com.galaxy.pids_limit`) drive resource limits when present; the + matching `RTMANAGER_DEFAULT_*` env vars supply the fallback when a + label is absent or unparseable. Producers never pass limits. +- **State directory ownership.** RTM creates per-game state + directories under `RTMANAGER_GAME_STATE_ROOT` with the configured + mode and uid/gid, but **never deletes them**. Removing the directory + is operator domain (backup tooling, a future Admin Service + workflow). A cleanup that removes the container leaves the + directory intact. diff --git a/rtmanager/docs/services.md b/rtmanager/docs/services.md new file mode 100644 index 0000000..83e9158 --- /dev/null +++ b/rtmanager/docs/services.md @@ -0,0 +1,443 @@ +# Lifecycle Services + +This document explains the design of the five lifecycle services +(`startruntime`, `stopruntime`, `restartruntime`, `patchruntime`, +`cleanupcontainer`) under [`../internal/service/`](../internal/service) +plus the per-handler REST glue under +[`../internal/api/internalhttp/`](../internal/api/internalhttp). + +The current-state behaviour (lifecycle steps, failure tables, the +per-game lease semantics, the wire contracts) lives in +[`../README.md`](../README.md), the OpenAPI spec at +[`../api/internal-openapi.yaml`](../api/internal-openapi.yaml), and the +AsyncAPI spec at +[`../api/runtime-jobs-asyncapi.yaml`](../api/runtime-jobs-asyncapi.yaml). +This file records the *why*. + +## 1. Per-game lease lives at the service layer + +Every lifecycle service acquires `rtmanager:game_lease:{game_id}` via +[`ports.GameLeaseStore`](../internal/ports/gamelease.go) before doing +any work, and releases it on the way out: + +- the lease primitive serialises operations on a single game across + every entry point (stream consumers and REST handlers); +- holding the lease at the service layer keeps the consumer / REST + callers symmetric — neither acquires the lease itself, both call + the service the same way; +- the Redis-backed adapter + ([`../internal/adapters/redisstate/gamelease/store.go`](../internal/adapters/redisstate/gamelease/store.go)) + uses `SET NX PX` on acquire, Lua compare-and-delete on release; a + release whose caller-supplied token no longer matches is a silent + no-op. + +The lease key shape is `rtmanager:game_lease:{base64url(game_id)}` so +opaque game ids may contain any characters without leaking through +the key syntax. + +The lease TTL is `RTMANAGER_GAME_LEASE_TTL_SECONDS` (default `60s`) +and is **not renewed mid-operation** in v1. A multi-GB image pull can +theoretically expire the lease before the start service finishes; +operators see this as a `reconcile_adopt` event later because the +container is created with the standard owner labels. A renewal helper +is deliberately deferred until a workload makes it necessary. + +The reconciler ([`workers.md`](workers.md) §4) honours the same lease +around every drift mutation, which closes the +restart-vs-`reconcile_dispose` race documented in §6 below. + +## 2. Health-events publisher lands with the start service + +The start service publishes `container_started` after `docker run` +returns; the events listener intentionally does **not** duplicate +the event ([`workers.md`](workers.md) §1). Centralising the publisher +on the start service avoids a "who emits what" ambiguity and lets the +publisher be a thin port wrapper rather than a worker-specific +helper. + +The publisher port lives next to the snapshot-upsert rule +([`adapters.md`](adapters.md) §8): one Publish call updates both +surfaces. + +## 3. `Result`-shaped contract + +`Service.Handle` returns `(Result, error)`. The Go-level `error` is +reserved for system-level / programmer faults (nil context, nil +service). All business outcomes flow through `Result`: + +- `Outcome=success`, `ErrorCode=""` — fresh start succeeded; +- `Outcome=success`, `ErrorCode="replay_no_op"` — idempotent replay; +- `Outcome=failure`, `ErrorCode` set — business failure + (`start_config_invalid` / `image_pull_failed` / + `container_start_failed` / `conflict` / `service_unavailable` / + `internal_error`). + +The stream consumer uses `Outcome` and `ErrorCode` to populate +`runtime:job_results` directly; the REST handler maps `Outcome=failure` +plus `ErrorCode` to the matching HTTP status. Both callers are simpler +with this contract than with an `errors.Is`-driven sentinel taxonomy. + +`ports.JobResult` and the two `JobOutcome*` string constants live in +the ports package next to `JobResultPublisher` so the wire shape is +defined exactly once. The constants are intentionally not aliases of +`operation.Outcome` — the audit-log enum is allowed to grow without +breaking the wire format. + +## 4. Start service failure-mode mapping + +| Failure | Error code | Notification intent | +| --- | --- | --- | +| Invalid input (empty fields, unknown op_source) | `start_config_invalid` | `runtime.start_config_invalid` | +| Lease busy | `conflict` | — | +| Existing record running with a different image_ref | `conflict` | — | +| Get returns a non-NotFound transport error | `internal_error` | — | +| `image_ref` shape rejected by `distribution/reference` | `start_config_invalid` | `runtime.start_config_invalid` | +| `EnsureNetwork` returns `ErrNetworkMissing` | `start_config_invalid` | `runtime.start_config_invalid` | +| `EnsureNetwork` returns any other error | `service_unavailable` | — | +| `PullImage` failure | `image_pull_failed` | `runtime.image_pull_failed` | +| `InspectImage` failure | `image_pull_failed` | `runtime.image_pull_failed` | +| `prepareStateDir` failure | `start_config_invalid` | `runtime.start_config_invalid` | +| `Run` failure | `container_start_failed` | `runtime.container_start_failed` | +| `Upsert` failure after successful Run | `container_start_failed` | `runtime.container_start_failed` | + +Three error codes do **not** raise an admin notification: `conflict`, +`service_unavailable`, and `internal_error` are operational classes +(another caller is in flight, a dependency is down, an unclassified +fault) where the corrective action is not a configuration change. The +operator already sees them through telemetry and structured logs; an +email per occurrence would be noise. + +## 5. Upsert-after-Run rollback + +A `Run` that succeeded but whose `Upsert` failed leaves a running +container with no PG record. The service issues a best-effort +`docker.Remove(containerID)` in a fresh `context.Background()` (the +request context may already be cancelled) before recording the failure. +A Remove failure is logged but not propagated; the reconciler adopts +surviving orphans on its periodic pass. + +The Docker adapter already removes the container when `Run` itself +returns an error after a successful `ContainerCreate` ([`adapters.md`](adapters.md) §3). +The service-layer rollback covers the additional post-`Run` Upsert +failure path. + +## 6. Pre-existing record handling + +Only `status=running` + same `image_ref` is a `replay_no_op`. +`running` + a different `image_ref` returns `failure / conflict` (use +`patch` to change the image of a running container). + +Anything else (`stopped`, `removed`, missing record) proceeds with a +fresh start that ends in `Upsert`. `Upsert` overwrites verbatim and is +not bound by the transitions table, so installing a `running` record +over a `removed` row is permitted — the `removed` terminus rule lives +in `runtime.AllowedTransitions` (which guards `UpdateStatus`), not in +`Upsert`. + +`created_at` is preserved across re-starts: the start service reuses +`existing.CreatedAt` when the record was found, so the +"first time RTM saw the game" semantics from +[`postgres-migration.md`](postgres-migration.md) §9 hold even when the +start path goes through `Upsert` rather than through the runtime +adapter's `INSERT ... ON CONFLICT DO UPDATE` EXCLUDED list. + +A residual `galaxy-game-{game_id}` container left over from a previous +start that was stopped but never cleaned up will fail at `docker run` +with a name conflict. The service surfaces that as +`container_start_failed`; cleanup plus the reconciler is the standard +remedy. A pre-emptive Remove inside the start service was rejected +because it would silently undo manual operator inspection on stopped +containers. + +## 7. `LobbyInternalClient.GetGame` is best-effort + +The fetch happens after the lease is acquired and before the Docker +work, with the configured `RTMANAGER_LOBBY_INTERNAL_TIMEOUT`. +`ErrLobbyUnavailable` and `ErrLobbyGameNotFound` are logged at +`debug`; the start operation continues either way. The fetched +`Status` and `TargetEngineVersion` enrich logs only — the start +envelope already carries the only required field (`image_ref`), and +the port docstring fixes the recoverable-failure contract. + +## 8. `image_ref` validation + +Validation uses `github.com/distribution/reference.ParseNormalizedNamed` +before any Docker round-trip. Rejected shapes surface as +`start_config_invalid` plus a `runtime.start_config_invalid` intent. +Daemon-side rejections after a valid parse (manifest unknown, +authentication required) surface as `image_pull_failed` plus a +`runtime.image_pull_failed` intent. The split keeps operator-actionable +configuration mistakes distinct from registry-side failures. + +## 9. State-directory preparer is overrideable + +`Dependencies.PrepareStateDir` is a `func(gameID string) (string, error)` +injection point that defaults to `os.MkdirAll` + `os.Chmod` + +`os.Chown` against `RTMANAGER_GAME_STATE_ROOT`. Tests override it to +point at a `t.TempDir()`-style fake without exercising the real +filesystem permissions (which require either matching uid/gid or +root). This is a deliberate non-port abstraction: the start service +does no other filesystem work and the cost of a new port for one +helper is not worth the indirection. + +## 10. Container env: both `GAME_STATE_PATH` and `STORAGE_PATH` + +Both names are accepted by the v1 engine. The start service always +sets both; the configured `RTMANAGER_ENGINE_STATE_ENV_NAME` controls +the primary. When the operator overrides the primary to `STORAGE_PATH`, +the deduplicating map collapses the two entries into one. + +## 11. Wiring layer construction + +`internal/app/wiring.go` is the single point that builds every +production store, adapter, and service from `config.Config`. The +struct exposes typed fields so handlers and workers can grab the +singletons without re-wiring; an `addCloser` slice releases adapter +resources (currently the Lobby HTTP client's idle-connection pool) at +runtime shutdown. The `runtimeRecordsProbe` adapter installed during +construction registers the `rtmanager.runtime_records_by_status` +gauge documented in [`../README.md` §Observability](../README.md). + +The persistence-only `CountByStatus` method on the `runtimerecordstore` +adapter is **not** part of `ports.RuntimeRecordStore` because it is +only used by the gauge probe; widening the port for one caller would +force every adapter and test fake to grow with no benefit. The adapter +exposes it directly and the wiring composes a concrete-typed wrapper. + +## 12. Shared lease across composed operations (restart, patch) + +Restart and patch must hold the lease across the inner +`stop → docker rm → start` sequence, otherwise a concurrent stop or +restart could observe a half-recreated runtime. + +`startruntime.Service` and `stopruntime.Service` therefore expose a +second public method: + +```go +// Run executes the lifecycle assuming the per-game lease is already +// held by the caller. Reserved for orchestrator services that compose +// stop or start with another operation under a single outer lease. +// External callers must use Handle. +func (service *Service) Run(ctx context.Context, input Input) (Result, error) +``` + +`Handle` acquires the lease, defers its release, and calls `Run`. +Restart and patch acquire the outer lease themselves and call `Run` +on the inner services. The inner services record their own +`operation_log` entries, telemetry counters, health events, and admin +notification intents identically to a top-level `Handle`. + +A typed `LeaseTicket` parameter (a small internal-package zero-size +struct that only the lease store can construct) was considered and +rejected for v1: only sister services in `internal/service/` ever call +`Run`, the docstring is loud about the precondition, and the pattern +can be tightened later without breaking the public surface that +consumers and handlers consume. + +## 13. Correlation id on `source_ref` + +The outer restart and patch services reuse the existing +`Input.SourceRef` as a correlation key: + +- when `Input.SourceRef` is non-empty (REST request id, stream entry + id), all three entries — outer restart / patch + inner stop + + inner start — share that value; +- when empty, the outer service generates a 32-byte base64url string + via the same `NewToken` generator that produces lease tokens, and + uses it as the correlation key for all three entries. + +The outer entry's `source_ref` keeps its dual semantics: actor ref +when the caller supplied one, generated correlation id otherwise. Pure +top-level operations (caller invokes start, stop, or cleanup directly) +keep the original meaning. Composed operations (restart, patch) use +the same value in three places to make audit queries trivial. + +This is not the cleanest end-state — a dedicated `correlation_id` +column would carry the link without ambiguity — but it is the smallest +change that does not touch the schema. A future stage that adds the +column can rename the field and clear up the dual role in one move. + +## 14. Semver validation for patch + +`internal/service/patchruntime/semver.go` enforces the +patch-precondition (current and new `image_ref` parse as semver, share +major and minor): + +- `extractSemverTag(imageRef)` parses with + `github.com/distribution/reference.ParseNormalizedNamed`, casts to + `reference.NamedTagged`, then validates the tag with + `golang.org/x/mod/semver.IsValid` (after prepending `v` when the tag + omits it). Failures map to `image_ref_not_semver`; +- `samePatchSeries(currentSemver, newSemver)` compares + `semver.MajorMinor` of the two canonical strings; mismatch maps to + `semver_patch_only`. + +`golang.org/x/mod` is a direct require to avoid a transitive-version +surprise. `github.com/Masterminds/semver/v3` (also in the module +graph) was rejected to avoid two semver libraries on disk for the +same job; `x/mod/semver` already covers Lobby. A hand-rolled +`vMajor.Minor.Patch` parser was rejected as premature. + +Pre-checks run before any inner stop or `docker rm`: a rejected patch +never disturbs the running runtime. Patch with +`new_image_ref == current_image_ref` proceeds through the recreate +flow unchanged (not `replay_no_op`: the inner start still runs); the +outer `op_kind=patch` entry records the no-op patch for audit. + +## 15. `StopReason` placement + +The reason enum mirrors `lobby/internal/ports/runtimemanager.go` +verbatim and lives at `internal/service/stopruntime/stopreason.go`. +The stream consumer and the REST handler import `stopruntime` for +the same enum the service requires. + +Inner stop calls from restart and patch always pass +`StopReasonAdminRequest`. Restart and patch are platform-internal +recreate flows; `admin_request` is the closest semantic match in the +five-value vocabulary. The actor that originated the recreate (REST +request id, admin user id) flows through the `op_source` / +`source_ref` pair, not through the stop reason. + +## 16. Error code centralisation + +`internal/service/startruntime/errors.go` is the canonical home for +the stable error codes returned in `Result.ErrorCode`. The other four +services (`stopruntime`, `restartruntime`, `patchruntime`, +`cleanupcontainer`) import the constants from `startruntime` rather +than redeclaring them. The package comment of `errors.go` flags the +shared usage so future readers do not chase per-service declarations. + +`start_config_invalid` is reserved for start because every start +validation failure also raises an admin notification intent. The +other services use the more general `invalid_request` for input +validation failures. + +## 17. Stop / restart / patch / cleanup failure tables + +### `stopruntime` + +| Failure | Error code | Notes | +| --- | --- | --- | +| Invalid input | `invalid_request` | No notification intent. | +| Lease busy | `conflict` | Lease release skipped because acquire returned false. | +| Lease error | `service_unavailable` | Redis unreachable. | +| Record missing | `not_found` | | +| Status `stopped` / `removed` | success / `replay_no_op` | Idempotent re-stop. | +| `docker.Stop` returns `ErrContainerNotFound` | success | Record transitions `running → removed`, `container_disappeared` health event published. | +| `docker.Stop` other error | `service_unavailable` | Record untouched; caller may retry. | +| `UpdateStatus` returns `ErrConflict` (CAS race) | success / `replay_no_op` | The desired state was reached by another path (reconciler / restart). | +| `UpdateStatus` returns `ErrNotFound` | `not_found` | Record vanished mid-stop. | +| `UpdateStatus` other error | `internal_error` | | + +### `restartruntime` + +| Failure | Error code | Notes | +| --- | --- | --- | +| Invalid input | `invalid_request` | | +| Lease busy / lease error | `conflict` / `service_unavailable` | Same as stop. | +| Record missing | `not_found` | | +| Status `removed` | `conflict` | Image_ref may be empty; restart cannot proceed. | +| Inner stop fails | inner `ErrorCode` | Outer `ErrorMessage` prefixes "inner stop failed: ". | +| `docker.Remove` fails | `service_unavailable` | Inner stop already moved record to `stopped`; runtime stays in `stopped`. Admin must call `cleanup_container` before retrying restart. | +| Inner start fails | inner `ErrorCode` | Outer `ErrorMessage` prefixes "inner start failed: ". | + +The post-stop `docker rm` failure is the only path that leaves the +runtime in a state from which the same operation cannot recover by +itself: a residual `galaxy-game-{game_id}` container blocks a fresh +inner start (the start service surfaces this as +`container_start_failed`). The runbook entry — "call cleanup, then +restart again" — is the standard remedy. + +### `patchruntime` + +| Failure | Error code | Notes | +| --- | --- | --- | +| Invalid input | `invalid_request` | | +| Lease busy / lease error | `conflict` / `service_unavailable` | | +| Record missing | `not_found` | | +| Status `removed` | `conflict` | | +| Current `image_ref` not parseable as semver tag | `image_ref_not_semver` | Pre-check; no inner ops fired. | +| New `image_ref` not parseable as semver tag | `image_ref_not_semver` | Pre-check; no inner ops fired. | +| Major / minor mismatch | `semver_patch_only` | Pre-check; no inner ops fired. | +| Inner stop / `docker rm` / inner start fails | inherits inner code | Same propagation as restart. | + +### `cleanupcontainer` + +| Failure | Error code | Notes | +| --- | --- | --- | +| Invalid input | `invalid_request` | | +| Lease busy / lease error | `conflict` / `service_unavailable` | | +| Record missing | `not_found` | | +| Status `removed` | success / `replay_no_op` | | +| Status `running` | `conflict` | Error message: "stop the runtime first". | +| Status `stopped` | proceed | | +| `docker.Remove` returns `ErrContainerNotFound` | success | Adapter swallows not-found into nil. | +| `docker.Remove` other error | `service_unavailable` | Record untouched; caller may retry. | +| `UpdateStatus` returns `ErrConflict` | success / `replay_no_op` | Race with reconciler dispose. | +| `UpdateStatus` returns `ErrNotFound` | `not_found` | | +| `UpdateStatus` other error | `internal_error` | | + +## 18. REST handler conventions + +The internal HTTP handlers under +[`../internal/api/internalhttp/handlers/`](../internal/api/internalhttp/handlers) +follow these rules: + +- **`X-Galaxy-Caller` header.** The optional header carries the + calling service identity (`gm` / `admin`); the handler records the + value as `op_source` in `operation_log` (`gm_rest` / `admin_rest`). + Missing or unknown values default to `admin_rest` because every + audit-log query already filters on the cleanup endpoint + (`op_source ∈ {auto_ttl, admin_rest}`); making the default match + the most-restricted surface keeps existing dashboards correct when + an unconfigured client hits the listener. The header is declared as + a reusable parameter (`components.parameters.XGalaxyCallerHeader`) + in the OpenAPI spec and is referenced from each runtime operation + but not from `/healthz` and `/readyz`. +- **Error code → HTTP status mapping.** One canonical table in + `handlers/common.go`: + + | ErrorCode | HTTP status | + | --- | ---: | + | (success, including `replay_no_op`) | 200 | + | `invalid_request`, `start_config_invalid`, `image_ref_not_semver` | 400 | + | `not_found` | 404 | + | `conflict`, `semver_patch_only` | 409 | + | `service_unavailable`, `docker_unavailable` | 503 | + | `internal_error`, `image_pull_failed`, `container_start_failed` | 500 | + + `image_pull_failed` and `container_start_failed` are operational + failures that originate inside RTM (registry / daemon problems), + not client-side validation issues; they map to `500` so callers + retry through their normal resilience paths instead of treating + the call as a 4xx that must be fixed at the source. + `docker_unavailable` is reserved for future producers; today the + start service emits `service_unavailable` for Docker-daemon + failures. Unknown error codes default to `500`. +- **List and Get bypass the service layer.** `internalListRuntimes` + and `internalGetRuntime` read directly from + `ports.RuntimeRecordStore`. Reads do not produce `operation_log` + rows, do not change Docker state, do not need the per-game lease, + and do not have a stream-side counterpart — none of the lifecycle + service machinery is justified. +- **`RuntimeRecordStore.List(ctx)` returns every record regardless + of status.** A single SELECT ordered by + `(last_op_at DESC, game_id ASC)` — the same direction the + `runtime_records_status_last_op_idx` index supports, so freshly + active games surface first. Pagination is intentionally not + modelled in v1; the working set is bounded by the games tracked + by Lobby. +- **Per-handler service ports use `mockgen`.** The handler layer + depends on five narrow interfaces — one per lifecycle service — + declared in `handlers/services.go`. Production wiring passes the + concrete `*.Service` pointers (each satisfies the + matching interface implicitly); tests pass the mockgen-generated + mocks under `handlers/mocks/`. +- **Conformance test scope.** `internalhttp/conformance_test.go` + drives every documented runtime operation against a real + `internalhttp.Server` whose service deps are deterministic stubs. + The test uses `kin-openapi/routers/legacy.NewRouter`, calls + `openapi3filter.ValidateRequest` and + `openapi3filter.ValidateResponse` so both directions match the + contract. The scope is happy-path only; the failure-path response + shapes are validated by the per-handler tests. diff --git a/rtmanager/docs/workers.md b/rtmanager/docs/workers.md new file mode 100644 index 0000000..3ec5605 --- /dev/null +++ b/rtmanager/docs/workers.md @@ -0,0 +1,412 @@ +# Background Workers + +This document explains the design of the seven background workers +under [`../internal/worker/`](../internal/worker): + +- [`startjobsconsumer`](../internal/worker/startjobsconsumer) and + [`stopjobsconsumer`](../internal/worker/stopjobsconsumer) — async + consumers driven by `runtime:start_jobs` / `runtime:stop_jobs`; +- [`dockerevents`](../internal/worker/dockerevents) — Docker `/events` + subscription; +- [`dockerinspect`](../internal/worker/dockerinspect) — periodic + `InspectContainer` worker; +- [`healthprobe`](../internal/worker/healthprobe) — active HTTP + `/healthz` probe; +- [`reconcile`](../internal/worker/reconcile) — startup + periodic + drift reconciliation; +- [`containercleanup`](../internal/worker/containercleanup) — + periodic TTL cleanup. + +The current-state behaviour and configuration surface live in +[`../README.md`](../README.md) (§Runtime Surface, §Health Monitoring, +§Reconciliation), and operational notes are in +[`runtime.md`](runtime.md), [`flows.md`](flows.md), and +[`runbook.md`](runbook.md). This file records the rationale. + +## 1. Single ownership per `event_type` + +The `runtime:health_events` vocabulary is shared across four sources; +each event type is owned by exactly one of them. + +| `event_type` | Owner | +| --- | --- | +| `container_started` | `internal/service/startruntime` | +| `container_exited` | `internal/worker/dockerevents` | +| `container_oom` | `internal/worker/dockerevents` | +| `container_disappeared` | `internal/worker/dockerevents` (external destroy) and `internal/worker/reconcile` (PG-drift) | +| `inspect_unhealthy` | `internal/worker/dockerinspect` | +| `probe_failed` | `internal/worker/healthprobe` | +| `probe_recovered` | `internal/worker/healthprobe` | + +`container_started` is intentionally not duplicated by the events +listener, even though Docker emits a `start` action whenever the start +service runs the container. The start service already publishes the +event with the same wire shape; observing the action in the listener +would produce two entries per real start. + +## 2. `container_disappeared` is conditional on PG state + +The Docker events listener inspects the runtime record before emitting +`container_disappeared` for a `destroy` action. Three suppression rules +apply: + +- record missing → suppress (the destroyed container was never owned + by RTM as a tracked runtime, so no consumer cares); +- record `status != running` → suppress (RTM already finished a stop + or cleanup; the destroy is the expected tail of that operation); +- record `current_container_id != event.ContainerID` → suppress (RTM + swapped to a new container through restart or patch; the destroy is + the expected removal of the prior container id). + +Only a destroy that arrives for a `running` record whose +`current_container_id` still equals the event id is treated as +unexpected. This is the wire-side analogue of the reconciler's +PG-drift check: the reconciler observes "PG=running, no Docker +container" while the events listener observes "Docker says destroy, +PG still says running pointing at this container". Together they cover +both directions of drift. + +A read failure against `runtime_records` is treated conservatively as +"suppress" — the listener cannot tell whether the destroy was external +or RTM-initiated, and over-emitting `container_disappeared` would lead +to a real consumer (`Game Master`) escalating a false positive. + +## 3. `die` with exit code `0` is suppressed + +`docker stop` (and graceful shutdowns via SIGTERM) produces a `die` +event with exit code `0`. The `container_exited` contract guarantees a +non-zero exit; emitting on exit `0` would shower consumers with +normal-stop noise. The listener silently drops the event; the +operation log already records the stop on the caller side. + +## 4. Inspect worker leaves `container_disappeared` to the reconciler + +When `dockerinspect` calls `InspectContainer` and the daemon returns +`ports.ErrContainerNotFound`, the worker logs at `Debug` and skips: + +- the reconciler is the single authority for PG-drift reconciliation. + Adding a third source for `container_disappeared` would risk double + emission and complicate the consumer story; +- inspect ticks every 30 seconds; the reconciler ticks every 5 + minutes. The latency window for "Docker drops the container, RTM + notices" is therefore at most 5 minutes in v1, which is acceptable + for the kinds of drift the reconciler covers (manual `docker rm` + outside RTM, daemon restart with stale records). If a future + requirement tightens the window, promoting the inspect-side + observation to a real `container_disappeared` is a one-line change. + +## 5. Probe hysteresis is in-memory and pruned per tick + +The active probe worker keeps per-game state in a +`map[string]*probeState` guarded by a mutex. Two counters live there: + +- `consecutiveFailures` — incremented on every failed probe, reset on + every success; +- `failurePublished` — prevents repeated `probe_failed` emission while + the failure persists, and triggers a single `probe_recovered` on the + first success after the threshold was crossed. + +The state is non-persistent. RTM is single-instance in v1, and a +process restart that loses the counters merely re-establishes the +hysteresis from scratch — the only consequence is that a probe failure +already in progress at the moment of restart needs another full +threshold of failures to surface. Making the state durable would add a +Redis round-trip to every probe attempt without buying anything that +operators or downstream consumers depend on. + +State pruning happens at the start of every tick. The worker reads the +current running list and removes any state entry whose `game_id` is +not in the list. A game that transitions through stopped → running +again starts fresh; previously-accumulated counters do not bleed into +the new lifecycle. + +## 6. Probe concurrency is bounded by a fixed cap + +Probes inside one tick run in parallel through a buffered-channel +semaphore (`defaultMaxConcurrency = 16`). Three reasons: + +- A single slow engine cannot delay the entire cohort. Sequential + per-game probing would multiply the worst case by `len(records)`, + which is the wrong shape for what is fundamentally a fan-out + observation pattern. +- An unbounded fan-out (one goroutine per record per tick without a + cap) was rejected to avoid pathological CPU and connection bursts + if the running list ever grows beyond what RTM was sized for. 16 + in-flight probes at the default 2s timeout fit a single RTM + instance well within typical OS file-descriptor and TCP + ephemeral-port limits. +- The cap is a constant rather than an env var because RTM v1 is + single-instance and the active-game count is bounded by Lobby; a + configurable cap is something we promote to env if a real workload + demands it. + +The same reasoning argues against parallelism in the inspect worker: +inspect calls are cheap (sub-ms in the local Docker socket case) and +serial execution avoids unnecessary concurrency on the daemon socket. + +## 7. Events listener reconnects with fixed backoff + +The Docker daemon's events stream is a long-lived subscription; the +SDK channel terminates on any transport error (daemon restart, socket +hiccup, connection reset). The listener's outer loop handles this by +re-subscribing after a fixed `defaultReconnectBackoff = 5s` wait, +indefinitely while ctx is alive. + +Crashing the process on a transport error was rejected because losing +a few seconds of health observations is a much smaller blast radius +than losing the entire RTM process while the start/stop pipelines are +running. The save-offset case is different: a lost offset replays the +entire backlog and breaks correctness, while a missed health event is +observation-only. + +A subscription error is logged at `Warn` so operators can see the +reconnect activity without it dominating the log volume. + +## 8. Health publisher remains best-effort + +Every emission goes through `ports.HealthEventPublisher.Publish`, the +same surface the start service already uses +([`adapters.md`](adapters.md) §8). A publish failure logs at `Error` +and proceeds; the worker does not retry, does not adjust its in-memory +hysteresis, and does not surface the failure to the caller. The +operation log is the source of truth for runtime state; the event +stream is a best-effort notification surface to consumers. + +## 9. Stream offset labels are stable identifiers + +Both consumers persist their progress through +`ports.StreamOffsetStore` under fixed labels — `startjobs` for the +start-jobs consumer and `stopjobs` for the stop-jobs consumer. The +labels match `rtmanager:stream_offsets:{label}` and stay stable when +the underlying stream key is renamed via +`RTMANAGER_REDIS_START_JOBS_STREAM` / +`RTMANAGER_REDIS_STOP_JOBS_STREAM`, so an operator who points the +consumer at a different stream key does not lose the persisted offset. + +## 10. `OpSource` and `SourceRef` originate at the consumer boundary + +Every consumed envelope is translated into a `Service.Handle` call +with `OpSource = operation.OpSourceLobbyStream`. The opaque per-source +`SourceRef` is the Redis Stream entry id (`message.ID`); the +`operation_log` rows therefore record the originating envelope id, and +restart / patch correlation logic ([`services.md`](services.md) §13) +keeps working when those services are invoked indirectly. + +## 11. Replay-no-op detection lives in the service layer + +The consumer does not detect replays itself. `startruntime.Service` +returns `Outcome=success, ErrorCode=replay_no_op` when the existing +record is already `running` with the same `image_ref`; +`stopruntime.Service` does the same for an already-stopped or +already-removed record. The consumer copies the result fields into +the `RuntimeJobResult` payload verbatim and lets Lobby observe the +replay through `error_code`. + +The wire-shape consequences: + +- `success` + empty `error_code` → fresh start / fresh stop; +- `success` + `error_code=replay_no_op` → idempotent replay. For + start, the existing record carries `container_id` and + `engine_endpoint`; for stop on `status=removed`, both fields are + empty strings (the record was nulled by an earlier cleanup) — the + AsyncAPI contract permits empty strings on these required fields; +- `failure` + non-empty `error_code` → the start / stop service + returned a zero `Record`; the consumer publishes empty + `container_id` and `engine_endpoint`. + +## 12. Per-message errors are absorbed; the offset always advances + +The consumer run loop logs and absorbs any decode error, any go-level +service error, and any publish failure; `streamOffsetStore.Save` runs +unconditionally after each handled message. Pinning the offset on a +single transient publish failure was rejected because the durable side +effect (operation_log row, runtime_records mutation, Docker state) has +already happened on the first pass; pinning the offset to retry the +publish would duplicate audit rows for hours until the operator +intervened. + +The exception is `streamOffsetStore.Save` itself: a save failure +returns a wrapped error from `Run`. The component supervisor in +`internal/app/app.go` then exits the process and lets the operator +escalate, because losing the offset would cause every subsequent +restart to re-process every prior envelope. + +## 13. `requested_at_ms` is logged-only + +The AsyncAPI envelopes carry `requested_at_ms` from Lobby. The +consumer parses it (rejecting unparseable values) but only includes +the value in structured logs — the field is "used for diagnostics, not +authoritative" per the contract. The service layer ignores it; the +operation_log uses `service.clock()` for `started_at` / `finished_at` +so Lobby's wall-clock skew never bleeds into RTM persistence. + +## 14. Reconciler: per-game lease around every write + +A `running → removed` mutation that races a restart's inner stop +would clobber the restart's freshly-installed `running` record without +any other guard. The reconciler honours the same per-game lease that +the lifecycle services hold ([`services.md`](services.md) §1). + +The reconciler splits its work into two phases: + +- **Read pass — lockless.** + `docker.List({com.galaxy.owner=rtmanager})` followed by + `RuntimeRecords.ListByStatus(running)`. No lease is taken; both + reads are point-in-time observations of independent systems and a + stale view here only delays a mutation by one tick. +- **Write pass — lease-guarded.** Every drift mutation + (`adoptOne` / `disposeOne` / `observedExitedOne`) acquires the + per-game lease, re-reads the record under the lease, and then + either applies the mutation or returns when state has changed. + A lease conflict (`acquired=false`) is logged at `info` and the + game is silently skipped — the next tick will retry. A lease-store + error is logged at `warn`; the rest of the pass continues. + +The re-read after lease acquisition is intentional: the read pass is +lockless, so by the time the lease is held the runtime record may +have moved. `UpdateStatus` already provides CAS via +`ExpectedFrom + ExpectedContainerID`, but `Upsert` (used for adopt) +does not, so the explicit re-read keeps the three paths uniform and +makes the skip condition obvious in code review. + +## 15. Three drift kinds covered by the reconciler + +- `adopt` — Docker reports a container labelled + `com.galaxy.owner=rtmanager` for which RTM has no record; insert a + fresh `runtime_records` row with `op_kind=reconcile_adopt` and never + stop or remove the container (operators may have started it + manually for diagnostics). +- `dispose` — RTM has a `running` record whose container is missing + in Docker; mark `status=removed`, publish + `container_disappeared`, append `op_kind=reconcile_dispose`. +- `observed_exited` — RTM has a `running` record whose container + exists but is in `exited`; mark `status=stopped`, publish + `container_exited` with the observed exit code. This third path + exists because the events listener sees only live events; a + container that died while RTM was offline would otherwise stay + `running` indefinitely. The drift is exposed through + `rtmanager.reconcile_drift{kind=observed_exited}` and through the + `container_exited` health event; no `operation_log` entry is + written because the audit log records explicit RTM operations, not + passive observations of Docker state. + +## 16. `stopped_at = now (reconciler observation time)` + +The `observed_exited` path writes `stopped_at = now`, where `now` is +the reconciler's observation time. The persistence adapter +([`postgres-migration.md`](postgres-migration.md) §8) hard-codes +`stopped_at = now` for the `stopped` destination — there is no +port-level knob for an explicit timestamp, and the reconciler does not +read `State.FinishedAt` from Docker. + +The trade-off: `stopped_at` diverges from the daemon's +`State.FinishedAt` by at most one tick interval (default 5 minutes). +If a downstream consumer ever needs the daemon-observed exit +timestamp, the upgrade path is a one-call extension of +`UpdateStatusInput` with an optional `StoppedAt *time.Time` field; +that change is deferred until a consumer materialises. + +## 17. Synchronous initial pass + periodic Component + +`README §Startup dependencies` step 6 demands "Reconciler runs once +and blocks until done" before background workers start, but +`app.App.Run` starts every registered `Component` concurrently — +component ordering does not translate into start ordering. + +The reconciler exposes a public `ReconcileNow(ctx)` method that the +runtime calls synchronously between `newWiring` and `app.New`. The +same `*Reconciler` is then registered as a `Component`; its `Run` +only ticks (no immediate pass) so the startup work is not duplicated. +The cost is one public method on the worker; the benefit is that the +README invariant holds verbatim and the periodic loop is a textbook +`Component`. + +## 18. Adopt through `Upsert`, race with start is benign + +The adopt path constructs a fresh `runtime.RuntimeRecord` (status +running, container id and image_ref from labels, `started_at` from +`com.galaxy.started_at_ms` or inspect, state path and docker network +from configuration, engine endpoint from the +`http://galaxy-game-{game_id}:8080` rule) and calls +`RuntimeRecords.Upsert`. + +Race scenario: the start service has called `docker.Run` but has not +yet finished its own `Upsert` when the reconciler observes the +container without a record. Both writers eventually arrive at PG with +the same key data — the start service knows the canonical +`image_ref`, but the reconciler reads it from the +`com.galaxy.engine_image_ref` label that the start service itself +wrote. The CAS-free overwrite is therefore benign: + +- `created_at` is preserved across upserts by the + `ON CONFLICT DO UPDATE` clause, so the "first time RTM saw this + game" timestamp stays stable regardless of which writer lands last; +- all other fields in this race carry identical values (same + container, same image, same hostname, same state path). + +Under the per-game lease this is doubly safe: the reconciler only +issues `Upsert` while holding the lease, and only after re-reading +the record finds it absent. Concurrent start would block on the same +lease; concurrent stop / restart would have moved the record out of +"absent" by the time the reconciler re-reads. + +## 19. Cleanup worker delegates to the service + +The TTL-cleanup worker is intentionally tiny: it lists +`runtime_records.status='stopped'`, filters in process by +`record.LastOpAt.Before(now - cfg.Container.Retention)`, and calls +`cleanupcontainer.Service.Handle` with `OpSource=auto_ttl` for each +candidate. The service already owns: + +- the per-game lease around the Docker `Remove` call; +- the `running → removed` CAS via `UpdateStatus`; +- the operation_log entry (`op_kind=cleanup_container`, + `op_source=auto_ttl`); +- the telemetry counter and structured log fields. + +In-memory filtering is acceptable in v1 because the cardinality of +`status=stopped` rows is bounded by Lobby's active-game count plus +retention period. The dedicated `(status, last_op_at)` index drives +the underlying `ListByStatus(stopped)` query so the database does +the heavy lifting; the Go-side filter is microseconds-per-row. + +The worker uses a small `Cleaner` interface in its own package rather +than depending on `*cleanupcontainer.Service` directly. This keeps +the worker's tests light — no need to construct Docker, lease, +operation-log, and telemetry doubles just to verify TTL math — while +the production wiring still binds the real service via a compile-time +interface assertion in `internal/app/wiring.go`. + +## 20. Sequential per-game work in reconciler and cleanup + +Both workers process games sequentially within a tick. The +reconciler's mutations are dominated by `Get` + `Upsert` / +`UpdateStatus` round-trips against PG plus an occasional Docker +`InspectContainer`; the cleanup worker's mutations are dominated by +the cleanup service's `docker.Remove` call. Parallelising either +would multiply the load on the Docker daemon socket and the PG pool +without buying anything that v1 cardinality demands. + +## 21. Cross-module test boundary for the consumer integration test + +[`../internal/worker/startjobsconsumer/integration_test.go`](../internal/worker/startjobsconsumer/integration_test.go) +covers the contract roundtrip without importing +`lobby/internal/...`: + +- it XADDs a start envelope in the AsyncAPI wire shape (the same + shape Lobby's `runtimemanager.Publisher` writes); +- it runs the real `startruntime.Service` against in-memory fakes for + the persistence stores, the lease, and the notification / health + publishers, plus a gomock-backed `ports.DockerClient`; +- it lets the real `jobresultspublisher.Publisher` write to + `runtime:job_results`; +- it reads the resulting entry and asserts the symmetric wire shape; +- it then XADDs the same envelope a second time and asserts the + `error_code=replay_no_op` outcome with no further Docker calls. + +The cross-module integration that runs both the real Lobby publisher +and the real Lobby consumer alongside RTM lives at +`integration/lobbyrtm/`, which is the home for inter-service +fixtures. Keeping the in-package test free of `lobby/...` imports +avoids module-internal coupling and keeps `rtmanager`'s test suite +buildable on its own. diff --git a/rtmanager/go.mod b/rtmanager/go.mod index bccce7f..ba68ee9 100644 --- a/rtmanager/go.mod +++ b/rtmanager/go.mod @@ -1,3 +1,132 @@ module galaxy/rtmanager go 1.26.2 + +require ( + galaxy/notificationintent v0.0.0-00010101000000-000000000000 + galaxy/postgres v0.0.0-00010101000000-000000000000 + galaxy/redisconn v0.0.0-00010101000000-000000000000 + github.com/alicebob/miniredis/v2 v2.37.0 + github.com/containerd/errdefs v1.0.0 + github.com/distribution/reference v0.6.0 + github.com/docker/docker v28.5.2+incompatible + github.com/docker/go-units v0.5.0 + github.com/getkin/kin-openapi v0.135.0 + github.com/go-jet/jet/v2 v2.14.1 + github.com/jackc/pgx/v5 v5.9.2 + github.com/redis/go-redis/v9 v9.18.0 + github.com/stretchr/testify v1.11.1 + github.com/testcontainers/testcontainers-go v0.42.0 + github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0 + github.com/testcontainers/testcontainers-go/modules/redis v0.42.0 + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 + go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 + go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.43.0 + go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0 + go.opentelemetry.io/otel/metric v1.43.0 + go.opentelemetry.io/otel/sdk v1.43.0 + go.opentelemetry.io/otel/sdk/metric v1.43.0 + go.opentelemetry.io/otel/trace v1.43.0 + go.uber.org/mock v0.6.0 + golang.org/x/mod v0.35.0 + gopkg.in/yaml.v3 v3.0.1 +) + +require ( + dario.cat/mergo v1.0.2 // indirect + github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect + github.com/Microsoft/go-winio v0.6.2 // indirect + github.com/XSAM/otelsql v0.42.0 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/containerd/errdefs/pkg v0.3.0 // indirect + github.com/containerd/log v0.1.0 // indirect + github.com/containerd/platforms v0.2.1 // indirect + github.com/cpuguy83/dockercfg v0.3.2 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect + github.com/docker/go-connections v0.7.0 // indirect + github.com/ebitengine/purego v0.10.0 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect + github.com/go-openapi/jsonpointer v0.21.0 // indirect + github.com/go-openapi/swag v0.23.0 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect + github.com/jackc/chunkreader/v2 v2.0.1 // indirect + github.com/jackc/pgconn v1.14.3 // indirect + github.com/jackc/pgio v1.0.0 // indirect + github.com/jackc/pgpassfile v1.0.0 // indirect + github.com/jackc/pgproto3/v2 v2.3.3 // indirect + github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect + github.com/jackc/pgtype v1.14.4 // indirect + github.com/jackc/puddle/v2 v2.2.2 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/klauspost/compress v1.18.5 // indirect + github.com/lib/pq v1.10.9 // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect + github.com/magiconair/properties v1.8.10 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/mdelapenya/tlscert v0.2.0 // indirect + github.com/mfridman/interpolate v0.0.2 // indirect + github.com/moby/docker-image-spec v1.3.1 // indirect + github.com/moby/go-archive v0.2.0 // indirect + github.com/moby/moby/api v1.54.2 // indirect + github.com/moby/moby/client v0.4.1 // indirect + github.com/moby/patternmatcher v0.6.1 // indirect + github.com/moby/sys/atomicwriter v0.1.0 // indirect + github.com/moby/sys/sequential v0.6.0 // indirect + github.com/moby/sys/user v0.4.0 // indirect + github.com/moby/sys/userns v0.1.0 // indirect + github.com/moby/term v0.5.2 // indirect + github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect + github.com/morikuni/aec v1.1.0 // indirect + github.com/oasdiff/yaml v0.0.9 // indirect + github.com/oasdiff/yaml3 v0.0.9 // indirect + github.com/opencontainers/go-digest v1.0.0 // indirect + github.com/opencontainers/image-spec v1.1.1 // indirect + github.com/perimeterx/marshmallow v1.1.5 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect + github.com/pressly/goose/v3 v3.27.1 // indirect + github.com/redis/go-redis/extra/rediscmd/v9 v9.18.0 // indirect + github.com/redis/go-redis/extra/redisotel/v9 v9.18.0 // indirect + github.com/sethvargo/go-retry v0.3.0 // indirect + github.com/shirou/gopsutil/v4 v4.26.3 // indirect + github.com/sirupsen/logrus v1.9.4 // indirect + github.com/tklauser/go-sysconf v0.3.16 // indirect + github.com/tklauser/numcpus v0.11.0 // indirect + github.com/ugorji/go/codec v1.3.1 // indirect + github.com/woodsbury/decimal128 v1.3.0 // indirect + github.com/yuin/gopher-lua v1.1.1 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect + go.opentelemetry.io/proto/otlp v1.10.0 // indirect + go.uber.org/atomic v1.11.0 // indirect + go.uber.org/multierr v1.11.0 // indirect + golang.org/x/crypto v0.50.0 // indirect + golang.org/x/net v0.53.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/sys v0.43.0 // indirect + golang.org/x/text v0.36.0 // indirect + golang.org/x/time v0.15.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260420184626-e10c466a9529 // indirect + google.golang.org/grpc v1.80.0 // indirect + google.golang.org/protobuf v1.36.11 // indirect +) + +replace galaxy/postgres => ../pkg/postgres + +replace galaxy/redisconn => ../pkg/redisconn + +replace galaxy/notificationintent => ../pkg/notificationintent diff --git a/rtmanager/go.sum b/rtmanager/go.sum new file mode 100644 index 0000000..4d55a44 --- /dev/null +++ b/rtmanager/go.sum @@ -0,0 +1,475 @@ +dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8= +dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA= +github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk= +github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8= +github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg= +github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/Masterminds/semver/v3 v3.1.1/go.mod h1:VPu/7SZ7ePZ3QOrcuXROw5FAcLl4a0cBrbBpGY/8hQs= +github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= +github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= +github.com/XSAM/otelsql v0.42.0 h1:Li0xF4eJUxG2e0x3D4rvRlys1f27yJKvjTh7ljkUP5o= +github.com/XSAM/otelsql v0.42.0/go.mod h1:4mOrEv+cS1KmKzrvTktvJnstr5GtKSAK+QHvFR9OcpI= +github.com/alicebob/miniredis/v2 v2.37.0 h1:RheObYW32G1aiJIj81XVt78ZHJpHonHLHW7OLIshq68= +github.com/alicebob/miniredis/v2 v2.37.0/go.mod h1:TcL7YfarKPGDAthEtl5NBeHZfeUQj6OXMm/+iu5cLMM= +github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= +github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= +github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= +github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cockroachdb/apd v1.1.0/go.mod h1:8Sl8LxpKi29FqWXR16WEFZRNSz3SoPzUzeMeY4+DwBQ= +github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= +github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= +github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= +github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk= +github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= +github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= +github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpSBQv6A= +github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw= +github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= +github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= +github.com/cpuguy83/dockercfg v0.3.2 h1:DlJTyZGBDlXqUZ2Dk2Q3xHs/FtnooJJVaad2S9GKorA= +github.com/cpuguy83/dockercfg v0.3.2/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc= +github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= +github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= +github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= +github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= +github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= +github.com/docker/docker v28.5.2+incompatible h1:DBX0Y0zAjZbSrm1uzOkdr1onVghKaftjlSWt4AFexzM= +github.com/docker/docker v28.5.2+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/go-connections v0.7.0 h1:6SsRfJddP22WMrCkj19x9WKjEDTB+ahsdiGYf0mN39c= +github.com/docker/go-connections v0.7.0/go.mod h1:no1qkHdjq7kLMGUXYAduOhYPSJxxvgWBh7ogVvptn3Q= +github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= +github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= +github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU= +github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/getkin/kin-openapi v0.135.0 h1:751SjYfbiwqukYuVjwYEIKNfrSwS5YpA7DZnKSwQgtg= +github.com/getkin/kin-openapi v0.135.0/go.mod h1:6dd5FJl6RdX4usBtFBaQhk9q62Yb2J0Mk5IhUO/QqFI= +github.com/go-jet/jet/v2 v2.14.1 h1:wsfD9e7CGP9h46+IFNlftfncBcmVnKddikbTtapQM3M= +github.com/go-jet/jet/v2 v2.14.1/go.mod h1:dqTAECV2Mo3S2NFjbm4vJ1aDruZjhaJ1RAAR8rGUkkc= +github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vbaY= +github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= +github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= +github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= +github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= +github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= +github.com/go-test/deep v1.0.8 h1:TDsG77qcSprGbC6vTN8OuXp5g+J+b5Pcguhf7Zt61VM= +github.com/go-test/deep v1.0.8/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= +github.com/gofrs/uuid v4.0.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c= +github.com/jackc/chunkreader v1.0.0/go.mod h1:RT6O25fNZIuasFJRyZ4R/Y2BbhasbmZXF9QQ7T3kePo= +github.com/jackc/chunkreader/v2 v2.0.0/go.mod h1:odVSm741yZoC3dpHEUXIqA9tQRhFrgOHwnPIn9lDKlk= +github.com/jackc/chunkreader/v2 v2.0.1 h1:i+RDz65UE+mmpjTfyz0MoVTnzeYxroil2G82ki7MGG8= +github.com/jackc/chunkreader/v2 v2.0.1/go.mod h1:odVSm741yZoC3dpHEUXIqA9tQRhFrgOHwnPIn9lDKlk= +github.com/jackc/pgconn v0.0.0-20190420214824-7e0022ef6ba3/go.mod h1:jkELnwuX+w9qN5YIfX0fl88Ehu4XC3keFuOJJk9pcnA= +github.com/jackc/pgconn v0.0.0-20190824142844-760dd75542eb/go.mod h1:lLjNuW/+OfW9/pnVKPazfWOgNfH2aPem8YQ7ilXGvJE= +github.com/jackc/pgconn v0.0.0-20190831204454-2fabfa3c18b7/go.mod h1:ZJKsE/KZfsUgOEh9hBm+xYTstcNHg7UPMVJqRfQxq4s= +github.com/jackc/pgconn v1.8.0/go.mod h1:1C2Pb36bGIP9QHGBYCjnyhqu7Rv3sGshaQUvmfGIB/o= +github.com/jackc/pgconn v1.9.0/go.mod h1:YctiPyvzfU11JFxoXokUOOKQXQmDMoJL9vJzHH8/2JY= +github.com/jackc/pgconn v1.9.1-0.20210724152538-d89c8390a530/go.mod h1:4z2w8XhRbP1hYxkpTuBjTS3ne3J48K83+u0zoyvg2pI= +github.com/jackc/pgconn v1.14.3 h1:bVoTr12EGANZz66nZPkMInAV/KHD2TxH9npjXXgiB3w= +github.com/jackc/pgconn v1.14.3/go.mod h1:RZbme4uasqzybK2RK5c65VsHxoyaml09lx3tXOcO/VM= +github.com/jackc/pgio v1.0.0 h1:g12B9UwVnzGhueNavwioyEEpAmqMe1E/BN9ES+8ovkE= +github.com/jackc/pgio v1.0.0/go.mod h1:oP+2QK2wFfUWgr+gxjoBH9KGBb31Eio69xUb0w5bYf8= +github.com/jackc/pgmock v0.0.0-20190831213851-13a1b77aafa2/go.mod h1:fGZlG77KXmcq05nJLRkk0+p82V8B8Dw8KN2/V9c/OAE= +github.com/jackc/pgmock v0.0.0-20201204152224-4fe30f7445fd/go.mod h1:hrBW0Enj2AZTNpt/7Y5rr2xe/9Mn757Wtb2xeBzPv2c= +github.com/jackc/pgmock v0.0.0-20210724152146-4ad1a8207f65 h1:DadwsjnMwFjfWc9y5Wi/+Zz7xoE5ALHsRQlOctkOiHc= +github.com/jackc/pgmock v0.0.0-20210724152146-4ad1a8207f65/go.mod h1:5R2h2EEX+qri8jOWMbJCtaPWkrrNc7OHwsp2TCqp7ak= +github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= +github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= +github.com/jackc/pgproto3 v1.1.0/go.mod h1:eR5FA3leWg7p9aeAqi37XOTgTIbkABlvcPB3E5rlc78= +github.com/jackc/pgproto3/v2 v2.0.0-alpha1.0.20190420180111-c116219b62db/go.mod h1:bhq50y+xrl9n5mRYyCBFKkpRVTLYJVWeCc+mEAI3yXA= +github.com/jackc/pgproto3/v2 v2.0.0-alpha1.0.20190609003834-432c2951c711/go.mod h1:uH0AWtUmuShn0bcesswc4aBTWGvw0cAxIJp+6OB//Wg= +github.com/jackc/pgproto3/v2 v2.0.0-rc3/go.mod h1:ryONWYqW6dqSg1Lw6vXNMXoBJhpzvWKnT95C46ckYeM= +github.com/jackc/pgproto3/v2 v2.0.0-rc3.0.20190831210041-4c03ce451f29/go.mod h1:ryONWYqW6dqSg1Lw6vXNMXoBJhpzvWKnT95C46ckYeM= +github.com/jackc/pgproto3/v2 v2.0.6/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA= +github.com/jackc/pgproto3/v2 v2.1.1/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA= +github.com/jackc/pgproto3/v2 v2.3.3 h1:1HLSx5H+tXR9pW3in3zaztoEwQYRC9SQaYUHjTSUOag= +github.com/jackc/pgproto3/v2 v2.3.3/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA= +github.com/jackc/pgservicefile v0.0.0-20200714003250-2b9c44734f2b/go.mod h1:vsD4gTJCa9TptPL8sPkXrLZ+hDuNrZCnj29CQpr4X1E= +github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= +github.com/jackc/pgtype v0.0.0-20190421001408-4ed0de4755e0/go.mod h1:hdSHsc1V01CGwFsrv11mJRHWJ6aifDLfdV3aVjFF0zg= +github.com/jackc/pgtype v0.0.0-20190824184912-ab885b375b90/go.mod h1:KcahbBH1nCMSo2DXpzsoWOAfFkdEtEJpPbVLq8eE+mc= +github.com/jackc/pgtype v0.0.0-20190828014616-a8802b16cc59/go.mod h1:MWlu30kVJrUS8lot6TQqcg7mtthZ9T0EoIBFiJcmcyw= +github.com/jackc/pgtype v1.8.1-0.20210724151600-32e20a603178/go.mod h1:C516IlIV9NKqfsMCXTdChteoXmwgUceqaLfjg2e3NlM= +github.com/jackc/pgtype v1.14.0/go.mod h1:LUMuVrfsFfdKGLw+AFFVv6KtHOFMwRgDDzBt76IqCA4= +github.com/jackc/pgtype v1.14.4 h1:fKuNiCumbKTAIxQwXfB/nsrnkEI6bPJrrSiMKgbJ2j8= +github.com/jackc/pgtype v1.14.4/go.mod h1:aKeozOde08iifGosdJpz9MBZonJOUJxqNpPBcMJTlVA= +github.com/jackc/pgx/v4 v4.0.0-20190420224344-cc3461e65d96/go.mod h1:mdxmSJJuR08CZQyj1PVQBHy9XOp5p8/SHH6a0psbY9Y= +github.com/jackc/pgx/v4 v4.0.0-20190421002000-1b8f0016e912/go.mod h1:no/Y67Jkk/9WuGR0JG/JseM9irFbnEPbuWV2EELPNuM= +github.com/jackc/pgx/v4 v4.0.0-pre1.0.20190824185557-6972a5742186/go.mod h1:X+GQnOEnf1dqHGpw7JmHqHc1NxDoalibchSk9/RWuDc= +github.com/jackc/pgx/v4 v4.12.1-0.20210724153913-640aa07df17c/go.mod h1:1QD0+tgSXP7iUjYm9C1NxKhny7lq6ee99u/z+IHFcgs= +github.com/jackc/pgx/v4 v4.18.2/go.mod h1:Ey4Oru5tH5sB6tV7hDmfWFahwF15Eb7DNXlRKx2CkVw= +github.com/jackc/pgx/v4 v4.18.3 h1:dE2/TrEsGX3RBprb3qryqSV9Y60iZN1C6i8IrmW9/BA= +github.com/jackc/pgx/v4 v4.18.3/go.mod h1:Ey4Oru5tH5sB6tV7hDmfWFahwF15Eb7DNXlRKx2CkVw= +github.com/jackc/pgx/v5 v5.9.2 h1:3ZhOzMWnR4yJ+RW1XImIPsD1aNSz4T4fyP7zlQb56hw= +github.com/jackc/pgx/v5 v5.9.2/go.mod h1:mal1tBGAFfLHvZzaYh77YS/eC6IX9OWbRV1QIIM0Jn4= +github.com/jackc/puddle v0.0.0-20190413234325-e4ced69a3a2b/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= +github.com/jackc/puddle v0.0.0-20190608224051-11cab39313c9/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= +github.com/jackc/puddle v1.1.3/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= +github.com/jackc/puddle v1.3.0/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= +github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= +github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE= +github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ= +github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y= +github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= +github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= +github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/pty v1.1.8/go.mod h1:O1sed60cT9XZ5uDucP5qwvh+TE3NnUj51EiZO/lmSfw= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= +github.com/lib/pq v1.1.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= +github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= +github.com/lib/pq v1.10.2/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= +github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= +github.com/magiconair/properties v1.8.10 h1:s31yESBquKXCV9a/ScB3ESkOjUYYv+X0rg8SYxI99mE= +github.com/magiconair/properties v1.8.10/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= +github.com/mattn/go-colorable v0.1.6/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= +github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= +github.com/mattn/go-isatty v0.0.7/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= +github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= +github.com/mattn/go-isatty v0.0.21 h1:xYae+lCNBP7QuW4PUnNG61ffM4hVIfm+zUzDuSzYLGs= +github.com/mattn/go-isatty v0.0.21/go.mod h1:ZXfXG4SQHsB/w3ZeOYbR0PrPwLy+n6xiMrJlRFqopa4= +github.com/mdelapenya/tlscert v0.2.0 h1:7H81W6Z/4weDvZBNOfQte5GpIMo0lGYEeWbkGp5LJHI= +github.com/mdelapenya/tlscert v0.2.0/go.mod h1:O4njj3ELLnJjGdkN7M/vIVCpZ+Cf0L6muqOG4tLSl8o= +github.com/mfridman/interpolate v0.0.2 h1:pnuTK7MQIxxFz1Gr+rjSIx9u7qVjf5VOoM/u6BbAxPY= +github.com/mfridman/interpolate v0.0.2/go.mod h1:p+7uk6oE07mpE/Ik1b8EckO0O4ZXiGAfshKBWLUM9Xg= +github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= +github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= +github.com/moby/go-archive v0.2.0 h1:zg5QDUM2mi0JIM9fdQZWC7U8+2ZfixfTYoHL7rWUcP8= +github.com/moby/go-archive v0.2.0/go.mod h1:mNeivT14o8xU+5q1YnNrkQVpK+dnNe/K6fHqnTg4qPU= +github.com/moby/moby/api v1.54.2 h1:wiat9QAhnDQjA7wk1kh/TqHz2I1uUA7M7t9SAl/JNXg= +github.com/moby/moby/api v1.54.2/go.mod h1:+RQ6wluLwtYaTd1WnPLykIDPekkuyD/ROWQClE83pzs= +github.com/moby/moby/client v0.4.1 h1:DMQgisVoMkmMs7fp3ROSdiBnoAu8+vo3GggFl06M/wY= +github.com/moby/moby/client v0.4.1/go.mod h1:z52C9O2POPOsnxZAy//WtKcQ32P+jT/NGeXu/7nfjGQ= +github.com/moby/patternmatcher v0.6.1 h1:qlhtafmr6kgMIJjKJMDmMWq7WLkKIo23hsrpR3x084U= +github.com/moby/patternmatcher v0.6.1/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= +github.com/moby/sys/atomicwriter v0.1.0 h1:kw5D/EqkBwsBFi0ss9v1VG3wIkVhzGvLklJ+w3A14Sw= +github.com/moby/sys/atomicwriter v0.1.0/go.mod h1:Ul8oqv2ZMNHOceF643P6FKPXeCmYtlQMvpizfsSoaWs= +github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU= +github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko= +github.com/moby/sys/user v0.4.0 h1:jhcMKit7SA80hivmFJcbB1vqmw//wU61Zdui2eQXuMs= +github.com/moby/sys/user v0.4.0/go.mod h1:bG+tYYYJgaMtRKgEmuueC0hJEAZWwtIbZTB+85uoHjs= +github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g= +github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= +github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ= +github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc= +github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 h1:RWengNIwukTxcDr9M+97sNutRR1RKhG96O6jWumTTnw= +github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826/go.mod h1:TaXosZuwdSHYgviHp1DAtfrULt5eUgsSMsZf+YrPgl8= +github.com/morikuni/aec v1.1.0 h1:vBBl0pUnvi/Je71dsRrhMBtreIqNMYErSAbEeb8jrXQ= +github.com/morikuni/aec v1.1.0/go.mod h1:xDRgiq/iw5l+zkao76YTKzKttOp2cwPEne25HDkJnBw= +github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= +github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= +github.com/oasdiff/yaml v0.0.9 h1:zQOvd2UKoozsSsAknnWoDJlSK4lC0mpmjfDsfqNwX48= +github.com/oasdiff/yaml v0.0.9/go.mod h1:8lvhgJG4xiKPj3HN5lDow4jZHPlx1i7dIwzkdAo6oAM= +github.com/oasdiff/yaml3 v0.0.9 h1:rWPrKccrdUm8J0F3sGuU+fuh9+1K/RdJlWF7O/9yw2g= +github.com/oasdiff/yaml3 v0.0.9/go.mod h1:y5+oSEHCPT/DGrS++Wc/479ERge0zTFxaF8PbGKcg2o= +github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= +github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= +github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= +github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M= +github.com/perimeterx/marshmallow v1.1.5 h1:a2LALqQ1BlHM8PZblsDdidgv1mWi1DgC2UmX50IvK2s= +github.com/perimeterx/marshmallow v1.1.5/go.mod h1:dsXbUu8CRzfYP5a87xpp0xq9S3u0Vchtcl8we9tYaXw= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= +github.com/pressly/goose/v3 v3.27.1 h1:6uEvcprBybDmW4hcz3gYujhARhye+GoWKhEWyzD5sh4= +github.com/pressly/goose/v3 v3.27.1/go.mod h1:maruOxsPnIG2yHHyo8UqKWXYKFcH7Q76csUV7+7KYoM= +github.com/redis/go-redis/extra/rediscmd/v9 v9.18.0 h1:QY4nmPHLFAJjtT5O4OMUEOxP8WVaRNOFpcbmxT2NLZU= +github.com/redis/go-redis/extra/rediscmd/v9 v9.18.0/go.mod h1:WH8cY/0fT41Bsf341qzo8v4nx0GCE8FykAA23IVbVmo= +github.com/redis/go-redis/extra/redisotel/v9 v9.18.0 h1:2dKdoEYBJ0CZCLPiCdvvc7luz3DPwY6hKdzjL6m1eHE= +github.com/redis/go-redis/extra/redisotel/v9 v9.18.0/go.mod h1:WzkrVG9ro9BwCQD0eJOWn6AGL4Z1CleGflM45w1hu10= +github.com/redis/go-redis/v9 v9.18.0 h1:pMkxYPkEbMPwRdenAzUNyFNrDgHx9U+DrBabWNfSRQs= +github.com/redis/go-redis/v9 v9.18.0/go.mod h1:k3ufPphLU5YXwNTUcCRXGxUoF1fqxnhFQmscfkCoDA0= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= +github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/rs/xid v1.2.1/go.mod h1:+uKXf+4Djp6Md1KODXJxgGQPKngRmWyn10oCKFzNHOQ= +github.com/rs/zerolog v1.13.0/go.mod h1:YbFCdg8HfsridGWAh22vktObvhZbQsZXe4/zB0OKkWU= +github.com/rs/zerolog v1.15.0/go.mod h1:xYTKnLHcpfU2225ny5qZjxnj9NvkumZYjJHlAThCjNc= +github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= +github.com/sethvargo/go-retry v0.3.0 h1:EEt31A35QhrcRZtrYFDTBg91cqZVnFL2navjDrah2SE= +github.com/sethvargo/go-retry v0.3.0/go.mod h1:mNX17F0C/HguQMyMyJxcnU471gOZGxCLyYaFyAZraas= +github.com/shirou/gopsutil/v4 v4.26.3 h1:2ESdQt90yU3oXF/CdOlRCJxrP+Am1aBYubTMTfxJ1qc= +github.com/shirou/gopsutil/v4 v4.26.3/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ= +github.com/shopspring/decimal v0.0.0-20180709203117-cd690d0c9e24/go.mod h1:M+9NzErvs504Cn4c5DxATwIqPbtswREoFCre64PpcG4= +github.com/shopspring/decimal v1.2.0/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= +github.com/sirupsen/logrus v1.4.1/go.mod h1:ni0Sbl8bgC9z8RoU9G6nDWqqs/fq4eDPysMBDgk/93Q= +github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= +github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= +github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.3 h1:jmXUvGomnU1o3W/V5h2VEradbpJDwGrzugQQvL0POH4= +github.com/stretchr/objx v0.5.3/go.mod h1:rDQraq+vQZU7Fde9LOZLr8Tax6zZvy4kuNKF+QYS+U0= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/testcontainers/testcontainers-go v0.42.0 h1:He3IhTzTZOygSXLJPMX7n44XtK+qhjat1nI9cneBbUY= +github.com/testcontainers/testcontainers-go v0.42.0/go.mod h1:vZjdY1YmUA1qEForxOIOazfsrdyORJAbhi0bp8plN30= +github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0 h1:GCbb1ndrF7OTDiIvxXyItaDab4qkzTFJ48LKFdM7EIo= +github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0/go.mod h1:IRPBaI8jXdrNfD0e4Zm7Fbcgaz5shKxOQv4axiL09xs= +github.com/testcontainers/testcontainers-go/modules/redis v0.42.0 h1:id/6LH8ZeDrtAUVSuNvZUAJ1kVpb82y1pr9yweAWsRg= +github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA= +github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI= +github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw= +github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ= +github.com/ugorji/go/codec v1.3.1 h1:waO7eEiFDwidsBN6agj1vJQ4AG7lh2yqXyOXqhgQuyY= +github.com/ugorji/go/codec v1.3.1/go.mod h1:pRBVtBSKl77K30Bv8R2P+cLSGaTtex6fsA2Wjqmfxj4= +github.com/woodsbury/decimal128 v1.3.0 h1:8pffMNWIlC0O5vbyHWFZAt5yWvWcrHA+3ovIIjVWss0= +github.com/woodsbury/decimal128 v1.3.0/go.mod h1:C5UTmyTjW3JftjUFzOVhC20BEQa2a4ZKOB5I6Zjb+ds= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M= +github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= +github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= +github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= +github.com/zenazn/goji v0.9.0/go.mod h1:7S9M489iMyHBNxwZnk9/EHS098H4/F6TATF2mIxtB1Q= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 h1:CqXxU8VOmDefoh0+ztfGaymYbhdB/tT3zs79QaZTNGY= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0/go.mod h1:BuhAPThV8PBHBvg8ZzZ/Ok3idOdhWIodywz2xEcRbJo= +go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 h1:8UQVDcZxOJLtX6gxtDt3vY2WTgvZqMQRzjsqiIHQdkc= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0/go.mod h1:2lmweYCiHYpEjQ/lSJBYhj9jP1zvCvQW4BqL9dnT7FQ= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 h1:w1K+pCJoPpQifuVpsKamUdn9U0zM3xUziVOqsGksUrY= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0/go.mod h1:HBy4BjzgVE8139ieRI75oXm3EcDN+6GhD88JT1Kjvxg= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 h1:RAE+JPfvEmvy+0LzyUA25/SGawPwIUbZ6u0Wug54sLc= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0/go.mod h1:AGmbycVGEsRx9mXMZ75CsOyhSP6MFIcj/6dnG+vhVjk= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 h1:3iZJKlCZufyRzPzlQhUIWVmfltrXuGyfjREgGP3UUjc= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0/go.mod h1:/G+nUPfhq2e+qiXMGxMwumDrP5jtzU+mWN7/sjT2rak= +go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.43.0 h1:TC+BewnDpeiAmcscXbGMfxkO+mwYUwE/VySwvw88PfA= +go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.43.0/go.mod h1:J/ZyF4vfPwsSr9xJSPyQ4LqtcTPULFR64KwTikGLe+A= +go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0 h1:mS47AX77OtFfKG4vtp+84kuGSFZHTyxtXIN269vChY0= +go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0/go.mod h1:PJnsC41lAGncJlPUniSwM81gc80GkgWJWr3cu2nKEtU= +go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= +go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= +go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= +go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= +go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g= +go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk= +go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= +go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= +go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= +go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= +go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= +go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/mock v0.6.0 h1:hyF9dfmbgIX5EfOdasqLsWD6xqpNZlXblLB/Dbnwv3Y= +go.uber.org/mock v0.6.0/go.mod h1:KiVJ4BqZJaMj4svdfmHM0AUx4NJYO8ZNpPnZn1Z+BBU= +go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= +go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= +go.uber.org/multierr v1.5.0/go.mod h1:FeouvMocqHpRaaGuG9EjoKcStLC43Zu/fmqdUMPcKYU= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee/go.mod h1:vJERXedbb3MVM5f9Ejo0C68/HhF8uaILCdgjnY+goOA= +go.uber.org/zap v1.9.1/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= +go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= +go.uber.org/zap v1.13.0/go.mod h1:zwrFLgMcdUuIBviXEYEH1YKNaOBnKXsx2IPda5bBwHM= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190411191339-88737f569e3a/go.mod h1:WFFai1msRO1wXaEeE5yQxYXgSfI8pQAWXbQop6sCtWE= +golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20190820162420-60c769a6c586/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20201203163018-be400aefbc4c/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= +golang.org/x/crypto v0.0.0-20210616213533-5ff15b29337e/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.20.0/go.mod h1:Xwo95rrVNIoSMx9wa1JroENMToLWn3RNVrTBpLHgZPQ= +golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= +golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= +golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= +golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM= +golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= +golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190403152447-81d4e9dc473e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= +golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.42.0 h1:UiKe+zDFmJobeJ5ggPwOshJIVt6/Ft0rcfrXZDLWAWY= +golang.org/x/term v0.42.0/go.mod h1:Dq/D+snpsbazcBG5+F9Q1n2rXV8Ma+71xEjTRufARgY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= +golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= +golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= +golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190425163242-31fd60d6bfdc/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190823170909-c4a336ef6a2f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191029041327-9cc4af7d6b2c/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200103221440-774c71fcf114/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190410155217-1f06c39b4373/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20190513163551-3ee3066db522/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260420184626-e10c466a9529 h1:XF8+t6QQiS0o9ArVan/HW8Q7cycNPGsJf6GA2nXxYAg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260420184626-e10c466a9529/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM= +google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +gopkg.in/inconshreveable/log15.v2 v2.0.0-20180818164646-67afb5ed74ec/go.mod h1:aPpfJ7XW+gOuirDoZ8gHhLh3kZ1B08FtV2bbmy7Jv3s= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q= +gotest.tools/v3 v3.5.2/go.mod h1:LtdLGcnqToBH83WByAAi/wiwSFCArdFIUV/xxN4pcjA= +honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= +modernc.org/libc v1.72.1 h1:db1xwJ6u1kE3KHTFTTbe2GCrczHPKzlURP0aDC4NGD0= +modernc.org/libc v1.72.1/go.mod h1:HRMiC/PhPGLIPM7GzAFCbI+oSgE3dhZ8FWftmRrHVlY= +modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= +modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= +modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= +modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= +modernc.org/sqlite v1.49.1 h1:dYGHTKcX1sJ+EQDnUzvz4TJ5GbuvhNJa8Fg6ElGx73U= +modernc.org/sqlite v1.49.1/go.mod h1:m0w8xhwYUVY3H6pSDwc3gkJ/irZT/0YEXwBlhaxQEew= +pgregory.net/rapid v1.2.0 h1:keKAYRcjm+e1F0oAuU5F5+YPAWcyxNNRK2wud503Gnk= +pgregory.net/rapid v1.2.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= diff --git a/rtmanager/integration/harness/docker.go b/rtmanager/integration/harness/docker.go new file mode 100644 index 0000000..0c280cd --- /dev/null +++ b/rtmanager/integration/harness/docker.go @@ -0,0 +1,236 @@ +package harness + +import ( + "context" + "crypto/rand" + "encoding/hex" + "errors" + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "sync" + "testing" + "time" + + cerrdefs "github.com/containerd/errdefs" + "github.com/docker/docker/api/types/network" + dockerclient "github.com/docker/docker/client" +) + +// Engine image tags used by the integration suite. `EngineImageRef` is +// the image we actually build from `galaxy/game/Dockerfile`; +// `PatchedEngineImageRef` is the same image content tagged at a higher +// semver patch so the patch lifecycle test exercises the +// `semver_patch_only` validation against a real image. Keeping both at +// the same digest avoids a redundant build. +const ( + EngineImageRef = "galaxy/game:1.0.0-rtm-it" + PatchedEngineImageRef = "galaxy/game:1.0.1-rtm-it" + + dockerNetworkPrefix = "rtmanager-it-" + + dockerPingTimeout = 5 * time.Second + dockerNetworkTimeout = 30 * time.Second + imageBuildTimeout = 10 * time.Minute +) + +// DockerEnv carries the per-package Docker client plus the workspace +// root used by image builds. The client is opened lazily on the first +// EnsureDocker call and closed by ShutdownDocker at TestMain exit. +type DockerEnv struct { + client *dockerclient.Client + workspaceRoot string +} + +// Client returns the harness-owned Docker SDK client. Tests use it +// directly for "external actions" the harness does not wrap (e.g., +// removing a running container behind RTM's back in `health_test`). +func (env *DockerEnv) Client() *dockerclient.Client { return env.client } + +// WorkspaceRoot returns the absolute path of the galaxy/ workspace +// root. It is exported so the runtime helper can resolve the host +// game-state root relative to it if a test needs a deterministic +// location, though the default places state under `t.ArtifactDir()`. +func (env *DockerEnv) WorkspaceRoot() string { return env.workspaceRoot } + +var ( + dockerOnce sync.Once + dockerEnv *DockerEnv + dockerErr error + + imageOnce sync.Once + imageErr error +) + +// EnsureDocker opens the shared Docker SDK client and verifies the +// daemon is reachable. When the daemon is unavailable the helper calls +// `t.Skip` so suites stay green on hosts without `/var/run/docker.sock` +// or `DOCKER_HOST`. +func EnsureDocker(t testing.TB) *DockerEnv { + t.Helper() + dockerOnce.Do(func() { + dockerEnv, dockerErr = openDocker() + }) + if dockerErr != nil { + t.Skipf("rtmanager integration: docker daemon unavailable: %v", dockerErr) + } + return dockerEnv +} + +// EnsureEngineImage builds the `galaxy/game` engine image from the +// workspace root once per package run via `sync.Once`, then tags the +// resulting image at both `EngineImageRef` and `PatchedEngineImageRef` +// so the patch lifecycle has a second semver-valid tag to point at. +// Subsequent calls re-use the cached image. Any test that asks for the +// engine image must invoke this helper first; it is intentionally +// separate from `EnsureDocker` so suites that only need the daemon +// (e.g., a future "Docker network missing" negative test) do not pay +// the build cost. +func EnsureEngineImage(t testing.TB) string { + t.Helper() + env := EnsureDocker(t) + imageOnce.Do(func() { + imageErr = buildAndTagEngineImage(env) + }) + if imageErr != nil { + t.Skipf("rtmanager integration: build galaxy/game image: %v", imageErr) + } + return EngineImageRef +} + +// EnsureNetwork creates a uniquely-named Docker bridge network for the +// caller's test and registers cleanup. Each test gets its own network +// so concurrent scenarios cannot collide on the per-game DNS hostname. +func EnsureNetwork(t testing.TB) string { + t.Helper() + env := EnsureDocker(t) + name := dockerNetworkPrefix + uniqueSuffix(t) + + createCtx, cancel := context.WithTimeout(context.Background(), dockerNetworkTimeout) + defer cancel() + if _, err := env.client.NetworkCreate(createCtx, name, network.CreateOptions{Driver: "bridge"}); err != nil { + t.Fatalf("rtmanager integration: create docker network %q: %v", name, err) + } + t.Cleanup(func() { + removeCtx, removeCancel := context.WithTimeout(context.Background(), dockerNetworkTimeout) + defer removeCancel() + if err := env.client.NetworkRemove(removeCtx, name); err != nil && !cerrdefs.IsNotFound(err) { + t.Logf("rtmanager integration: remove docker network %q: %v", name, err) + } + }) + return name +} + +// ShutdownDocker closes the shared Docker SDK client. `TestMain` +// invokes it after `m.Run`. The harness deliberately leaves the engine +// image in the local Docker cache so the next package run benefits +// from the layer cache; operators can `docker image rm` the +// `*-rtm-it` tags by hand if a stale image gets in the way. +func ShutdownDocker() { + if dockerEnv == nil { + return + } + if dockerEnv.client != nil { + _ = dockerEnv.client.Close() + } + dockerEnv = nil +} + +// uniqueSuffix returns 8 hex characters of randomness suitable for a +// per-test resource name. The same helper is used in +// `internal/adapters/docker/smoke_test.go`; we duplicate it instead of +// importing because `_test.go`-only helpers cannot be exported. +func uniqueSuffix(t testing.TB) string { + t.Helper() + buf := make([]byte, 4) + if _, err := rand.Read(buf); err != nil { + t.Fatalf("rtmanager integration: read random suffix: %v", err) + } + return hex.EncodeToString(buf) +} + +func openDocker() (*DockerEnv, error) { + if os.Getenv("DOCKER_HOST") == "" { + if _, err := os.Stat("/var/run/docker.sock"); err != nil { + return nil, fmt.Errorf("set DOCKER_HOST or expose /var/run/docker.sock: %w", err) + } + } + + client, err := dockerclient.NewClientWithOpts( + dockerclient.FromEnv, + dockerclient.WithAPIVersionNegotiation(), + ) + if err != nil { + return nil, fmt.Errorf("new docker client: %w", err) + } + + pingCtx, cancel := context.WithTimeout(context.Background(), dockerPingTimeout) + defer cancel() + if _, err := client.Ping(pingCtx); err != nil { + _ = client.Close() + return nil, fmt.Errorf("ping docker daemon: %w", err) + } + + root, err := workspaceRoot() + if err != nil { + _ = client.Close() + return nil, fmt.Errorf("resolve workspace root: %w", err) + } + + return &DockerEnv{ + client: client, + workspaceRoot: root, + }, nil +} + +// buildAndTagEngineImage invokes `docker build` against the workspace +// root context to materialise the `galaxy/game` image, then tags the +// resulting image at the patch tag. Shelling out to the CLI keeps the +// implementation tiny — using the SDK would require streaming a tar +// of the workspace root, which is heavy and duplicates what the CLI +// already optimises. The workspace-root build context is required by +// `galaxy/game` (see `galaxy/game/README.md` §Build). +func buildAndTagEngineImage(env *DockerEnv) error { + if env == nil { + return errors.New("nil docker env") + } + ctx, cancel := context.WithTimeout(context.Background(), imageBuildTimeout) + defer cancel() + + dockerfilePath := filepath.Join("game", "Dockerfile") + cmd := exec.CommandContext(ctx, "docker", "build", + "-f", dockerfilePath, + "-t", EngineImageRef, + ".", + ) + cmd.Dir = env.workspaceRoot + cmd.Env = append(os.Environ(), "DOCKER_BUILDKIT=1") + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("docker build (-f %s) in %s: %w; output:\n%s", + dockerfilePath, env.workspaceRoot, err, strings.TrimSpace(string(output))) + } + + if err := env.client.ImageTag(ctx, EngineImageRef, PatchedEngineImageRef); err != nil { + return fmt.Errorf("tag %s as %s: %w", EngineImageRef, PatchedEngineImageRef, err) + } + return nil +} + +// workspaceRoot resolves the absolute path of the galaxy/ workspace +// root by anchoring on this file's location. The harness lives at +// `galaxy/rtmanager/integration/harness/docker.go`, so the workspace +// root is three directories up. Mirrors the `cmd/jetgen` strategy. +func workspaceRoot() (string, error) { + _, file, _, ok := runtime.Caller(0) + if !ok { + return "", errors.New("resolve runtime caller for workspace root") + } + dir := filepath.Dir(file) + // dir = .../galaxy/rtmanager/integration/harness + root := filepath.Clean(filepath.Join(dir, "..", "..", "..")) + return root, nil +} diff --git a/rtmanager/integration/harness/lobbystub.go b/rtmanager/integration/harness/lobbystub.go new file mode 100644 index 0000000..e02b6d9 --- /dev/null +++ b/rtmanager/integration/harness/lobbystub.go @@ -0,0 +1,59 @@ +package harness + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +// LobbyStub answers the single Lobby internal request the start +// service performs ([`internal/adapters/lobbyclient`]). The start +// service treats this response as ancillary diagnostics — the start +// envelope already carries `image_ref` — so the stub returns a +// deterministic 200 OK and lets the runtime ignore the payload. +// +// The stub only validates that the runtime configuration treats the +// Lobby URL as required (so it cannot regress to nil-out the +// ancillary fetch); the response body itself is unused by the +// integration assertions. +type LobbyStub struct { + Server *httptest.Server +} + +// NewLobbyStub returns a started httptest.Server. The caller registers +// `t.Cleanup(stub.Close)` themselves through the runtime helper so the +// stub follows the same lifecycle as the rest of the per-test wiring. +func NewLobbyStub(t testing.TB) *LobbyStub { + t.Helper() + mux := http.NewServeMux() + mux.HandleFunc("GET /api/v1/internal/games/{game_id}", func(w http.ResponseWriter, r *http.Request) { + gameID := strings.TrimSpace(r.PathValue("game_id")) + if gameID == "" { + writeStubError(w, http.StatusBadRequest, "invalid_request", "game_id is required") + return + } + w.Header().Set("Content-Type", "application/json; charset=utf-8") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(map[string]string{ + "game_id": gameID, + "status": "running", + "target_engine_version": "1.0.0", + }) + }) + server := httptest.NewServer(mux) + t.Cleanup(server.Close) + return &LobbyStub{Server: server} +} + +// URL returns the base URL of the running stub. +func (stub *LobbyStub) URL() string { return stub.Server.URL } + +func writeStubError(w http.ResponseWriter, status int, code, message string) { + w.Header().Set("Content-Type", "application/json; charset=utf-8") + w.WriteHeader(status) + _ = json.NewEncoder(w).Encode(map[string]any{ + "error": map[string]string{"code": code, "message": message}, + }) +} diff --git a/rtmanager/integration/harness/postgres.go b/rtmanager/integration/harness/postgres.go new file mode 100644 index 0000000..605714a --- /dev/null +++ b/rtmanager/integration/harness/postgres.go @@ -0,0 +1,224 @@ +// Package harness exposes the testcontainers / Docker / image-build +// scaffolding shared by the Runtime Manager service-local integration +// suite under [`galaxy/rtmanager/integration`](..). +// +// Only `_test.go` files (and the harness itself) reference this +// package; production code paths in `cmd/rtmanager` never import it. +// The package therefore stays out of the production binary's import +// graph, identical to the in-package `pgtest` and `integration/internal/harness` +// patterns it mirrors. +package harness + +import ( + "context" + "database/sql" + "net/url" + "os" + "sync" + "testing" + "time" + + "galaxy/postgres" + "galaxy/rtmanager/internal/adapters/postgres/migrations" + + testcontainers "github.com/testcontainers/testcontainers-go" + tcpostgres "github.com/testcontainers/testcontainers-go/modules/postgres" + "github.com/testcontainers/testcontainers-go/wait" +) + +const ( + pgImage = "postgres:16-alpine" + pgSuperUser = "galaxy" + pgSuperPassword = "galaxy" + pgSuperDatabase = "galaxy_rtmanager_it" + pgServiceRole = "rtmanagerservice" + pgServicePassword = "rtmanagerservice" + pgServiceSchema = "rtmanager" + pgStartupTimeout = 90 * time.Second + + // pgOperationTimeout bounds the per-statement deadline used by every + // pool the harness opens. Short enough to surface a runaway + // integration test promptly, long enough to absorb laptop-grade I/O. + pgOperationTimeout = 10 * time.Second +) + +// PostgresEnv carries the per-package PostgreSQL fixture. The container +// is started lazily on the first EnsurePostgres call and torn down by +// ShutdownPostgres at TestMain exit. +type PostgresEnv struct { + container *tcpostgres.PostgresContainer + pool *sql.DB + scopedDSN string +} + +// Pool returns the harness-owned `*sql.DB` scoped to the rtmanager +// schema. Tests use it to read durable state directly through the +// existing store adapters. +func (env *PostgresEnv) Pool() *sql.DB { return env.pool } + +// DSN returns the rtmanager-role-scoped DSN suitable for +// `RTMANAGER_POSTGRES_PRIMARY_DSN`. Both this DSN and Pool address the +// same database; the pool is reused across tests, while the runtime +// under test opens its own pool through this DSN. +func (env *PostgresEnv) DSN() string { return env.scopedDSN } + +var ( + pgOnce sync.Once + pgEnv *PostgresEnv + pgErr error +) + +// EnsurePostgres starts the per-package PostgreSQL container on first +// invocation and applies the embedded goose migrations. Subsequent +// invocations reuse the same container. When Docker is unavailable the +// helper calls `t.Skip` so the suite stays green on hosts without a +// daemon (mirrors the contract from `internal/adapters/postgres/internal/pgtest`). +func EnsurePostgres(t testing.TB) *PostgresEnv { + t.Helper() + pgOnce.Do(func() { + pgEnv, pgErr = startPostgres() + }) + if pgErr != nil { + t.Skipf("rtmanager integration: postgres container start failed (Docker unavailable?): %v", pgErr) + } + return pgEnv +} + +// TruncatePostgres wipes every Runtime Manager table inside the shared +// pool, leaving the schema and indexes intact. Tests call this from +// their setup so each scenario starts on an empty state. +func TruncatePostgres(t testing.TB) { + t.Helper() + env := EnsurePostgres(t) + const stmt = `TRUNCATE TABLE runtime_records, operation_log, health_snapshots RESTART IDENTITY CASCADE` + if _, err := env.pool.ExecContext(context.Background(), stmt); err != nil { + t.Fatalf("truncate rtmanager tables: %v", err) + } +} + +// ShutdownPostgres terminates the shared container and closes the pool. +// `TestMain` invokes it after `m.Run` so the container is released even +// if individual tests panic. +func ShutdownPostgres() { + if pgEnv == nil { + return + } + if pgEnv.pool != nil { + _ = pgEnv.pool.Close() + } + if pgEnv.container != nil { + _ = testcontainers.TerminateContainer(pgEnv.container) + } + pgEnv = nil +} + +// RunMain is a convenience helper for the integration package +// `TestMain`: it runs the suite, captures the exit code, tears every +// shared container down, and exits. Wiring it through one helper keeps +// `TestMain` to two lines and centralises ordering. +func RunMain(m *testing.M) { + code := m.Run() + ShutdownRedis() + ShutdownPostgres() + ShutdownDocker() + os.Exit(code) +} + +func startPostgres() (*PostgresEnv, error) { + ctx := context.Background() + container, err := tcpostgres.Run(ctx, pgImage, + tcpostgres.WithDatabase(pgSuperDatabase), + tcpostgres.WithUsername(pgSuperUser), + tcpostgres.WithPassword(pgSuperPassword), + testcontainers.WithWaitStrategy( + wait.ForLog("database system is ready to accept connections"). + WithOccurrence(2). + WithStartupTimeout(pgStartupTimeout), + ), + ) + if err != nil { + return nil, err + } + baseDSN, err := container.ConnectionString(ctx, "sslmode=disable") + if err != nil { + _ = testcontainers.TerminateContainer(container) + return nil, err + } + if err := provisionRoleAndSchema(ctx, baseDSN); err != nil { + _ = testcontainers.TerminateContainer(container) + return nil, err + } + scopedDSN, err := scopedDSNForRole(baseDSN) + if err != nil { + _ = testcontainers.TerminateContainer(container) + return nil, err + } + cfg := postgres.DefaultConfig() + cfg.PrimaryDSN = scopedDSN + cfg.OperationTimeout = pgOperationTimeout + pool, err := postgres.OpenPrimary(ctx, cfg) + if err != nil { + _ = testcontainers.TerminateContainer(container) + return nil, err + } + if err := postgres.Ping(ctx, pool, pgOperationTimeout); err != nil { + _ = pool.Close() + _ = testcontainers.TerminateContainer(container) + return nil, err + } + if err := postgres.RunMigrations(ctx, pool, migrations.FS(), "."); err != nil { + _ = pool.Close() + _ = testcontainers.TerminateContainer(container) + return nil, err + } + return &PostgresEnv{ + container: container, + pool: pool, + scopedDSN: scopedDSN, + }, nil +} + +func provisionRoleAndSchema(ctx context.Context, baseDSN string) error { + cfg := postgres.DefaultConfig() + cfg.PrimaryDSN = baseDSN + cfg.OperationTimeout = pgOperationTimeout + db, err := postgres.OpenPrimary(ctx, cfg) + if err != nil { + return err + } + defer func() { _ = db.Close() }() + + statements := []string{ + `DO $$ BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'rtmanagerservice') THEN + CREATE ROLE rtmanagerservice LOGIN PASSWORD 'rtmanagerservice'; + END IF; + END $$;`, + `CREATE SCHEMA IF NOT EXISTS rtmanager AUTHORIZATION rtmanagerservice;`, + `GRANT USAGE ON SCHEMA rtmanager TO rtmanagerservice;`, + } + for _, statement := range statements { + if _, err := db.ExecContext(ctx, statement); err != nil { + return err + } + } + return nil +} + +func scopedDSNForRole(baseDSN string) (string, error) { + parsed, err := url.Parse(baseDSN) + if err != nil { + return "", err + } + values := url.Values{} + values.Set("search_path", pgServiceSchema) + values.Set("sslmode", "disable") + scoped := url.URL{ + Scheme: parsed.Scheme, + User: url.UserPassword(pgServiceRole, pgServicePassword), + Host: parsed.Host, + Path: parsed.Path, + RawQuery: values.Encode(), + } + return scoped.String(), nil +} diff --git a/rtmanager/integration/harness/redis.go b/rtmanager/integration/harness/redis.go new file mode 100644 index 0000000..9e28f06 --- /dev/null +++ b/rtmanager/integration/harness/redis.go @@ -0,0 +1,102 @@ +package harness + +import ( + "context" + "sync" + "testing" + + "github.com/redis/go-redis/v9" + testcontainers "github.com/testcontainers/testcontainers-go" + rediscontainer "github.com/testcontainers/testcontainers-go/modules/redis" +) + +const redisImage = "redis:7" + +// RedisEnv carries the per-package Redis fixture. The container is +// started lazily on the first EnsureRedis call and torn down by +// ShutdownRedis at TestMain exit. Both stream consumers and the +// per-game lease store hit this real Redis (miniredis would suffice +// for streams alone, but the lease semantics and eviction-by-TTL we +// rely on in `health_test` are easier to verify against a real +// daemon). +type RedisEnv struct { + container *rediscontainer.RedisContainer + addr string +} + +// Addr returns the externally reachable host:port of the Redis +// container. Both the runtime under test and the harness-owned client +// connect through the same endpoint. +func (env *RedisEnv) Addr() string { return env.addr } + +// NewClient opens a fresh `*redis.Client` against the harness Redis. +// Tests close their client through `t.Cleanup`; the harness keeps no +// shared client to avoid cross-test connection-pool surprises. +func (env *RedisEnv) NewClient(t testing.TB) *redis.Client { + t.Helper() + client := redis.NewClient(&redis.Options{Addr: env.addr}) + t.Cleanup(func() { _ = client.Close() }) + return client +} + +var ( + redisOnce sync.Once + redisEnv *RedisEnv + redisErr error +) + +// EnsureRedis starts the per-package Redis container on first +// invocation and returns it. When Docker is unavailable the helper +// calls `t.Skip` so the suite stays green on hosts without a daemon. +func EnsureRedis(t testing.TB) *RedisEnv { + t.Helper() + redisOnce.Do(func() { + redisEnv, redisErr = startRedis() + }) + if redisErr != nil { + t.Skipf("rtmanager integration: redis container start failed (Docker unavailable?): %v", redisErr) + } + return redisEnv +} + +// FlushRedis drops every key on the harness Redis. Tests call it from +// their setup so streams, offset records, and leases from previous +// scenarios do not leak. +func FlushRedis(t testing.TB) { + t.Helper() + env := EnsureRedis(t) + client := redis.NewClient(&redis.Options{Addr: env.addr}) + defer func() { _ = client.Close() }() + if _, err := client.FlushAll(context.Background()).Result(); err != nil { + t.Fatalf("flush rtmanager redis: %v", err) + } +} + +// ShutdownRedis terminates the shared container. `TestMain` invokes it +// after `m.Run`. +func ShutdownRedis() { + if redisEnv == nil { + return + } + if redisEnv.container != nil { + _ = testcontainers.TerminateContainer(redisEnv.container) + } + redisEnv = nil +} + +func startRedis() (*RedisEnv, error) { + ctx := context.Background() + container, err := rediscontainer.Run(ctx, redisImage) + if err != nil { + return nil, err + } + addr, err := container.Endpoint(ctx, "") + if err != nil { + _ = testcontainers.TerminateContainer(container) + return nil, err + } + return &RedisEnv{ + container: container, + addr: addr, + }, nil +} diff --git a/rtmanager/integration/harness/rest.go b/rtmanager/integration/harness/rest.go new file mode 100644 index 0000000..f907b83 --- /dev/null +++ b/rtmanager/integration/harness/rest.go @@ -0,0 +1,195 @@ +package harness + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "testing" + "time" +) + +// defaultHTTPClient backs the runtime-readiness poll and the REST +// helpers below. A short timeout is enough — every internal endpoint +// runs against an in-process listener. +var defaultHTTPClient = &http.Client{Timeout: 5 * time.Second} + +// newRequest is a thin shim over `http.NewRequestWithContext` so the +// readiness poll and the REST client share one constructor. +func newRequest(ctx context.Context, method, fullURL string, body io.Reader) (*http.Request, error) { + req, err := http.NewRequestWithContext(ctx, method, fullURL, body) + if err != nil { + return nil, err + } + if body != nil { + req.Header.Set("Content-Type", "application/json; charset=utf-8") + } + req.Header.Set("Accept", "application/json") + req.Header.Set("X-Galaxy-Caller", "admin") + return req, nil +} + +// REST is a tiny client for the trusted internal HTTP surface RTM +// exposes to Game Master and Admin Service. It always identifies the +// caller as `admin` (the operation_log records `admin_rest`); tests +// that need GM semantics should add an option later. v1 keeps the +// helper minimal because the integration scenarios only need +// admin-driven flows. +type REST struct { + baseURL string + httpc *http.Client +} + +// NewREST builds a REST client targeting env.InternalAddr. +func NewREST(env *Env) *REST { + return &REST{ + baseURL: "http://" + env.InternalAddr, + httpc: defaultHTTPClient, + } +} + +// Get issues GET path and returns the response body and status code. +func (r *REST) Get(t testing.TB, path string) ([]byte, int) { + t.Helper() + return r.do(t, http.MethodGet, path, nil) +} + +// Post issues POST path with body (a Go value JSON-marshaled). +func (r *REST) Post(t testing.TB, path string, body any) ([]byte, int) { + t.Helper() + return r.do(t, http.MethodPost, path, body) +} + +// Delete issues DELETE path with no body. +func (r *REST) Delete(t testing.TB, path string) ([]byte, int) { + t.Helper() + return r.do(t, http.MethodDelete, path, nil) +} + +// GetRuntime fetches a runtime record by game id and returns the +// decoded payload, the status code, and the raw bytes for diagnostics. +func (r *REST) GetRuntime(t testing.TB, gameID string) (RuntimeRecordResponse, int) { + t.Helper() + body, status := r.Get(t, fmt.Sprintf("/api/v1/internal/runtimes/%s", url.PathEscape(gameID))) + var resp RuntimeRecordResponse + if status == http.StatusOK { + if err := json.Unmarshal(body, &resp); err != nil { + t.Fatalf("decode get-runtime response: %v; body=%s", err, string(body)) + } + } + return resp, status +} + +// StartRuntime invokes the start endpoint with imageRef. +func (r *REST) StartRuntime(t testing.TB, gameID, imageRef string) (RuntimeRecordResponse, int) { + t.Helper() + body, status := r.Post(t, + fmt.Sprintf("/api/v1/internal/runtimes/%s/start", url.PathEscape(gameID)), + map[string]string{"image_ref": imageRef}, + ) + return decodeRecord(t, body, status, "start") +} + +// StopRuntime invokes the stop endpoint with reason. +func (r *REST) StopRuntime(t testing.TB, gameID, reason string) (RuntimeRecordResponse, int) { + t.Helper() + body, status := r.Post(t, + fmt.Sprintf("/api/v1/internal/runtimes/%s/stop", url.PathEscape(gameID)), + map[string]string{"reason": reason}, + ) + return decodeRecord(t, body, status, "stop") +} + +// RestartRuntime invokes the restart endpoint. +func (r *REST) RestartRuntime(t testing.TB, gameID string) (RuntimeRecordResponse, int) { + t.Helper() + body, status := r.Post(t, + fmt.Sprintf("/api/v1/internal/runtimes/%s/restart", url.PathEscape(gameID)), + struct{}{}, + ) + return decodeRecord(t, body, status, "restart") +} + +// PatchRuntime invokes the patch endpoint with imageRef. +func (r *REST) PatchRuntime(t testing.TB, gameID, imageRef string) (RuntimeRecordResponse, int) { + t.Helper() + body, status := r.Post(t, + fmt.Sprintf("/api/v1/internal/runtimes/%s/patch", url.PathEscape(gameID)), + map[string]string{"image_ref": imageRef}, + ) + return decodeRecord(t, body, status, "patch") +} + +// CleanupRuntime invokes the DELETE container endpoint. +func (r *REST) CleanupRuntime(t testing.TB, gameID string) (RuntimeRecordResponse, int) { + t.Helper() + body, status := r.Delete(t, + fmt.Sprintf("/api/v1/internal/runtimes/%s/container", url.PathEscape(gameID)), + ) + return decodeRecord(t, body, status, "cleanup") +} + +// RuntimeRecordResponse mirrors the OpenAPI RuntimeRecord schema. Only +// the fields integration scenarios assert against live here; the +// listener encodes everything else. +type RuntimeRecordResponse struct { + GameID string `json:"game_id"` + Status string `json:"status"` + CurrentContainerID *string `json:"current_container_id"` + CurrentImageRef *string `json:"current_image_ref"` + EngineEndpoint *string `json:"engine_endpoint"` + StatePath string `json:"state_path"` + DockerNetwork string `json:"docker_network"` + StartedAt *string `json:"started_at"` + StoppedAt *string `json:"stopped_at"` + RemovedAt *string `json:"removed_at"` + LastOpAt string `json:"last_op_at"` + CreatedAt string `json:"created_at"` +} + +func (r *REST) do(t testing.TB, method, path string, body any) ([]byte, int) { + t.Helper() + var reader io.Reader + if body != nil { + raw, err := json.Marshal(body) + if err != nil { + t.Fatalf("marshal request body: %v", err) + } + reader = bytes.NewReader(raw) + } + req, err := newRequest(context.Background(), method, r.baseURL+path, reader) + if err != nil { + t.Fatalf("build %s %s request: %v", method, path, err) + } + resp, err := r.httpc.Do(req) + if err != nil { + t.Fatalf("execute %s %s: %v", method, path, err) + } + defer resp.Body.Close() + raw, err := io.ReadAll(resp.Body) + if err != nil { + t.Fatalf("read %s %s response: %v", method, path, err) + } + return raw, resp.StatusCode +} + +func decodeRecord(t testing.TB, body []byte, status int, op string) (RuntimeRecordResponse, int) { + t.Helper() + if status != http.StatusOK { + return RuntimeRecordResponse{}, status + } + var resp RuntimeRecordResponse + if err := json.Unmarshal(body, &resp); err != nil { + t.Fatalf("decode %s response: %v; body=%s", op, err, string(body)) + } + return resp, status +} + +// PathEscape is a re-export so test files can call it without +// importing `net/url` directly. Keeps the test source focused on +// scenarios. +func PathEscape(value string) string { return url.PathEscape(strings.TrimSpace(value)) } diff --git a/rtmanager/integration/harness/runtime.go b/rtmanager/integration/harness/runtime.go new file mode 100644 index 0000000..a068c00 --- /dev/null +++ b/rtmanager/integration/harness/runtime.go @@ -0,0 +1,398 @@ +package harness + +import ( + "context" + "errors" + "io" + "log/slog" + "net/url" + "os" + "strconv" + "strings" + "sync" + "testing" + "time" + + "galaxy/postgres" + "galaxy/redisconn" + "galaxy/rtmanager/internal/app" + "galaxy/rtmanager/internal/config" + + "github.com/redis/go-redis/v9" +) + +// Default stream key shapes used by the integration suite. They match +// the production defaults so the wire shapes asserted in `streams.go` +// are identical to what Game Lobby sees in `integration/lobbyrtm`. +const ( + StartJobsStream = "runtime:start_jobs" + StopJobsStream = "runtime:stop_jobs" + JobResultsStream = "runtime:job_results" + HealthEventsStream = "runtime:health_events" + NotificationIntentsKey = "notification:intents" + gameStateRootSubdir = "game-state" + listenAddr = "127.0.0.1:0" + listenerWaitTimeout = 10 * time.Second + readyzPollInterval = 25 * time.Millisecond + cleanupShutdownTimeout = 30 * time.Second +) + +// Env carries everything one integration scenario needs to drive the +// Runtime Manager process. The struct is value-typed so tests reach +// fields without intermediate getters. +type Env struct { + // Cfg is the resolved Runtime Manager configuration handed to + // `app.NewRuntime`. Tests inspect it for stream key shapes, + // container defaults, and timeout knobs. + Cfg config.Config + + // Runtime is the in-process Runtime Manager exposed for tests that + // need to peek at internal state (`runtime.InternalServer().Addr()`). + Runtime *app.Runtime + + // Postgres holds the per-package PostgreSQL fixture. + Postgres *PostgresEnv + + // Redis holds the per-package Redis fixture plus a fresh client the + // test owns. + Redis *RedisEnv + RedisClient *redis.Client + + // Docker holds the per-package Docker daemon handle. + Docker *DockerEnv + + // Lobby is the per-test stub HTTP server. + Lobby *LobbyStub + + // Network is the unique Docker network name created for this test. + Network string + + // EngineImageRef and PatchedImageRef are the two semver-compatible + // engine image tags the harness builds once per package. Patch + // scenarios point at the second tag. + EngineImageRef string + PatchedImageRef string + + // GameStateRoot is the host filesystem path RTM writes per-game + // state directories under. It lives inside `t.ArtifactDir()` so + // failed scenarios leave the engine state behind for inspection. + GameStateRoot string + + // InternalAddr is the bound address of RTM's internal HTTP listener + // (resolved after Run binds the port). + InternalAddr string +} + +// EnvOptions carry per-test overrides to the harness defaults. Empty +// fields fall back to the defaults declared at the top of this file. +type EnvOptions struct { + // ReconcileInterval overrides the periodic reconciler interval. + // Default 500ms (so reconcile drift is observable inside a single + // scenario timeout). + ReconcileInterval time.Duration + + // CleanupInterval overrides the container-cleanup interval. + CleanupInterval time.Duration + + // InspectInterval overrides the Docker inspect worker interval. + InspectInterval time.Duration + + // ProbeInterval / ProbeTimeout / ProbeFailuresThreshold override + // the active engine probe knobs. + ProbeInterval time.Duration + ProbeTimeout time.Duration + ProbeFailuresThreshold int + + // GameLeaseTTL overrides the per-game Redis lease TTL. + GameLeaseTTL time.Duration + + // StreamBlockTimeout overrides the consumer XREAD block window. + StreamBlockTimeout time.Duration + + // LogToStderr makes the harness write the runtime's structured + // logs to stderr; the default discards them so test output stays + // focused on assertions. + LogToStderr bool +} + +// NewEnv stands up a fresh Runtime Manager process for the calling +// test. It blocks until the internal HTTP listener is bound; tests can +// issue REST and stream requests immediately after the call returns. +// +// `t.Cleanup` runs in reverse order: stop the runtime, close the +// runtime, close the per-test redis client, remove the docker network, +// terminate the lobby stub. Containers RTM created during the test are +// removed by the test's own cleanup paths or by the integration +// `health_test` external-action helpers. +func NewEnv(t *testing.T, opts EnvOptions) *Env { + t.Helper() + + pg := EnsurePostgres(t) + rd := EnsureRedis(t) + dk := EnsureDocker(t) + imageRef := EnsureEngineImage(t) + TruncatePostgres(t) + FlushRedis(t) + network := EnsureNetwork(t) + lobby := NewLobbyStub(t) + stateRoot := stateRoot(t) + + cfg := buildConfig(buildConfigInput{ + PostgresDSN: pg.DSN(), + RedisAddr: rd.Addr(), + DockerHost: resolveDockerHost(), + Network: network, + LobbyURL: lobby.URL(), + GameStateRoot: stateRoot, + ReconcileInterval: pickDuration(opts.ReconcileInterval, 500*time.Millisecond), + CleanupInterval: pickDuration(opts.CleanupInterval, 500*time.Millisecond), + InspectInterval: pickDuration(opts.InspectInterval, 500*time.Millisecond), + ProbeInterval: pickDuration(opts.ProbeInterval, 500*time.Millisecond), + ProbeTimeout: pickDuration(opts.ProbeTimeout, time.Second), + ProbeFailures: pickInt(opts.ProbeFailuresThreshold, 2), + GameLeaseTTL: pickDuration(opts.GameLeaseTTL, 5*time.Second), + StreamBlockTimeout: pickDuration(opts.StreamBlockTimeout, 200*time.Millisecond), + }) + + logger := newLogger(opts.LogToStderr) + + ctx, cancel := context.WithCancel(context.Background()) + + runtime, err := app.NewRuntime(ctx, cfg, logger) + if err != nil { + cancel() + t.Fatalf("rtmanager integration: new runtime: %v", err) + } + + runDone := make(chan error, 1) + go func() { + runDone <- runtime.Run(ctx) + }() + + internalAddr := waitForListener(t, runtime) + waitForReady(t, runtime, listenerWaitTimeout) + + var cleanupOnce sync.Once + t.Cleanup(func() { + cleanupOnce.Do(func() { + cancel() + waitCtx, waitCancel := context.WithTimeout(context.Background(), cleanupShutdownTimeout) + defer waitCancel() + select { + case err := <-runDone: + if err != nil && !isCleanShutdownErr(err) { + t.Logf("rtmanager integration: runtime.Run returned: %v", err) + } + case <-waitCtx.Done(): + t.Logf("rtmanager integration: runtime did not stop within %s", cleanupShutdownTimeout) + } + if err := runtime.Close(); err != nil { + t.Logf("rtmanager integration: runtime.Close: %v", err) + } + }) + }) + + return &Env{ + Cfg: cfg, + Runtime: runtime, + Postgres: pg, + Redis: rd, + RedisClient: rd.NewClient(t), + Docker: dk, + Lobby: lobby, + Network: network, + EngineImageRef: imageRef, + PatchedImageRef: PatchedEngineImageRef, + GameStateRoot: stateRoot, + InternalAddr: internalAddr, + } +} + +type buildConfigInput struct { + PostgresDSN string + RedisAddr string + DockerHost string + Network string + LobbyURL string + GameStateRoot string + ReconcileInterval time.Duration + CleanupInterval time.Duration + InspectInterval time.Duration + ProbeInterval time.Duration + ProbeTimeout time.Duration + ProbeFailures int + GameLeaseTTL time.Duration + StreamBlockTimeout time.Duration +} + +func buildConfig(in buildConfigInput) config.Config { + cfg := config.DefaultConfig() + cfg.InternalHTTP.Addr = listenAddr + + cfg.Docker.Host = in.DockerHost + cfg.Docker.Network = in.Network + cfg.Docker.PullPolicy = config.ImagePullPolicyIfMissing + + cfg.Postgres = config.PostgresConfig{ + Conn: postgres.Config{ + PrimaryDSN: in.PostgresDSN, + OperationTimeout: pgOperationTimeout, + MaxOpenConns: 5, + MaxIdleConns: 2, + ConnMaxLifetime: 30 * time.Minute, + }, + } + + cfg.Redis = config.RedisConfig{ + Conn: redisconn.Config{ + MasterAddr: in.RedisAddr, + Password: "integration", + OperationTimeout: 2 * time.Second, + }, + } + + cfg.Streams.StartJobs = StartJobsStream + cfg.Streams.StopJobs = StopJobsStream + cfg.Streams.JobResults = JobResultsStream + cfg.Streams.HealthEvents = HealthEventsStream + cfg.Streams.NotificationIntents = NotificationIntentsKey + cfg.Streams.BlockTimeout = in.StreamBlockTimeout + + cfg.Container.GameStateRoot = in.GameStateRoot + // Pin chown target to the current process uid/gid; the dev sandbox + // (and unprivileged dev machines) cannot chown to root. + cfg.Container.GameStateOwnerUID = os.Getuid() + cfg.Container.GameStateOwnerGID = os.Getgid() + + cfg.Health.InspectInterval = in.InspectInterval + cfg.Health.ProbeInterval = in.ProbeInterval + cfg.Health.ProbeTimeout = in.ProbeTimeout + cfg.Health.ProbeFailuresThreshold = in.ProbeFailures + + cfg.Cleanup.ReconcileInterval = in.ReconcileInterval + cfg.Cleanup.CleanupInterval = in.CleanupInterval + + cfg.Coordination.GameLeaseTTL = in.GameLeaseTTL + + cfg.Lobby = config.LobbyConfig{ + BaseURL: in.LobbyURL, + Timeout: 2 * time.Second, + } + + cfg.Telemetry.TracesExporter = "none" + cfg.Telemetry.MetricsExporter = "none" + + return cfg +} + +func newLogger(toStderr bool) *slog.Logger { + if toStderr { + return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug})) + } + return slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{Level: slog.LevelError})) +} + +func stateRoot(t *testing.T) string { + t.Helper() + dir := t.ArtifactDir() + root := dir + string(os.PathSeparator) + gameStateRootSubdir + if err := os.MkdirAll(root, 0o755); err != nil { + t.Fatalf("rtmanager integration: create game-state root %q: %v", root, err) + } + return root +} + +func resolveDockerHost() string { + if host := strings.TrimSpace(os.Getenv("DOCKER_HOST")); host != "" { + return host + } + return "unix:///var/run/docker.sock" +} + +func pickDuration(value, fallback time.Duration) time.Duration { + if value > 0 { + return value + } + return fallback +} + +func pickInt(value, fallback int) int { + if value > 0 { + return value + } + return fallback +} + +// waitForListener spins until `runtime.InternalServer().Addr()` returns +// a non-empty value or the deadline fires. The internal listener binds +// during `runtime.Run`, which runs in its own goroutine; this helper +// is the bridge between "Run started" and "tests can use REST". +func waitForListener(t *testing.T, runtime *app.Runtime) string { + t.Helper() + deadline := time.Now().Add(listenerWaitTimeout) + for { + if runtime != nil && runtime.InternalServer() != nil { + if addr := runtime.InternalServer().Addr(); addr != "" { + return addr + } + } + if time.Now().After(deadline) { + t.Fatalf("rtmanager integration: internal HTTP listener did not bind within %s", listenerWaitTimeout) + } + time.Sleep(readyzPollInterval) + } +} + +// waitForReady polls `/readyz` until it returns 200 or the deadline +// fires. RTM's readyz pings PG, Redis, and Docker; a successful +// response means every dependency is reachable through the runtime +// process. +func waitForReady(t *testing.T, runtime *app.Runtime, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + addr := runtime.InternalServer().Addr() + probeURL := (&url.URL{Scheme: "http", Host: addr, Path: "/readyz"}).String() + for { + req, err := newRequest(context.Background(), "GET", probeURL, nil) + if err == nil { + resp, err := defaultHTTPClient.Do(req) + if err == nil { + _, _ = io.Copy(io.Discard, resp.Body) + _ = resp.Body.Close() + if resp.StatusCode == 200 { + return + } + } + } + if time.Now().After(deadline) { + t.Fatalf("rtmanager integration: /readyz did not return 200 within %s", timeout) + } + time.Sleep(readyzPollInterval) + } +} + +func isCleanShutdownErr(err error) bool { + return err == nil || errors.Is(err, context.Canceled) +} + +// IDFromTestName builds a deterministic-but-unique game id from the +// caller's test name. Two tests with the same name running back-to-back +// would otherwise collide on PG state through the per-test +// `TruncatePostgres` window; pinning the suffix to `Now().UnixNano()` +// rules that out. +func IDFromTestName(t *testing.T) string { + t.Helper() + // The container hostname is `galaxy-game-{game_id}` and must fit + // HOST_NAME_MAX=64 chars; runc rejects longer values with + // "sethostname: invalid argument". Cap the lowercased test-name + // component at 36 chars and append a 16-char base36 suffix so the + // total stays comfortably under the limit (12 + 36 + 1 + 16 = 65 → + // trim further if needed). + const maxNameLen = 35 + suffix := strconv.FormatInt(time.Now().UnixNano(), 36) + prefix := strings.ToLower(strings.NewReplacer("/", "-", " ", "-").Replace(t.Name())) + if len(prefix) > maxNameLen { + prefix = prefix[:maxNameLen] + } + return prefix + "-" + suffix +} diff --git a/rtmanager/integration/harness/store.go b/rtmanager/integration/harness/store.go new file mode 100644 index 0000000..9b795f1 --- /dev/null +++ b/rtmanager/integration/harness/store.go @@ -0,0 +1,128 @@ +package harness + +import ( + "context" + "errors" + "testing" + "time" + + "galaxy/rtmanager/internal/adapters/postgres/healthsnapshotstore" + "galaxy/rtmanager/internal/adapters/postgres/operationlogstore" + "galaxy/rtmanager/internal/adapters/postgres/runtimerecordstore" + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + + "github.com/stretchr/testify/require" +) + +// RuntimeRecord returns the persisted runtime record for gameID. The +// helper opens the store on every call (cheap; the harness `*sql.DB` +// is shared) so individual scenarios stay isolated even if a previous +// test mutated store state. +func RuntimeRecord(t testing.TB, env *Env, gameID string) (runtime.RuntimeRecord, error) { + t.Helper() + store, err := runtimerecordstore.New(runtimerecordstore.Config{ + DB: env.Postgres.Pool(), + OperationTimeout: pgOperationTimeout, + }) + require.NoError(t, err) + return store.Get(context.Background(), gameID) +} + +// MustRuntimeRecord asserts that the record exists and returns it. +func MustRuntimeRecord(t testing.TB, env *Env, gameID string) runtime.RuntimeRecord { + t.Helper() + record, err := RuntimeRecord(t, env, gameID) + require.NoErrorf(t, err, "load runtime record for %s", gameID) + return record +} + +// EventuallyRuntimeRecord polls until predicate matches the runtime +// record for gameID, or the deadline fires. Returns the matching +// record. Used by lifecycle assertions that depend on async state +// transitions (start consumer → record). +func EventuallyRuntimeRecord(t testing.TB, env *Env, gameID string, predicate func(runtime.RuntimeRecord) bool, timeout time.Duration) runtime.RuntimeRecord { + t.Helper() + if timeout <= 0 { + timeout = defaultStreamTimeout + } + deadline := time.Now().Add(timeout) + for { + record, err := RuntimeRecord(t, env, gameID) + if err == nil && predicate(record) { + return record + } + if err != nil && !errors.Is(err, runtime.ErrNotFound) { + t.Fatalf("rtmanager integration: load runtime record: %v", err) + } + if time.Now().After(deadline) { + if err != nil { + t.Fatalf("rtmanager integration: runtime record predicate not met within %s; last err=%v", + timeout, err) + } + t.Fatalf("rtmanager integration: runtime record predicate not met within %s; last record=%+v", + timeout, record) + } + time.Sleep(defaultStreamPoll) + } +} + +// OperationEntries returns up to `limit` most-recent operation_log +// entries for gameID, ordered descending by started_at. +func OperationEntries(t testing.TB, env *Env, gameID string, limit int) []operation.OperationEntry { + t.Helper() + store, err := operationlogstore.New(operationlogstore.Config{ + DB: env.Postgres.Pool(), + OperationTimeout: pgOperationTimeout, + }) + require.NoError(t, err) + entries, err := store.ListByGame(context.Background(), gameID, limit) + require.NoErrorf(t, err, "list operation log entries for %s", gameID) + return entries +} + +// EventuallyOperationKind polls operation_log until at least one entry +// for gameID has the requested kind, or the deadline fires. Returns +// the matching entry. +func EventuallyOperationKind(t testing.TB, env *Env, gameID string, kind operation.OpKind, timeout time.Duration) operation.OperationEntry { + t.Helper() + if timeout <= 0 { + timeout = defaultStreamTimeout + } + deadline := time.Now().Add(timeout) + for { + entries := OperationEntries(t, env, gameID, 50) + for _, entry := range entries { + if entry.OpKind == kind { + return entry + } + } + if time.Now().After(deadline) { + t.Fatalf("rtmanager integration: operation_log entry with op_kind=%s not seen within %s; observed=%v", + kind, timeout, opKindSummary(entries)) + } + time.Sleep(defaultStreamPoll) + } +} + +// HealthSnapshot returns the latest persisted health snapshot for +// gameID, or the underlying not-found sentinel when nothing has been +// recorded yet. +func HealthSnapshot(t testing.TB, env *Env, gameID string) (health.HealthSnapshot, error) { + t.Helper() + store, err := healthsnapshotstore.New(healthsnapshotstore.Config{ + DB: env.Postgres.Pool(), + OperationTimeout: pgOperationTimeout, + }) + require.NoError(t, err) + return store.Get(context.Background(), gameID) +} + +func opKindSummary(entries []operation.OperationEntry) []string { + out := make([]string, 0, len(entries)) + for _, entry := range entries { + out = append(out, string(entry.OpKind)+"/"+string(entry.Outcome)) + } + return out +} diff --git a/rtmanager/integration/harness/streams.go b/rtmanager/integration/harness/streams.go new file mode 100644 index 0000000..e6ba4ac --- /dev/null +++ b/rtmanager/integration/harness/streams.go @@ -0,0 +1,334 @@ +package harness + +import ( + "context" + "encoding/json" + "fmt" + "strconv" + "strings" + "testing" + "time" + + "galaxy/rtmanager/internal/ports" + + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/require" +) + +// Default scenario timeouts. Stream-driven assertions sit on top of +// the runtime's worker tickers (defaults of 200-500ms in +// `EnvOptions`); 30s gives every reconcile / probe / event tick more +// than enough headroom even on a slow CI runner. +const ( + defaultStreamTimeout = 30 * time.Second + defaultStreamPoll = 25 * time.Millisecond +) + +// XAddStartJob appends one start-job entry in the +// `runtime:start_jobs` AsyncAPI shape and returns the assigned entry +// id. Mirrors the wire shape produced by Lobby's +// `runtimemanager.Publisher` so the consumer treats the entry exactly +// like a real Lobby-published job. +func XAddStartJob(t testing.TB, env *Env, gameID, imageRef string) string { + t.Helper() + id, err := env.RedisClient.XAdd(context.Background(), &redis.XAddArgs{ + Stream: env.Cfg.Streams.StartJobs, + Values: map[string]any{ + "game_id": gameID, + "image_ref": imageRef, + "requested_at_ms": time.Now().UTC().UnixMilli(), + }, + }).Result() + require.NoErrorf(t, err, "xadd start_jobs for game %s", gameID) + return id +} + +// XAddStopJob appends one stop-job entry classified by reason. The +// reason enum is documented at `ports.StopReason`. +func XAddStopJob(t testing.TB, env *Env, gameID, reason string) string { + t.Helper() + id, err := env.RedisClient.XAdd(context.Background(), &redis.XAddArgs{ + Stream: env.Cfg.Streams.StopJobs, + Values: map[string]any{ + "game_id": gameID, + "reason": reason, + "requested_at_ms": time.Now().UTC().UnixMilli(), + }, + }).Result() + require.NoErrorf(t, err, "xadd stop_jobs for game %s", gameID) + return id +} + +// JobResultEntry is the decoded shape of one `runtime:job_results` +// stream entry. Mirrors `ports.JobResult` plus the entry id surfaced +// by Redis so tests can correlate XADD ids with results. +type JobResultEntry struct { + StreamID string + GameID string + Outcome string + ContainerID string + EngineEndpoint string + ErrorCode string + ErrorMessage string +} + +// HealthEventEntry mirrors the `runtime:health_events` AsyncAPI shape +// in decoded form. +type HealthEventEntry struct { + StreamID string + GameID string + ContainerID string + EventType string + OccurredAtMs int64 + Details map[string]any +} + +// NotificationIntentEntry decodes one `notification:intents` entry +// that RTM publishes for first-touch start failures. +type NotificationIntentEntry struct { + StreamID string + NotificationType string + IdempotencyKey string + Payload map[string]any +} + +// WaitForJobResult polls `runtime:job_results` until predicate +// matches, or the timeout fires. Returns the matching entry. The +// helper does not consume the stream — every call rescans from `0-0` +// — because RTM's writes are append-only and the cardinality per test +// is small. +func WaitForJobResult(t testing.TB, env *Env, predicate func(JobResultEntry) bool, timeout time.Duration) JobResultEntry { + t.Helper() + if timeout <= 0 { + timeout = defaultStreamTimeout + } + deadline := time.Now().Add(timeout) + for { + entries, err := env.RedisClient.XRange(context.Background(), env.Cfg.Streams.JobResults, "-", "+").Result() + require.NoErrorf(t, err, "xrange %s", env.Cfg.Streams.JobResults) + for _, entry := range entries { + decoded := decodeJobResult(entry) + if predicate(decoded) { + return decoded + } + } + if time.Now().After(deadline) { + t.Fatalf("rtmanager integration: no job_result matched within %s; observed=%v", + timeout, jobResultStreamSummary(entries)) + } + time.Sleep(defaultStreamPoll) + } +} + +// AllJobResults returns every entry on `runtime:job_results` in stream +// order. Useful for assertions that depend on cardinality (replay +// tests). +func AllJobResults(t testing.TB, env *Env) []JobResultEntry { + t.Helper() + entries, err := env.RedisClient.XRange(context.Background(), env.Cfg.Streams.JobResults, "-", "+").Result() + require.NoErrorf(t, err, "xrange %s", env.Cfg.Streams.JobResults) + out := make([]JobResultEntry, 0, len(entries)) + for _, entry := range entries { + out = append(out, decodeJobResult(entry)) + } + return out +} + +// WaitForHealthEvent polls `runtime:health_events` until predicate +// matches, or the timeout fires. +func WaitForHealthEvent(t testing.TB, env *Env, predicate func(HealthEventEntry) bool, timeout time.Duration) HealthEventEntry { + t.Helper() + if timeout <= 0 { + timeout = defaultStreamTimeout + } + deadline := time.Now().Add(timeout) + for { + entries, err := env.RedisClient.XRange(context.Background(), env.Cfg.Streams.HealthEvents, "-", "+").Result() + require.NoErrorf(t, err, "xrange %s", env.Cfg.Streams.HealthEvents) + for _, entry := range entries { + decoded := decodeHealthEvent(t, entry) + if predicate(decoded) { + return decoded + } + } + if time.Now().After(deadline) { + t.Fatalf("rtmanager integration: no health_event matched within %s; observed=%v", + timeout, healthEventStreamSummary(entries)) + } + time.Sleep(defaultStreamPoll) + } +} + +// WaitForNotificationIntent polls `notification:intents` until +// predicate matches. +func WaitForNotificationIntent(t testing.TB, env *Env, predicate func(NotificationIntentEntry) bool, timeout time.Duration) NotificationIntentEntry { + t.Helper() + if timeout <= 0 { + timeout = defaultStreamTimeout + } + deadline := time.Now().Add(timeout) + for { + entries, err := env.RedisClient.XRange(context.Background(), env.Cfg.Streams.NotificationIntents, "-", "+").Result() + require.NoErrorf(t, err, "xrange %s", env.Cfg.Streams.NotificationIntents) + for _, entry := range entries { + decoded := decodeNotificationIntent(t, entry) + if predicate(decoded) { + return decoded + } + } + if time.Now().After(deadline) { + t.Fatalf("rtmanager integration: no notification_intent matched within %s; observed=%v", + timeout, notificationStreamSummary(entries)) + } + time.Sleep(defaultStreamPoll) + } +} + +// JobOutcomeIs returns a predicate matching a job result whose game id +// and outcome equal the inputs. +func JobOutcomeIs(gameID, outcome string) func(JobResultEntry) bool { + return func(entry JobResultEntry) bool { + return entry.GameID == gameID && entry.Outcome == outcome + } +} + +// JobOutcomeWithErrorCode matches a job result whose game id, outcome, +// and error_code all equal the inputs. Used by replay-no-op +// assertions. +func JobOutcomeWithErrorCode(gameID, outcome, errorCode string) func(JobResultEntry) bool { + return func(entry JobResultEntry) bool { + return entry.GameID == gameID && entry.Outcome == outcome && entry.ErrorCode == errorCode + } +} + +// HealthEventTypeIs returns a predicate matching a health event whose +// game id and event_type equal the inputs. +func HealthEventTypeIs(gameID, eventType string) func(HealthEventEntry) bool { + return func(entry HealthEventEntry) bool { + return entry.GameID == gameID && entry.EventType == eventType + } +} + +func decodeJobResult(message redis.XMessage) JobResultEntry { + return JobResultEntry{ + StreamID: message.ID, + GameID: streamString(message.Values, "game_id"), + Outcome: streamString(message.Values, "outcome"), + ContainerID: streamString(message.Values, "container_id"), + EngineEndpoint: streamString(message.Values, "engine_endpoint"), + ErrorCode: streamString(message.Values, "error_code"), + ErrorMessage: streamString(message.Values, "error_message"), + } +} + +func decodeHealthEvent(t testing.TB, message redis.XMessage) HealthEventEntry { + t.Helper() + occurredAt, _ := strconv.ParseInt(streamString(message.Values, "occurred_at_ms"), 10, 64) + entry := HealthEventEntry{ + StreamID: message.ID, + GameID: streamString(message.Values, "game_id"), + ContainerID: streamString(message.Values, "container_id"), + EventType: streamString(message.Values, "event_type"), + OccurredAtMs: occurredAt, + } + rawDetails := streamString(message.Values, "details") + if rawDetails != "" { + var parsed map[string]any + if err := json.Unmarshal([]byte(rawDetails), &parsed); err == nil { + entry.Details = parsed + } + } + return entry +} + +func decodeNotificationIntent(t testing.TB, message redis.XMessage) NotificationIntentEntry { + t.Helper() + entry := NotificationIntentEntry{ + StreamID: message.ID, + NotificationType: streamString(message.Values, "notification_type"), + IdempotencyKey: streamString(message.Values, "idempotency_key"), + } + rawPayload := streamString(message.Values, "payload_json") + if rawPayload == "" { + rawPayload = streamString(message.Values, "payload") + } + if rawPayload != "" { + var parsed map[string]any + if err := json.Unmarshal([]byte(rawPayload), &parsed); err == nil { + entry.Payload = parsed + } + } + return entry +} + +func streamString(values map[string]any, key string) string { + raw, ok := values[key] + if !ok { + return "" + } + switch typed := raw.(type) { + case string: + return typed + case []byte: + return string(typed) + default: + return fmt.Sprintf("%v", typed) + } +} + +func jobResultStreamSummary(entries []redis.XMessage) []string { + out := make([]string, 0, len(entries)) + for _, entry := range entries { + decoded := decodeJobResult(entry) + out = append(out, fmt.Sprintf("%s game=%s outcome=%s err=%s", + decoded.StreamID, decoded.GameID, decoded.Outcome, decoded.ErrorCode)) + } + return out +} + +func healthEventStreamSummary(entries []redis.XMessage) []string { + out := make([]string, 0, len(entries)) + for _, entry := range entries { + out = append(out, fmt.Sprintf("%s %s %s", + entry.ID, streamString(entry.Values, "game_id"), streamString(entry.Values, "event_type"))) + } + return out +} + +func notificationStreamSummary(entries []redis.XMessage) []string { + out := make([]string, 0, len(entries)) + for _, entry := range entries { + out = append(out, fmt.Sprintf("%s %s", + entry.ID, streamString(entry.Values, "notification_type"))) + } + return out +} + +// EnsureJobOutcomeConstants pins the constants from `ports` so suite +// authors can build predicates without importing `ports` themselves. +// Re-exported here to keep test source focused. +var ( + JobOutcomeSuccess = ports.JobOutcomeSuccess + JobOutcomeFailure = ports.JobOutcomeFailure +) + +// AssertNoJobResultBeyond fails the test if the count of entries on +// `runtime:job_results` exceeds `expectedCount`. Used by the replay +// tests to prove the second envelope was no-op. +func AssertNoJobResultBeyond(t testing.TB, env *Env, expectedCount int) { + t.Helper() + entries, err := env.RedisClient.XLen(context.Background(), env.Cfg.Streams.JobResults).Result() + require.NoError(t, err) + require.LessOrEqualf(t, entries, int64(expectedCount), + "job_results stream has more entries than expected; got=%d expected<=%d", entries, expectedCount) +} + +// SanitizeContainerSummaryFor returns a stable diagnostic string for a +// container summary keyed by game id. Used in test failures. +func SanitizeContainerSummaryFor(values map[string]string, gameID string) string { + parts := make([]string, 0, len(values)) + for key, value := range values { + parts = append(parts, key+"="+value) + } + return fmt.Sprintf("game=%s {%s}", gameID, strings.Join(parts, ", ")) +} diff --git a/rtmanager/integration/lifecycle_test.go b/rtmanager/integration/lifecycle_test.go new file mode 100644 index 0000000..88e408c --- /dev/null +++ b/rtmanager/integration/lifecycle_test.go @@ -0,0 +1,303 @@ +//go:build integration + +// Package integration_test owns the service-local end-to-end scenarios +// for Runtime Manager. The build tag keeps the suite out of the +// default `go test ./...` run; CI invokes the suite explicitly with +// `go test -tags=integration ./rtmanager/integration/...`. +// +// Design rationale for the suite — build tag, in-process harness, +// per-test isolation, two-tag engine image — lives in +// `rtmanager/docs/integration-tests.md`. Each test stands up its own +// Runtime Manager process via `harness.NewEnv`, drives the same +// streams Game Lobby uses in `integration/lobbyrtm`, and asserts the +// resulting PostgreSQL, Redis-stream, and Docker side-effects. +package integration_test + +import ( + "context" + "net/http" + "testing" + "time" + + "galaxy/rtmanager/integration/harness" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/filters" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestMain centralises shared-container teardown so individual +// failing tests do not leak the testcontainers postgres / redis pair. +func TestMain(m *testing.M) { + harness.RunMain(m) +} + +// TestLifecycle_StartInspectStopRestartPatchCleanup drives one game +// through every supported lifecycle operation against the real engine +// image and asserts each step's PG, Redis-stream, and Docker +// side-effects. +func TestLifecycle_StartInspectStopRestartPatchCleanup(t *testing.T) { + env := harness.NewEnv(t, harness.EnvOptions{LogToStderr: true}) + rest := harness.NewREST(env) + gameID := harness.IDFromTestName(t) + + // Step 1 — start through the Lobby async stream contract. + startEntryID := harness.XAddStartJob(t, env, gameID, env.EngineImageRef) + t.Logf("start_jobs xadd id=%s", startEntryID) + + startResult := harness.WaitForJobResult(t, env, + harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess), + 30*time.Second, + ) + require.Equal(t, "", startResult.ErrorCode, "fresh start must publish empty error_code") + require.NotEmpty(t, startResult.ContainerID, "fresh start job result must carry container_id") + require.NotEmpty(t, startResult.EngineEndpoint, "fresh start job result must carry engine_endpoint") + + // PG record reflects the start. + startedRecord := harness.EventuallyRuntimeRecord(t, env, gameID, + func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusRunning }, + 15*time.Second, + ) + assert.Equal(t, env.EngineImageRef, startedRecord.CurrentImageRef) + assert.Equal(t, env.Network, startedRecord.DockerNetwork) + assert.Equal(t, startResult.ContainerID, startedRecord.CurrentContainerID) + assert.Equal(t, startResult.EngineEndpoint, startedRecord.EngineEndpoint) + + // operation_log captures the start. + startEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindStart, 5*time.Second) + assert.Equal(t, operation.OutcomeSuccess, startEntry.Outcome) + assert.Equal(t, operation.OpSourceLobbyStream, startEntry.OpSource) + + // Step 2 — inspect via the GM/Admin REST surface. + getResp, status := rest.GetRuntime(t, gameID) + require.Equal(t, http.StatusOK, status) + require.Equal(t, "running", getResp.Status) + require.NotNil(t, getResp.CurrentContainerID) + require.Equal(t, startResult.ContainerID, *getResp.CurrentContainerID) + require.NotNil(t, getResp.CurrentImageRef) + require.Equal(t, env.EngineImageRef, *getResp.CurrentImageRef) + require.NotNil(t, getResp.EngineEndpoint) + require.Equal(t, startResult.EngineEndpoint, *getResp.EngineEndpoint) + + // Step 3 — stop through the Lobby async stream contract. + harness.XAddStopJob(t, env, gameID, "cancelled") + stopResult := waitForLatestStopOrStartResult(t, env, gameID) + require.Equal(t, ports.JobOutcomeSuccess, stopResult.Outcome) + require.Equal(t, "", stopResult.ErrorCode, "fresh stop must publish empty error_code") + + stoppedRecord := harness.EventuallyRuntimeRecord(t, env, gameID, + func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusStopped }, + 15*time.Second, + ) + assert.Equal(t, startResult.ContainerID, stoppedRecord.CurrentContainerID, + "stop preserves the current container id until cleanup") + + // Step 4 — restart via REST. Container id changes; engine endpoint + // stays stable. + restartResp, status := rest.RestartRuntime(t, gameID) + require.Equal(t, http.StatusOK, status) + require.Equal(t, "running", restartResp.Status) + require.NotNil(t, restartResp.CurrentContainerID) + require.NotEqual(t, startResult.ContainerID, *restartResp.CurrentContainerID, + "restart must produce a new container id") + require.NotNil(t, restartResp.EngineEndpoint) + require.Equal(t, startResult.EngineEndpoint, *restartResp.EngineEndpoint, + "restart must keep the engine endpoint stable") + + restartContainerID := *restartResp.CurrentContainerID + restartEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindRestart, 5*time.Second) + assert.Equal(t, operation.OutcomeSuccess, restartEntry.Outcome) + assert.Equal(t, operation.OpSourceAdminRest, restartEntry.OpSource) + + // Step 5 — patch to the second semver-compatible tag. Same image + // content, but the runtime should still record the new tag and + // recreate the container. + patchResp, status := rest.PatchRuntime(t, gameID, env.PatchedImageRef) + require.Equal(t, http.StatusOK, status) + require.Equal(t, "running", patchResp.Status) + require.NotNil(t, patchResp.CurrentImageRef) + assert.Equal(t, env.PatchedImageRef, *patchResp.CurrentImageRef) + require.NotNil(t, patchResp.CurrentContainerID) + assert.NotEqual(t, restartContainerID, *patchResp.CurrentContainerID, + "patch must recreate the container") + + patchEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindPatch, 5*time.Second) + assert.Equal(t, operation.OutcomeSuccess, patchEntry.Outcome) + + // Step 6 — quiesce via REST stop so cleanup is allowed (cleanup + // refuses to remove a running container per + // `rtmanager/README.md §Lifecycles → Cleanup`). + stopResp, status := rest.StopRuntime(t, gameID, "admin_request") + require.Equal(t, http.StatusOK, status) + require.Equal(t, "stopped", stopResp.Status) + + // Step 7 — cleanup the container. PG record flips to removed and + // current_container_id becomes nil. + cleanupResp, status := rest.CleanupRuntime(t, gameID) + require.Equal(t, http.StatusOK, status) + require.Equal(t, "removed", cleanupResp.Status) + require.Nil(t, cleanupResp.CurrentContainerID) + + cleanupEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindCleanupContainer, 5*time.Second) + assert.Equal(t, operation.OutcomeSuccess, cleanupEntry.Outcome) + assert.Equal(t, operation.OpSourceAdminRest, cleanupEntry.OpSource) +} + +// TestReplay_StartJobIsNoop publishes the same start envelope twice +// and asserts that Runtime Manager produces a fresh job_result for +// the first XADD and a `replay_no_op` outcome for the second, without +// recreating the engine container. +func TestReplay_StartJobIsNoop(t *testing.T) { + env := harness.NewEnv(t, harness.EnvOptions{}) + gameID := harness.IDFromTestName(t) + + // First XADD: fresh start. + harness.XAddStartJob(t, env, gameID, env.EngineImageRef) + first := harness.WaitForJobResult(t, env, + harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess), + 30*time.Second, + ) + require.Equal(t, "", first.ErrorCode) + + // Second XADD: same envelope; the start service must short-circuit + // at the `runtime_records.status=running && image_ref` check. + harness.XAddStartJob(t, env, gameID, env.EngineImageRef) + replay := harness.WaitForJobResult(t, env, + harness.JobOutcomeWithErrorCode(gameID, ports.JobOutcomeSuccess, "replay_no_op"), + 15*time.Second, + ) + assert.Equal(t, first.ContainerID, replay.ContainerID, + "replay must surface the same container id as the original start") + assert.Equal(t, first.EngineEndpoint, replay.EngineEndpoint) + + // Docker view: exactly one engine container exists for this game. + assertSingleEngineContainer(t, env, gameID) + + // Lifecycle stream produced exactly two entries: fresh + replay. + entries := harness.AllJobResults(t, env) + require.Len(t, entries, 2) + assert.Equal(t, "", entries[0].ErrorCode) + assert.Equal(t, "replay_no_op", entries[1].ErrorCode) +} + +// TestReplay_StopJobIsNoop publishes a stop envelope twice after a +// successful start and asserts the second stop surfaces as +// `replay_no_op` without altering the runtime record's `stopped_at`. +func TestReplay_StopJobIsNoop(t *testing.T) { + env := harness.NewEnv(t, harness.EnvOptions{}) + gameID := harness.IDFromTestName(t) + + // Bring the game to `running`. The start path publishes one entry + // to `runtime:job_results`; the stops below publish two more, so + // per-game stream order is [start, first-stop, replay-stop]. + harness.XAddStartJob(t, env, gameID, env.EngineImageRef) + harness.WaitForJobResult(t, env, + harness.JobOutcomeIs(gameID, ports.JobOutcomeSuccess), + 30*time.Second, + ) + + // First stop: fresh. The expectedCount accounts for the start + // entry that is already on the stream. + harness.XAddStopJob(t, env, gameID, "cancelled") + first := waitForJobResultByIndex(t, env, gameID, 2) + require.Equal(t, ports.JobOutcomeSuccess, first.Outcome) + require.Equal(t, "", first.ErrorCode) + + stoppedRecord := harness.EventuallyRuntimeRecord(t, env, gameID, + func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusStopped }, + 15*time.Second, + ) + require.NotNil(t, stoppedRecord.StoppedAt, "stopped record must carry stopped_at") + originalStoppedAt := *stoppedRecord.StoppedAt + + // Second stop: replay (third entry on the per-game stream). + harness.XAddStopJob(t, env, gameID, "cancelled") + replay := waitForJobResultByIndex(t, env, gameID, 3) + require.Equal(t, ports.JobOutcomeSuccess, replay.Outcome) + assert.Equal(t, "replay_no_op", replay.ErrorCode) + + // stopped_at stays anchored to the first stop. + postReplay := harness.MustRuntimeRecord(t, env, gameID) + require.Equal(t, runtime.StatusStopped, postReplay.Status) + require.NotNil(t, postReplay.StoppedAt) + assert.True(t, originalStoppedAt.Equal(*postReplay.StoppedAt), + "stopped_at must not move on a replay stop; was %s, now %s", + originalStoppedAt, *postReplay.StoppedAt) +} + +// waitForLatestStopOrStartResult finds the most recent `outcome=success` +// entry on `runtime:job_results` for gameID. The lifecycle scenario +// emits two consecutive successes (start then stop); the helper picks +// the second one without re-scanning the stream every iteration. +func waitForLatestStopOrStartResult(t *testing.T, env *harness.Env, gameID string) harness.JobResultEntry { + t.Helper() + deadline := time.Now().Add(30 * time.Second) + for { + entries := harness.AllJobResults(t, env) + // Two entries means we've observed both the start and stop + // outcomes for this game. + matched := 0 + var last harness.JobResultEntry + for _, entry := range entries { + if entry.GameID == gameID && entry.Outcome == ports.JobOutcomeSuccess { + matched++ + last = entry + } + } + if matched >= 2 { + return last + } + if time.Now().After(deadline) { + t.Fatalf("expected two job_results for %s, got %d", gameID, matched) + } + time.Sleep(50 * time.Millisecond) + } +} + +// waitForJobResultByIndex polls the job_results stream until it has +// at least `expectedCount` entries for gameID and returns the +// expectedCount-th. Used by the replay tests to deterministically +// pick the second / nth result. +func waitForJobResultByIndex(t *testing.T, env *harness.Env, gameID string, expectedCount int) harness.JobResultEntry { + t.Helper() + deadline := time.Now().Add(30 * time.Second) + for { + entries := harness.AllJobResults(t, env) + matches := make([]harness.JobResultEntry, 0, len(entries)) + for _, entry := range entries { + if entry.GameID == gameID { + matches = append(matches, entry) + } + } + if len(matches) >= expectedCount { + return matches[expectedCount-1] + } + if time.Now().After(deadline) { + t.Fatalf("expected at least %d job_results for %s, got %d", + expectedCount, gameID, len(matches)) + } + time.Sleep(50 * time.Millisecond) + } +} + +// assertSingleEngineContainer queries Docker by the per-game label and +// asserts exactly one matching container exists. Catches replay +// regressions that would let RTM start two containers for the same +// game id. +func assertSingleEngineContainer(t *testing.T, env *harness.Env, gameID string) { + t.Helper() + args := filters.NewArgs( + filters.Arg("label", "com.galaxy.owner=rtmanager"), + filters.Arg("label", "com.galaxy.game_id="+gameID), + ) + containers, err := env.Docker.Client().ContainerList( + context.Background(), + container.ListOptions{All: true, Filters: args}, + ) + require.NoError(t, err) + require.Lenf(t, containers, 1, "expected one engine container for game %s, got %d", gameID, len(containers)) +} diff --git a/rtmanager/integration/monitoring_test.go b/rtmanager/integration/monitoring_test.go new file mode 100644 index 0000000..096ac08 --- /dev/null +++ b/rtmanager/integration/monitoring_test.go @@ -0,0 +1,200 @@ +//go:build integration + +package integration_test + +import ( + "context" + "fmt" + "strconv" + "testing" + "time" + + "galaxy/notificationintent" + "galaxy/rtmanager/integration/harness" + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + + dockercontainer "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/network" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestHealth_ContainerDisappearedAndAdopt verifies the two +// drift-detection paths. The Docker events listener emits +// `container_disappeared` when a tracked container is destroyed +// outside RTM, and the reconciler adopts a fresh container labelled +// `com.galaxy.owner=rtmanager` that has no PG row. +// +// `runtime_records.status=removed` is terminal per +// `runtime.AllowedTransitions`; the adoption path therefore uses a +// **fresh** game_id rather than re-adopting the disposed one. That +// matches the documented contract: reconciler adopts containers +// labelled `com.galaxy.owner=rtmanager` for which no PG row exists. +func TestHealth_ContainerDisappearedAndAdopt(t *testing.T) { + env := harness.NewEnv(t, harness.EnvOptions{ + ReconcileInterval: 500 * time.Millisecond, + }) + + // Step 1 — bring a game to running through the start consumer. + disposalGameID := harness.IDFromTestName(t) + "-d" + harness.XAddStartJob(t, env, disposalGameID, env.EngineImageRef) + startResult := harness.WaitForJobResult(t, env, + harness.JobOutcomeIs(disposalGameID, ports.JobOutcomeSuccess), + 30*time.Second, + ) + originalContainerID := startResult.ContainerID + require.NotEmpty(t, originalContainerID) + + // Step 2 — externally remove the container; the events listener + // should observe the destroy and publish `container_disappeared`. + removeContainer(t, env, originalContainerID) + disappeared := harness.WaitForHealthEvent(t, env, + harness.HealthEventTypeIs(disposalGameID, string(health.EventTypeContainerDisappeared)), + 20*time.Second, + ) + assert.Equal(t, originalContainerID, disappeared.ContainerID) + + // The reconciler also marks the runtime record as removed within + // one or two ticks (`reconcile_dispose`). + harness.EventuallyRuntimeRecord(t, env, disposalGameID, + func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusRemoved }, + 15*time.Second, + ) + harness.EventuallyOperationKind(t, env, disposalGameID, operation.OpKindReconcileDispose, 5*time.Second) + + // Step 3 — bring up an adoption candidate for an unseen game id + // by hand. The reconciler must label-match it, find no record, + // and insert one with status=running. + adoptionGameID := harness.IDFromTestName(t) + "-a" + manualContainerID := runManualEngineContainer(t, env, adoptionGameID) + t.Logf("manual container id=%s", manualContainerID) + + adopted := harness.EventuallyRuntimeRecord(t, env, adoptionGameID, + func(r runtime.RuntimeRecord) bool { + return r.Status == runtime.StatusRunning && r.CurrentContainerID == manualContainerID + }, + 20*time.Second, + ) + assert.Equal(t, env.EngineImageRef, adopted.CurrentImageRef) + + adoptEntry := harness.EventuallyOperationKind(t, env, adoptionGameID, operation.OpKindReconcileAdopt, 5*time.Second) + assert.Equal(t, operation.OutcomeSuccess, adoptEntry.Outcome) + assert.Equal(t, operation.OpSourceAutoReconcile, adoptEntry.OpSource) + assert.Equal(t, manualContainerID, adoptEntry.ContainerID) +} + +// TestNotification_ImagePullFailed drives Runtime Manager with a +// start envelope pointing at an unresolvable image reference. The +// start service must surface the failure on `runtime:job_results` and +// publish a `runtime.image_pull_failed` admin notification on +// `notification:intents`. +func TestNotification_ImagePullFailed(t *testing.T) { + env := harness.NewEnv(t, harness.EnvOptions{}) + gameID := harness.IDFromTestName(t) + + const missingImage = "galaxy/integration-missing:0.0.0" + harness.XAddStartJob(t, env, gameID, missingImage) + + // Job result publishes a failure with the stable image_pull_failed + // code. + jobResult := harness.WaitForJobResult(t, env, + harness.JobOutcomeIs(gameID, ports.JobOutcomeFailure), + 60*time.Second, + ) + assert.Equal(t, startruntime.ErrorCodeImagePullFailed, jobResult.ErrorCode) + assert.Empty(t, jobResult.ContainerID, "failure must not surface a container id") + assert.Empty(t, jobResult.EngineEndpoint, "failure must not surface an engine endpoint") + assert.NotEmpty(t, jobResult.ErrorMessage, "failure must carry an operator-readable message") + + // Notification stream carries the matching admin-only intent. + intent := harness.WaitForNotificationIntent(t, env, + func(entry harness.NotificationIntentEntry) bool { + if entry.NotificationType != string(notificationintent.NotificationTypeRuntimeImagePullFailed) { + return false + } + payloadGameID, _ := entry.Payload["game_id"].(string) + return payloadGameID == gameID + }, + 30*time.Second, + ) + require.NotNil(t, intent.Payload, "notification intent must carry a payload") + assert.Equal(t, gameID, intent.Payload["game_id"]) + assert.Equal(t, missingImage, intent.Payload["image_ref"]) + assert.Equal(t, startruntime.ErrorCodeImagePullFailed, intent.Payload["error_code"]) + + // PG state: no running record was installed; operation_log + // captures one failed start with the stable error code. + _, err := harness.RuntimeRecord(t, env, gameID) + if err == nil { + // If an entry was upserted (rollback gap), it must not be + // running. + record := harness.MustRuntimeRecord(t, env, gameID) + assert.NotEqual(t, runtime.StatusRunning, record.Status, + "failed image pull must not leave a running record behind") + } + + failureEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindStart, 5*time.Second) + assert.Equal(t, operation.OutcomeFailure, failureEntry.Outcome) + assert.Equal(t, startruntime.ErrorCodeImagePullFailed, failureEntry.ErrorCode) +} + +// removeContainer terminates and removes the container behind RTM's +// back. Force=true is required because the engine has not received a +// SIGTERM and stop signal handling is engine-internal. +func removeContainer(t *testing.T, env *harness.Env, containerID string) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + require.NoError(t, env.Docker.Client().ContainerRemove(ctx, containerID, dockercontainer.RemoveOptions{Force: true})) +} + +// runManualEngineContainer bypasses RTM and starts an engine container +// directly through the Docker SDK. The container carries every label +// the reconciler reads at adopt time (`com.galaxy.owner`, +// `com.galaxy.kind`, `com.galaxy.game_id`, `com.galaxy.engine_image_ref`, +// `com.galaxy.started_at_ms`) plus the per-game hostname so the +// computed `engine_endpoint` matches what `rtmanager` would have +// written. +func runManualEngineContainer(t *testing.T, env *harness.Env, gameID string) string { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + hostname := "galaxy-game-" + gameID + cfg := &dockercontainer.Config{ + Image: env.EngineImageRef, + Hostname: hostname, + Labels: map[string]string{ + "com.galaxy.owner": "rtmanager", + "com.galaxy.kind": "game-engine", + "com.galaxy.game_id": gameID, + "com.galaxy.engine_image_ref": env.EngineImageRef, + "com.galaxy.started_at_ms": strconv.FormatInt(time.Now().UnixMilli(), 10), + }, + Env: []string{ + "GAME_STATE_PATH=/var/lib/galaxy-game", + "STORAGE_PATH=/var/lib/galaxy-game", + }, + } + hostCfg := &dockercontainer.HostConfig{} + netCfg := &network.NetworkingConfig{ + EndpointsConfig: map[string]*network.EndpointSettings{ + env.Network: {Aliases: []string{hostname}}, + }, + } + containerName := fmt.Sprintf("galaxy-game-%s-manual", gameID) + created, err := env.Docker.Client().ContainerCreate(ctx, cfg, hostCfg, netCfg, nil, containerName) + require.NoError(t, err) + t.Cleanup(func() { + removeCtx, removeCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer removeCancel() + _ = env.Docker.Client().ContainerRemove(removeCtx, created.ID, dockercontainer.RemoveOptions{Force: true}) + }) + + require.NoError(t, env.Docker.Client().ContainerStart(ctx, created.ID, dockercontainer.StartOptions{})) + return created.ID +} diff --git a/rtmanager/internal/adapters/docker/client.go b/rtmanager/internal/adapters/docker/client.go new file mode 100644 index 0000000..5a5f55d --- /dev/null +++ b/rtmanager/internal/adapters/docker/client.go @@ -0,0 +1,493 @@ +// Package docker provides the production Docker SDK adapter that +// implements `galaxy/rtmanager/internal/ports.DockerClient`. The +// adapter is the single component allowed to talk to the local Docker +// daemon; every Runtime Manager service path that needs container +// lifecycle operations goes through this surface. +// +// The adapter is intentionally narrow — it does not orchestrate, log, +// or retry. Cross-cutting concerns (lease coordination, durable state, +// notification side-effects) live in the service layer. +package docker + +import ( + "context" + "errors" + "fmt" + "io" + "maps" + "strings" + "sync" + "time" + + cerrdefs "github.com/containerd/errdefs" + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/events" + "github.com/docker/docker/api/types/filters" + "github.com/docker/docker/api/types/image" + "github.com/docker/docker/api/types/network" + dockerclient "github.com/docker/docker/client" + "github.com/docker/go-units" + + "galaxy/rtmanager/internal/ports" +) + +// EnginePort is the in-container HTTP port the engine listens on. The +// value is fixed by `rtmanager/README.md §Container Model` and by the +// engine's Dockerfile (`game/Dockerfile`); RTM never publishes the port +// to the host. Keeping the constant here lets the adapter own the URL +// shape so the start service does not have to know it. +const EnginePort = 8080 + +// Config groups the dependencies and per-process defaults required to +// construct a Client. The struct is value-typed so wiring code can +// build it inline without intermediate variables. +type Config struct { + // Docker stores the SDK client this adapter wraps. It must be + // non-nil; callers typically construct it via `client.NewClientWithOpts`. + Docker *dockerclient.Client + + // LogDriver stores the Docker logging driver applied to every + // container the adapter creates (e.g. `json-file`). + LogDriver string + + // LogOpts stores the comma-separated `key=value` driver options + // forwarded to Docker. Empty disables driver-specific options. + LogOpts string + + // Clock supplies the wall-clock used for `RunResult.StartedAt`. + // Defaults to `time.Now` when nil. + Clock func() time.Time +} + +// Client is the production adapter implementing `ports.DockerClient`. +// Construct it via NewClient; do not zero-initialise. +type Client struct { + docker *dockerclient.Client + logDriver string + logOpts string + clock func() time.Time +} + +// NewClient constructs a Client from cfg. It returns an error if cfg +// does not carry the minimum collaborator set the adapter needs to +// function. +func NewClient(cfg Config) (*Client, error) { + if cfg.Docker == nil { + return nil, errors.New("new docker adapter: nil docker client") + } + if strings.TrimSpace(cfg.LogDriver) == "" { + return nil, errors.New("new docker adapter: log driver must not be empty") + } + clock := cfg.Clock + if clock == nil { + clock = time.Now + } + return &Client{ + docker: cfg.Docker, + logDriver: cfg.LogDriver, + logOpts: cfg.LogOpts, + clock: clock, + }, nil +} + +// EnsureNetwork verifies the user-defined Docker network is present. +// The adapter never creates networks; provisioning is the operator's +// job per `rtmanager/README.md §Container Model`. +func (client *Client) EnsureNetwork(ctx context.Context, name string) error { + if _, err := client.docker.NetworkInspect(ctx, name, network.InspectOptions{}); err != nil { + if cerrdefs.IsNotFound(err) { + return ports.ErrNetworkMissing + } + return fmt.Errorf("ensure network %q: %w", name, err) + } + return nil +} + +// PullImage pulls ref according to policy. The pull stream is drained +// to completion because the Docker SDK only finishes the underlying +// pull when the body is consumed. +func (client *Client) PullImage(ctx context.Context, ref string, policy ports.PullPolicy) error { + if !policy.IsKnown() { + return fmt.Errorf("pull image %q: unknown pull policy %q", ref, policy) + } + switch policy { + case ports.PullPolicyAlways: + return client.runPull(ctx, ref) + case ports.PullPolicyIfMissing: + if present, err := client.imagePresent(ctx, ref); err != nil { + return err + } else if present { + return nil + } + return client.runPull(ctx, ref) + case ports.PullPolicyNever: + present, err := client.imagePresent(ctx, ref) + if err != nil { + return err + } + if !present { + return ports.ErrImageNotFound + } + return nil + default: + return fmt.Errorf("pull image %q: unsupported pull policy %q", ref, policy) + } +} + +// InspectImage returns image metadata for ref. RTM only reads labels +// at start time; the broader inspect struct stays accessible for +// diagnostics. +func (client *Client) InspectImage(ctx context.Context, ref string) (ports.ImageInspect, error) { + inspect, err := client.docker.ImageInspect(ctx, ref) + if err != nil { + if cerrdefs.IsNotFound(err) { + return ports.ImageInspect{}, ports.ErrImageNotFound + } + return ports.ImageInspect{}, fmt.Errorf("inspect image %q: %w", ref, err) + } + var labels map[string]string + if inspect.Config != nil { + labels = copyStringMap(inspect.Config.Labels) + } + return ports.ImageInspect{Ref: ref, Labels: labels}, nil +} + +// InspectContainer returns container metadata for containerID. The +// adapter best-effort decodes Docker timestamps; malformed values map +// to the zero time so callers do not have to defend against nil +// pointers in the SDK response. +func (client *Client) InspectContainer(ctx context.Context, containerID string) (ports.ContainerInspect, error) { + inspect, err := client.docker.ContainerInspect(ctx, containerID) + if err != nil { + if cerrdefs.IsNotFound(err) { + return ports.ContainerInspect{}, ports.ErrContainerNotFound + } + return ports.ContainerInspect{}, fmt.Errorf("inspect container %q: %w", containerID, err) + } + + result := ports.ContainerInspect{ID: inspect.ID} + if inspect.ContainerJSONBase != nil { + result.RestartCount = inspect.RestartCount + if inspect.State != nil { + result.Status = string(inspect.State.Status) + result.OOMKilled = inspect.State.OOMKilled + result.ExitCode = inspect.State.ExitCode + result.StartedAt = parseDockerTime(inspect.State.StartedAt) + result.FinishedAt = parseDockerTime(inspect.State.FinishedAt) + if inspect.State.Health != nil { + result.Health = string(inspect.State.Health.Status) + } + } + } + if inspect.Config != nil { + result.ImageRef = inspect.Config.Image + result.Hostname = inspect.Config.Hostname + result.Labels = copyStringMap(inspect.Config.Labels) + } + return result, nil +} + +// Run creates and starts one container according to spec. On +// `ContainerStart` failure the adapter best-effort removes the partial +// container so the start service never has to clean up after a failed +// start path. +func (client *Client) Run(ctx context.Context, spec ports.RunSpec) (ports.RunResult, error) { + if err := spec.Validate(); err != nil { + return ports.RunResult{}, fmt.Errorf("run container: %w", err) + } + memoryBytes, err := units.RAMInBytes(spec.Memory) + if err != nil { + return ports.RunResult{}, fmt.Errorf("run container %q: parse memory %q: %w", spec.Name, spec.Memory, err) + } + pidsLimit := int64(spec.PIDsLimit) + + containerCfg := &container.Config{ + Image: spec.Image, + Hostname: spec.Hostname, + Env: envMapToSlice(spec.Env), + Labels: copyStringMap(spec.Labels), + Cmd: append([]string(nil), spec.Cmd...), + } + hostCfg := &container.HostConfig{ + Binds: bindMountsToBinds(spec.BindMounts), + LogConfig: container.LogConfig{ + Type: client.logDriver, + Config: parseLogOpts(client.logOpts), + }, + Resources: container.Resources{ + NanoCPUs: int64(spec.CPUQuota * 1e9), + Memory: memoryBytes, + PidsLimit: &pidsLimit, + }, + } + netCfg := &network.NetworkingConfig{ + EndpointsConfig: map[string]*network.EndpointSettings{ + spec.Network: { + Aliases: []string{spec.Hostname}, + }, + }, + } + + created, err := client.docker.ContainerCreate(ctx, containerCfg, hostCfg, netCfg, nil, spec.Name) + if err != nil { + return ports.RunResult{}, fmt.Errorf("create container %q: %w", spec.Name, err) + } + + if err := client.docker.ContainerStart(ctx, created.ID, container.StartOptions{}); err != nil { + client.cleanupAfterFailedStart(created.ID) + return ports.RunResult{}, fmt.Errorf("start container %q: %w", spec.Name, err) + } + + return ports.RunResult{ + ContainerID: created.ID, + EngineEndpoint: fmt.Sprintf("http://%s:%d", spec.Hostname, EnginePort), + StartedAt: client.clock(), + }, nil +} + +// Stop bounds graceful shutdown by timeout. A missing container is +// surfaced as ErrContainerNotFound so the service layer can treat it +// as already-stopped per `rtmanager/README.md §Lifecycles → Stop`. +func (client *Client) Stop(ctx context.Context, containerID string, timeout time.Duration) error { + seconds := max(int(timeout.Round(time.Second).Seconds()), 0) + if err := client.docker.ContainerStop(ctx, containerID, container.StopOptions{Timeout: &seconds}); err != nil { + if cerrdefs.IsNotFound(err) { + return ports.ErrContainerNotFound + } + return fmt.Errorf("stop container %q: %w", containerID, err) + } + return nil +} + +// Remove removes the container without forcing kill. A missing +// container is reported as success so callers can treat the operation +// as idempotent. +func (client *Client) Remove(ctx context.Context, containerID string) error { + if err := client.docker.ContainerRemove(ctx, containerID, container.RemoveOptions{}); err != nil { + if cerrdefs.IsNotFound(err) { + return nil + } + return fmt.Errorf("remove container %q: %w", containerID, err) + } + return nil +} + +// List returns container summaries that match filter. Empty Labels +// match every container; the reconciler always passes +// `com.galaxy.owner=rtmanager`. +func (client *Client) List(ctx context.Context, filter ports.ListFilter) ([]ports.ContainerSummary, error) { + args := filters.NewArgs() + for key, value := range filter.Labels { + args.Add("label", key+"="+value) + } + summaries, err := client.docker.ContainerList(ctx, container.ListOptions{All: true, Filters: args}) + if err != nil { + return nil, fmt.Errorf("list containers: %w", err) + } + out := make([]ports.ContainerSummary, 0, len(summaries)) + for _, summary := range summaries { + hostname := "" + if len(summary.Names) > 0 { + hostname = strings.TrimPrefix(summary.Names[0], "/") + } + out = append(out, ports.ContainerSummary{ + ID: summary.ID, + ImageRef: summary.Image, + Hostname: hostname, + Labels: copyStringMap(summary.Labels), + Status: string(summary.State), + StartedAt: time.Unix(summary.Created, 0).UTC(), + }) + } + return out, nil +} + +// EventsListen subscribes to the Docker events stream and returns a +// typed channel of decoded container events plus an asynchronous +// error channel. The caller cancels ctx to terminate the subscription; +// the goroutine closes both channels on termination. +func (client *Client) EventsListen(ctx context.Context) (<-chan ports.DockerEvent, <-chan error, error) { + msgs, sdkErrs := client.docker.Events(ctx, events.ListOptions{}) + out := make(chan ports.DockerEvent) + outErrs := make(chan error, 1) + + var closeOnce sync.Once + closeAll := func() { + closeOnce.Do(func() { + close(out) + close(outErrs) + }) + } + + go func() { + defer closeAll() + for { + select { + case <-ctx.Done(): + return + case msg, ok := <-msgs: + if !ok { + return + } + if msg.Type != events.ContainerEventType { + continue + } + select { + case <-ctx.Done(): + return + case out <- decodeEvent(msg): + } + case err, ok := <-sdkErrs: + if !ok { + return + } + if err == nil { + continue + } + select { + case <-ctx.Done(): + case outErrs <- err: + } + return + } + } + }() + + return out, outErrs, nil +} + +func (client *Client) cleanupAfterFailedStart(containerID string) { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + _ = client.docker.ContainerRemove(cleanupCtx, containerID, container.RemoveOptions{Force: true}) +} + +func (client *Client) imagePresent(ctx context.Context, ref string) (bool, error) { + if _, err := client.docker.ImageInspect(ctx, ref); err != nil { + if cerrdefs.IsNotFound(err) { + return false, nil + } + return false, fmt.Errorf("inspect image %q: %w", ref, err) + } + return true, nil +} + +func (client *Client) runPull(ctx context.Context, ref string) error { + body, err := client.docker.ImagePull(ctx, ref, image.PullOptions{}) + if err != nil { + if cerrdefs.IsNotFound(err) { + return ports.ErrImageNotFound + } + return fmt.Errorf("pull image %q: %w", ref, err) + } + defer body.Close() + if _, err := io.Copy(io.Discard, body); err != nil { + return fmt.Errorf("drain pull stream for %q: %w", ref, err) + } + return nil +} + +func envMapToSlice(envMap map[string]string) []string { + if len(envMap) == 0 { + return nil + } + out := make([]string, 0, len(envMap)) + for key, value := range envMap { + out = append(out, key+"="+value) + } + return out +} + +func bindMountsToBinds(mounts []ports.BindMount) []string { + if len(mounts) == 0 { + return nil + } + binds := make([]string, 0, len(mounts)) + for _, mount := range mounts { + bind := mount.HostPath + ":" + mount.MountPath + if mount.ReadOnly { + bind += ":ro" + } + binds = append(binds, bind) + } + return binds +} + +func parseLogOpts(raw string) map[string]string { + if strings.TrimSpace(raw) == "" { + return nil + } + out := make(map[string]string) + for part := range strings.SplitSeq(raw, ",") { + entry := strings.TrimSpace(part) + if entry == "" { + continue + } + index := strings.IndexByte(entry, '=') + if index <= 0 { + continue + } + out[entry[:index]] = entry[index+1:] + } + if len(out) == 0 { + return nil + } + return out +} + +func parseDockerTime(raw string) time.Time { + if raw == "" { + return time.Time{} + } + parsed, err := time.Parse(time.RFC3339Nano, raw) + if err != nil { + return time.Time{} + } + return parsed.UTC() +} + +func copyStringMap(in map[string]string) map[string]string { + if in == nil { + return nil + } + out := make(map[string]string, len(in)) + maps.Copy(out, in) + return out +} + +func decodeEvent(msg events.Message) ports.DockerEvent { + occurredAt := time.Time{} + switch { + case msg.TimeNano != 0: + occurredAt = time.Unix(0, msg.TimeNano).UTC() + case msg.Time != 0: + occurredAt = time.Unix(msg.Time, 0).UTC() + } + exitCode := 0 + if raw, ok := msg.Actor.Attributes["exitCode"]; ok { + if value, err := parseExitCode(raw); err == nil { + exitCode = value + } + } + return ports.DockerEvent{ + Action: string(msg.Action), + ContainerID: msg.Actor.ID, + Labels: copyStringMap(msg.Actor.Attributes), + ExitCode: exitCode, + OccurredAt: occurredAt, + } +} + +func parseExitCode(raw string) (int, error) { + value := 0 + for _, r := range raw { + if r < '0' || r > '9' { + return 0, fmt.Errorf("non-numeric exit code %q", raw) + } + value = value*10 + int(r-'0') + } + return value, nil +} + +// Compile-time assertion: Client implements ports.DockerClient. +var _ ports.DockerClient = (*Client)(nil) diff --git a/rtmanager/internal/adapters/docker/client_test.go b/rtmanager/internal/adapters/docker/client_test.go new file mode 100644 index 0000000..f9458e3 --- /dev/null +++ b/rtmanager/internal/adapters/docker/client_test.go @@ -0,0 +1,561 @@ +package docker + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/http/httptest" + "net/url" + "strings" + "sync/atomic" + "testing" + "time" + + dockerclient "github.com/docker/docker/client" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "galaxy/rtmanager/internal/ports" +) + +// newTestClient wires an httptest.Server backed Docker SDK client to our +// adapter. The handler is invoked for every Docker API request issued +// during the test; tests assert on path and method to route the +// response. +func newTestClient(t *testing.T, handler http.HandlerFunc) *Client { + t.Helper() + server := httptest.NewServer(handler) + t.Cleanup(server.Close) + + docker, err := dockerclient.NewClientWithOpts( + dockerclient.WithHost(server.URL), + dockerclient.WithHTTPClient(server.Client()), + dockerclient.WithVersion("1.45"), + ) + require.NoError(t, err) + t.Cleanup(func() { _ = docker.Close() }) + + client, err := NewClient(Config{ + Docker: docker, + LogDriver: "json-file", + LogOpts: "max-size=1m,max-file=3", + Clock: func() time.Time { return time.Date(2026, time.April, 27, 12, 0, 0, 0, time.UTC) }, + }) + require.NoError(t, err) + return client +} + +func writeJSON(t *testing.T, w http.ResponseWriter, status int, body any) { + t.Helper() + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + require.NoError(t, json.NewEncoder(w).Encode(body)) +} + +func writeNotFound(t *testing.T, w http.ResponseWriter, msg string) { + t.Helper() + writeJSON(t, w, http.StatusNotFound, map[string]string{"message": msg}) +} + +// Docker SDK uses /v1.45 prefix when client is pinned to API 1.45. +func dockerPath(suffix string) string { + return "/v1.45" + suffix +} + +func TestNewClientValidatesConfig(t *testing.T) { + t.Run("nil docker client", func(t *testing.T) { + _, err := NewClient(Config{LogDriver: "json-file"}) + require.Error(t, err) + assert.Contains(t, err.Error(), "nil docker client") + }) + t.Run("empty log driver", func(t *testing.T) { + docker, err := dockerclient.NewClientWithOpts(dockerclient.WithHost("tcp://127.0.0.1:65535")) + require.NoError(t, err) + t.Cleanup(func() { _ = docker.Close() }) + _, err = NewClient(Config{Docker: docker, LogDriver: " "}) + require.Error(t, err) + assert.Contains(t, err.Error(), "log driver") + }) +} + +func TestEnsureNetwork(t *testing.T) { + t.Run("present", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, http.MethodGet, r.Method) + require.Equal(t, dockerPath("/networks/galaxy-net"), r.URL.Path) + writeJSON(t, w, http.StatusOK, map[string]any{"Id": "net-1", "Name": "galaxy-net"}) + }) + require.NoError(t, client.EnsureNetwork(context.Background(), "galaxy-net")) + }) + t.Run("missing", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + writeNotFound(t, w, "no such network") + }) + err := client.EnsureNetwork(context.Background(), "missing") + require.Error(t, err) + assert.ErrorIs(t, err, ports.ErrNetworkMissing) + }) + t.Run("transport error", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "boom", http.StatusInternalServerError) + }) + err := client.EnsureNetwork(context.Background(), "x") + require.Error(t, err) + assert.NotErrorIs(t, err, ports.ErrNetworkMissing) + }) +} + +func TestInspectImage(t *testing.T) { + t.Run("present", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, http.MethodGet, r.Method) + require.Equal(t, dockerPath("/images/galaxy/game:test/json"), r.URL.Path) + writeJSON(t, w, http.StatusOK, map[string]any{ + "Id": "sha256:abc", + "Config": map[string]any{ + "Labels": map[string]string{ + "com.galaxy.cpu_quota": "1.0", + "com.galaxy.memory": "512m", + "com.galaxy.pids_limit": "512", + }, + }, + }) + }) + got, err := client.InspectImage(context.Background(), "galaxy/game:test") + require.NoError(t, err) + assert.Equal(t, "galaxy/game:test", got.Ref) + assert.Equal(t, "1.0", got.Labels["com.galaxy.cpu_quota"]) + assert.Equal(t, "512m", got.Labels["com.galaxy.memory"]) + }) + t.Run("not found", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + writeNotFound(t, w, "no such image") + }) + _, err := client.InspectImage(context.Background(), "galaxy/missing:tag") + require.Error(t, err) + assert.ErrorIs(t, err, ports.ErrImageNotFound) + }) +} + +func TestInspectContainer(t *testing.T) { + t.Run("present", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, http.MethodGet, r.Method) + require.Equal(t, dockerPath("/containers/cont-1/json"), r.URL.Path) + writeJSON(t, w, http.StatusOK, map[string]any{ + "Id": "cont-1", + "RestartCount": 2, + "State": map[string]any{ + "Status": "running", + "OOMKilled": false, + "ExitCode": 0, + "StartedAt": "2026-04-27T11:00:00.5Z", + "FinishedAt": "0001-01-01T00:00:00Z", + "Health": map[string]any{"Status": "healthy"}, + }, + "Config": map[string]any{ + "Image": "galaxy/game:test", + "Hostname": "galaxy-game-game-1", + "Labels": map[string]string{ + "com.galaxy.owner": "rtmanager", + "com.galaxy.game_id": "game-1", + }, + }, + }) + }) + got, err := client.InspectContainer(context.Background(), "cont-1") + require.NoError(t, err) + assert.Equal(t, "cont-1", got.ID) + assert.Equal(t, 2, got.RestartCount) + assert.Equal(t, "running", got.Status) + assert.Equal(t, "healthy", got.Health) + assert.Equal(t, "galaxy/game:test", got.ImageRef) + assert.Equal(t, "galaxy-game-game-1", got.Hostname) + assert.Equal(t, "rtmanager", got.Labels["com.galaxy.owner"]) + assert.False(t, got.StartedAt.IsZero()) + }) + t.Run("not found", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + writeNotFound(t, w, "no such container") + }) + _, err := client.InspectContainer(context.Background(), "missing") + require.Error(t, err) + assert.ErrorIs(t, err, ports.ErrContainerNotFound) + }) +} + +func TestPullImagePolicies(t *testing.T) { + t.Run("if_missing/found skips pull", func(t *testing.T) { + hits := struct { + inspect atomic.Int32 + pull atomic.Int32 + }{} + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case strings.HasSuffix(r.URL.Path, "/json") && r.Method == http.MethodGet: + hits.inspect.Add(1) + writeJSON(t, w, http.StatusOK, map[string]any{"Id": "sha256:x"}) + case strings.Contains(r.URL.Path, "/images/create"): + hits.pull.Add(1) + w.WriteHeader(http.StatusOK) + default: + t.Fatalf("unexpected request %s %s", r.Method, r.URL.Path) + } + }) + require.NoError(t, client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyIfMissing)) + assert.Equal(t, int32(1), hits.inspect.Load()) + assert.Equal(t, int32(0), hits.pull.Load()) + }) + t.Run("if_missing/absent triggers pull", func(t *testing.T) { + hits := struct { + inspect atomic.Int32 + pull atomic.Int32 + }{} + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case strings.HasSuffix(r.URL.Path, "/json") && r.Method == http.MethodGet: + hits.inspect.Add(1) + writeNotFound(t, w, "no such image") + case strings.Contains(r.URL.Path, "/images/create"): + hits.pull.Add(1) + w.WriteHeader(http.StatusOK) + _, _ = io.WriteString(w, `{"status":"Pulling..."}`+"\n"+`{"status":"Done"}`+"\n") + default: + t.Fatalf("unexpected request %s %s", r.Method, r.URL.Path) + } + }) + require.NoError(t, client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyIfMissing)) + assert.Equal(t, int32(1), hits.inspect.Load()) + assert.Equal(t, int32(1), hits.pull.Load()) + }) + t.Run("always pulls regardless of cache", func(t *testing.T) { + var pullCount atomic.Int32 + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + require.Contains(t, r.URL.Path, "/images/create") + pullCount.Add(1) + w.WriteHeader(http.StatusOK) + }) + require.NoError(t, client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyAlways)) + assert.Equal(t, int32(1), pullCount.Load()) + }) + t.Run("never with absent image", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, http.MethodGet, r.Method) + writeNotFound(t, w, "no such image") + }) + err := client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyNever) + require.Error(t, err) + assert.ErrorIs(t, err, ports.ErrImageNotFound) + }) + t.Run("never with present image", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, http.MethodGet, r.Method) + writeJSON(t, w, http.StatusOK, map[string]any{"Id": "x"}) + }) + require.NoError(t, client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyNever)) + }) + t.Run("unknown policy", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("must not call docker on unknown policy") + }) + err := client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicy("invalid")) + require.Error(t, err) + }) +} + +func TestRunHappyPath(t *testing.T) { + calls := struct { + create atomic.Int32 + start atomic.Int32 + remove atomic.Int32 + }{} + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/containers/create"): + calls.create.Add(1) + require.Equal(t, "galaxy-game-game-1", r.URL.Query().Get("name")) + writeJSON(t, w, http.StatusCreated, map[string]any{"Id": "cont-new", "Warnings": []string{}}) + case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/start"): + calls.start.Add(1) + require.Equal(t, dockerPath("/containers/cont-new/start"), r.URL.Path) + w.WriteHeader(http.StatusNoContent) + case r.Method == http.MethodDelete && strings.HasPrefix(r.URL.Path, dockerPath("/containers/")): + calls.remove.Add(1) + w.WriteHeader(http.StatusNoContent) + default: + t.Fatalf("unexpected %s %s", r.Method, r.URL.Path) + } + }) + + result, err := client.Run(context.Background(), ports.RunSpec{ + Name: "galaxy-game-game-1", + Image: "galaxy/game:test", + Hostname: "galaxy-game-game-1", + Network: "galaxy-net", + Env: map[string]string{ + "GAME_STATE_PATH": "/var/lib/galaxy-game", + "STORAGE_PATH": "/var/lib/galaxy-game", + }, + Labels: map[string]string{"com.galaxy.owner": "rtmanager"}, + LogDriver: "json-file", + BindMounts: []ports.BindMount{ + {HostPath: "/var/lib/galaxy/games/game-1", MountPath: "/var/lib/galaxy-game"}, + }, + CPUQuota: 1.0, + Memory: "512m", + PIDsLimit: 512, + }) + require.NoError(t, err) + assert.Equal(t, "cont-new", result.ContainerID) + assert.Equal(t, "http://galaxy-game-game-1:8080", result.EngineEndpoint) + assert.False(t, result.StartedAt.IsZero()) + assert.Equal(t, int32(1), calls.create.Load()) + assert.Equal(t, int32(1), calls.start.Load()) + assert.Equal(t, int32(0), calls.remove.Load()) +} + +func TestRunStartFailureRemovesContainer(t *testing.T) { + calls := struct { + create atomic.Int32 + start atomic.Int32 + remove atomic.Int32 + }{} + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/containers/create"): + calls.create.Add(1) + writeJSON(t, w, http.StatusCreated, map[string]any{"Id": "cont-x"}) + case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/start"): + calls.start.Add(1) + http.Error(w, `{"message":"insufficient host resources"}`, http.StatusInternalServerError) + case r.Method == http.MethodDelete && strings.HasPrefix(r.URL.Path, dockerPath("/containers/cont-x")): + calls.remove.Add(1) + require.Equal(t, "1", r.URL.Query().Get("force")) + w.WriteHeader(http.StatusNoContent) + default: + t.Fatalf("unexpected %s %s", r.Method, r.URL.Path) + } + }) + + _, err := client.Run(context.Background(), ports.RunSpec{ + Name: "x", + Image: "img", + Hostname: "x", + Network: "n", + LogDriver: "json-file", + CPUQuota: 1.0, + Memory: "64m", + PIDsLimit: 64, + }) + require.Error(t, err) + assert.Equal(t, int32(1), calls.create.Load()) + assert.Equal(t, int32(1), calls.start.Load()) + assert.Equal(t, int32(1), calls.remove.Load(), "adapter must roll back the partial container") +} + +func TestRunRejectsInvalidSpec(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("must not contact docker on invalid spec") + }) + _, err := client.Run(context.Background(), ports.RunSpec{Name: "x"}) + require.Error(t, err) + assert.Contains(t, err.Error(), "image must not be empty") +} + +func TestStop(t *testing.T) { + t.Run("graceful stop", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, http.MethodPost, r.Method) + require.Equal(t, dockerPath("/containers/cont-1/stop"), r.URL.Path) + require.Equal(t, "30", r.URL.Query().Get("t")) + w.WriteHeader(http.StatusNoContent) + }) + require.NoError(t, client.Stop(context.Background(), "cont-1", 30*time.Second)) + }) + t.Run("missing container", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + writeNotFound(t, w, "no such container") + }) + err := client.Stop(context.Background(), "missing", 30*time.Second) + assert.ErrorIs(t, err, ports.ErrContainerNotFound) + }) + t.Run("negative timeout normalised to zero", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, "0", r.URL.Query().Get("t")) + w.WriteHeader(http.StatusNoContent) + }) + require.NoError(t, client.Stop(context.Background(), "x", -5*time.Second)) + }) +} + +func TestRemoveIsIdempotent(t *testing.T) { + t.Run("present", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, http.MethodDelete, r.Method) + w.WriteHeader(http.StatusNoContent) + }) + require.NoError(t, client.Remove(context.Background(), "cont-1")) + }) + t.Run("missing", func(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + writeNotFound(t, w, "no such container") + }) + require.NoError(t, client.Remove(context.Background(), "missing")) + }) +} + +func TestListAppliesLabelFilter(t *testing.T) { + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, http.MethodGet, r.Method) + require.Equal(t, dockerPath("/containers/json"), r.URL.Path) + require.Equal(t, "1", r.URL.Query().Get("all")) + + filtersRaw := r.URL.Query().Get("filters") + require.NotEmpty(t, filtersRaw) + var args map[string]map[string]bool + require.NoError(t, json.Unmarshal([]byte(filtersRaw), &args)) + require.True(t, args["label"]["com.galaxy.owner=rtmanager"]) + + writeJSON(t, w, http.StatusOK, []map[string]any{ + { + "Id": "cont-a", + "Image": "galaxy/game:1.2.3", + "Names": []string{"/galaxy-game-game-1"}, + "Labels": map[string]string{"com.galaxy.owner": "rtmanager"}, + "State": "running", + "Created": int64(1700000000), + }, + }) + }) + + got, err := client.List(context.Background(), ports.ListFilter{ + Labels: map[string]string{"com.galaxy.owner": "rtmanager"}, + }) + require.NoError(t, err) + require.Len(t, got, 1) + assert.Equal(t, "cont-a", got[0].ID) + assert.Equal(t, "galaxy/game:1.2.3", got[0].ImageRef) + assert.Equal(t, "galaxy-game-game-1", got[0].Hostname) + assert.Equal(t, "running", got[0].Status) + assert.False(t, got[0].StartedAt.IsZero()) + assert.Equal(t, "rtmanager", got[0].Labels["com.galaxy.owner"]) +} + +func TestEventsListenDecodesContainerEvents(t *testing.T) { + mu := make(chan struct{}) + client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, http.MethodGet, r.Method) + require.Equal(t, dockerPath("/events"), r.URL.Path) + + flusher, ok := w.(http.Flusher) + require.True(t, ok) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + flusher.Flush() + + // Container start event + writeEvent(t, w, "container", "start", "cont-1", map[string]string{ + "image": "galaxy/game:1.2.3", + "name": "galaxy-game-game-1", + "com.galaxy.game_id": "game-1", + }, time.Now()) + flusher.Flush() + + // Container die event with exit code 137 + writeEvent(t, w, "container", "die", "cont-1", map[string]string{ + "exitCode": "137", + }, time.Now()) + flusher.Flush() + + // Image event must be filtered out by adapter + writeEvent(t, w, "image", "pull", "img", nil, time.Now()) + flusher.Flush() + + <-mu + }) + defer close(mu) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + events, _, err := client.EventsListen(ctx) + require.NoError(t, err) + + got := []ports.DockerEvent{} + deadline := time.After(2 * time.Second) + for len(got) < 2 { + select { + case ev, ok := <-events: + if !ok { + t.Fatalf("events channel closed; got %d events", len(got)) + } + got = append(got, ev) + case <-deadline: + t.Fatalf("did not receive expected events; have %d", len(got)) + } + } + require.Len(t, got, 2) + assert.Equal(t, "start", got[0].Action) + assert.Equal(t, "cont-1", got[0].ContainerID) + assert.Equal(t, "game-1", got[0].Labels["com.galaxy.game_id"]) + assert.Equal(t, "die", got[1].Action) + assert.Equal(t, 137, got[1].ExitCode) +} + +func writeEvent(t *testing.T, w io.Writer, eventType, action, id string, attributes map[string]string, when time.Time) { + t.Helper() + payload := map[string]any{ + "Type": eventType, + "Action": action, + "Actor": map[string]any{"ID": id, "Attributes": attributes}, + "time": when.Unix(), + "timeNano": when.UnixNano(), + } + data, err := json.Marshal(payload) + require.NoError(t, err) + _, err = fmt.Fprintln(w, string(data)) + require.NoError(t, err) +} + +// Sanity: parsing helpers. +func TestParseLogOpts(t *testing.T) { + got := parseLogOpts("max-size=1m,max-file=3, ,empty=,=novalue") + assert.Equal(t, "1m", got["max-size"]) + assert.Equal(t, "3", got["max-file"]) + assert.Equal(t, "", got["empty"]) + _, hasNovalue := got["=novalue"] + assert.False(t, hasNovalue) +} + +func TestParseDockerTime(t *testing.T) { + assert.True(t, parseDockerTime("").IsZero()) + assert.True(t, parseDockerTime("not-a-date").IsZero()) + parsed := parseDockerTime("2026-04-27T11:00:00.5Z") + assert.False(t, parsed.IsZero()) + assert.Equal(t, time.UTC, parsed.Location()) +} + +func TestEnvMapToSliceDeterministicLength(t *testing.T) { + got := envMapToSlice(map[string]string{"A": "1", "B": "2"}) + assert.Len(t, got, 2) + for _, kv := range got { + assert.Contains(t, []string{"A=1", "B=2"}, kv) + } + assert.Nil(t, envMapToSlice(nil)) +} + +// Compile-time sanity: make sure errors.Is wiring stays intact. +func TestSentinelErrorsAreDistinct(t *testing.T) { + require.True(t, errors.Is(ports.ErrNetworkMissing, ports.ErrNetworkMissing)) + require.False(t, errors.Is(ports.ErrNetworkMissing, ports.ErrImageNotFound)) +} + +func TestURLPathEscapingForCharacters(t *testing.T) { + // Ensure the SDK URL path encodes special characters; the adapter + // passes raw inputs through and lets the SDK escape. + encoded := url.PathEscape("game-1") + assert.Equal(t, "game-1", encoded) +} diff --git a/rtmanager/internal/adapters/docker/mocks/mock_dockerclient.go b/rtmanager/internal/adapters/docker/mocks/mock_dockerclient.go new file mode 100644 index 0000000..720347e --- /dev/null +++ b/rtmanager/internal/adapters/docker/mocks/mock_dockerclient.go @@ -0,0 +1,175 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: galaxy/rtmanager/internal/ports (interfaces: DockerClient) +// +// Generated by this command: +// +// mockgen -destination=../adapters/docker/mocks/mock_dockerclient.go -package=mocks galaxy/rtmanager/internal/ports DockerClient +// + +// Package mocks is a generated GoMock package. +package mocks + +import ( + context "context" + ports "galaxy/rtmanager/internal/ports" + reflect "reflect" + time "time" + + gomock "go.uber.org/mock/gomock" +) + +// MockDockerClient is a mock of DockerClient interface. +type MockDockerClient struct { + ctrl *gomock.Controller + recorder *MockDockerClientMockRecorder + isgomock struct{} +} + +// MockDockerClientMockRecorder is the mock recorder for MockDockerClient. +type MockDockerClientMockRecorder struct { + mock *MockDockerClient +} + +// NewMockDockerClient creates a new mock instance. +func NewMockDockerClient(ctrl *gomock.Controller) *MockDockerClient { + mock := &MockDockerClient{ctrl: ctrl} + mock.recorder = &MockDockerClientMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockDockerClient) EXPECT() *MockDockerClientMockRecorder { + return m.recorder +} + +// EnsureNetwork mocks base method. +func (m *MockDockerClient) EnsureNetwork(ctx context.Context, name string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "EnsureNetwork", ctx, name) + ret0, _ := ret[0].(error) + return ret0 +} + +// EnsureNetwork indicates an expected call of EnsureNetwork. +func (mr *MockDockerClientMockRecorder) EnsureNetwork(ctx, name any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnsureNetwork", reflect.TypeOf((*MockDockerClient)(nil).EnsureNetwork), ctx, name) +} + +// EventsListen mocks base method. +func (m *MockDockerClient) EventsListen(ctx context.Context) (<-chan ports.DockerEvent, <-chan error, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "EventsListen", ctx) + ret0, _ := ret[0].(<-chan ports.DockerEvent) + ret1, _ := ret[1].(<-chan error) + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 +} + +// EventsListen indicates an expected call of EventsListen. +func (mr *MockDockerClientMockRecorder) EventsListen(ctx any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EventsListen", reflect.TypeOf((*MockDockerClient)(nil).EventsListen), ctx) +} + +// InspectContainer mocks base method. +func (m *MockDockerClient) InspectContainer(ctx context.Context, containerID string) (ports.ContainerInspect, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "InspectContainer", ctx, containerID) + ret0, _ := ret[0].(ports.ContainerInspect) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// InspectContainer indicates an expected call of InspectContainer. +func (mr *MockDockerClientMockRecorder) InspectContainer(ctx, containerID any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InspectContainer", reflect.TypeOf((*MockDockerClient)(nil).InspectContainer), ctx, containerID) +} + +// InspectImage mocks base method. +func (m *MockDockerClient) InspectImage(ctx context.Context, ref string) (ports.ImageInspect, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "InspectImage", ctx, ref) + ret0, _ := ret[0].(ports.ImageInspect) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// InspectImage indicates an expected call of InspectImage. +func (mr *MockDockerClientMockRecorder) InspectImage(ctx, ref any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InspectImage", reflect.TypeOf((*MockDockerClient)(nil).InspectImage), ctx, ref) +} + +// List mocks base method. +func (m *MockDockerClient) List(ctx context.Context, filter ports.ListFilter) ([]ports.ContainerSummary, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "List", ctx, filter) + ret0, _ := ret[0].([]ports.ContainerSummary) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// List indicates an expected call of List. +func (mr *MockDockerClientMockRecorder) List(ctx, filter any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "List", reflect.TypeOf((*MockDockerClient)(nil).List), ctx, filter) +} + +// PullImage mocks base method. +func (m *MockDockerClient) PullImage(ctx context.Context, ref string, policy ports.PullPolicy) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "PullImage", ctx, ref, policy) + ret0, _ := ret[0].(error) + return ret0 +} + +// PullImage indicates an expected call of PullImage. +func (mr *MockDockerClientMockRecorder) PullImage(ctx, ref, policy any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PullImage", reflect.TypeOf((*MockDockerClient)(nil).PullImage), ctx, ref, policy) +} + +// Remove mocks base method. +func (m *MockDockerClient) Remove(ctx context.Context, containerID string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Remove", ctx, containerID) + ret0, _ := ret[0].(error) + return ret0 +} + +// Remove indicates an expected call of Remove. +func (mr *MockDockerClientMockRecorder) Remove(ctx, containerID any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Remove", reflect.TypeOf((*MockDockerClient)(nil).Remove), ctx, containerID) +} + +// Run mocks base method. +func (m *MockDockerClient) Run(ctx context.Context, spec ports.RunSpec) (ports.RunResult, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Run", ctx, spec) + ret0, _ := ret[0].(ports.RunResult) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Run indicates an expected call of Run. +func (mr *MockDockerClientMockRecorder) Run(ctx, spec any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Run", reflect.TypeOf((*MockDockerClient)(nil).Run), ctx, spec) +} + +// Stop mocks base method. +func (m *MockDockerClient) Stop(ctx context.Context, containerID string, timeout time.Duration) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Stop", ctx, containerID, timeout) + ret0, _ := ret[0].(error) + return ret0 +} + +// Stop indicates an expected call of Stop. +func (mr *MockDockerClientMockRecorder) Stop(ctx, containerID, timeout any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stop", reflect.TypeOf((*MockDockerClient)(nil).Stop), ctx, containerID, timeout) +} diff --git a/rtmanager/internal/adapters/docker/mocks/mock_dockerclient_assertion_test.go b/rtmanager/internal/adapters/docker/mocks/mock_dockerclient_assertion_test.go new file mode 100644 index 0000000..ceebbbc --- /dev/null +++ b/rtmanager/internal/adapters/docker/mocks/mock_dockerclient_assertion_test.go @@ -0,0 +1,11 @@ +package mocks + +import ( + "galaxy/rtmanager/internal/ports" +) + +// Compile-time assertion that the generated mock satisfies the port +// interface. Future signature drift between the port and the generated +// file fails the build at this line, which is more actionable than a +// runtime check from a service test. +var _ ports.DockerClient = (*MockDockerClient)(nil) diff --git a/rtmanager/internal/adapters/docker/smoke_test.go b/rtmanager/internal/adapters/docker/smoke_test.go new file mode 100644 index 0000000..f3f47a8 --- /dev/null +++ b/rtmanager/internal/adapters/docker/smoke_test.go @@ -0,0 +1,202 @@ +// Package docker smoke tests exercise the production adapter against a +// real Docker daemon. The tests skip when no Docker socket is reachable +// (`skipUnlessDockerAvailable`), so they run in the default +// `go test ./...` pass without a build tag. +package docker + +import ( + "context" + "crypto/rand" + "encoding/hex" + "errors" + "os" + "testing" + "time" + + "github.com/docker/docker/api/types/network" + dockerclient "github.com/docker/docker/client" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "galaxy/rtmanager/internal/ports" +) + +const ( + smokeImage = "alpine:3.21" + smokeNetPrefix = "rtmanager-smoke-" +) + +func skipUnlessDockerAvailable(t *testing.T) { + t.Helper() + if os.Getenv("DOCKER_HOST") == "" { + if _, err := os.Stat("/var/run/docker.sock"); err != nil { + t.Skip("docker daemon not available; set DOCKER_HOST or expose /var/run/docker.sock") + } + } +} + +func newSmokeAdapter(t *testing.T) (*Client, *dockerclient.Client) { + t.Helper() + + docker, err := dockerclient.NewClientWithOpts(dockerclient.FromEnv, dockerclient.WithAPIVersionNegotiation()) + require.NoError(t, err) + t.Cleanup(func() { _ = docker.Close() }) + + pingCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if _, err := docker.Ping(pingCtx); err != nil { + // A reachable socket path may still be unusable in sandboxed + // environments (e.g., macOS sandbox blocking the colima socket). + // The smoke test can only run when the daemon answers ping, so a + // permission-denied / connection-refused error is a runtime + // "Docker unavailable" signal and skips the test. + t.Skipf("docker daemon unavailable: %v", err) + } + + adapter, err := NewClient(Config{ + Docker: docker, + LogDriver: "json-file", + }) + require.NoError(t, err) + return adapter, docker +} + +func uniqueSuffix(t *testing.T) string { + t.Helper() + buf := make([]byte, 4) + _, err := rand.Read(buf) + require.NoError(t, err) + return hex.EncodeToString(buf) +} + +// TestSmokeFullLifecycle runs the adapter through every method against +// the real Docker daemon: ensure-network → pull → run → events → +// stop → remove. +func TestSmokeFullLifecycle(t *testing.T) { + skipUnlessDockerAvailable(t) + + adapter, docker := newSmokeAdapter(t) + + suffix := uniqueSuffix(t) + netName := smokeNetPrefix + suffix + containerName := "rtmanager-smoke-cont-" + suffix + + // Step 1 — provision a temporary user-defined bridge network. + createCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + _, err := docker.NetworkCreate(createCtx, netName, network.CreateOptions{Driver: "bridge"}) + require.NoError(t, err) + t.Cleanup(func() { + removeCtx, removeCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer removeCancel() + _ = docker.NetworkRemove(removeCtx, netName) + }) + + // Step 2 — EnsureNetwork present and missing paths. + require.NoError(t, adapter.EnsureNetwork(createCtx, netName)) + missingErr := adapter.EnsureNetwork(createCtx, "rtmanager-smoke-missing-"+suffix) + require.Error(t, missingErr) + assert.ErrorIs(t, missingErr, ports.ErrNetworkMissing) + + // Step 3 — pull alpine via the configured policy. + pullCtx, pullCancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer pullCancel() + require.NoError(t, adapter.PullImage(pullCtx, smokeImage, ports.PullPolicyIfMissing)) + + // Step 4 — subscribe to events before running the container so we + // observe the start event. + listenCtx, listenCancel := context.WithCancel(context.Background()) + defer listenCancel() + events, listenErrs, err := adapter.EventsListen(listenCtx) + require.NoError(t, err) + + // Step 5 — run a tiny container that sleeps so we can observe it. + stateDir := t.TempDir() + runCtx, runCancel := context.WithTimeout(context.Background(), 60*time.Second) + defer runCancel() + result, err := adapter.Run(runCtx, ports.RunSpec{ + Name: containerName, + Image: smokeImage, + Hostname: "smoke-" + suffix, + Network: netName, + Env: map[string]string{ + "GAME_STATE_PATH": "/tmp/state", + "STORAGE_PATH": "/tmp/state", + }, + Labels: map[string]string{ + "com.galaxy.owner": "rtmanager", + "com.galaxy.kind": "smoke", + }, + BindMounts: []ports.BindMount{ + {HostPath: stateDir, MountPath: "/tmp/state"}, + }, + LogDriver: "json-file", + CPUQuota: 0.5, + Memory: "64m", + PIDsLimit: 32, + Cmd: []string{"/bin/sh", "-c", "sleep 60"}, + }) + require.NoError(t, err) + t.Cleanup(func() { + removeCtx, removeCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer removeCancel() + _ = adapter.Remove(removeCtx, result.ContainerID) + }) + + require.NotEmpty(t, result.ContainerID) + require.Equal(t, "http://smoke-"+suffix+":8080", result.EngineEndpoint) + + // Step 6 — wait for a `start` event for the new container id. + startObserved := waitForEvent(t, events, listenErrs, "start", result.ContainerID, 15*time.Second) + require.True(t, startObserved, "did not observe start event for container %s", result.ContainerID) + + // Step 7 — InspectContainer returns running state. + inspectCtx, inspectCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer inspectCancel() + inspect, err := adapter.InspectContainer(inspectCtx, result.ContainerID) + require.NoError(t, err) + assert.Equal(t, "running", inspect.Status) + + // Step 8 — Stop, then Remove, then InspectContainer must report + // not found. + stopCtx, stopCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer stopCancel() + require.NoError(t, adapter.Stop(stopCtx, result.ContainerID, 5*time.Second)) + + require.NoError(t, adapter.Remove(stopCtx, result.ContainerID)) + + if _, err := adapter.InspectContainer(stopCtx, result.ContainerID); !errors.Is(err, ports.ErrContainerNotFound) { + t.Fatalf("expected ErrContainerNotFound, got %v", err) + } + + // Step 9 — terminate the events subscription cleanly. + listenCancel() + select { + case _, ok := <-events: + _ = ok + case <-time.After(5 * time.Second): + t.Log("events channel did not close within timeout (best-effort)") + } +} + +func waitForEvent(t *testing.T, events <-chan ports.DockerEvent, errs <-chan error, action, containerID string, timeout time.Duration) bool { + t.Helper() + deadline := time.After(timeout) + for { + select { + case ev, ok := <-events: + if !ok { + return false + } + if ev.Action == action && ev.ContainerID == containerID { + return true + } + case err := <-errs: + if err != nil { + t.Fatalf("events stream error: %v", err) + } + case <-deadline: + return false + } + } +} diff --git a/rtmanager/internal/adapters/healtheventspublisher/publisher.go b/rtmanager/internal/adapters/healtheventspublisher/publisher.go new file mode 100644 index 0000000..8342b90 --- /dev/null +++ b/rtmanager/internal/adapters/healtheventspublisher/publisher.go @@ -0,0 +1,165 @@ +// Package healtheventspublisher provides the Redis-Streams-backed +// publisher for `runtime:health_events`. Every Publish call upserts the +// latest `health_snapshots` row before XADDing the event so consumers +// observing the snapshot store can never lag the event stream by more +// than the duration of one network call. +// +// The publisher is shared across `ports.HealthEventPublisher` callers: +// the start service emits `container_started`; the probe, inspect, and +// events-listener workers emit the rest. The publisher's surface is +// stable across all of them. +package healtheventspublisher + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "strconv" + + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/ports" + + "github.com/redis/go-redis/v9" +) + +// emptyDetails is the canonical JSON payload installed when the caller +// supplies an empty Details slice. Matches the SQL DEFAULT for +// `health_snapshots.details`. +const emptyDetails = "{}" + +// Wire field names used by the Redis Streams payload. Frozen by +// `rtmanager/api/runtime-health-asyncapi.yaml`; renaming any of them +// breaks consumers. +const ( + fieldGameID = "game_id" + fieldContainerID = "container_id" + fieldEventType = "event_type" + fieldOccurredAtMS = "occurred_at_ms" + fieldDetails = "details" +) + +// Config groups the dependencies and stream name required to construct +// a Publisher. +type Config struct { + // Client appends entries to the Redis Stream. Must be non-nil. + Client *redis.Client + + // Snapshots upserts the latest health snapshot. Must be non-nil. + Snapshots ports.HealthSnapshotStore + + // Stream stores the Redis Stream key events are published to (e.g. + // `runtime:health_events`). Must not be empty. + Stream string +} + +// Publisher implements `ports.HealthEventPublisher` on top of a shared +// Redis client and the production `health_snapshots` store. +type Publisher struct { + client *redis.Client + snapshots ports.HealthSnapshotStore + stream string +} + +// NewPublisher constructs one Publisher from cfg. Validation errors +// surface the missing collaborator verbatim. +func NewPublisher(cfg Config) (*Publisher, error) { + if cfg.Client == nil { + return nil, errors.New("new rtmanager health events publisher: nil redis client") + } + if cfg.Snapshots == nil { + return nil, errors.New("new rtmanager health events publisher: nil snapshot store") + } + if cfg.Stream == "" { + return nil, errors.New("new rtmanager health events publisher: stream must not be empty") + } + return &Publisher{ + client: cfg.Client, + snapshots: cfg.Snapshots, + stream: cfg.Stream, + }, nil +} + +// Publish upserts the matching health_snapshots row and then XADDs the +// envelope to the configured Redis Stream. Both side effects are +// required; the snapshot upsert runs first so a successful Publish +// always leaves the snapshot store at least as fresh as the stream. +func (publisher *Publisher) Publish(ctx context.Context, envelope ports.HealthEventEnvelope) error { + if publisher == nil || publisher.client == nil || publisher.snapshots == nil { + return errors.New("publish health event: nil publisher") + } + if ctx == nil { + return errors.New("publish health event: nil context") + } + if err := envelope.Validate(); err != nil { + return fmt.Errorf("publish health event: %w", err) + } + + details := envelope.Details + if len(details) == 0 { + details = json.RawMessage(emptyDetails) + } + + status, source := snapshotMappingFor(envelope.EventType) + snapshot := health.HealthSnapshot{ + GameID: envelope.GameID, + ContainerID: envelope.ContainerID, + Status: status, + Source: source, + Details: details, + ObservedAt: envelope.OccurredAt.UTC(), + } + if err := publisher.snapshots.Upsert(ctx, snapshot); err != nil { + return fmt.Errorf("publish health event: upsert snapshot: %w", err) + } + + occurredAtMS := envelope.OccurredAt.UTC().UnixMilli() + values := map[string]any{ + fieldGameID: envelope.GameID, + fieldContainerID: envelope.ContainerID, + fieldEventType: string(envelope.EventType), + fieldOccurredAtMS: strconv.FormatInt(occurredAtMS, 10), + fieldDetails: string(details), + } + if err := publisher.client.XAdd(ctx, &redis.XAddArgs{ + Stream: publisher.stream, + Values: values, + }).Err(); err != nil { + return fmt.Errorf("publish health event: xadd: %w", err) + } + return nil +} + +// snapshotMappingFor returns the SnapshotStatus and SnapshotSource that +// match eventType per `rtmanager/README.md §Health Monitoring`. +// +// `container_started` is observed when the start service successfully +// runs the container; the snapshot collapses it to `healthy`. +// `probe_recovered` collapses to `healthy` per +// `rtmanager/docs/domain-and-ports.md` §4: it does not have its own +// snapshot status; the next observation overwrites the prior +// `probe_failed` with `healthy`. +func snapshotMappingFor(eventType health.EventType) (health.SnapshotStatus, health.SnapshotSource) { + switch eventType { + case health.EventTypeContainerStarted: + return health.SnapshotStatusHealthy, health.SnapshotSourceDockerEvent + case health.EventTypeContainerExited: + return health.SnapshotStatusExited, health.SnapshotSourceDockerEvent + case health.EventTypeContainerOOM: + return health.SnapshotStatusOOM, health.SnapshotSourceDockerEvent + case health.EventTypeContainerDisappeared: + return health.SnapshotStatusContainerDisappeared, health.SnapshotSourceDockerEvent + case health.EventTypeInspectUnhealthy: + return health.SnapshotStatusInspectUnhealthy, health.SnapshotSourceInspect + case health.EventTypeProbeFailed: + return health.SnapshotStatusProbeFailed, health.SnapshotSourceProbe + case health.EventTypeProbeRecovered: + return health.SnapshotStatusHealthy, health.SnapshotSourceProbe + default: + return "", "" + } +} + +// Compile-time assertion: Publisher implements +// ports.HealthEventPublisher. +var _ ports.HealthEventPublisher = (*Publisher)(nil) diff --git a/rtmanager/internal/adapters/healtheventspublisher/publisher_test.go b/rtmanager/internal/adapters/healtheventspublisher/publisher_test.go new file mode 100644 index 0000000..c185919 --- /dev/null +++ b/rtmanager/internal/adapters/healtheventspublisher/publisher_test.go @@ -0,0 +1,197 @@ +package healtheventspublisher_test + +import ( + "context" + "encoding/json" + "strconv" + "sync" + "testing" + "time" + + "galaxy/rtmanager/internal/adapters/healtheventspublisher" + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/ports" + + "github.com/alicebob/miniredis/v2" + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// fakeSnapshots captures Upsert invocations for assertions. +type fakeSnapshots struct { + mu sync.Mutex + upserts []health.HealthSnapshot + upsertErr error +} + +func (s *fakeSnapshots) Upsert(_ context.Context, snapshot health.HealthSnapshot) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.upsertErr != nil { + return s.upsertErr + } + s.upserts = append(s.upserts, snapshot) + return nil +} + +func (s *fakeSnapshots) Get(_ context.Context, _ string) (health.HealthSnapshot, error) { + return health.HealthSnapshot{}, nil +} + +func newPublisher(t *testing.T, snapshots ports.HealthSnapshotStore) (*healtheventspublisher.Publisher, *miniredis.Miniredis, *redis.Client) { + t.Helper() + server := miniredis.RunT(t) + client := redis.NewClient(&redis.Options{Addr: server.Addr()}) + t.Cleanup(func() { _ = client.Close() }) + + publisher, err := healtheventspublisher.NewPublisher(healtheventspublisher.Config{ + Client: client, + Snapshots: snapshots, + Stream: "runtime:health_events", + }) + require.NoError(t, err) + return publisher, server, client +} + +func TestNewPublisherRejectsMissingCollaborators(t *testing.T) { + _, err := healtheventspublisher.NewPublisher(healtheventspublisher.Config{}) + require.Error(t, err) + + _, err = healtheventspublisher.NewPublisher(healtheventspublisher.Config{ + Client: redis.NewClient(&redis.Options{Addr: "127.0.0.1:0"}), + }) + require.Error(t, err) + + _, err = healtheventspublisher.NewPublisher(healtheventspublisher.Config{ + Client: redis.NewClient(&redis.Options{Addr: "127.0.0.1:0"}), + Snapshots: &fakeSnapshots{}, + }) + require.Error(t, err) +} + +func TestPublishContainerStartedUpsertsHealthyAndXAdds(t *testing.T) { + snapshots := &fakeSnapshots{} + publisher, _, client := newPublisher(t, snapshots) + + occurredAt := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) + envelope := ports.HealthEventEnvelope{ + GameID: "game-1", + ContainerID: "c-1", + EventType: health.EventTypeContainerStarted, + OccurredAt: occurredAt, + Details: json.RawMessage(`{"image_ref":"galaxy/game:1.2.3"}`), + } + require.NoError(t, publisher.Publish(context.Background(), envelope)) + + require.Len(t, snapshots.upserts, 1) + snapshot := snapshots.upserts[0] + assert.Equal(t, "game-1", snapshot.GameID) + assert.Equal(t, "c-1", snapshot.ContainerID) + assert.Equal(t, health.SnapshotStatusHealthy, snapshot.Status) + assert.Equal(t, health.SnapshotSourceDockerEvent, snapshot.Source) + assert.JSONEq(t, `{"image_ref":"galaxy/game:1.2.3"}`, string(snapshot.Details)) + assert.Equal(t, occurredAt, snapshot.ObservedAt) + + entries, err := client.XRange(context.Background(), "runtime:health_events", "-", "+").Result() + require.NoError(t, err) + require.Len(t, entries, 1) + values := entries[0].Values + assert.Equal(t, "game-1", values["game_id"]) + assert.Equal(t, "c-1", values["container_id"]) + assert.Equal(t, "container_started", values["event_type"]) + assert.Equal(t, strconv.FormatInt(occurredAt.UnixMilli(), 10), values["occurred_at_ms"]) + assert.JSONEq(t, `{"image_ref":"galaxy/game:1.2.3"}`, values["details"].(string)) +} + +func TestPublishMapsEveryEventTypeToASnapshot(t *testing.T) { + t.Parallel() + cases := []struct { + eventType health.EventType + expectStatus health.SnapshotStatus + expectSource health.SnapshotSource + }{ + {health.EventTypeContainerStarted, health.SnapshotStatusHealthy, health.SnapshotSourceDockerEvent}, + {health.EventTypeContainerExited, health.SnapshotStatusExited, health.SnapshotSourceDockerEvent}, + {health.EventTypeContainerOOM, health.SnapshotStatusOOM, health.SnapshotSourceDockerEvent}, + {health.EventTypeContainerDisappeared, health.SnapshotStatusContainerDisappeared, health.SnapshotSourceDockerEvent}, + {health.EventTypeInspectUnhealthy, health.SnapshotStatusInspectUnhealthy, health.SnapshotSourceInspect}, + {health.EventTypeProbeFailed, health.SnapshotStatusProbeFailed, health.SnapshotSourceProbe}, + {health.EventTypeProbeRecovered, health.SnapshotStatusHealthy, health.SnapshotSourceProbe}, + } + for _, tc := range cases { + t.Run(string(tc.eventType), func(t *testing.T) { + t.Parallel() + snapshots := &fakeSnapshots{} + publisher, _, _ := newPublisher(t, snapshots) + require.NoError(t, publisher.Publish(context.Background(), ports.HealthEventEnvelope{ + GameID: "g", + ContainerID: "c", + EventType: tc.eventType, + OccurredAt: time.Now().UTC(), + Details: json.RawMessage(`{}`), + })) + require.Len(t, snapshots.upserts, 1) + assert.Equal(t, tc.expectStatus, snapshots.upserts[0].Status) + assert.Equal(t, tc.expectSource, snapshots.upserts[0].Source) + }) + } +} + +func TestPublishEmptyDetailsBecomesEmptyObject(t *testing.T) { + snapshots := &fakeSnapshots{} + publisher, _, client := newPublisher(t, snapshots) + + envelope := ports.HealthEventEnvelope{ + GameID: "g", + ContainerID: "c", + EventType: health.EventTypeContainerDisappeared, + OccurredAt: time.Now().UTC(), + } + require.NoError(t, publisher.Publish(context.Background(), envelope)) + + require.Len(t, snapshots.upserts, 1) + assert.JSONEq(t, "{}", string(snapshots.upserts[0].Details)) + + entries, err := client.XRange(context.Background(), "runtime:health_events", "-", "+").Result() + require.NoError(t, err) + require.Len(t, entries, 1) + assert.JSONEq(t, "{}", entries[0].Values["details"].(string)) +} + +func TestPublishRejectsInvalidEnvelope(t *testing.T) { + snapshots := &fakeSnapshots{} + publisher, _, client := newPublisher(t, snapshots) + + require.Error(t, publisher.Publish(context.Background(), ports.HealthEventEnvelope{})) + + entries, err := client.XRange(context.Background(), "runtime:health_events", "-", "+").Result() + require.NoError(t, err) + assert.Empty(t, entries) + assert.Empty(t, snapshots.upserts) +} + +func TestPublishSurfacesSnapshotErrorWithoutXAdd(t *testing.T) { + snapshots := &fakeSnapshots{upsertErr: assertSentinelErr} + publisher, _, client := newPublisher(t, snapshots) + + err := publisher.Publish(context.Background(), ports.HealthEventEnvelope{ + GameID: "g", + ContainerID: "c", + EventType: health.EventTypeContainerStarted, + OccurredAt: time.Now().UTC(), + Details: json.RawMessage(`{"image_ref":"x"}`), + }) + require.Error(t, err) + + entries, err := client.XRange(context.Background(), "runtime:health_events", "-", "+").Result() + require.NoError(t, err) + assert.Empty(t, entries, "xadd must not run when snapshot upsert fails") +} + +// assertSentinelErr is a sentinel for snapshot-failure assertions. +var assertSentinelErr = sentinelError("snapshot upsert failure") + +type sentinelError string + +func (s sentinelError) Error() string { return string(s) } diff --git a/rtmanager/internal/adapters/jobresultspublisher/publisher.go b/rtmanager/internal/adapters/jobresultspublisher/publisher.go new file mode 100644 index 0000000..5214388 --- /dev/null +++ b/rtmanager/internal/adapters/jobresultspublisher/publisher.go @@ -0,0 +1,100 @@ +// Package jobresultspublisher provides the Redis-Streams-backed +// publisher for `runtime:job_results`. The start-jobs and stop-jobs +// consumers call this adapter so every consumed envelope produces +// exactly one outcome entry on the result stream. +// +// The wire fields mirror the AsyncAPI schema frozen in +// `rtmanager/api/runtime-jobs-asyncapi.yaml`. Every field is XADDed +// even when empty so consumers can rely on the schema's required-field +// set. +package jobresultspublisher + +import ( + "context" + "errors" + "fmt" + "strings" + + "galaxy/rtmanager/internal/ports" + + "github.com/redis/go-redis/v9" +) + +// Wire field names used by the Redis Streams payload. Frozen by +// `rtmanager/api/runtime-jobs-asyncapi.yaml`; renaming any of them +// breaks consumers. +const ( + fieldGameID = "game_id" + fieldOutcome = "outcome" + fieldContainerID = "container_id" + fieldEngineEndpoint = "engine_endpoint" + fieldErrorCode = "error_code" + fieldErrorMessage = "error_message" +) + +// Config groups the dependencies and stream name required to construct +// a Publisher. +type Config struct { + // Client appends entries to the Redis Stream. Must be non-nil. + Client *redis.Client + + // Stream stores the Redis Stream key job results are published to + // (e.g. `runtime:job_results`). Must not be empty. + Stream string +} + +// Publisher implements `ports.JobResultPublisher` on top of a shared +// Redis client. +type Publisher struct { + client *redis.Client + stream string +} + +// NewPublisher constructs one Publisher from cfg. Validation errors +// surface the missing collaborator verbatim. +func NewPublisher(cfg Config) (*Publisher, error) { + if cfg.Client == nil { + return nil, errors.New("new rtmanager job results publisher: nil redis client") + } + if strings.TrimSpace(cfg.Stream) == "" { + return nil, errors.New("new rtmanager job results publisher: stream must not be empty") + } + return &Publisher{ + client: cfg.Client, + stream: cfg.Stream, + }, nil +} + +// Publish XADDs result to the configured Redis Stream. The wire payload +// includes every field declared as required by the AsyncAPI schema — +// empty strings are kept so consumers always see the documented keys. +func (publisher *Publisher) Publish(ctx context.Context, result ports.JobResult) error { + if publisher == nil || publisher.client == nil { + return errors.New("publish job result: nil publisher") + } + if ctx == nil { + return errors.New("publish job result: nil context") + } + if err := result.Validate(); err != nil { + return fmt.Errorf("publish job result: %w", err) + } + + values := map[string]any{ + fieldGameID: result.GameID, + fieldOutcome: result.Outcome, + fieldContainerID: result.ContainerID, + fieldEngineEndpoint: result.EngineEndpoint, + fieldErrorCode: result.ErrorCode, + fieldErrorMessage: result.ErrorMessage, + } + if err := publisher.client.XAdd(ctx, &redis.XAddArgs{ + Stream: publisher.stream, + Values: values, + }).Err(); err != nil { + return fmt.Errorf("publish job result: xadd: %w", err) + } + return nil +} + +// Compile-time assertion: Publisher implements ports.JobResultPublisher. +var _ ports.JobResultPublisher = (*Publisher)(nil) diff --git a/rtmanager/internal/adapters/jobresultspublisher/publisher_test.go b/rtmanager/internal/adapters/jobresultspublisher/publisher_test.go new file mode 100644 index 0000000..2fffd5a --- /dev/null +++ b/rtmanager/internal/adapters/jobresultspublisher/publisher_test.go @@ -0,0 +1,142 @@ +package jobresultspublisher_test + +import ( + "context" + "testing" + + "galaxy/rtmanager/internal/adapters/jobresultspublisher" + "galaxy/rtmanager/internal/ports" + + "github.com/alicebob/miniredis/v2" + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func newPublisher(t *testing.T) (*jobresultspublisher.Publisher, *redis.Client) { + t.Helper() + server := miniredis.RunT(t) + client := redis.NewClient(&redis.Options{Addr: server.Addr()}) + t.Cleanup(func() { _ = client.Close() }) + + publisher, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{ + Client: client, + Stream: "runtime:job_results", + }) + require.NoError(t, err) + return publisher, client +} + +func TestNewPublisherRejectsMissingCollaborators(t *testing.T) { + _, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{}) + require.Error(t, err) + + server := miniredis.RunT(t) + client := redis.NewClient(&redis.Options{Addr: server.Addr()}) + t.Cleanup(func() { _ = client.Close() }) + + _, err = jobresultspublisher.NewPublisher(jobresultspublisher.Config{Client: client}) + require.Error(t, err) + + _, err = jobresultspublisher.NewPublisher(jobresultspublisher.Config{Client: client, Stream: " "}) + require.Error(t, err) +} + +func TestPublishRejectsInvalidResult(t *testing.T) { + publisher, _ := newPublisher(t) + + require.Error(t, publisher.Publish(context.Background(), ports.JobResult{})) + require.Error(t, publisher.Publish(context.Background(), ports.JobResult{ + GameID: "game-1", + Outcome: "weird", + })) +} + +func TestPublishStartSuccessXAddsAllRequiredFields(t *testing.T) { + publisher, client := newPublisher(t) + + result := ports.JobResult{ + GameID: "game-1", + Outcome: ports.JobOutcomeSuccess, + ContainerID: "c-1", + EngineEndpoint: "http://galaxy-game-game-1:8080", + ErrorCode: "", + ErrorMessage: "", + } + require.NoError(t, publisher.Publish(context.Background(), result)) + + entries, err := client.XRange(context.Background(), "runtime:job_results", "-", "+").Result() + require.NoError(t, err) + require.Len(t, entries, 1) + values := entries[0].Values + assert.Equal(t, "game-1", values["game_id"]) + assert.Equal(t, "success", values["outcome"]) + assert.Equal(t, "c-1", values["container_id"]) + assert.Equal(t, "http://galaxy-game-game-1:8080", values["engine_endpoint"]) + assert.Equal(t, "", values["error_code"]) + assert.Equal(t, "", values["error_message"]) +} + +func TestPublishFailureXAddsEmptyContainerAndEndpoint(t *testing.T) { + publisher, client := newPublisher(t) + + result := ports.JobResult{ + GameID: "game-2", + Outcome: ports.JobOutcomeFailure, + ErrorCode: "image_pull_failed", + ErrorMessage: "manifest unknown", + } + require.NoError(t, publisher.Publish(context.Background(), result)) + + entries, err := client.XRange(context.Background(), "runtime:job_results", "-", "+").Result() + require.NoError(t, err) + require.Len(t, entries, 1) + values := entries[0].Values + assert.Equal(t, "game-2", values["game_id"]) + assert.Equal(t, "failure", values["outcome"]) + assert.Equal(t, "", values["container_id"], "failure must publish empty container id") + assert.Equal(t, "", values["engine_endpoint"], "failure must publish empty engine endpoint") + assert.Equal(t, "image_pull_failed", values["error_code"]) + assert.Equal(t, "manifest unknown", values["error_message"]) +} + +func TestPublishReplayNoOpKeepsContainerAndEndpoint(t *testing.T) { + publisher, client := newPublisher(t) + + result := ports.JobResult{ + GameID: "game-3", + Outcome: ports.JobOutcomeSuccess, + ContainerID: "c-3", + EngineEndpoint: "http://galaxy-game-game-3:8080", + ErrorCode: "replay_no_op", + } + require.NoError(t, publisher.Publish(context.Background(), result)) + + entries, err := client.XRange(context.Background(), "runtime:job_results", "-", "+").Result() + require.NoError(t, err) + require.Len(t, entries, 1) + values := entries[0].Values + assert.Equal(t, "game-3", values["game_id"]) + assert.Equal(t, "success", values["outcome"]) + assert.Equal(t, "c-3", values["container_id"]) + assert.Equal(t, "http://galaxy-game-game-3:8080", values["engine_endpoint"]) + assert.Equal(t, "replay_no_op", values["error_code"]) + assert.Equal(t, "", values["error_message"]) +} + +func TestPublishFailsOnClosedClient(t *testing.T) { + server := miniredis.RunT(t) + client := redis.NewClient(&redis.Options{Addr: server.Addr()}) + publisher, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{ + Client: client, + Stream: "runtime:job_results", + }) + require.NoError(t, err) + require.NoError(t, client.Close()) + + err = publisher.Publish(context.Background(), ports.JobResult{ + GameID: "game-4", + Outcome: ports.JobOutcomeSuccess, + }) + require.Error(t, err) +} diff --git a/rtmanager/internal/adapters/lobbyclient/client.go b/rtmanager/internal/adapters/lobbyclient/client.go new file mode 100644 index 0000000..e858db0 --- /dev/null +++ b/rtmanager/internal/adapters/lobbyclient/client.go @@ -0,0 +1,219 @@ +// Package lobbyclient provides the trusted-internal Lobby REST client +// Runtime Manager uses to fetch ancillary game metadata for diagnostics. +// +// The client is intentionally minimal: the GetGame fetch is ancillary +// diagnostics because the start envelope already carries the only +// required field (`image_ref`). A failed call surfaces as +// `ports.ErrLobbyUnavailable` so callers can distinguish "not found" +// from transport faults and continue without aborting the start +// operation. +package lobbyclient + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" + + "galaxy/rtmanager/internal/ports" +) + +const ( + getGamePathSuffix = "/api/v1/internal/games/%s" +) + +// Config configures one HTTP-backed Lobby internal client. +type Config struct { + // BaseURL stores the absolute base URL of the Lobby internal HTTP + // listener (e.g. `http://lobby:8095`). + BaseURL string + + // RequestTimeout bounds one outbound lookup request. + RequestTimeout time.Duration +} + +// Client resolves Lobby game records through the trusted internal HTTP +// API. +type Client struct { + baseURL string + requestTimeout time.Duration + httpClient *http.Client + closeIdleConnections func() +} + +type gameRecordEnvelope struct { + GameID string `json:"game_id"` + Status string `json:"status"` + TargetEngineVersion string `json:"target_engine_version"` +} + +type errorEnvelope struct { + Error *errorBody `json:"error"` +} + +type errorBody struct { + Code string `json:"code"` + Message string `json:"message"` +} + +// NewClient constructs a Lobby internal client that uses +// repository-standard HTTP transport instrumentation through otelhttp. +// The cloned default transport keeps the production wiring isolated +// from caller-provided transports. +func NewClient(cfg Config) (*Client, error) { + transport, ok := http.DefaultTransport.(*http.Transport) + if !ok { + return nil, errors.New("new lobby internal client: default transport is not *http.Transport") + } + cloned := transport.Clone() + return newClient(cfg, &http.Client{Transport: otelhttp.NewTransport(cloned)}, cloned.CloseIdleConnections) +} + +func newClient(cfg Config, httpClient *http.Client, closeIdleConnections func()) (*Client, error) { + switch { + case strings.TrimSpace(cfg.BaseURL) == "": + return nil, errors.New("new lobby internal client: base URL must not be empty") + case cfg.RequestTimeout <= 0: + return nil, errors.New("new lobby internal client: request timeout must be positive") + case httpClient == nil: + return nil, errors.New("new lobby internal client: http client must not be nil") + } + + parsed, err := url.Parse(strings.TrimRight(strings.TrimSpace(cfg.BaseURL), "/")) + if err != nil { + return nil, fmt.Errorf("new lobby internal client: parse base URL: %w", err) + } + if parsed.Scheme == "" || parsed.Host == "" { + return nil, errors.New("new lobby internal client: base URL must be absolute") + } + + return &Client{ + baseURL: parsed.String(), + requestTimeout: cfg.RequestTimeout, + httpClient: httpClient, + closeIdleConnections: closeIdleConnections, + }, nil +} + +// Close releases idle HTTP connections owned by the client transport. +// Call once on shutdown. +func (client *Client) Close() error { + if client == nil || client.closeIdleConnections == nil { + return nil + } + client.closeIdleConnections() + return nil +} + +// GetGame returns the Lobby game record for gameID. It maps Lobby's +// `404 not_found` to `ports.ErrLobbyGameNotFound`; every other failure +// (transport, timeout, non-2xx response) maps to +// `ports.ErrLobbyUnavailable` wrapped with the original error so callers +// keep the diagnostic detail. +func (client *Client) GetGame(ctx context.Context, gameID string) (ports.LobbyGameRecord, error) { + if client == nil || client.httpClient == nil { + return ports.LobbyGameRecord{}, errors.New("lobby get game: nil client") + } + if ctx == nil { + return ports.LobbyGameRecord{}, errors.New("lobby get game: nil context") + } + if err := ctx.Err(); err != nil { + return ports.LobbyGameRecord{}, err + } + if strings.TrimSpace(gameID) == "" { + return ports.LobbyGameRecord{}, errors.New("lobby get game: game id must not be empty") + } + + payload, statusCode, err := client.doRequest(ctx, http.MethodGet, fmt.Sprintf(getGamePathSuffix, url.PathEscape(gameID))) + if err != nil { + return ports.LobbyGameRecord{}, fmt.Errorf("%w: %w", ports.ErrLobbyUnavailable, err) + } + + switch statusCode { + case http.StatusOK: + var envelope gameRecordEnvelope + if err := decodeJSONPayload(payload, &envelope); err != nil { + return ports.LobbyGameRecord{}, fmt.Errorf("%w: decode success response: %w", ports.ErrLobbyUnavailable, err) + } + if strings.TrimSpace(envelope.GameID) == "" { + return ports.LobbyGameRecord{}, fmt.Errorf("%w: success response missing game_id", ports.ErrLobbyUnavailable) + } + return ports.LobbyGameRecord{ + GameID: envelope.GameID, + Status: envelope.Status, + TargetEngineVersion: envelope.TargetEngineVersion, + }, nil + case http.StatusNotFound: + return ports.LobbyGameRecord{}, ports.ErrLobbyGameNotFound + default: + errorCode := decodeErrorCode(payload) + if errorCode != "" { + return ports.LobbyGameRecord{}, fmt.Errorf("%w: unexpected status %d (error_code=%s)", ports.ErrLobbyUnavailable, statusCode, errorCode) + } + return ports.LobbyGameRecord{}, fmt.Errorf("%w: unexpected status %d", ports.ErrLobbyUnavailable, statusCode) + } +} + +func (client *Client) doRequest(ctx context.Context, method, requestPath string) ([]byte, int, error) { + attemptCtx, cancel := context.WithTimeout(ctx, client.requestTimeout) + defer cancel() + + req, err := http.NewRequestWithContext(attemptCtx, method, client.baseURL+requestPath, nil) + if err != nil { + return nil, 0, fmt.Errorf("build request: %w", err) + } + req.Header.Set("Accept", "application/json") + + resp, err := client.httpClient.Do(req) + if err != nil { + return nil, 0, err + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, 0, fmt.Errorf("read response body: %w", err) + } + return body, resp.StatusCode, nil +} + +// decodeJSONPayload tolerantly decodes a JSON object; unknown fields +// are ignored so additive Lobby schema changes do not break us. +func decodeJSONPayload(payload []byte, target any) error { + decoder := json.NewDecoder(bytes.NewReader(payload)) + if err := decoder.Decode(target); err != nil { + return err + } + if err := decoder.Decode(&struct{}{}); err != io.EOF { + if err == nil { + return errors.New("unexpected trailing JSON input") + } + return err + } + return nil +} + +func decodeErrorCode(payload []byte) string { + if len(payload) == 0 { + return "" + } + var envelope errorEnvelope + if err := json.Unmarshal(payload, &envelope); err != nil { + return "" + } + if envelope.Error == nil { + return "" + } + return envelope.Error.Code +} + +// Compile-time assertion: Client implements ports.LobbyInternalClient. +var _ ports.LobbyInternalClient = (*Client)(nil) diff --git a/rtmanager/internal/adapters/lobbyclient/client_test.go b/rtmanager/internal/adapters/lobbyclient/client_test.go new file mode 100644 index 0000000..a01a5d6 --- /dev/null +++ b/rtmanager/internal/adapters/lobbyclient/client_test.go @@ -0,0 +1,153 @@ +package lobbyclient + +import ( + "context" + "errors" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "galaxy/rtmanager/internal/ports" +) + +func newTestClient(t *testing.T, baseURL string, timeout time.Duration) *Client { + t.Helper() + client, err := NewClient(Config{BaseURL: baseURL, RequestTimeout: timeout}) + require.NoError(t, err) + t.Cleanup(func() { _ = client.Close() }) + return client +} + +func TestNewClientValidatesConfig(t *testing.T) { + cases := map[string]Config{ + "empty base url": {BaseURL: "", RequestTimeout: time.Second}, + "non-absolute base url": {BaseURL: "lobby:8095", RequestTimeout: time.Second}, + "non-positive timeout": {BaseURL: "http://lobby:8095", RequestTimeout: 0}, + } + for name, cfg := range cases { + t.Run(name, func(t *testing.T) { + _, err := NewClient(cfg) + require.Error(t, err) + }) + } +} + +func TestGetGameSuccess(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, http.MethodGet, r.Method) + require.Equal(t, "/api/v1/internal/games/game-1", r.URL.Path) + require.Equal(t, "application/json", r.Header.Get("Accept")) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{ + "game_id": "game-1", + "game_name": "Sample", + "status": "running", + "target_engine_version": "1.4.2", + "current_turn": 0, + "runtime_status": "running" + }`)) + })) + defer server.Close() + + client := newTestClient(t, server.URL, time.Second) + got, err := client.GetGame(context.Background(), "game-1") + require.NoError(t, err) + assert.Equal(t, "game-1", got.GameID) + assert.Equal(t, "running", got.Status) + assert.Equal(t, "1.4.2", got.TargetEngineVersion) +} + +func TestGetGameNotFound(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusNotFound) + _, _ = w.Write([]byte(`{"error":{"code":"not_found","message":"no such game"}}`)) + })) + defer server.Close() + + client := newTestClient(t, server.URL, time.Second) + _, err := client.GetGame(context.Background(), "missing") + require.Error(t, err) + assert.True(t, errors.Is(err, ports.ErrLobbyGameNotFound)) + assert.False(t, errors.Is(err, ports.ErrLobbyUnavailable)) +} + +func TestGetGameInternalErrorMapsToUnavailable(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusInternalServerError) + _, _ = w.Write([]byte(`{"error":{"code":"internal_error","message":"boom"}}`)) + })) + defer server.Close() + + client := newTestClient(t, server.URL, time.Second) + _, err := client.GetGame(context.Background(), "x") + require.Error(t, err) + assert.True(t, errors.Is(err, ports.ErrLobbyUnavailable)) + assert.Contains(t, err.Error(), "500") + assert.Contains(t, err.Error(), "internal_error") +} + +func TestGetGameTimeoutMapsToUnavailable(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(150 * time.Millisecond) + _, _ = w.Write([]byte(`{}`)) + })) + defer server.Close() + + client := newTestClient(t, server.URL, 50*time.Millisecond) + _, err := client.GetGame(context.Background(), "x") + require.Error(t, err) + assert.True(t, errors.Is(err, ports.ErrLobbyUnavailable)) +} + +func TestGetGameSuccessMissingGameIDIsUnavailable(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`{"status":"running"}`)) + })) + defer server.Close() + + client := newTestClient(t, server.URL, time.Second) + _, err := client.GetGame(context.Background(), "x") + require.Error(t, err) + assert.True(t, errors.Is(err, ports.ErrLobbyUnavailable)) + assert.Contains(t, err.Error(), "missing game_id") +} + +func TestGetGameRejectsBadInput(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + t.Fatal("must not contact lobby on bad input") + })) + defer server.Close() + + client := newTestClient(t, server.URL, time.Second) + t.Run("empty game id", func(t *testing.T) { + _, err := client.GetGame(context.Background(), " ") + require.Error(t, err) + assert.Contains(t, err.Error(), "game id") + }) + t.Run("canceled context", func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() + _, err := client.GetGame(ctx, "x") + require.Error(t, err) + assert.True(t, errors.Is(err, context.Canceled)) + }) +} + +func TestCloseReleasesConnections(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`{"game_id":"x","status":"running","target_engine_version":"1.0.0"}`)) + })) + defer server.Close() + + client := newTestClient(t, server.URL, time.Second) + _, err := client.GetGame(context.Background(), "x") + require.NoError(t, err) + assert.NoError(t, client.Close()) + assert.NoError(t, client.Close()) // idempotent +} diff --git a/rtmanager/internal/adapters/notificationpublisher/publisher.go b/rtmanager/internal/adapters/notificationpublisher/publisher.go new file mode 100644 index 0000000..d63f543 --- /dev/null +++ b/rtmanager/internal/adapters/notificationpublisher/publisher.go @@ -0,0 +1,70 @@ +// Package notificationpublisher provides the Redis-Streams-backed +// notification-intent publisher Runtime Manager uses to emit admin-only +// failure notifications. The adapter is a thin shim over +// `galaxy/notificationintent.Publisher` that drops the entry id at the +// wrapper boundary; rationale lives in +// `rtmanager/docs/domain-and-ports.md §7`. +package notificationpublisher + +import ( + "context" + "errors" + "fmt" + + "github.com/redis/go-redis/v9" + + "galaxy/notificationintent" + "galaxy/rtmanager/internal/ports" +) + +// Config groups the dependencies and stream name required to +// construct a Publisher. +type Config struct { + // Client appends entries to Redis Streams. Must be non-nil. + Client *redis.Client + + // Stream stores the Redis Stream key intents are published to. + // When empty, `notificationintent.DefaultIntentsStream` is used. + Stream string +} + +// Publisher implements `ports.NotificationIntentPublisher` on top of +// the shared `notificationintent.Publisher`. The wrapper is the single +// point that drops the entry id returned by the underlying publisher. +type Publisher struct { + inner *notificationintent.Publisher +} + +// NewPublisher constructs a Publisher from cfg. It wraps the shared +// publisher and delegates validation; transport errors and validation +// errors propagate verbatim. +func NewPublisher(cfg Config) (*Publisher, error) { + if cfg.Client == nil { + return nil, errors.New("new rtmanager notification publisher: nil redis client") + } + inner, err := notificationintent.NewPublisher(notificationintent.PublisherConfig{ + Client: cfg.Client, + Stream: cfg.Stream, + }) + if err != nil { + return nil, fmt.Errorf("new rtmanager notification publisher: %w", err) + } + return &Publisher{inner: inner}, nil +} + +// Publish forwards intent to the underlying notificationintent +// publisher and discards the resulting Redis Stream entry id. A failed +// publish surfaces as the underlying error. +func (publisher *Publisher) Publish(ctx context.Context, intent notificationintent.Intent) error { + if publisher == nil || publisher.inner == nil { + return errors.New("publish notification intent: nil publisher") + } + if _, err := publisher.inner.Publish(ctx, intent); err != nil { + return err + } + return nil +} + +// Compile-time assertion: Publisher implements +// ports.NotificationIntentPublisher. +var _ ports.NotificationIntentPublisher = (*Publisher)(nil) diff --git a/rtmanager/internal/adapters/notificationpublisher/publisher_test.go b/rtmanager/internal/adapters/notificationpublisher/publisher_test.go new file mode 100644 index 0000000..86ca667 --- /dev/null +++ b/rtmanager/internal/adapters/notificationpublisher/publisher_test.go @@ -0,0 +1,123 @@ +package notificationpublisher + +import ( + "context" + "encoding/json" + "testing" + "time" + + "github.com/alicebob/miniredis/v2" + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "galaxy/notificationintent" +) + +func newRedis(t *testing.T) (*redis.Client, *miniredis.Miniredis) { + t.Helper() + server := miniredis.RunT(t) + client := redis.NewClient(&redis.Options{Addr: server.Addr()}) + t.Cleanup(func() { _ = client.Close() }) + return client, server +} + +func readStream(t *testing.T, client *redis.Client, stream string) []redis.XMessage { + t.Helper() + messages, err := client.XRange(context.Background(), stream, "-", "+").Result() + require.NoError(t, err) + return messages +} + +func TestNewPublisherValidation(t *testing.T) { + t.Run("nil client", func(t *testing.T) { + _, err := NewPublisher(Config{}) + require.Error(t, err) + assert.Contains(t, err.Error(), "nil redis client") + }) +} + +func TestPublisherWritesIntent(t *testing.T) { + client, _ := newRedis(t) + + publisher, err := NewPublisher(Config{Client: client, Stream: "notification:intents"}) + require.NoError(t, err) + + intent, err := notificationintent.NewRuntimeImagePullFailedIntent( + notificationintent.Metadata{ + IdempotencyKey: "rtmanager:start:game-1:abc", + OccurredAt: time.UnixMilli(1714200000000).UTC(), + }, + notificationintent.RuntimeImagePullFailedPayload{ + GameID: "game-1", + ImageRef: "galaxy/game:1.4.2", + ErrorCode: "image_pull_failed", + ErrorMessage: "registry timeout", + AttemptedAtMs: 1714200000000, + }, + ) + require.NoError(t, err) + + require.NoError(t, publisher.Publish(context.Background(), intent)) + + messages := readStream(t, client, "notification:intents") + require.Len(t, messages, 1) + + values := messages[0].Values + assert.Equal(t, "runtime.image_pull_failed", values["notification_type"]) + assert.Equal(t, "runtime_manager", values["producer"]) + assert.Equal(t, "admin_email", values["audience_kind"]) + assert.Equal(t, "rtmanager:start:game-1:abc", values["idempotency_key"]) + + // recipient_user_ids_json must be absent for admin_email audience. + _, hasRecipients := values["recipient_user_ids_json"] + assert.False(t, hasRecipients) + + payloadRaw, ok := values["payload_json"].(string) + require.True(t, ok) + var payload map[string]any + require.NoError(t, json.Unmarshal([]byte(payloadRaw), &payload)) + assert.Equal(t, "game-1", payload["game_id"]) + assert.Equal(t, "galaxy/game:1.4.2", payload["image_ref"]) +} + +func TestPublisherForwardsValidationError(t *testing.T) { + client, _ := newRedis(t) + publisher, err := NewPublisher(Config{Client: client}) + require.NoError(t, err) + + // Intent with a zero OccurredAt fails the shared validator. + bad := notificationintent.Intent{ + NotificationType: notificationintent.NotificationTypeRuntimeImagePullFailed, + Producer: notificationintent.ProducerRuntimeManager, + AudienceKind: notificationintent.AudienceKindAdminEmail, + IdempotencyKey: "k", + PayloadJSON: `{"game_id":"g","image_ref":"r","error_code":"c","error_message":"m","attempted_at_ms":1}`, + } + require.Error(t, publisher.Publish(context.Background(), bad)) +} + +func TestPublisherDefaultsStreamName(t *testing.T) { + client, _ := newRedis(t) + publisher, err := NewPublisher(Config{Client: client, Stream: ""}) + require.NoError(t, err) + + intent, err := notificationintent.NewRuntimeContainerStartFailedIntent( + notificationintent.Metadata{ + IdempotencyKey: "k", + OccurredAt: time.UnixMilli(1714200000000).UTC(), + }, + notificationintent.RuntimeContainerStartFailedPayload{ + GameID: "g", + ImageRef: "r", + ErrorCode: "container_start_failed", + ErrorMessage: "boom", + AttemptedAtMs: 1714200000000, + }, + ) + require.NoError(t, err) + require.NoError(t, publisher.Publish(context.Background(), intent)) + + messages := readStream(t, client, notificationintent.DefaultIntentsStream) + require.Len(t, messages, 1) +} diff --git a/rtmanager/internal/adapters/postgres/healthsnapshotstore/store.go b/rtmanager/internal/adapters/postgres/healthsnapshotstore/store.go new file mode 100644 index 0000000..083c7d9 --- /dev/null +++ b/rtmanager/internal/adapters/postgres/healthsnapshotstore/store.go @@ -0,0 +1,203 @@ +// Package healthsnapshotstore implements the PostgreSQL-backed adapter +// for `ports.HealthSnapshotStore`. +// +// The package owns the on-disk shape of the `health_snapshots` table +// defined in +// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql` +// and translates the schema-agnostic `ports.HealthSnapshotStore` interface +// declared in `internal/ports/healthsnapshotstore.go` into concrete +// go-jet/v2 statements driven by the pgx driver. +// +// The `details` jsonb column round-trips as a `json.RawMessage`. Empty +// payloads are substituted with the SQL default `{}` on Upsert so the +// CHECK constraints and downstream readers never observe a non-JSON +// empty string. +package healthsnapshotstore + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "strings" + "time" + + "galaxy/rtmanager/internal/adapters/postgres/internal/sqlx" + pgtable "galaxy/rtmanager/internal/adapters/postgres/jet/rtmanager/table" + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + + pg "github.com/go-jet/jet/v2/postgres" +) + +// emptyDetails is the canonical jsonb payload installed when the caller +// supplies an empty Details slice. It matches the SQL DEFAULT for the +// column. +const emptyDetails = "{}" + +// Config configures one PostgreSQL-backed health-snapshot store instance. +type Config struct { + // DB stores the connection pool the store uses for every query. + DB *sql.DB + + // OperationTimeout bounds one round trip. + OperationTimeout time.Duration +} + +// Store persists Runtime Manager health snapshots in PostgreSQL. +type Store struct { + db *sql.DB + operationTimeout time.Duration +} + +// New constructs one PostgreSQL-backed health-snapshot store from cfg. +func New(cfg Config) (*Store, error) { + if cfg.DB == nil { + return nil, errors.New("new postgres health snapshot store: db must not be nil") + } + if cfg.OperationTimeout <= 0 { + return nil, errors.New("new postgres health snapshot store: operation timeout must be positive") + } + return &Store{ + db: cfg.DB, + operationTimeout: cfg.OperationTimeout, + }, nil +} + +// healthSnapshotSelectColumns is the canonical SELECT list for the +// health_snapshots table, matching scanSnapshot's column order. +var healthSnapshotSelectColumns = pg.ColumnList{ + pgtable.HealthSnapshots.GameID, + pgtable.HealthSnapshots.ContainerID, + pgtable.HealthSnapshots.Status, + pgtable.HealthSnapshots.Source, + pgtable.HealthSnapshots.Details, + pgtable.HealthSnapshots.ObservedAt, +} + +// Upsert installs snapshot as the latest observation for snapshot.GameID. +// snapshot is validated through health.HealthSnapshot.Validate before the +// SQL is issued. +func (store *Store) Upsert(ctx context.Context, snapshot health.HealthSnapshot) error { + if store == nil || store.db == nil { + return errors.New("upsert health snapshot: nil store") + } + if err := snapshot.Validate(); err != nil { + return fmt.Errorf("upsert health snapshot: %w", err) + } + + operationCtx, cancel, err := sqlx.WithTimeout(ctx, "upsert health snapshot", store.operationTimeout) + if err != nil { + return err + } + defer cancel() + + details := emptyDetails + if len(snapshot.Details) > 0 { + details = string(snapshot.Details) + } + + stmt := pgtable.HealthSnapshots.INSERT( + pgtable.HealthSnapshots.GameID, + pgtable.HealthSnapshots.ContainerID, + pgtable.HealthSnapshots.Status, + pgtable.HealthSnapshots.Source, + pgtable.HealthSnapshots.Details, + pgtable.HealthSnapshots.ObservedAt, + ).VALUES( + snapshot.GameID, + snapshot.ContainerID, + string(snapshot.Status), + string(snapshot.Source), + details, + snapshot.ObservedAt.UTC(), + ).ON_CONFLICT(pgtable.HealthSnapshots.GameID).DO_UPDATE( + pg.SET( + pgtable.HealthSnapshots.ContainerID.SET(pgtable.HealthSnapshots.EXCLUDED.ContainerID), + pgtable.HealthSnapshots.Status.SET(pgtable.HealthSnapshots.EXCLUDED.Status), + pgtable.HealthSnapshots.Source.SET(pgtable.HealthSnapshots.EXCLUDED.Source), + pgtable.HealthSnapshots.Details.SET(pgtable.HealthSnapshots.EXCLUDED.Details), + pgtable.HealthSnapshots.ObservedAt.SET(pgtable.HealthSnapshots.EXCLUDED.ObservedAt), + ), + ) + + query, args := stmt.Sql() + if _, err := store.db.ExecContext(operationCtx, query, args...); err != nil { + return fmt.Errorf("upsert health snapshot: %w", err) + } + return nil +} + +// Get returns the latest snapshot for gameID. It returns +// runtime.ErrNotFound when no snapshot has been recorded yet. +func (store *Store) Get(ctx context.Context, gameID string) (health.HealthSnapshot, error) { + if store == nil || store.db == nil { + return health.HealthSnapshot{}, errors.New("get health snapshot: nil store") + } + if strings.TrimSpace(gameID) == "" { + return health.HealthSnapshot{}, fmt.Errorf("get health snapshot: game id must not be empty") + } + + operationCtx, cancel, err := sqlx.WithTimeout(ctx, "get health snapshot", store.operationTimeout) + if err != nil { + return health.HealthSnapshot{}, err + } + defer cancel() + + stmt := pg.SELECT(healthSnapshotSelectColumns). + FROM(pgtable.HealthSnapshots). + WHERE(pgtable.HealthSnapshots.GameID.EQ(pg.String(gameID))) + + query, args := stmt.Sql() + row := store.db.QueryRowContext(operationCtx, query, args...) + snapshot, err := scanSnapshot(row) + if sqlx.IsNoRows(err) { + return health.HealthSnapshot{}, runtime.ErrNotFound + } + if err != nil { + return health.HealthSnapshot{}, fmt.Errorf("get health snapshot: %w", err) + } + return snapshot, nil +} + +// rowScanner abstracts *sql.Row and *sql.Rows so scanSnapshot can be +// shared across both single-row reads and iterated reads. +type rowScanner interface { + Scan(dest ...any) error +} + +// scanSnapshot scans one health_snapshots row from rs. +func scanSnapshot(rs rowScanner) (health.HealthSnapshot, error) { + var ( + gameID string + containerID string + status string + source string + details []byte + observedAt time.Time + ) + if err := rs.Scan( + &gameID, + &containerID, + &status, + &source, + &details, + &observedAt, + ); err != nil { + return health.HealthSnapshot{}, err + } + return health.HealthSnapshot{ + GameID: gameID, + ContainerID: containerID, + Status: health.SnapshotStatus(status), + Source: health.SnapshotSource(source), + Details: json.RawMessage(details), + ObservedAt: observedAt.UTC(), + }, nil +} + +// Ensure Store satisfies the ports.HealthSnapshotStore interface at +// compile time. +var _ ports.HealthSnapshotStore = (*Store)(nil) diff --git a/rtmanager/internal/adapters/postgres/healthsnapshotstore/store_test.go b/rtmanager/internal/adapters/postgres/healthsnapshotstore/store_test.go new file mode 100644 index 0000000..d982483 --- /dev/null +++ b/rtmanager/internal/adapters/postgres/healthsnapshotstore/store_test.go @@ -0,0 +1,157 @@ +package healthsnapshotstore_test + +import ( + "context" + "encoding/json" + "testing" + "time" + + "galaxy/rtmanager/internal/adapters/postgres/healthsnapshotstore" + "galaxy/rtmanager/internal/adapters/postgres/internal/pgtest" + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/runtime" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestMain(m *testing.M) { pgtest.RunMain(m) } + +func newStore(t *testing.T) *healthsnapshotstore.Store { + t.Helper() + pgtest.TruncateAll(t) + store, err := healthsnapshotstore.New(healthsnapshotstore.Config{ + DB: pgtest.Ensure(t).Pool(), + OperationTimeout: pgtest.OperationTimeout, + }) + require.NoError(t, err) + return store +} + +func probeFailedSnapshot(gameID string, observedAt time.Time) health.HealthSnapshot { + return health.HealthSnapshot{ + GameID: gameID, + ContainerID: "container-1", + Status: health.SnapshotStatusProbeFailed, + Source: health.SnapshotSourceProbe, + Details: json.RawMessage(`{"consecutive_failures":3,"last_status":503,"last_error":"timeout"}`), + ObservedAt: observedAt, + } +} + +func TestUpsertAndGetRoundTrip(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + snapshot := probeFailedSnapshot("game-001", + time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)) + require.NoError(t, store.Upsert(ctx, snapshot)) + + got, err := store.Get(ctx, "game-001") + require.NoError(t, err) + assert.Equal(t, snapshot.GameID, got.GameID) + assert.Equal(t, snapshot.ContainerID, got.ContainerID) + assert.Equal(t, snapshot.Status, got.Status) + assert.Equal(t, snapshot.Source, got.Source) + assert.JSONEq(t, string(snapshot.Details), string(got.Details)) + assert.True(t, snapshot.ObservedAt.Equal(got.ObservedAt)) + assert.Equal(t, time.UTC, got.ObservedAt.Location()) +} + +func TestUpsertOverwritesPriorSnapshot(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + first := probeFailedSnapshot("game-001", + time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)) + require.NoError(t, store.Upsert(ctx, first)) + + second := health.HealthSnapshot{ + GameID: "game-001", + ContainerID: "container-2", + Status: health.SnapshotStatusHealthy, + Source: health.SnapshotSourceInspect, + Details: json.RawMessage(`{"restart_count":0,"state":"running"}`), + ObservedAt: first.ObservedAt.Add(time.Minute), + } + require.NoError(t, store.Upsert(ctx, second)) + + got, err := store.Get(ctx, "game-001") + require.NoError(t, err) + assert.Equal(t, "container-2", got.ContainerID) + assert.Equal(t, health.SnapshotStatusHealthy, got.Status) + assert.Equal(t, health.SnapshotSourceInspect, got.Source) + assert.JSONEq(t, string(second.Details), string(got.Details)) + assert.True(t, second.ObservedAt.Equal(got.ObservedAt)) +} + +func TestGetReturnsNotFound(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + _, err := store.Get(ctx, "game-missing") + require.ErrorIs(t, err, runtime.ErrNotFound) +} + +func TestUpsertEmptyDetailsRoundTripsAsEmptyObject(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + snapshot := probeFailedSnapshot("game-001", + time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)) + snapshot.Details = nil + require.NoError(t, store.Upsert(ctx, snapshot)) + + got, err := store.Get(ctx, "game-001") + require.NoError(t, err) + assert.JSONEq(t, "{}", string(got.Details), + "empty json.RawMessage must round-trip as the SQL default {}, got %q", + string(got.Details)) +} + +func TestUpsertValidatesSnapshot(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + tests := []struct { + name string + mutate func(*health.HealthSnapshot) + }{ + {"empty game id", func(s *health.HealthSnapshot) { s.GameID = "" }}, + {"unknown status", func(s *health.HealthSnapshot) { s.Status = "exotic" }}, + {"unknown source", func(s *health.HealthSnapshot) { s.Source = "exotic" }}, + {"zero observed at", func(s *health.HealthSnapshot) { s.ObservedAt = time.Time{} }}, + {"invalid json details", func(s *health.HealthSnapshot) { + s.Details = json.RawMessage("not json") + }}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + snapshot := probeFailedSnapshot("game-001", + time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)) + tt.mutate(&snapshot) + err := store.Upsert(ctx, snapshot) + require.Error(t, err) + }) + } +} + +func TestGetRejectsEmptyGameID(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + _, err := store.Get(ctx, "") + require.Error(t, err) +} + +func TestNewRejectsNilDB(t *testing.T) { + _, err := healthsnapshotstore.New(healthsnapshotstore.Config{OperationTimeout: time.Second}) + require.Error(t, err) +} + +func TestNewRejectsNonPositiveTimeout(t *testing.T) { + _, err := healthsnapshotstore.New(healthsnapshotstore.Config{ + DB: pgtest.Ensure(t).Pool(), + }) + require.Error(t, err) +} diff --git a/rtmanager/internal/adapters/postgres/internal/pgtest/pgtest.go b/rtmanager/internal/adapters/postgres/internal/pgtest/pgtest.go new file mode 100644 index 0000000..e20c592 --- /dev/null +++ b/rtmanager/internal/adapters/postgres/internal/pgtest/pgtest.go @@ -0,0 +1,209 @@ +// Package pgtest exposes the testcontainers-backed PostgreSQL bootstrap +// shared by every Runtime Manager PG adapter test. The package is regular +// Go code — not a `_test.go` file — so it can be imported by the +// `_test.go` files in the three sibling store packages +// (`runtimerecordstore`, `operationlogstore`, `healthsnapshotstore`). +// +// No production code in `cmd/rtmanager` or in the runtime imports this +// package. The testcontainers-go dependency therefore stays out of the +// production binary's import graph. +package pgtest + +import ( + "context" + "database/sql" + "net/url" + "os" + "sync" + "testing" + "time" + + "galaxy/postgres" + "galaxy/rtmanager/internal/adapters/postgres/migrations" + + testcontainers "github.com/testcontainers/testcontainers-go" + tcpostgres "github.com/testcontainers/testcontainers-go/modules/postgres" + "github.com/testcontainers/testcontainers-go/wait" +) + +const ( + postgresImage = "postgres:16-alpine" + superUser = "galaxy" + superPassword = "galaxy" + superDatabase = "galaxy_rtmanager" + serviceRole = "rtmanagerservice" + servicePassword = "rtmanagerservice" + serviceSchema = "rtmanager" + containerStartup = 90 * time.Second + + // OperationTimeout is the per-statement timeout used by every store + // constructed via the per-package newStore helpers. Tests may pass a + // smaller value if they need to assert deadline behaviour explicitly. + OperationTimeout = 10 * time.Second +) + +// Env holds the per-process container plus the *sql.DB pool already +// provisioned with the rtmanager schema, role, and migrations applied. +type Env struct { + container *tcpostgres.PostgresContainer + pool *sql.DB +} + +// Pool returns the shared pool. Tests truncate per-table state before +// each run via TruncateAll. +func (env *Env) Pool() *sql.DB { return env.pool } + +var ( + once sync.Once + cur *Env + curEr error +) + +// Ensure starts the PostgreSQL container on first invocation and applies +// the embedded goose migrations. Subsequent invocations reuse the same +// container/pool. When Docker is unavailable Ensure calls t.Skip with the +// underlying error so the test suite still passes on machines without +// Docker. +func Ensure(t testing.TB) *Env { + t.Helper() + once.Do(func() { + cur, curEr = start() + }) + if curEr != nil { + t.Skipf("postgres container start failed (Docker unavailable?): %v", curEr) + } + return cur +} + +// TruncateAll wipes every Runtime Manager table inside the shared pool, +// leaving the schema and indexes intact. Use it from each test that needs +// a clean slate. +func TruncateAll(t testing.TB) { + t.Helper() + env := Ensure(t) + const stmt = `TRUNCATE TABLE runtime_records, operation_log, health_snapshots RESTART IDENTITY CASCADE` + if _, err := env.pool.ExecContext(context.Background(), stmt); err != nil { + t.Fatalf("truncate rtmanager tables: %v", err) + } +} + +// Shutdown terminates the shared container and closes the pool. It is +// invoked from each test package's TestMain after `m.Run` returns so the +// container is released even if individual tests panic. +func Shutdown() { + if cur == nil { + return + } + if cur.pool != nil { + _ = cur.pool.Close() + } + if cur.container != nil { + _ = testcontainers.TerminateContainer(cur.container) + } + cur = nil +} + +// RunMain is a convenience helper for each store package's TestMain: it +// runs the test main, captures the exit code, shuts the container down, +// and exits. Wiring it through one helper keeps every TestMain to two +// lines. +func RunMain(m *testing.M) { + code := m.Run() + Shutdown() + os.Exit(code) +} + +func start() (*Env, error) { + ctx := context.Background() + container, err := tcpostgres.Run(ctx, postgresImage, + tcpostgres.WithDatabase(superDatabase), + tcpostgres.WithUsername(superUser), + tcpostgres.WithPassword(superPassword), + testcontainers.WithWaitStrategy( + wait.ForLog("database system is ready to accept connections"). + WithOccurrence(2). + WithStartupTimeout(containerStartup), + ), + ) + if err != nil { + return nil, err + } + baseDSN, err := container.ConnectionString(ctx, "sslmode=disable") + if err != nil { + _ = testcontainers.TerminateContainer(container) + return nil, err + } + if err := provisionRoleAndSchema(ctx, baseDSN); err != nil { + _ = testcontainers.TerminateContainer(container) + return nil, err + } + scopedDSN, err := dsnForServiceRole(baseDSN) + if err != nil { + _ = testcontainers.TerminateContainer(container) + return nil, err + } + cfg := postgres.DefaultConfig() + cfg.PrimaryDSN = scopedDSN + cfg.OperationTimeout = OperationTimeout + pool, err := postgres.OpenPrimary(ctx, cfg) + if err != nil { + _ = testcontainers.TerminateContainer(container) + return nil, err + } + if err := postgres.Ping(ctx, pool, OperationTimeout); err != nil { + _ = pool.Close() + _ = testcontainers.TerminateContainer(container) + return nil, err + } + if err := postgres.RunMigrations(ctx, pool, migrations.FS(), "."); err != nil { + _ = pool.Close() + _ = testcontainers.TerminateContainer(container) + return nil, err + } + return &Env{container: container, pool: pool}, nil +} + +func provisionRoleAndSchema(ctx context.Context, baseDSN string) error { + cfg := postgres.DefaultConfig() + cfg.PrimaryDSN = baseDSN + cfg.OperationTimeout = OperationTimeout + db, err := postgres.OpenPrimary(ctx, cfg) + if err != nil { + return err + } + defer func() { _ = db.Close() }() + + statements := []string{ + `DO $$ BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'rtmanagerservice') THEN + CREATE ROLE rtmanagerservice LOGIN PASSWORD 'rtmanagerservice'; + END IF; + END $$;`, + `CREATE SCHEMA IF NOT EXISTS rtmanager AUTHORIZATION rtmanagerservice;`, + `GRANT USAGE ON SCHEMA rtmanager TO rtmanagerservice;`, + } + for _, statement := range statements { + if _, err := db.ExecContext(ctx, statement); err != nil { + return err + } + } + return nil +} + +func dsnForServiceRole(baseDSN string) (string, error) { + parsed, err := url.Parse(baseDSN) + if err != nil { + return "", err + } + values := url.Values{} + values.Set("search_path", serviceSchema) + values.Set("sslmode", "disable") + scoped := url.URL{ + Scheme: parsed.Scheme, + User: url.UserPassword(serviceRole, servicePassword), + Host: parsed.Host, + Path: parsed.Path, + RawQuery: values.Encode(), + } + return scoped.String(), nil +} diff --git a/rtmanager/internal/adapters/postgres/internal/sqlx/sqlx.go b/rtmanager/internal/adapters/postgres/internal/sqlx/sqlx.go new file mode 100644 index 0000000..88747be --- /dev/null +++ b/rtmanager/internal/adapters/postgres/internal/sqlx/sqlx.go @@ -0,0 +1,112 @@ +// Package sqlx contains the small set of helpers shared by every Runtime +// Manager PostgreSQL adapter (runtimerecordstore, operationlogstore, +// healthsnapshotstore). The helpers centralise the boundary translations +// for nullable timestamps and the pgx SQLSTATE codes the adapters +// interpret as domain conflicts. +package sqlx + +import ( + "context" + "database/sql" + "errors" + "fmt" + "time" + + "github.com/jackc/pgx/v5/pgconn" +) + +// PgUniqueViolationCode identifies the SQLSTATE returned by PostgreSQL +// when a UNIQUE constraint is violated by INSERT or UPDATE. +const PgUniqueViolationCode = "23505" + +// IsUniqueViolation reports whether err is a PostgreSQL unique-violation, +// regardless of constraint name. +func IsUniqueViolation(err error) bool { + var pgErr *pgconn.PgError + if !errors.As(err, &pgErr) { + return false + } + return pgErr.Code == PgUniqueViolationCode +} + +// IsNoRows reports whether err is sql.ErrNoRows. +func IsNoRows(err error) bool { + return errors.Is(err, sql.ErrNoRows) +} + +// NullableTime returns t.UTC() when non-zero, otherwise nil so the column +// is bound as SQL NULL. +func NullableTime(t time.Time) any { + if t.IsZero() { + return nil + } + return t.UTC() +} + +// NullableTimePtr returns t.UTC() when t is non-nil and non-zero, otherwise +// nil. Companion of NullableTime for domain types that use *time.Time to +// express absent timestamps. +func NullableTimePtr(t *time.Time) any { + if t == nil { + return nil + } + return NullableTime(*t) +} + +// NullableString returns value when non-empty, otherwise nil so the column +// is bound as SQL NULL. Used for Runtime Manager columns that map empty +// domain strings to NULL (current_container_id, current_image_ref). +func NullableString(value string) any { + if value == "" { + return nil + } + return value +} + +// StringFromNullable copies an optional sql.NullString into a domain +// string. NULL becomes the empty string, matching the Runtime Manager +// domain convention that empty == NULL for nullable text columns. +func StringFromNullable(value sql.NullString) string { + if !value.Valid { + return "" + } + return value.String +} + +// TimeFromNullable copies an optional sql.NullTime into a domain +// time.Time, applying the global UTC normalisation rule. NULL values +// become the zero time.Time. +func TimeFromNullable(value sql.NullTime) time.Time { + if !value.Valid { + return time.Time{} + } + return value.Time.UTC() +} + +// TimePtrFromNullable copies an optional sql.NullTime into a domain +// *time.Time. NULL becomes nil; non-NULL values are wrapped after UTC +// normalisation. +func TimePtrFromNullable(value sql.NullTime) *time.Time { + if !value.Valid { + return nil + } + t := value.Time.UTC() + return &t +} + +// WithTimeout derives a child context bounded by timeout and prefixes +// context errors with operation. Callers must always invoke the returned +// cancel. +func WithTimeout(ctx context.Context, operation string, timeout time.Duration) (context.Context, context.CancelFunc, error) { + if ctx == nil { + return nil, nil, fmt.Errorf("%s: nil context", operation) + } + if err := ctx.Err(); err != nil { + return nil, nil, fmt.Errorf("%s: %w", operation, err) + } + if timeout <= 0 { + return nil, nil, fmt.Errorf("%s: operation timeout must be positive", operation) + } + bounded, cancel := context.WithTimeout(ctx, timeout) + return bounded, cancel, nil +} diff --git a/rtmanager/internal/adapters/postgres/jet/rtmanager/model/goose_db_version.go b/rtmanager/internal/adapters/postgres/jet/rtmanager/model/goose_db_version.go new file mode 100644 index 0000000..c7f68e8 --- /dev/null +++ b/rtmanager/internal/adapters/postgres/jet/rtmanager/model/goose_db_version.go @@ -0,0 +1,19 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package model + +import ( + "time" +) + +type GooseDbVersion struct { + ID int32 `sql:"primary_key"` + VersionID int64 + IsApplied bool + Tstamp time.Time +} diff --git a/rtmanager/internal/adapters/postgres/jet/rtmanager/model/health_snapshots.go b/rtmanager/internal/adapters/postgres/jet/rtmanager/model/health_snapshots.go new file mode 100644 index 0000000..fcedd9c --- /dev/null +++ b/rtmanager/internal/adapters/postgres/jet/rtmanager/model/health_snapshots.go @@ -0,0 +1,21 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package model + +import ( + "time" +) + +type HealthSnapshots struct { + GameID string `sql:"primary_key"` + ContainerID string + Status string + Source string + Details string + ObservedAt time.Time +} diff --git a/rtmanager/internal/adapters/postgres/jet/rtmanager/model/operation_log.go b/rtmanager/internal/adapters/postgres/jet/rtmanager/model/operation_log.go new file mode 100644 index 0000000..30b53f8 --- /dev/null +++ b/rtmanager/internal/adapters/postgres/jet/rtmanager/model/operation_log.go @@ -0,0 +1,27 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package model + +import ( + "time" +) + +type OperationLog struct { + ID int64 `sql:"primary_key"` + GameID string + OpKind string + OpSource string + SourceRef string + ImageRef string + ContainerID string + Outcome string + ErrorCode string + ErrorMessage string + StartedAt time.Time + FinishedAt *time.Time +} diff --git a/rtmanager/internal/adapters/postgres/jet/rtmanager/model/runtime_records.go b/rtmanager/internal/adapters/postgres/jet/rtmanager/model/runtime_records.go new file mode 100644 index 0000000..dddfd06 --- /dev/null +++ b/rtmanager/internal/adapters/postgres/jet/rtmanager/model/runtime_records.go @@ -0,0 +1,27 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package model + +import ( + "time" +) + +type RuntimeRecords struct { + GameID string `sql:"primary_key"` + Status string + CurrentContainerID *string + CurrentImageRef *string + EngineEndpoint string + StatePath string + DockerNetwork string + StartedAt *time.Time + StoppedAt *time.Time + RemovedAt *time.Time + LastOpAt time.Time + CreatedAt time.Time +} diff --git a/rtmanager/internal/adapters/postgres/jet/rtmanager/table/goose_db_version.go b/rtmanager/internal/adapters/postgres/jet/rtmanager/table/goose_db_version.go new file mode 100644 index 0000000..c3eb7d3 --- /dev/null +++ b/rtmanager/internal/adapters/postgres/jet/rtmanager/table/goose_db_version.go @@ -0,0 +1,87 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package table + +import ( + "github.com/go-jet/jet/v2/postgres" +) + +var GooseDbVersion = newGooseDbVersionTable("rtmanager", "goose_db_version", "") + +type gooseDbVersionTable struct { + postgres.Table + + // Columns + ID postgres.ColumnInteger + VersionID postgres.ColumnInteger + IsApplied postgres.ColumnBool + Tstamp postgres.ColumnTimestamp + + AllColumns postgres.ColumnList + MutableColumns postgres.ColumnList + DefaultColumns postgres.ColumnList +} + +type GooseDbVersionTable struct { + gooseDbVersionTable + + EXCLUDED gooseDbVersionTable +} + +// AS creates new GooseDbVersionTable with assigned alias +func (a GooseDbVersionTable) AS(alias string) *GooseDbVersionTable { + return newGooseDbVersionTable(a.SchemaName(), a.TableName(), alias) +} + +// Schema creates new GooseDbVersionTable with assigned schema name +func (a GooseDbVersionTable) FromSchema(schemaName string) *GooseDbVersionTable { + return newGooseDbVersionTable(schemaName, a.TableName(), a.Alias()) +} + +// WithPrefix creates new GooseDbVersionTable with assigned table prefix +func (a GooseDbVersionTable) WithPrefix(prefix string) *GooseDbVersionTable { + return newGooseDbVersionTable(a.SchemaName(), prefix+a.TableName(), a.TableName()) +} + +// WithSuffix creates new GooseDbVersionTable with assigned table suffix +func (a GooseDbVersionTable) WithSuffix(suffix string) *GooseDbVersionTable { + return newGooseDbVersionTable(a.SchemaName(), a.TableName()+suffix, a.TableName()) +} + +func newGooseDbVersionTable(schemaName, tableName, alias string) *GooseDbVersionTable { + return &GooseDbVersionTable{ + gooseDbVersionTable: newGooseDbVersionTableImpl(schemaName, tableName, alias), + EXCLUDED: newGooseDbVersionTableImpl("", "excluded", ""), + } +} + +func newGooseDbVersionTableImpl(schemaName, tableName, alias string) gooseDbVersionTable { + var ( + IDColumn = postgres.IntegerColumn("id") + VersionIDColumn = postgres.IntegerColumn("version_id") + IsAppliedColumn = postgres.BoolColumn("is_applied") + TstampColumn = postgres.TimestampColumn("tstamp") + allColumns = postgres.ColumnList{IDColumn, VersionIDColumn, IsAppliedColumn, TstampColumn} + mutableColumns = postgres.ColumnList{VersionIDColumn, IsAppliedColumn, TstampColumn} + defaultColumns = postgres.ColumnList{TstampColumn} + ) + + return gooseDbVersionTable{ + Table: postgres.NewTable(schemaName, tableName, alias, allColumns...), + + //Columns + ID: IDColumn, + VersionID: VersionIDColumn, + IsApplied: IsAppliedColumn, + Tstamp: TstampColumn, + + AllColumns: allColumns, + MutableColumns: mutableColumns, + DefaultColumns: defaultColumns, + } +} diff --git a/rtmanager/internal/adapters/postgres/jet/rtmanager/table/health_snapshots.go b/rtmanager/internal/adapters/postgres/jet/rtmanager/table/health_snapshots.go new file mode 100644 index 0000000..adde570 --- /dev/null +++ b/rtmanager/internal/adapters/postgres/jet/rtmanager/table/health_snapshots.go @@ -0,0 +1,93 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package table + +import ( + "github.com/go-jet/jet/v2/postgres" +) + +var HealthSnapshots = newHealthSnapshotsTable("rtmanager", "health_snapshots", "") + +type healthSnapshotsTable struct { + postgres.Table + + // Columns + GameID postgres.ColumnString + ContainerID postgres.ColumnString + Status postgres.ColumnString + Source postgres.ColumnString + Details postgres.ColumnString + ObservedAt postgres.ColumnTimestampz + + AllColumns postgres.ColumnList + MutableColumns postgres.ColumnList + DefaultColumns postgres.ColumnList +} + +type HealthSnapshotsTable struct { + healthSnapshotsTable + + EXCLUDED healthSnapshotsTable +} + +// AS creates new HealthSnapshotsTable with assigned alias +func (a HealthSnapshotsTable) AS(alias string) *HealthSnapshotsTable { + return newHealthSnapshotsTable(a.SchemaName(), a.TableName(), alias) +} + +// Schema creates new HealthSnapshotsTable with assigned schema name +func (a HealthSnapshotsTable) FromSchema(schemaName string) *HealthSnapshotsTable { + return newHealthSnapshotsTable(schemaName, a.TableName(), a.Alias()) +} + +// WithPrefix creates new HealthSnapshotsTable with assigned table prefix +func (a HealthSnapshotsTable) WithPrefix(prefix string) *HealthSnapshotsTable { + return newHealthSnapshotsTable(a.SchemaName(), prefix+a.TableName(), a.TableName()) +} + +// WithSuffix creates new HealthSnapshotsTable with assigned table suffix +func (a HealthSnapshotsTable) WithSuffix(suffix string) *HealthSnapshotsTable { + return newHealthSnapshotsTable(a.SchemaName(), a.TableName()+suffix, a.TableName()) +} + +func newHealthSnapshotsTable(schemaName, tableName, alias string) *HealthSnapshotsTable { + return &HealthSnapshotsTable{ + healthSnapshotsTable: newHealthSnapshotsTableImpl(schemaName, tableName, alias), + EXCLUDED: newHealthSnapshotsTableImpl("", "excluded", ""), + } +} + +func newHealthSnapshotsTableImpl(schemaName, tableName, alias string) healthSnapshotsTable { + var ( + GameIDColumn = postgres.StringColumn("game_id") + ContainerIDColumn = postgres.StringColumn("container_id") + StatusColumn = postgres.StringColumn("status") + SourceColumn = postgres.StringColumn("source") + DetailsColumn = postgres.StringColumn("details") + ObservedAtColumn = postgres.TimestampzColumn("observed_at") + allColumns = postgres.ColumnList{GameIDColumn, ContainerIDColumn, StatusColumn, SourceColumn, DetailsColumn, ObservedAtColumn} + mutableColumns = postgres.ColumnList{ContainerIDColumn, StatusColumn, SourceColumn, DetailsColumn, ObservedAtColumn} + defaultColumns = postgres.ColumnList{ContainerIDColumn, DetailsColumn} + ) + + return healthSnapshotsTable{ + Table: postgres.NewTable(schemaName, tableName, alias, allColumns...), + + //Columns + GameID: GameIDColumn, + ContainerID: ContainerIDColumn, + Status: StatusColumn, + Source: SourceColumn, + Details: DetailsColumn, + ObservedAt: ObservedAtColumn, + + AllColumns: allColumns, + MutableColumns: mutableColumns, + DefaultColumns: defaultColumns, + } +} diff --git a/rtmanager/internal/adapters/postgres/jet/rtmanager/table/operation_log.go b/rtmanager/internal/adapters/postgres/jet/rtmanager/table/operation_log.go new file mode 100644 index 0000000..0c5e929 --- /dev/null +++ b/rtmanager/internal/adapters/postgres/jet/rtmanager/table/operation_log.go @@ -0,0 +1,111 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package table + +import ( + "github.com/go-jet/jet/v2/postgres" +) + +var OperationLog = newOperationLogTable("rtmanager", "operation_log", "") + +type operationLogTable struct { + postgres.Table + + // Columns + ID postgres.ColumnInteger + GameID postgres.ColumnString + OpKind postgres.ColumnString + OpSource postgres.ColumnString + SourceRef postgres.ColumnString + ImageRef postgres.ColumnString + ContainerID postgres.ColumnString + Outcome postgres.ColumnString + ErrorCode postgres.ColumnString + ErrorMessage postgres.ColumnString + StartedAt postgres.ColumnTimestampz + FinishedAt postgres.ColumnTimestampz + + AllColumns postgres.ColumnList + MutableColumns postgres.ColumnList + DefaultColumns postgres.ColumnList +} + +type OperationLogTable struct { + operationLogTable + + EXCLUDED operationLogTable +} + +// AS creates new OperationLogTable with assigned alias +func (a OperationLogTable) AS(alias string) *OperationLogTable { + return newOperationLogTable(a.SchemaName(), a.TableName(), alias) +} + +// Schema creates new OperationLogTable with assigned schema name +func (a OperationLogTable) FromSchema(schemaName string) *OperationLogTable { + return newOperationLogTable(schemaName, a.TableName(), a.Alias()) +} + +// WithPrefix creates new OperationLogTable with assigned table prefix +func (a OperationLogTable) WithPrefix(prefix string) *OperationLogTable { + return newOperationLogTable(a.SchemaName(), prefix+a.TableName(), a.TableName()) +} + +// WithSuffix creates new OperationLogTable with assigned table suffix +func (a OperationLogTable) WithSuffix(suffix string) *OperationLogTable { + return newOperationLogTable(a.SchemaName(), a.TableName()+suffix, a.TableName()) +} + +func newOperationLogTable(schemaName, tableName, alias string) *OperationLogTable { + return &OperationLogTable{ + operationLogTable: newOperationLogTableImpl(schemaName, tableName, alias), + EXCLUDED: newOperationLogTableImpl("", "excluded", ""), + } +} + +func newOperationLogTableImpl(schemaName, tableName, alias string) operationLogTable { + var ( + IDColumn = postgres.IntegerColumn("id") + GameIDColumn = postgres.StringColumn("game_id") + OpKindColumn = postgres.StringColumn("op_kind") + OpSourceColumn = postgres.StringColumn("op_source") + SourceRefColumn = postgres.StringColumn("source_ref") + ImageRefColumn = postgres.StringColumn("image_ref") + ContainerIDColumn = postgres.StringColumn("container_id") + OutcomeColumn = postgres.StringColumn("outcome") + ErrorCodeColumn = postgres.StringColumn("error_code") + ErrorMessageColumn = postgres.StringColumn("error_message") + StartedAtColumn = postgres.TimestampzColumn("started_at") + FinishedAtColumn = postgres.TimestampzColumn("finished_at") + allColumns = postgres.ColumnList{IDColumn, GameIDColumn, OpKindColumn, OpSourceColumn, SourceRefColumn, ImageRefColumn, ContainerIDColumn, OutcomeColumn, ErrorCodeColumn, ErrorMessageColumn, StartedAtColumn, FinishedAtColumn} + mutableColumns = postgres.ColumnList{GameIDColumn, OpKindColumn, OpSourceColumn, SourceRefColumn, ImageRefColumn, ContainerIDColumn, OutcomeColumn, ErrorCodeColumn, ErrorMessageColumn, StartedAtColumn, FinishedAtColumn} + defaultColumns = postgres.ColumnList{IDColumn, SourceRefColumn, ImageRefColumn, ContainerIDColumn, ErrorCodeColumn, ErrorMessageColumn} + ) + + return operationLogTable{ + Table: postgres.NewTable(schemaName, tableName, alias, allColumns...), + + //Columns + ID: IDColumn, + GameID: GameIDColumn, + OpKind: OpKindColumn, + OpSource: OpSourceColumn, + SourceRef: SourceRefColumn, + ImageRef: ImageRefColumn, + ContainerID: ContainerIDColumn, + Outcome: OutcomeColumn, + ErrorCode: ErrorCodeColumn, + ErrorMessage: ErrorMessageColumn, + StartedAt: StartedAtColumn, + FinishedAt: FinishedAtColumn, + + AllColumns: allColumns, + MutableColumns: mutableColumns, + DefaultColumns: defaultColumns, + } +} diff --git a/rtmanager/internal/adapters/postgres/jet/rtmanager/table/runtime_records.go b/rtmanager/internal/adapters/postgres/jet/rtmanager/table/runtime_records.go new file mode 100644 index 0000000..ec4527f --- /dev/null +++ b/rtmanager/internal/adapters/postgres/jet/rtmanager/table/runtime_records.go @@ -0,0 +1,111 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package table + +import ( + "github.com/go-jet/jet/v2/postgres" +) + +var RuntimeRecords = newRuntimeRecordsTable("rtmanager", "runtime_records", "") + +type runtimeRecordsTable struct { + postgres.Table + + // Columns + GameID postgres.ColumnString + Status postgres.ColumnString + CurrentContainerID postgres.ColumnString + CurrentImageRef postgres.ColumnString + EngineEndpoint postgres.ColumnString + StatePath postgres.ColumnString + DockerNetwork postgres.ColumnString + StartedAt postgres.ColumnTimestampz + StoppedAt postgres.ColumnTimestampz + RemovedAt postgres.ColumnTimestampz + LastOpAt postgres.ColumnTimestampz + CreatedAt postgres.ColumnTimestampz + + AllColumns postgres.ColumnList + MutableColumns postgres.ColumnList + DefaultColumns postgres.ColumnList +} + +type RuntimeRecordsTable struct { + runtimeRecordsTable + + EXCLUDED runtimeRecordsTable +} + +// AS creates new RuntimeRecordsTable with assigned alias +func (a RuntimeRecordsTable) AS(alias string) *RuntimeRecordsTable { + return newRuntimeRecordsTable(a.SchemaName(), a.TableName(), alias) +} + +// Schema creates new RuntimeRecordsTable with assigned schema name +func (a RuntimeRecordsTable) FromSchema(schemaName string) *RuntimeRecordsTable { + return newRuntimeRecordsTable(schemaName, a.TableName(), a.Alias()) +} + +// WithPrefix creates new RuntimeRecordsTable with assigned table prefix +func (a RuntimeRecordsTable) WithPrefix(prefix string) *RuntimeRecordsTable { + return newRuntimeRecordsTable(a.SchemaName(), prefix+a.TableName(), a.TableName()) +} + +// WithSuffix creates new RuntimeRecordsTable with assigned table suffix +func (a RuntimeRecordsTable) WithSuffix(suffix string) *RuntimeRecordsTable { + return newRuntimeRecordsTable(a.SchemaName(), a.TableName()+suffix, a.TableName()) +} + +func newRuntimeRecordsTable(schemaName, tableName, alias string) *RuntimeRecordsTable { + return &RuntimeRecordsTable{ + runtimeRecordsTable: newRuntimeRecordsTableImpl(schemaName, tableName, alias), + EXCLUDED: newRuntimeRecordsTableImpl("", "excluded", ""), + } +} + +func newRuntimeRecordsTableImpl(schemaName, tableName, alias string) runtimeRecordsTable { + var ( + GameIDColumn = postgres.StringColumn("game_id") + StatusColumn = postgres.StringColumn("status") + CurrentContainerIDColumn = postgres.StringColumn("current_container_id") + CurrentImageRefColumn = postgres.StringColumn("current_image_ref") + EngineEndpointColumn = postgres.StringColumn("engine_endpoint") + StatePathColumn = postgres.StringColumn("state_path") + DockerNetworkColumn = postgres.StringColumn("docker_network") + StartedAtColumn = postgres.TimestampzColumn("started_at") + StoppedAtColumn = postgres.TimestampzColumn("stopped_at") + RemovedAtColumn = postgres.TimestampzColumn("removed_at") + LastOpAtColumn = postgres.TimestampzColumn("last_op_at") + CreatedAtColumn = postgres.TimestampzColumn("created_at") + allColumns = postgres.ColumnList{GameIDColumn, StatusColumn, CurrentContainerIDColumn, CurrentImageRefColumn, EngineEndpointColumn, StatePathColumn, DockerNetworkColumn, StartedAtColumn, StoppedAtColumn, RemovedAtColumn, LastOpAtColumn, CreatedAtColumn} + mutableColumns = postgres.ColumnList{StatusColumn, CurrentContainerIDColumn, CurrentImageRefColumn, EngineEndpointColumn, StatePathColumn, DockerNetworkColumn, StartedAtColumn, StoppedAtColumn, RemovedAtColumn, LastOpAtColumn, CreatedAtColumn} + defaultColumns = postgres.ColumnList{} + ) + + return runtimeRecordsTable{ + Table: postgres.NewTable(schemaName, tableName, alias, allColumns...), + + //Columns + GameID: GameIDColumn, + Status: StatusColumn, + CurrentContainerID: CurrentContainerIDColumn, + CurrentImageRef: CurrentImageRefColumn, + EngineEndpoint: EngineEndpointColumn, + StatePath: StatePathColumn, + DockerNetwork: DockerNetworkColumn, + StartedAt: StartedAtColumn, + StoppedAt: StoppedAtColumn, + RemovedAt: RemovedAtColumn, + LastOpAt: LastOpAtColumn, + CreatedAt: CreatedAtColumn, + + AllColumns: allColumns, + MutableColumns: mutableColumns, + DefaultColumns: defaultColumns, + } +} diff --git a/rtmanager/internal/adapters/postgres/jet/rtmanager/table/table_use_schema.go b/rtmanager/internal/adapters/postgres/jet/rtmanager/table/table_use_schema.go new file mode 100644 index 0000000..69980da --- /dev/null +++ b/rtmanager/internal/adapters/postgres/jet/rtmanager/table/table_use_schema.go @@ -0,0 +1,17 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package table + +// UseSchema sets a new schema name for all generated table SQL builder types. It is recommended to invoke +// this method only once at the beginning of the program. +func UseSchema(schema string) { + GooseDbVersion = GooseDbVersion.FromSchema(schema) + HealthSnapshots = HealthSnapshots.FromSchema(schema) + OperationLog = OperationLog.FromSchema(schema) + RuntimeRecords = RuntimeRecords.FromSchema(schema) +} diff --git a/rtmanager/internal/adapters/postgres/migrations/00001_init.sql b/rtmanager/internal/adapters/postgres/migrations/00001_init.sql new file mode 100644 index 0000000..e7eb011 --- /dev/null +++ b/rtmanager/internal/adapters/postgres/migrations/00001_init.sql @@ -0,0 +1,106 @@ +-- +goose Up +-- Initial Runtime Manager PostgreSQL schema. +-- +-- Three tables cover the durable surface of the service: +-- * runtime_records — one row per game with the latest known runtime +-- status and Docker container binding; +-- * operation_log — append-only audit of every start/stop/restart/ +-- patch/cleanup/reconcile_* operation RTM performed; +-- * health_snapshots — latest technical health observation per game. +-- +-- Schema and the matching `rtmanagerservice` role are provisioned +-- outside this script (in tests via cmd/jetgen/main.go::provisionRoleAndSchema; +-- in production via an ops init script). This migration runs as the +-- schema owner with `search_path=rtmanager` and only contains DDL for the +-- service-owned tables and indexes. ARCHITECTURE.md §Database topology +-- mandates that the per-service role's grants stay restricted to its own +-- schema; consequently this file deliberately deviates from PLAN.md +-- Stage 09's literal `CREATE SCHEMA IF NOT EXISTS rtmanager;` instruction. + +-- runtime_records holds one durable record per game with the latest +-- known runtime status and Docker container binding. The status enum +-- (running | stopped | removed) is enforced by a CHECK so domain code +-- can rely on it without reading every callsite. The (status, last_op_at) +-- index drives the periodic container-cleanup worker that scans +-- `status='stopped' AND last_op_at < now() - retention`. +CREATE TABLE runtime_records ( + game_id text PRIMARY KEY, + status text NOT NULL, + current_container_id text, + current_image_ref text, + engine_endpoint text NOT NULL, + state_path text NOT NULL, + docker_network text NOT NULL, + started_at timestamptz, + stopped_at timestamptz, + removed_at timestamptz, + last_op_at timestamptz NOT NULL, + created_at timestamptz NOT NULL, + CONSTRAINT runtime_records_status_chk + CHECK (status IN ('running', 'stopped', 'removed')) +); + +CREATE INDEX runtime_records_status_last_op_idx + ON runtime_records (status, last_op_at); + +-- operation_log is an append-only audit of every operation Runtime +-- Manager performed against a game's runtime. The (game_id, started_at +-- DESC) index drives audit reads from the GM/Admin REST surface; +-- finished_at is nullable for in-flight rows even though Stage 13+ +-- always finalises the row in the same transaction. The op_kind / +-- op_source / outcome enums are enforced by CHECK constraints to keep +-- the audit schema honest without a separate Go validator. +CREATE TABLE operation_log ( + id bigserial PRIMARY KEY, + game_id text NOT NULL, + op_kind text NOT NULL, + op_source text NOT NULL, + source_ref text NOT NULL DEFAULT '', + image_ref text NOT NULL DEFAULT '', + container_id text NOT NULL DEFAULT '', + outcome text NOT NULL, + error_code text NOT NULL DEFAULT '', + error_message text NOT NULL DEFAULT '', + started_at timestamptz NOT NULL, + finished_at timestamptz, + CONSTRAINT operation_log_op_kind_chk + CHECK (op_kind IN ( + 'start', 'stop', 'restart', 'patch', + 'cleanup_container', 'reconcile_adopt', 'reconcile_dispose' + )), + CONSTRAINT operation_log_op_source_chk + CHECK (op_source IN ( + 'lobby_stream', 'gm_rest', 'admin_rest', + 'auto_ttl', 'auto_reconcile' + )), + CONSTRAINT operation_log_outcome_chk + CHECK (outcome IN ('success', 'failure')) +); + +CREATE INDEX operation_log_game_started_idx + ON operation_log (game_id, started_at DESC); + +-- health_snapshots stores the latest technical health observation per +-- game. One row per game; later observations overwrite. The status enum +-- mirrors the `event_type` vocabulary on `runtime:health_events` +-- (collapsed to a flat status column for the latest-observation view). +CREATE TABLE health_snapshots ( + game_id text PRIMARY KEY, + container_id text NOT NULL DEFAULT '', + status text NOT NULL, + source text NOT NULL, + details jsonb NOT NULL DEFAULT '{}'::jsonb, + observed_at timestamptz NOT NULL, + CONSTRAINT health_snapshots_status_chk + CHECK (status IN ( + 'healthy', 'probe_failed', 'exited', + 'oom', 'inspect_unhealthy', 'container_disappeared' + )), + CONSTRAINT health_snapshots_source_chk + CHECK (source IN ('docker_event', 'inspect', 'probe')) +); + +-- +goose Down +DROP TABLE IF EXISTS health_snapshots; +DROP TABLE IF EXISTS operation_log; +DROP TABLE IF EXISTS runtime_records; diff --git a/rtmanager/internal/adapters/postgres/migrations/migrations.go b/rtmanager/internal/adapters/postgres/migrations/migrations.go new file mode 100644 index 0000000..1ab42ab --- /dev/null +++ b/rtmanager/internal/adapters/postgres/migrations/migrations.go @@ -0,0 +1,19 @@ +// Package migrations exposes the embedded goose migration files used by +// Runtime Manager to provision its `rtmanager` schema in PostgreSQL. +// +// The embedded filesystem is consumed by `pkg/postgres.RunMigrations` +// during rtmanager-service startup and by `cmd/jetgen` when regenerating +// the `internal/adapters/postgres/jet/` code against a transient +// PostgreSQL instance. +package migrations + +import "embed" + +//go:embed *.sql +var fs embed.FS + +// FS returns the embedded filesystem containing every numbered goose +// migration shipped with Runtime Manager. +func FS() embed.FS { + return fs +} diff --git a/rtmanager/internal/adapters/postgres/operationlogstore/store.go b/rtmanager/internal/adapters/postgres/operationlogstore/store.go new file mode 100644 index 0000000..ca097a8 --- /dev/null +++ b/rtmanager/internal/adapters/postgres/operationlogstore/store.go @@ -0,0 +1,235 @@ +// Package operationlogstore implements the PostgreSQL-backed adapter for +// `ports.OperationLogStore`. +// +// The package owns the on-disk shape of the `operation_log` table defined +// in +// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql` +// and translates the schema-agnostic `ports.OperationLogStore` interface +// declared in `internal/ports/operationlogstore.go` into concrete +// go-jet/v2 statements driven by the pgx driver. +// +// Append uses `INSERT ... RETURNING id` to surface the bigserial id back +// to callers; ListByGame is index-driven by `operation_log_game_started_idx`. +package operationlogstore + +import ( + "context" + "database/sql" + "errors" + "fmt" + "strings" + "time" + + "galaxy/rtmanager/internal/adapters/postgres/internal/sqlx" + pgtable "galaxy/rtmanager/internal/adapters/postgres/jet/rtmanager/table" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/ports" + + pg "github.com/go-jet/jet/v2/postgres" +) + +// Config configures one PostgreSQL-backed operation-log store instance. +type Config struct { + // DB stores the connection pool the store uses for every query. + DB *sql.DB + + // OperationTimeout bounds one round trip. + OperationTimeout time.Duration +} + +// Store persists Runtime Manager operation-log entries in PostgreSQL. +type Store struct { + db *sql.DB + operationTimeout time.Duration +} + +// New constructs one PostgreSQL-backed operation-log store from cfg. +func New(cfg Config) (*Store, error) { + if cfg.DB == nil { + return nil, errors.New("new postgres operation log store: db must not be nil") + } + if cfg.OperationTimeout <= 0 { + return nil, errors.New("new postgres operation log store: operation timeout must be positive") + } + return &Store{ + db: cfg.DB, + operationTimeout: cfg.OperationTimeout, + }, nil +} + +// operationLogSelectColumns is the canonical SELECT list for the +// operation_log table, matching scanEntry's column order. +var operationLogSelectColumns = pg.ColumnList{ + pgtable.OperationLog.ID, + pgtable.OperationLog.GameID, + pgtable.OperationLog.OpKind, + pgtable.OperationLog.OpSource, + pgtable.OperationLog.SourceRef, + pgtable.OperationLog.ImageRef, + pgtable.OperationLog.ContainerID, + pgtable.OperationLog.Outcome, + pgtable.OperationLog.ErrorCode, + pgtable.OperationLog.ErrorMessage, + pgtable.OperationLog.StartedAt, + pgtable.OperationLog.FinishedAt, +} + +// Append inserts entry into the operation log and returns the generated +// bigserial id. entry is validated through operation.OperationEntry.Validate +// before the SQL is issued. +func (store *Store) Append(ctx context.Context, entry operation.OperationEntry) (int64, error) { + if store == nil || store.db == nil { + return 0, errors.New("append operation log entry: nil store") + } + if err := entry.Validate(); err != nil { + return 0, fmt.Errorf("append operation log entry: %w", err) + } + + operationCtx, cancel, err := sqlx.WithTimeout(ctx, "append operation log entry", store.operationTimeout) + if err != nil { + return 0, err + } + defer cancel() + + stmt := pgtable.OperationLog.INSERT( + pgtable.OperationLog.GameID, + pgtable.OperationLog.OpKind, + pgtable.OperationLog.OpSource, + pgtable.OperationLog.SourceRef, + pgtable.OperationLog.ImageRef, + pgtable.OperationLog.ContainerID, + pgtable.OperationLog.Outcome, + pgtable.OperationLog.ErrorCode, + pgtable.OperationLog.ErrorMessage, + pgtable.OperationLog.StartedAt, + pgtable.OperationLog.FinishedAt, + ).VALUES( + entry.GameID, + string(entry.OpKind), + string(entry.OpSource), + entry.SourceRef, + entry.ImageRef, + entry.ContainerID, + string(entry.Outcome), + entry.ErrorCode, + entry.ErrorMessage, + entry.StartedAt.UTC(), + sqlx.NullableTimePtr(entry.FinishedAt), + ).RETURNING(pgtable.OperationLog.ID) + + query, args := stmt.Sql() + row := store.db.QueryRowContext(operationCtx, query, args...) + var id int64 + if err := row.Scan(&id); err != nil { + return 0, fmt.Errorf("append operation log entry: %w", err) + } + return id, nil +} + +// ListByGame returns the most recent entries for gameID, ordered by +// started_at descending and capped by limit. The (game_id, +// started_at DESC) index drives the read. +func (store *Store) ListByGame(ctx context.Context, gameID string, limit int) ([]operation.OperationEntry, error) { + if store == nil || store.db == nil { + return nil, errors.New("list operation log entries by game: nil store") + } + if strings.TrimSpace(gameID) == "" { + return nil, fmt.Errorf("list operation log entries by game: game id must not be empty") + } + if limit <= 0 { + return nil, fmt.Errorf("list operation log entries by game: limit must be positive, got %d", limit) + } + + operationCtx, cancel, err := sqlx.WithTimeout(ctx, "list operation log entries by game", store.operationTimeout) + if err != nil { + return nil, err + } + defer cancel() + + stmt := pg.SELECT(operationLogSelectColumns). + FROM(pgtable.OperationLog). + WHERE(pgtable.OperationLog.GameID.EQ(pg.String(gameID))). + ORDER_BY(pgtable.OperationLog.StartedAt.DESC(), pgtable.OperationLog.ID.DESC()). + LIMIT(int64(limit)) + + query, args := stmt.Sql() + rows, err := store.db.QueryContext(operationCtx, query, args...) + if err != nil { + return nil, fmt.Errorf("list operation log entries by game: %w", err) + } + defer rows.Close() + + entries := make([]operation.OperationEntry, 0) + for rows.Next() { + entry, err := scanEntry(rows) + if err != nil { + return nil, fmt.Errorf("list operation log entries by game: scan: %w", err) + } + entries = append(entries, entry) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("list operation log entries by game: %w", err) + } + if len(entries) == 0 { + return nil, nil + } + return entries, nil +} + +// rowScanner abstracts *sql.Row and *sql.Rows so scanEntry can be shared +// across both single-row reads and iterated reads. +type rowScanner interface { + Scan(dest ...any) error +} + +// scanEntry scans one operation_log row from rs. +func scanEntry(rs rowScanner) (operation.OperationEntry, error) { + var ( + id int64 + gameID string + opKind string + opSource string + sourceRef string + imageRef string + containerID string + outcome string + errorCode string + errorMessage string + startedAt time.Time + finishedAt sql.NullTime + ) + if err := rs.Scan( + &id, + &gameID, + &opKind, + &opSource, + &sourceRef, + &imageRef, + &containerID, + &outcome, + &errorCode, + &errorMessage, + &startedAt, + &finishedAt, + ); err != nil { + return operation.OperationEntry{}, err + } + return operation.OperationEntry{ + ID: id, + GameID: gameID, + OpKind: operation.OpKind(opKind), + OpSource: operation.OpSource(opSource), + SourceRef: sourceRef, + ImageRef: imageRef, + ContainerID: containerID, + Outcome: operation.Outcome(outcome), + ErrorCode: errorCode, + ErrorMessage: errorMessage, + StartedAt: startedAt.UTC(), + FinishedAt: sqlx.TimePtrFromNullable(finishedAt), + }, nil +} + +// Ensure Store satisfies the ports.OperationLogStore interface at compile +// time. +var _ ports.OperationLogStore = (*Store)(nil) diff --git a/rtmanager/internal/adapters/postgres/operationlogstore/store_test.go b/rtmanager/internal/adapters/postgres/operationlogstore/store_test.go new file mode 100644 index 0000000..8a1e5de --- /dev/null +++ b/rtmanager/internal/adapters/postgres/operationlogstore/store_test.go @@ -0,0 +1,207 @@ +package operationlogstore_test + +import ( + "context" + "testing" + "time" + + "galaxy/rtmanager/internal/adapters/postgres/internal/pgtest" + "galaxy/rtmanager/internal/adapters/postgres/operationlogstore" + "galaxy/rtmanager/internal/domain/operation" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestMain(m *testing.M) { pgtest.RunMain(m) } + +func newStore(t *testing.T) *operationlogstore.Store { + t.Helper() + pgtest.TruncateAll(t) + store, err := operationlogstore.New(operationlogstore.Config{ + DB: pgtest.Ensure(t).Pool(), + OperationTimeout: pgtest.OperationTimeout, + }) + require.NoError(t, err) + return store +} + +func successStartEntry(gameID string, startedAt time.Time, sourceRef string) operation.OperationEntry { + finishedAt := startedAt.Add(time.Second) + return operation.OperationEntry{ + GameID: gameID, + OpKind: operation.OpKindStart, + OpSource: operation.OpSourceLobbyStream, + SourceRef: sourceRef, + ImageRef: "galaxy/game:v1.2.3", + ContainerID: "container-1", + Outcome: operation.OutcomeSuccess, + StartedAt: startedAt, + FinishedAt: &finishedAt, + } +} + +func TestAppendReturnsPositiveIDs(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + startedAt := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) + id1, err := store.Append(ctx, successStartEntry("game-001", startedAt, "1700000000000-0")) + require.NoError(t, err) + assert.Greater(t, id1, int64(0)) + + id2, err := store.Append(ctx, successStartEntry("game-001", startedAt.Add(time.Minute), "1700000000001-0")) + require.NoError(t, err) + assert.Greater(t, id2, id1) +} + +func TestAppendValidatesEntry(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + tests := []struct { + name string + mutate func(*operation.OperationEntry) + }{ + {"empty game id", func(e *operation.OperationEntry) { e.GameID = "" }}, + {"unknown op kind", func(e *operation.OperationEntry) { e.OpKind = "exotic" }}, + {"unknown op source", func(e *operation.OperationEntry) { e.OpSource = "exotic" }}, + {"unknown outcome", func(e *operation.OperationEntry) { e.Outcome = "exotic" }}, + {"zero started at", func(e *operation.OperationEntry) { e.StartedAt = time.Time{} }}, + {"failure without error code", func(e *operation.OperationEntry) { + e.Outcome = operation.OutcomeFailure + e.ErrorCode = "" + }}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + entry := successStartEntry("game-001", + time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC), "ref") + tt.mutate(&entry) + _, err := store.Append(ctx, entry) + require.Error(t, err) + }) + } +} + +func TestListByGameReturnsEntriesNewestFirst(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + base := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) + for index := range 3 { + _, err := store.Append(ctx, successStartEntry("game-001", + base.Add(time.Duration(index)*time.Minute), + "ref-game-001-")) + require.NoError(t, err) + } + // Foreign-game entry must not appear in the list. + _, err := store.Append(ctx, successStartEntry("game-other", base, "ref-other")) + require.NoError(t, err) + + entries, err := store.ListByGame(ctx, "game-001", 10) + require.NoError(t, err) + require.Len(t, entries, 3) + for index := range 2 { + assert.True(t, + !entries[index].StartedAt.Before(entries[index+1].StartedAt), + "entries must be ordered started_at DESC; got %s before %s", + entries[index].StartedAt, entries[index+1].StartedAt, + ) + } +} + +func TestListByGameRespectsLimit(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + base := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) + for index := range 5 { + _, err := store.Append(ctx, successStartEntry("game-001", + base.Add(time.Duration(index)*time.Minute), "ref")) + require.NoError(t, err) + } + + entries, err := store.ListByGame(ctx, "game-001", 2) + require.NoError(t, err) + require.Len(t, entries, 2) +} + +func TestListByGameReturnsEmptyForUnknownGame(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + entries, err := store.ListByGame(ctx, "game-missing", 10) + require.NoError(t, err) + assert.Empty(t, entries) +} + +func TestListByGameRejectsInvalidArgs(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + _, err := store.ListByGame(ctx, "", 10) + require.Error(t, err) + + _, err = store.ListByGame(ctx, "game-001", 0) + require.Error(t, err) + + _, err = store.ListByGame(ctx, "game-001", -3) + require.Error(t, err) +} + +func TestAppendRoundTripsAllFields(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + startedAt := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) + finishedAt := startedAt.Add(2 * time.Second) + original := operation.OperationEntry{ + GameID: "game-001", + OpKind: operation.OpKindStop, + OpSource: operation.OpSourceGMRest, + SourceRef: "request-7", + ImageRef: "galaxy/game:v2.0.0", + ContainerID: "container-X", + Outcome: operation.OutcomeFailure, + ErrorCode: "container_start_failed", + ErrorMessage: "stop deadline exceeded", + StartedAt: startedAt, + FinishedAt: &finishedAt, + } + id, err := store.Append(ctx, original) + require.NoError(t, err) + + entries, err := store.ListByGame(ctx, "game-001", 10) + require.NoError(t, err) + require.Len(t, entries, 1) + + got := entries[0] + assert.Equal(t, id, got.ID) + assert.Equal(t, original.GameID, got.GameID) + assert.Equal(t, original.OpKind, got.OpKind) + assert.Equal(t, original.OpSource, got.OpSource) + assert.Equal(t, original.SourceRef, got.SourceRef) + assert.Equal(t, original.ImageRef, got.ImageRef) + assert.Equal(t, original.ContainerID, got.ContainerID) + assert.Equal(t, original.Outcome, got.Outcome) + assert.Equal(t, original.ErrorCode, got.ErrorCode) + assert.Equal(t, original.ErrorMessage, got.ErrorMessage) + assert.True(t, original.StartedAt.Equal(got.StartedAt)) + require.NotNil(t, got.FinishedAt) + assert.True(t, original.FinishedAt.Equal(*got.FinishedAt)) + assert.Equal(t, time.UTC, got.StartedAt.Location()) + assert.Equal(t, time.UTC, got.FinishedAt.Location()) +} + +func TestNewRejectsNilDB(t *testing.T) { + _, err := operationlogstore.New(operationlogstore.Config{OperationTimeout: time.Second}) + require.Error(t, err) +} + +func TestNewRejectsNonPositiveTimeout(t *testing.T) { + _, err := operationlogstore.New(operationlogstore.Config{ + DB: pgtest.Ensure(t).Pool(), + }) + require.Error(t, err) +} diff --git a/rtmanager/internal/adapters/postgres/runtimerecordstore/store.go b/rtmanager/internal/adapters/postgres/runtimerecordstore/store.go new file mode 100644 index 0000000..287c01c --- /dev/null +++ b/rtmanager/internal/adapters/postgres/runtimerecordstore/store.go @@ -0,0 +1,500 @@ +// Package runtimerecordstore implements the PostgreSQL-backed adapter for +// `ports.RuntimeRecordStore`. +// +// The package owns the on-disk shape of the `runtime_records` table +// defined in +// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql` +// and translates the schema-agnostic `ports.RuntimeRecordStore` interface +// declared in `internal/ports/runtimerecordstore.go` into concrete +// go-jet/v2 statements driven by the pgx driver. +// +// Lifecycle transitions (UpdateStatus) use compare-and-swap on +// `(status, current_container_id)` rather than holding a SELECT ... FOR +// UPDATE lock across the caller's logic, mirroring the pattern used by +// `lobby/internal/adapters/postgres/gamestore`. +package runtimerecordstore + +import ( + "context" + "database/sql" + "errors" + "fmt" + "strings" + "time" + + "galaxy/rtmanager/internal/adapters/postgres/internal/sqlx" + pgtable "galaxy/rtmanager/internal/adapters/postgres/jet/rtmanager/table" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + + pg "github.com/go-jet/jet/v2/postgres" +) + +// Config configures one PostgreSQL-backed runtime-record store instance. +// The store does not own the underlying *sql.DB lifecycle: the caller +// (typically the service runtime) opens, instruments, migrates, and +// closes the pool. +type Config struct { + // DB stores the connection pool the store uses for every query. + DB *sql.DB + + // OperationTimeout bounds one round trip. The store creates a + // derived context for each operation so callers cannot starve the + // pool with an unbounded ctx. + OperationTimeout time.Duration +} + +// Store persists Runtime Manager runtime records in PostgreSQL. +type Store struct { + db *sql.DB + operationTimeout time.Duration +} + +// New constructs one PostgreSQL-backed runtime-record store from cfg. +func New(cfg Config) (*Store, error) { + if cfg.DB == nil { + return nil, errors.New("new postgres runtime record store: db must not be nil") + } + if cfg.OperationTimeout <= 0 { + return nil, errors.New("new postgres runtime record store: operation timeout must be positive") + } + return &Store{ + db: cfg.DB, + operationTimeout: cfg.OperationTimeout, + }, nil +} + +// runtimeSelectColumns is the canonical SELECT list for the runtime_records +// table, matching scanRecord's column order. +var runtimeSelectColumns = pg.ColumnList{ + pgtable.RuntimeRecords.GameID, + pgtable.RuntimeRecords.Status, + pgtable.RuntimeRecords.CurrentContainerID, + pgtable.RuntimeRecords.CurrentImageRef, + pgtable.RuntimeRecords.EngineEndpoint, + pgtable.RuntimeRecords.StatePath, + pgtable.RuntimeRecords.DockerNetwork, + pgtable.RuntimeRecords.StartedAt, + pgtable.RuntimeRecords.StoppedAt, + pgtable.RuntimeRecords.RemovedAt, + pgtable.RuntimeRecords.LastOpAt, + pgtable.RuntimeRecords.CreatedAt, +} + +// Get returns the record identified by gameID. It returns +// runtime.ErrNotFound when no record exists. +func (store *Store) Get(ctx context.Context, gameID string) (runtime.RuntimeRecord, error) { + if store == nil || store.db == nil { + return runtime.RuntimeRecord{}, errors.New("get runtime record: nil store") + } + if strings.TrimSpace(gameID) == "" { + return runtime.RuntimeRecord{}, fmt.Errorf("get runtime record: game id must not be empty") + } + + operationCtx, cancel, err := sqlx.WithTimeout(ctx, "get runtime record", store.operationTimeout) + if err != nil { + return runtime.RuntimeRecord{}, err + } + defer cancel() + + stmt := pg.SELECT(runtimeSelectColumns). + FROM(pgtable.RuntimeRecords). + WHERE(pgtable.RuntimeRecords.GameID.EQ(pg.String(gameID))) + + query, args := stmt.Sql() + row := store.db.QueryRowContext(operationCtx, query, args...) + record, err := scanRecord(row) + if sqlx.IsNoRows(err) { + return runtime.RuntimeRecord{}, runtime.ErrNotFound + } + if err != nil { + return runtime.RuntimeRecord{}, fmt.Errorf("get runtime record: %w", err) + } + return record, nil +} + +// Upsert inserts record when no row exists for record.GameID and +// otherwise overwrites every mutable column verbatim. created_at is +// preserved across upserts so the "first time RTM saw the game" +// timestamp stays stable. +func (store *Store) Upsert(ctx context.Context, record runtime.RuntimeRecord) error { + if store == nil || store.db == nil { + return errors.New("upsert runtime record: nil store") + } + if err := record.Validate(); err != nil { + return fmt.Errorf("upsert runtime record: %w", err) + } + + operationCtx, cancel, err := sqlx.WithTimeout(ctx, "upsert runtime record", store.operationTimeout) + if err != nil { + return err + } + defer cancel() + + stmt := pgtable.RuntimeRecords.INSERT( + pgtable.RuntimeRecords.GameID, + pgtable.RuntimeRecords.Status, + pgtable.RuntimeRecords.CurrentContainerID, + pgtable.RuntimeRecords.CurrentImageRef, + pgtable.RuntimeRecords.EngineEndpoint, + pgtable.RuntimeRecords.StatePath, + pgtable.RuntimeRecords.DockerNetwork, + pgtable.RuntimeRecords.StartedAt, + pgtable.RuntimeRecords.StoppedAt, + pgtable.RuntimeRecords.RemovedAt, + pgtable.RuntimeRecords.LastOpAt, + pgtable.RuntimeRecords.CreatedAt, + ).VALUES( + record.GameID, + string(record.Status), + sqlx.NullableString(record.CurrentContainerID), + sqlx.NullableString(record.CurrentImageRef), + record.EngineEndpoint, + record.StatePath, + record.DockerNetwork, + sqlx.NullableTimePtr(record.StartedAt), + sqlx.NullableTimePtr(record.StoppedAt), + sqlx.NullableTimePtr(record.RemovedAt), + record.LastOpAt.UTC(), + record.CreatedAt.UTC(), + ).ON_CONFLICT(pgtable.RuntimeRecords.GameID).DO_UPDATE( + pg.SET( + pgtable.RuntimeRecords.Status.SET(pgtable.RuntimeRecords.EXCLUDED.Status), + pgtable.RuntimeRecords.CurrentContainerID.SET(pgtable.RuntimeRecords.EXCLUDED.CurrentContainerID), + pgtable.RuntimeRecords.CurrentImageRef.SET(pgtable.RuntimeRecords.EXCLUDED.CurrentImageRef), + pgtable.RuntimeRecords.EngineEndpoint.SET(pgtable.RuntimeRecords.EXCLUDED.EngineEndpoint), + pgtable.RuntimeRecords.StatePath.SET(pgtable.RuntimeRecords.EXCLUDED.StatePath), + pgtable.RuntimeRecords.DockerNetwork.SET(pgtable.RuntimeRecords.EXCLUDED.DockerNetwork), + pgtable.RuntimeRecords.StartedAt.SET(pgtable.RuntimeRecords.EXCLUDED.StartedAt), + pgtable.RuntimeRecords.StoppedAt.SET(pgtable.RuntimeRecords.EXCLUDED.StoppedAt), + pgtable.RuntimeRecords.RemovedAt.SET(pgtable.RuntimeRecords.EXCLUDED.RemovedAt), + pgtable.RuntimeRecords.LastOpAt.SET(pgtable.RuntimeRecords.EXCLUDED.LastOpAt), + ), + ) + + query, args := stmt.Sql() + if _, err := store.db.ExecContext(operationCtx, query, args...); err != nil { + return fmt.Errorf("upsert runtime record: %w", err) + } + return nil +} + +// UpdateStatus applies one status transition with a compare-and-swap +// guard on (status, current_container_id). Validate is invoked before +// any SQL touch. +func (store *Store) UpdateStatus(ctx context.Context, input ports.UpdateStatusInput) error { + if store == nil || store.db == nil { + return errors.New("update runtime status: nil store") + } + if err := input.Validate(); err != nil { + return err + } + + operationCtx, cancel, err := sqlx.WithTimeout(ctx, "update runtime status", store.operationTimeout) + if err != nil { + return err + } + defer cancel() + + now := input.Now.UTC() + stmt, err := buildUpdateStatusStatement(input, now) + if err != nil { + return err + } + + query, args := stmt.Sql() + result, err := store.db.ExecContext(operationCtx, query, args...) + if err != nil { + return fmt.Errorf("update runtime status: %w", err) + } + affected, err := result.RowsAffected() + if err != nil { + return fmt.Errorf("update runtime status: rows affected: %w", err) + } + if affected == 0 { + return store.classifyMissingUpdate(operationCtx, input.GameID) + } + return nil +} + +// classifyMissingUpdate distinguishes ErrNotFound from ErrConflict after +// an UPDATE that affected zero rows. A row that is absent yields +// ErrNotFound; a row whose status or container_id does not match the +// CAS predicate yields ErrConflict. +func (store *Store) classifyMissingUpdate(ctx context.Context, gameID string) error { + probe := pg.SELECT(pgtable.RuntimeRecords.Status). + FROM(pgtable.RuntimeRecords). + WHERE(pgtable.RuntimeRecords.GameID.EQ(pg.String(gameID))) + probeQuery, probeArgs := probe.Sql() + + var current string + row := store.db.QueryRowContext(ctx, probeQuery, probeArgs...) + if err := row.Scan(¤t); err != nil { + if sqlx.IsNoRows(err) { + return runtime.ErrNotFound + } + return fmt.Errorf("update runtime status: probe: %w", err) + } + return runtime.ErrConflict +} + +// buildUpdateStatusStatement assembles the UPDATE statement applied for +// one runtime-status transition. +// +// status, last_op_at are always updated. The remaining columns are +// driven by the destination: +// +// - StatusStopped: stopped_at is captured at Now. +// - StatusRemoved: removed_at is captured at Now and current_container_id +// is NULLed (the container is gone; the prior id remains observable +// through operation_log). +// - StatusRunning: only status + last_op_at change. Fresh started_at +// and current_container_id are installed via Upsert before any +// stopped → running transition reaches this path; the path exists +// so runtime.AllowedTransitions stays one-to-one with the adapter +// capability matrix even though v1 services use Upsert for this +// case. +func buildUpdateStatusStatement(input ports.UpdateStatusInput, now time.Time) (pg.UpdateStatement, error) { + statusValue := pg.String(string(input.To)) + nowValue := pg.TimestampzT(now) + + var stmt pg.UpdateStatement + switch input.To { + case runtime.StatusStopped: + stmt = pgtable.RuntimeRecords.UPDATE( + pgtable.RuntimeRecords.Status, + pgtable.RuntimeRecords.LastOpAt, + pgtable.RuntimeRecords.StoppedAt, + ).SET( + statusValue, + nowValue, + nowValue, + ) + case runtime.StatusRemoved: + stmt = pgtable.RuntimeRecords.UPDATE( + pgtable.RuntimeRecords.Status, + pgtable.RuntimeRecords.LastOpAt, + pgtable.RuntimeRecords.RemovedAt, + pgtable.RuntimeRecords.CurrentContainerID, + ).SET( + statusValue, + nowValue, + nowValue, + pg.NULL, + ) + case runtime.StatusRunning: + stmt = pgtable.RuntimeRecords.UPDATE( + pgtable.RuntimeRecords.Status, + pgtable.RuntimeRecords.LastOpAt, + ).SET( + statusValue, + nowValue, + ) + default: + return nil, fmt.Errorf("update runtime status: destination status %q is unsupported", input.To) + } + + whereExpr := pg.AND( + pgtable.RuntimeRecords.GameID.EQ(pg.String(input.GameID)), + pgtable.RuntimeRecords.Status.EQ(pg.String(string(input.ExpectedFrom))), + ) + if input.ExpectedContainerID != "" { + whereExpr = pg.AND( + whereExpr, + pgtable.RuntimeRecords.CurrentContainerID.EQ(pg.String(input.ExpectedContainerID)), + ) + } + return stmt.WHERE(whereExpr), nil +} + +// ListByStatus returns every record currently indexed under status. +// Ordering is last_op_at DESC, game_id ASC — the direction the +// `runtime_records_status_last_op_idx` index is built in. +func (store *Store) ListByStatus(ctx context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) { + if store == nil || store.db == nil { + return nil, errors.New("list runtime records by status: nil store") + } + if !status.IsKnown() { + return nil, fmt.Errorf("list runtime records by status: status %q is unsupported", status) + } + + operationCtx, cancel, err := sqlx.WithTimeout(ctx, "list runtime records by status", store.operationTimeout) + if err != nil { + return nil, err + } + defer cancel() + + stmt := pg.SELECT(runtimeSelectColumns). + FROM(pgtable.RuntimeRecords). + WHERE(pgtable.RuntimeRecords.Status.EQ(pg.String(string(status)))). + ORDER_BY(pgtable.RuntimeRecords.LastOpAt.DESC(), pgtable.RuntimeRecords.GameID.ASC()) + + query, args := stmt.Sql() + rows, err := store.db.QueryContext(operationCtx, query, args...) + if err != nil { + return nil, fmt.Errorf("list runtime records by status: %w", err) + } + defer rows.Close() + + records := make([]runtime.RuntimeRecord, 0) + for rows.Next() { + record, err := scanRecord(rows) + if err != nil { + return nil, fmt.Errorf("list runtime records by status: scan: %w", err) + } + records = append(records, record) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("list runtime records by status: %w", err) + } + if len(records) == 0 { + return nil, nil + } + return records, nil +} + +// List returns every runtime record currently stored. Ordering matches +// ListByStatus — last_op_at DESC, game_id ASC — so the REST list +// endpoint sees the freshest activity first. +func (store *Store) List(ctx context.Context) ([]runtime.RuntimeRecord, error) { + if store == nil || store.db == nil { + return nil, errors.New("list runtime records: nil store") + } + + operationCtx, cancel, err := sqlx.WithTimeout(ctx, "list runtime records", store.operationTimeout) + if err != nil { + return nil, err + } + defer cancel() + + stmt := pg.SELECT(runtimeSelectColumns). + FROM(pgtable.RuntimeRecords). + ORDER_BY(pgtable.RuntimeRecords.LastOpAt.DESC(), pgtable.RuntimeRecords.GameID.ASC()) + + query, args := stmt.Sql() + rows, err := store.db.QueryContext(operationCtx, query, args...) + if err != nil { + return nil, fmt.Errorf("list runtime records: %w", err) + } + defer rows.Close() + + records := make([]runtime.RuntimeRecord, 0) + for rows.Next() { + record, err := scanRecord(rows) + if err != nil { + return nil, fmt.Errorf("list runtime records: scan: %w", err) + } + records = append(records, record) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("list runtime records: %w", err) + } + if len(records) == 0 { + return nil, nil + } + return records, nil +} + +// CountByStatus returns the number of records indexed under each status. +// Statuses with zero records are present in the result with a zero +// count so callers (e.g. the telemetry gauge) can publish a stable +// label set on every reading. +func (store *Store) CountByStatus(ctx context.Context) (map[runtime.Status]int, error) { + if store == nil || store.db == nil { + return nil, errors.New("count runtime records by status: nil store") + } + + operationCtx, cancel, err := sqlx.WithTimeout(ctx, "count runtime records by status", store.operationTimeout) + if err != nil { + return nil, err + } + defer cancel() + + countAlias := pg.COUNT(pg.STAR).AS("count") + stmt := pg.SELECT(pgtable.RuntimeRecords.Status, countAlias). + FROM(pgtable.RuntimeRecords). + GROUP_BY(pgtable.RuntimeRecords.Status) + + query, args := stmt.Sql() + rows, err := store.db.QueryContext(operationCtx, query, args...) + if err != nil { + return nil, fmt.Errorf("count runtime records by status: %w", err) + } + defer rows.Close() + + counts := make(map[runtime.Status]int, len(runtime.AllStatuses())) + for _, status := range runtime.AllStatuses() { + counts[status] = 0 + } + for rows.Next() { + var status string + var count int + if err := rows.Scan(&status, &count); err != nil { + return nil, fmt.Errorf("count runtime records by status: scan: %w", err) + } + counts[runtime.Status(status)] = count + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("count runtime records by status: %w", err) + } + return counts, nil +} + +// rowScanner abstracts *sql.Row and *sql.Rows so scanRecord can be shared +// across both single-row reads and iterated reads. +type rowScanner interface { + Scan(dest ...any) error +} + +// scanRecord scans one runtime_records row from rs. Returns sql.ErrNoRows +// verbatim so callers can distinguish "no row" from a hard error. +func scanRecord(rs rowScanner) (runtime.RuntimeRecord, error) { + var ( + gameID string + status string + currentContainerID sql.NullString + currentImageRef sql.NullString + engineEndpoint string + statePath string + dockerNetwork string + startedAt sql.NullTime + stoppedAt sql.NullTime + removedAt sql.NullTime + lastOpAt time.Time + createdAt time.Time + ) + if err := rs.Scan( + &gameID, + &status, + ¤tContainerID, + ¤tImageRef, + &engineEndpoint, + &statePath, + &dockerNetwork, + &startedAt, + &stoppedAt, + &removedAt, + &lastOpAt, + &createdAt, + ); err != nil { + return runtime.RuntimeRecord{}, err + } + return runtime.RuntimeRecord{ + GameID: gameID, + Status: runtime.Status(status), + CurrentContainerID: sqlx.StringFromNullable(currentContainerID), + CurrentImageRef: sqlx.StringFromNullable(currentImageRef), + EngineEndpoint: engineEndpoint, + StatePath: statePath, + DockerNetwork: dockerNetwork, + StartedAt: sqlx.TimePtrFromNullable(startedAt), + StoppedAt: sqlx.TimePtrFromNullable(stoppedAt), + RemovedAt: sqlx.TimePtrFromNullable(removedAt), + LastOpAt: lastOpAt.UTC(), + CreatedAt: createdAt.UTC(), + }, nil +} + +// Ensure Store satisfies the ports.RuntimeRecordStore interface at +// compile time. +var _ ports.RuntimeRecordStore = (*Store)(nil) diff --git a/rtmanager/internal/adapters/postgres/runtimerecordstore/store_test.go b/rtmanager/internal/adapters/postgres/runtimerecordstore/store_test.go new file mode 100644 index 0000000..bfbea42 --- /dev/null +++ b/rtmanager/internal/adapters/postgres/runtimerecordstore/store_test.go @@ -0,0 +1,420 @@ +package runtimerecordstore_test + +import ( + "context" + "errors" + "sync" + "testing" + "time" + + "galaxy/rtmanager/internal/adapters/postgres/internal/pgtest" + "galaxy/rtmanager/internal/adapters/postgres/runtimerecordstore" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestMain(m *testing.M) { pgtest.RunMain(m) } + +func newStore(t *testing.T) *runtimerecordstore.Store { + t.Helper() + pgtest.TruncateAll(t) + store, err := runtimerecordstore.New(runtimerecordstore.Config{ + DB: pgtest.Ensure(t).Pool(), + OperationTimeout: pgtest.OperationTimeout, + }) + require.NoError(t, err) + return store +} + +func runningRecord(t *testing.T, gameID, containerID, imageRef string) runtime.RuntimeRecord { + t.Helper() + now := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) + started := now + return runtime.RuntimeRecord{ + GameID: gameID, + Status: runtime.StatusRunning, + CurrentContainerID: containerID, + CurrentImageRef: imageRef, + EngineEndpoint: "http://galaxy-game-" + gameID + ":8080", + StatePath: "/var/lib/galaxy/games/" + gameID, + DockerNetwork: "galaxy-net", + StartedAt: &started, + LastOpAt: now, + CreatedAt: now, + } +} + +func TestUpsertAndGetRoundTrip(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3") + require.NoError(t, store.Upsert(ctx, record)) + + got, err := store.Get(ctx, record.GameID) + require.NoError(t, err) + assert.Equal(t, record.GameID, got.GameID) + assert.Equal(t, record.Status, got.Status) + assert.Equal(t, record.CurrentContainerID, got.CurrentContainerID) + assert.Equal(t, record.CurrentImageRef, got.CurrentImageRef) + assert.Equal(t, record.EngineEndpoint, got.EngineEndpoint) + assert.Equal(t, record.StatePath, got.StatePath) + assert.Equal(t, record.DockerNetwork, got.DockerNetwork) + require.NotNil(t, got.StartedAt) + assert.True(t, record.StartedAt.Equal(*got.StartedAt)) + assert.Equal(t, time.UTC, got.StartedAt.Location()) + assert.Equal(t, time.UTC, got.LastOpAt.Location()) + assert.Equal(t, time.UTC, got.CreatedAt.Location()) + assert.Nil(t, got.StoppedAt) + assert.Nil(t, got.RemovedAt) +} + +func TestGetReturnsNotFound(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + _, err := store.Get(ctx, "game-missing") + require.ErrorIs(t, err, runtime.ErrNotFound) +} + +func TestUpsertOverwritesMutableColumnsPreservesCreatedAt(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + original := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3") + require.NoError(t, store.Upsert(ctx, original)) + + updated := original + updated.CurrentContainerID = "container-2" + updated.CurrentImageRef = "galaxy/game:v1.2.4" + newStarted := original.LastOpAt.Add(time.Minute) + updated.StartedAt = &newStarted + updated.LastOpAt = newStarted + // Fresh CreatedAt simulates a caller passing "now"; the store must + // preserve the original CreatedAt value on conflict. + updated.CreatedAt = newStarted + + require.NoError(t, store.Upsert(ctx, updated)) + + got, err := store.Get(ctx, original.GameID) + require.NoError(t, err) + assert.Equal(t, "container-2", got.CurrentContainerID) + assert.Equal(t, "galaxy/game:v1.2.4", got.CurrentImageRef) + assert.True(t, got.LastOpAt.Equal(newStarted)) + assert.True(t, got.CreatedAt.Equal(original.CreatedAt), + "created_at must be preserved across upserts: got %s, want %s", + got.CreatedAt, original.CreatedAt) +} + +func TestUpdateStatusRunningToStopped(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3") + require.NoError(t, store.Upsert(ctx, record)) + + now := record.LastOpAt.Add(2 * time.Minute) + require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: record.GameID, + ExpectedFrom: runtime.StatusRunning, + ExpectedContainerID: record.CurrentContainerID, + To: runtime.StatusStopped, + Now: now, + })) + + got, err := store.Get(ctx, record.GameID) + require.NoError(t, err) + assert.Equal(t, runtime.StatusStopped, got.Status) + require.NotNil(t, got.StoppedAt) + assert.True(t, now.Equal(*got.StoppedAt)) + assert.True(t, now.Equal(got.LastOpAt)) + // container id is preserved on stop; cleanup later NULLs it. + assert.Equal(t, record.CurrentContainerID, got.CurrentContainerID) +} + +func TestUpdateStatusRunningToRemovedClearsContainerID(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3") + require.NoError(t, store.Upsert(ctx, record)) + + now := record.LastOpAt.Add(time.Minute) + require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: record.GameID, + ExpectedFrom: runtime.StatusRunning, + To: runtime.StatusRemoved, + Now: now, + })) + + got, err := store.Get(ctx, record.GameID) + require.NoError(t, err) + assert.Equal(t, runtime.StatusRemoved, got.Status) + require.NotNil(t, got.RemovedAt) + assert.True(t, now.Equal(*got.RemovedAt)) + assert.True(t, now.Equal(got.LastOpAt)) + assert.Empty(t, got.CurrentContainerID, "current_container_id must be NULL after removal") +} + +func TestUpdateStatusStoppedToRemoved(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3") + require.NoError(t, store.Upsert(ctx, record)) + + stopAt := record.LastOpAt.Add(time.Minute) + require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: record.GameID, + ExpectedFrom: runtime.StatusRunning, + To: runtime.StatusStopped, + Now: stopAt, + })) + + removeAt := stopAt.Add(time.Hour) + require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: record.GameID, + ExpectedFrom: runtime.StatusStopped, + To: runtime.StatusRemoved, + Now: removeAt, + })) + + got, err := store.Get(ctx, record.GameID) + require.NoError(t, err) + assert.Equal(t, runtime.StatusRemoved, got.Status) + require.NotNil(t, got.RemovedAt) + assert.True(t, removeAt.Equal(*got.RemovedAt)) + assert.True(t, removeAt.Equal(got.LastOpAt)) + require.NotNil(t, got.StoppedAt, "stopped_at must remain populated through removal") + assert.True(t, stopAt.Equal(*got.StoppedAt)) + assert.Empty(t, got.CurrentContainerID) +} + +func TestUpdateStatusReturnsConflictOnFromMismatch(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3") + require.NoError(t, store.Upsert(ctx, record)) + + err := store.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: record.GameID, + ExpectedFrom: runtime.StatusStopped, // wrong + To: runtime.StatusRemoved, + Now: record.LastOpAt.Add(time.Minute), + }) + require.ErrorIs(t, err, runtime.ErrConflict) +} + +func TestUpdateStatusReturnsConflictOnContainerIDMismatch(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3") + require.NoError(t, store.Upsert(ctx, record)) + + err := store.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: record.GameID, + ExpectedFrom: runtime.StatusRunning, + ExpectedContainerID: "container-other", + To: runtime.StatusStopped, + Now: record.LastOpAt.Add(time.Minute), + }) + require.ErrorIs(t, err, runtime.ErrConflict) +} + +func TestUpdateStatusReturnsNotFoundForMissing(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + err := store.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: "game-missing", + ExpectedFrom: runtime.StatusRunning, + To: runtime.StatusStopped, + Now: time.Now().UTC(), + }) + require.ErrorIs(t, err, runtime.ErrNotFound) +} + +func TestUpdateStatusValidatesInputBeforeStore(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + err := store.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: "game-001", + ExpectedFrom: runtime.StatusRunning, + To: runtime.StatusStopped, + // Now intentionally zero — validation must reject. + }) + require.Error(t, err) +} + +// TestUpdateStatusConcurrentCAS asserts the CAS guard: when two callers +// race to apply the running → stopped transition on the same row, +// exactly one wins (returns nil) and the other observes +// runtime.ErrConflict. +func TestUpdateStatusConcurrentCAS(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3") + require.NoError(t, store.Upsert(ctx, record)) + + const concurrency = 8 + results := make([]error, concurrency) + var wg sync.WaitGroup + wg.Add(concurrency) + for index := range concurrency { + go func() { + defer wg.Done() + results[index] = store.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: record.GameID, + ExpectedFrom: runtime.StatusRunning, + ExpectedContainerID: record.CurrentContainerID, + To: runtime.StatusStopped, + Now: record.LastOpAt.Add(time.Duration(index+1) * time.Second), + }) + }() + } + wg.Wait() + + wins, conflicts := 0, 0 + for _, err := range results { + switch { + case err == nil: + wins++ + case errors.Is(err, runtime.ErrConflict): + conflicts++ + default: + t.Errorf("unexpected error from concurrent UpdateStatus: %v", err) + } + } + assert.Equal(t, 1, wins, "exactly one caller must win the CAS race") + assert.Equal(t, concurrency-1, conflicts, "the rest must observe runtime.ErrConflict") +} + +func TestListByStatusReturnsExpectedRecords(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + a := runningRecord(t, "game-aaa", "container-a", "galaxy/game:v1.2.3") + b := runningRecord(t, "game-bbb", "container-b", "galaxy/game:v1.2.3") + c := runningRecord(t, "game-ccc", "container-c", "galaxy/game:v1.2.3") + for _, r := range []runtime.RuntimeRecord{a, b, c} { + require.NoError(t, store.Upsert(ctx, r)) + } + + stopAt := a.LastOpAt.Add(time.Minute) + require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: b.GameID, + ExpectedFrom: runtime.StatusRunning, + To: runtime.StatusStopped, + Now: stopAt, + })) + + running, err := store.ListByStatus(ctx, runtime.StatusRunning) + require.NoError(t, err) + gotIDs := map[string]struct{}{} + for _, r := range running { + gotIDs[r.GameID] = struct{}{} + } + assert.Contains(t, gotIDs, a.GameID) + assert.Contains(t, gotIDs, c.GameID) + assert.NotContains(t, gotIDs, b.GameID) + + stopped, err := store.ListByStatus(ctx, runtime.StatusStopped) + require.NoError(t, err) + require.Len(t, stopped, 1) + assert.Equal(t, b.GameID, stopped[0].GameID) +} + +func TestListByStatusRejectsUnknown(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + _, err := store.ListByStatus(ctx, runtime.Status("exotic")) + require.Error(t, err) +} + +func TestListReturnsEveryStatus(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + a := runningRecord(t, "game-aaa", "container-a", "galaxy/game:v1.2.3") + b := runningRecord(t, "game-bbb", "container-b", "galaxy/game:v1.2.3") + c := runningRecord(t, "game-ccc", "container-c", "galaxy/game:v1.2.3") + for _, r := range []runtime.RuntimeRecord{a, b, c} { + require.NoError(t, store.Upsert(ctx, r)) + } + require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: b.GameID, + ExpectedFrom: runtime.StatusRunning, + To: runtime.StatusStopped, + Now: b.LastOpAt.Add(time.Minute), + })) + + all, err := store.List(ctx) + require.NoError(t, err) + require.Len(t, all, 3) + + gotIDs := map[string]runtime.Status{} + for _, r := range all { + gotIDs[r.GameID] = r.Status + } + assert.Equal(t, runtime.StatusRunning, gotIDs[a.GameID]) + assert.Equal(t, runtime.StatusStopped, gotIDs[b.GameID]) + assert.Equal(t, runtime.StatusRunning, gotIDs[c.GameID]) +} + +func TestListReturnsNilWhenEmpty(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + all, err := store.List(ctx) + require.NoError(t, err) + assert.Nil(t, all) +} + +func TestCountByStatusReturnsAllBuckets(t *testing.T) { + ctx := context.Background() + store := newStore(t) + + a := runningRecord(t, "game-1", "container-1", "galaxy/game:v1.2.3") + b := runningRecord(t, "game-2", "container-2", "galaxy/game:v1.2.3") + c := runningRecord(t, "game-3", "container-3", "galaxy/game:v1.2.3") + for _, r := range []runtime.RuntimeRecord{a, b, c} { + require.NoError(t, store.Upsert(ctx, r)) + } + require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: b.GameID, + ExpectedFrom: runtime.StatusRunning, + To: runtime.StatusStopped, + Now: b.LastOpAt.Add(time.Minute), + })) + + counts, err := store.CountByStatus(ctx) + require.NoError(t, err) + + for _, status := range runtime.AllStatuses() { + _, ok := counts[status] + assert.True(t, ok, "status %q must appear in counts even when zero", status) + } + assert.Equal(t, 2, counts[runtime.StatusRunning]) + assert.Equal(t, 1, counts[runtime.StatusStopped]) + assert.Equal(t, 0, counts[runtime.StatusRemoved]) +} + +func TestNewRejectsNilDB(t *testing.T) { + _, err := runtimerecordstore.New(runtimerecordstore.Config{OperationTimeout: time.Second}) + require.Error(t, err) +} + +func TestNewRejectsNonPositiveTimeout(t *testing.T) { + _, err := runtimerecordstore.New(runtimerecordstore.Config{ + DB: pgtest.Ensure(t).Pool(), + }) + require.Error(t, err) +} diff --git a/rtmanager/internal/adapters/redisstate/gamelease/store.go b/rtmanager/internal/adapters/redisstate/gamelease/store.go new file mode 100644 index 0000000..b05d8cd --- /dev/null +++ b/rtmanager/internal/adapters/redisstate/gamelease/store.go @@ -0,0 +1,117 @@ +// Package gamelease implements the Redis-backed adapter for +// `ports.GameLeaseStore`. +// +// The lease guards every lifecycle operation Runtime Manager runs +// against one game (start, stop, restart, patch, cleanup, plus the +// reconciler's drift mutations). Acquisition uses `SET NX PX ` +// with a random caller token; release runs a Lua compare-and-delete +// so a holder that lost the lease through TTL expiry cannot wipe +// another caller's claim. +package gamelease + +import ( + "context" + "errors" + "fmt" + "strings" + "time" + + "galaxy/rtmanager/internal/adapters/redisstate" + "galaxy/rtmanager/internal/ports" + + "github.com/redis/go-redis/v9" +) + +// releaseScript removes the per-game lease only when the supplied token +// still owns it. Compare-and-delete prevents a TTL-expired holder from +// clearing another caller's claim. +var releaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +// Config configures one Redis-backed game lease store instance. The +// store does not own the redis client lifecycle; the caller (typically +// the service runtime) opens and closes it. +type Config struct { + // Client stores the Redis client the store uses for every command. + Client *redis.Client +} + +// Store persists the per-game lifecycle lease in Redis. +type Store struct { + client *redis.Client + keys redisstate.Keyspace +} + +// New constructs one Redis-backed game lease store from cfg. +func New(cfg Config) (*Store, error) { + if cfg.Client == nil { + return nil, errors.New("new rtmanager game lease store: nil redis client") + } + return &Store{ + client: cfg.Client, + keys: redisstate.Keyspace{}, + }, nil +} + +// TryAcquire attempts to acquire the per-game lease for gameID owned by +// token for ttl. The acquired return is true on a successful claim and +// false when another caller still owns the lease. A non-nil error +// reports a transport failure and must not be confused with a missed +// lease. +func (store *Store) TryAcquire(ctx context.Context, gameID, token string, ttl time.Duration) (bool, error) { + if store == nil || store.client == nil { + return false, errors.New("try acquire game lease: nil store") + } + if ctx == nil { + return false, errors.New("try acquire game lease: nil context") + } + if strings.TrimSpace(gameID) == "" { + return false, errors.New("try acquire game lease: game id must not be empty") + } + if strings.TrimSpace(token) == "" { + return false, errors.New("try acquire game lease: token must not be empty") + } + if ttl <= 0 { + return false, errors.New("try acquire game lease: ttl must be positive") + } + + acquired, err := store.client.SetNX(ctx, store.keys.GameLease(gameID), token, ttl).Result() + if err != nil { + return false, fmt.Errorf("try acquire game lease: %w", err) + } + return acquired, nil +} + +// Release removes the per-game lease for gameID only when token still +// matches the stored owner value. A token mismatch is a silent no-op. +func (store *Store) Release(ctx context.Context, gameID, token string) error { + if store == nil || store.client == nil { + return errors.New("release game lease: nil store") + } + if ctx == nil { + return errors.New("release game lease: nil context") + } + if strings.TrimSpace(gameID) == "" { + return errors.New("release game lease: game id must not be empty") + } + if strings.TrimSpace(token) == "" { + return errors.New("release game lease: token must not be empty") + } + + if err := releaseScript.Run( + ctx, + store.client, + []string{store.keys.GameLease(gameID)}, + token, + ).Err(); err != nil { + return fmt.Errorf("release game lease: %w", err) + } + return nil +} + +// Compile-time assertion: Store implements ports.GameLeaseStore. +var _ ports.GameLeaseStore = (*Store)(nil) diff --git a/rtmanager/internal/adapters/redisstate/gamelease/store_test.go b/rtmanager/internal/adapters/redisstate/gamelease/store_test.go new file mode 100644 index 0000000..3e5da11 --- /dev/null +++ b/rtmanager/internal/adapters/redisstate/gamelease/store_test.go @@ -0,0 +1,133 @@ +package gamelease_test + +import ( + "context" + "testing" + "time" + + "galaxy/rtmanager/internal/adapters/redisstate" + "galaxy/rtmanager/internal/adapters/redisstate/gamelease" + + "github.com/alicebob/miniredis/v2" + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func newLeaseStore(t *testing.T) (*gamelease.Store, *miniredis.Miniredis) { + t.Helper() + server := miniredis.RunT(t) + client := redis.NewClient(&redis.Options{Addr: server.Addr()}) + t.Cleanup(func() { _ = client.Close() }) + + store, err := gamelease.New(gamelease.Config{Client: client}) + require.NoError(t, err) + return store, server +} + +func TestNewRejectsNilClient(t *testing.T) { + _, err := gamelease.New(gamelease.Config{}) + require.Error(t, err) +} + +func TestTryAcquireSetsKeyAndTTL(t *testing.T) { + store, server := newLeaseStore(t) + + acquired, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute) + require.NoError(t, err) + assert.True(t, acquired) + + key := redisstate.Keyspace{}.GameLease("game-1") + assert.True(t, server.Exists(key), "key %q must exist after TryAcquire", key) + + stored, err := server.Get(key) + require.NoError(t, err) + assert.Equal(t, "token-A", stored) + + // TTL must be positive (miniredis returns the remaining duration). + ttl := server.TTL(key) + assert.Greater(t, ttl, time.Duration(0)) +} + +func TestTryAcquireReturnsFalseWhenAlreadyHeld(t *testing.T) { + store, _ := newLeaseStore(t) + + acquired, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute) + require.NoError(t, err) + require.True(t, acquired) + + acquired, err = store.TryAcquire(context.Background(), "game-1", "token-B", time.Minute) + require.NoError(t, err) + assert.False(t, acquired) +} + +func TestReleaseRemovesKeyForOwnerToken(t *testing.T) { + store, server := newLeaseStore(t) + + _, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute) + require.NoError(t, err) + + require.NoError(t, store.Release(context.Background(), "game-1", "token-A")) + + key := redisstate.Keyspace{}.GameLease("game-1") + assert.False(t, server.Exists(key), "key %q must be deleted after Release", key) +} + +func TestReleaseIsNoOpForForeignToken(t *testing.T) { + store, server := newLeaseStore(t) + + _, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute) + require.NoError(t, err) + + require.NoError(t, store.Release(context.Background(), "game-1", "token-B")) + + key := redisstate.Keyspace{}.GameLease("game-1") + assert.True(t, server.Exists(key), "key %q must still exist when foreign token is released", key) + + stored, err := server.Get(key) + require.NoError(t, err) + assert.Equal(t, "token-A", stored) +} + +func TestTryAcquireSucceedsAfterTTLExpiry(t *testing.T) { + store, server := newLeaseStore(t) + + acquired, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute) + require.NoError(t, err) + require.True(t, acquired) + + server.FastForward(2 * time.Minute) + + acquired, err = store.TryAcquire(context.Background(), "game-1", "token-B", time.Minute) + require.NoError(t, err) + assert.True(t, acquired) +} + +func TestTryAcquireRejectsInvalidArguments(t *testing.T) { + store, _ := newLeaseStore(t) + + _, err := store.TryAcquire(context.Background(), "", "token", time.Minute) + require.Error(t, err) + + _, err = store.TryAcquire(context.Background(), "game-1", "", time.Minute) + require.Error(t, err) + + _, err = store.TryAcquire(context.Background(), "game-1", "token", 0) + require.Error(t, err) +} + +func TestReleaseRejectsInvalidArguments(t *testing.T) { + store, _ := newLeaseStore(t) + + require.Error(t, store.Release(context.Background(), "", "token")) + require.Error(t, store.Release(context.Background(), "game-1", "")) +} + +func TestKeyspaceGameLeaseIsPrefixedAndEncoded(t *testing.T) { + key := redisstate.Keyspace{}.GameLease("game with spaces") + assert.NotEmpty(t, key) + assert.Contains(t, key, "rtmanager:game_lease:") + suffix := key[len("rtmanager:game_lease:"):] + // base64url-encoded suffix must not contain the original spaces. + assert.NotContains(t, suffix, " ") +} diff --git a/rtmanager/internal/adapters/redisstate/keyspace.go b/rtmanager/internal/adapters/redisstate/keyspace.go new file mode 100644 index 0000000..417b4ab --- /dev/null +++ b/rtmanager/internal/adapters/redisstate/keyspace.go @@ -0,0 +1,44 @@ +// Package redisstate hosts the Runtime Manager Redis adapters that share +// a single keyspace. Each sibling subpackage (e.g. `streamoffsets`) +// implements one port and uses Keyspace to compose its keys, so the +// Redis namespace stays under one document and one prefix. +// +// The package itself only declares the keyspace; concrete stores live in +// nested packages so dependencies (testcontainers, miniredis) stay out +// of consumer build graphs that do not need them. +package redisstate + +import "encoding/base64" + +// defaultPrefix is the mandatory `rtmanager:` namespace prefix shared by +// every Runtime Manager Redis key. +const defaultPrefix = "rtmanager:" + +// Keyspace builds the Runtime Manager Redis keys. The namespace covers +// the stream consumer offsets and the per-game lifecycle lease in v1. +// +// Dynamic key segments are encoded with base64url so raw key structure +// does not depend on caller-provided characters; this matches the +// encoding chosen by `lobby/internal/adapters/redisstate.Keyspace`. +type Keyspace struct{} + +// StreamOffset returns the Redis key that stores the last successfully +// processed entry id for one Redis Stream consumer. The streamLabel is +// the short logical identifier of the consumer (e.g. `start_jobs`, +// `stop_jobs`), not the full stream name; it stays stable when the +// underlying stream key is renamed. +func (Keyspace) StreamOffset(streamLabel string) string { + return defaultPrefix + "stream_offsets:" + encodeKeyComponent(streamLabel) +} + +// GameLease returns the Redis key that stores the per-game lifecycle +// lease guarding start / stop / restart / patch / cleanup operations +// against the same game. The gameID is base64url-encoded so callers can +// pass any opaque identifier without escaping raw key characters. +func (Keyspace) GameLease(gameID string) string { + return defaultPrefix + "game_lease:" + encodeKeyComponent(gameID) +} + +func encodeKeyComponent(value string) string { + return base64.RawURLEncoding.EncodeToString([]byte(value)) +} diff --git a/rtmanager/internal/adapters/redisstate/streamoffsets/store.go b/rtmanager/internal/adapters/redisstate/streamoffsets/store.go new file mode 100644 index 0000000..1b58c7e --- /dev/null +++ b/rtmanager/internal/adapters/redisstate/streamoffsets/store.go @@ -0,0 +1,94 @@ +// Package streamoffsets implements the Redis-backed adapter for +// `ports.StreamOffsetStore`. +// +// The start-jobs and stop-jobs consumers call Load on startup to +// resume from the persisted offset and Save after every successful +// message handling. Keys are produced by +// `redisstate.Keyspace.StreamOffset`, mirroring the lobby pattern. +package streamoffsets + +import ( + "context" + "errors" + "fmt" + "strings" + + "galaxy/rtmanager/internal/adapters/redisstate" + "galaxy/rtmanager/internal/ports" + + "github.com/redis/go-redis/v9" +) + +// Config configures one Redis-backed stream-offset store instance. The +// store does not own the redis client lifecycle; the caller (typically +// the service runtime) opens and closes it. +type Config struct { + // Client stores the Redis client the store uses for every command. + Client *redis.Client +} + +// Store persists Runtime Manager stream consumer offsets in Redis. +type Store struct { + client *redis.Client + keys redisstate.Keyspace +} + +// New constructs one Redis-backed stream-offset store from cfg. +func New(cfg Config) (*Store, error) { + if cfg.Client == nil { + return nil, errors.New("new rtmanager stream offset store: nil redis client") + } + return &Store{ + client: cfg.Client, + keys: redisstate.Keyspace{}, + }, nil +} + +// Load returns the last processed entry id for streamLabel when one is +// stored. A missing key returns ("", false, nil). +func (store *Store) Load(ctx context.Context, streamLabel string) (string, bool, error) { + if store == nil || store.client == nil { + return "", false, errors.New("load rtmanager stream offset: nil store") + } + if ctx == nil { + return "", false, errors.New("load rtmanager stream offset: nil context") + } + if strings.TrimSpace(streamLabel) == "" { + return "", false, errors.New("load rtmanager stream offset: stream label must not be empty") + } + + value, err := store.client.Get(ctx, store.keys.StreamOffset(streamLabel)).Result() + switch { + case errors.Is(err, redis.Nil): + return "", false, nil + case err != nil: + return "", false, fmt.Errorf("load rtmanager stream offset: %w", err) + } + return value, true, nil +} + +// Save stores entryID as the new offset for streamLabel. The key has no +// TTL — offsets are durable and only overwritten by subsequent Saves. +func (store *Store) Save(ctx context.Context, streamLabel, entryID string) error { + if store == nil || store.client == nil { + return errors.New("save rtmanager stream offset: nil store") + } + if ctx == nil { + return errors.New("save rtmanager stream offset: nil context") + } + if strings.TrimSpace(streamLabel) == "" { + return errors.New("save rtmanager stream offset: stream label must not be empty") + } + if strings.TrimSpace(entryID) == "" { + return errors.New("save rtmanager stream offset: entry id must not be empty") + } + + if err := store.client.Set(ctx, store.keys.StreamOffset(streamLabel), entryID, 0).Err(); err != nil { + return fmt.Errorf("save rtmanager stream offset: %w", err) + } + return nil +} + +// Ensure Store satisfies the ports.StreamOffsetStore interface at +// compile time. +var _ ports.StreamOffsetStore = (*Store)(nil) diff --git a/rtmanager/internal/adapters/redisstate/streamoffsets/store_test.go b/rtmanager/internal/adapters/redisstate/streamoffsets/store_test.go new file mode 100644 index 0000000..970ce22 --- /dev/null +++ b/rtmanager/internal/adapters/redisstate/streamoffsets/store_test.go @@ -0,0 +1,86 @@ +package streamoffsets_test + +import ( + "context" + "testing" + + "galaxy/rtmanager/internal/adapters/redisstate" + "galaxy/rtmanager/internal/adapters/redisstate/streamoffsets" + + "github.com/alicebob/miniredis/v2" + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func newOffsetStore(t *testing.T) (*streamoffsets.Store, *miniredis.Miniredis) { + t.Helper() + server := miniredis.RunT(t) + client := redis.NewClient(&redis.Options{Addr: server.Addr()}) + t.Cleanup(func() { _ = client.Close() }) + + store, err := streamoffsets.New(streamoffsets.Config{Client: client}) + require.NoError(t, err) + return store, server +} + +func TestNewRejectsNilClient(t *testing.T) { + _, err := streamoffsets.New(streamoffsets.Config{}) + require.Error(t, err) +} + +func TestLoadMissingReturnsNotFound(t *testing.T) { + store, _ := newOffsetStore(t) + + id, found, err := store.Load(context.Background(), "start_jobs") + require.NoError(t, err) + assert.False(t, found) + assert.Empty(t, id) +} + +func TestSaveLoadRoundTrip(t *testing.T) { + store, server := newOffsetStore(t) + + require.NoError(t, store.Save(context.Background(), "start_jobs", "1700000000000-0")) + + id, found, err := store.Load(context.Background(), "start_jobs") + require.NoError(t, err) + assert.True(t, found) + assert.Equal(t, "1700000000000-0", id) + + // The persisted key must follow the rtmanager keyspace prefix. + expectedKey := redisstate.Keyspace{}.StreamOffset("start_jobs") + assert.True(t, server.Exists(expectedKey), + "key %q must exist after Save", expectedKey) +} + +func TestSaveOverwritesPriorValue(t *testing.T) { + store, _ := newOffsetStore(t) + + require.NoError(t, store.Save(context.Background(), "start_jobs", "100-0")) + require.NoError(t, store.Save(context.Background(), "start_jobs", "200-0")) + + id, found, err := store.Load(context.Background(), "start_jobs") + require.NoError(t, err) + assert.True(t, found) + assert.Equal(t, "200-0", id) +} + +func TestLoadAndSaveRejectInvalidArguments(t *testing.T) { + store, _ := newOffsetStore(t) + + require.Error(t, store.Save(context.Background(), "", "100-0")) + require.Error(t, store.Save(context.Background(), "start_jobs", "")) + + _, _, err := store.Load(context.Background(), "") + require.Error(t, err) +} + +func TestKeyspaceStreamOffsetIsPrefixed(t *testing.T) { + key := redisstate.Keyspace{}.StreamOffset("start_jobs") + assert.NotEmpty(t, key) + assert.Contains(t, key, "rtmanager:stream_offsets:") + // base64url-encoded label must not contain raw colons or spaces. + suffix := key[len("rtmanager:stream_offsets:"):] + assert.NotContains(t, suffix, ":") +} diff --git a/rtmanager/internal/api/internalhttp/conformance_test.go b/rtmanager/internal/api/internalhttp/conformance_test.go new file mode 100644 index 0000000..4a95a8f --- /dev/null +++ b/rtmanager/internal/api/internalhttp/conformance_test.go @@ -0,0 +1,367 @@ +package internalhttp + +import ( + "bytes" + "context" + "errors" + "io" + "net/http" + "net/http/httptest" + "path/filepath" + "runtime" + "strings" + "sync" + "testing" + "time" + + "galaxy/rtmanager/internal/api/internalhttp/handlers" + domainruntime "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/cleanupcontainer" + "galaxy/rtmanager/internal/service/patchruntime" + "galaxy/rtmanager/internal/service/restartruntime" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/service/stopruntime" + + "github.com/getkin/kin-openapi/openapi3" + "github.com/getkin/kin-openapi/openapi3filter" + "github.com/getkin/kin-openapi/routers" + "github.com/getkin/kin-openapi/routers/legacy" + "github.com/stretchr/testify/require" +) + +// TestInternalRESTConformance loads the OpenAPI specification, drives +// every runtime operation against the live internal HTTP listener +// backed by stub services, and validates each response body against +// the spec via `openapi3filter.ValidateResponse`. The test catches +// drift between the wire shape produced by the handler layer and the +// frozen contract; failure-path response shapes are validated by the +// per-handler tests in `handlers/_test.go`. +func TestInternalRESTConformance(t *testing.T) { + t.Parallel() + + doc := loadConformanceSpec(t) + + router, err := legacy.NewRouter(doc) + require.NoError(t, err) + + deps := newConformanceDeps(t) + server, err := NewServer(newConformanceConfig(), Dependencies{ + Logger: nil, + Telemetry: nil, + Readiness: nil, + RuntimeRecords: deps.records, + StartRuntime: deps.start, + StopRuntime: deps.stop, + RestartRuntime: deps.restart, + PatchRuntime: deps.patch, + CleanupContainer: deps.cleanup, + }) + require.NoError(t, err) + + cases := []conformanceCase{ + { + name: "internalListRuntimes", + method: http.MethodGet, + path: "/api/v1/internal/runtimes", + }, + { + name: "internalGetRuntime", + method: http.MethodGet, + path: "/api/v1/internal/runtimes/" + conformanceGameID, + }, + { + name: "internalStartRuntime", + method: http.MethodPost, + path: "/api/v1/internal/runtimes/" + conformanceGameID + "/start", + contentType: "application/json", + body: `{"image_ref":"galaxy/game:v1.2.3"}`, + }, + { + name: "internalStopRuntime", + method: http.MethodPost, + path: "/api/v1/internal/runtimes/" + conformanceGameID + "/stop", + contentType: "application/json", + body: `{"reason":"admin_request"}`, + }, + { + name: "internalRestartRuntime", + method: http.MethodPost, + path: "/api/v1/internal/runtimes/" + conformanceGameID + "/restart", + }, + { + name: "internalPatchRuntime", + method: http.MethodPost, + path: "/api/v1/internal/runtimes/" + conformanceGameID + "/patch", + contentType: "application/json", + body: `{"image_ref":"galaxy/game:v1.2.4"}`, + }, + { + name: "internalCleanupRuntimeContainer", + method: http.MethodDelete, + path: "/api/v1/internal/runtimes/" + conformanceGameID + "/container", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + runConformanceCase(t, server.handler, router, tc) + }) + } +} + +// conformanceGameID is the path variable used for every per-game +// conformance request. +const conformanceGameID = "game-conformance" + +// conformanceServerURL mirrors the canonical `servers[0].url` entry in +// `rtmanager/api/internal-openapi.yaml`. The legacy router matches +// requests against this prefix; updating the spec's server URL +// requires updating this constant. +const conformanceServerURL = "http://localhost:8096" + +// conformanceCase describes one request the conformance test drives. +type conformanceCase struct { + name string + method string + path string + contentType string + body string +} + +func runConformanceCase(t *testing.T, handler http.Handler, router routers.Router, tc conformanceCase) { + t.Helper() + + // Drive the handler with the path-only form so the listener's + // http.ServeMux matches the registered routes (which use raw paths, + // without the OpenAPI server URL prefix). + var bodyReader io.Reader + if tc.body != "" { + bodyReader = strings.NewReader(tc.body) + } + request := httptest.NewRequest(tc.method, tc.path, bodyReader) + if tc.contentType != "" { + request.Header.Set("Content-Type", tc.contentType) + } + request.Header.Set("X-Galaxy-Caller", "admin") + + recorder := httptest.NewRecorder() + handler.ServeHTTP(recorder, request) + require.Equalf(t, http.StatusOK, recorder.Code, "operation %s returned %d: %s", tc.name, recorder.Code, recorder.Body.String()) + + // kin-openapi's legacy router requires the request URL to match a + // `servers[].url` entry; rebuild the validation request with the + // canonical local server URL declared in the spec. + validationURL := conformanceServerURL + tc.path + validationRequest := httptest.NewRequest(tc.method, validationURL, bodyReaderFor(tc.body)) + if tc.contentType != "" { + validationRequest.Header.Set("Content-Type", tc.contentType) + } + validationRequest.Header.Set("X-Galaxy-Caller", "admin") + + route, pathParams, err := router.FindRoute(validationRequest) + require.NoError(t, err) + + requestInput := &openapi3filter.RequestValidationInput{ + Request: validationRequest, + PathParams: pathParams, + Route: route, + Options: &openapi3filter.Options{ + IncludeResponseStatus: true, + }, + } + require.NoError(t, openapi3filter.ValidateRequest(context.Background(), requestInput)) + + responseInput := &openapi3filter.ResponseValidationInput{ + RequestValidationInput: requestInput, + Status: recorder.Code, + Header: recorder.Header(), + Options: &openapi3filter.Options{ + IncludeResponseStatus: true, + }, + } + responseInput.SetBodyBytes(recorder.Body.Bytes()) + require.NoError(t, openapi3filter.ValidateResponse(context.Background(), responseInput)) +} + +func loadConformanceSpec(t *testing.T) *openapi3.T { + t.Helper() + + _, thisFile, _, ok := runtime.Caller(0) + require.True(t, ok) + + specPath := filepath.Join(filepath.Dir(thisFile), "..", "..", "..", "api", "internal-openapi.yaml") + loader := openapi3.NewLoader() + doc, err := loader.LoadFromFile(specPath) + require.NoError(t, err) + require.NoError(t, doc.Validate(context.Background())) + return doc +} + +func bodyReaderFor(raw string) io.Reader { + if raw == "" { + return http.NoBody + } + return bytes.NewBufferString(raw) +} + +// conformanceDeps groups the stub collaborators handed to the listener. +type conformanceDeps struct { + records *conformanceRecords + start *conformanceStart + stop *conformanceStop + restart *conformanceRestart + patch *conformancePatch + cleanup *conformanceCleanup +} + +func newConformanceDeps(t *testing.T) *conformanceDeps { + t.Helper() + return &conformanceDeps{ + records: newConformanceRecords(), + start: &conformanceStart{}, + stop: &conformanceStop{}, + restart: &conformanceRestart{}, + patch: &conformancePatch{}, + cleanup: &conformanceCleanup{}, + } +} + +func newConformanceConfig() Config { + return Config{ + Addr: ":0", + ReadHeaderTimeout: time.Second, + ReadTimeout: time.Second, + WriteTimeout: time.Second, + IdleTimeout: time.Second, + } +} + +// conformanceRecord builds a canonical running record used by every +// stub service. +func conformanceRecord() domainruntime.RuntimeRecord { + started := time.Date(2026, 4, 26, 13, 0, 0, 0, time.UTC) + return domainruntime.RuntimeRecord{ + GameID: conformanceGameID, + Status: domainruntime.StatusRunning, + CurrentContainerID: "container-conformance", + CurrentImageRef: "galaxy/game:v1.2.3", + EngineEndpoint: "http://galaxy-game-" + conformanceGameID + ":8080", + StatePath: "/var/lib/galaxy/" + conformanceGameID, + DockerNetwork: "galaxy-engine", + StartedAt: &started, + LastOpAt: started, + CreatedAt: started, + } +} + +// conformanceRecords is an in-memory record store seeded with one +// canonical record so the get / list endpoints have something to +// return. +type conformanceRecords struct { + mu sync.Mutex + stored map[string]domainruntime.RuntimeRecord +} + +func newConformanceRecords() *conformanceRecords { + return &conformanceRecords{ + stored: map[string]domainruntime.RuntimeRecord{ + conformanceGameID: conformanceRecord(), + }, + } +} + +func (s *conformanceRecords) Get(_ context.Context, gameID string) (domainruntime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + record, ok := s.stored[gameID] + if !ok { + return domainruntime.RuntimeRecord{}, domainruntime.ErrNotFound + } + return record, nil +} + +func (s *conformanceRecords) Upsert(_ context.Context, _ domainruntime.RuntimeRecord) error { + return errors.New("not used in conformance test") +} + +func (s *conformanceRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error { + return errors.New("not used in conformance test") +} + +func (s *conformanceRecords) ListByStatus(_ context.Context, _ domainruntime.Status) ([]domainruntime.RuntimeRecord, error) { + return nil, errors.New("not used in conformance test") +} + +func (s *conformanceRecords) List(_ context.Context) ([]domainruntime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]domainruntime.RuntimeRecord, 0, len(s.stored)) + for _, record := range s.stored { + out = append(out, record) + } + return out, nil +} + +// conformanceStart is the stub StartService used by the conformance +// test. Every Handle call returns the canonical record. +type conformanceStart struct{} + +func (s *conformanceStart) Handle(_ context.Context, _ startruntime.Input) (startruntime.Result, error) { + return startruntime.Result{ + Record: conformanceRecord(), + Outcome: "success", + }, nil +} + +type conformanceStop struct{} + +func (s *conformanceStop) Handle(_ context.Context, _ stopruntime.Input) (stopruntime.Result, error) { + rec := conformanceRecord() + rec.Status = domainruntime.StatusStopped + stopped := rec.LastOpAt.Add(time.Second) + rec.StoppedAt = &stopped + rec.LastOpAt = stopped + return stopruntime.Result{Record: rec, Outcome: "success"}, nil +} + +type conformanceRestart struct{} + +func (s *conformanceRestart) Handle(_ context.Context, _ restartruntime.Input) (restartruntime.Result, error) { + return restartruntime.Result{Record: conformanceRecord(), Outcome: "success"}, nil +} + +type conformancePatch struct{} + +func (s *conformancePatch) Handle(_ context.Context, in patchruntime.Input) (patchruntime.Result, error) { + rec := conformanceRecord() + if in.NewImageRef != "" { + rec.CurrentImageRef = in.NewImageRef + } + return patchruntime.Result{Record: rec, Outcome: "success"}, nil +} + +type conformanceCleanup struct{} + +func (s *conformanceCleanup) Handle(_ context.Context, _ cleanupcontainer.Input) (cleanupcontainer.Result, error) { + rec := conformanceRecord() + rec.Status = domainruntime.StatusRemoved + rec.CurrentContainerID = "" + removed := rec.LastOpAt.Add(time.Minute) + rec.RemovedAt = &removed + rec.LastOpAt = removed + return cleanupcontainer.Result{Record: rec, Outcome: "success"}, nil +} + +// Compile-time guards: the stubs must satisfy the handler-level +// service ports plus ports.RuntimeRecordStore so the listener accepts +// them. +var ( + _ handlers.StartService = (*conformanceStart)(nil) + _ handlers.StopService = (*conformanceStop)(nil) + _ handlers.RestartService = (*conformanceRestart)(nil) + _ handlers.PatchService = (*conformancePatch)(nil) + _ handlers.CleanupService = (*conformanceCleanup)(nil) + _ ports.RuntimeRecordStore = (*conformanceRecords)(nil) +) diff --git a/rtmanager/internal/api/internalhttp/handlers/cleanup.go b/rtmanager/internal/api/internalhttp/handlers/cleanup.go new file mode 100644 index 0000000..7e10dbf --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/cleanup.go @@ -0,0 +1,55 @@ +package handlers + +import ( + "net/http" + + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/service/cleanupcontainer" + "galaxy/rtmanager/internal/service/startruntime" +) + +// newCleanupHandler returns the handler for +// `DELETE /api/v1/internal/runtimes/{game_id}/container`. The OpenAPI +// spec declares no request body for this operation; any client-provided +// body is ignored. +func newCleanupHandler(deps Dependencies) http.HandlerFunc { + logger := loggerFor(deps.Logger, "internal_rest.cleanup") + return func(writer http.ResponseWriter, request *http.Request) { + if deps.CleanupContainer == nil { + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "cleanup container service is not wired", + ) + return + } + + gameID, ok := extractGameID(writer, request) + if !ok { + return + } + + result, err := deps.CleanupContainer.Handle(request.Context(), cleanupcontainer.Input{ + GameID: gameID, + OpSource: resolveOpSource(request), + SourceRef: requestSourceRef(request), + }) + if err != nil { + logger.ErrorContext(request.Context(), "cleanup container service errored", + "game_id", gameID, + "err", err.Error(), + ) + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "cleanup container service failed", + ) + return + } + + if result.Outcome == operation.OutcomeFailure { + writeFailure(writer, result.ErrorCode, result.ErrorMessage) + return + } + + writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record)) + } +} diff --git a/rtmanager/internal/api/internalhttp/handlers/common.go b/rtmanager/internal/api/internalhttp/handlers/common.go new file mode 100644 index 0000000..ba076cd --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/common.go @@ -0,0 +1,238 @@ +package handlers + +import ( + "encoding/json" + "errors" + "io" + "log/slog" + "net/http" + "strings" + "time" + + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/service/startruntime" +) + +// JSONContentType is the Content-Type used by every internal REST +// response. Exported so the listener-level tests can match it without +// re-declaring the constant. +const JSONContentType = "application/json; charset=utf-8" + +// gameIDPathParam is the name of the {game_id} path variable shared by +// every per-game runtime endpoint. +const gameIDPathParam = "game_id" + +// callerHeader is the HTTP header that distinguishes Game Master from +// Admin Service in the operation log. Documented in +// `rtmanager/api/internal-openapi.yaml` and +// `rtmanager/docs/services.md` §18. +const callerHeader = "X-Galaxy-Caller" + +// errorCodeDockerUnavailable mirrors the OpenAPI error code value. The +// lifecycle services do not currently emit it (they use +// `service_unavailable` for Docker daemon failures); the handler layer +// maps it to 503 anyway so future producers do not require a handler +// change. +const errorCodeDockerUnavailable = "docker_unavailable" + +// errorBody mirrors the `error` element of the OpenAPI ErrorResponse +// schema. +type errorBody struct { + Code string `json:"code"` + Message string `json:"message"` +} + +// errorResponse mirrors the OpenAPI ErrorResponse envelope. +type errorResponse struct { + Error errorBody `json:"error"` +} + +// runtimeRecordResponse mirrors the OpenAPI RuntimeRecord schema. +// Required fields use plain strings; nullable fields use pointers so an +// absent value encodes as the JSON literal `null` (matches the +// `nullable: true` declaration in the spec). Times are RFC3339 UTC. +type runtimeRecordResponse struct { + GameID string `json:"game_id"` + Status string `json:"status"` + CurrentContainerID *string `json:"current_container_id"` + CurrentImageRef *string `json:"current_image_ref"` + EngineEndpoint *string `json:"engine_endpoint"` + StatePath string `json:"state_path"` + DockerNetwork string `json:"docker_network"` + StartedAt *string `json:"started_at"` + StoppedAt *string `json:"stopped_at"` + RemovedAt *string `json:"removed_at"` + LastOpAt string `json:"last_op_at"` + CreatedAt string `json:"created_at"` +} + +// runtimesListResponse mirrors the OpenAPI RuntimesList schema. Items +// is always non-nil so the JSON form carries `[]` rather than `null` +// for an empty result. +type runtimesListResponse struct { + Items []runtimeRecordResponse `json:"items"` +} + +// encodeRuntimeRecord turns a domain RuntimeRecord into its wire shape. +func encodeRuntimeRecord(record runtime.RuntimeRecord) runtimeRecordResponse { + resp := runtimeRecordResponse{ + GameID: record.GameID, + Status: string(record.Status), + StatePath: record.StatePath, + DockerNetwork: record.DockerNetwork, + LastOpAt: record.LastOpAt.UTC().Format(time.RFC3339Nano), + CreatedAt: record.CreatedAt.UTC().Format(time.RFC3339Nano), + } + if record.CurrentContainerID != "" { + v := record.CurrentContainerID + resp.CurrentContainerID = &v + } + if record.CurrentImageRef != "" { + v := record.CurrentImageRef + resp.CurrentImageRef = &v + } + if record.EngineEndpoint != "" { + v := record.EngineEndpoint + resp.EngineEndpoint = &v + } + if record.StartedAt != nil { + v := record.StartedAt.UTC().Format(time.RFC3339Nano) + resp.StartedAt = &v + } + if record.StoppedAt != nil { + v := record.StoppedAt.UTC().Format(time.RFC3339Nano) + resp.StoppedAt = &v + } + if record.RemovedAt != nil { + v := record.RemovedAt.UTC().Format(time.RFC3339Nano) + resp.RemovedAt = &v + } + return resp +} + +// encodeRuntimesList builds the wire shape returned by the list handler. +// records may be nil (empty store); the result still carries an empty +// items slice so the JSON form is `{"items":[]}`. +func encodeRuntimesList(records []runtime.RuntimeRecord) runtimesListResponse { + resp := runtimesListResponse{ + Items: make([]runtimeRecordResponse, 0, len(records)), + } + for _, record := range records { + resp.Items = append(resp.Items, encodeRuntimeRecord(record)) + } + return resp +} + +// writeJSON writes payload as a JSON response with the given status code. +func writeJSON(writer http.ResponseWriter, statusCode int, payload any) { + writer.Header().Set("Content-Type", JSONContentType) + writer.WriteHeader(statusCode) + _ = json.NewEncoder(writer).Encode(payload) +} + +// writeError writes the canonical error envelope at statusCode. +func writeError(writer http.ResponseWriter, statusCode int, code, message string) { + writeJSON(writer, statusCode, errorResponse{ + Error: errorBody{Code: code, Message: message}, + }) +} + +// writeFailure writes the canonical error envelope using the HTTP +// status mapped from code. Used by every lifecycle handler when its +// service returns `Outcome=failure`. +func writeFailure(writer http.ResponseWriter, code, message string) { + writeError(writer, mapErrorCodeToStatus(code), code, message) +} + +// mapErrorCodeToStatus maps a stable error code to the HTTP status +// declared by `rtmanager/api/internal-openapi.yaml`. Unknown codes +// degrade to 500 so a future error code that ships ahead of its +// handler-layer mapping still produces a structurally valid response. +func mapErrorCodeToStatus(code string) int { + switch code { + case startruntime.ErrorCodeInvalidRequest, + startruntime.ErrorCodeStartConfigInvalid, + startruntime.ErrorCodeImageRefNotSemver: + return http.StatusBadRequest + case startruntime.ErrorCodeNotFound: + return http.StatusNotFound + case startruntime.ErrorCodeConflict, + startruntime.ErrorCodeSemverPatchOnly: + return http.StatusConflict + case startruntime.ErrorCodeServiceUnavailable, + errorCodeDockerUnavailable: + return http.StatusServiceUnavailable + case startruntime.ErrorCodeImagePullFailed, + startruntime.ErrorCodeContainerStartFailed, + startruntime.ErrorCodeInternal: + return http.StatusInternalServerError + default: + return http.StatusInternalServerError + } +} + +// decodeStrictJSON decodes one request body into target with strict +// JSON semantics: unknown fields are rejected and trailing content is +// rejected. Mirrors the helper used by lobby's internal HTTP layer. +func decodeStrictJSON(body io.Reader, target any) error { + decoder := json.NewDecoder(body) + decoder.DisallowUnknownFields() + if err := decoder.Decode(target); err != nil { + return err + } + if decoder.More() { + return errors.New("unexpected trailing content after JSON body") + } + return nil +} + +// extractGameID pulls the {game_id} path variable from request. An empty +// or whitespace-only value writes a `400 invalid_request` and returns +// ok=false so callers can short-circuit. +func extractGameID(writer http.ResponseWriter, request *http.Request) (string, bool) { + raw := request.PathValue(gameIDPathParam) + if strings.TrimSpace(raw) == "" { + writeError(writer, http.StatusBadRequest, + startruntime.ErrorCodeInvalidRequest, + "game id is required", + ) + return "", false + } + return raw, true +} + +// resolveOpSource maps the X-Galaxy-Caller header to an +// `operation.OpSource`. Missing or unknown values default to +// `OpSourceAdminRest`, matching the contract documented in +// `rtmanager/api/internal-openapi.yaml`. +func resolveOpSource(request *http.Request) operation.OpSource { + switch strings.ToLower(strings.TrimSpace(request.Header.Get(callerHeader))) { + case "gm": + return operation.OpSourceGMRest + default: + return operation.OpSourceAdminRest + } +} + +// requestSourceRef returns an opaque per-request reference recorded in +// `operation_log.source_ref`. v1 reads the `X-Request-ID` header when +// present so callers may correlate REST requests with audit rows; the +// listener does not currently install a request-id middleware so the +// header path is the only source. +func requestSourceRef(request *http.Request) string { + if v := strings.TrimSpace(request.Header.Get("X-Request-ID")); v != "" { + return v + } + return "" +} + +// loggerFor returns a logger annotated with the operation tag. Each +// handler scopes its logs by op so operators filtering on +// `op=internal_rest.start` see exactly the lifecycle they care about. +func loggerFor(parent *slog.Logger, op string) *slog.Logger { + if parent == nil { + parent = slog.Default() + } + return parent.With("component", "internal_http.handlers", "op", op) +} diff --git a/rtmanager/internal/api/internalhttp/handlers/common_test.go b/rtmanager/internal/api/internalhttp/handlers/common_test.go new file mode 100644 index 0000000..8cababf --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/common_test.go @@ -0,0 +1,197 @@ +package handlers + +import ( + "context" + "encoding/json" + "errors" + "io" + "net/http" + "net/http/httptest" + "strings" + "sync" + "testing" + "time" + + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + + "github.com/stretchr/testify/require" +) + +// fixedClock is the wall-clock used to build canonical sample records +// across the handler tests. UTC Sunday 1pm 2026-04-26 is far enough in +// the future to be obvious in test output. +var fixedClock = time.Date(2026, 4, 26, 13, 0, 0, 0, time.UTC) + +// sampleRunningRecord returns a canonical running record used by every +// happy-path test in this package. +func sampleRunningRecord(t *testing.T) runtime.RuntimeRecord { + t.Helper() + started := fixedClock + return runtime.RuntimeRecord{ + GameID: "game-test", + Status: runtime.StatusRunning, + CurrentContainerID: "container-test", + CurrentImageRef: "galaxy/game:v1.2.3", + EngineEndpoint: "http://galaxy-game-game-test:8080", + StatePath: "/var/lib/galaxy/game-test", + DockerNetwork: "galaxy-engine", + StartedAt: &started, + LastOpAt: fixedClock, + CreatedAt: fixedClock, + } +} + +// sampleStoppedRecord returns a canonical stopped record useful for +// cleanup-handler and list-handler tests. +func sampleStoppedRecord(t *testing.T) runtime.RuntimeRecord { + t.Helper() + started := fixedClock + stopped := fixedClock.Add(time.Minute) + return runtime.RuntimeRecord{ + GameID: "game-stopped", + Status: runtime.StatusStopped, + CurrentContainerID: "container-stopped", + CurrentImageRef: "galaxy/game:v1.2.3", + EngineEndpoint: "http://galaxy-game-game-stopped:8080", + StatePath: "/var/lib/galaxy/game-stopped", + DockerNetwork: "galaxy-engine", + StartedAt: &started, + StoppedAt: &stopped, + LastOpAt: stopped, + CreatedAt: fixedClock, + } +} + +// drive routes one request through a full mux configured by Register. +// It returns the captured ResponseRecorder so tests can assert on +// status, headers, and body. +func drive(t *testing.T, deps Dependencies, method, path string, headers http.Header, body io.Reader) *httptest.ResponseRecorder { + t.Helper() + + mux := http.NewServeMux() + Register(mux, deps) + + request := httptest.NewRequest(method, path, body) + for key, values := range headers { + for _, value := range values { + request.Header.Add(key, value) + } + } + + recorder := httptest.NewRecorder() + mux.ServeHTTP(recorder, request) + return recorder +} + +// decodeRecordResponse asserts that the response carried a 200 with +// the canonical content type and decodes the record body. +func decodeRecordResponse(t *testing.T, rec *httptest.ResponseRecorder) runtimeRecordResponse { + t.Helper() + require.Equalf(t, http.StatusOK, rec.Code, "expected 200, got body: %s", rec.Body.String()) + require.Equal(t, JSONContentType, rec.Header().Get("Content-Type")) + + var resp runtimeRecordResponse + require.NoError(t, json.NewDecoder(rec.Body).Decode(&resp)) + return resp +} + +// decodeErrorBody asserts the canonical error envelope and decodes it. +func decodeErrorBody(t *testing.T, rec *httptest.ResponseRecorder, wantStatus int) errorBody { + t.Helper() + require.Equalf(t, wantStatus, rec.Code, "expected %d, got body: %s", wantStatus, rec.Body.String()) + require.Equal(t, JSONContentType, rec.Header().Get("Content-Type")) + + var resp errorResponse + require.NoError(t, json.NewDecoder(rec.Body).Decode(&resp)) + return resp.Error +} + +// fakeRuntimeRecords is an in-memory ports.RuntimeRecordStore used by +// list / get tests. It is intentionally minimal — services use their +// own fakes in `internal/service//service_test.go` and do not +// share this helper. +type fakeRuntimeRecords struct { + mu sync.Mutex + stored map[string]runtime.RuntimeRecord + listErr error + getErr error +} + +func newFakeRuntimeRecords() *fakeRuntimeRecords { + return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}} +} + +func (s *fakeRuntimeRecords) put(record runtime.RuntimeRecord) { + s.mu.Lock() + defer s.mu.Unlock() + s.stored[record.GameID] = record +} + +func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.getErr != nil { + return runtime.RuntimeRecord{}, s.getErr + } + record, ok := s.stored[gameID] + if !ok { + return runtime.RuntimeRecord{}, runtime.ErrNotFound + } + return record, nil +} + +func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { + return errors.New("not used in handler tests") +} + +func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error { + return errors.New("not used in handler tests") +} + +func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in handler tests") +} + +func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.listErr != nil { + return nil, s.listErr + } + if len(s.stored) == 0 { + return nil, nil + } + records := make([]runtime.RuntimeRecord, 0, len(s.stored)) + for _, record := range s.stored { + records = append(records, record) + } + return records, nil +} + +// jsonHeaders returns the default headers used by tests that send a +// JSON body. +func jsonHeaders() http.Header { + h := http.Header{} + h.Set("Content-Type", "application/json") + return h +} + +// withCaller adds the X-Galaxy-Caller header to h and returns h. The +// helper exists to keep test cases readable when the header is the +// only difference between two table rows. +func withCaller(h http.Header, value string) http.Header { + if h == nil { + h = http.Header{} + } + h.Set(callerHeader, value) + return h +} + +// strReader builds an io.Reader from raw JSON. +func strReader(raw string) io.Reader { + return strings.NewReader(raw) +} + +// Compile-time assertions that the in-memory fake satisfies the port. +var _ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil) diff --git a/rtmanager/internal/api/internalhttp/handlers/get.go b/rtmanager/internal/api/internalhttp/handlers/get.go new file mode 100644 index 0000000..43613cc --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/get.go @@ -0,0 +1,55 @@ +package handlers + +import ( + "errors" + "net/http" + + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/service/startruntime" +) + +// newGetHandler returns the handler for +// `GET /api/v1/internal/runtimes/{game_id}`. The handler reads +// directly from the runtime record store and translates +// `runtime.ErrNotFound` to `404 not_found`. Like list, it does not +// run through the service layer and does not produce an operation_log +// row. +func newGetHandler(deps Dependencies) http.HandlerFunc { + logger := loggerFor(deps.Logger, "internal_rest.get") + return func(writer http.ResponseWriter, request *http.Request) { + if deps.RuntimeRecords == nil { + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "runtime records store is not wired", + ) + return + } + + gameID, ok := extractGameID(writer, request) + if !ok { + return + } + + record, err := deps.RuntimeRecords.Get(request.Context(), gameID) + if errors.Is(err, runtime.ErrNotFound) { + writeError(writer, http.StatusNotFound, + startruntime.ErrorCodeNotFound, + "runtime record not found", + ) + return + } + if err != nil { + logger.ErrorContext(request.Context(), "get runtime record", + "game_id", gameID, + "err", err.Error(), + ) + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "failed to read runtime record", + ) + return + } + + writeJSON(writer, http.StatusOK, encodeRuntimeRecord(record)) + } +} diff --git a/rtmanager/internal/api/internalhttp/handlers/handlers.go b/rtmanager/internal/api/internalhttp/handlers/handlers.go new file mode 100644 index 0000000..1efc3e8 --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/handlers.go @@ -0,0 +1,69 @@ +package handlers + +import ( + "log/slog" + "net/http" + + "galaxy/rtmanager/internal/ports" +) + +// Route paths registered by Register. The values match the operation +// IDs frozen by `rtmanager/api/internal-openapi.yaml` and +// `rtmanager/contract_openapi_test.go`. +const ( + listRuntimesPath = "/api/v1/internal/runtimes" + getRuntimePath = "/api/v1/internal/runtimes/{game_id}" + startRuntimePath = "/api/v1/internal/runtimes/{game_id}/start" + stopRuntimePath = "/api/v1/internal/runtimes/{game_id}/stop" + restartRuntimePath = "/api/v1/internal/runtimes/{game_id}/restart" + patchRuntimePath = "/api/v1/internal/runtimes/{game_id}/patch" + cleanupRuntimePath = "/api/v1/internal/runtimes/{game_id}/container" +) + +// Dependencies bundles the collaborators required to serve the GM/Admin +// REST surface. Any service may be nil for tests that exercise a +// subset of the surface; in that case the unwired routes return +// `500 internal_error` (mirrors lobby's "service is not wired" +// pattern). +type Dependencies struct { + // Logger receives structured logs scoped per handler. nil falls back + // to slog.Default. + Logger *slog.Logger + + // RuntimeRecords backs the read-only list and get handlers. They do + // not produce operation_log rows because they do not mutate state. + RuntimeRecords ports.RuntimeRecordStore + + // StartRuntime executes the start lifecycle operation. Production + // wiring passes `*startruntime.Service` (the concrete service + // satisfies StartService). + StartRuntime StartService + + // StopRuntime executes the stop lifecycle operation. + StopRuntime StopService + + // RestartRuntime executes the restart lifecycle operation. + RestartRuntime RestartService + + // PatchRuntime executes the patch lifecycle operation. + PatchRuntime PatchService + + // CleanupContainer executes the cleanup_container lifecycle + // operation. + CleanupContainer CleanupService +} + +// Register attaches every internal REST route to mux using deps. Each +// route reads its dependency lazily so a partially-wired Dependencies +// (e.g., a probe-only listener test) does not crash; missing +// dependencies surface as `500 internal_error`. Routes use Go 1.22 +// method-aware mux patterns. +func Register(mux *http.ServeMux, deps Dependencies) { + mux.HandleFunc("GET "+listRuntimesPath, newListHandler(deps)) + mux.HandleFunc("GET "+getRuntimePath, newGetHandler(deps)) + mux.HandleFunc("POST "+startRuntimePath, newStartHandler(deps)) + mux.HandleFunc("POST "+stopRuntimePath, newStopHandler(deps)) + mux.HandleFunc("POST "+restartRuntimePath, newRestartHandler(deps)) + mux.HandleFunc("POST "+patchRuntimePath, newPatchHandler(deps)) + mux.HandleFunc("DELETE "+cleanupRuntimePath, newCleanupHandler(deps)) +} diff --git a/rtmanager/internal/api/internalhttp/handlers/handlers_mutation_test.go b/rtmanager/internal/api/internalhttp/handlers/handlers_mutation_test.go new file mode 100644 index 0000000..69e141b --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/handlers_mutation_test.go @@ -0,0 +1,610 @@ +package handlers + +import ( + "context" + "net/http" + "testing" + + "galaxy/rtmanager/internal/api/internalhttp/handlers/mocks" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/service/cleanupcontainer" + "galaxy/rtmanager/internal/service/patchruntime" + "galaxy/rtmanager/internal/service/restartruntime" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/service/stopruntime" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" +) + +// Tests for the mutating handlers (start, stop, restart, patch, +// cleanup). Each handler delegates to one lifecycle service through a +// narrow `mockgen`-backed interface; the handler layer is responsible +// for input parsing, the `X-Galaxy-Caller` → `op_source` mapping, and +// the canonical `ErrorCode` → HTTP status table documented in +// `rtmanager/docs/services.md` §18. + +// --- start --- + +func TestStartHandlerReturnsRecordOnSuccess(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockStartService(ctrl) + + record := sampleRunningRecord(t) + mock.EXPECT(). + Handle(gomock.Any(), gomock.AssignableToTypeOf(startruntime.Input{})). + DoAndReturn(func(_ context.Context, in startruntime.Input) (startruntime.Result, error) { + assert.Equal(t, "game-test", in.GameID) + assert.Equal(t, "galaxy/game:v1.2.3", in.ImageRef) + assert.Equal(t, operation.OpSourceAdminRest, in.OpSource) + return startruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil + }) + + deps := Dependencies{StartRuntime: mock} + rec := drive(t, deps, http.MethodPost, "/api/v1/internal/runtimes/game-test/start", + jsonHeaders(), + strReader(`{"image_ref":"galaxy/game:v1.2.3"}`), + ) + + resp := decodeRecordResponse(t, rec) + assert.Equal(t, "game-test", resp.GameID) + assert.Equal(t, "running", resp.Status) +} + +func TestStartHandlerReturnsRecordOnReplayNoOp(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockStartService(ctrl) + + record := sampleRunningRecord(t) + mock.EXPECT(). + Handle(gomock.Any(), gomock.Any()). + Return(startruntime.Result{ + Record: record, + Outcome: operation.OutcomeSuccess, + ErrorCode: startruntime.ErrorCodeReplayNoOp, + }, nil) + + rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/start", + jsonHeaders(), + strReader(`{"image_ref":"galaxy/game:v1.2.3"}`), + ) + + resp := decodeRecordResponse(t, rec) + assert.Equal(t, "game-test", resp.GameID) +} + +func TestStartHandlerMapsServiceFailures(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + errorCode string + wantStatus int + }{ + {"start_config_invalid", startruntime.ErrorCodeStartConfigInvalid, http.StatusBadRequest}, + {"image_pull_failed", startruntime.ErrorCodeImagePullFailed, http.StatusInternalServerError}, + {"container_start_failed", startruntime.ErrorCodeContainerStartFailed, http.StatusInternalServerError}, + {"conflict", startruntime.ErrorCodeConflict, http.StatusConflict}, + {"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable}, + {"internal_error", startruntime.ErrorCodeInternal, http.StatusInternalServerError}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockStartService(ctrl) + + mock.EXPECT(). + Handle(gomock.Any(), gomock.Any()). + Return(startruntime.Result{ + Outcome: operation.OutcomeFailure, + ErrorCode: tc.errorCode, + ErrorMessage: "synthetic " + tc.name, + }, nil) + + rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/start", + jsonHeaders(), + strReader(`{"image_ref":"galaxy/game:v1.2.3"}`), + ) + + body := decodeErrorBody(t, rec, tc.wantStatus) + assert.Equal(t, tc.errorCode, body.Code) + assert.Equal(t, "synthetic "+tc.name, body.Message) + }) + } +} + +func TestStartHandlerRejectsUnknownJSONFields(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockStartService(ctrl) + + rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/start", + jsonHeaders(), + strReader(`{"image_ref":"x","extra":"y"}`), + ) + + body := decodeErrorBody(t, rec, http.StatusBadRequest) + assert.Equal(t, "invalid_request", body.Code) +} + +func TestStartHandlerRejectsMalformedJSON(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockStartService(ctrl) + + rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/start", + jsonHeaders(), + strReader(`{"image_ref":`), + ) + + body := decodeErrorBody(t, rec, http.StatusBadRequest) + assert.Equal(t, "invalid_request", body.Code) +} + +func TestStartHandlerHonoursXGalaxyCallerHeader(t *testing.T) { + t.Parallel() + + cases := []struct { + header string + want operation.OpSource + hdrLabel string + }{ + {"gm", operation.OpSourceGMRest, "gm"}, + {"GM", operation.OpSourceGMRest, "uppercase gm"}, + {"admin", operation.OpSourceAdminRest, "admin"}, + {"unknown", operation.OpSourceAdminRest, "unknown value"}, + {"", operation.OpSourceAdminRest, "missing header"}, + } + + for _, tc := range cases { + t.Run(tc.hdrLabel, func(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockStartService(ctrl) + + record := sampleRunningRecord(t) + mock.EXPECT(). + Handle(gomock.Any(), gomock.AssignableToTypeOf(startruntime.Input{})). + DoAndReturn(func(_ context.Context, in startruntime.Input) (startruntime.Result, error) { + assert.Equal(t, tc.want, in.OpSource) + return startruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil + }) + + headers := jsonHeaders() + if tc.header != "" { + headers = withCaller(headers, tc.header) + } + rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/start", + headers, + strReader(`{"image_ref":"galaxy/game:v1.2.3"}`), + ) + require.Equal(t, http.StatusOK, rec.Code) + }) + } +} + +func TestStartHandlerForwardsXRequestIDAsSourceRef(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockStartService(ctrl) + + mock.EXPECT(). + Handle(gomock.Any(), gomock.AssignableToTypeOf(startruntime.Input{})). + DoAndReturn(func(_ context.Context, in startruntime.Input) (startruntime.Result, error) { + assert.Equal(t, "req-42", in.SourceRef) + return startruntime.Result{Record: sampleRunningRecord(t), Outcome: operation.OutcomeSuccess}, nil + }) + + headers := jsonHeaders() + headers.Set("X-Request-ID", "req-42") + rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/start", + headers, + strReader(`{"image_ref":"galaxy/game:v1.2.3"}`), + ) + require.Equal(t, http.StatusOK, rec.Code) +} + +func TestStartHandlerReturnsInternalErrorWhenServiceErrors(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockStartService(ctrl) + + mock.EXPECT(). + Handle(gomock.Any(), gomock.Any()). + Return(startruntime.Result{}, assert.AnError) + + rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/start", + jsonHeaders(), + strReader(`{"image_ref":"galaxy/game:v1.2.3"}`), + ) + + body := decodeErrorBody(t, rec, http.StatusInternalServerError) + assert.Equal(t, "internal_error", body.Code) +} + +func TestStartHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) { + t.Parallel() + + rec := drive(t, Dependencies{}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/start", + jsonHeaders(), + strReader(`{"image_ref":"galaxy/game:v1.2.3"}`), + ) + + body := decodeErrorBody(t, rec, http.StatusInternalServerError) + assert.Equal(t, "internal_error", body.Code) +} + +// --- stop --- + +func TestStopHandlerReturnsRecordOnSuccess(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockStopService(ctrl) + + record := sampleStoppedRecord(t) + mock.EXPECT(). + Handle(gomock.Any(), gomock.AssignableToTypeOf(stopruntime.Input{})). + DoAndReturn(func(_ context.Context, in stopruntime.Input) (stopruntime.Result, error) { + assert.Equal(t, "game-test", in.GameID) + assert.Equal(t, stopruntime.StopReasonAdminRequest, in.Reason) + assert.Equal(t, operation.OpSourceAdminRest, in.OpSource) + return stopruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil + }) + + rec := drive(t, Dependencies{StopRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/stop", + jsonHeaders(), + strReader(`{"reason":"admin_request"}`), + ) + + resp := decodeRecordResponse(t, rec) + assert.Equal(t, "stopped", resp.Status) +} + +func TestStopHandlerMapsServiceFailures(t *testing.T) { + t.Parallel() + cases := []struct { + name string + errorCode string + wantStatus int + }{ + {"not_found", startruntime.ErrorCodeNotFound, http.StatusNotFound}, + {"conflict", startruntime.ErrorCodeConflict, http.StatusConflict}, + {"invalid_request", startruntime.ErrorCodeInvalidRequest, http.StatusBadRequest}, + {"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable}, + {"internal_error", startruntime.ErrorCodeInternal, http.StatusInternalServerError}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockStopService(ctrl) + mock.EXPECT().Handle(gomock.Any(), gomock.Any()).Return(stopruntime.Result{ + Outcome: operation.OutcomeFailure, ErrorCode: tc.errorCode, ErrorMessage: tc.name, + }, nil) + + rec := drive(t, Dependencies{StopRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/stop", + jsonHeaders(), + strReader(`{"reason":"admin_request"}`), + ) + body := decodeErrorBody(t, rec, tc.wantStatus) + assert.Equal(t, tc.errorCode, body.Code) + }) + } +} + +func TestStopHandlerRejectsUnknownJSONFields(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockStopService(ctrl) + + rec := drive(t, Dependencies{StopRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/stop", + jsonHeaders(), + strReader(`{"reason":"admin_request","extra":1}`), + ) + body := decodeErrorBody(t, rec, http.StatusBadRequest) + assert.Equal(t, "invalid_request", body.Code) +} + +func TestStopHandlerHonoursXGalaxyCallerHeader(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockStopService(ctrl) + + mock.EXPECT(). + Handle(gomock.Any(), gomock.AssignableToTypeOf(stopruntime.Input{})). + DoAndReturn(func(_ context.Context, in stopruntime.Input) (stopruntime.Result, error) { + assert.Equal(t, operation.OpSourceGMRest, in.OpSource) + return stopruntime.Result{Record: sampleStoppedRecord(t), Outcome: operation.OutcomeSuccess}, nil + }) + + rec := drive(t, Dependencies{StopRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/stop", + withCaller(jsonHeaders(), "gm"), + strReader(`{"reason":"cancelled"}`), + ) + require.Equal(t, http.StatusOK, rec.Code) +} + +func TestStopHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) { + t.Parallel() + + rec := drive(t, Dependencies{}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/stop", + jsonHeaders(), + strReader(`{"reason":"admin_request"}`), + ) + body := decodeErrorBody(t, rec, http.StatusInternalServerError) + assert.Equal(t, "internal_error", body.Code) +} + +// --- restart --- + +func TestRestartHandlerReturnsRecordOnSuccess(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockRestartService(ctrl) + + record := sampleRunningRecord(t) + mock.EXPECT(). + Handle(gomock.Any(), gomock.AssignableToTypeOf(restartruntime.Input{})). + DoAndReturn(func(_ context.Context, in restartruntime.Input) (restartruntime.Result, error) { + assert.Equal(t, "game-test", in.GameID) + assert.Equal(t, operation.OpSourceAdminRest, in.OpSource) + return restartruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil + }) + + rec := drive(t, Dependencies{RestartRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/restart", nil, nil, + ) + resp := decodeRecordResponse(t, rec) + assert.Equal(t, "running", resp.Status) +} + +func TestRestartHandlerMapsServiceFailures(t *testing.T) { + t.Parallel() + cases := []struct { + name string + errorCode string + wantStatus int + }{ + {"not_found", startruntime.ErrorCodeNotFound, http.StatusNotFound}, + {"conflict", startruntime.ErrorCodeConflict, http.StatusConflict}, + {"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable}, + {"internal_error", startruntime.ErrorCodeInternal, http.StatusInternalServerError}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockRestartService(ctrl) + mock.EXPECT().Handle(gomock.Any(), gomock.Any()).Return(restartruntime.Result{ + Outcome: operation.OutcomeFailure, ErrorCode: tc.errorCode, ErrorMessage: tc.name, + }, nil) + + rec := drive(t, Dependencies{RestartRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/restart", nil, nil, + ) + body := decodeErrorBody(t, rec, tc.wantStatus) + assert.Equal(t, tc.errorCode, body.Code) + }) + } +} + +func TestRestartHandlerHonoursXGalaxyCallerHeader(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockRestartService(ctrl) + + mock.EXPECT(). + Handle(gomock.Any(), gomock.AssignableToTypeOf(restartruntime.Input{})). + DoAndReturn(func(_ context.Context, in restartruntime.Input) (restartruntime.Result, error) { + assert.Equal(t, operation.OpSourceGMRest, in.OpSource) + return restartruntime.Result{Record: sampleRunningRecord(t), Outcome: operation.OutcomeSuccess}, nil + }) + + rec := drive(t, Dependencies{RestartRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/restart", + withCaller(http.Header{}, "gm"), nil, + ) + require.Equal(t, http.StatusOK, rec.Code) +} + +func TestRestartHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) { + t.Parallel() + + rec := drive(t, Dependencies{}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/restart", nil, nil, + ) + body := decodeErrorBody(t, rec, http.StatusInternalServerError) + assert.Equal(t, "internal_error", body.Code) +} + +// --- patch --- + +func TestPatchHandlerReturnsRecordOnSuccess(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockPatchService(ctrl) + + record := sampleRunningRecord(t) + mock.EXPECT(). + Handle(gomock.Any(), gomock.AssignableToTypeOf(patchruntime.Input{})). + DoAndReturn(func(_ context.Context, in patchruntime.Input) (patchruntime.Result, error) { + assert.Equal(t, "game-test", in.GameID) + assert.Equal(t, "galaxy/game:v1.2.4", in.NewImageRef) + return patchruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil + }) + + rec := drive(t, Dependencies{PatchRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/patch", + jsonHeaders(), + strReader(`{"image_ref":"galaxy/game:v1.2.4"}`), + ) + resp := decodeRecordResponse(t, rec) + assert.Equal(t, "running", resp.Status) +} + +func TestPatchHandlerMapsServiceFailures(t *testing.T) { + t.Parallel() + cases := []struct { + name string + errorCode string + wantStatus int + }{ + {"image_ref_not_semver", startruntime.ErrorCodeImageRefNotSemver, http.StatusBadRequest}, + {"semver_patch_only", startruntime.ErrorCodeSemverPatchOnly, http.StatusConflict}, + {"not_found", startruntime.ErrorCodeNotFound, http.StatusNotFound}, + {"conflict", startruntime.ErrorCodeConflict, http.StatusConflict}, + {"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockPatchService(ctrl) + mock.EXPECT().Handle(gomock.Any(), gomock.Any()).Return(patchruntime.Result{ + Outcome: operation.OutcomeFailure, ErrorCode: tc.errorCode, ErrorMessage: tc.name, + }, nil) + + rec := drive(t, Dependencies{PatchRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/patch", + jsonHeaders(), + strReader(`{"image_ref":"galaxy/game:v1.2.4"}`), + ) + body := decodeErrorBody(t, rec, tc.wantStatus) + assert.Equal(t, tc.errorCode, body.Code) + }) + } +} + +func TestPatchHandlerRejectsUnknownJSONFields(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockPatchService(ctrl) + + rec := drive(t, Dependencies{PatchRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/patch", + jsonHeaders(), + strReader(`{"image_ref":"x","unexpected":true}`), + ) + body := decodeErrorBody(t, rec, http.StatusBadRequest) + assert.Equal(t, "invalid_request", body.Code) +} + +func TestPatchHandlerHonoursXGalaxyCallerHeader(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockPatchService(ctrl) + + mock.EXPECT(). + Handle(gomock.Any(), gomock.AssignableToTypeOf(patchruntime.Input{})). + DoAndReturn(func(_ context.Context, in patchruntime.Input) (patchruntime.Result, error) { + assert.Equal(t, operation.OpSourceGMRest, in.OpSource) + return patchruntime.Result{Record: sampleRunningRecord(t), Outcome: operation.OutcomeSuccess}, nil + }) + + rec := drive(t, Dependencies{PatchRuntime: mock}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/patch", + withCaller(jsonHeaders(), "gm"), + strReader(`{"image_ref":"galaxy/game:v1.2.4"}`), + ) + require.Equal(t, http.StatusOK, rec.Code) +} + +func TestPatchHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) { + t.Parallel() + + rec := drive(t, Dependencies{}, http.MethodPost, + "/api/v1/internal/runtimes/game-test/patch", + jsonHeaders(), + strReader(`{"image_ref":"galaxy/game:v1.2.4"}`), + ) + body := decodeErrorBody(t, rec, http.StatusInternalServerError) + assert.Equal(t, "internal_error", body.Code) +} + +// --- cleanup --- + +func TestCleanupHandlerReturnsRecordOnSuccess(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockCleanupService(ctrl) + + record := sampleStoppedRecord(t) + record.Status = runtime.StatusRemoved + record.CurrentContainerID = "" + removed := record.LastOpAt + record.RemovedAt = &removed + + mock.EXPECT(). + Handle(gomock.Any(), gomock.AssignableToTypeOf(cleanupcontainer.Input{})). + DoAndReturn(func(_ context.Context, in cleanupcontainer.Input) (cleanupcontainer.Result, error) { + assert.Equal(t, "game-stopped", in.GameID) + assert.Equal(t, operation.OpSourceAdminRest, in.OpSource) + return cleanupcontainer.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil + }) + + rec := drive(t, Dependencies{CleanupContainer: mock}, http.MethodDelete, + "/api/v1/internal/runtimes/game-stopped/container", nil, nil, + ) + resp := decodeRecordResponse(t, rec) + assert.Equal(t, "removed", resp.Status) + assert.Nil(t, resp.CurrentContainerID, "container id must be null after cleanup") +} + +func TestCleanupHandlerMapsServiceFailures(t *testing.T) { + t.Parallel() + cases := []struct { + name string + errorCode string + wantStatus int + }{ + {"not_found", startruntime.ErrorCodeNotFound, http.StatusNotFound}, + {"conflict", startruntime.ErrorCodeConflict, http.StatusConflict}, + {"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + mock := mocks.NewMockCleanupService(ctrl) + mock.EXPECT().Handle(gomock.Any(), gomock.Any()).Return(cleanupcontainer.Result{ + Outcome: operation.OutcomeFailure, ErrorCode: tc.errorCode, ErrorMessage: tc.name, + }, nil) + + rec := drive(t, Dependencies{CleanupContainer: mock}, http.MethodDelete, + "/api/v1/internal/runtimes/game-test/container", nil, nil, + ) + body := decodeErrorBody(t, rec, tc.wantStatus) + assert.Equal(t, tc.errorCode, body.Code) + }) + } +} + +func TestCleanupHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) { + t.Parallel() + + rec := drive(t, Dependencies{}, http.MethodDelete, + "/api/v1/internal/runtimes/game-test/container", nil, nil, + ) + body := decodeErrorBody(t, rec, http.StatusInternalServerError) + assert.Equal(t, "internal_error", body.Code) +} diff --git a/rtmanager/internal/api/internalhttp/handlers/handlers_read_test.go b/rtmanager/internal/api/internalhttp/handlers/handlers_read_test.go new file mode 100644 index 0000000..c256530 --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/handlers_read_test.go @@ -0,0 +1,115 @@ +package handlers + +import ( + "encoding/json" + "errors" + "net/http" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// Tests for the read-only handlers (`internalListRuntimes`, +// `internalGetRuntime`). These bypass the service layer and read +// directly from `ports.RuntimeRecordStore` — see +// `rtmanager/docs/services.md` §18. + +func TestListHandlerReturnsEmptyItemsForEmptyStore(t *testing.T) { + t.Parallel() + + deps := Dependencies{RuntimeRecords: newFakeRuntimeRecords()} + rec := drive(t, deps, http.MethodGet, "/api/v1/internal/runtimes", nil, nil) + + require.Equal(t, http.StatusOK, rec.Code) + require.Equal(t, JSONContentType, rec.Header().Get("Content-Type")) + + var resp runtimesListResponse + require.NoError(t, json.NewDecoder(rec.Body).Decode(&resp)) + require.NotNil(t, resp.Items, "items must never be nil") + assert.Empty(t, resp.Items) +} + +func TestListHandlerReturnsEveryStoredRecord(t *testing.T) { + t.Parallel() + + store := newFakeRuntimeRecords() + store.put(sampleRunningRecord(t)) + store.put(sampleStoppedRecord(t)) + + rec := drive(t, Dependencies{RuntimeRecords: store}, http.MethodGet, "/api/v1/internal/runtimes", nil, nil) + require.Equal(t, http.StatusOK, rec.Code) + + var resp runtimesListResponse + require.NoError(t, json.NewDecoder(rec.Body).Decode(&resp)) + require.Len(t, resp.Items, 2) + + gotIDs := map[string]string{} + for _, item := range resp.Items { + gotIDs[item.GameID] = item.Status + } + assert.Equal(t, "running", gotIDs["game-test"]) + assert.Equal(t, "stopped", gotIDs["game-stopped"]) +} + +func TestListHandlerReturnsInternalErrorWhenStoreFails(t *testing.T) { + t.Parallel() + + store := newFakeRuntimeRecords() + store.listErr = errors.New("postgres exploded") + + rec := drive(t, Dependencies{RuntimeRecords: store}, http.MethodGet, "/api/v1/internal/runtimes", nil, nil) + body := decodeErrorBody(t, rec, http.StatusInternalServerError) + assert.Equal(t, "internal_error", body.Code) +} + +func TestListHandlerReturnsInternalErrorWhenStoreNotWired(t *testing.T) { + t.Parallel() + + rec := drive(t, Dependencies{}, http.MethodGet, "/api/v1/internal/runtimes", nil, nil) + body := decodeErrorBody(t, rec, http.StatusInternalServerError) + assert.Equal(t, "internal_error", body.Code) +} + +func TestGetHandlerReturnsTheRecord(t *testing.T) { + t.Parallel() + + store := newFakeRuntimeRecords() + record := sampleRunningRecord(t) + store.put(record) + + rec := drive(t, Dependencies{RuntimeRecords: store}, http.MethodGet, "/api/v1/internal/runtimes/game-test", nil, nil) + resp := decodeRecordResponse(t, rec) + assert.Equal(t, "game-test", resp.GameID) + assert.Equal(t, "running", resp.Status) + if assert.NotNil(t, resp.CurrentImageRef) { + assert.Equal(t, "galaxy/game:v1.2.3", *resp.CurrentImageRef) + } +} + +func TestGetHandlerReturnsNotFoundForMissingRecord(t *testing.T) { + t.Parallel() + + rec := drive(t, Dependencies{RuntimeRecords: newFakeRuntimeRecords()}, http.MethodGet, "/api/v1/internal/runtimes/game-missing", nil, nil) + body := decodeErrorBody(t, rec, http.StatusNotFound) + assert.Equal(t, "not_found", body.Code) +} + +func TestGetHandlerReturnsInternalErrorWhenStoreFails(t *testing.T) { + t.Parallel() + + store := newFakeRuntimeRecords() + store.getErr = errors.New("transport blew up") + + rec := drive(t, Dependencies{RuntimeRecords: store}, http.MethodGet, "/api/v1/internal/runtimes/game-test", nil, nil) + body := decodeErrorBody(t, rec, http.StatusInternalServerError) + assert.Equal(t, "internal_error", body.Code) +} + +func TestGetHandlerReturnsInternalErrorWhenStoreNotWired(t *testing.T) { + t.Parallel() + + rec := drive(t, Dependencies{}, http.MethodGet, "/api/v1/internal/runtimes/game-test", nil, nil) + body := decodeErrorBody(t, rec, http.StatusInternalServerError) + assert.Equal(t, "internal_error", body.Code) +} diff --git a/rtmanager/internal/api/internalhttp/handlers/list.go b/rtmanager/internal/api/internalhttp/handlers/list.go new file mode 100644 index 0000000..ad01627 --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/list.go @@ -0,0 +1,38 @@ +package handlers + +import ( + "net/http" + + "galaxy/rtmanager/internal/service/startruntime" +) + +// newListHandler returns the handler for `GET /api/v1/internal/runtimes`. +// The handler reads directly from `ports.RuntimeRecordStore.List` — +// this surface is read-only and does not produce operation_log rows +// (rationale: see `rtmanager/docs/services.md` §18). +func newListHandler(deps Dependencies) http.HandlerFunc { + logger := loggerFor(deps.Logger, "internal_rest.list") + return func(writer http.ResponseWriter, request *http.Request) { + if deps.RuntimeRecords == nil { + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "runtime records store is not wired", + ) + return + } + + records, err := deps.RuntimeRecords.List(request.Context()) + if err != nil { + logger.ErrorContext(request.Context(), "list runtime records", + "err", err.Error(), + ) + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "failed to list runtime records", + ) + return + } + + writeJSON(writer, http.StatusOK, encodeRuntimesList(records)) + } +} diff --git a/rtmanager/internal/api/internalhttp/handlers/mocks/mock_services.go b/rtmanager/internal/api/internalhttp/handlers/mocks/mock_services.go new file mode 100644 index 0000000..80a9877 --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/mocks/mock_services.go @@ -0,0 +1,217 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: galaxy/rtmanager/internal/api/internalhttp/handlers (interfaces: StartService,StopService,RestartService,PatchService,CleanupService) +// +// Generated by this command: +// +// mockgen -destination=mocks/mock_services.go -package=mocks galaxy/rtmanager/internal/api/internalhttp/handlers StartService,StopService,RestartService,PatchService,CleanupService +// + +// Package mocks is a generated GoMock package. +package mocks + +import ( + context "context" + cleanupcontainer "galaxy/rtmanager/internal/service/cleanupcontainer" + patchruntime "galaxy/rtmanager/internal/service/patchruntime" + restartruntime "galaxy/rtmanager/internal/service/restartruntime" + startruntime "galaxy/rtmanager/internal/service/startruntime" + stopruntime "galaxy/rtmanager/internal/service/stopruntime" + reflect "reflect" + + gomock "go.uber.org/mock/gomock" +) + +// MockStartService is a mock of StartService interface. +type MockStartService struct { + ctrl *gomock.Controller + recorder *MockStartServiceMockRecorder + isgomock struct{} +} + +// MockStartServiceMockRecorder is the mock recorder for MockStartService. +type MockStartServiceMockRecorder struct { + mock *MockStartService +} + +// NewMockStartService creates a new mock instance. +func NewMockStartService(ctrl *gomock.Controller) *MockStartService { + mock := &MockStartService{ctrl: ctrl} + mock.recorder = &MockStartServiceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockStartService) EXPECT() *MockStartServiceMockRecorder { + return m.recorder +} + +// Handle mocks base method. +func (m *MockStartService) Handle(ctx context.Context, in startruntime.Input) (startruntime.Result, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Handle", ctx, in) + ret0, _ := ret[0].(startruntime.Result) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Handle indicates an expected call of Handle. +func (mr *MockStartServiceMockRecorder) Handle(ctx, in any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockStartService)(nil).Handle), ctx, in) +} + +// MockStopService is a mock of StopService interface. +type MockStopService struct { + ctrl *gomock.Controller + recorder *MockStopServiceMockRecorder + isgomock struct{} +} + +// MockStopServiceMockRecorder is the mock recorder for MockStopService. +type MockStopServiceMockRecorder struct { + mock *MockStopService +} + +// NewMockStopService creates a new mock instance. +func NewMockStopService(ctrl *gomock.Controller) *MockStopService { + mock := &MockStopService{ctrl: ctrl} + mock.recorder = &MockStopServiceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockStopService) EXPECT() *MockStopServiceMockRecorder { + return m.recorder +} + +// Handle mocks base method. +func (m *MockStopService) Handle(ctx context.Context, in stopruntime.Input) (stopruntime.Result, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Handle", ctx, in) + ret0, _ := ret[0].(stopruntime.Result) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Handle indicates an expected call of Handle. +func (mr *MockStopServiceMockRecorder) Handle(ctx, in any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockStopService)(nil).Handle), ctx, in) +} + +// MockRestartService is a mock of RestartService interface. +type MockRestartService struct { + ctrl *gomock.Controller + recorder *MockRestartServiceMockRecorder + isgomock struct{} +} + +// MockRestartServiceMockRecorder is the mock recorder for MockRestartService. +type MockRestartServiceMockRecorder struct { + mock *MockRestartService +} + +// NewMockRestartService creates a new mock instance. +func NewMockRestartService(ctrl *gomock.Controller) *MockRestartService { + mock := &MockRestartService{ctrl: ctrl} + mock.recorder = &MockRestartServiceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockRestartService) EXPECT() *MockRestartServiceMockRecorder { + return m.recorder +} + +// Handle mocks base method. +func (m *MockRestartService) Handle(ctx context.Context, in restartruntime.Input) (restartruntime.Result, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Handle", ctx, in) + ret0, _ := ret[0].(restartruntime.Result) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Handle indicates an expected call of Handle. +func (mr *MockRestartServiceMockRecorder) Handle(ctx, in any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockRestartService)(nil).Handle), ctx, in) +} + +// MockPatchService is a mock of PatchService interface. +type MockPatchService struct { + ctrl *gomock.Controller + recorder *MockPatchServiceMockRecorder + isgomock struct{} +} + +// MockPatchServiceMockRecorder is the mock recorder for MockPatchService. +type MockPatchServiceMockRecorder struct { + mock *MockPatchService +} + +// NewMockPatchService creates a new mock instance. +func NewMockPatchService(ctrl *gomock.Controller) *MockPatchService { + mock := &MockPatchService{ctrl: ctrl} + mock.recorder = &MockPatchServiceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockPatchService) EXPECT() *MockPatchServiceMockRecorder { + return m.recorder +} + +// Handle mocks base method. +func (m *MockPatchService) Handle(ctx context.Context, in patchruntime.Input) (patchruntime.Result, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Handle", ctx, in) + ret0, _ := ret[0].(patchruntime.Result) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Handle indicates an expected call of Handle. +func (mr *MockPatchServiceMockRecorder) Handle(ctx, in any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockPatchService)(nil).Handle), ctx, in) +} + +// MockCleanupService is a mock of CleanupService interface. +type MockCleanupService struct { + ctrl *gomock.Controller + recorder *MockCleanupServiceMockRecorder + isgomock struct{} +} + +// MockCleanupServiceMockRecorder is the mock recorder for MockCleanupService. +type MockCleanupServiceMockRecorder struct { + mock *MockCleanupService +} + +// NewMockCleanupService creates a new mock instance. +func NewMockCleanupService(ctrl *gomock.Controller) *MockCleanupService { + mock := &MockCleanupService{ctrl: ctrl} + mock.recorder = &MockCleanupServiceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockCleanupService) EXPECT() *MockCleanupServiceMockRecorder { + return m.recorder +} + +// Handle mocks base method. +func (m *MockCleanupService) Handle(ctx context.Context, in cleanupcontainer.Input) (cleanupcontainer.Result, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Handle", ctx, in) + ret0, _ := ret[0].(cleanupcontainer.Result) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Handle indicates an expected call of Handle. +func (mr *MockCleanupServiceMockRecorder) Handle(ctx, in any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockCleanupService)(nil).Handle), ctx, in) +} diff --git a/rtmanager/internal/api/internalhttp/handlers/patch.go b/rtmanager/internal/api/internalhttp/handlers/patch.go new file mode 100644 index 0000000..5ece2a9 --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/patch.go @@ -0,0 +1,71 @@ +package handlers + +import ( + "net/http" + + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/service/patchruntime" + "galaxy/rtmanager/internal/service/startruntime" +) + +// patchRequestBody mirrors the OpenAPI PatchRequest schema. The +// service layer validates `image_ref` shape (semver, distribution +// reference) and surfaces `image_ref_not_semver` / +// `semver_patch_only` as needed. +type patchRequestBody struct { + ImageRef string `json:"image_ref"` +} + +// newPatchHandler returns the handler for +// `POST /api/v1/internal/runtimes/{game_id}/patch`. +func newPatchHandler(deps Dependencies) http.HandlerFunc { + logger := loggerFor(deps.Logger, "internal_rest.patch") + return func(writer http.ResponseWriter, request *http.Request) { + if deps.PatchRuntime == nil { + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "patch runtime service is not wired", + ) + return + } + + gameID, ok := extractGameID(writer, request) + if !ok { + return + } + + var body patchRequestBody + if err := decodeStrictJSON(request.Body, &body); err != nil { + writeError(writer, http.StatusBadRequest, + startruntime.ErrorCodeInvalidRequest, + err.Error(), + ) + return + } + + result, err := deps.PatchRuntime.Handle(request.Context(), patchruntime.Input{ + GameID: gameID, + NewImageRef: body.ImageRef, + OpSource: resolveOpSource(request), + SourceRef: requestSourceRef(request), + }) + if err != nil { + logger.ErrorContext(request.Context(), "patch runtime service errored", + "game_id", gameID, + "err", err.Error(), + ) + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "patch runtime service failed", + ) + return + } + + if result.Outcome == operation.OutcomeFailure { + writeFailure(writer, result.ErrorCode, result.ErrorMessage) + return + } + + writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record)) + } +} diff --git a/rtmanager/internal/api/internalhttp/handlers/restart.go b/rtmanager/internal/api/internalhttp/handlers/restart.go new file mode 100644 index 0000000..d8f0f07 --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/restart.go @@ -0,0 +1,55 @@ +package handlers + +import ( + "net/http" + + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/service/restartruntime" + "galaxy/rtmanager/internal/service/startruntime" +) + +// newRestartHandler returns the handler for +// `POST /api/v1/internal/runtimes/{game_id}/restart`. The OpenAPI spec +// declares no request body for this operation; any client-provided +// body is ignored. +func newRestartHandler(deps Dependencies) http.HandlerFunc { + logger := loggerFor(deps.Logger, "internal_rest.restart") + return func(writer http.ResponseWriter, request *http.Request) { + if deps.RestartRuntime == nil { + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "restart runtime service is not wired", + ) + return + } + + gameID, ok := extractGameID(writer, request) + if !ok { + return + } + + result, err := deps.RestartRuntime.Handle(request.Context(), restartruntime.Input{ + GameID: gameID, + OpSource: resolveOpSource(request), + SourceRef: requestSourceRef(request), + }) + if err != nil { + logger.ErrorContext(request.Context(), "restart runtime service errored", + "game_id", gameID, + "err", err.Error(), + ) + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "restart runtime service failed", + ) + return + } + + if result.Outcome == operation.OutcomeFailure { + writeFailure(writer, result.ErrorCode, result.ErrorMessage) + return + } + + writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record)) + } +} diff --git a/rtmanager/internal/api/internalhttp/handlers/services.go b/rtmanager/internal/api/internalhttp/handlers/services.go new file mode 100644 index 0000000..8233f9a --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/services.go @@ -0,0 +1,54 @@ +// Package handlers ships the GM/Admin-facing internal REST surface of +// Runtime Manager. The package is consumed by +// `galaxy/rtmanager/internal/api/internalhttp`; each handler delegates +// to one of the lifecycle services in `internal/service/` +// (`startruntime`, `stopruntime`, `restartruntime`, `patchruntime`, +// `cleanupcontainer`) or reads directly from `ports.RuntimeRecordStore` +// (list / get). +// +// The interfaces declared in this file mirror the single `Handle` +// method exposed by every concrete lifecycle service. Production wiring +// passes the concrete service pointers; tests pass `mockgen`-generated +// mocks. The narrow shape keeps the handler layer free of service +// internals (lease tokens, telemetry, durable side effects) and matches +// the repo-wide `mockgen` convention for wide / recorder ports. +package handlers + +import ( + "context" + + "galaxy/rtmanager/internal/service/cleanupcontainer" + "galaxy/rtmanager/internal/service/patchruntime" + "galaxy/rtmanager/internal/service/restartruntime" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/service/stopruntime" +) + +//go:generate go run go.uber.org/mock/mockgen -destination=mocks/mock_services.go -package=mocks galaxy/rtmanager/internal/api/internalhttp/handlers StartService,StopService,RestartService,PatchService,CleanupService + +// StartService is the narrow port the start handler depends on. It +// matches the public Handle method of `startruntime.Service`; the +// concrete service satisfies the interface implicitly. +type StartService interface { + Handle(ctx context.Context, in startruntime.Input) (startruntime.Result, error) +} + +// StopService is the narrow port the stop handler depends on. +type StopService interface { + Handle(ctx context.Context, in stopruntime.Input) (stopruntime.Result, error) +} + +// RestartService is the narrow port the restart handler depends on. +type RestartService interface { + Handle(ctx context.Context, in restartruntime.Input) (restartruntime.Result, error) +} + +// PatchService is the narrow port the patch handler depends on. +type PatchService interface { + Handle(ctx context.Context, in patchruntime.Input) (patchruntime.Result, error) +} + +// CleanupService is the narrow port the cleanup handler depends on. +type CleanupService interface { + Handle(ctx context.Context, in cleanupcontainer.Input) (cleanupcontainer.Result, error) +} diff --git a/rtmanager/internal/api/internalhttp/handlers/start.go b/rtmanager/internal/api/internalhttp/handlers/start.go new file mode 100644 index 0000000..4ebad76 --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/start.go @@ -0,0 +1,71 @@ +package handlers + +import ( + "net/http" + + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/service/startruntime" +) + +// startRequestBody mirrors the OpenAPI StartRequest schema. Only +// `image_ref` is accepted; unknown fields are rejected by +// decodeStrictJSON. +type startRequestBody struct { + ImageRef string `json:"image_ref"` +} + +// newStartHandler returns the handler for +// `POST /api/v1/internal/runtimes/{game_id}/start`. The handler +// delegates the entire lifecycle to `startruntime.Service`; failure +// codes are mapped to HTTP statuses via mapErrorCodeToStatus. +func newStartHandler(deps Dependencies) http.HandlerFunc { + logger := loggerFor(deps.Logger, "internal_rest.start") + return func(writer http.ResponseWriter, request *http.Request) { + if deps.StartRuntime == nil { + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "start runtime service is not wired", + ) + return + } + + gameID, ok := extractGameID(writer, request) + if !ok { + return + } + + var body startRequestBody + if err := decodeStrictJSON(request.Body, &body); err != nil { + writeError(writer, http.StatusBadRequest, + startruntime.ErrorCodeInvalidRequest, + err.Error(), + ) + return + } + + result, err := deps.StartRuntime.Handle(request.Context(), startruntime.Input{ + GameID: gameID, + ImageRef: body.ImageRef, + OpSource: resolveOpSource(request), + SourceRef: requestSourceRef(request), + }) + if err != nil { + logger.ErrorContext(request.Context(), "start runtime service errored", + "game_id", gameID, + "err", err.Error(), + ) + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "start runtime service failed", + ) + return + } + + if result.Outcome == operation.OutcomeFailure { + writeFailure(writer, result.ErrorCode, result.ErrorMessage) + return + } + + writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record)) + } +} diff --git a/rtmanager/internal/api/internalhttp/handlers/stop.go b/rtmanager/internal/api/internalhttp/handlers/stop.go new file mode 100644 index 0000000..c8d3312 --- /dev/null +++ b/rtmanager/internal/api/internalhttp/handlers/stop.go @@ -0,0 +1,70 @@ +package handlers + +import ( + "net/http" + + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/service/stopruntime" +) + +// stopRequestBody mirrors the OpenAPI StopRequest schema. The reason +// enum is validated at the service layer (`stopruntime.Input.Validate`); +// unknown values surface as `invalid_request`. +type stopRequestBody struct { + Reason string `json:"reason"` +} + +// newStopHandler returns the handler for +// `POST /api/v1/internal/runtimes/{game_id}/stop`. +func newStopHandler(deps Dependencies) http.HandlerFunc { + logger := loggerFor(deps.Logger, "internal_rest.stop") + return func(writer http.ResponseWriter, request *http.Request) { + if deps.StopRuntime == nil { + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "stop runtime service is not wired", + ) + return + } + + gameID, ok := extractGameID(writer, request) + if !ok { + return + } + + var body stopRequestBody + if err := decodeStrictJSON(request.Body, &body); err != nil { + writeError(writer, http.StatusBadRequest, + startruntime.ErrorCodeInvalidRequest, + err.Error(), + ) + return + } + + result, err := deps.StopRuntime.Handle(request.Context(), stopruntime.Input{ + GameID: gameID, + Reason: stopruntime.StopReason(body.Reason), + OpSource: resolveOpSource(request), + SourceRef: requestSourceRef(request), + }) + if err != nil { + logger.ErrorContext(request.Context(), "stop runtime service errored", + "game_id", gameID, + "err", err.Error(), + ) + writeError(writer, http.StatusInternalServerError, + startruntime.ErrorCodeInternal, + "stop runtime service failed", + ) + return + } + + if result.Outcome == operation.OutcomeFailure { + writeFailure(writer, result.ErrorCode, result.ErrorMessage) + return + } + + writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record)) + } +} diff --git a/rtmanager/internal/api/internalhttp/server.go b/rtmanager/internal/api/internalhttp/server.go new file mode 100644 index 0000000..618228f --- /dev/null +++ b/rtmanager/internal/api/internalhttp/server.go @@ -0,0 +1,363 @@ +// Package internalhttp provides the trusted internal HTTP listener used +// by the runnable Runtime Manager process. It exposes `/healthz` and +// `/readyz` plus the GM/Admin REST surface backed by the lifecycle +// services in `internal/service/`. +package internalhttp + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "log/slog" + "net" + "net/http" + "strconv" + "sync" + "time" + + "galaxy/rtmanager/internal/api/internalhttp/handlers" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/telemetry" + + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" + "go.opentelemetry.io/otel/attribute" +) + +const jsonContentType = "application/json; charset=utf-8" + +// errorCodeServiceUnavailable mirrors the stable error code declared in +// `rtmanager/api/internal-openapi.yaml` `§Error Model`. +const errorCodeServiceUnavailable = "service_unavailable" + +// HealthzPath and ReadyzPath are the internal probe routes documented in +// `rtmanager/api/internal-openapi.yaml`. +const ( + HealthzPath = "/healthz" + ReadyzPath = "/readyz" +) + +// ReadinessProbe reports whether the dependencies the listener guards +// (PostgreSQL, Redis, Docker) are reachable. A non-nil error is reported +// to the caller as `503 service_unavailable` with the wrapped message. +type ReadinessProbe interface { + Check(ctx context.Context) error +} + +// Config describes the trusted internal HTTP listener owned by Runtime +// Manager. +type Config struct { + // Addr is the TCP listen address used by the internal HTTP server. + Addr string + + // ReadHeaderTimeout bounds how long the listener may spend reading + // request headers before the server rejects the connection. + ReadHeaderTimeout time.Duration + + // ReadTimeout bounds how long the listener may spend reading one + // request. + ReadTimeout time.Duration + + // WriteTimeout bounds how long the listener may spend writing one + // response. + WriteTimeout time.Duration + + // IdleTimeout bounds how long the listener keeps an idle keep-alive + // connection open. + IdleTimeout time.Duration +} + +// Validate reports whether cfg contains a usable internal HTTP listener +// configuration. +func (cfg Config) Validate() error { + switch { + case cfg.Addr == "": + return errors.New("internal HTTP addr must not be empty") + case cfg.ReadHeaderTimeout <= 0: + return errors.New("internal HTTP read header timeout must be positive") + case cfg.ReadTimeout <= 0: + return errors.New("internal HTTP read timeout must be positive") + case cfg.WriteTimeout <= 0: + return errors.New("internal HTTP write timeout must be positive") + case cfg.IdleTimeout <= 0: + return errors.New("internal HTTP idle timeout must be positive") + default: + return nil + } +} + +// Dependencies describes the collaborators used by the internal HTTP +// transport layer. The listener still works when the lifecycle service +// fields are zero — handlers register but each returns +// `500 internal_error` until the runtime wires the real services. +type Dependencies struct { + // Logger writes structured listener lifecycle logs. When nil, + // slog.Default is used. + Logger *slog.Logger + + // Telemetry records low-cardinality probe metrics and lifecycle + // events. + Telemetry *telemetry.Runtime + + // Readiness reports whether PG / Redis / Docker are reachable. A + // nil readiness probe makes `/readyz` always answer `200`; the + // runtime always supplies a real probe in production wiring. + Readiness ReadinessProbe + + // RuntimeRecords backs the read-only list/get handlers. When nil + // those routes return `500 internal_error`. + RuntimeRecords ports.RuntimeRecordStore + + // StartRuntime, StopRuntime, RestartRuntime, PatchRuntime, and + // CleanupContainer back the lifecycle handlers. Each accepts a + // narrow interface so tests can pass `mockgen`-generated mocks; + // production wiring passes the concrete `*.Service` + // pointer. + StartRuntime handlers.StartService + StopRuntime handlers.StopService + RestartRuntime handlers.RestartService + PatchRuntime handlers.PatchService + CleanupContainer handlers.CleanupService +} + +// Server owns the trusted internal HTTP listener exposed by Runtime +// Manager. +type Server struct { + cfg Config + + handler http.Handler + logger *slog.Logger + metrics *telemetry.Runtime + + stateMu sync.RWMutex + server *http.Server + listener net.Listener +} + +// NewServer constructs one trusted internal HTTP server for cfg and deps. +func NewServer(cfg Config, deps Dependencies) (*Server, error) { + if err := cfg.Validate(); err != nil { + return nil, fmt.Errorf("new internal HTTP server: %w", err) + } + + logger := deps.Logger + if logger == nil { + logger = slog.Default() + } + + return &Server{ + cfg: cfg, + handler: newHandler(deps, logger), + logger: logger.With("component", "internal_http"), + metrics: deps.Telemetry, + }, nil +} + +// Addr returns the currently bound listener address after Run is called. +// It returns an empty string if the server has not yet bound a listener. +func (server *Server) Addr() string { + server.stateMu.RLock() + defer server.stateMu.RUnlock() + if server.listener == nil { + return "" + } + + return server.listener.Addr().String() +} + +// Run binds the configured listener and serves the internal HTTP surface +// until Shutdown closes the server. +func (server *Server) Run(ctx context.Context) error { + if ctx == nil { + return errors.New("run internal HTTP server: nil context") + } + if err := ctx.Err(); err != nil { + return err + } + + listener, err := net.Listen("tcp", server.cfg.Addr) + if err != nil { + return fmt.Errorf("run internal HTTP server: listen on %q: %w", server.cfg.Addr, err) + } + + httpServer := &http.Server{ + Handler: server.handler, + ReadHeaderTimeout: server.cfg.ReadHeaderTimeout, + ReadTimeout: server.cfg.ReadTimeout, + WriteTimeout: server.cfg.WriteTimeout, + IdleTimeout: server.cfg.IdleTimeout, + } + + server.stateMu.Lock() + server.server = httpServer + server.listener = listener + server.stateMu.Unlock() + + server.logger.Info("rtmanager internal HTTP server started", "addr", listener.Addr().String()) + + defer func() { + server.stateMu.Lock() + server.server = nil + server.listener = nil + server.stateMu.Unlock() + }() + + err = httpServer.Serve(listener) + switch { + case err == nil: + return nil + case errors.Is(err, http.ErrServerClosed): + server.logger.Info("rtmanager internal HTTP server stopped") + return nil + default: + return fmt.Errorf("run internal HTTP server: serve on %q: %w", server.cfg.Addr, err) + } +} + +// Shutdown gracefully stops the internal HTTP server within ctx. +func (server *Server) Shutdown(ctx context.Context) error { + if ctx == nil { + return errors.New("shutdown internal HTTP server: nil context") + } + + server.stateMu.RLock() + httpServer := server.server + server.stateMu.RUnlock() + + if httpServer == nil { + return nil + } + + if err := httpServer.Shutdown(ctx); err != nil && !errors.Is(err, http.ErrServerClosed) { + return fmt.Errorf("shutdown internal HTTP server: %w", err) + } + + return nil +} + +func newHandler(deps Dependencies, logger *slog.Logger) http.Handler { + mux := http.NewServeMux() + mux.HandleFunc("GET "+HealthzPath, handleHealthz) + mux.HandleFunc("GET "+ReadyzPath, handleReadyz(deps.Readiness, logger)) + + handlers.Register(mux, handlers.Dependencies{ + Logger: logger, + RuntimeRecords: deps.RuntimeRecords, + StartRuntime: deps.StartRuntime, + StopRuntime: deps.StopRuntime, + RestartRuntime: deps.RestartRuntime, + PatchRuntime: deps.PatchRuntime, + CleanupContainer: deps.CleanupContainer, + }) + + metrics := deps.Telemetry + options := []otelhttp.Option{} + if metrics != nil { + options = append(options, + otelhttp.WithTracerProvider(metrics.TracerProvider()), + otelhttp.WithMeterProvider(metrics.MeterProvider()), + ) + } + + return otelhttp.NewHandler(withObservability(mux, metrics), "rtmanager.internal_http", options...) +} + +func withObservability(next http.Handler, metrics *telemetry.Runtime) http.Handler { + return http.HandlerFunc(func(writer http.ResponseWriter, request *http.Request) { + startedAt := time.Now() + recorder := &statusRecorder{ + ResponseWriter: writer, + statusCode: http.StatusOK, + } + + next.ServeHTTP(recorder, request) + + route := request.Pattern + switch recorder.statusCode { + case http.StatusMethodNotAllowed: + route = "method_not_allowed" + case http.StatusNotFound: + route = "not_found" + case 0: + route = "unmatched" + } + if route == "" { + route = "unmatched" + } + + if metrics != nil { + metrics.RecordInternalHTTPRequest( + request.Context(), + []attribute.KeyValue{ + attribute.String("route", route), + attribute.String("method", request.Method), + attribute.String("status_code", strconv.Itoa(recorder.statusCode)), + }, + time.Since(startedAt), + ) + } + }) +} + +func handleHealthz(writer http.ResponseWriter, _ *http.Request) { + writeStatusResponse(writer, http.StatusOK, "ok") +} + +func handleReadyz(probe ReadinessProbe, logger *slog.Logger) http.HandlerFunc { + return func(writer http.ResponseWriter, request *http.Request) { + if probe == nil { + writeStatusResponse(writer, http.StatusOK, "ready") + return + } + + if err := probe.Check(request.Context()); err != nil { + logger.WarnContext(request.Context(), "rtmanager readiness probe failed", + "err", err.Error(), + ) + writeServiceUnavailable(writer, err.Error()) + return + } + + writeStatusResponse(writer, http.StatusOK, "ready") + } +} + +func writeStatusResponse(writer http.ResponseWriter, statusCode int, status string) { + writer.Header().Set("Content-Type", jsonContentType) + writer.WriteHeader(statusCode) + _ = json.NewEncoder(writer).Encode(statusResponse{Status: status}) +} + +func writeServiceUnavailable(writer http.ResponseWriter, message string) { + writer.Header().Set("Content-Type", jsonContentType) + writer.WriteHeader(http.StatusServiceUnavailable) + _ = json.NewEncoder(writer).Encode(errorResponse{ + Error: errorBody{ + Code: errorCodeServiceUnavailable, + Message: message, + }, + }) +} + +type statusResponse struct { + Status string `json:"status"` +} + +type errorBody struct { + Code string `json:"code"` + Message string `json:"message"` +} + +type errorResponse struct { + Error errorBody `json:"error"` +} + +type statusRecorder struct { + http.ResponseWriter + statusCode int +} + +func (recorder *statusRecorder) WriteHeader(statusCode int) { + recorder.statusCode = statusCode + recorder.ResponseWriter.WriteHeader(statusCode) +} diff --git a/rtmanager/internal/api/internalhttp/server_test.go b/rtmanager/internal/api/internalhttp/server_test.go new file mode 100644 index 0000000..4576d6b --- /dev/null +++ b/rtmanager/internal/api/internalhttp/server_test.go @@ -0,0 +1,115 @@ +package internalhttp + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func newTestConfig() Config { + return Config{ + Addr: ":0", + ReadHeaderTimeout: time.Second, + ReadTimeout: time.Second, + WriteTimeout: time.Second, + IdleTimeout: time.Second, + } +} + +type stubReadiness struct { + err error +} + +func (probe stubReadiness) Check(_ context.Context) error { + return probe.err +} + +func newTestServer(t *testing.T, deps Dependencies) http.Handler { + t.Helper() + server, err := NewServer(newTestConfig(), deps) + require.NoError(t, err) + return server.handler +} + +func TestHealthzReturnsOK(t *testing.T) { + t.Parallel() + + handler := newTestServer(t, Dependencies{}) + + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, HealthzPath, nil) + handler.ServeHTTP(rec, req) + + require.Equal(t, http.StatusOK, rec.Code) + require.Equal(t, jsonContentType, rec.Header().Get("Content-Type")) + + var body statusResponse + require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &body)) + require.Equal(t, "ok", body.Status) +} + +func TestReadyzReturnsReadyWhenProbeIsNil(t *testing.T) { + t.Parallel() + + handler := newTestServer(t, Dependencies{}) + + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, ReadyzPath, nil) + handler.ServeHTTP(rec, req) + + require.Equal(t, http.StatusOK, rec.Code) + + var body statusResponse + require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &body)) + require.Equal(t, "ready", body.Status) +} + +func TestReadyzReturnsReadyWhenProbeSucceeds(t *testing.T) { + t.Parallel() + + handler := newTestServer(t, Dependencies{Readiness: stubReadiness{}}) + + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, ReadyzPath, nil) + handler.ServeHTTP(rec, req) + + require.Equal(t, http.StatusOK, rec.Code) + + var body statusResponse + require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &body)) + require.Equal(t, "ready", body.Status) +} + +func TestReadyzReturnsServiceUnavailableWhenProbeFails(t *testing.T) { + t.Parallel() + + handler := newTestServer(t, Dependencies{ + Readiness: stubReadiness{err: errors.New("postgres ping: connection refused")}, + }) + + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, ReadyzPath, nil) + handler.ServeHTTP(rec, req) + + require.Equal(t, http.StatusServiceUnavailable, rec.Code) + require.Equal(t, jsonContentType, rec.Header().Get("Content-Type")) + + var body errorResponse + require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &body)) + require.Equal(t, errorCodeServiceUnavailable, body.Error.Code) + require.True(t, strings.Contains(body.Error.Message, "postgres")) +} + +func TestNewServerRejectsInvalidConfig(t *testing.T) { + t.Parallel() + + _, err := NewServer(Config{}, Dependencies{}) + require.Error(t, err) +} diff --git a/rtmanager/internal/app/app.go b/rtmanager/internal/app/app.go new file mode 100644 index 0000000..24fc320 --- /dev/null +++ b/rtmanager/internal/app/app.go @@ -0,0 +1,170 @@ +// Package app wires the Runtime Manager process lifecycle and +// coordinates component startup and graceful shutdown. +package app + +import ( + "context" + "errors" + "fmt" + "sync" + + "galaxy/rtmanager/internal/config" +) + +// Component is a long-lived Runtime Manager subsystem that participates +// in coordinated startup and graceful shutdown. +type Component interface { + // Run starts the component and blocks until it stops. + Run(context.Context) error + + // Shutdown stops the component within the provided timeout-bounded + // context. + Shutdown(context.Context) error +} + +// App owns the process-level lifecycle of Runtime Manager and its +// registered components. +type App struct { + cfg config.Config + components []Component +} + +// New constructs App with a defensive copy of the supplied components. +func New(cfg config.Config, components ...Component) *App { + clonedComponents := append([]Component(nil), components...) + + return &App{ + cfg: cfg, + components: clonedComponents, + } +} + +// Run starts all configured components, waits for cancellation or the +// first component failure, and then executes best-effort graceful +// shutdown. +func (app *App) Run(ctx context.Context) error { + if ctx == nil { + return errors.New("run rtmanager app: nil context") + } + if err := app.validate(); err != nil { + return err + } + if len(app.components) == 0 { + <-ctx.Done() + return nil + } + + runCtx, cancel := context.WithCancel(ctx) + defer cancel() + + results := make(chan componentResult, len(app.components)) + var runWaitGroup sync.WaitGroup + + for index, component := range app.components { + runWaitGroup.Add(1) + + go func(componentIndex int, component Component) { + defer runWaitGroup.Done() + results <- componentResult{ + index: componentIndex, + err: component.Run(runCtx), + } + }(index, component) + } + + var runErr error + + select { + case <-ctx.Done(): + case result := <-results: + runErr = classifyComponentResult(ctx, result) + } + + cancel() + + shutdownErr := app.shutdownComponents() + waitErr := app.waitForComponents(&runWaitGroup) + + return errors.Join(runErr, shutdownErr, waitErr) +} + +type componentResult struct { + index int + err error +} + +func (app *App) validate() error { + if app.cfg.ShutdownTimeout <= 0 { + return fmt.Errorf("run rtmanager app: shutdown timeout must be positive, got %s", app.cfg.ShutdownTimeout) + } + + for index, component := range app.components { + if component == nil { + return fmt.Errorf("run rtmanager app: component %d is nil", index) + } + } + + return nil +} + +func classifyComponentResult(parentCtx context.Context, result componentResult) error { + switch { + case result.err == nil: + if parentCtx.Err() != nil { + return nil + } + return fmt.Errorf("run rtmanager app: component %d exited without error before shutdown", result.index) + case errors.Is(result.err, context.Canceled) && parentCtx.Err() != nil: + return nil + default: + return fmt.Errorf("run rtmanager app: component %d: %w", result.index, result.err) + } +} + +func (app *App) shutdownComponents() error { + var shutdownWaitGroup sync.WaitGroup + errs := make(chan error, len(app.components)) + + for index, component := range app.components { + shutdownWaitGroup.Add(1) + + go func(componentIndex int, component Component) { + defer shutdownWaitGroup.Done() + + shutdownCtx, cancel := context.WithTimeout(context.Background(), app.cfg.ShutdownTimeout) + defer cancel() + + if err := component.Shutdown(shutdownCtx); err != nil { + errs <- fmt.Errorf("shutdown rtmanager component %d: %w", componentIndex, err) + } + }(index, component) + } + + shutdownWaitGroup.Wait() + close(errs) + + var joined error + for err := range errs { + joined = errors.Join(joined, err) + } + + return joined +} + +func (app *App) waitForComponents(runWaitGroup *sync.WaitGroup) error { + done := make(chan struct{}) + go func() { + runWaitGroup.Wait() + close(done) + }() + + waitCtx, cancel := context.WithTimeout(context.Background(), app.cfg.ShutdownTimeout) + defer cancel() + + select { + case <-done: + return nil + case <-waitCtx.Done(): + return fmt.Errorf("wait for rtmanager components: %w", waitCtx.Err()) + } +} diff --git a/rtmanager/internal/app/app_test.go b/rtmanager/internal/app/app_test.go new file mode 100644 index 0000000..0284190 --- /dev/null +++ b/rtmanager/internal/app/app_test.go @@ -0,0 +1,137 @@ +package app + +import ( + "context" + "errors" + "sync/atomic" + "testing" + "time" + + "galaxy/rtmanager/internal/config" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type fakeComponent struct { + runErr error + shutdownErr error + runHook func(context.Context) error + shutdownHook func(context.Context) error + runCount atomic.Int32 + downCount atomic.Int32 + blockForCtx bool +} + +func (component *fakeComponent) Run(ctx context.Context) error { + component.runCount.Add(1) + if component.runHook != nil { + return component.runHook(ctx) + } + if component.blockForCtx { + <-ctx.Done() + return ctx.Err() + } + + return component.runErr +} + +func (component *fakeComponent) Shutdown(ctx context.Context) error { + component.downCount.Add(1) + if component.shutdownHook != nil { + return component.shutdownHook(ctx) + } + + return component.shutdownErr +} + +func newCfg() config.Config { + return config.Config{ShutdownTimeout: time.Second} +} + +func TestAppRunWithoutComponentsBlocksUntilContextDone(t *testing.T) { + t.Parallel() + + app := New(newCfg()) + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + require.NoError(t, app.Run(ctx)) +} + +func TestAppRunReturnsOnContextCancel(t *testing.T) { + t.Parallel() + + component := &fakeComponent{blockForCtx: true} + app := New(newCfg(), component) + + ctx, cancel := context.WithCancel(context.Background()) + go func() { + time.Sleep(10 * time.Millisecond) + cancel() + }() + + require.NoError(t, app.Run(ctx)) + assert.EqualValues(t, 1, component.runCount.Load()) + assert.EqualValues(t, 1, component.downCount.Load()) +} + +func TestAppRunPropagatesComponentFailure(t *testing.T) { + t.Parallel() + + failure := errors.New("boom") + component := &fakeComponent{runErr: failure} + app := New(newCfg(), component) + + err := app.Run(context.Background()) + require.Error(t, err) + require.ErrorIs(t, err, failure) + assert.EqualValues(t, 1, component.downCount.Load()) +} + +func TestAppRunFailsOnNilContext(t *testing.T) { + t.Parallel() + + app := New(newCfg()) + var ctx context.Context + require.Error(t, app.Run(ctx)) +} + +func TestAppRunFailsOnNonPositiveShutdownTimeout(t *testing.T) { + t.Parallel() + + app := New(config.Config{}, &fakeComponent{}) + require.Error(t, app.Run(context.Background())) +} + +func TestAppRunFailsOnNilComponent(t *testing.T) { + t.Parallel() + + app := New(newCfg(), nil) + require.Error(t, app.Run(context.Background())) +} + +func TestAppRunFlagsCleanExitBeforeShutdown(t *testing.T) { + t.Parallel() + + component := &fakeComponent{} + app := New(newCfg(), component) + + err := app.Run(context.Background()) + require.Error(t, err) + require.True(t, contains(err.Error(), "exited without error")) +} + +func contains(haystack, needle string) bool { + return len(needle) == 0 || (len(haystack) >= len(needle) && (haystack == needle || index(haystack, needle) >= 0)) +} + +func index(haystack, needle string) int { + for i := 0; i+len(needle) <= len(haystack); i++ { + if haystack[i:i+len(needle)] == needle { + return i + } + } + return -1 +} diff --git a/rtmanager/internal/app/bootstrap.go b/rtmanager/internal/app/bootstrap.go new file mode 100644 index 0000000..630ca49 --- /dev/null +++ b/rtmanager/internal/app/bootstrap.go @@ -0,0 +1,85 @@ +package app + +import ( + "context" + "errors" + "fmt" + "time" + + "galaxy/redisconn" + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/telemetry" + + "github.com/docker/docker/client" + "github.com/redis/go-redis/v9" +) + +// newRedisClient builds the master Redis client from cfg via the shared +// `pkg/redisconn` helper. Replica clients are not opened in this iteration +// per ARCHITECTURE.md §Persistence Backends; they will be wired when read +// routing is introduced. +func newRedisClient(cfg config.RedisConfig) *redis.Client { + return redisconn.NewMasterClient(cfg.Conn) +} + +// instrumentRedisClient attaches the OpenTelemetry tracing and metrics +// instrumentation to client when telemetryRuntime is available. The +// actual instrumentation lives in `pkg/redisconn` so every Galaxy service +// shares one surface. +func instrumentRedisClient(redisClient *redis.Client, telemetryRuntime *telemetry.Runtime) error { + if redisClient == nil { + return errors.New("instrument redis client: nil client") + } + if telemetryRuntime == nil { + return nil + } + return redisconn.Instrument(redisClient, + redisconn.WithTracerProvider(telemetryRuntime.TracerProvider()), + redisconn.WithMeterProvider(telemetryRuntime.MeterProvider()), + ) +} + +// pingRedis performs a single Redis PING bounded by +// cfg.Conn.OperationTimeout to confirm that the configured Redis endpoint +// is reachable at startup. +func pingRedis(ctx context.Context, cfg config.RedisConfig, redisClient *redis.Client) error { + return redisconn.Ping(ctx, redisClient, cfg.Conn.OperationTimeout) +} + +// newDockerClient constructs a Docker SDK client for cfg.Host with an +// optional API version override. The bootstrap layer opens and pings +// the client; the production Docker adapter wraps it for the service +// layer. +func newDockerClient(cfg config.DockerConfig) (*client.Client, error) { + options := []client.Opt{client.WithHost(cfg.Host)} + if cfg.APIVersion == "" { + options = append(options, client.WithAPIVersionNegotiation()) + } else { + options = append(options, client.WithVersion(cfg.APIVersion)) + } + + docker, err := client.NewClientWithOpts(options...) + if err != nil { + return nil, fmt.Errorf("new docker client: %w", err) + } + return docker, nil +} + +// pingDocker bounds one Docker daemon ping under timeout and returns a +// wrapped error so startup failures are easy to spot in service logs. +func pingDocker(ctx context.Context, dockerClient *client.Client, timeout time.Duration) error { + if dockerClient == nil { + return errors.New("ping docker: nil client") + } + if timeout <= 0 { + return errors.New("ping docker: timeout must be positive") + } + + pingCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + if _, err := dockerClient.Ping(pingCtx); err != nil { + return fmt.Errorf("ping docker: %w", err) + } + return nil +} diff --git a/rtmanager/internal/app/bootstrap_test.go b/rtmanager/internal/app/bootstrap_test.go new file mode 100644 index 0000000..56ad64d --- /dev/null +++ b/rtmanager/internal/app/bootstrap_test.go @@ -0,0 +1,82 @@ +package app + +import ( + "context" + "testing" + "time" + + "galaxy/redisconn" + "galaxy/rtmanager/internal/config" + + "github.com/alicebob/miniredis/v2" + "github.com/stretchr/testify/require" +) + +func newTestRedisCfg(addr string) config.RedisConfig { + return config.RedisConfig{ + Conn: redisconn.Config{ + MasterAddr: addr, + Password: "test", + OperationTimeout: time.Second, + }, + } +} + +func TestPingRedisSucceedsAgainstMiniredis(t *testing.T) { + t.Parallel() + + server := miniredis.RunT(t) + + redisCfg := newTestRedisCfg(server.Addr()) + client := newRedisClient(redisCfg) + t.Cleanup(func() { _ = client.Close() }) + + require.NoError(t, pingRedis(context.Background(), redisCfg, client)) +} + +func TestPingRedisReturnsErrorWhenClosed(t *testing.T) { + t.Parallel() + + server := miniredis.RunT(t) + + redisCfg := newTestRedisCfg(server.Addr()) + client := newRedisClient(redisCfg) + require.NoError(t, client.Close()) + + require.Error(t, pingRedis(context.Background(), redisCfg, client)) +} + +func TestNewDockerClientHonoursHostOverride(t *testing.T) { + t.Parallel() + + docker, err := newDockerClient(config.DockerConfig{ + Host: "unix:///var/run/docker.sock", + APIVersion: "1.43", + Network: "galaxy-net", + LogDriver: "json-file", + PullPolicy: config.ImagePullPolicyIfMissing, + }) + require.NoError(t, err) + require.NotNil(t, docker) + require.NoError(t, docker.Close()) +} + +func TestPingDockerRejectsNilClient(t *testing.T) { + t.Parallel() + + require.Error(t, pingDocker(context.Background(), nil, time.Second)) +} + +func TestPingDockerRejectsNonPositiveTimeout(t *testing.T) { + t.Parallel() + + docker, err := newDockerClient(config.DockerConfig{ + Host: "unix:///var/run/docker.sock", + Network: "galaxy-net", + LogDriver: "json-file", + }) + require.NoError(t, err) + t.Cleanup(func() { _ = docker.Close() }) + + require.Error(t, pingDocker(context.Background(), docker, 0)) +} diff --git a/rtmanager/internal/app/runtime.go b/rtmanager/internal/app/runtime.go new file mode 100644 index 0000000..52d8e3a --- /dev/null +++ b/rtmanager/internal/app/runtime.go @@ -0,0 +1,262 @@ +package app + +import ( + "context" + "database/sql" + "errors" + "fmt" + "log/slog" + "time" + + "galaxy/postgres" + "galaxy/redisconn" + "galaxy/rtmanager/internal/adapters/postgres/migrations" + "galaxy/rtmanager/internal/api/internalhttp" + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/telemetry" + + dockerclient "github.com/docker/docker/client" + "github.com/redis/go-redis/v9" +) + +// Runtime owns the runnable Runtime Manager process plus the cleanup +// functions that release runtime resources after shutdown. +type Runtime struct { + cfg config.Config + + app *App + + wiring *wiring + + internalServer *internalhttp.Server + + cleanupFns []func() error +} + +// NewRuntime constructs the runnable Runtime Manager process from cfg. +// +// PostgreSQL migrations apply strictly before the internal HTTP listener +// becomes ready. The runtime opens one shared `*redis.Client`, one +// `*sql.DB`, one Docker SDK client, and one OpenTelemetry runtime; all +// are released in reverse construction order on shutdown. +func NewRuntime(ctx context.Context, cfg config.Config, logger *slog.Logger) (*Runtime, error) { + if ctx == nil { + return nil, errors.New("new rtmanager runtime: nil context") + } + if err := cfg.Validate(); err != nil { + return nil, fmt.Errorf("new rtmanager runtime: %w", err) + } + if logger == nil { + logger = slog.Default() + } + + runtime := &Runtime{ + cfg: cfg, + } + + cleanupOnError := func(err error) (*Runtime, error) { + if cleanupErr := runtime.Close(); cleanupErr != nil { + return nil, fmt.Errorf("%w; cleanup: %w", err, cleanupErr) + } + + return nil, err + } + + telemetryRuntime, err := telemetry.NewProcess(ctx, telemetry.ProcessConfig{ + ServiceName: cfg.Telemetry.ServiceName, + TracesExporter: cfg.Telemetry.TracesExporter, + MetricsExporter: cfg.Telemetry.MetricsExporter, + TracesProtocol: cfg.Telemetry.TracesProtocol, + MetricsProtocol: cfg.Telemetry.MetricsProtocol, + StdoutTracesEnabled: cfg.Telemetry.StdoutTracesEnabled, + StdoutMetricsEnabled: cfg.Telemetry.StdoutMetricsEnabled, + }, logger) + if err != nil { + return cleanupOnError(fmt.Errorf("new rtmanager runtime: telemetry: %w", err)) + } + runtime.cleanupFns = append(runtime.cleanupFns, func() error { + shutdownCtx, cancel := context.WithTimeout(context.Background(), cfg.ShutdownTimeout) + defer cancel() + return telemetryRuntime.Shutdown(shutdownCtx) + }) + + redisClient := newRedisClient(cfg.Redis) + if err := instrumentRedisClient(redisClient, telemetryRuntime); err != nil { + return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err)) + } + runtime.cleanupFns = append(runtime.cleanupFns, func() error { + err := redisClient.Close() + if errors.Is(err, redis.ErrClosed) { + return nil + } + return err + }) + if err := pingRedis(ctx, cfg.Redis, redisClient); err != nil { + return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err)) + } + + pgPool, err := postgres.OpenPrimary(ctx, cfg.Postgres.Conn, + postgres.WithTracerProvider(telemetryRuntime.TracerProvider()), + postgres.WithMeterProvider(telemetryRuntime.MeterProvider()), + ) + if err != nil { + return cleanupOnError(fmt.Errorf("new rtmanager runtime: open postgres: %w", err)) + } + runtime.cleanupFns = append(runtime.cleanupFns, pgPool.Close) + unregisterPGStats, err := postgres.InstrumentDBStats(pgPool, + postgres.WithMeterProvider(telemetryRuntime.MeterProvider()), + ) + if err != nil { + return cleanupOnError(fmt.Errorf("new rtmanager runtime: instrument postgres: %w", err)) + } + runtime.cleanupFns = append(runtime.cleanupFns, func() error { + return unregisterPGStats() + }) + if err := postgres.Ping(ctx, pgPool, cfg.Postgres.Conn.OperationTimeout); err != nil { + return cleanupOnError(fmt.Errorf("new rtmanager runtime: ping postgres: %w", err)) + } + if err := postgres.RunMigrations(ctx, pgPool, migrations.FS(), "."); err != nil { + return cleanupOnError(fmt.Errorf("new rtmanager runtime: run postgres migrations: %w", err)) + } + + dockerClient, err := newDockerClient(cfg.Docker) + if err != nil { + return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err)) + } + runtime.cleanupFns = append(runtime.cleanupFns, dockerClient.Close) + if err := pingDocker(ctx, dockerClient, cfg.Postgres.Conn.OperationTimeout); err != nil { + return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err)) + } + + wiring, err := newWiring(cfg, redisClient, pgPool, dockerClient, time.Now, logger, telemetryRuntime) + if err != nil { + return cleanupOnError(fmt.Errorf("new rtmanager runtime: wiring: %w", err)) + } + runtime.wiring = wiring + runtime.cleanupFns = append(runtime.cleanupFns, wiring.close) + if err := wiring.registerTelemetryGauges(); err != nil { + return cleanupOnError(fmt.Errorf("new rtmanager runtime: register telemetry gauges: %w", err)) + } + + if err := wiring.reconciler.ReconcileNow(ctx); err != nil { + return cleanupOnError(fmt.Errorf("new rtmanager runtime: initial reconcile: %w", err)) + } + + probe := newReadinessProbe(pgPool, redisClient, dockerClient, cfg) + + internalServer, err := internalhttp.NewServer(internalhttp.Config{ + Addr: cfg.InternalHTTP.Addr, + ReadHeaderTimeout: cfg.InternalHTTP.ReadHeaderTimeout, + ReadTimeout: cfg.InternalHTTP.ReadTimeout, + WriteTimeout: cfg.InternalHTTP.WriteTimeout, + IdleTimeout: cfg.InternalHTTP.IdleTimeout, + }, internalhttp.Dependencies{ + Logger: logger, + Telemetry: telemetryRuntime, + Readiness: probe, + RuntimeRecords: wiring.runtimeRecordStore, + StartRuntime: wiring.startRuntimeService, + StopRuntime: wiring.stopRuntimeService, + RestartRuntime: wiring.restartRuntimeService, + PatchRuntime: wiring.patchRuntimeService, + CleanupContainer: wiring.cleanupContainerService, + }) + if err != nil { + return cleanupOnError(fmt.Errorf("new rtmanager runtime: internal HTTP server: %w", err)) + } + runtime.internalServer = internalServer + + runtime.app = New(cfg, + internalServer, + wiring.startJobsConsumer, + wiring.stopJobsConsumer, + wiring.dockerEventsListener, + wiring.healthProbeWorker, + wiring.dockerInspectWorker, + wiring.reconciler, + wiring.containerCleanupWorker, + ) + + return runtime, nil +} + +// InternalServer returns the internal HTTP server owned by runtime. It is +// primarily exposed for tests; production code should not depend on it. +func (runtime *Runtime) InternalServer() *internalhttp.Server { + if runtime == nil { + return nil + } + + return runtime.internalServer +} + +// Run serves the internal HTTP listener until ctx is canceled or one +// component fails. +func (runtime *Runtime) Run(ctx context.Context) error { + if ctx == nil { + return errors.New("run rtmanager runtime: nil context") + } + if runtime == nil { + return errors.New("run rtmanager runtime: nil runtime") + } + if runtime.app == nil { + return errors.New("run rtmanager runtime: nil app") + } + + return runtime.app.Run(ctx) +} + +// Close releases every runtime dependency in reverse construction order. +// Close is safe to call multiple times. +func (runtime *Runtime) Close() error { + if runtime == nil { + return nil + } + + var joined error + for index := len(runtime.cleanupFns) - 1; index >= 0; index-- { + if err := runtime.cleanupFns[index](); err != nil { + joined = errors.Join(joined, err) + } + } + runtime.cleanupFns = nil + + return joined +} + +// readinessProbe pings every steady-state dependency the listener +// guards: PostgreSQL primary, Redis master, the Docker daemon, plus +// the configured Docker network's existence. +type readinessProbe struct { + pgPool *sql.DB + redisClient *redis.Client + dockerClient *dockerclient.Client + + postgresTimeout time.Duration + redisTimeout time.Duration + dockerTimeout time.Duration +} + +func newReadinessProbe(pgPool *sql.DB, redisClient *redis.Client, dockerClient *dockerclient.Client, cfg config.Config) *readinessProbe { + return &readinessProbe{ + pgPool: pgPool, + redisClient: redisClient, + dockerClient: dockerClient, + postgresTimeout: cfg.Postgres.Conn.OperationTimeout, + redisTimeout: cfg.Redis.Conn.OperationTimeout, + dockerTimeout: cfg.Postgres.Conn.OperationTimeout, + } +} + +// Check pings PostgreSQL, Redis, and Docker. The first failing +// dependency aborts the check so callers see a single, actionable +// error. +func (probe *readinessProbe) Check(ctx context.Context) error { + if err := postgres.Ping(ctx, probe.pgPool, probe.postgresTimeout); err != nil { + return err + } + if err := redisconn.Ping(ctx, probe.redisClient, probe.redisTimeout); err != nil { + return err + } + return pingDocker(ctx, probe.dockerClient, probe.dockerTimeout) +} diff --git a/rtmanager/internal/app/wiring.go b/rtmanager/internal/app/wiring.go new file mode 100644 index 0000000..5023fd4 --- /dev/null +++ b/rtmanager/internal/app/wiring.go @@ -0,0 +1,541 @@ +package app + +import ( + "context" + "database/sql" + "errors" + "fmt" + "log/slog" + "net/http" + "time" + + "galaxy/rtmanager/internal/adapters/docker" + "galaxy/rtmanager/internal/adapters/healtheventspublisher" + "galaxy/rtmanager/internal/adapters/jobresultspublisher" + "galaxy/rtmanager/internal/adapters/lobbyclient" + "galaxy/rtmanager/internal/adapters/notificationpublisher" + "galaxy/rtmanager/internal/adapters/postgres/healthsnapshotstore" + "galaxy/rtmanager/internal/adapters/postgres/operationlogstore" + "galaxy/rtmanager/internal/adapters/postgres/runtimerecordstore" + "galaxy/rtmanager/internal/adapters/redisstate/gamelease" + "galaxy/rtmanager/internal/adapters/redisstate/streamoffsets" + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/cleanupcontainer" + "galaxy/rtmanager/internal/service/patchruntime" + "galaxy/rtmanager/internal/service/restartruntime" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/service/stopruntime" + "galaxy/rtmanager/internal/telemetry" + "galaxy/rtmanager/internal/worker/containercleanup" + "galaxy/rtmanager/internal/worker/dockerevents" + "galaxy/rtmanager/internal/worker/dockerinspect" + "galaxy/rtmanager/internal/worker/healthprobe" + "galaxy/rtmanager/internal/worker/reconcile" + "galaxy/rtmanager/internal/worker/startjobsconsumer" + "galaxy/rtmanager/internal/worker/stopjobsconsumer" + + dockerclient "github.com/docker/docker/client" + "github.com/redis/go-redis/v9" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" +) + +// wiring owns the process-level singletons constructed once during +// `NewRuntime` and consumed by every worker and HTTP handler. +// +// The struct exposes typed accessors so callers can grab the store / +// adapter / service singletons without depending on internal fields. +type wiring struct { + cfg config.Config + + redisClient *redis.Client + pgPool *sql.DB + dockerClient *dockerclient.Client + + clock func() time.Time + + logger *slog.Logger + telemetry *telemetry.Runtime + + // Persistence stores. + runtimeRecordStore *runtimerecordstore.Store + operationLogStore *operationlogstore.Store + healthSnapshotStore *healthsnapshotstore.Store + streamOffsetStore *streamoffsets.Store + gameLeaseStore *gamelease.Store + + // External adapters. + dockerAdapter *docker.Client + lobbyClient *lobbyclient.Client + notificationPublisher *notificationpublisher.Publisher + healthEventsPublisher *healtheventspublisher.Publisher + jobResultsPublisher *jobresultspublisher.Publisher + + // Service layer. + startRuntimeService *startruntime.Service + stopRuntimeService *stopruntime.Service + restartRuntimeService *restartruntime.Service + patchRuntimeService *patchruntime.Service + cleanupContainerService *cleanupcontainer.Service + + // Worker layer. + startJobsConsumer *startjobsconsumer.Consumer + stopJobsConsumer *stopjobsconsumer.Consumer + dockerEventsListener *dockerevents.Listener + healthProbeWorker *healthprobe.Worker + dockerInspectWorker *dockerinspect.Worker + reconciler *reconcile.Reconciler + containerCleanupWorker *containercleanup.Worker + + // closers releases adapter-level resources at runtime shutdown. + closers []func() error +} + +// newWiring constructs the process-level dependency set, the persistence +// stores, the external adapters, and the service layer. It validates +// every required collaborator so callers can rely on them being non-nil. +func newWiring( + cfg config.Config, + redisClient *redis.Client, + pgPool *sql.DB, + dockerClient *dockerclient.Client, + clock func() time.Time, + logger *slog.Logger, + telemetryRuntime *telemetry.Runtime, +) (*wiring, error) { + if redisClient == nil { + return nil, errors.New("new rtmanager wiring: nil redis client") + } + if pgPool == nil { + return nil, errors.New("new rtmanager wiring: nil postgres pool") + } + if dockerClient == nil { + return nil, errors.New("new rtmanager wiring: nil docker client") + } + if clock == nil { + clock = time.Now + } + if logger == nil { + logger = slog.Default() + } + if telemetryRuntime == nil { + return nil, fmt.Errorf("new rtmanager wiring: nil telemetry runtime") + } + + w := &wiring{ + cfg: cfg, + redisClient: redisClient, + pgPool: pgPool, + dockerClient: dockerClient, + clock: clock, + logger: logger, + telemetry: telemetryRuntime, + } + + if err := w.buildPersistence(); err != nil { + return nil, fmt.Errorf("new rtmanager wiring: %w", err) + } + if err := w.buildAdapters(); err != nil { + _ = w.close() + return nil, fmt.Errorf("new rtmanager wiring: %w", err) + } + if err := w.buildServices(); err != nil { + _ = w.close() + return nil, fmt.Errorf("new rtmanager wiring: %w", err) + } + if err := w.buildWorkers(); err != nil { + _ = w.close() + return nil, fmt.Errorf("new rtmanager wiring: %w", err) + } + return w, nil +} + +func (w *wiring) buildPersistence() error { + runtimeStore, err := runtimerecordstore.New(runtimerecordstore.Config{ + DB: w.pgPool, + OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout, + }) + if err != nil { + return fmt.Errorf("runtime record store: %w", err) + } + w.runtimeRecordStore = runtimeStore + + operationStore, err := operationlogstore.New(operationlogstore.Config{ + DB: w.pgPool, + OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout, + }) + if err != nil { + return fmt.Errorf("operation log store: %w", err) + } + w.operationLogStore = operationStore + + snapshotStore, err := healthsnapshotstore.New(healthsnapshotstore.Config{ + DB: w.pgPool, + OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout, + }) + if err != nil { + return fmt.Errorf("health snapshot store: %w", err) + } + w.healthSnapshotStore = snapshotStore + + offsetStore, err := streamoffsets.New(streamoffsets.Config{Client: w.redisClient}) + if err != nil { + return fmt.Errorf("stream offset store: %w", err) + } + w.streamOffsetStore = offsetStore + + leaseStore, err := gamelease.New(gamelease.Config{Client: w.redisClient}) + if err != nil { + return fmt.Errorf("game lease store: %w", err) + } + w.gameLeaseStore = leaseStore + + return nil +} + +func (w *wiring) buildAdapters() error { + dockerAdapter, err := docker.NewClient(docker.Config{ + Docker: w.dockerClient, + LogDriver: w.cfg.Docker.LogDriver, + LogOpts: w.cfg.Docker.LogOpts, + Clock: w.clock, + }) + if err != nil { + return fmt.Errorf("docker adapter: %w", err) + } + w.dockerAdapter = dockerAdapter + + lobby, err := lobbyclient.NewClient(lobbyclient.Config{ + BaseURL: w.cfg.Lobby.BaseURL, + RequestTimeout: w.cfg.Lobby.Timeout, + }) + if err != nil { + return fmt.Errorf("lobby client: %w", err) + } + w.lobbyClient = lobby + w.closers = append(w.closers, lobby.Close) + + notificationPub, err := notificationpublisher.NewPublisher(notificationpublisher.Config{ + Client: w.redisClient, + Stream: w.cfg.Streams.NotificationIntents, + }) + if err != nil { + return fmt.Errorf("notification publisher: %w", err) + } + w.notificationPublisher = notificationPub + + healthPub, err := healtheventspublisher.NewPublisher(healtheventspublisher.Config{ + Client: w.redisClient, + Snapshots: w.healthSnapshotStore, + Stream: w.cfg.Streams.HealthEvents, + }) + if err != nil { + return fmt.Errorf("health events publisher: %w", err) + } + w.healthEventsPublisher = healthPub + + jobResultsPub, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{ + Client: w.redisClient, + Stream: w.cfg.Streams.JobResults, + }) + if err != nil { + return fmt.Errorf("job results publisher: %w", err) + } + w.jobResultsPublisher = jobResultsPub + + return nil +} + +func (w *wiring) buildServices() error { + startService, err := startruntime.NewService(startruntime.Dependencies{ + RuntimeRecords: w.runtimeRecordStore, + OperationLogs: w.operationLogStore, + Docker: w.dockerAdapter, + Leases: w.gameLeaseStore, + HealthEvents: w.healthEventsPublisher, + Notifications: w.notificationPublisher, + Lobby: w.lobbyClient, + Container: w.cfg.Container, + DockerCfg: w.cfg.Docker, + Coordination: w.cfg.Coordination, + Telemetry: w.telemetry, + Logger: w.logger, + Clock: w.clock, + }) + if err != nil { + return fmt.Errorf("start runtime service: %w", err) + } + w.startRuntimeService = startService + + stopService, err := stopruntime.NewService(stopruntime.Dependencies{ + RuntimeRecords: w.runtimeRecordStore, + OperationLogs: w.operationLogStore, + Docker: w.dockerAdapter, + Leases: w.gameLeaseStore, + HealthEvents: w.healthEventsPublisher, + Container: w.cfg.Container, + Coordination: w.cfg.Coordination, + Telemetry: w.telemetry, + Logger: w.logger, + Clock: w.clock, + }) + if err != nil { + return fmt.Errorf("stop runtime service: %w", err) + } + w.stopRuntimeService = stopService + + restartService, err := restartruntime.NewService(restartruntime.Dependencies{ + RuntimeRecords: w.runtimeRecordStore, + OperationLogs: w.operationLogStore, + Docker: w.dockerAdapter, + Leases: w.gameLeaseStore, + StopService: stopService, + StartService: startService, + Coordination: w.cfg.Coordination, + Telemetry: w.telemetry, + Logger: w.logger, + Clock: w.clock, + }) + if err != nil { + return fmt.Errorf("restart runtime service: %w", err) + } + w.restartRuntimeService = restartService + + patchService, err := patchruntime.NewService(patchruntime.Dependencies{ + RuntimeRecords: w.runtimeRecordStore, + OperationLogs: w.operationLogStore, + Docker: w.dockerAdapter, + Leases: w.gameLeaseStore, + StopService: stopService, + StartService: startService, + Coordination: w.cfg.Coordination, + Telemetry: w.telemetry, + Logger: w.logger, + Clock: w.clock, + }) + if err != nil { + return fmt.Errorf("patch runtime service: %w", err) + } + w.patchRuntimeService = patchService + + cleanupService, err := cleanupcontainer.NewService(cleanupcontainer.Dependencies{ + RuntimeRecords: w.runtimeRecordStore, + OperationLogs: w.operationLogStore, + Docker: w.dockerAdapter, + Leases: w.gameLeaseStore, + Coordination: w.cfg.Coordination, + Telemetry: w.telemetry, + Logger: w.logger, + Clock: w.clock, + }) + if err != nil { + return fmt.Errorf("cleanup container service: %w", err) + } + w.cleanupContainerService = cleanupService + + return nil +} + +// buildWorkers constructs the asynchronous Lobby ↔ RTM stream +// consumers. Both consumers participate in the process lifecycle as +// `app.Component`s; `internal/app/runtime.go` passes them into +// `app.New` alongside the internal HTTP server. +func (w *wiring) buildWorkers() error { + startConsumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{ + Client: w.redisClient, + Stream: w.cfg.Streams.StartJobs, + BlockTimeout: w.cfg.Streams.BlockTimeout, + StartService: w.startRuntimeService, + JobResults: w.jobResultsPublisher, + OffsetStore: w.streamOffsetStore, + Logger: w.logger, + }) + if err != nil { + return fmt.Errorf("start jobs consumer: %w", err) + } + w.startJobsConsumer = startConsumer + + stopConsumer, err := stopjobsconsumer.NewConsumer(stopjobsconsumer.Config{ + Client: w.redisClient, + Stream: w.cfg.Streams.StopJobs, + BlockTimeout: w.cfg.Streams.BlockTimeout, + StopService: w.stopRuntimeService, + JobResults: w.jobResultsPublisher, + OffsetStore: w.streamOffsetStore, + Logger: w.logger, + }) + if err != nil { + return fmt.Errorf("stop jobs consumer: %w", err) + } + w.stopJobsConsumer = stopConsumer + + eventsListener, err := dockerevents.NewListener(dockerevents.Dependencies{ + Docker: w.dockerAdapter, + RuntimeRecords: w.runtimeRecordStore, + HealthEvents: w.healthEventsPublisher, + Telemetry: w.telemetry, + Clock: w.clock, + Logger: w.logger, + }) + if err != nil { + return fmt.Errorf("docker events listener: %w", err) + } + w.dockerEventsListener = eventsListener + + probeHTTPClient, err := newProbeHTTPClient(w.telemetry) + if err != nil { + return fmt.Errorf("health probe http client: %w", err) + } + probeWorker, err := healthprobe.NewWorker(healthprobe.Dependencies{ + RuntimeRecords: w.runtimeRecordStore, + HealthEvents: w.healthEventsPublisher, + HTTPClient: probeHTTPClient, + Telemetry: w.telemetry, + Interval: w.cfg.Health.ProbeInterval, + ProbeTimeout: w.cfg.Health.ProbeTimeout, + FailuresThreshold: w.cfg.Health.ProbeFailuresThreshold, + Clock: w.clock, + Logger: w.logger, + }) + if err != nil { + return fmt.Errorf("health probe worker: %w", err) + } + w.healthProbeWorker = probeWorker + + inspectWorker, err := dockerinspect.NewWorker(dockerinspect.Dependencies{ + Docker: w.dockerAdapter, + RuntimeRecords: w.runtimeRecordStore, + HealthEvents: w.healthEventsPublisher, + Telemetry: w.telemetry, + Interval: w.cfg.Health.InspectInterval, + Clock: w.clock, + Logger: w.logger, + }) + if err != nil { + return fmt.Errorf("docker inspect worker: %w", err) + } + w.dockerInspectWorker = inspectWorker + + reconciler, err := reconcile.NewReconciler(reconcile.Dependencies{ + Docker: w.dockerAdapter, + RuntimeRecords: w.runtimeRecordStore, + OperationLogs: w.operationLogStore, + HealthEvents: w.healthEventsPublisher, + Leases: w.gameLeaseStore, + Telemetry: w.telemetry, + DockerCfg: w.cfg.Docker, + ContainerCfg: w.cfg.Container, + Coordination: w.cfg.Coordination, + Interval: w.cfg.Cleanup.ReconcileInterval, + Clock: w.clock, + Logger: w.logger, + }) + if err != nil { + return fmt.Errorf("reconciler: %w", err) + } + w.reconciler = reconciler + + cleanupWorker, err := containercleanup.NewWorker(containercleanup.Dependencies{ + RuntimeRecords: w.runtimeRecordStore, + Cleanup: w.cleanupContainerService, + Retention: w.cfg.Container.Retention, + Interval: w.cfg.Cleanup.CleanupInterval, + Clock: w.clock, + Logger: w.logger, + }) + if err != nil { + return fmt.Errorf("container cleanup worker: %w", err) + } + w.containerCleanupWorker = cleanupWorker + + return nil +} + +// newProbeHTTPClient constructs the otelhttp-instrumented HTTP client +// the active health probe uses to call engine `/healthz`. It clones +// the default transport so caller-provided transports stay isolated +// from production wiring (mirrors the lobby internal client). +func newProbeHTTPClient(telemetryRuntime *telemetry.Runtime) (*http.Client, error) { + transport, ok := http.DefaultTransport.(*http.Transport) + if !ok { + return nil, errors.New("default http transport is not *http.Transport") + } + cloned := transport.Clone() + instrumented := otelhttp.NewTransport(cloned, + otelhttp.WithTracerProvider(telemetryRuntime.TracerProvider()), + otelhttp.WithMeterProvider(telemetryRuntime.MeterProvider()), + ) + return &http.Client{Transport: instrumented}, nil +} + +// registerTelemetryGauges installs the runtime-records-by-status gauge +// callback so the telemetry runtime can observe the persistent store +// without holding a strong reference to the wiring. +func (w *wiring) registerTelemetryGauges() error { + probe := newRuntimeRecordsProbe(w.runtimeRecordStore) + return w.telemetry.RegisterGauges(telemetry.GaugeDependencies{ + RuntimeRecordsByStatus: probe, + Logger: w.logger, + }) +} + +// close releases adapter-level resources owned by the wiring layer. +// Returns the joined error of every closer; the caller is expected to +// invoke this once during process shutdown. +func (w *wiring) close() error { + var joined error + for index := len(w.closers) - 1; index >= 0; index-- { + if err := w.closers[index](); err != nil { + joined = errors.Join(joined, err) + } + } + w.closers = nil + return joined +} + +// runtimeRecordsProbe adapts runtimerecordstore.Store to +// telemetry.RuntimeRecordsByStatusProbe by translating the typed status +// keys into the string keys the gauge expects. +type runtimeRecordsProbe struct { + store *runtimerecordstore.Store +} + +func newRuntimeRecordsProbe(store *runtimerecordstore.Store) *runtimeRecordsProbe { + return &runtimeRecordsProbe{store: store} +} + +func (p *runtimeRecordsProbe) CountByStatus(ctx context.Context) (map[string]int, error) { + if p == nil || p.store == nil { + return nil, errors.New("runtime records probe: nil store") + } + counts, err := p.store.CountByStatus(ctx) + if err != nil { + return nil, err + } + out := make(map[string]int, len(counts)) + for status, count := range counts { + out[string(status)] = count + } + return out, nil +} + +// Compile-time assertions that the constructed adapters satisfy the +// expected port surfaces; these prevent silent regressions when a +// port shape changes. +var ( + _ ports.RuntimeRecordStore = (*runtimerecordstore.Store)(nil) + _ ports.OperationLogStore = (*operationlogstore.Store)(nil) + _ ports.HealthSnapshotStore = (*healthsnapshotstore.Store)(nil) + _ ports.StreamOffsetStore = (*streamoffsets.Store)(nil) + _ ports.GameLeaseStore = (*gamelease.Store)(nil) + _ ports.DockerClient = (*docker.Client)(nil) + _ ports.LobbyInternalClient = (*lobbyclient.Client)(nil) + _ ports.NotificationIntentPublisher = (*notificationpublisher.Publisher)(nil) + _ ports.HealthEventPublisher = (*healtheventspublisher.Publisher)(nil) + _ ports.JobResultPublisher = (*jobresultspublisher.Publisher)(nil) + + _ Component = (*reconcile.Reconciler)(nil) + _ Component = (*containercleanup.Worker)(nil) + _ containercleanup.Cleaner = (*cleanupcontainer.Service)(nil) +) + diff --git a/rtmanager/internal/config/config.go b/rtmanager/internal/config/config.go new file mode 100644 index 0000000..ddf17b9 --- /dev/null +++ b/rtmanager/internal/config/config.go @@ -0,0 +1,632 @@ +// Package config loads the Runtime Manager process configuration from +// environment variables. +package config + +import ( + "fmt" + "strings" + "time" + + "galaxy/postgres" + "galaxy/redisconn" + "galaxy/rtmanager/internal/telemetry" +) + +const ( + envPrefix = "RTMANAGER" + + shutdownTimeoutEnvVar = "RTMANAGER_SHUTDOWN_TIMEOUT" + logLevelEnvVar = "RTMANAGER_LOG_LEVEL" + + internalHTTPAddrEnvVar = "RTMANAGER_INTERNAL_HTTP_ADDR" + internalHTTPReadHeaderTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_READ_HEADER_TIMEOUT" + internalHTTPReadTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_READ_TIMEOUT" + internalHTTPWriteTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_WRITE_TIMEOUT" + internalHTTPIdleTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_IDLE_TIMEOUT" + + dockerHostEnvVar = "RTMANAGER_DOCKER_HOST" + dockerAPIVersionEnvVar = "RTMANAGER_DOCKER_API_VERSION" + dockerNetworkEnvVar = "RTMANAGER_DOCKER_NETWORK" + dockerLogDriverEnvVar = "RTMANAGER_DOCKER_LOG_DRIVER" + dockerLogOptsEnvVar = "RTMANAGER_DOCKER_LOG_OPTS" + imagePullPolicyEnvVar = "RTMANAGER_IMAGE_PULL_POLICY" + + defaultCPUQuotaEnvVar = "RTMANAGER_DEFAULT_CPU_QUOTA" + defaultMemoryEnvVar = "RTMANAGER_DEFAULT_MEMORY" + defaultPIDsLimitEnvVar = "RTMANAGER_DEFAULT_PIDS_LIMIT" + containerStopTimeoutSecondsEnvVar = "RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS" + containerRetentionDaysEnvVar = "RTMANAGER_CONTAINER_RETENTION_DAYS" + engineStateMountPathEnvVar = "RTMANAGER_ENGINE_STATE_MOUNT_PATH" + engineStateEnvNameEnvVar = "RTMANAGER_ENGINE_STATE_ENV_NAME" + gameStateDirModeEnvVar = "RTMANAGER_GAME_STATE_DIR_MODE" + gameStateOwnerUIDEnvVar = "RTMANAGER_GAME_STATE_OWNER_UID" + gameStateOwnerGIDEnvVar = "RTMANAGER_GAME_STATE_OWNER_GID" + gameStateRootEnvVar = "RTMANAGER_GAME_STATE_ROOT" + + startJobsStreamEnvVar = "RTMANAGER_REDIS_START_JOBS_STREAM" + stopJobsStreamEnvVar = "RTMANAGER_REDIS_STOP_JOBS_STREAM" + jobResultsStreamEnvVar = "RTMANAGER_REDIS_JOB_RESULTS_STREAM" + healthEventsStreamEnvVar = "RTMANAGER_REDIS_HEALTH_EVENTS_STREAM" + notificationIntentsStreamEnv = "RTMANAGER_NOTIFICATION_INTENTS_STREAM" + streamBlockTimeoutEnvVar = "RTMANAGER_STREAM_BLOCK_TIMEOUT" + + inspectIntervalEnvVar = "RTMANAGER_INSPECT_INTERVAL" + probeIntervalEnvVar = "RTMANAGER_PROBE_INTERVAL" + probeTimeoutEnvVar = "RTMANAGER_PROBE_TIMEOUT" + probeFailuresThresholdEnvVar = "RTMANAGER_PROBE_FAILURES_THRESHOLD" + + reconcileIntervalEnvVar = "RTMANAGER_RECONCILE_INTERVAL" + cleanupIntervalEnvVar = "RTMANAGER_CLEANUP_INTERVAL" + + gameLeaseTTLSecondsEnvVar = "RTMANAGER_GAME_LEASE_TTL_SECONDS" + + lobbyInternalBaseURLEnvVar = "RTMANAGER_LOBBY_INTERNAL_BASE_URL" + lobbyInternalTimeoutEnvVar = "RTMANAGER_LOBBY_INTERNAL_TIMEOUT" + + otelServiceNameEnvVar = "OTEL_SERVICE_NAME" + otelTracesExporterEnvVar = "OTEL_TRACES_EXPORTER" + otelMetricsExporterEnvVar = "OTEL_METRICS_EXPORTER" + otelExporterOTLPProtocolEnvVar = "OTEL_EXPORTER_OTLP_PROTOCOL" + otelExporterOTLPTracesProtocolEnvVar = "OTEL_EXPORTER_OTLP_TRACES_PROTOCOL" + otelExporterOTLPMetricsProtocolEnvVar = "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL" + otelStdoutTracesEnabledEnvVar = "RTMANAGER_OTEL_STDOUT_TRACES_ENABLED" + otelStdoutMetricsEnabledEnvVar = "RTMANAGER_OTEL_STDOUT_METRICS_ENABLED" + + defaultShutdownTimeout = 30 * time.Second + defaultLogLevel = "info" + defaultInternalHTTPAddr = ":8096" + defaultReadHeaderTimeout = 2 * time.Second + defaultReadTimeout = 5 * time.Second + defaultWriteTimeout = 15 * time.Second + defaultIdleTimeout = 60 * time.Second + + defaultDockerHost = "unix:///var/run/docker.sock" + defaultDockerNetwork = "galaxy-net" + defaultDockerLogDriver = "json-file" + defaultImagePullPolicy = ImagePullPolicyIfMissing + + defaultCPUQuota = 1.0 + defaultMemory = "512m" + defaultPIDsLimit = 512 + defaultContainerStopTimeout = 30 * time.Second + defaultContainerRetention = 30 * 24 * time.Hour + defaultEngineStateMountPath = "/var/lib/galaxy-game" + defaultEngineStateEnvName = "GAME_STATE_PATH" + defaultGameStateDirMode = 0o750 + + defaultStartJobsStream = "runtime:start_jobs" + defaultStopJobsStream = "runtime:stop_jobs" + defaultJobResultsStream = "runtime:job_results" + defaultHealthEventsStream = "runtime:health_events" + defaultNotificationIntentsKey = "notification:intents" + defaultStreamBlockTimeout = 5 * time.Second + + defaultInspectInterval = 30 * time.Second + defaultProbeInterval = 15 * time.Second + defaultProbeTimeout = 2 * time.Second + defaultProbeFailuresThreshold = 3 + + defaultReconcileInterval = 5 * time.Minute + defaultCleanupInterval = time.Hour + + defaultGameLeaseTTL = 60 * time.Second + + defaultLobbyInternalTimeout = 2 * time.Second + + defaultOTelServiceName = "galaxy-rtmanager" +) + +// ImagePullPolicy enumerates the supported image pull policies. The start +// service validates a producer-supplied `image_ref` against this policy at +// start time. +type ImagePullPolicy string + +// Supported pull policies, frozen by `rtmanager/README.md` §Configuration. +const ( + ImagePullPolicyIfMissing ImagePullPolicy = "if_missing" + ImagePullPolicyAlways ImagePullPolicy = "always" + ImagePullPolicyNever ImagePullPolicy = "never" +) + +// Validate reports whether p is one of the frozen pull policies. +func (p ImagePullPolicy) Validate() error { + switch p { + case ImagePullPolicyIfMissing, ImagePullPolicyAlways, ImagePullPolicyNever: + return nil + default: + return fmt.Errorf("image pull policy %q must be one of %q, %q, %q", + p, ImagePullPolicyIfMissing, ImagePullPolicyAlways, ImagePullPolicyNever) + } +} + +// Config stores the full Runtime Manager process configuration. +type Config struct { + // ShutdownTimeout bounds graceful shutdown of every long-lived + // component. + ShutdownTimeout time.Duration + + // Logging configures the process-wide structured logger. + Logging LoggingConfig + + // InternalHTTP configures the trusted internal HTTP listener that + // serves probes and the GM/Admin REST surface. + InternalHTTP InternalHTTPConfig + + // Docker configures the Docker SDK client RTM uses to drive the local + // Docker daemon. + Docker DockerConfig + + // Postgres configures the PostgreSQL-backed durable store consumed via + // `pkg/postgres`. + Postgres PostgresConfig + + // Redis configures the shared Redis connection topology consumed via + // `pkg/redisconn`. + Redis RedisConfig + + // Streams stores the stable Redis Stream names RTM reads from and + // writes to. + Streams StreamsConfig + + // Container stores the per-container defaults applied at start time + // when the resolved image does not declare its own labels. + Container ContainerConfig + + // Health configures the periodic health-monitoring workers (events + // listener, inspect, active probe). + Health HealthConfig + + // Cleanup configures the reconciler and container-cleanup workers. + Cleanup CleanupConfig + + // Coordination configures the per-game Redis lease used to serialise + // operations across all entry points. + Coordination CoordinationConfig + + // Lobby configures the synchronous Lobby internal REST client used by + // the start service for ancillary lookups. + Lobby LobbyConfig + + // Telemetry configures the process-wide OpenTelemetry runtime. + Telemetry TelemetryConfig +} + +// LoggingConfig configures the process-wide structured logger. +type LoggingConfig struct { + // Level stores the process log level accepted by log/slog. + Level string +} + +// InternalHTTPConfig configures the trusted internal HTTP listener. +type InternalHTTPConfig struct { + // Addr stores the TCP listen address. + Addr string + + // ReadHeaderTimeout bounds request-header reading. + ReadHeaderTimeout time.Duration + + // ReadTimeout bounds reading one request. + ReadTimeout time.Duration + + // WriteTimeout bounds writing one response. + WriteTimeout time.Duration + + // IdleTimeout bounds how long keep-alive connections stay open. + IdleTimeout time.Duration +} + +// Validate reports whether cfg stores a usable internal HTTP listener +// configuration. +func (cfg InternalHTTPConfig) Validate() error { + switch { + case strings.TrimSpace(cfg.Addr) == "": + return fmt.Errorf("internal HTTP addr must not be empty") + case !isTCPAddr(cfg.Addr): + return fmt.Errorf("internal HTTP addr %q must use host:port form", cfg.Addr) + case cfg.ReadHeaderTimeout <= 0: + return fmt.Errorf("internal HTTP read header timeout must be positive") + case cfg.ReadTimeout <= 0: + return fmt.Errorf("internal HTTP read timeout must be positive") + case cfg.WriteTimeout <= 0: + return fmt.Errorf("internal HTTP write timeout must be positive") + case cfg.IdleTimeout <= 0: + return fmt.Errorf("internal HTTP idle timeout must be positive") + default: + return nil + } +} + +// DockerConfig configures the Docker SDK client. +type DockerConfig struct { + // Host stores the Docker daemon endpoint (e.g. + // `unix:///var/run/docker.sock`). + Host string + + // APIVersion overrides the Docker API version. Empty lets the SDK + // negotiate. + APIVersion string + + // Network stores the user-defined Docker bridge network containers + // attach to. Provisioned outside RTM; missing network is a fail-fast + // condition at startup. + Network string + + // LogDriver stores the Docker logging driver applied to engine + // containers. + LogDriver string + + // LogOpts stores the comma-separated `key=value` driver options. + LogOpts string + + // PullPolicy stores the configured image pull policy. + PullPolicy ImagePullPolicy +} + +// Validate reports whether cfg stores a usable Docker configuration. +func (cfg DockerConfig) Validate() error { + switch { + case strings.TrimSpace(cfg.Host) == "": + return fmt.Errorf("docker host must not be empty") + case strings.TrimSpace(cfg.Network) == "": + return fmt.Errorf("docker network must not be empty") + case strings.TrimSpace(cfg.LogDriver) == "": + return fmt.Errorf("docker log driver must not be empty") + } + return cfg.PullPolicy.Validate() +} + +// PostgresConfig configures the PostgreSQL-backed durable store consumed +// via `pkg/postgres`. +type PostgresConfig struct { + // Conn carries the primary plus replica DSN topology and pool tuning. + Conn postgres.Config +} + +// Validate reports whether cfg stores a usable PostgreSQL configuration. +func (cfg PostgresConfig) Validate() error { + return cfg.Conn.Validate() +} + +// RedisConfig configures the Runtime Manager Redis connection topology. +type RedisConfig struct { + // Conn carries the connection topology (master, replicas, password, + // db, per-call timeout). + Conn redisconn.Config +} + +// Validate reports whether cfg stores a usable Redis configuration. +func (cfg RedisConfig) Validate() error { + return cfg.Conn.Validate() +} + +// StreamsConfig stores the stable Redis Stream names used by Runtime +// Manager. +type StreamsConfig struct { + // StartJobs stores the Redis Streams key Lobby writes start jobs to. + StartJobs string + + // StopJobs stores the Redis Streams key Lobby writes stop jobs to. + StopJobs string + + // JobResults stores the Redis Streams key RTM writes job outcomes + // to. + JobResults string + + // HealthEvents stores the Redis Streams key RTM publishes + // technical health events to. + HealthEvents string + + // NotificationIntents stores the Redis Streams key RTM publishes + // admin-only notification intents to. + NotificationIntents string + + // BlockTimeout bounds the maximum blocking read window for stream + // consumers. + BlockTimeout time.Duration +} + +// Validate reports whether cfg stores usable stream names. +func (cfg StreamsConfig) Validate() error { + switch { + case strings.TrimSpace(cfg.StartJobs) == "": + return fmt.Errorf("redis start jobs stream must not be empty") + case strings.TrimSpace(cfg.StopJobs) == "": + return fmt.Errorf("redis stop jobs stream must not be empty") + case strings.TrimSpace(cfg.JobResults) == "": + return fmt.Errorf("redis job results stream must not be empty") + case strings.TrimSpace(cfg.HealthEvents) == "": + return fmt.Errorf("redis health events stream must not be empty") + case strings.TrimSpace(cfg.NotificationIntents) == "": + return fmt.Errorf("redis notification intents stream must not be empty") + case cfg.BlockTimeout <= 0: + return fmt.Errorf("redis stream block timeout must be positive") + default: + return nil + } +} + +// ContainerConfig stores the per-container defaults applied at start +// time. Resource defaults apply when the resolved engine image does not +// expose `com.galaxy.cpu_quota` / `com.galaxy.memory` / +// `com.galaxy.pids_limit` labels. +type ContainerConfig struct { + // DefaultCPUQuota is the fallback `--cpus` value applied when the + // image does not declare `com.galaxy.cpu_quota`. + DefaultCPUQuota float64 + + // DefaultMemory is the fallback `--memory` value applied when the + // image does not declare `com.galaxy.memory`. + DefaultMemory string + + // DefaultPIDsLimit is the fallback `--pids-limit` value applied + // when the image does not declare `com.galaxy.pids_limit`. + DefaultPIDsLimit int + + // StopTimeout bounds graceful container stop before Docker fires + // SIGKILL. + StopTimeout time.Duration + + // Retention stores the TTL after which `status=stopped` containers + // are removed by the cleanup worker. + Retention time.Duration + + // EngineStateMountPath is the in-container path the per-game state + // directory is bind-mounted to. + EngineStateMountPath string + + // EngineStateEnvName is the env-var name forwarded to the engine + // pointing at EngineStateMountPath. + EngineStateEnvName string + + // GameStateDirMode stores the unix permissions applied to the + // per-game state directory on creation. + GameStateDirMode uint32 + + // GameStateOwnerUID stores the unix uid applied to the per-game + // state directory on creation. + GameStateOwnerUID int + + // GameStateOwnerGID stores the unix gid applied to the per-game + // state directory on creation. + GameStateOwnerGID int + + // GameStateRoot is the host path under which per-game state + // directories are created. + GameStateRoot string +} + +// Validate reports whether cfg stores usable container defaults. +func (cfg ContainerConfig) Validate() error { + switch { + case cfg.DefaultCPUQuota <= 0: + return fmt.Errorf("default cpu quota must be positive") + case strings.TrimSpace(cfg.DefaultMemory) == "": + return fmt.Errorf("default memory must not be empty") + case cfg.DefaultPIDsLimit <= 0: + return fmt.Errorf("default pids limit must be positive") + case cfg.StopTimeout <= 0: + return fmt.Errorf("container stop timeout must be positive") + case cfg.Retention <= 0: + return fmt.Errorf("container retention must be positive") + case strings.TrimSpace(cfg.EngineStateMountPath) == "": + return fmt.Errorf("engine state mount path must not be empty") + case strings.TrimSpace(cfg.EngineStateEnvName) == "": + return fmt.Errorf("engine state env name must not be empty") + case cfg.GameStateDirMode == 0: + return fmt.Errorf("game state dir mode must be non-zero") + case strings.TrimSpace(cfg.GameStateRoot) == "": + return fmt.Errorf("game state root must not be empty") + case !strings.HasPrefix(strings.TrimSpace(cfg.GameStateRoot), "/"): + return fmt.Errorf("game state root %q must be an absolute path", cfg.GameStateRoot) + default: + return nil + } +} + +// HealthConfig configures the periodic health-monitoring workers +// (Docker events listener, periodic inspect, active probe). +type HealthConfig struct { + // InspectInterval is the period between two periodic Docker inspect + // passes. + InspectInterval time.Duration + + // ProbeInterval is the period between two engine `/healthz` probe + // rounds. + ProbeInterval time.Duration + + // ProbeTimeout bounds one engine `/healthz` request. + ProbeTimeout time.Duration + + // ProbeFailuresThreshold is the consecutive-failure count that + // triggers a `probe_failed` event. + ProbeFailuresThreshold int +} + +// Validate reports whether cfg stores usable health-monitoring settings. +func (cfg HealthConfig) Validate() error { + switch { + case cfg.InspectInterval <= 0: + return fmt.Errorf("inspect interval must be positive") + case cfg.ProbeInterval <= 0: + return fmt.Errorf("probe interval must be positive") + case cfg.ProbeTimeout <= 0: + return fmt.Errorf("probe timeout must be positive") + case cfg.ProbeFailuresThreshold <= 0: + return fmt.Errorf("probe failures threshold must be positive") + default: + return nil + } +} + +// CleanupConfig configures the reconciler and container-cleanup workers. +type CleanupConfig struct { + // ReconcileInterval is the period between two reconciler passes. + ReconcileInterval time.Duration + + // CleanupInterval is the period between two container-cleanup + // passes. + CleanupInterval time.Duration +} + +// Validate reports whether cfg stores usable cleanup settings. +func (cfg CleanupConfig) Validate() error { + switch { + case cfg.ReconcileInterval <= 0: + return fmt.Errorf("reconcile interval must be positive") + case cfg.CleanupInterval <= 0: + return fmt.Errorf("cleanup interval must be positive") + default: + return nil + } +} + +// CoordinationConfig configures the per-game Redis lease. +type CoordinationConfig struct { + // GameLeaseTTL bounds the per-game lease lifetime renewed every + // half-TTL while an operation runs. + GameLeaseTTL time.Duration +} + +// Validate reports whether cfg stores a usable lease configuration. +func (cfg CoordinationConfig) Validate() error { + if cfg.GameLeaseTTL <= 0 { + return fmt.Errorf("game lease ttl must be positive") + } + return nil +} + +// LobbyConfig configures the synchronous Lobby internal REST client. +type LobbyConfig struct { + // BaseURL stores the trusted Lobby internal listener base URL. + BaseURL string + + // Timeout bounds one Lobby internal request. + Timeout time.Duration +} + +// Validate reports whether cfg stores a usable Lobby client +// configuration. +func (cfg LobbyConfig) Validate() error { + switch { + case strings.TrimSpace(cfg.BaseURL) == "": + return fmt.Errorf("lobby internal base url must not be empty") + case !isHTTPURL(cfg.BaseURL): + return fmt.Errorf("lobby internal base url %q must be an absolute http(s) URL", cfg.BaseURL) + case cfg.Timeout <= 0: + return fmt.Errorf("lobby internal timeout must be positive") + default: + return nil + } +} + +// TelemetryConfig configures the Runtime Manager OpenTelemetry runtime. +type TelemetryConfig struct { + // ServiceName overrides the default OpenTelemetry service name. + ServiceName string + + // TracesExporter selects the external traces exporter. Supported + // values are `none` and `otlp`. + TracesExporter string + + // MetricsExporter selects the external metrics exporter. Supported + // values are `none` and `otlp`. + MetricsExporter string + + // TracesProtocol selects the OTLP traces protocol when + // TracesExporter is `otlp`. + TracesProtocol string + + // MetricsProtocol selects the OTLP metrics protocol when + // MetricsExporter is `otlp`. + MetricsProtocol string + + // StdoutTracesEnabled enables the additional stdout trace exporter + // used for local development and debugging. + StdoutTracesEnabled bool + + // StdoutMetricsEnabled enables the additional stdout metric + // exporter used for local development and debugging. + StdoutMetricsEnabled bool +} + +// Validate reports whether cfg contains a supported OpenTelemetry +// configuration. +func (cfg TelemetryConfig) Validate() error { + return telemetry.ProcessConfig{ + ServiceName: cfg.ServiceName, + TracesExporter: cfg.TracesExporter, + MetricsExporter: cfg.MetricsExporter, + TracesProtocol: cfg.TracesProtocol, + MetricsProtocol: cfg.MetricsProtocol, + StdoutTracesEnabled: cfg.StdoutTracesEnabled, + StdoutMetricsEnabled: cfg.StdoutMetricsEnabled, + }.Validate() +} + +// DefaultConfig returns the default Runtime Manager process configuration. +func DefaultConfig() Config { + return Config{ + ShutdownTimeout: defaultShutdownTimeout, + Logging: LoggingConfig{ + Level: defaultLogLevel, + }, + InternalHTTP: InternalHTTPConfig{ + Addr: defaultInternalHTTPAddr, + ReadHeaderTimeout: defaultReadHeaderTimeout, + ReadTimeout: defaultReadTimeout, + WriteTimeout: defaultWriteTimeout, + IdleTimeout: defaultIdleTimeout, + }, + Docker: DockerConfig{ + Host: defaultDockerHost, + Network: defaultDockerNetwork, + LogDriver: defaultDockerLogDriver, + PullPolicy: defaultImagePullPolicy, + }, + Postgres: PostgresConfig{ + Conn: postgres.DefaultConfig(), + }, + Redis: RedisConfig{ + Conn: redisconn.DefaultConfig(), + }, + Streams: StreamsConfig{ + StartJobs: defaultStartJobsStream, + StopJobs: defaultStopJobsStream, + JobResults: defaultJobResultsStream, + HealthEvents: defaultHealthEventsStream, + NotificationIntents: defaultNotificationIntentsKey, + BlockTimeout: defaultStreamBlockTimeout, + }, + Container: ContainerConfig{ + DefaultCPUQuota: defaultCPUQuota, + DefaultMemory: defaultMemory, + DefaultPIDsLimit: defaultPIDsLimit, + StopTimeout: defaultContainerStopTimeout, + Retention: defaultContainerRetention, + EngineStateMountPath: defaultEngineStateMountPath, + EngineStateEnvName: defaultEngineStateEnvName, + GameStateDirMode: defaultGameStateDirMode, + }, + Health: HealthConfig{ + InspectInterval: defaultInspectInterval, + ProbeInterval: defaultProbeInterval, + ProbeTimeout: defaultProbeTimeout, + ProbeFailuresThreshold: defaultProbeFailuresThreshold, + }, + Cleanup: CleanupConfig{ + ReconcileInterval: defaultReconcileInterval, + CleanupInterval: defaultCleanupInterval, + }, + Coordination: CoordinationConfig{ + GameLeaseTTL: defaultGameLeaseTTL, + }, + Lobby: LobbyConfig{ + Timeout: defaultLobbyInternalTimeout, + }, + Telemetry: TelemetryConfig{ + ServiceName: defaultOTelServiceName, + TracesExporter: "none", + MetricsExporter: "none", + }, + } +} diff --git a/rtmanager/internal/config/config_test.go b/rtmanager/internal/config/config_test.go new file mode 100644 index 0000000..50a9e45 --- /dev/null +++ b/rtmanager/internal/config/config_test.go @@ -0,0 +1,142 @@ +package config + +import ( + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func validEnv(t *testing.T) { + t.Helper() + + t.Setenv("RTMANAGER_POSTGRES_PRIMARY_DSN", "postgres://rtm:secret@localhost:5432/galaxy?search_path=rtmanager&sslmode=disable") + t.Setenv("RTMANAGER_REDIS_MASTER_ADDR", "localhost:6379") + t.Setenv("RTMANAGER_REDIS_PASSWORD", "secret") + t.Setenv("RTMANAGER_GAME_STATE_ROOT", "/var/lib/galaxy/games") + t.Setenv("RTMANAGER_LOBBY_INTERNAL_BASE_URL", "http://lobby:8095") +} + +func TestLoadFromEnvAcceptsDefaults(t *testing.T) { + validEnv(t) + + cfg, err := LoadFromEnv() + require.NoError(t, err) + + require.Equal(t, ":8096", cfg.InternalHTTP.Addr) + require.Equal(t, "unix:///var/run/docker.sock", cfg.Docker.Host) + require.Equal(t, "galaxy-net", cfg.Docker.Network) + require.Equal(t, "json-file", cfg.Docker.LogDriver) + require.Equal(t, ImagePullPolicyIfMissing, cfg.Docker.PullPolicy) + require.Equal(t, "runtime:start_jobs", cfg.Streams.StartJobs) + require.Equal(t, "runtime:stop_jobs", cfg.Streams.StopJobs) + require.Equal(t, "runtime:job_results", cfg.Streams.JobResults) + require.Equal(t, "runtime:health_events", cfg.Streams.HealthEvents) + require.Equal(t, "notification:intents", cfg.Streams.NotificationIntents) + require.Equal(t, 30*time.Second, cfg.Container.StopTimeout) + require.Equal(t, 30*24*time.Hour, cfg.Container.Retention) + require.Equal(t, "/var/lib/galaxy-game", cfg.Container.EngineStateMountPath) + require.Equal(t, "GAME_STATE_PATH", cfg.Container.EngineStateEnvName) + require.EqualValues(t, 0o750, cfg.Container.GameStateDirMode) + require.Equal(t, 60*time.Second, cfg.Coordination.GameLeaseTTL) + require.Equal(t, "http://lobby:8095", cfg.Lobby.BaseURL) + require.Equal(t, 2*time.Second, cfg.Lobby.Timeout) + require.Equal(t, "galaxy-rtmanager", cfg.Telemetry.ServiceName) +} + +func TestLoadFromEnvHonoursOverrides(t *testing.T) { + validEnv(t) + t.Setenv("RTMANAGER_INTERNAL_HTTP_ADDR", ":9000") + t.Setenv("RTMANAGER_DOCKER_NETWORK", "custom-net") + t.Setenv("RTMANAGER_IMAGE_PULL_POLICY", "always") + t.Setenv("RTMANAGER_REDIS_START_JOBS_STREAM", "custom:start_jobs") + t.Setenv("RTMANAGER_GAME_LEASE_TTL_SECONDS", "120") + t.Setenv("RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS", "45") + t.Setenv("RTMANAGER_CONTAINER_RETENTION_DAYS", "7") + t.Setenv("RTMANAGER_GAME_STATE_DIR_MODE", "0700") + + cfg, err := LoadFromEnv() + require.NoError(t, err) + + require.Equal(t, ":9000", cfg.InternalHTTP.Addr) + require.Equal(t, "custom-net", cfg.Docker.Network) + require.Equal(t, ImagePullPolicyAlways, cfg.Docker.PullPolicy) + require.Equal(t, "custom:start_jobs", cfg.Streams.StartJobs) + require.Equal(t, 120*time.Second, cfg.Coordination.GameLeaseTTL) + require.Equal(t, 45*time.Second, cfg.Container.StopTimeout) + require.Equal(t, 7*24*time.Hour, cfg.Container.Retention) + require.EqualValues(t, 0o700, cfg.Container.GameStateDirMode) +} + +func TestLoadFromEnvRejectsUnknownPullPolicy(t *testing.T) { + validEnv(t) + t.Setenv("RTMANAGER_IMAGE_PULL_POLICY", "weekly") + + _, err := LoadFromEnv() + require.Error(t, err) + require.Contains(t, err.Error(), "image pull policy") +} + +func TestLoadFromEnvRequiresGameStateRoot(t *testing.T) { + t.Setenv("RTMANAGER_POSTGRES_PRIMARY_DSN", "postgres://rtm:secret@localhost:5432/galaxy") + t.Setenv("RTMANAGER_REDIS_MASTER_ADDR", "localhost:6379") + t.Setenv("RTMANAGER_REDIS_PASSWORD", "secret") + t.Setenv("RTMANAGER_LOBBY_INTERNAL_BASE_URL", "http://lobby:8095") + + _, err := LoadFromEnv() + require.Error(t, err) + require.Contains(t, err.Error(), "RTMANAGER_GAME_STATE_ROOT") +} + +func TestLoadFromEnvRequiresLobbyBaseURL(t *testing.T) { + t.Setenv("RTMANAGER_POSTGRES_PRIMARY_DSN", "postgres://rtm:secret@localhost:5432/galaxy") + t.Setenv("RTMANAGER_REDIS_MASTER_ADDR", "localhost:6379") + t.Setenv("RTMANAGER_REDIS_PASSWORD", "secret") + t.Setenv("RTMANAGER_GAME_STATE_ROOT", "/var/lib/galaxy/games") + + _, err := LoadFromEnv() + require.Error(t, err) + require.Contains(t, err.Error(), "RTMANAGER_LOBBY_INTERNAL_BASE_URL") +} + +func TestLoadFromEnvRejectsRelativeStateRoot(t *testing.T) { + validEnv(t) + t.Setenv("RTMANAGER_GAME_STATE_ROOT", "relative/path") + + _, err := LoadFromEnv() + require.Error(t, err) + require.Contains(t, err.Error(), "absolute path") +} + +func TestLoadFromEnvRejectsBadLogLevel(t *testing.T) { + validEnv(t) + t.Setenv("RTMANAGER_LOG_LEVEL", "verbose") + + _, err := LoadFromEnv() + require.Error(t, err) + require.Contains(t, err.Error(), "RTMANAGER_LOG_LEVEL") +} + +func TestImagePullPolicyValidate(t *testing.T) { + require.NoError(t, ImagePullPolicyIfMissing.Validate()) + require.NoError(t, ImagePullPolicyAlways.Validate()) + require.NoError(t, ImagePullPolicyNever.Validate()) + require.Error(t, ImagePullPolicy("monthly").Validate()) +} + +func TestInternalHTTPValidateRejectsBadAddr(t *testing.T) { + cfg := DefaultConfig().InternalHTTP + cfg.Addr = "not-an-addr" + err := cfg.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "host:port") +} + +func TestStreamsValidateRequiresAllNames(t *testing.T) { + cfg := DefaultConfig().Streams + cfg.StartJobs = " " + err := cfg.Validate() + require.Error(t, err) + require.True(t, strings.Contains(err.Error(), "start jobs")) +} diff --git a/rtmanager/internal/config/env.go b/rtmanager/internal/config/env.go new file mode 100644 index 0000000..4d4f4c4 --- /dev/null +++ b/rtmanager/internal/config/env.go @@ -0,0 +1,319 @@ +package config + +import ( + "fmt" + "os" + "strconv" + "strings" + "time" + + "galaxy/postgres" + "galaxy/redisconn" +) + +// LoadFromEnv builds Config from environment variables and validates the +// resulting configuration. +func LoadFromEnv() (Config, error) { + cfg := DefaultConfig() + + var err error + + cfg.ShutdownTimeout, err = durationEnv(shutdownTimeoutEnvVar, cfg.ShutdownTimeout) + if err != nil { + return Config{}, err + } + + cfg.Logging.Level = stringEnv(logLevelEnvVar, cfg.Logging.Level) + + cfg.InternalHTTP.Addr = stringEnv(internalHTTPAddrEnvVar, cfg.InternalHTTP.Addr) + cfg.InternalHTTP.ReadHeaderTimeout, err = durationEnv(internalHTTPReadHeaderTimeoutEnvVar, cfg.InternalHTTP.ReadHeaderTimeout) + if err != nil { + return Config{}, err + } + cfg.InternalHTTP.ReadTimeout, err = durationEnv(internalHTTPReadTimeoutEnvVar, cfg.InternalHTTP.ReadTimeout) + if err != nil { + return Config{}, err + } + cfg.InternalHTTP.WriteTimeout, err = durationEnv(internalHTTPWriteTimeoutEnvVar, cfg.InternalHTTP.WriteTimeout) + if err != nil { + return Config{}, err + } + cfg.InternalHTTP.IdleTimeout, err = durationEnv(internalHTTPIdleTimeoutEnvVar, cfg.InternalHTTP.IdleTimeout) + if err != nil { + return Config{}, err + } + + cfg.Docker.Host = stringEnv(dockerHostEnvVar, cfg.Docker.Host) + cfg.Docker.APIVersion = stringEnv(dockerAPIVersionEnvVar, cfg.Docker.APIVersion) + cfg.Docker.Network = stringEnv(dockerNetworkEnvVar, cfg.Docker.Network) + cfg.Docker.LogDriver = stringEnv(dockerLogDriverEnvVar, cfg.Docker.LogDriver) + cfg.Docker.LogOpts = stringEnv(dockerLogOptsEnvVar, cfg.Docker.LogOpts) + if raw, ok := os.LookupEnv(imagePullPolicyEnvVar); ok { + cfg.Docker.PullPolicy = ImagePullPolicy(strings.TrimSpace(raw)) + } + + pgConn, err := postgres.LoadFromEnv(envPrefix) + if err != nil { + return Config{}, err + } + cfg.Postgres.Conn = pgConn + + redisConn, err := redisconn.LoadFromEnv(envPrefix) + if err != nil { + return Config{}, err + } + cfg.Redis.Conn = redisConn + + cfg.Streams.StartJobs = stringEnv(startJobsStreamEnvVar, cfg.Streams.StartJobs) + cfg.Streams.StopJobs = stringEnv(stopJobsStreamEnvVar, cfg.Streams.StopJobs) + cfg.Streams.JobResults = stringEnv(jobResultsStreamEnvVar, cfg.Streams.JobResults) + cfg.Streams.HealthEvents = stringEnv(healthEventsStreamEnvVar, cfg.Streams.HealthEvents) + cfg.Streams.NotificationIntents = stringEnv(notificationIntentsStreamEnv, cfg.Streams.NotificationIntents) + cfg.Streams.BlockTimeout, err = durationEnv(streamBlockTimeoutEnvVar, cfg.Streams.BlockTimeout) + if err != nil { + return Config{}, err + } + + cfg.Container.DefaultCPUQuota, err = floatEnv(defaultCPUQuotaEnvVar, cfg.Container.DefaultCPUQuota) + if err != nil { + return Config{}, err + } + cfg.Container.DefaultMemory = stringEnv(defaultMemoryEnvVar, cfg.Container.DefaultMemory) + cfg.Container.DefaultPIDsLimit, err = intEnv(defaultPIDsLimitEnvVar, cfg.Container.DefaultPIDsLimit) + if err != nil { + return Config{}, err + } + cfg.Container.StopTimeout, err = secondsEnv(containerStopTimeoutSecondsEnvVar, cfg.Container.StopTimeout) + if err != nil { + return Config{}, err + } + cfg.Container.Retention, err = daysEnv(containerRetentionDaysEnvVar, cfg.Container.Retention) + if err != nil { + return Config{}, err + } + cfg.Container.EngineStateMountPath = stringEnv(engineStateMountPathEnvVar, cfg.Container.EngineStateMountPath) + cfg.Container.EngineStateEnvName = stringEnv(engineStateEnvNameEnvVar, cfg.Container.EngineStateEnvName) + cfg.Container.GameStateDirMode, err = octalUint32Env(gameStateDirModeEnvVar, cfg.Container.GameStateDirMode) + if err != nil { + return Config{}, err + } + cfg.Container.GameStateOwnerUID, err = intEnv(gameStateOwnerUIDEnvVar, cfg.Container.GameStateOwnerUID) + if err != nil { + return Config{}, err + } + cfg.Container.GameStateOwnerGID, err = intEnv(gameStateOwnerGIDEnvVar, cfg.Container.GameStateOwnerGID) + if err != nil { + return Config{}, err + } + root, ok := os.LookupEnv(gameStateRootEnvVar) + if !ok || strings.TrimSpace(root) == "" { + return Config{}, fmt.Errorf("%s must be set", gameStateRootEnvVar) + } + cfg.Container.GameStateRoot = strings.TrimSpace(root) + + cfg.Health.InspectInterval, err = durationEnv(inspectIntervalEnvVar, cfg.Health.InspectInterval) + if err != nil { + return Config{}, err + } + cfg.Health.ProbeInterval, err = durationEnv(probeIntervalEnvVar, cfg.Health.ProbeInterval) + if err != nil { + return Config{}, err + } + cfg.Health.ProbeTimeout, err = durationEnv(probeTimeoutEnvVar, cfg.Health.ProbeTimeout) + if err != nil { + return Config{}, err + } + cfg.Health.ProbeFailuresThreshold, err = intEnv(probeFailuresThresholdEnvVar, cfg.Health.ProbeFailuresThreshold) + if err != nil { + return Config{}, err + } + + cfg.Cleanup.ReconcileInterval, err = durationEnv(reconcileIntervalEnvVar, cfg.Cleanup.ReconcileInterval) + if err != nil { + return Config{}, err + } + cfg.Cleanup.CleanupInterval, err = durationEnv(cleanupIntervalEnvVar, cfg.Cleanup.CleanupInterval) + if err != nil { + return Config{}, err + } + + cfg.Coordination.GameLeaseTTL, err = secondsEnv(gameLeaseTTLSecondsEnvVar, cfg.Coordination.GameLeaseTTL) + if err != nil { + return Config{}, err + } + + lobbyURL, ok := os.LookupEnv(lobbyInternalBaseURLEnvVar) + if !ok || strings.TrimSpace(lobbyURL) == "" { + return Config{}, fmt.Errorf("%s must be set", lobbyInternalBaseURLEnvVar) + } + cfg.Lobby.BaseURL = strings.TrimSpace(lobbyURL) + cfg.Lobby.Timeout, err = durationEnv(lobbyInternalTimeoutEnvVar, cfg.Lobby.Timeout) + if err != nil { + return Config{}, err + } + + cfg.Telemetry.ServiceName = stringEnv(otelServiceNameEnvVar, cfg.Telemetry.ServiceName) + cfg.Telemetry.TracesExporter = normalizeExporterValue(stringEnv(otelTracesExporterEnvVar, cfg.Telemetry.TracesExporter)) + cfg.Telemetry.MetricsExporter = normalizeExporterValue(stringEnv(otelMetricsExporterEnvVar, cfg.Telemetry.MetricsExporter)) + cfg.Telemetry.TracesProtocol = normalizeProtocolValue( + os.Getenv(otelExporterOTLPTracesProtocolEnvVar), + os.Getenv(otelExporterOTLPProtocolEnvVar), + cfg.Telemetry.TracesProtocol, + ) + cfg.Telemetry.MetricsProtocol = normalizeProtocolValue( + os.Getenv(otelExporterOTLPMetricsProtocolEnvVar), + os.Getenv(otelExporterOTLPProtocolEnvVar), + cfg.Telemetry.MetricsProtocol, + ) + cfg.Telemetry.StdoutTracesEnabled, err = boolEnv(otelStdoutTracesEnabledEnvVar, cfg.Telemetry.StdoutTracesEnabled) + if err != nil { + return Config{}, err + } + cfg.Telemetry.StdoutMetricsEnabled, err = boolEnv(otelStdoutMetricsEnabledEnvVar, cfg.Telemetry.StdoutMetricsEnabled) + if err != nil { + return Config{}, err + } + + if err := cfg.Validate(); err != nil { + return Config{}, err + } + + return cfg, nil +} + +func stringEnv(name string, fallback string) string { + value, ok := os.LookupEnv(name) + if !ok { + return fallback + } + + return strings.TrimSpace(value) +} + +func durationEnv(name string, fallback time.Duration) (time.Duration, error) { + value, ok := os.LookupEnv(name) + if !ok { + return fallback, nil + } + + parsed, err := time.ParseDuration(strings.TrimSpace(value)) + if err != nil { + return 0, fmt.Errorf("%s: parse duration: %w", name, err) + } + + return parsed, nil +} + +func secondsEnv(name string, fallback time.Duration) (time.Duration, error) { + value, ok := os.LookupEnv(name) + if !ok { + return fallback, nil + } + + parsed, err := strconv.Atoi(strings.TrimSpace(value)) + if err != nil { + return 0, fmt.Errorf("%s: parse seconds: %w", name, err) + } + if parsed <= 0 { + return 0, fmt.Errorf("%s: must be positive", name) + } + + return time.Duration(parsed) * time.Second, nil +} + +func daysEnv(name string, fallback time.Duration) (time.Duration, error) { + value, ok := os.LookupEnv(name) + if !ok { + return fallback, nil + } + + parsed, err := strconv.Atoi(strings.TrimSpace(value)) + if err != nil { + return 0, fmt.Errorf("%s: parse days: %w", name, err) + } + if parsed <= 0 { + return 0, fmt.Errorf("%s: must be positive", name) + } + + return time.Duration(parsed) * 24 * time.Hour, nil +} + +func intEnv(name string, fallback int) (int, error) { + value, ok := os.LookupEnv(name) + if !ok { + return fallback, nil + } + + parsed, err := strconv.Atoi(strings.TrimSpace(value)) + if err != nil { + return 0, fmt.Errorf("%s: parse int: %w", name, err) + } + + return parsed, nil +} + +func floatEnv(name string, fallback float64) (float64, error) { + value, ok := os.LookupEnv(name) + if !ok { + return fallback, nil + } + + parsed, err := strconv.ParseFloat(strings.TrimSpace(value), 64) + if err != nil { + return 0, fmt.Errorf("%s: parse float: %w", name, err) + } + + return parsed, nil +} + +func boolEnv(name string, fallback bool) (bool, error) { + value, ok := os.LookupEnv(name) + if !ok { + return fallback, nil + } + + parsed, err := strconv.ParseBool(strings.TrimSpace(value)) + if err != nil { + return false, fmt.Errorf("%s: parse bool: %w", name, err) + } + + return parsed, nil +} + +func octalUint32Env(name string, fallback uint32) (uint32, error) { + value, ok := os.LookupEnv(name) + if !ok { + return fallback, nil + } + + parsed, err := strconv.ParseUint(strings.TrimSpace(value), 8, 32) + if err != nil { + return 0, fmt.Errorf("%s: parse octal: %w", name, err) + } + + return uint32(parsed), nil +} + +func normalizeExporterValue(value string) string { + trimmed := strings.TrimSpace(value) + switch trimmed { + case "", "none": + return "none" + default: + return trimmed + } +} + +func normalizeProtocolValue(primary string, fallback string, defaultValue string) string { + primary = strings.TrimSpace(primary) + if primary != "" { + return primary + } + + fallback = strings.TrimSpace(fallback) + if fallback != "" { + return fallback + } + + return strings.TrimSpace(defaultValue) +} diff --git a/rtmanager/internal/config/validation.go b/rtmanager/internal/config/validation.go new file mode 100644 index 0000000..caf455e --- /dev/null +++ b/rtmanager/internal/config/validation.go @@ -0,0 +1,93 @@ +package config + +import ( + "fmt" + "log/slog" + "net" + "net/url" + "strings" +) + +// Validate reports whether cfg stores a usable Runtime Manager process +// configuration. +func (cfg Config) Validate() error { + if cfg.ShutdownTimeout <= 0 { + return fmt.Errorf("%s must be positive", shutdownTimeoutEnvVar) + } + if err := validateSlogLevel(cfg.Logging.Level); err != nil { + return fmt.Errorf("%s: %w", logLevelEnvVar, err) + } + if err := cfg.InternalHTTP.Validate(); err != nil { + return err + } + if err := cfg.Docker.Validate(); err != nil { + return err + } + if err := cfg.Postgres.Validate(); err != nil { + return err + } + if err := cfg.Redis.Validate(); err != nil { + return err + } + if err := cfg.Streams.Validate(); err != nil { + return err + } + if err := cfg.Container.Validate(); err != nil { + return err + } + if err := cfg.Health.Validate(); err != nil { + return err + } + if err := cfg.Cleanup.Validate(); err != nil { + return err + } + if err := cfg.Coordination.Validate(); err != nil { + return err + } + if err := cfg.Lobby.Validate(); err != nil { + return err + } + if err := cfg.Telemetry.Validate(); err != nil { + return err + } + + return nil +} + +func validateSlogLevel(level string) error { + var slogLevel slog.Level + if err := slogLevel.UnmarshalText([]byte(strings.TrimSpace(level))); err != nil { + return fmt.Errorf("invalid slog level %q: %w", level, err) + } + + return nil +} + +func isTCPAddr(value string) bool { + host, port, err := net.SplitHostPort(strings.TrimSpace(value)) + if err != nil { + return false + } + + if port == "" { + return false + } + if host == "" { + return true + } + + return !strings.Contains(host, " ") +} + +func isHTTPURL(value string) bool { + parsed, err := url.Parse(strings.TrimSpace(value)) + if err != nil { + return false + } + + if parsed.Scheme != "http" && parsed.Scheme != "https" { + return false + } + + return parsed.Host != "" +} diff --git a/rtmanager/internal/domain/health/snapshot.go b/rtmanager/internal/domain/health/snapshot.go new file mode 100644 index 0000000..de40c44 --- /dev/null +++ b/rtmanager/internal/domain/health/snapshot.go @@ -0,0 +1,231 @@ +// Package health defines the technical-health domain types owned by +// Runtime Manager. +// +// EventType matches the `event_type` enum frozen in +// `galaxy/rtmanager/api/runtime-health-asyncapi.yaml`. SnapshotStatus +// matches the SQL CHECK on `health_snapshots.status` and is intentionally +// narrower than EventType (the snapshot table collapses +// `container_started → healthy` and drops `probe_recovered` per +// `galaxy/rtmanager/README.md §Health Monitoring`). +package health + +import ( + "encoding/json" + "fmt" + "strings" + "time" +) + +// EventType identifies one entry on the `runtime:health_events` Redis +// Stream. Used by the health-event publishers and consumers. +type EventType string + +const ( + // EventTypeContainerStarted reports a successful container start. + EventTypeContainerStarted EventType = "container_started" + + // EventTypeContainerExited reports a non-zero Docker `die` event. + EventTypeContainerExited EventType = "container_exited" + + // EventTypeContainerOOM reports a Docker `oom` event. + EventTypeContainerOOM EventType = "container_oom" + + // EventTypeContainerDisappeared reports that the listener observed + // a `destroy` event for a record Runtime Manager did not initiate. + EventTypeContainerDisappeared EventType = "container_disappeared" + + // EventTypeInspectUnhealthy reports an unexpected outcome of the + // periodic Docker inspect (RestartCount growth, unexpected status, + // declared HEALTHCHECK reporting unhealthy). + EventTypeInspectUnhealthy EventType = "inspect_unhealthy" + + // EventTypeProbeFailed reports that the active HTTP probe crossed + // the configured failure threshold. + EventTypeProbeFailed EventType = "probe_failed" + + // EventTypeProbeRecovered reports the first probe success after a + // `probe_failed` event was published. + EventTypeProbeRecovered EventType = "probe_recovered" +) + +// IsKnown reports whether eventType belongs to the frozen event-type +// vocabulary. +func (eventType EventType) IsKnown() bool { + switch eventType { + case EventTypeContainerStarted, + EventTypeContainerExited, + EventTypeContainerOOM, + EventTypeContainerDisappeared, + EventTypeInspectUnhealthy, + EventTypeProbeFailed, + EventTypeProbeRecovered: + return true + default: + return false + } +} + +// AllEventTypes returns the frozen list of every event-type value. +func AllEventTypes() []EventType { + return []EventType{ + EventTypeContainerStarted, + EventTypeContainerExited, + EventTypeContainerOOM, + EventTypeContainerDisappeared, + EventTypeInspectUnhealthy, + EventTypeProbeFailed, + EventTypeProbeRecovered, + } +} + +// SnapshotStatus identifies one latest-observation status value stored +// in the `health_snapshots.status` column. Distinct from EventType: the +// table collapses `container_started → healthy` and never persists +// `probe_recovered` (it is conveyed only as a `runtime:health_events` +// entry with status=healthy in the next observation). +type SnapshotStatus string + +const ( + // SnapshotStatusHealthy reports that the most recent observation + // found the container live and the engine probe responsive. + SnapshotStatusHealthy SnapshotStatus = "healthy" + + // SnapshotStatusProbeFailed reports that the active probe crossed + // the failure threshold. + SnapshotStatusProbeFailed SnapshotStatus = "probe_failed" + + // SnapshotStatusExited reports that the container exited. + SnapshotStatusExited SnapshotStatus = "exited" + + // SnapshotStatusOOM reports that the container was killed by the + // OOM killer. + SnapshotStatusOOM SnapshotStatus = "oom" + + // SnapshotStatusInspectUnhealthy reports that the periodic inspect + // observed an unexpected state. + SnapshotStatusInspectUnhealthy SnapshotStatus = "inspect_unhealthy" + + // SnapshotStatusContainerDisappeared reports that Docker no longer + // reports the container. + SnapshotStatusContainerDisappeared SnapshotStatus = "container_disappeared" +) + +// IsKnown reports whether status belongs to the frozen snapshot-status +// vocabulary. +func (status SnapshotStatus) IsKnown() bool { + switch status { + case SnapshotStatusHealthy, + SnapshotStatusProbeFailed, + SnapshotStatusExited, + SnapshotStatusOOM, + SnapshotStatusInspectUnhealthy, + SnapshotStatusContainerDisappeared: + return true + default: + return false + } +} + +// AllSnapshotStatuses returns the frozen list of every snapshot-status +// value. +func AllSnapshotStatuses() []SnapshotStatus { + return []SnapshotStatus{ + SnapshotStatusHealthy, + SnapshotStatusProbeFailed, + SnapshotStatusExited, + SnapshotStatusOOM, + SnapshotStatusInspectUnhealthy, + SnapshotStatusContainerDisappeared, + } +} + +// SnapshotSource identifies the observation source that produced one +// snapshot. Matches the SQL CHECK on `health_snapshots.source`. +type SnapshotSource string + +const ( + // SnapshotSourceDockerEvent reports that the latest observation + // arrived through the Docker events listener. + SnapshotSourceDockerEvent SnapshotSource = "docker_event" + + // SnapshotSourceInspect reports that the latest observation arrived + // through the periodic Docker inspect worker. + SnapshotSourceInspect SnapshotSource = "inspect" + + // SnapshotSourceProbe reports that the latest observation arrived + // through the active HTTP probe. + SnapshotSourceProbe SnapshotSource = "probe" +) + +// IsKnown reports whether source belongs to the frozen snapshot-source +// vocabulary. +func (source SnapshotSource) IsKnown() bool { + switch source { + case SnapshotSourceDockerEvent, + SnapshotSourceInspect, + SnapshotSourceProbe: + return true + default: + return false + } +} + +// AllSnapshotSources returns the frozen list of every snapshot-source +// value. +func AllSnapshotSources() []SnapshotSource { + return []SnapshotSource{ + SnapshotSourceDockerEvent, + SnapshotSourceInspect, + SnapshotSourceProbe, + } +} + +// HealthSnapshot stores the latest technical-health observation for one +// game. One row per game_id; later observations overwrite. +type HealthSnapshot struct { + // GameID identifies the platform game. + GameID string + + // ContainerID stores the Docker container id observed by the + // snapshot source. Empty when the source could not associate a + // container (e.g., reconciler dispose for a record whose container + // is already gone). + ContainerID string + + // Status stores the latest observed snapshot status. + Status SnapshotStatus + + // Source stores the observation source that produced this entry. + Source SnapshotSource + + // Details stores the source-specific JSON detail payload. Adapters + // store and retrieve it verbatim. Empty / nil values are persisted + // as the SQL default `{}`. + Details json.RawMessage + + // ObservedAt stores the wall-clock at which the source captured the + // observation. + ObservedAt time.Time +} + +// Validate reports whether snapshot satisfies the snapshot invariants +// implied by the SQL CHECK constraints. +func (snapshot HealthSnapshot) Validate() error { + if strings.TrimSpace(snapshot.GameID) == "" { + return fmt.Errorf("game id must not be empty") + } + if !snapshot.Status.IsKnown() { + return fmt.Errorf("status %q is unsupported", snapshot.Status) + } + if !snapshot.Source.IsKnown() { + return fmt.Errorf("source %q is unsupported", snapshot.Source) + } + if snapshot.ObservedAt.IsZero() { + return fmt.Errorf("observed at must not be zero") + } + if len(snapshot.Details) > 0 && !json.Valid(snapshot.Details) { + return fmt.Errorf("details must be valid JSON when non-empty") + } + + return nil +} diff --git a/rtmanager/internal/domain/health/snapshot_test.go b/rtmanager/internal/domain/health/snapshot_test.go new file mode 100644 index 0000000..fcb9a73 --- /dev/null +++ b/rtmanager/internal/domain/health/snapshot_test.go @@ -0,0 +1,133 @@ +package health + +import ( + "encoding/json" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestEventTypeIsKnown(t *testing.T) { + for _, eventType := range AllEventTypes() { + assert.Truef(t, eventType.IsKnown(), "expected %q known", eventType) + } + + assert.False(t, EventType("").IsKnown()) + assert.False(t, EventType("paused").IsKnown()) +} + +func TestAllEventTypesCoverFrozenSet(t *testing.T) { + assert.ElementsMatch(t, + []EventType{ + EventTypeContainerStarted, + EventTypeContainerExited, + EventTypeContainerOOM, + EventTypeContainerDisappeared, + EventTypeInspectUnhealthy, + EventTypeProbeFailed, + EventTypeProbeRecovered, + }, + AllEventTypes(), + ) +} + +func TestSnapshotStatusIsKnown(t *testing.T) { + for _, status := range AllSnapshotStatuses() { + assert.Truef(t, status.IsKnown(), "expected %q known", status) + } + + assert.False(t, SnapshotStatus("").IsKnown()) + assert.False(t, SnapshotStatus("starting").IsKnown()) + assert.False(t, SnapshotStatus("probe_recovered").IsKnown(), + "snapshot status must not include event-only values") + assert.False(t, SnapshotStatus("container_started").IsKnown(), + "snapshot status must not include event-only values") +} + +func TestAllSnapshotStatusesCoverFrozenSet(t *testing.T) { + assert.ElementsMatch(t, + []SnapshotStatus{ + SnapshotStatusHealthy, + SnapshotStatusProbeFailed, + SnapshotStatusExited, + SnapshotStatusOOM, + SnapshotStatusInspectUnhealthy, + SnapshotStatusContainerDisappeared, + }, + AllSnapshotStatuses(), + ) +} + +func TestSnapshotSourceIsKnown(t *testing.T) { + for _, source := range AllSnapshotSources() { + assert.Truef(t, source.IsKnown(), "expected %q known", source) + } + + assert.False(t, SnapshotSource("").IsKnown()) + assert.False(t, SnapshotSource("manual").IsKnown()) +} + +func TestAllSnapshotSourcesCoverFrozenSet(t *testing.T) { + assert.ElementsMatch(t, + []SnapshotSource{ + SnapshotSourceDockerEvent, + SnapshotSourceInspect, + SnapshotSourceProbe, + }, + AllSnapshotSources(), + ) +} + +func sampleSnapshot() HealthSnapshot { + return HealthSnapshot{ + GameID: "game-test", + ContainerID: "container-1", + Status: SnapshotStatusHealthy, + Source: SnapshotSourceProbe, + Details: json.RawMessage(`{"prior_failure_count":0}`), + ObservedAt: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC), + } +} + +func TestHealthSnapshotValidateHappy(t *testing.T) { + require.NoError(t, sampleSnapshot().Validate()) +} + +func TestHealthSnapshotValidateAcceptsEmptyDetails(t *testing.T) { + snapshot := sampleSnapshot() + snapshot.Details = nil + + assert.NoError(t, snapshot.Validate()) +} + +func TestHealthSnapshotValidateAcceptsEmptyContainerID(t *testing.T) { + snapshot := sampleSnapshot() + snapshot.ContainerID = "" + + assert.NoError(t, snapshot.Validate()) +} + +func TestHealthSnapshotValidateRejects(t *testing.T) { + tests := []struct { + name string + mutate func(*HealthSnapshot) + }{ + {"empty game id", func(s *HealthSnapshot) { s.GameID = "" }}, + {"unknown status", func(s *HealthSnapshot) { s.Status = "exotic" }}, + {"unknown source", func(s *HealthSnapshot) { s.Source = "exotic" }}, + {"zero observed at", func(s *HealthSnapshot) { s.ObservedAt = time.Time{} }}, + {"invalid details json", func(s *HealthSnapshot) { + s.Details = json.RawMessage("not-json") + }}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + snapshot := sampleSnapshot() + tt.mutate(&snapshot) + assert.Error(t, snapshot.Validate()) + }) + } +} diff --git a/rtmanager/internal/domain/operation/log.go b/rtmanager/internal/domain/operation/log.go new file mode 100644 index 0000000..ac5c223 --- /dev/null +++ b/rtmanager/internal/domain/operation/log.go @@ -0,0 +1,245 @@ +// Package operation defines the runtime-operation audit-log domain types +// owned by Runtime Manager. +// +// One OperationEntry maps to one row of the `operation_log` PostgreSQL +// table (see +// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`). +// The OpKind / OpSource / Outcome enums match the SQL CHECK constraints +// verbatim and feed the telemetry counters declared in +// `galaxy/rtmanager/README.md §Observability`. +package operation + +import ( + "fmt" + "strings" + "time" +) + +// OpKind identifies the kind of operation Runtime Manager performed. +type OpKind string + +const ( + // OpKindStart records a start lifecycle operation. + OpKindStart OpKind = "start" + + // OpKindStop records a stop lifecycle operation. + OpKindStop OpKind = "stop" + + // OpKindRestart records a restart lifecycle operation + // (recreate with the same image_ref). + OpKindRestart OpKind = "restart" + + // OpKindPatch records a semver-patch lifecycle operation + // (recreate with a new image_ref). + OpKindPatch OpKind = "patch" + + // OpKindCleanupContainer records a container removal performed by + // the cleanup TTL worker or the admin DELETE endpoint. + OpKindCleanupContainer OpKind = "cleanup_container" + + // OpKindReconcileAdopt records that the reconciler discovered an + // unrecorded container labelled `com.galaxy.owner=rtmanager` and + // inserted a runtime record for it. + OpKindReconcileAdopt OpKind = "reconcile_adopt" + + // OpKindReconcileDispose records that the reconciler observed a + // running record whose container is missing in Docker and marked it + // as removed. + OpKindReconcileDispose OpKind = "reconcile_dispose" +) + +// IsKnown reports whether kind belongs to the frozen op-kind vocabulary. +func (kind OpKind) IsKnown() bool { + switch kind { + case OpKindStart, + OpKindStop, + OpKindRestart, + OpKindPatch, + OpKindCleanupContainer, + OpKindReconcileAdopt, + OpKindReconcileDispose: + return true + default: + return false + } +} + +// AllOpKinds returns the frozen list of every op-kind value. The slice +// order is stable across calls. +func AllOpKinds() []OpKind { + return []OpKind{ + OpKindStart, + OpKindStop, + OpKindRestart, + OpKindPatch, + OpKindCleanupContainer, + OpKindReconcileAdopt, + OpKindReconcileDispose, + } +} + +// OpSource identifies where one operation entered Runtime Manager. +type OpSource string + +const ( + // OpSourceLobbyStream identifies entries triggered by the + // `runtime:start_jobs` or `runtime:stop_jobs` Redis Stream consumer. + OpSourceLobbyStream OpSource = "lobby_stream" + + // OpSourceGMRest identifies entries triggered by Game Master through + // the internal REST surface. + OpSourceGMRest OpSource = "gm_rest" + + // OpSourceAdminRest identifies entries triggered by Admin Service + // through the internal REST surface. + OpSourceAdminRest OpSource = "admin_rest" + + // OpSourceAutoTTL identifies entries triggered by the periodic + // container-cleanup worker. + OpSourceAutoTTL OpSource = "auto_ttl" + + // OpSourceAutoReconcile identifies entries triggered by the + // reconciler at startup or on its periodic interval. + OpSourceAutoReconcile OpSource = "auto_reconcile" +) + +// IsKnown reports whether source belongs to the frozen op-source +// vocabulary. +func (source OpSource) IsKnown() bool { + switch source { + case OpSourceLobbyStream, + OpSourceGMRest, + OpSourceAdminRest, + OpSourceAutoTTL, + OpSourceAutoReconcile: + return true + default: + return false + } +} + +// AllOpSources returns the frozen list of every op-source value. The +// slice order is stable across calls. +func AllOpSources() []OpSource { + return []OpSource{ + OpSourceLobbyStream, + OpSourceGMRest, + OpSourceAdminRest, + OpSourceAutoTTL, + OpSourceAutoReconcile, + } +} + +// Outcome reports the high-level outcome of one operation. +type Outcome string + +const ( + // OutcomeSuccess reports that the operation completed without + // surfacing an error. + OutcomeSuccess Outcome = "success" + + // OutcomeFailure reports that the operation surfaced a stable error + // code recorded in OperationEntry.ErrorCode. + OutcomeFailure Outcome = "failure" +) + +// IsKnown reports whether outcome belongs to the frozen outcome +// vocabulary. +func (outcome Outcome) IsKnown() bool { + switch outcome { + case OutcomeSuccess, OutcomeFailure: + return true + default: + return false + } +} + +// AllOutcomes returns the frozen list of every outcome value. +func AllOutcomes() []Outcome { + return []Outcome{OutcomeSuccess, OutcomeFailure} +} + +// OperationEntry stores one append-only audit row of the `operation_log` +// table. ID is zero on records that have not been persisted yet; the +// store assigns it from the table's bigserial column. FinishedAt is a +// pointer because the column is nullable for in-flight rows even though +// the lifecycle services finalise the row in the same transaction. +type OperationEntry struct { + // ID identifies the persisted row. Zero before persistence. + ID int64 + + // GameID identifies the platform game this operation acted on. + GameID string + + // OpKind classifies what the operation did. + OpKind OpKind + + // OpSource classifies how the operation entered Runtime Manager. + OpSource OpSource + + // SourceRef stores an opaque per-source reference such as a Redis + // Stream entry id, a REST request id, or an admin user id. Empty + // when the source does not provide one. + SourceRef string + + // ImageRef stores the engine image reference associated with the + // operation, when applicable. Empty for operations that do not + // touch an image (e.g., cleanup_container). + ImageRef string + + // ContainerID stores the Docker container id observed at the time + // of the operation, when applicable. + ContainerID string + + // Outcome reports whether the operation succeeded or failed. + Outcome Outcome + + // ErrorCode stores the stable error code on failure. Empty on + // success. + ErrorCode string + + // ErrorMessage stores the operator-readable detail on failure. + // Empty on success. + ErrorMessage string + + // StartedAt stores the wall-clock at which the operation began. + StartedAt time.Time + + // FinishedAt stores the wall-clock at which the operation + // finalised. Nil for in-flight rows. + FinishedAt *time.Time +} + +// Validate reports whether entry satisfies the operation-log invariants +// implied by the SQL CHECK constraints and the README §Persistence +// Layout. +func (entry OperationEntry) Validate() error { + if strings.TrimSpace(entry.GameID) == "" { + return fmt.Errorf("game id must not be empty") + } + if !entry.OpKind.IsKnown() { + return fmt.Errorf("op kind %q is unsupported", entry.OpKind) + } + if !entry.OpSource.IsKnown() { + return fmt.Errorf("op source %q is unsupported", entry.OpSource) + } + if !entry.Outcome.IsKnown() { + return fmt.Errorf("outcome %q is unsupported", entry.Outcome) + } + if entry.StartedAt.IsZero() { + return fmt.Errorf("started at must not be zero") + } + if entry.FinishedAt != nil { + if entry.FinishedAt.IsZero() { + return fmt.Errorf("finished at must not be zero when present") + } + if entry.FinishedAt.Before(entry.StartedAt) { + return fmt.Errorf("finished at must not be before started at") + } + } + if entry.Outcome == OutcomeFailure && strings.TrimSpace(entry.ErrorCode) == "" { + return fmt.Errorf("error code must not be empty for failure entries") + } + + return nil +} diff --git a/rtmanager/internal/domain/operation/log_test.go b/rtmanager/internal/domain/operation/log_test.go new file mode 100644 index 0000000..f5ed31e --- /dev/null +++ b/rtmanager/internal/domain/operation/log_test.go @@ -0,0 +1,130 @@ +package operation + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestOpKindIsKnown(t *testing.T) { + for _, kind := range AllOpKinds() { + assert.Truef(t, kind.IsKnown(), "expected %q known", kind) + } + + assert.False(t, OpKind("").IsKnown()) + assert.False(t, OpKind("rollback").IsKnown()) +} + +func TestAllOpKindsCoverFrozenSet(t *testing.T) { + assert.ElementsMatch(t, + []OpKind{ + OpKindStart, OpKindStop, OpKindRestart, OpKindPatch, + OpKindCleanupContainer, OpKindReconcileAdopt, OpKindReconcileDispose, + }, + AllOpKinds(), + ) +} + +func TestOpSourceIsKnown(t *testing.T) { + for _, source := range AllOpSources() { + assert.Truef(t, source.IsKnown(), "expected %q known", source) + } + + assert.False(t, OpSource("").IsKnown()) + assert.False(t, OpSource("manual").IsKnown()) +} + +func TestAllOpSourcesCoverFrozenSet(t *testing.T) { + assert.ElementsMatch(t, + []OpSource{ + OpSourceLobbyStream, OpSourceGMRest, OpSourceAdminRest, + OpSourceAutoTTL, OpSourceAutoReconcile, + }, + AllOpSources(), + ) +} + +func TestOutcomeIsKnown(t *testing.T) { + for _, outcome := range AllOutcomes() { + assert.Truef(t, outcome.IsKnown(), "expected %q known", outcome) + } + + assert.False(t, Outcome("").IsKnown()) + assert.False(t, Outcome("partial").IsKnown()) +} + +func TestAllOutcomesCoverFrozenSet(t *testing.T) { + assert.ElementsMatch(t, + []Outcome{OutcomeSuccess, OutcomeFailure}, + AllOutcomes(), + ) +} + +func successEntry() OperationEntry { + started := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) + finished := started.Add(time.Second) + return OperationEntry{ + GameID: "game-test", + OpKind: OpKindStart, + OpSource: OpSourceLobbyStream, + SourceRef: "1700000000000-0", + ImageRef: "galaxy/game:1.0.0", + ContainerID: "container-1", + Outcome: OutcomeSuccess, + StartedAt: started, + FinishedAt: &finished, + } +} + +func TestOperationEntryValidateHappy(t *testing.T) { + require.NoError(t, successEntry().Validate()) +} + +func TestOperationEntryValidateAcceptsReplayNoOp(t *testing.T) { + entry := successEntry() + entry.ErrorCode = "replay_no_op" + + assert.NoError(t, entry.Validate()) +} + +func TestOperationEntryValidateAcceptsInFlight(t *testing.T) { + entry := successEntry() + entry.FinishedAt = nil + + assert.NoError(t, entry.Validate()) +} + +func TestOperationEntryValidateRejects(t *testing.T) { + tests := []struct { + name string + mutate func(*OperationEntry) + }{ + {"empty game id", func(e *OperationEntry) { e.GameID = "" }}, + {"unknown op kind", func(e *OperationEntry) { e.OpKind = "exotic" }}, + {"unknown op source", func(e *OperationEntry) { e.OpSource = "exotic" }}, + {"unknown outcome", func(e *OperationEntry) { e.Outcome = "partial" }}, + {"zero started at", func(e *OperationEntry) { e.StartedAt = time.Time{} }}, + {"zero finished at", func(e *OperationEntry) { + zero := time.Time{} + e.FinishedAt = &zero + }}, + {"finished before started", func(e *OperationEntry) { + before := e.StartedAt.Add(-time.Second) + e.FinishedAt = &before + }}, + {"failure without error code", func(e *OperationEntry) { + e.Outcome = OutcomeFailure + e.ErrorCode = "" + }}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + entry := successEntry() + tt.mutate(&entry) + assert.Error(t, entry.Validate()) + }) + } +} diff --git a/rtmanager/internal/domain/runtime/errors.go b/rtmanager/internal/domain/runtime/errors.go new file mode 100644 index 0000000..522e1f1 --- /dev/null +++ b/rtmanager/internal/domain/runtime/errors.go @@ -0,0 +1,43 @@ +package runtime + +import ( + "errors" + "fmt" +) + +// ErrNotFound reports that a runtime record was requested but does not +// exist in the store. +var ErrNotFound = errors.New("runtime record not found") + +// ErrConflict reports that a runtime mutation could not be applied +// because the record changed concurrently or failed a compare-and-swap +// guard. +var ErrConflict = errors.New("runtime record conflict") + +// ErrInvalidTransition is the sentinel returned when Transition rejects +// a `(from, to)` pair. +var ErrInvalidTransition = errors.New("invalid runtime status transition") + +// InvalidTransitionError stores the rejected `(from, to)` pair and wraps +// ErrInvalidTransition so callers can match it with errors.Is. +type InvalidTransitionError struct { + // From stores the source status that was attempted to leave. + From Status + + // To stores the destination status that was attempted to enter. + To Status +} + +// Error reports a human-readable summary of the rejected pair. +func (err *InvalidTransitionError) Error() string { + return fmt.Sprintf( + "invalid runtime status transition from %q to %q", + err.From, err.To, + ) +} + +// Unwrap returns ErrInvalidTransition so errors.Is recognizes the +// sentinel. +func (err *InvalidTransitionError) Unwrap() error { + return ErrInvalidTransition +} diff --git a/rtmanager/internal/domain/runtime/model.go b/rtmanager/internal/domain/runtime/model.go new file mode 100644 index 0000000..5bab48d --- /dev/null +++ b/rtmanager/internal/domain/runtime/model.go @@ -0,0 +1,197 @@ +// Package runtime defines the runtime-record domain model, status machine, +// and sentinel errors owned by Runtime Manager. +// +// The package mirrors the durable shape of the `runtime_records` +// PostgreSQL table (see +// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`). +// Every status / transition / required-field rule already documented in +// `galaxy/rtmanager/README.md` lives here as code so adapter and service +// layers do not re-derive it. +package runtime + +import ( + "fmt" + "strings" + "time" +) + +// Status identifies one runtime-record lifecycle state. +type Status string + +const ( + // StatusRunning reports that an engine container is live and bound to + // the record. The associated container id and image ref are non-empty + // and StartedAt is set. + StatusRunning Status = "running" + + // StatusStopped reports that the engine container has exited (graceful + // stop, observed Docker exit, or reconciled exit). The container is + // still present in Docker until the cleanup worker removes it. + StatusStopped Status = "stopped" + + // StatusRemoved reports that the container has been removed from + // Docker (admin cleanup or reconcile_dispose). The record stays in + // PostgreSQL for audit; there is no transition out of this state. + StatusRemoved Status = "removed" +) + +// IsKnown reports whether status belongs to the frozen runtime status +// vocabulary. +func (status Status) IsKnown() bool { + switch status { + case StatusRunning, StatusStopped, StatusRemoved: + return true + default: + return false + } +} + +// IsTerminal reports whether status can no longer accept lifecycle +// transitions. +func (status Status) IsTerminal() bool { + return status == StatusRemoved +} + +// AllStatuses returns the frozen list of every runtime status value. The +// slice order is stable across calls and matches the README §Persistence +// Layout listing. +func AllStatuses() []Status { + return []Status{ + StatusRunning, + StatusStopped, + StatusRemoved, + } +} + +// RuntimeRecord stores one durable runtime record owned by Runtime +// Manager. It mirrors one row of the `runtime_records` table. +// +// CurrentContainerID and CurrentImageRef are stored as plain strings; an +// empty value represents SQL NULL and is bridged at the adapter layer. +// StartedAt, StoppedAt, and RemovedAt are *time.Time so a missing value +// is unambiguous and aligns with the jet-generated model. +type RuntimeRecord struct { + // GameID identifies the platform game owning this runtime record. + GameID string + + // Status stores the current lifecycle state. + Status Status + + // CurrentContainerID identifies the bound Docker container. Empty + // when status is removed and after a reconciler observes + // disappearance. + CurrentContainerID string + + // CurrentImageRef stores the Docker reference of the currently-bound + // engine image. Non-empty when status is running or stopped. + CurrentImageRef string + + // EngineEndpoint stores the stable URL Game Master uses to reach the + // engine container, in `http://galaxy-game-{game_id}:8080` form. + EngineEndpoint string + + // StatePath stores the absolute host path of the bind-mounted engine + // state directory. + StatePath string + + // DockerNetwork stores the Docker network the container was attached + // to at create time. + DockerNetwork string + + // StartedAt stores the wall-clock at which the container became + // running. Non-nil when status is running or stopped. + StartedAt *time.Time + + // StoppedAt stores the wall-clock at which the container exited. + // Non-nil when status is stopped or removed (when the record passed + // through stopped before removal). + StoppedAt *time.Time + + // RemovedAt stores the wall-clock at which the container was removed + // from Docker. Non-nil when status is removed. + RemovedAt *time.Time + + // LastOpAt stores the wall-clock of the most recent operation + // affecting this record. Drives the cleanup TTL. + LastOpAt time.Time + + // CreatedAt stores the wall-clock at which Runtime Manager first saw + // this game. + CreatedAt time.Time +} + +// Validate reports whether record satisfies the runtime-record invariants +// implied by README §Lifecycles and the SQL CHECK on `runtime_records`. +func (record RuntimeRecord) Validate() error { + if strings.TrimSpace(record.GameID) == "" { + return fmt.Errorf("game id must not be empty") + } + if !record.Status.IsKnown() { + return fmt.Errorf("status %q is unsupported", record.Status) + } + if strings.TrimSpace(record.EngineEndpoint) == "" { + return fmt.Errorf("engine endpoint must not be empty") + } + if strings.TrimSpace(record.StatePath) == "" { + return fmt.Errorf("state path must not be empty") + } + if strings.TrimSpace(record.DockerNetwork) == "" { + return fmt.Errorf("docker network must not be empty") + } + if record.LastOpAt.IsZero() { + return fmt.Errorf("last op at must not be zero") + } + if record.CreatedAt.IsZero() { + return fmt.Errorf("created at must not be zero") + } + if record.LastOpAt.Before(record.CreatedAt) { + return fmt.Errorf("last op at must not be before created at") + } + + switch record.Status { + case StatusRunning: + if strings.TrimSpace(record.CurrentContainerID) == "" { + return fmt.Errorf("current container id must not be empty for running records") + } + if strings.TrimSpace(record.CurrentImageRef) == "" { + return fmt.Errorf("current image ref must not be empty for running records") + } + if record.StartedAt == nil { + return fmt.Errorf("started at must not be nil for running records") + } + if record.StartedAt.IsZero() { + return fmt.Errorf("started at must not be zero when present") + } + + case StatusStopped: + if strings.TrimSpace(record.CurrentImageRef) == "" { + return fmt.Errorf("current image ref must not be empty for stopped records") + } + if record.StoppedAt == nil { + return fmt.Errorf("stopped at must not be nil for stopped records") + } + if record.StoppedAt.IsZero() { + return fmt.Errorf("stopped at must not be zero when present") + } + + case StatusRemoved: + if record.RemovedAt == nil { + return fmt.Errorf("removed at must not be nil for removed records") + } + if record.RemovedAt.IsZero() { + return fmt.Errorf("removed at must not be zero when present") + } + } + + if record.StartedAt != nil && record.StartedAt.Before(record.CreatedAt) { + return fmt.Errorf("started at must not be before created at") + } + if record.StoppedAt != nil && record.StartedAt != nil && record.StoppedAt.Before(*record.StartedAt) { + return fmt.Errorf("stopped at must not be before started at") + } + if record.RemovedAt != nil && record.RemovedAt.Before(record.CreatedAt) { + return fmt.Errorf("removed at must not be before created at") + } + + return nil +} diff --git a/rtmanager/internal/domain/runtime/model_test.go b/rtmanager/internal/domain/runtime/model_test.go new file mode 100644 index 0000000..9ba2818 --- /dev/null +++ b/rtmanager/internal/domain/runtime/model_test.go @@ -0,0 +1,156 @@ +package runtime + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestStatusIsKnown(t *testing.T) { + for _, status := range AllStatuses() { + assert.Truef(t, status.IsKnown(), "expected %q known", status) + } + + assert.False(t, Status("").IsKnown()) + assert.False(t, Status("unknown").IsKnown()) +} + +func TestStatusIsTerminal(t *testing.T) { + assert.True(t, StatusRemoved.IsTerminal()) + + for _, status := range []Status{StatusRunning, StatusStopped} { + assert.Falsef(t, status.IsTerminal(), "expected %q non-terminal", status) + } +} + +func TestAllStatuses(t *testing.T) { + statuses := AllStatuses() + + assert.ElementsMatch(t, + []Status{StatusRunning, StatusStopped, StatusRemoved}, + statuses, + ) + + statuses[0] = "tampered" + assert.Equal(t, StatusRunning, AllStatuses()[0], + "AllStatuses must return an independent slice") +} + +func runningRecord() RuntimeRecord { + created := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) + started := created.Add(time.Second) + return RuntimeRecord{ + GameID: "game-test", + Status: StatusRunning, + CurrentContainerID: "container-1", + CurrentImageRef: "galaxy/game:1.0.0", + EngineEndpoint: "http://galaxy-game-game-test:8080", + StatePath: "/var/lib/galaxy/games/game-test", + DockerNetwork: "galaxy-net", + StartedAt: &started, + LastOpAt: started, + CreatedAt: created, + } +} + +func TestRuntimeRecordValidateRunningHappy(t *testing.T) { + require.NoError(t, runningRecord().Validate()) +} + +func TestRuntimeRecordValidateStoppedHappy(t *testing.T) { + record := runningRecord() + stopped := record.StartedAt.Add(time.Minute) + record.Status = StatusStopped + record.StoppedAt = &stopped + record.LastOpAt = stopped + + require.NoError(t, record.Validate()) +} + +func TestRuntimeRecordValidateRemovedHappy(t *testing.T) { + record := runningRecord() + stopped := record.StartedAt.Add(time.Minute) + removed := stopped.Add(time.Minute) + record.Status = StatusRemoved + record.StoppedAt = &stopped + record.RemovedAt = &removed + record.CurrentContainerID = "" + record.LastOpAt = removed + + require.NoError(t, record.Validate()) +} + +func TestRuntimeRecordValidateRejects(t *testing.T) { + tests := []struct { + name string + mutate func(*RuntimeRecord) + }{ + {"empty game id", func(r *RuntimeRecord) { r.GameID = "" }}, + {"unknown status", func(r *RuntimeRecord) { r.Status = "exotic" }}, + {"empty engine endpoint", func(r *RuntimeRecord) { r.EngineEndpoint = "" }}, + {"empty state path", func(r *RuntimeRecord) { r.StatePath = "" }}, + {"empty docker network", func(r *RuntimeRecord) { r.DockerNetwork = "" }}, + {"zero last op at", func(r *RuntimeRecord) { r.LastOpAt = time.Time{} }}, + {"zero created at", func(r *RuntimeRecord) { r.CreatedAt = time.Time{} }}, + {"last op at before created at", func(r *RuntimeRecord) { + r.LastOpAt = r.CreatedAt.Add(-time.Second) + }}, + {"running without container id", func(r *RuntimeRecord) { + r.CurrentContainerID = "" + }}, + {"running without image ref", func(r *RuntimeRecord) { + r.CurrentImageRef = "" + }}, + {"running without started at", func(r *RuntimeRecord) { + r.StartedAt = nil + }}, + {"started at before created at", func(r *RuntimeRecord) { + before := r.CreatedAt.Add(-time.Second) + r.StartedAt = &before + }}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + record := runningRecord() + tt.mutate(&record) + assert.Error(t, record.Validate()) + }) + } +} + +func TestRuntimeRecordValidateRejectsStoppedWithoutStoppedAt(t *testing.T) { + record := runningRecord() + record.Status = StatusStopped + record.StoppedAt = nil + + assert.Error(t, record.Validate()) +} + +func TestRuntimeRecordValidateRejectsStoppedBeforeStarted(t *testing.T) { + record := runningRecord() + stopped := record.StartedAt.Add(-time.Second) + record.Status = StatusStopped + record.StoppedAt = &stopped + + assert.Error(t, record.Validate()) +} + +func TestRuntimeRecordValidateRejectsRemovedWithoutRemovedAt(t *testing.T) { + record := runningRecord() + record.Status = StatusRemoved + record.RemovedAt = nil + + assert.Error(t, record.Validate()) +} + +func TestRuntimeRecordValidateRejectsRemovedBeforeCreated(t *testing.T) { + record := runningRecord() + before := record.CreatedAt.Add(-time.Second) + record.Status = StatusRemoved + record.RemovedAt = &before + + assert.Error(t, record.Validate()) +} diff --git a/rtmanager/internal/domain/runtime/transitions.go b/rtmanager/internal/domain/runtime/transitions.go new file mode 100644 index 0000000..fff82ec --- /dev/null +++ b/rtmanager/internal/domain/runtime/transitions.go @@ -0,0 +1,51 @@ +package runtime + +// transitionKey stores one `(from, to)` pair in the allowed-transitions +// table. +type transitionKey struct { + from Status + to Status +} + +// allowedTransitions stores the set of permitted `(from, to)` status +// pairs. The four pairs mirror the lifecycle flows frozen in +// `galaxy/rtmanager/README.md §Lifecycles`: +// +// - running → stopped: graceful stop, observed Docker exit, or +// reconcile observing an exited container. +// - running → removed: reconcile_dispose when Docker no longer reports +// the container at all. +// - stopped → running: restart and patch inner start steps. +// - stopped → removed: cleanup_container, both the periodic TTL worker +// and the admin DELETE endpoint. +var allowedTransitions = map[transitionKey]struct{}{ + {StatusRunning, StatusStopped}: {}, + {StatusRunning, StatusRemoved}: {}, + {StatusStopped, StatusRunning}: {}, + {StatusStopped, StatusRemoved}: {}, +} + +// AllowedTransitions returns a copy of the `(from, to)` allowed +// transitions table used by Transition. The returned map is safe to +// mutate; callers should not rely on iteration order. +func AllowedTransitions() map[Status][]Status { + result := make(map[Status][]Status) + for key := range allowedTransitions { + result[key.from] = append(result[key.from], key.to) + } + return result +} + +// Transition reports whether from may transition to next. The function +// returns nil when the pair is permitted, and an *InvalidTransitionError +// wrapping ErrInvalidTransition otherwise. It does not touch any store +// and is safe to call from any layer. +func Transition(from Status, next Status) error { + if !from.IsKnown() || !next.IsKnown() { + return &InvalidTransitionError{From: from, To: next} + } + if _, ok := allowedTransitions[transitionKey{from: from, to: next}]; !ok { + return &InvalidTransitionError{From: from, To: next} + } + return nil +} diff --git a/rtmanager/internal/domain/runtime/transitions_test.go b/rtmanager/internal/domain/runtime/transitions_test.go new file mode 100644 index 0000000..6f34da6 --- /dev/null +++ b/rtmanager/internal/domain/runtime/transitions_test.go @@ -0,0 +1,88 @@ +package runtime + +import ( + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestTransitionAllowed(t *testing.T) { + cases := []struct { + from Status + to Status + }{ + {StatusRunning, StatusStopped}, + {StatusRunning, StatusRemoved}, + {StatusStopped, StatusRunning}, + {StatusStopped, StatusRemoved}, + } + + for _, tc := range cases { + assert.NoErrorf(t, Transition(tc.from, tc.to), + "expected %q -> %q allowed", tc.from, tc.to) + } +} + +func TestTransitionRejected(t *testing.T) { + cases := []struct { + from Status + to Status + }{ + {StatusRemoved, StatusRunning}, + {StatusRemoved, StatusStopped}, + {StatusRemoved, StatusRemoved}, + {StatusRunning, StatusRunning}, + {StatusStopped, StatusStopped}, + {Status("unknown"), StatusRunning}, + {StatusRunning, Status("unknown")}, + {Status(""), Status("")}, + } + + for _, tc := range cases { + err := Transition(tc.from, tc.to) + require.Errorf(t, err, "expected %q -> %q rejected", tc.from, tc.to) + assert.ErrorIs(t, err, ErrInvalidTransition) + + var transitionErr *InvalidTransitionError + require.True(t, errors.As(err, &transitionErr), + "expected *InvalidTransitionError for %q -> %q", tc.from, tc.to) + assert.Equal(t, tc.from, transitionErr.From) + assert.Equal(t, tc.to, transitionErr.To) + } +} + +func TestAllowedTransitionsReturnsCopy(t *testing.T) { + first := AllowedTransitions() + require.NotEmpty(t, first) + + for from := range first { + first[from] = nil + } + + second := AllowedTransitions() + assert.NotEmpty(t, second[StatusRunning], + "AllowedTransitions must return an independent map per call") +} + +func TestAllowedTransitionsCoversFourPairs(t *testing.T) { + transitions := AllowedTransitions() + + assert.ElementsMatch(t, + []Status{StatusStopped, StatusRemoved}, + transitions[StatusRunning], + ) + assert.ElementsMatch(t, + []Status{StatusRunning, StatusRemoved}, + transitions[StatusStopped], + ) + assert.Empty(t, transitions[StatusRemoved], + "removed has no outgoing transitions") +} + +func TestInvalidTransitionErrorMessage(t *testing.T) { + err := &InvalidTransitionError{From: StatusRunning, To: Status("bogus")} + assert.Contains(t, err.Error(), "running") + assert.Contains(t, err.Error(), "bogus") +} diff --git a/rtmanager/internal/logging/context.go b/rtmanager/internal/logging/context.go new file mode 100644 index 0000000..f3d7fde --- /dev/null +++ b/rtmanager/internal/logging/context.go @@ -0,0 +1,43 @@ +package logging + +import "context" + +// requestIDKey is the unexported context key under which the HTTP layer +// stores the request id propagated from the X-Request-Id header. +type requestIDKey struct{} + +// WithRequestID returns a child context that carries requestID. An empty +// requestID returns ctx unchanged so callers do not have to branch. +func WithRequestID(ctx context.Context, requestID string) context.Context { + if ctx == nil || requestID == "" { + return ctx + } + return context.WithValue(ctx, requestIDKey{}, requestID) +} + +// RequestIDFromContext returns the request id stored on ctx by +// WithRequestID, or an empty string when no value is present. +func RequestIDFromContext(ctx context.Context) string { + if ctx == nil { + return "" + } + value, _ := ctx.Value(requestIDKey{}).(string) + return value +} + +// ContextAttrs returns slog key-value pairs that materialise the frozen +// `rtmanager/README.md` §Observability log fields `request_id`, +// `trace_id`, and `span_id` from ctx. Pairs whose value is empty are +// omitted so logs stay tight. +func ContextAttrs(ctx context.Context) []any { + if ctx == nil { + return nil + } + + var attrs []any + if requestID := RequestIDFromContext(ctx); requestID != "" { + attrs = append(attrs, "request_id", requestID) + } + attrs = append(attrs, TraceAttrsFromContext(ctx)...) + return attrs +} diff --git a/rtmanager/internal/logging/logger.go b/rtmanager/internal/logging/logger.go new file mode 100644 index 0000000..fefde6e --- /dev/null +++ b/rtmanager/internal/logging/logger.go @@ -0,0 +1,45 @@ +// Package logging configures the Runtime Manager process logger and +// provides context-aware helpers for trace fields. +package logging + +import ( + "context" + "fmt" + "log/slog" + "os" + "strings" + + "go.opentelemetry.io/otel/trace" +) + +// New constructs the process-wide JSON logger from level. +func New(level string) (*slog.Logger, error) { + var slogLevel slog.Level + if err := slogLevel.UnmarshalText([]byte(strings.TrimSpace(level))); err != nil { + return nil, fmt.Errorf("build logger: %w", err) + } + + return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ + Level: slogLevel, + })), nil +} + +// TraceAttrsFromContext returns slog key-value pairs for the active +// OpenTelemetry span when ctx carries a valid span context. The keys match +// the frozen `rtmanager/README.md` §Observability log fields `trace_id` +// and `span_id`. +func TraceAttrsFromContext(ctx context.Context) []any { + if ctx == nil { + return nil + } + + spanContext := trace.SpanContextFromContext(ctx) + if !spanContext.IsValid() { + return nil + } + + return []any{ + "trace_id", spanContext.TraceID().String(), + "span_id", spanContext.SpanID().String(), + } +} diff --git a/rtmanager/internal/ports/dockerclient.go b/rtmanager/internal/ports/dockerclient.go new file mode 100644 index 0000000..6e88bc0 --- /dev/null +++ b/rtmanager/internal/ports/dockerclient.go @@ -0,0 +1,336 @@ +package ports + +import ( + "context" + "errors" + "fmt" + "time" +) + +// PullPolicy enumerates the supported image pull policies. The value +// set mirrors `config.ImagePullPolicy`; the runtime/wiring layer +// translates between the two so the docker adapter does not import +// `internal/config` and the port package stays free of configuration +// concerns. +type PullPolicy string + +// Supported pull policies, frozen by `rtmanager/README.md §Configuration`. +const ( + // PullPolicyIfMissing pulls the image only when it is absent from + // the local Docker daemon. + PullPolicyIfMissing PullPolicy = "if_missing" + + // PullPolicyAlways pulls the image on every start. + PullPolicyAlways PullPolicy = "always" + + // PullPolicyNever skips the pull and fails the start when the image + // is absent. + PullPolicyNever PullPolicy = "never" +) + +// IsKnown reports whether policy belongs to the frozen pull-policy +// vocabulary. +func (policy PullPolicy) IsKnown() bool { + switch policy { + case PullPolicyIfMissing, PullPolicyAlways, PullPolicyNever: + return true + default: + return false + } +} + +//go:generate go run go.uber.org/mock/mockgen -destination=../adapters/docker/mocks/mock_dockerclient.go -package=mocks galaxy/rtmanager/internal/ports DockerClient + +// DockerClient is the narrow Docker port Runtime Manager uses. The +// production adapter wraps `github.com/docker/docker/client`; service +// tests use a generated mock. The surface intentionally exposes only +// the operations RTM needs; `docker logs` and stream attach are out +// of scope for v1. +type DockerClient interface { + // EnsureNetwork verifies the configured Docker network is present + // on the daemon. It returns ErrNetworkMissing when the network does + // not exist; RTM never creates networks itself. + EnsureNetwork(ctx context.Context, name string) error + + // PullImage pulls ref according to policy. It returns nil on + // success and a wrapped Docker error otherwise. Implementations + // honour PullPolicyNever by skipping the pull and returning nil + // when the image is already present, or returning ErrImageNotFound + // otherwise. + PullImage(ctx context.Context, ref string, policy PullPolicy) error + + // InspectImage returns image metadata for ref. It returns + // ErrImageNotFound when no such image exists locally. + InspectImage(ctx context.Context, ref string) (ImageInspect, error) + + // InspectContainer returns container metadata for containerID. It + // returns ErrContainerNotFound when no such container exists. + InspectContainer(ctx context.Context, containerID string) (ContainerInspect, error) + + // Run creates and starts one container according to spec. The + // returned RunResult carries the assigned container id, the stable + // engine endpoint, and the wall-clock observed by the daemon. + Run(ctx context.Context, spec RunSpec) (RunResult, error) + + // Stop sends SIGTERM to the container followed by SIGKILL after + // timeout. It returns nil when the container exited cleanly and + // ErrContainerNotFound when it is already gone. + Stop(ctx context.Context, containerID string, timeout time.Duration) error + + // Remove removes the container. It returns nil when the container + // no longer exists (idempotent removal). + Remove(ctx context.Context, containerID string) error + + // List returns container summaries that match filter. Implementations + // translate ListFilter into the appropriate Docker filters argument. + List(ctx context.Context, filter ListFilter) ([]ContainerSummary, error) + + // EventsListen subscribes to the Docker events stream and returns + // the decoded event channel together with an asynchronous error + // channel. The caller cancels ctx to terminate the subscription. + // Implementations close events when the subscription terminates. + EventsListen(ctx context.Context) (events <-chan DockerEvent, errs <-chan error, err error) +} + +// RunSpec stores the request shape used by DockerClient.Run. +type RunSpec struct { + // Name stores the container name (typically `galaxy-game-{game_id}`). + Name string + + // Image stores the image reference resolved by the producer. + Image string + + // Hostname stores the container hostname assigned for the embedded + // Docker DNS to resolve from other containers on the network. + Hostname string + + // Network stores the user-defined Docker network the container + // attaches to. + Network string + + // Env stores the environment variables forwarded to the container + // (e.g. GAME_STATE_PATH, STORAGE_PATH). + Env map[string]string + + // Cmd overrides the entrypoint arguments for the container. Production + // callers leave it nil so the engine image's own CMD runs; tests use + // it to drive a tiny container that does not embed RTM-specific + // behaviour. Empty Cmd means "use image default", which mirrors the + // Docker SDK contract. + Cmd []string + + // Labels stores the labels applied to the container so the + // reconciler and the events listener can identify it. + Labels map[string]string + + // BindMounts stores the host-to-container bind mounts. RTM uses + // exactly one mount in v1 (the per-game state directory). + BindMounts []BindMount + + // LogDriver stores the Docker logging driver name. + LogDriver string + + // LogOpts stores the logging-driver options as key=value pairs. + LogOpts map[string]string + + // CPUQuota stores the `--cpus` value applied as a resource limit. + CPUQuota float64 + + // Memory stores the `--memory` value (e.g. `512m`) applied as a + // resource limit. + Memory string + + // PIDsLimit stores the `--pids-limit` value. + PIDsLimit int +} + +// BindMount stores one host-to-container bind mount. +type BindMount struct { + // HostPath stores the absolute host path bound into the container. + HostPath string + + // MountPath stores the absolute in-container path the host + // directory is mounted at. + MountPath string + + // ReadOnly mounts the host path read-only when true. + ReadOnly bool +} + +// RunResult stores the response shape returned by DockerClient.Run. +type RunResult struct { + // ContainerID identifies the created container. + ContainerID string + + // EngineEndpoint stores the stable URL Game Master uses to reach + // the engine container. + EngineEndpoint string + + // StartedAt stores the wall-clock the daemon observed for the + // start event. + StartedAt time.Time +} + +// ImageInspect stores the subset of `docker image inspect` fields RTM +// reads. Only Labels are required at start time (resource limits live +// there); other fields may be populated when convenient for diagnostics. +type ImageInspect struct { + // Ref stores the image reference the inspection was scoped to. + Ref string + + // Labels stores the image-level labels (e.g. + // `com.galaxy.cpu_quota`). + Labels map[string]string +} + +// ContainerInspect stores the subset of `docker inspect` fields RTM +// reads from a running or exited container. +type ContainerInspect struct { + // ID identifies the container. + ID string + + // ImageRef stores the image reference the container was started + // from. + ImageRef string + + // Hostname stores the container hostname. + Hostname string + + // Labels stores the container labels assigned at create time. + Labels map[string]string + + // Status stores the verbatim Docker `State.Status` value (e.g. + // `running`, `exited`). + Status string + + // Health stores the verbatim Docker `State.Health.Status` value + // (e.g. `healthy`, `unhealthy`). Empty when the image declares no + // HEALTHCHECK. + Health string + + // RestartCount stores the Docker `RestartCount` observed at + // inspection time. + RestartCount int + + // StartedAt stores the daemon-observed start wall-clock. + StartedAt time.Time + + // FinishedAt stores the daemon-observed exit wall-clock. Zero when + // the container is still running. + FinishedAt time.Time + + // ExitCode stores the exit code reported by the daemon. Zero when + // the container is still running. + ExitCode int + + // OOMKilled reports whether the container was killed by the OOM + // killer. + OOMKilled bool +} + +// ContainerSummary stores the subset of `docker ps` fields RTM reads. +type ContainerSummary struct { + // ID identifies the container. + ID string + + // ImageRef stores the image reference. + ImageRef string + + // Hostname stores the container hostname. + Hostname string + + // Labels stores the container labels assigned at create time. + Labels map[string]string + + // Status stores the verbatim Docker `State.Status` value. + Status string + + // StartedAt stores the daemon-observed start wall-clock. + StartedAt time.Time +} + +// ListFilter stores the criteria used by DockerClient.List. +type ListFilter struct { + // Labels stores label key=value pairs that must all be present on + // the container. Empty matches every container. + Labels map[string]string +} + +// DockerEvent stores one decoded entry from the Docker events stream. +// RTM only consumes container-scoped events. +type DockerEvent struct { + // Action stores the Docker event action verbatim (e.g. `start`, + // `die`, `oom`, `destroy`). + Action string + + // ContainerID identifies the container the event refers to. + ContainerID string + + // Labels stores the container labels carried by the event + // attributes when present. + Labels map[string]string + + // ExitCode stores the exit code attribute when applicable (e.g. + // `die` events). Zero when the action does not carry one. + ExitCode int + + // OccurredAt stores the daemon-observed event wall-clock. + OccurredAt time.Time +} + +// String returns policy as its stored enum value. Convenient for use in +// log fields and error messages. +func (policy PullPolicy) String() string { + return string(policy) +} + +// ErrNetworkMissing reports that the configured Docker network is not +// present on the daemon. +var ErrNetworkMissing = errors.New("docker network missing") + +// ErrImageNotFound reports that an image reference does not resolve to +// a local Docker image. +var ErrImageNotFound = errors.New("docker image not found") + +// ErrContainerNotFound reports that a container id does not resolve to +// a Docker container. +var ErrContainerNotFound = errors.New("docker container not found") + +// Validate reports whether spec carries the structural invariants +// required by DockerClient.Run. Adapters use it as the first defence +// against malformed specs originating in service code. +func (spec RunSpec) Validate() error { + if spec.Name == "" { + return fmt.Errorf("run spec: name must not be empty") + } + if spec.Image == "" { + return fmt.Errorf("run spec: image must not be empty") + } + if spec.Hostname == "" { + return fmt.Errorf("run spec: hostname must not be empty") + } + if spec.Network == "" { + return fmt.Errorf("run spec: network must not be empty") + } + if spec.LogDriver == "" { + return fmt.Errorf("run spec: log driver must not be empty") + } + if spec.CPUQuota <= 0 { + return fmt.Errorf("run spec: cpu quota must be positive") + } + if spec.Memory == "" { + return fmt.Errorf("run spec: memory must not be empty") + } + if spec.PIDsLimit <= 0 { + return fmt.Errorf("run spec: pids limit must be positive") + } + for index, mount := range spec.BindMounts { + if mount.HostPath == "" { + return fmt.Errorf("run spec: bind mounts[%d]: host path must not be empty", index) + } + if mount.MountPath == "" { + return fmt.Errorf("run spec: bind mounts[%d]: mount path must not be empty", index) + } + } + return nil +} diff --git a/rtmanager/internal/ports/gamelease.go b/rtmanager/internal/ports/gamelease.go new file mode 100644 index 0000000..989d8f6 --- /dev/null +++ b/rtmanager/internal/ports/gamelease.go @@ -0,0 +1,38 @@ +package ports + +import ( + "context" + "time" +) + +// GameLeaseStore guards every lifecycle operation Runtime Manager runs +// against one game. The lease serialises starts, stops, restarts, patches, +// and cleanup operations on the same `game_id` across all entry points +// (Lobby stream consumer, GM REST handler, Admin REST handler, periodic +// workers) so concurrent operations cannot corrupt each other's +// intermediate Docker / PostgreSQL state. +// +// The lease is a per-game key with a random token. Adapters use SETNX with +// PX TTL on TryAcquire and a compare-and-delete on Release so a publisher +// that lost the lease (TTL expiry, replica swap) cannot clear another +// caller's claim. +// +// In v1 the lease is not renewed mid-operation; callers must keep the +// total operation duration below the configured TTL +// (`RTMANAGER_GAME_LEASE_TTL_SECONDS`, default 60s). Multi-GB image pulls +// can exceed this in production and remain a known limitation; later +// stages may introduce a renewal helper if it bites. +type GameLeaseStore interface { + // TryAcquire attempts to acquire the per-game lease for gameID owned + // by token for ttl. It returns true when the lease was acquired and + // false when another holder still owns it. A non-nil error reports + // transport-level failures (Redis unreachable, network timeout) and + // must not be confused with a missed lease. + TryAcquire(ctx context.Context, gameID, token string, ttl time.Duration) (acquired bool, err error) + + // Release removes the per-game lease for gameID only when token still + // matches the stored owner value. Releasing a lease the caller no + // longer owns is a silent no-op so a TTL-driven release race never + // clears another caller's claim. + Release(ctx context.Context, gameID, token string) error +} diff --git a/rtmanager/internal/ports/healtheventspublisher.go b/rtmanager/internal/ports/healtheventspublisher.go new file mode 100644 index 0000000..198d95a --- /dev/null +++ b/rtmanager/internal/ports/healtheventspublisher.go @@ -0,0 +1,81 @@ +package ports + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "time" + + "galaxy/rtmanager/internal/domain/health" +) + +// HealthEventPublisher emits one entry on the `runtime:health_events` +// Redis Stream and updates `health_snapshots` with the latest observation +// for the affected game. Adapters publish and snapshot in one call so +// every emission durably advances both surfaces; partial publishes (event +// without snapshot, or vice versa) are not allowed. +// +// The start service emits `container_started` through this port; the +// periodic Docker inspect, the active probe, and the Docker events +// listener publish the rest of the event types through the same port +// without changing its surface. +type HealthEventPublisher interface { + // Publish records envelope on the configured `runtime:health_events` + // stream and upserts the matching `health_snapshots` row. A non-nil + // error reports a transport or storage failure; the caller treats it + // as a degraded emission per `rtmanager/README.md §Notification + // Contracts` (the underlying business state is the source of truth, + // not the event stream). + Publish(ctx context.Context, envelope HealthEventEnvelope) error +} + +// HealthEventEnvelope carries the payload published on +// `runtime:health_events`. The fields mirror the AsyncAPI schema frozen +// in `rtmanager/api/runtime-health-asyncapi.yaml`; adapters serialise +// every field verbatim so consumers see the contracted shape. +type HealthEventEnvelope struct { + // GameID identifies the platform game the event refers to. + GameID string + + // ContainerID identifies the Docker container observed by the event + // source. May differ from the record's current container id after a + // restart race; consumers are expected to treat the value as the + // observation's container, not the record's. + ContainerID string + + // EventType classifies the event per the frozen vocabulary in + // `galaxy/rtmanager/internal/domain/health.EventType`. + EventType health.EventType + + // OccurredAt stores the wall-clock at which Runtime Manager observed + // the event. Adapters convert it to UTC milliseconds for the wire + // payload (`occurred_at_ms`). + OccurredAt time.Time + + // Details stores the event-type-specific JSON payload. Adapters + // persist and stream it verbatim; nil and empty values are treated as + // the canonical empty-object payload. + Details json.RawMessage +} + +// Validate reports whether envelope satisfies the structural invariants +// implied by the AsyncAPI schema. +func (envelope HealthEventEnvelope) Validate() error { + if strings.TrimSpace(envelope.GameID) == "" { + return fmt.Errorf("health event envelope: game id must not be empty") + } + if strings.TrimSpace(envelope.ContainerID) == "" { + return fmt.Errorf("health event envelope: container id must not be empty") + } + if !envelope.EventType.IsKnown() { + return fmt.Errorf("health event envelope: event type %q is unsupported", envelope.EventType) + } + if envelope.OccurredAt.IsZero() { + return fmt.Errorf("health event envelope: occurred at must not be zero") + } + if len(envelope.Details) > 0 && !json.Valid(envelope.Details) { + return fmt.Errorf("health event envelope: details must be valid JSON when non-empty") + } + return nil +} diff --git a/rtmanager/internal/ports/healthsnapshotstore.go b/rtmanager/internal/ports/healthsnapshotstore.go new file mode 100644 index 0000000..0dc3326 --- /dev/null +++ b/rtmanager/internal/ports/healthsnapshotstore.go @@ -0,0 +1,22 @@ +package ports + +import ( + "context" + + "galaxy/rtmanager/internal/domain/health" +) + +// HealthSnapshotStore stores the latest technical-health observation per +// game. Adapters keep one row per game_id; later observations overwrite. +type HealthSnapshotStore interface { + // Upsert installs snapshot as the latest observation for + // snapshot.GameID. Adapters validate snapshot through + // health.HealthSnapshot.Validate before touching the store. + Upsert(ctx context.Context, snapshot health.HealthSnapshot) error + + // Get returns the latest snapshot for gameID. It returns + // runtime.ErrNotFound (declared in + // `galaxy/rtmanager/internal/domain/runtime`) when no snapshot has + // been recorded yet. + Get(ctx context.Context, gameID string) (health.HealthSnapshot, error) +} diff --git a/rtmanager/internal/ports/jobresultspublisher.go b/rtmanager/internal/ports/jobresultspublisher.go new file mode 100644 index 0000000..4b1fe76 --- /dev/null +++ b/rtmanager/internal/ports/jobresultspublisher.go @@ -0,0 +1,91 @@ +package ports + +import ( + "context" + "fmt" + "strings" +) + +// JobResultPublisher emits one entry on the `runtime:job_results` Redis +// Stream per finalised start or stop runtime job. Adapters serialise +// every JobResult field verbatim so consumers (Game Lobby's +// runtime-job-result worker today, future services tomorrow) see the +// AsyncAPI shape frozen in `rtmanager/api/runtime-jobs-asyncapi.yaml`. +// +// The start-jobs and stop-jobs consumers publish through this port. +// The synchronous REST handlers do not — REST callers receive the same +// `Result` shape directly from the service layer. +type JobResultPublisher interface { + // Publish records result on the configured `runtime:job_results` + // stream. A non-nil error reports a transport or serialisation + // failure; the caller treats the failure as a degraded emission + // (the operation_log already records the durable outcome). + Publish(ctx context.Context, result JobResult) error +} + +// JobResult outcome values frozen by the +// `RuntimeJobResultPayload.outcome` enum. +const ( + // JobOutcomeSuccess marks a successful start or stop, including the + // idempotent replay variant (`error_code=replay_no_op`). + JobOutcomeSuccess = "success" + + // JobOutcomeFailure marks a stable failure for which the payload + // carries a non-empty `error_code`. + JobOutcomeFailure = "failure" +) + +// JobResult carries the wire payload published on +// `runtime:job_results`. The fields mirror the AsyncAPI schema frozen +// in `rtmanager/api/runtime-jobs-asyncapi.yaml`; adapters serialise +// every field verbatim so consumers see the contracted shape. Fields +// that are required by the contract (every field on this struct) are +// always present in the wire entry — even when their string value is +// empty (allowed for `container_id` / `engine_endpoint` / `error_code` +// / `error_message` on appropriate variants). +type JobResult struct { + // GameID identifies the platform game the job acted on. Required. + GameID string + + // Outcome reports the high-level outcome. Must be `success` or + // `failure` (use the JobOutcome* constants). + Outcome string + + // ContainerID stores the Docker container id. Populated on + // `success` for fresh starts and replays; empty on `failure` and + // on `success/replay_no_op` for stop jobs that observed a removed + // record. + ContainerID string + + // EngineEndpoint stores the stable engine URL + // `http://galaxy-game-{game_id}:8080`. Populated alongside + // ContainerID, empty in the same cases. + EngineEndpoint string + + // ErrorCode stores the stable error code from + // `rtmanager/README.md §Error Model`. Empty for fresh successes, + // `replay_no_op` for idempotent replays, one of the failure + // codes otherwise. + ErrorCode string + + // ErrorMessage stores the operator-readable detail. Empty for + // successes; populated alongside ErrorCode on failure. + ErrorMessage string +} + +// Validate reports whether result satisfies the structural invariants +// implied by the AsyncAPI schema: a non-empty game id and one of the +// two known outcome values. The remaining fields are required to be +// present on the wire but may be empty strings, so Validate does not +// constrain them. +func (result JobResult) Validate() error { + if strings.TrimSpace(result.GameID) == "" { + return fmt.Errorf("job result: game id must not be empty") + } + switch result.Outcome { + case JobOutcomeSuccess, JobOutcomeFailure: + return nil + default: + return fmt.Errorf("job result: outcome %q is unsupported", result.Outcome) + } +} diff --git a/rtmanager/internal/ports/lobbyinternal.go b/rtmanager/internal/ports/lobbyinternal.go new file mode 100644 index 0000000..3b3f022 --- /dev/null +++ b/rtmanager/internal/ports/lobbyinternal.go @@ -0,0 +1,47 @@ +package ports + +import ( + "context" + "errors" +) + +// LobbyInternalClient is the synchronous trusted-REST port Runtime +// Manager uses to read ancillary game metadata from Game Lobby. Stage +// 13 calls GetGame purely for diagnostic context; the start envelope +// already carries the only required field (`image_ref`) so a +// LobbyInternalClient failure must not abort the start operation. +type LobbyInternalClient interface { + // GetGame returns the Lobby game record for gameID. It returns + // ErrLobbyGameNotFound when no record exists and ErrLobbyUnavailable + // for transport / timeout / non-2xx responses. + GetGame(ctx context.Context, gameID string) (LobbyGameRecord, error) +} + +// LobbyGameRecord stores the subset of the Lobby `GameRecord` schema +// Runtime Manager uses. The shape is intentionally minimal: this fetch +// is ancillary diagnostics and v1 has no required field. The struct +// may be extended additively without breaking existing callers. +type LobbyGameRecord struct { + // GameID identifies the platform game. + GameID string + + // Status stores the verbatim Lobby status string (e.g. `starting`, + // `running`, `paused`). Runtime Manager does not interpret it; it + // is exposed for log enrichment and diagnostics only. + Status string + + // TargetEngineVersion stores the semver of the engine version Lobby + // resolved into the start envelope's image_ref. Empty when Lobby + // did not return one. + TargetEngineVersion string +} + +// ErrLobbyGameNotFound reports that the Lobby internal API returned 404 +// for the requested game id. +var ErrLobbyGameNotFound = errors.New("lobby game not found") + +// ErrLobbyUnavailable reports that the Lobby internal API could not be +// reached (transport error, timeout, non-2xx response). Callers must +// treat the failure as recoverable: Runtime Manager continues the +// operation when the call is purely diagnostic. +var ErrLobbyUnavailable = errors.New("lobby internal api unavailable") diff --git a/rtmanager/internal/ports/notificationintents.go b/rtmanager/internal/ports/notificationintents.go new file mode 100644 index 0000000..328ce32 --- /dev/null +++ b/rtmanager/internal/ports/notificationintents.go @@ -0,0 +1,25 @@ +package ports + +import ( + "context" + + "galaxy/notificationintent" +) + +// NotificationIntentPublisher is the producer port Runtime Manager uses +// to publish admin-only notification intents to Notification Service. +// The production adapter is a thin wrapper around +// `notificationintent.Publisher`; the wrapper drops the entry id +// returned by the underlying publisher because Runtime Manager does +// not track per-intent ids in v1. +// +// A failed Publish call is a notification degradation per +// `galaxy/rtmanager/README.md §Notification Contracts` and must not roll +// back already committed business state. Callers log the error and +// proceed. +type NotificationIntentPublisher interface { + // Publish normalises intent and appends it to the configured Redis + // Stream. Validation failures and transport errors are returned + // verbatim. + Publish(ctx context.Context, intent notificationintent.Intent) error +} diff --git a/rtmanager/internal/ports/operationlogstore.go b/rtmanager/internal/ports/operationlogstore.go new file mode 100644 index 0000000..5a22e9b --- /dev/null +++ b/rtmanager/internal/ports/operationlogstore.go @@ -0,0 +1,23 @@ +package ports + +import ( + "context" + + "galaxy/rtmanager/internal/domain/operation" +) + +// OperationLogStore stores append-only audit entries for every +// lifecycle operation Runtime Manager performed against a game's +// runtime. Adapters must persist entry verbatim and return the +// generated bigserial id from Append. +type OperationLogStore interface { + // Append inserts entry into the operation log and returns the + // generated bigserial id. Adapters validate entry through + // operation.OperationEntry.Validate before touching the store. + Append(ctx context.Context, entry operation.OperationEntry) (id int64, err error) + + // ListByGame returns the most recent entries for gameID, ordered by + // started_at descending and capped by limit. A non-positive limit + // is rejected as invalid input by adapters. + ListByGame(ctx context.Context, gameID string, limit int) ([]operation.OperationEntry, error) +} diff --git a/rtmanager/internal/ports/runtimerecordstore.go b/rtmanager/internal/ports/runtimerecordstore.go new file mode 100644 index 0000000..caecab4 --- /dev/null +++ b/rtmanager/internal/ports/runtimerecordstore.go @@ -0,0 +1,112 @@ +// Package ports defines the stable interfaces that connect Runtime +// Manager use cases to external state and external services. +package ports + +import ( + "context" + "fmt" + "strings" + "time" + + "galaxy/rtmanager/internal/domain/runtime" +) + +// RuntimeRecordStore stores runtime records and exposes the operations +// used by the service layer (Stages 13+) and the workers (Stages 15-18). +// Adapters must preserve domain semantics: +// +// - Get returns runtime.ErrNotFound when no record exists for gameID. +// - Upsert installs a record verbatim; the caller is responsible for +// domain validation through runtime.RuntimeRecord.Validate. +// - UpdateStatus applies one transition through a compare-and-swap +// guard on (status, current_container_id) and returns +// runtime.ErrConflict on a stale CAS. +// - List returns every record currently stored, regardless of status. +// - ListByStatus returns every record currently indexed under status. +type RuntimeRecordStore interface { + // Get returns the record identified by gameID. It returns + // runtime.ErrNotFound when no record exists. + Get(ctx context.Context, gameID string) (runtime.RuntimeRecord, error) + + // Upsert inserts record when no row exists for record.GameID and + // otherwise overwrites every column verbatim. The start service uses + // Upsert to install fresh records on start, the inner start of + // restart and patch, and the reconcile_adopt path. + Upsert(ctx context.Context, record runtime.RuntimeRecord) error + + // UpdateStatus applies one status transition in a compare-and-swap + // fashion. The adapter must first call runtime.Transition to reject + // invalid pairs without touching the store, then verify that the + // stored status equals input.ExpectedFrom, and (when + // input.ExpectedContainerID is non-empty) that the stored + // current_container_id equals it. The adapter derives stopped_at / + // removed_at and updates last_op_at from input.Now per the + // destination status. + UpdateStatus(ctx context.Context, input UpdateStatusInput) error + + // List returns every runtime record currently stored. Used by the + // internal REST list endpoint; the v1 working set is bounded by the + // games tracked by Lobby and is small enough to return in one + // response (pagination is not supported). The order is + // adapter-defined; callers may reorder as needed. + List(ctx context.Context) ([]runtime.RuntimeRecord, error) + + // ListByStatus returns every record currently indexed under status. + // The order is adapter-defined; callers may reorder as needed. + ListByStatus(ctx context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) +} + +// UpdateStatusInput stores the arguments required to apply one status +// transition through a RuntimeRecordStore. The adapter is responsible +// for translating the destination status into the matching column +// updates (stopped_at / removed_at / current_container_id NULLing) and +// for the CAS guard. +type UpdateStatusInput struct { + // GameID identifies the record to mutate. + GameID string + + // ExpectedFrom stores the status the caller believes the record + // currently has. A mismatch results in runtime.ErrConflict. + ExpectedFrom runtime.Status + + // ExpectedContainerID is an optional CAS guard. When non-empty, the + // adapter rejects the update with runtime.ErrConflict if the stored + // current_container_id does not equal it. Used by stop / cleanup / + // reconcile to protect against concurrent restart races. Empty + // disables the container-id CAS while keeping the status CAS. + ExpectedContainerID string + + // To stores the destination status. + To runtime.Status + + // Now stores the wall-clock used to derive stopped_at / removed_at + // and last_op_at depending on To. + Now time.Time +} + +// Validate reports whether input contains a structurally valid status +// transition request. Adapters call Validate before touching the store. +func (input UpdateStatusInput) Validate() error { + if strings.TrimSpace(input.GameID) == "" { + return fmt.Errorf("update runtime status: game id must not be empty") + } + if !input.ExpectedFrom.IsKnown() { + return fmt.Errorf( + "update runtime status: expected from status %q is unsupported", + input.ExpectedFrom, + ) + } + if !input.To.IsKnown() { + return fmt.Errorf( + "update runtime status: to status %q is unsupported", + input.To, + ) + } + if err := runtime.Transition(input.ExpectedFrom, input.To); err != nil { + return fmt.Errorf("update runtime status: %w", err) + } + if input.Now.IsZero() { + return fmt.Errorf("update runtime status: now must not be zero") + } + return nil +} diff --git a/rtmanager/internal/ports/runtimerecordstore_test.go b/rtmanager/internal/ports/runtimerecordstore_test.go new file mode 100644 index 0000000..706a5dc --- /dev/null +++ b/rtmanager/internal/ports/runtimerecordstore_test.go @@ -0,0 +1,70 @@ +package ports + +import ( + "errors" + "testing" + "time" + + "galaxy/rtmanager/internal/domain/runtime" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func validUpdateStatusInput() UpdateStatusInput { + return UpdateStatusInput{ + GameID: "game-test", + ExpectedFrom: runtime.StatusRunning, + ExpectedContainerID: "container-1", + To: runtime.StatusStopped, + Now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC), + } +} + +func TestUpdateStatusInputValidateHappy(t *testing.T) { + require.NoError(t, validUpdateStatusInput().Validate()) +} + +func TestUpdateStatusInputValidateAcceptsEmptyContainerCAS(t *testing.T) { + input := validUpdateStatusInput() + input.ExpectedContainerID = "" + + assert.NoError(t, input.Validate()) +} + +func TestUpdateStatusInputValidateRejects(t *testing.T) { + tests := []struct { + name string + mutate func(*UpdateStatusInput) + }{ + {"empty game id", func(i *UpdateStatusInput) { i.GameID = "" }}, + {"unknown expected from", func(i *UpdateStatusInput) { + i.ExpectedFrom = "exotic" + }}, + {"unknown to", func(i *UpdateStatusInput) { + i.To = "exotic" + }}, + {"zero now", func(i *UpdateStatusInput) { + i.Now = time.Time{} + }}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + input := validUpdateStatusInput() + tt.mutate(&input) + assert.Error(t, input.Validate()) + }) + } +} + +func TestUpdateStatusInputValidateRejectsForbiddenTransition(t *testing.T) { + input := validUpdateStatusInput() + input.ExpectedFrom = runtime.StatusRemoved + input.To = runtime.StatusRunning + + err := input.Validate() + require.Error(t, err) + assert.True(t, errors.Is(err, runtime.ErrInvalidTransition), + "want runtime.ErrInvalidTransition, got %v", err) +} diff --git a/rtmanager/internal/ports/streamoffsetstore.go b/rtmanager/internal/ports/streamoffsetstore.go new file mode 100644 index 0000000..53a9c03 --- /dev/null +++ b/rtmanager/internal/ports/streamoffsetstore.go @@ -0,0 +1,23 @@ +package ports + +import "context" + +// StreamOffsetStore persists the last successfully processed Redis +// Stream entry id per consumer label. Workers call Load on startup to +// resume from the persisted offset and Save after every successful +// message handling so the next iteration advances past the +// just-processed entry. The label is the short logical identifier of +// the consumer (e.g. `start_jobs`, `stop_jobs`), not the full stream +// name; it stays stable when the underlying stream key is renamed. +type StreamOffsetStore interface { + // Load returns the last processed entry id for the consumer + // labelled stream when one is stored. The boolean return reports + // whether a value was present; implementations must not return an + // error for a missing key. + Load(ctx context.Context, stream string) (entryID string, found bool, err error) + + // Save stores entryID as the new last processed offset for the + // consumer labelled stream. Implementations overwrite any previous + // value unconditionally. + Save(ctx context.Context, stream, entryID string) error +} diff --git a/rtmanager/internal/service/cleanupcontainer/service.go b/rtmanager/internal/service/cleanupcontainer/service.go new file mode 100644 index 0000000..b1b2c6d --- /dev/null +++ b/rtmanager/internal/service/cleanupcontainer/service.go @@ -0,0 +1,442 @@ +// Package cleanupcontainer implements the `cleanup_container` lifecycle +// operation owned by Runtime Manager. The service removes the Docker +// container of an already-stopped runtime and transitions the record +// to `removed`. It refuses to operate on a still-running runtime — +// callers must stop first. +// +// Two callers exercise this surface: the administrative +// `DELETE /api/v1/internal/runtimes/{game_id}/container` endpoint, and +// the periodic container-cleanup worker that walks +// `runtime_records.status='stopped'` rows older than +// `RTMANAGER_CONTAINER_RETENTION_DAYS`. Both paths flow through Handle. +// +// Lifecycle and failure-mode semantics follow `rtmanager/README.md +// §Lifecycles → Cleanup`. Design rationale is captured in +// `rtmanager/docs/services.md`. +package cleanupcontainer + +import ( + "context" + "crypto/rand" + "encoding/base64" + "errors" + "fmt" + "log/slog" + "strings" + "time" + + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/logging" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/telemetry" +) + +const leaseReleaseTimeout = 5 * time.Second + +// Input stores the per-call arguments for one cleanup operation. +type Input struct { + // GameID identifies the platform game whose container is removed. + GameID string + + // OpSource classifies how the request entered Runtime Manager. + // Required: every operation_log entry carries an op_source. + OpSource operation.OpSource + + // SourceRef stores the optional opaque per-source reference (REST + // request id, admin user id). Empty for the periodic auto-TTL + // caller. + SourceRef string +} + +// Validate reports whether input carries the structural invariants the +// service requires. +func (input Input) Validate() error { + if strings.TrimSpace(input.GameID) == "" { + return fmt.Errorf("game id must not be empty") + } + if !input.OpSource.IsKnown() { + return fmt.Errorf("op source %q is unsupported", input.OpSource) + } + return nil +} + +// Result stores the deterministic outcome of one Handle call. +type Result struct { + // Record carries the updated runtime record on success and on + // idempotent replay; zero on failure. + Record runtime.RuntimeRecord + + // Outcome reports whether the operation completed (success) or + // produced a stable failure code. + Outcome operation.Outcome + + // ErrorCode stores the stable error code on failure, or + // `replay_no_op` on idempotent replay. Empty for fresh successes. + ErrorCode string + + // ErrorMessage stores the operator-readable detail on failure. + ErrorMessage string +} + +// Dependencies groups the collaborators required by Service. +type Dependencies struct { + RuntimeRecords ports.RuntimeRecordStore + OperationLogs ports.OperationLogStore + Docker ports.DockerClient + Leases ports.GameLeaseStore + + Coordination config.CoordinationConfig + + Telemetry *telemetry.Runtime + Logger *slog.Logger + Clock func() time.Time + NewToken func() string +} + +// Service executes the cleanup_container lifecycle operation. +type Service struct { + runtimeRecords ports.RuntimeRecordStore + operationLogs ports.OperationLogStore + docker ports.DockerClient + leases ports.GameLeaseStore + + leaseTTL time.Duration + + telemetry *telemetry.Runtime + logger *slog.Logger + + clock func() time.Time + newToken func() string +} + +// NewService constructs one Service from deps. +func NewService(deps Dependencies) (*Service, error) { + switch { + case deps.RuntimeRecords == nil: + return nil, errors.New("new cleanup container service: nil runtime records") + case deps.OperationLogs == nil: + return nil, errors.New("new cleanup container service: nil operation logs") + case deps.Docker == nil: + return nil, errors.New("new cleanup container service: nil docker client") + case deps.Leases == nil: + return nil, errors.New("new cleanup container service: nil lease store") + case deps.Telemetry == nil: + return nil, errors.New("new cleanup container service: nil telemetry runtime") + } + if err := deps.Coordination.Validate(); err != nil { + return nil, fmt.Errorf("new cleanup container service: coordination config: %w", err) + } + + clock := deps.Clock + if clock == nil { + clock = time.Now + } + logger := deps.Logger + if logger == nil { + logger = slog.Default() + } + logger = logger.With("service", "rtmanager.cleanupcontainer") + + newToken := deps.NewToken + if newToken == nil { + newToken = defaultTokenGenerator() + } + + return &Service{ + runtimeRecords: deps.RuntimeRecords, + operationLogs: deps.OperationLogs, + docker: deps.Docker, + leases: deps.Leases, + leaseTTL: deps.Coordination.GameLeaseTTL, + telemetry: deps.Telemetry, + logger: logger, + clock: clock, + newToken: newToken, + }, nil +} + +// Handle executes one cleanup operation end-to-end. The Go-level error +// return is reserved for non-business failures (nil context, nil +// receiver). Every business outcome — success, idempotent replay, or +// any of the stable failure modes — flows through Result. +func (service *Service) Handle(ctx context.Context, input Input) (Result, error) { + if service == nil { + return Result{}, errors.New("cleanup container: nil service") + } + if ctx == nil { + return Result{}, errors.New("cleanup container: nil context") + } + + opStartedAt := service.clock().UTC() + + if err := input.Validate(); err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInvalidRequest, + errorMessage: err.Error(), + }), nil + } + + token := service.newToken() + leaseStart := service.clock() + acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL) + service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart)) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeServiceUnavailable, + errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()), + }), nil + } + if !acquired { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeConflict, + errorMessage: "another lifecycle operation is in progress for this game", + }), nil + } + defer service.releaseLease(ctx, input.GameID, token) + + return service.runUnderLease(ctx, input, opStartedAt) +} + +// runUnderLease executes the lease-protected cleanup steps. +func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) { + existing, err := service.runtimeRecords.Get(ctx, input.GameID) + if errors.Is(err, runtime.ErrNotFound) { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeNotFound, + errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID), + }), nil + } + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()), + }), nil + } + + switch existing.Status { + case runtime.StatusRemoved: + return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil + case runtime.StatusRunning: + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeConflict, + errorMessage: fmt.Sprintf("runtime for game %q is running; stop the runtime first", input.GameID), + containerID: existing.CurrentContainerID, + imageRef: existing.CurrentImageRef, + }), nil + case runtime.StatusStopped: + // proceed + default: + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("runtime record has unsupported status %q", existing.Status), + }), nil + } + + if existing.CurrentContainerID != "" { + if err := service.docker.Remove(ctx, existing.CurrentContainerID); err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeServiceUnavailable, + errorMessage: fmt.Sprintf("docker remove: %s", err.Error()), + containerID: existing.CurrentContainerID, + imageRef: existing.CurrentImageRef, + }), nil + } + } + + updateNow := service.clock().UTC() + err = service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: input.GameID, + ExpectedFrom: runtime.StatusStopped, + ExpectedContainerID: existing.CurrentContainerID, + To: runtime.StatusRemoved, + Now: updateNow, + }) + if errors.Is(err, runtime.ErrConflict) { + // CAS race: another caller (reconciler dispose, concurrent admin) + // already moved the record. The desired terminal state was + // reached by another path. + return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil + } + if errors.Is(err, runtime.ErrNotFound) { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeNotFound, + errorMessage: fmt.Sprintf("runtime record for game %q vanished mid-cleanup", input.GameID), + containerID: existing.CurrentContainerID, + imageRef: existing.CurrentImageRef, + }), nil + } + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("update runtime status: %s", err.Error()), + containerID: existing.CurrentContainerID, + imageRef: existing.CurrentImageRef, + }), nil + } + + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: input.GameID, + OpKind: operation.OpKindCleanupContainer, + OpSource: input.OpSource, + SourceRef: input.SourceRef, + ImageRef: existing.CurrentImageRef, + ContainerID: existing.CurrentContainerID, + Outcome: operation.OutcomeSuccess, + StartedAt: opStartedAt, + FinishedAt: &finishedAt, + }) + service.telemetry.RecordCleanupOutcome(ctx, string(operation.OutcomeSuccess), string(input.OpSource)) + + record := existing + record.Status = runtime.StatusRemoved + record.CurrentContainerID = "" + removedAt := updateNow + record.RemovedAt = &removedAt + record.LastOpAt = updateNow + + logArgs := []any{ + "game_id", input.GameID, + "container_id", existing.CurrentContainerID, + "op_source", string(input.OpSource), + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.InfoContext(ctx, "runtime container cleaned up", logArgs...) + + return Result{ + Record: record, + Outcome: operation.OutcomeSuccess, + }, nil +} + +// recordReplayNoOp records the idempotent replay outcome and returns the +// existing record unchanged. +func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result { + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: input.GameID, + OpKind: operation.OpKindCleanupContainer, + OpSource: input.OpSource, + SourceRef: input.SourceRef, + ImageRef: existing.CurrentImageRef, + ContainerID: existing.CurrentContainerID, + Outcome: operation.OutcomeSuccess, + ErrorCode: startruntime.ErrorCodeReplayNoOp, + StartedAt: opStartedAt, + FinishedAt: &finishedAt, + }) + service.telemetry.RecordCleanupOutcome(ctx, string(operation.OutcomeSuccess), string(input.OpSource)) + + logArgs := []any{ + "game_id", input.GameID, + "container_id", existing.CurrentContainerID, + "op_source", string(input.OpSource), + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.InfoContext(ctx, "runtime cleanup replay no-op", logArgs...) + + return Result{ + Record: existing, + Outcome: operation.OutcomeSuccess, + ErrorCode: startruntime.ErrorCodeReplayNoOp, + } +} + +// failureCtx groups the inputs to recordFailure. +type failureCtx struct { + opStartedAt time.Time + input Input + errorCode string + errorMessage string + containerID string + imageRef string +} + +func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result { + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: fc.input.GameID, + OpKind: operation.OpKindCleanupContainer, + OpSource: fc.input.OpSource, + SourceRef: fc.input.SourceRef, + ImageRef: fc.imageRef, + ContainerID: fc.containerID, + Outcome: operation.OutcomeFailure, + ErrorCode: fc.errorCode, + ErrorMessage: fc.errorMessage, + StartedAt: fc.opStartedAt, + FinishedAt: &finishedAt, + }) + service.telemetry.RecordCleanupOutcome(ctx, string(operation.OutcomeFailure), string(fc.input.OpSource)) + + logArgs := []any{ + "game_id", fc.input.GameID, + "op_source", string(fc.input.OpSource), + "error_code", fc.errorCode, + "error_message", fc.errorMessage, + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.WarnContext(ctx, "runtime cleanup failed", logArgs...) + + return Result{ + Outcome: operation.OutcomeFailure, + ErrorCode: fc.errorCode, + ErrorMessage: fc.errorMessage, + } +} + +func (service *Service) releaseLease(ctx context.Context, gameID, token string) { + cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout) + defer cancel() + if err := service.leases.Release(cleanupCtx, gameID, token); err != nil { + service.logger.WarnContext(ctx, "release game lease", + "game_id", gameID, + "err", err.Error(), + ) + } +} + +func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) { + if _, err := service.operationLogs.Append(ctx, entry); err != nil { + service.logger.ErrorContext(ctx, "append operation log", + "game_id", entry.GameID, + "op_kind", string(entry.OpKind), + "outcome", string(entry.Outcome), + "error_code", entry.ErrorCode, + "err", err.Error(), + ) + } +} + +func defaultTokenGenerator() func() string { + return func() string { + var buf [32]byte + if _, err := rand.Read(buf[:]); err != nil { + return "rtmanager-fallback-token" + } + return base64.RawURLEncoding.EncodeToString(buf[:]) + } +} diff --git a/rtmanager/internal/service/cleanupcontainer/service_test.go b/rtmanager/internal/service/cleanupcontainer/service_test.go new file mode 100644 index 0000000..04e0092 --- /dev/null +++ b/rtmanager/internal/service/cleanupcontainer/service_test.go @@ -0,0 +1,382 @@ +package cleanupcontainer_test + +import ( + "context" + "errors" + "sync" + "testing" + "time" + + "galaxy/rtmanager/internal/adapters/docker/mocks" + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/cleanupcontainer" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/telemetry" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" +) + +// --- shared fake doubles ---------------------------------------------- + +type fakeRuntimeRecords struct { + mu sync.Mutex + + stored map[string]runtime.RuntimeRecord + getErr error + updateStatusErr error + + updates []ports.UpdateStatusInput +} + +func newFakeRuntimeRecords() *fakeRuntimeRecords { + return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}} +} + +func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.getErr != nil { + return runtime.RuntimeRecord{}, s.getErr + } + record, ok := s.stored[gameID] + if !ok { + return runtime.RuntimeRecord{}, runtime.ErrNotFound + } + return record, nil +} + +func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { + return errors.New("not used in cleanup tests") +} + +func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error { + s.mu.Lock() + defer s.mu.Unlock() + s.updates = append(s.updates, input) + if s.updateStatusErr != nil { + return s.updateStatusErr + } + record, ok := s.stored[input.GameID] + if !ok { + return runtime.ErrNotFound + } + if record.Status != input.ExpectedFrom { + return runtime.ErrConflict + } + if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID { + return runtime.ErrConflict + } + record.Status = input.To + record.LastOpAt = input.Now + if input.To == runtime.StatusRemoved { + removedAt := input.Now + record.RemovedAt = &removedAt + record.CurrentContainerID = "" + } + s.stored[input.GameID] = record + return nil +} + +func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in cleanup tests") +} + +func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in cleanup tests") +} + +type fakeOperationLogs struct { + mu sync.Mutex + + appendErr error + appends []operation.OperationEntry +} + +func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.appendErr != nil { + return 0, s.appendErr + } + s.appends = append(s.appends, entry) + return int64(len(s.appends)), nil +} + +func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) { + return nil, errors.New("not used in cleanup tests") +} + +func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) { + s.mu.Lock() + defer s.mu.Unlock() + if len(s.appends) == 0 { + return operation.OperationEntry{}, false + } + return s.appends[len(s.appends)-1], true +} + +type fakeLeases struct { + mu sync.Mutex + + acquired bool + acquireErr error + releaseErr error + + acquires []string + releases []string +} + +func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) { + l.mu.Lock() + defer l.mu.Unlock() + l.acquires = append(l.acquires, token) + if l.acquireErr != nil { + return false, l.acquireErr + } + return l.acquired, nil +} + +func (l *fakeLeases) Release(_ context.Context, _, token string) error { + l.mu.Lock() + defer l.mu.Unlock() + l.releases = append(l.releases, token) + return l.releaseErr +} + +// --- harness ---------------------------------------------------------- + +type harness struct { + records *fakeRuntimeRecords + operationLogs *fakeOperationLogs + docker *mocks.MockDockerClient + leases *fakeLeases + + telemetry *telemetry.Runtime + + now time.Time +} + +func newHarness(t *testing.T) *harness { + t.Helper() + ctrl := gomock.NewController(t) + t.Cleanup(ctrl.Finish) + + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + return &harness{ + records: newFakeRuntimeRecords(), + operationLogs: &fakeOperationLogs{}, + docker: mocks.NewMockDockerClient(ctrl), + leases: &fakeLeases{acquired: true}, + telemetry: telemetryRuntime, + now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC), + } +} + +func (h *harness) build(t *testing.T) *cleanupcontainer.Service { + t.Helper() + service, err := cleanupcontainer.NewService(cleanupcontainer.Dependencies{ + RuntimeRecords: h.records, + OperationLogs: h.operationLogs, + Docker: h.docker, + Leases: h.leases, + Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute}, + Telemetry: h.telemetry, + Clock: func() time.Time { return h.now }, + NewToken: func() string { return "token-A" }, + }) + require.NoError(t, err) + return service +} + +func basicInput() cleanupcontainer.Input { + return cleanupcontainer.Input{ + GameID: "game-1", + OpSource: operation.OpSourceAdminRest, + SourceRef: "rest-cleanup-1", + } +} + +func stoppedRecord(now time.Time) runtime.RuntimeRecord { + startedAt := now.Add(-2 * time.Hour) + stoppedAt := now.Add(-time.Hour) + return runtime.RuntimeRecord{ + GameID: "game-1", + Status: runtime.StatusStopped, + CurrentContainerID: "ctr-old", + CurrentImageRef: "registry.example.com/galaxy/game:1.4.7", + EngineEndpoint: "http://galaxy-game-game-1:8080", + StatePath: "/var/lib/galaxy/games/game-1", + DockerNetwork: "galaxy-net", + StartedAt: &startedAt, + StoppedAt: &stoppedAt, + LastOpAt: stoppedAt, + CreatedAt: startedAt, + } +} + +// --- happy path ----------------------------------------------------- + +func TestHandleCleanupHappyPath(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = stoppedRecord(h.now) + + h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil) + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Empty(t, result.ErrorCode) + assert.Equal(t, runtime.StatusRemoved, result.Record.Status) + assert.Empty(t, result.Record.CurrentContainerID) + + require.Len(t, h.records.updates, 1) + assert.Equal(t, runtime.StatusStopped, h.records.updates[0].ExpectedFrom) + assert.Equal(t, runtime.StatusRemoved, h.records.updates[0].To) + + require.Len(t, h.operationLogs.appends, 1) + last, _ := h.operationLogs.lastAppend() + assert.Equal(t, operation.OpKindCleanupContainer, last.OpKind) + assert.Equal(t, operation.OutcomeSuccess, last.Outcome) + assert.Empty(t, last.ErrorCode) +} + +// --- replay --------------------------------------------------------- + +func TestHandleReplayNoOpForRemovedRecord(t *testing.T) { + h := newHarness(t) + removed := stoppedRecord(h.now) + removed.Status = runtime.StatusRemoved + removed.CurrentContainerID = "" + removedAt := h.now.Add(-30 * time.Minute) + removed.RemovedAt = &removedAt + h.records.stored["game-1"] = removed + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode) + assert.Empty(t, h.records.updates) + + last, _ := h.operationLogs.lastAppend() + assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode) +} + +func TestHandleReplayNoOpOnUpdateStatusConflict(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = stoppedRecord(h.now) + h.records.updateStatusErr = runtime.ErrConflict + + h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil) + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode) +} + +// --- failure paths -------------------------------------------------- + +func TestHandleConflictOnRunningRecord(t *testing.T) { + h := newHarness(t) + running := stoppedRecord(h.now) + running.Status = runtime.StatusRunning + startedAt := h.now.Add(-time.Hour) + running.StartedAt = &startedAt + running.StoppedAt = nil + h.records.stored["game-1"] = running + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode) + assert.Contains(t, result.ErrorMessage, "stop the runtime first") +} + +func TestHandleNotFoundForMissingRecord(t *testing.T) { + h := newHarness(t) + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode) +} + +func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = stoppedRecord(h.now) + + h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o")) + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode) + assert.Empty(t, h.records.updates, "no record mutation on docker remove failure") +} + +func TestHandleInternalErrorOnGenericUpdateError(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = stoppedRecord(h.now) + h.records.updateStatusErr = errors.New("postgres down") + + h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil) + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeInternal, result.ErrorCode) +} + +func TestHandleConflictWhenLeaseBusy(t *testing.T) { + h := newHarness(t) + h.leases.acquired = false + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode) +} + +// --- input validation ---------------------------------------------- + +func TestHandleRejectsInvalidInput(t *testing.T) { + h := newHarness(t) + service := h.build(t) + + cases := []cleanupcontainer.Input{ + {GameID: "", OpSource: operation.OpSourceAdminRest}, + {GameID: "g", OpSource: operation.OpSource("bogus")}, + } + for _, input := range cases { + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode) + } +} + +// --- constructor --------------------------------------------------- + +func TestNewServiceRejectsMissingDependencies(t *testing.T) { + h := newHarness(t) + deps := cleanupcontainer.Dependencies{ + Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute}, + Telemetry: h.telemetry, + } + _, err := cleanupcontainer.NewService(deps) + require.Error(t, err) +} diff --git a/rtmanager/internal/service/patchruntime/semver.go b/rtmanager/internal/service/patchruntime/semver.go new file mode 100644 index 0000000..7bd87cb --- /dev/null +++ b/rtmanager/internal/service/patchruntime/semver.go @@ -0,0 +1,52 @@ +package patchruntime + +import ( + "errors" + "fmt" + "strings" + + "github.com/distribution/reference" + "golang.org/x/mod/semver" +) + +// errImageRefNoTag reports that an image reference does not declare a +// tag. The patch service maps it to `image_ref_not_semver` because a +// digest-only or tagless reference cannot carry a semver-comparable +// version. +var errImageRefNoTag = errors.New("image reference is missing a tag") + +// extractSemverTag returns the canonical semver string ("v1.4.7") for +// imageRef, ready to feed into golang.org/x/mod/semver. The leading "v" +// is added when the underlying tag omits it. +// +// Errors returned by this function are pre-formatted for inclusion in +// the patch service's `image_ref_not_semver` failure message. +func extractSemverTag(imageRef string) (string, error) { + parsed, err := reference.ParseNormalizedNamed(imageRef) + if err != nil { + return "", fmt.Errorf("parse image reference %q: %w", imageRef, err) + } + tagged, ok := parsed.(reference.NamedTagged) + if !ok { + return "", fmt.Errorf("%w: %q", errImageRefNoTag, imageRef) + } + tag := strings.TrimSpace(tagged.Tag()) + if tag == "" { + return "", fmt.Errorf("%w: %q", errImageRefNoTag, imageRef) + } + candidate := tag + if !strings.HasPrefix(candidate, "v") { + candidate = "v" + candidate + } + if !semver.IsValid(candidate) { + return "", fmt.Errorf("tag %q on image reference %q is not a valid semver", tag, imageRef) + } + return candidate, nil +} + +// samePatchSeries reports whether two canonical semver strings (with +// the leading "v") share their major and minor components. The third +// component (patch) and any pre-release / build metadata are ignored. +func samePatchSeries(currentSemver, newSemver string) bool { + return semver.MajorMinor(currentSemver) == semver.MajorMinor(newSemver) +} diff --git a/rtmanager/internal/service/patchruntime/service.go b/rtmanager/internal/service/patchruntime/service.go new file mode 100644 index 0000000..348d798 --- /dev/null +++ b/rtmanager/internal/service/patchruntime/service.go @@ -0,0 +1,483 @@ +// Package patchruntime implements the `patch` lifecycle operation owned +// by Runtime Manager. Patch is restart with a new `image_ref`: under +// one outer per-game lease the service runs the stop service, removes +// the container, and runs the start service with the new image. The +// engine reads its state from the bind-mount on startup, so any data +// written before the patch survives. +// +// The new and current image references must both parse as semver tags +// and share their major and minor components. A new tag that bumps the +// major or the minor surfaces as `semver_patch_only`; a tag that is +// not parseable as semver surfaces as `image_ref_not_semver`. These +// pre-checks run before any Docker work so a rejected patch never +// disturbs the running runtime. +// +// Lifecycle and failure-mode semantics follow `rtmanager/README.md +// §Lifecycles → Patch`. Design rationale is captured in +// `rtmanager/docs/services.md`. +package patchruntime + +import ( + "context" + "crypto/rand" + "encoding/base64" + "errors" + "fmt" + "log/slog" + "strings" + "time" + + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/logging" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/service/stopruntime" + "galaxy/rtmanager/internal/telemetry" +) + +const leaseReleaseTimeout = 5 * time.Second + +// Input stores the per-call arguments for one patch operation. +type Input struct { + // GameID identifies the platform game to patch. + GameID string + + // NewImageRef stores the new Docker reference the patch installs. + // Must be a valid Docker reference whose tag parses as semver. + NewImageRef string + + // OpSource classifies how the request entered Runtime Manager. + OpSource operation.OpSource + + // SourceRef stores the optional opaque per-source reference. When + // non-empty it is reused as the correlation id linking the outer + // patch entry to the inner stop and start log entries. + SourceRef string +} + +// Validate reports whether input carries the structural invariants the +// service requires. Image-reference shape and semver checks happen +// later inside Handle so that they run after the runtime record has +// been loaded. +func (input Input) Validate() error { + if strings.TrimSpace(input.GameID) == "" { + return fmt.Errorf("game id must not be empty") + } + if strings.TrimSpace(input.NewImageRef) == "" { + return fmt.Errorf("new image ref must not be empty") + } + if !input.OpSource.IsKnown() { + return fmt.Errorf("op source %q is unsupported", input.OpSource) + } + return nil +} + +// Result stores the deterministic outcome of one Handle call. +type Result struct { + // Record carries the runtime record installed by the inner start on + // success; zero on failure. + Record runtime.RuntimeRecord + + // Outcome reports whether the operation completed (success) or + // produced a stable failure code. + Outcome operation.Outcome + + // ErrorCode stores the stable error code on failure. + ErrorCode string + + // ErrorMessage stores the operator-readable detail on failure. + ErrorMessage string +} + +// Dependencies groups the collaborators required by Service. +type Dependencies struct { + RuntimeRecords ports.RuntimeRecordStore + OperationLogs ports.OperationLogStore + Docker ports.DockerClient + Leases ports.GameLeaseStore + + // StopService runs the inner stop step. + StopService *stopruntime.Service + // StartService runs the inner start step with the new image_ref. + StartService *startruntime.Service + + Coordination config.CoordinationConfig + + Telemetry *telemetry.Runtime + Logger *slog.Logger + Clock func() time.Time + NewToken func() string +} + +// Service executes the patch lifecycle operation. +type Service struct { + runtimeRecords ports.RuntimeRecordStore + operationLogs ports.OperationLogStore + docker ports.DockerClient + leases ports.GameLeaseStore + stopService *stopruntime.Service + startService *startruntime.Service + + leaseTTL time.Duration + + telemetry *telemetry.Runtime + logger *slog.Logger + + clock func() time.Time + newToken func() string +} + +// NewService constructs one Service from deps. +func NewService(deps Dependencies) (*Service, error) { + switch { + case deps.RuntimeRecords == nil: + return nil, errors.New("new patch runtime service: nil runtime records") + case deps.OperationLogs == nil: + return nil, errors.New("new patch runtime service: nil operation logs") + case deps.Docker == nil: + return nil, errors.New("new patch runtime service: nil docker client") + case deps.Leases == nil: + return nil, errors.New("new patch runtime service: nil lease store") + case deps.StopService == nil: + return nil, errors.New("new patch runtime service: nil stop service") + case deps.StartService == nil: + return nil, errors.New("new patch runtime service: nil start service") + case deps.Telemetry == nil: + return nil, errors.New("new patch runtime service: nil telemetry runtime") + } + if err := deps.Coordination.Validate(); err != nil { + return nil, fmt.Errorf("new patch runtime service: coordination config: %w", err) + } + + clock := deps.Clock + if clock == nil { + clock = time.Now + } + logger := deps.Logger + if logger == nil { + logger = slog.Default() + } + logger = logger.With("service", "rtmanager.patchruntime") + + newToken := deps.NewToken + if newToken == nil { + newToken = defaultTokenGenerator() + } + + return &Service{ + runtimeRecords: deps.RuntimeRecords, + operationLogs: deps.OperationLogs, + docker: deps.Docker, + leases: deps.Leases, + stopService: deps.StopService, + startService: deps.StartService, + leaseTTL: deps.Coordination.GameLeaseTTL, + telemetry: deps.Telemetry, + logger: logger, + clock: clock, + newToken: newToken, + }, nil +} + +// Handle executes one patch operation end-to-end. The Go-level error +// return is reserved for non-business failures (nil context, nil +// receiver). Every business outcome — success or any of the stable +// failure codes — flows through Result. +func (service *Service) Handle(ctx context.Context, input Input) (Result, error) { + if service == nil { + return Result{}, errors.New("patch runtime: nil service") + } + if ctx == nil { + return Result{}, errors.New("patch runtime: nil context") + } + + opStartedAt := service.clock().UTC() + + if err := input.Validate(); err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInvalidRequest, + errorMessage: err.Error(), + }), nil + } + + token := service.newToken() + leaseStart := service.clock() + acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL) + service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart)) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeServiceUnavailable, + errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()), + }), nil + } + if !acquired { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeConflict, + errorMessage: "another lifecycle operation is in progress for this game", + }), nil + } + defer service.releaseLease(ctx, input.GameID, token) + + return service.runUnderLease(ctx, input, opStartedAt) +} + +// runUnderLease executes the lease-protected patch sequence: load the +// runtime record, validate semver compatibility, run inner stop, +// remove the container, run inner start with the new image. +func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) { + existing, err := service.runtimeRecords.Get(ctx, input.GameID) + if errors.Is(err, runtime.ErrNotFound) { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeNotFound, + errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID), + }), nil + } + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()), + }), nil + } + if existing.Status == runtime.StatusRemoved { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeConflict, + errorMessage: fmt.Sprintf("runtime for game %q is removed; cannot patch", input.GameID), + }), nil + } + if strings.TrimSpace(existing.CurrentImageRef) == "" { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("runtime record for game %q has no current image_ref", input.GameID), + }), nil + } + + currentSemver, err := extractSemverTag(existing.CurrentImageRef) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeImageRefNotSemver, + errorMessage: fmt.Sprintf("current image_ref: %s", err.Error()), + imageRef: existing.CurrentImageRef, + }), nil + } + newSemver, err := extractSemverTag(input.NewImageRef) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeImageRefNotSemver, + errorMessage: fmt.Sprintf("new image_ref: %s", err.Error()), + imageRef: input.NewImageRef, + }), nil + } + if !samePatchSeries(currentSemver, newSemver) { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeSemverPatchOnly, + errorMessage: fmt.Sprintf( + "patch must keep major.minor; current=%s new=%s", + currentSemver, newSemver, + ), + imageRef: input.NewImageRef, + }), nil + } + + correlationRef := input.SourceRef + if correlationRef == "" { + correlationRef = service.newToken() + } + containerID := existing.CurrentContainerID + + stopResult, err := service.stopService.Run(ctx, stopruntime.Input{ + GameID: input.GameID, + Reason: stopruntime.StopReasonAdminRequest, + OpSource: input.OpSource, + SourceRef: correlationRef, + }) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("inner stop: %s", err.Error()), + imageRef: input.NewImageRef, + containerID: containerID, + }), nil + } + if stopResult.Outcome == operation.OutcomeFailure { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: stopResult.ErrorCode, + errorMessage: fmt.Sprintf("inner stop failed: %s", stopResult.ErrorMessage), + imageRef: input.NewImageRef, + containerID: containerID, + }), nil + } + + if containerID != "" { + if err := service.docker.Remove(ctx, containerID); err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeServiceUnavailable, + errorMessage: fmt.Sprintf("docker remove: %s", err.Error()), + imageRef: input.NewImageRef, + containerID: containerID, + }), nil + } + } + + startResult, err := service.startService.Run(ctx, startruntime.Input{ + GameID: input.GameID, + ImageRef: input.NewImageRef, + OpSource: input.OpSource, + SourceRef: correlationRef, + }) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("inner start: %s", err.Error()), + imageRef: input.NewImageRef, + }), nil + } + if startResult.Outcome == operation.OutcomeFailure { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startResult.ErrorCode, + errorMessage: fmt.Sprintf("inner start failed: %s", startResult.ErrorMessage), + imageRef: input.NewImageRef, + }), nil + } + + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: input.GameID, + OpKind: operation.OpKindPatch, + OpSource: input.OpSource, + SourceRef: correlationRef, + ImageRef: input.NewImageRef, + ContainerID: startResult.Record.CurrentContainerID, + Outcome: operation.OutcomeSuccess, + StartedAt: opStartedAt, + FinishedAt: &finishedAt, + }) + service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeSuccess), "") + + logArgs := []any{ + "game_id", input.GameID, + "prev_image_ref", existing.CurrentImageRef, + "new_image_ref", input.NewImageRef, + "prev_container_id", containerID, + "new_container_id", startResult.Record.CurrentContainerID, + "op_source", string(input.OpSource), + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.InfoContext(ctx, "runtime patched", logArgs...) + + return Result{ + Record: startResult.Record, + Outcome: operation.OutcomeSuccess, + }, nil +} + +// failureCtx groups the inputs to recordFailure. +type failureCtx struct { + opStartedAt time.Time + input Input + errorCode string + errorMessage string + imageRef string + containerID string +} + +// recordFailure writes the outer failure operation_log entry and emits +// telemetry. Inner stop / start services have already recorded their +// own entries; this is the outer summary. +func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result { + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: fc.input.GameID, + OpKind: operation.OpKindPatch, + OpSource: fc.input.OpSource, + SourceRef: fc.input.SourceRef, + ImageRef: fc.imageRef, + ContainerID: fc.containerID, + Outcome: operation.OutcomeFailure, + ErrorCode: fc.errorCode, + ErrorMessage: fc.errorMessage, + StartedAt: fc.opStartedAt, + FinishedAt: &finishedAt, + }) + service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode) + + logArgs := []any{ + "game_id", fc.input.GameID, + "image_ref", fc.imageRef, + "op_source", string(fc.input.OpSource), + "error_code", fc.errorCode, + "error_message", fc.errorMessage, + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.WarnContext(ctx, "runtime patch failed", logArgs...) + + return Result{ + Outcome: operation.OutcomeFailure, + ErrorCode: fc.errorCode, + ErrorMessage: fc.errorMessage, + } +} + +func (service *Service) releaseLease(ctx context.Context, gameID, token string) { + cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout) + defer cancel() + if err := service.leases.Release(cleanupCtx, gameID, token); err != nil { + service.logger.WarnContext(ctx, "release game lease", + "game_id", gameID, + "err", err.Error(), + ) + } +} + +func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) { + if _, err := service.operationLogs.Append(ctx, entry); err != nil { + service.logger.ErrorContext(ctx, "append operation log", + "game_id", entry.GameID, + "op_kind", string(entry.OpKind), + "outcome", string(entry.Outcome), + "error_code", entry.ErrorCode, + "err", err.Error(), + ) + } +} + +func defaultTokenGenerator() func() string { + return func() string { + var buf [32]byte + if _, err := rand.Read(buf[:]); err != nil { + return "rtmanager-fallback-token" + } + return base64.RawURLEncoding.EncodeToString(buf[:]) + } +} diff --git a/rtmanager/internal/service/patchruntime/service_test.go b/rtmanager/internal/service/patchruntime/service_test.go new file mode 100644 index 0000000..f87e8c1 --- /dev/null +++ b/rtmanager/internal/service/patchruntime/service_test.go @@ -0,0 +1,597 @@ +package patchruntime_test + +import ( + "context" + "errors" + "sync" + "testing" + "time" + + "galaxy/notificationintent" + "galaxy/rtmanager/internal/adapters/docker/mocks" + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/patchruntime" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/service/stopruntime" + "galaxy/rtmanager/internal/telemetry" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" +) + +// --- shared fake doubles (mirror the restartruntime test pattern) --- + +type fakeRuntimeRecords struct { + mu sync.Mutex + + stored map[string]runtime.RuntimeRecord + getErr error + upsertErr error + updateStatusErr error + + upserts []runtime.RuntimeRecord + updates []ports.UpdateStatusInput +} + +func newFakeRuntimeRecords() *fakeRuntimeRecords { + return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}} +} + +func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.getErr != nil { + return runtime.RuntimeRecord{}, s.getErr + } + record, ok := s.stored[gameID] + if !ok { + return runtime.RuntimeRecord{}, runtime.ErrNotFound + } + return record, nil +} + +func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.upsertErr != nil { + return s.upsertErr + } + s.upserts = append(s.upserts, record) + s.stored[record.GameID] = record + return nil +} + +func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error { + s.mu.Lock() + defer s.mu.Unlock() + s.updates = append(s.updates, input) + if s.updateStatusErr != nil { + return s.updateStatusErr + } + record, ok := s.stored[input.GameID] + if !ok { + return runtime.ErrNotFound + } + if record.Status != input.ExpectedFrom { + return runtime.ErrConflict + } + if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID { + return runtime.ErrConflict + } + record.Status = input.To + record.LastOpAt = input.Now + switch input.To { + case runtime.StatusStopped: + stoppedAt := input.Now + record.StoppedAt = &stoppedAt + case runtime.StatusRemoved: + removedAt := input.Now + record.RemovedAt = &removedAt + record.CurrentContainerID = "" + } + s.stored[input.GameID] = record + return nil +} + +func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in patch tests") +} + +func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in patch tests") +} + +type fakeOperationLogs struct { + mu sync.Mutex + + appendErr error + appends []operation.OperationEntry +} + +func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.appendErr != nil { + return 0, s.appendErr + } + s.appends = append(s.appends, entry) + return int64(len(s.appends)), nil +} + +func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) { + return nil, errors.New("not used in patch tests") +} + +func (s *fakeOperationLogs) byKind(kind operation.OpKind) []operation.OperationEntry { + s.mu.Lock() + defer s.mu.Unlock() + out := []operation.OperationEntry{} + for _, entry := range s.appends { + if entry.OpKind == kind { + out = append(out, entry) + } + } + return out +} + +type fakeLeases struct { + mu sync.Mutex + + acquired bool + acquireErr error + releaseErr error + + acquires []string + releases []string +} + +func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) { + l.mu.Lock() + defer l.mu.Unlock() + l.acquires = append(l.acquires, token) + if l.acquireErr != nil { + return false, l.acquireErr + } + return l.acquired, nil +} + +func (l *fakeLeases) Release(_ context.Context, _, token string) error { + l.mu.Lock() + defer l.mu.Unlock() + l.releases = append(l.releases, token) + return l.releaseErr +} + +type fakeHealthEvents struct { + mu sync.Mutex + envelopes []ports.HealthEventEnvelope +} + +func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error { + h.mu.Lock() + defer h.mu.Unlock() + h.envelopes = append(h.envelopes, envelope) + return nil +} + +type fakeNotifications struct { + mu sync.Mutex + intents []notificationintent.Intent +} + +func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error { + n.mu.Lock() + defer n.mu.Unlock() + n.intents = append(n.intents, intent) + return nil +} + +type fakeLobby struct{} + +func (l *fakeLobby) GetGame(_ context.Context, _ string) (ports.LobbyGameRecord, error) { + return ports.LobbyGameRecord{}, nil +} + +// --- harness --------------------------------------------------------- + +type harness struct { + records *fakeRuntimeRecords + operationLogs *fakeOperationLogs + docker *mocks.MockDockerClient + leases *fakeLeases + healthEvents *fakeHealthEvents + notifications *fakeNotifications + lobby *fakeLobby + telemetry *telemetry.Runtime + + now time.Time + stateDir string + + startService *startruntime.Service + stopService *stopruntime.Service +} + +func newHarness(t *testing.T) *harness { + t.Helper() + ctrl := gomock.NewController(t) + t.Cleanup(ctrl.Finish) + + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + h := &harness{ + records: newFakeRuntimeRecords(), + operationLogs: &fakeOperationLogs{}, + docker: mocks.NewMockDockerClient(ctrl), + leases: &fakeLeases{acquired: true}, + healthEvents: &fakeHealthEvents{}, + notifications: &fakeNotifications{}, + lobby: &fakeLobby{}, + telemetry: telemetryRuntime, + now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC), + stateDir: "/var/lib/galaxy/games/game-1", + } + + containerCfg := config.ContainerConfig{ + DefaultCPUQuota: 1.0, + DefaultMemory: "512m", + DefaultPIDsLimit: 512, + StopTimeout: 30 * time.Second, + Retention: 30 * 24 * time.Hour, + EngineStateMountPath: "/var/lib/galaxy-game", + EngineStateEnvName: "GAME_STATE_PATH", + GameStateDirMode: 0o750, + GameStateRoot: "/var/lib/galaxy/games", + } + dockerCfg := config.DockerConfig{ + Host: "unix:///var/run/docker.sock", + Network: "galaxy-net", + LogDriver: "json-file", + PullPolicy: config.ImagePullPolicyIfMissing, + } + coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute} + + startService, err := startruntime.NewService(startruntime.Dependencies{ + RuntimeRecords: h.records, + OperationLogs: h.operationLogs, + Docker: h.docker, + Leases: h.leases, + HealthEvents: h.healthEvents, + Notifications: h.notifications, + Lobby: h.lobby, + Container: containerCfg, + DockerCfg: dockerCfg, + Coordination: coordinationCfg, + Telemetry: h.telemetry, + Clock: func() time.Time { return h.now }, + NewToken: func() string { return "inner-start-token" }, + PrepareStateDir: func(_ string) (string, error) { return h.stateDir, nil }, + }) + require.NoError(t, err) + h.startService = startService + + stopService, err := stopruntime.NewService(stopruntime.Dependencies{ + RuntimeRecords: h.records, + OperationLogs: h.operationLogs, + Docker: h.docker, + Leases: h.leases, + HealthEvents: h.healthEvents, + Container: containerCfg, + Coordination: coordinationCfg, + Telemetry: h.telemetry, + Clock: func() time.Time { return h.now }, + NewToken: func() string { return "inner-stop-token" }, + }) + require.NoError(t, err) + h.stopService = stopService + + return h +} + +func (h *harness) build(t *testing.T, tokens ...string) *patchruntime.Service { + t.Helper() + tokenIdx := 0 + tokenGen := func() string { + if tokenIdx >= len(tokens) { + return "outer-fallback" + } + t := tokens[tokenIdx] + tokenIdx++ + return t + } + service, err := patchruntime.NewService(patchruntime.Dependencies{ + RuntimeRecords: h.records, + OperationLogs: h.operationLogs, + Docker: h.docker, + Leases: h.leases, + StopService: h.stopService, + StartService: h.startService, + Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute}, + Telemetry: h.telemetry, + Clock: func() time.Time { return h.now }, + NewToken: tokenGen, + }) + require.NoError(t, err) + return service +} + +const ( + currentImage = "registry.example.com/galaxy/game:1.4.7" + patchImage = "registry.example.com/galaxy/game:1.4.8" + majorBump = "registry.example.com/galaxy/game:2.0.0" + tagless = "registry.example.com/galaxy/game" + notSemver = "registry.example.com/galaxy/game:latest" +) + +func runningRecord(now time.Time) runtime.RuntimeRecord { + startedAt := now.Add(-time.Hour) + return runtime.RuntimeRecord{ + GameID: "game-1", + Status: runtime.StatusRunning, + CurrentContainerID: "ctr-old", + CurrentImageRef: currentImage, + EngineEndpoint: "http://galaxy-game-game-1:8080", + StatePath: "/var/lib/galaxy/games/game-1", + DockerNetwork: "galaxy-net", + StartedAt: &startedAt, + LastOpAt: startedAt, + CreatedAt: startedAt, + } +} + +func basicInput() patchruntime.Input { + return patchruntime.Input{ + GameID: "game-1", + NewImageRef: patchImage, + OpSource: operation.OpSourceGMRest, + SourceRef: "rest-req-99", + } +} + +func sampleRunResult(now time.Time) ports.RunResult { + return ports.RunResult{ + ContainerID: "ctr-new", + EngineEndpoint: "http://galaxy-game-game-1:8080", + StartedAt: now, + } +} + +func expectInnerStart(h *harness, image string) { + h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil) + h.docker.EXPECT().PullImage(gomock.Any(), image, gomock.Any()).Return(nil) + h.docker.EXPECT().InspectImage(gomock.Any(), image).Return(ports.ImageInspect{Ref: image}, nil) + h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil) +} + +// --- happy path ----------------------------------------------------- + +func TestHandlePatchHappyPath(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil) + h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil) + expectInnerStart(h, patchImage) + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Equal(t, patchImage, result.Record.CurrentImageRef) + + patches := h.operationLogs.byKind(operation.OpKindPatch) + require.Len(t, patches, 1) + assert.Equal(t, "rest-req-99", patches[0].SourceRef) + assert.Equal(t, patchImage, patches[0].ImageRef) + assert.Equal(t, "ctr-new", patches[0].ContainerID) + + assert.Len(t, h.operationLogs.byKind(operation.OpKindStop), 1) + assert.Len(t, h.operationLogs.byKind(operation.OpKindStart), 1) +} + +func TestHandlePatchSameImageProceedsAsRecreate(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil) + h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil) + expectInnerStart(h, currentImage) + + input := basicInput() + input.NewImageRef = currentImage + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + require.Len(t, h.operationLogs.byKind(operation.OpKindPatch), 1, "patch entry recorded even when image is unchanged") +} + +// --- semver pre-checks --------------------------------------------- + +func TestHandleImageRefNotSemverWhenNewIsTagless(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + input := basicInput() + input.NewImageRef = tagless + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode) + + assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop), "no inner stop on pre-check failure") + assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart)) +} + +func TestHandleImageRefNotSemverWhenNewIsNonSemver(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + input := basicInput() + input.NewImageRef = notSemver + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode) +} + +func TestHandleImageRefNotSemverWhenCurrentIsTagless(t *testing.T) { + h := newHarness(t) + record := runningRecord(h.now) + record.CurrentImageRef = tagless + h.records.stored["game-1"] = record + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode) +} + +func TestHandleSemverPatchOnlyOnMajorBump(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + input := basicInput() + input.NewImageRef = majorBump + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeSemverPatchOnly, result.ErrorCode) + + assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop)) + assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart)) +} + +func TestHandleSemverPatchOnlyOnMinorBump(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + input := basicInput() + input.NewImageRef = "registry.example.com/galaxy/game:1.5.0" + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeSemverPatchOnly, result.ErrorCode) +} + +// --- record state checks ------------------------------------------- + +func TestHandleNotFoundForMissingRecord(t *testing.T) { + h := newHarness(t) + service := h.build(t, "outer-token") + + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode) +} + +func TestHandleConflictForRemovedRecord(t *testing.T) { + h := newHarness(t) + removed := runningRecord(h.now) + removed.Status = runtime.StatusRemoved + removed.CurrentContainerID = "" + removedAt := h.now.Add(-time.Hour) + removed.RemovedAt = &removedAt + h.records.stored["game-1"] = removed + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode) +} + +// --- failures from inner ops --------------------------------------- + +func TestHandlePropagatesInnerStopFailure(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(errors.New("daemon unreachable")) + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode) +} + +func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil) + h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o")) + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode) +} + +func TestHandlePropagatesInnerStartFailure(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil) + h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil) + h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil) + h.docker.EXPECT().PullImage(gomock.Any(), patchImage, gomock.Any()).Return(errors.New("manifest unknown")) + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode) +} + +// --- conflicts ------------------------------------------------------ + +func TestHandleConflictWhenLeaseBusy(t *testing.T) { + h := newHarness(t) + h.leases.acquired = false + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode) +} + +// --- input validation ---------------------------------------------- + +func TestHandleRejectsInvalidInput(t *testing.T) { + h := newHarness(t) + service := h.build(t, "outer-token") + + cases := []patchruntime.Input{ + {GameID: "", NewImageRef: patchImage, OpSource: operation.OpSourceGMRest}, + {GameID: "g", NewImageRef: "", OpSource: operation.OpSourceGMRest}, + {GameID: "g", NewImageRef: patchImage, OpSource: operation.OpSource("bogus")}, + } + for _, input := range cases { + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode) + } +} + +// --- constructor --------------------------------------------------- + +func TestNewServiceRejectsMissingDependencies(t *testing.T) { + h := newHarness(t) + deps := patchruntime.Dependencies{ + Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute}, + Telemetry: h.telemetry, + } + _, err := patchruntime.NewService(deps) + require.Error(t, err) +} diff --git a/rtmanager/internal/service/restartruntime/service.go b/rtmanager/internal/service/restartruntime/service.go new file mode 100644 index 0000000..8cb01f1 --- /dev/null +++ b/rtmanager/internal/service/restartruntime/service.go @@ -0,0 +1,482 @@ +// Package restartruntime implements the `restart` lifecycle operation +// owned by Runtime Manager. Restart is a recreate: under one outer +// per-game lease the service runs the stop service, removes the +// container with `docker rm`, and runs the start service with the +// runtime's current `image_ref`. The hostname / engine endpoint stays +// stable across the recreate; `container_id` changes. +// +// Lifecycle and failure-mode semantics follow `rtmanager/README.md +// §Lifecycles → Restart`. Design rationale is captured in +// `rtmanager/docs/services.md`, in particular the lease-sharing +// pattern with `startruntime.Service.Run` / `stopruntime.Service.Run`, +// the correlation-id reuse on `source_ref`, and the +// inner-stop-then-rm-failure recovery rule. +package restartruntime + +import ( + "context" + "crypto/rand" + "encoding/base64" + "errors" + "fmt" + "log/slog" + "strings" + "time" + + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/logging" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/service/stopruntime" + "galaxy/rtmanager/internal/telemetry" +) + +// leaseReleaseTimeout bounds the deferred lease-release call. +const leaseReleaseTimeout = 5 * time.Second + +// Input stores the per-call arguments for one restart operation. +type Input struct { + // GameID identifies the platform game to restart. + GameID string + + // OpSource classifies how the request entered Runtime Manager. + // Required: every operation_log entry carries an op_source. + OpSource operation.OpSource + + // SourceRef stores the optional opaque per-source reference (REST + // request id, admin user id). When non-empty it is reused as the + // correlation id linking the outer restart entry to the inner stop + // and start log entries. + SourceRef string +} + +// Validate reports whether input carries the structural invariants the +// service requires. +func (input Input) Validate() error { + if strings.TrimSpace(input.GameID) == "" { + return fmt.Errorf("game id must not be empty") + } + if !input.OpSource.IsKnown() { + return fmt.Errorf("op source %q is unsupported", input.OpSource) + } + return nil +} + +// Result stores the deterministic outcome of one Handle call. +type Result struct { + // Record carries the runtime record installed by the inner start on + // success; zero on failure. + Record runtime.RuntimeRecord + + // Outcome reports whether the operation completed (success) or + // produced a stable failure code. + Outcome operation.Outcome + + // ErrorCode stores the stable error code on failure. Empty for + // success. + ErrorCode string + + // ErrorMessage stores the operator-readable detail on failure. + // Empty for success. + ErrorMessage string +} + +// Dependencies groups the collaborators required by Service. +type Dependencies struct { + // RuntimeRecords reads the runtime record at the start of restart + // to capture the current image_ref and container_id. + RuntimeRecords ports.RuntimeRecordStore + + // OperationLogs records the outer restart audit entry. Inner stop + // and start services append their own entries through their own + // stores. + OperationLogs ports.OperationLogStore + + // Docker drives the docker rm step between the inner stop and + // inner start. + Docker ports.DockerClient + + // Leases serialises operations against the same game id. The outer + // lease is held for the entire stop + rm + start sequence. + Leases ports.GameLeaseStore + + // StopService runs the inner stop step under the outer lease. + StopService *stopruntime.Service + + // StartService runs the inner start step under the outer lease. + StartService *startruntime.Service + + // Coordination supplies the per-game lease TTL. + Coordination config.CoordinationConfig + + // Telemetry records restart outcomes and lease latency. Required. + Telemetry *telemetry.Runtime + + // Logger records structured service-level events. Defaults to + // `slog.Default()` when nil. + Logger *slog.Logger + + // Clock supplies the wall-clock used for operation timestamps. + // Defaults to `time.Now` when nil. + Clock func() time.Time + + // NewToken supplies a unique opaque token. Used both for the lease + // and for the correlation id when Input.SourceRef is empty. + // Defaults to a 32-byte random base64url string when nil. + NewToken func() string +} + +// Service executes the restart lifecycle operation. +type Service struct { + runtimeRecords ports.RuntimeRecordStore + operationLogs ports.OperationLogStore + docker ports.DockerClient + leases ports.GameLeaseStore + stopService *stopruntime.Service + startService *startruntime.Service + + leaseTTL time.Duration + + telemetry *telemetry.Runtime + logger *slog.Logger + + clock func() time.Time + newToken func() string +} + +// NewService constructs one Service from deps. +func NewService(deps Dependencies) (*Service, error) { + switch { + case deps.RuntimeRecords == nil: + return nil, errors.New("new restart runtime service: nil runtime records") + case deps.OperationLogs == nil: + return nil, errors.New("new restart runtime service: nil operation logs") + case deps.Docker == nil: + return nil, errors.New("new restart runtime service: nil docker client") + case deps.Leases == nil: + return nil, errors.New("new restart runtime service: nil lease store") + case deps.StopService == nil: + return nil, errors.New("new restart runtime service: nil stop service") + case deps.StartService == nil: + return nil, errors.New("new restart runtime service: nil start service") + case deps.Telemetry == nil: + return nil, errors.New("new restart runtime service: nil telemetry runtime") + } + if err := deps.Coordination.Validate(); err != nil { + return nil, fmt.Errorf("new restart runtime service: coordination config: %w", err) + } + + clock := deps.Clock + if clock == nil { + clock = time.Now + } + logger := deps.Logger + if logger == nil { + logger = slog.Default() + } + logger = logger.With("service", "rtmanager.restartruntime") + + newToken := deps.NewToken + if newToken == nil { + newToken = defaultTokenGenerator() + } + + return &Service{ + runtimeRecords: deps.RuntimeRecords, + operationLogs: deps.OperationLogs, + docker: deps.Docker, + leases: deps.Leases, + stopService: deps.StopService, + startService: deps.StartService, + leaseTTL: deps.Coordination.GameLeaseTTL, + telemetry: deps.Telemetry, + logger: logger, + clock: clock, + newToken: newToken, + }, nil +} + +// Handle executes one restart operation end-to-end. The Go-level error +// return is reserved for non-business failures (nil context, nil +// receiver). Every business outcome — success or any of the stable +// failure codes — flows through Result. +func (service *Service) Handle(ctx context.Context, input Input) (Result, error) { + if service == nil { + return Result{}, errors.New("restart runtime: nil service") + } + if ctx == nil { + return Result{}, errors.New("restart runtime: nil context") + } + + opStartedAt := service.clock().UTC() + + if err := input.Validate(); err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInvalidRequest, + errorMessage: err.Error(), + }), nil + } + + token := service.newToken() + leaseStart := service.clock() + acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL) + service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart)) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeServiceUnavailable, + errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()), + }), nil + } + if !acquired { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeConflict, + errorMessage: "another lifecycle operation is in progress for this game", + }), nil + } + defer service.releaseLease(ctx, input.GameID, token) + + return service.runUnderLease(ctx, input, opStartedAt) +} + +// runUnderLease executes the lease-protected restart sequence. Loads +// the runtime record, runs inner stop, removes the container, runs +// inner start. +func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) { + existing, err := service.runtimeRecords.Get(ctx, input.GameID) + if errors.Is(err, runtime.ErrNotFound) { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeNotFound, + errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID), + }), nil + } + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()), + }), nil + } + if existing.Status == runtime.StatusRemoved { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeConflict, + errorMessage: fmt.Sprintf("runtime for game %q is removed; cannot restart", input.GameID), + imageRef: existing.CurrentImageRef, + }), nil + } + if strings.TrimSpace(existing.CurrentImageRef) == "" { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("runtime record for game %q has no image_ref to restart with", input.GameID), + }), nil + } + + correlationRef := input.SourceRef + if correlationRef == "" { + correlationRef = service.newToken() + } + containerID := existing.CurrentContainerID + imageRef := existing.CurrentImageRef + + stopResult, err := service.stopService.Run(ctx, stopruntime.Input{ + GameID: input.GameID, + Reason: stopruntime.StopReasonAdminRequest, + OpSource: input.OpSource, + SourceRef: correlationRef, + }) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("inner stop: %s", err.Error()), + imageRef: imageRef, + containerID: containerID, + }), nil + } + if stopResult.Outcome == operation.OutcomeFailure { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: stopResult.ErrorCode, + errorMessage: fmt.Sprintf("inner stop failed: %s", stopResult.ErrorMessage), + imageRef: imageRef, + containerID: containerID, + }), nil + } + + if containerID != "" { + if err := service.docker.Remove(ctx, containerID); err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeServiceUnavailable, + errorMessage: fmt.Sprintf("docker remove: %s", err.Error()), + imageRef: imageRef, + containerID: containerID, + }), nil + } + } + + startResult, err := service.startService.Run(ctx, startruntime.Input{ + GameID: input.GameID, + ImageRef: imageRef, + OpSource: input.OpSource, + SourceRef: correlationRef, + }) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("inner start: %s", err.Error()), + imageRef: imageRef, + }), nil + } + if startResult.Outcome == operation.OutcomeFailure { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startResult.ErrorCode, + errorMessage: fmt.Sprintf("inner start failed: %s", startResult.ErrorMessage), + imageRef: imageRef, + }), nil + } + + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: input.GameID, + OpKind: operation.OpKindRestart, + OpSource: input.OpSource, + SourceRef: correlationRef, + ImageRef: imageRef, + ContainerID: startResult.Record.CurrentContainerID, + Outcome: operation.OutcomeSuccess, + StartedAt: opStartedAt, + FinishedAt: &finishedAt, + }) + service.telemetry.RecordRestartOutcome(ctx, string(operation.OutcomeSuccess), "") + + logArgs := []any{ + "game_id", input.GameID, + "prev_container_id", containerID, + "new_container_id", startResult.Record.CurrentContainerID, + "image_ref", imageRef, + "op_source", string(input.OpSource), + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.InfoContext(ctx, "runtime restarted", logArgs...) + + return Result{ + Record: startResult.Record, + Outcome: operation.OutcomeSuccess, + }, nil +} + +// failureCtx groups the inputs to recordFailure. +type failureCtx struct { + opStartedAt time.Time + input Input + errorCode string + errorMessage string + imageRef string + containerID string +} + +// recordFailure records the outer failure operation_log entry and emits +// telemetry. Inner stop / start services have already recorded their +// own entries; this is the outer summary. +func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result { + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: fc.input.GameID, + OpKind: operation.OpKindRestart, + OpSource: fc.input.OpSource, + SourceRef: correlationRefOrEmpty(fc.input), + ImageRef: fc.imageRef, + ContainerID: fc.containerID, + Outcome: operation.OutcomeFailure, + ErrorCode: fc.errorCode, + ErrorMessage: fc.errorMessage, + StartedAt: fc.opStartedAt, + FinishedAt: &finishedAt, + }) + service.telemetry.RecordRestartOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode) + + logArgs := []any{ + "game_id", fc.input.GameID, + "image_ref", fc.imageRef, + "op_source", string(fc.input.OpSource), + "error_code", fc.errorCode, + "error_message", fc.errorMessage, + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.WarnContext(ctx, "runtime restart failed", logArgs...) + + return Result{ + Outcome: operation.OutcomeFailure, + ErrorCode: fc.errorCode, + ErrorMessage: fc.errorMessage, + } +} + +// correlationRefOrEmpty returns the original Input.SourceRef for the +// outer entry. Outer-failure paths that did not yet generate a +// correlation id (input validation, lease busy) keep the original +// `source_ref` which is the actor ref. +func correlationRefOrEmpty(input Input) string { + return input.SourceRef +} + +// releaseLease releases the per-game lease in a fresh background context. +func (service *Service) releaseLease(ctx context.Context, gameID, token string) { + cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout) + defer cancel() + if err := service.leases.Release(cleanupCtx, gameID, token); err != nil { + service.logger.WarnContext(ctx, "release game lease", + "game_id", gameID, + "err", err.Error(), + ) + } +} + +// bestEffortAppend writes one outer operation_log entry. Inner ops have +// already appended their own; a failure here only loses the outer +// summary, which is acceptable. +func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) { + if _, err := service.operationLogs.Append(ctx, entry); err != nil { + service.logger.ErrorContext(ctx, "append operation log", + "game_id", entry.GameID, + "op_kind", string(entry.OpKind), + "outcome", string(entry.Outcome), + "error_code", entry.ErrorCode, + "err", err.Error(), + ) + } +} + +// defaultTokenGenerator returns a function that produces 32-byte +// base64url-encoded tokens. +func defaultTokenGenerator() func() string { + return func() string { + var buf [32]byte + if _, err := rand.Read(buf[:]); err != nil { + return "rtmanager-fallback-token" + } + return base64.RawURLEncoding.EncodeToString(buf[:]) + } +} diff --git a/rtmanager/internal/service/restartruntime/service_test.go b/rtmanager/internal/service/restartruntime/service_test.go new file mode 100644 index 0000000..00fcae8 --- /dev/null +++ b/rtmanager/internal/service/restartruntime/service_test.go @@ -0,0 +1,584 @@ +package restartruntime_test + +import ( + "context" + "errors" + "sync" + "testing" + "time" + + "galaxy/notificationintent" + "galaxy/rtmanager/internal/adapters/docker/mocks" + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/restartruntime" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/service/stopruntime" + "galaxy/rtmanager/internal/telemetry" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" +) + +// --- shared fake doubles ---------------------------------------------- + +type fakeRuntimeRecords struct { + mu sync.Mutex + + stored map[string]runtime.RuntimeRecord + getErr error + upsertErr error + updateStatusErr error + + upserts []runtime.RuntimeRecord + updates []ports.UpdateStatusInput +} + +func newFakeRuntimeRecords() *fakeRuntimeRecords { + return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}} +} + +func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.getErr != nil { + return runtime.RuntimeRecord{}, s.getErr + } + record, ok := s.stored[gameID] + if !ok { + return runtime.RuntimeRecord{}, runtime.ErrNotFound + } + return record, nil +} + +func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.upsertErr != nil { + return s.upsertErr + } + s.upserts = append(s.upserts, record) + s.stored[record.GameID] = record + return nil +} + +func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error { + s.mu.Lock() + defer s.mu.Unlock() + s.updates = append(s.updates, input) + if s.updateStatusErr != nil { + return s.updateStatusErr + } + record, ok := s.stored[input.GameID] + if !ok { + return runtime.ErrNotFound + } + if record.Status != input.ExpectedFrom { + return runtime.ErrConflict + } + if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID { + return runtime.ErrConflict + } + record.Status = input.To + record.LastOpAt = input.Now + switch input.To { + case runtime.StatusStopped: + stoppedAt := input.Now + record.StoppedAt = &stoppedAt + case runtime.StatusRemoved: + removedAt := input.Now + record.RemovedAt = &removedAt + record.CurrentContainerID = "" + } + s.stored[input.GameID] = record + return nil +} + +func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in restart tests") +} + +func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in restart tests") +} + +type fakeOperationLogs struct { + mu sync.Mutex + + appendErr error + appends []operation.OperationEntry +} + +func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.appendErr != nil { + return 0, s.appendErr + } + s.appends = append(s.appends, entry) + return int64(len(s.appends)), nil +} + +func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) { + return nil, errors.New("not used in restart tests") +} + +func (s *fakeOperationLogs) byKind(kind operation.OpKind) []operation.OperationEntry { + s.mu.Lock() + defer s.mu.Unlock() + out := []operation.OperationEntry{} + for _, entry := range s.appends { + if entry.OpKind == kind { + out = append(out, entry) + } + } + return out +} + +type fakeLeases struct { + mu sync.Mutex + + acquired bool + acquireErr error + releaseErr error + + acquires []string + releases []string +} + +func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) { + l.mu.Lock() + defer l.mu.Unlock() + l.acquires = append(l.acquires, token) + if l.acquireErr != nil { + return false, l.acquireErr + } + return l.acquired, nil +} + +func (l *fakeLeases) Release(_ context.Context, _, token string) error { + l.mu.Lock() + defer l.mu.Unlock() + l.releases = append(l.releases, token) + return l.releaseErr +} + +type fakeHealthEvents struct { + mu sync.Mutex + + publishErr error + envelopes []ports.HealthEventEnvelope +} + +func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error { + h.mu.Lock() + defer h.mu.Unlock() + if h.publishErr != nil { + return h.publishErr + } + h.envelopes = append(h.envelopes, envelope) + return nil +} + +type fakeNotifications struct { + mu sync.Mutex + + publishErr error + intents []notificationintent.Intent +} + +func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error { + n.mu.Lock() + defer n.mu.Unlock() + if n.publishErr != nil { + return n.publishErr + } + n.intents = append(n.intents, intent) + return nil +} + +type fakeLobby struct { + record ports.LobbyGameRecord + err error +} + +func (l *fakeLobby) GetGame(_ context.Context, _ string) (ports.LobbyGameRecord, error) { + if l.err != nil { + return ports.LobbyGameRecord{}, l.err + } + return l.record, nil +} + +// --- harness ---------------------------------------------------------- + +type harness struct { + records *fakeRuntimeRecords + operationLogs *fakeOperationLogs + docker *mocks.MockDockerClient + leases *fakeLeases + healthEvents *fakeHealthEvents + notifications *fakeNotifications + lobby *fakeLobby + telemetry *telemetry.Runtime + + now time.Time + stateDir string + + startService *startruntime.Service + stopService *stopruntime.Service +} + +func newHarness(t *testing.T) *harness { + t.Helper() + ctrl := gomock.NewController(t) + t.Cleanup(ctrl.Finish) + + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + h := &harness{ + records: newFakeRuntimeRecords(), + operationLogs: &fakeOperationLogs{}, + docker: mocks.NewMockDockerClient(ctrl), + leases: &fakeLeases{acquired: true}, + healthEvents: &fakeHealthEvents{}, + notifications: &fakeNotifications{}, + lobby: &fakeLobby{}, + telemetry: telemetryRuntime, + now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC), + stateDir: "/var/lib/galaxy/games/game-1", + } + + containerCfg := config.ContainerConfig{ + DefaultCPUQuota: 1.0, + DefaultMemory: "512m", + DefaultPIDsLimit: 512, + StopTimeout: 30 * time.Second, + Retention: 30 * 24 * time.Hour, + EngineStateMountPath: "/var/lib/galaxy-game", + EngineStateEnvName: "GAME_STATE_PATH", + GameStateDirMode: 0o750, + GameStateRoot: "/var/lib/galaxy/games", + } + dockerCfg := config.DockerConfig{ + Host: "unix:///var/run/docker.sock", + Network: "galaxy-net", + LogDriver: "json-file", + PullPolicy: config.ImagePullPolicyIfMissing, + } + coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute} + + startService, err := startruntime.NewService(startruntime.Dependencies{ + RuntimeRecords: h.records, + OperationLogs: h.operationLogs, + Docker: h.docker, + Leases: h.leases, + HealthEvents: h.healthEvents, + Notifications: h.notifications, + Lobby: h.lobby, + Container: containerCfg, + DockerCfg: dockerCfg, + Coordination: coordinationCfg, + Telemetry: h.telemetry, + Clock: func() time.Time { return h.now }, + NewToken: func() string { return "inner-start-token" }, + PrepareStateDir: func(_ string) (string, error) { return h.stateDir, nil }, + }) + require.NoError(t, err) + h.startService = startService + + stopService, err := stopruntime.NewService(stopruntime.Dependencies{ + RuntimeRecords: h.records, + OperationLogs: h.operationLogs, + Docker: h.docker, + Leases: h.leases, + HealthEvents: h.healthEvents, + Container: containerCfg, + Coordination: coordinationCfg, + Telemetry: h.telemetry, + Clock: func() time.Time { return h.now }, + NewToken: func() string { return "inner-stop-token" }, + }) + require.NoError(t, err) + h.stopService = stopService + + return h +} + +func (h *harness) build(t *testing.T, tokens ...string) *restartruntime.Service { + t.Helper() + tokenIdx := 0 + tokenGen := func() string { + if tokenIdx >= len(tokens) { + return "outer-fallback" + } + t := tokens[tokenIdx] + tokenIdx++ + return t + } + service, err := restartruntime.NewService(restartruntime.Dependencies{ + RuntimeRecords: h.records, + OperationLogs: h.operationLogs, + Docker: h.docker, + Leases: h.leases, + StopService: h.stopService, + StartService: h.startService, + Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute}, + Telemetry: h.telemetry, + Clock: func() time.Time { return h.now }, + NewToken: tokenGen, + }) + require.NoError(t, err) + return service +} + +const imageRef = "registry.example.com/galaxy/game:1.4.7" + +func runningRecord(now time.Time) runtime.RuntimeRecord { + startedAt := now.Add(-time.Hour) + return runtime.RuntimeRecord{ + GameID: "game-1", + Status: runtime.StatusRunning, + CurrentContainerID: "ctr-old", + CurrentImageRef: imageRef, + EngineEndpoint: "http://galaxy-game-game-1:8080", + StatePath: "/var/lib/galaxy/games/game-1", + DockerNetwork: "galaxy-net", + StartedAt: &startedAt, + LastOpAt: startedAt, + CreatedAt: startedAt, + } +} + +func basicInput() restartruntime.Input { + return restartruntime.Input{ + GameID: "game-1", + OpSource: operation.OpSourceGMRest, + SourceRef: "rest-req-42", + } +} + +func sampleRunResult(now time.Time) ports.RunResult { + return ports.RunResult{ + ContainerID: "ctr-new", + EngineEndpoint: "http://galaxy-game-game-1:8080", + StartedAt: now, + } +} + +func expectInnerStart(h *harness) { + h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil) + h.docker.EXPECT().PullImage(gomock.Any(), imageRef, gomock.Any()).Return(nil) + h.docker.EXPECT().InspectImage(gomock.Any(), imageRef).Return(ports.ImageInspect{Ref: imageRef}, nil) + h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil) +} + +// --- happy path ------------------------------------------------------- + +func TestHandleRestartFromRunning(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil) + h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil) + expectInnerStart(h) + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Empty(t, result.ErrorCode) + assert.Equal(t, "ctr-new", result.Record.CurrentContainerID) + assert.Equal(t, imageRef, result.Record.CurrentImageRef) + assert.Equal(t, runtime.StatusRunning, result.Record.Status) + + stops := h.operationLogs.byKind(operation.OpKindStop) + starts := h.operationLogs.byKind(operation.OpKindStart) + restarts := h.operationLogs.byKind(operation.OpKindRestart) + require.Len(t, stops, 1, "inner stop appended its own entry") + require.Len(t, starts, 1, "inner start appended its own entry") + require.Len(t, restarts, 1, "outer restart appended one summary entry") + + assert.Equal(t, "rest-req-42", stops[0].SourceRef, "correlation id propagated to inner stop") + assert.Equal(t, "rest-req-42", starts[0].SourceRef, "correlation id propagated to inner start") + assert.Equal(t, "rest-req-42", restarts[0].SourceRef, "correlation id stored on outer restart") + assert.Equal(t, "ctr-new", restarts[0].ContainerID) + assert.Equal(t, imageRef, restarts[0].ImageRef) + + assert.Equal(t, []string{"outer-token"}, h.leases.acquires) + assert.Equal(t, []string{"outer-token"}, h.leases.releases) +} + +func TestHandleRestartFromStopped(t *testing.T) { + h := newHarness(t) + stoppedRecord := runningRecord(h.now) + stoppedRecord.Status = runtime.StatusStopped + stoppedAt := h.now.Add(-30 * time.Minute) + stoppedRecord.StoppedAt = &stoppedAt + h.records.stored["game-1"] = stoppedRecord + + // No docker.Stop because inner stop short-circuits via replay no-op. + h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil) + expectInnerStart(h) + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Equal(t, "ctr-new", result.Record.CurrentContainerID) +} + +// --- correlation id fallback ----------------------------------------- + +func TestHandleGeneratesCorrelationWhenSourceRefEmpty(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil) + h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil) + expectInnerStart(h) + + input := basicInput() + input.SourceRef = "" + + // First newToken call yields the lease token, second yields the + // correlation id fallback. + service := h.build(t, "outer-token", "correlation-fallback") + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + + stops := h.operationLogs.byKind(operation.OpKindStop) + starts := h.operationLogs.byKind(operation.OpKindStart) + restarts := h.operationLogs.byKind(operation.OpKindRestart) + require.Len(t, stops, 1) + require.Len(t, starts, 1) + require.Len(t, restarts, 1) + assert.Equal(t, "correlation-fallback", stops[0].SourceRef) + assert.Equal(t, "correlation-fallback", starts[0].SourceRef) + assert.Equal(t, "correlation-fallback", restarts[0].SourceRef) +} + +// --- failure paths --------------------------------------------------- + +func TestHandleNotFoundForMissingRecord(t *testing.T) { + h := newHarness(t) + service := h.build(t, "outer-token") + + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode) + assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop)) + assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart)) + require.Len(t, h.operationLogs.byKind(operation.OpKindRestart), 1) +} + +func TestHandleConflictForRemovedRecord(t *testing.T) { + h := newHarness(t) + removed := runningRecord(h.now) + removed.Status = runtime.StatusRemoved + removed.CurrentContainerID = "" + removedAt := h.now.Add(-time.Hour) + removed.RemovedAt = &removedAt + h.records.stored["game-1"] = removed + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode) +} + +func TestHandleConflictWhenLeaseBusy(t *testing.T) { + h := newHarness(t) + h.leases.acquired = false + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode) + assert.Empty(t, h.leases.releases, "release must not run when acquire returned false") +} + +func TestHandlePropagatesInnerStopFailure(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(errors.New("daemon unreachable")) + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode) + assert.Contains(t, result.ErrorMessage, "inner stop failed") +} + +func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil) + h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o")) + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode) + assert.Contains(t, result.ErrorMessage, "docker remove") + // inner stop did succeed and write its log entry; outer restart records failure. + require.Len(t, h.operationLogs.byKind(operation.OpKindStop), 1) + require.Len(t, h.operationLogs.byKind(operation.OpKindRestart), 1) +} + +func TestHandlePropagatesInnerStartFailure(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil) + h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil) + h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil) + h.docker.EXPECT().PullImage(gomock.Any(), imageRef, gomock.Any()).Return(errors.New("manifest unknown")) + + service := h.build(t, "outer-token") + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode) + assert.Contains(t, result.ErrorMessage, "inner start failed") +} + +// --- input validation ------------------------------------------------ + +func TestHandleRejectsInvalidInput(t *testing.T) { + h := newHarness(t) + service := h.build(t, "outer-token") + + cases := []restartruntime.Input{ + {GameID: "", OpSource: operation.OpSourceGMRest}, + {GameID: "g", OpSource: operation.OpSource("bogus")}, + } + for _, input := range cases { + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode) + } +} + +// --- constructor ----------------------------------------------------- + +func TestNewServiceRejectsMissingDependencies(t *testing.T) { + h := newHarness(t) + deps := restartruntime.Dependencies{ + Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute}, + Telemetry: h.telemetry, + } + _, err := restartruntime.NewService(deps) + require.Error(t, err) +} diff --git a/rtmanager/internal/service/startruntime/errors.go b/rtmanager/internal/service/startruntime/errors.go new file mode 100644 index 0000000..8e80568 --- /dev/null +++ b/rtmanager/internal/service/startruntime/errors.go @@ -0,0 +1,68 @@ +package startruntime + +// Stable error codes returned in `Result.ErrorCode`. The values match the +// vocabulary frozen by `rtmanager/README.md §Error Model`, +// `rtmanager/api/internal-openapi.yaml`, and +// `rtmanager/api/runtime-jobs-asyncapi.yaml`. Although the constants live +// in the start-service package they are the canonical home for every +// lifecycle service in `internal/service/`. Stop, restart, patch, +// cleanup, the REST handlers, and the stream consumers import these +// names rather than redeclare them; renaming any of them is a contract +// change. +const ( + // ErrorCodeReplayNoOp reports that the request was an idempotent + // replay against an already-running record with the same image_ref. + ErrorCodeReplayNoOp = "replay_no_op" + + // ErrorCodeStartConfigInvalid reports that the start request was + // rejected before any Docker work because of a validation failure + // (invalid image_ref shape, missing Docker network, unwritable state + // directory). + ErrorCodeStartConfigInvalid = "start_config_invalid" + + // ErrorCodeImagePullFailed reports that the image pull stage failed. + ErrorCodeImagePullFailed = "image_pull_failed" + + // ErrorCodeContainerStartFailed reports that `docker create` or + // `docker start` failed, or that the runtime record could not be + // installed after a successful Run. + ErrorCodeContainerStartFailed = "container_start_failed" + + // ErrorCodeConflict reports an operation incompatible with the + // current runtime state (lease busy, running record with a different + // image_ref, cleanup attempted on a running runtime, restart or + // patch attempted on a removed record). + ErrorCodeConflict = "conflict" + + // ErrorCodeServiceUnavailable reports that a steady-state dependency + // (Docker daemon, PostgreSQL, Redis) was unreachable for this call. + ErrorCodeServiceUnavailable = "service_unavailable" + + // ErrorCodeInternal reports an unexpected error not classified by + // the other codes. + ErrorCodeInternal = "internal_error" + + // ErrorCodeInvalidRequest reports that the request was rejected + // because of structural input validation (empty required fields, + // unknown enum values). Used by the stop / restart / patch / + // cleanup services for malformed Input. The start service uses the + // stricter `start_config_invalid` code instead because every start + // validation failure also raises an admin notification intent. + ErrorCodeInvalidRequest = "invalid_request" + + // ErrorCodeNotFound reports that the runtime record requested by a + // stop, restart, patch or cleanup operation does not exist. Those + // services raise it; the start service never does (start installs + // the record on first call). + ErrorCodeNotFound = "not_found" + + // ErrorCodeImageRefNotSemver reports that a patch operation was + // rejected because either the current or the new image reference + // could not be parsed as a semver tag. + ErrorCodeImageRefNotSemver = "image_ref_not_semver" + + // ErrorCodeSemverPatchOnly reports that a patch operation was + // rejected because the major or minor component differs between the + // current and new image references. + ErrorCodeSemverPatchOnly = "semver_patch_only" +) diff --git a/rtmanager/internal/service/startruntime/service.go b/rtmanager/internal/service/startruntime/service.go new file mode 100644 index 0000000..6e3fbb9 --- /dev/null +++ b/rtmanager/internal/service/startruntime/service.go @@ -0,0 +1,940 @@ +// Package startruntime implements the `start` lifecycle operation owned +// by Runtime Manager. The service is the single orchestrator behind +// both the asynchronous `runtime:start_jobs` consumer and the +// synchronous `POST /api/v1/internal/runtimes/{game_id}/start` REST +// handler; both callers obtain a deterministic Result with a stable +// `Outcome` / `ErrorCode` pair. +// +// Lifecycle and failure-mode semantics follow `rtmanager/README.md +// §Lifecycles → Start`. Design rationale is captured in +// `rtmanager/docs/services.md`. +package startruntime + +import ( + "context" + "crypto/rand" + "encoding/base64" + "encoding/json" + "errors" + "fmt" + "log/slog" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "galaxy/notificationintent" + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/logging" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/telemetry" + + "github.com/distribution/reference" +) + +// Container labels applied to every engine container created by the +// start service. Frozen by `rtmanager/README.md §Container Model`. +const ( + LabelOwner = "com.galaxy.owner" + LabelOwnerValue = "rtmanager" + LabelKind = "com.galaxy.kind" + LabelKindValue = "game-engine" + LabelGameID = "com.galaxy.game_id" + LabelEngineImageRef = "com.galaxy.engine_image_ref" + LabelStartedAtMs = "com.galaxy.started_at_ms" + + // Image labels read at start time to derive resource limits. + imageLabelCPUQuota = "com.galaxy.cpu_quota" + imageLabelMemory = "com.galaxy.memory" + imageLabelPIDsLimit = "com.galaxy.pids_limit" + + // HostnamePrefix is the constant prefix used to build the per-game + // container hostname (`galaxy-game-{game_id}`). The full hostname + // also forms the container name; restart and patch keep the same + // value so the engine endpoint stays stable across container + // recreates. + HostnamePrefix = "galaxy-game-" + + // EngineStateBackCompatEnvName is the secondary env var name v1 + // engines accept for the bind-mounted state directory. Always set + // alongside the configured primary name to honour the v1 backward + // compatibility commitment in `rtmanager/README.md §Container Model`. + EngineStateBackCompatEnvName = "STORAGE_PATH" + + // leaseReleaseTimeout bounds the deferred lease-release call. A + // fresh background context is used so the release runs even when + // the request context was already canceled. + leaseReleaseTimeout = 5 * time.Second +) + +// Input stores the per-call arguments for one start operation. +type Input struct { + // GameID identifies the platform game to start. + GameID string + + // ImageRef stores the producer-resolved Docker reference of the + // engine image. Validated against `distribution/reference` before + // any Docker work. + ImageRef string + + // OpSource classifies how the request entered Runtime Manager. + // Required: every operation_log entry carries an op_source. + OpSource operation.OpSource + + // SourceRef stores the optional opaque per-source reference + // (Redis Stream entry id, REST request id, admin user id). Empty + // when the caller does not provide one. + SourceRef string +} + +// Validate reports whether input carries the structural invariants the +// service requires. +func (input Input) Validate() error { + if strings.TrimSpace(input.GameID) == "" { + return fmt.Errorf("game id must not be empty") + } + if strings.TrimSpace(input.ImageRef) == "" { + return fmt.Errorf("image ref must not be empty") + } + if !input.OpSource.IsKnown() { + return fmt.Errorf("op source %q is unsupported", input.OpSource) + } + return nil +} + +// Result stores the deterministic outcome of one Handle call. +type Result struct { + // Record carries the runtime record installed by the operation. + // Populated on success and on idempotent replay (`replay_no_op`); + // zero on failure. + Record runtime.RuntimeRecord + + // Outcome reports whether the operation completed (success) or + // produced a stable failure code. + Outcome operation.Outcome + + // ErrorCode stores the stable error code on failure, or + // `replay_no_op` on idempotent replay. Empty for fresh successes. + ErrorCode string + + // ErrorMessage stores the operator-readable detail on failure. + // Empty for successes. + ErrorMessage string +} + +// Dependencies groups the collaborators required by Service. +type Dependencies struct { + // RuntimeRecords reads and installs the durable runtime record. + RuntimeRecords ports.RuntimeRecordStore + + // OperationLogs records the success / failure audit entry. + OperationLogs ports.OperationLogStore + + // Docker drives the Docker daemon (network check, pull, inspect, + // run, remove). + Docker ports.DockerClient + + // Leases serialises operations against the same game id. + Leases ports.GameLeaseStore + + // HealthEvents publishes `runtime:health_events` and upserts the + // matching `health_snapshots` row. + HealthEvents ports.HealthEventPublisher + + // Notifications publishes admin-only failure intents. + Notifications ports.NotificationIntentPublisher + + // Lobby provides best-effort diagnostic context for the started + // game. May be nil; the start operation does not depend on it. + Lobby ports.LobbyInternalClient + + // Container groups the per-container defaults and state-directory + // settings consumed at start time. + Container config.ContainerConfig + + // Docker groups the Docker daemon settings (network, log driver, + // pull policy) consumed at start time. + DockerCfg config.DockerConfig + + // Coordination supplies the per-game lease TTL. + Coordination config.CoordinationConfig + + // Telemetry records start outcomes, lease latency, and health + // event counters. Required. + Telemetry *telemetry.Runtime + + // Logger records structured service-level events. Defaults to + // `slog.Default()` when nil. + Logger *slog.Logger + + // Clock supplies the wall-clock used for operation timestamps. + // Defaults to `time.Now` when nil. + Clock func() time.Time + + // NewToken supplies a unique opaque lease token. Defaults to a + // 32-byte random base64url string when nil. Tests may override. + NewToken func() string + + // PrepareStateDir creates the per-game state directory and + // returns its absolute host path. Defaults to a real-filesystem + // implementation that honours Container.GameStateRoot, + // Container.GameStateDirMode, and Container.GameStateOwner{UID,GID}. + // Tests override to point at a temporary directory. + PrepareStateDir func(gameID string) (string, error) +} + +// Service executes the start lifecycle operation. +type Service struct { + runtimeRecords ports.RuntimeRecordStore + operationLogs ports.OperationLogStore + docker ports.DockerClient + leases ports.GameLeaseStore + healthEvents ports.HealthEventPublisher + notifications ports.NotificationIntentPublisher + lobby ports.LobbyInternalClient + + containerCfg config.ContainerConfig + dockerCfg config.DockerConfig + leaseTTL time.Duration + + telemetry *telemetry.Runtime + logger *slog.Logger + + clock func() time.Time + newToken func() string + prepareStateDir func(gameID string) (string, error) +} + +// NewService constructs one Service from deps. +func NewService(deps Dependencies) (*Service, error) { + switch { + case deps.RuntimeRecords == nil: + return nil, errors.New("new start runtime service: nil runtime records") + case deps.OperationLogs == nil: + return nil, errors.New("new start runtime service: nil operation logs") + case deps.Docker == nil: + return nil, errors.New("new start runtime service: nil docker client") + case deps.Leases == nil: + return nil, errors.New("new start runtime service: nil lease store") + case deps.HealthEvents == nil: + return nil, errors.New("new start runtime service: nil health events publisher") + case deps.Notifications == nil: + return nil, errors.New("new start runtime service: nil notification publisher") + case deps.Telemetry == nil: + return nil, errors.New("new start runtime service: nil telemetry runtime") + } + if err := deps.Container.Validate(); err != nil { + return nil, fmt.Errorf("new start runtime service: container config: %w", err) + } + if err := deps.DockerCfg.Validate(); err != nil { + return nil, fmt.Errorf("new start runtime service: docker config: %w", err) + } + if err := deps.Coordination.Validate(); err != nil { + return nil, fmt.Errorf("new start runtime service: coordination config: %w", err) + } + + clock := deps.Clock + if clock == nil { + clock = time.Now + } + logger := deps.Logger + if logger == nil { + logger = slog.Default() + } + logger = logger.With("service", "rtmanager.startruntime") + + newToken := deps.NewToken + if newToken == nil { + newToken = defaultTokenGenerator() + } + prepareStateDir := deps.PrepareStateDir + if prepareStateDir == nil { + prepareStateDir = newDefaultStateDirPreparer(deps.Container) + } + + return &Service{ + runtimeRecords: deps.RuntimeRecords, + operationLogs: deps.OperationLogs, + docker: deps.Docker, + leases: deps.Leases, + healthEvents: deps.HealthEvents, + notifications: deps.Notifications, + lobby: deps.Lobby, + containerCfg: deps.Container, + dockerCfg: deps.DockerCfg, + leaseTTL: deps.Coordination.GameLeaseTTL, + telemetry: deps.Telemetry, + logger: logger, + clock: clock, + newToken: newToken, + prepareStateDir: prepareStateDir, + }, nil +} + +// Handle executes one start operation end-to-end. The Go-level error +// return is reserved for non-business failures (nil context, nil +// receiver). Every business outcome — fresh success, idempotent +// replay, or any of the stable failure modes — flows through Result. +func (service *Service) Handle(ctx context.Context, input Input) (Result, error) { + if service == nil { + return Result{}, errors.New("start runtime: nil service") + } + if ctx == nil { + return Result{}, errors.New("start runtime: nil context") + } + + opStartedAt := service.clock().UTC() + + if err := input.Validate(); err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeStartConfigInvalid, + errorMessage: err.Error(), + notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid, + }), nil + } + + token := service.newToken() + leaseStart := service.clock() + acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL) + service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart)) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeServiceUnavailable, + errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()), + }), nil + } + if !acquired { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeConflict, + errorMessage: "another lifecycle operation is in progress for this game", + }), nil + } + defer service.releaseLease(ctx, input.GameID, token) + + return service.runUnderLease(ctx, input, opStartedAt) +} + +// Run executes the start lifecycle assuming the per-game lease is +// already held by the caller. The method is reserved for orchestrator +// services in `internal/service/` that compose start with another +// operation under a single outer lease (restart and patch). External +// callers must use Handle, which acquires and releases the lease +// itself. +// +// Run still validates input and reports business outcomes through +// Result; the Go-level error return is reserved for non-business +// failures (nil context, nil receiver). Operation log entries, +// telemetry counters, health events and admin-only notification +// intents fire identically to Handle. +func (service *Service) Run(ctx context.Context, input Input) (Result, error) { + if service == nil { + return Result{}, errors.New("start runtime: nil service") + } + if ctx == nil { + return Result{}, errors.New("start runtime: nil context") + } + + opStartedAt := service.clock().UTC() + + if err := input.Validate(); err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeStartConfigInvalid, + errorMessage: err.Error(), + notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid, + }), nil + } + + return service.runUnderLease(ctx, input, opStartedAt) +} + +// runUnderLease executes the post-validation, lease-protected start +// steps shared by Handle and Run. Callers must validate input and +// acquire the lease (when applicable) before invocation. +func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) { + existing, hasExisting, err := service.loadExisting(ctx, input.GameID) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeInternal, + errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()), + }), nil + } + if hasExisting && existing.Status == runtime.StatusRunning { + if existing.CurrentImageRef == input.ImageRef { + return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil + } + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeConflict, + errorMessage: fmt.Sprintf("runtime already running with image_ref %q", existing.CurrentImageRef), + }), nil + } + + service.fetchLobbyDiagnostic(ctx, input.GameID) + + if err := validateImageRef(input.ImageRef); err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeStartConfigInvalid, + errorMessage: fmt.Sprintf("invalid image_ref: %s", err.Error()), + notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid, + }), nil + } + + if err := service.docker.EnsureNetwork(ctx, service.dockerCfg.Network); err != nil { + if errors.Is(err, ports.ErrNetworkMissing) { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeStartConfigInvalid, + errorMessage: fmt.Sprintf("docker network %q is missing", service.dockerCfg.Network), + notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid, + }), nil + } + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeServiceUnavailable, + errorMessage: fmt.Sprintf("ensure docker network: %s", err.Error()), + }), nil + } + + if err := service.docker.PullImage(ctx, input.ImageRef, ports.PullPolicy(service.dockerCfg.PullPolicy)); err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeImagePullFailed, + errorMessage: err.Error(), + notificationType: notificationintent.NotificationTypeRuntimeImagePullFailed, + }), nil + } + + imageInspect, err := service.docker.InspectImage(ctx, input.ImageRef) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeImagePullFailed, + errorMessage: fmt.Sprintf("inspect image: %s", err.Error()), + notificationType: notificationintent.NotificationTypeRuntimeImagePullFailed, + }), nil + } + cpuQuota, memory, pidsLimit := service.resolveLimits(imageInspect.Labels) + + statePath, err := service.prepareStateDir(input.GameID) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeStartConfigInvalid, + errorMessage: fmt.Sprintf("prepare state directory: %s", err.Error()), + notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid, + }), nil + } + + hostname := containerHostname(input.GameID) + spec := ports.RunSpec{ + Name: hostname, + Image: input.ImageRef, + Hostname: hostname, + Network: service.dockerCfg.Network, + Env: service.buildEnv(), + Labels: service.buildLabels(input.GameID, input.ImageRef, opStartedAt), + BindMounts: []ports.BindMount{{ + HostPath: statePath, + MountPath: service.containerCfg.EngineStateMountPath, + ReadOnly: false, + }}, + LogDriver: service.dockerCfg.LogDriver, + LogOpts: parseLogOpts(service.dockerCfg.LogOpts), + CPUQuota: cpuQuota, + Memory: memory, + PIDsLimit: pidsLimit, + } + runResult, err := service.docker.Run(ctx, spec) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeContainerStartFailed, + errorMessage: err.Error(), + notificationType: notificationintent.NotificationTypeRuntimeContainerStartFailed, + }), nil + } + + createdAt := opStartedAt + if hasExisting && !existing.CreatedAt.IsZero() { + createdAt = existing.CreatedAt + } + startedAt := runResult.StartedAt + record := runtime.RuntimeRecord{ + GameID: input.GameID, + Status: runtime.StatusRunning, + CurrentContainerID: runResult.ContainerID, + CurrentImageRef: input.ImageRef, + EngineEndpoint: runResult.EngineEndpoint, + StatePath: statePath, + DockerNetwork: service.dockerCfg.Network, + StartedAt: &startedAt, + LastOpAt: startedAt, + CreatedAt: createdAt, + } + if err := service.runtimeRecords.Upsert(ctx, record); err != nil { + service.bestEffortRemove(input.GameID, runResult.ContainerID) + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: ErrorCodeContainerStartFailed, + errorMessage: fmt.Sprintf("upsert runtime record: %s", err.Error()), + containerID: runResult.ContainerID, + notificationType: notificationintent.NotificationTypeRuntimeContainerStartFailed, + }), nil + } + + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: input.GameID, + OpKind: operation.OpKindStart, + OpSource: input.OpSource, + SourceRef: input.SourceRef, + ImageRef: input.ImageRef, + ContainerID: runResult.ContainerID, + Outcome: operation.OutcomeSuccess, + StartedAt: opStartedAt, + FinishedAt: &finishedAt, + }) + service.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{ + GameID: input.GameID, + ContainerID: runResult.ContainerID, + EventType: health.EventTypeContainerStarted, + OccurredAt: startedAt, + Details: containerStartedDetails(input.ImageRef), + }) + + service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeSuccess), "", string(input.OpSource)) + service.telemetry.RecordHealthEvent(ctx, string(health.EventTypeContainerStarted)) + + logArgs := []any{ + "game_id", input.GameID, + "container_id", runResult.ContainerID, + "image_ref", input.ImageRef, + "op_source", string(input.OpSource), + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.InfoContext(ctx, "runtime started", logArgs...) + + return Result{ + Record: record, + Outcome: operation.OutcomeSuccess, + }, nil +} + +// failureCtx groups the inputs to recordFailure so the Handle method +// stays readable. +type failureCtx struct { + opStartedAt time.Time + input Input + errorCode string + errorMessage string + containerID string + notificationType notificationintent.NotificationType +} + +// recordFailure records the failure operation_log entry, publishes the +// matching admin-only notification intent (when applicable), and emits +// telemetry. All side effects are best-effort; a downstream failure is +// logged but does not change the returned Result. +func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result { + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: fc.input.GameID, + OpKind: operation.OpKindStart, + OpSource: fc.input.OpSource, + SourceRef: fc.input.SourceRef, + ImageRef: fc.input.ImageRef, + ContainerID: fc.containerID, + Outcome: operation.OutcomeFailure, + ErrorCode: fc.errorCode, + ErrorMessage: fc.errorMessage, + StartedAt: fc.opStartedAt, + FinishedAt: &finishedAt, + }) + + if fc.notificationType != "" { + service.bestEffortNotify(ctx, fc) + } + + service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode, string(fc.input.OpSource)) + + logArgs := []any{ + "game_id", fc.input.GameID, + "image_ref", fc.input.ImageRef, + "op_source", string(fc.input.OpSource), + "error_code", fc.errorCode, + "error_message", fc.errorMessage, + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.WarnContext(ctx, "runtime start failed", logArgs...) + + return Result{ + Outcome: operation.OutcomeFailure, + ErrorCode: fc.errorCode, + ErrorMessage: fc.errorMessage, + } +} + +// recordReplayNoOp records the idempotent replay outcome and returns +// the existing record. The operation_log entry is appended best-effort +// so audit history captures the replay; telemetry counts the call as a +// successful start with `error_code=replay_no_op`. +func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result { + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: input.GameID, + OpKind: operation.OpKindStart, + OpSource: input.OpSource, + SourceRef: input.SourceRef, + ImageRef: input.ImageRef, + ContainerID: existing.CurrentContainerID, + Outcome: operation.OutcomeSuccess, + ErrorCode: ErrorCodeReplayNoOp, + StartedAt: opStartedAt, + FinishedAt: &finishedAt, + }) + service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeSuccess), ErrorCodeReplayNoOp, string(input.OpSource)) + + logArgs := []any{ + "game_id", input.GameID, + "container_id", existing.CurrentContainerID, + "image_ref", input.ImageRef, + "op_source", string(input.OpSource), + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.InfoContext(ctx, "runtime start replay no-op", logArgs...) + + return Result{ + Record: existing, + Outcome: operation.OutcomeSuccess, + ErrorCode: ErrorCodeReplayNoOp, + } +} + +// loadExisting reads the runtime record for gameID. The boolean return +// reports whether a record exists; ErrNotFound is translated to +// (zero, false, nil) so the caller does not branch on the sentinel +// elsewhere. +func (service *Service) loadExisting(ctx context.Context, gameID string) (runtime.RuntimeRecord, bool, error) { + record, err := service.runtimeRecords.Get(ctx, gameID) + switch { + case errors.Is(err, runtime.ErrNotFound): + return runtime.RuntimeRecord{}, false, nil + case err != nil: + return runtime.RuntimeRecord{}, false, err + default: + return record, true, nil + } +} + +// fetchLobbyDiagnostic best-effort enriches the request log with the +// Lobby-side game record. A nil Lobby client or any transport failure +// is logged and the start operation continues. +func (service *Service) fetchLobbyDiagnostic(ctx context.Context, gameID string) { + if service.lobby == nil { + return + } + record, err := service.lobby.GetGame(ctx, gameID) + if err != nil { + service.logger.DebugContext(ctx, "lobby diagnostic fetch failed", + "game_id", gameID, + "err", err.Error(), + ) + return + } + service.logger.DebugContext(ctx, "lobby diagnostic fetched", + "game_id", gameID, + "lobby_status", record.Status, + "lobby_target_engine_version", record.TargetEngineVersion, + ) +} + +// resolveLimits derives the per-container resource limits from the +// resolved image's labels with config-driven fallbacks. Unparseable +// label values silently fall back to the configured default; operators +// see the chosen value through `rtmanager.docker_op_latency` and start +// logs. +func (service *Service) resolveLimits(labels map[string]string) (cpuQuota float64, memory string, pidsLimit int) { + cpuQuota = service.containerCfg.DefaultCPUQuota + memory = service.containerCfg.DefaultMemory + pidsLimit = service.containerCfg.DefaultPIDsLimit + + if raw, ok := labels[imageLabelCPUQuota]; ok { + if value, err := strconv.ParseFloat(raw, 64); err == nil && value > 0 { + cpuQuota = value + } + } + if raw, ok := labels[imageLabelMemory]; ok && strings.TrimSpace(raw) != "" { + memory = raw + } + if raw, ok := labels[imageLabelPIDsLimit]; ok { + if value, err := strconv.Atoi(raw); err == nil && value > 0 { + pidsLimit = value + } + } + return cpuQuota, memory, pidsLimit +} + +// buildEnv assembles the env-var map handed to the engine. Both the +// configured primary name and `STORAGE_PATH` are set per +// `rtmanager/README.md §Container Model` v1 backward compatibility. +func (service *Service) buildEnv() map[string]string { + mount := service.containerCfg.EngineStateMountPath + env := map[string]string{ + service.containerCfg.EngineStateEnvName: mount, + EngineStateBackCompatEnvName: mount, + } + return env +} + +// buildLabels assembles the container labels per +// `rtmanager/README.md §Container Model`. +func (service *Service) buildLabels(gameID, imageRef string, startedAt time.Time) map[string]string { + return map[string]string{ + LabelOwner: LabelOwnerValue, + LabelKind: LabelKindValue, + LabelGameID: gameID, + LabelEngineImageRef: imageRef, + LabelStartedAtMs: strconv.FormatInt(startedAt.UTC().UnixMilli(), 10), + } +} + +// releaseLease releases the per-game lease in a fresh background +// context so a canceled request context does not leave the lease +// pinned for its TTL. +func (service *Service) releaseLease(ctx context.Context, gameID, token string) { + cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout) + defer cancel() + if err := service.leases.Release(cleanupCtx, gameID, token); err != nil { + service.logger.WarnContext(ctx, "release game lease", + "game_id", gameID, + "err", err.Error(), + ) + } +} + +// bestEffortAppend writes one operation_log entry. A failure is logged +// and discarded; the durable runtime record (or its absence) remains +// the source of truth. +func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) { + if _, err := service.operationLogs.Append(ctx, entry); err != nil { + service.logger.ErrorContext(ctx, "append operation log", + "game_id", entry.GameID, + "op_kind", string(entry.OpKind), + "outcome", string(entry.Outcome), + "error_code", entry.ErrorCode, + "err", err.Error(), + ) + } +} + +// bestEffortPublishHealth emits one health event + snapshot upsert. +// Failures degrade silently per `rtmanager/README.md §Notification +// Contracts`; the runtime record remains the source of truth. +func (service *Service) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) { + if err := service.healthEvents.Publish(ctx, envelope); err != nil { + service.logger.ErrorContext(ctx, "publish health event", + "game_id", envelope.GameID, + "container_id", envelope.ContainerID, + "event_type", string(envelope.EventType), + "err", err.Error(), + ) + } +} + +// bestEffortNotify publishes one admin-only failure intent. Failures +// degrade silently because the source business state already reflects +// the outcome. +func (service *Service) bestEffortNotify(ctx context.Context, fc failureCtx) { + intent, err := buildFailureIntent(fc, service.clock().UTC()) + if err != nil { + service.logger.ErrorContext(ctx, "build notification intent", + "game_id", fc.input.GameID, + "notification_type", string(fc.notificationType), + "err", err.Error(), + ) + return + } + if err := service.notifications.Publish(ctx, intent); err != nil { + service.logger.ErrorContext(ctx, "publish notification intent", + "game_id", fc.input.GameID, + "notification_type", string(fc.notificationType), + "err", err.Error(), + ) + return + } + service.telemetry.RecordNotificationIntent(ctx, string(fc.notificationType)) +} + +// bestEffortRemove forces removal of a container left running by a +// failed start that progressed past Run but failed to register the +// runtime record. Failures degrade silently — the reconciler adopts +// orphans the periodic pass observes. +func (service *Service) bestEffortRemove(gameID, containerID string) { + cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout) + defer cancel() + if err := service.docker.Remove(cleanupCtx, containerID); err != nil { + service.logger.ErrorContext(cleanupCtx, "rollback container after upsert failure", + "game_id", gameID, + "container_id", containerID, + "err", err.Error(), + ) + } +} + +// containerHostname builds the per-game hostname that doubles as the +// Docker container name. +func containerHostname(gameID string) string { + return HostnamePrefix + gameID +} + +// containerStartedDetails builds the `details` payload required by the +// `container_started` AsyncAPI variant. +func containerStartedDetails(imageRef string) json.RawMessage { + payload := map[string]string{"image_ref": imageRef} + encoded, _ := json.Marshal(payload) + return encoded +} + +// validateImageRef rejects malformed Docker references before any +// daemon round-trip. The validation surfaces as `start_config_invalid`; +// daemon-side rejections after a valid parse are reported as +// `image_pull_failed`. +func validateImageRef(ref string) error { + if strings.TrimSpace(ref) == "" { + return fmt.Errorf("image ref must not be empty") + } + if _, err := reference.ParseNormalizedNamed(ref); err != nil { + return err + } + return nil +} + +// parseLogOpts turns the `key=value,key2=value2` shape of the +// `RTMANAGER_DOCKER_LOG_OPTS` config into a map suitable for the +// Docker SDK. Empty input returns nil so the SDK uses driver defaults. +func parseLogOpts(raw string) map[string]string { + if strings.TrimSpace(raw) == "" { + return nil + } + out := make(map[string]string) + for part := range strings.SplitSeq(raw, ",") { + entry := strings.TrimSpace(part) + if entry == "" { + continue + } + index := strings.IndexByte(entry, '=') + if index <= 0 { + continue + } + out[entry[:index]] = entry[index+1:] + } + if len(out) == 0 { + return nil + } + return out +} + +// buildFailureIntent constructs the admin-only notification intent for +// fc. The idempotency key is scoped per (notification_type, game_id, +// image_ref, attempted_at_ms) so the same failure observed twice is +// recognised as a duplicate by Notification Service. +func buildFailureIntent(fc failureCtx, attemptedAt time.Time) (notificationintent.Intent, error) { + attemptedAtMs := attemptedAt.UnixMilli() + idempotencyKey := fmt.Sprintf("%s.%s.%d", fc.notificationType, fc.input.GameID, attemptedAtMs) + metadata := notificationintent.Metadata{ + IdempotencyKey: idempotencyKey, + OccurredAt: attemptedAt, + } + + switch fc.notificationType { + case notificationintent.NotificationTypeRuntimeImagePullFailed: + return notificationintent.NewRuntimeImagePullFailedIntent(metadata, notificationintent.RuntimeImagePullFailedPayload{ + GameID: fc.input.GameID, + ImageRef: fc.input.ImageRef, + ErrorCode: fc.errorCode, + ErrorMessage: fc.errorMessage, + AttemptedAtMs: attemptedAtMs, + }) + case notificationintent.NotificationTypeRuntimeContainerStartFailed: + return notificationintent.NewRuntimeContainerStartFailedIntent(metadata, notificationintent.RuntimeContainerStartFailedPayload{ + GameID: fc.input.GameID, + ImageRef: fc.input.ImageRef, + ErrorCode: fc.errorCode, + ErrorMessage: fc.errorMessage, + AttemptedAtMs: attemptedAtMs, + }) + case notificationintent.NotificationTypeRuntimeStartConfigInvalid: + return notificationintent.NewRuntimeStartConfigInvalidIntent(metadata, notificationintent.RuntimeStartConfigInvalidPayload{ + GameID: fc.input.GameID, + ImageRef: fc.input.ImageRef, + ErrorCode: fc.errorCode, + ErrorMessage: fc.errorMessage, + AttemptedAtMs: attemptedAtMs, + }) + default: + return notificationintent.Intent{}, fmt.Errorf("unsupported notification type %q", fc.notificationType) + } +} + +// defaultTokenGenerator returns a function that produces 32-byte +// base64url-encoded tokens. The randomness source is `crypto/rand`; +// failures fall back to a deterministic-looking but invalid token so +// the caller observes a TryAcquire collision rather than a panic on a +// degraded entropy source. +func defaultTokenGenerator() func() string { + return func() string { + var buf [32]byte + if _, err := rand.Read(buf[:]); err != nil { + return "rtmanager-fallback-token" + } + return base64.RawURLEncoding.EncodeToString(buf[:]) + } +} + +// newDefaultStateDirPreparer returns a function that creates the +// per-game state directory under cfg.GameStateRoot with the configured +// permissions and ownership. The function is overridable through +// Dependencies.PrepareStateDir; tests inject a temporary-dir fake. +func newDefaultStateDirPreparer(cfg config.ContainerConfig) func(gameID string) (string, error) { + mode := os.FileMode(cfg.GameStateDirMode) + uid := cfg.GameStateOwnerUID + gid := cfg.GameStateOwnerGID + root := cfg.GameStateRoot + return func(gameID string) (string, error) { + path := filepath.Join(root, gameID) + if err := os.MkdirAll(path, mode); err != nil { + return "", fmt.Errorf("create state dir %q: %w", path, err) + } + if err := os.Chmod(path, mode); err != nil { + return "", fmt.Errorf("chmod state dir %q: %w", path, err) + } + if err := os.Chown(path, uid, gid); err != nil { + return "", fmt.Errorf("chown state dir %q: %w", path, err) + } + return path, nil + } +} diff --git a/rtmanager/internal/service/startruntime/service_test.go b/rtmanager/internal/service/startruntime/service_test.go new file mode 100644 index 0000000..d810f46 --- /dev/null +++ b/rtmanager/internal/service/startruntime/service_test.go @@ -0,0 +1,693 @@ +package startruntime_test + +import ( + "context" + "encoding/json" + "errors" + "sync" + "testing" + "time" + + "galaxy/notificationintent" + "galaxy/rtmanager/internal/adapters/docker/mocks" + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/telemetry" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" +) + +// --- test doubles ----------------------------------------------------- + +type fakeRuntimeRecords struct { + mu sync.Mutex + stored map[string]runtime.RuntimeRecord + getErr error + upsertErr error + upserts []runtime.RuntimeRecord +} + +func newFakeRuntimeRecords() *fakeRuntimeRecords { + return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}} +} + +func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.getErr != nil { + return runtime.RuntimeRecord{}, s.getErr + } + record, ok := s.stored[gameID] + if !ok { + return runtime.RuntimeRecord{}, runtime.ErrNotFound + } + return record, nil +} + +func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.upsertErr != nil { + return s.upsertErr + } + s.upserts = append(s.upserts, record) + s.stored[record.GameID] = record + return nil +} + +func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error { + return errors.New("not used in start tests") +} + +func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in start tests") +} + +func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in start tests") +} + +type fakeOperationLogs struct { + mu sync.Mutex + appendErr error + appends []operation.OperationEntry +} + +func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.appendErr != nil { + return 0, s.appendErr + } + s.appends = append(s.appends, entry) + return int64(len(s.appends)), nil +} + +func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) { + return nil, errors.New("not used in start tests") +} + +func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) { + s.mu.Lock() + defer s.mu.Unlock() + if len(s.appends) == 0 { + return operation.OperationEntry{}, false + } + return s.appends[len(s.appends)-1], true +} + +type fakeLeases struct { + acquired bool + acquireErr error + releaseErr error + + mu sync.Mutex + acquires []string + releases []string +} + +func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) { + l.mu.Lock() + defer l.mu.Unlock() + l.acquires = append(l.acquires, token) + if l.acquireErr != nil { + return false, l.acquireErr + } + return l.acquired, nil +} + +func (l *fakeLeases) Release(_ context.Context, _, token string) error { + l.mu.Lock() + defer l.mu.Unlock() + l.releases = append(l.releases, token) + return l.releaseErr +} + +type fakeHealthEvents struct { + mu sync.Mutex + publishErr error + envelopes []ports.HealthEventEnvelope +} + +func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error { + h.mu.Lock() + defer h.mu.Unlock() + if h.publishErr != nil { + return h.publishErr + } + h.envelopes = append(h.envelopes, envelope) + return nil +} + +type fakeNotifications struct { + mu sync.Mutex + publishErr error + intents []notificationintent.Intent +} + +func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error { + n.mu.Lock() + defer n.mu.Unlock() + if n.publishErr != nil { + return n.publishErr + } + n.intents = append(n.intents, intent) + return nil +} + +type fakeLobby struct { + record ports.LobbyGameRecord + err error + + mu sync.Mutex + calls []string +} + +func (l *fakeLobby) GetGame(_ context.Context, gameID string) (ports.LobbyGameRecord, error) { + l.mu.Lock() + defer l.mu.Unlock() + l.calls = append(l.calls, gameID) + if l.err != nil { + return ports.LobbyGameRecord{}, l.err + } + return l.record, nil +} + +// --- harness ---------------------------------------------------------- + +type harness struct { + records *fakeRuntimeRecords + operationLogs *fakeOperationLogs + docker *mocks.MockDockerClient + leases *fakeLeases + healthEvents *fakeHealthEvents + notifications *fakeNotifications + lobby *fakeLobby + telemetry *telemetry.Runtime + + now time.Time + stateDir string +} + +func newHarness(t *testing.T) *harness { + t.Helper() + ctrl := gomock.NewController(t) + t.Cleanup(ctrl.Finish) + + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + return &harness{ + records: newFakeRuntimeRecords(), + operationLogs: &fakeOperationLogs{}, + docker: mocks.NewMockDockerClient(ctrl), + leases: &fakeLeases{acquired: true}, + healthEvents: &fakeHealthEvents{}, + notifications: &fakeNotifications{}, + lobby: &fakeLobby{}, + telemetry: telemetryRuntime, + now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC), + stateDir: "/var/lib/galaxy/games/game-1", + } +} + +func (h *harness) build(t *testing.T) *startruntime.Service { + t.Helper() + + containerCfg := config.ContainerConfig{ + DefaultCPUQuota: 1.0, + DefaultMemory: "512m", + DefaultPIDsLimit: 512, + StopTimeout: 30 * time.Second, + Retention: 30 * 24 * time.Hour, + EngineStateMountPath: "/var/lib/galaxy-game", + EngineStateEnvName: "GAME_STATE_PATH", + GameStateDirMode: 0o750, + GameStateRoot: "/var/lib/galaxy/games", + } + dockerCfg := config.DockerConfig{ + Host: "unix:///var/run/docker.sock", + Network: "galaxy-net", + LogDriver: "json-file", + PullPolicy: config.ImagePullPolicyIfMissing, + } + coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute} + + service, err := startruntime.NewService(startruntime.Dependencies{ + RuntimeRecords: h.records, + OperationLogs: h.operationLogs, + Docker: h.docker, + Leases: h.leases, + HealthEvents: h.healthEvents, + Notifications: h.notifications, + Lobby: h.lobby, + Container: containerCfg, + DockerCfg: dockerCfg, + Coordination: coordinationCfg, + Telemetry: h.telemetry, + Clock: func() time.Time { return h.now }, + NewToken: func() string { return "token-A" }, + PrepareStateDir: func(_ string) (string, error) { + return h.stateDir, nil + }, + }) + require.NoError(t, err) + return service +} + +func basicInput() startruntime.Input { + return startruntime.Input{ + GameID: "game-1", + ImageRef: "registry.example.com/galaxy/game:1.4.7", + OpSource: operation.OpSourceLobbyStream, + SourceRef: "1700000000000-0", + } +} + +func sampleRunResult(now time.Time) ports.RunResult { + return ports.RunResult{ + ContainerID: "ctr-123", + EngineEndpoint: "http://galaxy-game-game-1:8080", + StartedAt: now, + } +} + +// --- happy path ------------------------------------------------------- + +func TestHandleHappyPath(t *testing.T) { + h := newHarness(t) + input := basicInput() + + h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil) + h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, ports.PullPolicy(config.ImagePullPolicyIfMissing)).Return(nil) + h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{ + Ref: input.ImageRef, + Labels: map[string]string{ + "com.galaxy.cpu_quota": "0.5", + "com.galaxy.memory": "256m", + "com.galaxy.pids_limit": "256", + }, + }, nil) + h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).DoAndReturn(func(_ context.Context, spec ports.RunSpec) (ports.RunResult, error) { + assert.Equal(t, "galaxy-game-game-1", spec.Name) + assert.Equal(t, "galaxy-game-game-1", spec.Hostname) + assert.Equal(t, input.ImageRef, spec.Image) + assert.Equal(t, "galaxy-net", spec.Network) + assert.Equal(t, "json-file", spec.LogDriver) + assert.InDelta(t, 0.5, spec.CPUQuota, 0) + assert.Equal(t, "256m", spec.Memory) + assert.Equal(t, 256, spec.PIDsLimit) + assert.Equal(t, h.stateDir, spec.BindMounts[0].HostPath) + assert.Equal(t, "/var/lib/galaxy-game", spec.BindMounts[0].MountPath) + assert.Equal(t, "/var/lib/galaxy-game", spec.Env["GAME_STATE_PATH"]) + assert.Equal(t, "/var/lib/galaxy-game", spec.Env["STORAGE_PATH"]) + assert.Equal(t, "rtmanager", spec.Labels[startruntime.LabelOwner]) + assert.Equal(t, "game-engine", spec.Labels[startruntime.LabelKind]) + assert.Equal(t, input.GameID, spec.Labels[startruntime.LabelGameID]) + assert.Equal(t, input.ImageRef, spec.Labels[startruntime.LabelEngineImageRef]) + return sampleRunResult(h.now), nil + }) + + service := h.build(t) + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Empty(t, result.ErrorCode) + assert.Equal(t, runtime.StatusRunning, result.Record.Status) + assert.Equal(t, "ctr-123", result.Record.CurrentContainerID) + assert.Equal(t, input.ImageRef, result.Record.CurrentImageRef) + assert.Equal(t, "http://galaxy-game-game-1:8080", result.Record.EngineEndpoint) + assert.Equal(t, h.stateDir, result.Record.StatePath) + assert.Equal(t, "galaxy-net", result.Record.DockerNetwork) + require.NotNil(t, result.Record.StartedAt) + assert.Equal(t, h.now, *result.Record.StartedAt) + assert.Equal(t, h.now, result.Record.LastOpAt) + assert.Equal(t, h.now, result.Record.CreatedAt) + + require.Len(t, h.records.upserts, 1) + require.Len(t, h.operationLogs.appends, 1) + last, _ := h.operationLogs.lastAppend() + assert.Equal(t, operation.OpKindStart, last.OpKind) + assert.Equal(t, operation.OutcomeSuccess, last.Outcome) + assert.Empty(t, last.ErrorCode) + assert.Equal(t, "ctr-123", last.ContainerID) + + require.Len(t, h.healthEvents.envelopes, 1) + assert.Equal(t, health.EventTypeContainerStarted, h.healthEvents.envelopes[0].EventType) + var details map[string]string + require.NoError(t, json.Unmarshal(h.healthEvents.envelopes[0].Details, &details)) + assert.Equal(t, input.ImageRef, details["image_ref"]) + + assert.Empty(t, h.notifications.intents, "no notification intent expected on success") + assert.Equal(t, []string{"token-A"}, h.leases.acquires) + assert.Equal(t, []string{"token-A"}, h.leases.releases) + assert.Equal(t, []string{input.GameID}, h.lobby.calls) +} + +// --- idempotent replay ------------------------------------------------ + +func TestHandleReplayNoOpForRunningRecordWithSameImageRef(t *testing.T) { + h := newHarness(t) + input := basicInput() + startedAt := h.now.Add(-time.Hour) + h.records.stored[input.GameID] = runtime.RuntimeRecord{ + GameID: input.GameID, + Status: runtime.StatusRunning, + CurrentContainerID: "ctr-prev", + CurrentImageRef: input.ImageRef, + EngineEndpoint: "http://galaxy-game-game-1:8080", + StatePath: h.stateDir, + DockerNetwork: "galaxy-net", + StartedAt: &startedAt, + LastOpAt: startedAt, + CreatedAt: startedAt, + } + + service := h.build(t) + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode) + assert.Equal(t, "ctr-prev", result.Record.CurrentContainerID) + + assert.Empty(t, h.records.upserts, "replay must not Upsert a fresh record") + require.Len(t, h.operationLogs.appends, 1) + last, _ := h.operationLogs.lastAppend() + assert.Equal(t, operation.OutcomeSuccess, last.Outcome) + assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode) + assert.Equal(t, "ctr-prev", last.ContainerID) + assert.Empty(t, h.notifications.intents) + assert.Equal(t, []string{"token-A"}, h.leases.releases, "lease must be released after replay no-op") +} + +// --- conflicts -------------------------------------------------------- + +func TestHandleConflictWhenLeaseBusy(t *testing.T) { + h := newHarness(t) + h.leases.acquired = false + input := basicInput() + + service := h.build(t) + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode) + + require.Len(t, h.operationLogs.appends, 1) + last, _ := h.operationLogs.lastAppend() + assert.Equal(t, operation.OutcomeFailure, last.Outcome) + assert.Equal(t, startruntime.ErrorCodeConflict, last.ErrorCode) + + assert.Empty(t, h.notifications.intents, "lease conflicts must not raise admin notifications") + assert.Empty(t, h.leases.releases, "release must not run when acquire returned false") +} + +func TestHandleConflictWhenRunningWithDifferentImageRef(t *testing.T) { + h := newHarness(t) + input := basicInput() + startedAt := h.now.Add(-time.Hour) + h.records.stored[input.GameID] = runtime.RuntimeRecord{ + GameID: input.GameID, + Status: runtime.StatusRunning, + CurrentContainerID: "ctr-prev", + CurrentImageRef: "registry.example.com/galaxy/game:1.4.6", + EngineEndpoint: "http://galaxy-game-game-1:8080", + StatePath: h.stateDir, + DockerNetwork: "galaxy-net", + StartedAt: &startedAt, + LastOpAt: startedAt, + CreatedAt: startedAt, + } + + service := h.build(t) + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode) + + last, _ := h.operationLogs.lastAppend() + assert.Equal(t, startruntime.ErrorCodeConflict, last.ErrorCode) + assert.Empty(t, h.notifications.intents) + assert.Empty(t, h.records.upserts) +} + +// --- start_config_invalid --------------------------------------------- + +func TestHandleStartConfigInvalidWhenImageRefMalformed(t *testing.T) { + h := newHarness(t) + input := basicInput() + input.ImageRef = "::not a docker reference::" + + service := h.build(t) + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode) + + require.Len(t, h.notifications.intents, 1) + assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType) + last, _ := h.operationLogs.lastAppend() + assert.Equal(t, operation.OutcomeFailure, last.Outcome) +} + +func TestHandleStartConfigInvalidWhenNetworkMissing(t *testing.T) { + h := newHarness(t) + input := basicInput() + + h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(ports.ErrNetworkMissing) + + service := h.build(t) + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode) + require.Len(t, h.notifications.intents, 1) + assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType) +} + +func TestHandleStartConfigInvalidWhenStateDirFails(t *testing.T) { + h := newHarness(t) + input := basicInput() + + h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil) + h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil) + h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil) + + service, err := startruntime.NewService(startruntime.Dependencies{ + RuntimeRecords: h.records, + OperationLogs: h.operationLogs, + Docker: h.docker, + Leases: h.leases, + HealthEvents: h.healthEvents, + Notifications: h.notifications, + Lobby: h.lobby, + Container: config.ContainerConfig{ + DefaultCPUQuota: 1.0, + DefaultMemory: "512m", + DefaultPIDsLimit: 512, + StopTimeout: 30 * time.Second, + Retention: 30 * 24 * time.Hour, + EngineStateMountPath: "/var/lib/galaxy-game", + EngineStateEnvName: "GAME_STATE_PATH", + GameStateDirMode: 0o750, + GameStateRoot: "/var/lib/galaxy/games", + }, + DockerCfg: config.DockerConfig{ + Host: "unix:///var/run/docker.sock", + Network: "galaxy-net", + LogDriver: "json-file", + PullPolicy: config.ImagePullPolicyIfMissing, + }, + Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute}, + Telemetry: h.telemetry, + Clock: func() time.Time { return h.now }, + NewToken: func() string { return "token-A" }, + PrepareStateDir: func(_ string) (string, error) { + return "", errors.New("disk full") + }, + }) + require.NoError(t, err) + + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode) + require.Len(t, h.notifications.intents, 1) + assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType) +} + +// --- image_pull_failed ------------------------------------------------ + +func TestHandleImagePullFailed(t *testing.T) { + h := newHarness(t) + input := basicInput() + + h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil) + h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(errors.New("manifest unknown")) + + service := h.build(t) + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode) + require.Len(t, h.notifications.intents, 1) + assert.Equal(t, notificationintent.NotificationTypeRuntimeImagePullFailed, h.notifications.intents[0].NotificationType) + assert.Empty(t, h.records.upserts) +} + +// --- container_start_failed ------------------------------------------ + +func TestHandleContainerStartFailedOnRunError(t *testing.T) { + h := newHarness(t) + input := basicInput() + + h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil) + h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil) + h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil) + h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(ports.RunResult{}, errors.New("container name conflict")) + + service := h.build(t) + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeContainerStartFailed, result.ErrorCode) + require.Len(t, h.notifications.intents, 1) + assert.Equal(t, notificationintent.NotificationTypeRuntimeContainerStartFailed, h.notifications.intents[0].NotificationType) + assert.Empty(t, h.records.upserts) +} + +func TestHandleRollsBackContainerWhenUpsertFails(t *testing.T) { + h := newHarness(t) + h.records.upsertErr = errors.New("connection refused") + input := basicInput() + + h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil) + h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil) + h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil) + h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil) + h.docker.EXPECT().Remove(gomock.Any(), "ctr-123").Return(nil) + + service := h.build(t) + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeContainerStartFailed, result.ErrorCode) + require.Len(t, h.notifications.intents, 1) + assert.Equal(t, notificationintent.NotificationTypeRuntimeContainerStartFailed, h.notifications.intents[0].NotificationType) +} + +// --- best-effort degradation ----------------------------------------- + +func TestHandleSuccessSurvivesOperationLogFailure(t *testing.T) { + h := newHarness(t) + h.operationLogs.appendErr = errors.New("postgres down") + input := basicInput() + + h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil) + h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil) + h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil) + h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil) + + service := h.build(t) + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Empty(t, result.ErrorCode) + assert.Len(t, h.records.upserts, 1) +} + +func TestHandleSuccessSurvivesHealthPublishFailure(t *testing.T) { + h := newHarness(t) + h.healthEvents.publishErr = errors.New("redis down") + input := basicInput() + + h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil) + h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil) + h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil) + h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil) + + service := h.build(t) + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Len(t, h.records.upserts, 1) +} + +// --- pre-existing stopped record proceeds with fresh start ---------- + +func TestHandlePreservesCreatedAtForExistingRecord(t *testing.T) { + h := newHarness(t) + input := basicInput() + originalCreatedAt := h.now.Add(-72 * time.Hour) + stoppedAt := h.now.Add(-time.Hour) + h.records.stored[input.GameID] = runtime.RuntimeRecord{ + GameID: input.GameID, + Status: runtime.StatusStopped, + CurrentImageRef: "registry.example.com/galaxy/game:1.4.6", + EngineEndpoint: "http://galaxy-game-game-1:8080", + StatePath: h.stateDir, + DockerNetwork: "galaxy-net", + StoppedAt: &stoppedAt, + LastOpAt: stoppedAt, + CreatedAt: originalCreatedAt, + } + + h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil) + h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil) + h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil) + h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil) + + service := h.build(t) + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Equal(t, originalCreatedAt, result.Record.CreatedAt, "created_at must be preserved across re-starts") + assert.Equal(t, runtime.StatusRunning, result.Record.Status) + assert.Equal(t, input.ImageRef, result.Record.CurrentImageRef) +} + +// --- input validation ----------------------------------------------- + +func TestHandleRejectsInvalidInput(t *testing.T) { + h := newHarness(t) + service := h.build(t) + + cases := []startruntime.Input{ + {GameID: "", ImageRef: "x", OpSource: operation.OpSourceLobbyStream}, + {GameID: "g", ImageRef: "", OpSource: operation.OpSourceLobbyStream}, + {GameID: "g", ImageRef: "x", OpSource: operation.OpSource("bogus")}, + } + for _, input := range cases { + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode) + } +} + +func TestNewServiceRejectsMissingDependencies(t *testing.T) { + h := newHarness(t) + deps := startruntime.Dependencies{ + Container: config.ContainerConfig{ + DefaultCPUQuota: 1.0, + DefaultMemory: "512m", + DefaultPIDsLimit: 512, + StopTimeout: 30 * time.Second, + Retention: 30 * 24 * time.Hour, + EngineStateMountPath: "/var/lib/galaxy-game", + EngineStateEnvName: "GAME_STATE_PATH", + GameStateDirMode: 0o750, + GameStateRoot: "/var/lib/galaxy/games", + }, + DockerCfg: config.DockerConfig{ + Host: "unix:///var/run/docker.sock", + Network: "galaxy-net", + LogDriver: "json-file", + PullPolicy: config.ImagePullPolicyIfMissing, + }, + Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute}, + Telemetry: h.telemetry, + } + _, err := startruntime.NewService(deps) + require.Error(t, err) +} diff --git a/rtmanager/internal/service/stopruntime/service.go b/rtmanager/internal/service/stopruntime/service.go new file mode 100644 index 0000000..27f71e8 --- /dev/null +++ b/rtmanager/internal/service/stopruntime/service.go @@ -0,0 +1,612 @@ +// Package stopruntime implements the `stop` lifecycle operation owned by +// Runtime Manager. The service is the single orchestrator behind both +// the asynchronous `runtime:stop_jobs` consumer and the synchronous +// `POST /api/v1/internal/runtimes/{game_id}/stop` REST handler. It is +// also the inner stop step of the restart and patch services, which +// call Run while holding the outer per-game lease. +// +// Lifecycle and failure-mode semantics follow `rtmanager/README.md +// §Lifecycles → Stop`. Design rationale is captured in +// `rtmanager/docs/services.md`. +package stopruntime + +import ( + "context" + "crypto/rand" + "encoding/base64" + "encoding/json" + "errors" + "fmt" + "log/slog" + "strings" + "time" + + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/logging" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/telemetry" +) + +// leaseReleaseTimeout bounds the deferred lease-release call. A fresh +// background context is used so the release runs even when the request +// context was already canceled. +const leaseReleaseTimeout = 5 * time.Second + +// Input stores the per-call arguments for one stop operation. +type Input struct { + // GameID identifies the platform game to stop. + GameID string + + // Reason classifies the trigger of the stop. Required. + Reason StopReason + + // OpSource classifies how the request entered Runtime Manager. + // Required: every operation_log entry carries an op_source. + OpSource operation.OpSource + + // SourceRef stores the optional opaque per-source reference (Redis + // Stream entry id, REST request id, admin user id). Empty when the + // caller does not provide one. For inner calls invoked by the + // restart and patch orchestrators it carries the outer correlation + // id so the three operation_log entries share it. + SourceRef string +} + +// Validate reports whether input carries the structural invariants the +// service requires. +func (input Input) Validate() error { + if strings.TrimSpace(input.GameID) == "" { + return fmt.Errorf("game id must not be empty") + } + if !input.OpSource.IsKnown() { + return fmt.Errorf("op source %q is unsupported", input.OpSource) + } + if err := input.Reason.Validate(); err != nil { + return err + } + return nil +} + +// Result stores the deterministic outcome of one Handle / Run call. +type Result struct { + // Record carries the runtime record installed by the operation. + // Populated on success and on idempotent replay; zero on failure. + Record runtime.RuntimeRecord + + // Outcome reports whether the operation completed (success) or + // produced a stable failure code. + Outcome operation.Outcome + + // ErrorCode stores the stable error code on failure, or + // `replay_no_op` on idempotent replay. Empty for fresh successes. + ErrorCode string + + // ErrorMessage stores the operator-readable detail on failure. + // Empty for successes. + ErrorMessage string +} + +// Dependencies groups the collaborators required by Service. +type Dependencies struct { + // RuntimeRecords reads and updates the durable runtime record. + RuntimeRecords ports.RuntimeRecordStore + + // OperationLogs records the success / failure audit entry. + OperationLogs ports.OperationLogStore + + // Docker drives the Docker daemon (container stop). + Docker ports.DockerClient + + // Leases serialises operations against the same game id. + Leases ports.GameLeaseStore + + // HealthEvents publishes `runtime:health_events` and upserts the + // matching `health_snapshots` row. Used on the vanished-container + // path to emit `container_disappeared`. + HealthEvents ports.HealthEventPublisher + + // Container groups the per-container settings consumed at stop time + // (the graceful stop timeout). + Container config.ContainerConfig + + // Coordination supplies the per-game lease TTL. + Coordination config.CoordinationConfig + + // Telemetry records stop outcomes and lease latency. Required. + Telemetry *telemetry.Runtime + + // Logger records structured service-level events. Defaults to + // `slog.Default()` when nil. + Logger *slog.Logger + + // Clock supplies the wall-clock used for operation timestamps. + // Defaults to `time.Now` when nil. + Clock func() time.Time + + // NewToken supplies a unique opaque lease token. Defaults to a + // 32-byte random base64url string when nil. Tests may override. + NewToken func() string +} + +// Service executes the stop lifecycle operation. +type Service struct { + runtimeRecords ports.RuntimeRecordStore + operationLogs ports.OperationLogStore + docker ports.DockerClient + leases ports.GameLeaseStore + healthEvents ports.HealthEventPublisher + + stopTimeout time.Duration + leaseTTL time.Duration + + telemetry *telemetry.Runtime + logger *slog.Logger + + clock func() time.Time + newToken func() string +} + +// NewService constructs one Service from deps. +func NewService(deps Dependencies) (*Service, error) { + switch { + case deps.RuntimeRecords == nil: + return nil, errors.New("new stop runtime service: nil runtime records") + case deps.OperationLogs == nil: + return nil, errors.New("new stop runtime service: nil operation logs") + case deps.Docker == nil: + return nil, errors.New("new stop runtime service: nil docker client") + case deps.Leases == nil: + return nil, errors.New("new stop runtime service: nil lease store") + case deps.HealthEvents == nil: + return nil, errors.New("new stop runtime service: nil health events publisher") + case deps.Telemetry == nil: + return nil, errors.New("new stop runtime service: nil telemetry runtime") + } + if err := deps.Container.Validate(); err != nil { + return nil, fmt.Errorf("new stop runtime service: container config: %w", err) + } + if err := deps.Coordination.Validate(); err != nil { + return nil, fmt.Errorf("new stop runtime service: coordination config: %w", err) + } + + clock := deps.Clock + if clock == nil { + clock = time.Now + } + logger := deps.Logger + if logger == nil { + logger = slog.Default() + } + logger = logger.With("service", "rtmanager.stopruntime") + + newToken := deps.NewToken + if newToken == nil { + newToken = defaultTokenGenerator() + } + + return &Service{ + runtimeRecords: deps.RuntimeRecords, + operationLogs: deps.OperationLogs, + docker: deps.Docker, + leases: deps.Leases, + healthEvents: deps.HealthEvents, + stopTimeout: deps.Container.StopTimeout, + leaseTTL: deps.Coordination.GameLeaseTTL, + telemetry: deps.Telemetry, + logger: logger, + clock: clock, + newToken: newToken, + }, nil +} + +// Handle executes one stop operation end-to-end. The Go-level error +// return is reserved for non-business failures (nil context, nil +// receiver). Every business outcome — success, idempotent replay, or +// any of the stable failure modes — flows through Result. +func (service *Service) Handle(ctx context.Context, input Input) (Result, error) { + if service == nil { + return Result{}, errors.New("stop runtime: nil service") + } + if ctx == nil { + return Result{}, errors.New("stop runtime: nil context") + } + + opStartedAt := service.clock().UTC() + + if err := input.Validate(); err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInvalidRequest, + errorMessage: err.Error(), + }), nil + } + + token := service.newToken() + leaseStart := service.clock() + acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL) + service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart)) + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeServiceUnavailable, + errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()), + }), nil + } + if !acquired { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeConflict, + errorMessage: "another lifecycle operation is in progress for this game", + }), nil + } + defer service.releaseLease(ctx, input.GameID, token) + + return service.runUnderLease(ctx, input, opStartedAt) +} + +// Run executes the stop lifecycle assuming the per-game lease is +// already held by the caller. The method is reserved for orchestrator +// services in `internal/service/` that compose stop with another +// operation under a single outer lease (restart and patch). External +// callers must use Handle. +func (service *Service) Run(ctx context.Context, input Input) (Result, error) { + if service == nil { + return Result{}, errors.New("stop runtime: nil service") + } + if ctx == nil { + return Result{}, errors.New("stop runtime: nil context") + } + + opStartedAt := service.clock().UTC() + + if err := input.Validate(); err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInvalidRequest, + errorMessage: err.Error(), + }), nil + } + + return service.runUnderLease(ctx, input, opStartedAt) +} + +// runUnderLease executes the post-validation, lease-protected stop +// steps shared by Handle and Run. +func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) { + existing, err := service.runtimeRecords.Get(ctx, input.GameID) + if errors.Is(err, runtime.ErrNotFound) { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeNotFound, + errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID), + }), nil + } + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()), + }), nil + } + + switch existing.Status { + case runtime.StatusStopped, runtime.StatusRemoved: + return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil + case runtime.StatusRunning: + // proceed + default: + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("runtime record has unsupported status %q", existing.Status), + }), nil + } + + if err := service.docker.Stop(ctx, existing.CurrentContainerID, service.stopTimeout); err != nil { + if errors.Is(err, ports.ErrContainerNotFound) { + return service.handleVanished(ctx, input, opStartedAt, existing), nil + } + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeServiceUnavailable, + errorMessage: fmt.Sprintf("docker stop: %s", err.Error()), + containerID: existing.CurrentContainerID, + imageRef: existing.CurrentImageRef, + }), nil + } + + updateNow := service.clock().UTC() + err = service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: input.GameID, + ExpectedFrom: runtime.StatusRunning, + ExpectedContainerID: existing.CurrentContainerID, + To: runtime.StatusStopped, + Now: updateNow, + }) + if errors.Is(err, runtime.ErrConflict) { + // CAS race: a concurrent reconciler / restart already moved the + // record. The desired terminal state was reached by another path. + return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil + } + if errors.Is(err, runtime.ErrNotFound) { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeNotFound, + errorMessage: fmt.Sprintf("runtime record for game %q vanished mid-stop", input.GameID), + containerID: existing.CurrentContainerID, + imageRef: existing.CurrentImageRef, + }), nil + } + if err != nil { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("update runtime status: %s", err.Error()), + containerID: existing.CurrentContainerID, + imageRef: existing.CurrentImageRef, + }), nil + } + + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: input.GameID, + OpKind: operation.OpKindStop, + OpSource: input.OpSource, + SourceRef: input.SourceRef, + ImageRef: existing.CurrentImageRef, + ContainerID: existing.CurrentContainerID, + Outcome: operation.OutcomeSuccess, + StartedAt: opStartedAt, + FinishedAt: &finishedAt, + }) + service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource)) + + record := existing + record.Status = runtime.StatusStopped + stoppedAt := updateNow + record.StoppedAt = &stoppedAt + record.LastOpAt = updateNow + + logArgs := []any{ + "game_id", input.GameID, + "container_id", existing.CurrentContainerID, + "reason", string(input.Reason), + "op_source", string(input.OpSource), + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.InfoContext(ctx, "runtime stopped", logArgs...) + + return Result{ + Record: record, + Outcome: operation.OutcomeSuccess, + }, nil +} + +// handleVanished records the success outcome for the case where docker +// stop reports the container as already gone. It updates the record to +// removed, publishes container_disappeared, and returns success. +func (service *Service) handleVanished(ctx context.Context, input Input, opStartedAt time.Time, existing runtime.RuntimeRecord) Result { + updateNow := service.clock().UTC() + err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: input.GameID, + ExpectedFrom: runtime.StatusRunning, + ExpectedContainerID: existing.CurrentContainerID, + To: runtime.StatusRemoved, + Now: updateNow, + }) + if errors.Is(err, runtime.ErrConflict) { + return service.recordReplayNoOp(ctx, opStartedAt, input, existing) + } + if err != nil && !errors.Is(err, runtime.ErrNotFound) { + return service.recordFailure(ctx, failureCtx{ + opStartedAt: opStartedAt, + input: input, + errorCode: startruntime.ErrorCodeInternal, + errorMessage: fmt.Sprintf("update runtime status to removed: %s", err.Error()), + containerID: existing.CurrentContainerID, + imageRef: existing.CurrentImageRef, + }) + } + + service.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{ + GameID: input.GameID, + ContainerID: existing.CurrentContainerID, + EventType: health.EventTypeContainerDisappeared, + OccurredAt: updateNow, + Details: emptyHealthDetails(), + }) + + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: input.GameID, + OpKind: operation.OpKindStop, + OpSource: input.OpSource, + SourceRef: input.SourceRef, + ImageRef: existing.CurrentImageRef, + ContainerID: existing.CurrentContainerID, + Outcome: operation.OutcomeSuccess, + StartedAt: opStartedAt, + FinishedAt: &finishedAt, + }) + service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource)) + service.telemetry.RecordHealthEvent(ctx, string(health.EventTypeContainerDisappeared)) + + record := existing + record.Status = runtime.StatusRemoved + record.CurrentContainerID = "" + removedAt := updateNow + record.RemovedAt = &removedAt + record.LastOpAt = updateNow + + logArgs := []any{ + "game_id", input.GameID, + "container_id", existing.CurrentContainerID, + "reason", string(input.Reason), + "op_source", string(input.OpSource), + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.InfoContext(ctx, "runtime stop on vanished container", logArgs...) + + return Result{ + Record: record, + Outcome: operation.OutcomeSuccess, + } +} + +// recordReplayNoOp records the idempotent replay outcome and returns the +// existing record unchanged. +func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result { + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: input.GameID, + OpKind: operation.OpKindStop, + OpSource: input.OpSource, + SourceRef: input.SourceRef, + ImageRef: existing.CurrentImageRef, + ContainerID: existing.CurrentContainerID, + Outcome: operation.OutcomeSuccess, + ErrorCode: startruntime.ErrorCodeReplayNoOp, + StartedAt: opStartedAt, + FinishedAt: &finishedAt, + }) + service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource)) + + logArgs := []any{ + "game_id", input.GameID, + "container_id", existing.CurrentContainerID, + "reason", string(input.Reason), + "op_source", string(input.OpSource), + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.InfoContext(ctx, "runtime stop replay no-op", logArgs...) + + return Result{ + Record: existing, + Outcome: operation.OutcomeSuccess, + ErrorCode: startruntime.ErrorCodeReplayNoOp, + } +} + +// failureCtx groups the inputs to recordFailure so the runUnderLease +// method stays readable. +type failureCtx struct { + opStartedAt time.Time + input Input + errorCode string + errorMessage string + containerID string + imageRef string +} + +// recordFailure records the failure operation_log entry and emits +// telemetry. The runtime record stays untouched. +func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result { + finishedAt := service.clock().UTC() + service.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: fc.input.GameID, + OpKind: operation.OpKindStop, + OpSource: fc.input.OpSource, + SourceRef: fc.input.SourceRef, + ImageRef: fc.imageRef, + ContainerID: fc.containerID, + Outcome: operation.OutcomeFailure, + ErrorCode: fc.errorCode, + ErrorMessage: fc.errorMessage, + StartedAt: fc.opStartedAt, + FinishedAt: &finishedAt, + }) + service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeFailure), string(fc.input.Reason), string(fc.input.OpSource)) + + logArgs := []any{ + "game_id", fc.input.GameID, + "reason", string(fc.input.Reason), + "op_source", string(fc.input.OpSource), + "error_code", fc.errorCode, + "error_message", fc.errorMessage, + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + service.logger.WarnContext(ctx, "runtime stop failed", logArgs...) + + return Result{ + Outcome: operation.OutcomeFailure, + ErrorCode: fc.errorCode, + ErrorMessage: fc.errorMessage, + } +} + +// releaseLease releases the per-game lease in a fresh background context +// so a canceled request context does not leave the lease pinned for its +// TTL. +func (service *Service) releaseLease(ctx context.Context, gameID, token string) { + cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout) + defer cancel() + if err := service.leases.Release(cleanupCtx, gameID, token); err != nil { + service.logger.WarnContext(ctx, "release game lease", + "game_id", gameID, + "err", err.Error(), + ) + } +} + +// bestEffortAppend writes one operation_log entry. A failure is logged +// and discarded; the durable runtime record (or its absence) remains +// the source of truth. +func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) { + if _, err := service.operationLogs.Append(ctx, entry); err != nil { + service.logger.ErrorContext(ctx, "append operation log", + "game_id", entry.GameID, + "op_kind", string(entry.OpKind), + "outcome", string(entry.Outcome), + "error_code", entry.ErrorCode, + "err", err.Error(), + ) + } +} + +// bestEffortPublishHealth emits one health event + snapshot upsert. +// Failures degrade silently per `rtmanager/README.md §Notification +// Contracts`; the runtime record remains the source of truth. +func (service *Service) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) { + if err := service.healthEvents.Publish(ctx, envelope); err != nil { + service.logger.ErrorContext(ctx, "publish health event", + "game_id", envelope.GameID, + "container_id", envelope.ContainerID, + "event_type", string(envelope.EventType), + "err", err.Error(), + ) + } +} + +// defaultTokenGenerator returns a function that produces 32-byte +// base64url-encoded tokens. Mirrors the start service: a degraded +// entropy source falls back to a sentinel token so the next TryAcquire +// observes a collision rather than a panic. +func defaultTokenGenerator() func() string { + return func() string { + var buf [32]byte + if _, err := rand.Read(buf[:]); err != nil { + return "rtmanager-fallback-token" + } + return base64.RawURLEncoding.EncodeToString(buf[:]) + } +} + +// emptyHealthDetails returns the canonical empty-object payload required +// by the `container_disappeared` AsyncAPI variant. +func emptyHealthDetails() json.RawMessage { + return json.RawMessage("{}") +} diff --git a/rtmanager/internal/service/stopruntime/service_test.go b/rtmanager/internal/service/stopruntime/service_test.go new file mode 100644 index 0000000..0bbd75d --- /dev/null +++ b/rtmanager/internal/service/stopruntime/service_test.go @@ -0,0 +1,537 @@ +package stopruntime_test + +import ( + "context" + "errors" + "sync" + "testing" + "time" + + "galaxy/rtmanager/internal/adapters/docker/mocks" + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/service/stopruntime" + "galaxy/rtmanager/internal/telemetry" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" +) + +// --- test doubles ----------------------------------------------------- + +type fakeRuntimeRecords struct { + mu sync.Mutex + + stored map[string]runtime.RuntimeRecord + getErr error + updateStatusErr error + + updates []ports.UpdateStatusInput +} + +func newFakeRuntimeRecords() *fakeRuntimeRecords { + return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}} +} + +func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.getErr != nil { + return runtime.RuntimeRecord{}, s.getErr + } + record, ok := s.stored[gameID] + if !ok { + return runtime.RuntimeRecord{}, runtime.ErrNotFound + } + return record, nil +} + +func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { + return errors.New("not used in stop tests") +} + +func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error { + s.mu.Lock() + defer s.mu.Unlock() + s.updates = append(s.updates, input) + if s.updateStatusErr != nil { + return s.updateStatusErr + } + record, ok := s.stored[input.GameID] + if !ok { + return runtime.ErrNotFound + } + if record.Status != input.ExpectedFrom { + return runtime.ErrConflict + } + if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID { + return runtime.ErrConflict + } + record.Status = input.To + record.LastOpAt = input.Now + switch input.To { + case runtime.StatusStopped: + stoppedAt := input.Now + record.StoppedAt = &stoppedAt + case runtime.StatusRemoved: + removedAt := input.Now + record.RemovedAt = &removedAt + record.CurrentContainerID = "" + } + s.stored[input.GameID] = record + return nil +} + +func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in stop tests") +} + +func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in stop tests") +} + +type fakeOperationLogs struct { + mu sync.Mutex + + appendErr error + appends []operation.OperationEntry +} + +func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.appendErr != nil { + return 0, s.appendErr + } + s.appends = append(s.appends, entry) + return int64(len(s.appends)), nil +} + +func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) { + return nil, errors.New("not used in stop tests") +} + +func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) { + s.mu.Lock() + defer s.mu.Unlock() + if len(s.appends) == 0 { + return operation.OperationEntry{}, false + } + return s.appends[len(s.appends)-1], true +} + +type fakeLeases struct { + acquired bool + acquireErr error + releaseErr error + + mu sync.Mutex + acquires []string + releases []string +} + +func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) { + l.mu.Lock() + defer l.mu.Unlock() + l.acquires = append(l.acquires, token) + if l.acquireErr != nil { + return false, l.acquireErr + } + return l.acquired, nil +} + +func (l *fakeLeases) Release(_ context.Context, _, token string) error { + l.mu.Lock() + defer l.mu.Unlock() + l.releases = append(l.releases, token) + return l.releaseErr +} + +type fakeHealthEvents struct { + mu sync.Mutex + + publishErr error + envelopes []ports.HealthEventEnvelope +} + +func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error { + h.mu.Lock() + defer h.mu.Unlock() + if h.publishErr != nil { + return h.publishErr + } + h.envelopes = append(h.envelopes, envelope) + return nil +} + +// --- harness ---------------------------------------------------------- + +type harness struct { + records *fakeRuntimeRecords + operationLogs *fakeOperationLogs + docker *mocks.MockDockerClient + leases *fakeLeases + healthEvents *fakeHealthEvents + + telemetry *telemetry.Runtime + + now time.Time +} + +func newHarness(t *testing.T) *harness { + t.Helper() + ctrl := gomock.NewController(t) + t.Cleanup(ctrl.Finish) + + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + return &harness{ + records: newFakeRuntimeRecords(), + operationLogs: &fakeOperationLogs{}, + docker: mocks.NewMockDockerClient(ctrl), + leases: &fakeLeases{acquired: true}, + healthEvents: &fakeHealthEvents{}, + telemetry: telemetryRuntime, + now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC), + } +} + +func (h *harness) build(t *testing.T) *stopruntime.Service { + t.Helper() + + containerCfg := config.ContainerConfig{ + DefaultCPUQuota: 1.0, + DefaultMemory: "512m", + DefaultPIDsLimit: 512, + StopTimeout: 30 * time.Second, + Retention: 30 * 24 * time.Hour, + EngineStateMountPath: "/var/lib/galaxy-game", + EngineStateEnvName: "GAME_STATE_PATH", + GameStateDirMode: 0o750, + GameStateRoot: "/var/lib/galaxy/games", + } + coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute} + + service, err := stopruntime.NewService(stopruntime.Dependencies{ + RuntimeRecords: h.records, + OperationLogs: h.operationLogs, + Docker: h.docker, + Leases: h.leases, + HealthEvents: h.healthEvents, + Container: containerCfg, + Coordination: coordinationCfg, + Telemetry: h.telemetry, + Clock: func() time.Time { return h.now }, + NewToken: func() string { return "token-A" }, + }) + require.NoError(t, err) + return service +} + +func basicInput() stopruntime.Input { + return stopruntime.Input{ + GameID: "game-1", + Reason: stopruntime.StopReasonCancelled, + OpSource: operation.OpSourceLobbyStream, + SourceRef: "1700000000000-0", + } +} + +func runningRecord(now time.Time) runtime.RuntimeRecord { + startedAt := now.Add(-time.Hour) + return runtime.RuntimeRecord{ + GameID: "game-1", + Status: runtime.StatusRunning, + CurrentContainerID: "ctr-123", + CurrentImageRef: "registry.example.com/galaxy/game:1.4.7", + EngineEndpoint: "http://galaxy-game-game-1:8080", + StatePath: "/var/lib/galaxy/games/game-1", + DockerNetwork: "galaxy-net", + StartedAt: &startedAt, + LastOpAt: startedAt, + CreatedAt: startedAt, + } +} + +// --- happy path ------------------------------------------------------- + +func TestHandleHappyPath(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil) + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Empty(t, result.ErrorCode) + assert.Equal(t, runtime.StatusStopped, result.Record.Status) + require.NotNil(t, result.Record.StoppedAt) + assert.Equal(t, h.now, *result.Record.StoppedAt) + assert.Equal(t, h.now, result.Record.LastOpAt) + + require.Len(t, h.records.updates, 1) + assert.Equal(t, runtime.StatusRunning, h.records.updates[0].ExpectedFrom) + assert.Equal(t, runtime.StatusStopped, h.records.updates[0].To) + assert.Equal(t, "ctr-123", h.records.updates[0].ExpectedContainerID) + + require.Len(t, h.operationLogs.appends, 1) + last, _ := h.operationLogs.lastAppend() + assert.Equal(t, operation.OpKindStop, last.OpKind) + assert.Equal(t, operation.OutcomeSuccess, last.Outcome) + assert.Empty(t, last.ErrorCode) + assert.Equal(t, "ctr-123", last.ContainerID) + + assert.Empty(t, h.healthEvents.envelopes) + assert.Equal(t, []string{"token-A"}, h.leases.acquires) + assert.Equal(t, []string{"token-A"}, h.leases.releases) +} + +// --- replay ---------------------------------------------------------- + +func TestHandleReplayNoOpForStoppedRecord(t *testing.T) { + h := newHarness(t) + stoppedRecord := runningRecord(h.now) + stoppedRecord.Status = runtime.StatusStopped + stoppedAt := h.now.Add(-time.Minute) + stoppedRecord.StoppedAt = &stoppedAt + h.records.stored["game-1"] = stoppedRecord + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode) + assert.Equal(t, runtime.StatusStopped, result.Record.Status) + + assert.Empty(t, h.records.updates) + require.Len(t, h.operationLogs.appends, 1) + last, _ := h.operationLogs.lastAppend() + assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode) + assert.Equal(t, []string{"token-A"}, h.leases.releases) +} + +func TestHandleReplayNoOpForRemovedRecord(t *testing.T) { + h := newHarness(t) + removed := runningRecord(h.now) + removed.Status = runtime.StatusRemoved + removed.CurrentContainerID = "" + removedAt := h.now.Add(-time.Minute) + removed.RemovedAt = &removedAt + h.records.stored["game-1"] = removed + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode) +} + +// --- vanished container ---------------------------------------------- + +func TestHandleVanishedContainerMarksRemoved(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(ports.ErrContainerNotFound) + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Empty(t, result.ErrorCode) + assert.Equal(t, runtime.StatusRemoved, result.Record.Status) + assert.Empty(t, result.Record.CurrentContainerID) + + require.Len(t, h.records.updates, 1) + assert.Equal(t, runtime.StatusRemoved, h.records.updates[0].To) + + require.Len(t, h.healthEvents.envelopes, 1) + assert.Equal(t, health.EventTypeContainerDisappeared, h.healthEvents.envelopes[0].EventType) + + require.Len(t, h.operationLogs.appends, 1) + last, _ := h.operationLogs.lastAppend() + assert.Equal(t, operation.OutcomeSuccess, last.Outcome) + assert.Empty(t, last.ErrorCode) +} + +// --- failure paths --------------------------------------------------- + +func TestHandleNotFoundForMissingRecord(t *testing.T) { + h := newHarness(t) + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode) + assert.Empty(t, h.healthEvents.envelopes) + assert.Empty(t, h.records.updates) +} + +func TestHandleServiceUnavailableOnDockerError(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(errors.New("docker daemon timeout")) + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode) + + last, _ := h.operationLogs.lastAppend() + assert.Equal(t, operation.OutcomeFailure, last.Outcome) + assert.Equal(t, "ctr-123", last.ContainerID) + assert.Empty(t, h.records.updates, "no record mutation on docker stop failure") +} + +func TestHandleReplayNoOpOnUpdateStatusConflict(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + h.records.updateStatusErr = runtime.ErrConflict + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil) + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode) +} + +func TestHandleInternalErrorOnUpdateStatusGenericError(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + h.records.updateStatusErr = errors.New("postgres down") + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil) + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeInternal, result.ErrorCode) +} + +// --- conflicts ------------------------------------------------------- + +func TestHandleConflictWhenLeaseBusy(t *testing.T) { + h := newHarness(t) + h.leases.acquired = false + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode) + + assert.Empty(t, h.leases.releases, "release must not run when acquire returned false") +} + +func TestHandleServiceUnavailableOnLeaseError(t *testing.T) { + h := newHarness(t) + h.leases.acquireErr = errors.New("redis timeout") + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeFailure, result.Outcome) + assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode) +} + +// --- input validation ------------------------------------------------ + +func TestHandleRejectsInvalidInput(t *testing.T) { + h := newHarness(t) + service := h.build(t) + + cases := []stopruntime.Input{ + {GameID: "", Reason: stopruntime.StopReasonCancelled, OpSource: operation.OpSourceLobbyStream}, + {GameID: "g", Reason: "", OpSource: operation.OpSourceLobbyStream}, + {GameID: "g", Reason: stopruntime.StopReason("bogus"), OpSource: operation.OpSourceLobbyStream}, + {GameID: "g", Reason: stopruntime.StopReasonCancelled, OpSource: operation.OpSource("bogus")}, + } + for _, input := range cases { + result, err := service.Handle(context.Background(), input) + require.NoError(t, err) + assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode) + } +} + +// --- Run path (no-lease) --------------------------------------------- + +func TestRunSkipsLease(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + h.leases.acquired = false // would block Handle; Run must ignore + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil) + + service := h.build(t) + result, err := service.Run(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Empty(t, h.leases.acquires, "Run must not touch the lease store") + assert.Empty(t, h.leases.releases) +} + +// --- best-effort degradation ---------------------------------------- + +func TestHandleSurvivesOperationLogFailure(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + h.operationLogs.appendErr = errors.New("postgres down") + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil) + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) +} + +func TestHandleSurvivesHealthPublishFailureOnVanished(t *testing.T) { + h := newHarness(t) + h.records.stored["game-1"] = runningRecord(h.now) + h.healthEvents.publishErr = errors.New("redis down") + + h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(ports.ErrContainerNotFound) + + service := h.build(t) + result, err := service.Handle(context.Background(), basicInput()) + require.NoError(t, err) + assert.Equal(t, operation.OutcomeSuccess, result.Outcome) + assert.Equal(t, runtime.StatusRemoved, result.Record.Status) +} + +// --- constructor ----------------------------------------------------- + +func TestNewServiceRejectsMissingDependencies(t *testing.T) { + h := newHarness(t) + deps := stopruntime.Dependencies{ + Container: config.ContainerConfig{ + DefaultCPUQuota: 1.0, + DefaultMemory: "512m", + DefaultPIDsLimit: 512, + StopTimeout: 30 * time.Second, + Retention: 30 * 24 * time.Hour, + EngineStateMountPath: "/var/lib/galaxy-game", + EngineStateEnvName: "GAME_STATE_PATH", + GameStateDirMode: 0o750, + GameStateRoot: "/var/lib/galaxy/games", + }, + Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute}, + Telemetry: h.telemetry, + } + _, err := stopruntime.NewService(deps) + require.Error(t, err) +} diff --git a/rtmanager/internal/service/stopruntime/stopreason.go b/rtmanager/internal/service/stopruntime/stopreason.go new file mode 100644 index 0000000..6bbfca1 --- /dev/null +++ b/rtmanager/internal/service/stopruntime/stopreason.go @@ -0,0 +1,82 @@ +package stopruntime + +import "fmt" + +// StopReason classifies why a caller is asking Runtime Manager to stop a +// game container. The enum is part of the `runtime:stop_jobs` envelope +// produced by Game Lobby and the body of the `POST +// /api/v1/internal/runtimes/{game_id}/stop` REST endpoint, and mirrors +// the AsyncAPI contract frozen in +// `rtmanager/api/runtime-jobs-asyncapi.yaml`. +// +// The vocabulary is shared with `lobby/internal/ports/runtimemanager.go`; +// the two declarations stay byte-identical and adding a new value +// requires a coordinated contract bump on both sides. +type StopReason string + +// StopReason enum values. Adding a new value is a contract change that +// touches the AsyncAPI spec, the Lobby producer, and every Runtime +// Manager consumer. +const ( + // StopReasonOrphanCleanup releases a container whose post-start + // metadata persistence failed in Lobby. + StopReasonOrphanCleanup StopReason = "orphan_cleanup" + + // StopReasonCancelled covers user-lifecycle cascade and explicit + // cancel paths for in-flight games. + StopReasonCancelled StopReason = "cancelled" + + // StopReasonFinished is reserved for engine-driven game finish flows. + StopReasonFinished StopReason = "finished" + + // StopReasonAdminRequest is reserved for admin-initiated stop paths. + StopReasonAdminRequest StopReason = "admin_request" + + // StopReasonTimeout is reserved for timeout-driven stop paths. + StopReasonTimeout StopReason = "timeout" +) + +// IsKnown reports whether reason belongs to the frozen stop-reason +// vocabulary. +func (reason StopReason) IsKnown() bool { + switch reason { + case StopReasonOrphanCleanup, + StopReasonCancelled, + StopReasonFinished, + StopReasonAdminRequest, + StopReasonTimeout: + return true + default: + return false + } +} + +// AllStopReasons returns the frozen list of every stop-reason value. The +// slice order is stable across calls and matches the AsyncAPI enum order. +func AllStopReasons() []StopReason { + return []StopReason{ + StopReasonOrphanCleanup, + StopReasonCancelled, + StopReasonFinished, + StopReasonAdminRequest, + StopReasonTimeout, + } +} + +// String returns reason as its stored enum value. Useful in log fields +// and telemetry attributes. +func (reason StopReason) String() string { + return string(reason) +} + +// Validate reports whether reason carries one of the five values fixed +// by the AsyncAPI contract. +func (reason StopReason) Validate() error { + if reason == "" { + return fmt.Errorf("stop reason must not be empty") + } + if !reason.IsKnown() { + return fmt.Errorf("stop reason %q is unsupported", reason) + } + return nil +} diff --git a/rtmanager/internal/telemetry/runtime.go b/rtmanager/internal/telemetry/runtime.go new file mode 100644 index 0000000..7df6e95 --- /dev/null +++ b/rtmanager/internal/telemetry/runtime.go @@ -0,0 +1,651 @@ +// Package telemetry provides lightweight OpenTelemetry helpers and +// low-cardinality Runtime Manager instruments used by the runnable +// skeleton. Later stages emit into the instruments declared here without +// touching this package. +package telemetry + +import ( + "context" + "errors" + "fmt" + "log/slog" + "os" + "strings" + "sync" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" + "go.opentelemetry.io/otel/exporters/stdout/stdoutmetric" + "go.opentelemetry.io/otel/exporters/stdout/stdouttrace" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/propagation" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + oteltrace "go.opentelemetry.io/otel/trace" +) + +const meterName = "galaxy/rtmanager" + +const ( + defaultServiceName = "galaxy-rtmanager" + + processExporterNone = "none" + processExporterOTLP = "otlp" + processProtocolHTTPProtobuf = "http/protobuf" + processProtocolGRPC = "grpc" +) + +// ProcessConfig configures the process-wide OpenTelemetry runtime. +type ProcessConfig struct { + // ServiceName overrides the default OpenTelemetry service name. + ServiceName string + + // TracesExporter selects the external traces exporter. Supported values + // are `none` and `otlp`. + TracesExporter string + + // MetricsExporter selects the external metrics exporter. Supported + // values are `none` and `otlp`. + MetricsExporter string + + // TracesProtocol selects the OTLP traces protocol when TracesExporter is + // `otlp`. + TracesProtocol string + + // MetricsProtocol selects the OTLP metrics protocol when + // MetricsExporter is `otlp`. + MetricsProtocol string + + // StdoutTracesEnabled enables the additional stdout trace exporter used + // for local development and debugging. + StdoutTracesEnabled bool + + // StdoutMetricsEnabled enables the additional stdout metric exporter + // used for local development and debugging. + StdoutMetricsEnabled bool +} + +// Validate reports whether cfg contains a supported OpenTelemetry exporter +// configuration. +func (cfg ProcessConfig) Validate() error { + switch cfg.TracesExporter { + case processExporterNone, processExporterOTLP: + default: + return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter) + } + + switch cfg.MetricsExporter { + case processExporterNone, processExporterOTLP: + default: + return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter) + } + + if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC { + return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol) + } + if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC { + return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol) + } + + return nil +} + +// Runtime owns the Runtime Manager OpenTelemetry providers and the +// low-cardinality custom instruments listed in `rtmanager/README.md` +// §Observability. +type Runtime struct { + tracerProvider oteltrace.TracerProvider + meterProvider metric.MeterProvider + meter metric.Meter + + shutdownMu sync.Mutex + shutdownDone bool + shutdownErr error + shutdownFns []func(context.Context) error + + internalHTTPRequests metric.Int64Counter + internalHTTPDuration metric.Float64Histogram + + startOutcomes metric.Int64Counter + stopOutcomes metric.Int64Counter + restartOutcomes metric.Int64Counter + patchOutcomes metric.Int64Counter + cleanupOutcomes metric.Int64Counter + healthEvents metric.Int64Counter + reconcileDrift metric.Int64Counter + notificationIntents metric.Int64Counter + dockerOpLatency metric.Float64Histogram + leaseAcquireLatency metric.Float64Histogram + + runtimeRecordsByStatus metric.Int64ObservableGauge + + gaugeMu sync.Mutex + gaugeRegistration metric.Registration +} + +// NewWithProviders constructs a telemetry runtime around explicitly supplied +// meterProvider and tracerProvider values. +func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) { + if meterProvider == nil { + meterProvider = otel.GetMeterProvider() + } + if tracerProvider == nil { + tracerProvider = otel.GetTracerProvider() + } + if meterProvider == nil { + return nil, errors.New("new rtmanager telemetry runtime: nil meter provider") + } + if tracerProvider == nil { + return nil, errors.New("new rtmanager telemetry runtime: nil tracer provider") + } + + return buildRuntime(meterProvider, tracerProvider, nil) +} + +// NewProcess constructs the process-wide Runtime Manager OpenTelemetry +// runtime from cfg, installs the resulting providers globally, and +// returns the runtime. +func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) { + if ctx == nil { + return nil, errors.New("new rtmanager telemetry process: nil context") + } + if err := cfg.Validate(); err != nil { + return nil, fmt.Errorf("new rtmanager telemetry process: %w", err) + } + if logger == nil { + logger = slog.Default() + } + + serviceName := strings.TrimSpace(cfg.ServiceName) + if serviceName == "" { + serviceName = defaultServiceName + } + + res := resource.NewSchemaless(attribute.String("service.name", serviceName)) + + tracerProvider, err := newTracerProvider(ctx, res, cfg) + if err != nil { + return nil, fmt.Errorf("new rtmanager telemetry process: tracer provider: %w", err) + } + meterProvider, err := newMeterProvider(ctx, res, cfg) + if err != nil { + return nil, fmt.Errorf("new rtmanager telemetry process: meter provider: %w", err) + } + + otel.SetTracerProvider(tracerProvider) + otel.SetMeterProvider(meterProvider) + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + propagation.Baggage{}, + )) + + runtime, err := buildRuntime(meterProvider, tracerProvider, []func(context.Context) error{ + meterProvider.Shutdown, + tracerProvider.Shutdown, + }) + if err != nil { + return nil, fmt.Errorf("new rtmanager telemetry process: runtime: %w", err) + } + + logger.Info("rtmanager telemetry configured", + "service_name", serviceName, + "traces_exporter", cfg.TracesExporter, + "metrics_exporter", cfg.MetricsExporter, + ) + + return runtime, nil +} + +// TracerProvider returns the runtime tracer provider. +func (runtime *Runtime) TracerProvider() oteltrace.TracerProvider { + if runtime == nil || runtime.tracerProvider == nil { + return otel.GetTracerProvider() + } + + return runtime.tracerProvider +} + +// MeterProvider returns the runtime meter provider. +func (runtime *Runtime) MeterProvider() metric.MeterProvider { + if runtime == nil || runtime.meterProvider == nil { + return otel.GetMeterProvider() + } + + return runtime.meterProvider +} + +// Shutdown flushes and stops the configured telemetry providers. Shutdown +// is idempotent. +func (runtime *Runtime) Shutdown(ctx context.Context) error { + if runtime == nil { + return nil + } + + runtime.shutdownMu.Lock() + if runtime.shutdownDone { + err := runtime.shutdownErr + runtime.shutdownMu.Unlock() + return err + } + runtime.shutdownDone = true + runtime.shutdownMu.Unlock() + + runtime.gaugeMu.Lock() + if runtime.gaugeRegistration != nil { + _ = runtime.gaugeRegistration.Unregister() + runtime.gaugeRegistration = nil + } + runtime.gaugeMu.Unlock() + + var shutdownErr error + for index := len(runtime.shutdownFns) - 1; index >= 0; index-- { + shutdownErr = errors.Join(shutdownErr, runtime.shutdownFns[index](ctx)) + } + + runtime.shutdownMu.Lock() + runtime.shutdownErr = shutdownErr + runtime.shutdownMu.Unlock() + + return shutdownErr +} + +// RecordInternalHTTPRequest records one internal HTTP request outcome. +func (runtime *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) { + if runtime == nil { + return + } + + options := metric.WithAttributes(attrs...) + runtime.internalHTTPRequests.Add(normalizeContext(ctx), 1, options) + runtime.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options) +} + +// RecordStartOutcome records one terminal outcome of the start operation. +// outcome is `success` or `failure`; errorCode is `replay_no_op` or one of +// the stable failure codes from `rtmanager/README.md` §Error Model; +// opSource is `lobby_stream`, `gm_rest`, or `admin_rest`. +func (runtime *Runtime) RecordStartOutcome(ctx context.Context, outcome, errorCode, opSource string) { + if runtime == nil || runtime.startOutcomes == nil { + return + } + runtime.startOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( + attribute.String("outcome", outcome), + attribute.String("error_code", errorCode), + attribute.String("op_source", opSource), + )) +} + +// RecordStopOutcome records one terminal outcome of the stop operation. +// reason is the value carried on `runtime:stop_jobs` or the matching REST +// reason; opSource is `lobby_stream`, `gm_rest`, or `admin_rest`. +func (runtime *Runtime) RecordStopOutcome(ctx context.Context, outcome, reason, opSource string) { + if runtime == nil || runtime.stopOutcomes == nil { + return + } + runtime.stopOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( + attribute.String("outcome", outcome), + attribute.String("reason", reason), + attribute.String("op_source", opSource), + )) +} + +// RecordRestartOutcome records one terminal outcome of the restart +// operation. +func (runtime *Runtime) RecordRestartOutcome(ctx context.Context, outcome, errorCode string) { + if runtime == nil || runtime.restartOutcomes == nil { + return + } + runtime.restartOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( + attribute.String("outcome", outcome), + attribute.String("error_code", errorCode), + )) +} + +// RecordPatchOutcome records one terminal outcome of the patch operation. +func (runtime *Runtime) RecordPatchOutcome(ctx context.Context, outcome, errorCode string) { + if runtime == nil || runtime.patchOutcomes == nil { + return + } + runtime.patchOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( + attribute.String("outcome", outcome), + attribute.String("error_code", errorCode), + )) +} + +// RecordCleanupOutcome records one terminal outcome of the cleanup +// operation. opSource is `auto_ttl` for the periodic cleanup worker and +// `admin_rest` for explicit administrative removal. +func (runtime *Runtime) RecordCleanupOutcome(ctx context.Context, outcome, opSource string) { + if runtime == nil || runtime.cleanupOutcomes == nil { + return + } + runtime.cleanupOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes( + attribute.String("outcome", outcome), + attribute.String("op_source", opSource), + )) +} + +// RecordHealthEvent records one technical runtime event published on +// `runtime:health_events`. eventType comes from the frozen vocabulary in +// `rtmanager/README.md` §Async Stream Contracts. +func (runtime *Runtime) RecordHealthEvent(ctx context.Context, eventType string) { + if runtime == nil || runtime.healthEvents == nil { + return + } + runtime.healthEvents.Add(normalizeContext(ctx), 1, metric.WithAttributes( + attribute.String("event_type", eventType), + )) +} + +// RecordReconcileDrift records one drift outcome from the reconciler. kind +// is `adopt`, `dispose`, or `observed_exited`. +func (runtime *Runtime) RecordReconcileDrift(ctx context.Context, kind string) { + if runtime == nil || runtime.reconcileDrift == nil { + return + } + runtime.reconcileDrift.Add(normalizeContext(ctx), 1, metric.WithAttributes( + attribute.String("kind", kind), + )) +} + +// RecordNotificationIntent records one admin-only notification intent +// publish attempt. notificationType is `runtime.image_pull_failed`, +// `runtime.container_start_failed`, or `runtime.start_config_invalid`. +func (runtime *Runtime) RecordNotificationIntent(ctx context.Context, notificationType string) { + if runtime == nil || runtime.notificationIntents == nil { + return + } + runtime.notificationIntents.Add(normalizeContext(ctx), 1, metric.WithAttributes( + attribute.String("notification_type", notificationType), + )) +} + +// RecordDockerOpLatency records the wall-clock duration of one Docker SDK +// call. op is one of `pull`, `create`, `start`, `stop`, `rm`, `inspect`, +// `events`. +func (runtime *Runtime) RecordDockerOpLatency(ctx context.Context, op string, duration time.Duration) { + if runtime == nil || runtime.dockerOpLatency == nil { + return + } + runtime.dockerOpLatency.Record(normalizeContext(ctx), duration.Seconds()*1000, metric.WithAttributes( + attribute.String("op", op), + )) +} + +// RecordLeaseAcquireLatency records the wall-clock latency of one +// per-game Redis lease acquisition. +func (runtime *Runtime) RecordLeaseAcquireLatency(ctx context.Context, duration time.Duration) { + if runtime == nil || runtime.leaseAcquireLatency == nil { + return + } + runtime.leaseAcquireLatency.Record(normalizeContext(ctx), duration.Seconds()*1000) +} + +// RuntimeRecordsByStatusProbe reports the number of runtime_records rows +// per status. The production probe wraps the runtime record store; tests +// may pass a stub. +type RuntimeRecordsByStatusProbe interface { + CountByStatus(ctx context.Context) (map[string]int, error) +} + +// GaugeDependencies groups the collaborators required by RegisterGauges. +type GaugeDependencies struct { + // RuntimeRecordsByStatus probes the per-status row count for + // `rtmanager.runtime_records_by_status`. + RuntimeRecordsByStatus RuntimeRecordsByStatusProbe + + // Logger records non-fatal probe errors. Defaults to slog.Default + // when nil. + Logger *slog.Logger +} + +// RegisterGauges installs the observable-gauge callback that reports +// `rtmanager.runtime_records_by_status`. It is safe to call once per +// Runtime; a second call replaces the previous registration. The runtime +// keeps no strong reference to deps beyond the callback closure. +// +// The wiring layer registers the gauge once the persistence adapters +// are constructed. +func (runtime *Runtime) RegisterGauges(deps GaugeDependencies) error { + if runtime == nil { + return errors.New("register rtmanager gauges: nil runtime") + } + if deps.RuntimeRecordsByStatus == nil { + return errors.New("register rtmanager gauges: nil runtime records probe") + } + + logger := deps.Logger + if logger == nil { + logger = slog.Default() + } + + runtime.gaugeMu.Lock() + defer runtime.gaugeMu.Unlock() + + if runtime.gaugeRegistration != nil { + _ = runtime.gaugeRegistration.Unregister() + runtime.gaugeRegistration = nil + } + + callback := func(ctx context.Context, observer metric.Observer) error { + counts, err := deps.RuntimeRecordsByStatus.CountByStatus(ctx) + if err != nil { + logger.WarnContext(ctx, "runtime records probe failed", + "err", err.Error(), + ) + return nil + } + for status, count := range counts { + observer.ObserveInt64(runtime.runtimeRecordsByStatus, int64(count), metric.WithAttributes( + attribute.String("status", status), + )) + } + return nil + } + + registration, err := runtime.meter.RegisterCallback(callback, runtime.runtimeRecordsByStatus) + if err != nil { + return fmt.Errorf("register rtmanager gauges: %w", err) + } + runtime.gaugeRegistration = registration + + return nil +} + +func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) { + meter := meterProvider.Meter(meterName) + runtime := &Runtime{ + tracerProvider: tracerProvider, + meterProvider: meterProvider, + meter: meter, + shutdownFns: append([]func(context.Context) error(nil), shutdownFns...), + } + + internalHTTPRequests, err := meter.Int64Counter("rtmanager.internal_http.requests") + if err != nil { + return nil, fmt.Errorf("build rtmanager telemetry runtime: internal_http.requests: %w", err) + } + internalHTTPDuration, err := meter.Float64Histogram("rtmanager.internal_http.duration", metric.WithUnit("ms")) + if err != nil { + return nil, fmt.Errorf("build rtmanager telemetry runtime: internal_http.duration: %w", err) + } + runtime.internalHTTPRequests = internalHTTPRequests + runtime.internalHTTPDuration = internalHTTPDuration + + if err := registerCounters(meter, runtime); err != nil { + return nil, err + } + if err := registerHistograms(meter, runtime); err != nil { + return nil, err + } + if err := registerObservableGauges(meter, runtime); err != nil { + return nil, err + } + + return runtime, nil +} + +func registerCounters(meter metric.Meter, runtime *Runtime) error { + specs := []struct { + name string + target *metric.Int64Counter + }{ + {"rtmanager.start_outcomes", &runtime.startOutcomes}, + {"rtmanager.stop_outcomes", &runtime.stopOutcomes}, + {"rtmanager.restart_outcomes", &runtime.restartOutcomes}, + {"rtmanager.patch_outcomes", &runtime.patchOutcomes}, + {"rtmanager.cleanup_outcomes", &runtime.cleanupOutcomes}, + {"rtmanager.health_events", &runtime.healthEvents}, + {"rtmanager.reconcile_drift", &runtime.reconcileDrift}, + {"rtmanager.notification_intents", &runtime.notificationIntents}, + } + for _, spec := range specs { + counter, err := meter.Int64Counter(spec.name) + if err != nil { + return fmt.Errorf("build rtmanager telemetry runtime: %s: %w", spec.name, err) + } + *spec.target = counter + } + return nil +} + +func registerHistograms(meter metric.Meter, runtime *Runtime) error { + specs := []struct { + name string + unit string + target *metric.Float64Histogram + }{ + {"rtmanager.docker_op_latency", "ms", &runtime.dockerOpLatency}, + {"rtmanager.lease_acquire_latency", "ms", &runtime.leaseAcquireLatency}, + } + for _, spec := range specs { + options := []metric.Float64HistogramOption{} + if spec.unit != "" { + options = append(options, metric.WithUnit(spec.unit)) + } + histogram, err := meter.Float64Histogram(spec.name, options...) + if err != nil { + return fmt.Errorf("build rtmanager telemetry runtime: %s: %w", spec.name, err) + } + *spec.target = histogram + } + return nil +} + +func registerObservableGauges(meter metric.Meter, runtime *Runtime) error { + gauge, err := meter.Int64ObservableGauge("rtmanager.runtime_records_by_status") + if err != nil { + return fmt.Errorf("build rtmanager telemetry runtime: runtime_records_by_status: %w", err) + } + runtime.runtimeRecordsByStatus = gauge + return nil +} + +func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdktrace.TracerProvider, error) { + options := []sdktrace.TracerProviderOption{ + sdktrace.WithResource(res), + } + + if exporter, err := traceExporter(ctx, cfg); err != nil { + return nil, err + } else if exporter != nil { + options = append(options, sdktrace.WithBatcher(exporter)) + } + + if cfg.StdoutTracesEnabled { + exporter, err := stdouttrace.New(stdouttrace.WithWriter(os.Stdout)) + if err != nil { + return nil, fmt.Errorf("stdout traces exporter: %w", err) + } + options = append(options, sdktrace.WithBatcher(exporter)) + } + + return sdktrace.NewTracerProvider(options...), nil +} + +func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdkmetric.MeterProvider, error) { + options := []sdkmetric.Option{ + sdkmetric.WithResource(res), + } + + if exporter, err := metricExporter(ctx, cfg); err != nil { + return nil, err + } else if exporter != nil { + options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter))) + } + + if cfg.StdoutMetricsEnabled { + exporter, err := stdoutmetric.New(stdoutmetric.WithWriter(os.Stdout)) + if err != nil { + return nil, fmt.Errorf("stdout metrics exporter: %w", err) + } + options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter))) + } + + return sdkmetric.NewMeterProvider(options...), nil +} + +func traceExporter(ctx context.Context, cfg ProcessConfig) (sdktrace.SpanExporter, error) { + if cfg.TracesExporter != processExporterOTLP { + return nil, nil + } + + switch normalizeProtocol(cfg.TracesProtocol) { + case processProtocolGRPC: + exporter, err := otlptracegrpc.New(ctx) + if err != nil { + return nil, fmt.Errorf("otlp grpc traces exporter: %w", err) + } + return exporter, nil + default: + exporter, err := otlptracehttp.New(ctx) + if err != nil { + return nil, fmt.Errorf("otlp http traces exporter: %w", err) + } + return exporter, nil + } +} + +func metricExporter(ctx context.Context, cfg ProcessConfig) (sdkmetric.Exporter, error) { + if cfg.MetricsExporter != processExporterOTLP { + return nil, nil + } + + switch normalizeProtocol(cfg.MetricsProtocol) { + case processProtocolGRPC: + exporter, err := otlpmetricgrpc.New(ctx) + if err != nil { + return nil, fmt.Errorf("otlp grpc metrics exporter: %w", err) + } + return exporter, nil + default: + exporter, err := otlpmetrichttp.New(ctx) + if err != nil { + return nil, fmt.Errorf("otlp http metrics exporter: %w", err) + } + return exporter, nil + } +} + +func normalizeProtocol(value string) string { + switch strings.TrimSpace(value) { + case processProtocolGRPC: + return processProtocolGRPC + default: + return processProtocolHTTPProtobuf + } +} + +func normalizeContext(ctx context.Context) context.Context { + if ctx == nil { + return context.Background() + } + + return ctx +} diff --git a/rtmanager/internal/worker/containercleanup/worker.go b/rtmanager/internal/worker/containercleanup/worker.go new file mode 100644 index 0000000..badae53 --- /dev/null +++ b/rtmanager/internal/worker/containercleanup/worker.go @@ -0,0 +1,204 @@ +// Package containercleanup ships the periodic TTL-cleanup worker +// described in `rtmanager/README.md §Lifecycles → Cleanup`. +// +// On every tick the worker lists `runtime_records.status='stopped'` +// rows whose `last_op_at` is older than the configured retention +// (`RTMANAGER_CONTAINER_RETENTION_DAYS`) and delegates removal to +// `cleanupcontainer.Service.Handle` with `op_source=auto_ttl`. The +// service owns the per-game lease, the Docker `Remove` call, the +// status transition, the telemetry counter, and the operation_log +// entry; this worker is intentionally tiny — a ticker plus a TTL +// filter. +// +// Idempotent outcomes (`replay_no_op`, `conflict`) are absorbed; a +// failure on one game does not abort the rest of the pass. +// +// Design rationale is captured in +// `rtmanager/docs/workers.md`. +package containercleanup + +import ( + "context" + "errors" + "log/slog" + "time" + + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/cleanupcontainer" +) + +// Cleaner is the narrow surface the worker uses to remove stopped +// containers. The production `*cleanupcontainer.Service` satisfies +// this interface verbatim; the package keeps the surface here so +// tests can substitute a fake without spinning the full service. +type Cleaner interface { + Handle(ctx context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error) +} + +// Dependencies groups the collaborators required by Worker. +type Dependencies struct { + // RuntimeRecords lists `status=stopped` records on every tick. + RuntimeRecords ports.RuntimeRecordStore + + // Cleanup performs the actual container removal under the per-game + // lease. + Cleanup Cleaner + + // Retention is the TTL after which a stopped container becomes a + // removal candidate. Mirrors `cfg.Container.Retention`. + Retention time.Duration + + // Interval bounds the tick period. Mirrors + // `cfg.Cleanup.CleanupInterval`. + Interval time.Duration + + // Clock supplies the wall-clock used to compute the TTL threshold. + // Defaults to `time.Now` when nil. + Clock func() time.Time + + // Logger receives structured worker-level events. Defaults to + // `slog.Default()` when nil. + Logger *slog.Logger +} + +// Worker drives the periodic TTL-cleanup loop. +type Worker struct { + runtimeRecords ports.RuntimeRecordStore + cleanup Cleaner + + retention time.Duration + interval time.Duration + + clock func() time.Time + logger *slog.Logger +} + +// NewWorker constructs one Worker from deps. +func NewWorker(deps Dependencies) (*Worker, error) { + switch { + case deps.RuntimeRecords == nil: + return nil, errors.New("new container cleanup worker: nil runtime records store") + case deps.Cleanup == nil: + return nil, errors.New("new container cleanup worker: nil cleanup service") + case deps.Retention <= 0: + return nil, errors.New("new container cleanup worker: retention must be positive") + case deps.Interval <= 0: + return nil, errors.New("new container cleanup worker: interval must be positive") + } + + clock := deps.Clock + if clock == nil { + clock = time.Now + } + logger := deps.Logger + if logger == nil { + logger = slog.Default() + } + + return &Worker{ + runtimeRecords: deps.RuntimeRecords, + cleanup: deps.Cleanup, + retention: deps.Retention, + interval: deps.Interval, + clock: clock, + logger: logger.With("worker", "rtmanager.containercleanup"), + }, nil +} + +// Run drives the cleanup loop until ctx is cancelled. Per-tick errors +// are absorbed; the loop only exits on context cancellation. +func (worker *Worker) Run(ctx context.Context) error { + if worker == nil { + return errors.New("run container cleanup worker: nil worker") + } + if ctx == nil { + return errors.New("run container cleanup worker: nil context") + } + if err := ctx.Err(); err != nil { + return err + } + + worker.logger.Info("container cleanup worker started", + "interval", worker.interval.String(), + "retention", worker.retention.String(), + ) + defer worker.logger.Info("container cleanup worker stopped") + + ticker := time.NewTicker(worker.interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + worker.tick(ctx) + } + } +} + +// Shutdown is a no-op; Run terminates on context cancellation. +func (worker *Worker) Shutdown(ctx context.Context) error { + if ctx == nil { + return errors.New("shutdown container cleanup worker: nil context") + } + return nil +} + +// Tick performs one cleanup pass. Exported so tests can drive the +// worker deterministically without spinning a real ticker. +func (worker *Worker) Tick(ctx context.Context) { + worker.tick(ctx) +} + +// tick lists stopped records and delegates removal of expired ones to +// the cleanup service. +func (worker *Worker) tick(ctx context.Context) { + if err := ctx.Err(); err != nil { + return + } + + records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusStopped) + if err != nil { + worker.logger.WarnContext(ctx, "list stopped records", + "err", err.Error(), + ) + return + } + + threshold := worker.clock().Add(-worker.retention) + for _, record := range records { + if err := ctx.Err(); err != nil { + return + } + if !record.LastOpAt.Before(threshold) { + continue + } + + result, err := worker.cleanup.Handle(ctx, cleanupcontainer.Input{ + GameID: record.GameID, + OpSource: operation.OpSourceAutoTTL, + }) + if err != nil { + worker.logger.ErrorContext(ctx, "cleanup handle returned error", + "game_id", record.GameID, + "err", err.Error(), + ) + continue + } + if result.Outcome == operation.OutcomeFailure { + worker.logger.InfoContext(ctx, "cleanup ttl pass: failure outcome", + "game_id", record.GameID, + "error_code", result.ErrorCode, + "error_message", result.ErrorMessage, + ) + continue + } + worker.logger.InfoContext(ctx, "cleanup ttl removed container", + "game_id", record.GameID, + "error_code", result.ErrorCode, + ) + } +} diff --git a/rtmanager/internal/worker/containercleanup/worker_test.go b/rtmanager/internal/worker/containercleanup/worker_test.go new file mode 100644 index 0000000..c0a7cb8 --- /dev/null +++ b/rtmanager/internal/worker/containercleanup/worker_test.go @@ -0,0 +1,296 @@ +package containercleanup_test + +import ( + "context" + "errors" + "io" + "log/slog" + "sync" + "testing" + "time" + + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/cleanupcontainer" + "galaxy/rtmanager/internal/worker/containercleanup" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func silentLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +// fakeRuntimeRecords supports ListByStatus only. +type fakeRuntimeRecords struct { + mu sync.Mutex + stopped []runtime.RuntimeRecord + listErr error +} + +func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} } + +func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) { + s.mu.Lock() + defer s.mu.Unlock() + s.stopped = append([]runtime.RuntimeRecord(nil), records...) +} + +func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) { + return runtime.RuntimeRecord{}, runtime.ErrNotFound +} +func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil } +func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error { + return nil +} +func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { + return nil, nil +} + +func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.listErr != nil { + return nil, s.listErr + } + if status != runtime.StatusStopped { + return nil, nil + } + out := make([]runtime.RuntimeRecord, len(s.stopped)) + copy(out, s.stopped) + return out, nil +} + +// fakeCleaner records every Handle call and returns canned responses. +type fakeCleaner struct { + mu sync.Mutex + + calls []cleanupcontainer.Input + responses []cleanupcontainer.Result + errs []error + + defaultResult cleanupcontainer.Result + defaultErr error +} + +func (c *fakeCleaner) Handle(_ context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error) { + c.mu.Lock() + defer c.mu.Unlock() + c.calls = append(c.calls, input) + if len(c.errs) > 0 { + err := c.errs[0] + c.errs = c.errs[1:] + return cleanupcontainer.Result{}, err + } + if len(c.responses) > 0 { + result := c.responses[0] + c.responses = c.responses[1:] + return result, nil + } + if c.defaultErr != nil { + return cleanupcontainer.Result{}, c.defaultErr + } + return c.defaultResult, nil +} + +func (c *fakeCleaner) Calls() []cleanupcontainer.Input { + c.mu.Lock() + defer c.mu.Unlock() + out := make([]cleanupcontainer.Input, len(c.calls)) + copy(out, c.calls) + return out +} + +// --- harness ---------------------------------------------------------- + +type harness struct { + records *fakeRuntimeRecords + cleaner *fakeCleaner + + now time.Time +} + +func newHarness() *harness { + return &harness{ + records: newFakeRuntimeRecords(), + cleaner: &fakeCleaner{ + defaultResult: cleanupcontainer.Result{Outcome: operation.OutcomeSuccess}, + }, + now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC), + } +} + +func (h *harness) build(t *testing.T, retention time.Duration) *containercleanup.Worker { + t.Helper() + worker, err := containercleanup.NewWorker(containercleanup.Dependencies{ + RuntimeRecords: h.records, + Cleanup: h.cleaner, + Retention: retention, + Interval: 50 * time.Millisecond, + Clock: func() time.Time { return h.now }, + Logger: silentLogger(), + }) + require.NoError(t, err) + return worker +} + +// stoppedRecord builds a baseline record with the requested LastOpAt. +func stoppedRecord(gameID string, lastOpAt time.Time) runtime.RuntimeRecord { + stoppedAt := lastOpAt + return runtime.RuntimeRecord{ + GameID: gameID, + Status: runtime.StatusStopped, + CurrentContainerID: "ctr-" + gameID, + CurrentImageRef: "galaxy/game:1.0.0", + EngineEndpoint: "http://galaxy-game-" + gameID + ":8080", + StatePath: "/var/lib/galaxy/games/" + gameID, + DockerNetwork: "galaxy-net", + LastOpAt: lastOpAt, + CreatedAt: lastOpAt.Add(-time.Hour), + StoppedAt: &stoppedAt, + } +} + +// --- constructor ------------------------------------------------------ + +func TestNewWorkerRejectsMissingDeps(t *testing.T) { + cleaner := &fakeCleaner{defaultResult: cleanupcontainer.Result{Outcome: operation.OutcomeSuccess}} + records := newFakeRuntimeRecords() + + defectives := []containercleanup.Dependencies{ + {}, + {RuntimeRecords: records}, + {RuntimeRecords: records, Cleanup: cleaner}, + {RuntimeRecords: records, Cleanup: cleaner, Retention: time.Hour}, + } + for index, deps := range defectives { + _, err := containercleanup.NewWorker(deps) + require.Errorf(t, err, "case %d should fail", index) + } + + _, err := containercleanup.NewWorker(containercleanup.Dependencies{ + RuntimeRecords: records, + Cleanup: cleaner, + Retention: time.Hour, + Interval: time.Minute, + }) + require.NoError(t, err) +} + +// --- TTL math --------------------------------------------------------- + +func TestTickCallsHandleForExpiredRecordsOnly(t *testing.T) { + h := newHarness() + retention := 24 * time.Hour + w := h.build(t, retention) + + // One stopped older than retention, one within retention. + expired := stoppedRecord("game-old", h.now.Add(-30*time.Hour)) + fresh := stoppedRecord("game-new", h.now.Add(-time.Hour)) + h.records.Set(expired, fresh) + + w.Tick(context.Background()) + + calls := h.cleaner.Calls() + require.Len(t, calls, 1, "only the expired record should be passed to cleanup") + assert.Equal(t, "game-old", calls[0].GameID) + assert.Equal(t, operation.OpSourceAutoTTL, calls[0].OpSource) + assert.Empty(t, calls[0].SourceRef) +} + +func TestTickRespectsThresholdBoundaryExactly(t *testing.T) { + h := newHarness() + retention := 24 * time.Hour + w := h.build(t, retention) + + // LastOpAt exactly equals the threshold; record.LastOpAt.Before(threshold) + // must be false → record stays. + exactly := stoppedRecord("game-edge", h.now.Add(-retention)) + h.records.Set(exactly) + + w.Tick(context.Background()) + assert.Empty(t, h.cleaner.Calls(), "boundary record (LastOpAt == threshold) is not yet expired") +} + +// --- error absorption ------------------------------------------------- + +func TestTickAbsorbsListError(t *testing.T) { + h := newHarness() + w := h.build(t, time.Hour) + h.records.listErr = errors.New("pg down") + + require.NotPanics(t, func() { w.Tick(context.Background()) }) + assert.Empty(t, h.cleaner.Calls()) +} + +func TestTickAbsorbsHandleErrorAndContinues(t *testing.T) { + h := newHarness() + retention := time.Hour + w := h.build(t, retention) + + a := stoppedRecord("game-a", h.now.Add(-2*retention)) + b := stoppedRecord("game-b", h.now.Add(-2*retention)) + h.records.Set(a, b) + + h.cleaner.errs = []error{errors.New("docker hiccup")} + + w.Tick(context.Background()) + + calls := h.cleaner.Calls() + require.Len(t, calls, 2, "second game must still be processed after first error") + assert.Equal(t, "game-a", calls[0].GameID) + assert.Equal(t, "game-b", calls[1].GameID) +} + +func TestTickAbsorbsFailureOutcomeAndContinues(t *testing.T) { + h := newHarness() + retention := time.Hour + w := h.build(t, retention) + + a := stoppedRecord("game-a", h.now.Add(-2*retention)) + b := stoppedRecord("game-b", h.now.Add(-2*retention)) + h.records.Set(a, b) + + h.cleaner.responses = []cleanupcontainer.Result{ + {Outcome: operation.OutcomeFailure, ErrorCode: "service_unavailable", ErrorMessage: "docker"}, + } + + w.Tick(context.Background()) + + calls := h.cleaner.Calls() + require.Len(t, calls, 2) +} + +// --- Run lifecycle ---------------------------------------------------- + +func TestRunRespectsContextCancel(t *testing.T) { + h := newHarness() + w := h.build(t, time.Hour) + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan error, 1) + go func() { done <- w.Run(ctx) }() + + cancel() + select { + case err := <-done: + assert.ErrorIs(t, err, context.Canceled) + case <-time.After(time.Second): + t.Fatalf("Run did not exit after cancel") + } +} + +func TestShutdownIsNoOp(t *testing.T) { + h := newHarness() + w := h.build(t, time.Hour) + require.NoError(t, w.Shutdown(context.Background())) +} + +// --- compile-time safety ---------------------------------------------- + +var ( + _ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil) + _ containercleanup.Cleaner = (*fakeCleaner)(nil) +) diff --git a/rtmanager/internal/worker/dockerevents/listener.go b/rtmanager/internal/worker/dockerevents/listener.go new file mode 100644 index 0000000..87f71e9 --- /dev/null +++ b/rtmanager/internal/worker/dockerevents/listener.go @@ -0,0 +1,357 @@ +// Package dockerevents subscribes to the Docker events stream and turns +// container-scoped events into entries on `runtime:health_events`. +// +// Three event kinds are emitted by this listener (per +// `rtmanager/README.md §Health Monitoring`): +// +// - `container_exited` from a `die` action with non-zero exit code; +// - `container_oom` from an `oom` action; +// - `container_disappeared` from a `destroy` action observed for a +// `runtime_records.status=running` row whose `current_container_id` +// still matches the destroyed container — i.e., a destroy that +// Runtime Manager did not initiate itself. Destroys triggered by +// RTM's own restart / cleanup flow either find the record already +// transitioned (status != running) or pointing at a different +// container id, and are therefore skipped. +// +// `container_started` is emitted by the start service and is not +// duplicated here. Graceful stop produces a `die` event with exit code +// `0`; that case is suppressed to honour the README guarantee that +// `container_exited` carries a non-zero exit. +// +// Design rationale, including the destroy-disambiguation rule and the +// reconnect policy, is captured in +// `rtmanager/docs/workers.md`. +package dockerevents + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "log/slog" + "strings" + "time" + + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/logging" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/telemetry" +) + +// Docker event actions consumed by the listener. Other actions are +// observed but ignored. +const ( + actionDie = "die" + actionOOM = "oom" + actionDestroy = "destroy" +) + +// defaultReconnectBackoff bounds the wait between two `EventsListen` +// reconnect attempts. Daemon hiccups in production are common; the +// listener never gives up while ctx is alive. +const defaultReconnectBackoff = 5 * time.Second + +// Dependencies groups the collaborators required by Listener. +type Dependencies struct { + // Docker provides the EventsListen subscription used by Run. + Docker ports.DockerClient + + // RuntimeRecords resolves `(game_id, container_id)` for destroy + // disambiguation. + RuntimeRecords ports.RuntimeRecordStore + + // HealthEvents emits the entries produced by handleEvent. Failures + // are best-effort: the listener logs and continues. + HealthEvents ports.HealthEventPublisher + + // Telemetry records one health-event counter increment per emission. + // Required. + Telemetry *telemetry.Runtime + + // Clock supplies the wall-clock used as a fallback when a Docker + // event arrives without a timestamp. Defaults to `time.Now`. + Clock func() time.Time + + // Logger receives structured worker-level events. Defaults to + // `slog.Default()` when nil. + Logger *slog.Logger + + // ReconnectBackoff bounds the wait between reconnect attempts. + // Defaults to defaultReconnectBackoff when zero. + ReconnectBackoff time.Duration +} + +// Listener consumes Docker container events and emits the matching +// `runtime:health_events` entries. +type Listener struct { + docker ports.DockerClient + runtimeRecords ports.RuntimeRecordStore + healthEvents ports.HealthEventPublisher + telemetry *telemetry.Runtime + clock func() time.Time + logger *slog.Logger + + reconnectBackoff time.Duration +} + +// NewListener constructs one Listener from deps. +func NewListener(deps Dependencies) (*Listener, error) { + switch { + case deps.Docker == nil: + return nil, errors.New("new docker events listener: nil docker client") + case deps.RuntimeRecords == nil: + return nil, errors.New("new docker events listener: nil runtime records store") + case deps.HealthEvents == nil: + return nil, errors.New("new docker events listener: nil health events publisher") + case deps.Telemetry == nil: + return nil, errors.New("new docker events listener: nil telemetry runtime") + } + + clock := deps.Clock + if clock == nil { + clock = time.Now + } + logger := deps.Logger + if logger == nil { + logger = slog.Default() + } + backoff := deps.ReconnectBackoff + if backoff <= 0 { + backoff = defaultReconnectBackoff + } + + return &Listener{ + docker: deps.Docker, + runtimeRecords: deps.RuntimeRecords, + healthEvents: deps.HealthEvents, + telemetry: deps.Telemetry, + clock: clock, + logger: logger.With("worker", "rtmanager.dockerevents"), + reconnectBackoff: backoff, + }, nil +} + +// Run drives the events subscription. The outer loop reconnects after a +// Docker subscription error with a fixed backoff; only `ctx` +// cancellation terminates Run. +func (listener *Listener) Run(ctx context.Context) error { + if listener == nil { + return errors.New("run docker events listener: nil listener") + } + if ctx == nil { + return errors.New("run docker events listener: nil context") + } + if err := ctx.Err(); err != nil { + return err + } + + listener.logger.Info("docker events listener started", + "reconnect_backoff", listener.reconnectBackoff.String(), + ) + defer listener.logger.Info("docker events listener stopped") + + for { + if err := ctx.Err(); err != nil { + return err + } + + err := listener.runOnce(ctx) + if err == nil || errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + if ctxErr := ctx.Err(); ctxErr != nil { + return ctxErr + } + } + if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) { + listener.logger.WarnContext(ctx, "docker events subscription dropped, will reconnect", + "err", err.Error(), + "backoff", listener.reconnectBackoff.String(), + ) + } + + if waitErr := listener.sleep(ctx); waitErr != nil { + return waitErr + } + } +} + +// Shutdown is a no-op; Run terminates on context cancellation. +func (listener *Listener) Shutdown(ctx context.Context) error { + if ctx == nil { + return errors.New("shutdown docker events listener: nil context") + } + return nil +} + +// runOnce subscribes once and processes events until the subscription +// reports an error or ctx is cancelled. +func (listener *Listener) runOnce(ctx context.Context) error { + events, errs, err := listener.docker.EventsListen(ctx) + if err != nil { + return fmt.Errorf("subscribe docker events: %w", err) + } + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case event, ok := <-events: + if !ok { + return errors.New("docker events channel closed") + } + listener.handleEvent(ctx, event) + case subscribeErr, ok := <-errs: + if !ok { + return errors.New("docker errors channel closed") + } + if subscribeErr == nil { + continue + } + return subscribeErr + } + } +} + +// sleep waits reconnectBackoff or until ctx is cancelled. +func (listener *Listener) sleep(ctx context.Context) error { + timer := time.NewTimer(listener.reconnectBackoff) + defer timer.Stop() + + select { + case <-ctx.Done(): + return ctx.Err() + case <-timer.C: + return nil + } +} + +// handleEvent translates one Docker event into a health-events emission +// (if any). All branches are exported via tests. +func (listener *Listener) handleEvent(ctx context.Context, event ports.DockerEvent) { + gameID := strings.TrimSpace(event.Labels[startruntime.LabelGameID]) + if gameID == "" { + return + } + + occurredAt := event.OccurredAt + if occurredAt.IsZero() { + occurredAt = listener.clock() + } + occurredAt = occurredAt.UTC() + + switch event.Action { + case actionDie: + if event.ExitCode == 0 { + return + } + listener.publish(ctx, ports.HealthEventEnvelope{ + GameID: gameID, + ContainerID: event.ContainerID, + EventType: health.EventTypeContainerExited, + OccurredAt: occurredAt, + Details: containerExitedDetails(event.ExitCode, false), + }) + case actionOOM: + listener.publish(ctx, ports.HealthEventEnvelope{ + GameID: gameID, + ContainerID: event.ContainerID, + EventType: health.EventTypeContainerOOM, + OccurredAt: occurredAt, + Details: containerOOMDetails(event.ExitCode), + }) + case actionDestroy: + if !listener.isUnexpectedDestroy(ctx, gameID, event.ContainerID) { + return + } + listener.publish(ctx, ports.HealthEventEnvelope{ + GameID: gameID, + ContainerID: event.ContainerID, + EventType: health.EventTypeContainerDisappeared, + OccurredAt: occurredAt, + Details: containerDisappearedDetails(), + }) + default: + return + } +} + +// isUnexpectedDestroy returns true when the destroy event came from a +// source other than Runtime Manager itself. The check is conservative: +// any read error treats the destroy as expected (we cannot tell), and +// only a record currently `running` whose `current_container_id` still +// equals the destroyed id is considered unexpected. +func (listener *Listener) isUnexpectedDestroy(ctx context.Context, gameID, containerID string) bool { + record, err := listener.runtimeRecords.Get(ctx, gameID) + switch { + case errors.Is(err, runtime.ErrNotFound): + return false + case err != nil: + listener.logger.WarnContext(ctx, "destroy lookup failed; suppressing emission", + "game_id", gameID, + "container_id", containerID, + "err", err.Error(), + ) + return false + } + if record.Status != runtime.StatusRunning { + return false + } + if record.CurrentContainerID != containerID { + return false + } + return true +} + +// publish emits one envelope through the configured publisher, updates +// the telemetry counter, and logs the outcome. All side effects are +// best-effort; a publish error degrades to a warning log. +func (listener *Listener) publish(ctx context.Context, envelope ports.HealthEventEnvelope) { + if err := listener.healthEvents.Publish(ctx, envelope); err != nil { + listener.logger.ErrorContext(ctx, "publish health event", + "game_id", envelope.GameID, + "container_id", envelope.ContainerID, + "event_type", string(envelope.EventType), + "err", err.Error(), + ) + return + } + + listener.telemetry.RecordHealthEvent(ctx, string(envelope.EventType)) + + logArgs := []any{ + "game_id", envelope.GameID, + "container_id", envelope.ContainerID, + "event_type", string(envelope.EventType), + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + listener.logger.InfoContext(ctx, "docker event published", logArgs...) +} + +// containerExitedDetails builds the JSON payload required by the +// `container_exited` AsyncAPI variant. +func containerExitedDetails(exitCode int, oom bool) json.RawMessage { + payload := struct { + ExitCode int `json:"exit_code"` + OOM bool `json:"oom"` + }{ExitCode: exitCode, OOM: oom} + encoded, _ := json.Marshal(payload) + return encoded +} + +// containerOOMDetails builds the JSON payload required by the +// `container_oom` AsyncAPI variant. +func containerOOMDetails(exitCode int) json.RawMessage { + payload := struct { + ExitCode int `json:"exit_code"` + }{ExitCode: exitCode} + encoded, _ := json.Marshal(payload) + return encoded +} + +// containerDisappearedDetails builds the empty JSON object the +// `container_disappeared` AsyncAPI variant requires. +func containerDisappearedDetails() json.RawMessage { + return json.RawMessage(`{}`) +} diff --git a/rtmanager/internal/worker/dockerevents/listener_test.go b/rtmanager/internal/worker/dockerevents/listener_test.go new file mode 100644 index 0000000..68fafcf --- /dev/null +++ b/rtmanager/internal/worker/dockerevents/listener_test.go @@ -0,0 +1,584 @@ +package dockerevents_test + +import ( + "context" + "encoding/json" + "errors" + "io" + "log/slog" + "sync" + "sync/atomic" + "testing" + "time" + + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/telemetry" + "galaxy/rtmanager/internal/worker/dockerevents" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func silentLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +// fakeDockerEvents is a minimal ports.DockerClient implementation for +// the listener: only EventsListen is exercised. Tests push events +// through the eventsCh channel and observe reconnect attempts via the +// counter. +type fakeDockerEvents struct { + mu sync.Mutex + subscribeCount int32 + subscribeErr error + currentEventsCh chan ports.DockerEvent + currentErrsCh chan error + subscribed chan struct{} +} + +func newFakeDockerEvents() *fakeDockerEvents { + return &fakeDockerEvents{subscribed: make(chan struct{}, 16)} +} + +func (f *fakeDockerEvents) EventsListen(ctx context.Context) (<-chan ports.DockerEvent, <-chan error, error) { + atomic.AddInt32(&f.subscribeCount, 1) + f.mu.Lock() + if f.subscribeErr != nil { + err := f.subscribeErr + f.mu.Unlock() + return nil, nil, err + } + events := make(chan ports.DockerEvent, 16) + errs := make(chan error, 1) + f.currentEventsCh = events + f.currentErrsCh = errs + f.mu.Unlock() + + select { + case f.subscribed <- struct{}{}: + default: + } + + go func() { + <-ctx.Done() + f.mu.Lock() + if f.currentEventsCh == events { + close(events) + close(errs) + f.currentEventsCh = nil + f.currentErrsCh = nil + } + f.mu.Unlock() + }() + return events, errs, nil +} + +func (f *fakeDockerEvents) sendEvent(event ports.DockerEvent) { + f.mu.Lock() + ch := f.currentEventsCh + f.mu.Unlock() + if ch != nil { + ch <- event + } +} + +func (f *fakeDockerEvents) sendErr(err error) { + f.mu.Lock() + ch := f.currentErrsCh + f.mu.Unlock() + if ch != nil { + ch <- err + } +} + +func (f *fakeDockerEvents) waitSubscribed(t *testing.T) { + t.Helper() + select { + case <-f.subscribed: + case <-time.After(time.Second): + t.Fatalf("timed out waiting for EventsListen subscription") + } +} + +func (f *fakeDockerEvents) subscriptions() int { + return int(atomic.LoadInt32(&f.subscribeCount)) +} + +// Unused DockerClient methods. The listener only consumes EventsListen. +func (f *fakeDockerEvents) EnsureNetwork(_ context.Context, _ string) error { return nil } +func (f *fakeDockerEvents) PullImage(_ context.Context, _ string, _ ports.PullPolicy) error { + return nil +} +func (f *fakeDockerEvents) InspectImage(_ context.Context, _ string) (ports.ImageInspect, error) { + return ports.ImageInspect{}, nil +} +func (f *fakeDockerEvents) InspectContainer(_ context.Context, _ string) (ports.ContainerInspect, error) { + return ports.ContainerInspect{}, nil +} +func (f *fakeDockerEvents) Run(_ context.Context, _ ports.RunSpec) (ports.RunResult, error) { + return ports.RunResult{}, nil +} +func (f *fakeDockerEvents) Stop(_ context.Context, _ string, _ time.Duration) error { return nil } +func (f *fakeDockerEvents) Remove(_ context.Context, _ string) error { return nil } +func (f *fakeDockerEvents) List(_ context.Context, _ ports.ListFilter) ([]ports.ContainerSummary, error) { + return nil, nil +} + +// fakeRuntimeRecords supports Get only; the listener does not call any +// other method. Tests seed records via Set. +type fakeRuntimeRecords struct { + mu sync.Mutex + stored map[string]runtime.RuntimeRecord + getErr error +} + +func newFakeRuntimeRecords() *fakeRuntimeRecords { + return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}} +} + +func (s *fakeRuntimeRecords) Set(record runtime.RuntimeRecord) { + s.mu.Lock() + defer s.mu.Unlock() + s.stored[record.GameID] = record +} + +func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.getErr != nil { + return runtime.RuntimeRecord{}, s.getErr + } + record, ok := s.stored[gameID] + if !ok { + return runtime.RuntimeRecord{}, runtime.ErrNotFound + } + return record, nil +} + +func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil } +func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error { + return nil +} +func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { return nil, nil } +func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) { + return nil, nil +} + +// fakeHealthEvents captures every Publish call. +type fakeHealthEvents struct { + mu sync.Mutex + published []ports.HealthEventEnvelope + publishErr error +} + +func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.publishErr != nil { + return s.publishErr + } + s.published = append(s.published, envelope) + return nil +} + +func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]ports.HealthEventEnvelope, len(s.published)) + copy(out, s.published) + return out +} + +// --- harness ---------------------------------------------------------- + +type harness struct { + docker *fakeDockerEvents + records *fakeRuntimeRecords + health *fakeHealthEvents + listener *dockerevents.Listener + clockNow time.Time +} + +func newHarness(t *testing.T) *harness { + t.Helper() + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + docker := newFakeDockerEvents() + records := newFakeRuntimeRecords() + healthEvents := &fakeHealthEvents{} + clockNow := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) + + listener, err := dockerevents.NewListener(dockerevents.Dependencies{ + Docker: docker, + RuntimeRecords: records, + HealthEvents: healthEvents, + Telemetry: telemetryRuntime, + Clock: func() time.Time { return clockNow }, + Logger: silentLogger(), + ReconnectBackoff: 5 * time.Millisecond, + }) + require.NoError(t, err) + + return &harness{ + docker: docker, + records: records, + health: healthEvents, + listener: listener, + clockNow: clockNow, + } +} + +// --- constructor ------------------------------------------------------- + +func TestNewListenerRejectsMissingDeps(t *testing.T) { + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + cases := []dockerevents.Dependencies{ + {}, + {Docker: newFakeDockerEvents()}, + {Docker: newFakeDockerEvents(), RuntimeRecords: newFakeRuntimeRecords()}, + {Docker: newFakeDockerEvents(), RuntimeRecords: newFakeRuntimeRecords(), HealthEvents: &fakeHealthEvents{}}, + } + for index, deps := range cases { + _, err := dockerevents.NewListener(deps) + require.Errorf(t, err, "case %d should fail", index) + } + + _, err = dockerevents.NewListener(dockerevents.Dependencies{ + Docker: newFakeDockerEvents(), + RuntimeRecords: newFakeRuntimeRecords(), + HealthEvents: &fakeHealthEvents{}, + Telemetry: telemetryRuntime, + }) + require.NoError(t, err) +} + +// --- Run lifecycle ----------------------------------------------------- + +func TestRunPublishesContainerExitedOnNonZeroDie(t *testing.T) { + h := newHarness(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + done := runListener(h, ctx) + h.docker.waitSubscribed(t) + + occurredAt := h.clockNow.Add(-time.Minute) + h.docker.sendEvent(ports.DockerEvent{ + Action: "die", + ContainerID: "ctr-die", + Labels: map[string]string{startruntime.LabelGameID: "game-die"}, + ExitCode: 137, + OccurredAt: occurredAt, + }) + + require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond) + + envelopes := h.health.Published() + require.Len(t, envelopes, 1) + envelope := envelopes[0] + assert.Equal(t, "game-die", envelope.GameID) + assert.Equal(t, "ctr-die", envelope.ContainerID) + assert.Equal(t, health.EventTypeContainerExited, envelope.EventType) + assert.True(t, envelope.OccurredAt.Equal(occurredAt.UTC())) + assertJSONEqual(t, `{"exit_code":137,"oom":false}`, envelope.Details) + + cancel() + waitDone(t, done) +} + +func TestRunSkipsZeroExitDie(t *testing.T) { + h := newHarness(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + done := runListener(h, ctx) + h.docker.waitSubscribed(t) + + h.docker.sendEvent(ports.DockerEvent{ + Action: "die", + ContainerID: "ctr-graceful", + Labels: map[string]string{startruntime.LabelGameID: "game-graceful"}, + ExitCode: 0, + OccurredAt: h.clockNow, + }) + + time.Sleep(20 * time.Millisecond) + assert.Empty(t, h.health.Published(), "graceful exit must not emit container_exited") + + cancel() + waitDone(t, done) +} + +func TestRunPublishesContainerOOM(t *testing.T) { + h := newHarness(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + done := runListener(h, ctx) + h.docker.waitSubscribed(t) + + h.docker.sendEvent(ports.DockerEvent{ + Action: "oom", + ContainerID: "ctr-oom", + Labels: map[string]string{startruntime.LabelGameID: "game-oom"}, + ExitCode: 137, + OccurredAt: h.clockNow, + }) + + require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond) + envelope := h.health.Published()[0] + assert.Equal(t, health.EventTypeContainerOOM, envelope.EventType) + assertJSONEqual(t, `{"exit_code":137}`, envelope.Details) + + cancel() + waitDone(t, done) +} + +func TestRunDestroyEmitsDisappearedOnlyForRunningRecordWithMatchingContainer(t *testing.T) { + h := newHarness(t) + + startedAt := h.clockNow.Add(-time.Hour) + h.records.Set(runtime.RuntimeRecord{ + GameID: "game-d", + Status: runtime.StatusRunning, + CurrentContainerID: "ctr-current", + CurrentImageRef: "galaxy/game:1.0.0", + EngineEndpoint: "http://galaxy-game-game-d:8080", + StatePath: "/var/lib/galaxy/games/game-d", + DockerNetwork: "galaxy-net", + StartedAt: &startedAt, + LastOpAt: h.clockNow, + CreatedAt: startedAt, + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + done := runListener(h, ctx) + h.docker.waitSubscribed(t) + + // Matching destroy → emit. + h.docker.sendEvent(ports.DockerEvent{ + Action: "destroy", + ContainerID: "ctr-current", + Labels: map[string]string{startruntime.LabelGameID: "game-d"}, + OccurredAt: h.clockNow, + }) + + require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond) + envelope := h.health.Published()[0] + assert.Equal(t, health.EventTypeContainerDisappeared, envelope.EventType) + assertJSONEqual(t, `{}`, envelope.Details) + + // Non-matching container id → skip. + h.docker.sendEvent(ports.DockerEvent{ + Action: "destroy", + ContainerID: "ctr-old", + Labels: map[string]string{startruntime.LabelGameID: "game-d"}, + OccurredAt: h.clockNow, + }) + time.Sleep(20 * time.Millisecond) + assert.Len(t, h.health.Published(), 1, "destroy on outdated container_id must not emit again") + + cancel() + waitDone(t, done) +} + +func TestRunDestroySkipsNonRunningRecord(t *testing.T) { + h := newHarness(t) + + startedAt := h.clockNow.Add(-time.Hour) + stoppedAt := h.clockNow.Add(-time.Minute) + h.records.Set(runtime.RuntimeRecord{ + GameID: "game-stopped", + Status: runtime.StatusStopped, + CurrentContainerID: "ctr-stopped", + CurrentImageRef: "galaxy/game:1.0.0", + EngineEndpoint: "http://galaxy-game-game-stopped:8080", + StatePath: "/var/lib/galaxy/games/game-stopped", + DockerNetwork: "galaxy-net", + StartedAt: &startedAt, + StoppedAt: &stoppedAt, + LastOpAt: stoppedAt, + CreatedAt: startedAt, + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + done := runListener(h, ctx) + h.docker.waitSubscribed(t) + + h.docker.sendEvent(ports.DockerEvent{ + Action: "destroy", + ContainerID: "ctr-stopped", + Labels: map[string]string{startruntime.LabelGameID: "game-stopped"}, + OccurredAt: h.clockNow, + }) + + time.Sleep(20 * time.Millisecond) + assert.Empty(t, h.health.Published(), "destroy on non-running record must not emit") + + cancel() + waitDone(t, done) +} + +func TestRunDestroySkipsUnknownGame(t *testing.T) { + h := newHarness(t) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + done := runListener(h, ctx) + h.docker.waitSubscribed(t) + + h.docker.sendEvent(ports.DockerEvent{ + Action: "destroy", + ContainerID: "ctr-unknown", + Labels: map[string]string{startruntime.LabelGameID: "game-unknown"}, + OccurredAt: h.clockNow, + }) + + time.Sleep(20 * time.Millisecond) + assert.Empty(t, h.health.Published(), "destroy with no record must not emit") + + cancel() + waitDone(t, done) +} + +func TestRunSkipsEventsWithoutGameIDLabel(t *testing.T) { + h := newHarness(t) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + done := runListener(h, ctx) + h.docker.waitSubscribed(t) + + h.docker.sendEvent(ports.DockerEvent{ + Action: "die", + ContainerID: "ctr-foreign", + Labels: map[string]string{}, + ExitCode: 1, + OccurredAt: h.clockNow, + }) + + time.Sleep(20 * time.Millisecond) + assert.Empty(t, h.health.Published(), "events without game_id label must not emit") + + cancel() + waitDone(t, done) +} + +func TestRunSkipsUnrelatedActions(t *testing.T) { + h := newHarness(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + done := runListener(h, ctx) + h.docker.waitSubscribed(t) + + for _, action := range []string{"start", "kill", "pause", "create"} { + h.docker.sendEvent(ports.DockerEvent{ + Action: action, + ContainerID: "ctr-x", + Labels: map[string]string{startruntime.LabelGameID: "game-x"}, + OccurredAt: h.clockNow, + }) + } + + time.Sleep(20 * time.Millisecond) + assert.Empty(t, h.health.Published(), "non-die/oom/destroy actions must not emit") + + cancel() + waitDone(t, done) +} + +func TestRunReconnectsAfterSubscriptionError(t *testing.T) { + h := newHarness(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + done := runListener(h, ctx) + h.docker.waitSubscribed(t) + + h.docker.sendErr(errors.New("connection reset")) + h.docker.waitSubscribed(t) + + // Send an event after reconnect to confirm pipeline resumed. + h.docker.sendEvent(ports.DockerEvent{ + Action: "die", + ContainerID: "ctr-after", + Labels: map[string]string{startruntime.LabelGameID: "game-after"}, + ExitCode: 1, + OccurredAt: h.clockNow, + }) + + require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond) + assert.GreaterOrEqual(t, h.docker.subscriptions(), 2, "listener must reconnect after error") + + cancel() + waitDone(t, done) +} + +func TestRunFillsOccurredAtWhenZero(t *testing.T) { + h := newHarness(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + done := runListener(h, ctx) + h.docker.waitSubscribed(t) + + h.docker.sendEvent(ports.DockerEvent{ + Action: "oom", + ContainerID: "ctr-time", + Labels: map[string]string{startruntime.LabelGameID: "game-time"}, + ExitCode: 137, + }) + + require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond) + envelope := h.health.Published()[0] + assert.True(t, envelope.OccurredAt.Equal(h.clockNow.UTC())) + + cancel() + waitDone(t, done) +} + +// --- helpers ----------------------------------------------------------- + +func runListener(h *harness, ctx context.Context) chan error { + done := make(chan error, 1) + go func() { done <- h.listener.Run(ctx) }() + return done +} + +func waitDone(t *testing.T, done chan error) { + t.Helper() + select { + case <-done: + case <-time.After(time.Second): + t.Fatalf("Run did not exit within timeout") + } +} + +func assertJSONEqual(t *testing.T, want string, got json.RawMessage) { + t.Helper() + var wantValue, gotValue any + require.NoError(t, json.Unmarshal([]byte(want), &wantValue)) + require.NoError(t, json.Unmarshal(got, &gotValue)) + assert.Equal(t, wantValue, gotValue) +} + +// --- shutdown ---------------------------------------------------------- + +func TestShutdownIsNoOp(t *testing.T) { + h := newHarness(t) + require.NoError(t, h.listener.Shutdown(context.Background())) +} + +// --- compile-time safety ---------------------------------------------- + +var ( + _ ports.DockerClient = (*fakeDockerEvents)(nil) + _ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil) + _ ports.HealthEventPublisher = (*fakeHealthEvents)(nil) +) diff --git a/rtmanager/internal/worker/dockerinspect/worker.go b/rtmanager/internal/worker/dockerinspect/worker.go new file mode 100644 index 0000000..3e2328e --- /dev/null +++ b/rtmanager/internal/worker/dockerinspect/worker.go @@ -0,0 +1,318 @@ +// Package dockerinspect runs the periodic Docker inspect described in +// `rtmanager/README.md §Health Monitoring`. +// +// On every tick the worker lists `runtime_records.status=running`, +// inspects each container, and emits `inspect_unhealthy` when any of +// the following holds: +// +// - `RestartCount` increased between observations (delta detection +// requires a prior observation; the first inspect of a record only +// records the baseline); +// - `State.Status != "running"`; +// - `State.Health.Status == "unhealthy"` (only meaningful when the +// image declares a Docker HEALTHCHECK). +// +// `ErrContainerNotFound` is left to the reconciler — the inspect +// worker logs and skips so that `container_disappeared` emission +// stays single-sourced (Docker events listener + reconciler). +// +// Per-game state is pruned at the start of every tick against the +// freshly-read running list, so a stopped or removed game never +// carries a stale baseline into a new lifecycle. +package dockerinspect + +import ( + "context" + "encoding/json" + "errors" + "log/slog" + "sync" + "time" + + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/logging" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/telemetry" +) + +// dockerStateRunning is the verbatim Docker `State.Status` value the +// worker treats as healthy. +const dockerStateRunning = "running" + +// dockerHealthUnhealthy is the verbatim Docker `State.Health.Status` +// value the worker treats as unhealthy. +const dockerHealthUnhealthy = "unhealthy" + +// Dependencies groups the collaborators required by Worker. +type Dependencies struct { + // Docker provides the InspectContainer surface. + Docker ports.DockerClient + + // RuntimeRecords lists running games on every tick. + RuntimeRecords ports.RuntimeRecordStore + + // HealthEvents emits `inspect_unhealthy` entries. + HealthEvents ports.HealthEventPublisher + + // Telemetry records one health-event counter per emission. + Telemetry *telemetry.Runtime + + // Interval bounds the tick period. + Interval time.Duration + + // Clock supplies the wall-clock used for emission timestamps. + // Defaults to `time.Now` when nil. + Clock func() time.Time + + // Logger receives structured worker-level events. Defaults to + // `slog.Default()` when nil. + Logger *slog.Logger +} + +// Worker drives the periodic inspect loop. +type Worker struct { + docker ports.DockerClient + runtimeRecords ports.RuntimeRecordStore + healthEvents ports.HealthEventPublisher + telemetry *telemetry.Runtime + + interval time.Duration + + clock func() time.Time + logger *slog.Logger + + mu sync.Mutex + states map[string]*inspectState +} + +// inspectState stores the per-game baseline. Owned by Worker and +// protected by Worker.mu. +type inspectState struct { + lastRestartCount int + seen bool +} + +// NewWorker constructs one Worker from deps. +func NewWorker(deps Dependencies) (*Worker, error) { + switch { + case deps.Docker == nil: + return nil, errors.New("new docker inspect worker: nil docker client") + case deps.RuntimeRecords == nil: + return nil, errors.New("new docker inspect worker: nil runtime records store") + case deps.HealthEvents == nil: + return nil, errors.New("new docker inspect worker: nil health events publisher") + case deps.Telemetry == nil: + return nil, errors.New("new docker inspect worker: nil telemetry runtime") + case deps.Interval <= 0: + return nil, errors.New("new docker inspect worker: interval must be positive") + } + + clock := deps.Clock + if clock == nil { + clock = time.Now + } + logger := deps.Logger + if logger == nil { + logger = slog.Default() + } + + return &Worker{ + docker: deps.Docker, + runtimeRecords: deps.RuntimeRecords, + healthEvents: deps.HealthEvents, + telemetry: deps.Telemetry, + interval: deps.Interval, + clock: clock, + logger: logger.With("worker", "rtmanager.dockerinspect"), + states: map[string]*inspectState{}, + }, nil +} + +// Run drives the inspect loop until ctx is cancelled. Per-tick errors +// are absorbed; the loop only exits on context cancellation. +func (worker *Worker) Run(ctx context.Context) error { + if worker == nil { + return errors.New("run docker inspect worker: nil worker") + } + if ctx == nil { + return errors.New("run docker inspect worker: nil context") + } + if err := ctx.Err(); err != nil { + return err + } + + worker.logger.Info("docker inspect worker started", + "interval", worker.interval.String(), + ) + defer worker.logger.Info("docker inspect worker stopped") + + ticker := time.NewTicker(worker.interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + worker.tick(ctx) + } + } +} + +// Shutdown is a no-op; Run terminates on context cancellation. +func (worker *Worker) Shutdown(ctx context.Context) error { + if ctx == nil { + return errors.New("shutdown docker inspect worker: nil context") + } + return nil +} + +// Tick performs one inspect pass. Exported so tests can drive the +// worker deterministically without spinning a real ticker. +func (worker *Worker) Tick(ctx context.Context) { + worker.tick(ctx) +} + +// tick performs one full pass: list running records, prune state for +// stopped games, then inspect every running container sequentially. +// Inspect calls are cheap; sequential execution avoids fan-out against +// the Docker daemon. +func (worker *Worker) tick(ctx context.Context) { + if err := ctx.Err(); err != nil { + return + } + + records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning) + if err != nil { + worker.logger.WarnContext(ctx, "list running records", + "err", err.Error(), + ) + return + } + + worker.pruneStates(records) + + for _, record := range records { + if err := ctx.Err(); err != nil { + return + } + worker.inspectOne(ctx, record) + } +} + +// pruneStates removes per-game baselines for games no longer in the +// running list. +func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) { + worker.mu.Lock() + defer worker.mu.Unlock() + if len(worker.states) == 0 { + return + } + running := make(map[string]struct{}, len(records)) + for _, record := range records { + running[record.GameID] = struct{}{} + } + for gameID := range worker.states { + if _, ok := running[gameID]; !ok { + delete(worker.states, gameID) + } + } +} + +// inspectOne issues one InspectContainer call and emits +// `inspect_unhealthy` when the observation crosses any of the three +// trigger conditions. The first observation of a record only seeds the +// baseline; deltas need at least two ticks. +func (worker *Worker) inspectOne(ctx context.Context, record runtime.RuntimeRecord) { + inspect, err := worker.docker.InspectContainer(ctx, record.CurrentContainerID) + if err != nil { + if errors.Is(err, ports.ErrContainerNotFound) { + worker.logger.DebugContext(ctx, "inspect skipped: container missing", + "game_id", record.GameID, + "container_id", record.CurrentContainerID, + ) + return + } + worker.logger.WarnContext(ctx, "inspect failed", + "game_id", record.GameID, + "container_id", record.CurrentContainerID, + "err", err.Error(), + ) + return + } + + worker.mu.Lock() + state, ok := worker.states[record.GameID] + if !ok { + state = &inspectState{} + worker.states[record.GameID] = state + } + prev := *state + state.lastRestartCount = inspect.RestartCount + state.seen = true + worker.mu.Unlock() + + emit := false + switch { + case prev.seen && inspect.RestartCount > prev.lastRestartCount: + emit = true + case inspect.Status != dockerStateRunning: + emit = true + case inspect.Health == dockerHealthUnhealthy: + emit = true + } + if !emit { + return + } + + worker.publish(ctx, ports.HealthEventEnvelope{ + GameID: record.GameID, + ContainerID: record.CurrentContainerID, + EventType: health.EventTypeInspectUnhealthy, + OccurredAt: worker.clock().UTC(), + Details: inspectUnhealthyDetails(inspect.RestartCount, inspect.Status, inspect.Health), + }) +} + +// publish emits one envelope through the configured publisher, updates +// the telemetry counter, and logs the outcome. Failures degrade to a +// warning log per `rtmanager/README.md §Notification Contracts`. +func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) { + if err := worker.healthEvents.Publish(ctx, envelope); err != nil { + worker.logger.ErrorContext(ctx, "publish health event", + "game_id", envelope.GameID, + "container_id", envelope.ContainerID, + "event_type", string(envelope.EventType), + "err", err.Error(), + ) + return + } + + worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType)) + + logArgs := []any{ + "game_id", envelope.GameID, + "container_id", envelope.ContainerID, + "event_type", string(envelope.EventType), + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + worker.logger.InfoContext(ctx, "inspect event published", logArgs...) +} + +// inspectUnhealthyDetails builds the JSON payload required by the +// `inspect_unhealthy` AsyncAPI variant. All three fields are required +// even when their value is the zero value. +func inspectUnhealthyDetails(restartCount int, state, health string) json.RawMessage { + payload := struct { + RestartCount int `json:"restart_count"` + State string `json:"state"` + Health string `json:"health"` + }{ + RestartCount: restartCount, + State: state, + Health: health, + } + encoded, _ := json.Marshal(payload) + return encoded +} diff --git a/rtmanager/internal/worker/dockerinspect/worker_test.go b/rtmanager/internal/worker/dockerinspect/worker_test.go new file mode 100644 index 0000000..8fa6979 --- /dev/null +++ b/rtmanager/internal/worker/dockerinspect/worker_test.go @@ -0,0 +1,388 @@ +package dockerinspect_test + +import ( + "context" + "encoding/json" + "errors" + "io" + "log/slog" + "sync" + "testing" + "time" + + "galaxy/rtmanager/internal/adapters/docker/mocks" + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/telemetry" + "galaxy/rtmanager/internal/worker/dockerinspect" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" +) + +func silentLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +// fakeRuntimeRecords supports ListByStatus only. +type fakeRuntimeRecords struct { + mu sync.Mutex + running []runtime.RuntimeRecord + listErr error +} + +func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} } + +func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) { + s.mu.Lock() + defer s.mu.Unlock() + s.running = append([]runtime.RuntimeRecord(nil), records...) +} + +func (s *fakeRuntimeRecords) Clear() { + s.mu.Lock() + defer s.mu.Unlock() + s.running = nil +} + +func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) { + return runtime.RuntimeRecord{}, runtime.ErrNotFound +} +func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil } +func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error { + return nil +} +func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { + return nil, nil +} + +func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.listErr != nil { + return nil, s.listErr + } + if status != runtime.StatusRunning { + return nil, nil + } + out := make([]runtime.RuntimeRecord, len(s.running)) + copy(out, s.running) + return out, nil +} + +// fakeHealthEvents captures every Publish call. +type fakeHealthEvents struct { + mu sync.Mutex + published []ports.HealthEventEnvelope + publishErr error +} + +func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.publishErr != nil { + return s.publishErr + } + s.published = append(s.published, envelope) + return nil +} + +func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]ports.HealthEventEnvelope, len(s.published)) + copy(out, s.published) + return out +} + +// --- harness ---------------------------------------------------------- + +type harness struct { + docker *mocks.MockDockerClient + records *fakeRuntimeRecords + health *fakeHealthEvents + worker *dockerinspect.Worker + now time.Time +} + +func newHarness(t *testing.T) *harness { + t.Helper() + ctrl := gomock.NewController(t) + t.Cleanup(ctrl.Finish) + + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + docker := mocks.NewMockDockerClient(ctrl) + records := newFakeRuntimeRecords() + healthEvents := &fakeHealthEvents{} + now := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) + + worker, err := dockerinspect.NewWorker(dockerinspect.Dependencies{ + Docker: docker, + RuntimeRecords: records, + HealthEvents: healthEvents, + Telemetry: telemetryRuntime, + Interval: 50 * time.Millisecond, + Clock: func() time.Time { return now }, + Logger: silentLogger(), + }) + require.NoError(t, err) + + return &harness{ + docker: docker, + records: records, + health: healthEvents, + worker: worker, + now: now, + } +} + +func runningRecord(gameID string) runtime.RuntimeRecord { + startedAt := time.Date(2026, 4, 27, 11, 0, 0, 0, time.UTC) + return runtime.RuntimeRecord{ + GameID: gameID, + Status: runtime.StatusRunning, + CurrentContainerID: "ctr-" + gameID, + CurrentImageRef: "galaxy/game:1.0.0", + EngineEndpoint: "http://galaxy-game-" + gameID + ":8080", + StatePath: "/var/lib/galaxy/games/" + gameID, + DockerNetwork: "galaxy-net", + StartedAt: &startedAt, + LastOpAt: startedAt, + CreatedAt: startedAt, + } +} + +// --- constructor ------------------------------------------------------ + +func TestNewWorkerRejectsMissingDeps(t *testing.T) { + ctrl := gomock.NewController(t) + t.Cleanup(ctrl.Finish) + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + base := dockerinspect.Dependencies{ + Docker: mocks.NewMockDockerClient(ctrl), + RuntimeRecords: newFakeRuntimeRecords(), + HealthEvents: &fakeHealthEvents{}, + Telemetry: telemetryRuntime, + Interval: time.Second, + } + + defectives := []dockerinspect.Dependencies{ + {}, + {Docker: base.Docker}, + {Docker: base.Docker, RuntimeRecords: base.RuntimeRecords}, + {Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents}, + {Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, Telemetry: base.Telemetry}, + } + for index, deps := range defectives { + _, err := dockerinspect.NewWorker(deps) + require.Errorf(t, err, "case %d should fail", index) + } + + _, err = dockerinspect.NewWorker(base) + require.NoError(t, err) +} + +// --- behaviour -------------------------------------------------------- + +func TestTickFirstObservationOnlySeedsBaseline(t *testing.T) { + h := newHarness(t) + h.records.Set(runningRecord("game-a")) + + h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{ + ID: "ctr-game-a", + Status: "running", + Health: "", + RestartCount: 2, + }, nil) + + h.worker.Tick(context.Background()) + assert.Empty(t, h.health.Published(), "first observation seeds baseline only") +} + +func TestTickRestartCountGrowthEmits(t *testing.T) { + h := newHarness(t) + h.records.Set(runningRecord("game-a")) + + gomock.InOrder( + h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{ + ID: "ctr-game-a", Status: "running", RestartCount: 2, + }, nil), + h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{ + ID: "ctr-game-a", Status: "running", RestartCount: 3, + }, nil), + ) + + h.worker.Tick(context.Background()) + h.worker.Tick(context.Background()) + + envelopes := h.health.Published() + require.Len(t, envelopes, 1) + envelope := envelopes[0] + assert.Equal(t, health.EventTypeInspectUnhealthy, envelope.EventType) + assert.Equal(t, "game-a", envelope.GameID) + assert.Equal(t, "ctr-game-a", envelope.ContainerID) + + var details struct { + RestartCount int `json:"restart_count"` + State string `json:"state"` + Health string `json:"health"` + } + require.NoError(t, json.Unmarshal(envelope.Details, &details)) + assert.Equal(t, 3, details.RestartCount) + assert.Equal(t, "running", details.State) + assert.Empty(t, details.Health) +} + +func TestTickStateNotRunningEmits(t *testing.T) { + h := newHarness(t) + h.records.Set(runningRecord("game-a")) + + h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{ + ID: "ctr-game-a", + Status: "exited", + Health: "", + RestartCount: 0, + }, nil) + + h.worker.Tick(context.Background()) + envelopes := h.health.Published() + require.Len(t, envelopes, 1, "state != running emits even on first observation") + envelope := envelopes[0] + assert.Equal(t, health.EventTypeInspectUnhealthy, envelope.EventType) + + var details struct { + RestartCount int `json:"restart_count"` + State string `json:"state"` + Health string `json:"health"` + } + require.NoError(t, json.Unmarshal(envelope.Details, &details)) + assert.Equal(t, "exited", details.State) +} + +func TestTickHealthUnhealthyEmits(t *testing.T) { + h := newHarness(t) + h.records.Set(runningRecord("game-a")) + + h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{ + ID: "ctr-game-a", + Status: "running", + Health: "unhealthy", + RestartCount: 0, + }, nil) + + h.worker.Tick(context.Background()) + envelopes := h.health.Published() + require.Len(t, envelopes, 1, "Health == unhealthy emits even on first observation") + envelope := envelopes[0] + assert.Equal(t, health.EventTypeInspectUnhealthy, envelope.EventType) + + var details struct { + Health string `json:"health"` + } + require.NoError(t, json.Unmarshal(envelope.Details, &details)) + assert.Equal(t, "unhealthy", details.Health) +} + +func TestTickHealthyDoesNotEmitOnSecondPass(t *testing.T) { + h := newHarness(t) + h.records.Set(runningRecord("game-a")) + + gomock.InOrder( + h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{ + ID: "ctr-game-a", Status: "running", RestartCount: 5, + }, nil), + h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{ + ID: "ctr-game-a", Status: "running", RestartCount: 5, + }, nil), + ) + + h.worker.Tick(context.Background()) + h.worker.Tick(context.Background()) + assert.Empty(t, h.health.Published(), "stable healthy observations must not emit") +} + +func TestTickContainerNotFoundIsSilent(t *testing.T) { + h := newHarness(t) + h.records.Set(runningRecord("game-a")) + + h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{}, ports.ErrContainerNotFound) + + h.worker.Tick(context.Background()) + assert.Empty(t, h.health.Published(), "ErrContainerNotFound must not emit; reconciler handles drift") +} + +func TestTickArbitraryInspectErrorIsAbsorbed(t *testing.T) { + h := newHarness(t) + h.records.Set(runningRecord("game-a")) + + h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{}, errors.New("docker daemon broken")) + + require.NotPanics(t, func() { h.worker.Tick(context.Background()) }) + assert.Empty(t, h.health.Published()) +} + +func TestTickPrunesStateForGamesNoLongerRunning(t *testing.T) { + h := newHarness(t) + h.records.Set(runningRecord("game-a")) + + gomock.InOrder( + h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{ + ID: "ctr-game-a", Status: "running", RestartCount: 5, + }, nil), + // After the game leaves running and re-enters, baseline must be + // reset; a smaller RestartCount must NOT emit (no delta from a + // stale state). + h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{ + ID: "ctr-game-a", Status: "running", RestartCount: 1, + }, nil), + ) + + h.worker.Tick(context.Background()) + h.records.Clear() + h.worker.Tick(context.Background()) + h.records.Set(runningRecord("game-a")) + h.worker.Tick(context.Background()) + + assert.Empty(t, h.health.Published(), "fresh baseline after re-running must not compare against stale lastRestartCount") +} + +func TestTickAbsorbsListError(t *testing.T) { + h := newHarness(t) + h.records.listErr = errors.New("pg down") + + require.NotPanics(t, func() { h.worker.Tick(context.Background()) }) + assert.Empty(t, h.health.Published()) +} + +func TestRunRespectsContextCancel(t *testing.T) { + h := newHarness(t) + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan error, 1) + go func() { done <- h.worker.Run(ctx) }() + + cancel() + select { + case err := <-done: + assert.ErrorIs(t, err, context.Canceled) + case <-time.After(time.Second): + t.Fatalf("Run did not exit after cancel") + } +} + +func TestShutdownIsNoOp(t *testing.T) { + h := newHarness(t) + require.NoError(t, h.worker.Shutdown(context.Background())) +} + +// --- compile-time safety ---------------------------------------------- + +var ( + _ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil) + _ ports.HealthEventPublisher = (*fakeHealthEvents)(nil) +) diff --git a/rtmanager/internal/worker/healthprobe/worker.go b/rtmanager/internal/worker/healthprobe/worker.go new file mode 100644 index 0000000..ee3a9ae --- /dev/null +++ b/rtmanager/internal/worker/healthprobe/worker.go @@ -0,0 +1,411 @@ +// Package healthprobe runs the active HTTP `/healthz` probe described in +// `rtmanager/README.md §Health Monitoring`. +// +// On every tick the worker lists `runtime_records.status=running`, +// probes each engine endpoint in parallel (capped at +// defaultMaxConcurrency), and applies the +// RTMANAGER_PROBE_FAILURES_THRESHOLD hysteresis to emit `probe_failed` +// (after N consecutive failures) and `probe_recovered` (on the first +// success after a `probe_failed` was published). In-memory state is +// pruned at the start of every tick against the freshly-read running +// list, so a game that stops between ticks never accumulates stale +// failure counters. +// +// Design rationale is captured in +// `rtmanager/docs/workers.md`. +package healthprobe + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "log/slog" + "net/http" + "strings" + "sync" + "time" + + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/logging" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/telemetry" +) + +// defaultMaxConcurrency caps the number of in-flight `/healthz` +// requests inside a single tick. RTM v1 is single-instance with a +// modest active-game count; the cap keeps a slow engine from delaying +// the rest of the cohort while preventing pathological fan-out if the +// running list grows. +const defaultMaxConcurrency = 16 + +// healthzPath is the engine probe path. Stable per +// `game/README.md §/healthz`. +const healthzPath = "/healthz" + +// Dependencies groups the collaborators required by Worker. +type Dependencies struct { + // RuntimeRecords lists running games on every tick. + RuntimeRecords ports.RuntimeRecordStore + + // HealthEvents emits `probe_failed` and `probe_recovered`. + HealthEvents ports.HealthEventPublisher + + // HTTPClient performs the engine `/healthz` request. Required. + // Production wiring supplies an `otelhttp`-instrumented client. + HTTPClient *http.Client + + // Telemetry records one health-event counter per emission. + Telemetry *telemetry.Runtime + + // Interval bounds the tick period. + Interval time.Duration + + // ProbeTimeout bounds one engine `/healthz` call. + ProbeTimeout time.Duration + + // FailuresThreshold is the consecutive-failure count that promotes + // the in-memory counter to a `probe_failed` emission. + FailuresThreshold int + + // MaxConcurrency caps the number of in-flight probes per tick. + // Defaults to defaultMaxConcurrency when zero or negative. + MaxConcurrency int + + // Clock supplies the wall-clock used for emission timestamps. + // Defaults to `time.Now` when nil. + Clock func() time.Time + + // Logger receives structured worker-level events. Defaults to + // `slog.Default()` when nil. + Logger *slog.Logger +} + +// Worker drives the periodic active-probe loop. +type Worker struct { + runtimeRecords ports.RuntimeRecordStore + healthEvents ports.HealthEventPublisher + httpClient *http.Client + telemetry *telemetry.Runtime + + interval time.Duration + probeTimeout time.Duration + failuresThreshold int + maxConcurrency int + + clock func() time.Time + logger *slog.Logger + + mu sync.Mutex + states map[string]*probeState +} + +// probeState stores the per-game hysteresis counters. Owned by Worker +// and protected by Worker.mu. +type probeState struct { + consecutiveFailures int + failurePublished bool +} + +// NewWorker constructs one Worker from deps. +func NewWorker(deps Dependencies) (*Worker, error) { + switch { + case deps.RuntimeRecords == nil: + return nil, errors.New("new health probe worker: nil runtime records store") + case deps.HealthEvents == nil: + return nil, errors.New("new health probe worker: nil health events publisher") + case deps.HTTPClient == nil: + return nil, errors.New("new health probe worker: nil http client") + case deps.Telemetry == nil: + return nil, errors.New("new health probe worker: nil telemetry runtime") + case deps.Interval <= 0: + return nil, errors.New("new health probe worker: interval must be positive") + case deps.ProbeTimeout <= 0: + return nil, errors.New("new health probe worker: probe timeout must be positive") + case deps.FailuresThreshold <= 0: + return nil, errors.New("new health probe worker: failures threshold must be positive") + } + + clock := deps.Clock + if clock == nil { + clock = time.Now + } + logger := deps.Logger + if logger == nil { + logger = slog.Default() + } + maxConcurrency := deps.MaxConcurrency + if maxConcurrency <= 0 { + maxConcurrency = defaultMaxConcurrency + } + + return &Worker{ + runtimeRecords: deps.RuntimeRecords, + healthEvents: deps.HealthEvents, + httpClient: deps.HTTPClient, + telemetry: deps.Telemetry, + interval: deps.Interval, + probeTimeout: deps.ProbeTimeout, + failuresThreshold: deps.FailuresThreshold, + maxConcurrency: maxConcurrency, + clock: clock, + logger: logger.With("worker", "rtmanager.healthprobe"), + states: map[string]*probeState{}, + }, nil +} + +// Run drives the probe loop until ctx is cancelled. Per-tick errors are +// absorbed; the loop only exits on context cancellation. +func (worker *Worker) Run(ctx context.Context) error { + if worker == nil { + return errors.New("run health probe worker: nil worker") + } + if ctx == nil { + return errors.New("run health probe worker: nil context") + } + if err := ctx.Err(); err != nil { + return err + } + + worker.logger.Info("health probe worker started", + "interval", worker.interval.String(), + "probe_timeout", worker.probeTimeout.String(), + "failures_threshold", worker.failuresThreshold, + "max_concurrency", worker.maxConcurrency, + ) + defer worker.logger.Info("health probe worker stopped") + + ticker := time.NewTicker(worker.interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + worker.tick(ctx) + } + } +} + +// Shutdown is a no-op; Run terminates on context cancellation. +func (worker *Worker) Shutdown(ctx context.Context) error { + if ctx == nil { + return errors.New("shutdown health probe worker: nil context") + } + return nil +} + +// Tick performs one probe pass. Exported so tests can drive the worker +// deterministically without spinning a real ticker. +func (worker *Worker) Tick(ctx context.Context) { + worker.tick(ctx) +} + +// tick performs one full pass: list running records, prune state for +// stopped games, then probe every running game in parallel. +func (worker *Worker) tick(ctx context.Context) { + if err := ctx.Err(); err != nil { + return + } + + records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning) + if err != nil { + worker.logger.WarnContext(ctx, "list running records", + "err", err.Error(), + ) + return + } + + worker.pruneStates(records) + + if len(records) == 0 { + return + } + + semaphore := make(chan struct{}, worker.maxConcurrency) + var waitGroup sync.WaitGroup + for _, record := range records { + select { + case <-ctx.Done(): + waitGroup.Wait() + return + case semaphore <- struct{}{}: + } + waitGroup.Add(1) + go func(record runtime.RuntimeRecord) { + defer waitGroup.Done() + defer func() { <-semaphore }() + worker.probeOne(ctx, record) + }(record) + } + waitGroup.Wait() +} + +// pruneStates removes per-game state for games no longer in the running +// list. Stopped or removed games therefore start with a clean counter +// the next time they re-enter `running`. +func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) { + worker.mu.Lock() + defer worker.mu.Unlock() + if len(worker.states) == 0 { + return + } + running := make(map[string]struct{}, len(records)) + for _, record := range records { + running[record.GameID] = struct{}{} + } + for gameID := range worker.states { + if _, ok := running[gameID]; !ok { + delete(worker.states, gameID) + } + } +} + +// probeOne issues one `/healthz` request and updates hysteresis state. +func (worker *Worker) probeOne(ctx context.Context, record runtime.RuntimeRecord) { + probeCtx, cancel := context.WithTimeout(ctx, worker.probeTimeout) + defer cancel() + + endpoint := strings.TrimRight(record.EngineEndpoint, "/") + healthzPath + request, err := http.NewRequestWithContext(probeCtx, http.MethodGet, endpoint, nil) + if err != nil { + worker.recordFailure(ctx, record, 0, fmt.Errorf("build request: %w", err)) + return + } + + response, err := worker.httpClient.Do(request) + if err != nil { + worker.recordFailure(ctx, record, 0, err) + return + } + defer response.Body.Close() + + if response.StatusCode == http.StatusOK { + worker.recordSuccess(ctx, record) + return + } + worker.recordFailure(ctx, record, response.StatusCode, fmt.Errorf("unexpected status %d", response.StatusCode)) +} + +// recordSuccess updates state on a successful probe and emits +// `probe_recovered` when the prior tick had crossed the failure +// threshold. +func (worker *Worker) recordSuccess(ctx context.Context, record runtime.RuntimeRecord) { + worker.mu.Lock() + state, ok := worker.states[record.GameID] + if !ok { + worker.mu.Unlock() + return + } + if !state.failurePublished { + state.consecutiveFailures = 0 + worker.mu.Unlock() + return + } + priorFailureCount := state.consecutiveFailures + state.consecutiveFailures = 0 + state.failurePublished = false + worker.mu.Unlock() + + worker.publish(ctx, ports.HealthEventEnvelope{ + GameID: record.GameID, + ContainerID: record.CurrentContainerID, + EventType: health.EventTypeProbeRecovered, + OccurredAt: worker.clock().UTC(), + Details: probeRecoveredDetails(priorFailureCount), + }) +} + +// recordFailure updates state on a failed probe and emits +// `probe_failed` once the threshold is crossed. +func (worker *Worker) recordFailure(ctx context.Context, record runtime.RuntimeRecord, lastStatus int, lastErr error) { + worker.mu.Lock() + state, ok := worker.states[record.GameID] + if !ok { + state = &probeState{} + worker.states[record.GameID] = state + } + state.consecutiveFailures++ + if state.failurePublished || state.consecutiveFailures < worker.failuresThreshold { + count := state.consecutiveFailures + worker.mu.Unlock() + worker.logger.DebugContext(ctx, "probe failure", + "game_id", record.GameID, + "consecutive_failures", count, + "threshold", worker.failuresThreshold, + "err", errString(lastErr), + ) + return + } + state.failurePublished = true + count := state.consecutiveFailures + worker.mu.Unlock() + + worker.publish(ctx, ports.HealthEventEnvelope{ + GameID: record.GameID, + ContainerID: record.CurrentContainerID, + EventType: health.EventTypeProbeFailed, + OccurredAt: worker.clock().UTC(), + Details: probeFailedDetails(count, lastStatus, errString(lastErr)), + }) +} + +// publish emits one envelope through the configured publisher, updates +// the telemetry counter, and logs the outcome. Failures degrade to a +// warning log per `rtmanager/README.md §Notification Contracts`. +func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) { + if err := worker.healthEvents.Publish(ctx, envelope); err != nil { + worker.logger.ErrorContext(ctx, "publish health event", + "game_id", envelope.GameID, + "container_id", envelope.ContainerID, + "event_type", string(envelope.EventType), + "err", err.Error(), + ) + return + } + + worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType)) + + logArgs := []any{ + "game_id", envelope.GameID, + "container_id", envelope.ContainerID, + "event_type", string(envelope.EventType), + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + worker.logger.InfoContext(ctx, "probe event published", logArgs...) +} + +// probeFailedDetails builds the JSON payload required by the +// `probe_failed` AsyncAPI variant. +func probeFailedDetails(consecutiveFailures, lastStatus int, lastError string) json.RawMessage { + payload := struct { + ConsecutiveFailures int `json:"consecutive_failures"` + LastStatus int `json:"last_status"` + LastError string `json:"last_error"` + }{ + ConsecutiveFailures: consecutiveFailures, + LastStatus: lastStatus, + LastError: lastError, + } + encoded, _ := json.Marshal(payload) + return encoded +} + +// probeRecoveredDetails builds the JSON payload required by the +// `probe_recovered` AsyncAPI variant. +func probeRecoveredDetails(priorFailureCount int) json.RawMessage { + payload := struct { + PriorFailureCount int `json:"prior_failure_count"` + }{PriorFailureCount: priorFailureCount} + encoded, _ := json.Marshal(payload) + return encoded +} + +func errString(err error) string { + if err == nil { + return "" + } + return err.Error() +} diff --git a/rtmanager/internal/worker/healthprobe/worker_test.go b/rtmanager/internal/worker/healthprobe/worker_test.go new file mode 100644 index 0000000..7516018 --- /dev/null +++ b/rtmanager/internal/worker/healthprobe/worker_test.go @@ -0,0 +1,417 @@ +package healthprobe_test + +import ( + "context" + "encoding/json" + "errors" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "sync" + "sync/atomic" + "testing" + "time" + + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/telemetry" + "galaxy/rtmanager/internal/worker/healthprobe" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func silentLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +// fakeRuntimeRecords supports List/ListByStatus only; the worker does +// not call other methods. +type fakeRuntimeRecords struct { + mu sync.Mutex + running []runtime.RuntimeRecord + listErr error +} + +func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} } + +func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) { + s.mu.Lock() + defer s.mu.Unlock() + s.running = append([]runtime.RuntimeRecord(nil), records...) +} + +func (s *fakeRuntimeRecords) Clear() { + s.mu.Lock() + defer s.mu.Unlock() + s.running = nil +} + +func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) { + return runtime.RuntimeRecord{}, runtime.ErrNotFound +} +func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil } +func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error { + return nil +} +func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { + return nil, nil +} + +func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.listErr != nil { + return nil, s.listErr + } + if status != runtime.StatusRunning { + return nil, nil + } + out := make([]runtime.RuntimeRecord, len(s.running)) + copy(out, s.running) + return out, nil +} + +// fakeHealthEvents captures every Publish call. +type fakeHealthEvents struct { + mu sync.Mutex + published []ports.HealthEventEnvelope + publishErr error +} + +func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.publishErr != nil { + return s.publishErr + } + s.published = append(s.published, envelope) + return nil +} + +func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]ports.HealthEventEnvelope, len(s.published)) + copy(out, s.published) + return out +} + +// engineServer is a per-game HTTP fake controlled by tests. +type engineServer struct { + server *httptest.Server + status atomic.Int32 + requests atomic.Int32 +} + +func newEngineServer(t *testing.T) *engineServer { + t.Helper() + es := &engineServer{} + es.status.Store(http.StatusOK) + es.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + es.requests.Add(1) + w.WriteHeader(int(es.status.Load())) + })) + t.Cleanup(es.server.Close) + return es +} + +func (e *engineServer) URL() string { return e.server.URL } + +func (e *engineServer) SetStatus(code int) { e.status.Store(int32(code)) } + +func (e *engineServer) Stop() { e.server.Close() } + +// --- harness ---------------------------------------------------------- + +type harness struct { + records *fakeRuntimeRecords + health *fakeHealthEvents + worker *healthprobe.Worker + now time.Time +} + +func newHarness(t *testing.T) *harness { + t.Helper() + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + records := newFakeRuntimeRecords() + healthEvents := &fakeHealthEvents{} + + worker, err := healthprobe.NewWorker(healthprobe.Dependencies{ + RuntimeRecords: records, + HealthEvents: healthEvents, + HTTPClient: &http.Client{}, + Telemetry: telemetryRuntime, + Interval: 50 * time.Millisecond, + ProbeTimeout: 100 * time.Millisecond, + FailuresThreshold: 3, + MaxConcurrency: 4, + Clock: func() time.Time { return time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) }, + Logger: silentLogger(), + }) + require.NoError(t, err) + + return &harness{ + records: records, + health: healthEvents, + worker: worker, + now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC), + } +} + +func runningRecord(gameID, endpoint string) runtime.RuntimeRecord { + startedAt := time.Date(2026, 4, 27, 11, 0, 0, 0, time.UTC) + return runtime.RuntimeRecord{ + GameID: gameID, + Status: runtime.StatusRunning, + CurrentContainerID: "ctr-" + gameID, + CurrentImageRef: "galaxy/game:1.0.0", + EngineEndpoint: endpoint, + StatePath: "/var/lib/galaxy/games/" + gameID, + DockerNetwork: "galaxy-net", + StartedAt: &startedAt, + LastOpAt: startedAt, + CreatedAt: startedAt, + } +} + +// --- constructor ------------------------------------------------------- + +func TestNewWorkerRejectsMissingDeps(t *testing.T) { + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + base := healthprobe.Dependencies{ + RuntimeRecords: newFakeRuntimeRecords(), + HealthEvents: &fakeHealthEvents{}, + HTTPClient: &http.Client{}, + Telemetry: telemetryRuntime, + Interval: time.Second, + ProbeTimeout: time.Second, + FailuresThreshold: 1, + } + + defectives := []healthprobe.Dependencies{ + {}, + {RuntimeRecords: base.RuntimeRecords}, + {RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents}, + {RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient}, + {RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry}, + {RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second}, + {RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second, ProbeTimeout: time.Second}, + } + for index, deps := range defectives { + _, err := healthprobe.NewWorker(deps) + require.Errorf(t, err, "case %d should fail", index) + } + + _, err = healthprobe.NewWorker(base) + require.NoError(t, err) +} + +// --- behaviour -------------------------------------------------------- + +func TestTickHealthyDoesNotEmit(t *testing.T) { + h := newHarness(t) + engine := newEngineServer(t) + + h.records.Set(runningRecord("game-a", engine.URL())) + h.worker.Tick(context.Background()) + + assert.Empty(t, h.health.Published(), "successful probe must not emit events") + assert.Equal(t, int32(1), engine.requests.Load(), "exactly one probe request") +} + +func TestTickFailureBelowThresholdDoesNotEmit(t *testing.T) { + h := newHarness(t) + engine := newEngineServer(t) + engine.SetStatus(http.StatusServiceUnavailable) + + h.records.Set(runningRecord("game-a", engine.URL())) + h.worker.Tick(context.Background()) + h.worker.Tick(context.Background()) + + assert.Empty(t, h.health.Published(), "two failures below threshold must not emit") +} + +func TestTickFailuresCrossingThresholdEmitProbeFailedOnce(t *testing.T) { + h := newHarness(t) + engine := newEngineServer(t) + engine.SetStatus(http.StatusInternalServerError) + + h.records.Set(runningRecord("game-a", engine.URL())) + + for range 5 { + h.worker.Tick(context.Background()) + } + + envelopes := h.health.Published() + require.Len(t, envelopes, 1, "probe_failed must publish exactly once across many failures") + envelope := envelopes[0] + assert.Equal(t, health.EventTypeProbeFailed, envelope.EventType) + assert.Equal(t, "game-a", envelope.GameID) + assert.Equal(t, "ctr-game-a", envelope.ContainerID) + + var details struct { + ConsecutiveFailures int `json:"consecutive_failures"` + LastStatus int `json:"last_status"` + LastError string `json:"last_error"` + } + require.NoError(t, json.Unmarshal(envelope.Details, &details)) + assert.Equal(t, 3, details.ConsecutiveFailures, "consecutive_failures equals threshold at first emission") + assert.Equal(t, http.StatusInternalServerError, details.LastStatus) + assert.NotEmpty(t, details.LastError) +} + +func TestTickRecoveryEmitsProbeRecoveredWithPriorFailureCount(t *testing.T) { + h := newHarness(t) + engine := newEngineServer(t) + engine.SetStatus(http.StatusInternalServerError) + + h.records.Set(runningRecord("game-a", engine.URL())) + + for range 3 { + h.worker.Tick(context.Background()) + } + require.Len(t, h.health.Published(), 1, "expect probe_failed after threshold") + + engine.SetStatus(http.StatusOK) + h.worker.Tick(context.Background()) + + envelopes := h.health.Published() + require.Len(t, envelopes, 2, "recovery must emit exactly one probe_recovered") + envelope := envelopes[1] + assert.Equal(t, health.EventTypeProbeRecovered, envelope.EventType) + + var details struct { + PriorFailureCount int `json:"prior_failure_count"` + } + require.NoError(t, json.Unmarshal(envelope.Details, &details)) + assert.Equal(t, 3, details.PriorFailureCount) +} + +func TestTickFlappingDoesNotDoublePublishProbeFailed(t *testing.T) { + h := newHarness(t) + engine := newEngineServer(t) + engine.SetStatus(http.StatusInternalServerError) + + h.records.Set(runningRecord("game-a", engine.URL())) + for range 5 { + h.worker.Tick(context.Background()) + } + require.Len(t, h.health.Published(), 1) + + // New failure after probe_failed has been published: must not emit again. + h.worker.Tick(context.Background()) + assert.Len(t, h.health.Published(), 1, "no new probe_failed while already in failed state") +} + +func TestTickPrunesStateForGamesNoLongerRunning(t *testing.T) { + h := newHarness(t) + engine := newEngineServer(t) + engine.SetStatus(http.StatusInternalServerError) + + h.records.Set(runningRecord("game-a", engine.URL())) + for range 3 { + h.worker.Tick(context.Background()) + } + require.Len(t, h.health.Published(), 1, "probe_failed published before stop") + + // Game leaves running; state must be pruned. + h.records.Clear() + h.worker.Tick(context.Background()) + + // Re-introduce the same game: counter starts fresh, new failures + // must accumulate from zero before another probe_failed fires. + h.records.Set(runningRecord("game-a", engine.URL())) + h.worker.Tick(context.Background()) + h.worker.Tick(context.Background()) + assert.Len(t, h.health.Published(), 1, "fresh state must require threshold failures again") + + h.worker.Tick(context.Background()) + assert.Len(t, h.health.Published(), 2, "third fresh failure crosses threshold") +} + +func TestTickProbesMultipleGamesConcurrently(t *testing.T) { + h := newHarness(t) + + // Two slow engines that simulate noticeable latency. Sequential + // execution would take 2*latency; parallel finishes near 1*latency. + const latency = 80 * time.Millisecond + makeSlowEngine := func() *httptest.Server { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + time.Sleep(latency) + w.WriteHeader(http.StatusOK) + })) + t.Cleanup(server.Close) + return server + } + a := makeSlowEngine() + b := makeSlowEngine() + + h.records.Set( + runningRecord("game-a", a.URL), + runningRecord("game-b", b.URL), + ) + + start := time.Now() + h.worker.Tick(context.Background()) + elapsed := time.Since(start) + + assert.Less(t, elapsed, 2*latency, "probes must run concurrently, not sequentially") +} + +func TestTickAbsorbsListError(t *testing.T) { + h := newHarness(t) + h.records.listErr = errors.New("pg down") + + require.NotPanics(t, func() { h.worker.Tick(context.Background()) }) + assert.Empty(t, h.health.Published()) +} + +func TestTickAbsorbsPublishError(t *testing.T) { + h := newHarness(t) + h.health.publishErr = errors.New("redis down") + engine := newEngineServer(t) + engine.SetStatus(http.StatusInternalServerError) + + h.records.Set(runningRecord("game-a", engine.URL())) + for range 3 { + h.worker.Tick(context.Background()) + } + // publishErr means nothing accumulated; the worker must not panic + // or change state in surprising ways. + assert.Empty(t, h.health.Published()) +} + +func TestRunRespectsContextCancel(t *testing.T) { + h := newHarness(t) + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan error, 1) + go func() { done <- h.worker.Run(ctx) }() + + cancel() + select { + case err := <-done: + assert.ErrorIs(t, err, context.Canceled) + case <-time.After(time.Second): + t.Fatalf("Run did not exit after cancel") + } +} + +func TestShutdownIsNoOp(t *testing.T) { + h := newHarness(t) + require.NoError(t, h.worker.Shutdown(context.Background())) +} + +// --- compile-time safety ---------------------------------------------- + +var ( + _ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil) + _ ports.HealthEventPublisher = (*fakeHealthEvents)(nil) +) diff --git a/rtmanager/internal/worker/reconcile/reconciler.go b/rtmanager/internal/worker/reconcile/reconciler.go new file mode 100644 index 0000000..1bb9dde --- /dev/null +++ b/rtmanager/internal/worker/reconcile/reconciler.go @@ -0,0 +1,678 @@ +// Package reconcile implements the drift reconciliation worker +// described in `rtmanager/README.md §Reconciliation`. The reconciler +// is the single authority that brings `runtime_records` into agreement +// with the Docker daemon's view of `com.galaxy.owner=rtmanager` +// containers. +// +// Three drift kinds are handled: +// +// - Adopt — a running container labelled `com.galaxy.owner=rtmanager` +// has no matching `runtime_records` row. The reconciler inserts a +// `status=running` record (`op_kind=reconcile_adopt`). +// - Dispose — a `status=running` row whose `current_container_id` is +// no longer reported by Docker. The reconciler updates the row to +// `status=removed`, publishes `runtime:health_events` +// `container_disappeared`, and appends `reconcile_dispose`. +// - Observed exited — a `status=running` row whose container exists +// but reports `State.Status=exited`. The reconciler transitions +// the row to `status=stopped` and publishes `container_exited` +// with the observed exit code. No `operation_log` entry is written +// because `OpKind` does not include a value for this transition; +// it is reflected in `rtmanager.reconcile_drift{kind=observed_exited}` +// instead. +// +// All write decisions for a given `game_id` are guarded by the per-game +// Redis lease; the read pass that lists Docker containers and PG +// records is lockless. +// +// The reconciler runs once synchronously at process start +// (`ReconcileNow`) before any other worker is allowed to start, and +// then periodically via `Run` as an `app.Component`. Design rationale +// is captured in `rtmanager/docs/workers.md`. +package reconcile + +import ( + "context" + "crypto/rand" + "encoding/base64" + "encoding/json" + "errors" + "fmt" + "log/slog" + "path/filepath" + "strconv" + "time" + + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/logging" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/telemetry" +) + +// dockerStateRunning is the verbatim Docker `State.Status` value the +// reconciler treats as "the container is alive". +const dockerStateRunning = "running" + +// dockerStateExited is the verbatim Docker `State.Status` value the +// reconciler treats as "the container has terminated". +const dockerStateExited = "exited" + +// driftKindAdopt / driftKindDispose / driftKindObservedExited match the +// `kind` label vocabulary on `rtmanager.reconcile_drift`. +const ( + driftKindAdopt = "adopt" + driftKindDispose = "dispose" + driftKindObservedExited = "observed_exited" +) + +// leaseReleaseTimeout bounds the deferred lease-release call. A fresh +// background context is used so the release runs even if the request +// context was already canceled. +const leaseReleaseTimeout = 5 * time.Second + +// Dependencies groups the collaborators required by Reconciler. +type Dependencies struct { + Docker ports.DockerClient + RuntimeRecords ports.RuntimeRecordStore + OperationLogs ports.OperationLogStore + HealthEvents ports.HealthEventPublisher + Leases ports.GameLeaseStore + + Telemetry *telemetry.Runtime + + DockerCfg config.DockerConfig + ContainerCfg config.ContainerConfig + Coordination config.CoordinationConfig + + // Interval bounds the periodic tick. ReconcileNow ignores it. + Interval time.Duration + + Clock func() time.Time + Logger *slog.Logger + NewToken func() string +} + +// Reconciler drives both the synchronous initial pass and the periodic +// drift reconciliation loop. +type Reconciler struct { + docker ports.DockerClient + runtimeRecords ports.RuntimeRecordStore + operationLogs ports.OperationLogStore + healthEvents ports.HealthEventPublisher + leases ports.GameLeaseStore + + telemetry *telemetry.Runtime + + dockerNetwork string + stateRoot string + leaseTTL time.Duration + + interval time.Duration + + clock func() time.Time + logger *slog.Logger + newToken func() string +} + +// NewReconciler constructs one Reconciler from deps. +func NewReconciler(deps Dependencies) (*Reconciler, error) { + switch { + case deps.Docker == nil: + return nil, errors.New("new reconciler: nil docker client") + case deps.RuntimeRecords == nil: + return nil, errors.New("new reconciler: nil runtime records store") + case deps.OperationLogs == nil: + return nil, errors.New("new reconciler: nil operation log store") + case deps.HealthEvents == nil: + return nil, errors.New("new reconciler: nil health events publisher") + case deps.Leases == nil: + return nil, errors.New("new reconciler: nil lease store") + case deps.Telemetry == nil: + return nil, errors.New("new reconciler: nil telemetry runtime") + case deps.Interval <= 0: + return nil, errors.New("new reconciler: interval must be positive") + } + if err := deps.DockerCfg.Validate(); err != nil { + return nil, fmt.Errorf("new reconciler: docker config: %w", err) + } + if err := deps.ContainerCfg.Validate(); err != nil { + return nil, fmt.Errorf("new reconciler: container config: %w", err) + } + if err := deps.Coordination.Validate(); err != nil { + return nil, fmt.Errorf("new reconciler: coordination config: %w", err) + } + + clock := deps.Clock + if clock == nil { + clock = time.Now + } + logger := deps.Logger + if logger == nil { + logger = slog.Default() + } + newToken := deps.NewToken + if newToken == nil { + newToken = defaultTokenGenerator() + } + + return &Reconciler{ + docker: deps.Docker, + runtimeRecords: deps.RuntimeRecords, + operationLogs: deps.OperationLogs, + healthEvents: deps.HealthEvents, + leases: deps.Leases, + telemetry: deps.Telemetry, + dockerNetwork: deps.DockerCfg.Network, + stateRoot: deps.ContainerCfg.GameStateRoot, + leaseTTL: deps.Coordination.GameLeaseTTL, + interval: deps.Interval, + clock: clock, + logger: logger.With("worker", "rtmanager.reconcile"), + newToken: newToken, + }, nil +} + +// ReconcileNow performs one full reconciliation pass synchronously. +// It is intended for the startup path described in +// `rtmanager/README.md §Startup dependencies` (step 6). Per-game +// errors are absorbed into telemetry and logs; only ctx errors are +// surfaced to the caller so a cancelled startup aborts immediately. +func (reconciler *Reconciler) ReconcileNow(ctx context.Context) error { + if reconciler == nil { + return errors.New("reconcile now: nil reconciler") + } + if ctx == nil { + return errors.New("reconcile now: nil context") + } + if err := ctx.Err(); err != nil { + return err + } + reconciler.tick(ctx) + return ctx.Err() +} + +// Run drives the periodic reconciliation loop. It does not perform an +// immediate first pass — `ReconcileNow` covers that path; the first +// tick fires after `Interval`. Run terminates on context cancellation. +func (reconciler *Reconciler) Run(ctx context.Context) error { + if reconciler == nil { + return errors.New("run reconciler: nil reconciler") + } + if ctx == nil { + return errors.New("run reconciler: nil context") + } + if err := ctx.Err(); err != nil { + return err + } + + reconciler.logger.Info("reconciler started", + "interval", reconciler.interval.String(), + ) + defer reconciler.logger.Info("reconciler stopped") + + ticker := time.NewTicker(reconciler.interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + reconciler.tick(ctx) + } + } +} + +// Shutdown is a no-op; Run terminates on context cancellation. +func (reconciler *Reconciler) Shutdown(ctx context.Context) error { + if ctx == nil { + return errors.New("shutdown reconciler: nil context") + } + return nil +} + +// Tick performs one reconciliation pass. Exported so tests can drive +// the reconciler deterministically without spinning a real ticker. +func (reconciler *Reconciler) Tick(ctx context.Context) { + reconciler.tick(ctx) +} + +// tick executes one full pass: list Docker containers + PG records, +// resolve drift, and apply lease-guarded mutations for each affected +// game. +func (reconciler *Reconciler) tick(ctx context.Context) { + if err := ctx.Err(); err != nil { + return + } + + containers, err := reconciler.docker.List(ctx, ports.ListFilter{ + Labels: map[string]string{startruntime.LabelOwner: startruntime.LabelOwnerValue}, + }) + if err != nil { + reconciler.logger.WarnContext(ctx, "list owned containers", + "err", err.Error(), + ) + return + } + + records, err := reconciler.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning) + if err != nil { + reconciler.logger.WarnContext(ctx, "list running records", + "err", err.Error(), + ) + return + } + + containerByGame := make(map[string]ports.ContainerSummary, len(containers)) + for _, summary := range containers { + gameID := summary.Labels[startruntime.LabelGameID] + if gameID == "" { + continue + } + containerByGame[gameID] = summary + } + + recordByGame := make(map[string]runtime.RuntimeRecord, len(records)) + for _, record := range records { + recordByGame[record.GameID] = record + } + + for gameID, summary := range containerByGame { + if err := ctx.Err(); err != nil { + return + } + if _, ok := recordByGame[gameID]; ok { + continue + } + if summary.Status != dockerStateRunning { + continue + } + reconciler.adoptOne(ctx, gameID, summary) + } + + for _, record := range records { + if err := ctx.Err(); err != nil { + return + } + summary, ok := containerByGame[record.GameID] + if !ok { + reconciler.disposeOne(ctx, record) + continue + } + if summary.ID != record.CurrentContainerID { + continue + } + if summary.Status == dockerStateExited { + reconciler.observedExitedOne(ctx, record, summary) + } + } +} + +// adoptOne installs a `runtime_records` row for an unrecorded running +// container under the per-game lease. +func (reconciler *Reconciler) adoptOne(ctx context.Context, gameID string, summary ports.ContainerSummary) { + token := reconciler.newToken() + acquired, err := reconciler.leases.TryAcquire(ctx, gameID, token, reconciler.leaseTTL) + if err != nil { + reconciler.logger.WarnContext(ctx, "adopt: acquire lease", + "game_id", gameID, + "err", err.Error(), + ) + return + } + if !acquired { + reconciler.logger.InfoContext(ctx, "adopt: lease busy, skipping", + "game_id", gameID, + ) + return + } + defer reconciler.releaseLease(ctx, gameID, token) + + if _, err := reconciler.runtimeRecords.Get(ctx, gameID); err == nil { + reconciler.logger.InfoContext(ctx, "adopt: record appeared concurrently, skipping", + "game_id", gameID, + ) + return + } else if !errors.Is(err, runtime.ErrNotFound) { + reconciler.logger.WarnContext(ctx, "adopt: read record", + "game_id", gameID, + "err", err.Error(), + ) + return + } + + startedAt := reconciler.resolveStartedAt(ctx, summary) + imageRef := summary.Labels[startruntime.LabelEngineImageRef] + if imageRef == "" { + imageRef = summary.ImageRef + } + + now := reconciler.clock().UTC() + createdAt := now + if startedAt.Before(createdAt) { + createdAt = startedAt + } + record := runtime.RuntimeRecord{ + GameID: gameID, + Status: runtime.StatusRunning, + CurrentContainerID: summary.ID, + CurrentImageRef: imageRef, + EngineEndpoint: reconciler.engineEndpoint(gameID), + StatePath: filepath.Join(reconciler.stateRoot, gameID), + DockerNetwork: reconciler.dockerNetwork, + StartedAt: &startedAt, + LastOpAt: now, + CreatedAt: createdAt, + } + if err := reconciler.runtimeRecords.Upsert(ctx, record); err != nil { + reconciler.logger.ErrorContext(ctx, "adopt: upsert record", + "game_id", gameID, + "container_id", summary.ID, + "err", err.Error(), + ) + return + } + + finishedAt := reconciler.clock().UTC() + reconciler.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: gameID, + OpKind: operation.OpKindReconcileAdopt, + OpSource: operation.OpSourceAutoReconcile, + ImageRef: imageRef, + ContainerID: summary.ID, + Outcome: operation.OutcomeSuccess, + StartedAt: now, + FinishedAt: &finishedAt, + }) + reconciler.telemetry.RecordReconcileDrift(ctx, driftKindAdopt) + + logArgs := []any{ + "game_id", gameID, + "container_id", summary.ID, + "image_ref", imageRef, + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + reconciler.logger.InfoContext(ctx, "reconciler adopted unrecorded container", logArgs...) +} + +// disposeOne transitions a `running` record whose container is missing +// in Docker to `removed` and publishes `container_disappeared`. +func (reconciler *Reconciler) disposeOne(ctx context.Context, record runtime.RuntimeRecord) { + token := reconciler.newToken() + acquired, err := reconciler.leases.TryAcquire(ctx, record.GameID, token, reconciler.leaseTTL) + if err != nil { + reconciler.logger.WarnContext(ctx, "dispose: acquire lease", + "game_id", record.GameID, + "err", err.Error(), + ) + return + } + if !acquired { + reconciler.logger.InfoContext(ctx, "dispose: lease busy, skipping", + "game_id", record.GameID, + ) + return + } + defer reconciler.releaseLease(ctx, record.GameID, token) + + current, err := reconciler.runtimeRecords.Get(ctx, record.GameID) + if err != nil { + if errors.Is(err, runtime.ErrNotFound) { + return + } + reconciler.logger.WarnContext(ctx, "dispose: read record", + "game_id", record.GameID, + "err", err.Error(), + ) + return + } + if current.Status != runtime.StatusRunning || current.CurrentContainerID != record.CurrentContainerID { + reconciler.logger.InfoContext(ctx, "dispose: state changed, skipping", + "game_id", record.GameID, + ) + return + } + + now := reconciler.clock().UTC() + err = reconciler.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: record.GameID, + ExpectedFrom: runtime.StatusRunning, + ExpectedContainerID: record.CurrentContainerID, + To: runtime.StatusRemoved, + Now: now, + }) + if errors.Is(err, runtime.ErrConflict) || errors.Is(err, runtime.ErrNotFound) { + reconciler.logger.InfoContext(ctx, "dispose: CAS lost, skipping", + "game_id", record.GameID, + "err", err.Error(), + ) + return + } + if err != nil { + reconciler.logger.ErrorContext(ctx, "dispose: update status", + "game_id", record.GameID, + "container_id", record.CurrentContainerID, + "err", err.Error(), + ) + return + } + + reconciler.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{ + GameID: record.GameID, + ContainerID: record.CurrentContainerID, + EventType: health.EventTypeContainerDisappeared, + OccurredAt: now, + Details: containerDisappearedDetails(), + }) + + finishedAt := reconciler.clock().UTC() + reconciler.bestEffortAppend(ctx, operation.OperationEntry{ + GameID: record.GameID, + OpKind: operation.OpKindReconcileDispose, + OpSource: operation.OpSourceAutoReconcile, + ImageRef: record.CurrentImageRef, + ContainerID: record.CurrentContainerID, + Outcome: operation.OutcomeSuccess, + StartedAt: now, + FinishedAt: &finishedAt, + }) + reconciler.telemetry.RecordReconcileDrift(ctx, driftKindDispose) + + logArgs := []any{ + "game_id", record.GameID, + "container_id", record.CurrentContainerID, + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + reconciler.logger.InfoContext(ctx, "reconciler disposed missing container", logArgs...) +} + +// observedExitedOne transitions a `running` record whose container is +// reported as `exited` to `stopped` and publishes `container_exited` +// with the observed exit code. No `operation_log` entry is written; +// see decision record §6. +func (reconciler *Reconciler) observedExitedOne(ctx context.Context, record runtime.RuntimeRecord, summary ports.ContainerSummary) { + token := reconciler.newToken() + acquired, err := reconciler.leases.TryAcquire(ctx, record.GameID, token, reconciler.leaseTTL) + if err != nil { + reconciler.logger.WarnContext(ctx, "observed_exited: acquire lease", + "game_id", record.GameID, + "err", err.Error(), + ) + return + } + if !acquired { + reconciler.logger.InfoContext(ctx, "observed_exited: lease busy, skipping", + "game_id", record.GameID, + ) + return + } + defer reconciler.releaseLease(ctx, record.GameID, token) + + current, err := reconciler.runtimeRecords.Get(ctx, record.GameID) + if err != nil { + if errors.Is(err, runtime.ErrNotFound) { + return + } + reconciler.logger.WarnContext(ctx, "observed_exited: read record", + "game_id", record.GameID, + "err", err.Error(), + ) + return + } + if current.Status != runtime.StatusRunning || current.CurrentContainerID != summary.ID { + reconciler.logger.InfoContext(ctx, "observed_exited: state changed, skipping", + "game_id", record.GameID, + ) + return + } + + inspect, err := reconciler.docker.InspectContainer(ctx, summary.ID) + if err != nil { + reconciler.logger.WarnContext(ctx, "observed_exited: inspect container", + "game_id", record.GameID, + "container_id", summary.ID, + "err", err.Error(), + ) + return + } + + now := reconciler.clock().UTC() + err = reconciler.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{ + GameID: record.GameID, + ExpectedFrom: runtime.StatusRunning, + ExpectedContainerID: summary.ID, + To: runtime.StatusStopped, + Now: now, + }) + if errors.Is(err, runtime.ErrConflict) || errors.Is(err, runtime.ErrNotFound) { + reconciler.logger.InfoContext(ctx, "observed_exited: CAS lost, skipping", + "game_id", record.GameID, + "err", err.Error(), + ) + return + } + if err != nil { + reconciler.logger.ErrorContext(ctx, "observed_exited: update status", + "game_id", record.GameID, + "container_id", summary.ID, + "err", err.Error(), + ) + return + } + + reconciler.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{ + GameID: record.GameID, + ContainerID: summary.ID, + EventType: health.EventTypeContainerExited, + OccurredAt: now, + Details: containerExitedDetails(inspect.ExitCode, inspect.OOMKilled), + }) + reconciler.telemetry.RecordReconcileDrift(ctx, driftKindObservedExited) + + logArgs := []any{ + "game_id", record.GameID, + "container_id", summary.ID, + "exit_code", inspect.ExitCode, + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + reconciler.logger.InfoContext(ctx, "reconciler observed exited container", logArgs...) +} + +// resolveStartedAt prefers the `com.galaxy.started_at_ms` label written +// by the start service. When the label is absent or unparseable, it +// falls back to a full inspect of the container; if inspect also fails +// or returns a zero StartedAt, the current clock is used so the record +// still validates. +func (reconciler *Reconciler) resolveStartedAt(ctx context.Context, summary ports.ContainerSummary) time.Time { + if raw, ok := summary.Labels[startruntime.LabelStartedAtMs]; ok && raw != "" { + if ms, err := strconv.ParseInt(raw, 10, 64); err == nil && ms > 0 { + return time.UnixMilli(ms).UTC() + } + } + inspect, err := reconciler.docker.InspectContainer(ctx, summary.ID) + if err == nil && !inspect.StartedAt.IsZero() { + return inspect.StartedAt.UTC() + } + return reconciler.clock().UTC() +} + +// engineEndpoint mirrors the URL shape produced by the docker adapter +// (`internal/adapters/docker/client.go::Run`). +func (reconciler *Reconciler) engineEndpoint(gameID string) string { + return fmt.Sprintf("http://%s%s:8080", startruntime.HostnamePrefix, gameID) +} + +// releaseLease releases the per-game lease in a fresh background +// context so a canceled tick context does not leave the lease pinned +// for its TTL. +func (reconciler *Reconciler) releaseLease(ctx context.Context, gameID, token string) { + cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout) + defer cancel() + if err := reconciler.leases.Release(cleanupCtx, gameID, token); err != nil { + reconciler.logger.WarnContext(ctx, "release game lease", + "game_id", gameID, + "err", err.Error(), + ) + } +} + +// bestEffortAppend writes one operation_log entry. A failure is logged +// and discarded; the durable runtime record (or its absence) remains +// the source of truth. +func (reconciler *Reconciler) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) { + if _, err := reconciler.operationLogs.Append(ctx, entry); err != nil { + reconciler.logger.ErrorContext(ctx, "append operation log", + "game_id", entry.GameID, + "op_kind", string(entry.OpKind), + "err", err.Error(), + ) + } +} + +// bestEffortPublishHealth emits one health event + snapshot upsert. +// Failures degrade silently per `rtmanager/README.md §Notification +// Contracts`; the runtime record remains the source of truth. +func (reconciler *Reconciler) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) { + if err := reconciler.healthEvents.Publish(ctx, envelope); err != nil { + reconciler.logger.ErrorContext(ctx, "publish health event", + "game_id", envelope.GameID, + "container_id", envelope.ContainerID, + "event_type", string(envelope.EventType), + "err", err.Error(), + ) + return + } + reconciler.telemetry.RecordHealthEvent(ctx, string(envelope.EventType)) +} + +// containerExitedDetails matches the JSON shape produced by the events +// listener so consumers see a single contracted payload regardless of +// the source. +func containerExitedDetails(exitCode int, oom bool) json.RawMessage { + payload := struct { + ExitCode int `json:"exit_code"` + OOM bool `json:"oom"` + }{ExitCode: exitCode, OOM: oom} + encoded, _ := json.Marshal(payload) + return encoded +} + +// containerDisappearedDetails returns the canonical empty-object +// payload required by the `container_disappeared` AsyncAPI variant. +func containerDisappearedDetails() json.RawMessage { + return json.RawMessage(`{}`) +} + +func defaultTokenGenerator() func() string { + return func() string { + var buf [32]byte + if _, err := rand.Read(buf[:]); err != nil { + return "rtmanager-fallback-token" + } + return base64.RawURLEncoding.EncodeToString(buf[:]) + } +} diff --git a/rtmanager/internal/worker/reconcile/reconciler_test.go b/rtmanager/internal/worker/reconcile/reconciler_test.go new file mode 100644 index 0000000..eb3d796 --- /dev/null +++ b/rtmanager/internal/worker/reconcile/reconciler_test.go @@ -0,0 +1,740 @@ +package reconcile_test + +import ( + "context" + "encoding/json" + "errors" + "io" + "log/slog" + "strconv" + "sync" + "testing" + "time" + + "galaxy/rtmanager/internal/adapters/docker/mocks" + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/domain/health" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/telemetry" + "galaxy/rtmanager/internal/worker/reconcile" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" +) + +func silentLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +// --- fake doubles ----------------------------------------------------- + +type fakeRuntimeRecords struct { + mu sync.Mutex + + stored map[string]runtime.RuntimeRecord + getErr error + upsertErr error + updateStatusErr error + listErr error + + upserts []runtime.RuntimeRecord + updates []ports.UpdateStatusInput +} + +func newFakeRuntimeRecords() *fakeRuntimeRecords { + return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}} +} + +func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) { + s.mu.Lock() + defer s.mu.Unlock() + for _, record := range records { + s.stored[record.GameID] = record + } +} + +func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.getErr != nil { + return runtime.RuntimeRecord{}, s.getErr + } + record, ok := s.stored[gameID] + if !ok { + return runtime.RuntimeRecord{}, runtime.ErrNotFound + } + return record, nil +} + +func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.upsertErr != nil { + return s.upsertErr + } + s.upserts = append(s.upserts, record) + s.stored[record.GameID] = record + return nil +} + +func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error { + s.mu.Lock() + defer s.mu.Unlock() + s.updates = append(s.updates, input) + if s.updateStatusErr != nil { + return s.updateStatusErr + } + record, ok := s.stored[input.GameID] + if !ok { + return runtime.ErrNotFound + } + if record.Status != input.ExpectedFrom { + return runtime.ErrConflict + } + if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID { + return runtime.ErrConflict + } + record.Status = input.To + record.LastOpAt = input.Now + switch input.To { + case runtime.StatusStopped: + t := input.Now + record.StoppedAt = &t + case runtime.StatusRemoved: + t := input.Now + record.RemovedAt = &t + record.CurrentContainerID = "" + } + s.stored[input.GameID] = record + return nil +} + +func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in reconciler tests") +} + +func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.listErr != nil { + return nil, s.listErr + } + var out []runtime.RuntimeRecord + for _, record := range s.stored { + if record.Status == status { + out = append(out, record) + } + } + return out, nil +} + +func (s *fakeRuntimeRecords) Upserts() []runtime.RuntimeRecord { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]runtime.RuntimeRecord, len(s.upserts)) + copy(out, s.upserts) + return out +} + +func (s *fakeRuntimeRecords) Updates() []ports.UpdateStatusInput { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]ports.UpdateStatusInput, len(s.updates)) + copy(out, s.updates) + return out +} + +type fakeOperationLogs struct { + mu sync.Mutex + + appendErr error + appends []operation.OperationEntry +} + +func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.appendErr != nil { + return 0, s.appendErr + } + s.appends = append(s.appends, entry) + return int64(len(s.appends)), nil +} + +func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) { + return nil, errors.New("not used in reconciler tests") +} + +func (s *fakeOperationLogs) Appends() []operation.OperationEntry { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]operation.OperationEntry, len(s.appends)) + copy(out, s.appends) + return out +} + +type fakeHealthEvents struct { + mu sync.Mutex + publishErr error + published []ports.HealthEventEnvelope +} + +func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.publishErr != nil { + return s.publishErr + } + s.published = append(s.published, envelope) + return nil +} + +func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]ports.HealthEventEnvelope, len(s.published)) + copy(out, s.published) + return out +} + +type fakeLeases struct { + mu sync.Mutex + + acquired bool + acquireErr error + releaseErr error + + acquires []string + releases []string +} + +func (l *fakeLeases) TryAcquire(_ context.Context, gameID, token string, _ time.Duration) (bool, error) { + l.mu.Lock() + defer l.mu.Unlock() + l.acquires = append(l.acquires, gameID+":"+token) + if l.acquireErr != nil { + return false, l.acquireErr + } + return l.acquired, nil +} + +func (l *fakeLeases) Release(_ context.Context, gameID, token string) error { + l.mu.Lock() + defer l.mu.Unlock() + l.releases = append(l.releases, gameID+":"+token) + return l.releaseErr +} + +func (l *fakeLeases) Acquires() []string { + l.mu.Lock() + defer l.mu.Unlock() + out := make([]string, len(l.acquires)) + copy(out, l.acquires) + return out +} + +func (l *fakeLeases) Releases() []string { + l.mu.Lock() + defer l.mu.Unlock() + out := make([]string, len(l.releases)) + copy(out, l.releases) + return out +} + +// --- harness ---------------------------------------------------------- + +type harness struct { + docker *mocks.MockDockerClient + records *fakeRuntimeRecords + operationLogs *fakeOperationLogs + healthEvents *fakeHealthEvents + leases *fakeLeases + + telemetry *telemetry.Runtime + + now time.Time +} + +func newHarness(t *testing.T) *harness { + t.Helper() + ctrl := gomock.NewController(t) + t.Cleanup(ctrl.Finish) + + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + return &harness{ + docker: mocks.NewMockDockerClient(ctrl), + records: newFakeRuntimeRecords(), + operationLogs: &fakeOperationLogs{}, + healthEvents: &fakeHealthEvents{}, + leases: &fakeLeases{acquired: true}, + telemetry: telemetryRuntime, + now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC), + } +} + +func (h *harness) build(t *testing.T) *reconcile.Reconciler { + t.Helper() + r, err := reconcile.NewReconciler(reconcile.Dependencies{ + Docker: h.docker, + RuntimeRecords: h.records, + OperationLogs: h.operationLogs, + HealthEvents: h.healthEvents, + Leases: h.leases, + Telemetry: h.telemetry, + DockerCfg: config.DockerConfig{ + Host: "unix:///var/run/docker.sock", + Network: "galaxy-net", + LogDriver: "json-file", + PullPolicy: config.ImagePullPolicyIfMissing, + }, + ContainerCfg: config.ContainerConfig{ + DefaultCPUQuota: 1.0, + DefaultMemory: "512m", + DefaultPIDsLimit: 512, + StopTimeout: 30 * time.Second, + Retention: 30 * 24 * time.Hour, + EngineStateMountPath: "/var/lib/galaxy-game", + EngineStateEnvName: "GAME_STATE_PATH", + GameStateDirMode: 0o750, + GameStateRoot: "/var/lib/galaxy/games", + }, + Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute}, + Interval: 50 * time.Millisecond, + Clock: func() time.Time { return h.now }, + Logger: silentLogger(), + NewToken: func() string { return "token-A" }, + }) + require.NoError(t, err) + return r +} + +// runningRecord builds a baseline runtime record in `running` state. +func runningRecord(gameID, containerID string, startedAt time.Time) runtime.RuntimeRecord { + return runtime.RuntimeRecord{ + GameID: gameID, + Status: runtime.StatusRunning, + CurrentContainerID: containerID, + CurrentImageRef: "galaxy/game:1.0.0", + EngineEndpoint: "http://galaxy-game-" + gameID + ":8080", + StatePath: "/var/lib/galaxy/games/" + gameID, + DockerNetwork: "galaxy-net", + StartedAt: &startedAt, + LastOpAt: startedAt, + CreatedAt: startedAt, + } +} + +func ownedSummary(gameID, containerID, imageRef, status string, startedAtMs int64) ports.ContainerSummary { + labels := map[string]string{ + startruntime.LabelOwner: startruntime.LabelOwnerValue, + startruntime.LabelKind: startruntime.LabelKindValue, + startruntime.LabelGameID: gameID, + startruntime.LabelEngineImageRef: imageRef, + } + if startedAtMs > 0 { + labels[startruntime.LabelStartedAtMs] = strconv.FormatInt(startedAtMs, 10) + } + return ports.ContainerSummary{ + ID: containerID, + ImageRef: imageRef, + Hostname: "galaxy-game-" + gameID, + Labels: labels, + Status: status, + StartedAt: time.UnixMilli(startedAtMs).UTC(), + } +} + +// --- constructor ------------------------------------------------------ + +func TestNewReconcilerRejectsMissingDeps(t *testing.T) { + ctrl := gomock.NewController(t) + t.Cleanup(ctrl.Finish) + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + dockerCfg := config.DockerConfig{ + Host: "unix:///var/run/docker.sock", + Network: "galaxy-net", + LogDriver: "json-file", + PullPolicy: config.ImagePullPolicyIfMissing, + } + containerCfg := config.ContainerConfig{ + DefaultCPUQuota: 1.0, + DefaultMemory: "512m", + DefaultPIDsLimit: 512, + StopTimeout: 30 * time.Second, + Retention: 30 * 24 * time.Hour, + EngineStateMountPath: "/var/lib/galaxy-game", + EngineStateEnvName: "GAME_STATE_PATH", + GameStateDirMode: 0o750, + GameStateRoot: "/var/lib/galaxy/games", + } + coord := config.CoordinationConfig{GameLeaseTTL: time.Minute} + + base := reconcile.Dependencies{ + Docker: mocks.NewMockDockerClient(ctrl), + RuntimeRecords: newFakeRuntimeRecords(), + OperationLogs: &fakeOperationLogs{}, + HealthEvents: &fakeHealthEvents{}, + Leases: &fakeLeases{acquired: true}, + Telemetry: telemetryRuntime, + DockerCfg: dockerCfg, + ContainerCfg: containerCfg, + Coordination: coord, + Interval: time.Second, + } + + defectives := []reconcile.Dependencies{ + {}, + {Docker: base.Docker}, + {Docker: base.Docker, RuntimeRecords: base.RuntimeRecords}, + {Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs}, + {Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs, HealthEvents: base.HealthEvents}, + {Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs, HealthEvents: base.HealthEvents, Leases: base.Leases}, + {Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs, HealthEvents: base.HealthEvents, Leases: base.Leases, Telemetry: base.Telemetry}, + } + for index, deps := range defectives { + _, err := reconcile.NewReconciler(deps) + require.Errorf(t, err, "case %d should fail", index) + } + + _, err = reconcile.NewReconciler(base) + require.NoError(t, err) +} + +// --- adopt ------------------------------------------------------------ + +func TestReconcileAdoptInsertsRecord(t *testing.T) { + h := newHarness(t) + r := h.build(t) + + startedAt := time.Date(2026, 4, 28, 11, 30, 0, 0, time.UTC) + summary := ownedSummary("game-a", "ctr-game-a", "galaxy/game:1.2.3", "running", startedAt.UnixMilli()) + + h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) + + r.Tick(context.Background()) + + upserts := h.records.Upserts() + require.Len(t, upserts, 1) + got := upserts[0] + assert.Equal(t, "game-a", got.GameID) + assert.Equal(t, runtime.StatusRunning, got.Status) + assert.Equal(t, "ctr-game-a", got.CurrentContainerID) + assert.Equal(t, "galaxy/game:1.2.3", got.CurrentImageRef) + assert.Equal(t, "http://galaxy-game-game-a:8080", got.EngineEndpoint) + assert.Equal(t, "/var/lib/galaxy/games/game-a", got.StatePath) + assert.Equal(t, "galaxy-net", got.DockerNetwork) + require.NotNil(t, got.StartedAt) + assert.True(t, got.StartedAt.Equal(startedAt)) + + appends := h.operationLogs.Appends() + require.Len(t, appends, 1) + assert.Equal(t, operation.OpKindReconcileAdopt, appends[0].OpKind) + assert.Equal(t, operation.OpSourceAutoReconcile, appends[0].OpSource) + assert.Equal(t, operation.OutcomeSuccess, appends[0].Outcome) + assert.Equal(t, "ctr-game-a", appends[0].ContainerID) + + assert.Equal(t, []string{"game-a:token-A"}, h.leases.Acquires()) + assert.Equal(t, []string{"game-a:token-A"}, h.leases.Releases()) + assert.Empty(t, h.healthEvents.Published(), "adopt does not publish health events") +} + +func TestReconcileAdoptFallsBackToInspectStartedAtWhenLabelMissing(t *testing.T) { + h := newHarness(t) + r := h.build(t) + + summary := ownedSummary("game-b", "ctr-game-b", "galaxy/game:1.0.0", "running", 0) + h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) + inspectStarted := time.Date(2026, 4, 28, 10, 0, 0, 0, time.UTC) + h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-b").Return(ports.ContainerInspect{ + ID: "ctr-game-b", + StartedAt: inspectStarted, + Status: "running", + }, nil) + + r.Tick(context.Background()) + + upserts := h.records.Upserts() + require.Len(t, upserts, 1) + require.NotNil(t, upserts[0].StartedAt) + assert.True(t, upserts[0].StartedAt.Equal(inspectStarted)) +} + +func TestReconcileAdoptSkipsWhenRecordAppearsConcurrently(t *testing.T) { + h := newHarness(t) + r := h.build(t) + + startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) + h.records.Set(runningRecord("game-c", "ctr-game-c", startedAt)) + + // Docker reports the same game running, but the record already + // exists (start service won the race). The list pass sees the + // record, so adopt path is never entered. + summary := ownedSummary("game-c", "ctr-game-c", "galaxy/game:1.0.0", "running", startedAt.UnixMilli()) + h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) + + r.Tick(context.Background()) + + assert.Empty(t, h.records.Upserts()) + assert.Empty(t, h.operationLogs.Appends()) + assert.Empty(t, h.leases.Acquires(), "no mutation -> no lease acquired") +} + +func TestReconcileAdoptSkipsNonRunningContainer(t *testing.T) { + h := newHarness(t) + r := h.build(t) + + summary := ownedSummary("game-d", "ctr-game-d", "galaxy/game:1.0.0", "exited", time.Now().UnixMilli()) + h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) + + r.Tick(context.Background()) + + assert.Empty(t, h.records.Upserts(), "exited container without record is not adopted") + assert.Empty(t, h.leases.Acquires()) +} + +// --- dispose ---------------------------------------------------------- + +func TestReconcileDisposeMarksRemoved(t *testing.T) { + h := newHarness(t) + r := h.build(t) + + startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) + h.records.Set(runningRecord("game-e", "ctr-game-e", startedAt)) + + h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil) + + r.Tick(context.Background()) + + updates := h.records.Updates() + require.Len(t, updates, 1) + assert.Equal(t, "game-e", updates[0].GameID) + assert.Equal(t, runtime.StatusRunning, updates[0].ExpectedFrom) + assert.Equal(t, "ctr-game-e", updates[0].ExpectedContainerID) + assert.Equal(t, runtime.StatusRemoved, updates[0].To) + + published := h.healthEvents.Published() + require.Len(t, published, 1) + assert.Equal(t, health.EventTypeContainerDisappeared, published[0].EventType) + assert.Equal(t, "game-e", published[0].GameID) + assert.Equal(t, "ctr-game-e", published[0].ContainerID) + assert.JSONEq(t, `{}`, string(published[0].Details)) + + appends := h.operationLogs.Appends() + require.Len(t, appends, 1) + assert.Equal(t, operation.OpKindReconcileDispose, appends[0].OpKind) + assert.Equal(t, operation.OpSourceAutoReconcile, appends[0].OpSource) +} + +func TestReconcileDisposeSkipsOnCASConflict(t *testing.T) { + h := newHarness(t) + r := h.build(t) + + startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) + h.records.Set(runningRecord("game-f", "ctr-game-f", startedAt)) + h.records.updateStatusErr = runtime.ErrConflict + + h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil) + + r.Tick(context.Background()) + + assert.Empty(t, h.healthEvents.Published(), "no health event when CAS lost") + assert.Empty(t, h.operationLogs.Appends(), "no operation_log entry when CAS lost") +} + +func TestReconcileDisposeSkipsWhenStateChangedAfterReread(t *testing.T) { + h := newHarness(t) + r := h.build(t) + + // Running record observed by ListByStatus, but Get under the lease + // returns a record whose status has changed. + startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) + listed := runningRecord("game-g", "ctr-game-g", startedAt) + h.records.Set(listed) + + h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil) + + // Mutate the stored record to simulate concurrent stop completing + // between the list pass and the lease re-read. The fake's Get + // observes the mutated state. + h.records.mu.Lock() + stoppedAt := startedAt.Add(time.Minute) + listed.Status = runtime.StatusStopped + listed.StoppedAt = &stoppedAt + h.records.stored["game-g"] = listed + h.records.mu.Unlock() + + r.Tick(context.Background()) + + assert.Empty(t, h.records.Updates(), "re-read sees status != running -> skip") + assert.Empty(t, h.healthEvents.Published()) + assert.Empty(t, h.operationLogs.Appends()) +} + +// --- observed_exited -------------------------------------------------- + +func TestReconcileObservedExitedMarksStopped(t *testing.T) { + h := newHarness(t) + r := h.build(t) + + startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) + h.records.Set(runningRecord("game-h", "ctr-game-h", startedAt)) + + summary := ownedSummary("game-h", "ctr-game-h", "galaxy/game:1.0.0", "exited", startedAt.UnixMilli()) + h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) + h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-h").Return(ports.ContainerInspect{ + ID: "ctr-game-h", + Status: "exited", + ExitCode: 137, + OOMKilled: false, + }, nil) + + r.Tick(context.Background()) + + updates := h.records.Updates() + require.Len(t, updates, 1) + assert.Equal(t, runtime.StatusRunning, updates[0].ExpectedFrom) + assert.Equal(t, "ctr-game-h", updates[0].ExpectedContainerID) + assert.Equal(t, runtime.StatusStopped, updates[0].To) + + published := h.healthEvents.Published() + require.Len(t, published, 1) + assert.Equal(t, health.EventTypeContainerExited, published[0].EventType) + var details struct { + ExitCode int `json:"exit_code"` + OOM bool `json:"oom"` + } + require.NoError(t, json.Unmarshal(published[0].Details, &details)) + assert.Equal(t, 137, details.ExitCode) + assert.False(t, details.OOM) + + assert.Empty(t, h.operationLogs.Appends(), "observed_exited writes no operation_log entry") +} + +// --- no-op paths ------------------------------------------------------ + +func TestReconcileNoDriftIsNoop(t *testing.T) { + h := newHarness(t) + r := h.build(t) + + startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) + h.records.Set(runningRecord("game-i", "ctr-game-i", startedAt)) + + summary := ownedSummary("game-i", "ctr-game-i", "galaxy/game:1.0.0", "running", startedAt.UnixMilli()) + h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) + + r.Tick(context.Background()) + + assert.Empty(t, h.records.Upserts()) + assert.Empty(t, h.records.Updates()) + assert.Empty(t, h.healthEvents.Published()) + assert.Empty(t, h.operationLogs.Appends()) + assert.Empty(t, h.leases.Acquires()) +} + +func TestReconcileSkipsWhenContainerIDMismatch(t *testing.T) { + h := newHarness(t) + r := h.build(t) + + startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) + h.records.Set(runningRecord("game-j", "ctr-old", startedAt)) + + // Docker reports the new container id; restart is in flight. + summary := ownedSummary("game-j", "ctr-new", "galaxy/game:1.0.0", "running", startedAt.UnixMilli()) + h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil) + + r.Tick(context.Background()) + + assert.Empty(t, h.records.Updates(), "id mismatch -> reconciler stays out of the way") + assert.Empty(t, h.healthEvents.Published()) +} + +// --- lease busy / errors ---------------------------------------------- + +func TestReconcileLeaseConflictSkipsGame(t *testing.T) { + h := newHarness(t) + h.leases.acquired = false + r := h.build(t) + + startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC) + h.records.Set(runningRecord("game-k", "ctr-game-k", startedAt)) + + h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil) + + r.Tick(context.Background()) + + assert.Empty(t, h.records.Updates(), "lease busy -> dispose skipped") + assert.Empty(t, h.healthEvents.Published()) + assert.Empty(t, h.leases.Releases(), "release not called when acquire returned false") +} + +func TestReconcileNowAbsorbsListError(t *testing.T) { + h := newHarness(t) + r := h.build(t) + + h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, errors.New("docker daemon down")) + + require.NoError(t, r.ReconcileNow(context.Background())) + assert.Empty(t, h.records.Updates()) + assert.Empty(t, h.records.Upserts()) +} + +func TestReconcileNowAbsorbsRecordsListError(t *testing.T) { + h := newHarness(t) + r := h.build(t) + h.records.listErr = errors.New("pg down") + + h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil) + + require.NoError(t, r.ReconcileNow(context.Background())) +} + +func TestReconcileNowReturnsContextError(t *testing.T) { + h := newHarness(t) + r := h.build(t) + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + require.ErrorIs(t, r.ReconcileNow(ctx), context.Canceled) +} + +// --- Run lifecycle ---------------------------------------------------- + +func TestRunRespectsContextCancel(t *testing.T) { + h := newHarness(t) + r := h.build(t) + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan error, 1) + go func() { done <- r.Run(ctx) }() + + cancel() + select { + case err := <-done: + assert.ErrorIs(t, err, context.Canceled) + case <-time.After(time.Second): + t.Fatalf("Run did not exit after cancel") + } +} + +func TestShutdownIsNoOp(t *testing.T) { + h := newHarness(t) + r := h.build(t) + require.NoError(t, r.Shutdown(context.Background())) +} + +// --- compile-time safety ---------------------------------------------- + +var ( + _ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil) + _ ports.OperationLogStore = (*fakeOperationLogs)(nil) + _ ports.HealthEventPublisher = (*fakeHealthEvents)(nil) + _ ports.GameLeaseStore = (*fakeLeases)(nil) +) diff --git a/rtmanager/internal/worker/startjobsconsumer/consumer.go b/rtmanager/internal/worker/startjobsconsumer/consumer.go new file mode 100644 index 0000000..b622fcd --- /dev/null +++ b/rtmanager/internal/worker/startjobsconsumer/consumer.go @@ -0,0 +1,337 @@ +// Package startjobsconsumer drives the asynchronous half of the +// Lobby ↔ Runtime Manager start contract. The consumer XREADs from +// `runtime:start_jobs` (produced by Lobby), decodes the envelope frozen +// in `rtmanager/api/runtime-jobs-asyncapi.yaml`, calls the production +// start orchestrator, and publishes one `runtime:job_results` outcome +// per consumed envelope. +// +// Replay safety is provided by the start service: an idempotent re-run +// surfaces as `Outcome=success` with `error_code=replay_no_op`. The +// consumer copies the service Result fields into the `RuntimeJobResult` +// payload verbatim. Per-message decode and publish errors are logged +// and absorbed; the offset advances unconditionally so a single poison +// message cannot pin the loop. Design rationale is captured in +// `rtmanager/docs/workers.md`. +package startjobsconsumer + +import ( + "context" + "errors" + "fmt" + "log/slog" + "strconv" + "strings" + "time" + + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/logging" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + + "github.com/redis/go-redis/v9" +) + +// streamOffsetLabel identifies the start-jobs consumer in the stream +// offset store. The label stays stable when the underlying stream key +// is renamed via configuration. Matches the convention from +// `rtmanager/README.md §Persistence Layout > Redis runtime-coordination state`. +const streamOffsetLabel = "startjobs" + +// Wire field names of the `RuntimeStartJob` payload. Frozen by +// `rtmanager/api/runtime-jobs-asyncapi.yaml`; renaming any of them +// requires a coordinated contract change with Lobby. +const ( + fieldGameID = "game_id" + fieldImageRef = "image_ref" + fieldRequestedAtMS = "requested_at_ms" +) + +// StartService is the narrow surface the consumer needs from the start +// orchestrator. The concrete `*startruntime.Service` satisfies this +// interface and is wired in production. +type StartService interface { + Handle(ctx context.Context, input startruntime.Input) (startruntime.Result, error) +} + +// Config groups the dependencies required to construct a Consumer. +type Config struct { + // Client provides XREAD access to the start-jobs stream. + Client *redis.Client + + // Stream stores the Redis Streams key consumed by the worker. + Stream string + + // BlockTimeout bounds the blocking XREAD window. + BlockTimeout time.Duration + + // StartService executes the start lifecycle for each decoded + // envelope. + StartService StartService + + // JobResults publishes one outcome entry per processed envelope. + JobResults ports.JobResultPublisher + + // OffsetStore persists the last successfully processed entry id so + // the consumer survives restarts without replaying processed + // envelopes. + OffsetStore ports.StreamOffsetStore + + // Logger receives structured worker-level events. Defaults to + // `slog.Default` when nil. + Logger *slog.Logger +} + +// Consumer drives the start-jobs processing loop. +type Consumer struct { + client *redis.Client + stream string + blockTimeout time.Duration + startService StartService + jobResults ports.JobResultPublisher + offsetStore ports.StreamOffsetStore + logger *slog.Logger +} + +// NewConsumer constructs one Consumer from cfg. Validation errors +// surface the missing collaborator verbatim. +func NewConsumer(cfg Config) (*Consumer, error) { + switch { + case cfg.Client == nil: + return nil, errors.New("new start jobs consumer: nil redis client") + case strings.TrimSpace(cfg.Stream) == "": + return nil, errors.New("new start jobs consumer: stream must not be empty") + case cfg.BlockTimeout <= 0: + return nil, errors.New("new start jobs consumer: block timeout must be positive") + case cfg.StartService == nil: + return nil, errors.New("new start jobs consumer: nil start service") + case cfg.JobResults == nil: + return nil, errors.New("new start jobs consumer: nil job results publisher") + case cfg.OffsetStore == nil: + return nil, errors.New("new start jobs consumer: nil offset store") + } + + logger := cfg.Logger + if logger == nil { + logger = slog.Default() + } + return &Consumer{ + client: cfg.Client, + stream: cfg.Stream, + blockTimeout: cfg.BlockTimeout, + startService: cfg.StartService, + jobResults: cfg.JobResults, + offsetStore: cfg.OffsetStore, + logger: logger.With("worker", "rtmanager.startjobs", "stream", cfg.Stream), + }, nil +} + +// Run drives the XREAD loop until ctx is cancelled. Per-message +// outcomes are absorbed by HandleMessage; the loop only exits on +// context cancellation or a fatal Redis / offset-store error. +func (consumer *Consumer) Run(ctx context.Context) error { + if consumer == nil || consumer.client == nil { + return errors.New("run start jobs consumer: nil consumer") + } + if ctx == nil { + return errors.New("run start jobs consumer: nil context") + } + if err := ctx.Err(); err != nil { + return err + } + + lastID, found, err := consumer.offsetStore.Load(ctx, streamOffsetLabel) + if err != nil { + return fmt.Errorf("run start jobs consumer: load offset: %w", err) + } + if !found { + lastID = "0-0" + } + + consumer.logger.Info("start jobs consumer started", + "block_timeout", consumer.blockTimeout.String(), + "start_entry_id", lastID, + ) + defer consumer.logger.Info("start jobs consumer stopped") + + for { + streams, err := consumer.client.XRead(ctx, &redis.XReadArgs{ + Streams: []string{consumer.stream, lastID}, + Count: 1, + Block: consumer.blockTimeout, + }).Result() + switch { + case err == nil: + for _, stream := range streams { + for _, message := range stream.Messages { + consumer.HandleMessage(ctx, message) + if err := consumer.offsetStore.Save(ctx, streamOffsetLabel, message.ID); err != nil { + return fmt.Errorf("run start jobs consumer: save offset: %w", err) + } + lastID = message.ID + } + } + case errors.Is(err, redis.Nil): + continue + case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)): + return ctx.Err() + case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed): + return fmt.Errorf("run start jobs consumer: %w", err) + default: + return fmt.Errorf("run start jobs consumer: %w", err) + } + } +} + +// Shutdown is a no-op; the consumer relies on context cancellation. +func (consumer *Consumer) Shutdown(ctx context.Context) error { + if ctx == nil { + return errors.New("shutdown start jobs consumer: nil context") + } + return nil +} + +// HandleMessage processes one Redis Stream message. Exported so tests +// can drive the consumer deterministically without spinning up a real +// XREAD loop. +// +// Per-message errors are logged and absorbed: the worker keeps running +// and the offset is allowed to advance. +func (consumer *Consumer) HandleMessage(ctx context.Context, message redis.XMessage) { + if consumer == nil { + return + } + + envelope, err := decodeStartJob(message) + if err != nil { + consumer.logger.WarnContext(ctx, "decode start job", + "stream_entry_id", message.ID, + "err", err.Error(), + ) + return + } + + input := startruntime.Input{ + GameID: envelope.GameID, + ImageRef: envelope.ImageRef, + OpSource: operation.OpSourceLobbyStream, + SourceRef: message.ID, + } + result, err := consumer.startService.Handle(ctx, input) + if err != nil { + consumer.logger.ErrorContext(ctx, "start service returned go-level error", + "stream_entry_id", message.ID, + "game_id", envelope.GameID, + "err", err.Error(), + ) + return + } + + jobResult := buildJobResult(envelope.GameID, result) + if err := consumer.jobResults.Publish(ctx, jobResult); err != nil { + consumer.logger.ErrorContext(ctx, "publish job result", + "stream_entry_id", message.ID, + "game_id", envelope.GameID, + "outcome", jobResult.Outcome, + "error_code", jobResult.ErrorCode, + "err", err.Error(), + ) + return + } + + logArgs := []any{ + "stream_entry_id", message.ID, + "game_id", envelope.GameID, + "outcome", jobResult.Outcome, + "error_code", jobResult.ErrorCode, + "requested_at_ms", envelope.RequestedAtMS, + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + consumer.logger.InfoContext(ctx, "start job processed", logArgs...) +} + +// startJobEnvelope stores the decoded shape of one `runtime:start_jobs` +// stream entry. +type startJobEnvelope struct { + GameID string + ImageRef string + RequestedAtMS int64 +} + +func decodeStartJob(message redis.XMessage) (startJobEnvelope, error) { + gameID := strings.TrimSpace(optionalString(message.Values, fieldGameID)) + if gameID == "" { + return startJobEnvelope{}, errors.New("missing game_id") + } + imageRef := strings.TrimSpace(optionalString(message.Values, fieldImageRef)) + if imageRef == "" { + return startJobEnvelope{}, errors.New("missing image_ref") + } + requestedAtMS, err := optionalInt64(message.Values, fieldRequestedAtMS) + if err != nil { + return startJobEnvelope{}, fmt.Errorf("invalid requested_at_ms: %w", err) + } + return startJobEnvelope{ + GameID: gameID, + ImageRef: imageRef, + RequestedAtMS: requestedAtMS, + }, nil +} + +// buildJobResult translates a startruntime.Result into the wire payload +// published on `runtime:job_results`. ContainerID and EngineEndpoint are +// taken from the service's Record on success / replay; on failure the +// service returns a zero Record and both fields stay empty per the +// AsyncAPI contract (required field, empty string is a valid value). +func buildJobResult(gameID string, result startruntime.Result) ports.JobResult { + jobResult := ports.JobResult{ + GameID: gameID, + Outcome: string(result.Outcome), + ErrorCode: result.ErrorCode, + ErrorMessage: result.ErrorMessage, + } + if result.Outcome == operation.OutcomeSuccess { + jobResult.ContainerID = result.Record.CurrentContainerID + jobResult.EngineEndpoint = result.Record.EngineEndpoint + } + return jobResult +} + +func optionalString(values map[string]any, key string) string { + raw, ok := values[key] + if !ok { + return "" + } + switch typed := raw.(type) { + case string: + return typed + case []byte: + return string(typed) + default: + return "" + } +} + +func optionalInt64(values map[string]any, key string) (int64, error) { + raw, ok := values[key] + if !ok { + return 0, nil + } + var stringValue string + switch typed := raw.(type) { + case string: + stringValue = typed + case []byte: + stringValue = string(typed) + default: + return 0, fmt.Errorf("unsupported type %T", raw) + } + stringValue = strings.TrimSpace(stringValue) + if stringValue == "" { + return 0, nil + } + parsed, err := strconv.ParseInt(stringValue, 10, 64) + if err != nil { + return 0, err + } + return parsed, nil +} diff --git a/rtmanager/internal/worker/startjobsconsumer/consumer_test.go b/rtmanager/internal/worker/startjobsconsumer/consumer_test.go new file mode 100644 index 0000000..ba588a0 --- /dev/null +++ b/rtmanager/internal/worker/startjobsconsumer/consumer_test.go @@ -0,0 +1,631 @@ +package startjobsconsumer_test + +import ( + "context" + "errors" + "io" + "log/slog" + "strconv" + "sync" + "testing" + "time" + + "galaxy/notificationintent" + "galaxy/rtmanager/internal/adapters/docker/mocks" + "galaxy/rtmanager/internal/adapters/jobresultspublisher" + "galaxy/rtmanager/internal/adapters/redisstate/streamoffsets" + "galaxy/rtmanager/internal/config" + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/telemetry" + "galaxy/rtmanager/internal/worker/startjobsconsumer" + + "github.com/alicebob/miniredis/v2" + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" +) + +func silentLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +type fakeStartService struct { + mu sync.Mutex + inputs []startruntime.Input + result startruntime.Result + err error + hook func(input startruntime.Input) (startruntime.Result, error) +} + +func (s *fakeStartService) Handle(_ context.Context, input startruntime.Input) (startruntime.Result, error) { + s.mu.Lock() + defer s.mu.Unlock() + s.inputs = append(s.inputs, input) + if s.hook != nil { + return s.hook(input) + } + return s.result, s.err +} + +func (s *fakeStartService) Inputs() []startruntime.Input { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]startruntime.Input, len(s.inputs)) + copy(out, s.inputs) + return out +} + +type fakeJobResults struct { + mu sync.Mutex + published []ports.JobResult + publishErr error +} + +func (s *fakeJobResults) Publish(_ context.Context, result ports.JobResult) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.publishErr != nil { + return s.publishErr + } + s.published = append(s.published, result) + return nil +} + +func (s *fakeJobResults) Published() []ports.JobResult { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]ports.JobResult, len(s.published)) + copy(out, s.published) + return out +} + +type fakeOffsetStore struct { + mu sync.Mutex + offsets map[string]string + loadErr error + saveErr error +} + +func newFakeOffsetStore() *fakeOffsetStore { + return &fakeOffsetStore{offsets: map[string]string{}} +} + +func (s *fakeOffsetStore) Load(_ context.Context, label string) (string, bool, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.loadErr != nil { + return "", false, s.loadErr + } + value, ok := s.offsets[label] + return value, ok, nil +} + +func (s *fakeOffsetStore) Save(_ context.Context, label, entryID string) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.saveErr != nil { + return s.saveErr + } + s.offsets[label] = entryID + return nil +} + +func (s *fakeOffsetStore) Get(label string) (string, bool) { + s.mu.Lock() + defer s.mu.Unlock() + value, ok := s.offsets[label] + return value, ok +} + +type harness struct { + consumer *startjobsconsumer.Consumer + starts *fakeStartService + results *fakeJobResults + offsets *fakeOffsetStore + stream string + server *miniredis.Miniredis + client *redis.Client +} + +func newHarness(t *testing.T) *harness { + t.Helper() + server := miniredis.RunT(t) + client := redis.NewClient(&redis.Options{Addr: server.Addr()}) + t.Cleanup(func() { _ = client.Close() }) + + starts := &fakeStartService{} + results := &fakeJobResults{} + offsets := newFakeOffsetStore() + stream := "runtime:start_jobs" + + consumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{ + Client: client, + Stream: stream, + BlockTimeout: 50 * time.Millisecond, + StartService: starts, + JobResults: results, + OffsetStore: offsets, + Logger: silentLogger(), + }) + require.NoError(t, err) + + return &harness{ + consumer: consumer, + starts: starts, + results: results, + offsets: offsets, + stream: stream, + server: server, + client: client, + } +} + +func startMessage(id, gameID, imageRef string, requestedAtMS int64) redis.XMessage { + return redis.XMessage{ + ID: id, + Values: map[string]any{ + "game_id": gameID, + "image_ref": imageRef, + "requested_at_ms": strconv.FormatInt(requestedAtMS, 10), + }, + } +} + +func TestNewConsumerRejectsMissingDeps(t *testing.T) { + server := miniredis.RunT(t) + client := redis.NewClient(&redis.Options{Addr: server.Addr()}) + t.Cleanup(func() { _ = client.Close() }) + + cases := []startjobsconsumer.Config{ + {}, + {Client: client}, + {Client: client, Stream: "runtime:start_jobs"}, + {Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second}, + {Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second, StartService: &fakeStartService{}}, + {Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second, StartService: &fakeStartService{}, JobResults: &fakeJobResults{}}, + } + for index, cfg := range cases { + _, err := startjobsconsumer.NewConsumer(cfg) + require.Errorf(t, err, "case %d should fail", index) + } +} + +func TestHandleMessageSuccessPublishesSuccessResult(t *testing.T) { + h := newHarness(t) + h.starts.result = startruntime.Result{ + Record: runtime.RuntimeRecord{ + GameID: "game-1", + Status: runtime.StatusRunning, + CurrentContainerID: "c-1", + EngineEndpoint: "http://galaxy-game-game-1:8080", + }, + Outcome: operation.OutcomeSuccess, + } + + h.consumer.HandleMessage(context.Background(), startMessage("100-0", "game-1", "galaxy/game:1.0.0", 1700)) + + inputs := h.starts.Inputs() + require.Len(t, inputs, 1) + assert.Equal(t, "game-1", inputs[0].GameID) + assert.Equal(t, "galaxy/game:1.0.0", inputs[0].ImageRef) + assert.Equal(t, operation.OpSourceLobbyStream, inputs[0].OpSource) + assert.Equal(t, "100-0", inputs[0].SourceRef) + + published := h.results.Published() + require.Len(t, published, 1) + assert.Equal(t, ports.JobResult{ + GameID: "game-1", + Outcome: ports.JobOutcomeSuccess, + ContainerID: "c-1", + EngineEndpoint: "http://galaxy-game-game-1:8080", + }, published[0]) +} + +func TestHandleMessageFailurePublishesFailureResult(t *testing.T) { + h := newHarness(t) + h.starts.result = startruntime.Result{ + Outcome: operation.OutcomeFailure, + ErrorCode: startruntime.ErrorCodeImagePullFailed, + ErrorMessage: "manifest unknown", + } + + h.consumer.HandleMessage(context.Background(), startMessage("101-0", "game-2", "galaxy/game:bad", 1700)) + + published := h.results.Published() + require.Len(t, published, 1) + assert.Equal(t, ports.JobResult{ + GameID: "game-2", + Outcome: ports.JobOutcomeFailure, + ErrorCode: "image_pull_failed", + ErrorMessage: "manifest unknown", + }, published[0]) +} + +func TestHandleMessageReplayNoOpKeepsContainerAndEndpoint(t *testing.T) { + h := newHarness(t) + h.starts.result = startruntime.Result{ + Record: runtime.RuntimeRecord{ + GameID: "game-3", + Status: runtime.StatusRunning, + CurrentContainerID: "c-3", + EngineEndpoint: "http://galaxy-game-game-3:8080", + }, + Outcome: operation.OutcomeSuccess, + ErrorCode: startruntime.ErrorCodeReplayNoOp, + } + + h.consumer.HandleMessage(context.Background(), startMessage("102-0", "game-3", "galaxy/game:1.0.0", 1700)) + + published := h.results.Published() + require.Len(t, published, 1) + assert.Equal(t, ports.JobResult{ + GameID: "game-3", + Outcome: ports.JobOutcomeSuccess, + ContainerID: "c-3", + EngineEndpoint: "http://galaxy-game-game-3:8080", + ErrorCode: "replay_no_op", + }, published[0]) +} + +func TestHandleMessageMalformedEnvelopesAreAbsorbed(t *testing.T) { + h := newHarness(t) + + cases := []redis.XMessage{ + {ID: "200-0", Values: map[string]any{"image_ref": "galaxy/game:1.0.0", "requested_at_ms": "1"}}, + {ID: "200-1", Values: map[string]any{"game_id": " ", "image_ref": "galaxy/game:1.0.0", "requested_at_ms": "1"}}, + {ID: "200-2", Values: map[string]any{"game_id": "game-x", "requested_at_ms": "1"}}, + {ID: "200-3", Values: map[string]any{"game_id": "game-x", "image_ref": " ", "requested_at_ms": "1"}}, + {ID: "200-4", Values: map[string]any{"game_id": "game-x", "image_ref": "galaxy/game:1.0.0", "requested_at_ms": "not-a-number"}}, + } + for _, msg := range cases { + h.consumer.HandleMessage(context.Background(), msg) + } + + assert.Empty(t, h.starts.Inputs(), "malformed envelopes must not reach the start service") + assert.Empty(t, h.results.Published(), "malformed envelopes must not produce job results") +} + +func TestHandleMessagePublishFailureIsAbsorbed(t *testing.T) { + h := newHarness(t) + h.starts.result = startruntime.Result{Outcome: operation.OutcomeFailure, ErrorCode: "internal_error"} + h.results.publishErr = errors.New("redis transient") + + h.consumer.HandleMessage(context.Background(), startMessage("300-0", "game-x", "galaxy/game:1.0.0", 1700)) + + require.Len(t, h.starts.Inputs(), 1, "service still runs even when publish fails") +} + +func TestHandleMessageGoLevelErrorIsAbsorbed(t *testing.T) { + h := newHarness(t) + h.starts.err = errors.New("nil ctx") + + h.consumer.HandleMessage(context.Background(), startMessage("400-0", "game-y", "galaxy/game:1.0.0", 1700)) + + assert.Empty(t, h.results.Published(), "go-level service errors must not surface as job results") +} + +func TestRunAdvancesOffsetPerMessage(t *testing.T) { + h := newHarness(t) + h.starts.result = startruntime.Result{ + Record: runtime.RuntimeRecord{ + GameID: "game-5", + Status: runtime.StatusRunning, + CurrentContainerID: "c-5", + EngineEndpoint: "http://galaxy-game-game-5:8080", + }, + Outcome: operation.OutcomeSuccess, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + done := make(chan error, 1) + go func() { done <- h.consumer.Run(ctx) }() + + mustXAdd(t, h.client, h.stream, "game-5", "galaxy/game:1.0.0", 1) + mustXAdd(t, h.client, h.stream, "game-5", "galaxy/game:1.0.0", 2) + + require.Eventually(t, func() bool { + return len(h.results.Published()) == 2 + }, time.Second, 10*time.Millisecond, "consumer must produce one job result per envelope") + + cancel() + require.Eventually(t, func() bool { + select { + case <-done: + return true + default: + return false + } + }, time.Second, 10*time.Millisecond, "Run must exit after context cancel") + + id, ok := h.offsets.Get("startjobs") + require.True(t, ok, "offset must be persisted after the run loop processed messages") + assert.NotEmpty(t, id, "offset entry id must not be empty") +} + +func TestRunResumesFromPersistedOffset(t *testing.T) { + h := newHarness(t) + h.starts.result = startruntime.Result{ + Record: runtime.RuntimeRecord{ + GameID: "game-6", + Status: runtime.StatusRunning, + CurrentContainerID: "c-6", + EngineEndpoint: "http://galaxy-game-game-6:8080", + }, + Outcome: operation.OutcomeSuccess, + } + + preID := mustXAdd(t, h.client, h.stream, "game-6", "galaxy/game:1.0.0", 1) + require.NoError(t, h.offsets.Save(context.Background(), "startjobs", preID)) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + done := make(chan error, 1) + go func() { done <- h.consumer.Run(ctx) }() + + mustXAdd(t, h.client, h.stream, "game-6", "galaxy/game:1.0.0", 2) + + require.Eventually(t, func() bool { + return len(h.results.Published()) == 1 + }, time.Second, 10*time.Millisecond, "consumer must skip the pre-existing entry and process only the new one") + + cancel() + <-done +} + +func TestRunExitsImmediatelyOnAlreadyCancelledContext(t *testing.T) { + h := newHarness(t) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + err := h.consumer.Run(ctx) + require.ErrorIs(t, err, context.Canceled) + assert.Empty(t, h.starts.Inputs()) + assert.Empty(t, h.results.Published()) +} + +func mustXAdd(t *testing.T, client *redis.Client, stream, gameID, imageRef string, requestedAtMS int64) string { + t.Helper() + id, err := client.XAdd(context.Background(), &redis.XAddArgs{ + Stream: stream, + Values: map[string]any{ + "game_id": gameID, + "image_ref": imageRef, + "requested_at_ms": strconv.FormatInt(requestedAtMS, 10), + }, + }).Result() + require.NoError(t, err) + return id +} + +// --- in-memory fakes for the roundtrip integration test ---------------------- + +type memoryRecords struct { + mu sync.Mutex + store map[string]runtime.RuntimeRecord +} + +func newMemoryRecords() *memoryRecords { + return &memoryRecords{store: map[string]runtime.RuntimeRecord{}} +} + +func (s *memoryRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + record, ok := s.store[gameID] + if !ok { + return runtime.RuntimeRecord{}, runtime.ErrNotFound + } + return record, nil +} + +func (s *memoryRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error { + s.mu.Lock() + defer s.mu.Unlock() + s.store[record.GameID] = record + return nil +} + +func (s *memoryRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error { + return errors.New("not used in start integration test") +} + +func (s *memoryRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in start integration test") +} + +func (s *memoryRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { + return nil, errors.New("not used in start integration test") +} + +type memoryOperationLogs struct { + mu sync.Mutex + entries []operation.OperationEntry +} + +func (s *memoryOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) { + s.mu.Lock() + defer s.mu.Unlock() + s.entries = append(s.entries, entry) + return int64(len(s.entries)), nil +} + +func (s *memoryOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) { + return nil, errors.New("not used in start integration test") +} + +type memoryLeases struct{} + +func (l *memoryLeases) TryAcquire(_ context.Context, _, _ string, _ time.Duration) (bool, error) { + return true, nil +} + +func (l *memoryLeases) Release(_ context.Context, _, _ string) error { + return nil +} + +type memoryHealthEvents struct{} + +func (h *memoryHealthEvents) Publish(_ context.Context, _ ports.HealthEventEnvelope) error { + return nil +} + +type memoryNotifications struct{} + +func (n *memoryNotifications) Publish(_ context.Context, _ notificationintent.Intent) error { + return nil +} + +// TestRoundTripStartJobThroughRealServiceAndPublisher exercises the +// Lobby → RTM → Lobby contract end-to-end inside one process: an XADD +// in the documented `runtime:start_jobs` shape is consumed, the real +// `startruntime.Service` runs against an in-memory fake stack and a +// gomock-backed Docker port, the real `jobresultspublisher` writes to +// `runtime:job_results`, and the test asserts the symmetric wire shape. +// +// A second XADD of the same envelope must surface as +// `error_code=replay_no_op` per the AsyncAPI replay-safety rule. +func TestRoundTripStartJobThroughRealServiceAndPublisher(t *testing.T) { + ctrl := gomock.NewController(t) + t.Cleanup(ctrl.Finish) + + server := miniredis.RunT(t) + client := redis.NewClient(&redis.Options{Addr: server.Addr()}) + t.Cleanup(func() { _ = client.Close() }) + + now := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) + records := newMemoryRecords() + dockerMock := mocks.NewMockDockerClient(ctrl) + + dockerMock.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil).Times(1) + dockerMock.EXPECT().PullImage(gomock.Any(), "galaxy/game:1.0.0", ports.PullPolicy(config.ImagePullPolicyIfMissing)).Return(nil).Times(1) + dockerMock.EXPECT().InspectImage(gomock.Any(), "galaxy/game:1.0.0").Return(ports.ImageInspect{ + Ref: "galaxy/game:1.0.0", + Labels: map[string]string{}, + }, nil).Times(1) + dockerMock.EXPECT().Run(gomock.Any(), gomock.Any()).Return(ports.RunResult{ + ContainerID: "ctr-roundtrip", + EngineEndpoint: "http://galaxy-game-game-1:8080", + StartedAt: now, + }, nil).Times(1) + + telemetryRuntime, err := telemetry.NewWithProviders(nil, nil) + require.NoError(t, err) + + containerCfg := config.ContainerConfig{ + DefaultCPUQuota: 1.0, + DefaultMemory: "512m", + DefaultPIDsLimit: 512, + StopTimeout: 30 * time.Second, + Retention: 30 * 24 * time.Hour, + EngineStateMountPath: "/var/lib/galaxy-game", + EngineStateEnvName: "GAME_STATE_PATH", + GameStateDirMode: 0o750, + GameStateRoot: "/var/lib/galaxy/games", + } + dockerCfg := config.DockerConfig{ + Host: "unix:///var/run/docker.sock", + Network: "galaxy-net", + LogDriver: "json-file", + PullPolicy: config.ImagePullPolicyIfMissing, + } + coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute} + + startService, err := startruntime.NewService(startruntime.Dependencies{ + RuntimeRecords: records, + OperationLogs: &memoryOperationLogs{}, + Docker: dockerMock, + Leases: &memoryLeases{}, + HealthEvents: &memoryHealthEvents{}, + Notifications: &memoryNotifications{}, + Container: containerCfg, + DockerCfg: dockerCfg, + Coordination: coordinationCfg, + Telemetry: telemetryRuntime, + Logger: silentLogger(), + Clock: func() time.Time { return now }, + NewToken: func() string { return "token-roundtrip" }, + PrepareStateDir: func(_ string) (string, error) { + return "/var/lib/galaxy/games/game-1", nil + }, + }) + require.NoError(t, err) + + publisher, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{ + Client: client, + Stream: "runtime:job_results", + }) + require.NoError(t, err) + + offsetStore, err := streamoffsets.New(streamoffsets.Config{Client: client}) + require.NoError(t, err) + + consumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{ + Client: client, + Stream: "runtime:start_jobs", + BlockTimeout: 50 * time.Millisecond, + StartService: startService, + JobResults: publisher, + OffsetStore: offsetStore, + Logger: silentLogger(), + }) + require.NoError(t, err) + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + done := make(chan error, 1) + go func() { done <- consumer.Run(ctx) }() + + mustXAdd(t, client, "runtime:start_jobs", "game-1", "galaxy/game:1.0.0", 1700) + + require.Eventually(t, func() bool { + entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result() + return err == nil && len(entries) == 1 + }, 2*time.Second, 20*time.Millisecond, "first XADD must produce one job result entry") + + entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result() + require.NoError(t, err) + require.Len(t, entries, 1) + values := entries[0].Values + assert.Equal(t, "game-1", values["game_id"]) + assert.Equal(t, "success", values["outcome"]) + assert.Equal(t, "ctr-roundtrip", values["container_id"]) + assert.Equal(t, "http://galaxy-game-game-1:8080", values["engine_endpoint"]) + assert.Equal(t, "", values["error_code"], "fresh start must publish empty error_code") + assert.Equal(t, "", values["error_message"]) + + // Replay: the same envelope must surface as success/replay_no_op + // because the runtime record now reports `running` with the same + // image_ref. The Docker mock has no further expectations, so a + // second pull/run would fail the test. + mustXAdd(t, client, "runtime:start_jobs", "game-1", "galaxy/game:1.0.0", 1701) + + require.Eventually(t, func() bool { + entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result() + return err == nil && len(entries) == 2 + }, 2*time.Second, 20*time.Millisecond, "second XADD must produce a replay_no_op job result") + + entries, err = client.XRange(ctx, "runtime:job_results", "-", "+").Result() + require.NoError(t, err) + require.Len(t, entries, 2) + replay := entries[1].Values + assert.Equal(t, "game-1", replay["game_id"]) + assert.Equal(t, "success", replay["outcome"]) + assert.Equal(t, "ctr-roundtrip", replay["container_id"]) + assert.Equal(t, "http://galaxy-game-game-1:8080", replay["engine_endpoint"]) + assert.Equal(t, "replay_no_op", replay["error_code"]) + assert.Equal(t, "", replay["error_message"]) + + cancel() + select { + case <-done: + case <-time.After(time.Second): + t.Fatal("consumer Run did not exit after context cancel") + } +} diff --git a/rtmanager/internal/worker/stopjobsconsumer/consumer.go b/rtmanager/internal/worker/stopjobsconsumer/consumer.go new file mode 100644 index 0000000..884517e --- /dev/null +++ b/rtmanager/internal/worker/stopjobsconsumer/consumer.go @@ -0,0 +1,332 @@ +// Package stopjobsconsumer drives the asynchronous half of the +// Lobby ↔ Runtime Manager stop contract. The consumer XREADs from +// `runtime:stop_jobs` (produced by Lobby), decodes the envelope frozen +// in `rtmanager/api/runtime-jobs-asyncapi.yaml`, calls the production +// stop orchestrator, and publishes one `runtime:job_results` outcome +// per consumed envelope. +// +// Replay safety: the stop service surfaces an already-stopped or +// already-removed record as `Outcome=success` with +// `error_code=replay_no_op`. The consumer copies the result fields +// into the wire payload verbatim. Per-message decode and publish +// errors are logged and absorbed; the offset advances unconditionally +// so a single poison message cannot pin the loop. Design rationale is +// captured in `rtmanager/docs/workers.md`. +package stopjobsconsumer + +import ( + "context" + "errors" + "fmt" + "log/slog" + "strconv" + "strings" + "time" + + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/logging" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/stopruntime" + + "github.com/redis/go-redis/v9" +) + +// streamOffsetLabel identifies the stop-jobs consumer in the stream +// offset store. Matches the convention from +// `rtmanager/README.md §Persistence Layout > Redis runtime-coordination state`. +const streamOffsetLabel = "stopjobs" + +// Wire field names of the `RuntimeStopJob` payload. Frozen by +// `rtmanager/api/runtime-jobs-asyncapi.yaml`. +const ( + fieldGameID = "game_id" + fieldReason = "reason" + fieldRequestedAtMS = "requested_at_ms" +) + +// StopService is the narrow surface the consumer needs from the stop +// orchestrator. The concrete `*stopruntime.Service` satisfies this +// interface and is wired in production. +type StopService interface { + Handle(ctx context.Context, input stopruntime.Input) (stopruntime.Result, error) +} + +// Config groups the dependencies required to construct a Consumer. +type Config struct { + // Client provides XREAD access to the stop-jobs stream. + Client *redis.Client + + // Stream stores the Redis Streams key consumed by the worker. + Stream string + + // BlockTimeout bounds the blocking XREAD window. + BlockTimeout time.Duration + + // StopService executes the stop lifecycle for each decoded envelope. + StopService StopService + + // JobResults publishes one outcome entry per processed envelope. + JobResults ports.JobResultPublisher + + // OffsetStore persists the last successfully processed entry id so + // the consumer survives restarts without replaying processed + // envelopes. + OffsetStore ports.StreamOffsetStore + + // Logger receives structured worker-level events. Defaults to + // `slog.Default` when nil. + Logger *slog.Logger +} + +// Consumer drives the stop-jobs processing loop. +type Consumer struct { + client *redis.Client + stream string + blockTimeout time.Duration + stopService StopService + jobResults ports.JobResultPublisher + offsetStore ports.StreamOffsetStore + logger *slog.Logger +} + +// NewConsumer constructs one Consumer from cfg. +func NewConsumer(cfg Config) (*Consumer, error) { + switch { + case cfg.Client == nil: + return nil, errors.New("new stop jobs consumer: nil redis client") + case strings.TrimSpace(cfg.Stream) == "": + return nil, errors.New("new stop jobs consumer: stream must not be empty") + case cfg.BlockTimeout <= 0: + return nil, errors.New("new stop jobs consumer: block timeout must be positive") + case cfg.StopService == nil: + return nil, errors.New("new stop jobs consumer: nil stop service") + case cfg.JobResults == nil: + return nil, errors.New("new stop jobs consumer: nil job results publisher") + case cfg.OffsetStore == nil: + return nil, errors.New("new stop jobs consumer: nil offset store") + } + + logger := cfg.Logger + if logger == nil { + logger = slog.Default() + } + return &Consumer{ + client: cfg.Client, + stream: cfg.Stream, + blockTimeout: cfg.BlockTimeout, + stopService: cfg.StopService, + jobResults: cfg.JobResults, + offsetStore: cfg.OffsetStore, + logger: logger.With("worker", "rtmanager.stopjobs", "stream", cfg.Stream), + }, nil +} + +// Run drives the XREAD loop until ctx is cancelled. +func (consumer *Consumer) Run(ctx context.Context) error { + if consumer == nil || consumer.client == nil { + return errors.New("run stop jobs consumer: nil consumer") + } + if ctx == nil { + return errors.New("run stop jobs consumer: nil context") + } + if err := ctx.Err(); err != nil { + return err + } + + lastID, found, err := consumer.offsetStore.Load(ctx, streamOffsetLabel) + if err != nil { + return fmt.Errorf("run stop jobs consumer: load offset: %w", err) + } + if !found { + lastID = "0-0" + } + + consumer.logger.Info("stop jobs consumer started", + "block_timeout", consumer.blockTimeout.String(), + "start_entry_id", lastID, + ) + defer consumer.logger.Info("stop jobs consumer stopped") + + for { + streams, err := consumer.client.XRead(ctx, &redis.XReadArgs{ + Streams: []string{consumer.stream, lastID}, + Count: 1, + Block: consumer.blockTimeout, + }).Result() + switch { + case err == nil: + for _, stream := range streams { + for _, message := range stream.Messages { + consumer.HandleMessage(ctx, message) + if err := consumer.offsetStore.Save(ctx, streamOffsetLabel, message.ID); err != nil { + return fmt.Errorf("run stop jobs consumer: save offset: %w", err) + } + lastID = message.ID + } + } + case errors.Is(err, redis.Nil): + continue + case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)): + return ctx.Err() + case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed): + return fmt.Errorf("run stop jobs consumer: %w", err) + default: + return fmt.Errorf("run stop jobs consumer: %w", err) + } + } +} + +// Shutdown is a no-op; the consumer relies on context cancellation. +func (consumer *Consumer) Shutdown(ctx context.Context) error { + if ctx == nil { + return errors.New("shutdown stop jobs consumer: nil context") + } + return nil +} + +// HandleMessage processes one Redis Stream message. Exported so tests +// can drive the consumer deterministically without spinning up a real +// XREAD loop. +func (consumer *Consumer) HandleMessage(ctx context.Context, message redis.XMessage) { + if consumer == nil { + return + } + + envelope, err := decodeStopJob(message) + if err != nil { + consumer.logger.WarnContext(ctx, "decode stop job", + "stream_entry_id", message.ID, + "err", err.Error(), + ) + return + } + + input := stopruntime.Input{ + GameID: envelope.GameID, + Reason: envelope.Reason, + OpSource: operation.OpSourceLobbyStream, + SourceRef: message.ID, + } + result, err := consumer.stopService.Handle(ctx, input) + if err != nil { + consumer.logger.ErrorContext(ctx, "stop service returned go-level error", + "stream_entry_id", message.ID, + "game_id", envelope.GameID, + "err", err.Error(), + ) + return + } + + jobResult := buildJobResult(envelope.GameID, result) + if err := consumer.jobResults.Publish(ctx, jobResult); err != nil { + consumer.logger.ErrorContext(ctx, "publish job result", + "stream_entry_id", message.ID, + "game_id", envelope.GameID, + "outcome", jobResult.Outcome, + "error_code", jobResult.ErrorCode, + "err", err.Error(), + ) + return + } + + logArgs := []any{ + "stream_entry_id", message.ID, + "game_id", envelope.GameID, + "reason", string(envelope.Reason), + "outcome", jobResult.Outcome, + "error_code", jobResult.ErrorCode, + "requested_at_ms", envelope.RequestedAtMS, + } + logArgs = append(logArgs, logging.ContextAttrs(ctx)...) + consumer.logger.InfoContext(ctx, "stop job processed", logArgs...) +} + +// stopJobEnvelope stores the decoded shape of one `runtime:stop_jobs` +// stream entry. +type stopJobEnvelope struct { + GameID string + Reason stopruntime.StopReason + RequestedAtMS int64 +} + +func decodeStopJob(message redis.XMessage) (stopJobEnvelope, error) { + gameID := strings.TrimSpace(optionalString(message.Values, fieldGameID)) + if gameID == "" { + return stopJobEnvelope{}, errors.New("missing game_id") + } + reasonRaw := strings.TrimSpace(optionalString(message.Values, fieldReason)) + if reasonRaw == "" { + return stopJobEnvelope{}, errors.New("missing reason") + } + reason := stopruntime.StopReason(reasonRaw) + if !reason.IsKnown() { + return stopJobEnvelope{}, fmt.Errorf("unsupported reason %q", reasonRaw) + } + requestedAtMS, err := optionalInt64(message.Values, fieldRequestedAtMS) + if err != nil { + return stopJobEnvelope{}, fmt.Errorf("invalid requested_at_ms: %w", err) + } + return stopJobEnvelope{ + GameID: gameID, + Reason: reason, + RequestedAtMS: requestedAtMS, + }, nil +} + +// buildJobResult translates a stopruntime.Result into the wire payload +// published on `runtime:job_results`. Stop replays for `status=removed` +// records carry an empty `CurrentContainerID`; the consumer publishes +// the empty fields verbatim, which the AsyncAPI contract permits. +func buildJobResult(gameID string, result stopruntime.Result) ports.JobResult { + jobResult := ports.JobResult{ + GameID: gameID, + Outcome: string(result.Outcome), + ErrorCode: result.ErrorCode, + ErrorMessage: result.ErrorMessage, + } + if result.Outcome == operation.OutcomeSuccess { + jobResult.ContainerID = result.Record.CurrentContainerID + jobResult.EngineEndpoint = result.Record.EngineEndpoint + } + return jobResult +} + +func optionalString(values map[string]any, key string) string { + raw, ok := values[key] + if !ok { + return "" + } + switch typed := raw.(type) { + case string: + return typed + case []byte: + return string(typed) + default: + return "" + } +} + +func optionalInt64(values map[string]any, key string) (int64, error) { + raw, ok := values[key] + if !ok { + return 0, nil + } + var stringValue string + switch typed := raw.(type) { + case string: + stringValue = typed + case []byte: + stringValue = string(typed) + default: + return 0, fmt.Errorf("unsupported type %T", raw) + } + stringValue = strings.TrimSpace(stringValue) + if stringValue == "" { + return 0, nil + } + parsed, err := strconv.ParseInt(stringValue, 10, 64) + if err != nil { + return 0, err + } + return parsed, nil +} diff --git a/rtmanager/internal/worker/stopjobsconsumer/consumer_test.go b/rtmanager/internal/worker/stopjobsconsumer/consumer_test.go new file mode 100644 index 0000000..3ca9900 --- /dev/null +++ b/rtmanager/internal/worker/stopjobsconsumer/consumer_test.go @@ -0,0 +1,357 @@ +package stopjobsconsumer_test + +import ( + "context" + "errors" + "io" + "log/slog" + "strconv" + "sync" + "testing" + "time" + + "galaxy/rtmanager/internal/domain/operation" + "galaxy/rtmanager/internal/domain/runtime" + "galaxy/rtmanager/internal/ports" + "galaxy/rtmanager/internal/service/startruntime" + "galaxy/rtmanager/internal/service/stopruntime" + "galaxy/rtmanager/internal/worker/stopjobsconsumer" + + "github.com/alicebob/miniredis/v2" + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func silentLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +type fakeStopService struct { + mu sync.Mutex + inputs []stopruntime.Input + result stopruntime.Result + err error +} + +func (s *fakeStopService) Handle(_ context.Context, input stopruntime.Input) (stopruntime.Result, error) { + s.mu.Lock() + defer s.mu.Unlock() + s.inputs = append(s.inputs, input) + return s.result, s.err +} + +func (s *fakeStopService) Inputs() []stopruntime.Input { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]stopruntime.Input, len(s.inputs)) + copy(out, s.inputs) + return out +} + +type fakeJobResults struct { + mu sync.Mutex + published []ports.JobResult + publishErr error +} + +func (s *fakeJobResults) Publish(_ context.Context, result ports.JobResult) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.publishErr != nil { + return s.publishErr + } + s.published = append(s.published, result) + return nil +} + +func (s *fakeJobResults) Published() []ports.JobResult { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]ports.JobResult, len(s.published)) + copy(out, s.published) + return out +} + +type fakeOffsetStore struct { + mu sync.Mutex + offsets map[string]string +} + +func newFakeOffsetStore() *fakeOffsetStore { + return &fakeOffsetStore{offsets: map[string]string{}} +} + +func (s *fakeOffsetStore) Load(_ context.Context, label string) (string, bool, error) { + s.mu.Lock() + defer s.mu.Unlock() + value, ok := s.offsets[label] + return value, ok, nil +} + +func (s *fakeOffsetStore) Save(_ context.Context, label, entryID string) error { + s.mu.Lock() + defer s.mu.Unlock() + s.offsets[label] = entryID + return nil +} + +func (s *fakeOffsetStore) Get(label string) (string, bool) { + s.mu.Lock() + defer s.mu.Unlock() + value, ok := s.offsets[label] + return value, ok +} + +type harness struct { + consumer *stopjobsconsumer.Consumer + stops *fakeStopService + results *fakeJobResults + offsets *fakeOffsetStore + stream string + server *miniredis.Miniredis + client *redis.Client +} + +func newHarness(t *testing.T) *harness { + t.Helper() + server := miniredis.RunT(t) + client := redis.NewClient(&redis.Options{Addr: server.Addr()}) + t.Cleanup(func() { _ = client.Close() }) + + stops := &fakeStopService{} + results := &fakeJobResults{} + offsets := newFakeOffsetStore() + stream := "runtime:stop_jobs" + + consumer, err := stopjobsconsumer.NewConsumer(stopjobsconsumer.Config{ + Client: client, + Stream: stream, + BlockTimeout: 50 * time.Millisecond, + StopService: stops, + JobResults: results, + OffsetStore: offsets, + Logger: silentLogger(), + }) + require.NoError(t, err) + + return &harness{ + consumer: consumer, + stops: stops, + results: results, + offsets: offsets, + stream: stream, + server: server, + client: client, + } +} + +func stopMessage(id, gameID, reason string, requestedAtMS int64) redis.XMessage { + return redis.XMessage{ + ID: id, + Values: map[string]any{ + "game_id": gameID, + "reason": reason, + "requested_at_ms": strconv.FormatInt(requestedAtMS, 10), + }, + } +} + +func TestNewConsumerRejectsMissingDeps(t *testing.T) { + server := miniredis.RunT(t) + client := redis.NewClient(&redis.Options{Addr: server.Addr()}) + t.Cleanup(func() { _ = client.Close() }) + + cases := []stopjobsconsumer.Config{ + {}, + {Client: client}, + {Client: client, Stream: "runtime:stop_jobs"}, + {Client: client, Stream: "runtime:stop_jobs", BlockTimeout: time.Second}, + {Client: client, Stream: "runtime:stop_jobs", BlockTimeout: time.Second, StopService: &fakeStopService{}}, + {Client: client, Stream: "runtime:stop_jobs", BlockTimeout: time.Second, StopService: &fakeStopService{}, JobResults: &fakeJobResults{}}, + } + for index, cfg := range cases { + _, err := stopjobsconsumer.NewConsumer(cfg) + require.Errorf(t, err, "case %d should fail", index) + } +} + +func TestHandleMessageSuccessPublishesSuccessResult(t *testing.T) { + h := newHarness(t) + h.stops.result = stopruntime.Result{ + Record: runtime.RuntimeRecord{ + GameID: "game-1", + Status: runtime.StatusStopped, + CurrentContainerID: "c-1", + CurrentImageRef: "galaxy/game:1.0.0", + EngineEndpoint: "http://galaxy-game-game-1:8080", + }, + Outcome: operation.OutcomeSuccess, + } + + h.consumer.HandleMessage(context.Background(), stopMessage("100-0", "game-1", "cancelled", 1700)) + + inputs := h.stops.Inputs() + require.Len(t, inputs, 1) + assert.Equal(t, "game-1", inputs[0].GameID) + assert.Equal(t, stopruntime.StopReasonCancelled, inputs[0].Reason) + assert.Equal(t, operation.OpSourceLobbyStream, inputs[0].OpSource) + assert.Equal(t, "100-0", inputs[0].SourceRef) + + published := h.results.Published() + require.Len(t, published, 1) + assert.Equal(t, ports.JobResult{ + GameID: "game-1", + Outcome: ports.JobOutcomeSuccess, + ContainerID: "c-1", + EngineEndpoint: "http://galaxy-game-game-1:8080", + }, published[0]) +} + +func TestHandleMessageFailureNotFoundPublishesFailureResult(t *testing.T) { + h := newHarness(t) + h.stops.result = stopruntime.Result{ + Outcome: operation.OutcomeFailure, + ErrorCode: startruntime.ErrorCodeNotFound, + ErrorMessage: "runtime record for game \"game-2\" does not exist", + } + + h.consumer.HandleMessage(context.Background(), stopMessage("101-0", "game-2", "admin_request", 1700)) + + published := h.results.Published() + require.Len(t, published, 1) + assert.Equal(t, ports.JobResult{ + GameID: "game-2", + Outcome: ports.JobOutcomeFailure, + ErrorCode: "not_found", + ErrorMessage: "runtime record for game \"game-2\" does not exist", + }, published[0]) +} + +func TestHandleMessageReplayNoOpForRemovedRecordHasEmptyContainerAndEndpoint(t *testing.T) { + h := newHarness(t) + h.stops.result = stopruntime.Result{ + Record: runtime.RuntimeRecord{ + GameID: "game-3", + Status: runtime.StatusRemoved, + CurrentContainerID: "", + EngineEndpoint: "http://galaxy-game-game-3:8080", + }, + Outcome: operation.OutcomeSuccess, + ErrorCode: startruntime.ErrorCodeReplayNoOp, + } + + h.consumer.HandleMessage(context.Background(), stopMessage("102-0", "game-3", "finished", 1700)) + + published := h.results.Published() + require.Len(t, published, 1) + assert.Equal(t, ports.JobResult{ + GameID: "game-3", + Outcome: ports.JobOutcomeSuccess, + ContainerID: "", + EngineEndpoint: "http://galaxy-game-game-3:8080", + ErrorCode: "replay_no_op", + }, published[0]) +} + +func TestHandleMessageMalformedEnvelopesAreAbsorbed(t *testing.T) { + h := newHarness(t) + + cases := []redis.XMessage{ + {ID: "200-0", Values: map[string]any{"reason": "cancelled", "requested_at_ms": "1"}}, + {ID: "200-1", Values: map[string]any{"game_id": "game-x", "requested_at_ms": "1"}}, + {ID: "200-2", Values: map[string]any{"game_id": "game-x", "reason": " ", "requested_at_ms": "1"}}, + {ID: "200-3", Values: map[string]any{"game_id": "game-x", "reason": "not_a_known_reason", "requested_at_ms": "1"}}, + {ID: "200-4", Values: map[string]any{"game_id": "game-x", "reason": "cancelled", "requested_at_ms": "abc"}}, + } + for _, msg := range cases { + h.consumer.HandleMessage(context.Background(), msg) + } + + assert.Empty(t, h.stops.Inputs(), "malformed envelopes must not reach the stop service") + assert.Empty(t, h.results.Published(), "malformed envelopes must not produce job results") +} + +func TestHandleMessagePublishFailureIsAbsorbed(t *testing.T) { + h := newHarness(t) + h.stops.result = stopruntime.Result{Outcome: operation.OutcomeFailure, ErrorCode: "internal_error"} + h.results.publishErr = errors.New("redis transient") + + h.consumer.HandleMessage(context.Background(), stopMessage("300-0", "game-x", "cancelled", 1700)) + + require.Len(t, h.stops.Inputs(), 1, "service still runs even when publish fails") +} + +func TestHandleMessageGoLevelErrorIsAbsorbed(t *testing.T) { + h := newHarness(t) + h.stops.err = errors.New("nil ctx") + + h.consumer.HandleMessage(context.Background(), stopMessage("400-0", "game-y", "cancelled", 1700)) + + assert.Empty(t, h.results.Published(), "go-level service errors must not surface as job results") +} + +func TestRunAdvancesOffsetPerMessage(t *testing.T) { + h := newHarness(t) + h.stops.result = stopruntime.Result{ + Record: runtime.RuntimeRecord{ + GameID: "game-5", + Status: runtime.StatusStopped, + CurrentContainerID: "c-5", + EngineEndpoint: "http://galaxy-game-game-5:8080", + }, + Outcome: operation.OutcomeSuccess, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + done := make(chan error, 1) + go func() { done <- h.consumer.Run(ctx) }() + + mustXAdd(t, h.client, h.stream, "game-5", "cancelled", 1) + mustXAdd(t, h.client, h.stream, "game-5", "finished", 2) + + require.Eventually(t, func() bool { + return len(h.results.Published()) == 2 + }, time.Second, 10*time.Millisecond, "consumer must produce one job result per envelope") + + cancel() + require.Eventually(t, func() bool { + select { + case <-done: + return true + default: + return false + } + }, time.Second, 10*time.Millisecond, "Run must exit after context cancel") + + id, ok := h.offsets.Get("stopjobs") + require.True(t, ok, "offset must be persisted after the run loop processed messages") + assert.NotEmpty(t, id, "offset entry id must not be empty") +} + +func TestRunExitsImmediatelyOnAlreadyCancelledContext(t *testing.T) { + h := newHarness(t) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + err := h.consumer.Run(ctx) + require.ErrorIs(t, err, context.Canceled) + assert.Empty(t, h.stops.Inputs()) + assert.Empty(t, h.results.Published()) +} + +func mustXAdd(t *testing.T, client *redis.Client, stream, gameID, reason string, requestedAtMS int64) string { + t.Helper() + id, err := client.XAdd(context.Background(), &redis.XAddArgs{ + Stream: stream, + Values: map[string]any{ + "game_id": gameID, + "reason": reason, + "requested_at_ms": strconv.FormatInt(requestedAtMS, 10), + }, + }).Result() + require.NoError(t, err) + return id +}