feat: runtime manager
This commit is contained in:
@@ -0,0 +1,534 @@
|
||||
openapi: 3.0.3
|
||||
info:
|
||||
title: Galaxy Runtime Manager Internal REST API
|
||||
version: v1
|
||||
description: |
|
||||
This specification documents the internal trusted REST contract of
|
||||
`galaxy/rtmanager` served on `RTMANAGER_INTERNAL_HTTP_ADDR`
|
||||
(default `:8096`).
|
||||
|
||||
The listener is not reachable from the public internet. Two caller
|
||||
classes use it: `Game Master` (inspect / restart / patch / stop /
|
||||
cleanup) and `Admin Service` (operational tooling, including
|
||||
force-cleanup). Runtime Manager treats every caller on this port as
|
||||
trusted and performs no user-level authorization; downstream services
|
||||
rely on network segmentation. There is no `X-User-ID` header
|
||||
contract.
|
||||
|
||||
Transport rules:
|
||||
- request bodies are strict JSON only; unknown fields are rejected
|
||||
with `invalid_request`;
|
||||
- error responses use `{ "error": { "code", "message" } }`, identical
|
||||
to the Lobby contract;
|
||||
- stable error codes are: `invalid_request`, `not_found`, `conflict`,
|
||||
`service_unavailable`, `internal_error`, `image_pull_failed`,
|
||||
`image_ref_not_semver`, `semver_patch_only`,
|
||||
`container_start_failed`, `start_config_invalid`,
|
||||
`docker_unavailable`, `replay_no_op`.
|
||||
|
||||
Caller identification:
|
||||
- the optional `X-Galaxy-Caller` request header carries the calling
|
||||
service identity (`gm` for `Game Master`, `admin` for `Admin
|
||||
Service`). Runtime Manager records the value as `op_source` in
|
||||
the `operation_log` (`gm_rest` or `admin_rest`). When the header
|
||||
is missing or carries an unknown value, Runtime Manager defaults
|
||||
to `op_source = admin_rest`.
|
||||
servers:
|
||||
- url: http://localhost:8096
|
||||
description: Default local internal listener for Runtime Manager.
|
||||
tags:
|
||||
- name: Runtimes
|
||||
description: Runtime lifecycle endpoints called by Game Master and Admin Service.
|
||||
- name: Probes
|
||||
description: Health and readiness probes.
|
||||
paths:
|
||||
/healthz:
|
||||
get:
|
||||
tags:
|
||||
- Probes
|
||||
operationId: internalHealthz
|
||||
summary: Internal listener health probe
|
||||
responses:
|
||||
"200":
|
||||
description: Service is alive.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ProbeResponse"
|
||||
examples:
|
||||
ok:
|
||||
value:
|
||||
status: ok
|
||||
/readyz:
|
||||
get:
|
||||
tags:
|
||||
- Probes
|
||||
operationId: internalReadyz
|
||||
summary: Internal listener readiness probe
|
||||
description: |
|
||||
Returns `200` only when the PostgreSQL primary, Redis master, and
|
||||
Docker daemon are reachable and the configured Docker network
|
||||
exists. Returns `503` with the standard error envelope otherwise.
|
||||
responses:
|
||||
"200":
|
||||
description: Service is ready to serve traffic.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ProbeResponse"
|
||||
examples:
|
||||
ready:
|
||||
value:
|
||||
status: ready
|
||||
"503":
|
||||
$ref: "#/components/responses/ServiceUnavailableError"
|
||||
/api/v1/internal/runtimes:
|
||||
get:
|
||||
tags:
|
||||
- Runtimes
|
||||
operationId: internalListRuntimes
|
||||
summary: List all known runtime records
|
||||
description: |
|
||||
Returns the full list of runtime records known to Runtime Manager.
|
||||
Pagination is not supported in v1 — the working set is bounded by
|
||||
the number of games tracked by Lobby and is small enough to return
|
||||
in one response.
|
||||
parameters:
|
||||
- $ref: "#/components/parameters/XGalaxyCallerHeader"
|
||||
responses:
|
||||
"200":
|
||||
description: All runtime records.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/RuntimesList"
|
||||
"500":
|
||||
$ref: "#/components/responses/InternalError"
|
||||
"503":
|
||||
$ref: "#/components/responses/ServiceUnavailableError"
|
||||
/api/v1/internal/runtimes/{game_id}:
|
||||
get:
|
||||
tags:
|
||||
- Runtimes
|
||||
operationId: internalGetRuntime
|
||||
summary: Get one runtime record by game id
|
||||
parameters:
|
||||
- $ref: "#/components/parameters/GameIDPath"
|
||||
- $ref: "#/components/parameters/XGalaxyCallerHeader"
|
||||
responses:
|
||||
"200":
|
||||
description: Runtime record for the game.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/RuntimeRecord"
|
||||
"404":
|
||||
$ref: "#/components/responses/NotFoundError"
|
||||
"500":
|
||||
$ref: "#/components/responses/InternalError"
|
||||
"503":
|
||||
$ref: "#/components/responses/ServiceUnavailableError"
|
||||
/api/v1/internal/runtimes/{game_id}/start:
|
||||
post:
|
||||
tags:
|
||||
- Runtimes
|
||||
operationId: internalStartRuntime
|
||||
summary: Start a game engine container
|
||||
description: |
|
||||
Pulls the supplied `image_ref` per the configured pull policy and
|
||||
creates the engine container. Idempotent: a re-start with the same
|
||||
`image_ref` for an already-running record returns `200` with the
|
||||
current record and `error_code=replay_no_op` recorded in the
|
||||
operation log.
|
||||
parameters:
|
||||
- $ref: "#/components/parameters/GameIDPath"
|
||||
- $ref: "#/components/parameters/XGalaxyCallerHeader"
|
||||
requestBody:
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/StartRequest"
|
||||
responses:
|
||||
"200":
|
||||
description: Runtime record after the start operation.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/RuntimeRecord"
|
||||
"400":
|
||||
$ref: "#/components/responses/InvalidRequestError"
|
||||
"409":
|
||||
$ref: "#/components/responses/ConflictError"
|
||||
"500":
|
||||
$ref: "#/components/responses/InternalError"
|
||||
"503":
|
||||
$ref: "#/components/responses/ServiceUnavailableError"
|
||||
/api/v1/internal/runtimes/{game_id}/stop:
|
||||
post:
|
||||
tags:
|
||||
- Runtimes
|
||||
operationId: internalStopRuntime
|
||||
summary: Stop a running game engine container
|
||||
description: |
|
||||
Issues `docker stop` with the configured timeout. Idempotent: stop
|
||||
on a record that is already `stopped` or `removed` returns
|
||||
success with `error_code=replay_no_op` recorded in the operation
|
||||
log.
|
||||
parameters:
|
||||
- $ref: "#/components/parameters/GameIDPath"
|
||||
- $ref: "#/components/parameters/XGalaxyCallerHeader"
|
||||
requestBody:
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/StopRequest"
|
||||
responses:
|
||||
"200":
|
||||
description: Runtime record after the stop operation.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/RuntimeRecord"
|
||||
"400":
|
||||
$ref: "#/components/responses/InvalidRequestError"
|
||||
"404":
|
||||
$ref: "#/components/responses/NotFoundError"
|
||||
"409":
|
||||
$ref: "#/components/responses/ConflictError"
|
||||
"500":
|
||||
$ref: "#/components/responses/InternalError"
|
||||
"503":
|
||||
$ref: "#/components/responses/ServiceUnavailableError"
|
||||
/api/v1/internal/runtimes/{game_id}/restart:
|
||||
post:
|
||||
tags:
|
||||
- Runtimes
|
||||
operationId: internalRestartRuntime
|
||||
summary: Recreate a game engine container with the same image
|
||||
description: |
|
||||
Stops, removes, and re-runs the container with the current
|
||||
`image_ref`. The container id changes; the engine endpoint stays
|
||||
stable.
|
||||
parameters:
|
||||
- $ref: "#/components/parameters/GameIDPath"
|
||||
- $ref: "#/components/parameters/XGalaxyCallerHeader"
|
||||
responses:
|
||||
"200":
|
||||
description: Runtime record after the restart operation.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/RuntimeRecord"
|
||||
"404":
|
||||
$ref: "#/components/responses/NotFoundError"
|
||||
"409":
|
||||
$ref: "#/components/responses/ConflictError"
|
||||
"500":
|
||||
$ref: "#/components/responses/InternalError"
|
||||
"503":
|
||||
$ref: "#/components/responses/ServiceUnavailableError"
|
||||
/api/v1/internal/runtimes/{game_id}/patch:
|
||||
post:
|
||||
tags:
|
||||
- Runtimes
|
||||
operationId: internalPatchRuntime
|
||||
summary: Recreate a game engine container with a new image
|
||||
description: |
|
||||
Restart with a new `image_ref`. Allowed only as a semver patch
|
||||
within the same major and minor line. Cross-major or cross-minor
|
||||
attempts return `409 conflict` with `error_code=semver_patch_only`.
|
||||
A non-semver `image_ref` returns `400 invalid_request` with
|
||||
`error_code=image_ref_not_semver`.
|
||||
parameters:
|
||||
- $ref: "#/components/parameters/GameIDPath"
|
||||
- $ref: "#/components/parameters/XGalaxyCallerHeader"
|
||||
requestBody:
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PatchRequest"
|
||||
responses:
|
||||
"200":
|
||||
description: Runtime record after the patch operation.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/RuntimeRecord"
|
||||
"400":
|
||||
$ref: "#/components/responses/InvalidRequestError"
|
||||
"404":
|
||||
$ref: "#/components/responses/NotFoundError"
|
||||
"409":
|
||||
$ref: "#/components/responses/ConflictError"
|
||||
"500":
|
||||
$ref: "#/components/responses/InternalError"
|
||||
"503":
|
||||
$ref: "#/components/responses/ServiceUnavailableError"
|
||||
/api/v1/internal/runtimes/{game_id}/container:
|
||||
delete:
|
||||
tags:
|
||||
- Runtimes
|
||||
operationId: internalCleanupRuntimeContainer
|
||||
summary: Remove an exited container
|
||||
description: |
|
||||
Calls `docker rm` for an already-stopped container and updates the
|
||||
runtime record to `removed`. Refuses with `409 conflict` if the
|
||||
record is still `running`. The host state directory is not
|
||||
deleted.
|
||||
parameters:
|
||||
- $ref: "#/components/parameters/GameIDPath"
|
||||
- $ref: "#/components/parameters/XGalaxyCallerHeader"
|
||||
responses:
|
||||
"200":
|
||||
description: Runtime record after the cleanup operation.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/RuntimeRecord"
|
||||
"404":
|
||||
$ref: "#/components/responses/NotFoundError"
|
||||
"409":
|
||||
$ref: "#/components/responses/ConflictError"
|
||||
"500":
|
||||
$ref: "#/components/responses/InternalError"
|
||||
"503":
|
||||
$ref: "#/components/responses/ServiceUnavailableError"
|
||||
components:
|
||||
parameters:
|
||||
GameIDPath:
|
||||
name: game_id
|
||||
in: path
|
||||
required: true
|
||||
description: Opaque stable game identifier owned by Lobby.
|
||||
schema:
|
||||
type: string
|
||||
XGalaxyCallerHeader:
|
||||
name: X-Galaxy-Caller
|
||||
in: header
|
||||
required: false
|
||||
description: |
|
||||
Identifies the calling service so Runtime Manager can record the
|
||||
right `op_source` in `operation_log` (`gm_rest` for `gm`,
|
||||
`admin_rest` for `admin`). Missing or unknown values default to
|
||||
`admin_rest`.
|
||||
schema:
|
||||
type: string
|
||||
enum:
|
||||
- gm
|
||||
- admin
|
||||
schemas:
|
||||
RuntimeRecord:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- game_id
|
||||
- status
|
||||
- state_path
|
||||
- docker_network
|
||||
- last_op_at
|
||||
- created_at
|
||||
properties:
|
||||
game_id:
|
||||
type: string
|
||||
description: Opaque stable game identifier owned by Lobby.
|
||||
status:
|
||||
type: string
|
||||
enum:
|
||||
- running
|
||||
- stopped
|
||||
- removed
|
||||
description: Current runtime status maintained by Runtime Manager.
|
||||
current_container_id:
|
||||
type: string
|
||||
nullable: true
|
||||
description: Docker container id; null when status is removed.
|
||||
current_image_ref:
|
||||
type: string
|
||||
nullable: true
|
||||
description: Image reference of the current container; null when status is removed.
|
||||
engine_endpoint:
|
||||
type: string
|
||||
nullable: true
|
||||
description: Stable engine URL `http://galaxy-game-{game_id}:8080`; null when status is removed.
|
||||
state_path:
|
||||
type: string
|
||||
description: Absolute host path of the per-game bind-mounted state directory.
|
||||
docker_network:
|
||||
type: string
|
||||
description: Docker network name observed when the container was created.
|
||||
started_at:
|
||||
type: string
|
||||
format: date-time
|
||||
nullable: true
|
||||
description: UTC timestamp of the most recent successful start.
|
||||
stopped_at:
|
||||
type: string
|
||||
format: date-time
|
||||
nullable: true
|
||||
description: UTC timestamp of the most recent stop.
|
||||
removed_at:
|
||||
type: string
|
||||
format: date-time
|
||||
nullable: true
|
||||
description: UTC timestamp of the most recent container removal.
|
||||
last_op_at:
|
||||
type: string
|
||||
format: date-time
|
||||
description: UTC timestamp of the most recent operation; drives retention TTL.
|
||||
created_at:
|
||||
type: string
|
||||
format: date-time
|
||||
description: UTC timestamp of the first observation of this game.
|
||||
RuntimesList:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- items
|
||||
properties:
|
||||
items:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/RuntimeRecord"
|
||||
StartRequest:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- image_ref
|
||||
properties:
|
||||
image_ref:
|
||||
type: string
|
||||
description: Docker reference resolved by the producer (Game Master or Admin Service).
|
||||
StopRequest:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- reason
|
||||
properties:
|
||||
reason:
|
||||
$ref: "#/components/schemas/StopReason"
|
||||
PatchRequest:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- image_ref
|
||||
properties:
|
||||
image_ref:
|
||||
type: string
|
||||
description: New Docker reference within the same semver major and minor line.
|
||||
StopReason:
|
||||
type: string
|
||||
enum:
|
||||
- orphan_cleanup
|
||||
- cancelled
|
||||
- finished
|
||||
- admin_request
|
||||
- timeout
|
||||
description: Reason carried in the stop envelope and recorded in the operation log.
|
||||
ErrorCode:
|
||||
type: string
|
||||
enum:
|
||||
- invalid_request
|
||||
- not_found
|
||||
- conflict
|
||||
- service_unavailable
|
||||
- internal_error
|
||||
- image_pull_failed
|
||||
- image_ref_not_semver
|
||||
- semver_patch_only
|
||||
- container_start_failed
|
||||
- start_config_invalid
|
||||
- docker_unavailable
|
||||
- replay_no_op
|
||||
description: Stable internal API error code.
|
||||
ProbeResponse:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- status
|
||||
properties:
|
||||
status:
|
||||
type: string
|
||||
ErrorResponse:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- error
|
||||
properties:
|
||||
error:
|
||||
$ref: "#/components/schemas/ErrorBody"
|
||||
ErrorBody:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- code
|
||||
- message
|
||||
properties:
|
||||
code:
|
||||
$ref: "#/components/schemas/ErrorCode"
|
||||
message:
|
||||
type: string
|
||||
description: Human-readable trusted error message.
|
||||
responses:
|
||||
InvalidRequestError:
|
||||
description: Request validation failed.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ErrorResponse"
|
||||
examples:
|
||||
invalidRequest:
|
||||
value:
|
||||
error:
|
||||
code: invalid_request
|
||||
message: request is invalid
|
||||
NotFoundError:
|
||||
description: The requested runtime record does not exist.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ErrorResponse"
|
||||
examples:
|
||||
notFound:
|
||||
value:
|
||||
error:
|
||||
code: not_found
|
||||
message: runtime record not found
|
||||
ConflictError:
|
||||
description: The requested operation is not allowed in the current runtime state.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ErrorResponse"
|
||||
examples:
|
||||
conflict:
|
||||
value:
|
||||
error:
|
||||
code: conflict
|
||||
message: operation not allowed in current status
|
||||
InternalError:
|
||||
description: Unexpected internal service error.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ErrorResponse"
|
||||
examples:
|
||||
internal:
|
||||
value:
|
||||
error:
|
||||
code: internal_error
|
||||
message: internal server error
|
||||
ServiceUnavailableError:
|
||||
description: An upstream dependency is unavailable.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ErrorResponse"
|
||||
examples:
|
||||
unavailable:
|
||||
value:
|
||||
error:
|
||||
code: service_unavailable
|
||||
message: service is unavailable
|
||||
@@ -0,0 +1,195 @@
|
||||
asyncapi: 3.1.0
|
||||
info:
|
||||
title: Galaxy Runtime Health Events Contract
|
||||
version: 1.0.0
|
||||
description: |
|
||||
Stable Redis Streams contract for technical container health events
|
||||
published by `Runtime Manager`. Consumers include `Game Master`;
|
||||
`Game Lobby` and `Admin Service` are reserved as future consumers.
|
||||
|
||||
Three independent sources feed this stream: the Docker events
|
||||
listener, the periodic Docker inspect worker, and the active HTTP
|
||||
`/healthz` probe. Every emission also upserts the latest snapshot
|
||||
into `health_snapshots` in PostgreSQL.
|
||||
|
||||
Polymorphism: the `details` field carries an `event_type`-specific
|
||||
payload selected via `oneOf` per type. Each variant is a closed object
|
||||
(no unknown fields).
|
||||
|
||||
The `event_type` enum is fixed in this contract; adding a new value
|
||||
requires a contract bump and a coordinated consumer change.
|
||||
channels:
|
||||
healthEvents:
|
||||
address: runtime:health_events
|
||||
messages:
|
||||
runtimeHealthEvent:
|
||||
$ref: '#/components/messages/RuntimeHealthEvent'
|
||||
operations:
|
||||
publishHealthEvent:
|
||||
action: send
|
||||
summary: Publish one technical health event for downstream consumers.
|
||||
channel:
|
||||
$ref: '#/channels/healthEvents'
|
||||
messages:
|
||||
- $ref: '#/channels/healthEvents/messages/runtimeHealthEvent'
|
||||
components:
|
||||
messages:
|
||||
RuntimeHealthEvent:
|
||||
name: RuntimeHealthEvent
|
||||
title: Runtime health event
|
||||
summary: One technical health observation about a game engine container.
|
||||
payload:
|
||||
$ref: '#/components/schemas/RuntimeHealthEventPayload'
|
||||
examples:
|
||||
- name: containerStarted
|
||||
summary: Engine container has been created and started.
|
||||
payload:
|
||||
game_id: game-123
|
||||
container_id: 7c2b5d1a4f6e
|
||||
event_type: container_started
|
||||
occurred_at_ms: 1775121700000
|
||||
details:
|
||||
image_ref: registry.example.com/galaxy/game:1.4.7
|
||||
- name: containerExited
|
||||
summary: Engine container terminated with a non-zero exit code.
|
||||
payload:
|
||||
game_id: game-123
|
||||
container_id: 7c2b5d1a4f6e
|
||||
event_type: container_exited
|
||||
occurred_at_ms: 1775121800000
|
||||
details:
|
||||
exit_code: 137
|
||||
oom: false
|
||||
- name: probeFailed
|
||||
summary: Active probe observed three consecutive failures.
|
||||
payload:
|
||||
game_id: game-123
|
||||
container_id: 7c2b5d1a4f6e
|
||||
event_type: probe_failed
|
||||
occurred_at_ms: 1775121810000
|
||||
details:
|
||||
consecutive_failures: 3
|
||||
last_status: 0
|
||||
last_error: "context deadline exceeded"
|
||||
schemas:
|
||||
RuntimeHealthEventPayload:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- game_id
|
||||
- container_id
|
||||
- event_type
|
||||
- occurred_at_ms
|
||||
- details
|
||||
properties:
|
||||
game_id:
|
||||
type: string
|
||||
description: Opaque stable game identifier owned by Lobby.
|
||||
container_id:
|
||||
type: string
|
||||
description: Docker container id observed by Runtime Manager. May differ from the current container id after a restart race.
|
||||
event_type:
|
||||
$ref: '#/components/schemas/EventType'
|
||||
occurred_at_ms:
|
||||
type: integer
|
||||
format: int64
|
||||
description: UTC milliseconds when Runtime Manager observed the event.
|
||||
details:
|
||||
oneOf:
|
||||
- $ref: '#/components/schemas/ContainerStartedDetails'
|
||||
- $ref: '#/components/schemas/ContainerExitedDetails'
|
||||
- $ref: '#/components/schemas/ContainerOomDetails'
|
||||
- $ref: '#/components/schemas/ContainerDisappearedDetails'
|
||||
- $ref: '#/components/schemas/InspectUnhealthyDetails'
|
||||
- $ref: '#/components/schemas/ProbeFailedDetails'
|
||||
- $ref: '#/components/schemas/ProbeRecoveredDetails'
|
||||
description: Polymorphic payload selected by event_type.
|
||||
EventType:
|
||||
type: string
|
||||
enum:
|
||||
- container_started
|
||||
- container_exited
|
||||
- container_oom
|
||||
- container_disappeared
|
||||
- inspect_unhealthy
|
||||
- probe_failed
|
||||
- probe_recovered
|
||||
description: Discriminator selecting the details variant.
|
||||
ContainerStartedDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- image_ref
|
||||
properties:
|
||||
image_ref:
|
||||
type: string
|
||||
description: Image reference of the started container.
|
||||
ContainerExitedDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- exit_code
|
||||
- oom
|
||||
properties:
|
||||
exit_code:
|
||||
type: integer
|
||||
description: Exit code reported by Docker.
|
||||
oom:
|
||||
type: boolean
|
||||
description: True when the container was killed by the OOM killer.
|
||||
ContainerOomDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- exit_code
|
||||
properties:
|
||||
exit_code:
|
||||
type: integer
|
||||
description: Exit code reported by Docker for the OOM event.
|
||||
ContainerDisappearedDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
description: Empty payload; emitted when a destroy event is observed for a record Runtime Manager did not initiate.
|
||||
InspectUnhealthyDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- restart_count
|
||||
- state
|
||||
- health
|
||||
properties:
|
||||
restart_count:
|
||||
type: integer
|
||||
description: Docker RestartCount observed at this inspection.
|
||||
state:
|
||||
type: string
|
||||
description: Docker State.Status observed at this inspection.
|
||||
health:
|
||||
type: string
|
||||
description: Docker State.Health.Status observed at this inspection; empty when the image declares no HEALTHCHECK.
|
||||
ProbeFailedDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- consecutive_failures
|
||||
- last_status
|
||||
- last_error
|
||||
properties:
|
||||
consecutive_failures:
|
||||
type: integer
|
||||
description: Number of consecutive probe failures that crossed the threshold.
|
||||
last_status:
|
||||
type: integer
|
||||
description: HTTP status of the last probe attempt; 0 when the probe failed before receiving a response.
|
||||
last_error:
|
||||
type: string
|
||||
description: Operator-readable error of the last probe attempt; empty when not applicable.
|
||||
ProbeRecoveredDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- prior_failure_count
|
||||
properties:
|
||||
prior_failure_count:
|
||||
type: integer
|
||||
description: Number of consecutive failures observed immediately before the recovery.
|
||||
@@ -0,0 +1,226 @@
|
||||
asyncapi: 3.1.0
|
||||
info:
|
||||
title: Galaxy Runtime Jobs Stream Contract
|
||||
version: 1.0.0
|
||||
description: |
|
||||
Stable Redis Streams contract carrying runtime jobs between
|
||||
`Game Lobby` and `Runtime Manager`.
|
||||
|
||||
`Game Lobby` is the sole producer for `runtime:start_jobs` and
|
||||
`runtime:stop_jobs`. `Runtime Manager` consumes both, executes the
|
||||
Docker work, and publishes one outcome per job to `runtime:job_results`,
|
||||
which is consumed by `Game Lobby`'s runtime-job-result worker.
|
||||
|
||||
Replay safety:
|
||||
- duplicate start jobs for an already-running game with the same
|
||||
`image_ref` produce a `success` job result with
|
||||
`error_code=replay_no_op`;
|
||||
- duplicate stop jobs for an already-stopped or already-removed game
|
||||
produce a `success` job result with `error_code=replay_no_op`.
|
||||
|
||||
The `reason` enum on `runtime:stop_jobs` is fixed in this contract.
|
||||
Adding a new value requires a contract bump and a coordinated
|
||||
Lobby/Runtime Manager change.
|
||||
channels:
|
||||
startJobs:
|
||||
address: runtime:start_jobs
|
||||
messages:
|
||||
runtimeStartJob:
|
||||
$ref: '#/components/messages/RuntimeStartJob'
|
||||
stopJobs:
|
||||
address: runtime:stop_jobs
|
||||
messages:
|
||||
runtimeStopJob:
|
||||
$ref: '#/components/messages/RuntimeStopJob'
|
||||
jobResults:
|
||||
address: runtime:job_results
|
||||
messages:
|
||||
runtimeJobResult:
|
||||
$ref: '#/components/messages/RuntimeJobResult'
|
||||
operations:
|
||||
consumeStartJob:
|
||||
action: receive
|
||||
summary: Receive one start job from Game Lobby and run a container.
|
||||
channel:
|
||||
$ref: '#/channels/startJobs'
|
||||
messages:
|
||||
- $ref: '#/channels/startJobs/messages/runtimeStartJob'
|
||||
consumeStopJob:
|
||||
action: receive
|
||||
summary: Receive one stop job from Game Lobby and stop a container.
|
||||
channel:
|
||||
$ref: '#/channels/stopJobs'
|
||||
messages:
|
||||
- $ref: '#/channels/stopJobs/messages/runtimeStopJob'
|
||||
publishJobResult:
|
||||
action: send
|
||||
summary: Publish one runtime job outcome for Game Lobby.
|
||||
channel:
|
||||
$ref: '#/channels/jobResults'
|
||||
messages:
|
||||
- $ref: '#/channels/jobResults/messages/runtimeJobResult'
|
||||
components:
|
||||
messages:
|
||||
RuntimeStartJob:
|
||||
name: RuntimeStartJob
|
||||
title: Runtime start job
|
||||
summary: Lobby request to start one game engine container.
|
||||
payload:
|
||||
$ref: '#/components/schemas/RuntimeStartJobPayload'
|
||||
examples:
|
||||
- name: startJob
|
||||
summary: Start a game engine container with a producer-resolved image_ref.
|
||||
payload:
|
||||
game_id: game-123
|
||||
image_ref: registry.example.com/galaxy/game:1.4.7
|
||||
requested_at_ms: 1775121700000
|
||||
RuntimeStopJob:
|
||||
name: RuntimeStopJob
|
||||
title: Runtime stop job
|
||||
summary: Lobby request to stop one game engine container.
|
||||
payload:
|
||||
$ref: '#/components/schemas/RuntimeStopJobPayload'
|
||||
examples:
|
||||
- name: cancelled
|
||||
summary: Stop the engine because the game was cancelled.
|
||||
payload:
|
||||
game_id: game-123
|
||||
reason: cancelled
|
||||
requested_at_ms: 1775121800000
|
||||
- name: orphanCleanup
|
||||
summary: Stop an engine whose Lobby metadata persistence failed.
|
||||
payload:
|
||||
game_id: game-456
|
||||
reason: orphan_cleanup
|
||||
requested_at_ms: 1775121810000
|
||||
RuntimeJobResult:
|
||||
name: RuntimeJobResult
|
||||
title: Runtime job result
|
||||
summary: Outcome of one start or stop job.
|
||||
payload:
|
||||
$ref: '#/components/schemas/RuntimeJobResultPayload'
|
||||
examples:
|
||||
- name: startSuccess
|
||||
summary: Successful start, container_id and engine_endpoint are populated.
|
||||
payload:
|
||||
game_id: game-123
|
||||
outcome: success
|
||||
container_id: 7c2b5d1a4f6e
|
||||
engine_endpoint: http://galaxy-game-game-123:8080
|
||||
error_code: ""
|
||||
error_message: ""
|
||||
- name: imagePullFailed
|
||||
summary: Failed start due to an image pull error.
|
||||
payload:
|
||||
game_id: game-789
|
||||
outcome: failure
|
||||
container_id: ""
|
||||
engine_endpoint: ""
|
||||
error_code: image_pull_failed
|
||||
error_message: "manifest unknown"
|
||||
- name: replayNoOp
|
||||
summary: Idempotent replay; the job was a no-op.
|
||||
payload:
|
||||
game_id: game-123
|
||||
outcome: success
|
||||
container_id: 7c2b5d1a4f6e
|
||||
engine_endpoint: http://galaxy-game-game-123:8080
|
||||
error_code: replay_no_op
|
||||
error_message: ""
|
||||
schemas:
|
||||
RuntimeStartJobPayload:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- game_id
|
||||
- image_ref
|
||||
- requested_at_ms
|
||||
properties:
|
||||
game_id:
|
||||
type: string
|
||||
description: Opaque stable game identifier owned by Lobby.
|
||||
image_ref:
|
||||
type: string
|
||||
description: Docker reference resolved by Lobby from LOBBY_ENGINE_IMAGE_TEMPLATE.
|
||||
requested_at_ms:
|
||||
type: integer
|
||||
format: int64
|
||||
description: UTC milliseconds; used for diagnostics, not authoritative.
|
||||
RuntimeStopJobPayload:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- game_id
|
||||
- reason
|
||||
- requested_at_ms
|
||||
properties:
|
||||
game_id:
|
||||
type: string
|
||||
description: Opaque stable game identifier owned by Lobby.
|
||||
reason:
|
||||
$ref: '#/components/schemas/StopReason'
|
||||
requested_at_ms:
|
||||
type: integer
|
||||
format: int64
|
||||
description: UTC milliseconds; used for diagnostics, not authoritative.
|
||||
RuntimeJobResultPayload:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- game_id
|
||||
- outcome
|
||||
- container_id
|
||||
- engine_endpoint
|
||||
- error_code
|
||||
- error_message
|
||||
properties:
|
||||
game_id:
|
||||
type: string
|
||||
description: Opaque stable game identifier matching the originating job.
|
||||
outcome:
|
||||
type: string
|
||||
enum:
|
||||
- success
|
||||
- failure
|
||||
description: High-level outcome of the runtime job.
|
||||
container_id:
|
||||
type: string
|
||||
description: Docker container id of the engine; populated on success, empty on failure.
|
||||
engine_endpoint:
|
||||
type: string
|
||||
description: Stable engine URL `http://galaxy-game-{game_id}:8080`; populated on success, empty on failure.
|
||||
error_code:
|
||||
$ref: '#/components/schemas/ErrorCode'
|
||||
error_message:
|
||||
type: string
|
||||
description: Operator-readable detail; empty when not applicable.
|
||||
StopReason:
|
||||
type: string
|
||||
enum:
|
||||
- orphan_cleanup
|
||||
- cancelled
|
||||
- finished
|
||||
- admin_request
|
||||
- timeout
|
||||
description: Reason value carried by every runtime:stop_jobs envelope.
|
||||
ErrorCode:
|
||||
type: string
|
||||
enum:
|
||||
- ""
|
||||
- invalid_request
|
||||
- not_found
|
||||
- conflict
|
||||
- service_unavailable
|
||||
- internal_error
|
||||
- image_pull_failed
|
||||
- image_ref_not_semver
|
||||
- semver_patch_only
|
||||
- container_start_failed
|
||||
- start_config_invalid
|
||||
- docker_unavailable
|
||||
- replay_no_op
|
||||
description: |
|
||||
Stable error code identical to the internal REST contract. The empty
|
||||
string is a valid value for successful job results that did not
|
||||
produce a code (the field is required to be present so consumers
|
||||
can rely on the schema).
|
||||
Reference in New Issue
Block a user