Files
galaxy-game/rtmanager/api/runtime-health-asyncapi.yaml
T
2026-04-28 20:39:18 +02:00

196 lines
6.7 KiB
YAML

asyncapi: 3.1.0
info:
title: Galaxy Runtime Health Events Contract
version: 1.0.0
description: |
Stable Redis Streams contract for technical container health events
published by `Runtime Manager`. Consumers include `Game Master`;
`Game Lobby` and `Admin Service` are reserved as future consumers.
Three independent sources feed this stream: the Docker events
listener, the periodic Docker inspect worker, and the active HTTP
`/healthz` probe. Every emission also upserts the latest snapshot
into `health_snapshots` in PostgreSQL.
Polymorphism: the `details` field carries an `event_type`-specific
payload selected via `oneOf` per type. Each variant is a closed object
(no unknown fields).
The `event_type` enum is fixed in this contract; adding a new value
requires a contract bump and a coordinated consumer change.
channels:
healthEvents:
address: runtime:health_events
messages:
runtimeHealthEvent:
$ref: '#/components/messages/RuntimeHealthEvent'
operations:
publishHealthEvent:
action: send
summary: Publish one technical health event for downstream consumers.
channel:
$ref: '#/channels/healthEvents'
messages:
- $ref: '#/channels/healthEvents/messages/runtimeHealthEvent'
components:
messages:
RuntimeHealthEvent:
name: RuntimeHealthEvent
title: Runtime health event
summary: One technical health observation about a game engine container.
payload:
$ref: '#/components/schemas/RuntimeHealthEventPayload'
examples:
- name: containerStarted
summary: Engine container has been created and started.
payload:
game_id: game-123
container_id: 7c2b5d1a4f6e
event_type: container_started
occurred_at_ms: 1775121700000
details:
image_ref: registry.example.com/galaxy/game:1.4.7
- name: containerExited
summary: Engine container terminated with a non-zero exit code.
payload:
game_id: game-123
container_id: 7c2b5d1a4f6e
event_type: container_exited
occurred_at_ms: 1775121800000
details:
exit_code: 137
oom: false
- name: probeFailed
summary: Active probe observed three consecutive failures.
payload:
game_id: game-123
container_id: 7c2b5d1a4f6e
event_type: probe_failed
occurred_at_ms: 1775121810000
details:
consecutive_failures: 3
last_status: 0
last_error: "context deadline exceeded"
schemas:
RuntimeHealthEventPayload:
type: object
additionalProperties: false
required:
- game_id
- container_id
- event_type
- occurred_at_ms
- details
properties:
game_id:
type: string
description: Opaque stable game identifier owned by Lobby.
container_id:
type: string
description: Docker container id observed by Runtime Manager. May differ from the current container id after a restart race.
event_type:
$ref: '#/components/schemas/EventType'
occurred_at_ms:
type: integer
format: int64
description: UTC milliseconds when Runtime Manager observed the event.
details:
oneOf:
- $ref: '#/components/schemas/ContainerStartedDetails'
- $ref: '#/components/schemas/ContainerExitedDetails'
- $ref: '#/components/schemas/ContainerOomDetails'
- $ref: '#/components/schemas/ContainerDisappearedDetails'
- $ref: '#/components/schemas/InspectUnhealthyDetails'
- $ref: '#/components/schemas/ProbeFailedDetails'
- $ref: '#/components/schemas/ProbeRecoveredDetails'
description: Polymorphic payload selected by event_type.
EventType:
type: string
enum:
- container_started
- container_exited
- container_oom
- container_disappeared
- inspect_unhealthy
- probe_failed
- probe_recovered
description: Discriminator selecting the details variant.
ContainerStartedDetails:
type: object
additionalProperties: false
required:
- image_ref
properties:
image_ref:
type: string
description: Image reference of the started container.
ContainerExitedDetails:
type: object
additionalProperties: false
required:
- exit_code
- oom
properties:
exit_code:
type: integer
description: Exit code reported by Docker.
oom:
type: boolean
description: True when the container was killed by the OOM killer.
ContainerOomDetails:
type: object
additionalProperties: false
required:
- exit_code
properties:
exit_code:
type: integer
description: Exit code reported by Docker for the OOM event.
ContainerDisappearedDetails:
type: object
additionalProperties: false
description: Empty payload; emitted when a destroy event is observed for a record Runtime Manager did not initiate.
InspectUnhealthyDetails:
type: object
additionalProperties: false
required:
- restart_count
- state
- health
properties:
restart_count:
type: integer
description: Docker RestartCount observed at this inspection.
state:
type: string
description: Docker State.Status observed at this inspection.
health:
type: string
description: Docker State.Health.Status observed at this inspection; empty when the image declares no HEALTHCHECK.
ProbeFailedDetails:
type: object
additionalProperties: false
required:
- consecutive_failures
- last_status
- last_error
properties:
consecutive_failures:
type: integer
description: Number of consecutive probe failures that crossed the threshold.
last_status:
type: integer
description: HTTP status of the last probe attempt; 0 when the probe failed before receiving a response.
last_error:
type: string
description: Operator-readable error of the last probe attempt; empty when not applicable.
ProbeRecoveredDetails:
type: object
additionalProperties: false
required:
- prior_failure_count
properties:
prior_failure_count:
type: integer
description: Number of consecutive failures observed immediately before the recovery.