196 lines
6.7 KiB
YAML
196 lines
6.7 KiB
YAML
asyncapi: 3.1.0
|
|
info:
|
|
title: Galaxy Runtime Health Events Contract
|
|
version: 1.0.0
|
|
description: |
|
|
Stable Redis Streams contract for technical container health events
|
|
published by `Runtime Manager`. Consumers include `Game Master`;
|
|
`Game Lobby` and `Admin Service` are reserved as future consumers.
|
|
|
|
Three independent sources feed this stream: the Docker events
|
|
listener, the periodic Docker inspect worker, and the active HTTP
|
|
`/healthz` probe. Every emission also upserts the latest snapshot
|
|
into `health_snapshots` in PostgreSQL.
|
|
|
|
Polymorphism: the `details` field carries an `event_type`-specific
|
|
payload selected via `oneOf` per type. Each variant is a closed object
|
|
(no unknown fields).
|
|
|
|
The `event_type` enum is fixed in this contract; adding a new value
|
|
requires a contract bump and a coordinated consumer change.
|
|
channels:
|
|
healthEvents:
|
|
address: runtime:health_events
|
|
messages:
|
|
runtimeHealthEvent:
|
|
$ref: '#/components/messages/RuntimeHealthEvent'
|
|
operations:
|
|
publishHealthEvent:
|
|
action: send
|
|
summary: Publish one technical health event for downstream consumers.
|
|
channel:
|
|
$ref: '#/channels/healthEvents'
|
|
messages:
|
|
- $ref: '#/channels/healthEvents/messages/runtimeHealthEvent'
|
|
components:
|
|
messages:
|
|
RuntimeHealthEvent:
|
|
name: RuntimeHealthEvent
|
|
title: Runtime health event
|
|
summary: One technical health observation about a game engine container.
|
|
payload:
|
|
$ref: '#/components/schemas/RuntimeHealthEventPayload'
|
|
examples:
|
|
- name: containerStarted
|
|
summary: Engine container has been created and started.
|
|
payload:
|
|
game_id: game-123
|
|
container_id: 7c2b5d1a4f6e
|
|
event_type: container_started
|
|
occurred_at_ms: 1775121700000
|
|
details:
|
|
image_ref: registry.example.com/galaxy/game:1.4.7
|
|
- name: containerExited
|
|
summary: Engine container terminated with a non-zero exit code.
|
|
payload:
|
|
game_id: game-123
|
|
container_id: 7c2b5d1a4f6e
|
|
event_type: container_exited
|
|
occurred_at_ms: 1775121800000
|
|
details:
|
|
exit_code: 137
|
|
oom: false
|
|
- name: probeFailed
|
|
summary: Active probe observed three consecutive failures.
|
|
payload:
|
|
game_id: game-123
|
|
container_id: 7c2b5d1a4f6e
|
|
event_type: probe_failed
|
|
occurred_at_ms: 1775121810000
|
|
details:
|
|
consecutive_failures: 3
|
|
last_status: 0
|
|
last_error: "context deadline exceeded"
|
|
schemas:
|
|
RuntimeHealthEventPayload:
|
|
type: object
|
|
additionalProperties: false
|
|
required:
|
|
- game_id
|
|
- container_id
|
|
- event_type
|
|
- occurred_at_ms
|
|
- details
|
|
properties:
|
|
game_id:
|
|
type: string
|
|
description: Opaque stable game identifier owned by Lobby.
|
|
container_id:
|
|
type: string
|
|
description: Docker container id observed by Runtime Manager. May differ from the current container id after a restart race.
|
|
event_type:
|
|
$ref: '#/components/schemas/EventType'
|
|
occurred_at_ms:
|
|
type: integer
|
|
format: int64
|
|
description: UTC milliseconds when Runtime Manager observed the event.
|
|
details:
|
|
oneOf:
|
|
- $ref: '#/components/schemas/ContainerStartedDetails'
|
|
- $ref: '#/components/schemas/ContainerExitedDetails'
|
|
- $ref: '#/components/schemas/ContainerOomDetails'
|
|
- $ref: '#/components/schemas/ContainerDisappearedDetails'
|
|
- $ref: '#/components/schemas/InspectUnhealthyDetails'
|
|
- $ref: '#/components/schemas/ProbeFailedDetails'
|
|
- $ref: '#/components/schemas/ProbeRecoveredDetails'
|
|
description: Polymorphic payload selected by event_type.
|
|
EventType:
|
|
type: string
|
|
enum:
|
|
- container_started
|
|
- container_exited
|
|
- container_oom
|
|
- container_disappeared
|
|
- inspect_unhealthy
|
|
- probe_failed
|
|
- probe_recovered
|
|
description: Discriminator selecting the details variant.
|
|
ContainerStartedDetails:
|
|
type: object
|
|
additionalProperties: false
|
|
required:
|
|
- image_ref
|
|
properties:
|
|
image_ref:
|
|
type: string
|
|
description: Image reference of the started container.
|
|
ContainerExitedDetails:
|
|
type: object
|
|
additionalProperties: false
|
|
required:
|
|
- exit_code
|
|
- oom
|
|
properties:
|
|
exit_code:
|
|
type: integer
|
|
description: Exit code reported by Docker.
|
|
oom:
|
|
type: boolean
|
|
description: True when the container was killed by the OOM killer.
|
|
ContainerOomDetails:
|
|
type: object
|
|
additionalProperties: false
|
|
required:
|
|
- exit_code
|
|
properties:
|
|
exit_code:
|
|
type: integer
|
|
description: Exit code reported by Docker for the OOM event.
|
|
ContainerDisappearedDetails:
|
|
type: object
|
|
additionalProperties: false
|
|
description: Empty payload; emitted when a destroy event is observed for a record Runtime Manager did not initiate.
|
|
InspectUnhealthyDetails:
|
|
type: object
|
|
additionalProperties: false
|
|
required:
|
|
- restart_count
|
|
- state
|
|
- health
|
|
properties:
|
|
restart_count:
|
|
type: integer
|
|
description: Docker RestartCount observed at this inspection.
|
|
state:
|
|
type: string
|
|
description: Docker State.Status observed at this inspection.
|
|
health:
|
|
type: string
|
|
description: Docker State.Health.Status observed at this inspection; empty when the image declares no HEALTHCHECK.
|
|
ProbeFailedDetails:
|
|
type: object
|
|
additionalProperties: false
|
|
required:
|
|
- consecutive_failures
|
|
- last_status
|
|
- last_error
|
|
properties:
|
|
consecutive_failures:
|
|
type: integer
|
|
description: Number of consecutive probe failures that crossed the threshold.
|
|
last_status:
|
|
type: integer
|
|
description: HTTP status of the last probe attempt; 0 when the probe failed before receiving a response.
|
|
last_error:
|
|
type: string
|
|
description: Operator-readable error of the last probe attempt; empty when not applicable.
|
|
ProbeRecoveredDetails:
|
|
type: object
|
|
additionalProperties: false
|
|
required:
|
|
- prior_failure_count
|
|
properties:
|
|
prior_failure_count:
|
|
type: integer
|
|
description: Number of consecutive failures observed immediately before the recovery.
|