feat: runtime manager
This commit is contained in:
@@ -0,0 +1,195 @@
|
||||
asyncapi: 3.1.0
|
||||
info:
|
||||
title: Galaxy Runtime Health Events Contract
|
||||
version: 1.0.0
|
||||
description: |
|
||||
Stable Redis Streams contract for technical container health events
|
||||
published by `Runtime Manager`. Consumers include `Game Master`;
|
||||
`Game Lobby` and `Admin Service` are reserved as future consumers.
|
||||
|
||||
Three independent sources feed this stream: the Docker events
|
||||
listener, the periodic Docker inspect worker, and the active HTTP
|
||||
`/healthz` probe. Every emission also upserts the latest snapshot
|
||||
into `health_snapshots` in PostgreSQL.
|
||||
|
||||
Polymorphism: the `details` field carries an `event_type`-specific
|
||||
payload selected via `oneOf` per type. Each variant is a closed object
|
||||
(no unknown fields).
|
||||
|
||||
The `event_type` enum is fixed in this contract; adding a new value
|
||||
requires a contract bump and a coordinated consumer change.
|
||||
channels:
|
||||
healthEvents:
|
||||
address: runtime:health_events
|
||||
messages:
|
||||
runtimeHealthEvent:
|
||||
$ref: '#/components/messages/RuntimeHealthEvent'
|
||||
operations:
|
||||
publishHealthEvent:
|
||||
action: send
|
||||
summary: Publish one technical health event for downstream consumers.
|
||||
channel:
|
||||
$ref: '#/channels/healthEvents'
|
||||
messages:
|
||||
- $ref: '#/channels/healthEvents/messages/runtimeHealthEvent'
|
||||
components:
|
||||
messages:
|
||||
RuntimeHealthEvent:
|
||||
name: RuntimeHealthEvent
|
||||
title: Runtime health event
|
||||
summary: One technical health observation about a game engine container.
|
||||
payload:
|
||||
$ref: '#/components/schemas/RuntimeHealthEventPayload'
|
||||
examples:
|
||||
- name: containerStarted
|
||||
summary: Engine container has been created and started.
|
||||
payload:
|
||||
game_id: game-123
|
||||
container_id: 7c2b5d1a4f6e
|
||||
event_type: container_started
|
||||
occurred_at_ms: 1775121700000
|
||||
details:
|
||||
image_ref: registry.example.com/galaxy/game:1.4.7
|
||||
- name: containerExited
|
||||
summary: Engine container terminated with a non-zero exit code.
|
||||
payload:
|
||||
game_id: game-123
|
||||
container_id: 7c2b5d1a4f6e
|
||||
event_type: container_exited
|
||||
occurred_at_ms: 1775121800000
|
||||
details:
|
||||
exit_code: 137
|
||||
oom: false
|
||||
- name: probeFailed
|
||||
summary: Active probe observed three consecutive failures.
|
||||
payload:
|
||||
game_id: game-123
|
||||
container_id: 7c2b5d1a4f6e
|
||||
event_type: probe_failed
|
||||
occurred_at_ms: 1775121810000
|
||||
details:
|
||||
consecutive_failures: 3
|
||||
last_status: 0
|
||||
last_error: "context deadline exceeded"
|
||||
schemas:
|
||||
RuntimeHealthEventPayload:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- game_id
|
||||
- container_id
|
||||
- event_type
|
||||
- occurred_at_ms
|
||||
- details
|
||||
properties:
|
||||
game_id:
|
||||
type: string
|
||||
description: Opaque stable game identifier owned by Lobby.
|
||||
container_id:
|
||||
type: string
|
||||
description: Docker container id observed by Runtime Manager. May differ from the current container id after a restart race.
|
||||
event_type:
|
||||
$ref: '#/components/schemas/EventType'
|
||||
occurred_at_ms:
|
||||
type: integer
|
||||
format: int64
|
||||
description: UTC milliseconds when Runtime Manager observed the event.
|
||||
details:
|
||||
oneOf:
|
||||
- $ref: '#/components/schemas/ContainerStartedDetails'
|
||||
- $ref: '#/components/schemas/ContainerExitedDetails'
|
||||
- $ref: '#/components/schemas/ContainerOomDetails'
|
||||
- $ref: '#/components/schemas/ContainerDisappearedDetails'
|
||||
- $ref: '#/components/schemas/InspectUnhealthyDetails'
|
||||
- $ref: '#/components/schemas/ProbeFailedDetails'
|
||||
- $ref: '#/components/schemas/ProbeRecoveredDetails'
|
||||
description: Polymorphic payload selected by event_type.
|
||||
EventType:
|
||||
type: string
|
||||
enum:
|
||||
- container_started
|
||||
- container_exited
|
||||
- container_oom
|
||||
- container_disappeared
|
||||
- inspect_unhealthy
|
||||
- probe_failed
|
||||
- probe_recovered
|
||||
description: Discriminator selecting the details variant.
|
||||
ContainerStartedDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- image_ref
|
||||
properties:
|
||||
image_ref:
|
||||
type: string
|
||||
description: Image reference of the started container.
|
||||
ContainerExitedDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- exit_code
|
||||
- oom
|
||||
properties:
|
||||
exit_code:
|
||||
type: integer
|
||||
description: Exit code reported by Docker.
|
||||
oom:
|
||||
type: boolean
|
||||
description: True when the container was killed by the OOM killer.
|
||||
ContainerOomDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- exit_code
|
||||
properties:
|
||||
exit_code:
|
||||
type: integer
|
||||
description: Exit code reported by Docker for the OOM event.
|
||||
ContainerDisappearedDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
description: Empty payload; emitted when a destroy event is observed for a record Runtime Manager did not initiate.
|
||||
InspectUnhealthyDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- restart_count
|
||||
- state
|
||||
- health
|
||||
properties:
|
||||
restart_count:
|
||||
type: integer
|
||||
description: Docker RestartCount observed at this inspection.
|
||||
state:
|
||||
type: string
|
||||
description: Docker State.Status observed at this inspection.
|
||||
health:
|
||||
type: string
|
||||
description: Docker State.Health.Status observed at this inspection; empty when the image declares no HEALTHCHECK.
|
||||
ProbeFailedDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- consecutive_failures
|
||||
- last_status
|
||||
- last_error
|
||||
properties:
|
||||
consecutive_failures:
|
||||
type: integer
|
||||
description: Number of consecutive probe failures that crossed the threshold.
|
||||
last_status:
|
||||
type: integer
|
||||
description: HTTP status of the last probe attempt; 0 when the probe failed before receiving a response.
|
||||
last_error:
|
||||
type: string
|
||||
description: Operator-readable error of the last probe attempt; empty when not applicable.
|
||||
ProbeRecoveredDetails:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- prior_failure_count
|
||||
properties:
|
||||
prior_failure_count:
|
||||
type: integer
|
||||
description: Number of consecutive failures observed immediately before the recovery.
|
||||
Reference in New Issue
Block a user