feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,231 @@
// Package health defines the technical-health domain types owned by
// Runtime Manager.
//
// EventType matches the `event_type` enum frozen in
// `galaxy/rtmanager/api/runtime-health-asyncapi.yaml`. SnapshotStatus
// matches the SQL CHECK on `health_snapshots.status` and is intentionally
// narrower than EventType (the snapshot table collapses
// `container_started → healthy` and drops `probe_recovered` per
// `galaxy/rtmanager/README.md §Health Monitoring`).
package health
import (
"encoding/json"
"fmt"
"strings"
"time"
)
// EventType identifies one entry on the `runtime:health_events` Redis
// Stream. Used by the health-event publishers and consumers.
type EventType string
const (
// EventTypeContainerStarted reports a successful container start.
EventTypeContainerStarted EventType = "container_started"
// EventTypeContainerExited reports a non-zero Docker `die` event.
EventTypeContainerExited EventType = "container_exited"
// EventTypeContainerOOM reports a Docker `oom` event.
EventTypeContainerOOM EventType = "container_oom"
// EventTypeContainerDisappeared reports that the listener observed
// a `destroy` event for a record Runtime Manager did not initiate.
EventTypeContainerDisappeared EventType = "container_disappeared"
// EventTypeInspectUnhealthy reports an unexpected outcome of the
// periodic Docker inspect (RestartCount growth, unexpected status,
// declared HEALTHCHECK reporting unhealthy).
EventTypeInspectUnhealthy EventType = "inspect_unhealthy"
// EventTypeProbeFailed reports that the active HTTP probe crossed
// the configured failure threshold.
EventTypeProbeFailed EventType = "probe_failed"
// EventTypeProbeRecovered reports the first probe success after a
// `probe_failed` event was published.
EventTypeProbeRecovered EventType = "probe_recovered"
)
// IsKnown reports whether eventType belongs to the frozen event-type
// vocabulary.
func (eventType EventType) IsKnown() bool {
switch eventType {
case EventTypeContainerStarted,
EventTypeContainerExited,
EventTypeContainerOOM,
EventTypeContainerDisappeared,
EventTypeInspectUnhealthy,
EventTypeProbeFailed,
EventTypeProbeRecovered:
return true
default:
return false
}
}
// AllEventTypes returns the frozen list of every event-type value.
func AllEventTypes() []EventType {
return []EventType{
EventTypeContainerStarted,
EventTypeContainerExited,
EventTypeContainerOOM,
EventTypeContainerDisappeared,
EventTypeInspectUnhealthy,
EventTypeProbeFailed,
EventTypeProbeRecovered,
}
}
// SnapshotStatus identifies one latest-observation status value stored
// in the `health_snapshots.status` column. Distinct from EventType: the
// table collapses `container_started → healthy` and never persists
// `probe_recovered` (it is conveyed only as a `runtime:health_events`
// entry with status=healthy in the next observation).
type SnapshotStatus string
const (
// SnapshotStatusHealthy reports that the most recent observation
// found the container live and the engine probe responsive.
SnapshotStatusHealthy SnapshotStatus = "healthy"
// SnapshotStatusProbeFailed reports that the active probe crossed
// the failure threshold.
SnapshotStatusProbeFailed SnapshotStatus = "probe_failed"
// SnapshotStatusExited reports that the container exited.
SnapshotStatusExited SnapshotStatus = "exited"
// SnapshotStatusOOM reports that the container was killed by the
// OOM killer.
SnapshotStatusOOM SnapshotStatus = "oom"
// SnapshotStatusInspectUnhealthy reports that the periodic inspect
// observed an unexpected state.
SnapshotStatusInspectUnhealthy SnapshotStatus = "inspect_unhealthy"
// SnapshotStatusContainerDisappeared reports that Docker no longer
// reports the container.
SnapshotStatusContainerDisappeared SnapshotStatus = "container_disappeared"
)
// IsKnown reports whether status belongs to the frozen snapshot-status
// vocabulary.
func (status SnapshotStatus) IsKnown() bool {
switch status {
case SnapshotStatusHealthy,
SnapshotStatusProbeFailed,
SnapshotStatusExited,
SnapshotStatusOOM,
SnapshotStatusInspectUnhealthy,
SnapshotStatusContainerDisappeared:
return true
default:
return false
}
}
// AllSnapshotStatuses returns the frozen list of every snapshot-status
// value.
func AllSnapshotStatuses() []SnapshotStatus {
return []SnapshotStatus{
SnapshotStatusHealthy,
SnapshotStatusProbeFailed,
SnapshotStatusExited,
SnapshotStatusOOM,
SnapshotStatusInspectUnhealthy,
SnapshotStatusContainerDisappeared,
}
}
// SnapshotSource identifies the observation source that produced one
// snapshot. Matches the SQL CHECK on `health_snapshots.source`.
type SnapshotSource string
const (
// SnapshotSourceDockerEvent reports that the latest observation
// arrived through the Docker events listener.
SnapshotSourceDockerEvent SnapshotSource = "docker_event"
// SnapshotSourceInspect reports that the latest observation arrived
// through the periodic Docker inspect worker.
SnapshotSourceInspect SnapshotSource = "inspect"
// SnapshotSourceProbe reports that the latest observation arrived
// through the active HTTP probe.
SnapshotSourceProbe SnapshotSource = "probe"
)
// IsKnown reports whether source belongs to the frozen snapshot-source
// vocabulary.
func (source SnapshotSource) IsKnown() bool {
switch source {
case SnapshotSourceDockerEvent,
SnapshotSourceInspect,
SnapshotSourceProbe:
return true
default:
return false
}
}
// AllSnapshotSources returns the frozen list of every snapshot-source
// value.
func AllSnapshotSources() []SnapshotSource {
return []SnapshotSource{
SnapshotSourceDockerEvent,
SnapshotSourceInspect,
SnapshotSourceProbe,
}
}
// HealthSnapshot stores the latest technical-health observation for one
// game. One row per game_id; later observations overwrite.
type HealthSnapshot struct {
// GameID identifies the platform game.
GameID string
// ContainerID stores the Docker container id observed by the
// snapshot source. Empty when the source could not associate a
// container (e.g., reconciler dispose for a record whose container
// is already gone).
ContainerID string
// Status stores the latest observed snapshot status.
Status SnapshotStatus
// Source stores the observation source that produced this entry.
Source SnapshotSource
// Details stores the source-specific JSON detail payload. Adapters
// store and retrieve it verbatim. Empty / nil values are persisted
// as the SQL default `{}`.
Details json.RawMessage
// ObservedAt stores the wall-clock at which the source captured the
// observation.
ObservedAt time.Time
}
// Validate reports whether snapshot satisfies the snapshot invariants
// implied by the SQL CHECK constraints.
func (snapshot HealthSnapshot) Validate() error {
if strings.TrimSpace(snapshot.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if !snapshot.Status.IsKnown() {
return fmt.Errorf("status %q is unsupported", snapshot.Status)
}
if !snapshot.Source.IsKnown() {
return fmt.Errorf("source %q is unsupported", snapshot.Source)
}
if snapshot.ObservedAt.IsZero() {
return fmt.Errorf("observed at must not be zero")
}
if len(snapshot.Details) > 0 && !json.Valid(snapshot.Details) {
return fmt.Errorf("details must be valid JSON when non-empty")
}
return nil
}