// Package health defines the technical-health domain types owned by // Runtime Manager. // // EventType matches the `event_type` enum frozen in // `galaxy/rtmanager/api/runtime-health-asyncapi.yaml`. SnapshotStatus // matches the SQL CHECK on `health_snapshots.status` and is intentionally // narrower than EventType (the snapshot table collapses // `container_started → healthy` and drops `probe_recovered` per // `galaxy/rtmanager/README.md §Health Monitoring`). package health import ( "encoding/json" "fmt" "strings" "time" ) // EventType identifies one entry on the `runtime:health_events` Redis // Stream. Used by the health-event publishers and consumers. type EventType string const ( // EventTypeContainerStarted reports a successful container start. EventTypeContainerStarted EventType = "container_started" // EventTypeContainerExited reports a non-zero Docker `die` event. EventTypeContainerExited EventType = "container_exited" // EventTypeContainerOOM reports a Docker `oom` event. EventTypeContainerOOM EventType = "container_oom" // EventTypeContainerDisappeared reports that the listener observed // a `destroy` event for a record Runtime Manager did not initiate. EventTypeContainerDisappeared EventType = "container_disappeared" // EventTypeInspectUnhealthy reports an unexpected outcome of the // periodic Docker inspect (RestartCount growth, unexpected status, // declared HEALTHCHECK reporting unhealthy). EventTypeInspectUnhealthy EventType = "inspect_unhealthy" // EventTypeProbeFailed reports that the active HTTP probe crossed // the configured failure threshold. EventTypeProbeFailed EventType = "probe_failed" // EventTypeProbeRecovered reports the first probe success after a // `probe_failed` event was published. EventTypeProbeRecovered EventType = "probe_recovered" ) // IsKnown reports whether eventType belongs to the frozen event-type // vocabulary. func (eventType EventType) IsKnown() bool { switch eventType { case EventTypeContainerStarted, EventTypeContainerExited, EventTypeContainerOOM, EventTypeContainerDisappeared, EventTypeInspectUnhealthy, EventTypeProbeFailed, EventTypeProbeRecovered: return true default: return false } } // AllEventTypes returns the frozen list of every event-type value. func AllEventTypes() []EventType { return []EventType{ EventTypeContainerStarted, EventTypeContainerExited, EventTypeContainerOOM, EventTypeContainerDisappeared, EventTypeInspectUnhealthy, EventTypeProbeFailed, EventTypeProbeRecovered, } } // SnapshotStatus identifies one latest-observation status value stored // in the `health_snapshots.status` column. Distinct from EventType: the // table collapses `container_started → healthy` and never persists // `probe_recovered` (it is conveyed only as a `runtime:health_events` // entry with status=healthy in the next observation). type SnapshotStatus string const ( // SnapshotStatusHealthy reports that the most recent observation // found the container live and the engine probe responsive. SnapshotStatusHealthy SnapshotStatus = "healthy" // SnapshotStatusProbeFailed reports that the active probe crossed // the failure threshold. SnapshotStatusProbeFailed SnapshotStatus = "probe_failed" // SnapshotStatusExited reports that the container exited. SnapshotStatusExited SnapshotStatus = "exited" // SnapshotStatusOOM reports that the container was killed by the // OOM killer. SnapshotStatusOOM SnapshotStatus = "oom" // SnapshotStatusInspectUnhealthy reports that the periodic inspect // observed an unexpected state. SnapshotStatusInspectUnhealthy SnapshotStatus = "inspect_unhealthy" // SnapshotStatusContainerDisappeared reports that Docker no longer // reports the container. SnapshotStatusContainerDisappeared SnapshotStatus = "container_disappeared" ) // IsKnown reports whether status belongs to the frozen snapshot-status // vocabulary. func (status SnapshotStatus) IsKnown() bool { switch status { case SnapshotStatusHealthy, SnapshotStatusProbeFailed, SnapshotStatusExited, SnapshotStatusOOM, SnapshotStatusInspectUnhealthy, SnapshotStatusContainerDisappeared: return true default: return false } } // AllSnapshotStatuses returns the frozen list of every snapshot-status // value. func AllSnapshotStatuses() []SnapshotStatus { return []SnapshotStatus{ SnapshotStatusHealthy, SnapshotStatusProbeFailed, SnapshotStatusExited, SnapshotStatusOOM, SnapshotStatusInspectUnhealthy, SnapshotStatusContainerDisappeared, } } // SnapshotSource identifies the observation source that produced one // snapshot. Matches the SQL CHECK on `health_snapshots.source`. type SnapshotSource string const ( // SnapshotSourceDockerEvent reports that the latest observation // arrived through the Docker events listener. SnapshotSourceDockerEvent SnapshotSource = "docker_event" // SnapshotSourceInspect reports that the latest observation arrived // through the periodic Docker inspect worker. SnapshotSourceInspect SnapshotSource = "inspect" // SnapshotSourceProbe reports that the latest observation arrived // through the active HTTP probe. SnapshotSourceProbe SnapshotSource = "probe" ) // IsKnown reports whether source belongs to the frozen snapshot-source // vocabulary. func (source SnapshotSource) IsKnown() bool { switch source { case SnapshotSourceDockerEvent, SnapshotSourceInspect, SnapshotSourceProbe: return true default: return false } } // AllSnapshotSources returns the frozen list of every snapshot-source // value. func AllSnapshotSources() []SnapshotSource { return []SnapshotSource{ SnapshotSourceDockerEvent, SnapshotSourceInspect, SnapshotSourceProbe, } } // HealthSnapshot stores the latest technical-health observation for one // game. One row per game_id; later observations overwrite. type HealthSnapshot struct { // GameID identifies the platform game. GameID string // ContainerID stores the Docker container id observed by the // snapshot source. Empty when the source could not associate a // container (e.g., reconciler dispose for a record whose container // is already gone). ContainerID string // Status stores the latest observed snapshot status. Status SnapshotStatus // Source stores the observation source that produced this entry. Source SnapshotSource // Details stores the source-specific JSON detail payload. Adapters // store and retrieve it verbatim. Empty / nil values are persisted // as the SQL default `{}`. Details json.RawMessage // ObservedAt stores the wall-clock at which the source captured the // observation. ObservedAt time.Time } // Validate reports whether snapshot satisfies the snapshot invariants // implied by the SQL CHECK constraints. func (snapshot HealthSnapshot) Validate() error { if strings.TrimSpace(snapshot.GameID) == "" { return fmt.Errorf("game id must not be empty") } if !snapshot.Status.IsKnown() { return fmt.Errorf("status %q is unsupported", snapshot.Status) } if !snapshot.Source.IsKnown() { return fmt.Errorf("source %q is unsupported", snapshot.Source) } if snapshot.ObservedAt.IsZero() { return fmt.Errorf("observed at must not be zero") } if len(snapshot.Details) > 0 && !json.Valid(snapshot.Details) { return fmt.Errorf("details must be valid JSON when non-empty") } return nil }