232 lines
7.2 KiB
Go
232 lines
7.2 KiB
Go
// Package health defines the technical-health domain types owned by
|
|
// Runtime Manager.
|
|
//
|
|
// EventType matches the `event_type` enum frozen in
|
|
// `galaxy/rtmanager/api/runtime-health-asyncapi.yaml`. SnapshotStatus
|
|
// matches the SQL CHECK on `health_snapshots.status` and is intentionally
|
|
// narrower than EventType (the snapshot table collapses
|
|
// `container_started → healthy` and drops `probe_recovered` per
|
|
// `galaxy/rtmanager/README.md §Health Monitoring`).
|
|
package health
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// EventType identifies one entry on the `runtime:health_events` Redis
|
|
// Stream. Used by the health-event publishers and consumers.
|
|
type EventType string
|
|
|
|
const (
|
|
// EventTypeContainerStarted reports a successful container start.
|
|
EventTypeContainerStarted EventType = "container_started"
|
|
|
|
// EventTypeContainerExited reports a non-zero Docker `die` event.
|
|
EventTypeContainerExited EventType = "container_exited"
|
|
|
|
// EventTypeContainerOOM reports a Docker `oom` event.
|
|
EventTypeContainerOOM EventType = "container_oom"
|
|
|
|
// EventTypeContainerDisappeared reports that the listener observed
|
|
// a `destroy` event for a record Runtime Manager did not initiate.
|
|
EventTypeContainerDisappeared EventType = "container_disappeared"
|
|
|
|
// EventTypeInspectUnhealthy reports an unexpected outcome of the
|
|
// periodic Docker inspect (RestartCount growth, unexpected status,
|
|
// declared HEALTHCHECK reporting unhealthy).
|
|
EventTypeInspectUnhealthy EventType = "inspect_unhealthy"
|
|
|
|
// EventTypeProbeFailed reports that the active HTTP probe crossed
|
|
// the configured failure threshold.
|
|
EventTypeProbeFailed EventType = "probe_failed"
|
|
|
|
// EventTypeProbeRecovered reports the first probe success after a
|
|
// `probe_failed` event was published.
|
|
EventTypeProbeRecovered EventType = "probe_recovered"
|
|
)
|
|
|
|
// IsKnown reports whether eventType belongs to the frozen event-type
|
|
// vocabulary.
|
|
func (eventType EventType) IsKnown() bool {
|
|
switch eventType {
|
|
case EventTypeContainerStarted,
|
|
EventTypeContainerExited,
|
|
EventTypeContainerOOM,
|
|
EventTypeContainerDisappeared,
|
|
EventTypeInspectUnhealthy,
|
|
EventTypeProbeFailed,
|
|
EventTypeProbeRecovered:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// AllEventTypes returns the frozen list of every event-type value.
|
|
func AllEventTypes() []EventType {
|
|
return []EventType{
|
|
EventTypeContainerStarted,
|
|
EventTypeContainerExited,
|
|
EventTypeContainerOOM,
|
|
EventTypeContainerDisappeared,
|
|
EventTypeInspectUnhealthy,
|
|
EventTypeProbeFailed,
|
|
EventTypeProbeRecovered,
|
|
}
|
|
}
|
|
|
|
// SnapshotStatus identifies one latest-observation status value stored
|
|
// in the `health_snapshots.status` column. Distinct from EventType: the
|
|
// table collapses `container_started → healthy` and never persists
|
|
// `probe_recovered` (it is conveyed only as a `runtime:health_events`
|
|
// entry with status=healthy in the next observation).
|
|
type SnapshotStatus string
|
|
|
|
const (
|
|
// SnapshotStatusHealthy reports that the most recent observation
|
|
// found the container live and the engine probe responsive.
|
|
SnapshotStatusHealthy SnapshotStatus = "healthy"
|
|
|
|
// SnapshotStatusProbeFailed reports that the active probe crossed
|
|
// the failure threshold.
|
|
SnapshotStatusProbeFailed SnapshotStatus = "probe_failed"
|
|
|
|
// SnapshotStatusExited reports that the container exited.
|
|
SnapshotStatusExited SnapshotStatus = "exited"
|
|
|
|
// SnapshotStatusOOM reports that the container was killed by the
|
|
// OOM killer.
|
|
SnapshotStatusOOM SnapshotStatus = "oom"
|
|
|
|
// SnapshotStatusInspectUnhealthy reports that the periodic inspect
|
|
// observed an unexpected state.
|
|
SnapshotStatusInspectUnhealthy SnapshotStatus = "inspect_unhealthy"
|
|
|
|
// SnapshotStatusContainerDisappeared reports that Docker no longer
|
|
// reports the container.
|
|
SnapshotStatusContainerDisappeared SnapshotStatus = "container_disappeared"
|
|
)
|
|
|
|
// IsKnown reports whether status belongs to the frozen snapshot-status
|
|
// vocabulary.
|
|
func (status SnapshotStatus) IsKnown() bool {
|
|
switch status {
|
|
case SnapshotStatusHealthy,
|
|
SnapshotStatusProbeFailed,
|
|
SnapshotStatusExited,
|
|
SnapshotStatusOOM,
|
|
SnapshotStatusInspectUnhealthy,
|
|
SnapshotStatusContainerDisappeared:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// AllSnapshotStatuses returns the frozen list of every snapshot-status
|
|
// value.
|
|
func AllSnapshotStatuses() []SnapshotStatus {
|
|
return []SnapshotStatus{
|
|
SnapshotStatusHealthy,
|
|
SnapshotStatusProbeFailed,
|
|
SnapshotStatusExited,
|
|
SnapshotStatusOOM,
|
|
SnapshotStatusInspectUnhealthy,
|
|
SnapshotStatusContainerDisappeared,
|
|
}
|
|
}
|
|
|
|
// SnapshotSource identifies the observation source that produced one
|
|
// snapshot. Matches the SQL CHECK on `health_snapshots.source`.
|
|
type SnapshotSource string
|
|
|
|
const (
|
|
// SnapshotSourceDockerEvent reports that the latest observation
|
|
// arrived through the Docker events listener.
|
|
SnapshotSourceDockerEvent SnapshotSource = "docker_event"
|
|
|
|
// SnapshotSourceInspect reports that the latest observation arrived
|
|
// through the periodic Docker inspect worker.
|
|
SnapshotSourceInspect SnapshotSource = "inspect"
|
|
|
|
// SnapshotSourceProbe reports that the latest observation arrived
|
|
// through the active HTTP probe.
|
|
SnapshotSourceProbe SnapshotSource = "probe"
|
|
)
|
|
|
|
// IsKnown reports whether source belongs to the frozen snapshot-source
|
|
// vocabulary.
|
|
func (source SnapshotSource) IsKnown() bool {
|
|
switch source {
|
|
case SnapshotSourceDockerEvent,
|
|
SnapshotSourceInspect,
|
|
SnapshotSourceProbe:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// AllSnapshotSources returns the frozen list of every snapshot-source
|
|
// value.
|
|
func AllSnapshotSources() []SnapshotSource {
|
|
return []SnapshotSource{
|
|
SnapshotSourceDockerEvent,
|
|
SnapshotSourceInspect,
|
|
SnapshotSourceProbe,
|
|
}
|
|
}
|
|
|
|
// HealthSnapshot stores the latest technical-health observation for one
|
|
// game. One row per game_id; later observations overwrite.
|
|
type HealthSnapshot struct {
|
|
// GameID identifies the platform game.
|
|
GameID string
|
|
|
|
// ContainerID stores the Docker container id observed by the
|
|
// snapshot source. Empty when the source could not associate a
|
|
// container (e.g., reconciler dispose for a record whose container
|
|
// is already gone).
|
|
ContainerID string
|
|
|
|
// Status stores the latest observed snapshot status.
|
|
Status SnapshotStatus
|
|
|
|
// Source stores the observation source that produced this entry.
|
|
Source SnapshotSource
|
|
|
|
// Details stores the source-specific JSON detail payload. Adapters
|
|
// store and retrieve it verbatim. Empty / nil values are persisted
|
|
// as the SQL default `{}`.
|
|
Details json.RawMessage
|
|
|
|
// ObservedAt stores the wall-clock at which the source captured the
|
|
// observation.
|
|
ObservedAt time.Time
|
|
}
|
|
|
|
// Validate reports whether snapshot satisfies the snapshot invariants
|
|
// implied by the SQL CHECK constraints.
|
|
func (snapshot HealthSnapshot) Validate() error {
|
|
if strings.TrimSpace(snapshot.GameID) == "" {
|
|
return fmt.Errorf("game id must not be empty")
|
|
}
|
|
if !snapshot.Status.IsKnown() {
|
|
return fmt.Errorf("status %q is unsupported", snapshot.Status)
|
|
}
|
|
if !snapshot.Source.IsKnown() {
|
|
return fmt.Errorf("source %q is unsupported", snapshot.Source)
|
|
}
|
|
if snapshot.ObservedAt.IsZero() {
|
|
return fmt.Errorf("observed at must not be zero")
|
|
}
|
|
if len(snapshot.Details) > 0 && !json.Valid(snapshot.Details) {
|
|
return fmt.Errorf("details must be valid JSON when non-empty")
|
|
}
|
|
|
|
return nil
|
|
}
|