feat: runtime manager
This commit is contained in:
@@ -0,0 +1,231 @@
|
||||
// Package health defines the technical-health domain types owned by
|
||||
// Runtime Manager.
|
||||
//
|
||||
// EventType matches the `event_type` enum frozen in
|
||||
// `galaxy/rtmanager/api/runtime-health-asyncapi.yaml`. SnapshotStatus
|
||||
// matches the SQL CHECK on `health_snapshots.status` and is intentionally
|
||||
// narrower than EventType (the snapshot table collapses
|
||||
// `container_started → healthy` and drops `probe_recovered` per
|
||||
// `galaxy/rtmanager/README.md §Health Monitoring`).
|
||||
package health
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// EventType identifies one entry on the `runtime:health_events` Redis
|
||||
// Stream. Used by the health-event publishers and consumers.
|
||||
type EventType string
|
||||
|
||||
const (
|
||||
// EventTypeContainerStarted reports a successful container start.
|
||||
EventTypeContainerStarted EventType = "container_started"
|
||||
|
||||
// EventTypeContainerExited reports a non-zero Docker `die` event.
|
||||
EventTypeContainerExited EventType = "container_exited"
|
||||
|
||||
// EventTypeContainerOOM reports a Docker `oom` event.
|
||||
EventTypeContainerOOM EventType = "container_oom"
|
||||
|
||||
// EventTypeContainerDisappeared reports that the listener observed
|
||||
// a `destroy` event for a record Runtime Manager did not initiate.
|
||||
EventTypeContainerDisappeared EventType = "container_disappeared"
|
||||
|
||||
// EventTypeInspectUnhealthy reports an unexpected outcome of the
|
||||
// periodic Docker inspect (RestartCount growth, unexpected status,
|
||||
// declared HEALTHCHECK reporting unhealthy).
|
||||
EventTypeInspectUnhealthy EventType = "inspect_unhealthy"
|
||||
|
||||
// EventTypeProbeFailed reports that the active HTTP probe crossed
|
||||
// the configured failure threshold.
|
||||
EventTypeProbeFailed EventType = "probe_failed"
|
||||
|
||||
// EventTypeProbeRecovered reports the first probe success after a
|
||||
// `probe_failed` event was published.
|
||||
EventTypeProbeRecovered EventType = "probe_recovered"
|
||||
)
|
||||
|
||||
// IsKnown reports whether eventType belongs to the frozen event-type
|
||||
// vocabulary.
|
||||
func (eventType EventType) IsKnown() bool {
|
||||
switch eventType {
|
||||
case EventTypeContainerStarted,
|
||||
EventTypeContainerExited,
|
||||
EventTypeContainerOOM,
|
||||
EventTypeContainerDisappeared,
|
||||
EventTypeInspectUnhealthy,
|
||||
EventTypeProbeFailed,
|
||||
EventTypeProbeRecovered:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// AllEventTypes returns the frozen list of every event-type value.
|
||||
func AllEventTypes() []EventType {
|
||||
return []EventType{
|
||||
EventTypeContainerStarted,
|
||||
EventTypeContainerExited,
|
||||
EventTypeContainerOOM,
|
||||
EventTypeContainerDisappeared,
|
||||
EventTypeInspectUnhealthy,
|
||||
EventTypeProbeFailed,
|
||||
EventTypeProbeRecovered,
|
||||
}
|
||||
}
|
||||
|
||||
// SnapshotStatus identifies one latest-observation status value stored
|
||||
// in the `health_snapshots.status` column. Distinct from EventType: the
|
||||
// table collapses `container_started → healthy` and never persists
|
||||
// `probe_recovered` (it is conveyed only as a `runtime:health_events`
|
||||
// entry with status=healthy in the next observation).
|
||||
type SnapshotStatus string
|
||||
|
||||
const (
|
||||
// SnapshotStatusHealthy reports that the most recent observation
|
||||
// found the container live and the engine probe responsive.
|
||||
SnapshotStatusHealthy SnapshotStatus = "healthy"
|
||||
|
||||
// SnapshotStatusProbeFailed reports that the active probe crossed
|
||||
// the failure threshold.
|
||||
SnapshotStatusProbeFailed SnapshotStatus = "probe_failed"
|
||||
|
||||
// SnapshotStatusExited reports that the container exited.
|
||||
SnapshotStatusExited SnapshotStatus = "exited"
|
||||
|
||||
// SnapshotStatusOOM reports that the container was killed by the
|
||||
// OOM killer.
|
||||
SnapshotStatusOOM SnapshotStatus = "oom"
|
||||
|
||||
// SnapshotStatusInspectUnhealthy reports that the periodic inspect
|
||||
// observed an unexpected state.
|
||||
SnapshotStatusInspectUnhealthy SnapshotStatus = "inspect_unhealthy"
|
||||
|
||||
// SnapshotStatusContainerDisappeared reports that Docker no longer
|
||||
// reports the container.
|
||||
SnapshotStatusContainerDisappeared SnapshotStatus = "container_disappeared"
|
||||
)
|
||||
|
||||
// IsKnown reports whether status belongs to the frozen snapshot-status
|
||||
// vocabulary.
|
||||
func (status SnapshotStatus) IsKnown() bool {
|
||||
switch status {
|
||||
case SnapshotStatusHealthy,
|
||||
SnapshotStatusProbeFailed,
|
||||
SnapshotStatusExited,
|
||||
SnapshotStatusOOM,
|
||||
SnapshotStatusInspectUnhealthy,
|
||||
SnapshotStatusContainerDisappeared:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// AllSnapshotStatuses returns the frozen list of every snapshot-status
|
||||
// value.
|
||||
func AllSnapshotStatuses() []SnapshotStatus {
|
||||
return []SnapshotStatus{
|
||||
SnapshotStatusHealthy,
|
||||
SnapshotStatusProbeFailed,
|
||||
SnapshotStatusExited,
|
||||
SnapshotStatusOOM,
|
||||
SnapshotStatusInspectUnhealthy,
|
||||
SnapshotStatusContainerDisappeared,
|
||||
}
|
||||
}
|
||||
|
||||
// SnapshotSource identifies the observation source that produced one
|
||||
// snapshot. Matches the SQL CHECK on `health_snapshots.source`.
|
||||
type SnapshotSource string
|
||||
|
||||
const (
|
||||
// SnapshotSourceDockerEvent reports that the latest observation
|
||||
// arrived through the Docker events listener.
|
||||
SnapshotSourceDockerEvent SnapshotSource = "docker_event"
|
||||
|
||||
// SnapshotSourceInspect reports that the latest observation arrived
|
||||
// through the periodic Docker inspect worker.
|
||||
SnapshotSourceInspect SnapshotSource = "inspect"
|
||||
|
||||
// SnapshotSourceProbe reports that the latest observation arrived
|
||||
// through the active HTTP probe.
|
||||
SnapshotSourceProbe SnapshotSource = "probe"
|
||||
)
|
||||
|
||||
// IsKnown reports whether source belongs to the frozen snapshot-source
|
||||
// vocabulary.
|
||||
func (source SnapshotSource) IsKnown() bool {
|
||||
switch source {
|
||||
case SnapshotSourceDockerEvent,
|
||||
SnapshotSourceInspect,
|
||||
SnapshotSourceProbe:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// AllSnapshotSources returns the frozen list of every snapshot-source
|
||||
// value.
|
||||
func AllSnapshotSources() []SnapshotSource {
|
||||
return []SnapshotSource{
|
||||
SnapshotSourceDockerEvent,
|
||||
SnapshotSourceInspect,
|
||||
SnapshotSourceProbe,
|
||||
}
|
||||
}
|
||||
|
||||
// HealthSnapshot stores the latest technical-health observation for one
|
||||
// game. One row per game_id; later observations overwrite.
|
||||
type HealthSnapshot struct {
|
||||
// GameID identifies the platform game.
|
||||
GameID string
|
||||
|
||||
// ContainerID stores the Docker container id observed by the
|
||||
// snapshot source. Empty when the source could not associate a
|
||||
// container (e.g., reconciler dispose for a record whose container
|
||||
// is already gone).
|
||||
ContainerID string
|
||||
|
||||
// Status stores the latest observed snapshot status.
|
||||
Status SnapshotStatus
|
||||
|
||||
// Source stores the observation source that produced this entry.
|
||||
Source SnapshotSource
|
||||
|
||||
// Details stores the source-specific JSON detail payload. Adapters
|
||||
// store and retrieve it verbatim. Empty / nil values are persisted
|
||||
// as the SQL default `{}`.
|
||||
Details json.RawMessage
|
||||
|
||||
// ObservedAt stores the wall-clock at which the source captured the
|
||||
// observation.
|
||||
ObservedAt time.Time
|
||||
}
|
||||
|
||||
// Validate reports whether snapshot satisfies the snapshot invariants
|
||||
// implied by the SQL CHECK constraints.
|
||||
func (snapshot HealthSnapshot) Validate() error {
|
||||
if strings.TrimSpace(snapshot.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if !snapshot.Status.IsKnown() {
|
||||
return fmt.Errorf("status %q is unsupported", snapshot.Status)
|
||||
}
|
||||
if !snapshot.Source.IsKnown() {
|
||||
return fmt.Errorf("source %q is unsupported", snapshot.Source)
|
||||
}
|
||||
if snapshot.ObservedAt.IsZero() {
|
||||
return fmt.Errorf("observed at must not be zero")
|
||||
}
|
||||
if len(snapshot.Details) > 0 && !json.Valid(snapshot.Details) {
|
||||
return fmt.Errorf("details must be valid JSON when non-empty")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user