feat: runtime manager
This commit is contained in:
@@ -0,0 +1,318 @@
|
||||
// Package dockerinspect runs the periodic Docker inspect described in
|
||||
// `rtmanager/README.md §Health Monitoring`.
|
||||
//
|
||||
// On every tick the worker lists `runtime_records.status=running`,
|
||||
// inspects each container, and emits `inspect_unhealthy` when any of
|
||||
// the following holds:
|
||||
//
|
||||
// - `RestartCount` increased between observations (delta detection
|
||||
// requires a prior observation; the first inspect of a record only
|
||||
// records the baseline);
|
||||
// - `State.Status != "running"`;
|
||||
// - `State.Health.Status == "unhealthy"` (only meaningful when the
|
||||
// image declares a Docker HEALTHCHECK).
|
||||
//
|
||||
// `ErrContainerNotFound` is left to the reconciler — the inspect
|
||||
// worker logs and skips so that `container_disappeared` emission
|
||||
// stays single-sourced (Docker events listener + reconciler).
|
||||
//
|
||||
// Per-game state is pruned at the start of every tick against the
|
||||
// freshly-read running list, so a stopped or removed game never
|
||||
// carries a stale baseline into a new lifecycle.
|
||||
package dockerinspect
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
// dockerStateRunning is the verbatim Docker `State.Status` value the
|
||||
// worker treats as healthy.
|
||||
const dockerStateRunning = "running"
|
||||
|
||||
// dockerHealthUnhealthy is the verbatim Docker `State.Health.Status`
|
||||
// value the worker treats as unhealthy.
|
||||
const dockerHealthUnhealthy = "unhealthy"
|
||||
|
||||
// Dependencies groups the collaborators required by Worker.
|
||||
type Dependencies struct {
|
||||
// Docker provides the InspectContainer surface.
|
||||
Docker ports.DockerClient
|
||||
|
||||
// RuntimeRecords lists running games on every tick.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// HealthEvents emits `inspect_unhealthy` entries.
|
||||
HealthEvents ports.HealthEventPublisher
|
||||
|
||||
// Telemetry records one health-event counter per emission.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Interval bounds the tick period.
|
||||
Interval time.Duration
|
||||
|
||||
// Clock supplies the wall-clock used for emission timestamps.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// Logger receives structured worker-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// Worker drives the periodic inspect loop.
|
||||
type Worker struct {
|
||||
docker ports.DockerClient
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
healthEvents ports.HealthEventPublisher
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
interval time.Duration
|
||||
|
||||
clock func() time.Time
|
||||
logger *slog.Logger
|
||||
|
||||
mu sync.Mutex
|
||||
states map[string]*inspectState
|
||||
}
|
||||
|
||||
// inspectState stores the per-game baseline. Owned by Worker and
|
||||
// protected by Worker.mu.
|
||||
type inspectState struct {
|
||||
lastRestartCount int
|
||||
seen bool
|
||||
}
|
||||
|
||||
// NewWorker constructs one Worker from deps.
|
||||
func NewWorker(deps Dependencies) (*Worker, error) {
|
||||
switch {
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new docker inspect worker: nil docker client")
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new docker inspect worker: nil runtime records store")
|
||||
case deps.HealthEvents == nil:
|
||||
return nil, errors.New("new docker inspect worker: nil health events publisher")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new docker inspect worker: nil telemetry runtime")
|
||||
case deps.Interval <= 0:
|
||||
return nil, errors.New("new docker inspect worker: interval must be positive")
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
return &Worker{
|
||||
docker: deps.Docker,
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
healthEvents: deps.HealthEvents,
|
||||
telemetry: deps.Telemetry,
|
||||
interval: deps.Interval,
|
||||
clock: clock,
|
||||
logger: logger.With("worker", "rtmanager.dockerinspect"),
|
||||
states: map[string]*inspectState{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run drives the inspect loop until ctx is cancelled. Per-tick errors
|
||||
// are absorbed; the loop only exits on context cancellation.
|
||||
func (worker *Worker) Run(ctx context.Context) error {
|
||||
if worker == nil {
|
||||
return errors.New("run docker inspect worker: nil worker")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run docker inspect worker: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
worker.logger.Info("docker inspect worker started",
|
||||
"interval", worker.interval.String(),
|
||||
)
|
||||
defer worker.logger.Info("docker inspect worker stopped")
|
||||
|
||||
ticker := time.NewTicker(worker.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
worker.tick(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; Run terminates on context cancellation.
|
||||
func (worker *Worker) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown docker inspect worker: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Tick performs one inspect pass. Exported so tests can drive the
|
||||
// worker deterministically without spinning a real ticker.
|
||||
func (worker *Worker) Tick(ctx context.Context) {
|
||||
worker.tick(ctx)
|
||||
}
|
||||
|
||||
// tick performs one full pass: list running records, prune state for
|
||||
// stopped games, then inspect every running container sequentially.
|
||||
// Inspect calls are cheap; sequential execution avoids fan-out against
|
||||
// the Docker daemon.
|
||||
func (worker *Worker) tick(ctx context.Context) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
|
||||
if err != nil {
|
||||
worker.logger.WarnContext(ctx, "list running records",
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
worker.pruneStates(records)
|
||||
|
||||
for _, record := range records {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
worker.inspectOne(ctx, record)
|
||||
}
|
||||
}
|
||||
|
||||
// pruneStates removes per-game baselines for games no longer in the
|
||||
// running list.
|
||||
func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) {
|
||||
worker.mu.Lock()
|
||||
defer worker.mu.Unlock()
|
||||
if len(worker.states) == 0 {
|
||||
return
|
||||
}
|
||||
running := make(map[string]struct{}, len(records))
|
||||
for _, record := range records {
|
||||
running[record.GameID] = struct{}{}
|
||||
}
|
||||
for gameID := range worker.states {
|
||||
if _, ok := running[gameID]; !ok {
|
||||
delete(worker.states, gameID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// inspectOne issues one InspectContainer call and emits
|
||||
// `inspect_unhealthy` when the observation crosses any of the three
|
||||
// trigger conditions. The first observation of a record only seeds the
|
||||
// baseline; deltas need at least two ticks.
|
||||
func (worker *Worker) inspectOne(ctx context.Context, record runtime.RuntimeRecord) {
|
||||
inspect, err := worker.docker.InspectContainer(ctx, record.CurrentContainerID)
|
||||
if err != nil {
|
||||
if errors.Is(err, ports.ErrContainerNotFound) {
|
||||
worker.logger.DebugContext(ctx, "inspect skipped: container missing",
|
||||
"game_id", record.GameID,
|
||||
"container_id", record.CurrentContainerID,
|
||||
)
|
||||
return
|
||||
}
|
||||
worker.logger.WarnContext(ctx, "inspect failed",
|
||||
"game_id", record.GameID,
|
||||
"container_id", record.CurrentContainerID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
worker.mu.Lock()
|
||||
state, ok := worker.states[record.GameID]
|
||||
if !ok {
|
||||
state = &inspectState{}
|
||||
worker.states[record.GameID] = state
|
||||
}
|
||||
prev := *state
|
||||
state.lastRestartCount = inspect.RestartCount
|
||||
state.seen = true
|
||||
worker.mu.Unlock()
|
||||
|
||||
emit := false
|
||||
switch {
|
||||
case prev.seen && inspect.RestartCount > prev.lastRestartCount:
|
||||
emit = true
|
||||
case inspect.Status != dockerStateRunning:
|
||||
emit = true
|
||||
case inspect.Health == dockerHealthUnhealthy:
|
||||
emit = true
|
||||
}
|
||||
if !emit {
|
||||
return
|
||||
}
|
||||
|
||||
worker.publish(ctx, ports.HealthEventEnvelope{
|
||||
GameID: record.GameID,
|
||||
ContainerID: record.CurrentContainerID,
|
||||
EventType: health.EventTypeInspectUnhealthy,
|
||||
OccurredAt: worker.clock().UTC(),
|
||||
Details: inspectUnhealthyDetails(inspect.RestartCount, inspect.Status, inspect.Health),
|
||||
})
|
||||
}
|
||||
|
||||
// publish emits one envelope through the configured publisher, updates
|
||||
// the telemetry counter, and logs the outcome. Failures degrade to a
|
||||
// warning log per `rtmanager/README.md §Notification Contracts`.
|
||||
func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
||||
if err := worker.healthEvents.Publish(ctx, envelope); err != nil {
|
||||
worker.logger.ErrorContext(ctx, "publish health event",
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
worker.logger.InfoContext(ctx, "inspect event published", logArgs...)
|
||||
}
|
||||
|
||||
// inspectUnhealthyDetails builds the JSON payload required by the
|
||||
// `inspect_unhealthy` AsyncAPI variant. All three fields are required
|
||||
// even when their value is the zero value.
|
||||
func inspectUnhealthyDetails(restartCount int, state, health string) json.RawMessage {
|
||||
payload := struct {
|
||||
RestartCount int `json:"restart_count"`
|
||||
State string `json:"state"`
|
||||
Health string `json:"health"`
|
||||
}{
|
||||
RestartCount: restartCount,
|
||||
State: state,
|
||||
Health: health,
|
||||
}
|
||||
encoded, _ := json.Marshal(payload)
|
||||
return encoded
|
||||
}
|
||||
Reference in New Issue
Block a user