feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,357 @@
// Package dockerevents subscribes to the Docker events stream and turns
// container-scoped events into entries on `runtime:health_events`.
//
// Three event kinds are emitted by this listener (per
// `rtmanager/README.md §Health Monitoring`):
//
// - `container_exited` from a `die` action with non-zero exit code;
// - `container_oom` from an `oom` action;
// - `container_disappeared` from a `destroy` action observed for a
// `runtime_records.status=running` row whose `current_container_id`
// still matches the destroyed container — i.e., a destroy that
// Runtime Manager did not initiate itself. Destroys triggered by
// RTM's own restart / cleanup flow either find the record already
// transitioned (status != running) or pointing at a different
// container id, and are therefore skipped.
//
// `container_started` is emitted by the start service and is not
// duplicated here. Graceful stop produces a `die` event with exit code
// `0`; that case is suppressed to honour the README guarantee that
// `container_exited` carries a non-zero exit.
//
// Design rationale, including the destroy-disambiguation rule and the
// reconnect policy, is captured in
// `rtmanager/docs/workers.md`.
package dockerevents
import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/logging"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/telemetry"
)
// Docker event actions consumed by the listener. Other actions are
// observed but ignored.
const (
actionDie = "die"
actionOOM = "oom"
actionDestroy = "destroy"
)
// defaultReconnectBackoff bounds the wait between two `EventsListen`
// reconnect attempts. Daemon hiccups in production are common; the
// listener never gives up while ctx is alive.
const defaultReconnectBackoff = 5 * time.Second
// Dependencies groups the collaborators required by Listener.
type Dependencies struct {
// Docker provides the EventsListen subscription used by Run.
Docker ports.DockerClient
// RuntimeRecords resolves `(game_id, container_id)` for destroy
// disambiguation.
RuntimeRecords ports.RuntimeRecordStore
// HealthEvents emits the entries produced by handleEvent. Failures
// are best-effort: the listener logs and continues.
HealthEvents ports.HealthEventPublisher
// Telemetry records one health-event counter increment per emission.
// Required.
Telemetry *telemetry.Runtime
// Clock supplies the wall-clock used as a fallback when a Docker
// event arrives without a timestamp. Defaults to `time.Now`.
Clock func() time.Time
// Logger receives structured worker-level events. Defaults to
// `slog.Default()` when nil.
Logger *slog.Logger
// ReconnectBackoff bounds the wait between reconnect attempts.
// Defaults to defaultReconnectBackoff when zero.
ReconnectBackoff time.Duration
}
// Listener consumes Docker container events and emits the matching
// `runtime:health_events` entries.
type Listener struct {
docker ports.DockerClient
runtimeRecords ports.RuntimeRecordStore
healthEvents ports.HealthEventPublisher
telemetry *telemetry.Runtime
clock func() time.Time
logger *slog.Logger
reconnectBackoff time.Duration
}
// NewListener constructs one Listener from deps.
func NewListener(deps Dependencies) (*Listener, error) {
switch {
case deps.Docker == nil:
return nil, errors.New("new docker events listener: nil docker client")
case deps.RuntimeRecords == nil:
return nil, errors.New("new docker events listener: nil runtime records store")
case deps.HealthEvents == nil:
return nil, errors.New("new docker events listener: nil health events publisher")
case deps.Telemetry == nil:
return nil, errors.New("new docker events listener: nil telemetry runtime")
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
backoff := deps.ReconnectBackoff
if backoff <= 0 {
backoff = defaultReconnectBackoff
}
return &Listener{
docker: deps.Docker,
runtimeRecords: deps.RuntimeRecords,
healthEvents: deps.HealthEvents,
telemetry: deps.Telemetry,
clock: clock,
logger: logger.With("worker", "rtmanager.dockerevents"),
reconnectBackoff: backoff,
}, nil
}
// Run drives the events subscription. The outer loop reconnects after a
// Docker subscription error with a fixed backoff; only `ctx`
// cancellation terminates Run.
func (listener *Listener) Run(ctx context.Context) error {
if listener == nil {
return errors.New("run docker events listener: nil listener")
}
if ctx == nil {
return errors.New("run docker events listener: nil context")
}
if err := ctx.Err(); err != nil {
return err
}
listener.logger.Info("docker events listener started",
"reconnect_backoff", listener.reconnectBackoff.String(),
)
defer listener.logger.Info("docker events listener stopped")
for {
if err := ctx.Err(); err != nil {
return err
}
err := listener.runOnce(ctx)
if err == nil || errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
if ctxErr := ctx.Err(); ctxErr != nil {
return ctxErr
}
}
if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) {
listener.logger.WarnContext(ctx, "docker events subscription dropped, will reconnect",
"err", err.Error(),
"backoff", listener.reconnectBackoff.String(),
)
}
if waitErr := listener.sleep(ctx); waitErr != nil {
return waitErr
}
}
}
// Shutdown is a no-op; Run terminates on context cancellation.
func (listener *Listener) Shutdown(ctx context.Context) error {
if ctx == nil {
return errors.New("shutdown docker events listener: nil context")
}
return nil
}
// runOnce subscribes once and processes events until the subscription
// reports an error or ctx is cancelled.
func (listener *Listener) runOnce(ctx context.Context) error {
events, errs, err := listener.docker.EventsListen(ctx)
if err != nil {
return fmt.Errorf("subscribe docker events: %w", err)
}
for {
select {
case <-ctx.Done():
return ctx.Err()
case event, ok := <-events:
if !ok {
return errors.New("docker events channel closed")
}
listener.handleEvent(ctx, event)
case subscribeErr, ok := <-errs:
if !ok {
return errors.New("docker errors channel closed")
}
if subscribeErr == nil {
continue
}
return subscribeErr
}
}
}
// sleep waits reconnectBackoff or until ctx is cancelled.
func (listener *Listener) sleep(ctx context.Context) error {
timer := time.NewTimer(listener.reconnectBackoff)
defer timer.Stop()
select {
case <-ctx.Done():
return ctx.Err()
case <-timer.C:
return nil
}
}
// handleEvent translates one Docker event into a health-events emission
// (if any). All branches are exported via tests.
func (listener *Listener) handleEvent(ctx context.Context, event ports.DockerEvent) {
gameID := strings.TrimSpace(event.Labels[startruntime.LabelGameID])
if gameID == "" {
return
}
occurredAt := event.OccurredAt
if occurredAt.IsZero() {
occurredAt = listener.clock()
}
occurredAt = occurredAt.UTC()
switch event.Action {
case actionDie:
if event.ExitCode == 0 {
return
}
listener.publish(ctx, ports.HealthEventEnvelope{
GameID: gameID,
ContainerID: event.ContainerID,
EventType: health.EventTypeContainerExited,
OccurredAt: occurredAt,
Details: containerExitedDetails(event.ExitCode, false),
})
case actionOOM:
listener.publish(ctx, ports.HealthEventEnvelope{
GameID: gameID,
ContainerID: event.ContainerID,
EventType: health.EventTypeContainerOOM,
OccurredAt: occurredAt,
Details: containerOOMDetails(event.ExitCode),
})
case actionDestroy:
if !listener.isUnexpectedDestroy(ctx, gameID, event.ContainerID) {
return
}
listener.publish(ctx, ports.HealthEventEnvelope{
GameID: gameID,
ContainerID: event.ContainerID,
EventType: health.EventTypeContainerDisappeared,
OccurredAt: occurredAt,
Details: containerDisappearedDetails(),
})
default:
return
}
}
// isUnexpectedDestroy returns true when the destroy event came from a
// source other than Runtime Manager itself. The check is conservative:
// any read error treats the destroy as expected (we cannot tell), and
// only a record currently `running` whose `current_container_id` still
// equals the destroyed id is considered unexpected.
func (listener *Listener) isUnexpectedDestroy(ctx context.Context, gameID, containerID string) bool {
record, err := listener.runtimeRecords.Get(ctx, gameID)
switch {
case errors.Is(err, runtime.ErrNotFound):
return false
case err != nil:
listener.logger.WarnContext(ctx, "destroy lookup failed; suppressing emission",
"game_id", gameID,
"container_id", containerID,
"err", err.Error(),
)
return false
}
if record.Status != runtime.StatusRunning {
return false
}
if record.CurrentContainerID != containerID {
return false
}
return true
}
// publish emits one envelope through the configured publisher, updates
// the telemetry counter, and logs the outcome. All side effects are
// best-effort; a publish error degrades to a warning log.
func (listener *Listener) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
if err := listener.healthEvents.Publish(ctx, envelope); err != nil {
listener.logger.ErrorContext(ctx, "publish health event",
"game_id", envelope.GameID,
"container_id", envelope.ContainerID,
"event_type", string(envelope.EventType),
"err", err.Error(),
)
return
}
listener.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
logArgs := []any{
"game_id", envelope.GameID,
"container_id", envelope.ContainerID,
"event_type", string(envelope.EventType),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
listener.logger.InfoContext(ctx, "docker event published", logArgs...)
}
// containerExitedDetails builds the JSON payload required by the
// `container_exited` AsyncAPI variant.
func containerExitedDetails(exitCode int, oom bool) json.RawMessage {
payload := struct {
ExitCode int `json:"exit_code"`
OOM bool `json:"oom"`
}{ExitCode: exitCode, OOM: oom}
encoded, _ := json.Marshal(payload)
return encoded
}
// containerOOMDetails builds the JSON payload required by the
// `container_oom` AsyncAPI variant.
func containerOOMDetails(exitCode int) json.RawMessage {
payload := struct {
ExitCode int `json:"exit_code"`
}{ExitCode: exitCode}
encoded, _ := json.Marshal(payload)
return encoded
}
// containerDisappearedDetails builds the empty JSON object the
// `container_disappeared` AsyncAPI variant requires.
func containerDisappearedDetails() json.RawMessage {
return json.RawMessage(`{}`)
}
@@ -0,0 +1,584 @@
package dockerevents_test
import (
"context"
"encoding/json"
"errors"
"io"
"log/slog"
"sync"
"sync/atomic"
"testing"
"time"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/telemetry"
"galaxy/rtmanager/internal/worker/dockerevents"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func silentLogger() *slog.Logger {
return slog.New(slog.NewTextHandler(io.Discard, nil))
}
// fakeDockerEvents is a minimal ports.DockerClient implementation for
// the listener: only EventsListen is exercised. Tests push events
// through the eventsCh channel and observe reconnect attempts via the
// counter.
type fakeDockerEvents struct {
mu sync.Mutex
subscribeCount int32
subscribeErr error
currentEventsCh chan ports.DockerEvent
currentErrsCh chan error
subscribed chan struct{}
}
func newFakeDockerEvents() *fakeDockerEvents {
return &fakeDockerEvents{subscribed: make(chan struct{}, 16)}
}
func (f *fakeDockerEvents) EventsListen(ctx context.Context) (<-chan ports.DockerEvent, <-chan error, error) {
atomic.AddInt32(&f.subscribeCount, 1)
f.mu.Lock()
if f.subscribeErr != nil {
err := f.subscribeErr
f.mu.Unlock()
return nil, nil, err
}
events := make(chan ports.DockerEvent, 16)
errs := make(chan error, 1)
f.currentEventsCh = events
f.currentErrsCh = errs
f.mu.Unlock()
select {
case f.subscribed <- struct{}{}:
default:
}
go func() {
<-ctx.Done()
f.mu.Lock()
if f.currentEventsCh == events {
close(events)
close(errs)
f.currentEventsCh = nil
f.currentErrsCh = nil
}
f.mu.Unlock()
}()
return events, errs, nil
}
func (f *fakeDockerEvents) sendEvent(event ports.DockerEvent) {
f.mu.Lock()
ch := f.currentEventsCh
f.mu.Unlock()
if ch != nil {
ch <- event
}
}
func (f *fakeDockerEvents) sendErr(err error) {
f.mu.Lock()
ch := f.currentErrsCh
f.mu.Unlock()
if ch != nil {
ch <- err
}
}
func (f *fakeDockerEvents) waitSubscribed(t *testing.T) {
t.Helper()
select {
case <-f.subscribed:
case <-time.After(time.Second):
t.Fatalf("timed out waiting for EventsListen subscription")
}
}
func (f *fakeDockerEvents) subscriptions() int {
return int(atomic.LoadInt32(&f.subscribeCount))
}
// Unused DockerClient methods. The listener only consumes EventsListen.
func (f *fakeDockerEvents) EnsureNetwork(_ context.Context, _ string) error { return nil }
func (f *fakeDockerEvents) PullImage(_ context.Context, _ string, _ ports.PullPolicy) error {
return nil
}
func (f *fakeDockerEvents) InspectImage(_ context.Context, _ string) (ports.ImageInspect, error) {
return ports.ImageInspect{}, nil
}
func (f *fakeDockerEvents) InspectContainer(_ context.Context, _ string) (ports.ContainerInspect, error) {
return ports.ContainerInspect{}, nil
}
func (f *fakeDockerEvents) Run(_ context.Context, _ ports.RunSpec) (ports.RunResult, error) {
return ports.RunResult{}, nil
}
func (f *fakeDockerEvents) Stop(_ context.Context, _ string, _ time.Duration) error { return nil }
func (f *fakeDockerEvents) Remove(_ context.Context, _ string) error { return nil }
func (f *fakeDockerEvents) List(_ context.Context, _ ports.ListFilter) ([]ports.ContainerSummary, error) {
return nil, nil
}
// fakeRuntimeRecords supports Get only; the listener does not call any
// other method. Tests seed records via Set.
type fakeRuntimeRecords struct {
mu sync.Mutex
stored map[string]runtime.RuntimeRecord
getErr error
}
func newFakeRuntimeRecords() *fakeRuntimeRecords {
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
}
func (s *fakeRuntimeRecords) Set(record runtime.RuntimeRecord) {
s.mu.Lock()
defer s.mu.Unlock()
s.stored[record.GameID] = record
}
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.getErr != nil {
return runtime.RuntimeRecord{}, s.getErr
}
record, ok := s.stored[gameID]
if !ok {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
return record, nil
}
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil }
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
return nil
}
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { return nil, nil }
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
return nil, nil
}
// fakeHealthEvents captures every Publish call.
type fakeHealthEvents struct {
mu sync.Mutex
published []ports.HealthEventEnvelope
publishErr error
}
func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.publishErr != nil {
return s.publishErr
}
s.published = append(s.published, envelope)
return nil
}
func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope {
s.mu.Lock()
defer s.mu.Unlock()
out := make([]ports.HealthEventEnvelope, len(s.published))
copy(out, s.published)
return out
}
// --- harness ----------------------------------------------------------
type harness struct {
docker *fakeDockerEvents
records *fakeRuntimeRecords
health *fakeHealthEvents
listener *dockerevents.Listener
clockNow time.Time
}
func newHarness(t *testing.T) *harness {
t.Helper()
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
require.NoError(t, err)
docker := newFakeDockerEvents()
records := newFakeRuntimeRecords()
healthEvents := &fakeHealthEvents{}
clockNow := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
listener, err := dockerevents.NewListener(dockerevents.Dependencies{
Docker: docker,
RuntimeRecords: records,
HealthEvents: healthEvents,
Telemetry: telemetryRuntime,
Clock: func() time.Time { return clockNow },
Logger: silentLogger(),
ReconnectBackoff: 5 * time.Millisecond,
})
require.NoError(t, err)
return &harness{
docker: docker,
records: records,
health: healthEvents,
listener: listener,
clockNow: clockNow,
}
}
// --- constructor -------------------------------------------------------
func TestNewListenerRejectsMissingDeps(t *testing.T) {
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
require.NoError(t, err)
cases := []dockerevents.Dependencies{
{},
{Docker: newFakeDockerEvents()},
{Docker: newFakeDockerEvents(), RuntimeRecords: newFakeRuntimeRecords()},
{Docker: newFakeDockerEvents(), RuntimeRecords: newFakeRuntimeRecords(), HealthEvents: &fakeHealthEvents{}},
}
for index, deps := range cases {
_, err := dockerevents.NewListener(deps)
require.Errorf(t, err, "case %d should fail", index)
}
_, err = dockerevents.NewListener(dockerevents.Dependencies{
Docker: newFakeDockerEvents(),
RuntimeRecords: newFakeRuntimeRecords(),
HealthEvents: &fakeHealthEvents{},
Telemetry: telemetryRuntime,
})
require.NoError(t, err)
}
// --- Run lifecycle -----------------------------------------------------
func TestRunPublishesContainerExitedOnNonZeroDie(t *testing.T) {
h := newHarness(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := runListener(h, ctx)
h.docker.waitSubscribed(t)
occurredAt := h.clockNow.Add(-time.Minute)
h.docker.sendEvent(ports.DockerEvent{
Action: "die",
ContainerID: "ctr-die",
Labels: map[string]string{startruntime.LabelGameID: "game-die"},
ExitCode: 137,
OccurredAt: occurredAt,
})
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
envelopes := h.health.Published()
require.Len(t, envelopes, 1)
envelope := envelopes[0]
assert.Equal(t, "game-die", envelope.GameID)
assert.Equal(t, "ctr-die", envelope.ContainerID)
assert.Equal(t, health.EventTypeContainerExited, envelope.EventType)
assert.True(t, envelope.OccurredAt.Equal(occurredAt.UTC()))
assertJSONEqual(t, `{"exit_code":137,"oom":false}`, envelope.Details)
cancel()
waitDone(t, done)
}
func TestRunSkipsZeroExitDie(t *testing.T) {
h := newHarness(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := runListener(h, ctx)
h.docker.waitSubscribed(t)
h.docker.sendEvent(ports.DockerEvent{
Action: "die",
ContainerID: "ctr-graceful",
Labels: map[string]string{startruntime.LabelGameID: "game-graceful"},
ExitCode: 0,
OccurredAt: h.clockNow,
})
time.Sleep(20 * time.Millisecond)
assert.Empty(t, h.health.Published(), "graceful exit must not emit container_exited")
cancel()
waitDone(t, done)
}
func TestRunPublishesContainerOOM(t *testing.T) {
h := newHarness(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := runListener(h, ctx)
h.docker.waitSubscribed(t)
h.docker.sendEvent(ports.DockerEvent{
Action: "oom",
ContainerID: "ctr-oom",
Labels: map[string]string{startruntime.LabelGameID: "game-oom"},
ExitCode: 137,
OccurredAt: h.clockNow,
})
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
envelope := h.health.Published()[0]
assert.Equal(t, health.EventTypeContainerOOM, envelope.EventType)
assertJSONEqual(t, `{"exit_code":137}`, envelope.Details)
cancel()
waitDone(t, done)
}
func TestRunDestroyEmitsDisappearedOnlyForRunningRecordWithMatchingContainer(t *testing.T) {
h := newHarness(t)
startedAt := h.clockNow.Add(-time.Hour)
h.records.Set(runtime.RuntimeRecord{
GameID: "game-d",
Status: runtime.StatusRunning,
CurrentContainerID: "ctr-current",
CurrentImageRef: "galaxy/game:1.0.0",
EngineEndpoint: "http://galaxy-game-game-d:8080",
StatePath: "/var/lib/galaxy/games/game-d",
DockerNetwork: "galaxy-net",
StartedAt: &startedAt,
LastOpAt: h.clockNow,
CreatedAt: startedAt,
})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := runListener(h, ctx)
h.docker.waitSubscribed(t)
// Matching destroy → emit.
h.docker.sendEvent(ports.DockerEvent{
Action: "destroy",
ContainerID: "ctr-current",
Labels: map[string]string{startruntime.LabelGameID: "game-d"},
OccurredAt: h.clockNow,
})
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
envelope := h.health.Published()[0]
assert.Equal(t, health.EventTypeContainerDisappeared, envelope.EventType)
assertJSONEqual(t, `{}`, envelope.Details)
// Non-matching container id → skip.
h.docker.sendEvent(ports.DockerEvent{
Action: "destroy",
ContainerID: "ctr-old",
Labels: map[string]string{startruntime.LabelGameID: "game-d"},
OccurredAt: h.clockNow,
})
time.Sleep(20 * time.Millisecond)
assert.Len(t, h.health.Published(), 1, "destroy on outdated container_id must not emit again")
cancel()
waitDone(t, done)
}
func TestRunDestroySkipsNonRunningRecord(t *testing.T) {
h := newHarness(t)
startedAt := h.clockNow.Add(-time.Hour)
stoppedAt := h.clockNow.Add(-time.Minute)
h.records.Set(runtime.RuntimeRecord{
GameID: "game-stopped",
Status: runtime.StatusStopped,
CurrentContainerID: "ctr-stopped",
CurrentImageRef: "galaxy/game:1.0.0",
EngineEndpoint: "http://galaxy-game-game-stopped:8080",
StatePath: "/var/lib/galaxy/games/game-stopped",
DockerNetwork: "galaxy-net",
StartedAt: &startedAt,
StoppedAt: &stoppedAt,
LastOpAt: stoppedAt,
CreatedAt: startedAt,
})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := runListener(h, ctx)
h.docker.waitSubscribed(t)
h.docker.sendEvent(ports.DockerEvent{
Action: "destroy",
ContainerID: "ctr-stopped",
Labels: map[string]string{startruntime.LabelGameID: "game-stopped"},
OccurredAt: h.clockNow,
})
time.Sleep(20 * time.Millisecond)
assert.Empty(t, h.health.Published(), "destroy on non-running record must not emit")
cancel()
waitDone(t, done)
}
func TestRunDestroySkipsUnknownGame(t *testing.T) {
h := newHarness(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := runListener(h, ctx)
h.docker.waitSubscribed(t)
h.docker.sendEvent(ports.DockerEvent{
Action: "destroy",
ContainerID: "ctr-unknown",
Labels: map[string]string{startruntime.LabelGameID: "game-unknown"},
OccurredAt: h.clockNow,
})
time.Sleep(20 * time.Millisecond)
assert.Empty(t, h.health.Published(), "destroy with no record must not emit")
cancel()
waitDone(t, done)
}
func TestRunSkipsEventsWithoutGameIDLabel(t *testing.T) {
h := newHarness(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := runListener(h, ctx)
h.docker.waitSubscribed(t)
h.docker.sendEvent(ports.DockerEvent{
Action: "die",
ContainerID: "ctr-foreign",
Labels: map[string]string{},
ExitCode: 1,
OccurredAt: h.clockNow,
})
time.Sleep(20 * time.Millisecond)
assert.Empty(t, h.health.Published(), "events without game_id label must not emit")
cancel()
waitDone(t, done)
}
func TestRunSkipsUnrelatedActions(t *testing.T) {
h := newHarness(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := runListener(h, ctx)
h.docker.waitSubscribed(t)
for _, action := range []string{"start", "kill", "pause", "create"} {
h.docker.sendEvent(ports.DockerEvent{
Action: action,
ContainerID: "ctr-x",
Labels: map[string]string{startruntime.LabelGameID: "game-x"},
OccurredAt: h.clockNow,
})
}
time.Sleep(20 * time.Millisecond)
assert.Empty(t, h.health.Published(), "non-die/oom/destroy actions must not emit")
cancel()
waitDone(t, done)
}
func TestRunReconnectsAfterSubscriptionError(t *testing.T) {
h := newHarness(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := runListener(h, ctx)
h.docker.waitSubscribed(t)
h.docker.sendErr(errors.New("connection reset"))
h.docker.waitSubscribed(t)
// Send an event after reconnect to confirm pipeline resumed.
h.docker.sendEvent(ports.DockerEvent{
Action: "die",
ContainerID: "ctr-after",
Labels: map[string]string{startruntime.LabelGameID: "game-after"},
ExitCode: 1,
OccurredAt: h.clockNow,
})
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
assert.GreaterOrEqual(t, h.docker.subscriptions(), 2, "listener must reconnect after error")
cancel()
waitDone(t, done)
}
func TestRunFillsOccurredAtWhenZero(t *testing.T) {
h := newHarness(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := runListener(h, ctx)
h.docker.waitSubscribed(t)
h.docker.sendEvent(ports.DockerEvent{
Action: "oom",
ContainerID: "ctr-time",
Labels: map[string]string{startruntime.LabelGameID: "game-time"},
ExitCode: 137,
})
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
envelope := h.health.Published()[0]
assert.True(t, envelope.OccurredAt.Equal(h.clockNow.UTC()))
cancel()
waitDone(t, done)
}
// --- helpers -----------------------------------------------------------
func runListener(h *harness, ctx context.Context) chan error {
done := make(chan error, 1)
go func() { done <- h.listener.Run(ctx) }()
return done
}
func waitDone(t *testing.T, done chan error) {
t.Helper()
select {
case <-done:
case <-time.After(time.Second):
t.Fatalf("Run did not exit within timeout")
}
}
func assertJSONEqual(t *testing.T, want string, got json.RawMessage) {
t.Helper()
var wantValue, gotValue any
require.NoError(t, json.Unmarshal([]byte(want), &wantValue))
require.NoError(t, json.Unmarshal(got, &gotValue))
assert.Equal(t, wantValue, gotValue)
}
// --- shutdown ----------------------------------------------------------
func TestShutdownIsNoOp(t *testing.T) {
h := newHarness(t)
require.NoError(t, h.listener.Shutdown(context.Background()))
}
// --- compile-time safety ----------------------------------------------
var (
_ ports.DockerClient = (*fakeDockerEvents)(nil)
_ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
_ ports.HealthEventPublisher = (*fakeHealthEvents)(nil)
)