feat: runtime manager
This commit is contained in:
@@ -0,0 +1,357 @@
|
||||
// Package dockerevents subscribes to the Docker events stream and turns
|
||||
// container-scoped events into entries on `runtime:health_events`.
|
||||
//
|
||||
// Three event kinds are emitted by this listener (per
|
||||
// `rtmanager/README.md §Health Monitoring`):
|
||||
//
|
||||
// - `container_exited` from a `die` action with non-zero exit code;
|
||||
// - `container_oom` from an `oom` action;
|
||||
// - `container_disappeared` from a `destroy` action observed for a
|
||||
// `runtime_records.status=running` row whose `current_container_id`
|
||||
// still matches the destroyed container — i.e., a destroy that
|
||||
// Runtime Manager did not initiate itself. Destroys triggered by
|
||||
// RTM's own restart / cleanup flow either find the record already
|
||||
// transitioned (status != running) or pointing at a different
|
||||
// container id, and are therefore skipped.
|
||||
//
|
||||
// `container_started` is emitted by the start service and is not
|
||||
// duplicated here. Graceful stop produces a `die` event with exit code
|
||||
// `0`; that case is suppressed to honour the README guarantee that
|
||||
// `container_exited` carries a non-zero exit.
|
||||
//
|
||||
// Design rationale, including the destroy-disambiguation rule and the
|
||||
// reconnect policy, is captured in
|
||||
// `rtmanager/docs/workers.md`.
|
||||
package dockerevents
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
// Docker event actions consumed by the listener. Other actions are
|
||||
// observed but ignored.
|
||||
const (
|
||||
actionDie = "die"
|
||||
actionOOM = "oom"
|
||||
actionDestroy = "destroy"
|
||||
)
|
||||
|
||||
// defaultReconnectBackoff bounds the wait between two `EventsListen`
|
||||
// reconnect attempts. Daemon hiccups in production are common; the
|
||||
// listener never gives up while ctx is alive.
|
||||
const defaultReconnectBackoff = 5 * time.Second
|
||||
|
||||
// Dependencies groups the collaborators required by Listener.
|
||||
type Dependencies struct {
|
||||
// Docker provides the EventsListen subscription used by Run.
|
||||
Docker ports.DockerClient
|
||||
|
||||
// RuntimeRecords resolves `(game_id, container_id)` for destroy
|
||||
// disambiguation.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// HealthEvents emits the entries produced by handleEvent. Failures
|
||||
// are best-effort: the listener logs and continues.
|
||||
HealthEvents ports.HealthEventPublisher
|
||||
|
||||
// Telemetry records one health-event counter increment per emission.
|
||||
// Required.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Clock supplies the wall-clock used as a fallback when a Docker
|
||||
// event arrives without a timestamp. Defaults to `time.Now`.
|
||||
Clock func() time.Time
|
||||
|
||||
// Logger receives structured worker-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
|
||||
// ReconnectBackoff bounds the wait between reconnect attempts.
|
||||
// Defaults to defaultReconnectBackoff when zero.
|
||||
ReconnectBackoff time.Duration
|
||||
}
|
||||
|
||||
// Listener consumes Docker container events and emits the matching
|
||||
// `runtime:health_events` entries.
|
||||
type Listener struct {
|
||||
docker ports.DockerClient
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
healthEvents ports.HealthEventPublisher
|
||||
telemetry *telemetry.Runtime
|
||||
clock func() time.Time
|
||||
logger *slog.Logger
|
||||
|
||||
reconnectBackoff time.Duration
|
||||
}
|
||||
|
||||
// NewListener constructs one Listener from deps.
|
||||
func NewListener(deps Dependencies) (*Listener, error) {
|
||||
switch {
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new docker events listener: nil docker client")
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new docker events listener: nil runtime records store")
|
||||
case deps.HealthEvents == nil:
|
||||
return nil, errors.New("new docker events listener: nil health events publisher")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new docker events listener: nil telemetry runtime")
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
backoff := deps.ReconnectBackoff
|
||||
if backoff <= 0 {
|
||||
backoff = defaultReconnectBackoff
|
||||
}
|
||||
|
||||
return &Listener{
|
||||
docker: deps.Docker,
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
healthEvents: deps.HealthEvents,
|
||||
telemetry: deps.Telemetry,
|
||||
clock: clock,
|
||||
logger: logger.With("worker", "rtmanager.dockerevents"),
|
||||
reconnectBackoff: backoff,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run drives the events subscription. The outer loop reconnects after a
|
||||
// Docker subscription error with a fixed backoff; only `ctx`
|
||||
// cancellation terminates Run.
|
||||
func (listener *Listener) Run(ctx context.Context) error {
|
||||
if listener == nil {
|
||||
return errors.New("run docker events listener: nil listener")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run docker events listener: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
listener.logger.Info("docker events listener started",
|
||||
"reconnect_backoff", listener.reconnectBackoff.String(),
|
||||
)
|
||||
defer listener.logger.Info("docker events listener stopped")
|
||||
|
||||
for {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err := listener.runOnce(ctx)
|
||||
if err == nil || errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
|
||||
if ctxErr := ctx.Err(); ctxErr != nil {
|
||||
return ctxErr
|
||||
}
|
||||
}
|
||||
if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) {
|
||||
listener.logger.WarnContext(ctx, "docker events subscription dropped, will reconnect",
|
||||
"err", err.Error(),
|
||||
"backoff", listener.reconnectBackoff.String(),
|
||||
)
|
||||
}
|
||||
|
||||
if waitErr := listener.sleep(ctx); waitErr != nil {
|
||||
return waitErr
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; Run terminates on context cancellation.
|
||||
func (listener *Listener) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown docker events listener: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// runOnce subscribes once and processes events until the subscription
|
||||
// reports an error or ctx is cancelled.
|
||||
func (listener *Listener) runOnce(ctx context.Context) error {
|
||||
events, errs, err := listener.docker.EventsListen(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("subscribe docker events: %w", err)
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case event, ok := <-events:
|
||||
if !ok {
|
||||
return errors.New("docker events channel closed")
|
||||
}
|
||||
listener.handleEvent(ctx, event)
|
||||
case subscribeErr, ok := <-errs:
|
||||
if !ok {
|
||||
return errors.New("docker errors channel closed")
|
||||
}
|
||||
if subscribeErr == nil {
|
||||
continue
|
||||
}
|
||||
return subscribeErr
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sleep waits reconnectBackoff or until ctx is cancelled.
|
||||
func (listener *Listener) sleep(ctx context.Context) error {
|
||||
timer := time.NewTimer(listener.reconnectBackoff)
|
||||
defer timer.Stop()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-timer.C:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// handleEvent translates one Docker event into a health-events emission
|
||||
// (if any). All branches are exported via tests.
|
||||
func (listener *Listener) handleEvent(ctx context.Context, event ports.DockerEvent) {
|
||||
gameID := strings.TrimSpace(event.Labels[startruntime.LabelGameID])
|
||||
if gameID == "" {
|
||||
return
|
||||
}
|
||||
|
||||
occurredAt := event.OccurredAt
|
||||
if occurredAt.IsZero() {
|
||||
occurredAt = listener.clock()
|
||||
}
|
||||
occurredAt = occurredAt.UTC()
|
||||
|
||||
switch event.Action {
|
||||
case actionDie:
|
||||
if event.ExitCode == 0 {
|
||||
return
|
||||
}
|
||||
listener.publish(ctx, ports.HealthEventEnvelope{
|
||||
GameID: gameID,
|
||||
ContainerID: event.ContainerID,
|
||||
EventType: health.EventTypeContainerExited,
|
||||
OccurredAt: occurredAt,
|
||||
Details: containerExitedDetails(event.ExitCode, false),
|
||||
})
|
||||
case actionOOM:
|
||||
listener.publish(ctx, ports.HealthEventEnvelope{
|
||||
GameID: gameID,
|
||||
ContainerID: event.ContainerID,
|
||||
EventType: health.EventTypeContainerOOM,
|
||||
OccurredAt: occurredAt,
|
||||
Details: containerOOMDetails(event.ExitCode),
|
||||
})
|
||||
case actionDestroy:
|
||||
if !listener.isUnexpectedDestroy(ctx, gameID, event.ContainerID) {
|
||||
return
|
||||
}
|
||||
listener.publish(ctx, ports.HealthEventEnvelope{
|
||||
GameID: gameID,
|
||||
ContainerID: event.ContainerID,
|
||||
EventType: health.EventTypeContainerDisappeared,
|
||||
OccurredAt: occurredAt,
|
||||
Details: containerDisappearedDetails(),
|
||||
})
|
||||
default:
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// isUnexpectedDestroy returns true when the destroy event came from a
|
||||
// source other than Runtime Manager itself. The check is conservative:
|
||||
// any read error treats the destroy as expected (we cannot tell), and
|
||||
// only a record currently `running` whose `current_container_id` still
|
||||
// equals the destroyed id is considered unexpected.
|
||||
func (listener *Listener) isUnexpectedDestroy(ctx context.Context, gameID, containerID string) bool {
|
||||
record, err := listener.runtimeRecords.Get(ctx, gameID)
|
||||
switch {
|
||||
case errors.Is(err, runtime.ErrNotFound):
|
||||
return false
|
||||
case err != nil:
|
||||
listener.logger.WarnContext(ctx, "destroy lookup failed; suppressing emission",
|
||||
"game_id", gameID,
|
||||
"container_id", containerID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return false
|
||||
}
|
||||
if record.Status != runtime.StatusRunning {
|
||||
return false
|
||||
}
|
||||
if record.CurrentContainerID != containerID {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// publish emits one envelope through the configured publisher, updates
|
||||
// the telemetry counter, and logs the outcome. All side effects are
|
||||
// best-effort; a publish error degrades to a warning log.
|
||||
func (listener *Listener) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
||||
if err := listener.healthEvents.Publish(ctx, envelope); err != nil {
|
||||
listener.logger.ErrorContext(ctx, "publish health event",
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
listener.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
listener.logger.InfoContext(ctx, "docker event published", logArgs...)
|
||||
}
|
||||
|
||||
// containerExitedDetails builds the JSON payload required by the
|
||||
// `container_exited` AsyncAPI variant.
|
||||
func containerExitedDetails(exitCode int, oom bool) json.RawMessage {
|
||||
payload := struct {
|
||||
ExitCode int `json:"exit_code"`
|
||||
OOM bool `json:"oom"`
|
||||
}{ExitCode: exitCode, OOM: oom}
|
||||
encoded, _ := json.Marshal(payload)
|
||||
return encoded
|
||||
}
|
||||
|
||||
// containerOOMDetails builds the JSON payload required by the
|
||||
// `container_oom` AsyncAPI variant.
|
||||
func containerOOMDetails(exitCode int) json.RawMessage {
|
||||
payload := struct {
|
||||
ExitCode int `json:"exit_code"`
|
||||
}{ExitCode: exitCode}
|
||||
encoded, _ := json.Marshal(payload)
|
||||
return encoded
|
||||
}
|
||||
|
||||
// containerDisappearedDetails builds the empty JSON object the
|
||||
// `container_disappeared` AsyncAPI variant requires.
|
||||
func containerDisappearedDetails() json.RawMessage {
|
||||
return json.RawMessage(`{}`)
|
||||
}
|
||||
@@ -0,0 +1,584 @@
|
||||
package dockerevents_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
"galaxy/rtmanager/internal/worker/dockerevents"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func silentLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||
}
|
||||
|
||||
// fakeDockerEvents is a minimal ports.DockerClient implementation for
|
||||
// the listener: only EventsListen is exercised. Tests push events
|
||||
// through the eventsCh channel and observe reconnect attempts via the
|
||||
// counter.
|
||||
type fakeDockerEvents struct {
|
||||
mu sync.Mutex
|
||||
subscribeCount int32
|
||||
subscribeErr error
|
||||
currentEventsCh chan ports.DockerEvent
|
||||
currentErrsCh chan error
|
||||
subscribed chan struct{}
|
||||
}
|
||||
|
||||
func newFakeDockerEvents() *fakeDockerEvents {
|
||||
return &fakeDockerEvents{subscribed: make(chan struct{}, 16)}
|
||||
}
|
||||
|
||||
func (f *fakeDockerEvents) EventsListen(ctx context.Context) (<-chan ports.DockerEvent, <-chan error, error) {
|
||||
atomic.AddInt32(&f.subscribeCount, 1)
|
||||
f.mu.Lock()
|
||||
if f.subscribeErr != nil {
|
||||
err := f.subscribeErr
|
||||
f.mu.Unlock()
|
||||
return nil, nil, err
|
||||
}
|
||||
events := make(chan ports.DockerEvent, 16)
|
||||
errs := make(chan error, 1)
|
||||
f.currentEventsCh = events
|
||||
f.currentErrsCh = errs
|
||||
f.mu.Unlock()
|
||||
|
||||
select {
|
||||
case f.subscribed <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
|
||||
go func() {
|
||||
<-ctx.Done()
|
||||
f.mu.Lock()
|
||||
if f.currentEventsCh == events {
|
||||
close(events)
|
||||
close(errs)
|
||||
f.currentEventsCh = nil
|
||||
f.currentErrsCh = nil
|
||||
}
|
||||
f.mu.Unlock()
|
||||
}()
|
||||
return events, errs, nil
|
||||
}
|
||||
|
||||
func (f *fakeDockerEvents) sendEvent(event ports.DockerEvent) {
|
||||
f.mu.Lock()
|
||||
ch := f.currentEventsCh
|
||||
f.mu.Unlock()
|
||||
if ch != nil {
|
||||
ch <- event
|
||||
}
|
||||
}
|
||||
|
||||
func (f *fakeDockerEvents) sendErr(err error) {
|
||||
f.mu.Lock()
|
||||
ch := f.currentErrsCh
|
||||
f.mu.Unlock()
|
||||
if ch != nil {
|
||||
ch <- err
|
||||
}
|
||||
}
|
||||
|
||||
func (f *fakeDockerEvents) waitSubscribed(t *testing.T) {
|
||||
t.Helper()
|
||||
select {
|
||||
case <-f.subscribed:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("timed out waiting for EventsListen subscription")
|
||||
}
|
||||
}
|
||||
|
||||
func (f *fakeDockerEvents) subscriptions() int {
|
||||
return int(atomic.LoadInt32(&f.subscribeCount))
|
||||
}
|
||||
|
||||
// Unused DockerClient methods. The listener only consumes EventsListen.
|
||||
func (f *fakeDockerEvents) EnsureNetwork(_ context.Context, _ string) error { return nil }
|
||||
func (f *fakeDockerEvents) PullImage(_ context.Context, _ string, _ ports.PullPolicy) error {
|
||||
return nil
|
||||
}
|
||||
func (f *fakeDockerEvents) InspectImage(_ context.Context, _ string) (ports.ImageInspect, error) {
|
||||
return ports.ImageInspect{}, nil
|
||||
}
|
||||
func (f *fakeDockerEvents) InspectContainer(_ context.Context, _ string) (ports.ContainerInspect, error) {
|
||||
return ports.ContainerInspect{}, nil
|
||||
}
|
||||
func (f *fakeDockerEvents) Run(_ context.Context, _ ports.RunSpec) (ports.RunResult, error) {
|
||||
return ports.RunResult{}, nil
|
||||
}
|
||||
func (f *fakeDockerEvents) Stop(_ context.Context, _ string, _ time.Duration) error { return nil }
|
||||
func (f *fakeDockerEvents) Remove(_ context.Context, _ string) error { return nil }
|
||||
func (f *fakeDockerEvents) List(_ context.Context, _ ports.ListFilter) ([]ports.ContainerSummary, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// fakeRuntimeRecords supports Get only; the listener does not call any
|
||||
// other method. Tests seed records via Set.
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Set(record runtime.RuntimeRecord) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.stored[record.GameID] = record
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil }
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
|
||||
return nil
|
||||
}
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { return nil, nil }
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// fakeHealthEvents captures every Publish call.
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
published []ports.HealthEventEnvelope
|
||||
publishErr error
|
||||
}
|
||||
|
||||
func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.publishErr != nil {
|
||||
return s.publishErr
|
||||
}
|
||||
s.published = append(s.published, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]ports.HealthEventEnvelope, len(s.published))
|
||||
copy(out, s.published)
|
||||
return out
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
docker *fakeDockerEvents
|
||||
records *fakeRuntimeRecords
|
||||
health *fakeHealthEvents
|
||||
listener *dockerevents.Listener
|
||||
clockNow time.Time
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
docker := newFakeDockerEvents()
|
||||
records := newFakeRuntimeRecords()
|
||||
healthEvents := &fakeHealthEvents{}
|
||||
clockNow := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
|
||||
|
||||
listener, err := dockerevents.NewListener(dockerevents.Dependencies{
|
||||
Docker: docker,
|
||||
RuntimeRecords: records,
|
||||
HealthEvents: healthEvents,
|
||||
Telemetry: telemetryRuntime,
|
||||
Clock: func() time.Time { return clockNow },
|
||||
Logger: silentLogger(),
|
||||
ReconnectBackoff: 5 * time.Millisecond,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
docker: docker,
|
||||
records: records,
|
||||
health: healthEvents,
|
||||
listener: listener,
|
||||
clockNow: clockNow,
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor -------------------------------------------------------
|
||||
|
||||
func TestNewListenerRejectsMissingDeps(t *testing.T) {
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
cases := []dockerevents.Dependencies{
|
||||
{},
|
||||
{Docker: newFakeDockerEvents()},
|
||||
{Docker: newFakeDockerEvents(), RuntimeRecords: newFakeRuntimeRecords()},
|
||||
{Docker: newFakeDockerEvents(), RuntimeRecords: newFakeRuntimeRecords(), HealthEvents: &fakeHealthEvents{}},
|
||||
}
|
||||
for index, deps := range cases {
|
||||
_, err := dockerevents.NewListener(deps)
|
||||
require.Errorf(t, err, "case %d should fail", index)
|
||||
}
|
||||
|
||||
_, err = dockerevents.NewListener(dockerevents.Dependencies{
|
||||
Docker: newFakeDockerEvents(),
|
||||
RuntimeRecords: newFakeRuntimeRecords(),
|
||||
HealthEvents: &fakeHealthEvents{},
|
||||
Telemetry: telemetryRuntime,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
// --- Run lifecycle -----------------------------------------------------
|
||||
|
||||
func TestRunPublishesContainerExitedOnNonZeroDie(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
occurredAt := h.clockNow.Add(-time.Minute)
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "die",
|
||||
ContainerID: "ctr-die",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-die"},
|
||||
ExitCode: 137,
|
||||
OccurredAt: occurredAt,
|
||||
})
|
||||
|
||||
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
|
||||
|
||||
envelopes := h.health.Published()
|
||||
require.Len(t, envelopes, 1)
|
||||
envelope := envelopes[0]
|
||||
assert.Equal(t, "game-die", envelope.GameID)
|
||||
assert.Equal(t, "ctr-die", envelope.ContainerID)
|
||||
assert.Equal(t, health.EventTypeContainerExited, envelope.EventType)
|
||||
assert.True(t, envelope.OccurredAt.Equal(occurredAt.UTC()))
|
||||
assertJSONEqual(t, `{"exit_code":137,"oom":false}`, envelope.Details)
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunSkipsZeroExitDie(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "die",
|
||||
ContainerID: "ctr-graceful",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-graceful"},
|
||||
ExitCode: 0,
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
assert.Empty(t, h.health.Published(), "graceful exit must not emit container_exited")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunPublishesContainerOOM(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "oom",
|
||||
ContainerID: "ctr-oom",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-oom"},
|
||||
ExitCode: 137,
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
|
||||
envelope := h.health.Published()[0]
|
||||
assert.Equal(t, health.EventTypeContainerOOM, envelope.EventType)
|
||||
assertJSONEqual(t, `{"exit_code":137}`, envelope.Details)
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunDestroyEmitsDisappearedOnlyForRunningRecordWithMatchingContainer(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
startedAt := h.clockNow.Add(-time.Hour)
|
||||
h.records.Set(runtime.RuntimeRecord{
|
||||
GameID: "game-d",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-current",
|
||||
CurrentImageRef: "galaxy/game:1.0.0",
|
||||
EngineEndpoint: "http://galaxy-game-game-d:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-d",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: h.clockNow,
|
||||
CreatedAt: startedAt,
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
// Matching destroy → emit.
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "destroy",
|
||||
ContainerID: "ctr-current",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-d"},
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
|
||||
envelope := h.health.Published()[0]
|
||||
assert.Equal(t, health.EventTypeContainerDisappeared, envelope.EventType)
|
||||
assertJSONEqual(t, `{}`, envelope.Details)
|
||||
|
||||
// Non-matching container id → skip.
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "destroy",
|
||||
ContainerID: "ctr-old",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-d"},
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
assert.Len(t, h.health.Published(), 1, "destroy on outdated container_id must not emit again")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunDestroySkipsNonRunningRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
startedAt := h.clockNow.Add(-time.Hour)
|
||||
stoppedAt := h.clockNow.Add(-time.Minute)
|
||||
h.records.Set(runtime.RuntimeRecord{
|
||||
GameID: "game-stopped",
|
||||
Status: runtime.StatusStopped,
|
||||
CurrentContainerID: "ctr-stopped",
|
||||
CurrentImageRef: "galaxy/game:1.0.0",
|
||||
EngineEndpoint: "http://galaxy-game-game-stopped:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-stopped",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
StoppedAt: &stoppedAt,
|
||||
LastOpAt: stoppedAt,
|
||||
CreatedAt: startedAt,
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "destroy",
|
||||
ContainerID: "ctr-stopped",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-stopped"},
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
assert.Empty(t, h.health.Published(), "destroy on non-running record must not emit")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunDestroySkipsUnknownGame(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "destroy",
|
||||
ContainerID: "ctr-unknown",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-unknown"},
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
assert.Empty(t, h.health.Published(), "destroy with no record must not emit")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunSkipsEventsWithoutGameIDLabel(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "die",
|
||||
ContainerID: "ctr-foreign",
|
||||
Labels: map[string]string{},
|
||||
ExitCode: 1,
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
assert.Empty(t, h.health.Published(), "events without game_id label must not emit")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunSkipsUnrelatedActions(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
for _, action := range []string{"start", "kill", "pause", "create"} {
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: action,
|
||||
ContainerID: "ctr-x",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-x"},
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
}
|
||||
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
assert.Empty(t, h.health.Published(), "non-die/oom/destroy actions must not emit")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunReconnectsAfterSubscriptionError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendErr(errors.New("connection reset"))
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
// Send an event after reconnect to confirm pipeline resumed.
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "die",
|
||||
ContainerID: "ctr-after",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-after"},
|
||||
ExitCode: 1,
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
|
||||
assert.GreaterOrEqual(t, h.docker.subscriptions(), 2, "listener must reconnect after error")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunFillsOccurredAtWhenZero(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "oom",
|
||||
ContainerID: "ctr-time",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-time"},
|
||||
ExitCode: 137,
|
||||
})
|
||||
|
||||
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
|
||||
envelope := h.health.Published()[0]
|
||||
assert.True(t, envelope.OccurredAt.Equal(h.clockNow.UTC()))
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
// --- helpers -----------------------------------------------------------
|
||||
|
||||
func runListener(h *harness, ctx context.Context) chan error {
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- h.listener.Run(ctx) }()
|
||||
return done
|
||||
}
|
||||
|
||||
func waitDone(t *testing.T, done chan error) {
|
||||
t.Helper()
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("Run did not exit within timeout")
|
||||
}
|
||||
}
|
||||
|
||||
func assertJSONEqual(t *testing.T, want string, got json.RawMessage) {
|
||||
t.Helper()
|
||||
var wantValue, gotValue any
|
||||
require.NoError(t, json.Unmarshal([]byte(want), &wantValue))
|
||||
require.NoError(t, json.Unmarshal(got, &gotValue))
|
||||
assert.Equal(t, wantValue, gotValue)
|
||||
}
|
||||
|
||||
// --- shutdown ----------------------------------------------------------
|
||||
|
||||
func TestShutdownIsNoOp(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
require.NoError(t, h.listener.Shutdown(context.Background()))
|
||||
}
|
||||
|
||||
// --- compile-time safety ----------------------------------------------
|
||||
|
||||
var (
|
||||
_ ports.DockerClient = (*fakeDockerEvents)(nil)
|
||||
_ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
|
||||
_ ports.HealthEventPublisher = (*fakeHealthEvents)(nil)
|
||||
)
|
||||
Reference in New Issue
Block a user