feat: runtime manager
This commit is contained in:
@@ -0,0 +1,204 @@
|
||||
// Package containercleanup ships the periodic TTL-cleanup worker
|
||||
// described in `rtmanager/README.md §Lifecycles → Cleanup`.
|
||||
//
|
||||
// On every tick the worker lists `runtime_records.status='stopped'`
|
||||
// rows whose `last_op_at` is older than the configured retention
|
||||
// (`RTMANAGER_CONTAINER_RETENTION_DAYS`) and delegates removal to
|
||||
// `cleanupcontainer.Service.Handle` with `op_source=auto_ttl`. The
|
||||
// service owns the per-game lease, the Docker `Remove` call, the
|
||||
// status transition, the telemetry counter, and the operation_log
|
||||
// entry; this worker is intentionally tiny — a ticker plus a TTL
|
||||
// filter.
|
||||
//
|
||||
// Idempotent outcomes (`replay_no_op`, `conflict`) are absorbed; a
|
||||
// failure on one game does not abort the rest of the pass.
|
||||
//
|
||||
// Design rationale is captured in
|
||||
// `rtmanager/docs/workers.md`.
|
||||
package containercleanup
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
||||
)
|
||||
|
||||
// Cleaner is the narrow surface the worker uses to remove stopped
|
||||
// containers. The production `*cleanupcontainer.Service` satisfies
|
||||
// this interface verbatim; the package keeps the surface here so
|
||||
// tests can substitute a fake without spinning the full service.
|
||||
type Cleaner interface {
|
||||
Handle(ctx context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error)
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Worker.
|
||||
type Dependencies struct {
|
||||
// RuntimeRecords lists `status=stopped` records on every tick.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// Cleanup performs the actual container removal under the per-game
|
||||
// lease.
|
||||
Cleanup Cleaner
|
||||
|
||||
// Retention is the TTL after which a stopped container becomes a
|
||||
// removal candidate. Mirrors `cfg.Container.Retention`.
|
||||
Retention time.Duration
|
||||
|
||||
// Interval bounds the tick period. Mirrors
|
||||
// `cfg.Cleanup.CleanupInterval`.
|
||||
Interval time.Duration
|
||||
|
||||
// Clock supplies the wall-clock used to compute the TTL threshold.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// Logger receives structured worker-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// Worker drives the periodic TTL-cleanup loop.
|
||||
type Worker struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
cleanup Cleaner
|
||||
|
||||
retention time.Duration
|
||||
interval time.Duration
|
||||
|
||||
clock func() time.Time
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewWorker constructs one Worker from deps.
|
||||
func NewWorker(deps Dependencies) (*Worker, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new container cleanup worker: nil runtime records store")
|
||||
case deps.Cleanup == nil:
|
||||
return nil, errors.New("new container cleanup worker: nil cleanup service")
|
||||
case deps.Retention <= 0:
|
||||
return nil, errors.New("new container cleanup worker: retention must be positive")
|
||||
case deps.Interval <= 0:
|
||||
return nil, errors.New("new container cleanup worker: interval must be positive")
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
return &Worker{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
cleanup: deps.Cleanup,
|
||||
retention: deps.Retention,
|
||||
interval: deps.Interval,
|
||||
clock: clock,
|
||||
logger: logger.With("worker", "rtmanager.containercleanup"),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run drives the cleanup loop until ctx is cancelled. Per-tick errors
|
||||
// are absorbed; the loop only exits on context cancellation.
|
||||
func (worker *Worker) Run(ctx context.Context) error {
|
||||
if worker == nil {
|
||||
return errors.New("run container cleanup worker: nil worker")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run container cleanup worker: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
worker.logger.Info("container cleanup worker started",
|
||||
"interval", worker.interval.String(),
|
||||
"retention", worker.retention.String(),
|
||||
)
|
||||
defer worker.logger.Info("container cleanup worker stopped")
|
||||
|
||||
ticker := time.NewTicker(worker.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
worker.tick(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; Run terminates on context cancellation.
|
||||
func (worker *Worker) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown container cleanup worker: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Tick performs one cleanup pass. Exported so tests can drive the
|
||||
// worker deterministically without spinning a real ticker.
|
||||
func (worker *Worker) Tick(ctx context.Context) {
|
||||
worker.tick(ctx)
|
||||
}
|
||||
|
||||
// tick lists stopped records and delegates removal of expired ones to
|
||||
// the cleanup service.
|
||||
func (worker *Worker) tick(ctx context.Context) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusStopped)
|
||||
if err != nil {
|
||||
worker.logger.WarnContext(ctx, "list stopped records",
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
threshold := worker.clock().Add(-worker.retention)
|
||||
for _, record := range records {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
if !record.LastOpAt.Before(threshold) {
|
||||
continue
|
||||
}
|
||||
|
||||
result, err := worker.cleanup.Handle(ctx, cleanupcontainer.Input{
|
||||
GameID: record.GameID,
|
||||
OpSource: operation.OpSourceAutoTTL,
|
||||
})
|
||||
if err != nil {
|
||||
worker.logger.ErrorContext(ctx, "cleanup handle returned error",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
continue
|
||||
}
|
||||
if result.Outcome == operation.OutcomeFailure {
|
||||
worker.logger.InfoContext(ctx, "cleanup ttl pass: failure outcome",
|
||||
"game_id", record.GameID,
|
||||
"error_code", result.ErrorCode,
|
||||
"error_message", result.ErrorMessage,
|
||||
)
|
||||
continue
|
||||
}
|
||||
worker.logger.InfoContext(ctx, "cleanup ttl removed container",
|
||||
"game_id", record.GameID,
|
||||
"error_code", result.ErrorCode,
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,296 @@
|
||||
package containercleanup_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
||||
"galaxy/rtmanager/internal/worker/containercleanup"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func silentLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||
}
|
||||
|
||||
// fakeRuntimeRecords supports ListByStatus only.
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
stopped []runtime.RuntimeRecord
|
||||
listErr error
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} }
|
||||
|
||||
func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.stopped = append([]runtime.RuntimeRecord(nil), records...)
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil }
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
|
||||
return nil
|
||||
}
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.listErr != nil {
|
||||
return nil, s.listErr
|
||||
}
|
||||
if status != runtime.StatusStopped {
|
||||
return nil, nil
|
||||
}
|
||||
out := make([]runtime.RuntimeRecord, len(s.stopped))
|
||||
copy(out, s.stopped)
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// fakeCleaner records every Handle call and returns canned responses.
|
||||
type fakeCleaner struct {
|
||||
mu sync.Mutex
|
||||
|
||||
calls []cleanupcontainer.Input
|
||||
responses []cleanupcontainer.Result
|
||||
errs []error
|
||||
|
||||
defaultResult cleanupcontainer.Result
|
||||
defaultErr error
|
||||
}
|
||||
|
||||
func (c *fakeCleaner) Handle(_ context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
c.calls = append(c.calls, input)
|
||||
if len(c.errs) > 0 {
|
||||
err := c.errs[0]
|
||||
c.errs = c.errs[1:]
|
||||
return cleanupcontainer.Result{}, err
|
||||
}
|
||||
if len(c.responses) > 0 {
|
||||
result := c.responses[0]
|
||||
c.responses = c.responses[1:]
|
||||
return result, nil
|
||||
}
|
||||
if c.defaultErr != nil {
|
||||
return cleanupcontainer.Result{}, c.defaultErr
|
||||
}
|
||||
return c.defaultResult, nil
|
||||
}
|
||||
|
||||
func (c *fakeCleaner) Calls() []cleanupcontainer.Input {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
out := make([]cleanupcontainer.Input, len(c.calls))
|
||||
copy(out, c.calls)
|
||||
return out
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
cleaner *fakeCleaner
|
||||
|
||||
now time.Time
|
||||
}
|
||||
|
||||
func newHarness() *harness {
|
||||
return &harness{
|
||||
records: newFakeRuntimeRecords(),
|
||||
cleaner: &fakeCleaner{
|
||||
defaultResult: cleanupcontainer.Result{Outcome: operation.OutcomeSuccess},
|
||||
},
|
||||
now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T, retention time.Duration) *containercleanup.Worker {
|
||||
t.Helper()
|
||||
worker, err := containercleanup.NewWorker(containercleanup.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
Cleanup: h.cleaner,
|
||||
Retention: retention,
|
||||
Interval: 50 * time.Millisecond,
|
||||
Clock: func() time.Time { return h.now },
|
||||
Logger: silentLogger(),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return worker
|
||||
}
|
||||
|
||||
// stoppedRecord builds a baseline record with the requested LastOpAt.
|
||||
func stoppedRecord(gameID string, lastOpAt time.Time) runtime.RuntimeRecord {
|
||||
stoppedAt := lastOpAt
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: gameID,
|
||||
Status: runtime.StatusStopped,
|
||||
CurrentContainerID: "ctr-" + gameID,
|
||||
CurrentImageRef: "galaxy/game:1.0.0",
|
||||
EngineEndpoint: "http://galaxy-game-" + gameID + ":8080",
|
||||
StatePath: "/var/lib/galaxy/games/" + gameID,
|
||||
DockerNetwork: "galaxy-net",
|
||||
LastOpAt: lastOpAt,
|
||||
CreatedAt: lastOpAt.Add(-time.Hour),
|
||||
StoppedAt: &stoppedAt,
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor ------------------------------------------------------
|
||||
|
||||
func TestNewWorkerRejectsMissingDeps(t *testing.T) {
|
||||
cleaner := &fakeCleaner{defaultResult: cleanupcontainer.Result{Outcome: operation.OutcomeSuccess}}
|
||||
records := newFakeRuntimeRecords()
|
||||
|
||||
defectives := []containercleanup.Dependencies{
|
||||
{},
|
||||
{RuntimeRecords: records},
|
||||
{RuntimeRecords: records, Cleanup: cleaner},
|
||||
{RuntimeRecords: records, Cleanup: cleaner, Retention: time.Hour},
|
||||
}
|
||||
for index, deps := range defectives {
|
||||
_, err := containercleanup.NewWorker(deps)
|
||||
require.Errorf(t, err, "case %d should fail", index)
|
||||
}
|
||||
|
||||
_, err := containercleanup.NewWorker(containercleanup.Dependencies{
|
||||
RuntimeRecords: records,
|
||||
Cleanup: cleaner,
|
||||
Retention: time.Hour,
|
||||
Interval: time.Minute,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
// --- TTL math ---------------------------------------------------------
|
||||
|
||||
func TestTickCallsHandleForExpiredRecordsOnly(t *testing.T) {
|
||||
h := newHarness()
|
||||
retention := 24 * time.Hour
|
||||
w := h.build(t, retention)
|
||||
|
||||
// One stopped older than retention, one within retention.
|
||||
expired := stoppedRecord("game-old", h.now.Add(-30*time.Hour))
|
||||
fresh := stoppedRecord("game-new", h.now.Add(-time.Hour))
|
||||
h.records.Set(expired, fresh)
|
||||
|
||||
w.Tick(context.Background())
|
||||
|
||||
calls := h.cleaner.Calls()
|
||||
require.Len(t, calls, 1, "only the expired record should be passed to cleanup")
|
||||
assert.Equal(t, "game-old", calls[0].GameID)
|
||||
assert.Equal(t, operation.OpSourceAutoTTL, calls[0].OpSource)
|
||||
assert.Empty(t, calls[0].SourceRef)
|
||||
}
|
||||
|
||||
func TestTickRespectsThresholdBoundaryExactly(t *testing.T) {
|
||||
h := newHarness()
|
||||
retention := 24 * time.Hour
|
||||
w := h.build(t, retention)
|
||||
|
||||
// LastOpAt exactly equals the threshold; record.LastOpAt.Before(threshold)
|
||||
// must be false → record stays.
|
||||
exactly := stoppedRecord("game-edge", h.now.Add(-retention))
|
||||
h.records.Set(exactly)
|
||||
|
||||
w.Tick(context.Background())
|
||||
assert.Empty(t, h.cleaner.Calls(), "boundary record (LastOpAt == threshold) is not yet expired")
|
||||
}
|
||||
|
||||
// --- error absorption -------------------------------------------------
|
||||
|
||||
func TestTickAbsorbsListError(t *testing.T) {
|
||||
h := newHarness()
|
||||
w := h.build(t, time.Hour)
|
||||
h.records.listErr = errors.New("pg down")
|
||||
|
||||
require.NotPanics(t, func() { w.Tick(context.Background()) })
|
||||
assert.Empty(t, h.cleaner.Calls())
|
||||
}
|
||||
|
||||
func TestTickAbsorbsHandleErrorAndContinues(t *testing.T) {
|
||||
h := newHarness()
|
||||
retention := time.Hour
|
||||
w := h.build(t, retention)
|
||||
|
||||
a := stoppedRecord("game-a", h.now.Add(-2*retention))
|
||||
b := stoppedRecord("game-b", h.now.Add(-2*retention))
|
||||
h.records.Set(a, b)
|
||||
|
||||
h.cleaner.errs = []error{errors.New("docker hiccup")}
|
||||
|
||||
w.Tick(context.Background())
|
||||
|
||||
calls := h.cleaner.Calls()
|
||||
require.Len(t, calls, 2, "second game must still be processed after first error")
|
||||
assert.Equal(t, "game-a", calls[0].GameID)
|
||||
assert.Equal(t, "game-b", calls[1].GameID)
|
||||
}
|
||||
|
||||
func TestTickAbsorbsFailureOutcomeAndContinues(t *testing.T) {
|
||||
h := newHarness()
|
||||
retention := time.Hour
|
||||
w := h.build(t, retention)
|
||||
|
||||
a := stoppedRecord("game-a", h.now.Add(-2*retention))
|
||||
b := stoppedRecord("game-b", h.now.Add(-2*retention))
|
||||
h.records.Set(a, b)
|
||||
|
||||
h.cleaner.responses = []cleanupcontainer.Result{
|
||||
{Outcome: operation.OutcomeFailure, ErrorCode: "service_unavailable", ErrorMessage: "docker"},
|
||||
}
|
||||
|
||||
w.Tick(context.Background())
|
||||
|
||||
calls := h.cleaner.Calls()
|
||||
require.Len(t, calls, 2)
|
||||
}
|
||||
|
||||
// --- Run lifecycle ----------------------------------------------------
|
||||
|
||||
func TestRunRespectsContextCancel(t *testing.T) {
|
||||
h := newHarness()
|
||||
w := h.build(t, time.Hour)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- w.Run(ctx) }()
|
||||
|
||||
cancel()
|
||||
select {
|
||||
case err := <-done:
|
||||
assert.ErrorIs(t, err, context.Canceled)
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("Run did not exit after cancel")
|
||||
}
|
||||
}
|
||||
|
||||
func TestShutdownIsNoOp(t *testing.T) {
|
||||
h := newHarness()
|
||||
w := h.build(t, time.Hour)
|
||||
require.NoError(t, w.Shutdown(context.Background()))
|
||||
}
|
||||
|
||||
// --- compile-time safety ----------------------------------------------
|
||||
|
||||
var (
|
||||
_ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
|
||||
_ containercleanup.Cleaner = (*fakeCleaner)(nil)
|
||||
)
|
||||
@@ -0,0 +1,357 @@
|
||||
// Package dockerevents subscribes to the Docker events stream and turns
|
||||
// container-scoped events into entries on `runtime:health_events`.
|
||||
//
|
||||
// Three event kinds are emitted by this listener (per
|
||||
// `rtmanager/README.md §Health Monitoring`):
|
||||
//
|
||||
// - `container_exited` from a `die` action with non-zero exit code;
|
||||
// - `container_oom` from an `oom` action;
|
||||
// - `container_disappeared` from a `destroy` action observed for a
|
||||
// `runtime_records.status=running` row whose `current_container_id`
|
||||
// still matches the destroyed container — i.e., a destroy that
|
||||
// Runtime Manager did not initiate itself. Destroys triggered by
|
||||
// RTM's own restart / cleanup flow either find the record already
|
||||
// transitioned (status != running) or pointing at a different
|
||||
// container id, and are therefore skipped.
|
||||
//
|
||||
// `container_started` is emitted by the start service and is not
|
||||
// duplicated here. Graceful stop produces a `die` event with exit code
|
||||
// `0`; that case is suppressed to honour the README guarantee that
|
||||
// `container_exited` carries a non-zero exit.
|
||||
//
|
||||
// Design rationale, including the destroy-disambiguation rule and the
|
||||
// reconnect policy, is captured in
|
||||
// `rtmanager/docs/workers.md`.
|
||||
package dockerevents
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
// Docker event actions consumed by the listener. Other actions are
|
||||
// observed but ignored.
|
||||
const (
|
||||
actionDie = "die"
|
||||
actionOOM = "oom"
|
||||
actionDestroy = "destroy"
|
||||
)
|
||||
|
||||
// defaultReconnectBackoff bounds the wait between two `EventsListen`
|
||||
// reconnect attempts. Daemon hiccups in production are common; the
|
||||
// listener never gives up while ctx is alive.
|
||||
const defaultReconnectBackoff = 5 * time.Second
|
||||
|
||||
// Dependencies groups the collaborators required by Listener.
|
||||
type Dependencies struct {
|
||||
// Docker provides the EventsListen subscription used by Run.
|
||||
Docker ports.DockerClient
|
||||
|
||||
// RuntimeRecords resolves `(game_id, container_id)` for destroy
|
||||
// disambiguation.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// HealthEvents emits the entries produced by handleEvent. Failures
|
||||
// are best-effort: the listener logs and continues.
|
||||
HealthEvents ports.HealthEventPublisher
|
||||
|
||||
// Telemetry records one health-event counter increment per emission.
|
||||
// Required.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Clock supplies the wall-clock used as a fallback when a Docker
|
||||
// event arrives without a timestamp. Defaults to `time.Now`.
|
||||
Clock func() time.Time
|
||||
|
||||
// Logger receives structured worker-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
|
||||
// ReconnectBackoff bounds the wait between reconnect attempts.
|
||||
// Defaults to defaultReconnectBackoff when zero.
|
||||
ReconnectBackoff time.Duration
|
||||
}
|
||||
|
||||
// Listener consumes Docker container events and emits the matching
|
||||
// `runtime:health_events` entries.
|
||||
type Listener struct {
|
||||
docker ports.DockerClient
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
healthEvents ports.HealthEventPublisher
|
||||
telemetry *telemetry.Runtime
|
||||
clock func() time.Time
|
||||
logger *slog.Logger
|
||||
|
||||
reconnectBackoff time.Duration
|
||||
}
|
||||
|
||||
// NewListener constructs one Listener from deps.
|
||||
func NewListener(deps Dependencies) (*Listener, error) {
|
||||
switch {
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new docker events listener: nil docker client")
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new docker events listener: nil runtime records store")
|
||||
case deps.HealthEvents == nil:
|
||||
return nil, errors.New("new docker events listener: nil health events publisher")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new docker events listener: nil telemetry runtime")
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
backoff := deps.ReconnectBackoff
|
||||
if backoff <= 0 {
|
||||
backoff = defaultReconnectBackoff
|
||||
}
|
||||
|
||||
return &Listener{
|
||||
docker: deps.Docker,
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
healthEvents: deps.HealthEvents,
|
||||
telemetry: deps.Telemetry,
|
||||
clock: clock,
|
||||
logger: logger.With("worker", "rtmanager.dockerevents"),
|
||||
reconnectBackoff: backoff,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run drives the events subscription. The outer loop reconnects after a
|
||||
// Docker subscription error with a fixed backoff; only `ctx`
|
||||
// cancellation terminates Run.
|
||||
func (listener *Listener) Run(ctx context.Context) error {
|
||||
if listener == nil {
|
||||
return errors.New("run docker events listener: nil listener")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run docker events listener: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
listener.logger.Info("docker events listener started",
|
||||
"reconnect_backoff", listener.reconnectBackoff.String(),
|
||||
)
|
||||
defer listener.logger.Info("docker events listener stopped")
|
||||
|
||||
for {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err := listener.runOnce(ctx)
|
||||
if err == nil || errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
|
||||
if ctxErr := ctx.Err(); ctxErr != nil {
|
||||
return ctxErr
|
||||
}
|
||||
}
|
||||
if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) {
|
||||
listener.logger.WarnContext(ctx, "docker events subscription dropped, will reconnect",
|
||||
"err", err.Error(),
|
||||
"backoff", listener.reconnectBackoff.String(),
|
||||
)
|
||||
}
|
||||
|
||||
if waitErr := listener.sleep(ctx); waitErr != nil {
|
||||
return waitErr
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; Run terminates on context cancellation.
|
||||
func (listener *Listener) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown docker events listener: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// runOnce subscribes once and processes events until the subscription
|
||||
// reports an error or ctx is cancelled.
|
||||
func (listener *Listener) runOnce(ctx context.Context) error {
|
||||
events, errs, err := listener.docker.EventsListen(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("subscribe docker events: %w", err)
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case event, ok := <-events:
|
||||
if !ok {
|
||||
return errors.New("docker events channel closed")
|
||||
}
|
||||
listener.handleEvent(ctx, event)
|
||||
case subscribeErr, ok := <-errs:
|
||||
if !ok {
|
||||
return errors.New("docker errors channel closed")
|
||||
}
|
||||
if subscribeErr == nil {
|
||||
continue
|
||||
}
|
||||
return subscribeErr
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sleep waits reconnectBackoff or until ctx is cancelled.
|
||||
func (listener *Listener) sleep(ctx context.Context) error {
|
||||
timer := time.NewTimer(listener.reconnectBackoff)
|
||||
defer timer.Stop()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-timer.C:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// handleEvent translates one Docker event into a health-events emission
|
||||
// (if any). All branches are exported via tests.
|
||||
func (listener *Listener) handleEvent(ctx context.Context, event ports.DockerEvent) {
|
||||
gameID := strings.TrimSpace(event.Labels[startruntime.LabelGameID])
|
||||
if gameID == "" {
|
||||
return
|
||||
}
|
||||
|
||||
occurredAt := event.OccurredAt
|
||||
if occurredAt.IsZero() {
|
||||
occurredAt = listener.clock()
|
||||
}
|
||||
occurredAt = occurredAt.UTC()
|
||||
|
||||
switch event.Action {
|
||||
case actionDie:
|
||||
if event.ExitCode == 0 {
|
||||
return
|
||||
}
|
||||
listener.publish(ctx, ports.HealthEventEnvelope{
|
||||
GameID: gameID,
|
||||
ContainerID: event.ContainerID,
|
||||
EventType: health.EventTypeContainerExited,
|
||||
OccurredAt: occurredAt,
|
||||
Details: containerExitedDetails(event.ExitCode, false),
|
||||
})
|
||||
case actionOOM:
|
||||
listener.publish(ctx, ports.HealthEventEnvelope{
|
||||
GameID: gameID,
|
||||
ContainerID: event.ContainerID,
|
||||
EventType: health.EventTypeContainerOOM,
|
||||
OccurredAt: occurredAt,
|
||||
Details: containerOOMDetails(event.ExitCode),
|
||||
})
|
||||
case actionDestroy:
|
||||
if !listener.isUnexpectedDestroy(ctx, gameID, event.ContainerID) {
|
||||
return
|
||||
}
|
||||
listener.publish(ctx, ports.HealthEventEnvelope{
|
||||
GameID: gameID,
|
||||
ContainerID: event.ContainerID,
|
||||
EventType: health.EventTypeContainerDisappeared,
|
||||
OccurredAt: occurredAt,
|
||||
Details: containerDisappearedDetails(),
|
||||
})
|
||||
default:
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// isUnexpectedDestroy returns true when the destroy event came from a
|
||||
// source other than Runtime Manager itself. The check is conservative:
|
||||
// any read error treats the destroy as expected (we cannot tell), and
|
||||
// only a record currently `running` whose `current_container_id` still
|
||||
// equals the destroyed id is considered unexpected.
|
||||
func (listener *Listener) isUnexpectedDestroy(ctx context.Context, gameID, containerID string) bool {
|
||||
record, err := listener.runtimeRecords.Get(ctx, gameID)
|
||||
switch {
|
||||
case errors.Is(err, runtime.ErrNotFound):
|
||||
return false
|
||||
case err != nil:
|
||||
listener.logger.WarnContext(ctx, "destroy lookup failed; suppressing emission",
|
||||
"game_id", gameID,
|
||||
"container_id", containerID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return false
|
||||
}
|
||||
if record.Status != runtime.StatusRunning {
|
||||
return false
|
||||
}
|
||||
if record.CurrentContainerID != containerID {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// publish emits one envelope through the configured publisher, updates
|
||||
// the telemetry counter, and logs the outcome. All side effects are
|
||||
// best-effort; a publish error degrades to a warning log.
|
||||
func (listener *Listener) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
||||
if err := listener.healthEvents.Publish(ctx, envelope); err != nil {
|
||||
listener.logger.ErrorContext(ctx, "publish health event",
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
listener.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
listener.logger.InfoContext(ctx, "docker event published", logArgs...)
|
||||
}
|
||||
|
||||
// containerExitedDetails builds the JSON payload required by the
|
||||
// `container_exited` AsyncAPI variant.
|
||||
func containerExitedDetails(exitCode int, oom bool) json.RawMessage {
|
||||
payload := struct {
|
||||
ExitCode int `json:"exit_code"`
|
||||
OOM bool `json:"oom"`
|
||||
}{ExitCode: exitCode, OOM: oom}
|
||||
encoded, _ := json.Marshal(payload)
|
||||
return encoded
|
||||
}
|
||||
|
||||
// containerOOMDetails builds the JSON payload required by the
|
||||
// `container_oom` AsyncAPI variant.
|
||||
func containerOOMDetails(exitCode int) json.RawMessage {
|
||||
payload := struct {
|
||||
ExitCode int `json:"exit_code"`
|
||||
}{ExitCode: exitCode}
|
||||
encoded, _ := json.Marshal(payload)
|
||||
return encoded
|
||||
}
|
||||
|
||||
// containerDisappearedDetails builds the empty JSON object the
|
||||
// `container_disappeared` AsyncAPI variant requires.
|
||||
func containerDisappearedDetails() json.RawMessage {
|
||||
return json.RawMessage(`{}`)
|
||||
}
|
||||
@@ -0,0 +1,584 @@
|
||||
package dockerevents_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
"galaxy/rtmanager/internal/worker/dockerevents"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func silentLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||
}
|
||||
|
||||
// fakeDockerEvents is a minimal ports.DockerClient implementation for
|
||||
// the listener: only EventsListen is exercised. Tests push events
|
||||
// through the eventsCh channel and observe reconnect attempts via the
|
||||
// counter.
|
||||
type fakeDockerEvents struct {
|
||||
mu sync.Mutex
|
||||
subscribeCount int32
|
||||
subscribeErr error
|
||||
currentEventsCh chan ports.DockerEvent
|
||||
currentErrsCh chan error
|
||||
subscribed chan struct{}
|
||||
}
|
||||
|
||||
func newFakeDockerEvents() *fakeDockerEvents {
|
||||
return &fakeDockerEvents{subscribed: make(chan struct{}, 16)}
|
||||
}
|
||||
|
||||
func (f *fakeDockerEvents) EventsListen(ctx context.Context) (<-chan ports.DockerEvent, <-chan error, error) {
|
||||
atomic.AddInt32(&f.subscribeCount, 1)
|
||||
f.mu.Lock()
|
||||
if f.subscribeErr != nil {
|
||||
err := f.subscribeErr
|
||||
f.mu.Unlock()
|
||||
return nil, nil, err
|
||||
}
|
||||
events := make(chan ports.DockerEvent, 16)
|
||||
errs := make(chan error, 1)
|
||||
f.currentEventsCh = events
|
||||
f.currentErrsCh = errs
|
||||
f.mu.Unlock()
|
||||
|
||||
select {
|
||||
case f.subscribed <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
|
||||
go func() {
|
||||
<-ctx.Done()
|
||||
f.mu.Lock()
|
||||
if f.currentEventsCh == events {
|
||||
close(events)
|
||||
close(errs)
|
||||
f.currentEventsCh = nil
|
||||
f.currentErrsCh = nil
|
||||
}
|
||||
f.mu.Unlock()
|
||||
}()
|
||||
return events, errs, nil
|
||||
}
|
||||
|
||||
func (f *fakeDockerEvents) sendEvent(event ports.DockerEvent) {
|
||||
f.mu.Lock()
|
||||
ch := f.currentEventsCh
|
||||
f.mu.Unlock()
|
||||
if ch != nil {
|
||||
ch <- event
|
||||
}
|
||||
}
|
||||
|
||||
func (f *fakeDockerEvents) sendErr(err error) {
|
||||
f.mu.Lock()
|
||||
ch := f.currentErrsCh
|
||||
f.mu.Unlock()
|
||||
if ch != nil {
|
||||
ch <- err
|
||||
}
|
||||
}
|
||||
|
||||
func (f *fakeDockerEvents) waitSubscribed(t *testing.T) {
|
||||
t.Helper()
|
||||
select {
|
||||
case <-f.subscribed:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("timed out waiting for EventsListen subscription")
|
||||
}
|
||||
}
|
||||
|
||||
func (f *fakeDockerEvents) subscriptions() int {
|
||||
return int(atomic.LoadInt32(&f.subscribeCount))
|
||||
}
|
||||
|
||||
// Unused DockerClient methods. The listener only consumes EventsListen.
|
||||
func (f *fakeDockerEvents) EnsureNetwork(_ context.Context, _ string) error { return nil }
|
||||
func (f *fakeDockerEvents) PullImage(_ context.Context, _ string, _ ports.PullPolicy) error {
|
||||
return nil
|
||||
}
|
||||
func (f *fakeDockerEvents) InspectImage(_ context.Context, _ string) (ports.ImageInspect, error) {
|
||||
return ports.ImageInspect{}, nil
|
||||
}
|
||||
func (f *fakeDockerEvents) InspectContainer(_ context.Context, _ string) (ports.ContainerInspect, error) {
|
||||
return ports.ContainerInspect{}, nil
|
||||
}
|
||||
func (f *fakeDockerEvents) Run(_ context.Context, _ ports.RunSpec) (ports.RunResult, error) {
|
||||
return ports.RunResult{}, nil
|
||||
}
|
||||
func (f *fakeDockerEvents) Stop(_ context.Context, _ string, _ time.Duration) error { return nil }
|
||||
func (f *fakeDockerEvents) Remove(_ context.Context, _ string) error { return nil }
|
||||
func (f *fakeDockerEvents) List(_ context.Context, _ ports.ListFilter) ([]ports.ContainerSummary, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// fakeRuntimeRecords supports Get only; the listener does not call any
|
||||
// other method. Tests seed records via Set.
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Set(record runtime.RuntimeRecord) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.stored[record.GameID] = record
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil }
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
|
||||
return nil
|
||||
}
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) { return nil, nil }
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// fakeHealthEvents captures every Publish call.
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
published []ports.HealthEventEnvelope
|
||||
publishErr error
|
||||
}
|
||||
|
||||
func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.publishErr != nil {
|
||||
return s.publishErr
|
||||
}
|
||||
s.published = append(s.published, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]ports.HealthEventEnvelope, len(s.published))
|
||||
copy(out, s.published)
|
||||
return out
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
docker *fakeDockerEvents
|
||||
records *fakeRuntimeRecords
|
||||
health *fakeHealthEvents
|
||||
listener *dockerevents.Listener
|
||||
clockNow time.Time
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
docker := newFakeDockerEvents()
|
||||
records := newFakeRuntimeRecords()
|
||||
healthEvents := &fakeHealthEvents{}
|
||||
clockNow := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
|
||||
|
||||
listener, err := dockerevents.NewListener(dockerevents.Dependencies{
|
||||
Docker: docker,
|
||||
RuntimeRecords: records,
|
||||
HealthEvents: healthEvents,
|
||||
Telemetry: telemetryRuntime,
|
||||
Clock: func() time.Time { return clockNow },
|
||||
Logger: silentLogger(),
|
||||
ReconnectBackoff: 5 * time.Millisecond,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
docker: docker,
|
||||
records: records,
|
||||
health: healthEvents,
|
||||
listener: listener,
|
||||
clockNow: clockNow,
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor -------------------------------------------------------
|
||||
|
||||
func TestNewListenerRejectsMissingDeps(t *testing.T) {
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
cases := []dockerevents.Dependencies{
|
||||
{},
|
||||
{Docker: newFakeDockerEvents()},
|
||||
{Docker: newFakeDockerEvents(), RuntimeRecords: newFakeRuntimeRecords()},
|
||||
{Docker: newFakeDockerEvents(), RuntimeRecords: newFakeRuntimeRecords(), HealthEvents: &fakeHealthEvents{}},
|
||||
}
|
||||
for index, deps := range cases {
|
||||
_, err := dockerevents.NewListener(deps)
|
||||
require.Errorf(t, err, "case %d should fail", index)
|
||||
}
|
||||
|
||||
_, err = dockerevents.NewListener(dockerevents.Dependencies{
|
||||
Docker: newFakeDockerEvents(),
|
||||
RuntimeRecords: newFakeRuntimeRecords(),
|
||||
HealthEvents: &fakeHealthEvents{},
|
||||
Telemetry: telemetryRuntime,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
// --- Run lifecycle -----------------------------------------------------
|
||||
|
||||
func TestRunPublishesContainerExitedOnNonZeroDie(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
occurredAt := h.clockNow.Add(-time.Minute)
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "die",
|
||||
ContainerID: "ctr-die",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-die"},
|
||||
ExitCode: 137,
|
||||
OccurredAt: occurredAt,
|
||||
})
|
||||
|
||||
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
|
||||
|
||||
envelopes := h.health.Published()
|
||||
require.Len(t, envelopes, 1)
|
||||
envelope := envelopes[0]
|
||||
assert.Equal(t, "game-die", envelope.GameID)
|
||||
assert.Equal(t, "ctr-die", envelope.ContainerID)
|
||||
assert.Equal(t, health.EventTypeContainerExited, envelope.EventType)
|
||||
assert.True(t, envelope.OccurredAt.Equal(occurredAt.UTC()))
|
||||
assertJSONEqual(t, `{"exit_code":137,"oom":false}`, envelope.Details)
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunSkipsZeroExitDie(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "die",
|
||||
ContainerID: "ctr-graceful",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-graceful"},
|
||||
ExitCode: 0,
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
assert.Empty(t, h.health.Published(), "graceful exit must not emit container_exited")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunPublishesContainerOOM(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "oom",
|
||||
ContainerID: "ctr-oom",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-oom"},
|
||||
ExitCode: 137,
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
|
||||
envelope := h.health.Published()[0]
|
||||
assert.Equal(t, health.EventTypeContainerOOM, envelope.EventType)
|
||||
assertJSONEqual(t, `{"exit_code":137}`, envelope.Details)
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunDestroyEmitsDisappearedOnlyForRunningRecordWithMatchingContainer(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
startedAt := h.clockNow.Add(-time.Hour)
|
||||
h.records.Set(runtime.RuntimeRecord{
|
||||
GameID: "game-d",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-current",
|
||||
CurrentImageRef: "galaxy/game:1.0.0",
|
||||
EngineEndpoint: "http://galaxy-game-game-d:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-d",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: h.clockNow,
|
||||
CreatedAt: startedAt,
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
// Matching destroy → emit.
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "destroy",
|
||||
ContainerID: "ctr-current",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-d"},
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
|
||||
envelope := h.health.Published()[0]
|
||||
assert.Equal(t, health.EventTypeContainerDisappeared, envelope.EventType)
|
||||
assertJSONEqual(t, `{}`, envelope.Details)
|
||||
|
||||
// Non-matching container id → skip.
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "destroy",
|
||||
ContainerID: "ctr-old",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-d"},
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
assert.Len(t, h.health.Published(), 1, "destroy on outdated container_id must not emit again")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunDestroySkipsNonRunningRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
startedAt := h.clockNow.Add(-time.Hour)
|
||||
stoppedAt := h.clockNow.Add(-time.Minute)
|
||||
h.records.Set(runtime.RuntimeRecord{
|
||||
GameID: "game-stopped",
|
||||
Status: runtime.StatusStopped,
|
||||
CurrentContainerID: "ctr-stopped",
|
||||
CurrentImageRef: "galaxy/game:1.0.0",
|
||||
EngineEndpoint: "http://galaxy-game-game-stopped:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-stopped",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
StoppedAt: &stoppedAt,
|
||||
LastOpAt: stoppedAt,
|
||||
CreatedAt: startedAt,
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "destroy",
|
||||
ContainerID: "ctr-stopped",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-stopped"},
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
assert.Empty(t, h.health.Published(), "destroy on non-running record must not emit")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunDestroySkipsUnknownGame(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "destroy",
|
||||
ContainerID: "ctr-unknown",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-unknown"},
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
assert.Empty(t, h.health.Published(), "destroy with no record must not emit")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunSkipsEventsWithoutGameIDLabel(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "die",
|
||||
ContainerID: "ctr-foreign",
|
||||
Labels: map[string]string{},
|
||||
ExitCode: 1,
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
assert.Empty(t, h.health.Published(), "events without game_id label must not emit")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunSkipsUnrelatedActions(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
for _, action := range []string{"start", "kill", "pause", "create"} {
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: action,
|
||||
ContainerID: "ctr-x",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-x"},
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
}
|
||||
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
assert.Empty(t, h.health.Published(), "non-die/oom/destroy actions must not emit")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunReconnectsAfterSubscriptionError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendErr(errors.New("connection reset"))
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
// Send an event after reconnect to confirm pipeline resumed.
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "die",
|
||||
ContainerID: "ctr-after",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-after"},
|
||||
ExitCode: 1,
|
||||
OccurredAt: h.clockNow,
|
||||
})
|
||||
|
||||
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
|
||||
assert.GreaterOrEqual(t, h.docker.subscriptions(), 2, "listener must reconnect after error")
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
func TestRunFillsOccurredAtWhenZero(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := runListener(h, ctx)
|
||||
h.docker.waitSubscribed(t)
|
||||
|
||||
h.docker.sendEvent(ports.DockerEvent{
|
||||
Action: "oom",
|
||||
ContainerID: "ctr-time",
|
||||
Labels: map[string]string{startruntime.LabelGameID: "game-time"},
|
||||
ExitCode: 137,
|
||||
})
|
||||
|
||||
require.Eventually(t, func() bool { return len(h.health.Published()) == 1 }, time.Second, 5*time.Millisecond)
|
||||
envelope := h.health.Published()[0]
|
||||
assert.True(t, envelope.OccurredAt.Equal(h.clockNow.UTC()))
|
||||
|
||||
cancel()
|
||||
waitDone(t, done)
|
||||
}
|
||||
|
||||
// --- helpers -----------------------------------------------------------
|
||||
|
||||
func runListener(h *harness, ctx context.Context) chan error {
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- h.listener.Run(ctx) }()
|
||||
return done
|
||||
}
|
||||
|
||||
func waitDone(t *testing.T, done chan error) {
|
||||
t.Helper()
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("Run did not exit within timeout")
|
||||
}
|
||||
}
|
||||
|
||||
func assertJSONEqual(t *testing.T, want string, got json.RawMessage) {
|
||||
t.Helper()
|
||||
var wantValue, gotValue any
|
||||
require.NoError(t, json.Unmarshal([]byte(want), &wantValue))
|
||||
require.NoError(t, json.Unmarshal(got, &gotValue))
|
||||
assert.Equal(t, wantValue, gotValue)
|
||||
}
|
||||
|
||||
// --- shutdown ----------------------------------------------------------
|
||||
|
||||
func TestShutdownIsNoOp(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
require.NoError(t, h.listener.Shutdown(context.Background()))
|
||||
}
|
||||
|
||||
// --- compile-time safety ----------------------------------------------
|
||||
|
||||
var (
|
||||
_ ports.DockerClient = (*fakeDockerEvents)(nil)
|
||||
_ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
|
||||
_ ports.HealthEventPublisher = (*fakeHealthEvents)(nil)
|
||||
)
|
||||
@@ -0,0 +1,318 @@
|
||||
// Package dockerinspect runs the periodic Docker inspect described in
|
||||
// `rtmanager/README.md §Health Monitoring`.
|
||||
//
|
||||
// On every tick the worker lists `runtime_records.status=running`,
|
||||
// inspects each container, and emits `inspect_unhealthy` when any of
|
||||
// the following holds:
|
||||
//
|
||||
// - `RestartCount` increased between observations (delta detection
|
||||
// requires a prior observation; the first inspect of a record only
|
||||
// records the baseline);
|
||||
// - `State.Status != "running"`;
|
||||
// - `State.Health.Status == "unhealthy"` (only meaningful when the
|
||||
// image declares a Docker HEALTHCHECK).
|
||||
//
|
||||
// `ErrContainerNotFound` is left to the reconciler — the inspect
|
||||
// worker logs and skips so that `container_disappeared` emission
|
||||
// stays single-sourced (Docker events listener + reconciler).
|
||||
//
|
||||
// Per-game state is pruned at the start of every tick against the
|
||||
// freshly-read running list, so a stopped or removed game never
|
||||
// carries a stale baseline into a new lifecycle.
|
||||
package dockerinspect
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
// dockerStateRunning is the verbatim Docker `State.Status` value the
|
||||
// worker treats as healthy.
|
||||
const dockerStateRunning = "running"
|
||||
|
||||
// dockerHealthUnhealthy is the verbatim Docker `State.Health.Status`
|
||||
// value the worker treats as unhealthy.
|
||||
const dockerHealthUnhealthy = "unhealthy"
|
||||
|
||||
// Dependencies groups the collaborators required by Worker.
|
||||
type Dependencies struct {
|
||||
// Docker provides the InspectContainer surface.
|
||||
Docker ports.DockerClient
|
||||
|
||||
// RuntimeRecords lists running games on every tick.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// HealthEvents emits `inspect_unhealthy` entries.
|
||||
HealthEvents ports.HealthEventPublisher
|
||||
|
||||
// Telemetry records one health-event counter per emission.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Interval bounds the tick period.
|
||||
Interval time.Duration
|
||||
|
||||
// Clock supplies the wall-clock used for emission timestamps.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// Logger receives structured worker-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// Worker drives the periodic inspect loop.
|
||||
type Worker struct {
|
||||
docker ports.DockerClient
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
healthEvents ports.HealthEventPublisher
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
interval time.Duration
|
||||
|
||||
clock func() time.Time
|
||||
logger *slog.Logger
|
||||
|
||||
mu sync.Mutex
|
||||
states map[string]*inspectState
|
||||
}
|
||||
|
||||
// inspectState stores the per-game baseline. Owned by Worker and
|
||||
// protected by Worker.mu.
|
||||
type inspectState struct {
|
||||
lastRestartCount int
|
||||
seen bool
|
||||
}
|
||||
|
||||
// NewWorker constructs one Worker from deps.
|
||||
func NewWorker(deps Dependencies) (*Worker, error) {
|
||||
switch {
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new docker inspect worker: nil docker client")
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new docker inspect worker: nil runtime records store")
|
||||
case deps.HealthEvents == nil:
|
||||
return nil, errors.New("new docker inspect worker: nil health events publisher")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new docker inspect worker: nil telemetry runtime")
|
||||
case deps.Interval <= 0:
|
||||
return nil, errors.New("new docker inspect worker: interval must be positive")
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
return &Worker{
|
||||
docker: deps.Docker,
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
healthEvents: deps.HealthEvents,
|
||||
telemetry: deps.Telemetry,
|
||||
interval: deps.Interval,
|
||||
clock: clock,
|
||||
logger: logger.With("worker", "rtmanager.dockerinspect"),
|
||||
states: map[string]*inspectState{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run drives the inspect loop until ctx is cancelled. Per-tick errors
|
||||
// are absorbed; the loop only exits on context cancellation.
|
||||
func (worker *Worker) Run(ctx context.Context) error {
|
||||
if worker == nil {
|
||||
return errors.New("run docker inspect worker: nil worker")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run docker inspect worker: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
worker.logger.Info("docker inspect worker started",
|
||||
"interval", worker.interval.String(),
|
||||
)
|
||||
defer worker.logger.Info("docker inspect worker stopped")
|
||||
|
||||
ticker := time.NewTicker(worker.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
worker.tick(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; Run terminates on context cancellation.
|
||||
func (worker *Worker) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown docker inspect worker: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Tick performs one inspect pass. Exported so tests can drive the
|
||||
// worker deterministically without spinning a real ticker.
|
||||
func (worker *Worker) Tick(ctx context.Context) {
|
||||
worker.tick(ctx)
|
||||
}
|
||||
|
||||
// tick performs one full pass: list running records, prune state for
|
||||
// stopped games, then inspect every running container sequentially.
|
||||
// Inspect calls are cheap; sequential execution avoids fan-out against
|
||||
// the Docker daemon.
|
||||
func (worker *Worker) tick(ctx context.Context) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
|
||||
if err != nil {
|
||||
worker.logger.WarnContext(ctx, "list running records",
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
worker.pruneStates(records)
|
||||
|
||||
for _, record := range records {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
worker.inspectOne(ctx, record)
|
||||
}
|
||||
}
|
||||
|
||||
// pruneStates removes per-game baselines for games no longer in the
|
||||
// running list.
|
||||
func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) {
|
||||
worker.mu.Lock()
|
||||
defer worker.mu.Unlock()
|
||||
if len(worker.states) == 0 {
|
||||
return
|
||||
}
|
||||
running := make(map[string]struct{}, len(records))
|
||||
for _, record := range records {
|
||||
running[record.GameID] = struct{}{}
|
||||
}
|
||||
for gameID := range worker.states {
|
||||
if _, ok := running[gameID]; !ok {
|
||||
delete(worker.states, gameID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// inspectOne issues one InspectContainer call and emits
|
||||
// `inspect_unhealthy` when the observation crosses any of the three
|
||||
// trigger conditions. The first observation of a record only seeds the
|
||||
// baseline; deltas need at least two ticks.
|
||||
func (worker *Worker) inspectOne(ctx context.Context, record runtime.RuntimeRecord) {
|
||||
inspect, err := worker.docker.InspectContainer(ctx, record.CurrentContainerID)
|
||||
if err != nil {
|
||||
if errors.Is(err, ports.ErrContainerNotFound) {
|
||||
worker.logger.DebugContext(ctx, "inspect skipped: container missing",
|
||||
"game_id", record.GameID,
|
||||
"container_id", record.CurrentContainerID,
|
||||
)
|
||||
return
|
||||
}
|
||||
worker.logger.WarnContext(ctx, "inspect failed",
|
||||
"game_id", record.GameID,
|
||||
"container_id", record.CurrentContainerID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
worker.mu.Lock()
|
||||
state, ok := worker.states[record.GameID]
|
||||
if !ok {
|
||||
state = &inspectState{}
|
||||
worker.states[record.GameID] = state
|
||||
}
|
||||
prev := *state
|
||||
state.lastRestartCount = inspect.RestartCount
|
||||
state.seen = true
|
||||
worker.mu.Unlock()
|
||||
|
||||
emit := false
|
||||
switch {
|
||||
case prev.seen && inspect.RestartCount > prev.lastRestartCount:
|
||||
emit = true
|
||||
case inspect.Status != dockerStateRunning:
|
||||
emit = true
|
||||
case inspect.Health == dockerHealthUnhealthy:
|
||||
emit = true
|
||||
}
|
||||
if !emit {
|
||||
return
|
||||
}
|
||||
|
||||
worker.publish(ctx, ports.HealthEventEnvelope{
|
||||
GameID: record.GameID,
|
||||
ContainerID: record.CurrentContainerID,
|
||||
EventType: health.EventTypeInspectUnhealthy,
|
||||
OccurredAt: worker.clock().UTC(),
|
||||
Details: inspectUnhealthyDetails(inspect.RestartCount, inspect.Status, inspect.Health),
|
||||
})
|
||||
}
|
||||
|
||||
// publish emits one envelope through the configured publisher, updates
|
||||
// the telemetry counter, and logs the outcome. Failures degrade to a
|
||||
// warning log per `rtmanager/README.md §Notification Contracts`.
|
||||
func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
||||
if err := worker.healthEvents.Publish(ctx, envelope); err != nil {
|
||||
worker.logger.ErrorContext(ctx, "publish health event",
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
worker.logger.InfoContext(ctx, "inspect event published", logArgs...)
|
||||
}
|
||||
|
||||
// inspectUnhealthyDetails builds the JSON payload required by the
|
||||
// `inspect_unhealthy` AsyncAPI variant. All three fields are required
|
||||
// even when their value is the zero value.
|
||||
func inspectUnhealthyDetails(restartCount int, state, health string) json.RawMessage {
|
||||
payload := struct {
|
||||
RestartCount int `json:"restart_count"`
|
||||
State string `json:"state"`
|
||||
Health string `json:"health"`
|
||||
}{
|
||||
RestartCount: restartCount,
|
||||
State: state,
|
||||
Health: health,
|
||||
}
|
||||
encoded, _ := json.Marshal(payload)
|
||||
return encoded
|
||||
}
|
||||
@@ -0,0 +1,388 @@
|
||||
package dockerinspect_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
"galaxy/rtmanager/internal/worker/dockerinspect"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
func silentLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||
}
|
||||
|
||||
// fakeRuntimeRecords supports ListByStatus only.
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
running []runtime.RuntimeRecord
|
||||
listErr error
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} }
|
||||
|
||||
func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.running = append([]runtime.RuntimeRecord(nil), records...)
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Clear() {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.running = nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil }
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
|
||||
return nil
|
||||
}
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.listErr != nil {
|
||||
return nil, s.listErr
|
||||
}
|
||||
if status != runtime.StatusRunning {
|
||||
return nil, nil
|
||||
}
|
||||
out := make([]runtime.RuntimeRecord, len(s.running))
|
||||
copy(out, s.running)
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// fakeHealthEvents captures every Publish call.
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
published []ports.HealthEventEnvelope
|
||||
publishErr error
|
||||
}
|
||||
|
||||
func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.publishErr != nil {
|
||||
return s.publishErr
|
||||
}
|
||||
s.published = append(s.published, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]ports.HealthEventEnvelope, len(s.published))
|
||||
copy(out, s.published)
|
||||
return out
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
docker *mocks.MockDockerClient
|
||||
records *fakeRuntimeRecords
|
||||
health *fakeHealthEvents
|
||||
worker *dockerinspect.Worker
|
||||
now time.Time
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
docker := mocks.NewMockDockerClient(ctrl)
|
||||
records := newFakeRuntimeRecords()
|
||||
healthEvents := &fakeHealthEvents{}
|
||||
now := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
|
||||
|
||||
worker, err := dockerinspect.NewWorker(dockerinspect.Dependencies{
|
||||
Docker: docker,
|
||||
RuntimeRecords: records,
|
||||
HealthEvents: healthEvents,
|
||||
Telemetry: telemetryRuntime,
|
||||
Interval: 50 * time.Millisecond,
|
||||
Clock: func() time.Time { return now },
|
||||
Logger: silentLogger(),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
docker: docker,
|
||||
records: records,
|
||||
health: healthEvents,
|
||||
worker: worker,
|
||||
now: now,
|
||||
}
|
||||
}
|
||||
|
||||
func runningRecord(gameID string) runtime.RuntimeRecord {
|
||||
startedAt := time.Date(2026, 4, 27, 11, 0, 0, 0, time.UTC)
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: gameID,
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-" + gameID,
|
||||
CurrentImageRef: "galaxy/game:1.0.0",
|
||||
EngineEndpoint: "http://galaxy-game-" + gameID + ":8080",
|
||||
StatePath: "/var/lib/galaxy/games/" + gameID,
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor ------------------------------------------------------
|
||||
|
||||
func TestNewWorkerRejectsMissingDeps(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
base := dockerinspect.Dependencies{
|
||||
Docker: mocks.NewMockDockerClient(ctrl),
|
||||
RuntimeRecords: newFakeRuntimeRecords(),
|
||||
HealthEvents: &fakeHealthEvents{},
|
||||
Telemetry: telemetryRuntime,
|
||||
Interval: time.Second,
|
||||
}
|
||||
|
||||
defectives := []dockerinspect.Dependencies{
|
||||
{},
|
||||
{Docker: base.Docker},
|
||||
{Docker: base.Docker, RuntimeRecords: base.RuntimeRecords},
|
||||
{Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents},
|
||||
{Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, Telemetry: base.Telemetry},
|
||||
}
|
||||
for index, deps := range defectives {
|
||||
_, err := dockerinspect.NewWorker(deps)
|
||||
require.Errorf(t, err, "case %d should fail", index)
|
||||
}
|
||||
|
||||
_, err = dockerinspect.NewWorker(base)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
// --- behaviour --------------------------------------------------------
|
||||
|
||||
func TestTickFirstObservationOnlySeedsBaseline(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.Set(runningRecord("game-a"))
|
||||
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{
|
||||
ID: "ctr-game-a",
|
||||
Status: "running",
|
||||
Health: "",
|
||||
RestartCount: 2,
|
||||
}, nil)
|
||||
|
||||
h.worker.Tick(context.Background())
|
||||
assert.Empty(t, h.health.Published(), "first observation seeds baseline only")
|
||||
}
|
||||
|
||||
func TestTickRestartCountGrowthEmits(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.Set(runningRecord("game-a"))
|
||||
|
||||
gomock.InOrder(
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{
|
||||
ID: "ctr-game-a", Status: "running", RestartCount: 2,
|
||||
}, nil),
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{
|
||||
ID: "ctr-game-a", Status: "running", RestartCount: 3,
|
||||
}, nil),
|
||||
)
|
||||
|
||||
h.worker.Tick(context.Background())
|
||||
h.worker.Tick(context.Background())
|
||||
|
||||
envelopes := h.health.Published()
|
||||
require.Len(t, envelopes, 1)
|
||||
envelope := envelopes[0]
|
||||
assert.Equal(t, health.EventTypeInspectUnhealthy, envelope.EventType)
|
||||
assert.Equal(t, "game-a", envelope.GameID)
|
||||
assert.Equal(t, "ctr-game-a", envelope.ContainerID)
|
||||
|
||||
var details struct {
|
||||
RestartCount int `json:"restart_count"`
|
||||
State string `json:"state"`
|
||||
Health string `json:"health"`
|
||||
}
|
||||
require.NoError(t, json.Unmarshal(envelope.Details, &details))
|
||||
assert.Equal(t, 3, details.RestartCount)
|
||||
assert.Equal(t, "running", details.State)
|
||||
assert.Empty(t, details.Health)
|
||||
}
|
||||
|
||||
func TestTickStateNotRunningEmits(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.Set(runningRecord("game-a"))
|
||||
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{
|
||||
ID: "ctr-game-a",
|
||||
Status: "exited",
|
||||
Health: "",
|
||||
RestartCount: 0,
|
||||
}, nil)
|
||||
|
||||
h.worker.Tick(context.Background())
|
||||
envelopes := h.health.Published()
|
||||
require.Len(t, envelopes, 1, "state != running emits even on first observation")
|
||||
envelope := envelopes[0]
|
||||
assert.Equal(t, health.EventTypeInspectUnhealthy, envelope.EventType)
|
||||
|
||||
var details struct {
|
||||
RestartCount int `json:"restart_count"`
|
||||
State string `json:"state"`
|
||||
Health string `json:"health"`
|
||||
}
|
||||
require.NoError(t, json.Unmarshal(envelope.Details, &details))
|
||||
assert.Equal(t, "exited", details.State)
|
||||
}
|
||||
|
||||
func TestTickHealthUnhealthyEmits(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.Set(runningRecord("game-a"))
|
||||
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{
|
||||
ID: "ctr-game-a",
|
||||
Status: "running",
|
||||
Health: "unhealthy",
|
||||
RestartCount: 0,
|
||||
}, nil)
|
||||
|
||||
h.worker.Tick(context.Background())
|
||||
envelopes := h.health.Published()
|
||||
require.Len(t, envelopes, 1, "Health == unhealthy emits even on first observation")
|
||||
envelope := envelopes[0]
|
||||
assert.Equal(t, health.EventTypeInspectUnhealthy, envelope.EventType)
|
||||
|
||||
var details struct {
|
||||
Health string `json:"health"`
|
||||
}
|
||||
require.NoError(t, json.Unmarshal(envelope.Details, &details))
|
||||
assert.Equal(t, "unhealthy", details.Health)
|
||||
}
|
||||
|
||||
func TestTickHealthyDoesNotEmitOnSecondPass(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.Set(runningRecord("game-a"))
|
||||
|
||||
gomock.InOrder(
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{
|
||||
ID: "ctr-game-a", Status: "running", RestartCount: 5,
|
||||
}, nil),
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{
|
||||
ID: "ctr-game-a", Status: "running", RestartCount: 5,
|
||||
}, nil),
|
||||
)
|
||||
|
||||
h.worker.Tick(context.Background())
|
||||
h.worker.Tick(context.Background())
|
||||
assert.Empty(t, h.health.Published(), "stable healthy observations must not emit")
|
||||
}
|
||||
|
||||
func TestTickContainerNotFoundIsSilent(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.Set(runningRecord("game-a"))
|
||||
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{}, ports.ErrContainerNotFound)
|
||||
|
||||
h.worker.Tick(context.Background())
|
||||
assert.Empty(t, h.health.Published(), "ErrContainerNotFound must not emit; reconciler handles drift")
|
||||
}
|
||||
|
||||
func TestTickArbitraryInspectErrorIsAbsorbed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.Set(runningRecord("game-a"))
|
||||
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{}, errors.New("docker daemon broken"))
|
||||
|
||||
require.NotPanics(t, func() { h.worker.Tick(context.Background()) })
|
||||
assert.Empty(t, h.health.Published())
|
||||
}
|
||||
|
||||
func TestTickPrunesStateForGamesNoLongerRunning(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.Set(runningRecord("game-a"))
|
||||
|
||||
gomock.InOrder(
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{
|
||||
ID: "ctr-game-a", Status: "running", RestartCount: 5,
|
||||
}, nil),
|
||||
// After the game leaves running and re-enters, baseline must be
|
||||
// reset; a smaller RestartCount must NOT emit (no delta from a
|
||||
// stale state).
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-a").Return(ports.ContainerInspect{
|
||||
ID: "ctr-game-a", Status: "running", RestartCount: 1,
|
||||
}, nil),
|
||||
)
|
||||
|
||||
h.worker.Tick(context.Background())
|
||||
h.records.Clear()
|
||||
h.worker.Tick(context.Background())
|
||||
h.records.Set(runningRecord("game-a"))
|
||||
h.worker.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.health.Published(), "fresh baseline after re-running must not compare against stale lastRestartCount")
|
||||
}
|
||||
|
||||
func TestTickAbsorbsListError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.listErr = errors.New("pg down")
|
||||
|
||||
require.NotPanics(t, func() { h.worker.Tick(context.Background()) })
|
||||
assert.Empty(t, h.health.Published())
|
||||
}
|
||||
|
||||
func TestRunRespectsContextCancel(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- h.worker.Run(ctx) }()
|
||||
|
||||
cancel()
|
||||
select {
|
||||
case err := <-done:
|
||||
assert.ErrorIs(t, err, context.Canceled)
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("Run did not exit after cancel")
|
||||
}
|
||||
}
|
||||
|
||||
func TestShutdownIsNoOp(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
require.NoError(t, h.worker.Shutdown(context.Background()))
|
||||
}
|
||||
|
||||
// --- compile-time safety ----------------------------------------------
|
||||
|
||||
var (
|
||||
_ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
|
||||
_ ports.HealthEventPublisher = (*fakeHealthEvents)(nil)
|
||||
)
|
||||
@@ -0,0 +1,411 @@
|
||||
// Package healthprobe runs the active HTTP `/healthz` probe described in
|
||||
// `rtmanager/README.md §Health Monitoring`.
|
||||
//
|
||||
// On every tick the worker lists `runtime_records.status=running`,
|
||||
// probes each engine endpoint in parallel (capped at
|
||||
// defaultMaxConcurrency), and applies the
|
||||
// RTMANAGER_PROBE_FAILURES_THRESHOLD hysteresis to emit `probe_failed`
|
||||
// (after N consecutive failures) and `probe_recovered` (on the first
|
||||
// success after a `probe_failed` was published). In-memory state is
|
||||
// pruned at the start of every tick against the freshly-read running
|
||||
// list, so a game that stops between ticks never accumulates stale
|
||||
// failure counters.
|
||||
//
|
||||
// Design rationale is captured in
|
||||
// `rtmanager/docs/workers.md`.
|
||||
package healthprobe
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
// defaultMaxConcurrency caps the number of in-flight `/healthz`
|
||||
// requests inside a single tick. RTM v1 is single-instance with a
|
||||
// modest active-game count; the cap keeps a slow engine from delaying
|
||||
// the rest of the cohort while preventing pathological fan-out if the
|
||||
// running list grows.
|
||||
const defaultMaxConcurrency = 16
|
||||
|
||||
// healthzPath is the engine probe path. Stable per
|
||||
// `game/README.md §/healthz`.
|
||||
const healthzPath = "/healthz"
|
||||
|
||||
// Dependencies groups the collaborators required by Worker.
|
||||
type Dependencies struct {
|
||||
// RuntimeRecords lists running games on every tick.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// HealthEvents emits `probe_failed` and `probe_recovered`.
|
||||
HealthEvents ports.HealthEventPublisher
|
||||
|
||||
// HTTPClient performs the engine `/healthz` request. Required.
|
||||
// Production wiring supplies an `otelhttp`-instrumented client.
|
||||
HTTPClient *http.Client
|
||||
|
||||
// Telemetry records one health-event counter per emission.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Interval bounds the tick period.
|
||||
Interval time.Duration
|
||||
|
||||
// ProbeTimeout bounds one engine `/healthz` call.
|
||||
ProbeTimeout time.Duration
|
||||
|
||||
// FailuresThreshold is the consecutive-failure count that promotes
|
||||
// the in-memory counter to a `probe_failed` emission.
|
||||
FailuresThreshold int
|
||||
|
||||
// MaxConcurrency caps the number of in-flight probes per tick.
|
||||
// Defaults to defaultMaxConcurrency when zero or negative.
|
||||
MaxConcurrency int
|
||||
|
||||
// Clock supplies the wall-clock used for emission timestamps.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// Logger receives structured worker-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// Worker drives the periodic active-probe loop.
|
||||
type Worker struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
healthEvents ports.HealthEventPublisher
|
||||
httpClient *http.Client
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
interval time.Duration
|
||||
probeTimeout time.Duration
|
||||
failuresThreshold int
|
||||
maxConcurrency int
|
||||
|
||||
clock func() time.Time
|
||||
logger *slog.Logger
|
||||
|
||||
mu sync.Mutex
|
||||
states map[string]*probeState
|
||||
}
|
||||
|
||||
// probeState stores the per-game hysteresis counters. Owned by Worker
|
||||
// and protected by Worker.mu.
|
||||
type probeState struct {
|
||||
consecutiveFailures int
|
||||
failurePublished bool
|
||||
}
|
||||
|
||||
// NewWorker constructs one Worker from deps.
|
||||
func NewWorker(deps Dependencies) (*Worker, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new health probe worker: nil runtime records store")
|
||||
case deps.HealthEvents == nil:
|
||||
return nil, errors.New("new health probe worker: nil health events publisher")
|
||||
case deps.HTTPClient == nil:
|
||||
return nil, errors.New("new health probe worker: nil http client")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new health probe worker: nil telemetry runtime")
|
||||
case deps.Interval <= 0:
|
||||
return nil, errors.New("new health probe worker: interval must be positive")
|
||||
case deps.ProbeTimeout <= 0:
|
||||
return nil, errors.New("new health probe worker: probe timeout must be positive")
|
||||
case deps.FailuresThreshold <= 0:
|
||||
return nil, errors.New("new health probe worker: failures threshold must be positive")
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
maxConcurrency := deps.MaxConcurrency
|
||||
if maxConcurrency <= 0 {
|
||||
maxConcurrency = defaultMaxConcurrency
|
||||
}
|
||||
|
||||
return &Worker{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
healthEvents: deps.HealthEvents,
|
||||
httpClient: deps.HTTPClient,
|
||||
telemetry: deps.Telemetry,
|
||||
interval: deps.Interval,
|
||||
probeTimeout: deps.ProbeTimeout,
|
||||
failuresThreshold: deps.FailuresThreshold,
|
||||
maxConcurrency: maxConcurrency,
|
||||
clock: clock,
|
||||
logger: logger.With("worker", "rtmanager.healthprobe"),
|
||||
states: map[string]*probeState{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run drives the probe loop until ctx is cancelled. Per-tick errors are
|
||||
// absorbed; the loop only exits on context cancellation.
|
||||
func (worker *Worker) Run(ctx context.Context) error {
|
||||
if worker == nil {
|
||||
return errors.New("run health probe worker: nil worker")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run health probe worker: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
worker.logger.Info("health probe worker started",
|
||||
"interval", worker.interval.String(),
|
||||
"probe_timeout", worker.probeTimeout.String(),
|
||||
"failures_threshold", worker.failuresThreshold,
|
||||
"max_concurrency", worker.maxConcurrency,
|
||||
)
|
||||
defer worker.logger.Info("health probe worker stopped")
|
||||
|
||||
ticker := time.NewTicker(worker.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
worker.tick(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; Run terminates on context cancellation.
|
||||
func (worker *Worker) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown health probe worker: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Tick performs one probe pass. Exported so tests can drive the worker
|
||||
// deterministically without spinning a real ticker.
|
||||
func (worker *Worker) Tick(ctx context.Context) {
|
||||
worker.tick(ctx)
|
||||
}
|
||||
|
||||
// tick performs one full pass: list running records, prune state for
|
||||
// stopped games, then probe every running game in parallel.
|
||||
func (worker *Worker) tick(ctx context.Context) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
|
||||
if err != nil {
|
||||
worker.logger.WarnContext(ctx, "list running records",
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
worker.pruneStates(records)
|
||||
|
||||
if len(records) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
semaphore := make(chan struct{}, worker.maxConcurrency)
|
||||
var waitGroup sync.WaitGroup
|
||||
for _, record := range records {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
waitGroup.Wait()
|
||||
return
|
||||
case semaphore <- struct{}{}:
|
||||
}
|
||||
waitGroup.Add(1)
|
||||
go func(record runtime.RuntimeRecord) {
|
||||
defer waitGroup.Done()
|
||||
defer func() { <-semaphore }()
|
||||
worker.probeOne(ctx, record)
|
||||
}(record)
|
||||
}
|
||||
waitGroup.Wait()
|
||||
}
|
||||
|
||||
// pruneStates removes per-game state for games no longer in the running
|
||||
// list. Stopped or removed games therefore start with a clean counter
|
||||
// the next time they re-enter `running`.
|
||||
func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) {
|
||||
worker.mu.Lock()
|
||||
defer worker.mu.Unlock()
|
||||
if len(worker.states) == 0 {
|
||||
return
|
||||
}
|
||||
running := make(map[string]struct{}, len(records))
|
||||
for _, record := range records {
|
||||
running[record.GameID] = struct{}{}
|
||||
}
|
||||
for gameID := range worker.states {
|
||||
if _, ok := running[gameID]; !ok {
|
||||
delete(worker.states, gameID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// probeOne issues one `/healthz` request and updates hysteresis state.
|
||||
func (worker *Worker) probeOne(ctx context.Context, record runtime.RuntimeRecord) {
|
||||
probeCtx, cancel := context.WithTimeout(ctx, worker.probeTimeout)
|
||||
defer cancel()
|
||||
|
||||
endpoint := strings.TrimRight(record.EngineEndpoint, "/") + healthzPath
|
||||
request, err := http.NewRequestWithContext(probeCtx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
worker.recordFailure(ctx, record, 0, fmt.Errorf("build request: %w", err))
|
||||
return
|
||||
}
|
||||
|
||||
response, err := worker.httpClient.Do(request)
|
||||
if err != nil {
|
||||
worker.recordFailure(ctx, record, 0, err)
|
||||
return
|
||||
}
|
||||
defer response.Body.Close()
|
||||
|
||||
if response.StatusCode == http.StatusOK {
|
||||
worker.recordSuccess(ctx, record)
|
||||
return
|
||||
}
|
||||
worker.recordFailure(ctx, record, response.StatusCode, fmt.Errorf("unexpected status %d", response.StatusCode))
|
||||
}
|
||||
|
||||
// recordSuccess updates state on a successful probe and emits
|
||||
// `probe_recovered` when the prior tick had crossed the failure
|
||||
// threshold.
|
||||
func (worker *Worker) recordSuccess(ctx context.Context, record runtime.RuntimeRecord) {
|
||||
worker.mu.Lock()
|
||||
state, ok := worker.states[record.GameID]
|
||||
if !ok {
|
||||
worker.mu.Unlock()
|
||||
return
|
||||
}
|
||||
if !state.failurePublished {
|
||||
state.consecutiveFailures = 0
|
||||
worker.mu.Unlock()
|
||||
return
|
||||
}
|
||||
priorFailureCount := state.consecutiveFailures
|
||||
state.consecutiveFailures = 0
|
||||
state.failurePublished = false
|
||||
worker.mu.Unlock()
|
||||
|
||||
worker.publish(ctx, ports.HealthEventEnvelope{
|
||||
GameID: record.GameID,
|
||||
ContainerID: record.CurrentContainerID,
|
||||
EventType: health.EventTypeProbeRecovered,
|
||||
OccurredAt: worker.clock().UTC(),
|
||||
Details: probeRecoveredDetails(priorFailureCount),
|
||||
})
|
||||
}
|
||||
|
||||
// recordFailure updates state on a failed probe and emits
|
||||
// `probe_failed` once the threshold is crossed.
|
||||
func (worker *Worker) recordFailure(ctx context.Context, record runtime.RuntimeRecord, lastStatus int, lastErr error) {
|
||||
worker.mu.Lock()
|
||||
state, ok := worker.states[record.GameID]
|
||||
if !ok {
|
||||
state = &probeState{}
|
||||
worker.states[record.GameID] = state
|
||||
}
|
||||
state.consecutiveFailures++
|
||||
if state.failurePublished || state.consecutiveFailures < worker.failuresThreshold {
|
||||
count := state.consecutiveFailures
|
||||
worker.mu.Unlock()
|
||||
worker.logger.DebugContext(ctx, "probe failure",
|
||||
"game_id", record.GameID,
|
||||
"consecutive_failures", count,
|
||||
"threshold", worker.failuresThreshold,
|
||||
"err", errString(lastErr),
|
||||
)
|
||||
return
|
||||
}
|
||||
state.failurePublished = true
|
||||
count := state.consecutiveFailures
|
||||
worker.mu.Unlock()
|
||||
|
||||
worker.publish(ctx, ports.HealthEventEnvelope{
|
||||
GameID: record.GameID,
|
||||
ContainerID: record.CurrentContainerID,
|
||||
EventType: health.EventTypeProbeFailed,
|
||||
OccurredAt: worker.clock().UTC(),
|
||||
Details: probeFailedDetails(count, lastStatus, errString(lastErr)),
|
||||
})
|
||||
}
|
||||
|
||||
// publish emits one envelope through the configured publisher, updates
|
||||
// the telemetry counter, and logs the outcome. Failures degrade to a
|
||||
// warning log per `rtmanager/README.md §Notification Contracts`.
|
||||
func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
||||
if err := worker.healthEvents.Publish(ctx, envelope); err != nil {
|
||||
worker.logger.ErrorContext(ctx, "publish health event",
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
worker.logger.InfoContext(ctx, "probe event published", logArgs...)
|
||||
}
|
||||
|
||||
// probeFailedDetails builds the JSON payload required by the
|
||||
// `probe_failed` AsyncAPI variant.
|
||||
func probeFailedDetails(consecutiveFailures, lastStatus int, lastError string) json.RawMessage {
|
||||
payload := struct {
|
||||
ConsecutiveFailures int `json:"consecutive_failures"`
|
||||
LastStatus int `json:"last_status"`
|
||||
LastError string `json:"last_error"`
|
||||
}{
|
||||
ConsecutiveFailures: consecutiveFailures,
|
||||
LastStatus: lastStatus,
|
||||
LastError: lastError,
|
||||
}
|
||||
encoded, _ := json.Marshal(payload)
|
||||
return encoded
|
||||
}
|
||||
|
||||
// probeRecoveredDetails builds the JSON payload required by the
|
||||
// `probe_recovered` AsyncAPI variant.
|
||||
func probeRecoveredDetails(priorFailureCount int) json.RawMessage {
|
||||
payload := struct {
|
||||
PriorFailureCount int `json:"prior_failure_count"`
|
||||
}{PriorFailureCount: priorFailureCount}
|
||||
encoded, _ := json.Marshal(payload)
|
||||
return encoded
|
||||
}
|
||||
|
||||
func errString(err error) string {
|
||||
if err == nil {
|
||||
return ""
|
||||
}
|
||||
return err.Error()
|
||||
}
|
||||
@@ -0,0 +1,417 @@
|
||||
package healthprobe_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
"galaxy/rtmanager/internal/worker/healthprobe"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func silentLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||
}
|
||||
|
||||
// fakeRuntimeRecords supports List/ListByStatus only; the worker does
|
||||
// not call other methods.
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
running []runtime.RuntimeRecord
|
||||
listErr error
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} }
|
||||
|
||||
func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.running = append([]runtime.RuntimeRecord(nil), records...)
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Clear() {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.running = nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil }
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
|
||||
return nil
|
||||
}
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.listErr != nil {
|
||||
return nil, s.listErr
|
||||
}
|
||||
if status != runtime.StatusRunning {
|
||||
return nil, nil
|
||||
}
|
||||
out := make([]runtime.RuntimeRecord, len(s.running))
|
||||
copy(out, s.running)
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// fakeHealthEvents captures every Publish call.
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
published []ports.HealthEventEnvelope
|
||||
publishErr error
|
||||
}
|
||||
|
||||
func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.publishErr != nil {
|
||||
return s.publishErr
|
||||
}
|
||||
s.published = append(s.published, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]ports.HealthEventEnvelope, len(s.published))
|
||||
copy(out, s.published)
|
||||
return out
|
||||
}
|
||||
|
||||
// engineServer is a per-game HTTP fake controlled by tests.
|
||||
type engineServer struct {
|
||||
server *httptest.Server
|
||||
status atomic.Int32
|
||||
requests atomic.Int32
|
||||
}
|
||||
|
||||
func newEngineServer(t *testing.T) *engineServer {
|
||||
t.Helper()
|
||||
es := &engineServer{}
|
||||
es.status.Store(http.StatusOK)
|
||||
es.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||
es.requests.Add(1)
|
||||
w.WriteHeader(int(es.status.Load()))
|
||||
}))
|
||||
t.Cleanup(es.server.Close)
|
||||
return es
|
||||
}
|
||||
|
||||
func (e *engineServer) URL() string { return e.server.URL }
|
||||
|
||||
func (e *engineServer) SetStatus(code int) { e.status.Store(int32(code)) }
|
||||
|
||||
func (e *engineServer) Stop() { e.server.Close() }
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
health *fakeHealthEvents
|
||||
worker *healthprobe.Worker
|
||||
now time.Time
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
records := newFakeRuntimeRecords()
|
||||
healthEvents := &fakeHealthEvents{}
|
||||
|
||||
worker, err := healthprobe.NewWorker(healthprobe.Dependencies{
|
||||
RuntimeRecords: records,
|
||||
HealthEvents: healthEvents,
|
||||
HTTPClient: &http.Client{},
|
||||
Telemetry: telemetryRuntime,
|
||||
Interval: 50 * time.Millisecond,
|
||||
ProbeTimeout: 100 * time.Millisecond,
|
||||
FailuresThreshold: 3,
|
||||
MaxConcurrency: 4,
|
||||
Clock: func() time.Time { return time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) },
|
||||
Logger: silentLogger(),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
records: records,
|
||||
health: healthEvents,
|
||||
worker: worker,
|
||||
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
}
|
||||
}
|
||||
|
||||
func runningRecord(gameID, endpoint string) runtime.RuntimeRecord {
|
||||
startedAt := time.Date(2026, 4, 27, 11, 0, 0, 0, time.UTC)
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: gameID,
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-" + gameID,
|
||||
CurrentImageRef: "galaxy/game:1.0.0",
|
||||
EngineEndpoint: endpoint,
|
||||
StatePath: "/var/lib/galaxy/games/" + gameID,
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor -------------------------------------------------------
|
||||
|
||||
func TestNewWorkerRejectsMissingDeps(t *testing.T) {
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
base := healthprobe.Dependencies{
|
||||
RuntimeRecords: newFakeRuntimeRecords(),
|
||||
HealthEvents: &fakeHealthEvents{},
|
||||
HTTPClient: &http.Client{},
|
||||
Telemetry: telemetryRuntime,
|
||||
Interval: time.Second,
|
||||
ProbeTimeout: time.Second,
|
||||
FailuresThreshold: 1,
|
||||
}
|
||||
|
||||
defectives := []healthprobe.Dependencies{
|
||||
{},
|
||||
{RuntimeRecords: base.RuntimeRecords},
|
||||
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents},
|
||||
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient},
|
||||
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry},
|
||||
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second},
|
||||
{RuntimeRecords: base.RuntimeRecords, HealthEvents: base.HealthEvents, HTTPClient: base.HTTPClient, Telemetry: base.Telemetry, Interval: time.Second, ProbeTimeout: time.Second},
|
||||
}
|
||||
for index, deps := range defectives {
|
||||
_, err := healthprobe.NewWorker(deps)
|
||||
require.Errorf(t, err, "case %d should fail", index)
|
||||
}
|
||||
|
||||
_, err = healthprobe.NewWorker(base)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
// --- behaviour --------------------------------------------------------
|
||||
|
||||
func TestTickHealthyDoesNotEmit(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
engine := newEngineServer(t)
|
||||
|
||||
h.records.Set(runningRecord("game-a", engine.URL()))
|
||||
h.worker.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.health.Published(), "successful probe must not emit events")
|
||||
assert.Equal(t, int32(1), engine.requests.Load(), "exactly one probe request")
|
||||
}
|
||||
|
||||
func TestTickFailureBelowThresholdDoesNotEmit(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
engine := newEngineServer(t)
|
||||
engine.SetStatus(http.StatusServiceUnavailable)
|
||||
|
||||
h.records.Set(runningRecord("game-a", engine.URL()))
|
||||
h.worker.Tick(context.Background())
|
||||
h.worker.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.health.Published(), "two failures below threshold must not emit")
|
||||
}
|
||||
|
||||
func TestTickFailuresCrossingThresholdEmitProbeFailedOnce(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
engine := newEngineServer(t)
|
||||
engine.SetStatus(http.StatusInternalServerError)
|
||||
|
||||
h.records.Set(runningRecord("game-a", engine.URL()))
|
||||
|
||||
for range 5 {
|
||||
h.worker.Tick(context.Background())
|
||||
}
|
||||
|
||||
envelopes := h.health.Published()
|
||||
require.Len(t, envelopes, 1, "probe_failed must publish exactly once across many failures")
|
||||
envelope := envelopes[0]
|
||||
assert.Equal(t, health.EventTypeProbeFailed, envelope.EventType)
|
||||
assert.Equal(t, "game-a", envelope.GameID)
|
||||
assert.Equal(t, "ctr-game-a", envelope.ContainerID)
|
||||
|
||||
var details struct {
|
||||
ConsecutiveFailures int `json:"consecutive_failures"`
|
||||
LastStatus int `json:"last_status"`
|
||||
LastError string `json:"last_error"`
|
||||
}
|
||||
require.NoError(t, json.Unmarshal(envelope.Details, &details))
|
||||
assert.Equal(t, 3, details.ConsecutiveFailures, "consecutive_failures equals threshold at first emission")
|
||||
assert.Equal(t, http.StatusInternalServerError, details.LastStatus)
|
||||
assert.NotEmpty(t, details.LastError)
|
||||
}
|
||||
|
||||
func TestTickRecoveryEmitsProbeRecoveredWithPriorFailureCount(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
engine := newEngineServer(t)
|
||||
engine.SetStatus(http.StatusInternalServerError)
|
||||
|
||||
h.records.Set(runningRecord("game-a", engine.URL()))
|
||||
|
||||
for range 3 {
|
||||
h.worker.Tick(context.Background())
|
||||
}
|
||||
require.Len(t, h.health.Published(), 1, "expect probe_failed after threshold")
|
||||
|
||||
engine.SetStatus(http.StatusOK)
|
||||
h.worker.Tick(context.Background())
|
||||
|
||||
envelopes := h.health.Published()
|
||||
require.Len(t, envelopes, 2, "recovery must emit exactly one probe_recovered")
|
||||
envelope := envelopes[1]
|
||||
assert.Equal(t, health.EventTypeProbeRecovered, envelope.EventType)
|
||||
|
||||
var details struct {
|
||||
PriorFailureCount int `json:"prior_failure_count"`
|
||||
}
|
||||
require.NoError(t, json.Unmarshal(envelope.Details, &details))
|
||||
assert.Equal(t, 3, details.PriorFailureCount)
|
||||
}
|
||||
|
||||
func TestTickFlappingDoesNotDoublePublishProbeFailed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
engine := newEngineServer(t)
|
||||
engine.SetStatus(http.StatusInternalServerError)
|
||||
|
||||
h.records.Set(runningRecord("game-a", engine.URL()))
|
||||
for range 5 {
|
||||
h.worker.Tick(context.Background())
|
||||
}
|
||||
require.Len(t, h.health.Published(), 1)
|
||||
|
||||
// New failure after probe_failed has been published: must not emit again.
|
||||
h.worker.Tick(context.Background())
|
||||
assert.Len(t, h.health.Published(), 1, "no new probe_failed while already in failed state")
|
||||
}
|
||||
|
||||
func TestTickPrunesStateForGamesNoLongerRunning(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
engine := newEngineServer(t)
|
||||
engine.SetStatus(http.StatusInternalServerError)
|
||||
|
||||
h.records.Set(runningRecord("game-a", engine.URL()))
|
||||
for range 3 {
|
||||
h.worker.Tick(context.Background())
|
||||
}
|
||||
require.Len(t, h.health.Published(), 1, "probe_failed published before stop")
|
||||
|
||||
// Game leaves running; state must be pruned.
|
||||
h.records.Clear()
|
||||
h.worker.Tick(context.Background())
|
||||
|
||||
// Re-introduce the same game: counter starts fresh, new failures
|
||||
// must accumulate from zero before another probe_failed fires.
|
||||
h.records.Set(runningRecord("game-a", engine.URL()))
|
||||
h.worker.Tick(context.Background())
|
||||
h.worker.Tick(context.Background())
|
||||
assert.Len(t, h.health.Published(), 1, "fresh state must require threshold failures again")
|
||||
|
||||
h.worker.Tick(context.Background())
|
||||
assert.Len(t, h.health.Published(), 2, "third fresh failure crosses threshold")
|
||||
}
|
||||
|
||||
func TestTickProbesMultipleGamesConcurrently(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
// Two slow engines that simulate noticeable latency. Sequential
|
||||
// execution would take 2*latency; parallel finishes near 1*latency.
|
||||
const latency = 80 * time.Millisecond
|
||||
makeSlowEngine := func() *httptest.Server {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||
time.Sleep(latency)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
t.Cleanup(server.Close)
|
||||
return server
|
||||
}
|
||||
a := makeSlowEngine()
|
||||
b := makeSlowEngine()
|
||||
|
||||
h.records.Set(
|
||||
runningRecord("game-a", a.URL),
|
||||
runningRecord("game-b", b.URL),
|
||||
)
|
||||
|
||||
start := time.Now()
|
||||
h.worker.Tick(context.Background())
|
||||
elapsed := time.Since(start)
|
||||
|
||||
assert.Less(t, elapsed, 2*latency, "probes must run concurrently, not sequentially")
|
||||
}
|
||||
|
||||
func TestTickAbsorbsListError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.listErr = errors.New("pg down")
|
||||
|
||||
require.NotPanics(t, func() { h.worker.Tick(context.Background()) })
|
||||
assert.Empty(t, h.health.Published())
|
||||
}
|
||||
|
||||
func TestTickAbsorbsPublishError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.health.publishErr = errors.New("redis down")
|
||||
engine := newEngineServer(t)
|
||||
engine.SetStatus(http.StatusInternalServerError)
|
||||
|
||||
h.records.Set(runningRecord("game-a", engine.URL()))
|
||||
for range 3 {
|
||||
h.worker.Tick(context.Background())
|
||||
}
|
||||
// publishErr means nothing accumulated; the worker must not panic
|
||||
// or change state in surprising ways.
|
||||
assert.Empty(t, h.health.Published())
|
||||
}
|
||||
|
||||
func TestRunRespectsContextCancel(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- h.worker.Run(ctx) }()
|
||||
|
||||
cancel()
|
||||
select {
|
||||
case err := <-done:
|
||||
assert.ErrorIs(t, err, context.Canceled)
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("Run did not exit after cancel")
|
||||
}
|
||||
}
|
||||
|
||||
func TestShutdownIsNoOp(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
require.NoError(t, h.worker.Shutdown(context.Background()))
|
||||
}
|
||||
|
||||
// --- compile-time safety ----------------------------------------------
|
||||
|
||||
var (
|
||||
_ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
|
||||
_ ports.HealthEventPublisher = (*fakeHealthEvents)(nil)
|
||||
)
|
||||
@@ -0,0 +1,678 @@
|
||||
// Package reconcile implements the drift reconciliation worker
|
||||
// described in `rtmanager/README.md §Reconciliation`. The reconciler
|
||||
// is the single authority that brings `runtime_records` into agreement
|
||||
// with the Docker daemon's view of `com.galaxy.owner=rtmanager`
|
||||
// containers.
|
||||
//
|
||||
// Three drift kinds are handled:
|
||||
//
|
||||
// - Adopt — a running container labelled `com.galaxy.owner=rtmanager`
|
||||
// has no matching `runtime_records` row. The reconciler inserts a
|
||||
// `status=running` record (`op_kind=reconcile_adopt`).
|
||||
// - Dispose — a `status=running` row whose `current_container_id` is
|
||||
// no longer reported by Docker. The reconciler updates the row to
|
||||
// `status=removed`, publishes `runtime:health_events`
|
||||
// `container_disappeared`, and appends `reconcile_dispose`.
|
||||
// - Observed exited — a `status=running` row whose container exists
|
||||
// but reports `State.Status=exited`. The reconciler transitions
|
||||
// the row to `status=stopped` and publishes `container_exited`
|
||||
// with the observed exit code. No `operation_log` entry is written
|
||||
// because `OpKind` does not include a value for this transition;
|
||||
// it is reflected in `rtmanager.reconcile_drift{kind=observed_exited}`
|
||||
// instead.
|
||||
//
|
||||
// All write decisions for a given `game_id` are guarded by the per-game
|
||||
// Redis lease; the read pass that lists Docker containers and PG
|
||||
// records is lockless.
|
||||
//
|
||||
// The reconciler runs once synchronously at process start
|
||||
// (`ReconcileNow`) before any other worker is allowed to start, and
|
||||
// then periodically via `Run` as an `app.Component`. Design rationale
|
||||
// is captured in `rtmanager/docs/workers.md`.
|
||||
package reconcile
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
// dockerStateRunning is the verbatim Docker `State.Status` value the
|
||||
// reconciler treats as "the container is alive".
|
||||
const dockerStateRunning = "running"
|
||||
|
||||
// dockerStateExited is the verbatim Docker `State.Status` value the
|
||||
// reconciler treats as "the container has terminated".
|
||||
const dockerStateExited = "exited"
|
||||
|
||||
// driftKindAdopt / driftKindDispose / driftKindObservedExited match the
|
||||
// `kind` label vocabulary on `rtmanager.reconcile_drift`.
|
||||
const (
|
||||
driftKindAdopt = "adopt"
|
||||
driftKindDispose = "dispose"
|
||||
driftKindObservedExited = "observed_exited"
|
||||
)
|
||||
|
||||
// leaseReleaseTimeout bounds the deferred lease-release call. A fresh
|
||||
// background context is used so the release runs even if the request
|
||||
// context was already canceled.
|
||||
const leaseReleaseTimeout = 5 * time.Second
|
||||
|
||||
// Dependencies groups the collaborators required by Reconciler.
|
||||
type Dependencies struct {
|
||||
Docker ports.DockerClient
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
OperationLogs ports.OperationLogStore
|
||||
HealthEvents ports.HealthEventPublisher
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
DockerCfg config.DockerConfig
|
||||
ContainerCfg config.ContainerConfig
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
// Interval bounds the periodic tick. ReconcileNow ignores it.
|
||||
Interval time.Duration
|
||||
|
||||
Clock func() time.Time
|
||||
Logger *slog.Logger
|
||||
NewToken func() string
|
||||
}
|
||||
|
||||
// Reconciler drives both the synchronous initial pass and the periodic
|
||||
// drift reconciliation loop.
|
||||
type Reconciler struct {
|
||||
docker ports.DockerClient
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
healthEvents ports.HealthEventPublisher
|
||||
leases ports.GameLeaseStore
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
dockerNetwork string
|
||||
stateRoot string
|
||||
leaseTTL time.Duration
|
||||
|
||||
interval time.Duration
|
||||
|
||||
clock func() time.Time
|
||||
logger *slog.Logger
|
||||
newToken func() string
|
||||
}
|
||||
|
||||
// NewReconciler constructs one Reconciler from deps.
|
||||
func NewReconciler(deps Dependencies) (*Reconciler, error) {
|
||||
switch {
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new reconciler: nil docker client")
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new reconciler: nil runtime records store")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new reconciler: nil operation log store")
|
||||
case deps.HealthEvents == nil:
|
||||
return nil, errors.New("new reconciler: nil health events publisher")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new reconciler: nil lease store")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new reconciler: nil telemetry runtime")
|
||||
case deps.Interval <= 0:
|
||||
return nil, errors.New("new reconciler: interval must be positive")
|
||||
}
|
||||
if err := deps.DockerCfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new reconciler: docker config: %w", err)
|
||||
}
|
||||
if err := deps.ContainerCfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new reconciler: container config: %w", err)
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new reconciler: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
|
||||
return &Reconciler{
|
||||
docker: deps.Docker,
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
healthEvents: deps.HealthEvents,
|
||||
leases: deps.Leases,
|
||||
telemetry: deps.Telemetry,
|
||||
dockerNetwork: deps.DockerCfg.Network,
|
||||
stateRoot: deps.ContainerCfg.GameStateRoot,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
interval: deps.Interval,
|
||||
clock: clock,
|
||||
logger: logger.With("worker", "rtmanager.reconcile"),
|
||||
newToken: newToken,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ReconcileNow performs one full reconciliation pass synchronously.
|
||||
// It is intended for the startup path described in
|
||||
// `rtmanager/README.md §Startup dependencies` (step 6). Per-game
|
||||
// errors are absorbed into telemetry and logs; only ctx errors are
|
||||
// surfaced to the caller so a cancelled startup aborts immediately.
|
||||
func (reconciler *Reconciler) ReconcileNow(ctx context.Context) error {
|
||||
if reconciler == nil {
|
||||
return errors.New("reconcile now: nil reconciler")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("reconcile now: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
reconciler.tick(ctx)
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
// Run drives the periodic reconciliation loop. It does not perform an
|
||||
// immediate first pass — `ReconcileNow` covers that path; the first
|
||||
// tick fires after `Interval`. Run terminates on context cancellation.
|
||||
func (reconciler *Reconciler) Run(ctx context.Context) error {
|
||||
if reconciler == nil {
|
||||
return errors.New("run reconciler: nil reconciler")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run reconciler: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
reconciler.logger.Info("reconciler started",
|
||||
"interval", reconciler.interval.String(),
|
||||
)
|
||||
defer reconciler.logger.Info("reconciler stopped")
|
||||
|
||||
ticker := time.NewTicker(reconciler.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
reconciler.tick(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; Run terminates on context cancellation.
|
||||
func (reconciler *Reconciler) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown reconciler: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Tick performs one reconciliation pass. Exported so tests can drive
|
||||
// the reconciler deterministically without spinning a real ticker.
|
||||
func (reconciler *Reconciler) Tick(ctx context.Context) {
|
||||
reconciler.tick(ctx)
|
||||
}
|
||||
|
||||
// tick executes one full pass: list Docker containers + PG records,
|
||||
// resolve drift, and apply lease-guarded mutations for each affected
|
||||
// game.
|
||||
func (reconciler *Reconciler) tick(ctx context.Context) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
containers, err := reconciler.docker.List(ctx, ports.ListFilter{
|
||||
Labels: map[string]string{startruntime.LabelOwner: startruntime.LabelOwnerValue},
|
||||
})
|
||||
if err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "list owned containers",
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
records, err := reconciler.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
|
||||
if err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "list running records",
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
containerByGame := make(map[string]ports.ContainerSummary, len(containers))
|
||||
for _, summary := range containers {
|
||||
gameID := summary.Labels[startruntime.LabelGameID]
|
||||
if gameID == "" {
|
||||
continue
|
||||
}
|
||||
containerByGame[gameID] = summary
|
||||
}
|
||||
|
||||
recordByGame := make(map[string]runtime.RuntimeRecord, len(records))
|
||||
for _, record := range records {
|
||||
recordByGame[record.GameID] = record
|
||||
}
|
||||
|
||||
for gameID, summary := range containerByGame {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
if _, ok := recordByGame[gameID]; ok {
|
||||
continue
|
||||
}
|
||||
if summary.Status != dockerStateRunning {
|
||||
continue
|
||||
}
|
||||
reconciler.adoptOne(ctx, gameID, summary)
|
||||
}
|
||||
|
||||
for _, record := range records {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
summary, ok := containerByGame[record.GameID]
|
||||
if !ok {
|
||||
reconciler.disposeOne(ctx, record)
|
||||
continue
|
||||
}
|
||||
if summary.ID != record.CurrentContainerID {
|
||||
continue
|
||||
}
|
||||
if summary.Status == dockerStateExited {
|
||||
reconciler.observedExitedOne(ctx, record, summary)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// adoptOne installs a `runtime_records` row for an unrecorded running
|
||||
// container under the per-game lease.
|
||||
func (reconciler *Reconciler) adoptOne(ctx context.Context, gameID string, summary ports.ContainerSummary) {
|
||||
token := reconciler.newToken()
|
||||
acquired, err := reconciler.leases.TryAcquire(ctx, gameID, token, reconciler.leaseTTL)
|
||||
if err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "adopt: acquire lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if !acquired {
|
||||
reconciler.logger.InfoContext(ctx, "adopt: lease busy, skipping",
|
||||
"game_id", gameID,
|
||||
)
|
||||
return
|
||||
}
|
||||
defer reconciler.releaseLease(ctx, gameID, token)
|
||||
|
||||
if _, err := reconciler.runtimeRecords.Get(ctx, gameID); err == nil {
|
||||
reconciler.logger.InfoContext(ctx, "adopt: record appeared concurrently, skipping",
|
||||
"game_id", gameID,
|
||||
)
|
||||
return
|
||||
} else if !errors.Is(err, runtime.ErrNotFound) {
|
||||
reconciler.logger.WarnContext(ctx, "adopt: read record",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
startedAt := reconciler.resolveStartedAt(ctx, summary)
|
||||
imageRef := summary.Labels[startruntime.LabelEngineImageRef]
|
||||
if imageRef == "" {
|
||||
imageRef = summary.ImageRef
|
||||
}
|
||||
|
||||
now := reconciler.clock().UTC()
|
||||
createdAt := now
|
||||
if startedAt.Before(createdAt) {
|
||||
createdAt = startedAt
|
||||
}
|
||||
record := runtime.RuntimeRecord{
|
||||
GameID: gameID,
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: summary.ID,
|
||||
CurrentImageRef: imageRef,
|
||||
EngineEndpoint: reconciler.engineEndpoint(gameID),
|
||||
StatePath: filepath.Join(reconciler.stateRoot, gameID),
|
||||
DockerNetwork: reconciler.dockerNetwork,
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: now,
|
||||
CreatedAt: createdAt,
|
||||
}
|
||||
if err := reconciler.runtimeRecords.Upsert(ctx, record); err != nil {
|
||||
reconciler.logger.ErrorContext(ctx, "adopt: upsert record",
|
||||
"game_id", gameID,
|
||||
"container_id", summary.ID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
finishedAt := reconciler.clock().UTC()
|
||||
reconciler.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: gameID,
|
||||
OpKind: operation.OpKindReconcileAdopt,
|
||||
OpSource: operation.OpSourceAutoReconcile,
|
||||
ImageRef: imageRef,
|
||||
ContainerID: summary.ID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: now,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
reconciler.telemetry.RecordReconcileDrift(ctx, driftKindAdopt)
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", gameID,
|
||||
"container_id", summary.ID,
|
||||
"image_ref", imageRef,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
reconciler.logger.InfoContext(ctx, "reconciler adopted unrecorded container", logArgs...)
|
||||
}
|
||||
|
||||
// disposeOne transitions a `running` record whose container is missing
|
||||
// in Docker to `removed` and publishes `container_disappeared`.
|
||||
func (reconciler *Reconciler) disposeOne(ctx context.Context, record runtime.RuntimeRecord) {
|
||||
token := reconciler.newToken()
|
||||
acquired, err := reconciler.leases.TryAcquire(ctx, record.GameID, token, reconciler.leaseTTL)
|
||||
if err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "dispose: acquire lease",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if !acquired {
|
||||
reconciler.logger.InfoContext(ctx, "dispose: lease busy, skipping",
|
||||
"game_id", record.GameID,
|
||||
)
|
||||
return
|
||||
}
|
||||
defer reconciler.releaseLease(ctx, record.GameID, token)
|
||||
|
||||
current, err := reconciler.runtimeRecords.Get(ctx, record.GameID)
|
||||
if err != nil {
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return
|
||||
}
|
||||
reconciler.logger.WarnContext(ctx, "dispose: read record",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if current.Status != runtime.StatusRunning || current.CurrentContainerID != record.CurrentContainerID {
|
||||
reconciler.logger.InfoContext(ctx, "dispose: state changed, skipping",
|
||||
"game_id", record.GameID,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
now := reconciler.clock().UTC()
|
||||
err = reconciler.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: record.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
ExpectedContainerID: record.CurrentContainerID,
|
||||
To: runtime.StatusRemoved,
|
||||
Now: now,
|
||||
})
|
||||
if errors.Is(err, runtime.ErrConflict) || errors.Is(err, runtime.ErrNotFound) {
|
||||
reconciler.logger.InfoContext(ctx, "dispose: CAS lost, skipping",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
reconciler.logger.ErrorContext(ctx, "dispose: update status",
|
||||
"game_id", record.GameID,
|
||||
"container_id", record.CurrentContainerID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
reconciler.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
|
||||
GameID: record.GameID,
|
||||
ContainerID: record.CurrentContainerID,
|
||||
EventType: health.EventTypeContainerDisappeared,
|
||||
OccurredAt: now,
|
||||
Details: containerDisappearedDetails(),
|
||||
})
|
||||
|
||||
finishedAt := reconciler.clock().UTC()
|
||||
reconciler.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: record.GameID,
|
||||
OpKind: operation.OpKindReconcileDispose,
|
||||
OpSource: operation.OpSourceAutoReconcile,
|
||||
ImageRef: record.CurrentImageRef,
|
||||
ContainerID: record.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: now,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
reconciler.telemetry.RecordReconcileDrift(ctx, driftKindDispose)
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", record.GameID,
|
||||
"container_id", record.CurrentContainerID,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
reconciler.logger.InfoContext(ctx, "reconciler disposed missing container", logArgs...)
|
||||
}
|
||||
|
||||
// observedExitedOne transitions a `running` record whose container is
|
||||
// reported as `exited` to `stopped` and publishes `container_exited`
|
||||
// with the observed exit code. No `operation_log` entry is written;
|
||||
// see decision record §6.
|
||||
func (reconciler *Reconciler) observedExitedOne(ctx context.Context, record runtime.RuntimeRecord, summary ports.ContainerSummary) {
|
||||
token := reconciler.newToken()
|
||||
acquired, err := reconciler.leases.TryAcquire(ctx, record.GameID, token, reconciler.leaseTTL)
|
||||
if err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "observed_exited: acquire lease",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if !acquired {
|
||||
reconciler.logger.InfoContext(ctx, "observed_exited: lease busy, skipping",
|
||||
"game_id", record.GameID,
|
||||
)
|
||||
return
|
||||
}
|
||||
defer reconciler.releaseLease(ctx, record.GameID, token)
|
||||
|
||||
current, err := reconciler.runtimeRecords.Get(ctx, record.GameID)
|
||||
if err != nil {
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return
|
||||
}
|
||||
reconciler.logger.WarnContext(ctx, "observed_exited: read record",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if current.Status != runtime.StatusRunning || current.CurrentContainerID != summary.ID {
|
||||
reconciler.logger.InfoContext(ctx, "observed_exited: state changed, skipping",
|
||||
"game_id", record.GameID,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
inspect, err := reconciler.docker.InspectContainer(ctx, summary.ID)
|
||||
if err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "observed_exited: inspect container",
|
||||
"game_id", record.GameID,
|
||||
"container_id", summary.ID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
now := reconciler.clock().UTC()
|
||||
err = reconciler.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: record.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
ExpectedContainerID: summary.ID,
|
||||
To: runtime.StatusStopped,
|
||||
Now: now,
|
||||
})
|
||||
if errors.Is(err, runtime.ErrConflict) || errors.Is(err, runtime.ErrNotFound) {
|
||||
reconciler.logger.InfoContext(ctx, "observed_exited: CAS lost, skipping",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
reconciler.logger.ErrorContext(ctx, "observed_exited: update status",
|
||||
"game_id", record.GameID,
|
||||
"container_id", summary.ID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
reconciler.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
|
||||
GameID: record.GameID,
|
||||
ContainerID: summary.ID,
|
||||
EventType: health.EventTypeContainerExited,
|
||||
OccurredAt: now,
|
||||
Details: containerExitedDetails(inspect.ExitCode, inspect.OOMKilled),
|
||||
})
|
||||
reconciler.telemetry.RecordReconcileDrift(ctx, driftKindObservedExited)
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", record.GameID,
|
||||
"container_id", summary.ID,
|
||||
"exit_code", inspect.ExitCode,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
reconciler.logger.InfoContext(ctx, "reconciler observed exited container", logArgs...)
|
||||
}
|
||||
|
||||
// resolveStartedAt prefers the `com.galaxy.started_at_ms` label written
|
||||
// by the start service. When the label is absent or unparseable, it
|
||||
// falls back to a full inspect of the container; if inspect also fails
|
||||
// or returns a zero StartedAt, the current clock is used so the record
|
||||
// still validates.
|
||||
func (reconciler *Reconciler) resolveStartedAt(ctx context.Context, summary ports.ContainerSummary) time.Time {
|
||||
if raw, ok := summary.Labels[startruntime.LabelStartedAtMs]; ok && raw != "" {
|
||||
if ms, err := strconv.ParseInt(raw, 10, 64); err == nil && ms > 0 {
|
||||
return time.UnixMilli(ms).UTC()
|
||||
}
|
||||
}
|
||||
inspect, err := reconciler.docker.InspectContainer(ctx, summary.ID)
|
||||
if err == nil && !inspect.StartedAt.IsZero() {
|
||||
return inspect.StartedAt.UTC()
|
||||
}
|
||||
return reconciler.clock().UTC()
|
||||
}
|
||||
|
||||
// engineEndpoint mirrors the URL shape produced by the docker adapter
|
||||
// (`internal/adapters/docker/client.go::Run`).
|
||||
func (reconciler *Reconciler) engineEndpoint(gameID string) string {
|
||||
return fmt.Sprintf("http://%s%s:8080", startruntime.HostnamePrefix, gameID)
|
||||
}
|
||||
|
||||
// releaseLease releases the per-game lease in a fresh background
|
||||
// context so a canceled tick context does not leave the lease pinned
|
||||
// for its TTL.
|
||||
func (reconciler *Reconciler) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := reconciler.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortAppend writes one operation_log entry. A failure is logged
|
||||
// and discarded; the durable runtime record (or its absence) remains
|
||||
// the source of truth.
|
||||
func (reconciler *Reconciler) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := reconciler.operationLogs.Append(ctx, entry); err != nil {
|
||||
reconciler.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortPublishHealth emits one health event + snapshot upsert.
|
||||
// Failures degrade silently per `rtmanager/README.md §Notification
|
||||
// Contracts`; the runtime record remains the source of truth.
|
||||
func (reconciler *Reconciler) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
||||
if err := reconciler.healthEvents.Publish(ctx, envelope); err != nil {
|
||||
reconciler.logger.ErrorContext(ctx, "publish health event",
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
reconciler.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
|
||||
}
|
||||
|
||||
// containerExitedDetails matches the JSON shape produced by the events
|
||||
// listener so consumers see a single contracted payload regardless of
|
||||
// the source.
|
||||
func containerExitedDetails(exitCode int, oom bool) json.RawMessage {
|
||||
payload := struct {
|
||||
ExitCode int `json:"exit_code"`
|
||||
OOM bool `json:"oom"`
|
||||
}{ExitCode: exitCode, OOM: oom}
|
||||
encoded, _ := json.Marshal(payload)
|
||||
return encoded
|
||||
}
|
||||
|
||||
// containerDisappearedDetails returns the canonical empty-object
|
||||
// payload required by the `container_disappeared` AsyncAPI variant.
|
||||
func containerDisappearedDetails() json.RawMessage {
|
||||
return json.RawMessage(`{}`)
|
||||
}
|
||||
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,740 @@
|
||||
package reconcile_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
"galaxy/rtmanager/internal/worker/reconcile"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
func silentLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||
}
|
||||
|
||||
// --- fake doubles -----------------------------------------------------
|
||||
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
upsertErr error
|
||||
updateStatusErr error
|
||||
listErr error
|
||||
|
||||
upserts []runtime.RuntimeRecord
|
||||
updates []ports.UpdateStatusInput
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
for _, record := range records {
|
||||
s.stored[record.GameID] = record
|
||||
}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.upsertErr != nil {
|
||||
return s.upsertErr
|
||||
}
|
||||
s.upserts = append(s.upserts, record)
|
||||
s.stored[record.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.updates = append(s.updates, input)
|
||||
if s.updateStatusErr != nil {
|
||||
return s.updateStatusErr
|
||||
}
|
||||
record, ok := s.stored[input.GameID]
|
||||
if !ok {
|
||||
return runtime.ErrNotFound
|
||||
}
|
||||
if record.Status != input.ExpectedFrom {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
record.Status = input.To
|
||||
record.LastOpAt = input.Now
|
||||
switch input.To {
|
||||
case runtime.StatusStopped:
|
||||
t := input.Now
|
||||
record.StoppedAt = &t
|
||||
case runtime.StatusRemoved:
|
||||
t := input.Now
|
||||
record.RemovedAt = &t
|
||||
record.CurrentContainerID = ""
|
||||
}
|
||||
s.stored[input.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in reconciler tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.listErr != nil {
|
||||
return nil, s.listErr
|
||||
}
|
||||
var out []runtime.RuntimeRecord
|
||||
for _, record := range s.stored {
|
||||
if record.Status == status {
|
||||
out = append(out, record)
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upserts() []runtime.RuntimeRecord {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]runtime.RuntimeRecord, len(s.upserts))
|
||||
copy(out, s.upserts)
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Updates() []ports.UpdateStatusInput {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]ports.UpdateStatusInput, len(s.updates))
|
||||
copy(out, s.updates)
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
|
||||
appendErr error
|
||||
appends []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.appendErr != nil {
|
||||
return 0, s.appendErr
|
||||
}
|
||||
s.appends = append(s.appends, entry)
|
||||
return int64(len(s.appends)), nil
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in reconciler tests")
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Appends() []operation.OperationEntry {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]operation.OperationEntry, len(s.appends))
|
||||
copy(out, s.appends)
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
publishErr error
|
||||
published []ports.HealthEventEnvelope
|
||||
}
|
||||
|
||||
func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.publishErr != nil {
|
||||
return s.publishErr
|
||||
}
|
||||
s.published = append(s.published, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]ports.HealthEventEnvelope, len(s.published))
|
||||
copy(out, s.published)
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeLeases struct {
|
||||
mu sync.Mutex
|
||||
|
||||
acquired bool
|
||||
acquireErr error
|
||||
releaseErr error
|
||||
|
||||
acquires []string
|
||||
releases []string
|
||||
}
|
||||
|
||||
func (l *fakeLeases) TryAcquire(_ context.Context, gameID, token string, _ time.Duration) (bool, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.acquires = append(l.acquires, gameID+":"+token)
|
||||
if l.acquireErr != nil {
|
||||
return false, l.acquireErr
|
||||
}
|
||||
return l.acquired, nil
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Release(_ context.Context, gameID, token string) error {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.releases = append(l.releases, gameID+":"+token)
|
||||
return l.releaseErr
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Acquires() []string {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
out := make([]string, len(l.acquires))
|
||||
copy(out, l.acquires)
|
||||
return out
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Releases() []string {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
out := make([]string, len(l.releases))
|
||||
copy(out, l.releases)
|
||||
return out
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
docker *mocks.MockDockerClient
|
||||
records *fakeRuntimeRecords
|
||||
operationLogs *fakeOperationLogs
|
||||
healthEvents *fakeHealthEvents
|
||||
leases *fakeLeases
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
now time.Time
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
docker: mocks.NewMockDockerClient(ctrl),
|
||||
records: newFakeRuntimeRecords(),
|
||||
operationLogs: &fakeOperationLogs{},
|
||||
healthEvents: &fakeHealthEvents{},
|
||||
leases: &fakeLeases{acquired: true},
|
||||
telemetry: telemetryRuntime,
|
||||
now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T) *reconcile.Reconciler {
|
||||
t.Helper()
|
||||
r, err := reconcile.NewReconciler(reconcile.Dependencies{
|
||||
Docker: h.docker,
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
HealthEvents: h.healthEvents,
|
||||
Leases: h.leases,
|
||||
Telemetry: h.telemetry,
|
||||
DockerCfg: config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
},
|
||||
ContainerCfg: config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
},
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Interval: 50 * time.Millisecond,
|
||||
Clock: func() time.Time { return h.now },
|
||||
Logger: silentLogger(),
|
||||
NewToken: func() string { return "token-A" },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return r
|
||||
}
|
||||
|
||||
// runningRecord builds a baseline runtime record in `running` state.
|
||||
func runningRecord(gameID, containerID string, startedAt time.Time) runtime.RuntimeRecord {
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: gameID,
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: containerID,
|
||||
CurrentImageRef: "galaxy/game:1.0.0",
|
||||
EngineEndpoint: "http://galaxy-game-" + gameID + ":8080",
|
||||
StatePath: "/var/lib/galaxy/games/" + gameID,
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func ownedSummary(gameID, containerID, imageRef, status string, startedAtMs int64) ports.ContainerSummary {
|
||||
labels := map[string]string{
|
||||
startruntime.LabelOwner: startruntime.LabelOwnerValue,
|
||||
startruntime.LabelKind: startruntime.LabelKindValue,
|
||||
startruntime.LabelGameID: gameID,
|
||||
startruntime.LabelEngineImageRef: imageRef,
|
||||
}
|
||||
if startedAtMs > 0 {
|
||||
labels[startruntime.LabelStartedAtMs] = strconv.FormatInt(startedAtMs, 10)
|
||||
}
|
||||
return ports.ContainerSummary{
|
||||
ID: containerID,
|
||||
ImageRef: imageRef,
|
||||
Hostname: "galaxy-game-" + gameID,
|
||||
Labels: labels,
|
||||
Status: status,
|
||||
StartedAt: time.UnixMilli(startedAtMs).UTC(),
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor ------------------------------------------------------
|
||||
|
||||
func TestNewReconcilerRejectsMissingDeps(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
dockerCfg := config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
}
|
||||
containerCfg := config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
}
|
||||
coord := config.CoordinationConfig{GameLeaseTTL: time.Minute}
|
||||
|
||||
base := reconcile.Dependencies{
|
||||
Docker: mocks.NewMockDockerClient(ctrl),
|
||||
RuntimeRecords: newFakeRuntimeRecords(),
|
||||
OperationLogs: &fakeOperationLogs{},
|
||||
HealthEvents: &fakeHealthEvents{},
|
||||
Leases: &fakeLeases{acquired: true},
|
||||
Telemetry: telemetryRuntime,
|
||||
DockerCfg: dockerCfg,
|
||||
ContainerCfg: containerCfg,
|
||||
Coordination: coord,
|
||||
Interval: time.Second,
|
||||
}
|
||||
|
||||
defectives := []reconcile.Dependencies{
|
||||
{},
|
||||
{Docker: base.Docker},
|
||||
{Docker: base.Docker, RuntimeRecords: base.RuntimeRecords},
|
||||
{Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs},
|
||||
{Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs, HealthEvents: base.HealthEvents},
|
||||
{Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs, HealthEvents: base.HealthEvents, Leases: base.Leases},
|
||||
{Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs, HealthEvents: base.HealthEvents, Leases: base.Leases, Telemetry: base.Telemetry},
|
||||
}
|
||||
for index, deps := range defectives {
|
||||
_, err := reconcile.NewReconciler(deps)
|
||||
require.Errorf(t, err, "case %d should fail", index)
|
||||
}
|
||||
|
||||
_, err = reconcile.NewReconciler(base)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
// --- adopt ------------------------------------------------------------
|
||||
|
||||
func TestReconcileAdoptInsertsRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 30, 0, 0, time.UTC)
|
||||
summary := ownedSummary("game-a", "ctr-game-a", "galaxy/game:1.2.3", "running", startedAt.UnixMilli())
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
upserts := h.records.Upserts()
|
||||
require.Len(t, upserts, 1)
|
||||
got := upserts[0]
|
||||
assert.Equal(t, "game-a", got.GameID)
|
||||
assert.Equal(t, runtime.StatusRunning, got.Status)
|
||||
assert.Equal(t, "ctr-game-a", got.CurrentContainerID)
|
||||
assert.Equal(t, "galaxy/game:1.2.3", got.CurrentImageRef)
|
||||
assert.Equal(t, "http://galaxy-game-game-a:8080", got.EngineEndpoint)
|
||||
assert.Equal(t, "/var/lib/galaxy/games/game-a", got.StatePath)
|
||||
assert.Equal(t, "galaxy-net", got.DockerNetwork)
|
||||
require.NotNil(t, got.StartedAt)
|
||||
assert.True(t, got.StartedAt.Equal(startedAt))
|
||||
|
||||
appends := h.operationLogs.Appends()
|
||||
require.Len(t, appends, 1)
|
||||
assert.Equal(t, operation.OpKindReconcileAdopt, appends[0].OpKind)
|
||||
assert.Equal(t, operation.OpSourceAutoReconcile, appends[0].OpSource)
|
||||
assert.Equal(t, operation.OutcomeSuccess, appends[0].Outcome)
|
||||
assert.Equal(t, "ctr-game-a", appends[0].ContainerID)
|
||||
|
||||
assert.Equal(t, []string{"game-a:token-A"}, h.leases.Acquires())
|
||||
assert.Equal(t, []string{"game-a:token-A"}, h.leases.Releases())
|
||||
assert.Empty(t, h.healthEvents.Published(), "adopt does not publish health events")
|
||||
}
|
||||
|
||||
func TestReconcileAdoptFallsBackToInspectStartedAtWhenLabelMissing(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
summary := ownedSummary("game-b", "ctr-game-b", "galaxy/game:1.0.0", "running", 0)
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
inspectStarted := time.Date(2026, 4, 28, 10, 0, 0, 0, time.UTC)
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-b").Return(ports.ContainerInspect{
|
||||
ID: "ctr-game-b",
|
||||
StartedAt: inspectStarted,
|
||||
Status: "running",
|
||||
}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
upserts := h.records.Upserts()
|
||||
require.Len(t, upserts, 1)
|
||||
require.NotNil(t, upserts[0].StartedAt)
|
||||
assert.True(t, upserts[0].StartedAt.Equal(inspectStarted))
|
||||
}
|
||||
|
||||
func TestReconcileAdoptSkipsWhenRecordAppearsConcurrently(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-c", "ctr-game-c", startedAt))
|
||||
|
||||
// Docker reports the same game running, but the record already
|
||||
// exists (start service won the race). The list pass sees the
|
||||
// record, so adopt path is never entered.
|
||||
summary := ownedSummary("game-c", "ctr-game-c", "galaxy/game:1.0.0", "running", startedAt.UnixMilli())
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.records.Upserts())
|
||||
assert.Empty(t, h.operationLogs.Appends())
|
||||
assert.Empty(t, h.leases.Acquires(), "no mutation -> no lease acquired")
|
||||
}
|
||||
|
||||
func TestReconcileAdoptSkipsNonRunningContainer(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
summary := ownedSummary("game-d", "ctr-game-d", "galaxy/game:1.0.0", "exited", time.Now().UnixMilli())
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.records.Upserts(), "exited container without record is not adopted")
|
||||
assert.Empty(t, h.leases.Acquires())
|
||||
}
|
||||
|
||||
// --- dispose ----------------------------------------------------------
|
||||
|
||||
func TestReconcileDisposeMarksRemoved(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-e", "ctr-game-e", startedAt))
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
updates := h.records.Updates()
|
||||
require.Len(t, updates, 1)
|
||||
assert.Equal(t, "game-e", updates[0].GameID)
|
||||
assert.Equal(t, runtime.StatusRunning, updates[0].ExpectedFrom)
|
||||
assert.Equal(t, "ctr-game-e", updates[0].ExpectedContainerID)
|
||||
assert.Equal(t, runtime.StatusRemoved, updates[0].To)
|
||||
|
||||
published := h.healthEvents.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, health.EventTypeContainerDisappeared, published[0].EventType)
|
||||
assert.Equal(t, "game-e", published[0].GameID)
|
||||
assert.Equal(t, "ctr-game-e", published[0].ContainerID)
|
||||
assert.JSONEq(t, `{}`, string(published[0].Details))
|
||||
|
||||
appends := h.operationLogs.Appends()
|
||||
require.Len(t, appends, 1)
|
||||
assert.Equal(t, operation.OpKindReconcileDispose, appends[0].OpKind)
|
||||
assert.Equal(t, operation.OpSourceAutoReconcile, appends[0].OpSource)
|
||||
}
|
||||
|
||||
func TestReconcileDisposeSkipsOnCASConflict(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-f", "ctr-game-f", startedAt))
|
||||
h.records.updateStatusErr = runtime.ErrConflict
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.healthEvents.Published(), "no health event when CAS lost")
|
||||
assert.Empty(t, h.operationLogs.Appends(), "no operation_log entry when CAS lost")
|
||||
}
|
||||
|
||||
func TestReconcileDisposeSkipsWhenStateChangedAfterReread(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
// Running record observed by ListByStatus, but Get under the lease
|
||||
// returns a record whose status has changed.
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
listed := runningRecord("game-g", "ctr-game-g", startedAt)
|
||||
h.records.Set(listed)
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil)
|
||||
|
||||
// Mutate the stored record to simulate concurrent stop completing
|
||||
// between the list pass and the lease re-read. The fake's Get
|
||||
// observes the mutated state.
|
||||
h.records.mu.Lock()
|
||||
stoppedAt := startedAt.Add(time.Minute)
|
||||
listed.Status = runtime.StatusStopped
|
||||
listed.StoppedAt = &stoppedAt
|
||||
h.records.stored["game-g"] = listed
|
||||
h.records.mu.Unlock()
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.records.Updates(), "re-read sees status != running -> skip")
|
||||
assert.Empty(t, h.healthEvents.Published())
|
||||
assert.Empty(t, h.operationLogs.Appends())
|
||||
}
|
||||
|
||||
// --- observed_exited --------------------------------------------------
|
||||
|
||||
func TestReconcileObservedExitedMarksStopped(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-h", "ctr-game-h", startedAt))
|
||||
|
||||
summary := ownedSummary("game-h", "ctr-game-h", "galaxy/game:1.0.0", "exited", startedAt.UnixMilli())
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-h").Return(ports.ContainerInspect{
|
||||
ID: "ctr-game-h",
|
||||
Status: "exited",
|
||||
ExitCode: 137,
|
||||
OOMKilled: false,
|
||||
}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
updates := h.records.Updates()
|
||||
require.Len(t, updates, 1)
|
||||
assert.Equal(t, runtime.StatusRunning, updates[0].ExpectedFrom)
|
||||
assert.Equal(t, "ctr-game-h", updates[0].ExpectedContainerID)
|
||||
assert.Equal(t, runtime.StatusStopped, updates[0].To)
|
||||
|
||||
published := h.healthEvents.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, health.EventTypeContainerExited, published[0].EventType)
|
||||
var details struct {
|
||||
ExitCode int `json:"exit_code"`
|
||||
OOM bool `json:"oom"`
|
||||
}
|
||||
require.NoError(t, json.Unmarshal(published[0].Details, &details))
|
||||
assert.Equal(t, 137, details.ExitCode)
|
||||
assert.False(t, details.OOM)
|
||||
|
||||
assert.Empty(t, h.operationLogs.Appends(), "observed_exited writes no operation_log entry")
|
||||
}
|
||||
|
||||
// --- no-op paths ------------------------------------------------------
|
||||
|
||||
func TestReconcileNoDriftIsNoop(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-i", "ctr-game-i", startedAt))
|
||||
|
||||
summary := ownedSummary("game-i", "ctr-game-i", "galaxy/game:1.0.0", "running", startedAt.UnixMilli())
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.records.Upserts())
|
||||
assert.Empty(t, h.records.Updates())
|
||||
assert.Empty(t, h.healthEvents.Published())
|
||||
assert.Empty(t, h.operationLogs.Appends())
|
||||
assert.Empty(t, h.leases.Acquires())
|
||||
}
|
||||
|
||||
func TestReconcileSkipsWhenContainerIDMismatch(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-j", "ctr-old", startedAt))
|
||||
|
||||
// Docker reports the new container id; restart is in flight.
|
||||
summary := ownedSummary("game-j", "ctr-new", "galaxy/game:1.0.0", "running", startedAt.UnixMilli())
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.records.Updates(), "id mismatch -> reconciler stays out of the way")
|
||||
assert.Empty(t, h.healthEvents.Published())
|
||||
}
|
||||
|
||||
// --- lease busy / errors ----------------------------------------------
|
||||
|
||||
func TestReconcileLeaseConflictSkipsGame(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquired = false
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-k", "ctr-game-k", startedAt))
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.records.Updates(), "lease busy -> dispose skipped")
|
||||
assert.Empty(t, h.healthEvents.Published())
|
||||
assert.Empty(t, h.leases.Releases(), "release not called when acquire returned false")
|
||||
}
|
||||
|
||||
func TestReconcileNowAbsorbsListError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, errors.New("docker daemon down"))
|
||||
|
||||
require.NoError(t, r.ReconcileNow(context.Background()))
|
||||
assert.Empty(t, h.records.Updates())
|
||||
assert.Empty(t, h.records.Upserts())
|
||||
}
|
||||
|
||||
func TestReconcileNowAbsorbsRecordsListError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
h.records.listErr = errors.New("pg down")
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil)
|
||||
|
||||
require.NoError(t, r.ReconcileNow(context.Background()))
|
||||
}
|
||||
|
||||
func TestReconcileNowReturnsContextError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
require.ErrorIs(t, r.ReconcileNow(ctx), context.Canceled)
|
||||
}
|
||||
|
||||
// --- Run lifecycle ----------------------------------------------------
|
||||
|
||||
func TestRunRespectsContextCancel(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- r.Run(ctx) }()
|
||||
|
||||
cancel()
|
||||
select {
|
||||
case err := <-done:
|
||||
assert.ErrorIs(t, err, context.Canceled)
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("Run did not exit after cancel")
|
||||
}
|
||||
}
|
||||
|
||||
func TestShutdownIsNoOp(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
require.NoError(t, r.Shutdown(context.Background()))
|
||||
}
|
||||
|
||||
// --- compile-time safety ----------------------------------------------
|
||||
|
||||
var (
|
||||
_ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
|
||||
_ ports.OperationLogStore = (*fakeOperationLogs)(nil)
|
||||
_ ports.HealthEventPublisher = (*fakeHealthEvents)(nil)
|
||||
_ ports.GameLeaseStore = (*fakeLeases)(nil)
|
||||
)
|
||||
@@ -0,0 +1,337 @@
|
||||
// Package startjobsconsumer drives the asynchronous half of the
|
||||
// Lobby ↔ Runtime Manager start contract. The consumer XREADs from
|
||||
// `runtime:start_jobs` (produced by Lobby), decodes the envelope frozen
|
||||
// in `rtmanager/api/runtime-jobs-asyncapi.yaml`, calls the production
|
||||
// start orchestrator, and publishes one `runtime:job_results` outcome
|
||||
// per consumed envelope.
|
||||
//
|
||||
// Replay safety is provided by the start service: an idempotent re-run
|
||||
// surfaces as `Outcome=success` with `error_code=replay_no_op`. The
|
||||
// consumer copies the service Result fields into the `RuntimeJobResult`
|
||||
// payload verbatim. Per-message decode and publish errors are logged
|
||||
// and absorbed; the offset advances unconditionally so a single poison
|
||||
// message cannot pin the loop. Design rationale is captured in
|
||||
// `rtmanager/docs/workers.md`.
|
||||
package startjobsconsumer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// streamOffsetLabel identifies the start-jobs consumer in the stream
|
||||
// offset store. The label stays stable when the underlying stream key
|
||||
// is renamed via configuration. Matches the convention from
|
||||
// `rtmanager/README.md §Persistence Layout > Redis runtime-coordination state`.
|
||||
const streamOffsetLabel = "startjobs"
|
||||
|
||||
// Wire field names of the `RuntimeStartJob` payload. Frozen by
|
||||
// `rtmanager/api/runtime-jobs-asyncapi.yaml`; renaming any of them
|
||||
// requires a coordinated contract change with Lobby.
|
||||
const (
|
||||
fieldGameID = "game_id"
|
||||
fieldImageRef = "image_ref"
|
||||
fieldRequestedAtMS = "requested_at_ms"
|
||||
)
|
||||
|
||||
// StartService is the narrow surface the consumer needs from the start
|
||||
// orchestrator. The concrete `*startruntime.Service` satisfies this
|
||||
// interface and is wired in production.
|
||||
type StartService interface {
|
||||
Handle(ctx context.Context, input startruntime.Input) (startruntime.Result, error)
|
||||
}
|
||||
|
||||
// Config groups the dependencies required to construct a Consumer.
|
||||
type Config struct {
|
||||
// Client provides XREAD access to the start-jobs stream.
|
||||
Client *redis.Client
|
||||
|
||||
// Stream stores the Redis Streams key consumed by the worker.
|
||||
Stream string
|
||||
|
||||
// BlockTimeout bounds the blocking XREAD window.
|
||||
BlockTimeout time.Duration
|
||||
|
||||
// StartService executes the start lifecycle for each decoded
|
||||
// envelope.
|
||||
StartService StartService
|
||||
|
||||
// JobResults publishes one outcome entry per processed envelope.
|
||||
JobResults ports.JobResultPublisher
|
||||
|
||||
// OffsetStore persists the last successfully processed entry id so
|
||||
// the consumer survives restarts without replaying processed
|
||||
// envelopes.
|
||||
OffsetStore ports.StreamOffsetStore
|
||||
|
||||
// Logger receives structured worker-level events. Defaults to
|
||||
// `slog.Default` when nil.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// Consumer drives the start-jobs processing loop.
|
||||
type Consumer struct {
|
||||
client *redis.Client
|
||||
stream string
|
||||
blockTimeout time.Duration
|
||||
startService StartService
|
||||
jobResults ports.JobResultPublisher
|
||||
offsetStore ports.StreamOffsetStore
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewConsumer constructs one Consumer from cfg. Validation errors
|
||||
// surface the missing collaborator verbatim.
|
||||
func NewConsumer(cfg Config) (*Consumer, error) {
|
||||
switch {
|
||||
case cfg.Client == nil:
|
||||
return nil, errors.New("new start jobs consumer: nil redis client")
|
||||
case strings.TrimSpace(cfg.Stream) == "":
|
||||
return nil, errors.New("new start jobs consumer: stream must not be empty")
|
||||
case cfg.BlockTimeout <= 0:
|
||||
return nil, errors.New("new start jobs consumer: block timeout must be positive")
|
||||
case cfg.StartService == nil:
|
||||
return nil, errors.New("new start jobs consumer: nil start service")
|
||||
case cfg.JobResults == nil:
|
||||
return nil, errors.New("new start jobs consumer: nil job results publisher")
|
||||
case cfg.OffsetStore == nil:
|
||||
return nil, errors.New("new start jobs consumer: nil offset store")
|
||||
}
|
||||
|
||||
logger := cfg.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
return &Consumer{
|
||||
client: cfg.Client,
|
||||
stream: cfg.Stream,
|
||||
blockTimeout: cfg.BlockTimeout,
|
||||
startService: cfg.StartService,
|
||||
jobResults: cfg.JobResults,
|
||||
offsetStore: cfg.OffsetStore,
|
||||
logger: logger.With("worker", "rtmanager.startjobs", "stream", cfg.Stream),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run drives the XREAD loop until ctx is cancelled. Per-message
|
||||
// outcomes are absorbed by HandleMessage; the loop only exits on
|
||||
// context cancellation or a fatal Redis / offset-store error.
|
||||
func (consumer *Consumer) Run(ctx context.Context) error {
|
||||
if consumer == nil || consumer.client == nil {
|
||||
return errors.New("run start jobs consumer: nil consumer")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run start jobs consumer: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
lastID, found, err := consumer.offsetStore.Load(ctx, streamOffsetLabel)
|
||||
if err != nil {
|
||||
return fmt.Errorf("run start jobs consumer: load offset: %w", err)
|
||||
}
|
||||
if !found {
|
||||
lastID = "0-0"
|
||||
}
|
||||
|
||||
consumer.logger.Info("start jobs consumer started",
|
||||
"block_timeout", consumer.blockTimeout.String(),
|
||||
"start_entry_id", lastID,
|
||||
)
|
||||
defer consumer.logger.Info("start jobs consumer stopped")
|
||||
|
||||
for {
|
||||
streams, err := consumer.client.XRead(ctx, &redis.XReadArgs{
|
||||
Streams: []string{consumer.stream, lastID},
|
||||
Count: 1,
|
||||
Block: consumer.blockTimeout,
|
||||
}).Result()
|
||||
switch {
|
||||
case err == nil:
|
||||
for _, stream := range streams {
|
||||
for _, message := range stream.Messages {
|
||||
consumer.HandleMessage(ctx, message)
|
||||
if err := consumer.offsetStore.Save(ctx, streamOffsetLabel, message.ID); err != nil {
|
||||
return fmt.Errorf("run start jobs consumer: save offset: %w", err)
|
||||
}
|
||||
lastID = message.ID
|
||||
}
|
||||
}
|
||||
case errors.Is(err, redis.Nil):
|
||||
continue
|
||||
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)):
|
||||
return ctx.Err()
|
||||
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed):
|
||||
return fmt.Errorf("run start jobs consumer: %w", err)
|
||||
default:
|
||||
return fmt.Errorf("run start jobs consumer: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; the consumer relies on context cancellation.
|
||||
func (consumer *Consumer) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown start jobs consumer: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// HandleMessage processes one Redis Stream message. Exported so tests
|
||||
// can drive the consumer deterministically without spinning up a real
|
||||
// XREAD loop.
|
||||
//
|
||||
// Per-message errors are logged and absorbed: the worker keeps running
|
||||
// and the offset is allowed to advance.
|
||||
func (consumer *Consumer) HandleMessage(ctx context.Context, message redis.XMessage) {
|
||||
if consumer == nil {
|
||||
return
|
||||
}
|
||||
|
||||
envelope, err := decodeStartJob(message)
|
||||
if err != nil {
|
||||
consumer.logger.WarnContext(ctx, "decode start job",
|
||||
"stream_entry_id", message.ID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
input := startruntime.Input{
|
||||
GameID: envelope.GameID,
|
||||
ImageRef: envelope.ImageRef,
|
||||
OpSource: operation.OpSourceLobbyStream,
|
||||
SourceRef: message.ID,
|
||||
}
|
||||
result, err := consumer.startService.Handle(ctx, input)
|
||||
if err != nil {
|
||||
consumer.logger.ErrorContext(ctx, "start service returned go-level error",
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", envelope.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
jobResult := buildJobResult(envelope.GameID, result)
|
||||
if err := consumer.jobResults.Publish(ctx, jobResult); err != nil {
|
||||
consumer.logger.ErrorContext(ctx, "publish job result",
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", envelope.GameID,
|
||||
"outcome", jobResult.Outcome,
|
||||
"error_code", jobResult.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
logArgs := []any{
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", envelope.GameID,
|
||||
"outcome", jobResult.Outcome,
|
||||
"error_code", jobResult.ErrorCode,
|
||||
"requested_at_ms", envelope.RequestedAtMS,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
consumer.logger.InfoContext(ctx, "start job processed", logArgs...)
|
||||
}
|
||||
|
||||
// startJobEnvelope stores the decoded shape of one `runtime:start_jobs`
|
||||
// stream entry.
|
||||
type startJobEnvelope struct {
|
||||
GameID string
|
||||
ImageRef string
|
||||
RequestedAtMS int64
|
||||
}
|
||||
|
||||
func decodeStartJob(message redis.XMessage) (startJobEnvelope, error) {
|
||||
gameID := strings.TrimSpace(optionalString(message.Values, fieldGameID))
|
||||
if gameID == "" {
|
||||
return startJobEnvelope{}, errors.New("missing game_id")
|
||||
}
|
||||
imageRef := strings.TrimSpace(optionalString(message.Values, fieldImageRef))
|
||||
if imageRef == "" {
|
||||
return startJobEnvelope{}, errors.New("missing image_ref")
|
||||
}
|
||||
requestedAtMS, err := optionalInt64(message.Values, fieldRequestedAtMS)
|
||||
if err != nil {
|
||||
return startJobEnvelope{}, fmt.Errorf("invalid requested_at_ms: %w", err)
|
||||
}
|
||||
return startJobEnvelope{
|
||||
GameID: gameID,
|
||||
ImageRef: imageRef,
|
||||
RequestedAtMS: requestedAtMS,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// buildJobResult translates a startruntime.Result into the wire payload
|
||||
// published on `runtime:job_results`. ContainerID and EngineEndpoint are
|
||||
// taken from the service's Record on success / replay; on failure the
|
||||
// service returns a zero Record and both fields stay empty per the
|
||||
// AsyncAPI contract (required field, empty string is a valid value).
|
||||
func buildJobResult(gameID string, result startruntime.Result) ports.JobResult {
|
||||
jobResult := ports.JobResult{
|
||||
GameID: gameID,
|
||||
Outcome: string(result.Outcome),
|
||||
ErrorCode: result.ErrorCode,
|
||||
ErrorMessage: result.ErrorMessage,
|
||||
}
|
||||
if result.Outcome == operation.OutcomeSuccess {
|
||||
jobResult.ContainerID = result.Record.CurrentContainerID
|
||||
jobResult.EngineEndpoint = result.Record.EngineEndpoint
|
||||
}
|
||||
return jobResult
|
||||
}
|
||||
|
||||
func optionalString(values map[string]any, key string) string {
|
||||
raw, ok := values[key]
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
switch typed := raw.(type) {
|
||||
case string:
|
||||
return typed
|
||||
case []byte:
|
||||
return string(typed)
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func optionalInt64(values map[string]any, key string) (int64, error) {
|
||||
raw, ok := values[key]
|
||||
if !ok {
|
||||
return 0, nil
|
||||
}
|
||||
var stringValue string
|
||||
switch typed := raw.(type) {
|
||||
case string:
|
||||
stringValue = typed
|
||||
case []byte:
|
||||
stringValue = string(typed)
|
||||
default:
|
||||
return 0, fmt.Errorf("unsupported type %T", raw)
|
||||
}
|
||||
stringValue = strings.TrimSpace(stringValue)
|
||||
if stringValue == "" {
|
||||
return 0, nil
|
||||
}
|
||||
parsed, err := strconv.ParseInt(stringValue, 10, 64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return parsed, nil
|
||||
}
|
||||
@@ -0,0 +1,631 @@
|
||||
package startjobsconsumer_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/adapters/jobresultspublisher"
|
||||
"galaxy/rtmanager/internal/adapters/redisstate/streamoffsets"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
"galaxy/rtmanager/internal/worker/startjobsconsumer"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
func silentLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||
}
|
||||
|
||||
type fakeStartService struct {
|
||||
mu sync.Mutex
|
||||
inputs []startruntime.Input
|
||||
result startruntime.Result
|
||||
err error
|
||||
hook func(input startruntime.Input) (startruntime.Result, error)
|
||||
}
|
||||
|
||||
func (s *fakeStartService) Handle(_ context.Context, input startruntime.Input) (startruntime.Result, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.inputs = append(s.inputs, input)
|
||||
if s.hook != nil {
|
||||
return s.hook(input)
|
||||
}
|
||||
return s.result, s.err
|
||||
}
|
||||
|
||||
func (s *fakeStartService) Inputs() []startruntime.Input {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]startruntime.Input, len(s.inputs))
|
||||
copy(out, s.inputs)
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeJobResults struct {
|
||||
mu sync.Mutex
|
||||
published []ports.JobResult
|
||||
publishErr error
|
||||
}
|
||||
|
||||
func (s *fakeJobResults) Publish(_ context.Context, result ports.JobResult) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.publishErr != nil {
|
||||
return s.publishErr
|
||||
}
|
||||
s.published = append(s.published, result)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeJobResults) Published() []ports.JobResult {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]ports.JobResult, len(s.published))
|
||||
copy(out, s.published)
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeOffsetStore struct {
|
||||
mu sync.Mutex
|
||||
offsets map[string]string
|
||||
loadErr error
|
||||
saveErr error
|
||||
}
|
||||
|
||||
func newFakeOffsetStore() *fakeOffsetStore {
|
||||
return &fakeOffsetStore{offsets: map[string]string{}}
|
||||
}
|
||||
|
||||
func (s *fakeOffsetStore) Load(_ context.Context, label string) (string, bool, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.loadErr != nil {
|
||||
return "", false, s.loadErr
|
||||
}
|
||||
value, ok := s.offsets[label]
|
||||
return value, ok, nil
|
||||
}
|
||||
|
||||
func (s *fakeOffsetStore) Save(_ context.Context, label, entryID string) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.saveErr != nil {
|
||||
return s.saveErr
|
||||
}
|
||||
s.offsets[label] = entryID
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeOffsetStore) Get(label string) (string, bool) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
value, ok := s.offsets[label]
|
||||
return value, ok
|
||||
}
|
||||
|
||||
type harness struct {
|
||||
consumer *startjobsconsumer.Consumer
|
||||
starts *fakeStartService
|
||||
results *fakeJobResults
|
||||
offsets *fakeOffsetStore
|
||||
stream string
|
||||
server *miniredis.Miniredis
|
||||
client *redis.Client
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
starts := &fakeStartService{}
|
||||
results := &fakeJobResults{}
|
||||
offsets := newFakeOffsetStore()
|
||||
stream := "runtime:start_jobs"
|
||||
|
||||
consumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{
|
||||
Client: client,
|
||||
Stream: stream,
|
||||
BlockTimeout: 50 * time.Millisecond,
|
||||
StartService: starts,
|
||||
JobResults: results,
|
||||
OffsetStore: offsets,
|
||||
Logger: silentLogger(),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
consumer: consumer,
|
||||
starts: starts,
|
||||
results: results,
|
||||
offsets: offsets,
|
||||
stream: stream,
|
||||
server: server,
|
||||
client: client,
|
||||
}
|
||||
}
|
||||
|
||||
func startMessage(id, gameID, imageRef string, requestedAtMS int64) redis.XMessage {
|
||||
return redis.XMessage{
|
||||
ID: id,
|
||||
Values: map[string]any{
|
||||
"game_id": gameID,
|
||||
"image_ref": imageRef,
|
||||
"requested_at_ms": strconv.FormatInt(requestedAtMS, 10),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewConsumerRejectsMissingDeps(t *testing.T) {
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
cases := []startjobsconsumer.Config{
|
||||
{},
|
||||
{Client: client},
|
||||
{Client: client, Stream: "runtime:start_jobs"},
|
||||
{Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second},
|
||||
{Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second, StartService: &fakeStartService{}},
|
||||
{Client: client, Stream: "runtime:start_jobs", BlockTimeout: time.Second, StartService: &fakeStartService{}, JobResults: &fakeJobResults{}},
|
||||
}
|
||||
for index, cfg := range cases {
|
||||
_, err := startjobsconsumer.NewConsumer(cfg)
|
||||
require.Errorf(t, err, "case %d should fail", index)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleMessageSuccessPublishesSuccessResult(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.result = startruntime.Result{
|
||||
Record: runtime.RuntimeRecord{
|
||||
GameID: "game-1",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "c-1",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
},
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), startMessage("100-0", "game-1", "galaxy/game:1.0.0", 1700))
|
||||
|
||||
inputs := h.starts.Inputs()
|
||||
require.Len(t, inputs, 1)
|
||||
assert.Equal(t, "game-1", inputs[0].GameID)
|
||||
assert.Equal(t, "galaxy/game:1.0.0", inputs[0].ImageRef)
|
||||
assert.Equal(t, operation.OpSourceLobbyStream, inputs[0].OpSource)
|
||||
assert.Equal(t, "100-0", inputs[0].SourceRef)
|
||||
|
||||
published := h.results.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, ports.JobResult{
|
||||
GameID: "game-1",
|
||||
Outcome: ports.JobOutcomeSuccess,
|
||||
ContainerID: "c-1",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
}, published[0])
|
||||
}
|
||||
|
||||
func TestHandleMessageFailurePublishesFailureResult(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.result = startruntime.Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: startruntime.ErrorCodeImagePullFailed,
|
||||
ErrorMessage: "manifest unknown",
|
||||
}
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), startMessage("101-0", "game-2", "galaxy/game:bad", 1700))
|
||||
|
||||
published := h.results.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, ports.JobResult{
|
||||
GameID: "game-2",
|
||||
Outcome: ports.JobOutcomeFailure,
|
||||
ErrorCode: "image_pull_failed",
|
||||
ErrorMessage: "manifest unknown",
|
||||
}, published[0])
|
||||
}
|
||||
|
||||
func TestHandleMessageReplayNoOpKeepsContainerAndEndpoint(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.result = startruntime.Result{
|
||||
Record: runtime.RuntimeRecord{
|
||||
GameID: "game-3",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "c-3",
|
||||
EngineEndpoint: "http://galaxy-game-game-3:8080",
|
||||
},
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
||||
}
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), startMessage("102-0", "game-3", "galaxy/game:1.0.0", 1700))
|
||||
|
||||
published := h.results.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, ports.JobResult{
|
||||
GameID: "game-3",
|
||||
Outcome: ports.JobOutcomeSuccess,
|
||||
ContainerID: "c-3",
|
||||
EngineEndpoint: "http://galaxy-game-game-3:8080",
|
||||
ErrorCode: "replay_no_op",
|
||||
}, published[0])
|
||||
}
|
||||
|
||||
func TestHandleMessageMalformedEnvelopesAreAbsorbed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
cases := []redis.XMessage{
|
||||
{ID: "200-0", Values: map[string]any{"image_ref": "galaxy/game:1.0.0", "requested_at_ms": "1"}},
|
||||
{ID: "200-1", Values: map[string]any{"game_id": " ", "image_ref": "galaxy/game:1.0.0", "requested_at_ms": "1"}},
|
||||
{ID: "200-2", Values: map[string]any{"game_id": "game-x", "requested_at_ms": "1"}},
|
||||
{ID: "200-3", Values: map[string]any{"game_id": "game-x", "image_ref": " ", "requested_at_ms": "1"}},
|
||||
{ID: "200-4", Values: map[string]any{"game_id": "game-x", "image_ref": "galaxy/game:1.0.0", "requested_at_ms": "not-a-number"}},
|
||||
}
|
||||
for _, msg := range cases {
|
||||
h.consumer.HandleMessage(context.Background(), msg)
|
||||
}
|
||||
|
||||
assert.Empty(t, h.starts.Inputs(), "malformed envelopes must not reach the start service")
|
||||
assert.Empty(t, h.results.Published(), "malformed envelopes must not produce job results")
|
||||
}
|
||||
|
||||
func TestHandleMessagePublishFailureIsAbsorbed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.result = startruntime.Result{Outcome: operation.OutcomeFailure, ErrorCode: "internal_error"}
|
||||
h.results.publishErr = errors.New("redis transient")
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), startMessage("300-0", "game-x", "galaxy/game:1.0.0", 1700))
|
||||
|
||||
require.Len(t, h.starts.Inputs(), 1, "service still runs even when publish fails")
|
||||
}
|
||||
|
||||
func TestHandleMessageGoLevelErrorIsAbsorbed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.err = errors.New("nil ctx")
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), startMessage("400-0", "game-y", "galaxy/game:1.0.0", 1700))
|
||||
|
||||
assert.Empty(t, h.results.Published(), "go-level service errors must not surface as job results")
|
||||
}
|
||||
|
||||
func TestRunAdvancesOffsetPerMessage(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.result = startruntime.Result{
|
||||
Record: runtime.RuntimeRecord{
|
||||
GameID: "game-5",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "c-5",
|
||||
EngineEndpoint: "http://galaxy-game-game-5:8080",
|
||||
},
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- h.consumer.Run(ctx) }()
|
||||
|
||||
mustXAdd(t, h.client, h.stream, "game-5", "galaxy/game:1.0.0", 1)
|
||||
mustXAdd(t, h.client, h.stream, "game-5", "galaxy/game:1.0.0", 2)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
return len(h.results.Published()) == 2
|
||||
}, time.Second, 10*time.Millisecond, "consumer must produce one job result per envelope")
|
||||
|
||||
cancel()
|
||||
require.Eventually(t, func() bool {
|
||||
select {
|
||||
case <-done:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}, time.Second, 10*time.Millisecond, "Run must exit after context cancel")
|
||||
|
||||
id, ok := h.offsets.Get("startjobs")
|
||||
require.True(t, ok, "offset must be persisted after the run loop processed messages")
|
||||
assert.NotEmpty(t, id, "offset entry id must not be empty")
|
||||
}
|
||||
|
||||
func TestRunResumesFromPersistedOffset(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.starts.result = startruntime.Result{
|
||||
Record: runtime.RuntimeRecord{
|
||||
GameID: "game-6",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "c-6",
|
||||
EngineEndpoint: "http://galaxy-game-game-6:8080",
|
||||
},
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}
|
||||
|
||||
preID := mustXAdd(t, h.client, h.stream, "game-6", "galaxy/game:1.0.0", 1)
|
||||
require.NoError(t, h.offsets.Save(context.Background(), "startjobs", preID))
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- h.consumer.Run(ctx) }()
|
||||
|
||||
mustXAdd(t, h.client, h.stream, "game-6", "galaxy/game:1.0.0", 2)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
return len(h.results.Published()) == 1
|
||||
}, time.Second, 10*time.Millisecond, "consumer must skip the pre-existing entry and process only the new one")
|
||||
|
||||
cancel()
|
||||
<-done
|
||||
}
|
||||
|
||||
func TestRunExitsImmediatelyOnAlreadyCancelledContext(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
err := h.consumer.Run(ctx)
|
||||
require.ErrorIs(t, err, context.Canceled)
|
||||
assert.Empty(t, h.starts.Inputs())
|
||||
assert.Empty(t, h.results.Published())
|
||||
}
|
||||
|
||||
func mustXAdd(t *testing.T, client *redis.Client, stream, gameID, imageRef string, requestedAtMS int64) string {
|
||||
t.Helper()
|
||||
id, err := client.XAdd(context.Background(), &redis.XAddArgs{
|
||||
Stream: stream,
|
||||
Values: map[string]any{
|
||||
"game_id": gameID,
|
||||
"image_ref": imageRef,
|
||||
"requested_at_ms": strconv.FormatInt(requestedAtMS, 10),
|
||||
},
|
||||
}).Result()
|
||||
require.NoError(t, err)
|
||||
return id
|
||||
}
|
||||
|
||||
// --- in-memory fakes for the roundtrip integration test ----------------------
|
||||
|
||||
type memoryRecords struct {
|
||||
mu sync.Mutex
|
||||
store map[string]runtime.RuntimeRecord
|
||||
}
|
||||
|
||||
func newMemoryRecords() *memoryRecords {
|
||||
return &memoryRecords{store: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *memoryRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
record, ok := s.store[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *memoryRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.store[record.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *memoryRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
|
||||
return errors.New("not used in start integration test")
|
||||
}
|
||||
|
||||
func (s *memoryRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in start integration test")
|
||||
}
|
||||
|
||||
func (s *memoryRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in start integration test")
|
||||
}
|
||||
|
||||
type memoryOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
entries []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *memoryOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.entries = append(s.entries, entry)
|
||||
return int64(len(s.entries)), nil
|
||||
}
|
||||
|
||||
func (s *memoryOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in start integration test")
|
||||
}
|
||||
|
||||
type memoryLeases struct{}
|
||||
|
||||
func (l *memoryLeases) TryAcquire(_ context.Context, _, _ string, _ time.Duration) (bool, error) {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func (l *memoryLeases) Release(_ context.Context, _, _ string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
type memoryHealthEvents struct{}
|
||||
|
||||
func (h *memoryHealthEvents) Publish(_ context.Context, _ ports.HealthEventEnvelope) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
type memoryNotifications struct{}
|
||||
|
||||
func (n *memoryNotifications) Publish(_ context.Context, _ notificationintent.Intent) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// TestRoundTripStartJobThroughRealServiceAndPublisher exercises the
|
||||
// Lobby → RTM → Lobby contract end-to-end inside one process: an XADD
|
||||
// in the documented `runtime:start_jobs` shape is consumed, the real
|
||||
// `startruntime.Service` runs against an in-memory fake stack and a
|
||||
// gomock-backed Docker port, the real `jobresultspublisher` writes to
|
||||
// `runtime:job_results`, and the test asserts the symmetric wire shape.
|
||||
//
|
||||
// A second XADD of the same envelope must surface as
|
||||
// `error_code=replay_no_op` per the AsyncAPI replay-safety rule.
|
||||
func TestRoundTripStartJobThroughRealServiceAndPublisher(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
now := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
|
||||
records := newMemoryRecords()
|
||||
dockerMock := mocks.NewMockDockerClient(ctrl)
|
||||
|
||||
dockerMock.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil).Times(1)
|
||||
dockerMock.EXPECT().PullImage(gomock.Any(), "galaxy/game:1.0.0", ports.PullPolicy(config.ImagePullPolicyIfMissing)).Return(nil).Times(1)
|
||||
dockerMock.EXPECT().InspectImage(gomock.Any(), "galaxy/game:1.0.0").Return(ports.ImageInspect{
|
||||
Ref: "galaxy/game:1.0.0",
|
||||
Labels: map[string]string{},
|
||||
}, nil).Times(1)
|
||||
dockerMock.EXPECT().Run(gomock.Any(), gomock.Any()).Return(ports.RunResult{
|
||||
ContainerID: "ctr-roundtrip",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StartedAt: now,
|
||||
}, nil).Times(1)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
containerCfg := config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
}
|
||||
dockerCfg := config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
}
|
||||
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
|
||||
|
||||
startService, err := startruntime.NewService(startruntime.Dependencies{
|
||||
RuntimeRecords: records,
|
||||
OperationLogs: &memoryOperationLogs{},
|
||||
Docker: dockerMock,
|
||||
Leases: &memoryLeases{},
|
||||
HealthEvents: &memoryHealthEvents{},
|
||||
Notifications: &memoryNotifications{},
|
||||
Container: containerCfg,
|
||||
DockerCfg: dockerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: telemetryRuntime,
|
||||
Logger: silentLogger(),
|
||||
Clock: func() time.Time { return now },
|
||||
NewToken: func() string { return "token-roundtrip" },
|
||||
PrepareStateDir: func(_ string) (string, error) {
|
||||
return "/var/lib/galaxy/games/game-1", nil
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
publisher, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{
|
||||
Client: client,
|
||||
Stream: "runtime:job_results",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
offsetStore, err := streamoffsets.New(streamoffsets.Config{Client: client})
|
||||
require.NoError(t, err)
|
||||
|
||||
consumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{
|
||||
Client: client,
|
||||
Stream: "runtime:start_jobs",
|
||||
BlockTimeout: 50 * time.Millisecond,
|
||||
StartService: startService,
|
||||
JobResults: publisher,
|
||||
OffsetStore: offsetStore,
|
||||
Logger: silentLogger(),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
t.Cleanup(cancel)
|
||||
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- consumer.Run(ctx) }()
|
||||
|
||||
mustXAdd(t, client, "runtime:start_jobs", "game-1", "galaxy/game:1.0.0", 1700)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result()
|
||||
return err == nil && len(entries) == 1
|
||||
}, 2*time.Second, 20*time.Millisecond, "first XADD must produce one job result entry")
|
||||
|
||||
entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, entries, 1)
|
||||
values := entries[0].Values
|
||||
assert.Equal(t, "game-1", values["game_id"])
|
||||
assert.Equal(t, "success", values["outcome"])
|
||||
assert.Equal(t, "ctr-roundtrip", values["container_id"])
|
||||
assert.Equal(t, "http://galaxy-game-game-1:8080", values["engine_endpoint"])
|
||||
assert.Equal(t, "", values["error_code"], "fresh start must publish empty error_code")
|
||||
assert.Equal(t, "", values["error_message"])
|
||||
|
||||
// Replay: the same envelope must surface as success/replay_no_op
|
||||
// because the runtime record now reports `running` with the same
|
||||
// image_ref. The Docker mock has no further expectations, so a
|
||||
// second pull/run would fail the test.
|
||||
mustXAdd(t, client, "runtime:start_jobs", "game-1", "galaxy/game:1.0.0", 1701)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
entries, err := client.XRange(ctx, "runtime:job_results", "-", "+").Result()
|
||||
return err == nil && len(entries) == 2
|
||||
}, 2*time.Second, 20*time.Millisecond, "second XADD must produce a replay_no_op job result")
|
||||
|
||||
entries, err = client.XRange(ctx, "runtime:job_results", "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, entries, 2)
|
||||
replay := entries[1].Values
|
||||
assert.Equal(t, "game-1", replay["game_id"])
|
||||
assert.Equal(t, "success", replay["outcome"])
|
||||
assert.Equal(t, "ctr-roundtrip", replay["container_id"])
|
||||
assert.Equal(t, "http://galaxy-game-game-1:8080", replay["engine_endpoint"])
|
||||
assert.Equal(t, "replay_no_op", replay["error_code"])
|
||||
assert.Equal(t, "", replay["error_message"])
|
||||
|
||||
cancel()
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("consumer Run did not exit after context cancel")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,332 @@
|
||||
// Package stopjobsconsumer drives the asynchronous half of the
|
||||
// Lobby ↔ Runtime Manager stop contract. The consumer XREADs from
|
||||
// `runtime:stop_jobs` (produced by Lobby), decodes the envelope frozen
|
||||
// in `rtmanager/api/runtime-jobs-asyncapi.yaml`, calls the production
|
||||
// stop orchestrator, and publishes one `runtime:job_results` outcome
|
||||
// per consumed envelope.
|
||||
//
|
||||
// Replay safety: the stop service surfaces an already-stopped or
|
||||
// already-removed record as `Outcome=success` with
|
||||
// `error_code=replay_no_op`. The consumer copies the result fields
|
||||
// into the wire payload verbatim. Per-message decode and publish
|
||||
// errors are logged and absorbed; the offset advances unconditionally
|
||||
// so a single poison message cannot pin the loop. Design rationale is
|
||||
// captured in `rtmanager/docs/workers.md`.
|
||||
package stopjobsconsumer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// streamOffsetLabel identifies the stop-jobs consumer in the stream
|
||||
// offset store. Matches the convention from
|
||||
// `rtmanager/README.md §Persistence Layout > Redis runtime-coordination state`.
|
||||
const streamOffsetLabel = "stopjobs"
|
||||
|
||||
// Wire field names of the `RuntimeStopJob` payload. Frozen by
|
||||
// `rtmanager/api/runtime-jobs-asyncapi.yaml`.
|
||||
const (
|
||||
fieldGameID = "game_id"
|
||||
fieldReason = "reason"
|
||||
fieldRequestedAtMS = "requested_at_ms"
|
||||
)
|
||||
|
||||
// StopService is the narrow surface the consumer needs from the stop
|
||||
// orchestrator. The concrete `*stopruntime.Service` satisfies this
|
||||
// interface and is wired in production.
|
||||
type StopService interface {
|
||||
Handle(ctx context.Context, input stopruntime.Input) (stopruntime.Result, error)
|
||||
}
|
||||
|
||||
// Config groups the dependencies required to construct a Consumer.
|
||||
type Config struct {
|
||||
// Client provides XREAD access to the stop-jobs stream.
|
||||
Client *redis.Client
|
||||
|
||||
// Stream stores the Redis Streams key consumed by the worker.
|
||||
Stream string
|
||||
|
||||
// BlockTimeout bounds the blocking XREAD window.
|
||||
BlockTimeout time.Duration
|
||||
|
||||
// StopService executes the stop lifecycle for each decoded envelope.
|
||||
StopService StopService
|
||||
|
||||
// JobResults publishes one outcome entry per processed envelope.
|
||||
JobResults ports.JobResultPublisher
|
||||
|
||||
// OffsetStore persists the last successfully processed entry id so
|
||||
// the consumer survives restarts without replaying processed
|
||||
// envelopes.
|
||||
OffsetStore ports.StreamOffsetStore
|
||||
|
||||
// Logger receives structured worker-level events. Defaults to
|
||||
// `slog.Default` when nil.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// Consumer drives the stop-jobs processing loop.
|
||||
type Consumer struct {
|
||||
client *redis.Client
|
||||
stream string
|
||||
blockTimeout time.Duration
|
||||
stopService StopService
|
||||
jobResults ports.JobResultPublisher
|
||||
offsetStore ports.StreamOffsetStore
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewConsumer constructs one Consumer from cfg.
|
||||
func NewConsumer(cfg Config) (*Consumer, error) {
|
||||
switch {
|
||||
case cfg.Client == nil:
|
||||
return nil, errors.New("new stop jobs consumer: nil redis client")
|
||||
case strings.TrimSpace(cfg.Stream) == "":
|
||||
return nil, errors.New("new stop jobs consumer: stream must not be empty")
|
||||
case cfg.BlockTimeout <= 0:
|
||||
return nil, errors.New("new stop jobs consumer: block timeout must be positive")
|
||||
case cfg.StopService == nil:
|
||||
return nil, errors.New("new stop jobs consumer: nil stop service")
|
||||
case cfg.JobResults == nil:
|
||||
return nil, errors.New("new stop jobs consumer: nil job results publisher")
|
||||
case cfg.OffsetStore == nil:
|
||||
return nil, errors.New("new stop jobs consumer: nil offset store")
|
||||
}
|
||||
|
||||
logger := cfg.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
return &Consumer{
|
||||
client: cfg.Client,
|
||||
stream: cfg.Stream,
|
||||
blockTimeout: cfg.BlockTimeout,
|
||||
stopService: cfg.StopService,
|
||||
jobResults: cfg.JobResults,
|
||||
offsetStore: cfg.OffsetStore,
|
||||
logger: logger.With("worker", "rtmanager.stopjobs", "stream", cfg.Stream),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run drives the XREAD loop until ctx is cancelled.
|
||||
func (consumer *Consumer) Run(ctx context.Context) error {
|
||||
if consumer == nil || consumer.client == nil {
|
||||
return errors.New("run stop jobs consumer: nil consumer")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run stop jobs consumer: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
lastID, found, err := consumer.offsetStore.Load(ctx, streamOffsetLabel)
|
||||
if err != nil {
|
||||
return fmt.Errorf("run stop jobs consumer: load offset: %w", err)
|
||||
}
|
||||
if !found {
|
||||
lastID = "0-0"
|
||||
}
|
||||
|
||||
consumer.logger.Info("stop jobs consumer started",
|
||||
"block_timeout", consumer.blockTimeout.String(),
|
||||
"start_entry_id", lastID,
|
||||
)
|
||||
defer consumer.logger.Info("stop jobs consumer stopped")
|
||||
|
||||
for {
|
||||
streams, err := consumer.client.XRead(ctx, &redis.XReadArgs{
|
||||
Streams: []string{consumer.stream, lastID},
|
||||
Count: 1,
|
||||
Block: consumer.blockTimeout,
|
||||
}).Result()
|
||||
switch {
|
||||
case err == nil:
|
||||
for _, stream := range streams {
|
||||
for _, message := range stream.Messages {
|
||||
consumer.HandleMessage(ctx, message)
|
||||
if err := consumer.offsetStore.Save(ctx, streamOffsetLabel, message.ID); err != nil {
|
||||
return fmt.Errorf("run stop jobs consumer: save offset: %w", err)
|
||||
}
|
||||
lastID = message.ID
|
||||
}
|
||||
}
|
||||
case errors.Is(err, redis.Nil):
|
||||
continue
|
||||
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)):
|
||||
return ctx.Err()
|
||||
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed):
|
||||
return fmt.Errorf("run stop jobs consumer: %w", err)
|
||||
default:
|
||||
return fmt.Errorf("run stop jobs consumer: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; the consumer relies on context cancellation.
|
||||
func (consumer *Consumer) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown stop jobs consumer: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// HandleMessage processes one Redis Stream message. Exported so tests
|
||||
// can drive the consumer deterministically without spinning up a real
|
||||
// XREAD loop.
|
||||
func (consumer *Consumer) HandleMessage(ctx context.Context, message redis.XMessage) {
|
||||
if consumer == nil {
|
||||
return
|
||||
}
|
||||
|
||||
envelope, err := decodeStopJob(message)
|
||||
if err != nil {
|
||||
consumer.logger.WarnContext(ctx, "decode stop job",
|
||||
"stream_entry_id", message.ID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
input := stopruntime.Input{
|
||||
GameID: envelope.GameID,
|
||||
Reason: envelope.Reason,
|
||||
OpSource: operation.OpSourceLobbyStream,
|
||||
SourceRef: message.ID,
|
||||
}
|
||||
result, err := consumer.stopService.Handle(ctx, input)
|
||||
if err != nil {
|
||||
consumer.logger.ErrorContext(ctx, "stop service returned go-level error",
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", envelope.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
jobResult := buildJobResult(envelope.GameID, result)
|
||||
if err := consumer.jobResults.Publish(ctx, jobResult); err != nil {
|
||||
consumer.logger.ErrorContext(ctx, "publish job result",
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", envelope.GameID,
|
||||
"outcome", jobResult.Outcome,
|
||||
"error_code", jobResult.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
logArgs := []any{
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", envelope.GameID,
|
||||
"reason", string(envelope.Reason),
|
||||
"outcome", jobResult.Outcome,
|
||||
"error_code", jobResult.ErrorCode,
|
||||
"requested_at_ms", envelope.RequestedAtMS,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
consumer.logger.InfoContext(ctx, "stop job processed", logArgs...)
|
||||
}
|
||||
|
||||
// stopJobEnvelope stores the decoded shape of one `runtime:stop_jobs`
|
||||
// stream entry.
|
||||
type stopJobEnvelope struct {
|
||||
GameID string
|
||||
Reason stopruntime.StopReason
|
||||
RequestedAtMS int64
|
||||
}
|
||||
|
||||
func decodeStopJob(message redis.XMessage) (stopJobEnvelope, error) {
|
||||
gameID := strings.TrimSpace(optionalString(message.Values, fieldGameID))
|
||||
if gameID == "" {
|
||||
return stopJobEnvelope{}, errors.New("missing game_id")
|
||||
}
|
||||
reasonRaw := strings.TrimSpace(optionalString(message.Values, fieldReason))
|
||||
if reasonRaw == "" {
|
||||
return stopJobEnvelope{}, errors.New("missing reason")
|
||||
}
|
||||
reason := stopruntime.StopReason(reasonRaw)
|
||||
if !reason.IsKnown() {
|
||||
return stopJobEnvelope{}, fmt.Errorf("unsupported reason %q", reasonRaw)
|
||||
}
|
||||
requestedAtMS, err := optionalInt64(message.Values, fieldRequestedAtMS)
|
||||
if err != nil {
|
||||
return stopJobEnvelope{}, fmt.Errorf("invalid requested_at_ms: %w", err)
|
||||
}
|
||||
return stopJobEnvelope{
|
||||
GameID: gameID,
|
||||
Reason: reason,
|
||||
RequestedAtMS: requestedAtMS,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// buildJobResult translates a stopruntime.Result into the wire payload
|
||||
// published on `runtime:job_results`. Stop replays for `status=removed`
|
||||
// records carry an empty `CurrentContainerID`; the consumer publishes
|
||||
// the empty fields verbatim, which the AsyncAPI contract permits.
|
||||
func buildJobResult(gameID string, result stopruntime.Result) ports.JobResult {
|
||||
jobResult := ports.JobResult{
|
||||
GameID: gameID,
|
||||
Outcome: string(result.Outcome),
|
||||
ErrorCode: result.ErrorCode,
|
||||
ErrorMessage: result.ErrorMessage,
|
||||
}
|
||||
if result.Outcome == operation.OutcomeSuccess {
|
||||
jobResult.ContainerID = result.Record.CurrentContainerID
|
||||
jobResult.EngineEndpoint = result.Record.EngineEndpoint
|
||||
}
|
||||
return jobResult
|
||||
}
|
||||
|
||||
func optionalString(values map[string]any, key string) string {
|
||||
raw, ok := values[key]
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
switch typed := raw.(type) {
|
||||
case string:
|
||||
return typed
|
||||
case []byte:
|
||||
return string(typed)
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func optionalInt64(values map[string]any, key string) (int64, error) {
|
||||
raw, ok := values[key]
|
||||
if !ok {
|
||||
return 0, nil
|
||||
}
|
||||
var stringValue string
|
||||
switch typed := raw.(type) {
|
||||
case string:
|
||||
stringValue = typed
|
||||
case []byte:
|
||||
stringValue = string(typed)
|
||||
default:
|
||||
return 0, fmt.Errorf("unsupported type %T", raw)
|
||||
}
|
||||
stringValue = strings.TrimSpace(stringValue)
|
||||
if stringValue == "" {
|
||||
return 0, nil
|
||||
}
|
||||
parsed, err := strconv.ParseInt(stringValue, 10, 64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return parsed, nil
|
||||
}
|
||||
@@ -0,0 +1,357 @@
|
||||
package stopjobsconsumer_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/worker/stopjobsconsumer"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func silentLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||
}
|
||||
|
||||
type fakeStopService struct {
|
||||
mu sync.Mutex
|
||||
inputs []stopruntime.Input
|
||||
result stopruntime.Result
|
||||
err error
|
||||
}
|
||||
|
||||
func (s *fakeStopService) Handle(_ context.Context, input stopruntime.Input) (stopruntime.Result, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.inputs = append(s.inputs, input)
|
||||
return s.result, s.err
|
||||
}
|
||||
|
||||
func (s *fakeStopService) Inputs() []stopruntime.Input {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]stopruntime.Input, len(s.inputs))
|
||||
copy(out, s.inputs)
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeJobResults struct {
|
||||
mu sync.Mutex
|
||||
published []ports.JobResult
|
||||
publishErr error
|
||||
}
|
||||
|
||||
func (s *fakeJobResults) Publish(_ context.Context, result ports.JobResult) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.publishErr != nil {
|
||||
return s.publishErr
|
||||
}
|
||||
s.published = append(s.published, result)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeJobResults) Published() []ports.JobResult {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]ports.JobResult, len(s.published))
|
||||
copy(out, s.published)
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeOffsetStore struct {
|
||||
mu sync.Mutex
|
||||
offsets map[string]string
|
||||
}
|
||||
|
||||
func newFakeOffsetStore() *fakeOffsetStore {
|
||||
return &fakeOffsetStore{offsets: map[string]string{}}
|
||||
}
|
||||
|
||||
func (s *fakeOffsetStore) Load(_ context.Context, label string) (string, bool, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
value, ok := s.offsets[label]
|
||||
return value, ok, nil
|
||||
}
|
||||
|
||||
func (s *fakeOffsetStore) Save(_ context.Context, label, entryID string) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.offsets[label] = entryID
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeOffsetStore) Get(label string) (string, bool) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
value, ok := s.offsets[label]
|
||||
return value, ok
|
||||
}
|
||||
|
||||
type harness struct {
|
||||
consumer *stopjobsconsumer.Consumer
|
||||
stops *fakeStopService
|
||||
results *fakeJobResults
|
||||
offsets *fakeOffsetStore
|
||||
stream string
|
||||
server *miniredis.Miniredis
|
||||
client *redis.Client
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
stops := &fakeStopService{}
|
||||
results := &fakeJobResults{}
|
||||
offsets := newFakeOffsetStore()
|
||||
stream := "runtime:stop_jobs"
|
||||
|
||||
consumer, err := stopjobsconsumer.NewConsumer(stopjobsconsumer.Config{
|
||||
Client: client,
|
||||
Stream: stream,
|
||||
BlockTimeout: 50 * time.Millisecond,
|
||||
StopService: stops,
|
||||
JobResults: results,
|
||||
OffsetStore: offsets,
|
||||
Logger: silentLogger(),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
consumer: consumer,
|
||||
stops: stops,
|
||||
results: results,
|
||||
offsets: offsets,
|
||||
stream: stream,
|
||||
server: server,
|
||||
client: client,
|
||||
}
|
||||
}
|
||||
|
||||
func stopMessage(id, gameID, reason string, requestedAtMS int64) redis.XMessage {
|
||||
return redis.XMessage{
|
||||
ID: id,
|
||||
Values: map[string]any{
|
||||
"game_id": gameID,
|
||||
"reason": reason,
|
||||
"requested_at_ms": strconv.FormatInt(requestedAtMS, 10),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewConsumerRejectsMissingDeps(t *testing.T) {
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
cases := []stopjobsconsumer.Config{
|
||||
{},
|
||||
{Client: client},
|
||||
{Client: client, Stream: "runtime:stop_jobs"},
|
||||
{Client: client, Stream: "runtime:stop_jobs", BlockTimeout: time.Second},
|
||||
{Client: client, Stream: "runtime:stop_jobs", BlockTimeout: time.Second, StopService: &fakeStopService{}},
|
||||
{Client: client, Stream: "runtime:stop_jobs", BlockTimeout: time.Second, StopService: &fakeStopService{}, JobResults: &fakeJobResults{}},
|
||||
}
|
||||
for index, cfg := range cases {
|
||||
_, err := stopjobsconsumer.NewConsumer(cfg)
|
||||
require.Errorf(t, err, "case %d should fail", index)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleMessageSuccessPublishesSuccessResult(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.stops.result = stopruntime.Result{
|
||||
Record: runtime.RuntimeRecord{
|
||||
GameID: "game-1",
|
||||
Status: runtime.StatusStopped,
|
||||
CurrentContainerID: "c-1",
|
||||
CurrentImageRef: "galaxy/game:1.0.0",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
},
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), stopMessage("100-0", "game-1", "cancelled", 1700))
|
||||
|
||||
inputs := h.stops.Inputs()
|
||||
require.Len(t, inputs, 1)
|
||||
assert.Equal(t, "game-1", inputs[0].GameID)
|
||||
assert.Equal(t, stopruntime.StopReasonCancelled, inputs[0].Reason)
|
||||
assert.Equal(t, operation.OpSourceLobbyStream, inputs[0].OpSource)
|
||||
assert.Equal(t, "100-0", inputs[0].SourceRef)
|
||||
|
||||
published := h.results.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, ports.JobResult{
|
||||
GameID: "game-1",
|
||||
Outcome: ports.JobOutcomeSuccess,
|
||||
ContainerID: "c-1",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
}, published[0])
|
||||
}
|
||||
|
||||
func TestHandleMessageFailureNotFoundPublishesFailureResult(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.stops.result = stopruntime.Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: startruntime.ErrorCodeNotFound,
|
||||
ErrorMessage: "runtime record for game \"game-2\" does not exist",
|
||||
}
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), stopMessage("101-0", "game-2", "admin_request", 1700))
|
||||
|
||||
published := h.results.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, ports.JobResult{
|
||||
GameID: "game-2",
|
||||
Outcome: ports.JobOutcomeFailure,
|
||||
ErrorCode: "not_found",
|
||||
ErrorMessage: "runtime record for game \"game-2\" does not exist",
|
||||
}, published[0])
|
||||
}
|
||||
|
||||
func TestHandleMessageReplayNoOpForRemovedRecordHasEmptyContainerAndEndpoint(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.stops.result = stopruntime.Result{
|
||||
Record: runtime.RuntimeRecord{
|
||||
GameID: "game-3",
|
||||
Status: runtime.StatusRemoved,
|
||||
CurrentContainerID: "",
|
||||
EngineEndpoint: "http://galaxy-game-game-3:8080",
|
||||
},
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
||||
}
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), stopMessage("102-0", "game-3", "finished", 1700))
|
||||
|
||||
published := h.results.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, ports.JobResult{
|
||||
GameID: "game-3",
|
||||
Outcome: ports.JobOutcomeSuccess,
|
||||
ContainerID: "",
|
||||
EngineEndpoint: "http://galaxy-game-game-3:8080",
|
||||
ErrorCode: "replay_no_op",
|
||||
}, published[0])
|
||||
}
|
||||
|
||||
func TestHandleMessageMalformedEnvelopesAreAbsorbed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
cases := []redis.XMessage{
|
||||
{ID: "200-0", Values: map[string]any{"reason": "cancelled", "requested_at_ms": "1"}},
|
||||
{ID: "200-1", Values: map[string]any{"game_id": "game-x", "requested_at_ms": "1"}},
|
||||
{ID: "200-2", Values: map[string]any{"game_id": "game-x", "reason": " ", "requested_at_ms": "1"}},
|
||||
{ID: "200-3", Values: map[string]any{"game_id": "game-x", "reason": "not_a_known_reason", "requested_at_ms": "1"}},
|
||||
{ID: "200-4", Values: map[string]any{"game_id": "game-x", "reason": "cancelled", "requested_at_ms": "abc"}},
|
||||
}
|
||||
for _, msg := range cases {
|
||||
h.consumer.HandleMessage(context.Background(), msg)
|
||||
}
|
||||
|
||||
assert.Empty(t, h.stops.Inputs(), "malformed envelopes must not reach the stop service")
|
||||
assert.Empty(t, h.results.Published(), "malformed envelopes must not produce job results")
|
||||
}
|
||||
|
||||
func TestHandleMessagePublishFailureIsAbsorbed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.stops.result = stopruntime.Result{Outcome: operation.OutcomeFailure, ErrorCode: "internal_error"}
|
||||
h.results.publishErr = errors.New("redis transient")
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), stopMessage("300-0", "game-x", "cancelled", 1700))
|
||||
|
||||
require.Len(t, h.stops.Inputs(), 1, "service still runs even when publish fails")
|
||||
}
|
||||
|
||||
func TestHandleMessageGoLevelErrorIsAbsorbed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.stops.err = errors.New("nil ctx")
|
||||
|
||||
h.consumer.HandleMessage(context.Background(), stopMessage("400-0", "game-y", "cancelled", 1700))
|
||||
|
||||
assert.Empty(t, h.results.Published(), "go-level service errors must not surface as job results")
|
||||
}
|
||||
|
||||
func TestRunAdvancesOffsetPerMessage(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.stops.result = stopruntime.Result{
|
||||
Record: runtime.RuntimeRecord{
|
||||
GameID: "game-5",
|
||||
Status: runtime.StatusStopped,
|
||||
CurrentContainerID: "c-5",
|
||||
EngineEndpoint: "http://galaxy-game-game-5:8080",
|
||||
},
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- h.consumer.Run(ctx) }()
|
||||
|
||||
mustXAdd(t, h.client, h.stream, "game-5", "cancelled", 1)
|
||||
mustXAdd(t, h.client, h.stream, "game-5", "finished", 2)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
return len(h.results.Published()) == 2
|
||||
}, time.Second, 10*time.Millisecond, "consumer must produce one job result per envelope")
|
||||
|
||||
cancel()
|
||||
require.Eventually(t, func() bool {
|
||||
select {
|
||||
case <-done:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}, time.Second, 10*time.Millisecond, "Run must exit after context cancel")
|
||||
|
||||
id, ok := h.offsets.Get("stopjobs")
|
||||
require.True(t, ok, "offset must be persisted after the run loop processed messages")
|
||||
assert.NotEmpty(t, id, "offset entry id must not be empty")
|
||||
}
|
||||
|
||||
func TestRunExitsImmediatelyOnAlreadyCancelledContext(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
err := h.consumer.Run(ctx)
|
||||
require.ErrorIs(t, err, context.Canceled)
|
||||
assert.Empty(t, h.stops.Inputs())
|
||||
assert.Empty(t, h.results.Published())
|
||||
}
|
||||
|
||||
func mustXAdd(t *testing.T, client *redis.Client, stream, gameID, reason string, requestedAtMS int64) string {
|
||||
t.Helper()
|
||||
id, err := client.XAdd(context.Background(), &redis.XAddArgs{
|
||||
Stream: stream,
|
||||
Values: map[string]any{
|
||||
"game_id": gameID,
|
||||
"reason": reason,
|
||||
"requested_at_ms": strconv.FormatInt(requestedAtMS, 10),
|
||||
},
|
||||
}).Result()
|
||||
require.NoError(t, err)
|
||||
return id
|
||||
}
|
||||
Reference in New Issue
Block a user