feat: runtime manager

2026-04-28 20:39:18 +02:00
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,204 @@
+// Package containercleanup ships the periodic TTL-cleanup worker
+// described in `rtmanager/README.md §Lifecycles → Cleanup`.
+//
+// On every tick the worker lists `runtime_records.status='stopped'`
+// rows whose `last_op_at` is older than the configured retention
+// (`RTMANAGER_CONTAINER_RETENTION_DAYS`) and delegates removal to
+// `cleanupcontainer.Service.Handle` with `op_source=auto_ttl`. The
+// service owns the per-game lease, the Docker `Remove` call, the
+// status transition, the telemetry counter, and the operation_log
+// entry; this worker is intentionally tiny — a ticker plus a TTL
+// filter.
+//
+// Idempotent outcomes (`replay_no_op`, `conflict`) are absorbed; a
+// failure on one game does not abort the rest of the pass.
+//
+// Design rationale is captured in
+// `rtmanager/docs/workers.md`.
+package containercleanup
+
+import (
+	"context"
+	"errors"
+	"log/slog"
+	"time"
+
+	"galaxy/rtmanager/internal/domain/operation"
+	"galaxy/rtmanager/internal/domain/runtime"
+	"galaxy/rtmanager/internal/ports"
+	"galaxy/rtmanager/internal/service/cleanupcontainer"
+)
+
+// Cleaner is the narrow surface the worker uses to remove stopped
+// containers. The production `*cleanupcontainer.Service` satisfies
+// this interface verbatim; the package keeps the surface here so
+// tests can substitute a fake without spinning the full service.
+type Cleaner interface {
+	Handle(ctx context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error)
+}
+
+// Dependencies groups the collaborators required by Worker.
+type Dependencies struct {
+	// RuntimeRecords lists `status=stopped` records on every tick.
+	RuntimeRecords ports.RuntimeRecordStore
+
+	// Cleanup performs the actual container removal under the per-game
+	// lease.
+	Cleanup Cleaner
+
+	// Retention is the TTL after which a stopped container becomes a
+	// removal candidate. Mirrors `cfg.Container.Retention`.
+	Retention time.Duration
+
+	// Interval bounds the tick period. Mirrors
+	// `cfg.Cleanup.CleanupInterval`.
+	Interval time.Duration
+
+	// Clock supplies the wall-clock used to compute the TTL threshold.
+	// Defaults to `time.Now` when nil.
+	Clock func() time.Time
+
+	// Logger receives structured worker-level events. Defaults to
+	// `slog.Default()` when nil.
+	Logger *slog.Logger
+}
+
+// Worker drives the periodic TTL-cleanup loop.
+type Worker struct {
+	runtimeRecords ports.RuntimeRecordStore
+	cleanup        Cleaner
+
+	retention time.Duration
+	interval  time.Duration
+
+	clock  func() time.Time
+	logger *slog.Logger
+}
+
+// NewWorker constructs one Worker from deps.
+func NewWorker(deps Dependencies) (*Worker, error) {
+	switch {
+	case deps.RuntimeRecords == nil:
+		return nil, errors.New("new container cleanup worker: nil runtime records store")
+	case deps.Cleanup == nil:
+		return nil, errors.New("new container cleanup worker: nil cleanup service")
+	case deps.Retention <= 0:
+		return nil, errors.New("new container cleanup worker: retention must be positive")
+	case deps.Interval <= 0:
+		return nil, errors.New("new container cleanup worker: interval must be positive")
+	}
+
+	clock := deps.Clock
+	if clock == nil {
+		clock = time.Now
+	}
+	logger := deps.Logger
+	if logger == nil {
+		logger = slog.Default()
+	}
+
+	return &Worker{
+		runtimeRecords: deps.RuntimeRecords,
+		cleanup:        deps.Cleanup,
+		retention:      deps.Retention,
+		interval:       deps.Interval,
+		clock:          clock,
+		logger:         logger.With("worker", "rtmanager.containercleanup"),
+	}, nil
+}
+
+// Run drives the cleanup loop until ctx is cancelled. Per-tick errors
+// are absorbed; the loop only exits on context cancellation.
+func (worker *Worker) Run(ctx context.Context) error {
+	if worker == nil {
+		return errors.New("run container cleanup worker: nil worker")
+	}
+	if ctx == nil {
+		return errors.New("run container cleanup worker: nil context")
+	}
+	if err := ctx.Err(); err != nil {
+		return err
+	}
+
+	worker.logger.Info("container cleanup worker started",
+		"interval", worker.interval.String(),
+		"retention", worker.retention.String(),
+	)
+	defer worker.logger.Info("container cleanup worker stopped")
+
+	ticker := time.NewTicker(worker.interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-ticker.C:
+			worker.tick(ctx)
+		}
+	}
+}
+
+// Shutdown is a no-op; Run terminates on context cancellation.
+func (worker *Worker) Shutdown(ctx context.Context) error {
+	if ctx == nil {
+		return errors.New("shutdown container cleanup worker: nil context")
+	}
+	return nil
+}
+
+// Tick performs one cleanup pass. Exported so tests can drive the
+// worker deterministically without spinning a real ticker.
+func (worker *Worker) Tick(ctx context.Context) {
+	worker.tick(ctx)
+}
+
+// tick lists stopped records and delegates removal of expired ones to
+// the cleanup service.
+func (worker *Worker) tick(ctx context.Context) {
+	if err := ctx.Err(); err != nil {
+		return
+	}
+
+	records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusStopped)
+	if err != nil {
+		worker.logger.WarnContext(ctx, "list stopped records",
+			"err", err.Error(),
+		)
+		return
+	}
+
+	threshold := worker.clock().Add(-worker.retention)
+	for _, record := range records {
+		if err := ctx.Err(); err != nil {
+			return
+		}
+		if !record.LastOpAt.Before(threshold) {
+			continue
+		}
+
+		result, err := worker.cleanup.Handle(ctx, cleanupcontainer.Input{
+			GameID:   record.GameID,
+			OpSource: operation.OpSourceAutoTTL,
+		})
+		if err != nil {
+			worker.logger.ErrorContext(ctx, "cleanup handle returned error",
+				"game_id", record.GameID,
+				"err", err.Error(),
+			)
+			continue
+		}
+		if result.Outcome == operation.OutcomeFailure {
+			worker.logger.InfoContext(ctx, "cleanup ttl pass: failure outcome",
+				"game_id", record.GameID,
+				"error_code", result.ErrorCode,
+				"error_message", result.ErrorMessage,
+			)
+			continue
+		}
+		worker.logger.InfoContext(ctx, "cleanup ttl removed container",
+			"game_id", record.GameID,
+			"error_code", result.ErrorCode,
+		)
+	}
+}
@@ -0,0 +1,296 @@
+package containercleanup_test
+
+import (
+	"context"
+	"errors"
+	"io"
+	"log/slog"
+	"sync"
+	"testing"
+	"time"
+
+	"galaxy/rtmanager/internal/domain/operation"
+	"galaxy/rtmanager/internal/domain/runtime"
+	"galaxy/rtmanager/internal/ports"
+	"galaxy/rtmanager/internal/service/cleanupcontainer"
+	"galaxy/rtmanager/internal/worker/containercleanup"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func silentLogger() *slog.Logger {
+	return slog.New(slog.NewTextHandler(io.Discard, nil))
+}
+
+// fakeRuntimeRecords supports ListByStatus only.
+type fakeRuntimeRecords struct {
+	mu      sync.Mutex
+	stopped []runtime.RuntimeRecord
+	listErr error
+}
+
+func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} }
+
+func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.stopped = append([]runtime.RuntimeRecord(nil), records...)
+}
+
+func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) {
+	return runtime.RuntimeRecord{}, runtime.ErrNotFound
+}
+func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil }
+func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
+	return nil
+}
+func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
+	return nil, nil
+}
+
+func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.listErr != nil {
+		return nil, s.listErr
+	}
+	if status != runtime.StatusStopped {
+		return nil, nil
+	}
+	out := make([]runtime.RuntimeRecord, len(s.stopped))
+	copy(out, s.stopped)
+	return out, nil
+}
+
+// fakeCleaner records every Handle call and returns canned responses.
+type fakeCleaner struct {
+	mu sync.Mutex
+
+	calls    []cleanupcontainer.Input
+	responses []cleanupcontainer.Result
+	errs      []error
+
+	defaultResult cleanupcontainer.Result
+	defaultErr    error
+}
+
+func (c *fakeCleaner) Handle(_ context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.calls = append(c.calls, input)
+	if len(c.errs) > 0 {
+		err := c.errs[0]
+		c.errs = c.errs[1:]
+		return cleanupcontainer.Result{}, err
+	}
+	if len(c.responses) > 0 {
+		result := c.responses[0]
+		c.responses = c.responses[1:]
+		return result, nil
+	}
+	if c.defaultErr != nil {
+		return cleanupcontainer.Result{}, c.defaultErr
+	}
+	return c.defaultResult, nil
+}
+
+func (c *fakeCleaner) Calls() []cleanupcontainer.Input {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	out := make([]cleanupcontainer.Input, len(c.calls))
+	copy(out, c.calls)
+	return out
+}
+
+// --- harness ----------------------------------------------------------
+
+type harness struct {
+	records *fakeRuntimeRecords
+	cleaner *fakeCleaner
+
+	now time.Time
+}
+
+func newHarness() *harness {
+	return &harness{
+		records: newFakeRuntimeRecords(),
+		cleaner: &fakeCleaner{
+			defaultResult: cleanupcontainer.Result{Outcome: operation.OutcomeSuccess},
+		},
+		now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
+	}
+}
+
+func (h *harness) build(t *testing.T, retention time.Duration) *containercleanup.Worker {
+	t.Helper()
+	worker, err := containercleanup.NewWorker(containercleanup.Dependencies{
+		RuntimeRecords: h.records,
+		Cleanup:        h.cleaner,
+		Retention:      retention,
+		Interval:       50 * time.Millisecond,
+		Clock:          func() time.Time { return h.now },
+		Logger:         silentLogger(),
+	})
+	require.NoError(t, err)
+	return worker
+}
+
+// stoppedRecord builds a baseline record with the requested LastOpAt.
+func stoppedRecord(gameID string, lastOpAt time.Time) runtime.RuntimeRecord {
+	stoppedAt := lastOpAt
+	return runtime.RuntimeRecord{
+		GameID:             gameID,
+		Status:             runtime.StatusStopped,
+		CurrentContainerID: "ctr-" + gameID,
+		CurrentImageRef:    "galaxy/game:1.0.0",
+		EngineEndpoint:     "http://galaxy-game-" + gameID + ":8080",
+		StatePath:          "/var/lib/galaxy/games/" + gameID,
+		DockerNetwork:      "galaxy-net",
+		LastOpAt:           lastOpAt,
+		CreatedAt:          lastOpAt.Add(-time.Hour),
+		StoppedAt:          &stoppedAt,
+	}
+}
+
+// --- constructor ------------------------------------------------------
+
+func TestNewWorkerRejectsMissingDeps(t *testing.T) {
+	cleaner := &fakeCleaner{defaultResult: cleanupcontainer.Result{Outcome: operation.OutcomeSuccess}}
+	records := newFakeRuntimeRecords()
+
+	defectives := []containercleanup.Dependencies{
+		{},
+		{RuntimeRecords: records},
+		{RuntimeRecords: records, Cleanup: cleaner},
+		{RuntimeRecords: records, Cleanup: cleaner, Retention: time.Hour},
+	}
+	for index, deps := range defectives {
+		_, err := containercleanup.NewWorker(deps)
+		require.Errorf(t, err, "case %d should fail", index)
+	}
+
+	_, err := containercleanup.NewWorker(containercleanup.Dependencies{
+		RuntimeRecords: records,
+		Cleanup:        cleaner,
+		Retention:      time.Hour,
+		Interval:       time.Minute,
+	})
+	require.NoError(t, err)
+}
+
+// --- TTL math ---------------------------------------------------------
+
+func TestTickCallsHandleForExpiredRecordsOnly(t *testing.T) {
+	h := newHarness()
+	retention := 24 * time.Hour
+	w := h.build(t, retention)
+
+	// One stopped older than retention, one within retention.
+	expired := stoppedRecord("game-old", h.now.Add(-30*time.Hour))
+	fresh := stoppedRecord("game-new", h.now.Add(-time.Hour))
+	h.records.Set(expired, fresh)
+
+	w.Tick(context.Background())
+
+	calls := h.cleaner.Calls()
+	require.Len(t, calls, 1, "only the expired record should be passed to cleanup")
+	assert.Equal(t, "game-old", calls[0].GameID)
+	assert.Equal(t, operation.OpSourceAutoTTL, calls[0].OpSource)
+	assert.Empty(t, calls[0].SourceRef)
+}
+
+func TestTickRespectsThresholdBoundaryExactly(t *testing.T) {
+	h := newHarness()
+	retention := 24 * time.Hour
+	w := h.build(t, retention)
+
+	// LastOpAt exactly equals the threshold; record.LastOpAt.Before(threshold)
+	// must be false → record stays.
+	exactly := stoppedRecord("game-edge", h.now.Add(-retention))
+	h.records.Set(exactly)
+
+	w.Tick(context.Background())
+	assert.Empty(t, h.cleaner.Calls(), "boundary record (LastOpAt == threshold) is not yet expired")
+}
+
+// --- error absorption -------------------------------------------------
+
+func TestTickAbsorbsListError(t *testing.T) {
+	h := newHarness()
+	w := h.build(t, time.Hour)
+	h.records.listErr = errors.New("pg down")
+
+	require.NotPanics(t, func() { w.Tick(context.Background()) })
+	assert.Empty(t, h.cleaner.Calls())
+}
+
+func TestTickAbsorbsHandleErrorAndContinues(t *testing.T) {
+	h := newHarness()
+	retention := time.Hour
+	w := h.build(t, retention)
+
+	a := stoppedRecord("game-a", h.now.Add(-2*retention))
+	b := stoppedRecord("game-b", h.now.Add(-2*retention))
+	h.records.Set(a, b)
+
+	h.cleaner.errs = []error{errors.New("docker hiccup")}
+
+	w.Tick(context.Background())
+
+	calls := h.cleaner.Calls()
+	require.Len(t, calls, 2, "second game must still be processed after first error")
+	assert.Equal(t, "game-a", calls[0].GameID)
+	assert.Equal(t, "game-b", calls[1].GameID)
+}
+
+func TestTickAbsorbsFailureOutcomeAndContinues(t *testing.T) {
+	h := newHarness()
+	retention := time.Hour
+	w := h.build(t, retention)
+
+	a := stoppedRecord("game-a", h.now.Add(-2*retention))
+	b := stoppedRecord("game-b", h.now.Add(-2*retention))
+	h.records.Set(a, b)
+
+	h.cleaner.responses = []cleanupcontainer.Result{
+		{Outcome: operation.OutcomeFailure, ErrorCode: "service_unavailable", ErrorMessage: "docker"},
+	}
+
+	w.Tick(context.Background())
+
+	calls := h.cleaner.Calls()
+	require.Len(t, calls, 2)
+}
+
+// --- Run lifecycle ----------------------------------------------------
+
+func TestRunRespectsContextCancel(t *testing.T) {
+	h := newHarness()
+	w := h.build(t, time.Hour)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan error, 1)
+	go func() { done <- w.Run(ctx) }()
+
+	cancel()
+	select {
+	case err := <-done:
+		assert.ErrorIs(t, err, context.Canceled)
+	case <-time.After(time.Second):
+		t.Fatalf("Run did not exit after cancel")
+	}
+}
+
+func TestShutdownIsNoOp(t *testing.T) {
+	h := newHarness()
+	w := h.build(t, time.Hour)
+	require.NoError(t, w.Shutdown(context.Background()))
+}
+
+// --- compile-time safety ----------------------------------------------
+
+var (
+	_ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
+	_ containercleanup.Cleaner = (*fakeCleaner)(nil)
+)