feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,204 @@
// Package containercleanup ships the periodic TTL-cleanup worker
// described in `rtmanager/README.md §Lifecycles → Cleanup`.
//
// On every tick the worker lists `runtime_records.status='stopped'`
// rows whose `last_op_at` is older than the configured retention
// (`RTMANAGER_CONTAINER_RETENTION_DAYS`) and delegates removal to
// `cleanupcontainer.Service.Handle` with `op_source=auto_ttl`. The
// service owns the per-game lease, the Docker `Remove` call, the
// status transition, the telemetry counter, and the operation_log
// entry; this worker is intentionally tiny — a ticker plus a TTL
// filter.
//
// Idempotent outcomes (`replay_no_op`, `conflict`) are absorbed; a
// failure on one game does not abort the rest of the pass.
//
// Design rationale is captured in
// `rtmanager/docs/workers.md`.
package containercleanup
import (
"context"
"errors"
"log/slog"
"time"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/cleanupcontainer"
)
// Cleaner is the narrow surface the worker uses to remove stopped
// containers. The production `*cleanupcontainer.Service` satisfies
// this interface verbatim; the package keeps the surface here so
// tests can substitute a fake without spinning the full service.
type Cleaner interface {
Handle(ctx context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error)
}
// Dependencies groups the collaborators required by Worker.
type Dependencies struct {
// RuntimeRecords lists `status=stopped` records on every tick.
RuntimeRecords ports.RuntimeRecordStore
// Cleanup performs the actual container removal under the per-game
// lease.
Cleanup Cleaner
// Retention is the TTL after which a stopped container becomes a
// removal candidate. Mirrors `cfg.Container.Retention`.
Retention time.Duration
// Interval bounds the tick period. Mirrors
// `cfg.Cleanup.CleanupInterval`.
Interval time.Duration
// Clock supplies the wall-clock used to compute the TTL threshold.
// Defaults to `time.Now` when nil.
Clock func() time.Time
// Logger receives structured worker-level events. Defaults to
// `slog.Default()` when nil.
Logger *slog.Logger
}
// Worker drives the periodic TTL-cleanup loop.
type Worker struct {
runtimeRecords ports.RuntimeRecordStore
cleanup Cleaner
retention time.Duration
interval time.Duration
clock func() time.Time
logger *slog.Logger
}
// NewWorker constructs one Worker from deps.
func NewWorker(deps Dependencies) (*Worker, error) {
switch {
case deps.RuntimeRecords == nil:
return nil, errors.New("new container cleanup worker: nil runtime records store")
case deps.Cleanup == nil:
return nil, errors.New("new container cleanup worker: nil cleanup service")
case deps.Retention <= 0:
return nil, errors.New("new container cleanup worker: retention must be positive")
case deps.Interval <= 0:
return nil, errors.New("new container cleanup worker: interval must be positive")
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
return &Worker{
runtimeRecords: deps.RuntimeRecords,
cleanup: deps.Cleanup,
retention: deps.Retention,
interval: deps.Interval,
clock: clock,
logger: logger.With("worker", "rtmanager.containercleanup"),
}, nil
}
// Run drives the cleanup loop until ctx is cancelled. Per-tick errors
// are absorbed; the loop only exits on context cancellation.
func (worker *Worker) Run(ctx context.Context) error {
if worker == nil {
return errors.New("run container cleanup worker: nil worker")
}
if ctx == nil {
return errors.New("run container cleanup worker: nil context")
}
if err := ctx.Err(); err != nil {
return err
}
worker.logger.Info("container cleanup worker started",
"interval", worker.interval.String(),
"retention", worker.retention.String(),
)
defer worker.logger.Info("container cleanup worker stopped")
ticker := time.NewTicker(worker.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
worker.tick(ctx)
}
}
}
// Shutdown is a no-op; Run terminates on context cancellation.
func (worker *Worker) Shutdown(ctx context.Context) error {
if ctx == nil {
return errors.New("shutdown container cleanup worker: nil context")
}
return nil
}
// Tick performs one cleanup pass. Exported so tests can drive the
// worker deterministically without spinning a real ticker.
func (worker *Worker) Tick(ctx context.Context) {
worker.tick(ctx)
}
// tick lists stopped records and delegates removal of expired ones to
// the cleanup service.
func (worker *Worker) tick(ctx context.Context) {
if err := ctx.Err(); err != nil {
return
}
records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusStopped)
if err != nil {
worker.logger.WarnContext(ctx, "list stopped records",
"err", err.Error(),
)
return
}
threshold := worker.clock().Add(-worker.retention)
for _, record := range records {
if err := ctx.Err(); err != nil {
return
}
if !record.LastOpAt.Before(threshold) {
continue
}
result, err := worker.cleanup.Handle(ctx, cleanupcontainer.Input{
GameID: record.GameID,
OpSource: operation.OpSourceAutoTTL,
})
if err != nil {
worker.logger.ErrorContext(ctx, "cleanup handle returned error",
"game_id", record.GameID,
"err", err.Error(),
)
continue
}
if result.Outcome == operation.OutcomeFailure {
worker.logger.InfoContext(ctx, "cleanup ttl pass: failure outcome",
"game_id", record.GameID,
"error_code", result.ErrorCode,
"error_message", result.ErrorMessage,
)
continue
}
worker.logger.InfoContext(ctx, "cleanup ttl removed container",
"game_id", record.GameID,
"error_code", result.ErrorCode,
)
}
}
@@ -0,0 +1,296 @@
package containercleanup_test
import (
"context"
"errors"
"io"
"log/slog"
"sync"
"testing"
"time"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/cleanupcontainer"
"galaxy/rtmanager/internal/worker/containercleanup"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func silentLogger() *slog.Logger {
return slog.New(slog.NewTextHandler(io.Discard, nil))
}
// fakeRuntimeRecords supports ListByStatus only.
type fakeRuntimeRecords struct {
mu sync.Mutex
stopped []runtime.RuntimeRecord
listErr error
}
func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} }
func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) {
s.mu.Lock()
defer s.mu.Unlock()
s.stopped = append([]runtime.RuntimeRecord(nil), records...)
}
func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil }
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
return nil
}
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
return nil, nil
}
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.listErr != nil {
return nil, s.listErr
}
if status != runtime.StatusStopped {
return nil, nil
}
out := make([]runtime.RuntimeRecord, len(s.stopped))
copy(out, s.stopped)
return out, nil
}
// fakeCleaner records every Handle call and returns canned responses.
type fakeCleaner struct {
mu sync.Mutex
calls []cleanupcontainer.Input
responses []cleanupcontainer.Result
errs []error
defaultResult cleanupcontainer.Result
defaultErr error
}
func (c *fakeCleaner) Handle(_ context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error) {
c.mu.Lock()
defer c.mu.Unlock()
c.calls = append(c.calls, input)
if len(c.errs) > 0 {
err := c.errs[0]
c.errs = c.errs[1:]
return cleanupcontainer.Result{}, err
}
if len(c.responses) > 0 {
result := c.responses[0]
c.responses = c.responses[1:]
return result, nil
}
if c.defaultErr != nil {
return cleanupcontainer.Result{}, c.defaultErr
}
return c.defaultResult, nil
}
func (c *fakeCleaner) Calls() []cleanupcontainer.Input {
c.mu.Lock()
defer c.mu.Unlock()
out := make([]cleanupcontainer.Input, len(c.calls))
copy(out, c.calls)
return out
}
// --- harness ----------------------------------------------------------
type harness struct {
records *fakeRuntimeRecords
cleaner *fakeCleaner
now time.Time
}
func newHarness() *harness {
return &harness{
records: newFakeRuntimeRecords(),
cleaner: &fakeCleaner{
defaultResult: cleanupcontainer.Result{Outcome: operation.OutcomeSuccess},
},
now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
}
}
func (h *harness) build(t *testing.T, retention time.Duration) *containercleanup.Worker {
t.Helper()
worker, err := containercleanup.NewWorker(containercleanup.Dependencies{
RuntimeRecords: h.records,
Cleanup: h.cleaner,
Retention: retention,
Interval: 50 * time.Millisecond,
Clock: func() time.Time { return h.now },
Logger: silentLogger(),
})
require.NoError(t, err)
return worker
}
// stoppedRecord builds a baseline record with the requested LastOpAt.
func stoppedRecord(gameID string, lastOpAt time.Time) runtime.RuntimeRecord {
stoppedAt := lastOpAt
return runtime.RuntimeRecord{
GameID: gameID,
Status: runtime.StatusStopped,
CurrentContainerID: "ctr-" + gameID,
CurrentImageRef: "galaxy/game:1.0.0",
EngineEndpoint: "http://galaxy-game-" + gameID + ":8080",
StatePath: "/var/lib/galaxy/games/" + gameID,
DockerNetwork: "galaxy-net",
LastOpAt: lastOpAt,
CreatedAt: lastOpAt.Add(-time.Hour),
StoppedAt: &stoppedAt,
}
}
// --- constructor ------------------------------------------------------
func TestNewWorkerRejectsMissingDeps(t *testing.T) {
cleaner := &fakeCleaner{defaultResult: cleanupcontainer.Result{Outcome: operation.OutcomeSuccess}}
records := newFakeRuntimeRecords()
defectives := []containercleanup.Dependencies{
{},
{RuntimeRecords: records},
{RuntimeRecords: records, Cleanup: cleaner},
{RuntimeRecords: records, Cleanup: cleaner, Retention: time.Hour},
}
for index, deps := range defectives {
_, err := containercleanup.NewWorker(deps)
require.Errorf(t, err, "case %d should fail", index)
}
_, err := containercleanup.NewWorker(containercleanup.Dependencies{
RuntimeRecords: records,
Cleanup: cleaner,
Retention: time.Hour,
Interval: time.Minute,
})
require.NoError(t, err)
}
// --- TTL math ---------------------------------------------------------
func TestTickCallsHandleForExpiredRecordsOnly(t *testing.T) {
h := newHarness()
retention := 24 * time.Hour
w := h.build(t, retention)
// One stopped older than retention, one within retention.
expired := stoppedRecord("game-old", h.now.Add(-30*time.Hour))
fresh := stoppedRecord("game-new", h.now.Add(-time.Hour))
h.records.Set(expired, fresh)
w.Tick(context.Background())
calls := h.cleaner.Calls()
require.Len(t, calls, 1, "only the expired record should be passed to cleanup")
assert.Equal(t, "game-old", calls[0].GameID)
assert.Equal(t, operation.OpSourceAutoTTL, calls[0].OpSource)
assert.Empty(t, calls[0].SourceRef)
}
func TestTickRespectsThresholdBoundaryExactly(t *testing.T) {
h := newHarness()
retention := 24 * time.Hour
w := h.build(t, retention)
// LastOpAt exactly equals the threshold; record.LastOpAt.Before(threshold)
// must be false → record stays.
exactly := stoppedRecord("game-edge", h.now.Add(-retention))
h.records.Set(exactly)
w.Tick(context.Background())
assert.Empty(t, h.cleaner.Calls(), "boundary record (LastOpAt == threshold) is not yet expired")
}
// --- error absorption -------------------------------------------------
func TestTickAbsorbsListError(t *testing.T) {
h := newHarness()
w := h.build(t, time.Hour)
h.records.listErr = errors.New("pg down")
require.NotPanics(t, func() { w.Tick(context.Background()) })
assert.Empty(t, h.cleaner.Calls())
}
func TestTickAbsorbsHandleErrorAndContinues(t *testing.T) {
h := newHarness()
retention := time.Hour
w := h.build(t, retention)
a := stoppedRecord("game-a", h.now.Add(-2*retention))
b := stoppedRecord("game-b", h.now.Add(-2*retention))
h.records.Set(a, b)
h.cleaner.errs = []error{errors.New("docker hiccup")}
w.Tick(context.Background())
calls := h.cleaner.Calls()
require.Len(t, calls, 2, "second game must still be processed after first error")
assert.Equal(t, "game-a", calls[0].GameID)
assert.Equal(t, "game-b", calls[1].GameID)
}
func TestTickAbsorbsFailureOutcomeAndContinues(t *testing.T) {
h := newHarness()
retention := time.Hour
w := h.build(t, retention)
a := stoppedRecord("game-a", h.now.Add(-2*retention))
b := stoppedRecord("game-b", h.now.Add(-2*retention))
h.records.Set(a, b)
h.cleaner.responses = []cleanupcontainer.Result{
{Outcome: operation.OutcomeFailure, ErrorCode: "service_unavailable", ErrorMessage: "docker"},
}
w.Tick(context.Background())
calls := h.cleaner.Calls()
require.Len(t, calls, 2)
}
// --- Run lifecycle ----------------------------------------------------
func TestRunRespectsContextCancel(t *testing.T) {
h := newHarness()
w := h.build(t, time.Hour)
ctx, cancel := context.WithCancel(context.Background())
done := make(chan error, 1)
go func() { done <- w.Run(ctx) }()
cancel()
select {
case err := <-done:
assert.ErrorIs(t, err, context.Canceled)
case <-time.After(time.Second):
t.Fatalf("Run did not exit after cancel")
}
}
func TestShutdownIsNoOp(t *testing.T) {
h := newHarness()
w := h.build(t, time.Hour)
require.NoError(t, w.Shutdown(context.Background()))
}
// --- compile-time safety ----------------------------------------------
var (
_ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
_ containercleanup.Cleaner = (*fakeCleaner)(nil)
)