feat: runtime manager
This commit is contained in:
@@ -0,0 +1,204 @@
|
||||
// Package containercleanup ships the periodic TTL-cleanup worker
|
||||
// described in `rtmanager/README.md §Lifecycles → Cleanup`.
|
||||
//
|
||||
// On every tick the worker lists `runtime_records.status='stopped'`
|
||||
// rows whose `last_op_at` is older than the configured retention
|
||||
// (`RTMANAGER_CONTAINER_RETENTION_DAYS`) and delegates removal to
|
||||
// `cleanupcontainer.Service.Handle` with `op_source=auto_ttl`. The
|
||||
// service owns the per-game lease, the Docker `Remove` call, the
|
||||
// status transition, the telemetry counter, and the operation_log
|
||||
// entry; this worker is intentionally tiny — a ticker plus a TTL
|
||||
// filter.
|
||||
//
|
||||
// Idempotent outcomes (`replay_no_op`, `conflict`) are absorbed; a
|
||||
// failure on one game does not abort the rest of the pass.
|
||||
//
|
||||
// Design rationale is captured in
|
||||
// `rtmanager/docs/workers.md`.
|
||||
package containercleanup
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
||||
)
|
||||
|
||||
// Cleaner is the narrow surface the worker uses to remove stopped
|
||||
// containers. The production `*cleanupcontainer.Service` satisfies
|
||||
// this interface verbatim; the package keeps the surface here so
|
||||
// tests can substitute a fake without spinning the full service.
|
||||
type Cleaner interface {
|
||||
Handle(ctx context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error)
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Worker.
|
||||
type Dependencies struct {
|
||||
// RuntimeRecords lists `status=stopped` records on every tick.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// Cleanup performs the actual container removal under the per-game
|
||||
// lease.
|
||||
Cleanup Cleaner
|
||||
|
||||
// Retention is the TTL after which a stopped container becomes a
|
||||
// removal candidate. Mirrors `cfg.Container.Retention`.
|
||||
Retention time.Duration
|
||||
|
||||
// Interval bounds the tick period. Mirrors
|
||||
// `cfg.Cleanup.CleanupInterval`.
|
||||
Interval time.Duration
|
||||
|
||||
// Clock supplies the wall-clock used to compute the TTL threshold.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// Logger receives structured worker-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// Worker drives the periodic TTL-cleanup loop.
|
||||
type Worker struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
cleanup Cleaner
|
||||
|
||||
retention time.Duration
|
||||
interval time.Duration
|
||||
|
||||
clock func() time.Time
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewWorker constructs one Worker from deps.
|
||||
func NewWorker(deps Dependencies) (*Worker, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new container cleanup worker: nil runtime records store")
|
||||
case deps.Cleanup == nil:
|
||||
return nil, errors.New("new container cleanup worker: nil cleanup service")
|
||||
case deps.Retention <= 0:
|
||||
return nil, errors.New("new container cleanup worker: retention must be positive")
|
||||
case deps.Interval <= 0:
|
||||
return nil, errors.New("new container cleanup worker: interval must be positive")
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
return &Worker{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
cleanup: deps.Cleanup,
|
||||
retention: deps.Retention,
|
||||
interval: deps.Interval,
|
||||
clock: clock,
|
||||
logger: logger.With("worker", "rtmanager.containercleanup"),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run drives the cleanup loop until ctx is cancelled. Per-tick errors
|
||||
// are absorbed; the loop only exits on context cancellation.
|
||||
func (worker *Worker) Run(ctx context.Context) error {
|
||||
if worker == nil {
|
||||
return errors.New("run container cleanup worker: nil worker")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run container cleanup worker: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
worker.logger.Info("container cleanup worker started",
|
||||
"interval", worker.interval.String(),
|
||||
"retention", worker.retention.String(),
|
||||
)
|
||||
defer worker.logger.Info("container cleanup worker stopped")
|
||||
|
||||
ticker := time.NewTicker(worker.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
worker.tick(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; Run terminates on context cancellation.
|
||||
func (worker *Worker) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown container cleanup worker: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Tick performs one cleanup pass. Exported so tests can drive the
|
||||
// worker deterministically without spinning a real ticker.
|
||||
func (worker *Worker) Tick(ctx context.Context) {
|
||||
worker.tick(ctx)
|
||||
}
|
||||
|
||||
// tick lists stopped records and delegates removal of expired ones to
|
||||
// the cleanup service.
|
||||
func (worker *Worker) tick(ctx context.Context) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusStopped)
|
||||
if err != nil {
|
||||
worker.logger.WarnContext(ctx, "list stopped records",
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
threshold := worker.clock().Add(-worker.retention)
|
||||
for _, record := range records {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
if !record.LastOpAt.Before(threshold) {
|
||||
continue
|
||||
}
|
||||
|
||||
result, err := worker.cleanup.Handle(ctx, cleanupcontainer.Input{
|
||||
GameID: record.GameID,
|
||||
OpSource: operation.OpSourceAutoTTL,
|
||||
})
|
||||
if err != nil {
|
||||
worker.logger.ErrorContext(ctx, "cleanup handle returned error",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
continue
|
||||
}
|
||||
if result.Outcome == operation.OutcomeFailure {
|
||||
worker.logger.InfoContext(ctx, "cleanup ttl pass: failure outcome",
|
||||
"game_id", record.GameID,
|
||||
"error_code", result.ErrorCode,
|
||||
"error_message", result.ErrorMessage,
|
||||
)
|
||||
continue
|
||||
}
|
||||
worker.logger.InfoContext(ctx, "cleanup ttl removed container",
|
||||
"game_id", record.GameID,
|
||||
"error_code", result.ErrorCode,
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,296 @@
|
||||
package containercleanup_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
||||
"galaxy/rtmanager/internal/worker/containercleanup"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func silentLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||
}
|
||||
|
||||
// fakeRuntimeRecords supports ListByStatus only.
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
stopped []runtime.RuntimeRecord
|
||||
listErr error
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords { return &fakeRuntimeRecords{} }
|
||||
|
||||
func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.stopped = append([]runtime.RuntimeRecord(nil), records...)
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, _ string) (runtime.RuntimeRecord, error) {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error { return nil }
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
|
||||
return nil
|
||||
}
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.listErr != nil {
|
||||
return nil, s.listErr
|
||||
}
|
||||
if status != runtime.StatusStopped {
|
||||
return nil, nil
|
||||
}
|
||||
out := make([]runtime.RuntimeRecord, len(s.stopped))
|
||||
copy(out, s.stopped)
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// fakeCleaner records every Handle call and returns canned responses.
|
||||
type fakeCleaner struct {
|
||||
mu sync.Mutex
|
||||
|
||||
calls []cleanupcontainer.Input
|
||||
responses []cleanupcontainer.Result
|
||||
errs []error
|
||||
|
||||
defaultResult cleanupcontainer.Result
|
||||
defaultErr error
|
||||
}
|
||||
|
||||
func (c *fakeCleaner) Handle(_ context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
c.calls = append(c.calls, input)
|
||||
if len(c.errs) > 0 {
|
||||
err := c.errs[0]
|
||||
c.errs = c.errs[1:]
|
||||
return cleanupcontainer.Result{}, err
|
||||
}
|
||||
if len(c.responses) > 0 {
|
||||
result := c.responses[0]
|
||||
c.responses = c.responses[1:]
|
||||
return result, nil
|
||||
}
|
||||
if c.defaultErr != nil {
|
||||
return cleanupcontainer.Result{}, c.defaultErr
|
||||
}
|
||||
return c.defaultResult, nil
|
||||
}
|
||||
|
||||
func (c *fakeCleaner) Calls() []cleanupcontainer.Input {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
out := make([]cleanupcontainer.Input, len(c.calls))
|
||||
copy(out, c.calls)
|
||||
return out
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
cleaner *fakeCleaner
|
||||
|
||||
now time.Time
|
||||
}
|
||||
|
||||
func newHarness() *harness {
|
||||
return &harness{
|
||||
records: newFakeRuntimeRecords(),
|
||||
cleaner: &fakeCleaner{
|
||||
defaultResult: cleanupcontainer.Result{Outcome: operation.OutcomeSuccess},
|
||||
},
|
||||
now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T, retention time.Duration) *containercleanup.Worker {
|
||||
t.Helper()
|
||||
worker, err := containercleanup.NewWorker(containercleanup.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
Cleanup: h.cleaner,
|
||||
Retention: retention,
|
||||
Interval: 50 * time.Millisecond,
|
||||
Clock: func() time.Time { return h.now },
|
||||
Logger: silentLogger(),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return worker
|
||||
}
|
||||
|
||||
// stoppedRecord builds a baseline record with the requested LastOpAt.
|
||||
func stoppedRecord(gameID string, lastOpAt time.Time) runtime.RuntimeRecord {
|
||||
stoppedAt := lastOpAt
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: gameID,
|
||||
Status: runtime.StatusStopped,
|
||||
CurrentContainerID: "ctr-" + gameID,
|
||||
CurrentImageRef: "galaxy/game:1.0.0",
|
||||
EngineEndpoint: "http://galaxy-game-" + gameID + ":8080",
|
||||
StatePath: "/var/lib/galaxy/games/" + gameID,
|
||||
DockerNetwork: "galaxy-net",
|
||||
LastOpAt: lastOpAt,
|
||||
CreatedAt: lastOpAt.Add(-time.Hour),
|
||||
StoppedAt: &stoppedAt,
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor ------------------------------------------------------
|
||||
|
||||
func TestNewWorkerRejectsMissingDeps(t *testing.T) {
|
||||
cleaner := &fakeCleaner{defaultResult: cleanupcontainer.Result{Outcome: operation.OutcomeSuccess}}
|
||||
records := newFakeRuntimeRecords()
|
||||
|
||||
defectives := []containercleanup.Dependencies{
|
||||
{},
|
||||
{RuntimeRecords: records},
|
||||
{RuntimeRecords: records, Cleanup: cleaner},
|
||||
{RuntimeRecords: records, Cleanup: cleaner, Retention: time.Hour},
|
||||
}
|
||||
for index, deps := range defectives {
|
||||
_, err := containercleanup.NewWorker(deps)
|
||||
require.Errorf(t, err, "case %d should fail", index)
|
||||
}
|
||||
|
||||
_, err := containercleanup.NewWorker(containercleanup.Dependencies{
|
||||
RuntimeRecords: records,
|
||||
Cleanup: cleaner,
|
||||
Retention: time.Hour,
|
||||
Interval: time.Minute,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
// --- TTL math ---------------------------------------------------------
|
||||
|
||||
func TestTickCallsHandleForExpiredRecordsOnly(t *testing.T) {
|
||||
h := newHarness()
|
||||
retention := 24 * time.Hour
|
||||
w := h.build(t, retention)
|
||||
|
||||
// One stopped older than retention, one within retention.
|
||||
expired := stoppedRecord("game-old", h.now.Add(-30*time.Hour))
|
||||
fresh := stoppedRecord("game-new", h.now.Add(-time.Hour))
|
||||
h.records.Set(expired, fresh)
|
||||
|
||||
w.Tick(context.Background())
|
||||
|
||||
calls := h.cleaner.Calls()
|
||||
require.Len(t, calls, 1, "only the expired record should be passed to cleanup")
|
||||
assert.Equal(t, "game-old", calls[0].GameID)
|
||||
assert.Equal(t, operation.OpSourceAutoTTL, calls[0].OpSource)
|
||||
assert.Empty(t, calls[0].SourceRef)
|
||||
}
|
||||
|
||||
func TestTickRespectsThresholdBoundaryExactly(t *testing.T) {
|
||||
h := newHarness()
|
||||
retention := 24 * time.Hour
|
||||
w := h.build(t, retention)
|
||||
|
||||
// LastOpAt exactly equals the threshold; record.LastOpAt.Before(threshold)
|
||||
// must be false → record stays.
|
||||
exactly := stoppedRecord("game-edge", h.now.Add(-retention))
|
||||
h.records.Set(exactly)
|
||||
|
||||
w.Tick(context.Background())
|
||||
assert.Empty(t, h.cleaner.Calls(), "boundary record (LastOpAt == threshold) is not yet expired")
|
||||
}
|
||||
|
||||
// --- error absorption -------------------------------------------------
|
||||
|
||||
func TestTickAbsorbsListError(t *testing.T) {
|
||||
h := newHarness()
|
||||
w := h.build(t, time.Hour)
|
||||
h.records.listErr = errors.New("pg down")
|
||||
|
||||
require.NotPanics(t, func() { w.Tick(context.Background()) })
|
||||
assert.Empty(t, h.cleaner.Calls())
|
||||
}
|
||||
|
||||
func TestTickAbsorbsHandleErrorAndContinues(t *testing.T) {
|
||||
h := newHarness()
|
||||
retention := time.Hour
|
||||
w := h.build(t, retention)
|
||||
|
||||
a := stoppedRecord("game-a", h.now.Add(-2*retention))
|
||||
b := stoppedRecord("game-b", h.now.Add(-2*retention))
|
||||
h.records.Set(a, b)
|
||||
|
||||
h.cleaner.errs = []error{errors.New("docker hiccup")}
|
||||
|
||||
w.Tick(context.Background())
|
||||
|
||||
calls := h.cleaner.Calls()
|
||||
require.Len(t, calls, 2, "second game must still be processed after first error")
|
||||
assert.Equal(t, "game-a", calls[0].GameID)
|
||||
assert.Equal(t, "game-b", calls[1].GameID)
|
||||
}
|
||||
|
||||
func TestTickAbsorbsFailureOutcomeAndContinues(t *testing.T) {
|
||||
h := newHarness()
|
||||
retention := time.Hour
|
||||
w := h.build(t, retention)
|
||||
|
||||
a := stoppedRecord("game-a", h.now.Add(-2*retention))
|
||||
b := stoppedRecord("game-b", h.now.Add(-2*retention))
|
||||
h.records.Set(a, b)
|
||||
|
||||
h.cleaner.responses = []cleanupcontainer.Result{
|
||||
{Outcome: operation.OutcomeFailure, ErrorCode: "service_unavailable", ErrorMessage: "docker"},
|
||||
}
|
||||
|
||||
w.Tick(context.Background())
|
||||
|
||||
calls := h.cleaner.Calls()
|
||||
require.Len(t, calls, 2)
|
||||
}
|
||||
|
||||
// --- Run lifecycle ----------------------------------------------------
|
||||
|
||||
func TestRunRespectsContextCancel(t *testing.T) {
|
||||
h := newHarness()
|
||||
w := h.build(t, time.Hour)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- w.Run(ctx) }()
|
||||
|
||||
cancel()
|
||||
select {
|
||||
case err := <-done:
|
||||
assert.ErrorIs(t, err, context.Canceled)
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("Run did not exit after cancel")
|
||||
}
|
||||
}
|
||||
|
||||
func TestShutdownIsNoOp(t *testing.T) {
|
||||
h := newHarness()
|
||||
w := h.build(t, time.Hour)
|
||||
require.NoError(t, w.Shutdown(context.Background()))
|
||||
}
|
||||
|
||||
// --- compile-time safety ----------------------------------------------
|
||||
|
||||
var (
|
||||
_ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
|
||||
_ containercleanup.Cleaner = (*fakeCleaner)(nil)
|
||||
)
|
||||
Reference in New Issue
Block a user