feat: runtime manager

2026-04-28 20:39:18 +02:00
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,170 @@
+// Package app wires the Runtime Manager process lifecycle and
+// coordinates component startup and graceful shutdown.
+package app
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+
+	"galaxy/rtmanager/internal/config"
+)
+
+// Component is a long-lived Runtime Manager subsystem that participates
+// in coordinated startup and graceful shutdown.
+type Component interface {
+	// Run starts the component and blocks until it stops.
+	Run(context.Context) error
+
+	// Shutdown stops the component within the provided timeout-bounded
+	// context.
+	Shutdown(context.Context) error
+}
+
+// App owns the process-level lifecycle of Runtime Manager and its
+// registered components.
+type App struct {
+	cfg        config.Config
+	components []Component
+}
+
+// New constructs App with a defensive copy of the supplied components.
+func New(cfg config.Config, components ...Component) *App {
+	clonedComponents := append([]Component(nil), components...)
+
+	return &App{
+		cfg:        cfg,
+		components: clonedComponents,
+	}
+}
+
+// Run starts all configured components, waits for cancellation or the
+// first component failure, and then executes best-effort graceful
+// shutdown.
+func (app *App) Run(ctx context.Context) error {
+	if ctx == nil {
+		return errors.New("run rtmanager app: nil context")
+	}
+	if err := app.validate(); err != nil {
+		return err
+	}
+	if len(app.components) == 0 {
+		<-ctx.Done()
+		return nil
+	}
+
+	runCtx, cancel := context.WithCancel(ctx)
+	defer cancel()
+
+	results := make(chan componentResult, len(app.components))
+	var runWaitGroup sync.WaitGroup
+
+	for index, component := range app.components {
+		runWaitGroup.Add(1)
+
+		go func(componentIndex int, component Component) {
+			defer runWaitGroup.Done()
+			results <- componentResult{
+				index: componentIndex,
+				err:   component.Run(runCtx),
+			}
+		}(index, component)
+	}
+
+	var runErr error
+
+	select {
+	case <-ctx.Done():
+	case result := <-results:
+		runErr = classifyComponentResult(ctx, result)
+	}
+
+	cancel()
+
+	shutdownErr := app.shutdownComponents()
+	waitErr := app.waitForComponents(&runWaitGroup)
+
+	return errors.Join(runErr, shutdownErr, waitErr)
+}
+
+type componentResult struct {
+	index int
+	err   error
+}
+
+func (app *App) validate() error {
+	if app.cfg.ShutdownTimeout <= 0 {
+		return fmt.Errorf("run rtmanager app: shutdown timeout must be positive, got %s", app.cfg.ShutdownTimeout)
+	}
+
+	for index, component := range app.components {
+		if component == nil {
+			return fmt.Errorf("run rtmanager app: component %d is nil", index)
+		}
+	}
+
+	return nil
+}
+
+func classifyComponentResult(parentCtx context.Context, result componentResult) error {
+	switch {
+	case result.err == nil:
+		if parentCtx.Err() != nil {
+			return nil
+		}
+		return fmt.Errorf("run rtmanager app: component %d exited without error before shutdown", result.index)
+	case errors.Is(result.err, context.Canceled) && parentCtx.Err() != nil:
+		return nil
+	default:
+		return fmt.Errorf("run rtmanager app: component %d: %w", result.index, result.err)
+	}
+}
+
+func (app *App) shutdownComponents() error {
+	var shutdownWaitGroup sync.WaitGroup
+	errs := make(chan error, len(app.components))
+
+	for index, component := range app.components {
+		shutdownWaitGroup.Add(1)
+
+		go func(componentIndex int, component Component) {
+			defer shutdownWaitGroup.Done()
+
+			shutdownCtx, cancel := context.WithTimeout(context.Background(), app.cfg.ShutdownTimeout)
+			defer cancel()
+
+			if err := component.Shutdown(shutdownCtx); err != nil {
+				errs <- fmt.Errorf("shutdown rtmanager component %d: %w", componentIndex, err)
+			}
+		}(index, component)
+	}
+
+	shutdownWaitGroup.Wait()
+	close(errs)
+
+	var joined error
+	for err := range errs {
+		joined = errors.Join(joined, err)
+	}
+
+	return joined
+}
+
+func (app *App) waitForComponents(runWaitGroup *sync.WaitGroup) error {
+	done := make(chan struct{})
+	go func() {
+		runWaitGroup.Wait()
+		close(done)
+	}()
+
+	waitCtx, cancel := context.WithTimeout(context.Background(), app.cfg.ShutdownTimeout)
+	defer cancel()
+
+	select {
+	case <-done:
+		return nil
+	case <-waitCtx.Done():
+		return fmt.Errorf("wait for rtmanager components: %w", waitCtx.Err())
+	}
+}
@@ -0,0 +1,137 @@
+package app
+
+import (
+	"context"
+	"errors"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"galaxy/rtmanager/internal/config"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+type fakeComponent struct {
+	runErr       error
+	shutdownErr  error
+	runHook      func(context.Context) error
+	shutdownHook func(context.Context) error
+	runCount     atomic.Int32
+	downCount    atomic.Int32
+	blockForCtx  bool
+}
+
+func (component *fakeComponent) Run(ctx context.Context) error {
+	component.runCount.Add(1)
+	if component.runHook != nil {
+		return component.runHook(ctx)
+	}
+	if component.blockForCtx {
+		<-ctx.Done()
+		return ctx.Err()
+	}
+
+	return component.runErr
+}
+
+func (component *fakeComponent) Shutdown(ctx context.Context) error {
+	component.downCount.Add(1)
+	if component.shutdownHook != nil {
+		return component.shutdownHook(ctx)
+	}
+
+	return component.shutdownErr
+}
+
+func newCfg() config.Config {
+	return config.Config{ShutdownTimeout: time.Second}
+}
+
+func TestAppRunWithoutComponentsBlocksUntilContextDone(t *testing.T) {
+	t.Parallel()
+
+	app := New(newCfg())
+
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+
+	require.NoError(t, app.Run(ctx))
+}
+
+func TestAppRunReturnsOnContextCancel(t *testing.T) {
+	t.Parallel()
+
+	component := &fakeComponent{blockForCtx: true}
+	app := New(newCfg(), component)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	go func() {
+		time.Sleep(10 * time.Millisecond)
+		cancel()
+	}()
+
+	require.NoError(t, app.Run(ctx))
+	assert.EqualValues(t, 1, component.runCount.Load())
+	assert.EqualValues(t, 1, component.downCount.Load())
+}
+
+func TestAppRunPropagatesComponentFailure(t *testing.T) {
+	t.Parallel()
+
+	failure := errors.New("boom")
+	component := &fakeComponent{runErr: failure}
+	app := New(newCfg(), component)
+
+	err := app.Run(context.Background())
+	require.Error(t, err)
+	require.ErrorIs(t, err, failure)
+	assert.EqualValues(t, 1, component.downCount.Load())
+}
+
+func TestAppRunFailsOnNilContext(t *testing.T) {
+	t.Parallel()
+
+	app := New(newCfg())
+	var ctx context.Context
+	require.Error(t, app.Run(ctx))
+}
+
+func TestAppRunFailsOnNonPositiveShutdownTimeout(t *testing.T) {
+	t.Parallel()
+
+	app := New(config.Config{}, &fakeComponent{})
+	require.Error(t, app.Run(context.Background()))
+}
+
+func TestAppRunFailsOnNilComponent(t *testing.T) {
+	t.Parallel()
+
+	app := New(newCfg(), nil)
+	require.Error(t, app.Run(context.Background()))
+}
+
+func TestAppRunFlagsCleanExitBeforeShutdown(t *testing.T) {
+	t.Parallel()
+
+	component := &fakeComponent{}
+	app := New(newCfg(), component)
+
+	err := app.Run(context.Background())
+	require.Error(t, err)
+	require.True(t, contains(err.Error(), "exited without error"))
+}
+
+func contains(haystack, needle string) bool {
+	return len(needle) == 0 || (len(haystack) >= len(needle) && (haystack == needle || index(haystack, needle) >= 0))
+}
+
+func index(haystack, needle string) int {
+	for i := 0; i+len(needle) <= len(haystack); i++ {
+		if haystack[i:i+len(needle)] == needle {
+			return i
+		}
+	}
+	return -1
+}
@@ -0,0 +1,85 @@
+package app
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"time"
+
+	"galaxy/redisconn"
+	"galaxy/rtmanager/internal/config"
+	"galaxy/rtmanager/internal/telemetry"
+
+	"github.com/docker/docker/client"
+	"github.com/redis/go-redis/v9"
+)
+
+// newRedisClient builds the master Redis client from cfg via the shared
+// `pkg/redisconn` helper. Replica clients are not opened in this iteration
+// per ARCHITECTURE.md §Persistence Backends; they will be wired when read
+// routing is introduced.
+func newRedisClient(cfg config.RedisConfig) *redis.Client {
+	return redisconn.NewMasterClient(cfg.Conn)
+}
+
+// instrumentRedisClient attaches the OpenTelemetry tracing and metrics
+// instrumentation to client when telemetryRuntime is available. The
+// actual instrumentation lives in `pkg/redisconn` so every Galaxy service
+// shares one surface.
+func instrumentRedisClient(redisClient *redis.Client, telemetryRuntime *telemetry.Runtime) error {
+	if redisClient == nil {
+		return errors.New("instrument redis client: nil client")
+	}
+	if telemetryRuntime == nil {
+		return nil
+	}
+	return redisconn.Instrument(redisClient,
+		redisconn.WithTracerProvider(telemetryRuntime.TracerProvider()),
+		redisconn.WithMeterProvider(telemetryRuntime.MeterProvider()),
+	)
+}
+
+// pingRedis performs a single Redis PING bounded by
+// cfg.Conn.OperationTimeout to confirm that the configured Redis endpoint
+// is reachable at startup.
+func pingRedis(ctx context.Context, cfg config.RedisConfig, redisClient *redis.Client) error {
+	return redisconn.Ping(ctx, redisClient, cfg.Conn.OperationTimeout)
+}
+
+// newDockerClient constructs a Docker SDK client for cfg.Host with an
+// optional API version override. The bootstrap layer opens and pings
+// the client; the production Docker adapter wraps it for the service
+// layer.
+func newDockerClient(cfg config.DockerConfig) (*client.Client, error) {
+	options := []client.Opt{client.WithHost(cfg.Host)}
+	if cfg.APIVersion == "" {
+		options = append(options, client.WithAPIVersionNegotiation())
+	} else {
+		options = append(options, client.WithVersion(cfg.APIVersion))
+	}
+
+	docker, err := client.NewClientWithOpts(options...)
+	if err != nil {
+		return nil, fmt.Errorf("new docker client: %w", err)
+	}
+	return docker, nil
+}
+
+// pingDocker bounds one Docker daemon ping under timeout and returns a
+// wrapped error so startup failures are easy to spot in service logs.
+func pingDocker(ctx context.Context, dockerClient *client.Client, timeout time.Duration) error {
+	if dockerClient == nil {
+		return errors.New("ping docker: nil client")
+	}
+	if timeout <= 0 {
+		return errors.New("ping docker: timeout must be positive")
+	}
+
+	pingCtx, cancel := context.WithTimeout(ctx, timeout)
+	defer cancel()
+
+	if _, err := dockerClient.Ping(pingCtx); err != nil {
+		return fmt.Errorf("ping docker: %w", err)
+	}
+	return nil
+}
@@ -0,0 +1,82 @@
+package app
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"galaxy/redisconn"
+	"galaxy/rtmanager/internal/config"
+
+	"github.com/alicebob/miniredis/v2"
+	"github.com/stretchr/testify/require"
+)
+
+func newTestRedisCfg(addr string) config.RedisConfig {
+	return config.RedisConfig{
+		Conn: redisconn.Config{
+			MasterAddr:       addr,
+			Password:         "test",
+			OperationTimeout: time.Second,
+		},
+	}
+}
+
+func TestPingRedisSucceedsAgainstMiniredis(t *testing.T) {
+	t.Parallel()
+
+	server := miniredis.RunT(t)
+
+	redisCfg := newTestRedisCfg(server.Addr())
+	client := newRedisClient(redisCfg)
+	t.Cleanup(func() { _ = client.Close() })
+
+	require.NoError(t, pingRedis(context.Background(), redisCfg, client))
+}
+
+func TestPingRedisReturnsErrorWhenClosed(t *testing.T) {
+	t.Parallel()
+
+	server := miniredis.RunT(t)
+
+	redisCfg := newTestRedisCfg(server.Addr())
+	client := newRedisClient(redisCfg)
+	require.NoError(t, client.Close())
+
+	require.Error(t, pingRedis(context.Background(), redisCfg, client))
+}
+
+func TestNewDockerClientHonoursHostOverride(t *testing.T) {
+	t.Parallel()
+
+	docker, err := newDockerClient(config.DockerConfig{
+		Host:       "unix:///var/run/docker.sock",
+		APIVersion: "1.43",
+		Network:    "galaxy-net",
+		LogDriver:  "json-file",
+		PullPolicy: config.ImagePullPolicyIfMissing,
+	})
+	require.NoError(t, err)
+	require.NotNil(t, docker)
+	require.NoError(t, docker.Close())
+}
+
+func TestPingDockerRejectsNilClient(t *testing.T) {
+	t.Parallel()
+
+	require.Error(t, pingDocker(context.Background(), nil, time.Second))
+}
+
+func TestPingDockerRejectsNonPositiveTimeout(t *testing.T) {
+	t.Parallel()
+
+	docker, err := newDockerClient(config.DockerConfig{
+		Host:      "unix:///var/run/docker.sock",
+		Network:   "galaxy-net",
+		LogDriver: "json-file",
+	})
+	require.NoError(t, err)
+	t.Cleanup(func() { _ = docker.Close() })
+
+	require.Error(t, pingDocker(context.Background(), docker, 0))
+}
@@ -0,0 +1,262 @@
+package app
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"fmt"
+	"log/slog"
+	"time"
+
+	"galaxy/postgres"
+	"galaxy/redisconn"
+	"galaxy/rtmanager/internal/adapters/postgres/migrations"
+	"galaxy/rtmanager/internal/api/internalhttp"
+	"galaxy/rtmanager/internal/config"
+	"galaxy/rtmanager/internal/telemetry"
+
+	dockerclient "github.com/docker/docker/client"
+	"github.com/redis/go-redis/v9"
+)
+
+// Runtime owns the runnable Runtime Manager process plus the cleanup
+// functions that release runtime resources after shutdown.
+type Runtime struct {
+	cfg config.Config
+
+	app *App
+
+	wiring *wiring
+
+	internalServer *internalhttp.Server
+
+	cleanupFns []func() error
+}
+
+// NewRuntime constructs the runnable Runtime Manager process from cfg.
+//
+// PostgreSQL migrations apply strictly before the internal HTTP listener
+// becomes ready. The runtime opens one shared `*redis.Client`, one
+// `*sql.DB`, one Docker SDK client, and one OpenTelemetry runtime; all
+// are released in reverse construction order on shutdown.
+func NewRuntime(ctx context.Context, cfg config.Config, logger *slog.Logger) (*Runtime, error) {
+	if ctx == nil {
+		return nil, errors.New("new rtmanager runtime: nil context")
+	}
+	if err := cfg.Validate(); err != nil {
+		return nil, fmt.Errorf("new rtmanager runtime: %w", err)
+	}
+	if logger == nil {
+		logger = slog.Default()
+	}
+
+	runtime := &Runtime{
+		cfg: cfg,
+	}
+
+	cleanupOnError := func(err error) (*Runtime, error) {
+		if cleanupErr := runtime.Close(); cleanupErr != nil {
+			return nil, fmt.Errorf("%w; cleanup: %w", err, cleanupErr)
+		}
+
+		return nil, err
+	}
+
+	telemetryRuntime, err := telemetry.NewProcess(ctx, telemetry.ProcessConfig{
+		ServiceName:          cfg.Telemetry.ServiceName,
+		TracesExporter:       cfg.Telemetry.TracesExporter,
+		MetricsExporter:      cfg.Telemetry.MetricsExporter,
+		TracesProtocol:       cfg.Telemetry.TracesProtocol,
+		MetricsProtocol:      cfg.Telemetry.MetricsProtocol,
+		StdoutTracesEnabled:  cfg.Telemetry.StdoutTracesEnabled,
+		StdoutMetricsEnabled: cfg.Telemetry.StdoutMetricsEnabled,
+	}, logger)
+	if err != nil {
+		return cleanupOnError(fmt.Errorf("new rtmanager runtime: telemetry: %w", err))
+	}
+	runtime.cleanupFns = append(runtime.cleanupFns, func() error {
+		shutdownCtx, cancel := context.WithTimeout(context.Background(), cfg.ShutdownTimeout)
+		defer cancel()
+		return telemetryRuntime.Shutdown(shutdownCtx)
+	})
+
+	redisClient := newRedisClient(cfg.Redis)
+	if err := instrumentRedisClient(redisClient, telemetryRuntime); err != nil {
+		return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
+	}
+	runtime.cleanupFns = append(runtime.cleanupFns, func() error {
+		err := redisClient.Close()
+		if errors.Is(err, redis.ErrClosed) {
+			return nil
+		}
+		return err
+	})
+	if err := pingRedis(ctx, cfg.Redis, redisClient); err != nil {
+		return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
+	}
+
+	pgPool, err := postgres.OpenPrimary(ctx, cfg.Postgres.Conn,
+		postgres.WithTracerProvider(telemetryRuntime.TracerProvider()),
+		postgres.WithMeterProvider(telemetryRuntime.MeterProvider()),
+	)
+	if err != nil {
+		return cleanupOnError(fmt.Errorf("new rtmanager runtime: open postgres: %w", err))
+	}
+	runtime.cleanupFns = append(runtime.cleanupFns, pgPool.Close)
+	unregisterPGStats, err := postgres.InstrumentDBStats(pgPool,
+		postgres.WithMeterProvider(telemetryRuntime.MeterProvider()),
+	)
+	if err != nil {
+		return cleanupOnError(fmt.Errorf("new rtmanager runtime: instrument postgres: %w", err))
+	}
+	runtime.cleanupFns = append(runtime.cleanupFns, func() error {
+		return unregisterPGStats()
+	})
+	if err := postgres.Ping(ctx, pgPool, cfg.Postgres.Conn.OperationTimeout); err != nil {
+		return cleanupOnError(fmt.Errorf("new rtmanager runtime: ping postgres: %w", err))
+	}
+	if err := postgres.RunMigrations(ctx, pgPool, migrations.FS(), "."); err != nil {
+		return cleanupOnError(fmt.Errorf("new rtmanager runtime: run postgres migrations: %w", err))
+	}
+
+	dockerClient, err := newDockerClient(cfg.Docker)
+	if err != nil {
+		return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
+	}
+	runtime.cleanupFns = append(runtime.cleanupFns, dockerClient.Close)
+	if err := pingDocker(ctx, dockerClient, cfg.Postgres.Conn.OperationTimeout); err != nil {
+		return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
+	}
+
+	wiring, err := newWiring(cfg, redisClient, pgPool, dockerClient, time.Now, logger, telemetryRuntime)
+	if err != nil {
+		return cleanupOnError(fmt.Errorf("new rtmanager runtime: wiring: %w", err))
+	}
+	runtime.wiring = wiring
+	runtime.cleanupFns = append(runtime.cleanupFns, wiring.close)
+	if err := wiring.registerTelemetryGauges(); err != nil {
+		return cleanupOnError(fmt.Errorf("new rtmanager runtime: register telemetry gauges: %w", err))
+	}
+
+	if err := wiring.reconciler.ReconcileNow(ctx); err != nil {
+		return cleanupOnError(fmt.Errorf("new rtmanager runtime: initial reconcile: %w", err))
+	}
+
+	probe := newReadinessProbe(pgPool, redisClient, dockerClient, cfg)
+
+	internalServer, err := internalhttp.NewServer(internalhttp.Config{
+		Addr:              cfg.InternalHTTP.Addr,
+		ReadHeaderTimeout: cfg.InternalHTTP.ReadHeaderTimeout,
+		ReadTimeout:       cfg.InternalHTTP.ReadTimeout,
+		WriteTimeout:      cfg.InternalHTTP.WriteTimeout,
+		IdleTimeout:       cfg.InternalHTTP.IdleTimeout,
+	}, internalhttp.Dependencies{
+		Logger:           logger,
+		Telemetry:        telemetryRuntime,
+		Readiness:        probe,
+		RuntimeRecords:   wiring.runtimeRecordStore,
+		StartRuntime:     wiring.startRuntimeService,
+		StopRuntime:      wiring.stopRuntimeService,
+		RestartRuntime:   wiring.restartRuntimeService,
+		PatchRuntime:     wiring.patchRuntimeService,
+		CleanupContainer: wiring.cleanupContainerService,
+	})
+	if err != nil {
+		return cleanupOnError(fmt.Errorf("new rtmanager runtime: internal HTTP server: %w", err))
+	}
+	runtime.internalServer = internalServer
+
+	runtime.app = New(cfg,
+		internalServer,
+		wiring.startJobsConsumer,
+		wiring.stopJobsConsumer,
+		wiring.dockerEventsListener,
+		wiring.healthProbeWorker,
+		wiring.dockerInspectWorker,
+		wiring.reconciler,
+		wiring.containerCleanupWorker,
+	)
+
+	return runtime, nil
+}
+
+// InternalServer returns the internal HTTP server owned by runtime. It is
+// primarily exposed for tests; production code should not depend on it.
+func (runtime *Runtime) InternalServer() *internalhttp.Server {
+	if runtime == nil {
+		return nil
+	}
+
+	return runtime.internalServer
+}
+
+// Run serves the internal HTTP listener until ctx is canceled or one
+// component fails.
+func (runtime *Runtime) Run(ctx context.Context) error {
+	if ctx == nil {
+		return errors.New("run rtmanager runtime: nil context")
+	}
+	if runtime == nil {
+		return errors.New("run rtmanager runtime: nil runtime")
+	}
+	if runtime.app == nil {
+		return errors.New("run rtmanager runtime: nil app")
+	}
+
+	return runtime.app.Run(ctx)
+}
+
+// Close releases every runtime dependency in reverse construction order.
+// Close is safe to call multiple times.
+func (runtime *Runtime) Close() error {
+	if runtime == nil {
+		return nil
+	}
+
+	var joined error
+	for index := len(runtime.cleanupFns) - 1; index >= 0; index-- {
+		if err := runtime.cleanupFns[index](); err != nil {
+			joined = errors.Join(joined, err)
+		}
+	}
+	runtime.cleanupFns = nil
+
+	return joined
+}
+
+// readinessProbe pings every steady-state dependency the listener
+// guards: PostgreSQL primary, Redis master, the Docker daemon, plus
+// the configured Docker network's existence.
+type readinessProbe struct {
+	pgPool       *sql.DB
+	redisClient  *redis.Client
+	dockerClient *dockerclient.Client
+
+	postgresTimeout time.Duration
+	redisTimeout    time.Duration
+	dockerTimeout   time.Duration
+}
+
+func newReadinessProbe(pgPool *sql.DB, redisClient *redis.Client, dockerClient *dockerclient.Client, cfg config.Config) *readinessProbe {
+	return &readinessProbe{
+		pgPool:          pgPool,
+		redisClient:     redisClient,
+		dockerClient:    dockerClient,
+		postgresTimeout: cfg.Postgres.Conn.OperationTimeout,
+		redisTimeout:    cfg.Redis.Conn.OperationTimeout,
+		dockerTimeout:   cfg.Postgres.Conn.OperationTimeout,
+	}
+}
+
+// Check pings PostgreSQL, Redis, and Docker. The first failing
+// dependency aborts the check so callers see a single, actionable
+// error.
+func (probe *readinessProbe) Check(ctx context.Context) error {
+	if err := postgres.Ping(ctx, probe.pgPool, probe.postgresTimeout); err != nil {
+		return err
+	}
+	if err := redisconn.Ping(ctx, probe.redisClient, probe.redisTimeout); err != nil {
+		return err
+	}
+	return pingDocker(ctx, probe.dockerClient, probe.dockerTimeout)
+}
@@ -0,0 +1,541 @@
+package app
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"fmt"
+	"log/slog"
+	"net/http"
+	"time"
+
+	"galaxy/rtmanager/internal/adapters/docker"
+	"galaxy/rtmanager/internal/adapters/healtheventspublisher"
+	"galaxy/rtmanager/internal/adapters/jobresultspublisher"
+	"galaxy/rtmanager/internal/adapters/lobbyclient"
+	"galaxy/rtmanager/internal/adapters/notificationpublisher"
+	"galaxy/rtmanager/internal/adapters/postgres/healthsnapshotstore"
+	"galaxy/rtmanager/internal/adapters/postgres/operationlogstore"
+	"galaxy/rtmanager/internal/adapters/postgres/runtimerecordstore"
+	"galaxy/rtmanager/internal/adapters/redisstate/gamelease"
+	"galaxy/rtmanager/internal/adapters/redisstate/streamoffsets"
+	"galaxy/rtmanager/internal/config"
+	"galaxy/rtmanager/internal/ports"
+	"galaxy/rtmanager/internal/service/cleanupcontainer"
+	"galaxy/rtmanager/internal/service/patchruntime"
+	"galaxy/rtmanager/internal/service/restartruntime"
+	"galaxy/rtmanager/internal/service/startruntime"
+	"galaxy/rtmanager/internal/service/stopruntime"
+	"galaxy/rtmanager/internal/telemetry"
+	"galaxy/rtmanager/internal/worker/containercleanup"
+	"galaxy/rtmanager/internal/worker/dockerevents"
+	"galaxy/rtmanager/internal/worker/dockerinspect"
+	"galaxy/rtmanager/internal/worker/healthprobe"
+	"galaxy/rtmanager/internal/worker/reconcile"
+	"galaxy/rtmanager/internal/worker/startjobsconsumer"
+	"galaxy/rtmanager/internal/worker/stopjobsconsumer"
+
+	dockerclient "github.com/docker/docker/client"
+	"github.com/redis/go-redis/v9"
+	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
+)
+
+// wiring owns the process-level singletons constructed once during
+// `NewRuntime` and consumed by every worker and HTTP handler.
+//
+// The struct exposes typed accessors so callers can grab the store /
+// adapter / service singletons without depending on internal fields.
+type wiring struct {
+	cfg config.Config
+
+	redisClient  *redis.Client
+	pgPool       *sql.DB
+	dockerClient *dockerclient.Client
+
+	clock func() time.Time
+
+	logger    *slog.Logger
+	telemetry *telemetry.Runtime
+
+	// Persistence stores.
+	runtimeRecordStore  *runtimerecordstore.Store
+	operationLogStore   *operationlogstore.Store
+	healthSnapshotStore *healthsnapshotstore.Store
+	streamOffsetStore   *streamoffsets.Store
+	gameLeaseStore      *gamelease.Store
+
+	// External adapters.
+	dockerAdapter         *docker.Client
+	lobbyClient           *lobbyclient.Client
+	notificationPublisher *notificationpublisher.Publisher
+	healthEventsPublisher *healtheventspublisher.Publisher
+	jobResultsPublisher   *jobresultspublisher.Publisher
+
+	// Service layer.
+	startRuntimeService     *startruntime.Service
+	stopRuntimeService      *stopruntime.Service
+	restartRuntimeService   *restartruntime.Service
+	patchRuntimeService     *patchruntime.Service
+	cleanupContainerService *cleanupcontainer.Service
+
+	// Worker layer.
+	startJobsConsumer      *startjobsconsumer.Consumer
+	stopJobsConsumer       *stopjobsconsumer.Consumer
+	dockerEventsListener   *dockerevents.Listener
+	healthProbeWorker      *healthprobe.Worker
+	dockerInspectWorker    *dockerinspect.Worker
+	reconciler             *reconcile.Reconciler
+	containerCleanupWorker *containercleanup.Worker
+
+	// closers releases adapter-level resources at runtime shutdown.
+	closers []func() error
+}
+
+// newWiring constructs the process-level dependency set, the persistence
+// stores, the external adapters, and the service layer. It validates
+// every required collaborator so callers can rely on them being non-nil.
+func newWiring(
+	cfg config.Config,
+	redisClient *redis.Client,
+	pgPool *sql.DB,
+	dockerClient *dockerclient.Client,
+	clock func() time.Time,
+	logger *slog.Logger,
+	telemetryRuntime *telemetry.Runtime,
+) (*wiring, error) {
+	if redisClient == nil {
+		return nil, errors.New("new rtmanager wiring: nil redis client")
+	}
+	if pgPool == nil {
+		return nil, errors.New("new rtmanager wiring: nil postgres pool")
+	}
+	if dockerClient == nil {
+		return nil, errors.New("new rtmanager wiring: nil docker client")
+	}
+	if clock == nil {
+		clock = time.Now
+	}
+	if logger == nil {
+		logger = slog.Default()
+	}
+	if telemetryRuntime == nil {
+		return nil, fmt.Errorf("new rtmanager wiring: nil telemetry runtime")
+	}
+
+	w := &wiring{
+		cfg:          cfg,
+		redisClient:  redisClient,
+		pgPool:       pgPool,
+		dockerClient: dockerClient,
+		clock:        clock,
+		logger:       logger,
+		telemetry:    telemetryRuntime,
+	}
+
+	if err := w.buildPersistence(); err != nil {
+		return nil, fmt.Errorf("new rtmanager wiring: %w", err)
+	}
+	if err := w.buildAdapters(); err != nil {
+		_ = w.close()
+		return nil, fmt.Errorf("new rtmanager wiring: %w", err)
+	}
+	if err := w.buildServices(); err != nil {
+		_ = w.close()
+		return nil, fmt.Errorf("new rtmanager wiring: %w", err)
+	}
+	if err := w.buildWorkers(); err != nil {
+		_ = w.close()
+		return nil, fmt.Errorf("new rtmanager wiring: %w", err)
+	}
+	return w, nil
+}
+
+func (w *wiring) buildPersistence() error {
+	runtimeStore, err := runtimerecordstore.New(runtimerecordstore.Config{
+		DB:               w.pgPool,
+		OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
+	})
+	if err != nil {
+		return fmt.Errorf("runtime record store: %w", err)
+	}
+	w.runtimeRecordStore = runtimeStore
+
+	operationStore, err := operationlogstore.New(operationlogstore.Config{
+		DB:               w.pgPool,
+		OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
+	})
+	if err != nil {
+		return fmt.Errorf("operation log store: %w", err)
+	}
+	w.operationLogStore = operationStore
+
+	snapshotStore, err := healthsnapshotstore.New(healthsnapshotstore.Config{
+		DB:               w.pgPool,
+		OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
+	})
+	if err != nil {
+		return fmt.Errorf("health snapshot store: %w", err)
+	}
+	w.healthSnapshotStore = snapshotStore
+
+	offsetStore, err := streamoffsets.New(streamoffsets.Config{Client: w.redisClient})
+	if err != nil {
+		return fmt.Errorf("stream offset store: %w", err)
+	}
+	w.streamOffsetStore = offsetStore
+
+	leaseStore, err := gamelease.New(gamelease.Config{Client: w.redisClient})
+	if err != nil {
+		return fmt.Errorf("game lease store: %w", err)
+	}
+	w.gameLeaseStore = leaseStore
+
+	return nil
+}
+
+func (w *wiring) buildAdapters() error {
+	dockerAdapter, err := docker.NewClient(docker.Config{
+		Docker:    w.dockerClient,
+		LogDriver: w.cfg.Docker.LogDriver,
+		LogOpts:   w.cfg.Docker.LogOpts,
+		Clock:     w.clock,
+	})
+	if err != nil {
+		return fmt.Errorf("docker adapter: %w", err)
+	}
+	w.dockerAdapter = dockerAdapter
+
+	lobby, err := lobbyclient.NewClient(lobbyclient.Config{
+		BaseURL:        w.cfg.Lobby.BaseURL,
+		RequestTimeout: w.cfg.Lobby.Timeout,
+	})
+	if err != nil {
+		return fmt.Errorf("lobby client: %w", err)
+	}
+	w.lobbyClient = lobby
+	w.closers = append(w.closers, lobby.Close)
+
+	notificationPub, err := notificationpublisher.NewPublisher(notificationpublisher.Config{
+		Client: w.redisClient,
+		Stream: w.cfg.Streams.NotificationIntents,
+	})
+	if err != nil {
+		return fmt.Errorf("notification publisher: %w", err)
+	}
+	w.notificationPublisher = notificationPub
+
+	healthPub, err := healtheventspublisher.NewPublisher(healtheventspublisher.Config{
+		Client:    w.redisClient,
+		Snapshots: w.healthSnapshotStore,
+		Stream:    w.cfg.Streams.HealthEvents,
+	})
+	if err != nil {
+		return fmt.Errorf("health events publisher: %w", err)
+	}
+	w.healthEventsPublisher = healthPub
+
+	jobResultsPub, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{
+		Client: w.redisClient,
+		Stream: w.cfg.Streams.JobResults,
+	})
+	if err != nil {
+		return fmt.Errorf("job results publisher: %w", err)
+	}
+	w.jobResultsPublisher = jobResultsPub
+
+	return nil
+}
+
+func (w *wiring) buildServices() error {
+	startService, err := startruntime.NewService(startruntime.Dependencies{
+		RuntimeRecords: w.runtimeRecordStore,
+		OperationLogs:  w.operationLogStore,
+		Docker:         w.dockerAdapter,
+		Leases:         w.gameLeaseStore,
+		HealthEvents:   w.healthEventsPublisher,
+		Notifications:  w.notificationPublisher,
+		Lobby:          w.lobbyClient,
+		Container:      w.cfg.Container,
+		DockerCfg:      w.cfg.Docker,
+		Coordination:   w.cfg.Coordination,
+		Telemetry:      w.telemetry,
+		Logger:         w.logger,
+		Clock:          w.clock,
+	})
+	if err != nil {
+		return fmt.Errorf("start runtime service: %w", err)
+	}
+	w.startRuntimeService = startService
+
+	stopService, err := stopruntime.NewService(stopruntime.Dependencies{
+		RuntimeRecords: w.runtimeRecordStore,
+		OperationLogs:  w.operationLogStore,
+		Docker:         w.dockerAdapter,
+		Leases:         w.gameLeaseStore,
+		HealthEvents:   w.healthEventsPublisher,
+		Container:      w.cfg.Container,
+		Coordination:   w.cfg.Coordination,
+		Telemetry:      w.telemetry,
+		Logger:         w.logger,
+		Clock:          w.clock,
+	})
+	if err != nil {
+		return fmt.Errorf("stop runtime service: %w", err)
+	}
+	w.stopRuntimeService = stopService
+
+	restartService, err := restartruntime.NewService(restartruntime.Dependencies{
+		RuntimeRecords: w.runtimeRecordStore,
+		OperationLogs:  w.operationLogStore,
+		Docker:         w.dockerAdapter,
+		Leases:         w.gameLeaseStore,
+		StopService:    stopService,
+		StartService:   startService,
+		Coordination:   w.cfg.Coordination,
+		Telemetry:      w.telemetry,
+		Logger:         w.logger,
+		Clock:          w.clock,
+	})
+	if err != nil {
+		return fmt.Errorf("restart runtime service: %w", err)
+	}
+	w.restartRuntimeService = restartService
+
+	patchService, err := patchruntime.NewService(patchruntime.Dependencies{
+		RuntimeRecords: w.runtimeRecordStore,
+		OperationLogs:  w.operationLogStore,
+		Docker:         w.dockerAdapter,
+		Leases:         w.gameLeaseStore,
+		StopService:    stopService,
+		StartService:   startService,
+		Coordination:   w.cfg.Coordination,
+		Telemetry:      w.telemetry,
+		Logger:         w.logger,
+		Clock:          w.clock,
+	})
+	if err != nil {
+		return fmt.Errorf("patch runtime service: %w", err)
+	}
+	w.patchRuntimeService = patchService
+
+	cleanupService, err := cleanupcontainer.NewService(cleanupcontainer.Dependencies{
+		RuntimeRecords: w.runtimeRecordStore,
+		OperationLogs:  w.operationLogStore,
+		Docker:         w.dockerAdapter,
+		Leases:         w.gameLeaseStore,
+		Coordination:   w.cfg.Coordination,
+		Telemetry:      w.telemetry,
+		Logger:         w.logger,
+		Clock:          w.clock,
+	})
+	if err != nil {
+		return fmt.Errorf("cleanup container service: %w", err)
+	}
+	w.cleanupContainerService = cleanupService
+
+	return nil
+}
+
+// buildWorkers constructs the asynchronous Lobby ↔ RTM stream
+// consumers. Both consumers participate in the process lifecycle as
+// `app.Component`s; `internal/app/runtime.go` passes them into
+// `app.New` alongside the internal HTTP server.
+func (w *wiring) buildWorkers() error {
+	startConsumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{
+		Client:       w.redisClient,
+		Stream:       w.cfg.Streams.StartJobs,
+		BlockTimeout: w.cfg.Streams.BlockTimeout,
+		StartService: w.startRuntimeService,
+		JobResults:   w.jobResultsPublisher,
+		OffsetStore:  w.streamOffsetStore,
+		Logger:       w.logger,
+	})
+	if err != nil {
+		return fmt.Errorf("start jobs consumer: %w", err)
+	}
+	w.startJobsConsumer = startConsumer
+
+	stopConsumer, err := stopjobsconsumer.NewConsumer(stopjobsconsumer.Config{
+		Client:       w.redisClient,
+		Stream:       w.cfg.Streams.StopJobs,
+		BlockTimeout: w.cfg.Streams.BlockTimeout,
+		StopService:  w.stopRuntimeService,
+		JobResults:   w.jobResultsPublisher,
+		OffsetStore:  w.streamOffsetStore,
+		Logger:       w.logger,
+	})
+	if err != nil {
+		return fmt.Errorf("stop jobs consumer: %w", err)
+	}
+	w.stopJobsConsumer = stopConsumer
+
+	eventsListener, err := dockerevents.NewListener(dockerevents.Dependencies{
+		Docker:         w.dockerAdapter,
+		RuntimeRecords: w.runtimeRecordStore,
+		HealthEvents:   w.healthEventsPublisher,
+		Telemetry:      w.telemetry,
+		Clock:          w.clock,
+		Logger:         w.logger,
+	})
+	if err != nil {
+		return fmt.Errorf("docker events listener: %w", err)
+	}
+	w.dockerEventsListener = eventsListener
+
+	probeHTTPClient, err := newProbeHTTPClient(w.telemetry)
+	if err != nil {
+		return fmt.Errorf("health probe http client: %w", err)
+	}
+	probeWorker, err := healthprobe.NewWorker(healthprobe.Dependencies{
+		RuntimeRecords:    w.runtimeRecordStore,
+		HealthEvents:      w.healthEventsPublisher,
+		HTTPClient:        probeHTTPClient,
+		Telemetry:         w.telemetry,
+		Interval:          w.cfg.Health.ProbeInterval,
+		ProbeTimeout:      w.cfg.Health.ProbeTimeout,
+		FailuresThreshold: w.cfg.Health.ProbeFailuresThreshold,
+		Clock:             w.clock,
+		Logger:            w.logger,
+	})
+	if err != nil {
+		return fmt.Errorf("health probe worker: %w", err)
+	}
+	w.healthProbeWorker = probeWorker
+
+	inspectWorker, err := dockerinspect.NewWorker(dockerinspect.Dependencies{
+		Docker:         w.dockerAdapter,
+		RuntimeRecords: w.runtimeRecordStore,
+		HealthEvents:   w.healthEventsPublisher,
+		Telemetry:      w.telemetry,
+		Interval:       w.cfg.Health.InspectInterval,
+		Clock:          w.clock,
+		Logger:         w.logger,
+	})
+	if err != nil {
+		return fmt.Errorf("docker inspect worker: %w", err)
+	}
+	w.dockerInspectWorker = inspectWorker
+
+	reconciler, err := reconcile.NewReconciler(reconcile.Dependencies{
+		Docker:         w.dockerAdapter,
+		RuntimeRecords: w.runtimeRecordStore,
+		OperationLogs:  w.operationLogStore,
+		HealthEvents:   w.healthEventsPublisher,
+		Leases:         w.gameLeaseStore,
+		Telemetry:      w.telemetry,
+		DockerCfg:      w.cfg.Docker,
+		ContainerCfg:   w.cfg.Container,
+		Coordination:   w.cfg.Coordination,
+		Interval:       w.cfg.Cleanup.ReconcileInterval,
+		Clock:          w.clock,
+		Logger:         w.logger,
+	})
+	if err != nil {
+		return fmt.Errorf("reconciler: %w", err)
+	}
+	w.reconciler = reconciler
+
+	cleanupWorker, err := containercleanup.NewWorker(containercleanup.Dependencies{
+		RuntimeRecords: w.runtimeRecordStore,
+		Cleanup:        w.cleanupContainerService,
+		Retention:      w.cfg.Container.Retention,
+		Interval:       w.cfg.Cleanup.CleanupInterval,
+		Clock:          w.clock,
+		Logger:         w.logger,
+	})
+	if err != nil {
+		return fmt.Errorf("container cleanup worker: %w", err)
+	}
+	w.containerCleanupWorker = cleanupWorker
+
+	return nil
+}
+
+// newProbeHTTPClient constructs the otelhttp-instrumented HTTP client
+// the active health probe uses to call engine `/healthz`. It clones
+// the default transport so caller-provided transports stay isolated
+// from production wiring (mirrors the lobby internal client).
+func newProbeHTTPClient(telemetryRuntime *telemetry.Runtime) (*http.Client, error) {
+	transport, ok := http.DefaultTransport.(*http.Transport)
+	if !ok {
+		return nil, errors.New("default http transport is not *http.Transport")
+	}
+	cloned := transport.Clone()
+	instrumented := otelhttp.NewTransport(cloned,
+		otelhttp.WithTracerProvider(telemetryRuntime.TracerProvider()),
+		otelhttp.WithMeterProvider(telemetryRuntime.MeterProvider()),
+	)
+	return &http.Client{Transport: instrumented}, nil
+}
+
+// registerTelemetryGauges installs the runtime-records-by-status gauge
+// callback so the telemetry runtime can observe the persistent store
+// without holding a strong reference to the wiring.
+func (w *wiring) registerTelemetryGauges() error {
+	probe := newRuntimeRecordsProbe(w.runtimeRecordStore)
+	return w.telemetry.RegisterGauges(telemetry.GaugeDependencies{
+		RuntimeRecordsByStatus: probe,
+		Logger:                 w.logger,
+	})
+}
+
+// close releases adapter-level resources owned by the wiring layer.
+// Returns the joined error of every closer; the caller is expected to
+// invoke this once during process shutdown.
+func (w *wiring) close() error {
+	var joined error
+	for index := len(w.closers) - 1; index >= 0; index-- {
+		if err := w.closers[index](); err != nil {
+			joined = errors.Join(joined, err)
+		}
+	}
+	w.closers = nil
+	return joined
+}
+
+// runtimeRecordsProbe adapts runtimerecordstore.Store to
+// telemetry.RuntimeRecordsByStatusProbe by translating the typed status
+// keys into the string keys the gauge expects.
+type runtimeRecordsProbe struct {
+	store *runtimerecordstore.Store
+}
+
+func newRuntimeRecordsProbe(store *runtimerecordstore.Store) *runtimeRecordsProbe {
+	return &runtimeRecordsProbe{store: store}
+}
+
+func (p *runtimeRecordsProbe) CountByStatus(ctx context.Context) (map[string]int, error) {
+	if p == nil || p.store == nil {
+		return nil, errors.New("runtime records probe: nil store")
+	}
+	counts, err := p.store.CountByStatus(ctx)
+	if err != nil {
+		return nil, err
+	}
+	out := make(map[string]int, len(counts))
+	for status, count := range counts {
+		out[string(status)] = count
+	}
+	return out, nil
+}
+
+// Compile-time assertions that the constructed adapters satisfy the
+// expected port surfaces; these prevent silent regressions when a
+// port shape changes.
+var (
+	_ ports.RuntimeRecordStore        = (*runtimerecordstore.Store)(nil)
+	_ ports.OperationLogStore         = (*operationlogstore.Store)(nil)
+	_ ports.HealthSnapshotStore       = (*healthsnapshotstore.Store)(nil)
+	_ ports.StreamOffsetStore         = (*streamoffsets.Store)(nil)
+	_ ports.GameLeaseStore            = (*gamelease.Store)(nil)
+	_ ports.DockerClient              = (*docker.Client)(nil)
+	_ ports.LobbyInternalClient       = (*lobbyclient.Client)(nil)
+	_ ports.NotificationIntentPublisher = (*notificationpublisher.Publisher)(nil)
+	_ ports.HealthEventPublisher      = (*healtheventspublisher.Publisher)(nil)
+	_ ports.JobResultPublisher        = (*jobresultspublisher.Publisher)(nil)
+
+	_ Component                  = (*reconcile.Reconciler)(nil)
+	_ Component                  = (*containercleanup.Worker)(nil)
+	_ containercleanup.Cleaner   = (*cleanupcontainer.Service)(nil)
+)
+