feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
+170
View File
@@ -0,0 +1,170 @@
// Package app wires the Runtime Manager process lifecycle and
// coordinates component startup and graceful shutdown.
package app
import (
"context"
"errors"
"fmt"
"sync"
"galaxy/rtmanager/internal/config"
)
// Component is a long-lived Runtime Manager subsystem that participates
// in coordinated startup and graceful shutdown.
type Component interface {
// Run starts the component and blocks until it stops.
Run(context.Context) error
// Shutdown stops the component within the provided timeout-bounded
// context.
Shutdown(context.Context) error
}
// App owns the process-level lifecycle of Runtime Manager and its
// registered components.
type App struct {
cfg config.Config
components []Component
}
// New constructs App with a defensive copy of the supplied components.
func New(cfg config.Config, components ...Component) *App {
clonedComponents := append([]Component(nil), components...)
return &App{
cfg: cfg,
components: clonedComponents,
}
}
// Run starts all configured components, waits for cancellation or the
// first component failure, and then executes best-effort graceful
// shutdown.
func (app *App) Run(ctx context.Context) error {
if ctx == nil {
return errors.New("run rtmanager app: nil context")
}
if err := app.validate(); err != nil {
return err
}
if len(app.components) == 0 {
<-ctx.Done()
return nil
}
runCtx, cancel := context.WithCancel(ctx)
defer cancel()
results := make(chan componentResult, len(app.components))
var runWaitGroup sync.WaitGroup
for index, component := range app.components {
runWaitGroup.Add(1)
go func(componentIndex int, component Component) {
defer runWaitGroup.Done()
results <- componentResult{
index: componentIndex,
err: component.Run(runCtx),
}
}(index, component)
}
var runErr error
select {
case <-ctx.Done():
case result := <-results:
runErr = classifyComponentResult(ctx, result)
}
cancel()
shutdownErr := app.shutdownComponents()
waitErr := app.waitForComponents(&runWaitGroup)
return errors.Join(runErr, shutdownErr, waitErr)
}
type componentResult struct {
index int
err error
}
func (app *App) validate() error {
if app.cfg.ShutdownTimeout <= 0 {
return fmt.Errorf("run rtmanager app: shutdown timeout must be positive, got %s", app.cfg.ShutdownTimeout)
}
for index, component := range app.components {
if component == nil {
return fmt.Errorf("run rtmanager app: component %d is nil", index)
}
}
return nil
}
func classifyComponentResult(parentCtx context.Context, result componentResult) error {
switch {
case result.err == nil:
if parentCtx.Err() != nil {
return nil
}
return fmt.Errorf("run rtmanager app: component %d exited without error before shutdown", result.index)
case errors.Is(result.err, context.Canceled) && parentCtx.Err() != nil:
return nil
default:
return fmt.Errorf("run rtmanager app: component %d: %w", result.index, result.err)
}
}
func (app *App) shutdownComponents() error {
var shutdownWaitGroup sync.WaitGroup
errs := make(chan error, len(app.components))
for index, component := range app.components {
shutdownWaitGroup.Add(1)
go func(componentIndex int, component Component) {
defer shutdownWaitGroup.Done()
shutdownCtx, cancel := context.WithTimeout(context.Background(), app.cfg.ShutdownTimeout)
defer cancel()
if err := component.Shutdown(shutdownCtx); err != nil {
errs <- fmt.Errorf("shutdown rtmanager component %d: %w", componentIndex, err)
}
}(index, component)
}
shutdownWaitGroup.Wait()
close(errs)
var joined error
for err := range errs {
joined = errors.Join(joined, err)
}
return joined
}
func (app *App) waitForComponents(runWaitGroup *sync.WaitGroup) error {
done := make(chan struct{})
go func() {
runWaitGroup.Wait()
close(done)
}()
waitCtx, cancel := context.WithTimeout(context.Background(), app.cfg.ShutdownTimeout)
defer cancel()
select {
case <-done:
return nil
case <-waitCtx.Done():
return fmt.Errorf("wait for rtmanager components: %w", waitCtx.Err())
}
}
+137
View File
@@ -0,0 +1,137 @@
package app
import (
"context"
"errors"
"sync/atomic"
"testing"
"time"
"galaxy/rtmanager/internal/config"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type fakeComponent struct {
runErr error
shutdownErr error
runHook func(context.Context) error
shutdownHook func(context.Context) error
runCount atomic.Int32
downCount atomic.Int32
blockForCtx bool
}
func (component *fakeComponent) Run(ctx context.Context) error {
component.runCount.Add(1)
if component.runHook != nil {
return component.runHook(ctx)
}
if component.blockForCtx {
<-ctx.Done()
return ctx.Err()
}
return component.runErr
}
func (component *fakeComponent) Shutdown(ctx context.Context) error {
component.downCount.Add(1)
if component.shutdownHook != nil {
return component.shutdownHook(ctx)
}
return component.shutdownErr
}
func newCfg() config.Config {
return config.Config{ShutdownTimeout: time.Second}
}
func TestAppRunWithoutComponentsBlocksUntilContextDone(t *testing.T) {
t.Parallel()
app := New(newCfg())
ctx, cancel := context.WithCancel(context.Background())
cancel()
require.NoError(t, app.Run(ctx))
}
func TestAppRunReturnsOnContextCancel(t *testing.T) {
t.Parallel()
component := &fakeComponent{blockForCtx: true}
app := New(newCfg(), component)
ctx, cancel := context.WithCancel(context.Background())
go func() {
time.Sleep(10 * time.Millisecond)
cancel()
}()
require.NoError(t, app.Run(ctx))
assert.EqualValues(t, 1, component.runCount.Load())
assert.EqualValues(t, 1, component.downCount.Load())
}
func TestAppRunPropagatesComponentFailure(t *testing.T) {
t.Parallel()
failure := errors.New("boom")
component := &fakeComponent{runErr: failure}
app := New(newCfg(), component)
err := app.Run(context.Background())
require.Error(t, err)
require.ErrorIs(t, err, failure)
assert.EqualValues(t, 1, component.downCount.Load())
}
func TestAppRunFailsOnNilContext(t *testing.T) {
t.Parallel()
app := New(newCfg())
var ctx context.Context
require.Error(t, app.Run(ctx))
}
func TestAppRunFailsOnNonPositiveShutdownTimeout(t *testing.T) {
t.Parallel()
app := New(config.Config{}, &fakeComponent{})
require.Error(t, app.Run(context.Background()))
}
func TestAppRunFailsOnNilComponent(t *testing.T) {
t.Parallel()
app := New(newCfg(), nil)
require.Error(t, app.Run(context.Background()))
}
func TestAppRunFlagsCleanExitBeforeShutdown(t *testing.T) {
t.Parallel()
component := &fakeComponent{}
app := New(newCfg(), component)
err := app.Run(context.Background())
require.Error(t, err)
require.True(t, contains(err.Error(), "exited without error"))
}
func contains(haystack, needle string) bool {
return len(needle) == 0 || (len(haystack) >= len(needle) && (haystack == needle || index(haystack, needle) >= 0))
}
func index(haystack, needle string) int {
for i := 0; i+len(needle) <= len(haystack); i++ {
if haystack[i:i+len(needle)] == needle {
return i
}
}
return -1
}
+85
View File
@@ -0,0 +1,85 @@
package app
import (
"context"
"errors"
"fmt"
"time"
"galaxy/redisconn"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/telemetry"
"github.com/docker/docker/client"
"github.com/redis/go-redis/v9"
)
// newRedisClient builds the master Redis client from cfg via the shared
// `pkg/redisconn` helper. Replica clients are not opened in this iteration
// per ARCHITECTURE.md §Persistence Backends; they will be wired when read
// routing is introduced.
func newRedisClient(cfg config.RedisConfig) *redis.Client {
return redisconn.NewMasterClient(cfg.Conn)
}
// instrumentRedisClient attaches the OpenTelemetry tracing and metrics
// instrumentation to client when telemetryRuntime is available. The
// actual instrumentation lives in `pkg/redisconn` so every Galaxy service
// shares one surface.
func instrumentRedisClient(redisClient *redis.Client, telemetryRuntime *telemetry.Runtime) error {
if redisClient == nil {
return errors.New("instrument redis client: nil client")
}
if telemetryRuntime == nil {
return nil
}
return redisconn.Instrument(redisClient,
redisconn.WithTracerProvider(telemetryRuntime.TracerProvider()),
redisconn.WithMeterProvider(telemetryRuntime.MeterProvider()),
)
}
// pingRedis performs a single Redis PING bounded by
// cfg.Conn.OperationTimeout to confirm that the configured Redis endpoint
// is reachable at startup.
func pingRedis(ctx context.Context, cfg config.RedisConfig, redisClient *redis.Client) error {
return redisconn.Ping(ctx, redisClient, cfg.Conn.OperationTimeout)
}
// newDockerClient constructs a Docker SDK client for cfg.Host with an
// optional API version override. The bootstrap layer opens and pings
// the client; the production Docker adapter wraps it for the service
// layer.
func newDockerClient(cfg config.DockerConfig) (*client.Client, error) {
options := []client.Opt{client.WithHost(cfg.Host)}
if cfg.APIVersion == "" {
options = append(options, client.WithAPIVersionNegotiation())
} else {
options = append(options, client.WithVersion(cfg.APIVersion))
}
docker, err := client.NewClientWithOpts(options...)
if err != nil {
return nil, fmt.Errorf("new docker client: %w", err)
}
return docker, nil
}
// pingDocker bounds one Docker daemon ping under timeout and returns a
// wrapped error so startup failures are easy to spot in service logs.
func pingDocker(ctx context.Context, dockerClient *client.Client, timeout time.Duration) error {
if dockerClient == nil {
return errors.New("ping docker: nil client")
}
if timeout <= 0 {
return errors.New("ping docker: timeout must be positive")
}
pingCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
if _, err := dockerClient.Ping(pingCtx); err != nil {
return fmt.Errorf("ping docker: %w", err)
}
return nil
}
+82
View File
@@ -0,0 +1,82 @@
package app
import (
"context"
"testing"
"time"
"galaxy/redisconn"
"galaxy/rtmanager/internal/config"
"github.com/alicebob/miniredis/v2"
"github.com/stretchr/testify/require"
)
func newTestRedisCfg(addr string) config.RedisConfig {
return config.RedisConfig{
Conn: redisconn.Config{
MasterAddr: addr,
Password: "test",
OperationTimeout: time.Second,
},
}
}
func TestPingRedisSucceedsAgainstMiniredis(t *testing.T) {
t.Parallel()
server := miniredis.RunT(t)
redisCfg := newTestRedisCfg(server.Addr())
client := newRedisClient(redisCfg)
t.Cleanup(func() { _ = client.Close() })
require.NoError(t, pingRedis(context.Background(), redisCfg, client))
}
func TestPingRedisReturnsErrorWhenClosed(t *testing.T) {
t.Parallel()
server := miniredis.RunT(t)
redisCfg := newTestRedisCfg(server.Addr())
client := newRedisClient(redisCfg)
require.NoError(t, client.Close())
require.Error(t, pingRedis(context.Background(), redisCfg, client))
}
func TestNewDockerClientHonoursHostOverride(t *testing.T) {
t.Parallel()
docker, err := newDockerClient(config.DockerConfig{
Host: "unix:///var/run/docker.sock",
APIVersion: "1.43",
Network: "galaxy-net",
LogDriver: "json-file",
PullPolicy: config.ImagePullPolicyIfMissing,
})
require.NoError(t, err)
require.NotNil(t, docker)
require.NoError(t, docker.Close())
}
func TestPingDockerRejectsNilClient(t *testing.T) {
t.Parallel()
require.Error(t, pingDocker(context.Background(), nil, time.Second))
}
func TestPingDockerRejectsNonPositiveTimeout(t *testing.T) {
t.Parallel()
docker, err := newDockerClient(config.DockerConfig{
Host: "unix:///var/run/docker.sock",
Network: "galaxy-net",
LogDriver: "json-file",
})
require.NoError(t, err)
t.Cleanup(func() { _ = docker.Close() })
require.Error(t, pingDocker(context.Background(), docker, 0))
}
+262
View File
@@ -0,0 +1,262 @@
package app
import (
"context"
"database/sql"
"errors"
"fmt"
"log/slog"
"time"
"galaxy/postgres"
"galaxy/redisconn"
"galaxy/rtmanager/internal/adapters/postgres/migrations"
"galaxy/rtmanager/internal/api/internalhttp"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/telemetry"
dockerclient "github.com/docker/docker/client"
"github.com/redis/go-redis/v9"
)
// Runtime owns the runnable Runtime Manager process plus the cleanup
// functions that release runtime resources after shutdown.
type Runtime struct {
cfg config.Config
app *App
wiring *wiring
internalServer *internalhttp.Server
cleanupFns []func() error
}
// NewRuntime constructs the runnable Runtime Manager process from cfg.
//
// PostgreSQL migrations apply strictly before the internal HTTP listener
// becomes ready. The runtime opens one shared `*redis.Client`, one
// `*sql.DB`, one Docker SDK client, and one OpenTelemetry runtime; all
// are released in reverse construction order on shutdown.
func NewRuntime(ctx context.Context, cfg config.Config, logger *slog.Logger) (*Runtime, error) {
if ctx == nil {
return nil, errors.New("new rtmanager runtime: nil context")
}
if err := cfg.Validate(); err != nil {
return nil, fmt.Errorf("new rtmanager runtime: %w", err)
}
if logger == nil {
logger = slog.Default()
}
runtime := &Runtime{
cfg: cfg,
}
cleanupOnError := func(err error) (*Runtime, error) {
if cleanupErr := runtime.Close(); cleanupErr != nil {
return nil, fmt.Errorf("%w; cleanup: %w", err, cleanupErr)
}
return nil, err
}
telemetryRuntime, err := telemetry.NewProcess(ctx, telemetry.ProcessConfig{
ServiceName: cfg.Telemetry.ServiceName,
TracesExporter: cfg.Telemetry.TracesExporter,
MetricsExporter: cfg.Telemetry.MetricsExporter,
TracesProtocol: cfg.Telemetry.TracesProtocol,
MetricsProtocol: cfg.Telemetry.MetricsProtocol,
StdoutTracesEnabled: cfg.Telemetry.StdoutTracesEnabled,
StdoutMetricsEnabled: cfg.Telemetry.StdoutMetricsEnabled,
}, logger)
if err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: telemetry: %w", err))
}
runtime.cleanupFns = append(runtime.cleanupFns, func() error {
shutdownCtx, cancel := context.WithTimeout(context.Background(), cfg.ShutdownTimeout)
defer cancel()
return telemetryRuntime.Shutdown(shutdownCtx)
})
redisClient := newRedisClient(cfg.Redis)
if err := instrumentRedisClient(redisClient, telemetryRuntime); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
}
runtime.cleanupFns = append(runtime.cleanupFns, func() error {
err := redisClient.Close()
if errors.Is(err, redis.ErrClosed) {
return nil
}
return err
})
if err := pingRedis(ctx, cfg.Redis, redisClient); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
}
pgPool, err := postgres.OpenPrimary(ctx, cfg.Postgres.Conn,
postgres.WithTracerProvider(telemetryRuntime.TracerProvider()),
postgres.WithMeterProvider(telemetryRuntime.MeterProvider()),
)
if err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: open postgres: %w", err))
}
runtime.cleanupFns = append(runtime.cleanupFns, pgPool.Close)
unregisterPGStats, err := postgres.InstrumentDBStats(pgPool,
postgres.WithMeterProvider(telemetryRuntime.MeterProvider()),
)
if err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: instrument postgres: %w", err))
}
runtime.cleanupFns = append(runtime.cleanupFns, func() error {
return unregisterPGStats()
})
if err := postgres.Ping(ctx, pgPool, cfg.Postgres.Conn.OperationTimeout); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: ping postgres: %w", err))
}
if err := postgres.RunMigrations(ctx, pgPool, migrations.FS(), "."); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: run postgres migrations: %w", err))
}
dockerClient, err := newDockerClient(cfg.Docker)
if err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
}
runtime.cleanupFns = append(runtime.cleanupFns, dockerClient.Close)
if err := pingDocker(ctx, dockerClient, cfg.Postgres.Conn.OperationTimeout); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
}
wiring, err := newWiring(cfg, redisClient, pgPool, dockerClient, time.Now, logger, telemetryRuntime)
if err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: wiring: %w", err))
}
runtime.wiring = wiring
runtime.cleanupFns = append(runtime.cleanupFns, wiring.close)
if err := wiring.registerTelemetryGauges(); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: register telemetry gauges: %w", err))
}
if err := wiring.reconciler.ReconcileNow(ctx); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: initial reconcile: %w", err))
}
probe := newReadinessProbe(pgPool, redisClient, dockerClient, cfg)
internalServer, err := internalhttp.NewServer(internalhttp.Config{
Addr: cfg.InternalHTTP.Addr,
ReadHeaderTimeout: cfg.InternalHTTP.ReadHeaderTimeout,
ReadTimeout: cfg.InternalHTTP.ReadTimeout,
WriteTimeout: cfg.InternalHTTP.WriteTimeout,
IdleTimeout: cfg.InternalHTTP.IdleTimeout,
}, internalhttp.Dependencies{
Logger: logger,
Telemetry: telemetryRuntime,
Readiness: probe,
RuntimeRecords: wiring.runtimeRecordStore,
StartRuntime: wiring.startRuntimeService,
StopRuntime: wiring.stopRuntimeService,
RestartRuntime: wiring.restartRuntimeService,
PatchRuntime: wiring.patchRuntimeService,
CleanupContainer: wiring.cleanupContainerService,
})
if err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: internal HTTP server: %w", err))
}
runtime.internalServer = internalServer
runtime.app = New(cfg,
internalServer,
wiring.startJobsConsumer,
wiring.stopJobsConsumer,
wiring.dockerEventsListener,
wiring.healthProbeWorker,
wiring.dockerInspectWorker,
wiring.reconciler,
wiring.containerCleanupWorker,
)
return runtime, nil
}
// InternalServer returns the internal HTTP server owned by runtime. It is
// primarily exposed for tests; production code should not depend on it.
func (runtime *Runtime) InternalServer() *internalhttp.Server {
if runtime == nil {
return nil
}
return runtime.internalServer
}
// Run serves the internal HTTP listener until ctx is canceled or one
// component fails.
func (runtime *Runtime) Run(ctx context.Context) error {
if ctx == nil {
return errors.New("run rtmanager runtime: nil context")
}
if runtime == nil {
return errors.New("run rtmanager runtime: nil runtime")
}
if runtime.app == nil {
return errors.New("run rtmanager runtime: nil app")
}
return runtime.app.Run(ctx)
}
// Close releases every runtime dependency in reverse construction order.
// Close is safe to call multiple times.
func (runtime *Runtime) Close() error {
if runtime == nil {
return nil
}
var joined error
for index := len(runtime.cleanupFns) - 1; index >= 0; index-- {
if err := runtime.cleanupFns[index](); err != nil {
joined = errors.Join(joined, err)
}
}
runtime.cleanupFns = nil
return joined
}
// readinessProbe pings every steady-state dependency the listener
// guards: PostgreSQL primary, Redis master, the Docker daemon, plus
// the configured Docker network's existence.
type readinessProbe struct {
pgPool *sql.DB
redisClient *redis.Client
dockerClient *dockerclient.Client
postgresTimeout time.Duration
redisTimeout time.Duration
dockerTimeout time.Duration
}
func newReadinessProbe(pgPool *sql.DB, redisClient *redis.Client, dockerClient *dockerclient.Client, cfg config.Config) *readinessProbe {
return &readinessProbe{
pgPool: pgPool,
redisClient: redisClient,
dockerClient: dockerClient,
postgresTimeout: cfg.Postgres.Conn.OperationTimeout,
redisTimeout: cfg.Redis.Conn.OperationTimeout,
dockerTimeout: cfg.Postgres.Conn.OperationTimeout,
}
}
// Check pings PostgreSQL, Redis, and Docker. The first failing
// dependency aborts the check so callers see a single, actionable
// error.
func (probe *readinessProbe) Check(ctx context.Context) error {
if err := postgres.Ping(ctx, probe.pgPool, probe.postgresTimeout); err != nil {
return err
}
if err := redisconn.Ping(ctx, probe.redisClient, probe.redisTimeout); err != nil {
return err
}
return pingDocker(ctx, probe.dockerClient, probe.dockerTimeout)
}
+541
View File
@@ -0,0 +1,541 @@
package app
import (
"context"
"database/sql"
"errors"
"fmt"
"log/slog"
"net/http"
"time"
"galaxy/rtmanager/internal/adapters/docker"
"galaxy/rtmanager/internal/adapters/healtheventspublisher"
"galaxy/rtmanager/internal/adapters/jobresultspublisher"
"galaxy/rtmanager/internal/adapters/lobbyclient"
"galaxy/rtmanager/internal/adapters/notificationpublisher"
"galaxy/rtmanager/internal/adapters/postgres/healthsnapshotstore"
"galaxy/rtmanager/internal/adapters/postgres/operationlogstore"
"galaxy/rtmanager/internal/adapters/postgres/runtimerecordstore"
"galaxy/rtmanager/internal/adapters/redisstate/gamelease"
"galaxy/rtmanager/internal/adapters/redisstate/streamoffsets"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/cleanupcontainer"
"galaxy/rtmanager/internal/service/patchruntime"
"galaxy/rtmanager/internal/service/restartruntime"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"galaxy/rtmanager/internal/telemetry"
"galaxy/rtmanager/internal/worker/containercleanup"
"galaxy/rtmanager/internal/worker/dockerevents"
"galaxy/rtmanager/internal/worker/dockerinspect"
"galaxy/rtmanager/internal/worker/healthprobe"
"galaxy/rtmanager/internal/worker/reconcile"
"galaxy/rtmanager/internal/worker/startjobsconsumer"
"galaxy/rtmanager/internal/worker/stopjobsconsumer"
dockerclient "github.com/docker/docker/client"
"github.com/redis/go-redis/v9"
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
)
// wiring owns the process-level singletons constructed once during
// `NewRuntime` and consumed by every worker and HTTP handler.
//
// The struct exposes typed accessors so callers can grab the store /
// adapter / service singletons without depending on internal fields.
type wiring struct {
cfg config.Config
redisClient *redis.Client
pgPool *sql.DB
dockerClient *dockerclient.Client
clock func() time.Time
logger *slog.Logger
telemetry *telemetry.Runtime
// Persistence stores.
runtimeRecordStore *runtimerecordstore.Store
operationLogStore *operationlogstore.Store
healthSnapshotStore *healthsnapshotstore.Store
streamOffsetStore *streamoffsets.Store
gameLeaseStore *gamelease.Store
// External adapters.
dockerAdapter *docker.Client
lobbyClient *lobbyclient.Client
notificationPublisher *notificationpublisher.Publisher
healthEventsPublisher *healtheventspublisher.Publisher
jobResultsPublisher *jobresultspublisher.Publisher
// Service layer.
startRuntimeService *startruntime.Service
stopRuntimeService *stopruntime.Service
restartRuntimeService *restartruntime.Service
patchRuntimeService *patchruntime.Service
cleanupContainerService *cleanupcontainer.Service
// Worker layer.
startJobsConsumer *startjobsconsumer.Consumer
stopJobsConsumer *stopjobsconsumer.Consumer
dockerEventsListener *dockerevents.Listener
healthProbeWorker *healthprobe.Worker
dockerInspectWorker *dockerinspect.Worker
reconciler *reconcile.Reconciler
containerCleanupWorker *containercleanup.Worker
// closers releases adapter-level resources at runtime shutdown.
closers []func() error
}
// newWiring constructs the process-level dependency set, the persistence
// stores, the external adapters, and the service layer. It validates
// every required collaborator so callers can rely on them being non-nil.
func newWiring(
cfg config.Config,
redisClient *redis.Client,
pgPool *sql.DB,
dockerClient *dockerclient.Client,
clock func() time.Time,
logger *slog.Logger,
telemetryRuntime *telemetry.Runtime,
) (*wiring, error) {
if redisClient == nil {
return nil, errors.New("new rtmanager wiring: nil redis client")
}
if pgPool == nil {
return nil, errors.New("new rtmanager wiring: nil postgres pool")
}
if dockerClient == nil {
return nil, errors.New("new rtmanager wiring: nil docker client")
}
if clock == nil {
clock = time.Now
}
if logger == nil {
logger = slog.Default()
}
if telemetryRuntime == nil {
return nil, fmt.Errorf("new rtmanager wiring: nil telemetry runtime")
}
w := &wiring{
cfg: cfg,
redisClient: redisClient,
pgPool: pgPool,
dockerClient: dockerClient,
clock: clock,
logger: logger,
telemetry: telemetryRuntime,
}
if err := w.buildPersistence(); err != nil {
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
}
if err := w.buildAdapters(); err != nil {
_ = w.close()
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
}
if err := w.buildServices(); err != nil {
_ = w.close()
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
}
if err := w.buildWorkers(); err != nil {
_ = w.close()
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
}
return w, nil
}
func (w *wiring) buildPersistence() error {
runtimeStore, err := runtimerecordstore.New(runtimerecordstore.Config{
DB: w.pgPool,
OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
})
if err != nil {
return fmt.Errorf("runtime record store: %w", err)
}
w.runtimeRecordStore = runtimeStore
operationStore, err := operationlogstore.New(operationlogstore.Config{
DB: w.pgPool,
OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
})
if err != nil {
return fmt.Errorf("operation log store: %w", err)
}
w.operationLogStore = operationStore
snapshotStore, err := healthsnapshotstore.New(healthsnapshotstore.Config{
DB: w.pgPool,
OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
})
if err != nil {
return fmt.Errorf("health snapshot store: %w", err)
}
w.healthSnapshotStore = snapshotStore
offsetStore, err := streamoffsets.New(streamoffsets.Config{Client: w.redisClient})
if err != nil {
return fmt.Errorf("stream offset store: %w", err)
}
w.streamOffsetStore = offsetStore
leaseStore, err := gamelease.New(gamelease.Config{Client: w.redisClient})
if err != nil {
return fmt.Errorf("game lease store: %w", err)
}
w.gameLeaseStore = leaseStore
return nil
}
func (w *wiring) buildAdapters() error {
dockerAdapter, err := docker.NewClient(docker.Config{
Docker: w.dockerClient,
LogDriver: w.cfg.Docker.LogDriver,
LogOpts: w.cfg.Docker.LogOpts,
Clock: w.clock,
})
if err != nil {
return fmt.Errorf("docker adapter: %w", err)
}
w.dockerAdapter = dockerAdapter
lobby, err := lobbyclient.NewClient(lobbyclient.Config{
BaseURL: w.cfg.Lobby.BaseURL,
RequestTimeout: w.cfg.Lobby.Timeout,
})
if err != nil {
return fmt.Errorf("lobby client: %w", err)
}
w.lobbyClient = lobby
w.closers = append(w.closers, lobby.Close)
notificationPub, err := notificationpublisher.NewPublisher(notificationpublisher.Config{
Client: w.redisClient,
Stream: w.cfg.Streams.NotificationIntents,
})
if err != nil {
return fmt.Errorf("notification publisher: %w", err)
}
w.notificationPublisher = notificationPub
healthPub, err := healtheventspublisher.NewPublisher(healtheventspublisher.Config{
Client: w.redisClient,
Snapshots: w.healthSnapshotStore,
Stream: w.cfg.Streams.HealthEvents,
})
if err != nil {
return fmt.Errorf("health events publisher: %w", err)
}
w.healthEventsPublisher = healthPub
jobResultsPub, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{
Client: w.redisClient,
Stream: w.cfg.Streams.JobResults,
})
if err != nil {
return fmt.Errorf("job results publisher: %w", err)
}
w.jobResultsPublisher = jobResultsPub
return nil
}
func (w *wiring) buildServices() error {
startService, err := startruntime.NewService(startruntime.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
OperationLogs: w.operationLogStore,
Docker: w.dockerAdapter,
Leases: w.gameLeaseStore,
HealthEvents: w.healthEventsPublisher,
Notifications: w.notificationPublisher,
Lobby: w.lobbyClient,
Container: w.cfg.Container,
DockerCfg: w.cfg.Docker,
Coordination: w.cfg.Coordination,
Telemetry: w.telemetry,
Logger: w.logger,
Clock: w.clock,
})
if err != nil {
return fmt.Errorf("start runtime service: %w", err)
}
w.startRuntimeService = startService
stopService, err := stopruntime.NewService(stopruntime.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
OperationLogs: w.operationLogStore,
Docker: w.dockerAdapter,
Leases: w.gameLeaseStore,
HealthEvents: w.healthEventsPublisher,
Container: w.cfg.Container,
Coordination: w.cfg.Coordination,
Telemetry: w.telemetry,
Logger: w.logger,
Clock: w.clock,
})
if err != nil {
return fmt.Errorf("stop runtime service: %w", err)
}
w.stopRuntimeService = stopService
restartService, err := restartruntime.NewService(restartruntime.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
OperationLogs: w.operationLogStore,
Docker: w.dockerAdapter,
Leases: w.gameLeaseStore,
StopService: stopService,
StartService: startService,
Coordination: w.cfg.Coordination,
Telemetry: w.telemetry,
Logger: w.logger,
Clock: w.clock,
})
if err != nil {
return fmt.Errorf("restart runtime service: %w", err)
}
w.restartRuntimeService = restartService
patchService, err := patchruntime.NewService(patchruntime.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
OperationLogs: w.operationLogStore,
Docker: w.dockerAdapter,
Leases: w.gameLeaseStore,
StopService: stopService,
StartService: startService,
Coordination: w.cfg.Coordination,
Telemetry: w.telemetry,
Logger: w.logger,
Clock: w.clock,
})
if err != nil {
return fmt.Errorf("patch runtime service: %w", err)
}
w.patchRuntimeService = patchService
cleanupService, err := cleanupcontainer.NewService(cleanupcontainer.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
OperationLogs: w.operationLogStore,
Docker: w.dockerAdapter,
Leases: w.gameLeaseStore,
Coordination: w.cfg.Coordination,
Telemetry: w.telemetry,
Logger: w.logger,
Clock: w.clock,
})
if err != nil {
return fmt.Errorf("cleanup container service: %w", err)
}
w.cleanupContainerService = cleanupService
return nil
}
// buildWorkers constructs the asynchronous Lobby ↔ RTM stream
// consumers. Both consumers participate in the process lifecycle as
// `app.Component`s; `internal/app/runtime.go` passes them into
// `app.New` alongside the internal HTTP server.
func (w *wiring) buildWorkers() error {
startConsumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{
Client: w.redisClient,
Stream: w.cfg.Streams.StartJobs,
BlockTimeout: w.cfg.Streams.BlockTimeout,
StartService: w.startRuntimeService,
JobResults: w.jobResultsPublisher,
OffsetStore: w.streamOffsetStore,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("start jobs consumer: %w", err)
}
w.startJobsConsumer = startConsumer
stopConsumer, err := stopjobsconsumer.NewConsumer(stopjobsconsumer.Config{
Client: w.redisClient,
Stream: w.cfg.Streams.StopJobs,
BlockTimeout: w.cfg.Streams.BlockTimeout,
StopService: w.stopRuntimeService,
JobResults: w.jobResultsPublisher,
OffsetStore: w.streamOffsetStore,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("stop jobs consumer: %w", err)
}
w.stopJobsConsumer = stopConsumer
eventsListener, err := dockerevents.NewListener(dockerevents.Dependencies{
Docker: w.dockerAdapter,
RuntimeRecords: w.runtimeRecordStore,
HealthEvents: w.healthEventsPublisher,
Telemetry: w.telemetry,
Clock: w.clock,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("docker events listener: %w", err)
}
w.dockerEventsListener = eventsListener
probeHTTPClient, err := newProbeHTTPClient(w.telemetry)
if err != nil {
return fmt.Errorf("health probe http client: %w", err)
}
probeWorker, err := healthprobe.NewWorker(healthprobe.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
HealthEvents: w.healthEventsPublisher,
HTTPClient: probeHTTPClient,
Telemetry: w.telemetry,
Interval: w.cfg.Health.ProbeInterval,
ProbeTimeout: w.cfg.Health.ProbeTimeout,
FailuresThreshold: w.cfg.Health.ProbeFailuresThreshold,
Clock: w.clock,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("health probe worker: %w", err)
}
w.healthProbeWorker = probeWorker
inspectWorker, err := dockerinspect.NewWorker(dockerinspect.Dependencies{
Docker: w.dockerAdapter,
RuntimeRecords: w.runtimeRecordStore,
HealthEvents: w.healthEventsPublisher,
Telemetry: w.telemetry,
Interval: w.cfg.Health.InspectInterval,
Clock: w.clock,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("docker inspect worker: %w", err)
}
w.dockerInspectWorker = inspectWorker
reconciler, err := reconcile.NewReconciler(reconcile.Dependencies{
Docker: w.dockerAdapter,
RuntimeRecords: w.runtimeRecordStore,
OperationLogs: w.operationLogStore,
HealthEvents: w.healthEventsPublisher,
Leases: w.gameLeaseStore,
Telemetry: w.telemetry,
DockerCfg: w.cfg.Docker,
ContainerCfg: w.cfg.Container,
Coordination: w.cfg.Coordination,
Interval: w.cfg.Cleanup.ReconcileInterval,
Clock: w.clock,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("reconciler: %w", err)
}
w.reconciler = reconciler
cleanupWorker, err := containercleanup.NewWorker(containercleanup.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
Cleanup: w.cleanupContainerService,
Retention: w.cfg.Container.Retention,
Interval: w.cfg.Cleanup.CleanupInterval,
Clock: w.clock,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("container cleanup worker: %w", err)
}
w.containerCleanupWorker = cleanupWorker
return nil
}
// newProbeHTTPClient constructs the otelhttp-instrumented HTTP client
// the active health probe uses to call engine `/healthz`. It clones
// the default transport so caller-provided transports stay isolated
// from production wiring (mirrors the lobby internal client).
func newProbeHTTPClient(telemetryRuntime *telemetry.Runtime) (*http.Client, error) {
transport, ok := http.DefaultTransport.(*http.Transport)
if !ok {
return nil, errors.New("default http transport is not *http.Transport")
}
cloned := transport.Clone()
instrumented := otelhttp.NewTransport(cloned,
otelhttp.WithTracerProvider(telemetryRuntime.TracerProvider()),
otelhttp.WithMeterProvider(telemetryRuntime.MeterProvider()),
)
return &http.Client{Transport: instrumented}, nil
}
// registerTelemetryGauges installs the runtime-records-by-status gauge
// callback so the telemetry runtime can observe the persistent store
// without holding a strong reference to the wiring.
func (w *wiring) registerTelemetryGauges() error {
probe := newRuntimeRecordsProbe(w.runtimeRecordStore)
return w.telemetry.RegisterGauges(telemetry.GaugeDependencies{
RuntimeRecordsByStatus: probe,
Logger: w.logger,
})
}
// close releases adapter-level resources owned by the wiring layer.
// Returns the joined error of every closer; the caller is expected to
// invoke this once during process shutdown.
func (w *wiring) close() error {
var joined error
for index := len(w.closers) - 1; index >= 0; index-- {
if err := w.closers[index](); err != nil {
joined = errors.Join(joined, err)
}
}
w.closers = nil
return joined
}
// runtimeRecordsProbe adapts runtimerecordstore.Store to
// telemetry.RuntimeRecordsByStatusProbe by translating the typed status
// keys into the string keys the gauge expects.
type runtimeRecordsProbe struct {
store *runtimerecordstore.Store
}
func newRuntimeRecordsProbe(store *runtimerecordstore.Store) *runtimeRecordsProbe {
return &runtimeRecordsProbe{store: store}
}
func (p *runtimeRecordsProbe) CountByStatus(ctx context.Context) (map[string]int, error) {
if p == nil || p.store == nil {
return nil, errors.New("runtime records probe: nil store")
}
counts, err := p.store.CountByStatus(ctx)
if err != nil {
return nil, err
}
out := make(map[string]int, len(counts))
for status, count := range counts {
out[string(status)] = count
}
return out, nil
}
// Compile-time assertions that the constructed adapters satisfy the
// expected port surfaces; these prevent silent regressions when a
// port shape changes.
var (
_ ports.RuntimeRecordStore = (*runtimerecordstore.Store)(nil)
_ ports.OperationLogStore = (*operationlogstore.Store)(nil)
_ ports.HealthSnapshotStore = (*healthsnapshotstore.Store)(nil)
_ ports.StreamOffsetStore = (*streamoffsets.Store)(nil)
_ ports.GameLeaseStore = (*gamelease.Store)(nil)
_ ports.DockerClient = (*docker.Client)(nil)
_ ports.LobbyInternalClient = (*lobbyclient.Client)(nil)
_ ports.NotificationIntentPublisher = (*notificationpublisher.Publisher)(nil)
_ ports.HealthEventPublisher = (*healtheventspublisher.Publisher)(nil)
_ ports.JobResultPublisher = (*jobresultspublisher.Publisher)(nil)
_ Component = (*reconcile.Reconciler)(nil)
_ Component = (*containercleanup.Worker)(nil)
_ containercleanup.Cleaner = (*cleanupcontainer.Service)(nil)
)