feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
+632
View File
@@ -0,0 +1,632 @@
// Package config loads the Runtime Manager process configuration from
// environment variables.
package config
import (
"fmt"
"strings"
"time"
"galaxy/postgres"
"galaxy/redisconn"
"galaxy/rtmanager/internal/telemetry"
)
const (
envPrefix = "RTMANAGER"
shutdownTimeoutEnvVar = "RTMANAGER_SHUTDOWN_TIMEOUT"
logLevelEnvVar = "RTMANAGER_LOG_LEVEL"
internalHTTPAddrEnvVar = "RTMANAGER_INTERNAL_HTTP_ADDR"
internalHTTPReadHeaderTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_READ_HEADER_TIMEOUT"
internalHTTPReadTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_READ_TIMEOUT"
internalHTTPWriteTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_WRITE_TIMEOUT"
internalHTTPIdleTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_IDLE_TIMEOUT"
dockerHostEnvVar = "RTMANAGER_DOCKER_HOST"
dockerAPIVersionEnvVar = "RTMANAGER_DOCKER_API_VERSION"
dockerNetworkEnvVar = "RTMANAGER_DOCKER_NETWORK"
dockerLogDriverEnvVar = "RTMANAGER_DOCKER_LOG_DRIVER"
dockerLogOptsEnvVar = "RTMANAGER_DOCKER_LOG_OPTS"
imagePullPolicyEnvVar = "RTMANAGER_IMAGE_PULL_POLICY"
defaultCPUQuotaEnvVar = "RTMANAGER_DEFAULT_CPU_QUOTA"
defaultMemoryEnvVar = "RTMANAGER_DEFAULT_MEMORY"
defaultPIDsLimitEnvVar = "RTMANAGER_DEFAULT_PIDS_LIMIT"
containerStopTimeoutSecondsEnvVar = "RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS"
containerRetentionDaysEnvVar = "RTMANAGER_CONTAINER_RETENTION_DAYS"
engineStateMountPathEnvVar = "RTMANAGER_ENGINE_STATE_MOUNT_PATH"
engineStateEnvNameEnvVar = "RTMANAGER_ENGINE_STATE_ENV_NAME"
gameStateDirModeEnvVar = "RTMANAGER_GAME_STATE_DIR_MODE"
gameStateOwnerUIDEnvVar = "RTMANAGER_GAME_STATE_OWNER_UID"
gameStateOwnerGIDEnvVar = "RTMANAGER_GAME_STATE_OWNER_GID"
gameStateRootEnvVar = "RTMANAGER_GAME_STATE_ROOT"
startJobsStreamEnvVar = "RTMANAGER_REDIS_START_JOBS_STREAM"
stopJobsStreamEnvVar = "RTMANAGER_REDIS_STOP_JOBS_STREAM"
jobResultsStreamEnvVar = "RTMANAGER_REDIS_JOB_RESULTS_STREAM"
healthEventsStreamEnvVar = "RTMANAGER_REDIS_HEALTH_EVENTS_STREAM"
notificationIntentsStreamEnv = "RTMANAGER_NOTIFICATION_INTENTS_STREAM"
streamBlockTimeoutEnvVar = "RTMANAGER_STREAM_BLOCK_TIMEOUT"
inspectIntervalEnvVar = "RTMANAGER_INSPECT_INTERVAL"
probeIntervalEnvVar = "RTMANAGER_PROBE_INTERVAL"
probeTimeoutEnvVar = "RTMANAGER_PROBE_TIMEOUT"
probeFailuresThresholdEnvVar = "RTMANAGER_PROBE_FAILURES_THRESHOLD"
reconcileIntervalEnvVar = "RTMANAGER_RECONCILE_INTERVAL"
cleanupIntervalEnvVar = "RTMANAGER_CLEANUP_INTERVAL"
gameLeaseTTLSecondsEnvVar = "RTMANAGER_GAME_LEASE_TTL_SECONDS"
lobbyInternalBaseURLEnvVar = "RTMANAGER_LOBBY_INTERNAL_BASE_URL"
lobbyInternalTimeoutEnvVar = "RTMANAGER_LOBBY_INTERNAL_TIMEOUT"
otelServiceNameEnvVar = "OTEL_SERVICE_NAME"
otelTracesExporterEnvVar = "OTEL_TRACES_EXPORTER"
otelMetricsExporterEnvVar = "OTEL_METRICS_EXPORTER"
otelExporterOTLPProtocolEnvVar = "OTEL_EXPORTER_OTLP_PROTOCOL"
otelExporterOTLPTracesProtocolEnvVar = "OTEL_EXPORTER_OTLP_TRACES_PROTOCOL"
otelExporterOTLPMetricsProtocolEnvVar = "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL"
otelStdoutTracesEnabledEnvVar = "RTMANAGER_OTEL_STDOUT_TRACES_ENABLED"
otelStdoutMetricsEnabledEnvVar = "RTMANAGER_OTEL_STDOUT_METRICS_ENABLED"
defaultShutdownTimeout = 30 * time.Second
defaultLogLevel = "info"
defaultInternalHTTPAddr = ":8096"
defaultReadHeaderTimeout = 2 * time.Second
defaultReadTimeout = 5 * time.Second
defaultWriteTimeout = 15 * time.Second
defaultIdleTimeout = 60 * time.Second
defaultDockerHost = "unix:///var/run/docker.sock"
defaultDockerNetwork = "galaxy-net"
defaultDockerLogDriver = "json-file"
defaultImagePullPolicy = ImagePullPolicyIfMissing
defaultCPUQuota = 1.0
defaultMemory = "512m"
defaultPIDsLimit = 512
defaultContainerStopTimeout = 30 * time.Second
defaultContainerRetention = 30 * 24 * time.Hour
defaultEngineStateMountPath = "/var/lib/galaxy-game"
defaultEngineStateEnvName = "GAME_STATE_PATH"
defaultGameStateDirMode = 0o750
defaultStartJobsStream = "runtime:start_jobs"
defaultStopJobsStream = "runtime:stop_jobs"
defaultJobResultsStream = "runtime:job_results"
defaultHealthEventsStream = "runtime:health_events"
defaultNotificationIntentsKey = "notification:intents"
defaultStreamBlockTimeout = 5 * time.Second
defaultInspectInterval = 30 * time.Second
defaultProbeInterval = 15 * time.Second
defaultProbeTimeout = 2 * time.Second
defaultProbeFailuresThreshold = 3
defaultReconcileInterval = 5 * time.Minute
defaultCleanupInterval = time.Hour
defaultGameLeaseTTL = 60 * time.Second
defaultLobbyInternalTimeout = 2 * time.Second
defaultOTelServiceName = "galaxy-rtmanager"
)
// ImagePullPolicy enumerates the supported image pull policies. The start
// service validates a producer-supplied `image_ref` against this policy at
// start time.
type ImagePullPolicy string
// Supported pull policies, frozen by `rtmanager/README.md` §Configuration.
const (
ImagePullPolicyIfMissing ImagePullPolicy = "if_missing"
ImagePullPolicyAlways ImagePullPolicy = "always"
ImagePullPolicyNever ImagePullPolicy = "never"
)
// Validate reports whether p is one of the frozen pull policies.
func (p ImagePullPolicy) Validate() error {
switch p {
case ImagePullPolicyIfMissing, ImagePullPolicyAlways, ImagePullPolicyNever:
return nil
default:
return fmt.Errorf("image pull policy %q must be one of %q, %q, %q",
p, ImagePullPolicyIfMissing, ImagePullPolicyAlways, ImagePullPolicyNever)
}
}
// Config stores the full Runtime Manager process configuration.
type Config struct {
// ShutdownTimeout bounds graceful shutdown of every long-lived
// component.
ShutdownTimeout time.Duration
// Logging configures the process-wide structured logger.
Logging LoggingConfig
// InternalHTTP configures the trusted internal HTTP listener that
// serves probes and the GM/Admin REST surface.
InternalHTTP InternalHTTPConfig
// Docker configures the Docker SDK client RTM uses to drive the local
// Docker daemon.
Docker DockerConfig
// Postgres configures the PostgreSQL-backed durable store consumed via
// `pkg/postgres`.
Postgres PostgresConfig
// Redis configures the shared Redis connection topology consumed via
// `pkg/redisconn`.
Redis RedisConfig
// Streams stores the stable Redis Stream names RTM reads from and
// writes to.
Streams StreamsConfig
// Container stores the per-container defaults applied at start time
// when the resolved image does not declare its own labels.
Container ContainerConfig
// Health configures the periodic health-monitoring workers (events
// listener, inspect, active probe).
Health HealthConfig
// Cleanup configures the reconciler and container-cleanup workers.
Cleanup CleanupConfig
// Coordination configures the per-game Redis lease used to serialise
// operations across all entry points.
Coordination CoordinationConfig
// Lobby configures the synchronous Lobby internal REST client used by
// the start service for ancillary lookups.
Lobby LobbyConfig
// Telemetry configures the process-wide OpenTelemetry runtime.
Telemetry TelemetryConfig
}
// LoggingConfig configures the process-wide structured logger.
type LoggingConfig struct {
// Level stores the process log level accepted by log/slog.
Level string
}
// InternalHTTPConfig configures the trusted internal HTTP listener.
type InternalHTTPConfig struct {
// Addr stores the TCP listen address.
Addr string
// ReadHeaderTimeout bounds request-header reading.
ReadHeaderTimeout time.Duration
// ReadTimeout bounds reading one request.
ReadTimeout time.Duration
// WriteTimeout bounds writing one response.
WriteTimeout time.Duration
// IdleTimeout bounds how long keep-alive connections stay open.
IdleTimeout time.Duration
}
// Validate reports whether cfg stores a usable internal HTTP listener
// configuration.
func (cfg InternalHTTPConfig) Validate() error {
switch {
case strings.TrimSpace(cfg.Addr) == "":
return fmt.Errorf("internal HTTP addr must not be empty")
case !isTCPAddr(cfg.Addr):
return fmt.Errorf("internal HTTP addr %q must use host:port form", cfg.Addr)
case cfg.ReadHeaderTimeout <= 0:
return fmt.Errorf("internal HTTP read header timeout must be positive")
case cfg.ReadTimeout <= 0:
return fmt.Errorf("internal HTTP read timeout must be positive")
case cfg.WriteTimeout <= 0:
return fmt.Errorf("internal HTTP write timeout must be positive")
case cfg.IdleTimeout <= 0:
return fmt.Errorf("internal HTTP idle timeout must be positive")
default:
return nil
}
}
// DockerConfig configures the Docker SDK client.
type DockerConfig struct {
// Host stores the Docker daemon endpoint (e.g.
// `unix:///var/run/docker.sock`).
Host string
// APIVersion overrides the Docker API version. Empty lets the SDK
// negotiate.
APIVersion string
// Network stores the user-defined Docker bridge network containers
// attach to. Provisioned outside RTM; missing network is a fail-fast
// condition at startup.
Network string
// LogDriver stores the Docker logging driver applied to engine
// containers.
LogDriver string
// LogOpts stores the comma-separated `key=value` driver options.
LogOpts string
// PullPolicy stores the configured image pull policy.
PullPolicy ImagePullPolicy
}
// Validate reports whether cfg stores a usable Docker configuration.
func (cfg DockerConfig) Validate() error {
switch {
case strings.TrimSpace(cfg.Host) == "":
return fmt.Errorf("docker host must not be empty")
case strings.TrimSpace(cfg.Network) == "":
return fmt.Errorf("docker network must not be empty")
case strings.TrimSpace(cfg.LogDriver) == "":
return fmt.Errorf("docker log driver must not be empty")
}
return cfg.PullPolicy.Validate()
}
// PostgresConfig configures the PostgreSQL-backed durable store consumed
// via `pkg/postgres`.
type PostgresConfig struct {
// Conn carries the primary plus replica DSN topology and pool tuning.
Conn postgres.Config
}
// Validate reports whether cfg stores a usable PostgreSQL configuration.
func (cfg PostgresConfig) Validate() error {
return cfg.Conn.Validate()
}
// RedisConfig configures the Runtime Manager Redis connection topology.
type RedisConfig struct {
// Conn carries the connection topology (master, replicas, password,
// db, per-call timeout).
Conn redisconn.Config
}
// Validate reports whether cfg stores a usable Redis configuration.
func (cfg RedisConfig) Validate() error {
return cfg.Conn.Validate()
}
// StreamsConfig stores the stable Redis Stream names used by Runtime
// Manager.
type StreamsConfig struct {
// StartJobs stores the Redis Streams key Lobby writes start jobs to.
StartJobs string
// StopJobs stores the Redis Streams key Lobby writes stop jobs to.
StopJobs string
// JobResults stores the Redis Streams key RTM writes job outcomes
// to.
JobResults string
// HealthEvents stores the Redis Streams key RTM publishes
// technical health events to.
HealthEvents string
// NotificationIntents stores the Redis Streams key RTM publishes
// admin-only notification intents to.
NotificationIntents string
// BlockTimeout bounds the maximum blocking read window for stream
// consumers.
BlockTimeout time.Duration
}
// Validate reports whether cfg stores usable stream names.
func (cfg StreamsConfig) Validate() error {
switch {
case strings.TrimSpace(cfg.StartJobs) == "":
return fmt.Errorf("redis start jobs stream must not be empty")
case strings.TrimSpace(cfg.StopJobs) == "":
return fmt.Errorf("redis stop jobs stream must not be empty")
case strings.TrimSpace(cfg.JobResults) == "":
return fmt.Errorf("redis job results stream must not be empty")
case strings.TrimSpace(cfg.HealthEvents) == "":
return fmt.Errorf("redis health events stream must not be empty")
case strings.TrimSpace(cfg.NotificationIntents) == "":
return fmt.Errorf("redis notification intents stream must not be empty")
case cfg.BlockTimeout <= 0:
return fmt.Errorf("redis stream block timeout must be positive")
default:
return nil
}
}
// ContainerConfig stores the per-container defaults applied at start
// time. Resource defaults apply when the resolved engine image does not
// expose `com.galaxy.cpu_quota` / `com.galaxy.memory` /
// `com.galaxy.pids_limit` labels.
type ContainerConfig struct {
// DefaultCPUQuota is the fallback `--cpus` value applied when the
// image does not declare `com.galaxy.cpu_quota`.
DefaultCPUQuota float64
// DefaultMemory is the fallback `--memory` value applied when the
// image does not declare `com.galaxy.memory`.
DefaultMemory string
// DefaultPIDsLimit is the fallback `--pids-limit` value applied
// when the image does not declare `com.galaxy.pids_limit`.
DefaultPIDsLimit int
// StopTimeout bounds graceful container stop before Docker fires
// SIGKILL.
StopTimeout time.Duration
// Retention stores the TTL after which `status=stopped` containers
// are removed by the cleanup worker.
Retention time.Duration
// EngineStateMountPath is the in-container path the per-game state
// directory is bind-mounted to.
EngineStateMountPath string
// EngineStateEnvName is the env-var name forwarded to the engine
// pointing at EngineStateMountPath.
EngineStateEnvName string
// GameStateDirMode stores the unix permissions applied to the
// per-game state directory on creation.
GameStateDirMode uint32
// GameStateOwnerUID stores the unix uid applied to the per-game
// state directory on creation.
GameStateOwnerUID int
// GameStateOwnerGID stores the unix gid applied to the per-game
// state directory on creation.
GameStateOwnerGID int
// GameStateRoot is the host path under which per-game state
// directories are created.
GameStateRoot string
}
// Validate reports whether cfg stores usable container defaults.
func (cfg ContainerConfig) Validate() error {
switch {
case cfg.DefaultCPUQuota <= 0:
return fmt.Errorf("default cpu quota must be positive")
case strings.TrimSpace(cfg.DefaultMemory) == "":
return fmt.Errorf("default memory must not be empty")
case cfg.DefaultPIDsLimit <= 0:
return fmt.Errorf("default pids limit must be positive")
case cfg.StopTimeout <= 0:
return fmt.Errorf("container stop timeout must be positive")
case cfg.Retention <= 0:
return fmt.Errorf("container retention must be positive")
case strings.TrimSpace(cfg.EngineStateMountPath) == "":
return fmt.Errorf("engine state mount path must not be empty")
case strings.TrimSpace(cfg.EngineStateEnvName) == "":
return fmt.Errorf("engine state env name must not be empty")
case cfg.GameStateDirMode == 0:
return fmt.Errorf("game state dir mode must be non-zero")
case strings.TrimSpace(cfg.GameStateRoot) == "":
return fmt.Errorf("game state root must not be empty")
case !strings.HasPrefix(strings.TrimSpace(cfg.GameStateRoot), "/"):
return fmt.Errorf("game state root %q must be an absolute path", cfg.GameStateRoot)
default:
return nil
}
}
// HealthConfig configures the periodic health-monitoring workers
// (Docker events listener, periodic inspect, active probe).
type HealthConfig struct {
// InspectInterval is the period between two periodic Docker inspect
// passes.
InspectInterval time.Duration
// ProbeInterval is the period between two engine `/healthz` probe
// rounds.
ProbeInterval time.Duration
// ProbeTimeout bounds one engine `/healthz` request.
ProbeTimeout time.Duration
// ProbeFailuresThreshold is the consecutive-failure count that
// triggers a `probe_failed` event.
ProbeFailuresThreshold int
}
// Validate reports whether cfg stores usable health-monitoring settings.
func (cfg HealthConfig) Validate() error {
switch {
case cfg.InspectInterval <= 0:
return fmt.Errorf("inspect interval must be positive")
case cfg.ProbeInterval <= 0:
return fmt.Errorf("probe interval must be positive")
case cfg.ProbeTimeout <= 0:
return fmt.Errorf("probe timeout must be positive")
case cfg.ProbeFailuresThreshold <= 0:
return fmt.Errorf("probe failures threshold must be positive")
default:
return nil
}
}
// CleanupConfig configures the reconciler and container-cleanup workers.
type CleanupConfig struct {
// ReconcileInterval is the period between two reconciler passes.
ReconcileInterval time.Duration
// CleanupInterval is the period between two container-cleanup
// passes.
CleanupInterval time.Duration
}
// Validate reports whether cfg stores usable cleanup settings.
func (cfg CleanupConfig) Validate() error {
switch {
case cfg.ReconcileInterval <= 0:
return fmt.Errorf("reconcile interval must be positive")
case cfg.CleanupInterval <= 0:
return fmt.Errorf("cleanup interval must be positive")
default:
return nil
}
}
// CoordinationConfig configures the per-game Redis lease.
type CoordinationConfig struct {
// GameLeaseTTL bounds the per-game lease lifetime renewed every
// half-TTL while an operation runs.
GameLeaseTTL time.Duration
}
// Validate reports whether cfg stores a usable lease configuration.
func (cfg CoordinationConfig) Validate() error {
if cfg.GameLeaseTTL <= 0 {
return fmt.Errorf("game lease ttl must be positive")
}
return nil
}
// LobbyConfig configures the synchronous Lobby internal REST client.
type LobbyConfig struct {
// BaseURL stores the trusted Lobby internal listener base URL.
BaseURL string
// Timeout bounds one Lobby internal request.
Timeout time.Duration
}
// Validate reports whether cfg stores a usable Lobby client
// configuration.
func (cfg LobbyConfig) Validate() error {
switch {
case strings.TrimSpace(cfg.BaseURL) == "":
return fmt.Errorf("lobby internal base url must not be empty")
case !isHTTPURL(cfg.BaseURL):
return fmt.Errorf("lobby internal base url %q must be an absolute http(s) URL", cfg.BaseURL)
case cfg.Timeout <= 0:
return fmt.Errorf("lobby internal timeout must be positive")
default:
return nil
}
}
// TelemetryConfig configures the Runtime Manager OpenTelemetry runtime.
type TelemetryConfig struct {
// ServiceName overrides the default OpenTelemetry service name.
ServiceName string
// TracesExporter selects the external traces exporter. Supported
// values are `none` and `otlp`.
TracesExporter string
// MetricsExporter selects the external metrics exporter. Supported
// values are `none` and `otlp`.
MetricsExporter string
// TracesProtocol selects the OTLP traces protocol when
// TracesExporter is `otlp`.
TracesProtocol string
// MetricsProtocol selects the OTLP metrics protocol when
// MetricsExporter is `otlp`.
MetricsProtocol string
// StdoutTracesEnabled enables the additional stdout trace exporter
// used for local development and debugging.
StdoutTracesEnabled bool
// StdoutMetricsEnabled enables the additional stdout metric
// exporter used for local development and debugging.
StdoutMetricsEnabled bool
}
// Validate reports whether cfg contains a supported OpenTelemetry
// configuration.
func (cfg TelemetryConfig) Validate() error {
return telemetry.ProcessConfig{
ServiceName: cfg.ServiceName,
TracesExporter: cfg.TracesExporter,
MetricsExporter: cfg.MetricsExporter,
TracesProtocol: cfg.TracesProtocol,
MetricsProtocol: cfg.MetricsProtocol,
StdoutTracesEnabled: cfg.StdoutTracesEnabled,
StdoutMetricsEnabled: cfg.StdoutMetricsEnabled,
}.Validate()
}
// DefaultConfig returns the default Runtime Manager process configuration.
func DefaultConfig() Config {
return Config{
ShutdownTimeout: defaultShutdownTimeout,
Logging: LoggingConfig{
Level: defaultLogLevel,
},
InternalHTTP: InternalHTTPConfig{
Addr: defaultInternalHTTPAddr,
ReadHeaderTimeout: defaultReadHeaderTimeout,
ReadTimeout: defaultReadTimeout,
WriteTimeout: defaultWriteTimeout,
IdleTimeout: defaultIdleTimeout,
},
Docker: DockerConfig{
Host: defaultDockerHost,
Network: defaultDockerNetwork,
LogDriver: defaultDockerLogDriver,
PullPolicy: defaultImagePullPolicy,
},
Postgres: PostgresConfig{
Conn: postgres.DefaultConfig(),
},
Redis: RedisConfig{
Conn: redisconn.DefaultConfig(),
},
Streams: StreamsConfig{
StartJobs: defaultStartJobsStream,
StopJobs: defaultStopJobsStream,
JobResults: defaultJobResultsStream,
HealthEvents: defaultHealthEventsStream,
NotificationIntents: defaultNotificationIntentsKey,
BlockTimeout: defaultStreamBlockTimeout,
},
Container: ContainerConfig{
DefaultCPUQuota: defaultCPUQuota,
DefaultMemory: defaultMemory,
DefaultPIDsLimit: defaultPIDsLimit,
StopTimeout: defaultContainerStopTimeout,
Retention: defaultContainerRetention,
EngineStateMountPath: defaultEngineStateMountPath,
EngineStateEnvName: defaultEngineStateEnvName,
GameStateDirMode: defaultGameStateDirMode,
},
Health: HealthConfig{
InspectInterval: defaultInspectInterval,
ProbeInterval: defaultProbeInterval,
ProbeTimeout: defaultProbeTimeout,
ProbeFailuresThreshold: defaultProbeFailuresThreshold,
},
Cleanup: CleanupConfig{
ReconcileInterval: defaultReconcileInterval,
CleanupInterval: defaultCleanupInterval,
},
Coordination: CoordinationConfig{
GameLeaseTTL: defaultGameLeaseTTL,
},
Lobby: LobbyConfig{
Timeout: defaultLobbyInternalTimeout,
},
Telemetry: TelemetryConfig{
ServiceName: defaultOTelServiceName,
TracesExporter: "none",
MetricsExporter: "none",
},
}
}