// Package config loads the Runtime Manager process configuration from // environment variables. package config import ( "fmt" "strings" "time" "galaxy/postgres" "galaxy/redisconn" "galaxy/rtmanager/internal/telemetry" ) const ( envPrefix = "RTMANAGER" shutdownTimeoutEnvVar = "RTMANAGER_SHUTDOWN_TIMEOUT" logLevelEnvVar = "RTMANAGER_LOG_LEVEL" internalHTTPAddrEnvVar = "RTMANAGER_INTERNAL_HTTP_ADDR" internalHTTPReadHeaderTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_READ_HEADER_TIMEOUT" internalHTTPReadTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_READ_TIMEOUT" internalHTTPWriteTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_WRITE_TIMEOUT" internalHTTPIdleTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_IDLE_TIMEOUT" dockerHostEnvVar = "RTMANAGER_DOCKER_HOST" dockerAPIVersionEnvVar = "RTMANAGER_DOCKER_API_VERSION" dockerNetworkEnvVar = "RTMANAGER_DOCKER_NETWORK" dockerLogDriverEnvVar = "RTMANAGER_DOCKER_LOG_DRIVER" dockerLogOptsEnvVar = "RTMANAGER_DOCKER_LOG_OPTS" imagePullPolicyEnvVar = "RTMANAGER_IMAGE_PULL_POLICY" defaultCPUQuotaEnvVar = "RTMANAGER_DEFAULT_CPU_QUOTA" defaultMemoryEnvVar = "RTMANAGER_DEFAULT_MEMORY" defaultPIDsLimitEnvVar = "RTMANAGER_DEFAULT_PIDS_LIMIT" containerStopTimeoutSecondsEnvVar = "RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS" containerRetentionDaysEnvVar = "RTMANAGER_CONTAINER_RETENTION_DAYS" engineStateMountPathEnvVar = "RTMANAGER_ENGINE_STATE_MOUNT_PATH" engineStateEnvNameEnvVar = "RTMANAGER_ENGINE_STATE_ENV_NAME" gameStateDirModeEnvVar = "RTMANAGER_GAME_STATE_DIR_MODE" gameStateOwnerUIDEnvVar = "RTMANAGER_GAME_STATE_OWNER_UID" gameStateOwnerGIDEnvVar = "RTMANAGER_GAME_STATE_OWNER_GID" gameStateRootEnvVar = "RTMANAGER_GAME_STATE_ROOT" startJobsStreamEnvVar = "RTMANAGER_REDIS_START_JOBS_STREAM" stopJobsStreamEnvVar = "RTMANAGER_REDIS_STOP_JOBS_STREAM" jobResultsStreamEnvVar = "RTMANAGER_REDIS_JOB_RESULTS_STREAM" healthEventsStreamEnvVar = "RTMANAGER_REDIS_HEALTH_EVENTS_STREAM" notificationIntentsStreamEnv = "RTMANAGER_NOTIFICATION_INTENTS_STREAM" streamBlockTimeoutEnvVar = "RTMANAGER_STREAM_BLOCK_TIMEOUT" inspectIntervalEnvVar = "RTMANAGER_INSPECT_INTERVAL" probeIntervalEnvVar = "RTMANAGER_PROBE_INTERVAL" probeTimeoutEnvVar = "RTMANAGER_PROBE_TIMEOUT" probeFailuresThresholdEnvVar = "RTMANAGER_PROBE_FAILURES_THRESHOLD" reconcileIntervalEnvVar = "RTMANAGER_RECONCILE_INTERVAL" cleanupIntervalEnvVar = "RTMANAGER_CLEANUP_INTERVAL" gameLeaseTTLSecondsEnvVar = "RTMANAGER_GAME_LEASE_TTL_SECONDS" lobbyInternalBaseURLEnvVar = "RTMANAGER_LOBBY_INTERNAL_BASE_URL" lobbyInternalTimeoutEnvVar = "RTMANAGER_LOBBY_INTERNAL_TIMEOUT" otelServiceNameEnvVar = "OTEL_SERVICE_NAME" otelTracesExporterEnvVar = "OTEL_TRACES_EXPORTER" otelMetricsExporterEnvVar = "OTEL_METRICS_EXPORTER" otelExporterOTLPProtocolEnvVar = "OTEL_EXPORTER_OTLP_PROTOCOL" otelExporterOTLPTracesProtocolEnvVar = "OTEL_EXPORTER_OTLP_TRACES_PROTOCOL" otelExporterOTLPMetricsProtocolEnvVar = "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL" otelStdoutTracesEnabledEnvVar = "RTMANAGER_OTEL_STDOUT_TRACES_ENABLED" otelStdoutMetricsEnabledEnvVar = "RTMANAGER_OTEL_STDOUT_METRICS_ENABLED" defaultShutdownTimeout = 30 * time.Second defaultLogLevel = "info" defaultInternalHTTPAddr = ":8096" defaultReadHeaderTimeout = 2 * time.Second defaultReadTimeout = 5 * time.Second defaultWriteTimeout = 15 * time.Second defaultIdleTimeout = 60 * time.Second defaultDockerHost = "unix:///var/run/docker.sock" defaultDockerNetwork = "galaxy-net" defaultDockerLogDriver = "json-file" defaultImagePullPolicy = ImagePullPolicyIfMissing defaultCPUQuota = 1.0 defaultMemory = "512m" defaultPIDsLimit = 512 defaultContainerStopTimeout = 30 * time.Second defaultContainerRetention = 30 * 24 * time.Hour defaultEngineStateMountPath = "/var/lib/galaxy-game" defaultEngineStateEnvName = "GAME_STATE_PATH" defaultGameStateDirMode = 0o750 defaultStartJobsStream = "runtime:start_jobs" defaultStopJobsStream = "runtime:stop_jobs" defaultJobResultsStream = "runtime:job_results" defaultHealthEventsStream = "runtime:health_events" defaultNotificationIntentsKey = "notification:intents" defaultStreamBlockTimeout = 5 * time.Second defaultInspectInterval = 30 * time.Second defaultProbeInterval = 15 * time.Second defaultProbeTimeout = 2 * time.Second defaultProbeFailuresThreshold = 3 defaultReconcileInterval = 5 * time.Minute defaultCleanupInterval = time.Hour defaultGameLeaseTTL = 60 * time.Second defaultLobbyInternalTimeout = 2 * time.Second defaultOTelServiceName = "galaxy-rtmanager" ) // ImagePullPolicy enumerates the supported image pull policies. The start // service validates a producer-supplied `image_ref` against this policy at // start time. type ImagePullPolicy string // Supported pull policies, frozen by `rtmanager/README.md` §Configuration. const ( ImagePullPolicyIfMissing ImagePullPolicy = "if_missing" ImagePullPolicyAlways ImagePullPolicy = "always" ImagePullPolicyNever ImagePullPolicy = "never" ) // Validate reports whether p is one of the frozen pull policies. func (p ImagePullPolicy) Validate() error { switch p { case ImagePullPolicyIfMissing, ImagePullPolicyAlways, ImagePullPolicyNever: return nil default: return fmt.Errorf("image pull policy %q must be one of %q, %q, %q", p, ImagePullPolicyIfMissing, ImagePullPolicyAlways, ImagePullPolicyNever) } } // Config stores the full Runtime Manager process configuration. type Config struct { // ShutdownTimeout bounds graceful shutdown of every long-lived // component. ShutdownTimeout time.Duration // Logging configures the process-wide structured logger. Logging LoggingConfig // InternalHTTP configures the trusted internal HTTP listener that // serves probes and the GM/Admin REST surface. InternalHTTP InternalHTTPConfig // Docker configures the Docker SDK client RTM uses to drive the local // Docker daemon. Docker DockerConfig // Postgres configures the PostgreSQL-backed durable store consumed via // `pkg/postgres`. Postgres PostgresConfig // Redis configures the shared Redis connection topology consumed via // `pkg/redisconn`. Redis RedisConfig // Streams stores the stable Redis Stream names RTM reads from and // writes to. Streams StreamsConfig // Container stores the per-container defaults applied at start time // when the resolved image does not declare its own labels. Container ContainerConfig // Health configures the periodic health-monitoring workers (events // listener, inspect, active probe). Health HealthConfig // Cleanup configures the reconciler and container-cleanup workers. Cleanup CleanupConfig // Coordination configures the per-game Redis lease used to serialise // operations across all entry points. Coordination CoordinationConfig // Lobby configures the synchronous Lobby internal REST client used by // the start service for ancillary lookups. Lobby LobbyConfig // Telemetry configures the process-wide OpenTelemetry runtime. Telemetry TelemetryConfig } // LoggingConfig configures the process-wide structured logger. type LoggingConfig struct { // Level stores the process log level accepted by log/slog. Level string } // InternalHTTPConfig configures the trusted internal HTTP listener. type InternalHTTPConfig struct { // Addr stores the TCP listen address. Addr string // ReadHeaderTimeout bounds request-header reading. ReadHeaderTimeout time.Duration // ReadTimeout bounds reading one request. ReadTimeout time.Duration // WriteTimeout bounds writing one response. WriteTimeout time.Duration // IdleTimeout bounds how long keep-alive connections stay open. IdleTimeout time.Duration } // Validate reports whether cfg stores a usable internal HTTP listener // configuration. func (cfg InternalHTTPConfig) Validate() error { switch { case strings.TrimSpace(cfg.Addr) == "": return fmt.Errorf("internal HTTP addr must not be empty") case !isTCPAddr(cfg.Addr): return fmt.Errorf("internal HTTP addr %q must use host:port form", cfg.Addr) case cfg.ReadHeaderTimeout <= 0: return fmt.Errorf("internal HTTP read header timeout must be positive") case cfg.ReadTimeout <= 0: return fmt.Errorf("internal HTTP read timeout must be positive") case cfg.WriteTimeout <= 0: return fmt.Errorf("internal HTTP write timeout must be positive") case cfg.IdleTimeout <= 0: return fmt.Errorf("internal HTTP idle timeout must be positive") default: return nil } } // DockerConfig configures the Docker SDK client. type DockerConfig struct { // Host stores the Docker daemon endpoint (e.g. // `unix:///var/run/docker.sock`). Host string // APIVersion overrides the Docker API version. Empty lets the SDK // negotiate. APIVersion string // Network stores the user-defined Docker bridge network containers // attach to. Provisioned outside RTM; missing network is a fail-fast // condition at startup. Network string // LogDriver stores the Docker logging driver applied to engine // containers. LogDriver string // LogOpts stores the comma-separated `key=value` driver options. LogOpts string // PullPolicy stores the configured image pull policy. PullPolicy ImagePullPolicy } // Validate reports whether cfg stores a usable Docker configuration. func (cfg DockerConfig) Validate() error { switch { case strings.TrimSpace(cfg.Host) == "": return fmt.Errorf("docker host must not be empty") case strings.TrimSpace(cfg.Network) == "": return fmt.Errorf("docker network must not be empty") case strings.TrimSpace(cfg.LogDriver) == "": return fmt.Errorf("docker log driver must not be empty") } return cfg.PullPolicy.Validate() } // PostgresConfig configures the PostgreSQL-backed durable store consumed // via `pkg/postgres`. type PostgresConfig struct { // Conn carries the primary plus replica DSN topology and pool tuning. Conn postgres.Config } // Validate reports whether cfg stores a usable PostgreSQL configuration. func (cfg PostgresConfig) Validate() error { return cfg.Conn.Validate() } // RedisConfig configures the Runtime Manager Redis connection topology. type RedisConfig struct { // Conn carries the connection topology (master, replicas, password, // db, per-call timeout). Conn redisconn.Config } // Validate reports whether cfg stores a usable Redis configuration. func (cfg RedisConfig) Validate() error { return cfg.Conn.Validate() } // StreamsConfig stores the stable Redis Stream names used by Runtime // Manager. type StreamsConfig struct { // StartJobs stores the Redis Streams key Lobby writes start jobs to. StartJobs string // StopJobs stores the Redis Streams key Lobby writes stop jobs to. StopJobs string // JobResults stores the Redis Streams key RTM writes job outcomes // to. JobResults string // HealthEvents stores the Redis Streams key RTM publishes // technical health events to. HealthEvents string // NotificationIntents stores the Redis Streams key RTM publishes // admin-only notification intents to. NotificationIntents string // BlockTimeout bounds the maximum blocking read window for stream // consumers. BlockTimeout time.Duration } // Validate reports whether cfg stores usable stream names. func (cfg StreamsConfig) Validate() error { switch { case strings.TrimSpace(cfg.StartJobs) == "": return fmt.Errorf("redis start jobs stream must not be empty") case strings.TrimSpace(cfg.StopJobs) == "": return fmt.Errorf("redis stop jobs stream must not be empty") case strings.TrimSpace(cfg.JobResults) == "": return fmt.Errorf("redis job results stream must not be empty") case strings.TrimSpace(cfg.HealthEvents) == "": return fmt.Errorf("redis health events stream must not be empty") case strings.TrimSpace(cfg.NotificationIntents) == "": return fmt.Errorf("redis notification intents stream must not be empty") case cfg.BlockTimeout <= 0: return fmt.Errorf("redis stream block timeout must be positive") default: return nil } } // ContainerConfig stores the per-container defaults applied at start // time. Resource defaults apply when the resolved engine image does not // expose `com.galaxy.cpu_quota` / `com.galaxy.memory` / // `com.galaxy.pids_limit` labels. type ContainerConfig struct { // DefaultCPUQuota is the fallback `--cpus` value applied when the // image does not declare `com.galaxy.cpu_quota`. DefaultCPUQuota float64 // DefaultMemory is the fallback `--memory` value applied when the // image does not declare `com.galaxy.memory`. DefaultMemory string // DefaultPIDsLimit is the fallback `--pids-limit` value applied // when the image does not declare `com.galaxy.pids_limit`. DefaultPIDsLimit int // StopTimeout bounds graceful container stop before Docker fires // SIGKILL. StopTimeout time.Duration // Retention stores the TTL after which `status=stopped` containers // are removed by the cleanup worker. Retention time.Duration // EngineStateMountPath is the in-container path the per-game state // directory is bind-mounted to. EngineStateMountPath string // EngineStateEnvName is the env-var name forwarded to the engine // pointing at EngineStateMountPath. EngineStateEnvName string // GameStateDirMode stores the unix permissions applied to the // per-game state directory on creation. GameStateDirMode uint32 // GameStateOwnerUID stores the unix uid applied to the per-game // state directory on creation. GameStateOwnerUID int // GameStateOwnerGID stores the unix gid applied to the per-game // state directory on creation. GameStateOwnerGID int // GameStateRoot is the host path under which per-game state // directories are created. GameStateRoot string } // Validate reports whether cfg stores usable container defaults. func (cfg ContainerConfig) Validate() error { switch { case cfg.DefaultCPUQuota <= 0: return fmt.Errorf("default cpu quota must be positive") case strings.TrimSpace(cfg.DefaultMemory) == "": return fmt.Errorf("default memory must not be empty") case cfg.DefaultPIDsLimit <= 0: return fmt.Errorf("default pids limit must be positive") case cfg.StopTimeout <= 0: return fmt.Errorf("container stop timeout must be positive") case cfg.Retention <= 0: return fmt.Errorf("container retention must be positive") case strings.TrimSpace(cfg.EngineStateMountPath) == "": return fmt.Errorf("engine state mount path must not be empty") case strings.TrimSpace(cfg.EngineStateEnvName) == "": return fmt.Errorf("engine state env name must not be empty") case cfg.GameStateDirMode == 0: return fmt.Errorf("game state dir mode must be non-zero") case strings.TrimSpace(cfg.GameStateRoot) == "": return fmt.Errorf("game state root must not be empty") case !strings.HasPrefix(strings.TrimSpace(cfg.GameStateRoot), "/"): return fmt.Errorf("game state root %q must be an absolute path", cfg.GameStateRoot) default: return nil } } // HealthConfig configures the periodic health-monitoring workers // (Docker events listener, periodic inspect, active probe). type HealthConfig struct { // InspectInterval is the period between two periodic Docker inspect // passes. InspectInterval time.Duration // ProbeInterval is the period between two engine `/healthz` probe // rounds. ProbeInterval time.Duration // ProbeTimeout bounds one engine `/healthz` request. ProbeTimeout time.Duration // ProbeFailuresThreshold is the consecutive-failure count that // triggers a `probe_failed` event. ProbeFailuresThreshold int } // Validate reports whether cfg stores usable health-monitoring settings. func (cfg HealthConfig) Validate() error { switch { case cfg.InspectInterval <= 0: return fmt.Errorf("inspect interval must be positive") case cfg.ProbeInterval <= 0: return fmt.Errorf("probe interval must be positive") case cfg.ProbeTimeout <= 0: return fmt.Errorf("probe timeout must be positive") case cfg.ProbeFailuresThreshold <= 0: return fmt.Errorf("probe failures threshold must be positive") default: return nil } } // CleanupConfig configures the reconciler and container-cleanup workers. type CleanupConfig struct { // ReconcileInterval is the period between two reconciler passes. ReconcileInterval time.Duration // CleanupInterval is the period between two container-cleanup // passes. CleanupInterval time.Duration } // Validate reports whether cfg stores usable cleanup settings. func (cfg CleanupConfig) Validate() error { switch { case cfg.ReconcileInterval <= 0: return fmt.Errorf("reconcile interval must be positive") case cfg.CleanupInterval <= 0: return fmt.Errorf("cleanup interval must be positive") default: return nil } } // CoordinationConfig configures the per-game Redis lease. type CoordinationConfig struct { // GameLeaseTTL bounds the per-game lease lifetime renewed every // half-TTL while an operation runs. GameLeaseTTL time.Duration } // Validate reports whether cfg stores a usable lease configuration. func (cfg CoordinationConfig) Validate() error { if cfg.GameLeaseTTL <= 0 { return fmt.Errorf("game lease ttl must be positive") } return nil } // LobbyConfig configures the synchronous Lobby internal REST client. type LobbyConfig struct { // BaseURL stores the trusted Lobby internal listener base URL. BaseURL string // Timeout bounds one Lobby internal request. Timeout time.Duration } // Validate reports whether cfg stores a usable Lobby client // configuration. func (cfg LobbyConfig) Validate() error { switch { case strings.TrimSpace(cfg.BaseURL) == "": return fmt.Errorf("lobby internal base url must not be empty") case !isHTTPURL(cfg.BaseURL): return fmt.Errorf("lobby internal base url %q must be an absolute http(s) URL", cfg.BaseURL) case cfg.Timeout <= 0: return fmt.Errorf("lobby internal timeout must be positive") default: return nil } } // TelemetryConfig configures the Runtime Manager OpenTelemetry runtime. type TelemetryConfig struct { // ServiceName overrides the default OpenTelemetry service name. ServiceName string // TracesExporter selects the external traces exporter. Supported // values are `none` and `otlp`. TracesExporter string // MetricsExporter selects the external metrics exporter. Supported // values are `none` and `otlp`. MetricsExporter string // TracesProtocol selects the OTLP traces protocol when // TracesExporter is `otlp`. TracesProtocol string // MetricsProtocol selects the OTLP metrics protocol when // MetricsExporter is `otlp`. MetricsProtocol string // StdoutTracesEnabled enables the additional stdout trace exporter // used for local development and debugging. StdoutTracesEnabled bool // StdoutMetricsEnabled enables the additional stdout metric // exporter used for local development and debugging. StdoutMetricsEnabled bool } // Validate reports whether cfg contains a supported OpenTelemetry // configuration. func (cfg TelemetryConfig) Validate() error { return telemetry.ProcessConfig{ ServiceName: cfg.ServiceName, TracesExporter: cfg.TracesExporter, MetricsExporter: cfg.MetricsExporter, TracesProtocol: cfg.TracesProtocol, MetricsProtocol: cfg.MetricsProtocol, StdoutTracesEnabled: cfg.StdoutTracesEnabled, StdoutMetricsEnabled: cfg.StdoutMetricsEnabled, }.Validate() } // DefaultConfig returns the default Runtime Manager process configuration. func DefaultConfig() Config { return Config{ ShutdownTimeout: defaultShutdownTimeout, Logging: LoggingConfig{ Level: defaultLogLevel, }, InternalHTTP: InternalHTTPConfig{ Addr: defaultInternalHTTPAddr, ReadHeaderTimeout: defaultReadHeaderTimeout, ReadTimeout: defaultReadTimeout, WriteTimeout: defaultWriteTimeout, IdleTimeout: defaultIdleTimeout, }, Docker: DockerConfig{ Host: defaultDockerHost, Network: defaultDockerNetwork, LogDriver: defaultDockerLogDriver, PullPolicy: defaultImagePullPolicy, }, Postgres: PostgresConfig{ Conn: postgres.DefaultConfig(), }, Redis: RedisConfig{ Conn: redisconn.DefaultConfig(), }, Streams: StreamsConfig{ StartJobs: defaultStartJobsStream, StopJobs: defaultStopJobsStream, JobResults: defaultJobResultsStream, HealthEvents: defaultHealthEventsStream, NotificationIntents: defaultNotificationIntentsKey, BlockTimeout: defaultStreamBlockTimeout, }, Container: ContainerConfig{ DefaultCPUQuota: defaultCPUQuota, DefaultMemory: defaultMemory, DefaultPIDsLimit: defaultPIDsLimit, StopTimeout: defaultContainerStopTimeout, Retention: defaultContainerRetention, EngineStateMountPath: defaultEngineStateMountPath, EngineStateEnvName: defaultEngineStateEnvName, GameStateDirMode: defaultGameStateDirMode, }, Health: HealthConfig{ InspectInterval: defaultInspectInterval, ProbeInterval: defaultProbeInterval, ProbeTimeout: defaultProbeTimeout, ProbeFailuresThreshold: defaultProbeFailuresThreshold, }, Cleanup: CleanupConfig{ ReconcileInterval: defaultReconcileInterval, CleanupInterval: defaultCleanupInterval, }, Coordination: CoordinationConfig{ GameLeaseTTL: defaultGameLeaseTTL, }, Lobby: LobbyConfig{ Timeout: defaultLobbyInternalTimeout, }, Telemetry: TelemetryConfig{ ServiceName: defaultOTelServiceName, TracesExporter: "none", MetricsExporter: "none", }, } }