feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,493 @@
// Package docker provides the production Docker SDK adapter that
// implements `galaxy/rtmanager/internal/ports.DockerClient`. The
// adapter is the single component allowed to talk to the local Docker
// daemon; every Runtime Manager service path that needs container
// lifecycle operations goes through this surface.
//
// The adapter is intentionally narrow — it does not orchestrate, log,
// or retry. Cross-cutting concerns (lease coordination, durable state,
// notification side-effects) live in the service layer.
package docker
import (
"context"
"errors"
"fmt"
"io"
"maps"
"strings"
"sync"
"time"
cerrdefs "github.com/containerd/errdefs"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/events"
"github.com/docker/docker/api/types/filters"
"github.com/docker/docker/api/types/image"
"github.com/docker/docker/api/types/network"
dockerclient "github.com/docker/docker/client"
"github.com/docker/go-units"
"galaxy/rtmanager/internal/ports"
)
// EnginePort is the in-container HTTP port the engine listens on. The
// value is fixed by `rtmanager/README.md §Container Model` and by the
// engine's Dockerfile (`game/Dockerfile`); RTM never publishes the port
// to the host. Keeping the constant here lets the adapter own the URL
// shape so the start service does not have to know it.
const EnginePort = 8080
// Config groups the dependencies and per-process defaults required to
// construct a Client. The struct is value-typed so wiring code can
// build it inline without intermediate variables.
type Config struct {
// Docker stores the SDK client this adapter wraps. It must be
// non-nil; callers typically construct it via `client.NewClientWithOpts`.
Docker *dockerclient.Client
// LogDriver stores the Docker logging driver applied to every
// container the adapter creates (e.g. `json-file`).
LogDriver string
// LogOpts stores the comma-separated `key=value` driver options
// forwarded to Docker. Empty disables driver-specific options.
LogOpts string
// Clock supplies the wall-clock used for `RunResult.StartedAt`.
// Defaults to `time.Now` when nil.
Clock func() time.Time
}
// Client is the production adapter implementing `ports.DockerClient`.
// Construct it via NewClient; do not zero-initialise.
type Client struct {
docker *dockerclient.Client
logDriver string
logOpts string
clock func() time.Time
}
// NewClient constructs a Client from cfg. It returns an error if cfg
// does not carry the minimum collaborator set the adapter needs to
// function.
func NewClient(cfg Config) (*Client, error) {
if cfg.Docker == nil {
return nil, errors.New("new docker adapter: nil docker client")
}
if strings.TrimSpace(cfg.LogDriver) == "" {
return nil, errors.New("new docker adapter: log driver must not be empty")
}
clock := cfg.Clock
if clock == nil {
clock = time.Now
}
return &Client{
docker: cfg.Docker,
logDriver: cfg.LogDriver,
logOpts: cfg.LogOpts,
clock: clock,
}, nil
}
// EnsureNetwork verifies the user-defined Docker network is present.
// The adapter never creates networks; provisioning is the operator's
// job per `rtmanager/README.md §Container Model`.
func (client *Client) EnsureNetwork(ctx context.Context, name string) error {
if _, err := client.docker.NetworkInspect(ctx, name, network.InspectOptions{}); err != nil {
if cerrdefs.IsNotFound(err) {
return ports.ErrNetworkMissing
}
return fmt.Errorf("ensure network %q: %w", name, err)
}
return nil
}
// PullImage pulls ref according to policy. The pull stream is drained
// to completion because the Docker SDK only finishes the underlying
// pull when the body is consumed.
func (client *Client) PullImage(ctx context.Context, ref string, policy ports.PullPolicy) error {
if !policy.IsKnown() {
return fmt.Errorf("pull image %q: unknown pull policy %q", ref, policy)
}
switch policy {
case ports.PullPolicyAlways:
return client.runPull(ctx, ref)
case ports.PullPolicyIfMissing:
if present, err := client.imagePresent(ctx, ref); err != nil {
return err
} else if present {
return nil
}
return client.runPull(ctx, ref)
case ports.PullPolicyNever:
present, err := client.imagePresent(ctx, ref)
if err != nil {
return err
}
if !present {
return ports.ErrImageNotFound
}
return nil
default:
return fmt.Errorf("pull image %q: unsupported pull policy %q", ref, policy)
}
}
// InspectImage returns image metadata for ref. RTM only reads labels
// at start time; the broader inspect struct stays accessible for
// diagnostics.
func (client *Client) InspectImage(ctx context.Context, ref string) (ports.ImageInspect, error) {
inspect, err := client.docker.ImageInspect(ctx, ref)
if err != nil {
if cerrdefs.IsNotFound(err) {
return ports.ImageInspect{}, ports.ErrImageNotFound
}
return ports.ImageInspect{}, fmt.Errorf("inspect image %q: %w", ref, err)
}
var labels map[string]string
if inspect.Config != nil {
labels = copyStringMap(inspect.Config.Labels)
}
return ports.ImageInspect{Ref: ref, Labels: labels}, nil
}
// InspectContainer returns container metadata for containerID. The
// adapter best-effort decodes Docker timestamps; malformed values map
// to the zero time so callers do not have to defend against nil
// pointers in the SDK response.
func (client *Client) InspectContainer(ctx context.Context, containerID string) (ports.ContainerInspect, error) {
inspect, err := client.docker.ContainerInspect(ctx, containerID)
if err != nil {
if cerrdefs.IsNotFound(err) {
return ports.ContainerInspect{}, ports.ErrContainerNotFound
}
return ports.ContainerInspect{}, fmt.Errorf("inspect container %q: %w", containerID, err)
}
result := ports.ContainerInspect{ID: inspect.ID}
if inspect.ContainerJSONBase != nil {
result.RestartCount = inspect.RestartCount
if inspect.State != nil {
result.Status = string(inspect.State.Status)
result.OOMKilled = inspect.State.OOMKilled
result.ExitCode = inspect.State.ExitCode
result.StartedAt = parseDockerTime(inspect.State.StartedAt)
result.FinishedAt = parseDockerTime(inspect.State.FinishedAt)
if inspect.State.Health != nil {
result.Health = string(inspect.State.Health.Status)
}
}
}
if inspect.Config != nil {
result.ImageRef = inspect.Config.Image
result.Hostname = inspect.Config.Hostname
result.Labels = copyStringMap(inspect.Config.Labels)
}
return result, nil
}
// Run creates and starts one container according to spec. On
// `ContainerStart` failure the adapter best-effort removes the partial
// container so the start service never has to clean up after a failed
// start path.
func (client *Client) Run(ctx context.Context, spec ports.RunSpec) (ports.RunResult, error) {
if err := spec.Validate(); err != nil {
return ports.RunResult{}, fmt.Errorf("run container: %w", err)
}
memoryBytes, err := units.RAMInBytes(spec.Memory)
if err != nil {
return ports.RunResult{}, fmt.Errorf("run container %q: parse memory %q: %w", spec.Name, spec.Memory, err)
}
pidsLimit := int64(spec.PIDsLimit)
containerCfg := &container.Config{
Image: spec.Image,
Hostname: spec.Hostname,
Env: envMapToSlice(spec.Env),
Labels: copyStringMap(spec.Labels),
Cmd: append([]string(nil), spec.Cmd...),
}
hostCfg := &container.HostConfig{
Binds: bindMountsToBinds(spec.BindMounts),
LogConfig: container.LogConfig{
Type: client.logDriver,
Config: parseLogOpts(client.logOpts),
},
Resources: container.Resources{
NanoCPUs: int64(spec.CPUQuota * 1e9),
Memory: memoryBytes,
PidsLimit: &pidsLimit,
},
}
netCfg := &network.NetworkingConfig{
EndpointsConfig: map[string]*network.EndpointSettings{
spec.Network: {
Aliases: []string{spec.Hostname},
},
},
}
created, err := client.docker.ContainerCreate(ctx, containerCfg, hostCfg, netCfg, nil, spec.Name)
if err != nil {
return ports.RunResult{}, fmt.Errorf("create container %q: %w", spec.Name, err)
}
if err := client.docker.ContainerStart(ctx, created.ID, container.StartOptions{}); err != nil {
client.cleanupAfterFailedStart(created.ID)
return ports.RunResult{}, fmt.Errorf("start container %q: %w", spec.Name, err)
}
return ports.RunResult{
ContainerID: created.ID,
EngineEndpoint: fmt.Sprintf("http://%s:%d", spec.Hostname, EnginePort),
StartedAt: client.clock(),
}, nil
}
// Stop bounds graceful shutdown by timeout. A missing container is
// surfaced as ErrContainerNotFound so the service layer can treat it
// as already-stopped per `rtmanager/README.md §Lifecycles → Stop`.
func (client *Client) Stop(ctx context.Context, containerID string, timeout time.Duration) error {
seconds := max(int(timeout.Round(time.Second).Seconds()), 0)
if err := client.docker.ContainerStop(ctx, containerID, container.StopOptions{Timeout: &seconds}); err != nil {
if cerrdefs.IsNotFound(err) {
return ports.ErrContainerNotFound
}
return fmt.Errorf("stop container %q: %w", containerID, err)
}
return nil
}
// Remove removes the container without forcing kill. A missing
// container is reported as success so callers can treat the operation
// as idempotent.
func (client *Client) Remove(ctx context.Context, containerID string) error {
if err := client.docker.ContainerRemove(ctx, containerID, container.RemoveOptions{}); err != nil {
if cerrdefs.IsNotFound(err) {
return nil
}
return fmt.Errorf("remove container %q: %w", containerID, err)
}
return nil
}
// List returns container summaries that match filter. Empty Labels
// match every container; the reconciler always passes
// `com.galaxy.owner=rtmanager`.
func (client *Client) List(ctx context.Context, filter ports.ListFilter) ([]ports.ContainerSummary, error) {
args := filters.NewArgs()
for key, value := range filter.Labels {
args.Add("label", key+"="+value)
}
summaries, err := client.docker.ContainerList(ctx, container.ListOptions{All: true, Filters: args})
if err != nil {
return nil, fmt.Errorf("list containers: %w", err)
}
out := make([]ports.ContainerSummary, 0, len(summaries))
for _, summary := range summaries {
hostname := ""
if len(summary.Names) > 0 {
hostname = strings.TrimPrefix(summary.Names[0], "/")
}
out = append(out, ports.ContainerSummary{
ID: summary.ID,
ImageRef: summary.Image,
Hostname: hostname,
Labels: copyStringMap(summary.Labels),
Status: string(summary.State),
StartedAt: time.Unix(summary.Created, 0).UTC(),
})
}
return out, nil
}
// EventsListen subscribes to the Docker events stream and returns a
// typed channel of decoded container events plus an asynchronous
// error channel. The caller cancels ctx to terminate the subscription;
// the goroutine closes both channels on termination.
func (client *Client) EventsListen(ctx context.Context) (<-chan ports.DockerEvent, <-chan error, error) {
msgs, sdkErrs := client.docker.Events(ctx, events.ListOptions{})
out := make(chan ports.DockerEvent)
outErrs := make(chan error, 1)
var closeOnce sync.Once
closeAll := func() {
closeOnce.Do(func() {
close(out)
close(outErrs)
})
}
go func() {
defer closeAll()
for {
select {
case <-ctx.Done():
return
case msg, ok := <-msgs:
if !ok {
return
}
if msg.Type != events.ContainerEventType {
continue
}
select {
case <-ctx.Done():
return
case out <- decodeEvent(msg):
}
case err, ok := <-sdkErrs:
if !ok {
return
}
if err == nil {
continue
}
select {
case <-ctx.Done():
case outErrs <- err:
}
return
}
}
}()
return out, outErrs, nil
}
func (client *Client) cleanupAfterFailedStart(containerID string) {
cleanupCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
_ = client.docker.ContainerRemove(cleanupCtx, containerID, container.RemoveOptions{Force: true})
}
func (client *Client) imagePresent(ctx context.Context, ref string) (bool, error) {
if _, err := client.docker.ImageInspect(ctx, ref); err != nil {
if cerrdefs.IsNotFound(err) {
return false, nil
}
return false, fmt.Errorf("inspect image %q: %w", ref, err)
}
return true, nil
}
func (client *Client) runPull(ctx context.Context, ref string) error {
body, err := client.docker.ImagePull(ctx, ref, image.PullOptions{})
if err != nil {
if cerrdefs.IsNotFound(err) {
return ports.ErrImageNotFound
}
return fmt.Errorf("pull image %q: %w", ref, err)
}
defer body.Close()
if _, err := io.Copy(io.Discard, body); err != nil {
return fmt.Errorf("drain pull stream for %q: %w", ref, err)
}
return nil
}
func envMapToSlice(envMap map[string]string) []string {
if len(envMap) == 0 {
return nil
}
out := make([]string, 0, len(envMap))
for key, value := range envMap {
out = append(out, key+"="+value)
}
return out
}
func bindMountsToBinds(mounts []ports.BindMount) []string {
if len(mounts) == 0 {
return nil
}
binds := make([]string, 0, len(mounts))
for _, mount := range mounts {
bind := mount.HostPath + ":" + mount.MountPath
if mount.ReadOnly {
bind += ":ro"
}
binds = append(binds, bind)
}
return binds
}
func parseLogOpts(raw string) map[string]string {
if strings.TrimSpace(raw) == "" {
return nil
}
out := make(map[string]string)
for part := range strings.SplitSeq(raw, ",") {
entry := strings.TrimSpace(part)
if entry == "" {
continue
}
index := strings.IndexByte(entry, '=')
if index <= 0 {
continue
}
out[entry[:index]] = entry[index+1:]
}
if len(out) == 0 {
return nil
}
return out
}
func parseDockerTime(raw string) time.Time {
if raw == "" {
return time.Time{}
}
parsed, err := time.Parse(time.RFC3339Nano, raw)
if err != nil {
return time.Time{}
}
return parsed.UTC()
}
func copyStringMap(in map[string]string) map[string]string {
if in == nil {
return nil
}
out := make(map[string]string, len(in))
maps.Copy(out, in)
return out
}
func decodeEvent(msg events.Message) ports.DockerEvent {
occurredAt := time.Time{}
switch {
case msg.TimeNano != 0:
occurredAt = time.Unix(0, msg.TimeNano).UTC()
case msg.Time != 0:
occurredAt = time.Unix(msg.Time, 0).UTC()
}
exitCode := 0
if raw, ok := msg.Actor.Attributes["exitCode"]; ok {
if value, err := parseExitCode(raw); err == nil {
exitCode = value
}
}
return ports.DockerEvent{
Action: string(msg.Action),
ContainerID: msg.Actor.ID,
Labels: copyStringMap(msg.Actor.Attributes),
ExitCode: exitCode,
OccurredAt: occurredAt,
}
}
func parseExitCode(raw string) (int, error) {
value := 0
for _, r := range raw {
if r < '0' || r > '9' {
return 0, fmt.Errorf("non-numeric exit code %q", raw)
}
value = value*10 + int(r-'0')
}
return value, nil
}
// Compile-time assertion: Client implements ports.DockerClient.
var _ ports.DockerClient = (*Client)(nil)
@@ -0,0 +1,561 @@
package docker
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/http/httptest"
"net/url"
"strings"
"sync/atomic"
"testing"
"time"
dockerclient "github.com/docker/docker/client"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"galaxy/rtmanager/internal/ports"
)
// newTestClient wires an httptest.Server backed Docker SDK client to our
// adapter. The handler is invoked for every Docker API request issued
// during the test; tests assert on path and method to route the
// response.
func newTestClient(t *testing.T, handler http.HandlerFunc) *Client {
t.Helper()
server := httptest.NewServer(handler)
t.Cleanup(server.Close)
docker, err := dockerclient.NewClientWithOpts(
dockerclient.WithHost(server.URL),
dockerclient.WithHTTPClient(server.Client()),
dockerclient.WithVersion("1.45"),
)
require.NoError(t, err)
t.Cleanup(func() { _ = docker.Close() })
client, err := NewClient(Config{
Docker: docker,
LogDriver: "json-file",
LogOpts: "max-size=1m,max-file=3",
Clock: func() time.Time { return time.Date(2026, time.April, 27, 12, 0, 0, 0, time.UTC) },
})
require.NoError(t, err)
return client
}
func writeJSON(t *testing.T, w http.ResponseWriter, status int, body any) {
t.Helper()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
require.NoError(t, json.NewEncoder(w).Encode(body))
}
func writeNotFound(t *testing.T, w http.ResponseWriter, msg string) {
t.Helper()
writeJSON(t, w, http.StatusNotFound, map[string]string{"message": msg})
}
// Docker SDK uses /v1.45 prefix when client is pinned to API 1.45.
func dockerPath(suffix string) string {
return "/v1.45" + suffix
}
func TestNewClientValidatesConfig(t *testing.T) {
t.Run("nil docker client", func(t *testing.T) {
_, err := NewClient(Config{LogDriver: "json-file"})
require.Error(t, err)
assert.Contains(t, err.Error(), "nil docker client")
})
t.Run("empty log driver", func(t *testing.T) {
docker, err := dockerclient.NewClientWithOpts(dockerclient.WithHost("tcp://127.0.0.1:65535"))
require.NoError(t, err)
t.Cleanup(func() { _ = docker.Close() })
_, err = NewClient(Config{Docker: docker, LogDriver: " "})
require.Error(t, err)
assert.Contains(t, err.Error(), "log driver")
})
}
func TestEnsureNetwork(t *testing.T) {
t.Run("present", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
require.Equal(t, http.MethodGet, r.Method)
require.Equal(t, dockerPath("/networks/galaxy-net"), r.URL.Path)
writeJSON(t, w, http.StatusOK, map[string]any{"Id": "net-1", "Name": "galaxy-net"})
})
require.NoError(t, client.EnsureNetwork(context.Background(), "galaxy-net"))
})
t.Run("missing", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
writeNotFound(t, w, "no such network")
})
err := client.EnsureNetwork(context.Background(), "missing")
require.Error(t, err)
assert.ErrorIs(t, err, ports.ErrNetworkMissing)
})
t.Run("transport error", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "boom", http.StatusInternalServerError)
})
err := client.EnsureNetwork(context.Background(), "x")
require.Error(t, err)
assert.NotErrorIs(t, err, ports.ErrNetworkMissing)
})
}
func TestInspectImage(t *testing.T) {
t.Run("present", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
require.Equal(t, http.MethodGet, r.Method)
require.Equal(t, dockerPath("/images/galaxy/game:test/json"), r.URL.Path)
writeJSON(t, w, http.StatusOK, map[string]any{
"Id": "sha256:abc",
"Config": map[string]any{
"Labels": map[string]string{
"com.galaxy.cpu_quota": "1.0",
"com.galaxy.memory": "512m",
"com.galaxy.pids_limit": "512",
},
},
})
})
got, err := client.InspectImage(context.Background(), "galaxy/game:test")
require.NoError(t, err)
assert.Equal(t, "galaxy/game:test", got.Ref)
assert.Equal(t, "1.0", got.Labels["com.galaxy.cpu_quota"])
assert.Equal(t, "512m", got.Labels["com.galaxy.memory"])
})
t.Run("not found", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
writeNotFound(t, w, "no such image")
})
_, err := client.InspectImage(context.Background(), "galaxy/missing:tag")
require.Error(t, err)
assert.ErrorIs(t, err, ports.ErrImageNotFound)
})
}
func TestInspectContainer(t *testing.T) {
t.Run("present", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
require.Equal(t, http.MethodGet, r.Method)
require.Equal(t, dockerPath("/containers/cont-1/json"), r.URL.Path)
writeJSON(t, w, http.StatusOK, map[string]any{
"Id": "cont-1",
"RestartCount": 2,
"State": map[string]any{
"Status": "running",
"OOMKilled": false,
"ExitCode": 0,
"StartedAt": "2026-04-27T11:00:00.5Z",
"FinishedAt": "0001-01-01T00:00:00Z",
"Health": map[string]any{"Status": "healthy"},
},
"Config": map[string]any{
"Image": "galaxy/game:test",
"Hostname": "galaxy-game-game-1",
"Labels": map[string]string{
"com.galaxy.owner": "rtmanager",
"com.galaxy.game_id": "game-1",
},
},
})
})
got, err := client.InspectContainer(context.Background(), "cont-1")
require.NoError(t, err)
assert.Equal(t, "cont-1", got.ID)
assert.Equal(t, 2, got.RestartCount)
assert.Equal(t, "running", got.Status)
assert.Equal(t, "healthy", got.Health)
assert.Equal(t, "galaxy/game:test", got.ImageRef)
assert.Equal(t, "galaxy-game-game-1", got.Hostname)
assert.Equal(t, "rtmanager", got.Labels["com.galaxy.owner"])
assert.False(t, got.StartedAt.IsZero())
})
t.Run("not found", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
writeNotFound(t, w, "no such container")
})
_, err := client.InspectContainer(context.Background(), "missing")
require.Error(t, err)
assert.ErrorIs(t, err, ports.ErrContainerNotFound)
})
}
func TestPullImagePolicies(t *testing.T) {
t.Run("if_missing/found skips pull", func(t *testing.T) {
hits := struct {
inspect atomic.Int32
pull atomic.Int32
}{}
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
switch {
case strings.HasSuffix(r.URL.Path, "/json") && r.Method == http.MethodGet:
hits.inspect.Add(1)
writeJSON(t, w, http.StatusOK, map[string]any{"Id": "sha256:x"})
case strings.Contains(r.URL.Path, "/images/create"):
hits.pull.Add(1)
w.WriteHeader(http.StatusOK)
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.Path)
}
})
require.NoError(t, client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyIfMissing))
assert.Equal(t, int32(1), hits.inspect.Load())
assert.Equal(t, int32(0), hits.pull.Load())
})
t.Run("if_missing/absent triggers pull", func(t *testing.T) {
hits := struct {
inspect atomic.Int32
pull atomic.Int32
}{}
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
switch {
case strings.HasSuffix(r.URL.Path, "/json") && r.Method == http.MethodGet:
hits.inspect.Add(1)
writeNotFound(t, w, "no such image")
case strings.Contains(r.URL.Path, "/images/create"):
hits.pull.Add(1)
w.WriteHeader(http.StatusOK)
_, _ = io.WriteString(w, `{"status":"Pulling..."}`+"\n"+`{"status":"Done"}`+"\n")
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.Path)
}
})
require.NoError(t, client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyIfMissing))
assert.Equal(t, int32(1), hits.inspect.Load())
assert.Equal(t, int32(1), hits.pull.Load())
})
t.Run("always pulls regardless of cache", func(t *testing.T) {
var pullCount atomic.Int32
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
require.Contains(t, r.URL.Path, "/images/create")
pullCount.Add(1)
w.WriteHeader(http.StatusOK)
})
require.NoError(t, client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyAlways))
assert.Equal(t, int32(1), pullCount.Load())
})
t.Run("never with absent image", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
require.Equal(t, http.MethodGet, r.Method)
writeNotFound(t, w, "no such image")
})
err := client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyNever)
require.Error(t, err)
assert.ErrorIs(t, err, ports.ErrImageNotFound)
})
t.Run("never with present image", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
require.Equal(t, http.MethodGet, r.Method)
writeJSON(t, w, http.StatusOK, map[string]any{"Id": "x"})
})
require.NoError(t, client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyNever))
})
t.Run("unknown policy", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
t.Fatal("must not call docker on unknown policy")
})
err := client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicy("invalid"))
require.Error(t, err)
})
}
func TestRunHappyPath(t *testing.T) {
calls := struct {
create atomic.Int32
start atomic.Int32
remove atomic.Int32
}{}
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/containers/create"):
calls.create.Add(1)
require.Equal(t, "galaxy-game-game-1", r.URL.Query().Get("name"))
writeJSON(t, w, http.StatusCreated, map[string]any{"Id": "cont-new", "Warnings": []string{}})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/start"):
calls.start.Add(1)
require.Equal(t, dockerPath("/containers/cont-new/start"), r.URL.Path)
w.WriteHeader(http.StatusNoContent)
case r.Method == http.MethodDelete && strings.HasPrefix(r.URL.Path, dockerPath("/containers/")):
calls.remove.Add(1)
w.WriteHeader(http.StatusNoContent)
default:
t.Fatalf("unexpected %s %s", r.Method, r.URL.Path)
}
})
result, err := client.Run(context.Background(), ports.RunSpec{
Name: "galaxy-game-game-1",
Image: "galaxy/game:test",
Hostname: "galaxy-game-game-1",
Network: "galaxy-net",
Env: map[string]string{
"GAME_STATE_PATH": "/var/lib/galaxy-game",
"STORAGE_PATH": "/var/lib/galaxy-game",
},
Labels: map[string]string{"com.galaxy.owner": "rtmanager"},
LogDriver: "json-file",
BindMounts: []ports.BindMount{
{HostPath: "/var/lib/galaxy/games/game-1", MountPath: "/var/lib/galaxy-game"},
},
CPUQuota: 1.0,
Memory: "512m",
PIDsLimit: 512,
})
require.NoError(t, err)
assert.Equal(t, "cont-new", result.ContainerID)
assert.Equal(t, "http://galaxy-game-game-1:8080", result.EngineEndpoint)
assert.False(t, result.StartedAt.IsZero())
assert.Equal(t, int32(1), calls.create.Load())
assert.Equal(t, int32(1), calls.start.Load())
assert.Equal(t, int32(0), calls.remove.Load())
}
func TestRunStartFailureRemovesContainer(t *testing.T) {
calls := struct {
create atomic.Int32
start atomic.Int32
remove atomic.Int32
}{}
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/containers/create"):
calls.create.Add(1)
writeJSON(t, w, http.StatusCreated, map[string]any{"Id": "cont-x"})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/start"):
calls.start.Add(1)
http.Error(w, `{"message":"insufficient host resources"}`, http.StatusInternalServerError)
case r.Method == http.MethodDelete && strings.HasPrefix(r.URL.Path, dockerPath("/containers/cont-x")):
calls.remove.Add(1)
require.Equal(t, "1", r.URL.Query().Get("force"))
w.WriteHeader(http.StatusNoContent)
default:
t.Fatalf("unexpected %s %s", r.Method, r.URL.Path)
}
})
_, err := client.Run(context.Background(), ports.RunSpec{
Name: "x",
Image: "img",
Hostname: "x",
Network: "n",
LogDriver: "json-file",
CPUQuota: 1.0,
Memory: "64m",
PIDsLimit: 64,
})
require.Error(t, err)
assert.Equal(t, int32(1), calls.create.Load())
assert.Equal(t, int32(1), calls.start.Load())
assert.Equal(t, int32(1), calls.remove.Load(), "adapter must roll back the partial container")
}
func TestRunRejectsInvalidSpec(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
t.Fatal("must not contact docker on invalid spec")
})
_, err := client.Run(context.Background(), ports.RunSpec{Name: "x"})
require.Error(t, err)
assert.Contains(t, err.Error(), "image must not be empty")
}
func TestStop(t *testing.T) {
t.Run("graceful stop", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
require.Equal(t, http.MethodPost, r.Method)
require.Equal(t, dockerPath("/containers/cont-1/stop"), r.URL.Path)
require.Equal(t, "30", r.URL.Query().Get("t"))
w.WriteHeader(http.StatusNoContent)
})
require.NoError(t, client.Stop(context.Background(), "cont-1", 30*time.Second))
})
t.Run("missing container", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
writeNotFound(t, w, "no such container")
})
err := client.Stop(context.Background(), "missing", 30*time.Second)
assert.ErrorIs(t, err, ports.ErrContainerNotFound)
})
t.Run("negative timeout normalised to zero", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
require.Equal(t, "0", r.URL.Query().Get("t"))
w.WriteHeader(http.StatusNoContent)
})
require.NoError(t, client.Stop(context.Background(), "x", -5*time.Second))
})
}
func TestRemoveIsIdempotent(t *testing.T) {
t.Run("present", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
require.Equal(t, http.MethodDelete, r.Method)
w.WriteHeader(http.StatusNoContent)
})
require.NoError(t, client.Remove(context.Background(), "cont-1"))
})
t.Run("missing", func(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
writeNotFound(t, w, "no such container")
})
require.NoError(t, client.Remove(context.Background(), "missing"))
})
}
func TestListAppliesLabelFilter(t *testing.T) {
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
require.Equal(t, http.MethodGet, r.Method)
require.Equal(t, dockerPath("/containers/json"), r.URL.Path)
require.Equal(t, "1", r.URL.Query().Get("all"))
filtersRaw := r.URL.Query().Get("filters")
require.NotEmpty(t, filtersRaw)
var args map[string]map[string]bool
require.NoError(t, json.Unmarshal([]byte(filtersRaw), &args))
require.True(t, args["label"]["com.galaxy.owner=rtmanager"])
writeJSON(t, w, http.StatusOK, []map[string]any{
{
"Id": "cont-a",
"Image": "galaxy/game:1.2.3",
"Names": []string{"/galaxy-game-game-1"},
"Labels": map[string]string{"com.galaxy.owner": "rtmanager"},
"State": "running",
"Created": int64(1700000000),
},
})
})
got, err := client.List(context.Background(), ports.ListFilter{
Labels: map[string]string{"com.galaxy.owner": "rtmanager"},
})
require.NoError(t, err)
require.Len(t, got, 1)
assert.Equal(t, "cont-a", got[0].ID)
assert.Equal(t, "galaxy/game:1.2.3", got[0].ImageRef)
assert.Equal(t, "galaxy-game-game-1", got[0].Hostname)
assert.Equal(t, "running", got[0].Status)
assert.False(t, got[0].StartedAt.IsZero())
assert.Equal(t, "rtmanager", got[0].Labels["com.galaxy.owner"])
}
func TestEventsListenDecodesContainerEvents(t *testing.T) {
mu := make(chan struct{})
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
require.Equal(t, http.MethodGet, r.Method)
require.Equal(t, dockerPath("/events"), r.URL.Path)
flusher, ok := w.(http.Flusher)
require.True(t, ok)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
flusher.Flush()
// Container start event
writeEvent(t, w, "container", "start", "cont-1", map[string]string{
"image": "galaxy/game:1.2.3",
"name": "galaxy-game-game-1",
"com.galaxy.game_id": "game-1",
}, time.Now())
flusher.Flush()
// Container die event with exit code 137
writeEvent(t, w, "container", "die", "cont-1", map[string]string{
"exitCode": "137",
}, time.Now())
flusher.Flush()
// Image event must be filtered out by adapter
writeEvent(t, w, "image", "pull", "img", nil, time.Now())
flusher.Flush()
<-mu
})
defer close(mu)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
events, _, err := client.EventsListen(ctx)
require.NoError(t, err)
got := []ports.DockerEvent{}
deadline := time.After(2 * time.Second)
for len(got) < 2 {
select {
case ev, ok := <-events:
if !ok {
t.Fatalf("events channel closed; got %d events", len(got))
}
got = append(got, ev)
case <-deadline:
t.Fatalf("did not receive expected events; have %d", len(got))
}
}
require.Len(t, got, 2)
assert.Equal(t, "start", got[0].Action)
assert.Equal(t, "cont-1", got[0].ContainerID)
assert.Equal(t, "game-1", got[0].Labels["com.galaxy.game_id"])
assert.Equal(t, "die", got[1].Action)
assert.Equal(t, 137, got[1].ExitCode)
}
func writeEvent(t *testing.T, w io.Writer, eventType, action, id string, attributes map[string]string, when time.Time) {
t.Helper()
payload := map[string]any{
"Type": eventType,
"Action": action,
"Actor": map[string]any{"ID": id, "Attributes": attributes},
"time": when.Unix(),
"timeNano": when.UnixNano(),
}
data, err := json.Marshal(payload)
require.NoError(t, err)
_, err = fmt.Fprintln(w, string(data))
require.NoError(t, err)
}
// Sanity: parsing helpers.
func TestParseLogOpts(t *testing.T) {
got := parseLogOpts("max-size=1m,max-file=3, ,empty=,=novalue")
assert.Equal(t, "1m", got["max-size"])
assert.Equal(t, "3", got["max-file"])
assert.Equal(t, "", got["empty"])
_, hasNovalue := got["=novalue"]
assert.False(t, hasNovalue)
}
func TestParseDockerTime(t *testing.T) {
assert.True(t, parseDockerTime("").IsZero())
assert.True(t, parseDockerTime("not-a-date").IsZero())
parsed := parseDockerTime("2026-04-27T11:00:00.5Z")
assert.False(t, parsed.IsZero())
assert.Equal(t, time.UTC, parsed.Location())
}
func TestEnvMapToSliceDeterministicLength(t *testing.T) {
got := envMapToSlice(map[string]string{"A": "1", "B": "2"})
assert.Len(t, got, 2)
for _, kv := range got {
assert.Contains(t, []string{"A=1", "B=2"}, kv)
}
assert.Nil(t, envMapToSlice(nil))
}
// Compile-time sanity: make sure errors.Is wiring stays intact.
func TestSentinelErrorsAreDistinct(t *testing.T) {
require.True(t, errors.Is(ports.ErrNetworkMissing, ports.ErrNetworkMissing))
require.False(t, errors.Is(ports.ErrNetworkMissing, ports.ErrImageNotFound))
}
func TestURLPathEscapingForCharacters(t *testing.T) {
// Ensure the SDK URL path encodes special characters; the adapter
// passes raw inputs through and lets the SDK escape.
encoded := url.PathEscape("game-1")
assert.Equal(t, "game-1", encoded)
}
@@ -0,0 +1,175 @@
// Code generated by MockGen. DO NOT EDIT.
// Source: galaxy/rtmanager/internal/ports (interfaces: DockerClient)
//
// Generated by this command:
//
// mockgen -destination=../adapters/docker/mocks/mock_dockerclient.go -package=mocks galaxy/rtmanager/internal/ports DockerClient
//
// Package mocks is a generated GoMock package.
package mocks
import (
context "context"
ports "galaxy/rtmanager/internal/ports"
reflect "reflect"
time "time"
gomock "go.uber.org/mock/gomock"
)
// MockDockerClient is a mock of DockerClient interface.
type MockDockerClient struct {
ctrl *gomock.Controller
recorder *MockDockerClientMockRecorder
isgomock struct{}
}
// MockDockerClientMockRecorder is the mock recorder for MockDockerClient.
type MockDockerClientMockRecorder struct {
mock *MockDockerClient
}
// NewMockDockerClient creates a new mock instance.
func NewMockDockerClient(ctrl *gomock.Controller) *MockDockerClient {
mock := &MockDockerClient{ctrl: ctrl}
mock.recorder = &MockDockerClientMockRecorder{mock}
return mock
}
// EXPECT returns an object that allows the caller to indicate expected use.
func (m *MockDockerClient) EXPECT() *MockDockerClientMockRecorder {
return m.recorder
}
// EnsureNetwork mocks base method.
func (m *MockDockerClient) EnsureNetwork(ctx context.Context, name string) error {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "EnsureNetwork", ctx, name)
ret0, _ := ret[0].(error)
return ret0
}
// EnsureNetwork indicates an expected call of EnsureNetwork.
func (mr *MockDockerClientMockRecorder) EnsureNetwork(ctx, name any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnsureNetwork", reflect.TypeOf((*MockDockerClient)(nil).EnsureNetwork), ctx, name)
}
// EventsListen mocks base method.
func (m *MockDockerClient) EventsListen(ctx context.Context) (<-chan ports.DockerEvent, <-chan error, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "EventsListen", ctx)
ret0, _ := ret[0].(<-chan ports.DockerEvent)
ret1, _ := ret[1].(<-chan error)
ret2, _ := ret[2].(error)
return ret0, ret1, ret2
}
// EventsListen indicates an expected call of EventsListen.
func (mr *MockDockerClientMockRecorder) EventsListen(ctx any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EventsListen", reflect.TypeOf((*MockDockerClient)(nil).EventsListen), ctx)
}
// InspectContainer mocks base method.
func (m *MockDockerClient) InspectContainer(ctx context.Context, containerID string) (ports.ContainerInspect, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "InspectContainer", ctx, containerID)
ret0, _ := ret[0].(ports.ContainerInspect)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// InspectContainer indicates an expected call of InspectContainer.
func (mr *MockDockerClientMockRecorder) InspectContainer(ctx, containerID any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InspectContainer", reflect.TypeOf((*MockDockerClient)(nil).InspectContainer), ctx, containerID)
}
// InspectImage mocks base method.
func (m *MockDockerClient) InspectImage(ctx context.Context, ref string) (ports.ImageInspect, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "InspectImage", ctx, ref)
ret0, _ := ret[0].(ports.ImageInspect)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// InspectImage indicates an expected call of InspectImage.
func (mr *MockDockerClientMockRecorder) InspectImage(ctx, ref any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InspectImage", reflect.TypeOf((*MockDockerClient)(nil).InspectImage), ctx, ref)
}
// List mocks base method.
func (m *MockDockerClient) List(ctx context.Context, filter ports.ListFilter) ([]ports.ContainerSummary, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "List", ctx, filter)
ret0, _ := ret[0].([]ports.ContainerSummary)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// List indicates an expected call of List.
func (mr *MockDockerClientMockRecorder) List(ctx, filter any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "List", reflect.TypeOf((*MockDockerClient)(nil).List), ctx, filter)
}
// PullImage mocks base method.
func (m *MockDockerClient) PullImage(ctx context.Context, ref string, policy ports.PullPolicy) error {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "PullImage", ctx, ref, policy)
ret0, _ := ret[0].(error)
return ret0
}
// PullImage indicates an expected call of PullImage.
func (mr *MockDockerClientMockRecorder) PullImage(ctx, ref, policy any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PullImage", reflect.TypeOf((*MockDockerClient)(nil).PullImage), ctx, ref, policy)
}
// Remove mocks base method.
func (m *MockDockerClient) Remove(ctx context.Context, containerID string) error {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Remove", ctx, containerID)
ret0, _ := ret[0].(error)
return ret0
}
// Remove indicates an expected call of Remove.
func (mr *MockDockerClientMockRecorder) Remove(ctx, containerID any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Remove", reflect.TypeOf((*MockDockerClient)(nil).Remove), ctx, containerID)
}
// Run mocks base method.
func (m *MockDockerClient) Run(ctx context.Context, spec ports.RunSpec) (ports.RunResult, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Run", ctx, spec)
ret0, _ := ret[0].(ports.RunResult)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// Run indicates an expected call of Run.
func (mr *MockDockerClientMockRecorder) Run(ctx, spec any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Run", reflect.TypeOf((*MockDockerClient)(nil).Run), ctx, spec)
}
// Stop mocks base method.
func (m *MockDockerClient) Stop(ctx context.Context, containerID string, timeout time.Duration) error {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Stop", ctx, containerID, timeout)
ret0, _ := ret[0].(error)
return ret0
}
// Stop indicates an expected call of Stop.
func (mr *MockDockerClientMockRecorder) Stop(ctx, containerID, timeout any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stop", reflect.TypeOf((*MockDockerClient)(nil).Stop), ctx, containerID, timeout)
}
@@ -0,0 +1,11 @@
package mocks
import (
"galaxy/rtmanager/internal/ports"
)
// Compile-time assertion that the generated mock satisfies the port
// interface. Future signature drift between the port and the generated
// file fails the build at this line, which is more actionable than a
// runtime check from a service test.
var _ ports.DockerClient = (*MockDockerClient)(nil)
@@ -0,0 +1,202 @@
// Package docker smoke tests exercise the production adapter against a
// real Docker daemon. The tests skip when no Docker socket is reachable
// (`skipUnlessDockerAvailable`), so they run in the default
// `go test ./...` pass without a build tag.
package docker
import (
"context"
"crypto/rand"
"encoding/hex"
"errors"
"os"
"testing"
"time"
"github.com/docker/docker/api/types/network"
dockerclient "github.com/docker/docker/client"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"galaxy/rtmanager/internal/ports"
)
const (
smokeImage = "alpine:3.21"
smokeNetPrefix = "rtmanager-smoke-"
)
func skipUnlessDockerAvailable(t *testing.T) {
t.Helper()
if os.Getenv("DOCKER_HOST") == "" {
if _, err := os.Stat("/var/run/docker.sock"); err != nil {
t.Skip("docker daemon not available; set DOCKER_HOST or expose /var/run/docker.sock")
}
}
}
func newSmokeAdapter(t *testing.T) (*Client, *dockerclient.Client) {
t.Helper()
docker, err := dockerclient.NewClientWithOpts(dockerclient.FromEnv, dockerclient.WithAPIVersionNegotiation())
require.NoError(t, err)
t.Cleanup(func() { _ = docker.Close() })
pingCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if _, err := docker.Ping(pingCtx); err != nil {
// A reachable socket path may still be unusable in sandboxed
// environments (e.g., macOS sandbox blocking the colima socket).
// The smoke test can only run when the daemon answers ping, so a
// permission-denied / connection-refused error is a runtime
// "Docker unavailable" signal and skips the test.
t.Skipf("docker daemon unavailable: %v", err)
}
adapter, err := NewClient(Config{
Docker: docker,
LogDriver: "json-file",
})
require.NoError(t, err)
return adapter, docker
}
func uniqueSuffix(t *testing.T) string {
t.Helper()
buf := make([]byte, 4)
_, err := rand.Read(buf)
require.NoError(t, err)
return hex.EncodeToString(buf)
}
// TestSmokeFullLifecycle runs the adapter through every method against
// the real Docker daemon: ensure-network → pull → run → events →
// stop → remove.
func TestSmokeFullLifecycle(t *testing.T) {
skipUnlessDockerAvailable(t)
adapter, docker := newSmokeAdapter(t)
suffix := uniqueSuffix(t)
netName := smokeNetPrefix + suffix
containerName := "rtmanager-smoke-cont-" + suffix
// Step 1 — provision a temporary user-defined bridge network.
createCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
_, err := docker.NetworkCreate(createCtx, netName, network.CreateOptions{Driver: "bridge"})
require.NoError(t, err)
t.Cleanup(func() {
removeCtx, removeCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer removeCancel()
_ = docker.NetworkRemove(removeCtx, netName)
})
// Step 2 — EnsureNetwork present and missing paths.
require.NoError(t, adapter.EnsureNetwork(createCtx, netName))
missingErr := adapter.EnsureNetwork(createCtx, "rtmanager-smoke-missing-"+suffix)
require.Error(t, missingErr)
assert.ErrorIs(t, missingErr, ports.ErrNetworkMissing)
// Step 3 — pull alpine via the configured policy.
pullCtx, pullCancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer pullCancel()
require.NoError(t, adapter.PullImage(pullCtx, smokeImage, ports.PullPolicyIfMissing))
// Step 4 — subscribe to events before running the container so we
// observe the start event.
listenCtx, listenCancel := context.WithCancel(context.Background())
defer listenCancel()
events, listenErrs, err := adapter.EventsListen(listenCtx)
require.NoError(t, err)
// Step 5 — run a tiny container that sleeps so we can observe it.
stateDir := t.TempDir()
runCtx, runCancel := context.WithTimeout(context.Background(), 60*time.Second)
defer runCancel()
result, err := adapter.Run(runCtx, ports.RunSpec{
Name: containerName,
Image: smokeImage,
Hostname: "smoke-" + suffix,
Network: netName,
Env: map[string]string{
"GAME_STATE_PATH": "/tmp/state",
"STORAGE_PATH": "/tmp/state",
},
Labels: map[string]string{
"com.galaxy.owner": "rtmanager",
"com.galaxy.kind": "smoke",
},
BindMounts: []ports.BindMount{
{HostPath: stateDir, MountPath: "/tmp/state"},
},
LogDriver: "json-file",
CPUQuota: 0.5,
Memory: "64m",
PIDsLimit: 32,
Cmd: []string{"/bin/sh", "-c", "sleep 60"},
})
require.NoError(t, err)
t.Cleanup(func() {
removeCtx, removeCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer removeCancel()
_ = adapter.Remove(removeCtx, result.ContainerID)
})
require.NotEmpty(t, result.ContainerID)
require.Equal(t, "http://smoke-"+suffix+":8080", result.EngineEndpoint)
// Step 6 — wait for a `start` event for the new container id.
startObserved := waitForEvent(t, events, listenErrs, "start", result.ContainerID, 15*time.Second)
require.True(t, startObserved, "did not observe start event for container %s", result.ContainerID)
// Step 7 — InspectContainer returns running state.
inspectCtx, inspectCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer inspectCancel()
inspect, err := adapter.InspectContainer(inspectCtx, result.ContainerID)
require.NoError(t, err)
assert.Equal(t, "running", inspect.Status)
// Step 8 — Stop, then Remove, then InspectContainer must report
// not found.
stopCtx, stopCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer stopCancel()
require.NoError(t, adapter.Stop(stopCtx, result.ContainerID, 5*time.Second))
require.NoError(t, adapter.Remove(stopCtx, result.ContainerID))
if _, err := adapter.InspectContainer(stopCtx, result.ContainerID); !errors.Is(err, ports.ErrContainerNotFound) {
t.Fatalf("expected ErrContainerNotFound, got %v", err)
}
// Step 9 — terminate the events subscription cleanly.
listenCancel()
select {
case _, ok := <-events:
_ = ok
case <-time.After(5 * time.Second):
t.Log("events channel did not close within timeout (best-effort)")
}
}
func waitForEvent(t *testing.T, events <-chan ports.DockerEvent, errs <-chan error, action, containerID string, timeout time.Duration) bool {
t.Helper()
deadline := time.After(timeout)
for {
select {
case ev, ok := <-events:
if !ok {
return false
}
if ev.Action == action && ev.ContainerID == containerID {
return true
}
case err := <-errs:
if err != nil {
t.Fatalf("events stream error: %v", err)
}
case <-deadline:
return false
}
}
}
@@ -0,0 +1,165 @@
// Package healtheventspublisher provides the Redis-Streams-backed
// publisher for `runtime:health_events`. Every Publish call upserts the
// latest `health_snapshots` row before XADDing the event so consumers
// observing the snapshot store can never lag the event stream by more
// than the duration of one network call.
//
// The publisher is shared across `ports.HealthEventPublisher` callers:
// the start service emits `container_started`; the probe, inspect, and
// events-listener workers emit the rest. The publisher's surface is
// stable across all of them.
package healtheventspublisher
import (
"context"
"encoding/json"
"errors"
"fmt"
"strconv"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/ports"
"github.com/redis/go-redis/v9"
)
// emptyDetails is the canonical JSON payload installed when the caller
// supplies an empty Details slice. Matches the SQL DEFAULT for
// `health_snapshots.details`.
const emptyDetails = "{}"
// Wire field names used by the Redis Streams payload. Frozen by
// `rtmanager/api/runtime-health-asyncapi.yaml`; renaming any of them
// breaks consumers.
const (
fieldGameID = "game_id"
fieldContainerID = "container_id"
fieldEventType = "event_type"
fieldOccurredAtMS = "occurred_at_ms"
fieldDetails = "details"
)
// Config groups the dependencies and stream name required to construct
// a Publisher.
type Config struct {
// Client appends entries to the Redis Stream. Must be non-nil.
Client *redis.Client
// Snapshots upserts the latest health snapshot. Must be non-nil.
Snapshots ports.HealthSnapshotStore
// Stream stores the Redis Stream key events are published to (e.g.
// `runtime:health_events`). Must not be empty.
Stream string
}
// Publisher implements `ports.HealthEventPublisher` on top of a shared
// Redis client and the production `health_snapshots` store.
type Publisher struct {
client *redis.Client
snapshots ports.HealthSnapshotStore
stream string
}
// NewPublisher constructs one Publisher from cfg. Validation errors
// surface the missing collaborator verbatim.
func NewPublisher(cfg Config) (*Publisher, error) {
if cfg.Client == nil {
return nil, errors.New("new rtmanager health events publisher: nil redis client")
}
if cfg.Snapshots == nil {
return nil, errors.New("new rtmanager health events publisher: nil snapshot store")
}
if cfg.Stream == "" {
return nil, errors.New("new rtmanager health events publisher: stream must not be empty")
}
return &Publisher{
client: cfg.Client,
snapshots: cfg.Snapshots,
stream: cfg.Stream,
}, nil
}
// Publish upserts the matching health_snapshots row and then XADDs the
// envelope to the configured Redis Stream. Both side effects are
// required; the snapshot upsert runs first so a successful Publish
// always leaves the snapshot store at least as fresh as the stream.
func (publisher *Publisher) Publish(ctx context.Context, envelope ports.HealthEventEnvelope) error {
if publisher == nil || publisher.client == nil || publisher.snapshots == nil {
return errors.New("publish health event: nil publisher")
}
if ctx == nil {
return errors.New("publish health event: nil context")
}
if err := envelope.Validate(); err != nil {
return fmt.Errorf("publish health event: %w", err)
}
details := envelope.Details
if len(details) == 0 {
details = json.RawMessage(emptyDetails)
}
status, source := snapshotMappingFor(envelope.EventType)
snapshot := health.HealthSnapshot{
GameID: envelope.GameID,
ContainerID: envelope.ContainerID,
Status: status,
Source: source,
Details: details,
ObservedAt: envelope.OccurredAt.UTC(),
}
if err := publisher.snapshots.Upsert(ctx, snapshot); err != nil {
return fmt.Errorf("publish health event: upsert snapshot: %w", err)
}
occurredAtMS := envelope.OccurredAt.UTC().UnixMilli()
values := map[string]any{
fieldGameID: envelope.GameID,
fieldContainerID: envelope.ContainerID,
fieldEventType: string(envelope.EventType),
fieldOccurredAtMS: strconv.FormatInt(occurredAtMS, 10),
fieldDetails: string(details),
}
if err := publisher.client.XAdd(ctx, &redis.XAddArgs{
Stream: publisher.stream,
Values: values,
}).Err(); err != nil {
return fmt.Errorf("publish health event: xadd: %w", err)
}
return nil
}
// snapshotMappingFor returns the SnapshotStatus and SnapshotSource that
// match eventType per `rtmanager/README.md §Health Monitoring`.
//
// `container_started` is observed when the start service successfully
// runs the container; the snapshot collapses it to `healthy`.
// `probe_recovered` collapses to `healthy` per
// `rtmanager/docs/domain-and-ports.md` §4: it does not have its own
// snapshot status; the next observation overwrites the prior
// `probe_failed` with `healthy`.
func snapshotMappingFor(eventType health.EventType) (health.SnapshotStatus, health.SnapshotSource) {
switch eventType {
case health.EventTypeContainerStarted:
return health.SnapshotStatusHealthy, health.SnapshotSourceDockerEvent
case health.EventTypeContainerExited:
return health.SnapshotStatusExited, health.SnapshotSourceDockerEvent
case health.EventTypeContainerOOM:
return health.SnapshotStatusOOM, health.SnapshotSourceDockerEvent
case health.EventTypeContainerDisappeared:
return health.SnapshotStatusContainerDisappeared, health.SnapshotSourceDockerEvent
case health.EventTypeInspectUnhealthy:
return health.SnapshotStatusInspectUnhealthy, health.SnapshotSourceInspect
case health.EventTypeProbeFailed:
return health.SnapshotStatusProbeFailed, health.SnapshotSourceProbe
case health.EventTypeProbeRecovered:
return health.SnapshotStatusHealthy, health.SnapshotSourceProbe
default:
return "", ""
}
}
// Compile-time assertion: Publisher implements
// ports.HealthEventPublisher.
var _ ports.HealthEventPublisher = (*Publisher)(nil)
@@ -0,0 +1,197 @@
package healtheventspublisher_test
import (
"context"
"encoding/json"
"strconv"
"sync"
"testing"
"time"
"galaxy/rtmanager/internal/adapters/healtheventspublisher"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/ports"
"github.com/alicebob/miniredis/v2"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// fakeSnapshots captures Upsert invocations for assertions.
type fakeSnapshots struct {
mu sync.Mutex
upserts []health.HealthSnapshot
upsertErr error
}
func (s *fakeSnapshots) Upsert(_ context.Context, snapshot health.HealthSnapshot) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.upsertErr != nil {
return s.upsertErr
}
s.upserts = append(s.upserts, snapshot)
return nil
}
func (s *fakeSnapshots) Get(_ context.Context, _ string) (health.HealthSnapshot, error) {
return health.HealthSnapshot{}, nil
}
func newPublisher(t *testing.T, snapshots ports.HealthSnapshotStore) (*healtheventspublisher.Publisher, *miniredis.Miniredis, *redis.Client) {
t.Helper()
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
t.Cleanup(func() { _ = client.Close() })
publisher, err := healtheventspublisher.NewPublisher(healtheventspublisher.Config{
Client: client,
Snapshots: snapshots,
Stream: "runtime:health_events",
})
require.NoError(t, err)
return publisher, server, client
}
func TestNewPublisherRejectsMissingCollaborators(t *testing.T) {
_, err := healtheventspublisher.NewPublisher(healtheventspublisher.Config{})
require.Error(t, err)
_, err = healtheventspublisher.NewPublisher(healtheventspublisher.Config{
Client: redis.NewClient(&redis.Options{Addr: "127.0.0.1:0"}),
})
require.Error(t, err)
_, err = healtheventspublisher.NewPublisher(healtheventspublisher.Config{
Client: redis.NewClient(&redis.Options{Addr: "127.0.0.1:0"}),
Snapshots: &fakeSnapshots{},
})
require.Error(t, err)
}
func TestPublishContainerStartedUpsertsHealthyAndXAdds(t *testing.T) {
snapshots := &fakeSnapshots{}
publisher, _, client := newPublisher(t, snapshots)
occurredAt := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
envelope := ports.HealthEventEnvelope{
GameID: "game-1",
ContainerID: "c-1",
EventType: health.EventTypeContainerStarted,
OccurredAt: occurredAt,
Details: json.RawMessage(`{"image_ref":"galaxy/game:1.2.3"}`),
}
require.NoError(t, publisher.Publish(context.Background(), envelope))
require.Len(t, snapshots.upserts, 1)
snapshot := snapshots.upserts[0]
assert.Equal(t, "game-1", snapshot.GameID)
assert.Equal(t, "c-1", snapshot.ContainerID)
assert.Equal(t, health.SnapshotStatusHealthy, snapshot.Status)
assert.Equal(t, health.SnapshotSourceDockerEvent, snapshot.Source)
assert.JSONEq(t, `{"image_ref":"galaxy/game:1.2.3"}`, string(snapshot.Details))
assert.Equal(t, occurredAt, snapshot.ObservedAt)
entries, err := client.XRange(context.Background(), "runtime:health_events", "-", "+").Result()
require.NoError(t, err)
require.Len(t, entries, 1)
values := entries[0].Values
assert.Equal(t, "game-1", values["game_id"])
assert.Equal(t, "c-1", values["container_id"])
assert.Equal(t, "container_started", values["event_type"])
assert.Equal(t, strconv.FormatInt(occurredAt.UnixMilli(), 10), values["occurred_at_ms"])
assert.JSONEq(t, `{"image_ref":"galaxy/game:1.2.3"}`, values["details"].(string))
}
func TestPublishMapsEveryEventTypeToASnapshot(t *testing.T) {
t.Parallel()
cases := []struct {
eventType health.EventType
expectStatus health.SnapshotStatus
expectSource health.SnapshotSource
}{
{health.EventTypeContainerStarted, health.SnapshotStatusHealthy, health.SnapshotSourceDockerEvent},
{health.EventTypeContainerExited, health.SnapshotStatusExited, health.SnapshotSourceDockerEvent},
{health.EventTypeContainerOOM, health.SnapshotStatusOOM, health.SnapshotSourceDockerEvent},
{health.EventTypeContainerDisappeared, health.SnapshotStatusContainerDisappeared, health.SnapshotSourceDockerEvent},
{health.EventTypeInspectUnhealthy, health.SnapshotStatusInspectUnhealthy, health.SnapshotSourceInspect},
{health.EventTypeProbeFailed, health.SnapshotStatusProbeFailed, health.SnapshotSourceProbe},
{health.EventTypeProbeRecovered, health.SnapshotStatusHealthy, health.SnapshotSourceProbe},
}
for _, tc := range cases {
t.Run(string(tc.eventType), func(t *testing.T) {
t.Parallel()
snapshots := &fakeSnapshots{}
publisher, _, _ := newPublisher(t, snapshots)
require.NoError(t, publisher.Publish(context.Background(), ports.HealthEventEnvelope{
GameID: "g",
ContainerID: "c",
EventType: tc.eventType,
OccurredAt: time.Now().UTC(),
Details: json.RawMessage(`{}`),
}))
require.Len(t, snapshots.upserts, 1)
assert.Equal(t, tc.expectStatus, snapshots.upserts[0].Status)
assert.Equal(t, tc.expectSource, snapshots.upserts[0].Source)
})
}
}
func TestPublishEmptyDetailsBecomesEmptyObject(t *testing.T) {
snapshots := &fakeSnapshots{}
publisher, _, client := newPublisher(t, snapshots)
envelope := ports.HealthEventEnvelope{
GameID: "g",
ContainerID: "c",
EventType: health.EventTypeContainerDisappeared,
OccurredAt: time.Now().UTC(),
}
require.NoError(t, publisher.Publish(context.Background(), envelope))
require.Len(t, snapshots.upserts, 1)
assert.JSONEq(t, "{}", string(snapshots.upserts[0].Details))
entries, err := client.XRange(context.Background(), "runtime:health_events", "-", "+").Result()
require.NoError(t, err)
require.Len(t, entries, 1)
assert.JSONEq(t, "{}", entries[0].Values["details"].(string))
}
func TestPublishRejectsInvalidEnvelope(t *testing.T) {
snapshots := &fakeSnapshots{}
publisher, _, client := newPublisher(t, snapshots)
require.Error(t, publisher.Publish(context.Background(), ports.HealthEventEnvelope{}))
entries, err := client.XRange(context.Background(), "runtime:health_events", "-", "+").Result()
require.NoError(t, err)
assert.Empty(t, entries)
assert.Empty(t, snapshots.upserts)
}
func TestPublishSurfacesSnapshotErrorWithoutXAdd(t *testing.T) {
snapshots := &fakeSnapshots{upsertErr: assertSentinelErr}
publisher, _, client := newPublisher(t, snapshots)
err := publisher.Publish(context.Background(), ports.HealthEventEnvelope{
GameID: "g",
ContainerID: "c",
EventType: health.EventTypeContainerStarted,
OccurredAt: time.Now().UTC(),
Details: json.RawMessage(`{"image_ref":"x"}`),
})
require.Error(t, err)
entries, err := client.XRange(context.Background(), "runtime:health_events", "-", "+").Result()
require.NoError(t, err)
assert.Empty(t, entries, "xadd must not run when snapshot upsert fails")
}
// assertSentinelErr is a sentinel for snapshot-failure assertions.
var assertSentinelErr = sentinelError("snapshot upsert failure")
type sentinelError string
func (s sentinelError) Error() string { return string(s) }
@@ -0,0 +1,100 @@
// Package jobresultspublisher provides the Redis-Streams-backed
// publisher for `runtime:job_results`. The start-jobs and stop-jobs
// consumers call this adapter so every consumed envelope produces
// exactly one outcome entry on the result stream.
//
// The wire fields mirror the AsyncAPI schema frozen in
// `rtmanager/api/runtime-jobs-asyncapi.yaml`. Every field is XADDed
// even when empty so consumers can rely on the schema's required-field
// set.
package jobresultspublisher
import (
"context"
"errors"
"fmt"
"strings"
"galaxy/rtmanager/internal/ports"
"github.com/redis/go-redis/v9"
)
// Wire field names used by the Redis Streams payload. Frozen by
// `rtmanager/api/runtime-jobs-asyncapi.yaml`; renaming any of them
// breaks consumers.
const (
fieldGameID = "game_id"
fieldOutcome = "outcome"
fieldContainerID = "container_id"
fieldEngineEndpoint = "engine_endpoint"
fieldErrorCode = "error_code"
fieldErrorMessage = "error_message"
)
// Config groups the dependencies and stream name required to construct
// a Publisher.
type Config struct {
// Client appends entries to the Redis Stream. Must be non-nil.
Client *redis.Client
// Stream stores the Redis Stream key job results are published to
// (e.g. `runtime:job_results`). Must not be empty.
Stream string
}
// Publisher implements `ports.JobResultPublisher` on top of a shared
// Redis client.
type Publisher struct {
client *redis.Client
stream string
}
// NewPublisher constructs one Publisher from cfg. Validation errors
// surface the missing collaborator verbatim.
func NewPublisher(cfg Config) (*Publisher, error) {
if cfg.Client == nil {
return nil, errors.New("new rtmanager job results publisher: nil redis client")
}
if strings.TrimSpace(cfg.Stream) == "" {
return nil, errors.New("new rtmanager job results publisher: stream must not be empty")
}
return &Publisher{
client: cfg.Client,
stream: cfg.Stream,
}, nil
}
// Publish XADDs result to the configured Redis Stream. The wire payload
// includes every field declared as required by the AsyncAPI schema —
// empty strings are kept so consumers always see the documented keys.
func (publisher *Publisher) Publish(ctx context.Context, result ports.JobResult) error {
if publisher == nil || publisher.client == nil {
return errors.New("publish job result: nil publisher")
}
if ctx == nil {
return errors.New("publish job result: nil context")
}
if err := result.Validate(); err != nil {
return fmt.Errorf("publish job result: %w", err)
}
values := map[string]any{
fieldGameID: result.GameID,
fieldOutcome: result.Outcome,
fieldContainerID: result.ContainerID,
fieldEngineEndpoint: result.EngineEndpoint,
fieldErrorCode: result.ErrorCode,
fieldErrorMessage: result.ErrorMessage,
}
if err := publisher.client.XAdd(ctx, &redis.XAddArgs{
Stream: publisher.stream,
Values: values,
}).Err(); err != nil {
return fmt.Errorf("publish job result: xadd: %w", err)
}
return nil
}
// Compile-time assertion: Publisher implements ports.JobResultPublisher.
var _ ports.JobResultPublisher = (*Publisher)(nil)
@@ -0,0 +1,142 @@
package jobresultspublisher_test
import (
"context"
"testing"
"galaxy/rtmanager/internal/adapters/jobresultspublisher"
"galaxy/rtmanager/internal/ports"
"github.com/alicebob/miniredis/v2"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func newPublisher(t *testing.T) (*jobresultspublisher.Publisher, *redis.Client) {
t.Helper()
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
t.Cleanup(func() { _ = client.Close() })
publisher, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{
Client: client,
Stream: "runtime:job_results",
})
require.NoError(t, err)
return publisher, client
}
func TestNewPublisherRejectsMissingCollaborators(t *testing.T) {
_, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{})
require.Error(t, err)
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
t.Cleanup(func() { _ = client.Close() })
_, err = jobresultspublisher.NewPublisher(jobresultspublisher.Config{Client: client})
require.Error(t, err)
_, err = jobresultspublisher.NewPublisher(jobresultspublisher.Config{Client: client, Stream: " "})
require.Error(t, err)
}
func TestPublishRejectsInvalidResult(t *testing.T) {
publisher, _ := newPublisher(t)
require.Error(t, publisher.Publish(context.Background(), ports.JobResult{}))
require.Error(t, publisher.Publish(context.Background(), ports.JobResult{
GameID: "game-1",
Outcome: "weird",
}))
}
func TestPublishStartSuccessXAddsAllRequiredFields(t *testing.T) {
publisher, client := newPublisher(t)
result := ports.JobResult{
GameID: "game-1",
Outcome: ports.JobOutcomeSuccess,
ContainerID: "c-1",
EngineEndpoint: "http://galaxy-game-game-1:8080",
ErrorCode: "",
ErrorMessage: "",
}
require.NoError(t, publisher.Publish(context.Background(), result))
entries, err := client.XRange(context.Background(), "runtime:job_results", "-", "+").Result()
require.NoError(t, err)
require.Len(t, entries, 1)
values := entries[0].Values
assert.Equal(t, "game-1", values["game_id"])
assert.Equal(t, "success", values["outcome"])
assert.Equal(t, "c-1", values["container_id"])
assert.Equal(t, "http://galaxy-game-game-1:8080", values["engine_endpoint"])
assert.Equal(t, "", values["error_code"])
assert.Equal(t, "", values["error_message"])
}
func TestPublishFailureXAddsEmptyContainerAndEndpoint(t *testing.T) {
publisher, client := newPublisher(t)
result := ports.JobResult{
GameID: "game-2",
Outcome: ports.JobOutcomeFailure,
ErrorCode: "image_pull_failed",
ErrorMessage: "manifest unknown",
}
require.NoError(t, publisher.Publish(context.Background(), result))
entries, err := client.XRange(context.Background(), "runtime:job_results", "-", "+").Result()
require.NoError(t, err)
require.Len(t, entries, 1)
values := entries[0].Values
assert.Equal(t, "game-2", values["game_id"])
assert.Equal(t, "failure", values["outcome"])
assert.Equal(t, "", values["container_id"], "failure must publish empty container id")
assert.Equal(t, "", values["engine_endpoint"], "failure must publish empty engine endpoint")
assert.Equal(t, "image_pull_failed", values["error_code"])
assert.Equal(t, "manifest unknown", values["error_message"])
}
func TestPublishReplayNoOpKeepsContainerAndEndpoint(t *testing.T) {
publisher, client := newPublisher(t)
result := ports.JobResult{
GameID: "game-3",
Outcome: ports.JobOutcomeSuccess,
ContainerID: "c-3",
EngineEndpoint: "http://galaxy-game-game-3:8080",
ErrorCode: "replay_no_op",
}
require.NoError(t, publisher.Publish(context.Background(), result))
entries, err := client.XRange(context.Background(), "runtime:job_results", "-", "+").Result()
require.NoError(t, err)
require.Len(t, entries, 1)
values := entries[0].Values
assert.Equal(t, "game-3", values["game_id"])
assert.Equal(t, "success", values["outcome"])
assert.Equal(t, "c-3", values["container_id"])
assert.Equal(t, "http://galaxy-game-game-3:8080", values["engine_endpoint"])
assert.Equal(t, "replay_no_op", values["error_code"])
assert.Equal(t, "", values["error_message"])
}
func TestPublishFailsOnClosedClient(t *testing.T) {
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
publisher, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{
Client: client,
Stream: "runtime:job_results",
})
require.NoError(t, err)
require.NoError(t, client.Close())
err = publisher.Publish(context.Background(), ports.JobResult{
GameID: "game-4",
Outcome: ports.JobOutcomeSuccess,
})
require.Error(t, err)
}
@@ -0,0 +1,219 @@
// Package lobbyclient provides the trusted-internal Lobby REST client
// Runtime Manager uses to fetch ancillary game metadata for diagnostics.
//
// The client is intentionally minimal: the GetGame fetch is ancillary
// diagnostics because the start envelope already carries the only
// required field (`image_ref`). A failed call surfaces as
// `ports.ErrLobbyUnavailable` so callers can distinguish "not found"
// from transport faults and continue without aborting the start
// operation.
package lobbyclient
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
"galaxy/rtmanager/internal/ports"
)
const (
getGamePathSuffix = "/api/v1/internal/games/%s"
)
// Config configures one HTTP-backed Lobby internal client.
type Config struct {
// BaseURL stores the absolute base URL of the Lobby internal HTTP
// listener (e.g. `http://lobby:8095`).
BaseURL string
// RequestTimeout bounds one outbound lookup request.
RequestTimeout time.Duration
}
// Client resolves Lobby game records through the trusted internal HTTP
// API.
type Client struct {
baseURL string
requestTimeout time.Duration
httpClient *http.Client
closeIdleConnections func()
}
type gameRecordEnvelope struct {
GameID string `json:"game_id"`
Status string `json:"status"`
TargetEngineVersion string `json:"target_engine_version"`
}
type errorEnvelope struct {
Error *errorBody `json:"error"`
}
type errorBody struct {
Code string `json:"code"`
Message string `json:"message"`
}
// NewClient constructs a Lobby internal client that uses
// repository-standard HTTP transport instrumentation through otelhttp.
// The cloned default transport keeps the production wiring isolated
// from caller-provided transports.
func NewClient(cfg Config) (*Client, error) {
transport, ok := http.DefaultTransport.(*http.Transport)
if !ok {
return nil, errors.New("new lobby internal client: default transport is not *http.Transport")
}
cloned := transport.Clone()
return newClient(cfg, &http.Client{Transport: otelhttp.NewTransport(cloned)}, cloned.CloseIdleConnections)
}
func newClient(cfg Config, httpClient *http.Client, closeIdleConnections func()) (*Client, error) {
switch {
case strings.TrimSpace(cfg.BaseURL) == "":
return nil, errors.New("new lobby internal client: base URL must not be empty")
case cfg.RequestTimeout <= 0:
return nil, errors.New("new lobby internal client: request timeout must be positive")
case httpClient == nil:
return nil, errors.New("new lobby internal client: http client must not be nil")
}
parsed, err := url.Parse(strings.TrimRight(strings.TrimSpace(cfg.BaseURL), "/"))
if err != nil {
return nil, fmt.Errorf("new lobby internal client: parse base URL: %w", err)
}
if parsed.Scheme == "" || parsed.Host == "" {
return nil, errors.New("new lobby internal client: base URL must be absolute")
}
return &Client{
baseURL: parsed.String(),
requestTimeout: cfg.RequestTimeout,
httpClient: httpClient,
closeIdleConnections: closeIdleConnections,
}, nil
}
// Close releases idle HTTP connections owned by the client transport.
// Call once on shutdown.
func (client *Client) Close() error {
if client == nil || client.closeIdleConnections == nil {
return nil
}
client.closeIdleConnections()
return nil
}
// GetGame returns the Lobby game record for gameID. It maps Lobby's
// `404 not_found` to `ports.ErrLobbyGameNotFound`; every other failure
// (transport, timeout, non-2xx response) maps to
// `ports.ErrLobbyUnavailable` wrapped with the original error so callers
// keep the diagnostic detail.
func (client *Client) GetGame(ctx context.Context, gameID string) (ports.LobbyGameRecord, error) {
if client == nil || client.httpClient == nil {
return ports.LobbyGameRecord{}, errors.New("lobby get game: nil client")
}
if ctx == nil {
return ports.LobbyGameRecord{}, errors.New("lobby get game: nil context")
}
if err := ctx.Err(); err != nil {
return ports.LobbyGameRecord{}, err
}
if strings.TrimSpace(gameID) == "" {
return ports.LobbyGameRecord{}, errors.New("lobby get game: game id must not be empty")
}
payload, statusCode, err := client.doRequest(ctx, http.MethodGet, fmt.Sprintf(getGamePathSuffix, url.PathEscape(gameID)))
if err != nil {
return ports.LobbyGameRecord{}, fmt.Errorf("%w: %w", ports.ErrLobbyUnavailable, err)
}
switch statusCode {
case http.StatusOK:
var envelope gameRecordEnvelope
if err := decodeJSONPayload(payload, &envelope); err != nil {
return ports.LobbyGameRecord{}, fmt.Errorf("%w: decode success response: %w", ports.ErrLobbyUnavailable, err)
}
if strings.TrimSpace(envelope.GameID) == "" {
return ports.LobbyGameRecord{}, fmt.Errorf("%w: success response missing game_id", ports.ErrLobbyUnavailable)
}
return ports.LobbyGameRecord{
GameID: envelope.GameID,
Status: envelope.Status,
TargetEngineVersion: envelope.TargetEngineVersion,
}, nil
case http.StatusNotFound:
return ports.LobbyGameRecord{}, ports.ErrLobbyGameNotFound
default:
errorCode := decodeErrorCode(payload)
if errorCode != "" {
return ports.LobbyGameRecord{}, fmt.Errorf("%w: unexpected status %d (error_code=%s)", ports.ErrLobbyUnavailable, statusCode, errorCode)
}
return ports.LobbyGameRecord{}, fmt.Errorf("%w: unexpected status %d", ports.ErrLobbyUnavailable, statusCode)
}
}
func (client *Client) doRequest(ctx context.Context, method, requestPath string) ([]byte, int, error) {
attemptCtx, cancel := context.WithTimeout(ctx, client.requestTimeout)
defer cancel()
req, err := http.NewRequestWithContext(attemptCtx, method, client.baseURL+requestPath, nil)
if err != nil {
return nil, 0, fmt.Errorf("build request: %w", err)
}
req.Header.Set("Accept", "application/json")
resp, err := client.httpClient.Do(req)
if err != nil {
return nil, 0, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, 0, fmt.Errorf("read response body: %w", err)
}
return body, resp.StatusCode, nil
}
// decodeJSONPayload tolerantly decodes a JSON object; unknown fields
// are ignored so additive Lobby schema changes do not break us.
func decodeJSONPayload(payload []byte, target any) error {
decoder := json.NewDecoder(bytes.NewReader(payload))
if err := decoder.Decode(target); err != nil {
return err
}
if err := decoder.Decode(&struct{}{}); err != io.EOF {
if err == nil {
return errors.New("unexpected trailing JSON input")
}
return err
}
return nil
}
func decodeErrorCode(payload []byte) string {
if len(payload) == 0 {
return ""
}
var envelope errorEnvelope
if err := json.Unmarshal(payload, &envelope); err != nil {
return ""
}
if envelope.Error == nil {
return ""
}
return envelope.Error.Code
}
// Compile-time assertion: Client implements ports.LobbyInternalClient.
var _ ports.LobbyInternalClient = (*Client)(nil)
@@ -0,0 +1,153 @@
package lobbyclient
import (
"context"
"errors"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"galaxy/rtmanager/internal/ports"
)
func newTestClient(t *testing.T, baseURL string, timeout time.Duration) *Client {
t.Helper()
client, err := NewClient(Config{BaseURL: baseURL, RequestTimeout: timeout})
require.NoError(t, err)
t.Cleanup(func() { _ = client.Close() })
return client
}
func TestNewClientValidatesConfig(t *testing.T) {
cases := map[string]Config{
"empty base url": {BaseURL: "", RequestTimeout: time.Second},
"non-absolute base url": {BaseURL: "lobby:8095", RequestTimeout: time.Second},
"non-positive timeout": {BaseURL: "http://lobby:8095", RequestTimeout: 0},
}
for name, cfg := range cases {
t.Run(name, func(t *testing.T) {
_, err := NewClient(cfg)
require.Error(t, err)
})
}
}
func TestGetGameSuccess(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
require.Equal(t, http.MethodGet, r.Method)
require.Equal(t, "/api/v1/internal/games/game-1", r.URL.Path)
require.Equal(t, "application/json", r.Header.Get("Accept"))
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"game_id": "game-1",
"game_name": "Sample",
"status": "running",
"target_engine_version": "1.4.2",
"current_turn": 0,
"runtime_status": "running"
}`))
}))
defer server.Close()
client := newTestClient(t, server.URL, time.Second)
got, err := client.GetGame(context.Background(), "game-1")
require.NoError(t, err)
assert.Equal(t, "game-1", got.GameID)
assert.Equal(t, "running", got.Status)
assert.Equal(t, "1.4.2", got.TargetEngineVersion)
}
func TestGetGameNotFound(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusNotFound)
_, _ = w.Write([]byte(`{"error":{"code":"not_found","message":"no such game"}}`))
}))
defer server.Close()
client := newTestClient(t, server.URL, time.Second)
_, err := client.GetGame(context.Background(), "missing")
require.Error(t, err)
assert.True(t, errors.Is(err, ports.ErrLobbyGameNotFound))
assert.False(t, errors.Is(err, ports.ErrLobbyUnavailable))
}
func TestGetGameInternalErrorMapsToUnavailable(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusInternalServerError)
_, _ = w.Write([]byte(`{"error":{"code":"internal_error","message":"boom"}}`))
}))
defer server.Close()
client := newTestClient(t, server.URL, time.Second)
_, err := client.GetGame(context.Background(), "x")
require.Error(t, err)
assert.True(t, errors.Is(err, ports.ErrLobbyUnavailable))
assert.Contains(t, err.Error(), "500")
assert.Contains(t, err.Error(), "internal_error")
}
func TestGetGameTimeoutMapsToUnavailable(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(150 * time.Millisecond)
_, _ = w.Write([]byte(`{}`))
}))
defer server.Close()
client := newTestClient(t, server.URL, 50*time.Millisecond)
_, err := client.GetGame(context.Background(), "x")
require.Error(t, err)
assert.True(t, errors.Is(err, ports.ErrLobbyUnavailable))
}
func TestGetGameSuccessMissingGameIDIsUnavailable(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(`{"status":"running"}`))
}))
defer server.Close()
client := newTestClient(t, server.URL, time.Second)
_, err := client.GetGame(context.Background(), "x")
require.Error(t, err)
assert.True(t, errors.Is(err, ports.ErrLobbyUnavailable))
assert.Contains(t, err.Error(), "missing game_id")
}
func TestGetGameRejectsBadInput(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
t.Fatal("must not contact lobby on bad input")
}))
defer server.Close()
client := newTestClient(t, server.URL, time.Second)
t.Run("empty game id", func(t *testing.T) {
_, err := client.GetGame(context.Background(), " ")
require.Error(t, err)
assert.Contains(t, err.Error(), "game id")
})
t.Run("canceled context", func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
cancel()
_, err := client.GetGame(ctx, "x")
require.Error(t, err)
assert.True(t, errors.Is(err, context.Canceled))
})
}
func TestCloseReleasesConnections(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(`{"game_id":"x","status":"running","target_engine_version":"1.0.0"}`))
}))
defer server.Close()
client := newTestClient(t, server.URL, time.Second)
_, err := client.GetGame(context.Background(), "x")
require.NoError(t, err)
assert.NoError(t, client.Close())
assert.NoError(t, client.Close()) // idempotent
}
@@ -0,0 +1,70 @@
// Package notificationpublisher provides the Redis-Streams-backed
// notification-intent publisher Runtime Manager uses to emit admin-only
// failure notifications. The adapter is a thin shim over
// `galaxy/notificationintent.Publisher` that drops the entry id at the
// wrapper boundary; rationale lives in
// `rtmanager/docs/domain-and-ports.md §7`.
package notificationpublisher
import (
"context"
"errors"
"fmt"
"github.com/redis/go-redis/v9"
"galaxy/notificationintent"
"galaxy/rtmanager/internal/ports"
)
// Config groups the dependencies and stream name required to
// construct a Publisher.
type Config struct {
// Client appends entries to Redis Streams. Must be non-nil.
Client *redis.Client
// Stream stores the Redis Stream key intents are published to.
// When empty, `notificationintent.DefaultIntentsStream` is used.
Stream string
}
// Publisher implements `ports.NotificationIntentPublisher` on top of
// the shared `notificationintent.Publisher`. The wrapper is the single
// point that drops the entry id returned by the underlying publisher.
type Publisher struct {
inner *notificationintent.Publisher
}
// NewPublisher constructs a Publisher from cfg. It wraps the shared
// publisher and delegates validation; transport errors and validation
// errors propagate verbatim.
func NewPublisher(cfg Config) (*Publisher, error) {
if cfg.Client == nil {
return nil, errors.New("new rtmanager notification publisher: nil redis client")
}
inner, err := notificationintent.NewPublisher(notificationintent.PublisherConfig{
Client: cfg.Client,
Stream: cfg.Stream,
})
if err != nil {
return nil, fmt.Errorf("new rtmanager notification publisher: %w", err)
}
return &Publisher{inner: inner}, nil
}
// Publish forwards intent to the underlying notificationintent
// publisher and discards the resulting Redis Stream entry id. A failed
// publish surfaces as the underlying error.
func (publisher *Publisher) Publish(ctx context.Context, intent notificationintent.Intent) error {
if publisher == nil || publisher.inner == nil {
return errors.New("publish notification intent: nil publisher")
}
if _, err := publisher.inner.Publish(ctx, intent); err != nil {
return err
}
return nil
}
// Compile-time assertion: Publisher implements
// ports.NotificationIntentPublisher.
var _ ports.NotificationIntentPublisher = (*Publisher)(nil)
@@ -0,0 +1,123 @@
package notificationpublisher
import (
"context"
"encoding/json"
"testing"
"time"
"github.com/alicebob/miniredis/v2"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"galaxy/notificationintent"
)
func newRedis(t *testing.T) (*redis.Client, *miniredis.Miniredis) {
t.Helper()
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
t.Cleanup(func() { _ = client.Close() })
return client, server
}
func readStream(t *testing.T, client *redis.Client, stream string) []redis.XMessage {
t.Helper()
messages, err := client.XRange(context.Background(), stream, "-", "+").Result()
require.NoError(t, err)
return messages
}
func TestNewPublisherValidation(t *testing.T) {
t.Run("nil client", func(t *testing.T) {
_, err := NewPublisher(Config{})
require.Error(t, err)
assert.Contains(t, err.Error(), "nil redis client")
})
}
func TestPublisherWritesIntent(t *testing.T) {
client, _ := newRedis(t)
publisher, err := NewPublisher(Config{Client: client, Stream: "notification:intents"})
require.NoError(t, err)
intent, err := notificationintent.NewRuntimeImagePullFailedIntent(
notificationintent.Metadata{
IdempotencyKey: "rtmanager:start:game-1:abc",
OccurredAt: time.UnixMilli(1714200000000).UTC(),
},
notificationintent.RuntimeImagePullFailedPayload{
GameID: "game-1",
ImageRef: "galaxy/game:1.4.2",
ErrorCode: "image_pull_failed",
ErrorMessage: "registry timeout",
AttemptedAtMs: 1714200000000,
},
)
require.NoError(t, err)
require.NoError(t, publisher.Publish(context.Background(), intent))
messages := readStream(t, client, "notification:intents")
require.Len(t, messages, 1)
values := messages[0].Values
assert.Equal(t, "runtime.image_pull_failed", values["notification_type"])
assert.Equal(t, "runtime_manager", values["producer"])
assert.Equal(t, "admin_email", values["audience_kind"])
assert.Equal(t, "rtmanager:start:game-1:abc", values["idempotency_key"])
// recipient_user_ids_json must be absent for admin_email audience.
_, hasRecipients := values["recipient_user_ids_json"]
assert.False(t, hasRecipients)
payloadRaw, ok := values["payload_json"].(string)
require.True(t, ok)
var payload map[string]any
require.NoError(t, json.Unmarshal([]byte(payloadRaw), &payload))
assert.Equal(t, "game-1", payload["game_id"])
assert.Equal(t, "galaxy/game:1.4.2", payload["image_ref"])
}
func TestPublisherForwardsValidationError(t *testing.T) {
client, _ := newRedis(t)
publisher, err := NewPublisher(Config{Client: client})
require.NoError(t, err)
// Intent with a zero OccurredAt fails the shared validator.
bad := notificationintent.Intent{
NotificationType: notificationintent.NotificationTypeRuntimeImagePullFailed,
Producer: notificationintent.ProducerRuntimeManager,
AudienceKind: notificationintent.AudienceKindAdminEmail,
IdempotencyKey: "k",
PayloadJSON: `{"game_id":"g","image_ref":"r","error_code":"c","error_message":"m","attempted_at_ms":1}`,
}
require.Error(t, publisher.Publish(context.Background(), bad))
}
func TestPublisherDefaultsStreamName(t *testing.T) {
client, _ := newRedis(t)
publisher, err := NewPublisher(Config{Client: client, Stream: ""})
require.NoError(t, err)
intent, err := notificationintent.NewRuntimeContainerStartFailedIntent(
notificationintent.Metadata{
IdempotencyKey: "k",
OccurredAt: time.UnixMilli(1714200000000).UTC(),
},
notificationintent.RuntimeContainerStartFailedPayload{
GameID: "g",
ImageRef: "r",
ErrorCode: "container_start_failed",
ErrorMessage: "boom",
AttemptedAtMs: 1714200000000,
},
)
require.NoError(t, err)
require.NoError(t, publisher.Publish(context.Background(), intent))
messages := readStream(t, client, notificationintent.DefaultIntentsStream)
require.Len(t, messages, 1)
}
@@ -0,0 +1,203 @@
// Package healthsnapshotstore implements the PostgreSQL-backed adapter
// for `ports.HealthSnapshotStore`.
//
// The package owns the on-disk shape of the `health_snapshots` table
// defined in
// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`
// and translates the schema-agnostic `ports.HealthSnapshotStore` interface
// declared in `internal/ports/healthsnapshotstore.go` into concrete
// go-jet/v2 statements driven by the pgx driver.
//
// The `details` jsonb column round-trips as a `json.RawMessage`. Empty
// payloads are substituted with the SQL default `{}` on Upsert so the
// CHECK constraints and downstream readers never observe a non-JSON
// empty string.
package healthsnapshotstore
import (
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"strings"
"time"
"galaxy/rtmanager/internal/adapters/postgres/internal/sqlx"
pgtable "galaxy/rtmanager/internal/adapters/postgres/jet/rtmanager/table"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
pg "github.com/go-jet/jet/v2/postgres"
)
// emptyDetails is the canonical jsonb payload installed when the caller
// supplies an empty Details slice. It matches the SQL DEFAULT for the
// column.
const emptyDetails = "{}"
// Config configures one PostgreSQL-backed health-snapshot store instance.
type Config struct {
// DB stores the connection pool the store uses for every query.
DB *sql.DB
// OperationTimeout bounds one round trip.
OperationTimeout time.Duration
}
// Store persists Runtime Manager health snapshots in PostgreSQL.
type Store struct {
db *sql.DB
operationTimeout time.Duration
}
// New constructs one PostgreSQL-backed health-snapshot store from cfg.
func New(cfg Config) (*Store, error) {
if cfg.DB == nil {
return nil, errors.New("new postgres health snapshot store: db must not be nil")
}
if cfg.OperationTimeout <= 0 {
return nil, errors.New("new postgres health snapshot store: operation timeout must be positive")
}
return &Store{
db: cfg.DB,
operationTimeout: cfg.OperationTimeout,
}, nil
}
// healthSnapshotSelectColumns is the canonical SELECT list for the
// health_snapshots table, matching scanSnapshot's column order.
var healthSnapshotSelectColumns = pg.ColumnList{
pgtable.HealthSnapshots.GameID,
pgtable.HealthSnapshots.ContainerID,
pgtable.HealthSnapshots.Status,
pgtable.HealthSnapshots.Source,
pgtable.HealthSnapshots.Details,
pgtable.HealthSnapshots.ObservedAt,
}
// Upsert installs snapshot as the latest observation for snapshot.GameID.
// snapshot is validated through health.HealthSnapshot.Validate before the
// SQL is issued.
func (store *Store) Upsert(ctx context.Context, snapshot health.HealthSnapshot) error {
if store == nil || store.db == nil {
return errors.New("upsert health snapshot: nil store")
}
if err := snapshot.Validate(); err != nil {
return fmt.Errorf("upsert health snapshot: %w", err)
}
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "upsert health snapshot", store.operationTimeout)
if err != nil {
return err
}
defer cancel()
details := emptyDetails
if len(snapshot.Details) > 0 {
details = string(snapshot.Details)
}
stmt := pgtable.HealthSnapshots.INSERT(
pgtable.HealthSnapshots.GameID,
pgtable.HealthSnapshots.ContainerID,
pgtable.HealthSnapshots.Status,
pgtable.HealthSnapshots.Source,
pgtable.HealthSnapshots.Details,
pgtable.HealthSnapshots.ObservedAt,
).VALUES(
snapshot.GameID,
snapshot.ContainerID,
string(snapshot.Status),
string(snapshot.Source),
details,
snapshot.ObservedAt.UTC(),
).ON_CONFLICT(pgtable.HealthSnapshots.GameID).DO_UPDATE(
pg.SET(
pgtable.HealthSnapshots.ContainerID.SET(pgtable.HealthSnapshots.EXCLUDED.ContainerID),
pgtable.HealthSnapshots.Status.SET(pgtable.HealthSnapshots.EXCLUDED.Status),
pgtable.HealthSnapshots.Source.SET(pgtable.HealthSnapshots.EXCLUDED.Source),
pgtable.HealthSnapshots.Details.SET(pgtable.HealthSnapshots.EXCLUDED.Details),
pgtable.HealthSnapshots.ObservedAt.SET(pgtable.HealthSnapshots.EXCLUDED.ObservedAt),
),
)
query, args := stmt.Sql()
if _, err := store.db.ExecContext(operationCtx, query, args...); err != nil {
return fmt.Errorf("upsert health snapshot: %w", err)
}
return nil
}
// Get returns the latest snapshot for gameID. It returns
// runtime.ErrNotFound when no snapshot has been recorded yet.
func (store *Store) Get(ctx context.Context, gameID string) (health.HealthSnapshot, error) {
if store == nil || store.db == nil {
return health.HealthSnapshot{}, errors.New("get health snapshot: nil store")
}
if strings.TrimSpace(gameID) == "" {
return health.HealthSnapshot{}, fmt.Errorf("get health snapshot: game id must not be empty")
}
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "get health snapshot", store.operationTimeout)
if err != nil {
return health.HealthSnapshot{}, err
}
defer cancel()
stmt := pg.SELECT(healthSnapshotSelectColumns).
FROM(pgtable.HealthSnapshots).
WHERE(pgtable.HealthSnapshots.GameID.EQ(pg.String(gameID)))
query, args := stmt.Sql()
row := store.db.QueryRowContext(operationCtx, query, args...)
snapshot, err := scanSnapshot(row)
if sqlx.IsNoRows(err) {
return health.HealthSnapshot{}, runtime.ErrNotFound
}
if err != nil {
return health.HealthSnapshot{}, fmt.Errorf("get health snapshot: %w", err)
}
return snapshot, nil
}
// rowScanner abstracts *sql.Row and *sql.Rows so scanSnapshot can be
// shared across both single-row reads and iterated reads.
type rowScanner interface {
Scan(dest ...any) error
}
// scanSnapshot scans one health_snapshots row from rs.
func scanSnapshot(rs rowScanner) (health.HealthSnapshot, error) {
var (
gameID string
containerID string
status string
source string
details []byte
observedAt time.Time
)
if err := rs.Scan(
&gameID,
&containerID,
&status,
&source,
&details,
&observedAt,
); err != nil {
return health.HealthSnapshot{}, err
}
return health.HealthSnapshot{
GameID: gameID,
ContainerID: containerID,
Status: health.SnapshotStatus(status),
Source: health.SnapshotSource(source),
Details: json.RawMessage(details),
ObservedAt: observedAt.UTC(),
}, nil
}
// Ensure Store satisfies the ports.HealthSnapshotStore interface at
// compile time.
var _ ports.HealthSnapshotStore = (*Store)(nil)
@@ -0,0 +1,157 @@
package healthsnapshotstore_test
import (
"context"
"encoding/json"
"testing"
"time"
"galaxy/rtmanager/internal/adapters/postgres/healthsnapshotstore"
"galaxy/rtmanager/internal/adapters/postgres/internal/pgtest"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/runtime"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestMain(m *testing.M) { pgtest.RunMain(m) }
func newStore(t *testing.T) *healthsnapshotstore.Store {
t.Helper()
pgtest.TruncateAll(t)
store, err := healthsnapshotstore.New(healthsnapshotstore.Config{
DB: pgtest.Ensure(t).Pool(),
OperationTimeout: pgtest.OperationTimeout,
})
require.NoError(t, err)
return store
}
func probeFailedSnapshot(gameID string, observedAt time.Time) health.HealthSnapshot {
return health.HealthSnapshot{
GameID: gameID,
ContainerID: "container-1",
Status: health.SnapshotStatusProbeFailed,
Source: health.SnapshotSourceProbe,
Details: json.RawMessage(`{"consecutive_failures":3,"last_status":503,"last_error":"timeout"}`),
ObservedAt: observedAt,
}
}
func TestUpsertAndGetRoundTrip(t *testing.T) {
ctx := context.Background()
store := newStore(t)
snapshot := probeFailedSnapshot("game-001",
time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC))
require.NoError(t, store.Upsert(ctx, snapshot))
got, err := store.Get(ctx, "game-001")
require.NoError(t, err)
assert.Equal(t, snapshot.GameID, got.GameID)
assert.Equal(t, snapshot.ContainerID, got.ContainerID)
assert.Equal(t, snapshot.Status, got.Status)
assert.Equal(t, snapshot.Source, got.Source)
assert.JSONEq(t, string(snapshot.Details), string(got.Details))
assert.True(t, snapshot.ObservedAt.Equal(got.ObservedAt))
assert.Equal(t, time.UTC, got.ObservedAt.Location())
}
func TestUpsertOverwritesPriorSnapshot(t *testing.T) {
ctx := context.Background()
store := newStore(t)
first := probeFailedSnapshot("game-001",
time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC))
require.NoError(t, store.Upsert(ctx, first))
second := health.HealthSnapshot{
GameID: "game-001",
ContainerID: "container-2",
Status: health.SnapshotStatusHealthy,
Source: health.SnapshotSourceInspect,
Details: json.RawMessage(`{"restart_count":0,"state":"running"}`),
ObservedAt: first.ObservedAt.Add(time.Minute),
}
require.NoError(t, store.Upsert(ctx, second))
got, err := store.Get(ctx, "game-001")
require.NoError(t, err)
assert.Equal(t, "container-2", got.ContainerID)
assert.Equal(t, health.SnapshotStatusHealthy, got.Status)
assert.Equal(t, health.SnapshotSourceInspect, got.Source)
assert.JSONEq(t, string(second.Details), string(got.Details))
assert.True(t, second.ObservedAt.Equal(got.ObservedAt))
}
func TestGetReturnsNotFound(t *testing.T) {
ctx := context.Background()
store := newStore(t)
_, err := store.Get(ctx, "game-missing")
require.ErrorIs(t, err, runtime.ErrNotFound)
}
func TestUpsertEmptyDetailsRoundTripsAsEmptyObject(t *testing.T) {
ctx := context.Background()
store := newStore(t)
snapshot := probeFailedSnapshot("game-001",
time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC))
snapshot.Details = nil
require.NoError(t, store.Upsert(ctx, snapshot))
got, err := store.Get(ctx, "game-001")
require.NoError(t, err)
assert.JSONEq(t, "{}", string(got.Details),
"empty json.RawMessage must round-trip as the SQL default {}, got %q",
string(got.Details))
}
func TestUpsertValidatesSnapshot(t *testing.T) {
ctx := context.Background()
store := newStore(t)
tests := []struct {
name string
mutate func(*health.HealthSnapshot)
}{
{"empty game id", func(s *health.HealthSnapshot) { s.GameID = "" }},
{"unknown status", func(s *health.HealthSnapshot) { s.Status = "exotic" }},
{"unknown source", func(s *health.HealthSnapshot) { s.Source = "exotic" }},
{"zero observed at", func(s *health.HealthSnapshot) { s.ObservedAt = time.Time{} }},
{"invalid json details", func(s *health.HealthSnapshot) {
s.Details = json.RawMessage("not json")
}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
snapshot := probeFailedSnapshot("game-001",
time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC))
tt.mutate(&snapshot)
err := store.Upsert(ctx, snapshot)
require.Error(t, err)
})
}
}
func TestGetRejectsEmptyGameID(t *testing.T) {
ctx := context.Background()
store := newStore(t)
_, err := store.Get(ctx, "")
require.Error(t, err)
}
func TestNewRejectsNilDB(t *testing.T) {
_, err := healthsnapshotstore.New(healthsnapshotstore.Config{OperationTimeout: time.Second})
require.Error(t, err)
}
func TestNewRejectsNonPositiveTimeout(t *testing.T) {
_, err := healthsnapshotstore.New(healthsnapshotstore.Config{
DB: pgtest.Ensure(t).Pool(),
})
require.Error(t, err)
}
@@ -0,0 +1,209 @@
// Package pgtest exposes the testcontainers-backed PostgreSQL bootstrap
// shared by every Runtime Manager PG adapter test. The package is regular
// Go code — not a `_test.go` file — so it can be imported by the
// `_test.go` files in the three sibling store packages
// (`runtimerecordstore`, `operationlogstore`, `healthsnapshotstore`).
//
// No production code in `cmd/rtmanager` or in the runtime imports this
// package. The testcontainers-go dependency therefore stays out of the
// production binary's import graph.
package pgtest
import (
"context"
"database/sql"
"net/url"
"os"
"sync"
"testing"
"time"
"galaxy/postgres"
"galaxy/rtmanager/internal/adapters/postgres/migrations"
testcontainers "github.com/testcontainers/testcontainers-go"
tcpostgres "github.com/testcontainers/testcontainers-go/modules/postgres"
"github.com/testcontainers/testcontainers-go/wait"
)
const (
postgresImage = "postgres:16-alpine"
superUser = "galaxy"
superPassword = "galaxy"
superDatabase = "galaxy_rtmanager"
serviceRole = "rtmanagerservice"
servicePassword = "rtmanagerservice"
serviceSchema = "rtmanager"
containerStartup = 90 * time.Second
// OperationTimeout is the per-statement timeout used by every store
// constructed via the per-package newStore helpers. Tests may pass a
// smaller value if they need to assert deadline behaviour explicitly.
OperationTimeout = 10 * time.Second
)
// Env holds the per-process container plus the *sql.DB pool already
// provisioned with the rtmanager schema, role, and migrations applied.
type Env struct {
container *tcpostgres.PostgresContainer
pool *sql.DB
}
// Pool returns the shared pool. Tests truncate per-table state before
// each run via TruncateAll.
func (env *Env) Pool() *sql.DB { return env.pool }
var (
once sync.Once
cur *Env
curEr error
)
// Ensure starts the PostgreSQL container on first invocation and applies
// the embedded goose migrations. Subsequent invocations reuse the same
// container/pool. When Docker is unavailable Ensure calls t.Skip with the
// underlying error so the test suite still passes on machines without
// Docker.
func Ensure(t testing.TB) *Env {
t.Helper()
once.Do(func() {
cur, curEr = start()
})
if curEr != nil {
t.Skipf("postgres container start failed (Docker unavailable?): %v", curEr)
}
return cur
}
// TruncateAll wipes every Runtime Manager table inside the shared pool,
// leaving the schema and indexes intact. Use it from each test that needs
// a clean slate.
func TruncateAll(t testing.TB) {
t.Helper()
env := Ensure(t)
const stmt = `TRUNCATE TABLE runtime_records, operation_log, health_snapshots RESTART IDENTITY CASCADE`
if _, err := env.pool.ExecContext(context.Background(), stmt); err != nil {
t.Fatalf("truncate rtmanager tables: %v", err)
}
}
// Shutdown terminates the shared container and closes the pool. It is
// invoked from each test package's TestMain after `m.Run` returns so the
// container is released even if individual tests panic.
func Shutdown() {
if cur == nil {
return
}
if cur.pool != nil {
_ = cur.pool.Close()
}
if cur.container != nil {
_ = testcontainers.TerminateContainer(cur.container)
}
cur = nil
}
// RunMain is a convenience helper for each store package's TestMain: it
// runs the test main, captures the exit code, shuts the container down,
// and exits. Wiring it through one helper keeps every TestMain to two
// lines.
func RunMain(m *testing.M) {
code := m.Run()
Shutdown()
os.Exit(code)
}
func start() (*Env, error) {
ctx := context.Background()
container, err := tcpostgres.Run(ctx, postgresImage,
tcpostgres.WithDatabase(superDatabase),
tcpostgres.WithUsername(superUser),
tcpostgres.WithPassword(superPassword),
testcontainers.WithWaitStrategy(
wait.ForLog("database system is ready to accept connections").
WithOccurrence(2).
WithStartupTimeout(containerStartup),
),
)
if err != nil {
return nil, err
}
baseDSN, err := container.ConnectionString(ctx, "sslmode=disable")
if err != nil {
_ = testcontainers.TerminateContainer(container)
return nil, err
}
if err := provisionRoleAndSchema(ctx, baseDSN); err != nil {
_ = testcontainers.TerminateContainer(container)
return nil, err
}
scopedDSN, err := dsnForServiceRole(baseDSN)
if err != nil {
_ = testcontainers.TerminateContainer(container)
return nil, err
}
cfg := postgres.DefaultConfig()
cfg.PrimaryDSN = scopedDSN
cfg.OperationTimeout = OperationTimeout
pool, err := postgres.OpenPrimary(ctx, cfg)
if err != nil {
_ = testcontainers.TerminateContainer(container)
return nil, err
}
if err := postgres.Ping(ctx, pool, OperationTimeout); err != nil {
_ = pool.Close()
_ = testcontainers.TerminateContainer(container)
return nil, err
}
if err := postgres.RunMigrations(ctx, pool, migrations.FS(), "."); err != nil {
_ = pool.Close()
_ = testcontainers.TerminateContainer(container)
return nil, err
}
return &Env{container: container, pool: pool}, nil
}
func provisionRoleAndSchema(ctx context.Context, baseDSN string) error {
cfg := postgres.DefaultConfig()
cfg.PrimaryDSN = baseDSN
cfg.OperationTimeout = OperationTimeout
db, err := postgres.OpenPrimary(ctx, cfg)
if err != nil {
return err
}
defer func() { _ = db.Close() }()
statements := []string{
`DO $$ BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'rtmanagerservice') THEN
CREATE ROLE rtmanagerservice LOGIN PASSWORD 'rtmanagerservice';
END IF;
END $$;`,
`CREATE SCHEMA IF NOT EXISTS rtmanager AUTHORIZATION rtmanagerservice;`,
`GRANT USAGE ON SCHEMA rtmanager TO rtmanagerservice;`,
}
for _, statement := range statements {
if _, err := db.ExecContext(ctx, statement); err != nil {
return err
}
}
return nil
}
func dsnForServiceRole(baseDSN string) (string, error) {
parsed, err := url.Parse(baseDSN)
if err != nil {
return "", err
}
values := url.Values{}
values.Set("search_path", serviceSchema)
values.Set("sslmode", "disable")
scoped := url.URL{
Scheme: parsed.Scheme,
User: url.UserPassword(serviceRole, servicePassword),
Host: parsed.Host,
Path: parsed.Path,
RawQuery: values.Encode(),
}
return scoped.String(), nil
}
@@ -0,0 +1,112 @@
// Package sqlx contains the small set of helpers shared by every Runtime
// Manager PostgreSQL adapter (runtimerecordstore, operationlogstore,
// healthsnapshotstore). The helpers centralise the boundary translations
// for nullable timestamps and the pgx SQLSTATE codes the adapters
// interpret as domain conflicts.
package sqlx
import (
"context"
"database/sql"
"errors"
"fmt"
"time"
"github.com/jackc/pgx/v5/pgconn"
)
// PgUniqueViolationCode identifies the SQLSTATE returned by PostgreSQL
// when a UNIQUE constraint is violated by INSERT or UPDATE.
const PgUniqueViolationCode = "23505"
// IsUniqueViolation reports whether err is a PostgreSQL unique-violation,
// regardless of constraint name.
func IsUniqueViolation(err error) bool {
var pgErr *pgconn.PgError
if !errors.As(err, &pgErr) {
return false
}
return pgErr.Code == PgUniqueViolationCode
}
// IsNoRows reports whether err is sql.ErrNoRows.
func IsNoRows(err error) bool {
return errors.Is(err, sql.ErrNoRows)
}
// NullableTime returns t.UTC() when non-zero, otherwise nil so the column
// is bound as SQL NULL.
func NullableTime(t time.Time) any {
if t.IsZero() {
return nil
}
return t.UTC()
}
// NullableTimePtr returns t.UTC() when t is non-nil and non-zero, otherwise
// nil. Companion of NullableTime for domain types that use *time.Time to
// express absent timestamps.
func NullableTimePtr(t *time.Time) any {
if t == nil {
return nil
}
return NullableTime(*t)
}
// NullableString returns value when non-empty, otherwise nil so the column
// is bound as SQL NULL. Used for Runtime Manager columns that map empty
// domain strings to NULL (current_container_id, current_image_ref).
func NullableString(value string) any {
if value == "" {
return nil
}
return value
}
// StringFromNullable copies an optional sql.NullString into a domain
// string. NULL becomes the empty string, matching the Runtime Manager
// domain convention that empty == NULL for nullable text columns.
func StringFromNullable(value sql.NullString) string {
if !value.Valid {
return ""
}
return value.String
}
// TimeFromNullable copies an optional sql.NullTime into a domain
// time.Time, applying the global UTC normalisation rule. NULL values
// become the zero time.Time.
func TimeFromNullable(value sql.NullTime) time.Time {
if !value.Valid {
return time.Time{}
}
return value.Time.UTC()
}
// TimePtrFromNullable copies an optional sql.NullTime into a domain
// *time.Time. NULL becomes nil; non-NULL values are wrapped after UTC
// normalisation.
func TimePtrFromNullable(value sql.NullTime) *time.Time {
if !value.Valid {
return nil
}
t := value.Time.UTC()
return &t
}
// WithTimeout derives a child context bounded by timeout and prefixes
// context errors with operation. Callers must always invoke the returned
// cancel.
func WithTimeout(ctx context.Context, operation string, timeout time.Duration) (context.Context, context.CancelFunc, error) {
if ctx == nil {
return nil, nil, fmt.Errorf("%s: nil context", operation)
}
if err := ctx.Err(); err != nil {
return nil, nil, fmt.Errorf("%s: %w", operation, err)
}
if timeout <= 0 {
return nil, nil, fmt.Errorf("%s: operation timeout must be positive", operation)
}
bounded, cancel := context.WithTimeout(ctx, timeout)
return bounded, cancel, nil
}
@@ -0,0 +1,19 @@
//
// Code generated by go-jet DO NOT EDIT.
//
// WARNING: Changes to this file may cause incorrect behavior
// and will be lost if the code is regenerated
//
package model
import (
"time"
)
type GooseDbVersion struct {
ID int32 `sql:"primary_key"`
VersionID int64
IsApplied bool
Tstamp time.Time
}
@@ -0,0 +1,21 @@
//
// Code generated by go-jet DO NOT EDIT.
//
// WARNING: Changes to this file may cause incorrect behavior
// and will be lost if the code is regenerated
//
package model
import (
"time"
)
type HealthSnapshots struct {
GameID string `sql:"primary_key"`
ContainerID string
Status string
Source string
Details string
ObservedAt time.Time
}
@@ -0,0 +1,27 @@
//
// Code generated by go-jet DO NOT EDIT.
//
// WARNING: Changes to this file may cause incorrect behavior
// and will be lost if the code is regenerated
//
package model
import (
"time"
)
type OperationLog struct {
ID int64 `sql:"primary_key"`
GameID string
OpKind string
OpSource string
SourceRef string
ImageRef string
ContainerID string
Outcome string
ErrorCode string
ErrorMessage string
StartedAt time.Time
FinishedAt *time.Time
}
@@ -0,0 +1,27 @@
//
// Code generated by go-jet DO NOT EDIT.
//
// WARNING: Changes to this file may cause incorrect behavior
// and will be lost if the code is regenerated
//
package model
import (
"time"
)
type RuntimeRecords struct {
GameID string `sql:"primary_key"`
Status string
CurrentContainerID *string
CurrentImageRef *string
EngineEndpoint string
StatePath string
DockerNetwork string
StartedAt *time.Time
StoppedAt *time.Time
RemovedAt *time.Time
LastOpAt time.Time
CreatedAt time.Time
}
@@ -0,0 +1,87 @@
//
// Code generated by go-jet DO NOT EDIT.
//
// WARNING: Changes to this file may cause incorrect behavior
// and will be lost if the code is regenerated
//
package table
import (
"github.com/go-jet/jet/v2/postgres"
)
var GooseDbVersion = newGooseDbVersionTable("rtmanager", "goose_db_version", "")
type gooseDbVersionTable struct {
postgres.Table
// Columns
ID postgres.ColumnInteger
VersionID postgres.ColumnInteger
IsApplied postgres.ColumnBool
Tstamp postgres.ColumnTimestamp
AllColumns postgres.ColumnList
MutableColumns postgres.ColumnList
DefaultColumns postgres.ColumnList
}
type GooseDbVersionTable struct {
gooseDbVersionTable
EXCLUDED gooseDbVersionTable
}
// AS creates new GooseDbVersionTable with assigned alias
func (a GooseDbVersionTable) AS(alias string) *GooseDbVersionTable {
return newGooseDbVersionTable(a.SchemaName(), a.TableName(), alias)
}
// Schema creates new GooseDbVersionTable with assigned schema name
func (a GooseDbVersionTable) FromSchema(schemaName string) *GooseDbVersionTable {
return newGooseDbVersionTable(schemaName, a.TableName(), a.Alias())
}
// WithPrefix creates new GooseDbVersionTable with assigned table prefix
func (a GooseDbVersionTable) WithPrefix(prefix string) *GooseDbVersionTable {
return newGooseDbVersionTable(a.SchemaName(), prefix+a.TableName(), a.TableName())
}
// WithSuffix creates new GooseDbVersionTable with assigned table suffix
func (a GooseDbVersionTable) WithSuffix(suffix string) *GooseDbVersionTable {
return newGooseDbVersionTable(a.SchemaName(), a.TableName()+suffix, a.TableName())
}
func newGooseDbVersionTable(schemaName, tableName, alias string) *GooseDbVersionTable {
return &GooseDbVersionTable{
gooseDbVersionTable: newGooseDbVersionTableImpl(schemaName, tableName, alias),
EXCLUDED: newGooseDbVersionTableImpl("", "excluded", ""),
}
}
func newGooseDbVersionTableImpl(schemaName, tableName, alias string) gooseDbVersionTable {
var (
IDColumn = postgres.IntegerColumn("id")
VersionIDColumn = postgres.IntegerColumn("version_id")
IsAppliedColumn = postgres.BoolColumn("is_applied")
TstampColumn = postgres.TimestampColumn("tstamp")
allColumns = postgres.ColumnList{IDColumn, VersionIDColumn, IsAppliedColumn, TstampColumn}
mutableColumns = postgres.ColumnList{VersionIDColumn, IsAppliedColumn, TstampColumn}
defaultColumns = postgres.ColumnList{TstampColumn}
)
return gooseDbVersionTable{
Table: postgres.NewTable(schemaName, tableName, alias, allColumns...),
//Columns
ID: IDColumn,
VersionID: VersionIDColumn,
IsApplied: IsAppliedColumn,
Tstamp: TstampColumn,
AllColumns: allColumns,
MutableColumns: mutableColumns,
DefaultColumns: defaultColumns,
}
}
@@ -0,0 +1,93 @@
//
// Code generated by go-jet DO NOT EDIT.
//
// WARNING: Changes to this file may cause incorrect behavior
// and will be lost if the code is regenerated
//
package table
import (
"github.com/go-jet/jet/v2/postgres"
)
var HealthSnapshots = newHealthSnapshotsTable("rtmanager", "health_snapshots", "")
type healthSnapshotsTable struct {
postgres.Table
// Columns
GameID postgres.ColumnString
ContainerID postgres.ColumnString
Status postgres.ColumnString
Source postgres.ColumnString
Details postgres.ColumnString
ObservedAt postgres.ColumnTimestampz
AllColumns postgres.ColumnList
MutableColumns postgres.ColumnList
DefaultColumns postgres.ColumnList
}
type HealthSnapshotsTable struct {
healthSnapshotsTable
EXCLUDED healthSnapshotsTable
}
// AS creates new HealthSnapshotsTable with assigned alias
func (a HealthSnapshotsTable) AS(alias string) *HealthSnapshotsTable {
return newHealthSnapshotsTable(a.SchemaName(), a.TableName(), alias)
}
// Schema creates new HealthSnapshotsTable with assigned schema name
func (a HealthSnapshotsTable) FromSchema(schemaName string) *HealthSnapshotsTable {
return newHealthSnapshotsTable(schemaName, a.TableName(), a.Alias())
}
// WithPrefix creates new HealthSnapshotsTable with assigned table prefix
func (a HealthSnapshotsTable) WithPrefix(prefix string) *HealthSnapshotsTable {
return newHealthSnapshotsTable(a.SchemaName(), prefix+a.TableName(), a.TableName())
}
// WithSuffix creates new HealthSnapshotsTable with assigned table suffix
func (a HealthSnapshotsTable) WithSuffix(suffix string) *HealthSnapshotsTable {
return newHealthSnapshotsTable(a.SchemaName(), a.TableName()+suffix, a.TableName())
}
func newHealthSnapshotsTable(schemaName, tableName, alias string) *HealthSnapshotsTable {
return &HealthSnapshotsTable{
healthSnapshotsTable: newHealthSnapshotsTableImpl(schemaName, tableName, alias),
EXCLUDED: newHealthSnapshotsTableImpl("", "excluded", ""),
}
}
func newHealthSnapshotsTableImpl(schemaName, tableName, alias string) healthSnapshotsTable {
var (
GameIDColumn = postgres.StringColumn("game_id")
ContainerIDColumn = postgres.StringColumn("container_id")
StatusColumn = postgres.StringColumn("status")
SourceColumn = postgres.StringColumn("source")
DetailsColumn = postgres.StringColumn("details")
ObservedAtColumn = postgres.TimestampzColumn("observed_at")
allColumns = postgres.ColumnList{GameIDColumn, ContainerIDColumn, StatusColumn, SourceColumn, DetailsColumn, ObservedAtColumn}
mutableColumns = postgres.ColumnList{ContainerIDColumn, StatusColumn, SourceColumn, DetailsColumn, ObservedAtColumn}
defaultColumns = postgres.ColumnList{ContainerIDColumn, DetailsColumn}
)
return healthSnapshotsTable{
Table: postgres.NewTable(schemaName, tableName, alias, allColumns...),
//Columns
GameID: GameIDColumn,
ContainerID: ContainerIDColumn,
Status: StatusColumn,
Source: SourceColumn,
Details: DetailsColumn,
ObservedAt: ObservedAtColumn,
AllColumns: allColumns,
MutableColumns: mutableColumns,
DefaultColumns: defaultColumns,
}
}
@@ -0,0 +1,111 @@
//
// Code generated by go-jet DO NOT EDIT.
//
// WARNING: Changes to this file may cause incorrect behavior
// and will be lost if the code is regenerated
//
package table
import (
"github.com/go-jet/jet/v2/postgres"
)
var OperationLog = newOperationLogTable("rtmanager", "operation_log", "")
type operationLogTable struct {
postgres.Table
// Columns
ID postgres.ColumnInteger
GameID postgres.ColumnString
OpKind postgres.ColumnString
OpSource postgres.ColumnString
SourceRef postgres.ColumnString
ImageRef postgres.ColumnString
ContainerID postgres.ColumnString
Outcome postgres.ColumnString
ErrorCode postgres.ColumnString
ErrorMessage postgres.ColumnString
StartedAt postgres.ColumnTimestampz
FinishedAt postgres.ColumnTimestampz
AllColumns postgres.ColumnList
MutableColumns postgres.ColumnList
DefaultColumns postgres.ColumnList
}
type OperationLogTable struct {
operationLogTable
EXCLUDED operationLogTable
}
// AS creates new OperationLogTable with assigned alias
func (a OperationLogTable) AS(alias string) *OperationLogTable {
return newOperationLogTable(a.SchemaName(), a.TableName(), alias)
}
// Schema creates new OperationLogTable with assigned schema name
func (a OperationLogTable) FromSchema(schemaName string) *OperationLogTable {
return newOperationLogTable(schemaName, a.TableName(), a.Alias())
}
// WithPrefix creates new OperationLogTable with assigned table prefix
func (a OperationLogTable) WithPrefix(prefix string) *OperationLogTable {
return newOperationLogTable(a.SchemaName(), prefix+a.TableName(), a.TableName())
}
// WithSuffix creates new OperationLogTable with assigned table suffix
func (a OperationLogTable) WithSuffix(suffix string) *OperationLogTable {
return newOperationLogTable(a.SchemaName(), a.TableName()+suffix, a.TableName())
}
func newOperationLogTable(schemaName, tableName, alias string) *OperationLogTable {
return &OperationLogTable{
operationLogTable: newOperationLogTableImpl(schemaName, tableName, alias),
EXCLUDED: newOperationLogTableImpl("", "excluded", ""),
}
}
func newOperationLogTableImpl(schemaName, tableName, alias string) operationLogTable {
var (
IDColumn = postgres.IntegerColumn("id")
GameIDColumn = postgres.StringColumn("game_id")
OpKindColumn = postgres.StringColumn("op_kind")
OpSourceColumn = postgres.StringColumn("op_source")
SourceRefColumn = postgres.StringColumn("source_ref")
ImageRefColumn = postgres.StringColumn("image_ref")
ContainerIDColumn = postgres.StringColumn("container_id")
OutcomeColumn = postgres.StringColumn("outcome")
ErrorCodeColumn = postgres.StringColumn("error_code")
ErrorMessageColumn = postgres.StringColumn("error_message")
StartedAtColumn = postgres.TimestampzColumn("started_at")
FinishedAtColumn = postgres.TimestampzColumn("finished_at")
allColumns = postgres.ColumnList{IDColumn, GameIDColumn, OpKindColumn, OpSourceColumn, SourceRefColumn, ImageRefColumn, ContainerIDColumn, OutcomeColumn, ErrorCodeColumn, ErrorMessageColumn, StartedAtColumn, FinishedAtColumn}
mutableColumns = postgres.ColumnList{GameIDColumn, OpKindColumn, OpSourceColumn, SourceRefColumn, ImageRefColumn, ContainerIDColumn, OutcomeColumn, ErrorCodeColumn, ErrorMessageColumn, StartedAtColumn, FinishedAtColumn}
defaultColumns = postgres.ColumnList{IDColumn, SourceRefColumn, ImageRefColumn, ContainerIDColumn, ErrorCodeColumn, ErrorMessageColumn}
)
return operationLogTable{
Table: postgres.NewTable(schemaName, tableName, alias, allColumns...),
//Columns
ID: IDColumn,
GameID: GameIDColumn,
OpKind: OpKindColumn,
OpSource: OpSourceColumn,
SourceRef: SourceRefColumn,
ImageRef: ImageRefColumn,
ContainerID: ContainerIDColumn,
Outcome: OutcomeColumn,
ErrorCode: ErrorCodeColumn,
ErrorMessage: ErrorMessageColumn,
StartedAt: StartedAtColumn,
FinishedAt: FinishedAtColumn,
AllColumns: allColumns,
MutableColumns: mutableColumns,
DefaultColumns: defaultColumns,
}
}
@@ -0,0 +1,111 @@
//
// Code generated by go-jet DO NOT EDIT.
//
// WARNING: Changes to this file may cause incorrect behavior
// and will be lost if the code is regenerated
//
package table
import (
"github.com/go-jet/jet/v2/postgres"
)
var RuntimeRecords = newRuntimeRecordsTable("rtmanager", "runtime_records", "")
type runtimeRecordsTable struct {
postgres.Table
// Columns
GameID postgres.ColumnString
Status postgres.ColumnString
CurrentContainerID postgres.ColumnString
CurrentImageRef postgres.ColumnString
EngineEndpoint postgres.ColumnString
StatePath postgres.ColumnString
DockerNetwork postgres.ColumnString
StartedAt postgres.ColumnTimestampz
StoppedAt postgres.ColumnTimestampz
RemovedAt postgres.ColumnTimestampz
LastOpAt postgres.ColumnTimestampz
CreatedAt postgres.ColumnTimestampz
AllColumns postgres.ColumnList
MutableColumns postgres.ColumnList
DefaultColumns postgres.ColumnList
}
type RuntimeRecordsTable struct {
runtimeRecordsTable
EXCLUDED runtimeRecordsTable
}
// AS creates new RuntimeRecordsTable with assigned alias
func (a RuntimeRecordsTable) AS(alias string) *RuntimeRecordsTable {
return newRuntimeRecordsTable(a.SchemaName(), a.TableName(), alias)
}
// Schema creates new RuntimeRecordsTable with assigned schema name
func (a RuntimeRecordsTable) FromSchema(schemaName string) *RuntimeRecordsTable {
return newRuntimeRecordsTable(schemaName, a.TableName(), a.Alias())
}
// WithPrefix creates new RuntimeRecordsTable with assigned table prefix
func (a RuntimeRecordsTable) WithPrefix(prefix string) *RuntimeRecordsTable {
return newRuntimeRecordsTable(a.SchemaName(), prefix+a.TableName(), a.TableName())
}
// WithSuffix creates new RuntimeRecordsTable with assigned table suffix
func (a RuntimeRecordsTable) WithSuffix(suffix string) *RuntimeRecordsTable {
return newRuntimeRecordsTable(a.SchemaName(), a.TableName()+suffix, a.TableName())
}
func newRuntimeRecordsTable(schemaName, tableName, alias string) *RuntimeRecordsTable {
return &RuntimeRecordsTable{
runtimeRecordsTable: newRuntimeRecordsTableImpl(schemaName, tableName, alias),
EXCLUDED: newRuntimeRecordsTableImpl("", "excluded", ""),
}
}
func newRuntimeRecordsTableImpl(schemaName, tableName, alias string) runtimeRecordsTable {
var (
GameIDColumn = postgres.StringColumn("game_id")
StatusColumn = postgres.StringColumn("status")
CurrentContainerIDColumn = postgres.StringColumn("current_container_id")
CurrentImageRefColumn = postgres.StringColumn("current_image_ref")
EngineEndpointColumn = postgres.StringColumn("engine_endpoint")
StatePathColumn = postgres.StringColumn("state_path")
DockerNetworkColumn = postgres.StringColumn("docker_network")
StartedAtColumn = postgres.TimestampzColumn("started_at")
StoppedAtColumn = postgres.TimestampzColumn("stopped_at")
RemovedAtColumn = postgres.TimestampzColumn("removed_at")
LastOpAtColumn = postgres.TimestampzColumn("last_op_at")
CreatedAtColumn = postgres.TimestampzColumn("created_at")
allColumns = postgres.ColumnList{GameIDColumn, StatusColumn, CurrentContainerIDColumn, CurrentImageRefColumn, EngineEndpointColumn, StatePathColumn, DockerNetworkColumn, StartedAtColumn, StoppedAtColumn, RemovedAtColumn, LastOpAtColumn, CreatedAtColumn}
mutableColumns = postgres.ColumnList{StatusColumn, CurrentContainerIDColumn, CurrentImageRefColumn, EngineEndpointColumn, StatePathColumn, DockerNetworkColumn, StartedAtColumn, StoppedAtColumn, RemovedAtColumn, LastOpAtColumn, CreatedAtColumn}
defaultColumns = postgres.ColumnList{}
)
return runtimeRecordsTable{
Table: postgres.NewTable(schemaName, tableName, alias, allColumns...),
//Columns
GameID: GameIDColumn,
Status: StatusColumn,
CurrentContainerID: CurrentContainerIDColumn,
CurrentImageRef: CurrentImageRefColumn,
EngineEndpoint: EngineEndpointColumn,
StatePath: StatePathColumn,
DockerNetwork: DockerNetworkColumn,
StartedAt: StartedAtColumn,
StoppedAt: StoppedAtColumn,
RemovedAt: RemovedAtColumn,
LastOpAt: LastOpAtColumn,
CreatedAt: CreatedAtColumn,
AllColumns: allColumns,
MutableColumns: mutableColumns,
DefaultColumns: defaultColumns,
}
}
@@ -0,0 +1,17 @@
//
// Code generated by go-jet DO NOT EDIT.
//
// WARNING: Changes to this file may cause incorrect behavior
// and will be lost if the code is regenerated
//
package table
// UseSchema sets a new schema name for all generated table SQL builder types. It is recommended to invoke
// this method only once at the beginning of the program.
func UseSchema(schema string) {
GooseDbVersion = GooseDbVersion.FromSchema(schema)
HealthSnapshots = HealthSnapshots.FromSchema(schema)
OperationLog = OperationLog.FromSchema(schema)
RuntimeRecords = RuntimeRecords.FromSchema(schema)
}
@@ -0,0 +1,106 @@
-- +goose Up
-- Initial Runtime Manager PostgreSQL schema.
--
-- Three tables cover the durable surface of the service:
-- * runtime_records — one row per game with the latest known runtime
-- status and Docker container binding;
-- * operation_log — append-only audit of every start/stop/restart/
-- patch/cleanup/reconcile_* operation RTM performed;
-- * health_snapshots — latest technical health observation per game.
--
-- Schema and the matching `rtmanagerservice` role are provisioned
-- outside this script (in tests via cmd/jetgen/main.go::provisionRoleAndSchema;
-- in production via an ops init script). This migration runs as the
-- schema owner with `search_path=rtmanager` and only contains DDL for the
-- service-owned tables and indexes. ARCHITECTURE.md §Database topology
-- mandates that the per-service role's grants stay restricted to its own
-- schema; consequently this file deliberately deviates from PLAN.md
-- Stage 09's literal `CREATE SCHEMA IF NOT EXISTS rtmanager;` instruction.
-- runtime_records holds one durable record per game with the latest
-- known runtime status and Docker container binding. The status enum
-- (running | stopped | removed) is enforced by a CHECK so domain code
-- can rely on it without reading every callsite. The (status, last_op_at)
-- index drives the periodic container-cleanup worker that scans
-- `status='stopped' AND last_op_at < now() - retention`.
CREATE TABLE runtime_records (
game_id text PRIMARY KEY,
status text NOT NULL,
current_container_id text,
current_image_ref text,
engine_endpoint text NOT NULL,
state_path text NOT NULL,
docker_network text NOT NULL,
started_at timestamptz,
stopped_at timestamptz,
removed_at timestamptz,
last_op_at timestamptz NOT NULL,
created_at timestamptz NOT NULL,
CONSTRAINT runtime_records_status_chk
CHECK (status IN ('running', 'stopped', 'removed'))
);
CREATE INDEX runtime_records_status_last_op_idx
ON runtime_records (status, last_op_at);
-- operation_log is an append-only audit of every operation Runtime
-- Manager performed against a game's runtime. The (game_id, started_at
-- DESC) index drives audit reads from the GM/Admin REST surface;
-- finished_at is nullable for in-flight rows even though Stage 13+
-- always finalises the row in the same transaction. The op_kind /
-- op_source / outcome enums are enforced by CHECK constraints to keep
-- the audit schema honest without a separate Go validator.
CREATE TABLE operation_log (
id bigserial PRIMARY KEY,
game_id text NOT NULL,
op_kind text NOT NULL,
op_source text NOT NULL,
source_ref text NOT NULL DEFAULT '',
image_ref text NOT NULL DEFAULT '',
container_id text NOT NULL DEFAULT '',
outcome text NOT NULL,
error_code text NOT NULL DEFAULT '',
error_message text NOT NULL DEFAULT '',
started_at timestamptz NOT NULL,
finished_at timestamptz,
CONSTRAINT operation_log_op_kind_chk
CHECK (op_kind IN (
'start', 'stop', 'restart', 'patch',
'cleanup_container', 'reconcile_adopt', 'reconcile_dispose'
)),
CONSTRAINT operation_log_op_source_chk
CHECK (op_source IN (
'lobby_stream', 'gm_rest', 'admin_rest',
'auto_ttl', 'auto_reconcile'
)),
CONSTRAINT operation_log_outcome_chk
CHECK (outcome IN ('success', 'failure'))
);
CREATE INDEX operation_log_game_started_idx
ON operation_log (game_id, started_at DESC);
-- health_snapshots stores the latest technical health observation per
-- game. One row per game; later observations overwrite. The status enum
-- mirrors the `event_type` vocabulary on `runtime:health_events`
-- (collapsed to a flat status column for the latest-observation view).
CREATE TABLE health_snapshots (
game_id text PRIMARY KEY,
container_id text NOT NULL DEFAULT '',
status text NOT NULL,
source text NOT NULL,
details jsonb NOT NULL DEFAULT '{}'::jsonb,
observed_at timestamptz NOT NULL,
CONSTRAINT health_snapshots_status_chk
CHECK (status IN (
'healthy', 'probe_failed', 'exited',
'oom', 'inspect_unhealthy', 'container_disappeared'
)),
CONSTRAINT health_snapshots_source_chk
CHECK (source IN ('docker_event', 'inspect', 'probe'))
);
-- +goose Down
DROP TABLE IF EXISTS health_snapshots;
DROP TABLE IF EXISTS operation_log;
DROP TABLE IF EXISTS runtime_records;
@@ -0,0 +1,19 @@
// Package migrations exposes the embedded goose migration files used by
// Runtime Manager to provision its `rtmanager` schema in PostgreSQL.
//
// The embedded filesystem is consumed by `pkg/postgres.RunMigrations`
// during rtmanager-service startup and by `cmd/jetgen` when regenerating
// the `internal/adapters/postgres/jet/` code against a transient
// PostgreSQL instance.
package migrations
import "embed"
//go:embed *.sql
var fs embed.FS
// FS returns the embedded filesystem containing every numbered goose
// migration shipped with Runtime Manager.
func FS() embed.FS {
return fs
}
@@ -0,0 +1,235 @@
// Package operationlogstore implements the PostgreSQL-backed adapter for
// `ports.OperationLogStore`.
//
// The package owns the on-disk shape of the `operation_log` table defined
// in
// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`
// and translates the schema-agnostic `ports.OperationLogStore` interface
// declared in `internal/ports/operationlogstore.go` into concrete
// go-jet/v2 statements driven by the pgx driver.
//
// Append uses `INSERT ... RETURNING id` to surface the bigserial id back
// to callers; ListByGame is index-driven by `operation_log_game_started_idx`.
package operationlogstore
import (
"context"
"database/sql"
"errors"
"fmt"
"strings"
"time"
"galaxy/rtmanager/internal/adapters/postgres/internal/sqlx"
pgtable "galaxy/rtmanager/internal/adapters/postgres/jet/rtmanager/table"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/ports"
pg "github.com/go-jet/jet/v2/postgres"
)
// Config configures one PostgreSQL-backed operation-log store instance.
type Config struct {
// DB stores the connection pool the store uses for every query.
DB *sql.DB
// OperationTimeout bounds one round trip.
OperationTimeout time.Duration
}
// Store persists Runtime Manager operation-log entries in PostgreSQL.
type Store struct {
db *sql.DB
operationTimeout time.Duration
}
// New constructs one PostgreSQL-backed operation-log store from cfg.
func New(cfg Config) (*Store, error) {
if cfg.DB == nil {
return nil, errors.New("new postgres operation log store: db must not be nil")
}
if cfg.OperationTimeout <= 0 {
return nil, errors.New("new postgres operation log store: operation timeout must be positive")
}
return &Store{
db: cfg.DB,
operationTimeout: cfg.OperationTimeout,
}, nil
}
// operationLogSelectColumns is the canonical SELECT list for the
// operation_log table, matching scanEntry's column order.
var operationLogSelectColumns = pg.ColumnList{
pgtable.OperationLog.ID,
pgtable.OperationLog.GameID,
pgtable.OperationLog.OpKind,
pgtable.OperationLog.OpSource,
pgtable.OperationLog.SourceRef,
pgtable.OperationLog.ImageRef,
pgtable.OperationLog.ContainerID,
pgtable.OperationLog.Outcome,
pgtable.OperationLog.ErrorCode,
pgtable.OperationLog.ErrorMessage,
pgtable.OperationLog.StartedAt,
pgtable.OperationLog.FinishedAt,
}
// Append inserts entry into the operation log and returns the generated
// bigserial id. entry is validated through operation.OperationEntry.Validate
// before the SQL is issued.
func (store *Store) Append(ctx context.Context, entry operation.OperationEntry) (int64, error) {
if store == nil || store.db == nil {
return 0, errors.New("append operation log entry: nil store")
}
if err := entry.Validate(); err != nil {
return 0, fmt.Errorf("append operation log entry: %w", err)
}
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "append operation log entry", store.operationTimeout)
if err != nil {
return 0, err
}
defer cancel()
stmt := pgtable.OperationLog.INSERT(
pgtable.OperationLog.GameID,
pgtable.OperationLog.OpKind,
pgtable.OperationLog.OpSource,
pgtable.OperationLog.SourceRef,
pgtable.OperationLog.ImageRef,
pgtable.OperationLog.ContainerID,
pgtable.OperationLog.Outcome,
pgtable.OperationLog.ErrorCode,
pgtable.OperationLog.ErrorMessage,
pgtable.OperationLog.StartedAt,
pgtable.OperationLog.FinishedAt,
).VALUES(
entry.GameID,
string(entry.OpKind),
string(entry.OpSource),
entry.SourceRef,
entry.ImageRef,
entry.ContainerID,
string(entry.Outcome),
entry.ErrorCode,
entry.ErrorMessage,
entry.StartedAt.UTC(),
sqlx.NullableTimePtr(entry.FinishedAt),
).RETURNING(pgtable.OperationLog.ID)
query, args := stmt.Sql()
row := store.db.QueryRowContext(operationCtx, query, args...)
var id int64
if err := row.Scan(&id); err != nil {
return 0, fmt.Errorf("append operation log entry: %w", err)
}
return id, nil
}
// ListByGame returns the most recent entries for gameID, ordered by
// started_at descending and capped by limit. The (game_id,
// started_at DESC) index drives the read.
func (store *Store) ListByGame(ctx context.Context, gameID string, limit int) ([]operation.OperationEntry, error) {
if store == nil || store.db == nil {
return nil, errors.New("list operation log entries by game: nil store")
}
if strings.TrimSpace(gameID) == "" {
return nil, fmt.Errorf("list operation log entries by game: game id must not be empty")
}
if limit <= 0 {
return nil, fmt.Errorf("list operation log entries by game: limit must be positive, got %d", limit)
}
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "list operation log entries by game", store.operationTimeout)
if err != nil {
return nil, err
}
defer cancel()
stmt := pg.SELECT(operationLogSelectColumns).
FROM(pgtable.OperationLog).
WHERE(pgtable.OperationLog.GameID.EQ(pg.String(gameID))).
ORDER_BY(pgtable.OperationLog.StartedAt.DESC(), pgtable.OperationLog.ID.DESC()).
LIMIT(int64(limit))
query, args := stmt.Sql()
rows, err := store.db.QueryContext(operationCtx, query, args...)
if err != nil {
return nil, fmt.Errorf("list operation log entries by game: %w", err)
}
defer rows.Close()
entries := make([]operation.OperationEntry, 0)
for rows.Next() {
entry, err := scanEntry(rows)
if err != nil {
return nil, fmt.Errorf("list operation log entries by game: scan: %w", err)
}
entries = append(entries, entry)
}
if err := rows.Err(); err != nil {
return nil, fmt.Errorf("list operation log entries by game: %w", err)
}
if len(entries) == 0 {
return nil, nil
}
return entries, nil
}
// rowScanner abstracts *sql.Row and *sql.Rows so scanEntry can be shared
// across both single-row reads and iterated reads.
type rowScanner interface {
Scan(dest ...any) error
}
// scanEntry scans one operation_log row from rs.
func scanEntry(rs rowScanner) (operation.OperationEntry, error) {
var (
id int64
gameID string
opKind string
opSource string
sourceRef string
imageRef string
containerID string
outcome string
errorCode string
errorMessage string
startedAt time.Time
finishedAt sql.NullTime
)
if err := rs.Scan(
&id,
&gameID,
&opKind,
&opSource,
&sourceRef,
&imageRef,
&containerID,
&outcome,
&errorCode,
&errorMessage,
&startedAt,
&finishedAt,
); err != nil {
return operation.OperationEntry{}, err
}
return operation.OperationEntry{
ID: id,
GameID: gameID,
OpKind: operation.OpKind(opKind),
OpSource: operation.OpSource(opSource),
SourceRef: sourceRef,
ImageRef: imageRef,
ContainerID: containerID,
Outcome: operation.Outcome(outcome),
ErrorCode: errorCode,
ErrorMessage: errorMessage,
StartedAt: startedAt.UTC(),
FinishedAt: sqlx.TimePtrFromNullable(finishedAt),
}, nil
}
// Ensure Store satisfies the ports.OperationLogStore interface at compile
// time.
var _ ports.OperationLogStore = (*Store)(nil)
@@ -0,0 +1,207 @@
package operationlogstore_test
import (
"context"
"testing"
"time"
"galaxy/rtmanager/internal/adapters/postgres/internal/pgtest"
"galaxy/rtmanager/internal/adapters/postgres/operationlogstore"
"galaxy/rtmanager/internal/domain/operation"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestMain(m *testing.M) { pgtest.RunMain(m) }
func newStore(t *testing.T) *operationlogstore.Store {
t.Helper()
pgtest.TruncateAll(t)
store, err := operationlogstore.New(operationlogstore.Config{
DB: pgtest.Ensure(t).Pool(),
OperationTimeout: pgtest.OperationTimeout,
})
require.NoError(t, err)
return store
}
func successStartEntry(gameID string, startedAt time.Time, sourceRef string) operation.OperationEntry {
finishedAt := startedAt.Add(time.Second)
return operation.OperationEntry{
GameID: gameID,
OpKind: operation.OpKindStart,
OpSource: operation.OpSourceLobbyStream,
SourceRef: sourceRef,
ImageRef: "galaxy/game:v1.2.3",
ContainerID: "container-1",
Outcome: operation.OutcomeSuccess,
StartedAt: startedAt,
FinishedAt: &finishedAt,
}
}
func TestAppendReturnsPositiveIDs(t *testing.T) {
ctx := context.Background()
store := newStore(t)
startedAt := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
id1, err := store.Append(ctx, successStartEntry("game-001", startedAt, "1700000000000-0"))
require.NoError(t, err)
assert.Greater(t, id1, int64(0))
id2, err := store.Append(ctx, successStartEntry("game-001", startedAt.Add(time.Minute), "1700000000001-0"))
require.NoError(t, err)
assert.Greater(t, id2, id1)
}
func TestAppendValidatesEntry(t *testing.T) {
ctx := context.Background()
store := newStore(t)
tests := []struct {
name string
mutate func(*operation.OperationEntry)
}{
{"empty game id", func(e *operation.OperationEntry) { e.GameID = "" }},
{"unknown op kind", func(e *operation.OperationEntry) { e.OpKind = "exotic" }},
{"unknown op source", func(e *operation.OperationEntry) { e.OpSource = "exotic" }},
{"unknown outcome", func(e *operation.OperationEntry) { e.Outcome = "exotic" }},
{"zero started at", func(e *operation.OperationEntry) { e.StartedAt = time.Time{} }},
{"failure without error code", func(e *operation.OperationEntry) {
e.Outcome = operation.OutcomeFailure
e.ErrorCode = ""
}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
entry := successStartEntry("game-001",
time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC), "ref")
tt.mutate(&entry)
_, err := store.Append(ctx, entry)
require.Error(t, err)
})
}
}
func TestListByGameReturnsEntriesNewestFirst(t *testing.T) {
ctx := context.Background()
store := newStore(t)
base := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
for index := range 3 {
_, err := store.Append(ctx, successStartEntry("game-001",
base.Add(time.Duration(index)*time.Minute),
"ref-game-001-"))
require.NoError(t, err)
}
// Foreign-game entry must not appear in the list.
_, err := store.Append(ctx, successStartEntry("game-other", base, "ref-other"))
require.NoError(t, err)
entries, err := store.ListByGame(ctx, "game-001", 10)
require.NoError(t, err)
require.Len(t, entries, 3)
for index := range 2 {
assert.True(t,
!entries[index].StartedAt.Before(entries[index+1].StartedAt),
"entries must be ordered started_at DESC; got %s before %s",
entries[index].StartedAt, entries[index+1].StartedAt,
)
}
}
func TestListByGameRespectsLimit(t *testing.T) {
ctx := context.Background()
store := newStore(t)
base := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
for index := range 5 {
_, err := store.Append(ctx, successStartEntry("game-001",
base.Add(time.Duration(index)*time.Minute), "ref"))
require.NoError(t, err)
}
entries, err := store.ListByGame(ctx, "game-001", 2)
require.NoError(t, err)
require.Len(t, entries, 2)
}
func TestListByGameReturnsEmptyForUnknownGame(t *testing.T) {
ctx := context.Background()
store := newStore(t)
entries, err := store.ListByGame(ctx, "game-missing", 10)
require.NoError(t, err)
assert.Empty(t, entries)
}
func TestListByGameRejectsInvalidArgs(t *testing.T) {
ctx := context.Background()
store := newStore(t)
_, err := store.ListByGame(ctx, "", 10)
require.Error(t, err)
_, err = store.ListByGame(ctx, "game-001", 0)
require.Error(t, err)
_, err = store.ListByGame(ctx, "game-001", -3)
require.Error(t, err)
}
func TestAppendRoundTripsAllFields(t *testing.T) {
ctx := context.Background()
store := newStore(t)
startedAt := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
finishedAt := startedAt.Add(2 * time.Second)
original := operation.OperationEntry{
GameID: "game-001",
OpKind: operation.OpKindStop,
OpSource: operation.OpSourceGMRest,
SourceRef: "request-7",
ImageRef: "galaxy/game:v2.0.0",
ContainerID: "container-X",
Outcome: operation.OutcomeFailure,
ErrorCode: "container_start_failed",
ErrorMessage: "stop deadline exceeded",
StartedAt: startedAt,
FinishedAt: &finishedAt,
}
id, err := store.Append(ctx, original)
require.NoError(t, err)
entries, err := store.ListByGame(ctx, "game-001", 10)
require.NoError(t, err)
require.Len(t, entries, 1)
got := entries[0]
assert.Equal(t, id, got.ID)
assert.Equal(t, original.GameID, got.GameID)
assert.Equal(t, original.OpKind, got.OpKind)
assert.Equal(t, original.OpSource, got.OpSource)
assert.Equal(t, original.SourceRef, got.SourceRef)
assert.Equal(t, original.ImageRef, got.ImageRef)
assert.Equal(t, original.ContainerID, got.ContainerID)
assert.Equal(t, original.Outcome, got.Outcome)
assert.Equal(t, original.ErrorCode, got.ErrorCode)
assert.Equal(t, original.ErrorMessage, got.ErrorMessage)
assert.True(t, original.StartedAt.Equal(got.StartedAt))
require.NotNil(t, got.FinishedAt)
assert.True(t, original.FinishedAt.Equal(*got.FinishedAt))
assert.Equal(t, time.UTC, got.StartedAt.Location())
assert.Equal(t, time.UTC, got.FinishedAt.Location())
}
func TestNewRejectsNilDB(t *testing.T) {
_, err := operationlogstore.New(operationlogstore.Config{OperationTimeout: time.Second})
require.Error(t, err)
}
func TestNewRejectsNonPositiveTimeout(t *testing.T) {
_, err := operationlogstore.New(operationlogstore.Config{
DB: pgtest.Ensure(t).Pool(),
})
require.Error(t, err)
}
@@ -0,0 +1,500 @@
// Package runtimerecordstore implements the PostgreSQL-backed adapter for
// `ports.RuntimeRecordStore`.
//
// The package owns the on-disk shape of the `runtime_records` table
// defined in
// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`
// and translates the schema-agnostic `ports.RuntimeRecordStore` interface
// declared in `internal/ports/runtimerecordstore.go` into concrete
// go-jet/v2 statements driven by the pgx driver.
//
// Lifecycle transitions (UpdateStatus) use compare-and-swap on
// `(status, current_container_id)` rather than holding a SELECT ... FOR
// UPDATE lock across the caller's logic, mirroring the pattern used by
// `lobby/internal/adapters/postgres/gamestore`.
package runtimerecordstore
import (
"context"
"database/sql"
"errors"
"fmt"
"strings"
"time"
"galaxy/rtmanager/internal/adapters/postgres/internal/sqlx"
pgtable "galaxy/rtmanager/internal/adapters/postgres/jet/rtmanager/table"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
pg "github.com/go-jet/jet/v2/postgres"
)
// Config configures one PostgreSQL-backed runtime-record store instance.
// The store does not own the underlying *sql.DB lifecycle: the caller
// (typically the service runtime) opens, instruments, migrates, and
// closes the pool.
type Config struct {
// DB stores the connection pool the store uses for every query.
DB *sql.DB
// OperationTimeout bounds one round trip. The store creates a
// derived context for each operation so callers cannot starve the
// pool with an unbounded ctx.
OperationTimeout time.Duration
}
// Store persists Runtime Manager runtime records in PostgreSQL.
type Store struct {
db *sql.DB
operationTimeout time.Duration
}
// New constructs one PostgreSQL-backed runtime-record store from cfg.
func New(cfg Config) (*Store, error) {
if cfg.DB == nil {
return nil, errors.New("new postgres runtime record store: db must not be nil")
}
if cfg.OperationTimeout <= 0 {
return nil, errors.New("new postgres runtime record store: operation timeout must be positive")
}
return &Store{
db: cfg.DB,
operationTimeout: cfg.OperationTimeout,
}, nil
}
// runtimeSelectColumns is the canonical SELECT list for the runtime_records
// table, matching scanRecord's column order.
var runtimeSelectColumns = pg.ColumnList{
pgtable.RuntimeRecords.GameID,
pgtable.RuntimeRecords.Status,
pgtable.RuntimeRecords.CurrentContainerID,
pgtable.RuntimeRecords.CurrentImageRef,
pgtable.RuntimeRecords.EngineEndpoint,
pgtable.RuntimeRecords.StatePath,
pgtable.RuntimeRecords.DockerNetwork,
pgtable.RuntimeRecords.StartedAt,
pgtable.RuntimeRecords.StoppedAt,
pgtable.RuntimeRecords.RemovedAt,
pgtable.RuntimeRecords.LastOpAt,
pgtable.RuntimeRecords.CreatedAt,
}
// Get returns the record identified by gameID. It returns
// runtime.ErrNotFound when no record exists.
func (store *Store) Get(ctx context.Context, gameID string) (runtime.RuntimeRecord, error) {
if store == nil || store.db == nil {
return runtime.RuntimeRecord{}, errors.New("get runtime record: nil store")
}
if strings.TrimSpace(gameID) == "" {
return runtime.RuntimeRecord{}, fmt.Errorf("get runtime record: game id must not be empty")
}
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "get runtime record", store.operationTimeout)
if err != nil {
return runtime.RuntimeRecord{}, err
}
defer cancel()
stmt := pg.SELECT(runtimeSelectColumns).
FROM(pgtable.RuntimeRecords).
WHERE(pgtable.RuntimeRecords.GameID.EQ(pg.String(gameID)))
query, args := stmt.Sql()
row := store.db.QueryRowContext(operationCtx, query, args...)
record, err := scanRecord(row)
if sqlx.IsNoRows(err) {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
if err != nil {
return runtime.RuntimeRecord{}, fmt.Errorf("get runtime record: %w", err)
}
return record, nil
}
// Upsert inserts record when no row exists for record.GameID and
// otherwise overwrites every mutable column verbatim. created_at is
// preserved across upserts so the "first time RTM saw the game"
// timestamp stays stable.
func (store *Store) Upsert(ctx context.Context, record runtime.RuntimeRecord) error {
if store == nil || store.db == nil {
return errors.New("upsert runtime record: nil store")
}
if err := record.Validate(); err != nil {
return fmt.Errorf("upsert runtime record: %w", err)
}
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "upsert runtime record", store.operationTimeout)
if err != nil {
return err
}
defer cancel()
stmt := pgtable.RuntimeRecords.INSERT(
pgtable.RuntimeRecords.GameID,
pgtable.RuntimeRecords.Status,
pgtable.RuntimeRecords.CurrentContainerID,
pgtable.RuntimeRecords.CurrentImageRef,
pgtable.RuntimeRecords.EngineEndpoint,
pgtable.RuntimeRecords.StatePath,
pgtable.RuntimeRecords.DockerNetwork,
pgtable.RuntimeRecords.StartedAt,
pgtable.RuntimeRecords.StoppedAt,
pgtable.RuntimeRecords.RemovedAt,
pgtable.RuntimeRecords.LastOpAt,
pgtable.RuntimeRecords.CreatedAt,
).VALUES(
record.GameID,
string(record.Status),
sqlx.NullableString(record.CurrentContainerID),
sqlx.NullableString(record.CurrentImageRef),
record.EngineEndpoint,
record.StatePath,
record.DockerNetwork,
sqlx.NullableTimePtr(record.StartedAt),
sqlx.NullableTimePtr(record.StoppedAt),
sqlx.NullableTimePtr(record.RemovedAt),
record.LastOpAt.UTC(),
record.CreatedAt.UTC(),
).ON_CONFLICT(pgtable.RuntimeRecords.GameID).DO_UPDATE(
pg.SET(
pgtable.RuntimeRecords.Status.SET(pgtable.RuntimeRecords.EXCLUDED.Status),
pgtable.RuntimeRecords.CurrentContainerID.SET(pgtable.RuntimeRecords.EXCLUDED.CurrentContainerID),
pgtable.RuntimeRecords.CurrentImageRef.SET(pgtable.RuntimeRecords.EXCLUDED.CurrentImageRef),
pgtable.RuntimeRecords.EngineEndpoint.SET(pgtable.RuntimeRecords.EXCLUDED.EngineEndpoint),
pgtable.RuntimeRecords.StatePath.SET(pgtable.RuntimeRecords.EXCLUDED.StatePath),
pgtable.RuntimeRecords.DockerNetwork.SET(pgtable.RuntimeRecords.EXCLUDED.DockerNetwork),
pgtable.RuntimeRecords.StartedAt.SET(pgtable.RuntimeRecords.EXCLUDED.StartedAt),
pgtable.RuntimeRecords.StoppedAt.SET(pgtable.RuntimeRecords.EXCLUDED.StoppedAt),
pgtable.RuntimeRecords.RemovedAt.SET(pgtable.RuntimeRecords.EXCLUDED.RemovedAt),
pgtable.RuntimeRecords.LastOpAt.SET(pgtable.RuntimeRecords.EXCLUDED.LastOpAt),
),
)
query, args := stmt.Sql()
if _, err := store.db.ExecContext(operationCtx, query, args...); err != nil {
return fmt.Errorf("upsert runtime record: %w", err)
}
return nil
}
// UpdateStatus applies one status transition with a compare-and-swap
// guard on (status, current_container_id). Validate is invoked before
// any SQL touch.
func (store *Store) UpdateStatus(ctx context.Context, input ports.UpdateStatusInput) error {
if store == nil || store.db == nil {
return errors.New("update runtime status: nil store")
}
if err := input.Validate(); err != nil {
return err
}
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "update runtime status", store.operationTimeout)
if err != nil {
return err
}
defer cancel()
now := input.Now.UTC()
stmt, err := buildUpdateStatusStatement(input, now)
if err != nil {
return err
}
query, args := stmt.Sql()
result, err := store.db.ExecContext(operationCtx, query, args...)
if err != nil {
return fmt.Errorf("update runtime status: %w", err)
}
affected, err := result.RowsAffected()
if err != nil {
return fmt.Errorf("update runtime status: rows affected: %w", err)
}
if affected == 0 {
return store.classifyMissingUpdate(operationCtx, input.GameID)
}
return nil
}
// classifyMissingUpdate distinguishes ErrNotFound from ErrConflict after
// an UPDATE that affected zero rows. A row that is absent yields
// ErrNotFound; a row whose status or container_id does not match the
// CAS predicate yields ErrConflict.
func (store *Store) classifyMissingUpdate(ctx context.Context, gameID string) error {
probe := pg.SELECT(pgtable.RuntimeRecords.Status).
FROM(pgtable.RuntimeRecords).
WHERE(pgtable.RuntimeRecords.GameID.EQ(pg.String(gameID)))
probeQuery, probeArgs := probe.Sql()
var current string
row := store.db.QueryRowContext(ctx, probeQuery, probeArgs...)
if err := row.Scan(&current); err != nil {
if sqlx.IsNoRows(err) {
return runtime.ErrNotFound
}
return fmt.Errorf("update runtime status: probe: %w", err)
}
return runtime.ErrConflict
}
// buildUpdateStatusStatement assembles the UPDATE statement applied for
// one runtime-status transition.
//
// status, last_op_at are always updated. The remaining columns are
// driven by the destination:
//
// - StatusStopped: stopped_at is captured at Now.
// - StatusRemoved: removed_at is captured at Now and current_container_id
// is NULLed (the container is gone; the prior id remains observable
// through operation_log).
// - StatusRunning: only status + last_op_at change. Fresh started_at
// and current_container_id are installed via Upsert before any
// stopped → running transition reaches this path; the path exists
// so runtime.AllowedTransitions stays one-to-one with the adapter
// capability matrix even though v1 services use Upsert for this
// case.
func buildUpdateStatusStatement(input ports.UpdateStatusInput, now time.Time) (pg.UpdateStatement, error) {
statusValue := pg.String(string(input.To))
nowValue := pg.TimestampzT(now)
var stmt pg.UpdateStatement
switch input.To {
case runtime.StatusStopped:
stmt = pgtable.RuntimeRecords.UPDATE(
pgtable.RuntimeRecords.Status,
pgtable.RuntimeRecords.LastOpAt,
pgtable.RuntimeRecords.StoppedAt,
).SET(
statusValue,
nowValue,
nowValue,
)
case runtime.StatusRemoved:
stmt = pgtable.RuntimeRecords.UPDATE(
pgtable.RuntimeRecords.Status,
pgtable.RuntimeRecords.LastOpAt,
pgtable.RuntimeRecords.RemovedAt,
pgtable.RuntimeRecords.CurrentContainerID,
).SET(
statusValue,
nowValue,
nowValue,
pg.NULL,
)
case runtime.StatusRunning:
stmt = pgtable.RuntimeRecords.UPDATE(
pgtable.RuntimeRecords.Status,
pgtable.RuntimeRecords.LastOpAt,
).SET(
statusValue,
nowValue,
)
default:
return nil, fmt.Errorf("update runtime status: destination status %q is unsupported", input.To)
}
whereExpr := pg.AND(
pgtable.RuntimeRecords.GameID.EQ(pg.String(input.GameID)),
pgtable.RuntimeRecords.Status.EQ(pg.String(string(input.ExpectedFrom))),
)
if input.ExpectedContainerID != "" {
whereExpr = pg.AND(
whereExpr,
pgtable.RuntimeRecords.CurrentContainerID.EQ(pg.String(input.ExpectedContainerID)),
)
}
return stmt.WHERE(whereExpr), nil
}
// ListByStatus returns every record currently indexed under status.
// Ordering is last_op_at DESC, game_id ASC — the direction the
// `runtime_records_status_last_op_idx` index is built in.
func (store *Store) ListByStatus(ctx context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
if store == nil || store.db == nil {
return nil, errors.New("list runtime records by status: nil store")
}
if !status.IsKnown() {
return nil, fmt.Errorf("list runtime records by status: status %q is unsupported", status)
}
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "list runtime records by status", store.operationTimeout)
if err != nil {
return nil, err
}
defer cancel()
stmt := pg.SELECT(runtimeSelectColumns).
FROM(pgtable.RuntimeRecords).
WHERE(pgtable.RuntimeRecords.Status.EQ(pg.String(string(status)))).
ORDER_BY(pgtable.RuntimeRecords.LastOpAt.DESC(), pgtable.RuntimeRecords.GameID.ASC())
query, args := stmt.Sql()
rows, err := store.db.QueryContext(operationCtx, query, args...)
if err != nil {
return nil, fmt.Errorf("list runtime records by status: %w", err)
}
defer rows.Close()
records := make([]runtime.RuntimeRecord, 0)
for rows.Next() {
record, err := scanRecord(rows)
if err != nil {
return nil, fmt.Errorf("list runtime records by status: scan: %w", err)
}
records = append(records, record)
}
if err := rows.Err(); err != nil {
return nil, fmt.Errorf("list runtime records by status: %w", err)
}
if len(records) == 0 {
return nil, nil
}
return records, nil
}
// List returns every runtime record currently stored. Ordering matches
// ListByStatus — last_op_at DESC, game_id ASC — so the REST list
// endpoint sees the freshest activity first.
func (store *Store) List(ctx context.Context) ([]runtime.RuntimeRecord, error) {
if store == nil || store.db == nil {
return nil, errors.New("list runtime records: nil store")
}
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "list runtime records", store.operationTimeout)
if err != nil {
return nil, err
}
defer cancel()
stmt := pg.SELECT(runtimeSelectColumns).
FROM(pgtable.RuntimeRecords).
ORDER_BY(pgtable.RuntimeRecords.LastOpAt.DESC(), pgtable.RuntimeRecords.GameID.ASC())
query, args := stmt.Sql()
rows, err := store.db.QueryContext(operationCtx, query, args...)
if err != nil {
return nil, fmt.Errorf("list runtime records: %w", err)
}
defer rows.Close()
records := make([]runtime.RuntimeRecord, 0)
for rows.Next() {
record, err := scanRecord(rows)
if err != nil {
return nil, fmt.Errorf("list runtime records: scan: %w", err)
}
records = append(records, record)
}
if err := rows.Err(); err != nil {
return nil, fmt.Errorf("list runtime records: %w", err)
}
if len(records) == 0 {
return nil, nil
}
return records, nil
}
// CountByStatus returns the number of records indexed under each status.
// Statuses with zero records are present in the result with a zero
// count so callers (e.g. the telemetry gauge) can publish a stable
// label set on every reading.
func (store *Store) CountByStatus(ctx context.Context) (map[runtime.Status]int, error) {
if store == nil || store.db == nil {
return nil, errors.New("count runtime records by status: nil store")
}
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "count runtime records by status", store.operationTimeout)
if err != nil {
return nil, err
}
defer cancel()
countAlias := pg.COUNT(pg.STAR).AS("count")
stmt := pg.SELECT(pgtable.RuntimeRecords.Status, countAlias).
FROM(pgtable.RuntimeRecords).
GROUP_BY(pgtable.RuntimeRecords.Status)
query, args := stmt.Sql()
rows, err := store.db.QueryContext(operationCtx, query, args...)
if err != nil {
return nil, fmt.Errorf("count runtime records by status: %w", err)
}
defer rows.Close()
counts := make(map[runtime.Status]int, len(runtime.AllStatuses()))
for _, status := range runtime.AllStatuses() {
counts[status] = 0
}
for rows.Next() {
var status string
var count int
if err := rows.Scan(&status, &count); err != nil {
return nil, fmt.Errorf("count runtime records by status: scan: %w", err)
}
counts[runtime.Status(status)] = count
}
if err := rows.Err(); err != nil {
return nil, fmt.Errorf("count runtime records by status: %w", err)
}
return counts, nil
}
// rowScanner abstracts *sql.Row and *sql.Rows so scanRecord can be shared
// across both single-row reads and iterated reads.
type rowScanner interface {
Scan(dest ...any) error
}
// scanRecord scans one runtime_records row from rs. Returns sql.ErrNoRows
// verbatim so callers can distinguish "no row" from a hard error.
func scanRecord(rs rowScanner) (runtime.RuntimeRecord, error) {
var (
gameID string
status string
currentContainerID sql.NullString
currentImageRef sql.NullString
engineEndpoint string
statePath string
dockerNetwork string
startedAt sql.NullTime
stoppedAt sql.NullTime
removedAt sql.NullTime
lastOpAt time.Time
createdAt time.Time
)
if err := rs.Scan(
&gameID,
&status,
&currentContainerID,
&currentImageRef,
&engineEndpoint,
&statePath,
&dockerNetwork,
&startedAt,
&stoppedAt,
&removedAt,
&lastOpAt,
&createdAt,
); err != nil {
return runtime.RuntimeRecord{}, err
}
return runtime.RuntimeRecord{
GameID: gameID,
Status: runtime.Status(status),
CurrentContainerID: sqlx.StringFromNullable(currentContainerID),
CurrentImageRef: sqlx.StringFromNullable(currentImageRef),
EngineEndpoint: engineEndpoint,
StatePath: statePath,
DockerNetwork: dockerNetwork,
StartedAt: sqlx.TimePtrFromNullable(startedAt),
StoppedAt: sqlx.TimePtrFromNullable(stoppedAt),
RemovedAt: sqlx.TimePtrFromNullable(removedAt),
LastOpAt: lastOpAt.UTC(),
CreatedAt: createdAt.UTC(),
}, nil
}
// Ensure Store satisfies the ports.RuntimeRecordStore interface at
// compile time.
var _ ports.RuntimeRecordStore = (*Store)(nil)
@@ -0,0 +1,420 @@
package runtimerecordstore_test
import (
"context"
"errors"
"sync"
"testing"
"time"
"galaxy/rtmanager/internal/adapters/postgres/internal/pgtest"
"galaxy/rtmanager/internal/adapters/postgres/runtimerecordstore"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestMain(m *testing.M) { pgtest.RunMain(m) }
func newStore(t *testing.T) *runtimerecordstore.Store {
t.Helper()
pgtest.TruncateAll(t)
store, err := runtimerecordstore.New(runtimerecordstore.Config{
DB: pgtest.Ensure(t).Pool(),
OperationTimeout: pgtest.OperationTimeout,
})
require.NoError(t, err)
return store
}
func runningRecord(t *testing.T, gameID, containerID, imageRef string) runtime.RuntimeRecord {
t.Helper()
now := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
started := now
return runtime.RuntimeRecord{
GameID: gameID,
Status: runtime.StatusRunning,
CurrentContainerID: containerID,
CurrentImageRef: imageRef,
EngineEndpoint: "http://galaxy-game-" + gameID + ":8080",
StatePath: "/var/lib/galaxy/games/" + gameID,
DockerNetwork: "galaxy-net",
StartedAt: &started,
LastOpAt: now,
CreatedAt: now,
}
}
func TestUpsertAndGetRoundTrip(t *testing.T) {
ctx := context.Background()
store := newStore(t)
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
require.NoError(t, store.Upsert(ctx, record))
got, err := store.Get(ctx, record.GameID)
require.NoError(t, err)
assert.Equal(t, record.GameID, got.GameID)
assert.Equal(t, record.Status, got.Status)
assert.Equal(t, record.CurrentContainerID, got.CurrentContainerID)
assert.Equal(t, record.CurrentImageRef, got.CurrentImageRef)
assert.Equal(t, record.EngineEndpoint, got.EngineEndpoint)
assert.Equal(t, record.StatePath, got.StatePath)
assert.Equal(t, record.DockerNetwork, got.DockerNetwork)
require.NotNil(t, got.StartedAt)
assert.True(t, record.StartedAt.Equal(*got.StartedAt))
assert.Equal(t, time.UTC, got.StartedAt.Location())
assert.Equal(t, time.UTC, got.LastOpAt.Location())
assert.Equal(t, time.UTC, got.CreatedAt.Location())
assert.Nil(t, got.StoppedAt)
assert.Nil(t, got.RemovedAt)
}
func TestGetReturnsNotFound(t *testing.T) {
ctx := context.Background()
store := newStore(t)
_, err := store.Get(ctx, "game-missing")
require.ErrorIs(t, err, runtime.ErrNotFound)
}
func TestUpsertOverwritesMutableColumnsPreservesCreatedAt(t *testing.T) {
ctx := context.Background()
store := newStore(t)
original := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
require.NoError(t, store.Upsert(ctx, original))
updated := original
updated.CurrentContainerID = "container-2"
updated.CurrentImageRef = "galaxy/game:v1.2.4"
newStarted := original.LastOpAt.Add(time.Minute)
updated.StartedAt = &newStarted
updated.LastOpAt = newStarted
// Fresh CreatedAt simulates a caller passing "now"; the store must
// preserve the original CreatedAt value on conflict.
updated.CreatedAt = newStarted
require.NoError(t, store.Upsert(ctx, updated))
got, err := store.Get(ctx, original.GameID)
require.NoError(t, err)
assert.Equal(t, "container-2", got.CurrentContainerID)
assert.Equal(t, "galaxy/game:v1.2.4", got.CurrentImageRef)
assert.True(t, got.LastOpAt.Equal(newStarted))
assert.True(t, got.CreatedAt.Equal(original.CreatedAt),
"created_at must be preserved across upserts: got %s, want %s",
got.CreatedAt, original.CreatedAt)
}
func TestUpdateStatusRunningToStopped(t *testing.T) {
ctx := context.Background()
store := newStore(t)
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
require.NoError(t, store.Upsert(ctx, record))
now := record.LastOpAt.Add(2 * time.Minute)
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: record.GameID,
ExpectedFrom: runtime.StatusRunning,
ExpectedContainerID: record.CurrentContainerID,
To: runtime.StatusStopped,
Now: now,
}))
got, err := store.Get(ctx, record.GameID)
require.NoError(t, err)
assert.Equal(t, runtime.StatusStopped, got.Status)
require.NotNil(t, got.StoppedAt)
assert.True(t, now.Equal(*got.StoppedAt))
assert.True(t, now.Equal(got.LastOpAt))
// container id is preserved on stop; cleanup later NULLs it.
assert.Equal(t, record.CurrentContainerID, got.CurrentContainerID)
}
func TestUpdateStatusRunningToRemovedClearsContainerID(t *testing.T) {
ctx := context.Background()
store := newStore(t)
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
require.NoError(t, store.Upsert(ctx, record))
now := record.LastOpAt.Add(time.Minute)
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: record.GameID,
ExpectedFrom: runtime.StatusRunning,
To: runtime.StatusRemoved,
Now: now,
}))
got, err := store.Get(ctx, record.GameID)
require.NoError(t, err)
assert.Equal(t, runtime.StatusRemoved, got.Status)
require.NotNil(t, got.RemovedAt)
assert.True(t, now.Equal(*got.RemovedAt))
assert.True(t, now.Equal(got.LastOpAt))
assert.Empty(t, got.CurrentContainerID, "current_container_id must be NULL after removal")
}
func TestUpdateStatusStoppedToRemoved(t *testing.T) {
ctx := context.Background()
store := newStore(t)
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
require.NoError(t, store.Upsert(ctx, record))
stopAt := record.LastOpAt.Add(time.Minute)
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: record.GameID,
ExpectedFrom: runtime.StatusRunning,
To: runtime.StatusStopped,
Now: stopAt,
}))
removeAt := stopAt.Add(time.Hour)
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: record.GameID,
ExpectedFrom: runtime.StatusStopped,
To: runtime.StatusRemoved,
Now: removeAt,
}))
got, err := store.Get(ctx, record.GameID)
require.NoError(t, err)
assert.Equal(t, runtime.StatusRemoved, got.Status)
require.NotNil(t, got.RemovedAt)
assert.True(t, removeAt.Equal(*got.RemovedAt))
assert.True(t, removeAt.Equal(got.LastOpAt))
require.NotNil(t, got.StoppedAt, "stopped_at must remain populated through removal")
assert.True(t, stopAt.Equal(*got.StoppedAt))
assert.Empty(t, got.CurrentContainerID)
}
func TestUpdateStatusReturnsConflictOnFromMismatch(t *testing.T) {
ctx := context.Background()
store := newStore(t)
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
require.NoError(t, store.Upsert(ctx, record))
err := store.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: record.GameID,
ExpectedFrom: runtime.StatusStopped, // wrong
To: runtime.StatusRemoved,
Now: record.LastOpAt.Add(time.Minute),
})
require.ErrorIs(t, err, runtime.ErrConflict)
}
func TestUpdateStatusReturnsConflictOnContainerIDMismatch(t *testing.T) {
ctx := context.Background()
store := newStore(t)
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
require.NoError(t, store.Upsert(ctx, record))
err := store.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: record.GameID,
ExpectedFrom: runtime.StatusRunning,
ExpectedContainerID: "container-other",
To: runtime.StatusStopped,
Now: record.LastOpAt.Add(time.Minute),
})
require.ErrorIs(t, err, runtime.ErrConflict)
}
func TestUpdateStatusReturnsNotFoundForMissing(t *testing.T) {
ctx := context.Background()
store := newStore(t)
err := store.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: "game-missing",
ExpectedFrom: runtime.StatusRunning,
To: runtime.StatusStopped,
Now: time.Now().UTC(),
})
require.ErrorIs(t, err, runtime.ErrNotFound)
}
func TestUpdateStatusValidatesInputBeforeStore(t *testing.T) {
ctx := context.Background()
store := newStore(t)
err := store.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: "game-001",
ExpectedFrom: runtime.StatusRunning,
To: runtime.StatusStopped,
// Now intentionally zero — validation must reject.
})
require.Error(t, err)
}
// TestUpdateStatusConcurrentCAS asserts the CAS guard: when two callers
// race to apply the running → stopped transition on the same row,
// exactly one wins (returns nil) and the other observes
// runtime.ErrConflict.
func TestUpdateStatusConcurrentCAS(t *testing.T) {
ctx := context.Background()
store := newStore(t)
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
require.NoError(t, store.Upsert(ctx, record))
const concurrency = 8
results := make([]error, concurrency)
var wg sync.WaitGroup
wg.Add(concurrency)
for index := range concurrency {
go func() {
defer wg.Done()
results[index] = store.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: record.GameID,
ExpectedFrom: runtime.StatusRunning,
ExpectedContainerID: record.CurrentContainerID,
To: runtime.StatusStopped,
Now: record.LastOpAt.Add(time.Duration(index+1) * time.Second),
})
}()
}
wg.Wait()
wins, conflicts := 0, 0
for _, err := range results {
switch {
case err == nil:
wins++
case errors.Is(err, runtime.ErrConflict):
conflicts++
default:
t.Errorf("unexpected error from concurrent UpdateStatus: %v", err)
}
}
assert.Equal(t, 1, wins, "exactly one caller must win the CAS race")
assert.Equal(t, concurrency-1, conflicts, "the rest must observe runtime.ErrConflict")
}
func TestListByStatusReturnsExpectedRecords(t *testing.T) {
ctx := context.Background()
store := newStore(t)
a := runningRecord(t, "game-aaa", "container-a", "galaxy/game:v1.2.3")
b := runningRecord(t, "game-bbb", "container-b", "galaxy/game:v1.2.3")
c := runningRecord(t, "game-ccc", "container-c", "galaxy/game:v1.2.3")
for _, r := range []runtime.RuntimeRecord{a, b, c} {
require.NoError(t, store.Upsert(ctx, r))
}
stopAt := a.LastOpAt.Add(time.Minute)
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: b.GameID,
ExpectedFrom: runtime.StatusRunning,
To: runtime.StatusStopped,
Now: stopAt,
}))
running, err := store.ListByStatus(ctx, runtime.StatusRunning)
require.NoError(t, err)
gotIDs := map[string]struct{}{}
for _, r := range running {
gotIDs[r.GameID] = struct{}{}
}
assert.Contains(t, gotIDs, a.GameID)
assert.Contains(t, gotIDs, c.GameID)
assert.NotContains(t, gotIDs, b.GameID)
stopped, err := store.ListByStatus(ctx, runtime.StatusStopped)
require.NoError(t, err)
require.Len(t, stopped, 1)
assert.Equal(t, b.GameID, stopped[0].GameID)
}
func TestListByStatusRejectsUnknown(t *testing.T) {
ctx := context.Background()
store := newStore(t)
_, err := store.ListByStatus(ctx, runtime.Status("exotic"))
require.Error(t, err)
}
func TestListReturnsEveryStatus(t *testing.T) {
ctx := context.Background()
store := newStore(t)
a := runningRecord(t, "game-aaa", "container-a", "galaxy/game:v1.2.3")
b := runningRecord(t, "game-bbb", "container-b", "galaxy/game:v1.2.3")
c := runningRecord(t, "game-ccc", "container-c", "galaxy/game:v1.2.3")
for _, r := range []runtime.RuntimeRecord{a, b, c} {
require.NoError(t, store.Upsert(ctx, r))
}
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: b.GameID,
ExpectedFrom: runtime.StatusRunning,
To: runtime.StatusStopped,
Now: b.LastOpAt.Add(time.Minute),
}))
all, err := store.List(ctx)
require.NoError(t, err)
require.Len(t, all, 3)
gotIDs := map[string]runtime.Status{}
for _, r := range all {
gotIDs[r.GameID] = r.Status
}
assert.Equal(t, runtime.StatusRunning, gotIDs[a.GameID])
assert.Equal(t, runtime.StatusStopped, gotIDs[b.GameID])
assert.Equal(t, runtime.StatusRunning, gotIDs[c.GameID])
}
func TestListReturnsNilWhenEmpty(t *testing.T) {
ctx := context.Background()
store := newStore(t)
all, err := store.List(ctx)
require.NoError(t, err)
assert.Nil(t, all)
}
func TestCountByStatusReturnsAllBuckets(t *testing.T) {
ctx := context.Background()
store := newStore(t)
a := runningRecord(t, "game-1", "container-1", "galaxy/game:v1.2.3")
b := runningRecord(t, "game-2", "container-2", "galaxy/game:v1.2.3")
c := runningRecord(t, "game-3", "container-3", "galaxy/game:v1.2.3")
for _, r := range []runtime.RuntimeRecord{a, b, c} {
require.NoError(t, store.Upsert(ctx, r))
}
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: b.GameID,
ExpectedFrom: runtime.StatusRunning,
To: runtime.StatusStopped,
Now: b.LastOpAt.Add(time.Minute),
}))
counts, err := store.CountByStatus(ctx)
require.NoError(t, err)
for _, status := range runtime.AllStatuses() {
_, ok := counts[status]
assert.True(t, ok, "status %q must appear in counts even when zero", status)
}
assert.Equal(t, 2, counts[runtime.StatusRunning])
assert.Equal(t, 1, counts[runtime.StatusStopped])
assert.Equal(t, 0, counts[runtime.StatusRemoved])
}
func TestNewRejectsNilDB(t *testing.T) {
_, err := runtimerecordstore.New(runtimerecordstore.Config{OperationTimeout: time.Second})
require.Error(t, err)
}
func TestNewRejectsNonPositiveTimeout(t *testing.T) {
_, err := runtimerecordstore.New(runtimerecordstore.Config{
DB: pgtest.Ensure(t).Pool(),
})
require.Error(t, err)
}
@@ -0,0 +1,117 @@
// Package gamelease implements the Redis-backed adapter for
// `ports.GameLeaseStore`.
//
// The lease guards every lifecycle operation Runtime Manager runs
// against one game (start, stop, restart, patch, cleanup, plus the
// reconciler's drift mutations). Acquisition uses `SET NX PX <ttl>`
// with a random caller token; release runs a Lua compare-and-delete
// so a holder that lost the lease through TTL expiry cannot wipe
// another caller's claim.
package gamelease
import (
"context"
"errors"
"fmt"
"strings"
"time"
"galaxy/rtmanager/internal/adapters/redisstate"
"galaxy/rtmanager/internal/ports"
"github.com/redis/go-redis/v9"
)
// releaseScript removes the per-game lease only when the supplied token
// still owns it. Compare-and-delete prevents a TTL-expired holder from
// clearing another caller's claim.
var releaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
end
return 0
`)
// Config configures one Redis-backed game lease store instance. The
// store does not own the redis client lifecycle; the caller (typically
// the service runtime) opens and closes it.
type Config struct {
// Client stores the Redis client the store uses for every command.
Client *redis.Client
}
// Store persists the per-game lifecycle lease in Redis.
type Store struct {
client *redis.Client
keys redisstate.Keyspace
}
// New constructs one Redis-backed game lease store from cfg.
func New(cfg Config) (*Store, error) {
if cfg.Client == nil {
return nil, errors.New("new rtmanager game lease store: nil redis client")
}
return &Store{
client: cfg.Client,
keys: redisstate.Keyspace{},
}, nil
}
// TryAcquire attempts to acquire the per-game lease for gameID owned by
// token for ttl. The acquired return is true on a successful claim and
// false when another caller still owns the lease. A non-nil error
// reports a transport failure and must not be confused with a missed
// lease.
func (store *Store) TryAcquire(ctx context.Context, gameID, token string, ttl time.Duration) (bool, error) {
if store == nil || store.client == nil {
return false, errors.New("try acquire game lease: nil store")
}
if ctx == nil {
return false, errors.New("try acquire game lease: nil context")
}
if strings.TrimSpace(gameID) == "" {
return false, errors.New("try acquire game lease: game id must not be empty")
}
if strings.TrimSpace(token) == "" {
return false, errors.New("try acquire game lease: token must not be empty")
}
if ttl <= 0 {
return false, errors.New("try acquire game lease: ttl must be positive")
}
acquired, err := store.client.SetNX(ctx, store.keys.GameLease(gameID), token, ttl).Result()
if err != nil {
return false, fmt.Errorf("try acquire game lease: %w", err)
}
return acquired, nil
}
// Release removes the per-game lease for gameID only when token still
// matches the stored owner value. A token mismatch is a silent no-op.
func (store *Store) Release(ctx context.Context, gameID, token string) error {
if store == nil || store.client == nil {
return errors.New("release game lease: nil store")
}
if ctx == nil {
return errors.New("release game lease: nil context")
}
if strings.TrimSpace(gameID) == "" {
return errors.New("release game lease: game id must not be empty")
}
if strings.TrimSpace(token) == "" {
return errors.New("release game lease: token must not be empty")
}
if err := releaseScript.Run(
ctx,
store.client,
[]string{store.keys.GameLease(gameID)},
token,
).Err(); err != nil {
return fmt.Errorf("release game lease: %w", err)
}
return nil
}
// Compile-time assertion: Store implements ports.GameLeaseStore.
var _ ports.GameLeaseStore = (*Store)(nil)
@@ -0,0 +1,133 @@
package gamelease_test
import (
"context"
"testing"
"time"
"galaxy/rtmanager/internal/adapters/redisstate"
"galaxy/rtmanager/internal/adapters/redisstate/gamelease"
"github.com/alicebob/miniredis/v2"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func newLeaseStore(t *testing.T) (*gamelease.Store, *miniredis.Miniredis) {
t.Helper()
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
t.Cleanup(func() { _ = client.Close() })
store, err := gamelease.New(gamelease.Config{Client: client})
require.NoError(t, err)
return store, server
}
func TestNewRejectsNilClient(t *testing.T) {
_, err := gamelease.New(gamelease.Config{})
require.Error(t, err)
}
func TestTryAcquireSetsKeyAndTTL(t *testing.T) {
store, server := newLeaseStore(t)
acquired, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute)
require.NoError(t, err)
assert.True(t, acquired)
key := redisstate.Keyspace{}.GameLease("game-1")
assert.True(t, server.Exists(key), "key %q must exist after TryAcquire", key)
stored, err := server.Get(key)
require.NoError(t, err)
assert.Equal(t, "token-A", stored)
// TTL must be positive (miniredis returns the remaining duration).
ttl := server.TTL(key)
assert.Greater(t, ttl, time.Duration(0))
}
func TestTryAcquireReturnsFalseWhenAlreadyHeld(t *testing.T) {
store, _ := newLeaseStore(t)
acquired, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute)
require.NoError(t, err)
require.True(t, acquired)
acquired, err = store.TryAcquire(context.Background(), "game-1", "token-B", time.Minute)
require.NoError(t, err)
assert.False(t, acquired)
}
func TestReleaseRemovesKeyForOwnerToken(t *testing.T) {
store, server := newLeaseStore(t)
_, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute)
require.NoError(t, err)
require.NoError(t, store.Release(context.Background(), "game-1", "token-A"))
key := redisstate.Keyspace{}.GameLease("game-1")
assert.False(t, server.Exists(key), "key %q must be deleted after Release", key)
}
func TestReleaseIsNoOpForForeignToken(t *testing.T) {
store, server := newLeaseStore(t)
_, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute)
require.NoError(t, err)
require.NoError(t, store.Release(context.Background(), "game-1", "token-B"))
key := redisstate.Keyspace{}.GameLease("game-1")
assert.True(t, server.Exists(key), "key %q must still exist when foreign token is released", key)
stored, err := server.Get(key)
require.NoError(t, err)
assert.Equal(t, "token-A", stored)
}
func TestTryAcquireSucceedsAfterTTLExpiry(t *testing.T) {
store, server := newLeaseStore(t)
acquired, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute)
require.NoError(t, err)
require.True(t, acquired)
server.FastForward(2 * time.Minute)
acquired, err = store.TryAcquire(context.Background(), "game-1", "token-B", time.Minute)
require.NoError(t, err)
assert.True(t, acquired)
}
func TestTryAcquireRejectsInvalidArguments(t *testing.T) {
store, _ := newLeaseStore(t)
_, err := store.TryAcquire(context.Background(), "", "token", time.Minute)
require.Error(t, err)
_, err = store.TryAcquire(context.Background(), "game-1", "", time.Minute)
require.Error(t, err)
_, err = store.TryAcquire(context.Background(), "game-1", "token", 0)
require.Error(t, err)
}
func TestReleaseRejectsInvalidArguments(t *testing.T) {
store, _ := newLeaseStore(t)
require.Error(t, store.Release(context.Background(), "", "token"))
require.Error(t, store.Release(context.Background(), "game-1", ""))
}
func TestKeyspaceGameLeaseIsPrefixedAndEncoded(t *testing.T) {
key := redisstate.Keyspace{}.GameLease("game with spaces")
assert.NotEmpty(t, key)
assert.Contains(t, key, "rtmanager:game_lease:")
suffix := key[len("rtmanager:game_lease:"):]
// base64url-encoded suffix must not contain the original spaces.
assert.NotContains(t, suffix, " ")
}
@@ -0,0 +1,44 @@
// Package redisstate hosts the Runtime Manager Redis adapters that share
// a single keyspace. Each sibling subpackage (e.g. `streamoffsets`)
// implements one port and uses Keyspace to compose its keys, so the
// Redis namespace stays under one document and one prefix.
//
// The package itself only declares the keyspace; concrete stores live in
// nested packages so dependencies (testcontainers, miniredis) stay out
// of consumer build graphs that do not need them.
package redisstate
import "encoding/base64"
// defaultPrefix is the mandatory `rtmanager:` namespace prefix shared by
// every Runtime Manager Redis key.
const defaultPrefix = "rtmanager:"
// Keyspace builds the Runtime Manager Redis keys. The namespace covers
// the stream consumer offsets and the per-game lifecycle lease in v1.
//
// Dynamic key segments are encoded with base64url so raw key structure
// does not depend on caller-provided characters; this matches the
// encoding chosen by `lobby/internal/adapters/redisstate.Keyspace`.
type Keyspace struct{}
// StreamOffset returns the Redis key that stores the last successfully
// processed entry id for one Redis Stream consumer. The streamLabel is
// the short logical identifier of the consumer (e.g. `start_jobs`,
// `stop_jobs`), not the full stream name; it stays stable when the
// underlying stream key is renamed.
func (Keyspace) StreamOffset(streamLabel string) string {
return defaultPrefix + "stream_offsets:" + encodeKeyComponent(streamLabel)
}
// GameLease returns the Redis key that stores the per-game lifecycle
// lease guarding start / stop / restart / patch / cleanup operations
// against the same game. The gameID is base64url-encoded so callers can
// pass any opaque identifier without escaping raw key characters.
func (Keyspace) GameLease(gameID string) string {
return defaultPrefix + "game_lease:" + encodeKeyComponent(gameID)
}
func encodeKeyComponent(value string) string {
return base64.RawURLEncoding.EncodeToString([]byte(value))
}
@@ -0,0 +1,94 @@
// Package streamoffsets implements the Redis-backed adapter for
// `ports.StreamOffsetStore`.
//
// The start-jobs and stop-jobs consumers call Load on startup to
// resume from the persisted offset and Save after every successful
// message handling. Keys are produced by
// `redisstate.Keyspace.StreamOffset`, mirroring the lobby pattern.
package streamoffsets
import (
"context"
"errors"
"fmt"
"strings"
"galaxy/rtmanager/internal/adapters/redisstate"
"galaxy/rtmanager/internal/ports"
"github.com/redis/go-redis/v9"
)
// Config configures one Redis-backed stream-offset store instance. The
// store does not own the redis client lifecycle; the caller (typically
// the service runtime) opens and closes it.
type Config struct {
// Client stores the Redis client the store uses for every command.
Client *redis.Client
}
// Store persists Runtime Manager stream consumer offsets in Redis.
type Store struct {
client *redis.Client
keys redisstate.Keyspace
}
// New constructs one Redis-backed stream-offset store from cfg.
func New(cfg Config) (*Store, error) {
if cfg.Client == nil {
return nil, errors.New("new rtmanager stream offset store: nil redis client")
}
return &Store{
client: cfg.Client,
keys: redisstate.Keyspace{},
}, nil
}
// Load returns the last processed entry id for streamLabel when one is
// stored. A missing key returns ("", false, nil).
func (store *Store) Load(ctx context.Context, streamLabel string) (string, bool, error) {
if store == nil || store.client == nil {
return "", false, errors.New("load rtmanager stream offset: nil store")
}
if ctx == nil {
return "", false, errors.New("load rtmanager stream offset: nil context")
}
if strings.TrimSpace(streamLabel) == "" {
return "", false, errors.New("load rtmanager stream offset: stream label must not be empty")
}
value, err := store.client.Get(ctx, store.keys.StreamOffset(streamLabel)).Result()
switch {
case errors.Is(err, redis.Nil):
return "", false, nil
case err != nil:
return "", false, fmt.Errorf("load rtmanager stream offset: %w", err)
}
return value, true, nil
}
// Save stores entryID as the new offset for streamLabel. The key has no
// TTL — offsets are durable and only overwritten by subsequent Saves.
func (store *Store) Save(ctx context.Context, streamLabel, entryID string) error {
if store == nil || store.client == nil {
return errors.New("save rtmanager stream offset: nil store")
}
if ctx == nil {
return errors.New("save rtmanager stream offset: nil context")
}
if strings.TrimSpace(streamLabel) == "" {
return errors.New("save rtmanager stream offset: stream label must not be empty")
}
if strings.TrimSpace(entryID) == "" {
return errors.New("save rtmanager stream offset: entry id must not be empty")
}
if err := store.client.Set(ctx, store.keys.StreamOffset(streamLabel), entryID, 0).Err(); err != nil {
return fmt.Errorf("save rtmanager stream offset: %w", err)
}
return nil
}
// Ensure Store satisfies the ports.StreamOffsetStore interface at
// compile time.
var _ ports.StreamOffsetStore = (*Store)(nil)
@@ -0,0 +1,86 @@
package streamoffsets_test
import (
"context"
"testing"
"galaxy/rtmanager/internal/adapters/redisstate"
"galaxy/rtmanager/internal/adapters/redisstate/streamoffsets"
"github.com/alicebob/miniredis/v2"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func newOffsetStore(t *testing.T) (*streamoffsets.Store, *miniredis.Miniredis) {
t.Helper()
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
t.Cleanup(func() { _ = client.Close() })
store, err := streamoffsets.New(streamoffsets.Config{Client: client})
require.NoError(t, err)
return store, server
}
func TestNewRejectsNilClient(t *testing.T) {
_, err := streamoffsets.New(streamoffsets.Config{})
require.Error(t, err)
}
func TestLoadMissingReturnsNotFound(t *testing.T) {
store, _ := newOffsetStore(t)
id, found, err := store.Load(context.Background(), "start_jobs")
require.NoError(t, err)
assert.False(t, found)
assert.Empty(t, id)
}
func TestSaveLoadRoundTrip(t *testing.T) {
store, server := newOffsetStore(t)
require.NoError(t, store.Save(context.Background(), "start_jobs", "1700000000000-0"))
id, found, err := store.Load(context.Background(), "start_jobs")
require.NoError(t, err)
assert.True(t, found)
assert.Equal(t, "1700000000000-0", id)
// The persisted key must follow the rtmanager keyspace prefix.
expectedKey := redisstate.Keyspace{}.StreamOffset("start_jobs")
assert.True(t, server.Exists(expectedKey),
"key %q must exist after Save", expectedKey)
}
func TestSaveOverwritesPriorValue(t *testing.T) {
store, _ := newOffsetStore(t)
require.NoError(t, store.Save(context.Background(), "start_jobs", "100-0"))
require.NoError(t, store.Save(context.Background(), "start_jobs", "200-0"))
id, found, err := store.Load(context.Background(), "start_jobs")
require.NoError(t, err)
assert.True(t, found)
assert.Equal(t, "200-0", id)
}
func TestLoadAndSaveRejectInvalidArguments(t *testing.T) {
store, _ := newOffsetStore(t)
require.Error(t, store.Save(context.Background(), "", "100-0"))
require.Error(t, store.Save(context.Background(), "start_jobs", ""))
_, _, err := store.Load(context.Background(), "")
require.Error(t, err)
}
func TestKeyspaceStreamOffsetIsPrefixed(t *testing.T) {
key := redisstate.Keyspace{}.StreamOffset("start_jobs")
assert.NotEmpty(t, key)
assert.Contains(t, key, "rtmanager:stream_offsets:")
// base64url-encoded label must not contain raw colons or spaces.
suffix := key[len("rtmanager:stream_offsets:"):]
assert.NotContains(t, suffix, ":")
}
@@ -0,0 +1,367 @@
package internalhttp
import (
"bytes"
"context"
"errors"
"io"
"net/http"
"net/http/httptest"
"path/filepath"
"runtime"
"strings"
"sync"
"testing"
"time"
"galaxy/rtmanager/internal/api/internalhttp/handlers"
domainruntime "galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/cleanupcontainer"
"galaxy/rtmanager/internal/service/patchruntime"
"galaxy/rtmanager/internal/service/restartruntime"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"github.com/getkin/kin-openapi/openapi3"
"github.com/getkin/kin-openapi/openapi3filter"
"github.com/getkin/kin-openapi/routers"
"github.com/getkin/kin-openapi/routers/legacy"
"github.com/stretchr/testify/require"
)
// TestInternalRESTConformance loads the OpenAPI specification, drives
// every runtime operation against the live internal HTTP listener
// backed by stub services, and validates each response body against
// the spec via `openapi3filter.ValidateResponse`. The test catches
// drift between the wire shape produced by the handler layer and the
// frozen contract; failure-path response shapes are validated by the
// per-handler tests in `handlers/<op>_test.go`.
func TestInternalRESTConformance(t *testing.T) {
t.Parallel()
doc := loadConformanceSpec(t)
router, err := legacy.NewRouter(doc)
require.NoError(t, err)
deps := newConformanceDeps(t)
server, err := NewServer(newConformanceConfig(), Dependencies{
Logger: nil,
Telemetry: nil,
Readiness: nil,
RuntimeRecords: deps.records,
StartRuntime: deps.start,
StopRuntime: deps.stop,
RestartRuntime: deps.restart,
PatchRuntime: deps.patch,
CleanupContainer: deps.cleanup,
})
require.NoError(t, err)
cases := []conformanceCase{
{
name: "internalListRuntimes",
method: http.MethodGet,
path: "/api/v1/internal/runtimes",
},
{
name: "internalGetRuntime",
method: http.MethodGet,
path: "/api/v1/internal/runtimes/" + conformanceGameID,
},
{
name: "internalStartRuntime",
method: http.MethodPost,
path: "/api/v1/internal/runtimes/" + conformanceGameID + "/start",
contentType: "application/json",
body: `{"image_ref":"galaxy/game:v1.2.3"}`,
},
{
name: "internalStopRuntime",
method: http.MethodPost,
path: "/api/v1/internal/runtimes/" + conformanceGameID + "/stop",
contentType: "application/json",
body: `{"reason":"admin_request"}`,
},
{
name: "internalRestartRuntime",
method: http.MethodPost,
path: "/api/v1/internal/runtimes/" + conformanceGameID + "/restart",
},
{
name: "internalPatchRuntime",
method: http.MethodPost,
path: "/api/v1/internal/runtimes/" + conformanceGameID + "/patch",
contentType: "application/json",
body: `{"image_ref":"galaxy/game:v1.2.4"}`,
},
{
name: "internalCleanupRuntimeContainer",
method: http.MethodDelete,
path: "/api/v1/internal/runtimes/" + conformanceGameID + "/container",
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
runConformanceCase(t, server.handler, router, tc)
})
}
}
// conformanceGameID is the path variable used for every per-game
// conformance request.
const conformanceGameID = "game-conformance"
// conformanceServerURL mirrors the canonical `servers[0].url` entry in
// `rtmanager/api/internal-openapi.yaml`. The legacy router matches
// requests against this prefix; updating the spec's server URL
// requires updating this constant.
const conformanceServerURL = "http://localhost:8096"
// conformanceCase describes one request the conformance test drives.
type conformanceCase struct {
name string
method string
path string
contentType string
body string
}
func runConformanceCase(t *testing.T, handler http.Handler, router routers.Router, tc conformanceCase) {
t.Helper()
// Drive the handler with the path-only form so the listener's
// http.ServeMux matches the registered routes (which use raw paths,
// without the OpenAPI server URL prefix).
var bodyReader io.Reader
if tc.body != "" {
bodyReader = strings.NewReader(tc.body)
}
request := httptest.NewRequest(tc.method, tc.path, bodyReader)
if tc.contentType != "" {
request.Header.Set("Content-Type", tc.contentType)
}
request.Header.Set("X-Galaxy-Caller", "admin")
recorder := httptest.NewRecorder()
handler.ServeHTTP(recorder, request)
require.Equalf(t, http.StatusOK, recorder.Code, "operation %s returned %d: %s", tc.name, recorder.Code, recorder.Body.String())
// kin-openapi's legacy router requires the request URL to match a
// `servers[].url` entry; rebuild the validation request with the
// canonical local server URL declared in the spec.
validationURL := conformanceServerURL + tc.path
validationRequest := httptest.NewRequest(tc.method, validationURL, bodyReaderFor(tc.body))
if tc.contentType != "" {
validationRequest.Header.Set("Content-Type", tc.contentType)
}
validationRequest.Header.Set("X-Galaxy-Caller", "admin")
route, pathParams, err := router.FindRoute(validationRequest)
require.NoError(t, err)
requestInput := &openapi3filter.RequestValidationInput{
Request: validationRequest,
PathParams: pathParams,
Route: route,
Options: &openapi3filter.Options{
IncludeResponseStatus: true,
},
}
require.NoError(t, openapi3filter.ValidateRequest(context.Background(), requestInput))
responseInput := &openapi3filter.ResponseValidationInput{
RequestValidationInput: requestInput,
Status: recorder.Code,
Header: recorder.Header(),
Options: &openapi3filter.Options{
IncludeResponseStatus: true,
},
}
responseInput.SetBodyBytes(recorder.Body.Bytes())
require.NoError(t, openapi3filter.ValidateResponse(context.Background(), responseInput))
}
func loadConformanceSpec(t *testing.T) *openapi3.T {
t.Helper()
_, thisFile, _, ok := runtime.Caller(0)
require.True(t, ok)
specPath := filepath.Join(filepath.Dir(thisFile), "..", "..", "..", "api", "internal-openapi.yaml")
loader := openapi3.NewLoader()
doc, err := loader.LoadFromFile(specPath)
require.NoError(t, err)
require.NoError(t, doc.Validate(context.Background()))
return doc
}
func bodyReaderFor(raw string) io.Reader {
if raw == "" {
return http.NoBody
}
return bytes.NewBufferString(raw)
}
// conformanceDeps groups the stub collaborators handed to the listener.
type conformanceDeps struct {
records *conformanceRecords
start *conformanceStart
stop *conformanceStop
restart *conformanceRestart
patch *conformancePatch
cleanup *conformanceCleanup
}
func newConformanceDeps(t *testing.T) *conformanceDeps {
t.Helper()
return &conformanceDeps{
records: newConformanceRecords(),
start: &conformanceStart{},
stop: &conformanceStop{},
restart: &conformanceRestart{},
patch: &conformancePatch{},
cleanup: &conformanceCleanup{},
}
}
func newConformanceConfig() Config {
return Config{
Addr: ":0",
ReadHeaderTimeout: time.Second,
ReadTimeout: time.Second,
WriteTimeout: time.Second,
IdleTimeout: time.Second,
}
}
// conformanceRecord builds a canonical running record used by every
// stub service.
func conformanceRecord() domainruntime.RuntimeRecord {
started := time.Date(2026, 4, 26, 13, 0, 0, 0, time.UTC)
return domainruntime.RuntimeRecord{
GameID: conformanceGameID,
Status: domainruntime.StatusRunning,
CurrentContainerID: "container-conformance",
CurrentImageRef: "galaxy/game:v1.2.3",
EngineEndpoint: "http://galaxy-game-" + conformanceGameID + ":8080",
StatePath: "/var/lib/galaxy/" + conformanceGameID,
DockerNetwork: "galaxy-engine",
StartedAt: &started,
LastOpAt: started,
CreatedAt: started,
}
}
// conformanceRecords is an in-memory record store seeded with one
// canonical record so the get / list endpoints have something to
// return.
type conformanceRecords struct {
mu sync.Mutex
stored map[string]domainruntime.RuntimeRecord
}
func newConformanceRecords() *conformanceRecords {
return &conformanceRecords{
stored: map[string]domainruntime.RuntimeRecord{
conformanceGameID: conformanceRecord(),
},
}
}
func (s *conformanceRecords) Get(_ context.Context, gameID string) (domainruntime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
record, ok := s.stored[gameID]
if !ok {
return domainruntime.RuntimeRecord{}, domainruntime.ErrNotFound
}
return record, nil
}
func (s *conformanceRecords) Upsert(_ context.Context, _ domainruntime.RuntimeRecord) error {
return errors.New("not used in conformance test")
}
func (s *conformanceRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
return errors.New("not used in conformance test")
}
func (s *conformanceRecords) ListByStatus(_ context.Context, _ domainruntime.Status) ([]domainruntime.RuntimeRecord, error) {
return nil, errors.New("not used in conformance test")
}
func (s *conformanceRecords) List(_ context.Context) ([]domainruntime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
out := make([]domainruntime.RuntimeRecord, 0, len(s.stored))
for _, record := range s.stored {
out = append(out, record)
}
return out, nil
}
// conformanceStart is the stub StartService used by the conformance
// test. Every Handle call returns the canonical record.
type conformanceStart struct{}
func (s *conformanceStart) Handle(_ context.Context, _ startruntime.Input) (startruntime.Result, error) {
return startruntime.Result{
Record: conformanceRecord(),
Outcome: "success",
}, nil
}
type conformanceStop struct{}
func (s *conformanceStop) Handle(_ context.Context, _ stopruntime.Input) (stopruntime.Result, error) {
rec := conformanceRecord()
rec.Status = domainruntime.StatusStopped
stopped := rec.LastOpAt.Add(time.Second)
rec.StoppedAt = &stopped
rec.LastOpAt = stopped
return stopruntime.Result{Record: rec, Outcome: "success"}, nil
}
type conformanceRestart struct{}
func (s *conformanceRestart) Handle(_ context.Context, _ restartruntime.Input) (restartruntime.Result, error) {
return restartruntime.Result{Record: conformanceRecord(), Outcome: "success"}, nil
}
type conformancePatch struct{}
func (s *conformancePatch) Handle(_ context.Context, in patchruntime.Input) (patchruntime.Result, error) {
rec := conformanceRecord()
if in.NewImageRef != "" {
rec.CurrentImageRef = in.NewImageRef
}
return patchruntime.Result{Record: rec, Outcome: "success"}, nil
}
type conformanceCleanup struct{}
func (s *conformanceCleanup) Handle(_ context.Context, _ cleanupcontainer.Input) (cleanupcontainer.Result, error) {
rec := conformanceRecord()
rec.Status = domainruntime.StatusRemoved
rec.CurrentContainerID = ""
removed := rec.LastOpAt.Add(time.Minute)
rec.RemovedAt = &removed
rec.LastOpAt = removed
return cleanupcontainer.Result{Record: rec, Outcome: "success"}, nil
}
// Compile-time guards: the stubs must satisfy the handler-level
// service ports plus ports.RuntimeRecordStore so the listener accepts
// them.
var (
_ handlers.StartService = (*conformanceStart)(nil)
_ handlers.StopService = (*conformanceStop)(nil)
_ handlers.RestartService = (*conformanceRestart)(nil)
_ handlers.PatchService = (*conformancePatch)(nil)
_ handlers.CleanupService = (*conformanceCleanup)(nil)
_ ports.RuntimeRecordStore = (*conformanceRecords)(nil)
)
@@ -0,0 +1,55 @@
package handlers
import (
"net/http"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/service/cleanupcontainer"
"galaxy/rtmanager/internal/service/startruntime"
)
// newCleanupHandler returns the handler for
// `DELETE /api/v1/internal/runtimes/{game_id}/container`. The OpenAPI
// spec declares no request body for this operation; any client-provided
// body is ignored.
func newCleanupHandler(deps Dependencies) http.HandlerFunc {
logger := loggerFor(deps.Logger, "internal_rest.cleanup")
return func(writer http.ResponseWriter, request *http.Request) {
if deps.CleanupContainer == nil {
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"cleanup container service is not wired",
)
return
}
gameID, ok := extractGameID(writer, request)
if !ok {
return
}
result, err := deps.CleanupContainer.Handle(request.Context(), cleanupcontainer.Input{
GameID: gameID,
OpSource: resolveOpSource(request),
SourceRef: requestSourceRef(request),
})
if err != nil {
logger.ErrorContext(request.Context(), "cleanup container service errored",
"game_id", gameID,
"err", err.Error(),
)
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"cleanup container service failed",
)
return
}
if result.Outcome == operation.OutcomeFailure {
writeFailure(writer, result.ErrorCode, result.ErrorMessage)
return
}
writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record))
}
}
@@ -0,0 +1,238 @@
package handlers
import (
"encoding/json"
"errors"
"io"
"log/slog"
"net/http"
"strings"
"time"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/service/startruntime"
)
// JSONContentType is the Content-Type used by every internal REST
// response. Exported so the listener-level tests can match it without
// re-declaring the constant.
const JSONContentType = "application/json; charset=utf-8"
// gameIDPathParam is the name of the {game_id} path variable shared by
// every per-game runtime endpoint.
const gameIDPathParam = "game_id"
// callerHeader is the HTTP header that distinguishes Game Master from
// Admin Service in the operation log. Documented in
// `rtmanager/api/internal-openapi.yaml` and
// `rtmanager/docs/services.md` §18.
const callerHeader = "X-Galaxy-Caller"
// errorCodeDockerUnavailable mirrors the OpenAPI error code value. The
// lifecycle services do not currently emit it (they use
// `service_unavailable` for Docker daemon failures); the handler layer
// maps it to 503 anyway so future producers do not require a handler
// change.
const errorCodeDockerUnavailable = "docker_unavailable"
// errorBody mirrors the `error` element of the OpenAPI ErrorResponse
// schema.
type errorBody struct {
Code string `json:"code"`
Message string `json:"message"`
}
// errorResponse mirrors the OpenAPI ErrorResponse envelope.
type errorResponse struct {
Error errorBody `json:"error"`
}
// runtimeRecordResponse mirrors the OpenAPI RuntimeRecord schema.
// Required fields use plain strings; nullable fields use pointers so an
// absent value encodes as the JSON literal `null` (matches the
// `nullable: true` declaration in the spec). Times are RFC3339 UTC.
type runtimeRecordResponse struct {
GameID string `json:"game_id"`
Status string `json:"status"`
CurrentContainerID *string `json:"current_container_id"`
CurrentImageRef *string `json:"current_image_ref"`
EngineEndpoint *string `json:"engine_endpoint"`
StatePath string `json:"state_path"`
DockerNetwork string `json:"docker_network"`
StartedAt *string `json:"started_at"`
StoppedAt *string `json:"stopped_at"`
RemovedAt *string `json:"removed_at"`
LastOpAt string `json:"last_op_at"`
CreatedAt string `json:"created_at"`
}
// runtimesListResponse mirrors the OpenAPI RuntimesList schema. Items
// is always non-nil so the JSON form carries `[]` rather than `null`
// for an empty result.
type runtimesListResponse struct {
Items []runtimeRecordResponse `json:"items"`
}
// encodeRuntimeRecord turns a domain RuntimeRecord into its wire shape.
func encodeRuntimeRecord(record runtime.RuntimeRecord) runtimeRecordResponse {
resp := runtimeRecordResponse{
GameID: record.GameID,
Status: string(record.Status),
StatePath: record.StatePath,
DockerNetwork: record.DockerNetwork,
LastOpAt: record.LastOpAt.UTC().Format(time.RFC3339Nano),
CreatedAt: record.CreatedAt.UTC().Format(time.RFC3339Nano),
}
if record.CurrentContainerID != "" {
v := record.CurrentContainerID
resp.CurrentContainerID = &v
}
if record.CurrentImageRef != "" {
v := record.CurrentImageRef
resp.CurrentImageRef = &v
}
if record.EngineEndpoint != "" {
v := record.EngineEndpoint
resp.EngineEndpoint = &v
}
if record.StartedAt != nil {
v := record.StartedAt.UTC().Format(time.RFC3339Nano)
resp.StartedAt = &v
}
if record.StoppedAt != nil {
v := record.StoppedAt.UTC().Format(time.RFC3339Nano)
resp.StoppedAt = &v
}
if record.RemovedAt != nil {
v := record.RemovedAt.UTC().Format(time.RFC3339Nano)
resp.RemovedAt = &v
}
return resp
}
// encodeRuntimesList builds the wire shape returned by the list handler.
// records may be nil (empty store); the result still carries an empty
// items slice so the JSON form is `{"items":[]}`.
func encodeRuntimesList(records []runtime.RuntimeRecord) runtimesListResponse {
resp := runtimesListResponse{
Items: make([]runtimeRecordResponse, 0, len(records)),
}
for _, record := range records {
resp.Items = append(resp.Items, encodeRuntimeRecord(record))
}
return resp
}
// writeJSON writes payload as a JSON response with the given status code.
func writeJSON(writer http.ResponseWriter, statusCode int, payload any) {
writer.Header().Set("Content-Type", JSONContentType)
writer.WriteHeader(statusCode)
_ = json.NewEncoder(writer).Encode(payload)
}
// writeError writes the canonical error envelope at statusCode.
func writeError(writer http.ResponseWriter, statusCode int, code, message string) {
writeJSON(writer, statusCode, errorResponse{
Error: errorBody{Code: code, Message: message},
})
}
// writeFailure writes the canonical error envelope using the HTTP
// status mapped from code. Used by every lifecycle handler when its
// service returns `Outcome=failure`.
func writeFailure(writer http.ResponseWriter, code, message string) {
writeError(writer, mapErrorCodeToStatus(code), code, message)
}
// mapErrorCodeToStatus maps a stable error code to the HTTP status
// declared by `rtmanager/api/internal-openapi.yaml`. Unknown codes
// degrade to 500 so a future error code that ships ahead of its
// handler-layer mapping still produces a structurally valid response.
func mapErrorCodeToStatus(code string) int {
switch code {
case startruntime.ErrorCodeInvalidRequest,
startruntime.ErrorCodeStartConfigInvalid,
startruntime.ErrorCodeImageRefNotSemver:
return http.StatusBadRequest
case startruntime.ErrorCodeNotFound:
return http.StatusNotFound
case startruntime.ErrorCodeConflict,
startruntime.ErrorCodeSemverPatchOnly:
return http.StatusConflict
case startruntime.ErrorCodeServiceUnavailable,
errorCodeDockerUnavailable:
return http.StatusServiceUnavailable
case startruntime.ErrorCodeImagePullFailed,
startruntime.ErrorCodeContainerStartFailed,
startruntime.ErrorCodeInternal:
return http.StatusInternalServerError
default:
return http.StatusInternalServerError
}
}
// decodeStrictJSON decodes one request body into target with strict
// JSON semantics: unknown fields are rejected and trailing content is
// rejected. Mirrors the helper used by lobby's internal HTTP layer.
func decodeStrictJSON(body io.Reader, target any) error {
decoder := json.NewDecoder(body)
decoder.DisallowUnknownFields()
if err := decoder.Decode(target); err != nil {
return err
}
if decoder.More() {
return errors.New("unexpected trailing content after JSON body")
}
return nil
}
// extractGameID pulls the {game_id} path variable from request. An empty
// or whitespace-only value writes a `400 invalid_request` and returns
// ok=false so callers can short-circuit.
func extractGameID(writer http.ResponseWriter, request *http.Request) (string, bool) {
raw := request.PathValue(gameIDPathParam)
if strings.TrimSpace(raw) == "" {
writeError(writer, http.StatusBadRequest,
startruntime.ErrorCodeInvalidRequest,
"game id is required",
)
return "", false
}
return raw, true
}
// resolveOpSource maps the X-Galaxy-Caller header to an
// `operation.OpSource`. Missing or unknown values default to
// `OpSourceAdminRest`, matching the contract documented in
// `rtmanager/api/internal-openapi.yaml`.
func resolveOpSource(request *http.Request) operation.OpSource {
switch strings.ToLower(strings.TrimSpace(request.Header.Get(callerHeader))) {
case "gm":
return operation.OpSourceGMRest
default:
return operation.OpSourceAdminRest
}
}
// requestSourceRef returns an opaque per-request reference recorded in
// `operation_log.source_ref`. v1 reads the `X-Request-ID` header when
// present so callers may correlate REST requests with audit rows; the
// listener does not currently install a request-id middleware so the
// header path is the only source.
func requestSourceRef(request *http.Request) string {
if v := strings.TrimSpace(request.Header.Get("X-Request-ID")); v != "" {
return v
}
return ""
}
// loggerFor returns a logger annotated with the operation tag. Each
// handler scopes its logs by op so operators filtering on
// `op=internal_rest.start` see exactly the lifecycle they care about.
func loggerFor(parent *slog.Logger, op string) *slog.Logger {
if parent == nil {
parent = slog.Default()
}
return parent.With("component", "internal_http.handlers", "op", op)
}
@@ -0,0 +1,197 @@
package handlers
import (
"context"
"encoding/json"
"errors"
"io"
"net/http"
"net/http/httptest"
"strings"
"sync"
"testing"
"time"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"github.com/stretchr/testify/require"
)
// fixedClock is the wall-clock used to build canonical sample records
// across the handler tests. UTC Sunday 1pm 2026-04-26 is far enough in
// the future to be obvious in test output.
var fixedClock = time.Date(2026, 4, 26, 13, 0, 0, 0, time.UTC)
// sampleRunningRecord returns a canonical running record used by every
// happy-path test in this package.
func sampleRunningRecord(t *testing.T) runtime.RuntimeRecord {
t.Helper()
started := fixedClock
return runtime.RuntimeRecord{
GameID: "game-test",
Status: runtime.StatusRunning,
CurrentContainerID: "container-test",
CurrentImageRef: "galaxy/game:v1.2.3",
EngineEndpoint: "http://galaxy-game-game-test:8080",
StatePath: "/var/lib/galaxy/game-test",
DockerNetwork: "galaxy-engine",
StartedAt: &started,
LastOpAt: fixedClock,
CreatedAt: fixedClock,
}
}
// sampleStoppedRecord returns a canonical stopped record useful for
// cleanup-handler and list-handler tests.
func sampleStoppedRecord(t *testing.T) runtime.RuntimeRecord {
t.Helper()
started := fixedClock
stopped := fixedClock.Add(time.Minute)
return runtime.RuntimeRecord{
GameID: "game-stopped",
Status: runtime.StatusStopped,
CurrentContainerID: "container-stopped",
CurrentImageRef: "galaxy/game:v1.2.3",
EngineEndpoint: "http://galaxy-game-game-stopped:8080",
StatePath: "/var/lib/galaxy/game-stopped",
DockerNetwork: "galaxy-engine",
StartedAt: &started,
StoppedAt: &stopped,
LastOpAt: stopped,
CreatedAt: fixedClock,
}
}
// drive routes one request through a full mux configured by Register.
// It returns the captured ResponseRecorder so tests can assert on
// status, headers, and body.
func drive(t *testing.T, deps Dependencies, method, path string, headers http.Header, body io.Reader) *httptest.ResponseRecorder {
t.Helper()
mux := http.NewServeMux()
Register(mux, deps)
request := httptest.NewRequest(method, path, body)
for key, values := range headers {
for _, value := range values {
request.Header.Add(key, value)
}
}
recorder := httptest.NewRecorder()
mux.ServeHTTP(recorder, request)
return recorder
}
// decodeRecordResponse asserts that the response carried a 200 with
// the canonical content type and decodes the record body.
func decodeRecordResponse(t *testing.T, rec *httptest.ResponseRecorder) runtimeRecordResponse {
t.Helper()
require.Equalf(t, http.StatusOK, rec.Code, "expected 200, got body: %s", rec.Body.String())
require.Equal(t, JSONContentType, rec.Header().Get("Content-Type"))
var resp runtimeRecordResponse
require.NoError(t, json.NewDecoder(rec.Body).Decode(&resp))
return resp
}
// decodeErrorBody asserts the canonical error envelope and decodes it.
func decodeErrorBody(t *testing.T, rec *httptest.ResponseRecorder, wantStatus int) errorBody {
t.Helper()
require.Equalf(t, wantStatus, rec.Code, "expected %d, got body: %s", wantStatus, rec.Body.String())
require.Equal(t, JSONContentType, rec.Header().Get("Content-Type"))
var resp errorResponse
require.NoError(t, json.NewDecoder(rec.Body).Decode(&resp))
return resp.Error
}
// fakeRuntimeRecords is an in-memory ports.RuntimeRecordStore used by
// list / get tests. It is intentionally minimal — services use their
// own fakes in `internal/service/<op>/service_test.go` and do not
// share this helper.
type fakeRuntimeRecords struct {
mu sync.Mutex
stored map[string]runtime.RuntimeRecord
listErr error
getErr error
}
func newFakeRuntimeRecords() *fakeRuntimeRecords {
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
}
func (s *fakeRuntimeRecords) put(record runtime.RuntimeRecord) {
s.mu.Lock()
defer s.mu.Unlock()
s.stored[record.GameID] = record
}
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.getErr != nil {
return runtime.RuntimeRecord{}, s.getErr
}
record, ok := s.stored[gameID]
if !ok {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
return record, nil
}
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error {
return errors.New("not used in handler tests")
}
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
return errors.New("not used in handler tests")
}
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in handler tests")
}
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.listErr != nil {
return nil, s.listErr
}
if len(s.stored) == 0 {
return nil, nil
}
records := make([]runtime.RuntimeRecord, 0, len(s.stored))
for _, record := range s.stored {
records = append(records, record)
}
return records, nil
}
// jsonHeaders returns the default headers used by tests that send a
// JSON body.
func jsonHeaders() http.Header {
h := http.Header{}
h.Set("Content-Type", "application/json")
return h
}
// withCaller adds the X-Galaxy-Caller header to h and returns h. The
// helper exists to keep test cases readable when the header is the
// only difference between two table rows.
func withCaller(h http.Header, value string) http.Header {
if h == nil {
h = http.Header{}
}
h.Set(callerHeader, value)
return h
}
// strReader builds an io.Reader from raw JSON.
func strReader(raw string) io.Reader {
return strings.NewReader(raw)
}
// Compile-time assertions that the in-memory fake satisfies the port.
var _ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
@@ -0,0 +1,55 @@
package handlers
import (
"errors"
"net/http"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/service/startruntime"
)
// newGetHandler returns the handler for
// `GET /api/v1/internal/runtimes/{game_id}`. The handler reads
// directly from the runtime record store and translates
// `runtime.ErrNotFound` to `404 not_found`. Like list, it does not
// run through the service layer and does not produce an operation_log
// row.
func newGetHandler(deps Dependencies) http.HandlerFunc {
logger := loggerFor(deps.Logger, "internal_rest.get")
return func(writer http.ResponseWriter, request *http.Request) {
if deps.RuntimeRecords == nil {
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"runtime records store is not wired",
)
return
}
gameID, ok := extractGameID(writer, request)
if !ok {
return
}
record, err := deps.RuntimeRecords.Get(request.Context(), gameID)
if errors.Is(err, runtime.ErrNotFound) {
writeError(writer, http.StatusNotFound,
startruntime.ErrorCodeNotFound,
"runtime record not found",
)
return
}
if err != nil {
logger.ErrorContext(request.Context(), "get runtime record",
"game_id", gameID,
"err", err.Error(),
)
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"failed to read runtime record",
)
return
}
writeJSON(writer, http.StatusOK, encodeRuntimeRecord(record))
}
}
@@ -0,0 +1,69 @@
package handlers
import (
"log/slog"
"net/http"
"galaxy/rtmanager/internal/ports"
)
// Route paths registered by Register. The values match the operation
// IDs frozen by `rtmanager/api/internal-openapi.yaml` and
// `rtmanager/contract_openapi_test.go`.
const (
listRuntimesPath = "/api/v1/internal/runtimes"
getRuntimePath = "/api/v1/internal/runtimes/{game_id}"
startRuntimePath = "/api/v1/internal/runtimes/{game_id}/start"
stopRuntimePath = "/api/v1/internal/runtimes/{game_id}/stop"
restartRuntimePath = "/api/v1/internal/runtimes/{game_id}/restart"
patchRuntimePath = "/api/v1/internal/runtimes/{game_id}/patch"
cleanupRuntimePath = "/api/v1/internal/runtimes/{game_id}/container"
)
// Dependencies bundles the collaborators required to serve the GM/Admin
// REST surface. Any service may be nil for tests that exercise a
// subset of the surface; in that case the unwired routes return
// `500 internal_error` (mirrors lobby's "service is not wired"
// pattern).
type Dependencies struct {
// Logger receives structured logs scoped per handler. nil falls back
// to slog.Default.
Logger *slog.Logger
// RuntimeRecords backs the read-only list and get handlers. They do
// not produce operation_log rows because they do not mutate state.
RuntimeRecords ports.RuntimeRecordStore
// StartRuntime executes the start lifecycle operation. Production
// wiring passes `*startruntime.Service` (the concrete service
// satisfies StartService).
StartRuntime StartService
// StopRuntime executes the stop lifecycle operation.
StopRuntime StopService
// RestartRuntime executes the restart lifecycle operation.
RestartRuntime RestartService
// PatchRuntime executes the patch lifecycle operation.
PatchRuntime PatchService
// CleanupContainer executes the cleanup_container lifecycle
// operation.
CleanupContainer CleanupService
}
// Register attaches every internal REST route to mux using deps. Each
// route reads its dependency lazily so a partially-wired Dependencies
// (e.g., a probe-only listener test) does not crash; missing
// dependencies surface as `500 internal_error`. Routes use Go 1.22
// method-aware mux patterns.
func Register(mux *http.ServeMux, deps Dependencies) {
mux.HandleFunc("GET "+listRuntimesPath, newListHandler(deps))
mux.HandleFunc("GET "+getRuntimePath, newGetHandler(deps))
mux.HandleFunc("POST "+startRuntimePath, newStartHandler(deps))
mux.HandleFunc("POST "+stopRuntimePath, newStopHandler(deps))
mux.HandleFunc("POST "+restartRuntimePath, newRestartHandler(deps))
mux.HandleFunc("POST "+patchRuntimePath, newPatchHandler(deps))
mux.HandleFunc("DELETE "+cleanupRuntimePath, newCleanupHandler(deps))
}
@@ -0,0 +1,610 @@
package handlers
import (
"context"
"net/http"
"testing"
"galaxy/rtmanager/internal/api/internalhttp/handlers/mocks"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/service/cleanupcontainer"
"galaxy/rtmanager/internal/service/patchruntime"
"galaxy/rtmanager/internal/service/restartruntime"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
)
// Tests for the mutating handlers (start, stop, restart, patch,
// cleanup). Each handler delegates to one lifecycle service through a
// narrow `mockgen`-backed interface; the handler layer is responsible
// for input parsing, the `X-Galaxy-Caller` → `op_source` mapping, and
// the canonical `ErrorCode` → HTTP status table documented in
// `rtmanager/docs/services.md` §18.
// --- start ---
func TestStartHandlerReturnsRecordOnSuccess(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockStartService(ctrl)
record := sampleRunningRecord(t)
mock.EXPECT().
Handle(gomock.Any(), gomock.AssignableToTypeOf(startruntime.Input{})).
DoAndReturn(func(_ context.Context, in startruntime.Input) (startruntime.Result, error) {
assert.Equal(t, "game-test", in.GameID)
assert.Equal(t, "galaxy/game:v1.2.3", in.ImageRef)
assert.Equal(t, operation.OpSourceAdminRest, in.OpSource)
return startruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil
})
deps := Dependencies{StartRuntime: mock}
rec := drive(t, deps, http.MethodPost, "/api/v1/internal/runtimes/game-test/start",
jsonHeaders(),
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
)
resp := decodeRecordResponse(t, rec)
assert.Equal(t, "game-test", resp.GameID)
assert.Equal(t, "running", resp.Status)
}
func TestStartHandlerReturnsRecordOnReplayNoOp(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockStartService(ctrl)
record := sampleRunningRecord(t)
mock.EXPECT().
Handle(gomock.Any(), gomock.Any()).
Return(startruntime.Result{
Record: record,
Outcome: operation.OutcomeSuccess,
ErrorCode: startruntime.ErrorCodeReplayNoOp,
}, nil)
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/start",
jsonHeaders(),
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
)
resp := decodeRecordResponse(t, rec)
assert.Equal(t, "game-test", resp.GameID)
}
func TestStartHandlerMapsServiceFailures(t *testing.T) {
t.Parallel()
cases := []struct {
name string
errorCode string
wantStatus int
}{
{"start_config_invalid", startruntime.ErrorCodeStartConfigInvalid, http.StatusBadRequest},
{"image_pull_failed", startruntime.ErrorCodeImagePullFailed, http.StatusInternalServerError},
{"container_start_failed", startruntime.ErrorCodeContainerStartFailed, http.StatusInternalServerError},
{"conflict", startruntime.ErrorCodeConflict, http.StatusConflict},
{"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable},
{"internal_error", startruntime.ErrorCodeInternal, http.StatusInternalServerError},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockStartService(ctrl)
mock.EXPECT().
Handle(gomock.Any(), gomock.Any()).
Return(startruntime.Result{
Outcome: operation.OutcomeFailure,
ErrorCode: tc.errorCode,
ErrorMessage: "synthetic " + tc.name,
}, nil)
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/start",
jsonHeaders(),
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
)
body := decodeErrorBody(t, rec, tc.wantStatus)
assert.Equal(t, tc.errorCode, body.Code)
assert.Equal(t, "synthetic "+tc.name, body.Message)
})
}
}
func TestStartHandlerRejectsUnknownJSONFields(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockStartService(ctrl)
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/start",
jsonHeaders(),
strReader(`{"image_ref":"x","extra":"y"}`),
)
body := decodeErrorBody(t, rec, http.StatusBadRequest)
assert.Equal(t, "invalid_request", body.Code)
}
func TestStartHandlerRejectsMalformedJSON(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockStartService(ctrl)
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/start",
jsonHeaders(),
strReader(`{"image_ref":`),
)
body := decodeErrorBody(t, rec, http.StatusBadRequest)
assert.Equal(t, "invalid_request", body.Code)
}
func TestStartHandlerHonoursXGalaxyCallerHeader(t *testing.T) {
t.Parallel()
cases := []struct {
header string
want operation.OpSource
hdrLabel string
}{
{"gm", operation.OpSourceGMRest, "gm"},
{"GM", operation.OpSourceGMRest, "uppercase gm"},
{"admin", operation.OpSourceAdminRest, "admin"},
{"unknown", operation.OpSourceAdminRest, "unknown value"},
{"", operation.OpSourceAdminRest, "missing header"},
}
for _, tc := range cases {
t.Run(tc.hdrLabel, func(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockStartService(ctrl)
record := sampleRunningRecord(t)
mock.EXPECT().
Handle(gomock.Any(), gomock.AssignableToTypeOf(startruntime.Input{})).
DoAndReturn(func(_ context.Context, in startruntime.Input) (startruntime.Result, error) {
assert.Equal(t, tc.want, in.OpSource)
return startruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil
})
headers := jsonHeaders()
if tc.header != "" {
headers = withCaller(headers, tc.header)
}
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/start",
headers,
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
)
require.Equal(t, http.StatusOK, rec.Code)
})
}
}
func TestStartHandlerForwardsXRequestIDAsSourceRef(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockStartService(ctrl)
mock.EXPECT().
Handle(gomock.Any(), gomock.AssignableToTypeOf(startruntime.Input{})).
DoAndReturn(func(_ context.Context, in startruntime.Input) (startruntime.Result, error) {
assert.Equal(t, "req-42", in.SourceRef)
return startruntime.Result{Record: sampleRunningRecord(t), Outcome: operation.OutcomeSuccess}, nil
})
headers := jsonHeaders()
headers.Set("X-Request-ID", "req-42")
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/start",
headers,
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
)
require.Equal(t, http.StatusOK, rec.Code)
}
func TestStartHandlerReturnsInternalErrorWhenServiceErrors(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockStartService(ctrl)
mock.EXPECT().
Handle(gomock.Any(), gomock.Any()).
Return(startruntime.Result{}, assert.AnError)
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/start",
jsonHeaders(),
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
)
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
assert.Equal(t, "internal_error", body.Code)
}
func TestStartHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) {
t.Parallel()
rec := drive(t, Dependencies{}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/start",
jsonHeaders(),
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
)
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
assert.Equal(t, "internal_error", body.Code)
}
// --- stop ---
func TestStopHandlerReturnsRecordOnSuccess(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockStopService(ctrl)
record := sampleStoppedRecord(t)
mock.EXPECT().
Handle(gomock.Any(), gomock.AssignableToTypeOf(stopruntime.Input{})).
DoAndReturn(func(_ context.Context, in stopruntime.Input) (stopruntime.Result, error) {
assert.Equal(t, "game-test", in.GameID)
assert.Equal(t, stopruntime.StopReasonAdminRequest, in.Reason)
assert.Equal(t, operation.OpSourceAdminRest, in.OpSource)
return stopruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil
})
rec := drive(t, Dependencies{StopRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/stop",
jsonHeaders(),
strReader(`{"reason":"admin_request"}`),
)
resp := decodeRecordResponse(t, rec)
assert.Equal(t, "stopped", resp.Status)
}
func TestStopHandlerMapsServiceFailures(t *testing.T) {
t.Parallel()
cases := []struct {
name string
errorCode string
wantStatus int
}{
{"not_found", startruntime.ErrorCodeNotFound, http.StatusNotFound},
{"conflict", startruntime.ErrorCodeConflict, http.StatusConflict},
{"invalid_request", startruntime.ErrorCodeInvalidRequest, http.StatusBadRequest},
{"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable},
{"internal_error", startruntime.ErrorCodeInternal, http.StatusInternalServerError},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockStopService(ctrl)
mock.EXPECT().Handle(gomock.Any(), gomock.Any()).Return(stopruntime.Result{
Outcome: operation.OutcomeFailure, ErrorCode: tc.errorCode, ErrorMessage: tc.name,
}, nil)
rec := drive(t, Dependencies{StopRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/stop",
jsonHeaders(),
strReader(`{"reason":"admin_request"}`),
)
body := decodeErrorBody(t, rec, tc.wantStatus)
assert.Equal(t, tc.errorCode, body.Code)
})
}
}
func TestStopHandlerRejectsUnknownJSONFields(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockStopService(ctrl)
rec := drive(t, Dependencies{StopRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/stop",
jsonHeaders(),
strReader(`{"reason":"admin_request","extra":1}`),
)
body := decodeErrorBody(t, rec, http.StatusBadRequest)
assert.Equal(t, "invalid_request", body.Code)
}
func TestStopHandlerHonoursXGalaxyCallerHeader(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockStopService(ctrl)
mock.EXPECT().
Handle(gomock.Any(), gomock.AssignableToTypeOf(stopruntime.Input{})).
DoAndReturn(func(_ context.Context, in stopruntime.Input) (stopruntime.Result, error) {
assert.Equal(t, operation.OpSourceGMRest, in.OpSource)
return stopruntime.Result{Record: sampleStoppedRecord(t), Outcome: operation.OutcomeSuccess}, nil
})
rec := drive(t, Dependencies{StopRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/stop",
withCaller(jsonHeaders(), "gm"),
strReader(`{"reason":"cancelled"}`),
)
require.Equal(t, http.StatusOK, rec.Code)
}
func TestStopHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) {
t.Parallel()
rec := drive(t, Dependencies{}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/stop",
jsonHeaders(),
strReader(`{"reason":"admin_request"}`),
)
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
assert.Equal(t, "internal_error", body.Code)
}
// --- restart ---
func TestRestartHandlerReturnsRecordOnSuccess(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockRestartService(ctrl)
record := sampleRunningRecord(t)
mock.EXPECT().
Handle(gomock.Any(), gomock.AssignableToTypeOf(restartruntime.Input{})).
DoAndReturn(func(_ context.Context, in restartruntime.Input) (restartruntime.Result, error) {
assert.Equal(t, "game-test", in.GameID)
assert.Equal(t, operation.OpSourceAdminRest, in.OpSource)
return restartruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil
})
rec := drive(t, Dependencies{RestartRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/restart", nil, nil,
)
resp := decodeRecordResponse(t, rec)
assert.Equal(t, "running", resp.Status)
}
func TestRestartHandlerMapsServiceFailures(t *testing.T) {
t.Parallel()
cases := []struct {
name string
errorCode string
wantStatus int
}{
{"not_found", startruntime.ErrorCodeNotFound, http.StatusNotFound},
{"conflict", startruntime.ErrorCodeConflict, http.StatusConflict},
{"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable},
{"internal_error", startruntime.ErrorCodeInternal, http.StatusInternalServerError},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockRestartService(ctrl)
mock.EXPECT().Handle(gomock.Any(), gomock.Any()).Return(restartruntime.Result{
Outcome: operation.OutcomeFailure, ErrorCode: tc.errorCode, ErrorMessage: tc.name,
}, nil)
rec := drive(t, Dependencies{RestartRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/restart", nil, nil,
)
body := decodeErrorBody(t, rec, tc.wantStatus)
assert.Equal(t, tc.errorCode, body.Code)
})
}
}
func TestRestartHandlerHonoursXGalaxyCallerHeader(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockRestartService(ctrl)
mock.EXPECT().
Handle(gomock.Any(), gomock.AssignableToTypeOf(restartruntime.Input{})).
DoAndReturn(func(_ context.Context, in restartruntime.Input) (restartruntime.Result, error) {
assert.Equal(t, operation.OpSourceGMRest, in.OpSource)
return restartruntime.Result{Record: sampleRunningRecord(t), Outcome: operation.OutcomeSuccess}, nil
})
rec := drive(t, Dependencies{RestartRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/restart",
withCaller(http.Header{}, "gm"), nil,
)
require.Equal(t, http.StatusOK, rec.Code)
}
func TestRestartHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) {
t.Parallel()
rec := drive(t, Dependencies{}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/restart", nil, nil,
)
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
assert.Equal(t, "internal_error", body.Code)
}
// --- patch ---
func TestPatchHandlerReturnsRecordOnSuccess(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockPatchService(ctrl)
record := sampleRunningRecord(t)
mock.EXPECT().
Handle(gomock.Any(), gomock.AssignableToTypeOf(patchruntime.Input{})).
DoAndReturn(func(_ context.Context, in patchruntime.Input) (patchruntime.Result, error) {
assert.Equal(t, "game-test", in.GameID)
assert.Equal(t, "galaxy/game:v1.2.4", in.NewImageRef)
return patchruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil
})
rec := drive(t, Dependencies{PatchRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/patch",
jsonHeaders(),
strReader(`{"image_ref":"galaxy/game:v1.2.4"}`),
)
resp := decodeRecordResponse(t, rec)
assert.Equal(t, "running", resp.Status)
}
func TestPatchHandlerMapsServiceFailures(t *testing.T) {
t.Parallel()
cases := []struct {
name string
errorCode string
wantStatus int
}{
{"image_ref_not_semver", startruntime.ErrorCodeImageRefNotSemver, http.StatusBadRequest},
{"semver_patch_only", startruntime.ErrorCodeSemverPatchOnly, http.StatusConflict},
{"not_found", startruntime.ErrorCodeNotFound, http.StatusNotFound},
{"conflict", startruntime.ErrorCodeConflict, http.StatusConflict},
{"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockPatchService(ctrl)
mock.EXPECT().Handle(gomock.Any(), gomock.Any()).Return(patchruntime.Result{
Outcome: operation.OutcomeFailure, ErrorCode: tc.errorCode, ErrorMessage: tc.name,
}, nil)
rec := drive(t, Dependencies{PatchRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/patch",
jsonHeaders(),
strReader(`{"image_ref":"galaxy/game:v1.2.4"}`),
)
body := decodeErrorBody(t, rec, tc.wantStatus)
assert.Equal(t, tc.errorCode, body.Code)
})
}
}
func TestPatchHandlerRejectsUnknownJSONFields(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockPatchService(ctrl)
rec := drive(t, Dependencies{PatchRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/patch",
jsonHeaders(),
strReader(`{"image_ref":"x","unexpected":true}`),
)
body := decodeErrorBody(t, rec, http.StatusBadRequest)
assert.Equal(t, "invalid_request", body.Code)
}
func TestPatchHandlerHonoursXGalaxyCallerHeader(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockPatchService(ctrl)
mock.EXPECT().
Handle(gomock.Any(), gomock.AssignableToTypeOf(patchruntime.Input{})).
DoAndReturn(func(_ context.Context, in patchruntime.Input) (patchruntime.Result, error) {
assert.Equal(t, operation.OpSourceGMRest, in.OpSource)
return patchruntime.Result{Record: sampleRunningRecord(t), Outcome: operation.OutcomeSuccess}, nil
})
rec := drive(t, Dependencies{PatchRuntime: mock}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/patch",
withCaller(jsonHeaders(), "gm"),
strReader(`{"image_ref":"galaxy/game:v1.2.4"}`),
)
require.Equal(t, http.StatusOK, rec.Code)
}
func TestPatchHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) {
t.Parallel()
rec := drive(t, Dependencies{}, http.MethodPost,
"/api/v1/internal/runtimes/game-test/patch",
jsonHeaders(),
strReader(`{"image_ref":"galaxy/game:v1.2.4"}`),
)
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
assert.Equal(t, "internal_error", body.Code)
}
// --- cleanup ---
func TestCleanupHandlerReturnsRecordOnSuccess(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockCleanupService(ctrl)
record := sampleStoppedRecord(t)
record.Status = runtime.StatusRemoved
record.CurrentContainerID = ""
removed := record.LastOpAt
record.RemovedAt = &removed
mock.EXPECT().
Handle(gomock.Any(), gomock.AssignableToTypeOf(cleanupcontainer.Input{})).
DoAndReturn(func(_ context.Context, in cleanupcontainer.Input) (cleanupcontainer.Result, error) {
assert.Equal(t, "game-stopped", in.GameID)
assert.Equal(t, operation.OpSourceAdminRest, in.OpSource)
return cleanupcontainer.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil
})
rec := drive(t, Dependencies{CleanupContainer: mock}, http.MethodDelete,
"/api/v1/internal/runtimes/game-stopped/container", nil, nil,
)
resp := decodeRecordResponse(t, rec)
assert.Equal(t, "removed", resp.Status)
assert.Nil(t, resp.CurrentContainerID, "container id must be null after cleanup")
}
func TestCleanupHandlerMapsServiceFailures(t *testing.T) {
t.Parallel()
cases := []struct {
name string
errorCode string
wantStatus int
}{
{"not_found", startruntime.ErrorCodeNotFound, http.StatusNotFound},
{"conflict", startruntime.ErrorCodeConflict, http.StatusConflict},
{"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mock := mocks.NewMockCleanupService(ctrl)
mock.EXPECT().Handle(gomock.Any(), gomock.Any()).Return(cleanupcontainer.Result{
Outcome: operation.OutcomeFailure, ErrorCode: tc.errorCode, ErrorMessage: tc.name,
}, nil)
rec := drive(t, Dependencies{CleanupContainer: mock}, http.MethodDelete,
"/api/v1/internal/runtimes/game-test/container", nil, nil,
)
body := decodeErrorBody(t, rec, tc.wantStatus)
assert.Equal(t, tc.errorCode, body.Code)
})
}
}
func TestCleanupHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) {
t.Parallel()
rec := drive(t, Dependencies{}, http.MethodDelete,
"/api/v1/internal/runtimes/game-test/container", nil, nil,
)
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
assert.Equal(t, "internal_error", body.Code)
}
@@ -0,0 +1,115 @@
package handlers
import (
"encoding/json"
"errors"
"net/http"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// Tests for the read-only handlers (`internalListRuntimes`,
// `internalGetRuntime`). These bypass the service layer and read
// directly from `ports.RuntimeRecordStore` — see
// `rtmanager/docs/services.md` §18.
func TestListHandlerReturnsEmptyItemsForEmptyStore(t *testing.T) {
t.Parallel()
deps := Dependencies{RuntimeRecords: newFakeRuntimeRecords()}
rec := drive(t, deps, http.MethodGet, "/api/v1/internal/runtimes", nil, nil)
require.Equal(t, http.StatusOK, rec.Code)
require.Equal(t, JSONContentType, rec.Header().Get("Content-Type"))
var resp runtimesListResponse
require.NoError(t, json.NewDecoder(rec.Body).Decode(&resp))
require.NotNil(t, resp.Items, "items must never be nil")
assert.Empty(t, resp.Items)
}
func TestListHandlerReturnsEveryStoredRecord(t *testing.T) {
t.Parallel()
store := newFakeRuntimeRecords()
store.put(sampleRunningRecord(t))
store.put(sampleStoppedRecord(t))
rec := drive(t, Dependencies{RuntimeRecords: store}, http.MethodGet, "/api/v1/internal/runtimes", nil, nil)
require.Equal(t, http.StatusOK, rec.Code)
var resp runtimesListResponse
require.NoError(t, json.NewDecoder(rec.Body).Decode(&resp))
require.Len(t, resp.Items, 2)
gotIDs := map[string]string{}
for _, item := range resp.Items {
gotIDs[item.GameID] = item.Status
}
assert.Equal(t, "running", gotIDs["game-test"])
assert.Equal(t, "stopped", gotIDs["game-stopped"])
}
func TestListHandlerReturnsInternalErrorWhenStoreFails(t *testing.T) {
t.Parallel()
store := newFakeRuntimeRecords()
store.listErr = errors.New("postgres exploded")
rec := drive(t, Dependencies{RuntimeRecords: store}, http.MethodGet, "/api/v1/internal/runtimes", nil, nil)
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
assert.Equal(t, "internal_error", body.Code)
}
func TestListHandlerReturnsInternalErrorWhenStoreNotWired(t *testing.T) {
t.Parallel()
rec := drive(t, Dependencies{}, http.MethodGet, "/api/v1/internal/runtimes", nil, nil)
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
assert.Equal(t, "internal_error", body.Code)
}
func TestGetHandlerReturnsTheRecord(t *testing.T) {
t.Parallel()
store := newFakeRuntimeRecords()
record := sampleRunningRecord(t)
store.put(record)
rec := drive(t, Dependencies{RuntimeRecords: store}, http.MethodGet, "/api/v1/internal/runtimes/game-test", nil, nil)
resp := decodeRecordResponse(t, rec)
assert.Equal(t, "game-test", resp.GameID)
assert.Equal(t, "running", resp.Status)
if assert.NotNil(t, resp.CurrentImageRef) {
assert.Equal(t, "galaxy/game:v1.2.3", *resp.CurrentImageRef)
}
}
func TestGetHandlerReturnsNotFoundForMissingRecord(t *testing.T) {
t.Parallel()
rec := drive(t, Dependencies{RuntimeRecords: newFakeRuntimeRecords()}, http.MethodGet, "/api/v1/internal/runtimes/game-missing", nil, nil)
body := decodeErrorBody(t, rec, http.StatusNotFound)
assert.Equal(t, "not_found", body.Code)
}
func TestGetHandlerReturnsInternalErrorWhenStoreFails(t *testing.T) {
t.Parallel()
store := newFakeRuntimeRecords()
store.getErr = errors.New("transport blew up")
rec := drive(t, Dependencies{RuntimeRecords: store}, http.MethodGet, "/api/v1/internal/runtimes/game-test", nil, nil)
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
assert.Equal(t, "internal_error", body.Code)
}
func TestGetHandlerReturnsInternalErrorWhenStoreNotWired(t *testing.T) {
t.Parallel()
rec := drive(t, Dependencies{}, http.MethodGet, "/api/v1/internal/runtimes/game-test", nil, nil)
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
assert.Equal(t, "internal_error", body.Code)
}
@@ -0,0 +1,38 @@
package handlers
import (
"net/http"
"galaxy/rtmanager/internal/service/startruntime"
)
// newListHandler returns the handler for `GET /api/v1/internal/runtimes`.
// The handler reads directly from `ports.RuntimeRecordStore.List` —
// this surface is read-only and does not produce operation_log rows
// (rationale: see `rtmanager/docs/services.md` §18).
func newListHandler(deps Dependencies) http.HandlerFunc {
logger := loggerFor(deps.Logger, "internal_rest.list")
return func(writer http.ResponseWriter, request *http.Request) {
if deps.RuntimeRecords == nil {
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"runtime records store is not wired",
)
return
}
records, err := deps.RuntimeRecords.List(request.Context())
if err != nil {
logger.ErrorContext(request.Context(), "list runtime records",
"err", err.Error(),
)
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"failed to list runtime records",
)
return
}
writeJSON(writer, http.StatusOK, encodeRuntimesList(records))
}
}
@@ -0,0 +1,217 @@
// Code generated by MockGen. DO NOT EDIT.
// Source: galaxy/rtmanager/internal/api/internalhttp/handlers (interfaces: StartService,StopService,RestartService,PatchService,CleanupService)
//
// Generated by this command:
//
// mockgen -destination=mocks/mock_services.go -package=mocks galaxy/rtmanager/internal/api/internalhttp/handlers StartService,StopService,RestartService,PatchService,CleanupService
//
// Package mocks is a generated GoMock package.
package mocks
import (
context "context"
cleanupcontainer "galaxy/rtmanager/internal/service/cleanupcontainer"
patchruntime "galaxy/rtmanager/internal/service/patchruntime"
restartruntime "galaxy/rtmanager/internal/service/restartruntime"
startruntime "galaxy/rtmanager/internal/service/startruntime"
stopruntime "galaxy/rtmanager/internal/service/stopruntime"
reflect "reflect"
gomock "go.uber.org/mock/gomock"
)
// MockStartService is a mock of StartService interface.
type MockStartService struct {
ctrl *gomock.Controller
recorder *MockStartServiceMockRecorder
isgomock struct{}
}
// MockStartServiceMockRecorder is the mock recorder for MockStartService.
type MockStartServiceMockRecorder struct {
mock *MockStartService
}
// NewMockStartService creates a new mock instance.
func NewMockStartService(ctrl *gomock.Controller) *MockStartService {
mock := &MockStartService{ctrl: ctrl}
mock.recorder = &MockStartServiceMockRecorder{mock}
return mock
}
// EXPECT returns an object that allows the caller to indicate expected use.
func (m *MockStartService) EXPECT() *MockStartServiceMockRecorder {
return m.recorder
}
// Handle mocks base method.
func (m *MockStartService) Handle(ctx context.Context, in startruntime.Input) (startruntime.Result, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Handle", ctx, in)
ret0, _ := ret[0].(startruntime.Result)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// Handle indicates an expected call of Handle.
func (mr *MockStartServiceMockRecorder) Handle(ctx, in any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockStartService)(nil).Handle), ctx, in)
}
// MockStopService is a mock of StopService interface.
type MockStopService struct {
ctrl *gomock.Controller
recorder *MockStopServiceMockRecorder
isgomock struct{}
}
// MockStopServiceMockRecorder is the mock recorder for MockStopService.
type MockStopServiceMockRecorder struct {
mock *MockStopService
}
// NewMockStopService creates a new mock instance.
func NewMockStopService(ctrl *gomock.Controller) *MockStopService {
mock := &MockStopService{ctrl: ctrl}
mock.recorder = &MockStopServiceMockRecorder{mock}
return mock
}
// EXPECT returns an object that allows the caller to indicate expected use.
func (m *MockStopService) EXPECT() *MockStopServiceMockRecorder {
return m.recorder
}
// Handle mocks base method.
func (m *MockStopService) Handle(ctx context.Context, in stopruntime.Input) (stopruntime.Result, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Handle", ctx, in)
ret0, _ := ret[0].(stopruntime.Result)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// Handle indicates an expected call of Handle.
func (mr *MockStopServiceMockRecorder) Handle(ctx, in any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockStopService)(nil).Handle), ctx, in)
}
// MockRestartService is a mock of RestartService interface.
type MockRestartService struct {
ctrl *gomock.Controller
recorder *MockRestartServiceMockRecorder
isgomock struct{}
}
// MockRestartServiceMockRecorder is the mock recorder for MockRestartService.
type MockRestartServiceMockRecorder struct {
mock *MockRestartService
}
// NewMockRestartService creates a new mock instance.
func NewMockRestartService(ctrl *gomock.Controller) *MockRestartService {
mock := &MockRestartService{ctrl: ctrl}
mock.recorder = &MockRestartServiceMockRecorder{mock}
return mock
}
// EXPECT returns an object that allows the caller to indicate expected use.
func (m *MockRestartService) EXPECT() *MockRestartServiceMockRecorder {
return m.recorder
}
// Handle mocks base method.
func (m *MockRestartService) Handle(ctx context.Context, in restartruntime.Input) (restartruntime.Result, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Handle", ctx, in)
ret0, _ := ret[0].(restartruntime.Result)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// Handle indicates an expected call of Handle.
func (mr *MockRestartServiceMockRecorder) Handle(ctx, in any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockRestartService)(nil).Handle), ctx, in)
}
// MockPatchService is a mock of PatchService interface.
type MockPatchService struct {
ctrl *gomock.Controller
recorder *MockPatchServiceMockRecorder
isgomock struct{}
}
// MockPatchServiceMockRecorder is the mock recorder for MockPatchService.
type MockPatchServiceMockRecorder struct {
mock *MockPatchService
}
// NewMockPatchService creates a new mock instance.
func NewMockPatchService(ctrl *gomock.Controller) *MockPatchService {
mock := &MockPatchService{ctrl: ctrl}
mock.recorder = &MockPatchServiceMockRecorder{mock}
return mock
}
// EXPECT returns an object that allows the caller to indicate expected use.
func (m *MockPatchService) EXPECT() *MockPatchServiceMockRecorder {
return m.recorder
}
// Handle mocks base method.
func (m *MockPatchService) Handle(ctx context.Context, in patchruntime.Input) (patchruntime.Result, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Handle", ctx, in)
ret0, _ := ret[0].(patchruntime.Result)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// Handle indicates an expected call of Handle.
func (mr *MockPatchServiceMockRecorder) Handle(ctx, in any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockPatchService)(nil).Handle), ctx, in)
}
// MockCleanupService is a mock of CleanupService interface.
type MockCleanupService struct {
ctrl *gomock.Controller
recorder *MockCleanupServiceMockRecorder
isgomock struct{}
}
// MockCleanupServiceMockRecorder is the mock recorder for MockCleanupService.
type MockCleanupServiceMockRecorder struct {
mock *MockCleanupService
}
// NewMockCleanupService creates a new mock instance.
func NewMockCleanupService(ctrl *gomock.Controller) *MockCleanupService {
mock := &MockCleanupService{ctrl: ctrl}
mock.recorder = &MockCleanupServiceMockRecorder{mock}
return mock
}
// EXPECT returns an object that allows the caller to indicate expected use.
func (m *MockCleanupService) EXPECT() *MockCleanupServiceMockRecorder {
return m.recorder
}
// Handle mocks base method.
func (m *MockCleanupService) Handle(ctx context.Context, in cleanupcontainer.Input) (cleanupcontainer.Result, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Handle", ctx, in)
ret0, _ := ret[0].(cleanupcontainer.Result)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// Handle indicates an expected call of Handle.
func (mr *MockCleanupServiceMockRecorder) Handle(ctx, in any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockCleanupService)(nil).Handle), ctx, in)
}
@@ -0,0 +1,71 @@
package handlers
import (
"net/http"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/service/patchruntime"
"galaxy/rtmanager/internal/service/startruntime"
)
// patchRequestBody mirrors the OpenAPI PatchRequest schema. The
// service layer validates `image_ref` shape (semver, distribution
// reference) and surfaces `image_ref_not_semver` /
// `semver_patch_only` as needed.
type patchRequestBody struct {
ImageRef string `json:"image_ref"`
}
// newPatchHandler returns the handler for
// `POST /api/v1/internal/runtimes/{game_id}/patch`.
func newPatchHandler(deps Dependencies) http.HandlerFunc {
logger := loggerFor(deps.Logger, "internal_rest.patch")
return func(writer http.ResponseWriter, request *http.Request) {
if deps.PatchRuntime == nil {
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"patch runtime service is not wired",
)
return
}
gameID, ok := extractGameID(writer, request)
if !ok {
return
}
var body patchRequestBody
if err := decodeStrictJSON(request.Body, &body); err != nil {
writeError(writer, http.StatusBadRequest,
startruntime.ErrorCodeInvalidRequest,
err.Error(),
)
return
}
result, err := deps.PatchRuntime.Handle(request.Context(), patchruntime.Input{
GameID: gameID,
NewImageRef: body.ImageRef,
OpSource: resolveOpSource(request),
SourceRef: requestSourceRef(request),
})
if err != nil {
logger.ErrorContext(request.Context(), "patch runtime service errored",
"game_id", gameID,
"err", err.Error(),
)
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"patch runtime service failed",
)
return
}
if result.Outcome == operation.OutcomeFailure {
writeFailure(writer, result.ErrorCode, result.ErrorMessage)
return
}
writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record))
}
}
@@ -0,0 +1,55 @@
package handlers
import (
"net/http"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/service/restartruntime"
"galaxy/rtmanager/internal/service/startruntime"
)
// newRestartHandler returns the handler for
// `POST /api/v1/internal/runtimes/{game_id}/restart`. The OpenAPI spec
// declares no request body for this operation; any client-provided
// body is ignored.
func newRestartHandler(deps Dependencies) http.HandlerFunc {
logger := loggerFor(deps.Logger, "internal_rest.restart")
return func(writer http.ResponseWriter, request *http.Request) {
if deps.RestartRuntime == nil {
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"restart runtime service is not wired",
)
return
}
gameID, ok := extractGameID(writer, request)
if !ok {
return
}
result, err := deps.RestartRuntime.Handle(request.Context(), restartruntime.Input{
GameID: gameID,
OpSource: resolveOpSource(request),
SourceRef: requestSourceRef(request),
})
if err != nil {
logger.ErrorContext(request.Context(), "restart runtime service errored",
"game_id", gameID,
"err", err.Error(),
)
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"restart runtime service failed",
)
return
}
if result.Outcome == operation.OutcomeFailure {
writeFailure(writer, result.ErrorCode, result.ErrorMessage)
return
}
writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record))
}
}
@@ -0,0 +1,54 @@
// Package handlers ships the GM/Admin-facing internal REST surface of
// Runtime Manager. The package is consumed by
// `galaxy/rtmanager/internal/api/internalhttp`; each handler delegates
// to one of the lifecycle services in `internal/service/`
// (`startruntime`, `stopruntime`, `restartruntime`, `patchruntime`,
// `cleanupcontainer`) or reads directly from `ports.RuntimeRecordStore`
// (list / get).
//
// The interfaces declared in this file mirror the single `Handle`
// method exposed by every concrete lifecycle service. Production wiring
// passes the concrete service pointers; tests pass `mockgen`-generated
// mocks. The narrow shape keeps the handler layer free of service
// internals (lease tokens, telemetry, durable side effects) and matches
// the repo-wide `mockgen` convention for wide / recorder ports.
package handlers
import (
"context"
"galaxy/rtmanager/internal/service/cleanupcontainer"
"galaxy/rtmanager/internal/service/patchruntime"
"galaxy/rtmanager/internal/service/restartruntime"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
)
//go:generate go run go.uber.org/mock/mockgen -destination=mocks/mock_services.go -package=mocks galaxy/rtmanager/internal/api/internalhttp/handlers StartService,StopService,RestartService,PatchService,CleanupService
// StartService is the narrow port the start handler depends on. It
// matches the public Handle method of `startruntime.Service`; the
// concrete service satisfies the interface implicitly.
type StartService interface {
Handle(ctx context.Context, in startruntime.Input) (startruntime.Result, error)
}
// StopService is the narrow port the stop handler depends on.
type StopService interface {
Handle(ctx context.Context, in stopruntime.Input) (stopruntime.Result, error)
}
// RestartService is the narrow port the restart handler depends on.
type RestartService interface {
Handle(ctx context.Context, in restartruntime.Input) (restartruntime.Result, error)
}
// PatchService is the narrow port the patch handler depends on.
type PatchService interface {
Handle(ctx context.Context, in patchruntime.Input) (patchruntime.Result, error)
}
// CleanupService is the narrow port the cleanup handler depends on.
type CleanupService interface {
Handle(ctx context.Context, in cleanupcontainer.Input) (cleanupcontainer.Result, error)
}
@@ -0,0 +1,71 @@
package handlers
import (
"net/http"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/service/startruntime"
)
// startRequestBody mirrors the OpenAPI StartRequest schema. Only
// `image_ref` is accepted; unknown fields are rejected by
// decodeStrictJSON.
type startRequestBody struct {
ImageRef string `json:"image_ref"`
}
// newStartHandler returns the handler for
// `POST /api/v1/internal/runtimes/{game_id}/start`. The handler
// delegates the entire lifecycle to `startruntime.Service`; failure
// codes are mapped to HTTP statuses via mapErrorCodeToStatus.
func newStartHandler(deps Dependencies) http.HandlerFunc {
logger := loggerFor(deps.Logger, "internal_rest.start")
return func(writer http.ResponseWriter, request *http.Request) {
if deps.StartRuntime == nil {
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"start runtime service is not wired",
)
return
}
gameID, ok := extractGameID(writer, request)
if !ok {
return
}
var body startRequestBody
if err := decodeStrictJSON(request.Body, &body); err != nil {
writeError(writer, http.StatusBadRequest,
startruntime.ErrorCodeInvalidRequest,
err.Error(),
)
return
}
result, err := deps.StartRuntime.Handle(request.Context(), startruntime.Input{
GameID: gameID,
ImageRef: body.ImageRef,
OpSource: resolveOpSource(request),
SourceRef: requestSourceRef(request),
})
if err != nil {
logger.ErrorContext(request.Context(), "start runtime service errored",
"game_id", gameID,
"err", err.Error(),
)
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"start runtime service failed",
)
return
}
if result.Outcome == operation.OutcomeFailure {
writeFailure(writer, result.ErrorCode, result.ErrorMessage)
return
}
writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record))
}
}
@@ -0,0 +1,70 @@
package handlers
import (
"net/http"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
)
// stopRequestBody mirrors the OpenAPI StopRequest schema. The reason
// enum is validated at the service layer (`stopruntime.Input.Validate`);
// unknown values surface as `invalid_request`.
type stopRequestBody struct {
Reason string `json:"reason"`
}
// newStopHandler returns the handler for
// `POST /api/v1/internal/runtimes/{game_id}/stop`.
func newStopHandler(deps Dependencies) http.HandlerFunc {
logger := loggerFor(deps.Logger, "internal_rest.stop")
return func(writer http.ResponseWriter, request *http.Request) {
if deps.StopRuntime == nil {
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"stop runtime service is not wired",
)
return
}
gameID, ok := extractGameID(writer, request)
if !ok {
return
}
var body stopRequestBody
if err := decodeStrictJSON(request.Body, &body); err != nil {
writeError(writer, http.StatusBadRequest,
startruntime.ErrorCodeInvalidRequest,
err.Error(),
)
return
}
result, err := deps.StopRuntime.Handle(request.Context(), stopruntime.Input{
GameID: gameID,
Reason: stopruntime.StopReason(body.Reason),
OpSource: resolveOpSource(request),
SourceRef: requestSourceRef(request),
})
if err != nil {
logger.ErrorContext(request.Context(), "stop runtime service errored",
"game_id", gameID,
"err", err.Error(),
)
writeError(writer, http.StatusInternalServerError,
startruntime.ErrorCodeInternal,
"stop runtime service failed",
)
return
}
if result.Outcome == operation.OutcomeFailure {
writeFailure(writer, result.ErrorCode, result.ErrorMessage)
return
}
writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record))
}
}
@@ -0,0 +1,363 @@
// Package internalhttp provides the trusted internal HTTP listener used
// by the runnable Runtime Manager process. It exposes `/healthz` and
// `/readyz` plus the GM/Admin REST surface backed by the lifecycle
// services in `internal/service/`.
package internalhttp
import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
"net"
"net/http"
"strconv"
"sync"
"time"
"galaxy/rtmanager/internal/api/internalhttp/handlers"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/telemetry"
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
"go.opentelemetry.io/otel/attribute"
)
const jsonContentType = "application/json; charset=utf-8"
// errorCodeServiceUnavailable mirrors the stable error code declared in
// `rtmanager/api/internal-openapi.yaml` `§Error Model`.
const errorCodeServiceUnavailable = "service_unavailable"
// HealthzPath and ReadyzPath are the internal probe routes documented in
// `rtmanager/api/internal-openapi.yaml`.
const (
HealthzPath = "/healthz"
ReadyzPath = "/readyz"
)
// ReadinessProbe reports whether the dependencies the listener guards
// (PostgreSQL, Redis, Docker) are reachable. A non-nil error is reported
// to the caller as `503 service_unavailable` with the wrapped message.
type ReadinessProbe interface {
Check(ctx context.Context) error
}
// Config describes the trusted internal HTTP listener owned by Runtime
// Manager.
type Config struct {
// Addr is the TCP listen address used by the internal HTTP server.
Addr string
// ReadHeaderTimeout bounds how long the listener may spend reading
// request headers before the server rejects the connection.
ReadHeaderTimeout time.Duration
// ReadTimeout bounds how long the listener may spend reading one
// request.
ReadTimeout time.Duration
// WriteTimeout bounds how long the listener may spend writing one
// response.
WriteTimeout time.Duration
// IdleTimeout bounds how long the listener keeps an idle keep-alive
// connection open.
IdleTimeout time.Duration
}
// Validate reports whether cfg contains a usable internal HTTP listener
// configuration.
func (cfg Config) Validate() error {
switch {
case cfg.Addr == "":
return errors.New("internal HTTP addr must not be empty")
case cfg.ReadHeaderTimeout <= 0:
return errors.New("internal HTTP read header timeout must be positive")
case cfg.ReadTimeout <= 0:
return errors.New("internal HTTP read timeout must be positive")
case cfg.WriteTimeout <= 0:
return errors.New("internal HTTP write timeout must be positive")
case cfg.IdleTimeout <= 0:
return errors.New("internal HTTP idle timeout must be positive")
default:
return nil
}
}
// Dependencies describes the collaborators used by the internal HTTP
// transport layer. The listener still works when the lifecycle service
// fields are zero — handlers register but each returns
// `500 internal_error` until the runtime wires the real services.
type Dependencies struct {
// Logger writes structured listener lifecycle logs. When nil,
// slog.Default is used.
Logger *slog.Logger
// Telemetry records low-cardinality probe metrics and lifecycle
// events.
Telemetry *telemetry.Runtime
// Readiness reports whether PG / Redis / Docker are reachable. A
// nil readiness probe makes `/readyz` always answer `200`; the
// runtime always supplies a real probe in production wiring.
Readiness ReadinessProbe
// RuntimeRecords backs the read-only list/get handlers. When nil
// those routes return `500 internal_error`.
RuntimeRecords ports.RuntimeRecordStore
// StartRuntime, StopRuntime, RestartRuntime, PatchRuntime, and
// CleanupContainer back the lifecycle handlers. Each accepts a
// narrow interface so tests can pass `mockgen`-generated mocks;
// production wiring passes the concrete `*<lifecycle>.Service`
// pointer.
StartRuntime handlers.StartService
StopRuntime handlers.StopService
RestartRuntime handlers.RestartService
PatchRuntime handlers.PatchService
CleanupContainer handlers.CleanupService
}
// Server owns the trusted internal HTTP listener exposed by Runtime
// Manager.
type Server struct {
cfg Config
handler http.Handler
logger *slog.Logger
metrics *telemetry.Runtime
stateMu sync.RWMutex
server *http.Server
listener net.Listener
}
// NewServer constructs one trusted internal HTTP server for cfg and deps.
func NewServer(cfg Config, deps Dependencies) (*Server, error) {
if err := cfg.Validate(); err != nil {
return nil, fmt.Errorf("new internal HTTP server: %w", err)
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
return &Server{
cfg: cfg,
handler: newHandler(deps, logger),
logger: logger.With("component", "internal_http"),
metrics: deps.Telemetry,
}, nil
}
// Addr returns the currently bound listener address after Run is called.
// It returns an empty string if the server has not yet bound a listener.
func (server *Server) Addr() string {
server.stateMu.RLock()
defer server.stateMu.RUnlock()
if server.listener == nil {
return ""
}
return server.listener.Addr().String()
}
// Run binds the configured listener and serves the internal HTTP surface
// until Shutdown closes the server.
func (server *Server) Run(ctx context.Context) error {
if ctx == nil {
return errors.New("run internal HTTP server: nil context")
}
if err := ctx.Err(); err != nil {
return err
}
listener, err := net.Listen("tcp", server.cfg.Addr)
if err != nil {
return fmt.Errorf("run internal HTTP server: listen on %q: %w", server.cfg.Addr, err)
}
httpServer := &http.Server{
Handler: server.handler,
ReadHeaderTimeout: server.cfg.ReadHeaderTimeout,
ReadTimeout: server.cfg.ReadTimeout,
WriteTimeout: server.cfg.WriteTimeout,
IdleTimeout: server.cfg.IdleTimeout,
}
server.stateMu.Lock()
server.server = httpServer
server.listener = listener
server.stateMu.Unlock()
server.logger.Info("rtmanager internal HTTP server started", "addr", listener.Addr().String())
defer func() {
server.stateMu.Lock()
server.server = nil
server.listener = nil
server.stateMu.Unlock()
}()
err = httpServer.Serve(listener)
switch {
case err == nil:
return nil
case errors.Is(err, http.ErrServerClosed):
server.logger.Info("rtmanager internal HTTP server stopped")
return nil
default:
return fmt.Errorf("run internal HTTP server: serve on %q: %w", server.cfg.Addr, err)
}
}
// Shutdown gracefully stops the internal HTTP server within ctx.
func (server *Server) Shutdown(ctx context.Context) error {
if ctx == nil {
return errors.New("shutdown internal HTTP server: nil context")
}
server.stateMu.RLock()
httpServer := server.server
server.stateMu.RUnlock()
if httpServer == nil {
return nil
}
if err := httpServer.Shutdown(ctx); err != nil && !errors.Is(err, http.ErrServerClosed) {
return fmt.Errorf("shutdown internal HTTP server: %w", err)
}
return nil
}
func newHandler(deps Dependencies, logger *slog.Logger) http.Handler {
mux := http.NewServeMux()
mux.HandleFunc("GET "+HealthzPath, handleHealthz)
mux.HandleFunc("GET "+ReadyzPath, handleReadyz(deps.Readiness, logger))
handlers.Register(mux, handlers.Dependencies{
Logger: logger,
RuntimeRecords: deps.RuntimeRecords,
StartRuntime: deps.StartRuntime,
StopRuntime: deps.StopRuntime,
RestartRuntime: deps.RestartRuntime,
PatchRuntime: deps.PatchRuntime,
CleanupContainer: deps.CleanupContainer,
})
metrics := deps.Telemetry
options := []otelhttp.Option{}
if metrics != nil {
options = append(options,
otelhttp.WithTracerProvider(metrics.TracerProvider()),
otelhttp.WithMeterProvider(metrics.MeterProvider()),
)
}
return otelhttp.NewHandler(withObservability(mux, metrics), "rtmanager.internal_http", options...)
}
func withObservability(next http.Handler, metrics *telemetry.Runtime) http.Handler {
return http.HandlerFunc(func(writer http.ResponseWriter, request *http.Request) {
startedAt := time.Now()
recorder := &statusRecorder{
ResponseWriter: writer,
statusCode: http.StatusOK,
}
next.ServeHTTP(recorder, request)
route := request.Pattern
switch recorder.statusCode {
case http.StatusMethodNotAllowed:
route = "method_not_allowed"
case http.StatusNotFound:
route = "not_found"
case 0:
route = "unmatched"
}
if route == "" {
route = "unmatched"
}
if metrics != nil {
metrics.RecordInternalHTTPRequest(
request.Context(),
[]attribute.KeyValue{
attribute.String("route", route),
attribute.String("method", request.Method),
attribute.String("status_code", strconv.Itoa(recorder.statusCode)),
},
time.Since(startedAt),
)
}
})
}
func handleHealthz(writer http.ResponseWriter, _ *http.Request) {
writeStatusResponse(writer, http.StatusOK, "ok")
}
func handleReadyz(probe ReadinessProbe, logger *slog.Logger) http.HandlerFunc {
return func(writer http.ResponseWriter, request *http.Request) {
if probe == nil {
writeStatusResponse(writer, http.StatusOK, "ready")
return
}
if err := probe.Check(request.Context()); err != nil {
logger.WarnContext(request.Context(), "rtmanager readiness probe failed",
"err", err.Error(),
)
writeServiceUnavailable(writer, err.Error())
return
}
writeStatusResponse(writer, http.StatusOK, "ready")
}
}
func writeStatusResponse(writer http.ResponseWriter, statusCode int, status string) {
writer.Header().Set("Content-Type", jsonContentType)
writer.WriteHeader(statusCode)
_ = json.NewEncoder(writer).Encode(statusResponse{Status: status})
}
func writeServiceUnavailable(writer http.ResponseWriter, message string) {
writer.Header().Set("Content-Type", jsonContentType)
writer.WriteHeader(http.StatusServiceUnavailable)
_ = json.NewEncoder(writer).Encode(errorResponse{
Error: errorBody{
Code: errorCodeServiceUnavailable,
Message: message,
},
})
}
type statusResponse struct {
Status string `json:"status"`
}
type errorBody struct {
Code string `json:"code"`
Message string `json:"message"`
}
type errorResponse struct {
Error errorBody `json:"error"`
}
type statusRecorder struct {
http.ResponseWriter
statusCode int
}
func (recorder *statusRecorder) WriteHeader(statusCode int) {
recorder.statusCode = statusCode
recorder.ResponseWriter.WriteHeader(statusCode)
}
@@ -0,0 +1,115 @@
package internalhttp
import (
"context"
"encoding/json"
"errors"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
"github.com/stretchr/testify/require"
)
func newTestConfig() Config {
return Config{
Addr: ":0",
ReadHeaderTimeout: time.Second,
ReadTimeout: time.Second,
WriteTimeout: time.Second,
IdleTimeout: time.Second,
}
}
type stubReadiness struct {
err error
}
func (probe stubReadiness) Check(_ context.Context) error {
return probe.err
}
func newTestServer(t *testing.T, deps Dependencies) http.Handler {
t.Helper()
server, err := NewServer(newTestConfig(), deps)
require.NoError(t, err)
return server.handler
}
func TestHealthzReturnsOK(t *testing.T) {
t.Parallel()
handler := newTestServer(t, Dependencies{})
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, HealthzPath, nil)
handler.ServeHTTP(rec, req)
require.Equal(t, http.StatusOK, rec.Code)
require.Equal(t, jsonContentType, rec.Header().Get("Content-Type"))
var body statusResponse
require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &body))
require.Equal(t, "ok", body.Status)
}
func TestReadyzReturnsReadyWhenProbeIsNil(t *testing.T) {
t.Parallel()
handler := newTestServer(t, Dependencies{})
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, ReadyzPath, nil)
handler.ServeHTTP(rec, req)
require.Equal(t, http.StatusOK, rec.Code)
var body statusResponse
require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &body))
require.Equal(t, "ready", body.Status)
}
func TestReadyzReturnsReadyWhenProbeSucceeds(t *testing.T) {
t.Parallel()
handler := newTestServer(t, Dependencies{Readiness: stubReadiness{}})
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, ReadyzPath, nil)
handler.ServeHTTP(rec, req)
require.Equal(t, http.StatusOK, rec.Code)
var body statusResponse
require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &body))
require.Equal(t, "ready", body.Status)
}
func TestReadyzReturnsServiceUnavailableWhenProbeFails(t *testing.T) {
t.Parallel()
handler := newTestServer(t, Dependencies{
Readiness: stubReadiness{err: errors.New("postgres ping: connection refused")},
})
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, ReadyzPath, nil)
handler.ServeHTTP(rec, req)
require.Equal(t, http.StatusServiceUnavailable, rec.Code)
require.Equal(t, jsonContentType, rec.Header().Get("Content-Type"))
var body errorResponse
require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &body))
require.Equal(t, errorCodeServiceUnavailable, body.Error.Code)
require.True(t, strings.Contains(body.Error.Message, "postgres"))
}
func TestNewServerRejectsInvalidConfig(t *testing.T) {
t.Parallel()
_, err := NewServer(Config{}, Dependencies{})
require.Error(t, err)
}
+170
View File
@@ -0,0 +1,170 @@
// Package app wires the Runtime Manager process lifecycle and
// coordinates component startup and graceful shutdown.
package app
import (
"context"
"errors"
"fmt"
"sync"
"galaxy/rtmanager/internal/config"
)
// Component is a long-lived Runtime Manager subsystem that participates
// in coordinated startup and graceful shutdown.
type Component interface {
// Run starts the component and blocks until it stops.
Run(context.Context) error
// Shutdown stops the component within the provided timeout-bounded
// context.
Shutdown(context.Context) error
}
// App owns the process-level lifecycle of Runtime Manager and its
// registered components.
type App struct {
cfg config.Config
components []Component
}
// New constructs App with a defensive copy of the supplied components.
func New(cfg config.Config, components ...Component) *App {
clonedComponents := append([]Component(nil), components...)
return &App{
cfg: cfg,
components: clonedComponents,
}
}
// Run starts all configured components, waits for cancellation or the
// first component failure, and then executes best-effort graceful
// shutdown.
func (app *App) Run(ctx context.Context) error {
if ctx == nil {
return errors.New("run rtmanager app: nil context")
}
if err := app.validate(); err != nil {
return err
}
if len(app.components) == 0 {
<-ctx.Done()
return nil
}
runCtx, cancel := context.WithCancel(ctx)
defer cancel()
results := make(chan componentResult, len(app.components))
var runWaitGroup sync.WaitGroup
for index, component := range app.components {
runWaitGroup.Add(1)
go func(componentIndex int, component Component) {
defer runWaitGroup.Done()
results <- componentResult{
index: componentIndex,
err: component.Run(runCtx),
}
}(index, component)
}
var runErr error
select {
case <-ctx.Done():
case result := <-results:
runErr = classifyComponentResult(ctx, result)
}
cancel()
shutdownErr := app.shutdownComponents()
waitErr := app.waitForComponents(&runWaitGroup)
return errors.Join(runErr, shutdownErr, waitErr)
}
type componentResult struct {
index int
err error
}
func (app *App) validate() error {
if app.cfg.ShutdownTimeout <= 0 {
return fmt.Errorf("run rtmanager app: shutdown timeout must be positive, got %s", app.cfg.ShutdownTimeout)
}
for index, component := range app.components {
if component == nil {
return fmt.Errorf("run rtmanager app: component %d is nil", index)
}
}
return nil
}
func classifyComponentResult(parentCtx context.Context, result componentResult) error {
switch {
case result.err == nil:
if parentCtx.Err() != nil {
return nil
}
return fmt.Errorf("run rtmanager app: component %d exited without error before shutdown", result.index)
case errors.Is(result.err, context.Canceled) && parentCtx.Err() != nil:
return nil
default:
return fmt.Errorf("run rtmanager app: component %d: %w", result.index, result.err)
}
}
func (app *App) shutdownComponents() error {
var shutdownWaitGroup sync.WaitGroup
errs := make(chan error, len(app.components))
for index, component := range app.components {
shutdownWaitGroup.Add(1)
go func(componentIndex int, component Component) {
defer shutdownWaitGroup.Done()
shutdownCtx, cancel := context.WithTimeout(context.Background(), app.cfg.ShutdownTimeout)
defer cancel()
if err := component.Shutdown(shutdownCtx); err != nil {
errs <- fmt.Errorf("shutdown rtmanager component %d: %w", componentIndex, err)
}
}(index, component)
}
shutdownWaitGroup.Wait()
close(errs)
var joined error
for err := range errs {
joined = errors.Join(joined, err)
}
return joined
}
func (app *App) waitForComponents(runWaitGroup *sync.WaitGroup) error {
done := make(chan struct{})
go func() {
runWaitGroup.Wait()
close(done)
}()
waitCtx, cancel := context.WithTimeout(context.Background(), app.cfg.ShutdownTimeout)
defer cancel()
select {
case <-done:
return nil
case <-waitCtx.Done():
return fmt.Errorf("wait for rtmanager components: %w", waitCtx.Err())
}
}
+137
View File
@@ -0,0 +1,137 @@
package app
import (
"context"
"errors"
"sync/atomic"
"testing"
"time"
"galaxy/rtmanager/internal/config"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type fakeComponent struct {
runErr error
shutdownErr error
runHook func(context.Context) error
shutdownHook func(context.Context) error
runCount atomic.Int32
downCount atomic.Int32
blockForCtx bool
}
func (component *fakeComponent) Run(ctx context.Context) error {
component.runCount.Add(1)
if component.runHook != nil {
return component.runHook(ctx)
}
if component.blockForCtx {
<-ctx.Done()
return ctx.Err()
}
return component.runErr
}
func (component *fakeComponent) Shutdown(ctx context.Context) error {
component.downCount.Add(1)
if component.shutdownHook != nil {
return component.shutdownHook(ctx)
}
return component.shutdownErr
}
func newCfg() config.Config {
return config.Config{ShutdownTimeout: time.Second}
}
func TestAppRunWithoutComponentsBlocksUntilContextDone(t *testing.T) {
t.Parallel()
app := New(newCfg())
ctx, cancel := context.WithCancel(context.Background())
cancel()
require.NoError(t, app.Run(ctx))
}
func TestAppRunReturnsOnContextCancel(t *testing.T) {
t.Parallel()
component := &fakeComponent{blockForCtx: true}
app := New(newCfg(), component)
ctx, cancel := context.WithCancel(context.Background())
go func() {
time.Sleep(10 * time.Millisecond)
cancel()
}()
require.NoError(t, app.Run(ctx))
assert.EqualValues(t, 1, component.runCount.Load())
assert.EqualValues(t, 1, component.downCount.Load())
}
func TestAppRunPropagatesComponentFailure(t *testing.T) {
t.Parallel()
failure := errors.New("boom")
component := &fakeComponent{runErr: failure}
app := New(newCfg(), component)
err := app.Run(context.Background())
require.Error(t, err)
require.ErrorIs(t, err, failure)
assert.EqualValues(t, 1, component.downCount.Load())
}
func TestAppRunFailsOnNilContext(t *testing.T) {
t.Parallel()
app := New(newCfg())
var ctx context.Context
require.Error(t, app.Run(ctx))
}
func TestAppRunFailsOnNonPositiveShutdownTimeout(t *testing.T) {
t.Parallel()
app := New(config.Config{}, &fakeComponent{})
require.Error(t, app.Run(context.Background()))
}
func TestAppRunFailsOnNilComponent(t *testing.T) {
t.Parallel()
app := New(newCfg(), nil)
require.Error(t, app.Run(context.Background()))
}
func TestAppRunFlagsCleanExitBeforeShutdown(t *testing.T) {
t.Parallel()
component := &fakeComponent{}
app := New(newCfg(), component)
err := app.Run(context.Background())
require.Error(t, err)
require.True(t, contains(err.Error(), "exited without error"))
}
func contains(haystack, needle string) bool {
return len(needle) == 0 || (len(haystack) >= len(needle) && (haystack == needle || index(haystack, needle) >= 0))
}
func index(haystack, needle string) int {
for i := 0; i+len(needle) <= len(haystack); i++ {
if haystack[i:i+len(needle)] == needle {
return i
}
}
return -1
}
+85
View File
@@ -0,0 +1,85 @@
package app
import (
"context"
"errors"
"fmt"
"time"
"galaxy/redisconn"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/telemetry"
"github.com/docker/docker/client"
"github.com/redis/go-redis/v9"
)
// newRedisClient builds the master Redis client from cfg via the shared
// `pkg/redisconn` helper. Replica clients are not opened in this iteration
// per ARCHITECTURE.md §Persistence Backends; they will be wired when read
// routing is introduced.
func newRedisClient(cfg config.RedisConfig) *redis.Client {
return redisconn.NewMasterClient(cfg.Conn)
}
// instrumentRedisClient attaches the OpenTelemetry tracing and metrics
// instrumentation to client when telemetryRuntime is available. The
// actual instrumentation lives in `pkg/redisconn` so every Galaxy service
// shares one surface.
func instrumentRedisClient(redisClient *redis.Client, telemetryRuntime *telemetry.Runtime) error {
if redisClient == nil {
return errors.New("instrument redis client: nil client")
}
if telemetryRuntime == nil {
return nil
}
return redisconn.Instrument(redisClient,
redisconn.WithTracerProvider(telemetryRuntime.TracerProvider()),
redisconn.WithMeterProvider(telemetryRuntime.MeterProvider()),
)
}
// pingRedis performs a single Redis PING bounded by
// cfg.Conn.OperationTimeout to confirm that the configured Redis endpoint
// is reachable at startup.
func pingRedis(ctx context.Context, cfg config.RedisConfig, redisClient *redis.Client) error {
return redisconn.Ping(ctx, redisClient, cfg.Conn.OperationTimeout)
}
// newDockerClient constructs a Docker SDK client for cfg.Host with an
// optional API version override. The bootstrap layer opens and pings
// the client; the production Docker adapter wraps it for the service
// layer.
func newDockerClient(cfg config.DockerConfig) (*client.Client, error) {
options := []client.Opt{client.WithHost(cfg.Host)}
if cfg.APIVersion == "" {
options = append(options, client.WithAPIVersionNegotiation())
} else {
options = append(options, client.WithVersion(cfg.APIVersion))
}
docker, err := client.NewClientWithOpts(options...)
if err != nil {
return nil, fmt.Errorf("new docker client: %w", err)
}
return docker, nil
}
// pingDocker bounds one Docker daemon ping under timeout and returns a
// wrapped error so startup failures are easy to spot in service logs.
func pingDocker(ctx context.Context, dockerClient *client.Client, timeout time.Duration) error {
if dockerClient == nil {
return errors.New("ping docker: nil client")
}
if timeout <= 0 {
return errors.New("ping docker: timeout must be positive")
}
pingCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
if _, err := dockerClient.Ping(pingCtx); err != nil {
return fmt.Errorf("ping docker: %w", err)
}
return nil
}
+82
View File
@@ -0,0 +1,82 @@
package app
import (
"context"
"testing"
"time"
"galaxy/redisconn"
"galaxy/rtmanager/internal/config"
"github.com/alicebob/miniredis/v2"
"github.com/stretchr/testify/require"
)
func newTestRedisCfg(addr string) config.RedisConfig {
return config.RedisConfig{
Conn: redisconn.Config{
MasterAddr: addr,
Password: "test",
OperationTimeout: time.Second,
},
}
}
func TestPingRedisSucceedsAgainstMiniredis(t *testing.T) {
t.Parallel()
server := miniredis.RunT(t)
redisCfg := newTestRedisCfg(server.Addr())
client := newRedisClient(redisCfg)
t.Cleanup(func() { _ = client.Close() })
require.NoError(t, pingRedis(context.Background(), redisCfg, client))
}
func TestPingRedisReturnsErrorWhenClosed(t *testing.T) {
t.Parallel()
server := miniredis.RunT(t)
redisCfg := newTestRedisCfg(server.Addr())
client := newRedisClient(redisCfg)
require.NoError(t, client.Close())
require.Error(t, pingRedis(context.Background(), redisCfg, client))
}
func TestNewDockerClientHonoursHostOverride(t *testing.T) {
t.Parallel()
docker, err := newDockerClient(config.DockerConfig{
Host: "unix:///var/run/docker.sock",
APIVersion: "1.43",
Network: "galaxy-net",
LogDriver: "json-file",
PullPolicy: config.ImagePullPolicyIfMissing,
})
require.NoError(t, err)
require.NotNil(t, docker)
require.NoError(t, docker.Close())
}
func TestPingDockerRejectsNilClient(t *testing.T) {
t.Parallel()
require.Error(t, pingDocker(context.Background(), nil, time.Second))
}
func TestPingDockerRejectsNonPositiveTimeout(t *testing.T) {
t.Parallel()
docker, err := newDockerClient(config.DockerConfig{
Host: "unix:///var/run/docker.sock",
Network: "galaxy-net",
LogDriver: "json-file",
})
require.NoError(t, err)
t.Cleanup(func() { _ = docker.Close() })
require.Error(t, pingDocker(context.Background(), docker, 0))
}
+262
View File
@@ -0,0 +1,262 @@
package app
import (
"context"
"database/sql"
"errors"
"fmt"
"log/slog"
"time"
"galaxy/postgres"
"galaxy/redisconn"
"galaxy/rtmanager/internal/adapters/postgres/migrations"
"galaxy/rtmanager/internal/api/internalhttp"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/telemetry"
dockerclient "github.com/docker/docker/client"
"github.com/redis/go-redis/v9"
)
// Runtime owns the runnable Runtime Manager process plus the cleanup
// functions that release runtime resources after shutdown.
type Runtime struct {
cfg config.Config
app *App
wiring *wiring
internalServer *internalhttp.Server
cleanupFns []func() error
}
// NewRuntime constructs the runnable Runtime Manager process from cfg.
//
// PostgreSQL migrations apply strictly before the internal HTTP listener
// becomes ready. The runtime opens one shared `*redis.Client`, one
// `*sql.DB`, one Docker SDK client, and one OpenTelemetry runtime; all
// are released in reverse construction order on shutdown.
func NewRuntime(ctx context.Context, cfg config.Config, logger *slog.Logger) (*Runtime, error) {
if ctx == nil {
return nil, errors.New("new rtmanager runtime: nil context")
}
if err := cfg.Validate(); err != nil {
return nil, fmt.Errorf("new rtmanager runtime: %w", err)
}
if logger == nil {
logger = slog.Default()
}
runtime := &Runtime{
cfg: cfg,
}
cleanupOnError := func(err error) (*Runtime, error) {
if cleanupErr := runtime.Close(); cleanupErr != nil {
return nil, fmt.Errorf("%w; cleanup: %w", err, cleanupErr)
}
return nil, err
}
telemetryRuntime, err := telemetry.NewProcess(ctx, telemetry.ProcessConfig{
ServiceName: cfg.Telemetry.ServiceName,
TracesExporter: cfg.Telemetry.TracesExporter,
MetricsExporter: cfg.Telemetry.MetricsExporter,
TracesProtocol: cfg.Telemetry.TracesProtocol,
MetricsProtocol: cfg.Telemetry.MetricsProtocol,
StdoutTracesEnabled: cfg.Telemetry.StdoutTracesEnabled,
StdoutMetricsEnabled: cfg.Telemetry.StdoutMetricsEnabled,
}, logger)
if err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: telemetry: %w", err))
}
runtime.cleanupFns = append(runtime.cleanupFns, func() error {
shutdownCtx, cancel := context.WithTimeout(context.Background(), cfg.ShutdownTimeout)
defer cancel()
return telemetryRuntime.Shutdown(shutdownCtx)
})
redisClient := newRedisClient(cfg.Redis)
if err := instrumentRedisClient(redisClient, telemetryRuntime); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
}
runtime.cleanupFns = append(runtime.cleanupFns, func() error {
err := redisClient.Close()
if errors.Is(err, redis.ErrClosed) {
return nil
}
return err
})
if err := pingRedis(ctx, cfg.Redis, redisClient); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
}
pgPool, err := postgres.OpenPrimary(ctx, cfg.Postgres.Conn,
postgres.WithTracerProvider(telemetryRuntime.TracerProvider()),
postgres.WithMeterProvider(telemetryRuntime.MeterProvider()),
)
if err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: open postgres: %w", err))
}
runtime.cleanupFns = append(runtime.cleanupFns, pgPool.Close)
unregisterPGStats, err := postgres.InstrumentDBStats(pgPool,
postgres.WithMeterProvider(telemetryRuntime.MeterProvider()),
)
if err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: instrument postgres: %w", err))
}
runtime.cleanupFns = append(runtime.cleanupFns, func() error {
return unregisterPGStats()
})
if err := postgres.Ping(ctx, pgPool, cfg.Postgres.Conn.OperationTimeout); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: ping postgres: %w", err))
}
if err := postgres.RunMigrations(ctx, pgPool, migrations.FS(), "."); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: run postgres migrations: %w", err))
}
dockerClient, err := newDockerClient(cfg.Docker)
if err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
}
runtime.cleanupFns = append(runtime.cleanupFns, dockerClient.Close)
if err := pingDocker(ctx, dockerClient, cfg.Postgres.Conn.OperationTimeout); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
}
wiring, err := newWiring(cfg, redisClient, pgPool, dockerClient, time.Now, logger, telemetryRuntime)
if err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: wiring: %w", err))
}
runtime.wiring = wiring
runtime.cleanupFns = append(runtime.cleanupFns, wiring.close)
if err := wiring.registerTelemetryGauges(); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: register telemetry gauges: %w", err))
}
if err := wiring.reconciler.ReconcileNow(ctx); err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: initial reconcile: %w", err))
}
probe := newReadinessProbe(pgPool, redisClient, dockerClient, cfg)
internalServer, err := internalhttp.NewServer(internalhttp.Config{
Addr: cfg.InternalHTTP.Addr,
ReadHeaderTimeout: cfg.InternalHTTP.ReadHeaderTimeout,
ReadTimeout: cfg.InternalHTTP.ReadTimeout,
WriteTimeout: cfg.InternalHTTP.WriteTimeout,
IdleTimeout: cfg.InternalHTTP.IdleTimeout,
}, internalhttp.Dependencies{
Logger: logger,
Telemetry: telemetryRuntime,
Readiness: probe,
RuntimeRecords: wiring.runtimeRecordStore,
StartRuntime: wiring.startRuntimeService,
StopRuntime: wiring.stopRuntimeService,
RestartRuntime: wiring.restartRuntimeService,
PatchRuntime: wiring.patchRuntimeService,
CleanupContainer: wiring.cleanupContainerService,
})
if err != nil {
return cleanupOnError(fmt.Errorf("new rtmanager runtime: internal HTTP server: %w", err))
}
runtime.internalServer = internalServer
runtime.app = New(cfg,
internalServer,
wiring.startJobsConsumer,
wiring.stopJobsConsumer,
wiring.dockerEventsListener,
wiring.healthProbeWorker,
wiring.dockerInspectWorker,
wiring.reconciler,
wiring.containerCleanupWorker,
)
return runtime, nil
}
// InternalServer returns the internal HTTP server owned by runtime. It is
// primarily exposed for tests; production code should not depend on it.
func (runtime *Runtime) InternalServer() *internalhttp.Server {
if runtime == nil {
return nil
}
return runtime.internalServer
}
// Run serves the internal HTTP listener until ctx is canceled or one
// component fails.
func (runtime *Runtime) Run(ctx context.Context) error {
if ctx == nil {
return errors.New("run rtmanager runtime: nil context")
}
if runtime == nil {
return errors.New("run rtmanager runtime: nil runtime")
}
if runtime.app == nil {
return errors.New("run rtmanager runtime: nil app")
}
return runtime.app.Run(ctx)
}
// Close releases every runtime dependency in reverse construction order.
// Close is safe to call multiple times.
func (runtime *Runtime) Close() error {
if runtime == nil {
return nil
}
var joined error
for index := len(runtime.cleanupFns) - 1; index >= 0; index-- {
if err := runtime.cleanupFns[index](); err != nil {
joined = errors.Join(joined, err)
}
}
runtime.cleanupFns = nil
return joined
}
// readinessProbe pings every steady-state dependency the listener
// guards: PostgreSQL primary, Redis master, the Docker daemon, plus
// the configured Docker network's existence.
type readinessProbe struct {
pgPool *sql.DB
redisClient *redis.Client
dockerClient *dockerclient.Client
postgresTimeout time.Duration
redisTimeout time.Duration
dockerTimeout time.Duration
}
func newReadinessProbe(pgPool *sql.DB, redisClient *redis.Client, dockerClient *dockerclient.Client, cfg config.Config) *readinessProbe {
return &readinessProbe{
pgPool: pgPool,
redisClient: redisClient,
dockerClient: dockerClient,
postgresTimeout: cfg.Postgres.Conn.OperationTimeout,
redisTimeout: cfg.Redis.Conn.OperationTimeout,
dockerTimeout: cfg.Postgres.Conn.OperationTimeout,
}
}
// Check pings PostgreSQL, Redis, and Docker. The first failing
// dependency aborts the check so callers see a single, actionable
// error.
func (probe *readinessProbe) Check(ctx context.Context) error {
if err := postgres.Ping(ctx, probe.pgPool, probe.postgresTimeout); err != nil {
return err
}
if err := redisconn.Ping(ctx, probe.redisClient, probe.redisTimeout); err != nil {
return err
}
return pingDocker(ctx, probe.dockerClient, probe.dockerTimeout)
}
+541
View File
@@ -0,0 +1,541 @@
package app
import (
"context"
"database/sql"
"errors"
"fmt"
"log/slog"
"net/http"
"time"
"galaxy/rtmanager/internal/adapters/docker"
"galaxy/rtmanager/internal/adapters/healtheventspublisher"
"galaxy/rtmanager/internal/adapters/jobresultspublisher"
"galaxy/rtmanager/internal/adapters/lobbyclient"
"galaxy/rtmanager/internal/adapters/notificationpublisher"
"galaxy/rtmanager/internal/adapters/postgres/healthsnapshotstore"
"galaxy/rtmanager/internal/adapters/postgres/operationlogstore"
"galaxy/rtmanager/internal/adapters/postgres/runtimerecordstore"
"galaxy/rtmanager/internal/adapters/redisstate/gamelease"
"galaxy/rtmanager/internal/adapters/redisstate/streamoffsets"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/cleanupcontainer"
"galaxy/rtmanager/internal/service/patchruntime"
"galaxy/rtmanager/internal/service/restartruntime"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"galaxy/rtmanager/internal/telemetry"
"galaxy/rtmanager/internal/worker/containercleanup"
"galaxy/rtmanager/internal/worker/dockerevents"
"galaxy/rtmanager/internal/worker/dockerinspect"
"galaxy/rtmanager/internal/worker/healthprobe"
"galaxy/rtmanager/internal/worker/reconcile"
"galaxy/rtmanager/internal/worker/startjobsconsumer"
"galaxy/rtmanager/internal/worker/stopjobsconsumer"
dockerclient "github.com/docker/docker/client"
"github.com/redis/go-redis/v9"
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
)
// wiring owns the process-level singletons constructed once during
// `NewRuntime` and consumed by every worker and HTTP handler.
//
// The struct exposes typed accessors so callers can grab the store /
// adapter / service singletons without depending on internal fields.
type wiring struct {
cfg config.Config
redisClient *redis.Client
pgPool *sql.DB
dockerClient *dockerclient.Client
clock func() time.Time
logger *slog.Logger
telemetry *telemetry.Runtime
// Persistence stores.
runtimeRecordStore *runtimerecordstore.Store
operationLogStore *operationlogstore.Store
healthSnapshotStore *healthsnapshotstore.Store
streamOffsetStore *streamoffsets.Store
gameLeaseStore *gamelease.Store
// External adapters.
dockerAdapter *docker.Client
lobbyClient *lobbyclient.Client
notificationPublisher *notificationpublisher.Publisher
healthEventsPublisher *healtheventspublisher.Publisher
jobResultsPublisher *jobresultspublisher.Publisher
// Service layer.
startRuntimeService *startruntime.Service
stopRuntimeService *stopruntime.Service
restartRuntimeService *restartruntime.Service
patchRuntimeService *patchruntime.Service
cleanupContainerService *cleanupcontainer.Service
// Worker layer.
startJobsConsumer *startjobsconsumer.Consumer
stopJobsConsumer *stopjobsconsumer.Consumer
dockerEventsListener *dockerevents.Listener
healthProbeWorker *healthprobe.Worker
dockerInspectWorker *dockerinspect.Worker
reconciler *reconcile.Reconciler
containerCleanupWorker *containercleanup.Worker
// closers releases adapter-level resources at runtime shutdown.
closers []func() error
}
// newWiring constructs the process-level dependency set, the persistence
// stores, the external adapters, and the service layer. It validates
// every required collaborator so callers can rely on them being non-nil.
func newWiring(
cfg config.Config,
redisClient *redis.Client,
pgPool *sql.DB,
dockerClient *dockerclient.Client,
clock func() time.Time,
logger *slog.Logger,
telemetryRuntime *telemetry.Runtime,
) (*wiring, error) {
if redisClient == nil {
return nil, errors.New("new rtmanager wiring: nil redis client")
}
if pgPool == nil {
return nil, errors.New("new rtmanager wiring: nil postgres pool")
}
if dockerClient == nil {
return nil, errors.New("new rtmanager wiring: nil docker client")
}
if clock == nil {
clock = time.Now
}
if logger == nil {
logger = slog.Default()
}
if telemetryRuntime == nil {
return nil, fmt.Errorf("new rtmanager wiring: nil telemetry runtime")
}
w := &wiring{
cfg: cfg,
redisClient: redisClient,
pgPool: pgPool,
dockerClient: dockerClient,
clock: clock,
logger: logger,
telemetry: telemetryRuntime,
}
if err := w.buildPersistence(); err != nil {
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
}
if err := w.buildAdapters(); err != nil {
_ = w.close()
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
}
if err := w.buildServices(); err != nil {
_ = w.close()
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
}
if err := w.buildWorkers(); err != nil {
_ = w.close()
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
}
return w, nil
}
func (w *wiring) buildPersistence() error {
runtimeStore, err := runtimerecordstore.New(runtimerecordstore.Config{
DB: w.pgPool,
OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
})
if err != nil {
return fmt.Errorf("runtime record store: %w", err)
}
w.runtimeRecordStore = runtimeStore
operationStore, err := operationlogstore.New(operationlogstore.Config{
DB: w.pgPool,
OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
})
if err != nil {
return fmt.Errorf("operation log store: %w", err)
}
w.operationLogStore = operationStore
snapshotStore, err := healthsnapshotstore.New(healthsnapshotstore.Config{
DB: w.pgPool,
OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
})
if err != nil {
return fmt.Errorf("health snapshot store: %w", err)
}
w.healthSnapshotStore = snapshotStore
offsetStore, err := streamoffsets.New(streamoffsets.Config{Client: w.redisClient})
if err != nil {
return fmt.Errorf("stream offset store: %w", err)
}
w.streamOffsetStore = offsetStore
leaseStore, err := gamelease.New(gamelease.Config{Client: w.redisClient})
if err != nil {
return fmt.Errorf("game lease store: %w", err)
}
w.gameLeaseStore = leaseStore
return nil
}
func (w *wiring) buildAdapters() error {
dockerAdapter, err := docker.NewClient(docker.Config{
Docker: w.dockerClient,
LogDriver: w.cfg.Docker.LogDriver,
LogOpts: w.cfg.Docker.LogOpts,
Clock: w.clock,
})
if err != nil {
return fmt.Errorf("docker adapter: %w", err)
}
w.dockerAdapter = dockerAdapter
lobby, err := lobbyclient.NewClient(lobbyclient.Config{
BaseURL: w.cfg.Lobby.BaseURL,
RequestTimeout: w.cfg.Lobby.Timeout,
})
if err != nil {
return fmt.Errorf("lobby client: %w", err)
}
w.lobbyClient = lobby
w.closers = append(w.closers, lobby.Close)
notificationPub, err := notificationpublisher.NewPublisher(notificationpublisher.Config{
Client: w.redisClient,
Stream: w.cfg.Streams.NotificationIntents,
})
if err != nil {
return fmt.Errorf("notification publisher: %w", err)
}
w.notificationPublisher = notificationPub
healthPub, err := healtheventspublisher.NewPublisher(healtheventspublisher.Config{
Client: w.redisClient,
Snapshots: w.healthSnapshotStore,
Stream: w.cfg.Streams.HealthEvents,
})
if err != nil {
return fmt.Errorf("health events publisher: %w", err)
}
w.healthEventsPublisher = healthPub
jobResultsPub, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{
Client: w.redisClient,
Stream: w.cfg.Streams.JobResults,
})
if err != nil {
return fmt.Errorf("job results publisher: %w", err)
}
w.jobResultsPublisher = jobResultsPub
return nil
}
func (w *wiring) buildServices() error {
startService, err := startruntime.NewService(startruntime.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
OperationLogs: w.operationLogStore,
Docker: w.dockerAdapter,
Leases: w.gameLeaseStore,
HealthEvents: w.healthEventsPublisher,
Notifications: w.notificationPublisher,
Lobby: w.lobbyClient,
Container: w.cfg.Container,
DockerCfg: w.cfg.Docker,
Coordination: w.cfg.Coordination,
Telemetry: w.telemetry,
Logger: w.logger,
Clock: w.clock,
})
if err != nil {
return fmt.Errorf("start runtime service: %w", err)
}
w.startRuntimeService = startService
stopService, err := stopruntime.NewService(stopruntime.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
OperationLogs: w.operationLogStore,
Docker: w.dockerAdapter,
Leases: w.gameLeaseStore,
HealthEvents: w.healthEventsPublisher,
Container: w.cfg.Container,
Coordination: w.cfg.Coordination,
Telemetry: w.telemetry,
Logger: w.logger,
Clock: w.clock,
})
if err != nil {
return fmt.Errorf("stop runtime service: %w", err)
}
w.stopRuntimeService = stopService
restartService, err := restartruntime.NewService(restartruntime.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
OperationLogs: w.operationLogStore,
Docker: w.dockerAdapter,
Leases: w.gameLeaseStore,
StopService: stopService,
StartService: startService,
Coordination: w.cfg.Coordination,
Telemetry: w.telemetry,
Logger: w.logger,
Clock: w.clock,
})
if err != nil {
return fmt.Errorf("restart runtime service: %w", err)
}
w.restartRuntimeService = restartService
patchService, err := patchruntime.NewService(patchruntime.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
OperationLogs: w.operationLogStore,
Docker: w.dockerAdapter,
Leases: w.gameLeaseStore,
StopService: stopService,
StartService: startService,
Coordination: w.cfg.Coordination,
Telemetry: w.telemetry,
Logger: w.logger,
Clock: w.clock,
})
if err != nil {
return fmt.Errorf("patch runtime service: %w", err)
}
w.patchRuntimeService = patchService
cleanupService, err := cleanupcontainer.NewService(cleanupcontainer.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
OperationLogs: w.operationLogStore,
Docker: w.dockerAdapter,
Leases: w.gameLeaseStore,
Coordination: w.cfg.Coordination,
Telemetry: w.telemetry,
Logger: w.logger,
Clock: w.clock,
})
if err != nil {
return fmt.Errorf("cleanup container service: %w", err)
}
w.cleanupContainerService = cleanupService
return nil
}
// buildWorkers constructs the asynchronous Lobby ↔ RTM stream
// consumers. Both consumers participate in the process lifecycle as
// `app.Component`s; `internal/app/runtime.go` passes them into
// `app.New` alongside the internal HTTP server.
func (w *wiring) buildWorkers() error {
startConsumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{
Client: w.redisClient,
Stream: w.cfg.Streams.StartJobs,
BlockTimeout: w.cfg.Streams.BlockTimeout,
StartService: w.startRuntimeService,
JobResults: w.jobResultsPublisher,
OffsetStore: w.streamOffsetStore,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("start jobs consumer: %w", err)
}
w.startJobsConsumer = startConsumer
stopConsumer, err := stopjobsconsumer.NewConsumer(stopjobsconsumer.Config{
Client: w.redisClient,
Stream: w.cfg.Streams.StopJobs,
BlockTimeout: w.cfg.Streams.BlockTimeout,
StopService: w.stopRuntimeService,
JobResults: w.jobResultsPublisher,
OffsetStore: w.streamOffsetStore,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("stop jobs consumer: %w", err)
}
w.stopJobsConsumer = stopConsumer
eventsListener, err := dockerevents.NewListener(dockerevents.Dependencies{
Docker: w.dockerAdapter,
RuntimeRecords: w.runtimeRecordStore,
HealthEvents: w.healthEventsPublisher,
Telemetry: w.telemetry,
Clock: w.clock,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("docker events listener: %w", err)
}
w.dockerEventsListener = eventsListener
probeHTTPClient, err := newProbeHTTPClient(w.telemetry)
if err != nil {
return fmt.Errorf("health probe http client: %w", err)
}
probeWorker, err := healthprobe.NewWorker(healthprobe.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
HealthEvents: w.healthEventsPublisher,
HTTPClient: probeHTTPClient,
Telemetry: w.telemetry,
Interval: w.cfg.Health.ProbeInterval,
ProbeTimeout: w.cfg.Health.ProbeTimeout,
FailuresThreshold: w.cfg.Health.ProbeFailuresThreshold,
Clock: w.clock,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("health probe worker: %w", err)
}
w.healthProbeWorker = probeWorker
inspectWorker, err := dockerinspect.NewWorker(dockerinspect.Dependencies{
Docker: w.dockerAdapter,
RuntimeRecords: w.runtimeRecordStore,
HealthEvents: w.healthEventsPublisher,
Telemetry: w.telemetry,
Interval: w.cfg.Health.InspectInterval,
Clock: w.clock,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("docker inspect worker: %w", err)
}
w.dockerInspectWorker = inspectWorker
reconciler, err := reconcile.NewReconciler(reconcile.Dependencies{
Docker: w.dockerAdapter,
RuntimeRecords: w.runtimeRecordStore,
OperationLogs: w.operationLogStore,
HealthEvents: w.healthEventsPublisher,
Leases: w.gameLeaseStore,
Telemetry: w.telemetry,
DockerCfg: w.cfg.Docker,
ContainerCfg: w.cfg.Container,
Coordination: w.cfg.Coordination,
Interval: w.cfg.Cleanup.ReconcileInterval,
Clock: w.clock,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("reconciler: %w", err)
}
w.reconciler = reconciler
cleanupWorker, err := containercleanup.NewWorker(containercleanup.Dependencies{
RuntimeRecords: w.runtimeRecordStore,
Cleanup: w.cleanupContainerService,
Retention: w.cfg.Container.Retention,
Interval: w.cfg.Cleanup.CleanupInterval,
Clock: w.clock,
Logger: w.logger,
})
if err != nil {
return fmt.Errorf("container cleanup worker: %w", err)
}
w.containerCleanupWorker = cleanupWorker
return nil
}
// newProbeHTTPClient constructs the otelhttp-instrumented HTTP client
// the active health probe uses to call engine `/healthz`. It clones
// the default transport so caller-provided transports stay isolated
// from production wiring (mirrors the lobby internal client).
func newProbeHTTPClient(telemetryRuntime *telemetry.Runtime) (*http.Client, error) {
transport, ok := http.DefaultTransport.(*http.Transport)
if !ok {
return nil, errors.New("default http transport is not *http.Transport")
}
cloned := transport.Clone()
instrumented := otelhttp.NewTransport(cloned,
otelhttp.WithTracerProvider(telemetryRuntime.TracerProvider()),
otelhttp.WithMeterProvider(telemetryRuntime.MeterProvider()),
)
return &http.Client{Transport: instrumented}, nil
}
// registerTelemetryGauges installs the runtime-records-by-status gauge
// callback so the telemetry runtime can observe the persistent store
// without holding a strong reference to the wiring.
func (w *wiring) registerTelemetryGauges() error {
probe := newRuntimeRecordsProbe(w.runtimeRecordStore)
return w.telemetry.RegisterGauges(telemetry.GaugeDependencies{
RuntimeRecordsByStatus: probe,
Logger: w.logger,
})
}
// close releases adapter-level resources owned by the wiring layer.
// Returns the joined error of every closer; the caller is expected to
// invoke this once during process shutdown.
func (w *wiring) close() error {
var joined error
for index := len(w.closers) - 1; index >= 0; index-- {
if err := w.closers[index](); err != nil {
joined = errors.Join(joined, err)
}
}
w.closers = nil
return joined
}
// runtimeRecordsProbe adapts runtimerecordstore.Store to
// telemetry.RuntimeRecordsByStatusProbe by translating the typed status
// keys into the string keys the gauge expects.
type runtimeRecordsProbe struct {
store *runtimerecordstore.Store
}
func newRuntimeRecordsProbe(store *runtimerecordstore.Store) *runtimeRecordsProbe {
return &runtimeRecordsProbe{store: store}
}
func (p *runtimeRecordsProbe) CountByStatus(ctx context.Context) (map[string]int, error) {
if p == nil || p.store == nil {
return nil, errors.New("runtime records probe: nil store")
}
counts, err := p.store.CountByStatus(ctx)
if err != nil {
return nil, err
}
out := make(map[string]int, len(counts))
for status, count := range counts {
out[string(status)] = count
}
return out, nil
}
// Compile-time assertions that the constructed adapters satisfy the
// expected port surfaces; these prevent silent regressions when a
// port shape changes.
var (
_ ports.RuntimeRecordStore = (*runtimerecordstore.Store)(nil)
_ ports.OperationLogStore = (*operationlogstore.Store)(nil)
_ ports.HealthSnapshotStore = (*healthsnapshotstore.Store)(nil)
_ ports.StreamOffsetStore = (*streamoffsets.Store)(nil)
_ ports.GameLeaseStore = (*gamelease.Store)(nil)
_ ports.DockerClient = (*docker.Client)(nil)
_ ports.LobbyInternalClient = (*lobbyclient.Client)(nil)
_ ports.NotificationIntentPublisher = (*notificationpublisher.Publisher)(nil)
_ ports.HealthEventPublisher = (*healtheventspublisher.Publisher)(nil)
_ ports.JobResultPublisher = (*jobresultspublisher.Publisher)(nil)
_ Component = (*reconcile.Reconciler)(nil)
_ Component = (*containercleanup.Worker)(nil)
_ containercleanup.Cleaner = (*cleanupcontainer.Service)(nil)
)
+632
View File
@@ -0,0 +1,632 @@
// Package config loads the Runtime Manager process configuration from
// environment variables.
package config
import (
"fmt"
"strings"
"time"
"galaxy/postgres"
"galaxy/redisconn"
"galaxy/rtmanager/internal/telemetry"
)
const (
envPrefix = "RTMANAGER"
shutdownTimeoutEnvVar = "RTMANAGER_SHUTDOWN_TIMEOUT"
logLevelEnvVar = "RTMANAGER_LOG_LEVEL"
internalHTTPAddrEnvVar = "RTMANAGER_INTERNAL_HTTP_ADDR"
internalHTTPReadHeaderTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_READ_HEADER_TIMEOUT"
internalHTTPReadTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_READ_TIMEOUT"
internalHTTPWriteTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_WRITE_TIMEOUT"
internalHTTPIdleTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_IDLE_TIMEOUT"
dockerHostEnvVar = "RTMANAGER_DOCKER_HOST"
dockerAPIVersionEnvVar = "RTMANAGER_DOCKER_API_VERSION"
dockerNetworkEnvVar = "RTMANAGER_DOCKER_NETWORK"
dockerLogDriverEnvVar = "RTMANAGER_DOCKER_LOG_DRIVER"
dockerLogOptsEnvVar = "RTMANAGER_DOCKER_LOG_OPTS"
imagePullPolicyEnvVar = "RTMANAGER_IMAGE_PULL_POLICY"
defaultCPUQuotaEnvVar = "RTMANAGER_DEFAULT_CPU_QUOTA"
defaultMemoryEnvVar = "RTMANAGER_DEFAULT_MEMORY"
defaultPIDsLimitEnvVar = "RTMANAGER_DEFAULT_PIDS_LIMIT"
containerStopTimeoutSecondsEnvVar = "RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS"
containerRetentionDaysEnvVar = "RTMANAGER_CONTAINER_RETENTION_DAYS"
engineStateMountPathEnvVar = "RTMANAGER_ENGINE_STATE_MOUNT_PATH"
engineStateEnvNameEnvVar = "RTMANAGER_ENGINE_STATE_ENV_NAME"
gameStateDirModeEnvVar = "RTMANAGER_GAME_STATE_DIR_MODE"
gameStateOwnerUIDEnvVar = "RTMANAGER_GAME_STATE_OWNER_UID"
gameStateOwnerGIDEnvVar = "RTMANAGER_GAME_STATE_OWNER_GID"
gameStateRootEnvVar = "RTMANAGER_GAME_STATE_ROOT"
startJobsStreamEnvVar = "RTMANAGER_REDIS_START_JOBS_STREAM"
stopJobsStreamEnvVar = "RTMANAGER_REDIS_STOP_JOBS_STREAM"
jobResultsStreamEnvVar = "RTMANAGER_REDIS_JOB_RESULTS_STREAM"
healthEventsStreamEnvVar = "RTMANAGER_REDIS_HEALTH_EVENTS_STREAM"
notificationIntentsStreamEnv = "RTMANAGER_NOTIFICATION_INTENTS_STREAM"
streamBlockTimeoutEnvVar = "RTMANAGER_STREAM_BLOCK_TIMEOUT"
inspectIntervalEnvVar = "RTMANAGER_INSPECT_INTERVAL"
probeIntervalEnvVar = "RTMANAGER_PROBE_INTERVAL"
probeTimeoutEnvVar = "RTMANAGER_PROBE_TIMEOUT"
probeFailuresThresholdEnvVar = "RTMANAGER_PROBE_FAILURES_THRESHOLD"
reconcileIntervalEnvVar = "RTMANAGER_RECONCILE_INTERVAL"
cleanupIntervalEnvVar = "RTMANAGER_CLEANUP_INTERVAL"
gameLeaseTTLSecondsEnvVar = "RTMANAGER_GAME_LEASE_TTL_SECONDS"
lobbyInternalBaseURLEnvVar = "RTMANAGER_LOBBY_INTERNAL_BASE_URL"
lobbyInternalTimeoutEnvVar = "RTMANAGER_LOBBY_INTERNAL_TIMEOUT"
otelServiceNameEnvVar = "OTEL_SERVICE_NAME"
otelTracesExporterEnvVar = "OTEL_TRACES_EXPORTER"
otelMetricsExporterEnvVar = "OTEL_METRICS_EXPORTER"
otelExporterOTLPProtocolEnvVar = "OTEL_EXPORTER_OTLP_PROTOCOL"
otelExporterOTLPTracesProtocolEnvVar = "OTEL_EXPORTER_OTLP_TRACES_PROTOCOL"
otelExporterOTLPMetricsProtocolEnvVar = "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL"
otelStdoutTracesEnabledEnvVar = "RTMANAGER_OTEL_STDOUT_TRACES_ENABLED"
otelStdoutMetricsEnabledEnvVar = "RTMANAGER_OTEL_STDOUT_METRICS_ENABLED"
defaultShutdownTimeout = 30 * time.Second
defaultLogLevel = "info"
defaultInternalHTTPAddr = ":8096"
defaultReadHeaderTimeout = 2 * time.Second
defaultReadTimeout = 5 * time.Second
defaultWriteTimeout = 15 * time.Second
defaultIdleTimeout = 60 * time.Second
defaultDockerHost = "unix:///var/run/docker.sock"
defaultDockerNetwork = "galaxy-net"
defaultDockerLogDriver = "json-file"
defaultImagePullPolicy = ImagePullPolicyIfMissing
defaultCPUQuota = 1.0
defaultMemory = "512m"
defaultPIDsLimit = 512
defaultContainerStopTimeout = 30 * time.Second
defaultContainerRetention = 30 * 24 * time.Hour
defaultEngineStateMountPath = "/var/lib/galaxy-game"
defaultEngineStateEnvName = "GAME_STATE_PATH"
defaultGameStateDirMode = 0o750
defaultStartJobsStream = "runtime:start_jobs"
defaultStopJobsStream = "runtime:stop_jobs"
defaultJobResultsStream = "runtime:job_results"
defaultHealthEventsStream = "runtime:health_events"
defaultNotificationIntentsKey = "notification:intents"
defaultStreamBlockTimeout = 5 * time.Second
defaultInspectInterval = 30 * time.Second
defaultProbeInterval = 15 * time.Second
defaultProbeTimeout = 2 * time.Second
defaultProbeFailuresThreshold = 3
defaultReconcileInterval = 5 * time.Minute
defaultCleanupInterval = time.Hour
defaultGameLeaseTTL = 60 * time.Second
defaultLobbyInternalTimeout = 2 * time.Second
defaultOTelServiceName = "galaxy-rtmanager"
)
// ImagePullPolicy enumerates the supported image pull policies. The start
// service validates a producer-supplied `image_ref` against this policy at
// start time.
type ImagePullPolicy string
// Supported pull policies, frozen by `rtmanager/README.md` §Configuration.
const (
ImagePullPolicyIfMissing ImagePullPolicy = "if_missing"
ImagePullPolicyAlways ImagePullPolicy = "always"
ImagePullPolicyNever ImagePullPolicy = "never"
)
// Validate reports whether p is one of the frozen pull policies.
func (p ImagePullPolicy) Validate() error {
switch p {
case ImagePullPolicyIfMissing, ImagePullPolicyAlways, ImagePullPolicyNever:
return nil
default:
return fmt.Errorf("image pull policy %q must be one of %q, %q, %q",
p, ImagePullPolicyIfMissing, ImagePullPolicyAlways, ImagePullPolicyNever)
}
}
// Config stores the full Runtime Manager process configuration.
type Config struct {
// ShutdownTimeout bounds graceful shutdown of every long-lived
// component.
ShutdownTimeout time.Duration
// Logging configures the process-wide structured logger.
Logging LoggingConfig
// InternalHTTP configures the trusted internal HTTP listener that
// serves probes and the GM/Admin REST surface.
InternalHTTP InternalHTTPConfig
// Docker configures the Docker SDK client RTM uses to drive the local
// Docker daemon.
Docker DockerConfig
// Postgres configures the PostgreSQL-backed durable store consumed via
// `pkg/postgres`.
Postgres PostgresConfig
// Redis configures the shared Redis connection topology consumed via
// `pkg/redisconn`.
Redis RedisConfig
// Streams stores the stable Redis Stream names RTM reads from and
// writes to.
Streams StreamsConfig
// Container stores the per-container defaults applied at start time
// when the resolved image does not declare its own labels.
Container ContainerConfig
// Health configures the periodic health-monitoring workers (events
// listener, inspect, active probe).
Health HealthConfig
// Cleanup configures the reconciler and container-cleanup workers.
Cleanup CleanupConfig
// Coordination configures the per-game Redis lease used to serialise
// operations across all entry points.
Coordination CoordinationConfig
// Lobby configures the synchronous Lobby internal REST client used by
// the start service for ancillary lookups.
Lobby LobbyConfig
// Telemetry configures the process-wide OpenTelemetry runtime.
Telemetry TelemetryConfig
}
// LoggingConfig configures the process-wide structured logger.
type LoggingConfig struct {
// Level stores the process log level accepted by log/slog.
Level string
}
// InternalHTTPConfig configures the trusted internal HTTP listener.
type InternalHTTPConfig struct {
// Addr stores the TCP listen address.
Addr string
// ReadHeaderTimeout bounds request-header reading.
ReadHeaderTimeout time.Duration
// ReadTimeout bounds reading one request.
ReadTimeout time.Duration
// WriteTimeout bounds writing one response.
WriteTimeout time.Duration
// IdleTimeout bounds how long keep-alive connections stay open.
IdleTimeout time.Duration
}
// Validate reports whether cfg stores a usable internal HTTP listener
// configuration.
func (cfg InternalHTTPConfig) Validate() error {
switch {
case strings.TrimSpace(cfg.Addr) == "":
return fmt.Errorf("internal HTTP addr must not be empty")
case !isTCPAddr(cfg.Addr):
return fmt.Errorf("internal HTTP addr %q must use host:port form", cfg.Addr)
case cfg.ReadHeaderTimeout <= 0:
return fmt.Errorf("internal HTTP read header timeout must be positive")
case cfg.ReadTimeout <= 0:
return fmt.Errorf("internal HTTP read timeout must be positive")
case cfg.WriteTimeout <= 0:
return fmt.Errorf("internal HTTP write timeout must be positive")
case cfg.IdleTimeout <= 0:
return fmt.Errorf("internal HTTP idle timeout must be positive")
default:
return nil
}
}
// DockerConfig configures the Docker SDK client.
type DockerConfig struct {
// Host stores the Docker daemon endpoint (e.g.
// `unix:///var/run/docker.sock`).
Host string
// APIVersion overrides the Docker API version. Empty lets the SDK
// negotiate.
APIVersion string
// Network stores the user-defined Docker bridge network containers
// attach to. Provisioned outside RTM; missing network is a fail-fast
// condition at startup.
Network string
// LogDriver stores the Docker logging driver applied to engine
// containers.
LogDriver string
// LogOpts stores the comma-separated `key=value` driver options.
LogOpts string
// PullPolicy stores the configured image pull policy.
PullPolicy ImagePullPolicy
}
// Validate reports whether cfg stores a usable Docker configuration.
func (cfg DockerConfig) Validate() error {
switch {
case strings.TrimSpace(cfg.Host) == "":
return fmt.Errorf("docker host must not be empty")
case strings.TrimSpace(cfg.Network) == "":
return fmt.Errorf("docker network must not be empty")
case strings.TrimSpace(cfg.LogDriver) == "":
return fmt.Errorf("docker log driver must not be empty")
}
return cfg.PullPolicy.Validate()
}
// PostgresConfig configures the PostgreSQL-backed durable store consumed
// via `pkg/postgres`.
type PostgresConfig struct {
// Conn carries the primary plus replica DSN topology and pool tuning.
Conn postgres.Config
}
// Validate reports whether cfg stores a usable PostgreSQL configuration.
func (cfg PostgresConfig) Validate() error {
return cfg.Conn.Validate()
}
// RedisConfig configures the Runtime Manager Redis connection topology.
type RedisConfig struct {
// Conn carries the connection topology (master, replicas, password,
// db, per-call timeout).
Conn redisconn.Config
}
// Validate reports whether cfg stores a usable Redis configuration.
func (cfg RedisConfig) Validate() error {
return cfg.Conn.Validate()
}
// StreamsConfig stores the stable Redis Stream names used by Runtime
// Manager.
type StreamsConfig struct {
// StartJobs stores the Redis Streams key Lobby writes start jobs to.
StartJobs string
// StopJobs stores the Redis Streams key Lobby writes stop jobs to.
StopJobs string
// JobResults stores the Redis Streams key RTM writes job outcomes
// to.
JobResults string
// HealthEvents stores the Redis Streams key RTM publishes
// technical health events to.
HealthEvents string
// NotificationIntents stores the Redis Streams key RTM publishes
// admin-only notification intents to.
NotificationIntents string
// BlockTimeout bounds the maximum blocking read window for stream
// consumers.
BlockTimeout time.Duration
}
// Validate reports whether cfg stores usable stream names.
func (cfg StreamsConfig) Validate() error {
switch {
case strings.TrimSpace(cfg.StartJobs) == "":
return fmt.Errorf("redis start jobs stream must not be empty")
case strings.TrimSpace(cfg.StopJobs) == "":
return fmt.Errorf("redis stop jobs stream must not be empty")
case strings.TrimSpace(cfg.JobResults) == "":
return fmt.Errorf("redis job results stream must not be empty")
case strings.TrimSpace(cfg.HealthEvents) == "":
return fmt.Errorf("redis health events stream must not be empty")
case strings.TrimSpace(cfg.NotificationIntents) == "":
return fmt.Errorf("redis notification intents stream must not be empty")
case cfg.BlockTimeout <= 0:
return fmt.Errorf("redis stream block timeout must be positive")
default:
return nil
}
}
// ContainerConfig stores the per-container defaults applied at start
// time. Resource defaults apply when the resolved engine image does not
// expose `com.galaxy.cpu_quota` / `com.galaxy.memory` /
// `com.galaxy.pids_limit` labels.
type ContainerConfig struct {
// DefaultCPUQuota is the fallback `--cpus` value applied when the
// image does not declare `com.galaxy.cpu_quota`.
DefaultCPUQuota float64
// DefaultMemory is the fallback `--memory` value applied when the
// image does not declare `com.galaxy.memory`.
DefaultMemory string
// DefaultPIDsLimit is the fallback `--pids-limit` value applied
// when the image does not declare `com.galaxy.pids_limit`.
DefaultPIDsLimit int
// StopTimeout bounds graceful container stop before Docker fires
// SIGKILL.
StopTimeout time.Duration
// Retention stores the TTL after which `status=stopped` containers
// are removed by the cleanup worker.
Retention time.Duration
// EngineStateMountPath is the in-container path the per-game state
// directory is bind-mounted to.
EngineStateMountPath string
// EngineStateEnvName is the env-var name forwarded to the engine
// pointing at EngineStateMountPath.
EngineStateEnvName string
// GameStateDirMode stores the unix permissions applied to the
// per-game state directory on creation.
GameStateDirMode uint32
// GameStateOwnerUID stores the unix uid applied to the per-game
// state directory on creation.
GameStateOwnerUID int
// GameStateOwnerGID stores the unix gid applied to the per-game
// state directory on creation.
GameStateOwnerGID int
// GameStateRoot is the host path under which per-game state
// directories are created.
GameStateRoot string
}
// Validate reports whether cfg stores usable container defaults.
func (cfg ContainerConfig) Validate() error {
switch {
case cfg.DefaultCPUQuota <= 0:
return fmt.Errorf("default cpu quota must be positive")
case strings.TrimSpace(cfg.DefaultMemory) == "":
return fmt.Errorf("default memory must not be empty")
case cfg.DefaultPIDsLimit <= 0:
return fmt.Errorf("default pids limit must be positive")
case cfg.StopTimeout <= 0:
return fmt.Errorf("container stop timeout must be positive")
case cfg.Retention <= 0:
return fmt.Errorf("container retention must be positive")
case strings.TrimSpace(cfg.EngineStateMountPath) == "":
return fmt.Errorf("engine state mount path must not be empty")
case strings.TrimSpace(cfg.EngineStateEnvName) == "":
return fmt.Errorf("engine state env name must not be empty")
case cfg.GameStateDirMode == 0:
return fmt.Errorf("game state dir mode must be non-zero")
case strings.TrimSpace(cfg.GameStateRoot) == "":
return fmt.Errorf("game state root must not be empty")
case !strings.HasPrefix(strings.TrimSpace(cfg.GameStateRoot), "/"):
return fmt.Errorf("game state root %q must be an absolute path", cfg.GameStateRoot)
default:
return nil
}
}
// HealthConfig configures the periodic health-monitoring workers
// (Docker events listener, periodic inspect, active probe).
type HealthConfig struct {
// InspectInterval is the period between two periodic Docker inspect
// passes.
InspectInterval time.Duration
// ProbeInterval is the period between two engine `/healthz` probe
// rounds.
ProbeInterval time.Duration
// ProbeTimeout bounds one engine `/healthz` request.
ProbeTimeout time.Duration
// ProbeFailuresThreshold is the consecutive-failure count that
// triggers a `probe_failed` event.
ProbeFailuresThreshold int
}
// Validate reports whether cfg stores usable health-monitoring settings.
func (cfg HealthConfig) Validate() error {
switch {
case cfg.InspectInterval <= 0:
return fmt.Errorf("inspect interval must be positive")
case cfg.ProbeInterval <= 0:
return fmt.Errorf("probe interval must be positive")
case cfg.ProbeTimeout <= 0:
return fmt.Errorf("probe timeout must be positive")
case cfg.ProbeFailuresThreshold <= 0:
return fmt.Errorf("probe failures threshold must be positive")
default:
return nil
}
}
// CleanupConfig configures the reconciler and container-cleanup workers.
type CleanupConfig struct {
// ReconcileInterval is the period between two reconciler passes.
ReconcileInterval time.Duration
// CleanupInterval is the period between two container-cleanup
// passes.
CleanupInterval time.Duration
}
// Validate reports whether cfg stores usable cleanup settings.
func (cfg CleanupConfig) Validate() error {
switch {
case cfg.ReconcileInterval <= 0:
return fmt.Errorf("reconcile interval must be positive")
case cfg.CleanupInterval <= 0:
return fmt.Errorf("cleanup interval must be positive")
default:
return nil
}
}
// CoordinationConfig configures the per-game Redis lease.
type CoordinationConfig struct {
// GameLeaseTTL bounds the per-game lease lifetime renewed every
// half-TTL while an operation runs.
GameLeaseTTL time.Duration
}
// Validate reports whether cfg stores a usable lease configuration.
func (cfg CoordinationConfig) Validate() error {
if cfg.GameLeaseTTL <= 0 {
return fmt.Errorf("game lease ttl must be positive")
}
return nil
}
// LobbyConfig configures the synchronous Lobby internal REST client.
type LobbyConfig struct {
// BaseURL stores the trusted Lobby internal listener base URL.
BaseURL string
// Timeout bounds one Lobby internal request.
Timeout time.Duration
}
// Validate reports whether cfg stores a usable Lobby client
// configuration.
func (cfg LobbyConfig) Validate() error {
switch {
case strings.TrimSpace(cfg.BaseURL) == "":
return fmt.Errorf("lobby internal base url must not be empty")
case !isHTTPURL(cfg.BaseURL):
return fmt.Errorf("lobby internal base url %q must be an absolute http(s) URL", cfg.BaseURL)
case cfg.Timeout <= 0:
return fmt.Errorf("lobby internal timeout must be positive")
default:
return nil
}
}
// TelemetryConfig configures the Runtime Manager OpenTelemetry runtime.
type TelemetryConfig struct {
// ServiceName overrides the default OpenTelemetry service name.
ServiceName string
// TracesExporter selects the external traces exporter. Supported
// values are `none` and `otlp`.
TracesExporter string
// MetricsExporter selects the external metrics exporter. Supported
// values are `none` and `otlp`.
MetricsExporter string
// TracesProtocol selects the OTLP traces protocol when
// TracesExporter is `otlp`.
TracesProtocol string
// MetricsProtocol selects the OTLP metrics protocol when
// MetricsExporter is `otlp`.
MetricsProtocol string
// StdoutTracesEnabled enables the additional stdout trace exporter
// used for local development and debugging.
StdoutTracesEnabled bool
// StdoutMetricsEnabled enables the additional stdout metric
// exporter used for local development and debugging.
StdoutMetricsEnabled bool
}
// Validate reports whether cfg contains a supported OpenTelemetry
// configuration.
func (cfg TelemetryConfig) Validate() error {
return telemetry.ProcessConfig{
ServiceName: cfg.ServiceName,
TracesExporter: cfg.TracesExporter,
MetricsExporter: cfg.MetricsExporter,
TracesProtocol: cfg.TracesProtocol,
MetricsProtocol: cfg.MetricsProtocol,
StdoutTracesEnabled: cfg.StdoutTracesEnabled,
StdoutMetricsEnabled: cfg.StdoutMetricsEnabled,
}.Validate()
}
// DefaultConfig returns the default Runtime Manager process configuration.
func DefaultConfig() Config {
return Config{
ShutdownTimeout: defaultShutdownTimeout,
Logging: LoggingConfig{
Level: defaultLogLevel,
},
InternalHTTP: InternalHTTPConfig{
Addr: defaultInternalHTTPAddr,
ReadHeaderTimeout: defaultReadHeaderTimeout,
ReadTimeout: defaultReadTimeout,
WriteTimeout: defaultWriteTimeout,
IdleTimeout: defaultIdleTimeout,
},
Docker: DockerConfig{
Host: defaultDockerHost,
Network: defaultDockerNetwork,
LogDriver: defaultDockerLogDriver,
PullPolicy: defaultImagePullPolicy,
},
Postgres: PostgresConfig{
Conn: postgres.DefaultConfig(),
},
Redis: RedisConfig{
Conn: redisconn.DefaultConfig(),
},
Streams: StreamsConfig{
StartJobs: defaultStartJobsStream,
StopJobs: defaultStopJobsStream,
JobResults: defaultJobResultsStream,
HealthEvents: defaultHealthEventsStream,
NotificationIntents: defaultNotificationIntentsKey,
BlockTimeout: defaultStreamBlockTimeout,
},
Container: ContainerConfig{
DefaultCPUQuota: defaultCPUQuota,
DefaultMemory: defaultMemory,
DefaultPIDsLimit: defaultPIDsLimit,
StopTimeout: defaultContainerStopTimeout,
Retention: defaultContainerRetention,
EngineStateMountPath: defaultEngineStateMountPath,
EngineStateEnvName: defaultEngineStateEnvName,
GameStateDirMode: defaultGameStateDirMode,
},
Health: HealthConfig{
InspectInterval: defaultInspectInterval,
ProbeInterval: defaultProbeInterval,
ProbeTimeout: defaultProbeTimeout,
ProbeFailuresThreshold: defaultProbeFailuresThreshold,
},
Cleanup: CleanupConfig{
ReconcileInterval: defaultReconcileInterval,
CleanupInterval: defaultCleanupInterval,
},
Coordination: CoordinationConfig{
GameLeaseTTL: defaultGameLeaseTTL,
},
Lobby: LobbyConfig{
Timeout: defaultLobbyInternalTimeout,
},
Telemetry: TelemetryConfig{
ServiceName: defaultOTelServiceName,
TracesExporter: "none",
MetricsExporter: "none",
},
}
}
+142
View File
@@ -0,0 +1,142 @@
package config
import (
"strings"
"testing"
"time"
"github.com/stretchr/testify/require"
)
func validEnv(t *testing.T) {
t.Helper()
t.Setenv("RTMANAGER_POSTGRES_PRIMARY_DSN", "postgres://rtm:secret@localhost:5432/galaxy?search_path=rtmanager&sslmode=disable")
t.Setenv("RTMANAGER_REDIS_MASTER_ADDR", "localhost:6379")
t.Setenv("RTMANAGER_REDIS_PASSWORD", "secret")
t.Setenv("RTMANAGER_GAME_STATE_ROOT", "/var/lib/galaxy/games")
t.Setenv("RTMANAGER_LOBBY_INTERNAL_BASE_URL", "http://lobby:8095")
}
func TestLoadFromEnvAcceptsDefaults(t *testing.T) {
validEnv(t)
cfg, err := LoadFromEnv()
require.NoError(t, err)
require.Equal(t, ":8096", cfg.InternalHTTP.Addr)
require.Equal(t, "unix:///var/run/docker.sock", cfg.Docker.Host)
require.Equal(t, "galaxy-net", cfg.Docker.Network)
require.Equal(t, "json-file", cfg.Docker.LogDriver)
require.Equal(t, ImagePullPolicyIfMissing, cfg.Docker.PullPolicy)
require.Equal(t, "runtime:start_jobs", cfg.Streams.StartJobs)
require.Equal(t, "runtime:stop_jobs", cfg.Streams.StopJobs)
require.Equal(t, "runtime:job_results", cfg.Streams.JobResults)
require.Equal(t, "runtime:health_events", cfg.Streams.HealthEvents)
require.Equal(t, "notification:intents", cfg.Streams.NotificationIntents)
require.Equal(t, 30*time.Second, cfg.Container.StopTimeout)
require.Equal(t, 30*24*time.Hour, cfg.Container.Retention)
require.Equal(t, "/var/lib/galaxy-game", cfg.Container.EngineStateMountPath)
require.Equal(t, "GAME_STATE_PATH", cfg.Container.EngineStateEnvName)
require.EqualValues(t, 0o750, cfg.Container.GameStateDirMode)
require.Equal(t, 60*time.Second, cfg.Coordination.GameLeaseTTL)
require.Equal(t, "http://lobby:8095", cfg.Lobby.BaseURL)
require.Equal(t, 2*time.Second, cfg.Lobby.Timeout)
require.Equal(t, "galaxy-rtmanager", cfg.Telemetry.ServiceName)
}
func TestLoadFromEnvHonoursOverrides(t *testing.T) {
validEnv(t)
t.Setenv("RTMANAGER_INTERNAL_HTTP_ADDR", ":9000")
t.Setenv("RTMANAGER_DOCKER_NETWORK", "custom-net")
t.Setenv("RTMANAGER_IMAGE_PULL_POLICY", "always")
t.Setenv("RTMANAGER_REDIS_START_JOBS_STREAM", "custom:start_jobs")
t.Setenv("RTMANAGER_GAME_LEASE_TTL_SECONDS", "120")
t.Setenv("RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS", "45")
t.Setenv("RTMANAGER_CONTAINER_RETENTION_DAYS", "7")
t.Setenv("RTMANAGER_GAME_STATE_DIR_MODE", "0700")
cfg, err := LoadFromEnv()
require.NoError(t, err)
require.Equal(t, ":9000", cfg.InternalHTTP.Addr)
require.Equal(t, "custom-net", cfg.Docker.Network)
require.Equal(t, ImagePullPolicyAlways, cfg.Docker.PullPolicy)
require.Equal(t, "custom:start_jobs", cfg.Streams.StartJobs)
require.Equal(t, 120*time.Second, cfg.Coordination.GameLeaseTTL)
require.Equal(t, 45*time.Second, cfg.Container.StopTimeout)
require.Equal(t, 7*24*time.Hour, cfg.Container.Retention)
require.EqualValues(t, 0o700, cfg.Container.GameStateDirMode)
}
func TestLoadFromEnvRejectsUnknownPullPolicy(t *testing.T) {
validEnv(t)
t.Setenv("RTMANAGER_IMAGE_PULL_POLICY", "weekly")
_, err := LoadFromEnv()
require.Error(t, err)
require.Contains(t, err.Error(), "image pull policy")
}
func TestLoadFromEnvRequiresGameStateRoot(t *testing.T) {
t.Setenv("RTMANAGER_POSTGRES_PRIMARY_DSN", "postgres://rtm:secret@localhost:5432/galaxy")
t.Setenv("RTMANAGER_REDIS_MASTER_ADDR", "localhost:6379")
t.Setenv("RTMANAGER_REDIS_PASSWORD", "secret")
t.Setenv("RTMANAGER_LOBBY_INTERNAL_BASE_URL", "http://lobby:8095")
_, err := LoadFromEnv()
require.Error(t, err)
require.Contains(t, err.Error(), "RTMANAGER_GAME_STATE_ROOT")
}
func TestLoadFromEnvRequiresLobbyBaseURL(t *testing.T) {
t.Setenv("RTMANAGER_POSTGRES_PRIMARY_DSN", "postgres://rtm:secret@localhost:5432/galaxy")
t.Setenv("RTMANAGER_REDIS_MASTER_ADDR", "localhost:6379")
t.Setenv("RTMANAGER_REDIS_PASSWORD", "secret")
t.Setenv("RTMANAGER_GAME_STATE_ROOT", "/var/lib/galaxy/games")
_, err := LoadFromEnv()
require.Error(t, err)
require.Contains(t, err.Error(), "RTMANAGER_LOBBY_INTERNAL_BASE_URL")
}
func TestLoadFromEnvRejectsRelativeStateRoot(t *testing.T) {
validEnv(t)
t.Setenv("RTMANAGER_GAME_STATE_ROOT", "relative/path")
_, err := LoadFromEnv()
require.Error(t, err)
require.Contains(t, err.Error(), "absolute path")
}
func TestLoadFromEnvRejectsBadLogLevel(t *testing.T) {
validEnv(t)
t.Setenv("RTMANAGER_LOG_LEVEL", "verbose")
_, err := LoadFromEnv()
require.Error(t, err)
require.Contains(t, err.Error(), "RTMANAGER_LOG_LEVEL")
}
func TestImagePullPolicyValidate(t *testing.T) {
require.NoError(t, ImagePullPolicyIfMissing.Validate())
require.NoError(t, ImagePullPolicyAlways.Validate())
require.NoError(t, ImagePullPolicyNever.Validate())
require.Error(t, ImagePullPolicy("monthly").Validate())
}
func TestInternalHTTPValidateRejectsBadAddr(t *testing.T) {
cfg := DefaultConfig().InternalHTTP
cfg.Addr = "not-an-addr"
err := cfg.Validate()
require.Error(t, err)
require.Contains(t, err.Error(), "host:port")
}
func TestStreamsValidateRequiresAllNames(t *testing.T) {
cfg := DefaultConfig().Streams
cfg.StartJobs = " "
err := cfg.Validate()
require.Error(t, err)
require.True(t, strings.Contains(err.Error(), "start jobs"))
}
+319
View File
@@ -0,0 +1,319 @@
package config
import (
"fmt"
"os"
"strconv"
"strings"
"time"
"galaxy/postgres"
"galaxy/redisconn"
)
// LoadFromEnv builds Config from environment variables and validates the
// resulting configuration.
func LoadFromEnv() (Config, error) {
cfg := DefaultConfig()
var err error
cfg.ShutdownTimeout, err = durationEnv(shutdownTimeoutEnvVar, cfg.ShutdownTimeout)
if err != nil {
return Config{}, err
}
cfg.Logging.Level = stringEnv(logLevelEnvVar, cfg.Logging.Level)
cfg.InternalHTTP.Addr = stringEnv(internalHTTPAddrEnvVar, cfg.InternalHTTP.Addr)
cfg.InternalHTTP.ReadHeaderTimeout, err = durationEnv(internalHTTPReadHeaderTimeoutEnvVar, cfg.InternalHTTP.ReadHeaderTimeout)
if err != nil {
return Config{}, err
}
cfg.InternalHTTP.ReadTimeout, err = durationEnv(internalHTTPReadTimeoutEnvVar, cfg.InternalHTTP.ReadTimeout)
if err != nil {
return Config{}, err
}
cfg.InternalHTTP.WriteTimeout, err = durationEnv(internalHTTPWriteTimeoutEnvVar, cfg.InternalHTTP.WriteTimeout)
if err != nil {
return Config{}, err
}
cfg.InternalHTTP.IdleTimeout, err = durationEnv(internalHTTPIdleTimeoutEnvVar, cfg.InternalHTTP.IdleTimeout)
if err != nil {
return Config{}, err
}
cfg.Docker.Host = stringEnv(dockerHostEnvVar, cfg.Docker.Host)
cfg.Docker.APIVersion = stringEnv(dockerAPIVersionEnvVar, cfg.Docker.APIVersion)
cfg.Docker.Network = stringEnv(dockerNetworkEnvVar, cfg.Docker.Network)
cfg.Docker.LogDriver = stringEnv(dockerLogDriverEnvVar, cfg.Docker.LogDriver)
cfg.Docker.LogOpts = stringEnv(dockerLogOptsEnvVar, cfg.Docker.LogOpts)
if raw, ok := os.LookupEnv(imagePullPolicyEnvVar); ok {
cfg.Docker.PullPolicy = ImagePullPolicy(strings.TrimSpace(raw))
}
pgConn, err := postgres.LoadFromEnv(envPrefix)
if err != nil {
return Config{}, err
}
cfg.Postgres.Conn = pgConn
redisConn, err := redisconn.LoadFromEnv(envPrefix)
if err != nil {
return Config{}, err
}
cfg.Redis.Conn = redisConn
cfg.Streams.StartJobs = stringEnv(startJobsStreamEnvVar, cfg.Streams.StartJobs)
cfg.Streams.StopJobs = stringEnv(stopJobsStreamEnvVar, cfg.Streams.StopJobs)
cfg.Streams.JobResults = stringEnv(jobResultsStreamEnvVar, cfg.Streams.JobResults)
cfg.Streams.HealthEvents = stringEnv(healthEventsStreamEnvVar, cfg.Streams.HealthEvents)
cfg.Streams.NotificationIntents = stringEnv(notificationIntentsStreamEnv, cfg.Streams.NotificationIntents)
cfg.Streams.BlockTimeout, err = durationEnv(streamBlockTimeoutEnvVar, cfg.Streams.BlockTimeout)
if err != nil {
return Config{}, err
}
cfg.Container.DefaultCPUQuota, err = floatEnv(defaultCPUQuotaEnvVar, cfg.Container.DefaultCPUQuota)
if err != nil {
return Config{}, err
}
cfg.Container.DefaultMemory = stringEnv(defaultMemoryEnvVar, cfg.Container.DefaultMemory)
cfg.Container.DefaultPIDsLimit, err = intEnv(defaultPIDsLimitEnvVar, cfg.Container.DefaultPIDsLimit)
if err != nil {
return Config{}, err
}
cfg.Container.StopTimeout, err = secondsEnv(containerStopTimeoutSecondsEnvVar, cfg.Container.StopTimeout)
if err != nil {
return Config{}, err
}
cfg.Container.Retention, err = daysEnv(containerRetentionDaysEnvVar, cfg.Container.Retention)
if err != nil {
return Config{}, err
}
cfg.Container.EngineStateMountPath = stringEnv(engineStateMountPathEnvVar, cfg.Container.EngineStateMountPath)
cfg.Container.EngineStateEnvName = stringEnv(engineStateEnvNameEnvVar, cfg.Container.EngineStateEnvName)
cfg.Container.GameStateDirMode, err = octalUint32Env(gameStateDirModeEnvVar, cfg.Container.GameStateDirMode)
if err != nil {
return Config{}, err
}
cfg.Container.GameStateOwnerUID, err = intEnv(gameStateOwnerUIDEnvVar, cfg.Container.GameStateOwnerUID)
if err != nil {
return Config{}, err
}
cfg.Container.GameStateOwnerGID, err = intEnv(gameStateOwnerGIDEnvVar, cfg.Container.GameStateOwnerGID)
if err != nil {
return Config{}, err
}
root, ok := os.LookupEnv(gameStateRootEnvVar)
if !ok || strings.TrimSpace(root) == "" {
return Config{}, fmt.Errorf("%s must be set", gameStateRootEnvVar)
}
cfg.Container.GameStateRoot = strings.TrimSpace(root)
cfg.Health.InspectInterval, err = durationEnv(inspectIntervalEnvVar, cfg.Health.InspectInterval)
if err != nil {
return Config{}, err
}
cfg.Health.ProbeInterval, err = durationEnv(probeIntervalEnvVar, cfg.Health.ProbeInterval)
if err != nil {
return Config{}, err
}
cfg.Health.ProbeTimeout, err = durationEnv(probeTimeoutEnvVar, cfg.Health.ProbeTimeout)
if err != nil {
return Config{}, err
}
cfg.Health.ProbeFailuresThreshold, err = intEnv(probeFailuresThresholdEnvVar, cfg.Health.ProbeFailuresThreshold)
if err != nil {
return Config{}, err
}
cfg.Cleanup.ReconcileInterval, err = durationEnv(reconcileIntervalEnvVar, cfg.Cleanup.ReconcileInterval)
if err != nil {
return Config{}, err
}
cfg.Cleanup.CleanupInterval, err = durationEnv(cleanupIntervalEnvVar, cfg.Cleanup.CleanupInterval)
if err != nil {
return Config{}, err
}
cfg.Coordination.GameLeaseTTL, err = secondsEnv(gameLeaseTTLSecondsEnvVar, cfg.Coordination.GameLeaseTTL)
if err != nil {
return Config{}, err
}
lobbyURL, ok := os.LookupEnv(lobbyInternalBaseURLEnvVar)
if !ok || strings.TrimSpace(lobbyURL) == "" {
return Config{}, fmt.Errorf("%s must be set", lobbyInternalBaseURLEnvVar)
}
cfg.Lobby.BaseURL = strings.TrimSpace(lobbyURL)
cfg.Lobby.Timeout, err = durationEnv(lobbyInternalTimeoutEnvVar, cfg.Lobby.Timeout)
if err != nil {
return Config{}, err
}
cfg.Telemetry.ServiceName = stringEnv(otelServiceNameEnvVar, cfg.Telemetry.ServiceName)
cfg.Telemetry.TracesExporter = normalizeExporterValue(stringEnv(otelTracesExporterEnvVar, cfg.Telemetry.TracesExporter))
cfg.Telemetry.MetricsExporter = normalizeExporterValue(stringEnv(otelMetricsExporterEnvVar, cfg.Telemetry.MetricsExporter))
cfg.Telemetry.TracesProtocol = normalizeProtocolValue(
os.Getenv(otelExporterOTLPTracesProtocolEnvVar),
os.Getenv(otelExporterOTLPProtocolEnvVar),
cfg.Telemetry.TracesProtocol,
)
cfg.Telemetry.MetricsProtocol = normalizeProtocolValue(
os.Getenv(otelExporterOTLPMetricsProtocolEnvVar),
os.Getenv(otelExporterOTLPProtocolEnvVar),
cfg.Telemetry.MetricsProtocol,
)
cfg.Telemetry.StdoutTracesEnabled, err = boolEnv(otelStdoutTracesEnabledEnvVar, cfg.Telemetry.StdoutTracesEnabled)
if err != nil {
return Config{}, err
}
cfg.Telemetry.StdoutMetricsEnabled, err = boolEnv(otelStdoutMetricsEnabledEnvVar, cfg.Telemetry.StdoutMetricsEnabled)
if err != nil {
return Config{}, err
}
if err := cfg.Validate(); err != nil {
return Config{}, err
}
return cfg, nil
}
func stringEnv(name string, fallback string) string {
value, ok := os.LookupEnv(name)
if !ok {
return fallback
}
return strings.TrimSpace(value)
}
func durationEnv(name string, fallback time.Duration) (time.Duration, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := time.ParseDuration(strings.TrimSpace(value))
if err != nil {
return 0, fmt.Errorf("%s: parse duration: %w", name, err)
}
return parsed, nil
}
func secondsEnv(name string, fallback time.Duration) (time.Duration, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := strconv.Atoi(strings.TrimSpace(value))
if err != nil {
return 0, fmt.Errorf("%s: parse seconds: %w", name, err)
}
if parsed <= 0 {
return 0, fmt.Errorf("%s: must be positive", name)
}
return time.Duration(parsed) * time.Second, nil
}
func daysEnv(name string, fallback time.Duration) (time.Duration, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := strconv.Atoi(strings.TrimSpace(value))
if err != nil {
return 0, fmt.Errorf("%s: parse days: %w", name, err)
}
if parsed <= 0 {
return 0, fmt.Errorf("%s: must be positive", name)
}
return time.Duration(parsed) * 24 * time.Hour, nil
}
func intEnv(name string, fallback int) (int, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := strconv.Atoi(strings.TrimSpace(value))
if err != nil {
return 0, fmt.Errorf("%s: parse int: %w", name, err)
}
return parsed, nil
}
func floatEnv(name string, fallback float64) (float64, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := strconv.ParseFloat(strings.TrimSpace(value), 64)
if err != nil {
return 0, fmt.Errorf("%s: parse float: %w", name, err)
}
return parsed, nil
}
func boolEnv(name string, fallback bool) (bool, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := strconv.ParseBool(strings.TrimSpace(value))
if err != nil {
return false, fmt.Errorf("%s: parse bool: %w", name, err)
}
return parsed, nil
}
func octalUint32Env(name string, fallback uint32) (uint32, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := strconv.ParseUint(strings.TrimSpace(value), 8, 32)
if err != nil {
return 0, fmt.Errorf("%s: parse octal: %w", name, err)
}
return uint32(parsed), nil
}
func normalizeExporterValue(value string) string {
trimmed := strings.TrimSpace(value)
switch trimmed {
case "", "none":
return "none"
default:
return trimmed
}
}
func normalizeProtocolValue(primary string, fallback string, defaultValue string) string {
primary = strings.TrimSpace(primary)
if primary != "" {
return primary
}
fallback = strings.TrimSpace(fallback)
if fallback != "" {
return fallback
}
return strings.TrimSpace(defaultValue)
}
+93
View File
@@ -0,0 +1,93 @@
package config
import (
"fmt"
"log/slog"
"net"
"net/url"
"strings"
)
// Validate reports whether cfg stores a usable Runtime Manager process
// configuration.
func (cfg Config) Validate() error {
if cfg.ShutdownTimeout <= 0 {
return fmt.Errorf("%s must be positive", shutdownTimeoutEnvVar)
}
if err := validateSlogLevel(cfg.Logging.Level); err != nil {
return fmt.Errorf("%s: %w", logLevelEnvVar, err)
}
if err := cfg.InternalHTTP.Validate(); err != nil {
return err
}
if err := cfg.Docker.Validate(); err != nil {
return err
}
if err := cfg.Postgres.Validate(); err != nil {
return err
}
if err := cfg.Redis.Validate(); err != nil {
return err
}
if err := cfg.Streams.Validate(); err != nil {
return err
}
if err := cfg.Container.Validate(); err != nil {
return err
}
if err := cfg.Health.Validate(); err != nil {
return err
}
if err := cfg.Cleanup.Validate(); err != nil {
return err
}
if err := cfg.Coordination.Validate(); err != nil {
return err
}
if err := cfg.Lobby.Validate(); err != nil {
return err
}
if err := cfg.Telemetry.Validate(); err != nil {
return err
}
return nil
}
func validateSlogLevel(level string) error {
var slogLevel slog.Level
if err := slogLevel.UnmarshalText([]byte(strings.TrimSpace(level))); err != nil {
return fmt.Errorf("invalid slog level %q: %w", level, err)
}
return nil
}
func isTCPAddr(value string) bool {
host, port, err := net.SplitHostPort(strings.TrimSpace(value))
if err != nil {
return false
}
if port == "" {
return false
}
if host == "" {
return true
}
return !strings.Contains(host, " ")
}
func isHTTPURL(value string) bool {
parsed, err := url.Parse(strings.TrimSpace(value))
if err != nil {
return false
}
if parsed.Scheme != "http" && parsed.Scheme != "https" {
return false
}
return parsed.Host != ""
}
@@ -0,0 +1,231 @@
// Package health defines the technical-health domain types owned by
// Runtime Manager.
//
// EventType matches the `event_type` enum frozen in
// `galaxy/rtmanager/api/runtime-health-asyncapi.yaml`. SnapshotStatus
// matches the SQL CHECK on `health_snapshots.status` and is intentionally
// narrower than EventType (the snapshot table collapses
// `container_started → healthy` and drops `probe_recovered` per
// `galaxy/rtmanager/README.md §Health Monitoring`).
package health
import (
"encoding/json"
"fmt"
"strings"
"time"
)
// EventType identifies one entry on the `runtime:health_events` Redis
// Stream. Used by the health-event publishers and consumers.
type EventType string
const (
// EventTypeContainerStarted reports a successful container start.
EventTypeContainerStarted EventType = "container_started"
// EventTypeContainerExited reports a non-zero Docker `die` event.
EventTypeContainerExited EventType = "container_exited"
// EventTypeContainerOOM reports a Docker `oom` event.
EventTypeContainerOOM EventType = "container_oom"
// EventTypeContainerDisappeared reports that the listener observed
// a `destroy` event for a record Runtime Manager did not initiate.
EventTypeContainerDisappeared EventType = "container_disappeared"
// EventTypeInspectUnhealthy reports an unexpected outcome of the
// periodic Docker inspect (RestartCount growth, unexpected status,
// declared HEALTHCHECK reporting unhealthy).
EventTypeInspectUnhealthy EventType = "inspect_unhealthy"
// EventTypeProbeFailed reports that the active HTTP probe crossed
// the configured failure threshold.
EventTypeProbeFailed EventType = "probe_failed"
// EventTypeProbeRecovered reports the first probe success after a
// `probe_failed` event was published.
EventTypeProbeRecovered EventType = "probe_recovered"
)
// IsKnown reports whether eventType belongs to the frozen event-type
// vocabulary.
func (eventType EventType) IsKnown() bool {
switch eventType {
case EventTypeContainerStarted,
EventTypeContainerExited,
EventTypeContainerOOM,
EventTypeContainerDisappeared,
EventTypeInspectUnhealthy,
EventTypeProbeFailed,
EventTypeProbeRecovered:
return true
default:
return false
}
}
// AllEventTypes returns the frozen list of every event-type value.
func AllEventTypes() []EventType {
return []EventType{
EventTypeContainerStarted,
EventTypeContainerExited,
EventTypeContainerOOM,
EventTypeContainerDisappeared,
EventTypeInspectUnhealthy,
EventTypeProbeFailed,
EventTypeProbeRecovered,
}
}
// SnapshotStatus identifies one latest-observation status value stored
// in the `health_snapshots.status` column. Distinct from EventType: the
// table collapses `container_started → healthy` and never persists
// `probe_recovered` (it is conveyed only as a `runtime:health_events`
// entry with status=healthy in the next observation).
type SnapshotStatus string
const (
// SnapshotStatusHealthy reports that the most recent observation
// found the container live and the engine probe responsive.
SnapshotStatusHealthy SnapshotStatus = "healthy"
// SnapshotStatusProbeFailed reports that the active probe crossed
// the failure threshold.
SnapshotStatusProbeFailed SnapshotStatus = "probe_failed"
// SnapshotStatusExited reports that the container exited.
SnapshotStatusExited SnapshotStatus = "exited"
// SnapshotStatusOOM reports that the container was killed by the
// OOM killer.
SnapshotStatusOOM SnapshotStatus = "oom"
// SnapshotStatusInspectUnhealthy reports that the periodic inspect
// observed an unexpected state.
SnapshotStatusInspectUnhealthy SnapshotStatus = "inspect_unhealthy"
// SnapshotStatusContainerDisappeared reports that Docker no longer
// reports the container.
SnapshotStatusContainerDisappeared SnapshotStatus = "container_disappeared"
)
// IsKnown reports whether status belongs to the frozen snapshot-status
// vocabulary.
func (status SnapshotStatus) IsKnown() bool {
switch status {
case SnapshotStatusHealthy,
SnapshotStatusProbeFailed,
SnapshotStatusExited,
SnapshotStatusOOM,
SnapshotStatusInspectUnhealthy,
SnapshotStatusContainerDisappeared:
return true
default:
return false
}
}
// AllSnapshotStatuses returns the frozen list of every snapshot-status
// value.
func AllSnapshotStatuses() []SnapshotStatus {
return []SnapshotStatus{
SnapshotStatusHealthy,
SnapshotStatusProbeFailed,
SnapshotStatusExited,
SnapshotStatusOOM,
SnapshotStatusInspectUnhealthy,
SnapshotStatusContainerDisappeared,
}
}
// SnapshotSource identifies the observation source that produced one
// snapshot. Matches the SQL CHECK on `health_snapshots.source`.
type SnapshotSource string
const (
// SnapshotSourceDockerEvent reports that the latest observation
// arrived through the Docker events listener.
SnapshotSourceDockerEvent SnapshotSource = "docker_event"
// SnapshotSourceInspect reports that the latest observation arrived
// through the periodic Docker inspect worker.
SnapshotSourceInspect SnapshotSource = "inspect"
// SnapshotSourceProbe reports that the latest observation arrived
// through the active HTTP probe.
SnapshotSourceProbe SnapshotSource = "probe"
)
// IsKnown reports whether source belongs to the frozen snapshot-source
// vocabulary.
func (source SnapshotSource) IsKnown() bool {
switch source {
case SnapshotSourceDockerEvent,
SnapshotSourceInspect,
SnapshotSourceProbe:
return true
default:
return false
}
}
// AllSnapshotSources returns the frozen list of every snapshot-source
// value.
func AllSnapshotSources() []SnapshotSource {
return []SnapshotSource{
SnapshotSourceDockerEvent,
SnapshotSourceInspect,
SnapshotSourceProbe,
}
}
// HealthSnapshot stores the latest technical-health observation for one
// game. One row per game_id; later observations overwrite.
type HealthSnapshot struct {
// GameID identifies the platform game.
GameID string
// ContainerID stores the Docker container id observed by the
// snapshot source. Empty when the source could not associate a
// container (e.g., reconciler dispose for a record whose container
// is already gone).
ContainerID string
// Status stores the latest observed snapshot status.
Status SnapshotStatus
// Source stores the observation source that produced this entry.
Source SnapshotSource
// Details stores the source-specific JSON detail payload. Adapters
// store and retrieve it verbatim. Empty / nil values are persisted
// as the SQL default `{}`.
Details json.RawMessage
// ObservedAt stores the wall-clock at which the source captured the
// observation.
ObservedAt time.Time
}
// Validate reports whether snapshot satisfies the snapshot invariants
// implied by the SQL CHECK constraints.
func (snapshot HealthSnapshot) Validate() error {
if strings.TrimSpace(snapshot.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if !snapshot.Status.IsKnown() {
return fmt.Errorf("status %q is unsupported", snapshot.Status)
}
if !snapshot.Source.IsKnown() {
return fmt.Errorf("source %q is unsupported", snapshot.Source)
}
if snapshot.ObservedAt.IsZero() {
return fmt.Errorf("observed at must not be zero")
}
if len(snapshot.Details) > 0 && !json.Valid(snapshot.Details) {
return fmt.Errorf("details must be valid JSON when non-empty")
}
return nil
}
@@ -0,0 +1,133 @@
package health
import (
"encoding/json"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestEventTypeIsKnown(t *testing.T) {
for _, eventType := range AllEventTypes() {
assert.Truef(t, eventType.IsKnown(), "expected %q known", eventType)
}
assert.False(t, EventType("").IsKnown())
assert.False(t, EventType("paused").IsKnown())
}
func TestAllEventTypesCoverFrozenSet(t *testing.T) {
assert.ElementsMatch(t,
[]EventType{
EventTypeContainerStarted,
EventTypeContainerExited,
EventTypeContainerOOM,
EventTypeContainerDisappeared,
EventTypeInspectUnhealthy,
EventTypeProbeFailed,
EventTypeProbeRecovered,
},
AllEventTypes(),
)
}
func TestSnapshotStatusIsKnown(t *testing.T) {
for _, status := range AllSnapshotStatuses() {
assert.Truef(t, status.IsKnown(), "expected %q known", status)
}
assert.False(t, SnapshotStatus("").IsKnown())
assert.False(t, SnapshotStatus("starting").IsKnown())
assert.False(t, SnapshotStatus("probe_recovered").IsKnown(),
"snapshot status must not include event-only values")
assert.False(t, SnapshotStatus("container_started").IsKnown(),
"snapshot status must not include event-only values")
}
func TestAllSnapshotStatusesCoverFrozenSet(t *testing.T) {
assert.ElementsMatch(t,
[]SnapshotStatus{
SnapshotStatusHealthy,
SnapshotStatusProbeFailed,
SnapshotStatusExited,
SnapshotStatusOOM,
SnapshotStatusInspectUnhealthy,
SnapshotStatusContainerDisappeared,
},
AllSnapshotStatuses(),
)
}
func TestSnapshotSourceIsKnown(t *testing.T) {
for _, source := range AllSnapshotSources() {
assert.Truef(t, source.IsKnown(), "expected %q known", source)
}
assert.False(t, SnapshotSource("").IsKnown())
assert.False(t, SnapshotSource("manual").IsKnown())
}
func TestAllSnapshotSourcesCoverFrozenSet(t *testing.T) {
assert.ElementsMatch(t,
[]SnapshotSource{
SnapshotSourceDockerEvent,
SnapshotSourceInspect,
SnapshotSourceProbe,
},
AllSnapshotSources(),
)
}
func sampleSnapshot() HealthSnapshot {
return HealthSnapshot{
GameID: "game-test",
ContainerID: "container-1",
Status: SnapshotStatusHealthy,
Source: SnapshotSourceProbe,
Details: json.RawMessage(`{"prior_failure_count":0}`),
ObservedAt: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
}
}
func TestHealthSnapshotValidateHappy(t *testing.T) {
require.NoError(t, sampleSnapshot().Validate())
}
func TestHealthSnapshotValidateAcceptsEmptyDetails(t *testing.T) {
snapshot := sampleSnapshot()
snapshot.Details = nil
assert.NoError(t, snapshot.Validate())
}
func TestHealthSnapshotValidateAcceptsEmptyContainerID(t *testing.T) {
snapshot := sampleSnapshot()
snapshot.ContainerID = ""
assert.NoError(t, snapshot.Validate())
}
func TestHealthSnapshotValidateRejects(t *testing.T) {
tests := []struct {
name string
mutate func(*HealthSnapshot)
}{
{"empty game id", func(s *HealthSnapshot) { s.GameID = "" }},
{"unknown status", func(s *HealthSnapshot) { s.Status = "exotic" }},
{"unknown source", func(s *HealthSnapshot) { s.Source = "exotic" }},
{"zero observed at", func(s *HealthSnapshot) { s.ObservedAt = time.Time{} }},
{"invalid details json", func(s *HealthSnapshot) {
s.Details = json.RawMessage("not-json")
}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
snapshot := sampleSnapshot()
tt.mutate(&snapshot)
assert.Error(t, snapshot.Validate())
})
}
}
+245
View File
@@ -0,0 +1,245 @@
// Package operation defines the runtime-operation audit-log domain types
// owned by Runtime Manager.
//
// One OperationEntry maps to one row of the `operation_log` PostgreSQL
// table (see
// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`).
// The OpKind / OpSource / Outcome enums match the SQL CHECK constraints
// verbatim and feed the telemetry counters declared in
// `galaxy/rtmanager/README.md §Observability`.
package operation
import (
"fmt"
"strings"
"time"
)
// OpKind identifies the kind of operation Runtime Manager performed.
type OpKind string
const (
// OpKindStart records a start lifecycle operation.
OpKindStart OpKind = "start"
// OpKindStop records a stop lifecycle operation.
OpKindStop OpKind = "stop"
// OpKindRestart records a restart lifecycle operation
// (recreate with the same image_ref).
OpKindRestart OpKind = "restart"
// OpKindPatch records a semver-patch lifecycle operation
// (recreate with a new image_ref).
OpKindPatch OpKind = "patch"
// OpKindCleanupContainer records a container removal performed by
// the cleanup TTL worker or the admin DELETE endpoint.
OpKindCleanupContainer OpKind = "cleanup_container"
// OpKindReconcileAdopt records that the reconciler discovered an
// unrecorded container labelled `com.galaxy.owner=rtmanager` and
// inserted a runtime record for it.
OpKindReconcileAdopt OpKind = "reconcile_adopt"
// OpKindReconcileDispose records that the reconciler observed a
// running record whose container is missing in Docker and marked it
// as removed.
OpKindReconcileDispose OpKind = "reconcile_dispose"
)
// IsKnown reports whether kind belongs to the frozen op-kind vocabulary.
func (kind OpKind) IsKnown() bool {
switch kind {
case OpKindStart,
OpKindStop,
OpKindRestart,
OpKindPatch,
OpKindCleanupContainer,
OpKindReconcileAdopt,
OpKindReconcileDispose:
return true
default:
return false
}
}
// AllOpKinds returns the frozen list of every op-kind value. The slice
// order is stable across calls.
func AllOpKinds() []OpKind {
return []OpKind{
OpKindStart,
OpKindStop,
OpKindRestart,
OpKindPatch,
OpKindCleanupContainer,
OpKindReconcileAdopt,
OpKindReconcileDispose,
}
}
// OpSource identifies where one operation entered Runtime Manager.
type OpSource string
const (
// OpSourceLobbyStream identifies entries triggered by the
// `runtime:start_jobs` or `runtime:stop_jobs` Redis Stream consumer.
OpSourceLobbyStream OpSource = "lobby_stream"
// OpSourceGMRest identifies entries triggered by Game Master through
// the internal REST surface.
OpSourceGMRest OpSource = "gm_rest"
// OpSourceAdminRest identifies entries triggered by Admin Service
// through the internal REST surface.
OpSourceAdminRest OpSource = "admin_rest"
// OpSourceAutoTTL identifies entries triggered by the periodic
// container-cleanup worker.
OpSourceAutoTTL OpSource = "auto_ttl"
// OpSourceAutoReconcile identifies entries triggered by the
// reconciler at startup or on its periodic interval.
OpSourceAutoReconcile OpSource = "auto_reconcile"
)
// IsKnown reports whether source belongs to the frozen op-source
// vocabulary.
func (source OpSource) IsKnown() bool {
switch source {
case OpSourceLobbyStream,
OpSourceGMRest,
OpSourceAdminRest,
OpSourceAutoTTL,
OpSourceAutoReconcile:
return true
default:
return false
}
}
// AllOpSources returns the frozen list of every op-source value. The
// slice order is stable across calls.
func AllOpSources() []OpSource {
return []OpSource{
OpSourceLobbyStream,
OpSourceGMRest,
OpSourceAdminRest,
OpSourceAutoTTL,
OpSourceAutoReconcile,
}
}
// Outcome reports the high-level outcome of one operation.
type Outcome string
const (
// OutcomeSuccess reports that the operation completed without
// surfacing an error.
OutcomeSuccess Outcome = "success"
// OutcomeFailure reports that the operation surfaced a stable error
// code recorded in OperationEntry.ErrorCode.
OutcomeFailure Outcome = "failure"
)
// IsKnown reports whether outcome belongs to the frozen outcome
// vocabulary.
func (outcome Outcome) IsKnown() bool {
switch outcome {
case OutcomeSuccess, OutcomeFailure:
return true
default:
return false
}
}
// AllOutcomes returns the frozen list of every outcome value.
func AllOutcomes() []Outcome {
return []Outcome{OutcomeSuccess, OutcomeFailure}
}
// OperationEntry stores one append-only audit row of the `operation_log`
// table. ID is zero on records that have not been persisted yet; the
// store assigns it from the table's bigserial column. FinishedAt is a
// pointer because the column is nullable for in-flight rows even though
// the lifecycle services finalise the row in the same transaction.
type OperationEntry struct {
// ID identifies the persisted row. Zero before persistence.
ID int64
// GameID identifies the platform game this operation acted on.
GameID string
// OpKind classifies what the operation did.
OpKind OpKind
// OpSource classifies how the operation entered Runtime Manager.
OpSource OpSource
// SourceRef stores an opaque per-source reference such as a Redis
// Stream entry id, a REST request id, or an admin user id. Empty
// when the source does not provide one.
SourceRef string
// ImageRef stores the engine image reference associated with the
// operation, when applicable. Empty for operations that do not
// touch an image (e.g., cleanup_container).
ImageRef string
// ContainerID stores the Docker container id observed at the time
// of the operation, when applicable.
ContainerID string
// Outcome reports whether the operation succeeded or failed.
Outcome Outcome
// ErrorCode stores the stable error code on failure. Empty on
// success.
ErrorCode string
// ErrorMessage stores the operator-readable detail on failure.
// Empty on success.
ErrorMessage string
// StartedAt stores the wall-clock at which the operation began.
StartedAt time.Time
// FinishedAt stores the wall-clock at which the operation
// finalised. Nil for in-flight rows.
FinishedAt *time.Time
}
// Validate reports whether entry satisfies the operation-log invariants
// implied by the SQL CHECK constraints and the README §Persistence
// Layout.
func (entry OperationEntry) Validate() error {
if strings.TrimSpace(entry.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if !entry.OpKind.IsKnown() {
return fmt.Errorf("op kind %q is unsupported", entry.OpKind)
}
if !entry.OpSource.IsKnown() {
return fmt.Errorf("op source %q is unsupported", entry.OpSource)
}
if !entry.Outcome.IsKnown() {
return fmt.Errorf("outcome %q is unsupported", entry.Outcome)
}
if entry.StartedAt.IsZero() {
return fmt.Errorf("started at must not be zero")
}
if entry.FinishedAt != nil {
if entry.FinishedAt.IsZero() {
return fmt.Errorf("finished at must not be zero when present")
}
if entry.FinishedAt.Before(entry.StartedAt) {
return fmt.Errorf("finished at must not be before started at")
}
}
if entry.Outcome == OutcomeFailure && strings.TrimSpace(entry.ErrorCode) == "" {
return fmt.Errorf("error code must not be empty for failure entries")
}
return nil
}
@@ -0,0 +1,130 @@
package operation
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestOpKindIsKnown(t *testing.T) {
for _, kind := range AllOpKinds() {
assert.Truef(t, kind.IsKnown(), "expected %q known", kind)
}
assert.False(t, OpKind("").IsKnown())
assert.False(t, OpKind("rollback").IsKnown())
}
func TestAllOpKindsCoverFrozenSet(t *testing.T) {
assert.ElementsMatch(t,
[]OpKind{
OpKindStart, OpKindStop, OpKindRestart, OpKindPatch,
OpKindCleanupContainer, OpKindReconcileAdopt, OpKindReconcileDispose,
},
AllOpKinds(),
)
}
func TestOpSourceIsKnown(t *testing.T) {
for _, source := range AllOpSources() {
assert.Truef(t, source.IsKnown(), "expected %q known", source)
}
assert.False(t, OpSource("").IsKnown())
assert.False(t, OpSource("manual").IsKnown())
}
func TestAllOpSourcesCoverFrozenSet(t *testing.T) {
assert.ElementsMatch(t,
[]OpSource{
OpSourceLobbyStream, OpSourceGMRest, OpSourceAdminRest,
OpSourceAutoTTL, OpSourceAutoReconcile,
},
AllOpSources(),
)
}
func TestOutcomeIsKnown(t *testing.T) {
for _, outcome := range AllOutcomes() {
assert.Truef(t, outcome.IsKnown(), "expected %q known", outcome)
}
assert.False(t, Outcome("").IsKnown())
assert.False(t, Outcome("partial").IsKnown())
}
func TestAllOutcomesCoverFrozenSet(t *testing.T) {
assert.ElementsMatch(t,
[]Outcome{OutcomeSuccess, OutcomeFailure},
AllOutcomes(),
)
}
func successEntry() OperationEntry {
started := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
finished := started.Add(time.Second)
return OperationEntry{
GameID: "game-test",
OpKind: OpKindStart,
OpSource: OpSourceLobbyStream,
SourceRef: "1700000000000-0",
ImageRef: "galaxy/game:1.0.0",
ContainerID: "container-1",
Outcome: OutcomeSuccess,
StartedAt: started,
FinishedAt: &finished,
}
}
func TestOperationEntryValidateHappy(t *testing.T) {
require.NoError(t, successEntry().Validate())
}
func TestOperationEntryValidateAcceptsReplayNoOp(t *testing.T) {
entry := successEntry()
entry.ErrorCode = "replay_no_op"
assert.NoError(t, entry.Validate())
}
func TestOperationEntryValidateAcceptsInFlight(t *testing.T) {
entry := successEntry()
entry.FinishedAt = nil
assert.NoError(t, entry.Validate())
}
func TestOperationEntryValidateRejects(t *testing.T) {
tests := []struct {
name string
mutate func(*OperationEntry)
}{
{"empty game id", func(e *OperationEntry) { e.GameID = "" }},
{"unknown op kind", func(e *OperationEntry) { e.OpKind = "exotic" }},
{"unknown op source", func(e *OperationEntry) { e.OpSource = "exotic" }},
{"unknown outcome", func(e *OperationEntry) { e.Outcome = "partial" }},
{"zero started at", func(e *OperationEntry) { e.StartedAt = time.Time{} }},
{"zero finished at", func(e *OperationEntry) {
zero := time.Time{}
e.FinishedAt = &zero
}},
{"finished before started", func(e *OperationEntry) {
before := e.StartedAt.Add(-time.Second)
e.FinishedAt = &before
}},
{"failure without error code", func(e *OperationEntry) {
e.Outcome = OutcomeFailure
e.ErrorCode = ""
}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
entry := successEntry()
tt.mutate(&entry)
assert.Error(t, entry.Validate())
})
}
}
@@ -0,0 +1,43 @@
package runtime
import (
"errors"
"fmt"
)
// ErrNotFound reports that a runtime record was requested but does not
// exist in the store.
var ErrNotFound = errors.New("runtime record not found")
// ErrConflict reports that a runtime mutation could not be applied
// because the record changed concurrently or failed a compare-and-swap
// guard.
var ErrConflict = errors.New("runtime record conflict")
// ErrInvalidTransition is the sentinel returned when Transition rejects
// a `(from, to)` pair.
var ErrInvalidTransition = errors.New("invalid runtime status transition")
// InvalidTransitionError stores the rejected `(from, to)` pair and wraps
// ErrInvalidTransition so callers can match it with errors.Is.
type InvalidTransitionError struct {
// From stores the source status that was attempted to leave.
From Status
// To stores the destination status that was attempted to enter.
To Status
}
// Error reports a human-readable summary of the rejected pair.
func (err *InvalidTransitionError) Error() string {
return fmt.Sprintf(
"invalid runtime status transition from %q to %q",
err.From, err.To,
)
}
// Unwrap returns ErrInvalidTransition so errors.Is recognizes the
// sentinel.
func (err *InvalidTransitionError) Unwrap() error {
return ErrInvalidTransition
}
+197
View File
@@ -0,0 +1,197 @@
// Package runtime defines the runtime-record domain model, status machine,
// and sentinel errors owned by Runtime Manager.
//
// The package mirrors the durable shape of the `runtime_records`
// PostgreSQL table (see
// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`).
// Every status / transition / required-field rule already documented in
// `galaxy/rtmanager/README.md` lives here as code so adapter and service
// layers do not re-derive it.
package runtime
import (
"fmt"
"strings"
"time"
)
// Status identifies one runtime-record lifecycle state.
type Status string
const (
// StatusRunning reports that an engine container is live and bound to
// the record. The associated container id and image ref are non-empty
// and StartedAt is set.
StatusRunning Status = "running"
// StatusStopped reports that the engine container has exited (graceful
// stop, observed Docker exit, or reconciled exit). The container is
// still present in Docker until the cleanup worker removes it.
StatusStopped Status = "stopped"
// StatusRemoved reports that the container has been removed from
// Docker (admin cleanup or reconcile_dispose). The record stays in
// PostgreSQL for audit; there is no transition out of this state.
StatusRemoved Status = "removed"
)
// IsKnown reports whether status belongs to the frozen runtime status
// vocabulary.
func (status Status) IsKnown() bool {
switch status {
case StatusRunning, StatusStopped, StatusRemoved:
return true
default:
return false
}
}
// IsTerminal reports whether status can no longer accept lifecycle
// transitions.
func (status Status) IsTerminal() bool {
return status == StatusRemoved
}
// AllStatuses returns the frozen list of every runtime status value. The
// slice order is stable across calls and matches the README §Persistence
// Layout listing.
func AllStatuses() []Status {
return []Status{
StatusRunning,
StatusStopped,
StatusRemoved,
}
}
// RuntimeRecord stores one durable runtime record owned by Runtime
// Manager. It mirrors one row of the `runtime_records` table.
//
// CurrentContainerID and CurrentImageRef are stored as plain strings; an
// empty value represents SQL NULL and is bridged at the adapter layer.
// StartedAt, StoppedAt, and RemovedAt are *time.Time so a missing value
// is unambiguous and aligns with the jet-generated model.
type RuntimeRecord struct {
// GameID identifies the platform game owning this runtime record.
GameID string
// Status stores the current lifecycle state.
Status Status
// CurrentContainerID identifies the bound Docker container. Empty
// when status is removed and after a reconciler observes
// disappearance.
CurrentContainerID string
// CurrentImageRef stores the Docker reference of the currently-bound
// engine image. Non-empty when status is running or stopped.
CurrentImageRef string
// EngineEndpoint stores the stable URL Game Master uses to reach the
// engine container, in `http://galaxy-game-{game_id}:8080` form.
EngineEndpoint string
// StatePath stores the absolute host path of the bind-mounted engine
// state directory.
StatePath string
// DockerNetwork stores the Docker network the container was attached
// to at create time.
DockerNetwork string
// StartedAt stores the wall-clock at which the container became
// running. Non-nil when status is running or stopped.
StartedAt *time.Time
// StoppedAt stores the wall-clock at which the container exited.
// Non-nil when status is stopped or removed (when the record passed
// through stopped before removal).
StoppedAt *time.Time
// RemovedAt stores the wall-clock at which the container was removed
// from Docker. Non-nil when status is removed.
RemovedAt *time.Time
// LastOpAt stores the wall-clock of the most recent operation
// affecting this record. Drives the cleanup TTL.
LastOpAt time.Time
// CreatedAt stores the wall-clock at which Runtime Manager first saw
// this game.
CreatedAt time.Time
}
// Validate reports whether record satisfies the runtime-record invariants
// implied by README §Lifecycles and the SQL CHECK on `runtime_records`.
func (record RuntimeRecord) Validate() error {
if strings.TrimSpace(record.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if !record.Status.IsKnown() {
return fmt.Errorf("status %q is unsupported", record.Status)
}
if strings.TrimSpace(record.EngineEndpoint) == "" {
return fmt.Errorf("engine endpoint must not be empty")
}
if strings.TrimSpace(record.StatePath) == "" {
return fmt.Errorf("state path must not be empty")
}
if strings.TrimSpace(record.DockerNetwork) == "" {
return fmt.Errorf("docker network must not be empty")
}
if record.LastOpAt.IsZero() {
return fmt.Errorf("last op at must not be zero")
}
if record.CreatedAt.IsZero() {
return fmt.Errorf("created at must not be zero")
}
if record.LastOpAt.Before(record.CreatedAt) {
return fmt.Errorf("last op at must not be before created at")
}
switch record.Status {
case StatusRunning:
if strings.TrimSpace(record.CurrentContainerID) == "" {
return fmt.Errorf("current container id must not be empty for running records")
}
if strings.TrimSpace(record.CurrentImageRef) == "" {
return fmt.Errorf("current image ref must not be empty for running records")
}
if record.StartedAt == nil {
return fmt.Errorf("started at must not be nil for running records")
}
if record.StartedAt.IsZero() {
return fmt.Errorf("started at must not be zero when present")
}
case StatusStopped:
if strings.TrimSpace(record.CurrentImageRef) == "" {
return fmt.Errorf("current image ref must not be empty for stopped records")
}
if record.StoppedAt == nil {
return fmt.Errorf("stopped at must not be nil for stopped records")
}
if record.StoppedAt.IsZero() {
return fmt.Errorf("stopped at must not be zero when present")
}
case StatusRemoved:
if record.RemovedAt == nil {
return fmt.Errorf("removed at must not be nil for removed records")
}
if record.RemovedAt.IsZero() {
return fmt.Errorf("removed at must not be zero when present")
}
}
if record.StartedAt != nil && record.StartedAt.Before(record.CreatedAt) {
return fmt.Errorf("started at must not be before created at")
}
if record.StoppedAt != nil && record.StartedAt != nil && record.StoppedAt.Before(*record.StartedAt) {
return fmt.Errorf("stopped at must not be before started at")
}
if record.RemovedAt != nil && record.RemovedAt.Before(record.CreatedAt) {
return fmt.Errorf("removed at must not be before created at")
}
return nil
}
@@ -0,0 +1,156 @@
package runtime
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestStatusIsKnown(t *testing.T) {
for _, status := range AllStatuses() {
assert.Truef(t, status.IsKnown(), "expected %q known", status)
}
assert.False(t, Status("").IsKnown())
assert.False(t, Status("unknown").IsKnown())
}
func TestStatusIsTerminal(t *testing.T) {
assert.True(t, StatusRemoved.IsTerminal())
for _, status := range []Status{StatusRunning, StatusStopped} {
assert.Falsef(t, status.IsTerminal(), "expected %q non-terminal", status)
}
}
func TestAllStatuses(t *testing.T) {
statuses := AllStatuses()
assert.ElementsMatch(t,
[]Status{StatusRunning, StatusStopped, StatusRemoved},
statuses,
)
statuses[0] = "tampered"
assert.Equal(t, StatusRunning, AllStatuses()[0],
"AllStatuses must return an independent slice")
}
func runningRecord() RuntimeRecord {
created := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
started := created.Add(time.Second)
return RuntimeRecord{
GameID: "game-test",
Status: StatusRunning,
CurrentContainerID: "container-1",
CurrentImageRef: "galaxy/game:1.0.0",
EngineEndpoint: "http://galaxy-game-game-test:8080",
StatePath: "/var/lib/galaxy/games/game-test",
DockerNetwork: "galaxy-net",
StartedAt: &started,
LastOpAt: started,
CreatedAt: created,
}
}
func TestRuntimeRecordValidateRunningHappy(t *testing.T) {
require.NoError(t, runningRecord().Validate())
}
func TestRuntimeRecordValidateStoppedHappy(t *testing.T) {
record := runningRecord()
stopped := record.StartedAt.Add(time.Minute)
record.Status = StatusStopped
record.StoppedAt = &stopped
record.LastOpAt = stopped
require.NoError(t, record.Validate())
}
func TestRuntimeRecordValidateRemovedHappy(t *testing.T) {
record := runningRecord()
stopped := record.StartedAt.Add(time.Minute)
removed := stopped.Add(time.Minute)
record.Status = StatusRemoved
record.StoppedAt = &stopped
record.RemovedAt = &removed
record.CurrentContainerID = ""
record.LastOpAt = removed
require.NoError(t, record.Validate())
}
func TestRuntimeRecordValidateRejects(t *testing.T) {
tests := []struct {
name string
mutate func(*RuntimeRecord)
}{
{"empty game id", func(r *RuntimeRecord) { r.GameID = "" }},
{"unknown status", func(r *RuntimeRecord) { r.Status = "exotic" }},
{"empty engine endpoint", func(r *RuntimeRecord) { r.EngineEndpoint = "" }},
{"empty state path", func(r *RuntimeRecord) { r.StatePath = "" }},
{"empty docker network", func(r *RuntimeRecord) { r.DockerNetwork = "" }},
{"zero last op at", func(r *RuntimeRecord) { r.LastOpAt = time.Time{} }},
{"zero created at", func(r *RuntimeRecord) { r.CreatedAt = time.Time{} }},
{"last op at before created at", func(r *RuntimeRecord) {
r.LastOpAt = r.CreatedAt.Add(-time.Second)
}},
{"running without container id", func(r *RuntimeRecord) {
r.CurrentContainerID = ""
}},
{"running without image ref", func(r *RuntimeRecord) {
r.CurrentImageRef = ""
}},
{"running without started at", func(r *RuntimeRecord) {
r.StartedAt = nil
}},
{"started at before created at", func(r *RuntimeRecord) {
before := r.CreatedAt.Add(-time.Second)
r.StartedAt = &before
}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
record := runningRecord()
tt.mutate(&record)
assert.Error(t, record.Validate())
})
}
}
func TestRuntimeRecordValidateRejectsStoppedWithoutStoppedAt(t *testing.T) {
record := runningRecord()
record.Status = StatusStopped
record.StoppedAt = nil
assert.Error(t, record.Validate())
}
func TestRuntimeRecordValidateRejectsStoppedBeforeStarted(t *testing.T) {
record := runningRecord()
stopped := record.StartedAt.Add(-time.Second)
record.Status = StatusStopped
record.StoppedAt = &stopped
assert.Error(t, record.Validate())
}
func TestRuntimeRecordValidateRejectsRemovedWithoutRemovedAt(t *testing.T) {
record := runningRecord()
record.Status = StatusRemoved
record.RemovedAt = nil
assert.Error(t, record.Validate())
}
func TestRuntimeRecordValidateRejectsRemovedBeforeCreated(t *testing.T) {
record := runningRecord()
before := record.CreatedAt.Add(-time.Second)
record.Status = StatusRemoved
record.RemovedAt = &before
assert.Error(t, record.Validate())
}
@@ -0,0 +1,51 @@
package runtime
// transitionKey stores one `(from, to)` pair in the allowed-transitions
// table.
type transitionKey struct {
from Status
to Status
}
// allowedTransitions stores the set of permitted `(from, to)` status
// pairs. The four pairs mirror the lifecycle flows frozen in
// `galaxy/rtmanager/README.md §Lifecycles`:
//
// - running → stopped: graceful stop, observed Docker exit, or
// reconcile observing an exited container.
// - running → removed: reconcile_dispose when Docker no longer reports
// the container at all.
// - stopped → running: restart and patch inner start steps.
// - stopped → removed: cleanup_container, both the periodic TTL worker
// and the admin DELETE endpoint.
var allowedTransitions = map[transitionKey]struct{}{
{StatusRunning, StatusStopped}: {},
{StatusRunning, StatusRemoved}: {},
{StatusStopped, StatusRunning}: {},
{StatusStopped, StatusRemoved}: {},
}
// AllowedTransitions returns a copy of the `(from, to)` allowed
// transitions table used by Transition. The returned map is safe to
// mutate; callers should not rely on iteration order.
func AllowedTransitions() map[Status][]Status {
result := make(map[Status][]Status)
for key := range allowedTransitions {
result[key.from] = append(result[key.from], key.to)
}
return result
}
// Transition reports whether from may transition to next. The function
// returns nil when the pair is permitted, and an *InvalidTransitionError
// wrapping ErrInvalidTransition otherwise. It does not touch any store
// and is safe to call from any layer.
func Transition(from Status, next Status) error {
if !from.IsKnown() || !next.IsKnown() {
return &InvalidTransitionError{From: from, To: next}
}
if _, ok := allowedTransitions[transitionKey{from: from, to: next}]; !ok {
return &InvalidTransitionError{From: from, To: next}
}
return nil
}
@@ -0,0 +1,88 @@
package runtime
import (
"errors"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestTransitionAllowed(t *testing.T) {
cases := []struct {
from Status
to Status
}{
{StatusRunning, StatusStopped},
{StatusRunning, StatusRemoved},
{StatusStopped, StatusRunning},
{StatusStopped, StatusRemoved},
}
for _, tc := range cases {
assert.NoErrorf(t, Transition(tc.from, tc.to),
"expected %q -> %q allowed", tc.from, tc.to)
}
}
func TestTransitionRejected(t *testing.T) {
cases := []struct {
from Status
to Status
}{
{StatusRemoved, StatusRunning},
{StatusRemoved, StatusStopped},
{StatusRemoved, StatusRemoved},
{StatusRunning, StatusRunning},
{StatusStopped, StatusStopped},
{Status("unknown"), StatusRunning},
{StatusRunning, Status("unknown")},
{Status(""), Status("")},
}
for _, tc := range cases {
err := Transition(tc.from, tc.to)
require.Errorf(t, err, "expected %q -> %q rejected", tc.from, tc.to)
assert.ErrorIs(t, err, ErrInvalidTransition)
var transitionErr *InvalidTransitionError
require.True(t, errors.As(err, &transitionErr),
"expected *InvalidTransitionError for %q -> %q", tc.from, tc.to)
assert.Equal(t, tc.from, transitionErr.From)
assert.Equal(t, tc.to, transitionErr.To)
}
}
func TestAllowedTransitionsReturnsCopy(t *testing.T) {
first := AllowedTransitions()
require.NotEmpty(t, first)
for from := range first {
first[from] = nil
}
second := AllowedTransitions()
assert.NotEmpty(t, second[StatusRunning],
"AllowedTransitions must return an independent map per call")
}
func TestAllowedTransitionsCoversFourPairs(t *testing.T) {
transitions := AllowedTransitions()
assert.ElementsMatch(t,
[]Status{StatusStopped, StatusRemoved},
transitions[StatusRunning],
)
assert.ElementsMatch(t,
[]Status{StatusRunning, StatusRemoved},
transitions[StatusStopped],
)
assert.Empty(t, transitions[StatusRemoved],
"removed has no outgoing transitions")
}
func TestInvalidTransitionErrorMessage(t *testing.T) {
err := &InvalidTransitionError{From: StatusRunning, To: Status("bogus")}
assert.Contains(t, err.Error(), "running")
assert.Contains(t, err.Error(), "bogus")
}
+43
View File
@@ -0,0 +1,43 @@
package logging
import "context"
// requestIDKey is the unexported context key under which the HTTP layer
// stores the request id propagated from the X-Request-Id header.
type requestIDKey struct{}
// WithRequestID returns a child context that carries requestID. An empty
// requestID returns ctx unchanged so callers do not have to branch.
func WithRequestID(ctx context.Context, requestID string) context.Context {
if ctx == nil || requestID == "" {
return ctx
}
return context.WithValue(ctx, requestIDKey{}, requestID)
}
// RequestIDFromContext returns the request id stored on ctx by
// WithRequestID, or an empty string when no value is present.
func RequestIDFromContext(ctx context.Context) string {
if ctx == nil {
return ""
}
value, _ := ctx.Value(requestIDKey{}).(string)
return value
}
// ContextAttrs returns slog key-value pairs that materialise the frozen
// `rtmanager/README.md` §Observability log fields `request_id`,
// `trace_id`, and `span_id` from ctx. Pairs whose value is empty are
// omitted so logs stay tight.
func ContextAttrs(ctx context.Context) []any {
if ctx == nil {
return nil
}
var attrs []any
if requestID := RequestIDFromContext(ctx); requestID != "" {
attrs = append(attrs, "request_id", requestID)
}
attrs = append(attrs, TraceAttrsFromContext(ctx)...)
return attrs
}
+45
View File
@@ -0,0 +1,45 @@
// Package logging configures the Runtime Manager process logger and
// provides context-aware helpers for trace fields.
package logging
import (
"context"
"fmt"
"log/slog"
"os"
"strings"
"go.opentelemetry.io/otel/trace"
)
// New constructs the process-wide JSON logger from level.
func New(level string) (*slog.Logger, error) {
var slogLevel slog.Level
if err := slogLevel.UnmarshalText([]byte(strings.TrimSpace(level))); err != nil {
return nil, fmt.Errorf("build logger: %w", err)
}
return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: slogLevel,
})), nil
}
// TraceAttrsFromContext returns slog key-value pairs for the active
// OpenTelemetry span when ctx carries a valid span context. The keys match
// the frozen `rtmanager/README.md` §Observability log fields `trace_id`
// and `span_id`.
func TraceAttrsFromContext(ctx context.Context) []any {
if ctx == nil {
return nil
}
spanContext := trace.SpanContextFromContext(ctx)
if !spanContext.IsValid() {
return nil
}
return []any{
"trace_id", spanContext.TraceID().String(),
"span_id", spanContext.SpanID().String(),
}
}
+336
View File
@@ -0,0 +1,336 @@
package ports
import (
"context"
"errors"
"fmt"
"time"
)
// PullPolicy enumerates the supported image pull policies. The value
// set mirrors `config.ImagePullPolicy`; the runtime/wiring layer
// translates between the two so the docker adapter does not import
// `internal/config` and the port package stays free of configuration
// concerns.
type PullPolicy string
// Supported pull policies, frozen by `rtmanager/README.md §Configuration`.
const (
// PullPolicyIfMissing pulls the image only when it is absent from
// the local Docker daemon.
PullPolicyIfMissing PullPolicy = "if_missing"
// PullPolicyAlways pulls the image on every start.
PullPolicyAlways PullPolicy = "always"
// PullPolicyNever skips the pull and fails the start when the image
// is absent.
PullPolicyNever PullPolicy = "never"
)
// IsKnown reports whether policy belongs to the frozen pull-policy
// vocabulary.
func (policy PullPolicy) IsKnown() bool {
switch policy {
case PullPolicyIfMissing, PullPolicyAlways, PullPolicyNever:
return true
default:
return false
}
}
//go:generate go run go.uber.org/mock/mockgen -destination=../adapters/docker/mocks/mock_dockerclient.go -package=mocks galaxy/rtmanager/internal/ports DockerClient
// DockerClient is the narrow Docker port Runtime Manager uses. The
// production adapter wraps `github.com/docker/docker/client`; service
// tests use a generated mock. The surface intentionally exposes only
// the operations RTM needs; `docker logs` and stream attach are out
// of scope for v1.
type DockerClient interface {
// EnsureNetwork verifies the configured Docker network is present
// on the daemon. It returns ErrNetworkMissing when the network does
// not exist; RTM never creates networks itself.
EnsureNetwork(ctx context.Context, name string) error
// PullImage pulls ref according to policy. It returns nil on
// success and a wrapped Docker error otherwise. Implementations
// honour PullPolicyNever by skipping the pull and returning nil
// when the image is already present, or returning ErrImageNotFound
// otherwise.
PullImage(ctx context.Context, ref string, policy PullPolicy) error
// InspectImage returns image metadata for ref. It returns
// ErrImageNotFound when no such image exists locally.
InspectImage(ctx context.Context, ref string) (ImageInspect, error)
// InspectContainer returns container metadata for containerID. It
// returns ErrContainerNotFound when no such container exists.
InspectContainer(ctx context.Context, containerID string) (ContainerInspect, error)
// Run creates and starts one container according to spec. The
// returned RunResult carries the assigned container id, the stable
// engine endpoint, and the wall-clock observed by the daemon.
Run(ctx context.Context, spec RunSpec) (RunResult, error)
// Stop sends SIGTERM to the container followed by SIGKILL after
// timeout. It returns nil when the container exited cleanly and
// ErrContainerNotFound when it is already gone.
Stop(ctx context.Context, containerID string, timeout time.Duration) error
// Remove removes the container. It returns nil when the container
// no longer exists (idempotent removal).
Remove(ctx context.Context, containerID string) error
// List returns container summaries that match filter. Implementations
// translate ListFilter into the appropriate Docker filters argument.
List(ctx context.Context, filter ListFilter) ([]ContainerSummary, error)
// EventsListen subscribes to the Docker events stream and returns
// the decoded event channel together with an asynchronous error
// channel. The caller cancels ctx to terminate the subscription.
// Implementations close events when the subscription terminates.
EventsListen(ctx context.Context) (events <-chan DockerEvent, errs <-chan error, err error)
}
// RunSpec stores the request shape used by DockerClient.Run.
type RunSpec struct {
// Name stores the container name (typically `galaxy-game-{game_id}`).
Name string
// Image stores the image reference resolved by the producer.
Image string
// Hostname stores the container hostname assigned for the embedded
// Docker DNS to resolve from other containers on the network.
Hostname string
// Network stores the user-defined Docker network the container
// attaches to.
Network string
// Env stores the environment variables forwarded to the container
// (e.g. GAME_STATE_PATH, STORAGE_PATH).
Env map[string]string
// Cmd overrides the entrypoint arguments for the container. Production
// callers leave it nil so the engine image's own CMD runs; tests use
// it to drive a tiny container that does not embed RTM-specific
// behaviour. Empty Cmd means "use image default", which mirrors the
// Docker SDK contract.
Cmd []string
// Labels stores the labels applied to the container so the
// reconciler and the events listener can identify it.
Labels map[string]string
// BindMounts stores the host-to-container bind mounts. RTM uses
// exactly one mount in v1 (the per-game state directory).
BindMounts []BindMount
// LogDriver stores the Docker logging driver name.
LogDriver string
// LogOpts stores the logging-driver options as key=value pairs.
LogOpts map[string]string
// CPUQuota stores the `--cpus` value applied as a resource limit.
CPUQuota float64
// Memory stores the `--memory` value (e.g. `512m`) applied as a
// resource limit.
Memory string
// PIDsLimit stores the `--pids-limit` value.
PIDsLimit int
}
// BindMount stores one host-to-container bind mount.
type BindMount struct {
// HostPath stores the absolute host path bound into the container.
HostPath string
// MountPath stores the absolute in-container path the host
// directory is mounted at.
MountPath string
// ReadOnly mounts the host path read-only when true.
ReadOnly bool
}
// RunResult stores the response shape returned by DockerClient.Run.
type RunResult struct {
// ContainerID identifies the created container.
ContainerID string
// EngineEndpoint stores the stable URL Game Master uses to reach
// the engine container.
EngineEndpoint string
// StartedAt stores the wall-clock the daemon observed for the
// start event.
StartedAt time.Time
}
// ImageInspect stores the subset of `docker image inspect` fields RTM
// reads. Only Labels are required at start time (resource limits live
// there); other fields may be populated when convenient for diagnostics.
type ImageInspect struct {
// Ref stores the image reference the inspection was scoped to.
Ref string
// Labels stores the image-level labels (e.g.
// `com.galaxy.cpu_quota`).
Labels map[string]string
}
// ContainerInspect stores the subset of `docker inspect` fields RTM
// reads from a running or exited container.
type ContainerInspect struct {
// ID identifies the container.
ID string
// ImageRef stores the image reference the container was started
// from.
ImageRef string
// Hostname stores the container hostname.
Hostname string
// Labels stores the container labels assigned at create time.
Labels map[string]string
// Status stores the verbatim Docker `State.Status` value (e.g.
// `running`, `exited`).
Status string
// Health stores the verbatim Docker `State.Health.Status` value
// (e.g. `healthy`, `unhealthy`). Empty when the image declares no
// HEALTHCHECK.
Health string
// RestartCount stores the Docker `RestartCount` observed at
// inspection time.
RestartCount int
// StartedAt stores the daemon-observed start wall-clock.
StartedAt time.Time
// FinishedAt stores the daemon-observed exit wall-clock. Zero when
// the container is still running.
FinishedAt time.Time
// ExitCode stores the exit code reported by the daemon. Zero when
// the container is still running.
ExitCode int
// OOMKilled reports whether the container was killed by the OOM
// killer.
OOMKilled bool
}
// ContainerSummary stores the subset of `docker ps` fields RTM reads.
type ContainerSummary struct {
// ID identifies the container.
ID string
// ImageRef stores the image reference.
ImageRef string
// Hostname stores the container hostname.
Hostname string
// Labels stores the container labels assigned at create time.
Labels map[string]string
// Status stores the verbatim Docker `State.Status` value.
Status string
// StartedAt stores the daemon-observed start wall-clock.
StartedAt time.Time
}
// ListFilter stores the criteria used by DockerClient.List.
type ListFilter struct {
// Labels stores label key=value pairs that must all be present on
// the container. Empty matches every container.
Labels map[string]string
}
// DockerEvent stores one decoded entry from the Docker events stream.
// RTM only consumes container-scoped events.
type DockerEvent struct {
// Action stores the Docker event action verbatim (e.g. `start`,
// `die`, `oom`, `destroy`).
Action string
// ContainerID identifies the container the event refers to.
ContainerID string
// Labels stores the container labels carried by the event
// attributes when present.
Labels map[string]string
// ExitCode stores the exit code attribute when applicable (e.g.
// `die` events). Zero when the action does not carry one.
ExitCode int
// OccurredAt stores the daemon-observed event wall-clock.
OccurredAt time.Time
}
// String returns policy as its stored enum value. Convenient for use in
// log fields and error messages.
func (policy PullPolicy) String() string {
return string(policy)
}
// ErrNetworkMissing reports that the configured Docker network is not
// present on the daemon.
var ErrNetworkMissing = errors.New("docker network missing")
// ErrImageNotFound reports that an image reference does not resolve to
// a local Docker image.
var ErrImageNotFound = errors.New("docker image not found")
// ErrContainerNotFound reports that a container id does not resolve to
// a Docker container.
var ErrContainerNotFound = errors.New("docker container not found")
// Validate reports whether spec carries the structural invariants
// required by DockerClient.Run. Adapters use it as the first defence
// against malformed specs originating in service code.
func (spec RunSpec) Validate() error {
if spec.Name == "" {
return fmt.Errorf("run spec: name must not be empty")
}
if spec.Image == "" {
return fmt.Errorf("run spec: image must not be empty")
}
if spec.Hostname == "" {
return fmt.Errorf("run spec: hostname must not be empty")
}
if spec.Network == "" {
return fmt.Errorf("run spec: network must not be empty")
}
if spec.LogDriver == "" {
return fmt.Errorf("run spec: log driver must not be empty")
}
if spec.CPUQuota <= 0 {
return fmt.Errorf("run spec: cpu quota must be positive")
}
if spec.Memory == "" {
return fmt.Errorf("run spec: memory must not be empty")
}
if spec.PIDsLimit <= 0 {
return fmt.Errorf("run spec: pids limit must be positive")
}
for index, mount := range spec.BindMounts {
if mount.HostPath == "" {
return fmt.Errorf("run spec: bind mounts[%d]: host path must not be empty", index)
}
if mount.MountPath == "" {
return fmt.Errorf("run spec: bind mounts[%d]: mount path must not be empty", index)
}
}
return nil
}
+38
View File
@@ -0,0 +1,38 @@
package ports
import (
"context"
"time"
)
// GameLeaseStore guards every lifecycle operation Runtime Manager runs
// against one game. The lease serialises starts, stops, restarts, patches,
// and cleanup operations on the same `game_id` across all entry points
// (Lobby stream consumer, GM REST handler, Admin REST handler, periodic
// workers) so concurrent operations cannot corrupt each other's
// intermediate Docker / PostgreSQL state.
//
// The lease is a per-game key with a random token. Adapters use SETNX with
// PX TTL on TryAcquire and a compare-and-delete on Release so a publisher
// that lost the lease (TTL expiry, replica swap) cannot clear another
// caller's claim.
//
// In v1 the lease is not renewed mid-operation; callers must keep the
// total operation duration below the configured TTL
// (`RTMANAGER_GAME_LEASE_TTL_SECONDS`, default 60s). Multi-GB image pulls
// can exceed this in production and remain a known limitation; later
// stages may introduce a renewal helper if it bites.
type GameLeaseStore interface {
// TryAcquire attempts to acquire the per-game lease for gameID owned
// by token for ttl. It returns true when the lease was acquired and
// false when another holder still owns it. A non-nil error reports
// transport-level failures (Redis unreachable, network timeout) and
// must not be confused with a missed lease.
TryAcquire(ctx context.Context, gameID, token string, ttl time.Duration) (acquired bool, err error)
// Release removes the per-game lease for gameID only when token still
// matches the stored owner value. Releasing a lease the caller no
// longer owns is a silent no-op so a TTL-driven release race never
// clears another caller's claim.
Release(ctx context.Context, gameID, token string) error
}
@@ -0,0 +1,81 @@
package ports
import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
"galaxy/rtmanager/internal/domain/health"
)
// HealthEventPublisher emits one entry on the `runtime:health_events`
// Redis Stream and updates `health_snapshots` with the latest observation
// for the affected game. Adapters publish and snapshot in one call so
// every emission durably advances both surfaces; partial publishes (event
// without snapshot, or vice versa) are not allowed.
//
// The start service emits `container_started` through this port; the
// periodic Docker inspect, the active probe, and the Docker events
// listener publish the rest of the event types through the same port
// without changing its surface.
type HealthEventPublisher interface {
// Publish records envelope on the configured `runtime:health_events`
// stream and upserts the matching `health_snapshots` row. A non-nil
// error reports a transport or storage failure; the caller treats it
// as a degraded emission per `rtmanager/README.md §Notification
// Contracts` (the underlying business state is the source of truth,
// not the event stream).
Publish(ctx context.Context, envelope HealthEventEnvelope) error
}
// HealthEventEnvelope carries the payload published on
// `runtime:health_events`. The fields mirror the AsyncAPI schema frozen
// in `rtmanager/api/runtime-health-asyncapi.yaml`; adapters serialise
// every field verbatim so consumers see the contracted shape.
type HealthEventEnvelope struct {
// GameID identifies the platform game the event refers to.
GameID string
// ContainerID identifies the Docker container observed by the event
// source. May differ from the record's current container id after a
// restart race; consumers are expected to treat the value as the
// observation's container, not the record's.
ContainerID string
// EventType classifies the event per the frozen vocabulary in
// `galaxy/rtmanager/internal/domain/health.EventType`.
EventType health.EventType
// OccurredAt stores the wall-clock at which Runtime Manager observed
// the event. Adapters convert it to UTC milliseconds for the wire
// payload (`occurred_at_ms`).
OccurredAt time.Time
// Details stores the event-type-specific JSON payload. Adapters
// persist and stream it verbatim; nil and empty values are treated as
// the canonical empty-object payload.
Details json.RawMessage
}
// Validate reports whether envelope satisfies the structural invariants
// implied by the AsyncAPI schema.
func (envelope HealthEventEnvelope) Validate() error {
if strings.TrimSpace(envelope.GameID) == "" {
return fmt.Errorf("health event envelope: game id must not be empty")
}
if strings.TrimSpace(envelope.ContainerID) == "" {
return fmt.Errorf("health event envelope: container id must not be empty")
}
if !envelope.EventType.IsKnown() {
return fmt.Errorf("health event envelope: event type %q is unsupported", envelope.EventType)
}
if envelope.OccurredAt.IsZero() {
return fmt.Errorf("health event envelope: occurred at must not be zero")
}
if len(envelope.Details) > 0 && !json.Valid(envelope.Details) {
return fmt.Errorf("health event envelope: details must be valid JSON when non-empty")
}
return nil
}
@@ -0,0 +1,22 @@
package ports
import (
"context"
"galaxy/rtmanager/internal/domain/health"
)
// HealthSnapshotStore stores the latest technical-health observation per
// game. Adapters keep one row per game_id; later observations overwrite.
type HealthSnapshotStore interface {
// Upsert installs snapshot as the latest observation for
// snapshot.GameID. Adapters validate snapshot through
// health.HealthSnapshot.Validate before touching the store.
Upsert(ctx context.Context, snapshot health.HealthSnapshot) error
// Get returns the latest snapshot for gameID. It returns
// runtime.ErrNotFound (declared in
// `galaxy/rtmanager/internal/domain/runtime`) when no snapshot has
// been recorded yet.
Get(ctx context.Context, gameID string) (health.HealthSnapshot, error)
}
@@ -0,0 +1,91 @@
package ports
import (
"context"
"fmt"
"strings"
)
// JobResultPublisher emits one entry on the `runtime:job_results` Redis
// Stream per finalised start or stop runtime job. Adapters serialise
// every JobResult field verbatim so consumers (Game Lobby's
// runtime-job-result worker today, future services tomorrow) see the
// AsyncAPI shape frozen in `rtmanager/api/runtime-jobs-asyncapi.yaml`.
//
// The start-jobs and stop-jobs consumers publish through this port.
// The synchronous REST handlers do not — REST callers receive the same
// `Result` shape directly from the service layer.
type JobResultPublisher interface {
// Publish records result on the configured `runtime:job_results`
// stream. A non-nil error reports a transport or serialisation
// failure; the caller treats the failure as a degraded emission
// (the operation_log already records the durable outcome).
Publish(ctx context.Context, result JobResult) error
}
// JobResult outcome values frozen by the
// `RuntimeJobResultPayload.outcome` enum.
const (
// JobOutcomeSuccess marks a successful start or stop, including the
// idempotent replay variant (`error_code=replay_no_op`).
JobOutcomeSuccess = "success"
// JobOutcomeFailure marks a stable failure for which the payload
// carries a non-empty `error_code`.
JobOutcomeFailure = "failure"
)
// JobResult carries the wire payload published on
// `runtime:job_results`. The fields mirror the AsyncAPI schema frozen
// in `rtmanager/api/runtime-jobs-asyncapi.yaml`; adapters serialise
// every field verbatim so consumers see the contracted shape. Fields
// that are required by the contract (every field on this struct) are
// always present in the wire entry — even when their string value is
// empty (allowed for `container_id` / `engine_endpoint` / `error_code`
// / `error_message` on appropriate variants).
type JobResult struct {
// GameID identifies the platform game the job acted on. Required.
GameID string
// Outcome reports the high-level outcome. Must be `success` or
// `failure` (use the JobOutcome* constants).
Outcome string
// ContainerID stores the Docker container id. Populated on
// `success` for fresh starts and replays; empty on `failure` and
// on `success/replay_no_op` for stop jobs that observed a removed
// record.
ContainerID string
// EngineEndpoint stores the stable engine URL
// `http://galaxy-game-{game_id}:8080`. Populated alongside
// ContainerID, empty in the same cases.
EngineEndpoint string
// ErrorCode stores the stable error code from
// `rtmanager/README.md §Error Model`. Empty for fresh successes,
// `replay_no_op` for idempotent replays, one of the failure
// codes otherwise.
ErrorCode string
// ErrorMessage stores the operator-readable detail. Empty for
// successes; populated alongside ErrorCode on failure.
ErrorMessage string
}
// Validate reports whether result satisfies the structural invariants
// implied by the AsyncAPI schema: a non-empty game id and one of the
// two known outcome values. The remaining fields are required to be
// present on the wire but may be empty strings, so Validate does not
// constrain them.
func (result JobResult) Validate() error {
if strings.TrimSpace(result.GameID) == "" {
return fmt.Errorf("job result: game id must not be empty")
}
switch result.Outcome {
case JobOutcomeSuccess, JobOutcomeFailure:
return nil
default:
return fmt.Errorf("job result: outcome %q is unsupported", result.Outcome)
}
}
+47
View File
@@ -0,0 +1,47 @@
package ports
import (
"context"
"errors"
)
// LobbyInternalClient is the synchronous trusted-REST port Runtime
// Manager uses to read ancillary game metadata from Game Lobby. Stage
// 13 calls GetGame purely for diagnostic context; the start envelope
// already carries the only required field (`image_ref`) so a
// LobbyInternalClient failure must not abort the start operation.
type LobbyInternalClient interface {
// GetGame returns the Lobby game record for gameID. It returns
// ErrLobbyGameNotFound when no record exists and ErrLobbyUnavailable
// for transport / timeout / non-2xx responses.
GetGame(ctx context.Context, gameID string) (LobbyGameRecord, error)
}
// LobbyGameRecord stores the subset of the Lobby `GameRecord` schema
// Runtime Manager uses. The shape is intentionally minimal: this fetch
// is ancillary diagnostics and v1 has no required field. The struct
// may be extended additively without breaking existing callers.
type LobbyGameRecord struct {
// GameID identifies the platform game.
GameID string
// Status stores the verbatim Lobby status string (e.g. `starting`,
// `running`, `paused`). Runtime Manager does not interpret it; it
// is exposed for log enrichment and diagnostics only.
Status string
// TargetEngineVersion stores the semver of the engine version Lobby
// resolved into the start envelope's image_ref. Empty when Lobby
// did not return one.
TargetEngineVersion string
}
// ErrLobbyGameNotFound reports that the Lobby internal API returned 404
// for the requested game id.
var ErrLobbyGameNotFound = errors.New("lobby game not found")
// ErrLobbyUnavailable reports that the Lobby internal API could not be
// reached (transport error, timeout, non-2xx response). Callers must
// treat the failure as recoverable: Runtime Manager continues the
// operation when the call is purely diagnostic.
var ErrLobbyUnavailable = errors.New("lobby internal api unavailable")
@@ -0,0 +1,25 @@
package ports
import (
"context"
"galaxy/notificationintent"
)
// NotificationIntentPublisher is the producer port Runtime Manager uses
// to publish admin-only notification intents to Notification Service.
// The production adapter is a thin wrapper around
// `notificationintent.Publisher`; the wrapper drops the entry id
// returned by the underlying publisher because Runtime Manager does
// not track per-intent ids in v1.
//
// A failed Publish call is a notification degradation per
// `galaxy/rtmanager/README.md §Notification Contracts` and must not roll
// back already committed business state. Callers log the error and
// proceed.
type NotificationIntentPublisher interface {
// Publish normalises intent and appends it to the configured Redis
// Stream. Validation failures and transport errors are returned
// verbatim.
Publish(ctx context.Context, intent notificationintent.Intent) error
}
@@ -0,0 +1,23 @@
package ports
import (
"context"
"galaxy/rtmanager/internal/domain/operation"
)
// OperationLogStore stores append-only audit entries for every
// lifecycle operation Runtime Manager performed against a game's
// runtime. Adapters must persist entry verbatim and return the
// generated bigserial id from Append.
type OperationLogStore interface {
// Append inserts entry into the operation log and returns the
// generated bigserial id. Adapters validate entry through
// operation.OperationEntry.Validate before touching the store.
Append(ctx context.Context, entry operation.OperationEntry) (id int64, err error)
// ListByGame returns the most recent entries for gameID, ordered by
// started_at descending and capped by limit. A non-positive limit
// is rejected as invalid input by adapters.
ListByGame(ctx context.Context, gameID string, limit int) ([]operation.OperationEntry, error)
}
@@ -0,0 +1,112 @@
// Package ports defines the stable interfaces that connect Runtime
// Manager use cases to external state and external services.
package ports
import (
"context"
"fmt"
"strings"
"time"
"galaxy/rtmanager/internal/domain/runtime"
)
// RuntimeRecordStore stores runtime records and exposes the operations
// used by the service layer (Stages 13+) and the workers (Stages 15-18).
// Adapters must preserve domain semantics:
//
// - Get returns runtime.ErrNotFound when no record exists for gameID.
// - Upsert installs a record verbatim; the caller is responsible for
// domain validation through runtime.RuntimeRecord.Validate.
// - UpdateStatus applies one transition through a compare-and-swap
// guard on (status, current_container_id) and returns
// runtime.ErrConflict on a stale CAS.
// - List returns every record currently stored, regardless of status.
// - ListByStatus returns every record currently indexed under status.
type RuntimeRecordStore interface {
// Get returns the record identified by gameID. It returns
// runtime.ErrNotFound when no record exists.
Get(ctx context.Context, gameID string) (runtime.RuntimeRecord, error)
// Upsert inserts record when no row exists for record.GameID and
// otherwise overwrites every column verbatim. The start service uses
// Upsert to install fresh records on start, the inner start of
// restart and patch, and the reconcile_adopt path.
Upsert(ctx context.Context, record runtime.RuntimeRecord) error
// UpdateStatus applies one status transition in a compare-and-swap
// fashion. The adapter must first call runtime.Transition to reject
// invalid pairs without touching the store, then verify that the
// stored status equals input.ExpectedFrom, and (when
// input.ExpectedContainerID is non-empty) that the stored
// current_container_id equals it. The adapter derives stopped_at /
// removed_at and updates last_op_at from input.Now per the
// destination status.
UpdateStatus(ctx context.Context, input UpdateStatusInput) error
// List returns every runtime record currently stored. Used by the
// internal REST list endpoint; the v1 working set is bounded by the
// games tracked by Lobby and is small enough to return in one
// response (pagination is not supported). The order is
// adapter-defined; callers may reorder as needed.
List(ctx context.Context) ([]runtime.RuntimeRecord, error)
// ListByStatus returns every record currently indexed under status.
// The order is adapter-defined; callers may reorder as needed.
ListByStatus(ctx context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error)
}
// UpdateStatusInput stores the arguments required to apply one status
// transition through a RuntimeRecordStore. The adapter is responsible
// for translating the destination status into the matching column
// updates (stopped_at / removed_at / current_container_id NULLing) and
// for the CAS guard.
type UpdateStatusInput struct {
// GameID identifies the record to mutate.
GameID string
// ExpectedFrom stores the status the caller believes the record
// currently has. A mismatch results in runtime.ErrConflict.
ExpectedFrom runtime.Status
// ExpectedContainerID is an optional CAS guard. When non-empty, the
// adapter rejects the update with runtime.ErrConflict if the stored
// current_container_id does not equal it. Used by stop / cleanup /
// reconcile to protect against concurrent restart races. Empty
// disables the container-id CAS while keeping the status CAS.
ExpectedContainerID string
// To stores the destination status.
To runtime.Status
// Now stores the wall-clock used to derive stopped_at / removed_at
// and last_op_at depending on To.
Now time.Time
}
// Validate reports whether input contains a structurally valid status
// transition request. Adapters call Validate before touching the store.
func (input UpdateStatusInput) Validate() error {
if strings.TrimSpace(input.GameID) == "" {
return fmt.Errorf("update runtime status: game id must not be empty")
}
if !input.ExpectedFrom.IsKnown() {
return fmt.Errorf(
"update runtime status: expected from status %q is unsupported",
input.ExpectedFrom,
)
}
if !input.To.IsKnown() {
return fmt.Errorf(
"update runtime status: to status %q is unsupported",
input.To,
)
}
if err := runtime.Transition(input.ExpectedFrom, input.To); err != nil {
return fmt.Errorf("update runtime status: %w", err)
}
if input.Now.IsZero() {
return fmt.Errorf("update runtime status: now must not be zero")
}
return nil
}
@@ -0,0 +1,70 @@
package ports
import (
"errors"
"testing"
"time"
"galaxy/rtmanager/internal/domain/runtime"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func validUpdateStatusInput() UpdateStatusInput {
return UpdateStatusInput{
GameID: "game-test",
ExpectedFrom: runtime.StatusRunning,
ExpectedContainerID: "container-1",
To: runtime.StatusStopped,
Now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
}
}
func TestUpdateStatusInputValidateHappy(t *testing.T) {
require.NoError(t, validUpdateStatusInput().Validate())
}
func TestUpdateStatusInputValidateAcceptsEmptyContainerCAS(t *testing.T) {
input := validUpdateStatusInput()
input.ExpectedContainerID = ""
assert.NoError(t, input.Validate())
}
func TestUpdateStatusInputValidateRejects(t *testing.T) {
tests := []struct {
name string
mutate func(*UpdateStatusInput)
}{
{"empty game id", func(i *UpdateStatusInput) { i.GameID = "" }},
{"unknown expected from", func(i *UpdateStatusInput) {
i.ExpectedFrom = "exotic"
}},
{"unknown to", func(i *UpdateStatusInput) {
i.To = "exotic"
}},
{"zero now", func(i *UpdateStatusInput) {
i.Now = time.Time{}
}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
input := validUpdateStatusInput()
tt.mutate(&input)
assert.Error(t, input.Validate())
})
}
}
func TestUpdateStatusInputValidateRejectsForbiddenTransition(t *testing.T) {
input := validUpdateStatusInput()
input.ExpectedFrom = runtime.StatusRemoved
input.To = runtime.StatusRunning
err := input.Validate()
require.Error(t, err)
assert.True(t, errors.Is(err, runtime.ErrInvalidTransition),
"want runtime.ErrInvalidTransition, got %v", err)
}
@@ -0,0 +1,23 @@
package ports
import "context"
// StreamOffsetStore persists the last successfully processed Redis
// Stream entry id per consumer label. Workers call Load on startup to
// resume from the persisted offset and Save after every successful
// message handling so the next iteration advances past the
// just-processed entry. The label is the short logical identifier of
// the consumer (e.g. `start_jobs`, `stop_jobs`), not the full stream
// name; it stays stable when the underlying stream key is renamed.
type StreamOffsetStore interface {
// Load returns the last processed entry id for the consumer
// labelled stream when one is stored. The boolean return reports
// whether a value was present; implementations must not return an
// error for a missing key.
Load(ctx context.Context, stream string) (entryID string, found bool, err error)
// Save stores entryID as the new last processed offset for the
// consumer labelled stream. Implementations overwrite any previous
// value unconditionally.
Save(ctx context.Context, stream, entryID string) error
}
@@ -0,0 +1,442 @@
// Package cleanupcontainer implements the `cleanup_container` lifecycle
// operation owned by Runtime Manager. The service removes the Docker
// container of an already-stopped runtime and transitions the record
// to `removed`. It refuses to operate on a still-running runtime —
// callers must stop first.
//
// Two callers exercise this surface: the administrative
// `DELETE /api/v1/internal/runtimes/{game_id}/container` endpoint, and
// the periodic container-cleanup worker that walks
// `runtime_records.status='stopped'` rows older than
// `RTMANAGER_CONTAINER_RETENTION_DAYS`. Both paths flow through Handle.
//
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
// §Lifecycles → Cleanup`. Design rationale is captured in
// `rtmanager/docs/services.md`.
package cleanupcontainer
import (
"context"
"crypto/rand"
"encoding/base64"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/logging"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/telemetry"
)
const leaseReleaseTimeout = 5 * time.Second
// Input stores the per-call arguments for one cleanup operation.
type Input struct {
// GameID identifies the platform game whose container is removed.
GameID string
// OpSource classifies how the request entered Runtime Manager.
// Required: every operation_log entry carries an op_source.
OpSource operation.OpSource
// SourceRef stores the optional opaque per-source reference (REST
// request id, admin user id). Empty for the periodic auto-TTL
// caller.
SourceRef string
}
// Validate reports whether input carries the structural invariants the
// service requires.
func (input Input) Validate() error {
if strings.TrimSpace(input.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if !input.OpSource.IsKnown() {
return fmt.Errorf("op source %q is unsupported", input.OpSource)
}
return nil
}
// Result stores the deterministic outcome of one Handle call.
type Result struct {
// Record carries the updated runtime record on success and on
// idempotent replay; zero on failure.
Record runtime.RuntimeRecord
// Outcome reports whether the operation completed (success) or
// produced a stable failure code.
Outcome operation.Outcome
// ErrorCode stores the stable error code on failure, or
// `replay_no_op` on idempotent replay. Empty for fresh successes.
ErrorCode string
// ErrorMessage stores the operator-readable detail on failure.
ErrorMessage string
}
// Dependencies groups the collaborators required by Service.
type Dependencies struct {
RuntimeRecords ports.RuntimeRecordStore
OperationLogs ports.OperationLogStore
Docker ports.DockerClient
Leases ports.GameLeaseStore
Coordination config.CoordinationConfig
Telemetry *telemetry.Runtime
Logger *slog.Logger
Clock func() time.Time
NewToken func() string
}
// Service executes the cleanup_container lifecycle operation.
type Service struct {
runtimeRecords ports.RuntimeRecordStore
operationLogs ports.OperationLogStore
docker ports.DockerClient
leases ports.GameLeaseStore
leaseTTL time.Duration
telemetry *telemetry.Runtime
logger *slog.Logger
clock func() time.Time
newToken func() string
}
// NewService constructs one Service from deps.
func NewService(deps Dependencies) (*Service, error) {
switch {
case deps.RuntimeRecords == nil:
return nil, errors.New("new cleanup container service: nil runtime records")
case deps.OperationLogs == nil:
return nil, errors.New("new cleanup container service: nil operation logs")
case deps.Docker == nil:
return nil, errors.New("new cleanup container service: nil docker client")
case deps.Leases == nil:
return nil, errors.New("new cleanup container service: nil lease store")
case deps.Telemetry == nil:
return nil, errors.New("new cleanup container service: nil telemetry runtime")
}
if err := deps.Coordination.Validate(); err != nil {
return nil, fmt.Errorf("new cleanup container service: coordination config: %w", err)
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
logger = logger.With("service", "rtmanager.cleanupcontainer")
newToken := deps.NewToken
if newToken == nil {
newToken = defaultTokenGenerator()
}
return &Service{
runtimeRecords: deps.RuntimeRecords,
operationLogs: deps.OperationLogs,
docker: deps.Docker,
leases: deps.Leases,
leaseTTL: deps.Coordination.GameLeaseTTL,
telemetry: deps.Telemetry,
logger: logger,
clock: clock,
newToken: newToken,
}, nil
}
// Handle executes one cleanup operation end-to-end. The Go-level error
// return is reserved for non-business failures (nil context, nil
// receiver). Every business outcome — success, idempotent replay, or
// any of the stable failure modes — flows through Result.
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("cleanup container: nil service")
}
if ctx == nil {
return Result{}, errors.New("cleanup container: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInvalidRequest,
errorMessage: err.Error(),
}), nil
}
token := service.newToken()
leaseStart := service.clock()
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
}), nil
}
if !acquired {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeConflict,
errorMessage: "another lifecycle operation is in progress for this game",
}), nil
}
defer service.releaseLease(ctx, input.GameID, token)
return service.runUnderLease(ctx, input, opStartedAt)
}
// runUnderLease executes the lease-protected cleanup steps.
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
if errors.Is(err, runtime.ErrNotFound) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeNotFound,
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
}), nil
}
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
}), nil
}
switch existing.Status {
case runtime.StatusRemoved:
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
case runtime.StatusRunning:
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeConflict,
errorMessage: fmt.Sprintf("runtime for game %q is running; stop the runtime first", input.GameID),
containerID: existing.CurrentContainerID,
imageRef: existing.CurrentImageRef,
}), nil
case runtime.StatusStopped:
// proceed
default:
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("runtime record has unsupported status %q", existing.Status),
}), nil
}
if existing.CurrentContainerID != "" {
if err := service.docker.Remove(ctx, existing.CurrentContainerID); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("docker remove: %s", err.Error()),
containerID: existing.CurrentContainerID,
imageRef: existing.CurrentImageRef,
}), nil
}
}
updateNow := service.clock().UTC()
err = service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: input.GameID,
ExpectedFrom: runtime.StatusStopped,
ExpectedContainerID: existing.CurrentContainerID,
To: runtime.StatusRemoved,
Now: updateNow,
})
if errors.Is(err, runtime.ErrConflict) {
// CAS race: another caller (reconciler dispose, concurrent admin)
// already moved the record. The desired terminal state was
// reached by another path.
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
}
if errors.Is(err, runtime.ErrNotFound) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeNotFound,
errorMessage: fmt.Sprintf("runtime record for game %q vanished mid-cleanup", input.GameID),
containerID: existing.CurrentContainerID,
imageRef: existing.CurrentImageRef,
}), nil
}
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("update runtime status: %s", err.Error()),
containerID: existing.CurrentContainerID,
imageRef: existing.CurrentImageRef,
}), nil
}
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindCleanupContainer,
OpSource: input.OpSource,
SourceRef: input.SourceRef,
ImageRef: existing.CurrentImageRef,
ContainerID: existing.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordCleanupOutcome(ctx, string(operation.OutcomeSuccess), string(input.OpSource))
record := existing
record.Status = runtime.StatusRemoved
record.CurrentContainerID = ""
removedAt := updateNow
record.RemovedAt = &removedAt
record.LastOpAt = updateNow
logArgs := []any{
"game_id", input.GameID,
"container_id", existing.CurrentContainerID,
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime container cleaned up", logArgs...)
return Result{
Record: record,
Outcome: operation.OutcomeSuccess,
}, nil
}
// recordReplayNoOp records the idempotent replay outcome and returns the
// existing record unchanged.
func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindCleanupContainer,
OpSource: input.OpSource,
SourceRef: input.SourceRef,
ImageRef: existing.CurrentImageRef,
ContainerID: existing.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
ErrorCode: startruntime.ErrorCodeReplayNoOp,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordCleanupOutcome(ctx, string(operation.OutcomeSuccess), string(input.OpSource))
logArgs := []any{
"game_id", input.GameID,
"container_id", existing.CurrentContainerID,
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime cleanup replay no-op", logArgs...)
return Result{
Record: existing,
Outcome: operation.OutcomeSuccess,
ErrorCode: startruntime.ErrorCodeReplayNoOp,
}
}
// failureCtx groups the inputs to recordFailure.
type failureCtx struct {
opStartedAt time.Time
input Input
errorCode string
errorMessage string
containerID string
imageRef string
}
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: fc.input.GameID,
OpKind: operation.OpKindCleanupContainer,
OpSource: fc.input.OpSource,
SourceRef: fc.input.SourceRef,
ImageRef: fc.imageRef,
ContainerID: fc.containerID,
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
StartedAt: fc.opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordCleanupOutcome(ctx, string(operation.OutcomeFailure), string(fc.input.OpSource))
logArgs := []any{
"game_id", fc.input.GameID,
"op_source", string(fc.input.OpSource),
"error_code", fc.errorCode,
"error_message", fc.errorMessage,
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.WarnContext(ctx, "runtime cleanup failed", logArgs...)
return Result{
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
}
}
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
defer cancel()
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
service.logger.WarnContext(ctx, "release game lease",
"game_id", gameID,
"err", err.Error(),
)
}
}
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
service.logger.ErrorContext(ctx, "append operation log",
"game_id", entry.GameID,
"op_kind", string(entry.OpKind),
"outcome", string(entry.Outcome),
"error_code", entry.ErrorCode,
"err", err.Error(),
)
}
}
func defaultTokenGenerator() func() string {
return func() string {
var buf [32]byte
if _, err := rand.Read(buf[:]); err != nil {
return "rtmanager-fallback-token"
}
return base64.RawURLEncoding.EncodeToString(buf[:])
}
}
@@ -0,0 +1,382 @@
package cleanupcontainer_test
import (
"context"
"errors"
"sync"
"testing"
"time"
"galaxy/rtmanager/internal/adapters/docker/mocks"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/cleanupcontainer"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/telemetry"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
)
// --- shared fake doubles ----------------------------------------------
type fakeRuntimeRecords struct {
mu sync.Mutex
stored map[string]runtime.RuntimeRecord
getErr error
updateStatusErr error
updates []ports.UpdateStatusInput
}
func newFakeRuntimeRecords() *fakeRuntimeRecords {
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
}
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.getErr != nil {
return runtime.RuntimeRecord{}, s.getErr
}
record, ok := s.stored[gameID]
if !ok {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
return record, nil
}
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error {
return errors.New("not used in cleanup tests")
}
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
s.mu.Lock()
defer s.mu.Unlock()
s.updates = append(s.updates, input)
if s.updateStatusErr != nil {
return s.updateStatusErr
}
record, ok := s.stored[input.GameID]
if !ok {
return runtime.ErrNotFound
}
if record.Status != input.ExpectedFrom {
return runtime.ErrConflict
}
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
return runtime.ErrConflict
}
record.Status = input.To
record.LastOpAt = input.Now
if input.To == runtime.StatusRemoved {
removedAt := input.Now
record.RemovedAt = &removedAt
record.CurrentContainerID = ""
}
s.stored[input.GameID] = record
return nil
}
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in cleanup tests")
}
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in cleanup tests")
}
type fakeOperationLogs struct {
mu sync.Mutex
appendErr error
appends []operation.OperationEntry
}
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.appendErr != nil {
return 0, s.appendErr
}
s.appends = append(s.appends, entry)
return int64(len(s.appends)), nil
}
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
return nil, errors.New("not used in cleanup tests")
}
func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) {
s.mu.Lock()
defer s.mu.Unlock()
if len(s.appends) == 0 {
return operation.OperationEntry{}, false
}
return s.appends[len(s.appends)-1], true
}
type fakeLeases struct {
mu sync.Mutex
acquired bool
acquireErr error
releaseErr error
acquires []string
releases []string
}
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
l.mu.Lock()
defer l.mu.Unlock()
l.acquires = append(l.acquires, token)
if l.acquireErr != nil {
return false, l.acquireErr
}
return l.acquired, nil
}
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
l.mu.Lock()
defer l.mu.Unlock()
l.releases = append(l.releases, token)
return l.releaseErr
}
// --- harness ----------------------------------------------------------
type harness struct {
records *fakeRuntimeRecords
operationLogs *fakeOperationLogs
docker *mocks.MockDockerClient
leases *fakeLeases
telemetry *telemetry.Runtime
now time.Time
}
func newHarness(t *testing.T) *harness {
t.Helper()
ctrl := gomock.NewController(t)
t.Cleanup(ctrl.Finish)
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
require.NoError(t, err)
return &harness{
records: newFakeRuntimeRecords(),
operationLogs: &fakeOperationLogs{},
docker: mocks.NewMockDockerClient(ctrl),
leases: &fakeLeases{acquired: true},
telemetry: telemetryRuntime,
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
}
}
func (h *harness) build(t *testing.T) *cleanupcontainer.Service {
t.Helper()
service, err := cleanupcontainer.NewService(cleanupcontainer.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: func() string { return "token-A" },
})
require.NoError(t, err)
return service
}
func basicInput() cleanupcontainer.Input {
return cleanupcontainer.Input{
GameID: "game-1",
OpSource: operation.OpSourceAdminRest,
SourceRef: "rest-cleanup-1",
}
}
func stoppedRecord(now time.Time) runtime.RuntimeRecord {
startedAt := now.Add(-2 * time.Hour)
stoppedAt := now.Add(-time.Hour)
return runtime.RuntimeRecord{
GameID: "game-1",
Status: runtime.StatusStopped,
CurrentContainerID: "ctr-old",
CurrentImageRef: "registry.example.com/galaxy/game:1.4.7",
EngineEndpoint: "http://galaxy-game-game-1:8080",
StatePath: "/var/lib/galaxy/games/game-1",
DockerNetwork: "galaxy-net",
StartedAt: &startedAt,
StoppedAt: &stoppedAt,
LastOpAt: stoppedAt,
CreatedAt: startedAt,
}
}
// --- happy path -----------------------------------------------------
func TestHandleCleanupHappyPath(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = stoppedRecord(h.now)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Empty(t, result.ErrorCode)
assert.Equal(t, runtime.StatusRemoved, result.Record.Status)
assert.Empty(t, result.Record.CurrentContainerID)
require.Len(t, h.records.updates, 1)
assert.Equal(t, runtime.StatusStopped, h.records.updates[0].ExpectedFrom)
assert.Equal(t, runtime.StatusRemoved, h.records.updates[0].To)
require.Len(t, h.operationLogs.appends, 1)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OpKindCleanupContainer, last.OpKind)
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
assert.Empty(t, last.ErrorCode)
}
// --- replay ---------------------------------------------------------
func TestHandleReplayNoOpForRemovedRecord(t *testing.T) {
h := newHarness(t)
removed := stoppedRecord(h.now)
removed.Status = runtime.StatusRemoved
removed.CurrentContainerID = ""
removedAt := h.now.Add(-30 * time.Minute)
removed.RemovedAt = &removedAt
h.records.stored["game-1"] = removed
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
assert.Empty(t, h.records.updates)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode)
}
func TestHandleReplayNoOpOnUpdateStatusConflict(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = stoppedRecord(h.now)
h.records.updateStatusErr = runtime.ErrConflict
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
}
// --- failure paths --------------------------------------------------
func TestHandleConflictOnRunningRecord(t *testing.T) {
h := newHarness(t)
running := stoppedRecord(h.now)
running.Status = runtime.StatusRunning
startedAt := h.now.Add(-time.Hour)
running.StartedAt = &startedAt
running.StoppedAt = nil
h.records.stored["game-1"] = running
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
assert.Contains(t, result.ErrorMessage, "stop the runtime first")
}
func TestHandleNotFoundForMissingRecord(t *testing.T) {
h := newHarness(t)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
}
func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = stoppedRecord(h.now)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o"))
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
assert.Empty(t, h.records.updates, "no record mutation on docker remove failure")
}
func TestHandleInternalErrorOnGenericUpdateError(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = stoppedRecord(h.now)
h.records.updateStatusErr = errors.New("postgres down")
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeInternal, result.ErrorCode)
}
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
h := newHarness(t)
h.leases.acquired = false
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
}
// --- input validation ----------------------------------------------
func TestHandleRejectsInvalidInput(t *testing.T) {
h := newHarness(t)
service := h.build(t)
cases := []cleanupcontainer.Input{
{GameID: "", OpSource: operation.OpSourceAdminRest},
{GameID: "g", OpSource: operation.OpSource("bogus")},
}
for _, input := range cases {
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
}
}
// --- constructor ---------------------------------------------------
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
h := newHarness(t)
deps := cleanupcontainer.Dependencies{
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
}
_, err := cleanupcontainer.NewService(deps)
require.Error(t, err)
}
@@ -0,0 +1,52 @@
package patchruntime
import (
"errors"
"fmt"
"strings"
"github.com/distribution/reference"
"golang.org/x/mod/semver"
)
// errImageRefNoTag reports that an image reference does not declare a
// tag. The patch service maps it to `image_ref_not_semver` because a
// digest-only or tagless reference cannot carry a semver-comparable
// version.
var errImageRefNoTag = errors.New("image reference is missing a tag")
// extractSemverTag returns the canonical semver string ("v1.4.7") for
// imageRef, ready to feed into golang.org/x/mod/semver. The leading "v"
// is added when the underlying tag omits it.
//
// Errors returned by this function are pre-formatted for inclusion in
// the patch service's `image_ref_not_semver` failure message.
func extractSemverTag(imageRef string) (string, error) {
parsed, err := reference.ParseNormalizedNamed(imageRef)
if err != nil {
return "", fmt.Errorf("parse image reference %q: %w", imageRef, err)
}
tagged, ok := parsed.(reference.NamedTagged)
if !ok {
return "", fmt.Errorf("%w: %q", errImageRefNoTag, imageRef)
}
tag := strings.TrimSpace(tagged.Tag())
if tag == "" {
return "", fmt.Errorf("%w: %q", errImageRefNoTag, imageRef)
}
candidate := tag
if !strings.HasPrefix(candidate, "v") {
candidate = "v" + candidate
}
if !semver.IsValid(candidate) {
return "", fmt.Errorf("tag %q on image reference %q is not a valid semver", tag, imageRef)
}
return candidate, nil
}
// samePatchSeries reports whether two canonical semver strings (with
// the leading "v") share their major and minor components. The third
// component (patch) and any pre-release / build metadata are ignored.
func samePatchSeries(currentSemver, newSemver string) bool {
return semver.MajorMinor(currentSemver) == semver.MajorMinor(newSemver)
}
@@ -0,0 +1,483 @@
// Package patchruntime implements the `patch` lifecycle operation owned
// by Runtime Manager. Patch is restart with a new `image_ref`: under
// one outer per-game lease the service runs the stop service, removes
// the container, and runs the start service with the new image. The
// engine reads its state from the bind-mount on startup, so any data
// written before the patch survives.
//
// The new and current image references must both parse as semver tags
// and share their major and minor components. A new tag that bumps the
// major or the minor surfaces as `semver_patch_only`; a tag that is
// not parseable as semver surfaces as `image_ref_not_semver`. These
// pre-checks run before any Docker work so a rejected patch never
// disturbs the running runtime.
//
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
// §Lifecycles → Patch`. Design rationale is captured in
// `rtmanager/docs/services.md`.
package patchruntime
import (
"context"
"crypto/rand"
"encoding/base64"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/logging"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"galaxy/rtmanager/internal/telemetry"
)
const leaseReleaseTimeout = 5 * time.Second
// Input stores the per-call arguments for one patch operation.
type Input struct {
// GameID identifies the platform game to patch.
GameID string
// NewImageRef stores the new Docker reference the patch installs.
// Must be a valid Docker reference whose tag parses as semver.
NewImageRef string
// OpSource classifies how the request entered Runtime Manager.
OpSource operation.OpSource
// SourceRef stores the optional opaque per-source reference. When
// non-empty it is reused as the correlation id linking the outer
// patch entry to the inner stop and start log entries.
SourceRef string
}
// Validate reports whether input carries the structural invariants the
// service requires. Image-reference shape and semver checks happen
// later inside Handle so that they run after the runtime record has
// been loaded.
func (input Input) Validate() error {
if strings.TrimSpace(input.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if strings.TrimSpace(input.NewImageRef) == "" {
return fmt.Errorf("new image ref must not be empty")
}
if !input.OpSource.IsKnown() {
return fmt.Errorf("op source %q is unsupported", input.OpSource)
}
return nil
}
// Result stores the deterministic outcome of one Handle call.
type Result struct {
// Record carries the runtime record installed by the inner start on
// success; zero on failure.
Record runtime.RuntimeRecord
// Outcome reports whether the operation completed (success) or
// produced a stable failure code.
Outcome operation.Outcome
// ErrorCode stores the stable error code on failure.
ErrorCode string
// ErrorMessage stores the operator-readable detail on failure.
ErrorMessage string
}
// Dependencies groups the collaborators required by Service.
type Dependencies struct {
RuntimeRecords ports.RuntimeRecordStore
OperationLogs ports.OperationLogStore
Docker ports.DockerClient
Leases ports.GameLeaseStore
// StopService runs the inner stop step.
StopService *stopruntime.Service
// StartService runs the inner start step with the new image_ref.
StartService *startruntime.Service
Coordination config.CoordinationConfig
Telemetry *telemetry.Runtime
Logger *slog.Logger
Clock func() time.Time
NewToken func() string
}
// Service executes the patch lifecycle operation.
type Service struct {
runtimeRecords ports.RuntimeRecordStore
operationLogs ports.OperationLogStore
docker ports.DockerClient
leases ports.GameLeaseStore
stopService *stopruntime.Service
startService *startruntime.Service
leaseTTL time.Duration
telemetry *telemetry.Runtime
logger *slog.Logger
clock func() time.Time
newToken func() string
}
// NewService constructs one Service from deps.
func NewService(deps Dependencies) (*Service, error) {
switch {
case deps.RuntimeRecords == nil:
return nil, errors.New("new patch runtime service: nil runtime records")
case deps.OperationLogs == nil:
return nil, errors.New("new patch runtime service: nil operation logs")
case deps.Docker == nil:
return nil, errors.New("new patch runtime service: nil docker client")
case deps.Leases == nil:
return nil, errors.New("new patch runtime service: nil lease store")
case deps.StopService == nil:
return nil, errors.New("new patch runtime service: nil stop service")
case deps.StartService == nil:
return nil, errors.New("new patch runtime service: nil start service")
case deps.Telemetry == nil:
return nil, errors.New("new patch runtime service: nil telemetry runtime")
}
if err := deps.Coordination.Validate(); err != nil {
return nil, fmt.Errorf("new patch runtime service: coordination config: %w", err)
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
logger = logger.With("service", "rtmanager.patchruntime")
newToken := deps.NewToken
if newToken == nil {
newToken = defaultTokenGenerator()
}
return &Service{
runtimeRecords: deps.RuntimeRecords,
operationLogs: deps.OperationLogs,
docker: deps.Docker,
leases: deps.Leases,
stopService: deps.StopService,
startService: deps.StartService,
leaseTTL: deps.Coordination.GameLeaseTTL,
telemetry: deps.Telemetry,
logger: logger,
clock: clock,
newToken: newToken,
}, nil
}
// Handle executes one patch operation end-to-end. The Go-level error
// return is reserved for non-business failures (nil context, nil
// receiver). Every business outcome — success or any of the stable
// failure codes — flows through Result.
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("patch runtime: nil service")
}
if ctx == nil {
return Result{}, errors.New("patch runtime: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInvalidRequest,
errorMessage: err.Error(),
}), nil
}
token := service.newToken()
leaseStart := service.clock()
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
}), nil
}
if !acquired {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeConflict,
errorMessage: "another lifecycle operation is in progress for this game",
}), nil
}
defer service.releaseLease(ctx, input.GameID, token)
return service.runUnderLease(ctx, input, opStartedAt)
}
// runUnderLease executes the lease-protected patch sequence: load the
// runtime record, validate semver compatibility, run inner stop,
// remove the container, run inner start with the new image.
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
if errors.Is(err, runtime.ErrNotFound) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeNotFound,
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
}), nil
}
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
}), nil
}
if existing.Status == runtime.StatusRemoved {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeConflict,
errorMessage: fmt.Sprintf("runtime for game %q is removed; cannot patch", input.GameID),
}), nil
}
if strings.TrimSpace(existing.CurrentImageRef) == "" {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("runtime record for game %q has no current image_ref", input.GameID),
}), nil
}
currentSemver, err := extractSemverTag(existing.CurrentImageRef)
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeImageRefNotSemver,
errorMessage: fmt.Sprintf("current image_ref: %s", err.Error()),
imageRef: existing.CurrentImageRef,
}), nil
}
newSemver, err := extractSemverTag(input.NewImageRef)
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeImageRefNotSemver,
errorMessage: fmt.Sprintf("new image_ref: %s", err.Error()),
imageRef: input.NewImageRef,
}), nil
}
if !samePatchSeries(currentSemver, newSemver) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeSemverPatchOnly,
errorMessage: fmt.Sprintf(
"patch must keep major.minor; current=%s new=%s",
currentSemver, newSemver,
),
imageRef: input.NewImageRef,
}), nil
}
correlationRef := input.SourceRef
if correlationRef == "" {
correlationRef = service.newToken()
}
containerID := existing.CurrentContainerID
stopResult, err := service.stopService.Run(ctx, stopruntime.Input{
GameID: input.GameID,
Reason: stopruntime.StopReasonAdminRequest,
OpSource: input.OpSource,
SourceRef: correlationRef,
})
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("inner stop: %s", err.Error()),
imageRef: input.NewImageRef,
containerID: containerID,
}), nil
}
if stopResult.Outcome == operation.OutcomeFailure {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: stopResult.ErrorCode,
errorMessage: fmt.Sprintf("inner stop failed: %s", stopResult.ErrorMessage),
imageRef: input.NewImageRef,
containerID: containerID,
}), nil
}
if containerID != "" {
if err := service.docker.Remove(ctx, containerID); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("docker remove: %s", err.Error()),
imageRef: input.NewImageRef,
containerID: containerID,
}), nil
}
}
startResult, err := service.startService.Run(ctx, startruntime.Input{
GameID: input.GameID,
ImageRef: input.NewImageRef,
OpSource: input.OpSource,
SourceRef: correlationRef,
})
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("inner start: %s", err.Error()),
imageRef: input.NewImageRef,
}), nil
}
if startResult.Outcome == operation.OutcomeFailure {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startResult.ErrorCode,
errorMessage: fmt.Sprintf("inner start failed: %s", startResult.ErrorMessage),
imageRef: input.NewImageRef,
}), nil
}
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindPatch,
OpSource: input.OpSource,
SourceRef: correlationRef,
ImageRef: input.NewImageRef,
ContainerID: startResult.Record.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeSuccess), "")
logArgs := []any{
"game_id", input.GameID,
"prev_image_ref", existing.CurrentImageRef,
"new_image_ref", input.NewImageRef,
"prev_container_id", containerID,
"new_container_id", startResult.Record.CurrentContainerID,
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime patched", logArgs...)
return Result{
Record: startResult.Record,
Outcome: operation.OutcomeSuccess,
}, nil
}
// failureCtx groups the inputs to recordFailure.
type failureCtx struct {
opStartedAt time.Time
input Input
errorCode string
errorMessage string
imageRef string
containerID string
}
// recordFailure writes the outer failure operation_log entry and emits
// telemetry. Inner stop / start services have already recorded their
// own entries; this is the outer summary.
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: fc.input.GameID,
OpKind: operation.OpKindPatch,
OpSource: fc.input.OpSource,
SourceRef: fc.input.SourceRef,
ImageRef: fc.imageRef,
ContainerID: fc.containerID,
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
StartedAt: fc.opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode)
logArgs := []any{
"game_id", fc.input.GameID,
"image_ref", fc.imageRef,
"op_source", string(fc.input.OpSource),
"error_code", fc.errorCode,
"error_message", fc.errorMessage,
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.WarnContext(ctx, "runtime patch failed", logArgs...)
return Result{
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
}
}
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
defer cancel()
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
service.logger.WarnContext(ctx, "release game lease",
"game_id", gameID,
"err", err.Error(),
)
}
}
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
service.logger.ErrorContext(ctx, "append operation log",
"game_id", entry.GameID,
"op_kind", string(entry.OpKind),
"outcome", string(entry.Outcome),
"error_code", entry.ErrorCode,
"err", err.Error(),
)
}
}
func defaultTokenGenerator() func() string {
return func() string {
var buf [32]byte
if _, err := rand.Read(buf[:]); err != nil {
return "rtmanager-fallback-token"
}
return base64.RawURLEncoding.EncodeToString(buf[:])
}
}
@@ -0,0 +1,597 @@
package patchruntime_test
import (
"context"
"errors"
"sync"
"testing"
"time"
"galaxy/notificationintent"
"galaxy/rtmanager/internal/adapters/docker/mocks"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/patchruntime"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"galaxy/rtmanager/internal/telemetry"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
)
// --- shared fake doubles (mirror the restartruntime test pattern) ---
type fakeRuntimeRecords struct {
mu sync.Mutex
stored map[string]runtime.RuntimeRecord
getErr error
upsertErr error
updateStatusErr error
upserts []runtime.RuntimeRecord
updates []ports.UpdateStatusInput
}
func newFakeRuntimeRecords() *fakeRuntimeRecords {
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
}
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.getErr != nil {
return runtime.RuntimeRecord{}, s.getErr
}
record, ok := s.stored[gameID]
if !ok {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
return record, nil
}
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.upsertErr != nil {
return s.upsertErr
}
s.upserts = append(s.upserts, record)
s.stored[record.GameID] = record
return nil
}
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
s.mu.Lock()
defer s.mu.Unlock()
s.updates = append(s.updates, input)
if s.updateStatusErr != nil {
return s.updateStatusErr
}
record, ok := s.stored[input.GameID]
if !ok {
return runtime.ErrNotFound
}
if record.Status != input.ExpectedFrom {
return runtime.ErrConflict
}
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
return runtime.ErrConflict
}
record.Status = input.To
record.LastOpAt = input.Now
switch input.To {
case runtime.StatusStopped:
stoppedAt := input.Now
record.StoppedAt = &stoppedAt
case runtime.StatusRemoved:
removedAt := input.Now
record.RemovedAt = &removedAt
record.CurrentContainerID = ""
}
s.stored[input.GameID] = record
return nil
}
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in patch tests")
}
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in patch tests")
}
type fakeOperationLogs struct {
mu sync.Mutex
appendErr error
appends []operation.OperationEntry
}
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.appendErr != nil {
return 0, s.appendErr
}
s.appends = append(s.appends, entry)
return int64(len(s.appends)), nil
}
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
return nil, errors.New("not used in patch tests")
}
func (s *fakeOperationLogs) byKind(kind operation.OpKind) []operation.OperationEntry {
s.mu.Lock()
defer s.mu.Unlock()
out := []operation.OperationEntry{}
for _, entry := range s.appends {
if entry.OpKind == kind {
out = append(out, entry)
}
}
return out
}
type fakeLeases struct {
mu sync.Mutex
acquired bool
acquireErr error
releaseErr error
acquires []string
releases []string
}
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
l.mu.Lock()
defer l.mu.Unlock()
l.acquires = append(l.acquires, token)
if l.acquireErr != nil {
return false, l.acquireErr
}
return l.acquired, nil
}
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
l.mu.Lock()
defer l.mu.Unlock()
l.releases = append(l.releases, token)
return l.releaseErr
}
type fakeHealthEvents struct {
mu sync.Mutex
envelopes []ports.HealthEventEnvelope
}
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
h.mu.Lock()
defer h.mu.Unlock()
h.envelopes = append(h.envelopes, envelope)
return nil
}
type fakeNotifications struct {
mu sync.Mutex
intents []notificationintent.Intent
}
func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error {
n.mu.Lock()
defer n.mu.Unlock()
n.intents = append(n.intents, intent)
return nil
}
type fakeLobby struct{}
func (l *fakeLobby) GetGame(_ context.Context, _ string) (ports.LobbyGameRecord, error) {
return ports.LobbyGameRecord{}, nil
}
// --- harness ---------------------------------------------------------
type harness struct {
records *fakeRuntimeRecords
operationLogs *fakeOperationLogs
docker *mocks.MockDockerClient
leases *fakeLeases
healthEvents *fakeHealthEvents
notifications *fakeNotifications
lobby *fakeLobby
telemetry *telemetry.Runtime
now time.Time
stateDir string
startService *startruntime.Service
stopService *stopruntime.Service
}
func newHarness(t *testing.T) *harness {
t.Helper()
ctrl := gomock.NewController(t)
t.Cleanup(ctrl.Finish)
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
require.NoError(t, err)
h := &harness{
records: newFakeRuntimeRecords(),
operationLogs: &fakeOperationLogs{},
docker: mocks.NewMockDockerClient(ctrl),
leases: &fakeLeases{acquired: true},
healthEvents: &fakeHealthEvents{},
notifications: &fakeNotifications{},
lobby: &fakeLobby{},
telemetry: telemetryRuntime,
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
stateDir: "/var/lib/galaxy/games/game-1",
}
containerCfg := config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
}
dockerCfg := config.DockerConfig{
Host: "unix:///var/run/docker.sock",
Network: "galaxy-net",
LogDriver: "json-file",
PullPolicy: config.ImagePullPolicyIfMissing,
}
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
startService, err := startruntime.NewService(startruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
HealthEvents: h.healthEvents,
Notifications: h.notifications,
Lobby: h.lobby,
Container: containerCfg,
DockerCfg: dockerCfg,
Coordination: coordinationCfg,
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: func() string { return "inner-start-token" },
PrepareStateDir: func(_ string) (string, error) { return h.stateDir, nil },
})
require.NoError(t, err)
h.startService = startService
stopService, err := stopruntime.NewService(stopruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
HealthEvents: h.healthEvents,
Container: containerCfg,
Coordination: coordinationCfg,
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: func() string { return "inner-stop-token" },
})
require.NoError(t, err)
h.stopService = stopService
return h
}
func (h *harness) build(t *testing.T, tokens ...string) *patchruntime.Service {
t.Helper()
tokenIdx := 0
tokenGen := func() string {
if tokenIdx >= len(tokens) {
return "outer-fallback"
}
t := tokens[tokenIdx]
tokenIdx++
return t
}
service, err := patchruntime.NewService(patchruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
StopService: h.stopService,
StartService: h.startService,
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: tokenGen,
})
require.NoError(t, err)
return service
}
const (
currentImage = "registry.example.com/galaxy/game:1.4.7"
patchImage = "registry.example.com/galaxy/game:1.4.8"
majorBump = "registry.example.com/galaxy/game:2.0.0"
tagless = "registry.example.com/galaxy/game"
notSemver = "registry.example.com/galaxy/game:latest"
)
func runningRecord(now time.Time) runtime.RuntimeRecord {
startedAt := now.Add(-time.Hour)
return runtime.RuntimeRecord{
GameID: "game-1",
Status: runtime.StatusRunning,
CurrentContainerID: "ctr-old",
CurrentImageRef: currentImage,
EngineEndpoint: "http://galaxy-game-game-1:8080",
StatePath: "/var/lib/galaxy/games/game-1",
DockerNetwork: "galaxy-net",
StartedAt: &startedAt,
LastOpAt: startedAt,
CreatedAt: startedAt,
}
}
func basicInput() patchruntime.Input {
return patchruntime.Input{
GameID: "game-1",
NewImageRef: patchImage,
OpSource: operation.OpSourceGMRest,
SourceRef: "rest-req-99",
}
}
func sampleRunResult(now time.Time) ports.RunResult {
return ports.RunResult{
ContainerID: "ctr-new",
EngineEndpoint: "http://galaxy-game-game-1:8080",
StartedAt: now,
}
}
func expectInnerStart(h *harness, image string) {
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), image, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), image).Return(ports.ImageInspect{Ref: image}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
}
// --- happy path -----------------------------------------------------
func TestHandlePatchHappyPath(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
expectInnerStart(h, patchImage)
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, patchImage, result.Record.CurrentImageRef)
patches := h.operationLogs.byKind(operation.OpKindPatch)
require.Len(t, patches, 1)
assert.Equal(t, "rest-req-99", patches[0].SourceRef)
assert.Equal(t, patchImage, patches[0].ImageRef)
assert.Equal(t, "ctr-new", patches[0].ContainerID)
assert.Len(t, h.operationLogs.byKind(operation.OpKindStop), 1)
assert.Len(t, h.operationLogs.byKind(operation.OpKindStart), 1)
}
func TestHandlePatchSameImageProceedsAsRecreate(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
expectInnerStart(h, currentImage)
input := basicInput()
input.NewImageRef = currentImage
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
require.Len(t, h.operationLogs.byKind(operation.OpKindPatch), 1, "patch entry recorded even when image is unchanged")
}
// --- semver pre-checks ---------------------------------------------
func TestHandleImageRefNotSemverWhenNewIsTagless(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
input := basicInput()
input.NewImageRef = tagless
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode)
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop), "no inner stop on pre-check failure")
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart))
}
func TestHandleImageRefNotSemverWhenNewIsNonSemver(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
input := basicInput()
input.NewImageRef = notSemver
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode)
}
func TestHandleImageRefNotSemverWhenCurrentIsTagless(t *testing.T) {
h := newHarness(t)
record := runningRecord(h.now)
record.CurrentImageRef = tagless
h.records.stored["game-1"] = record
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode)
}
func TestHandleSemverPatchOnlyOnMajorBump(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
input := basicInput()
input.NewImageRef = majorBump
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeSemverPatchOnly, result.ErrorCode)
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop))
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart))
}
func TestHandleSemverPatchOnlyOnMinorBump(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
input := basicInput()
input.NewImageRef = "registry.example.com/galaxy/game:1.5.0"
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeSemverPatchOnly, result.ErrorCode)
}
// --- record state checks -------------------------------------------
func TestHandleNotFoundForMissingRecord(t *testing.T) {
h := newHarness(t)
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
}
func TestHandleConflictForRemovedRecord(t *testing.T) {
h := newHarness(t)
removed := runningRecord(h.now)
removed.Status = runtime.StatusRemoved
removed.CurrentContainerID = ""
removedAt := h.now.Add(-time.Hour)
removed.RemovedAt = &removedAt
h.records.stored["game-1"] = removed
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
}
// --- failures from inner ops ---------------------------------------
func TestHandlePropagatesInnerStopFailure(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(errors.New("daemon unreachable"))
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
}
func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o"))
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
}
func TestHandlePropagatesInnerStartFailure(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), patchImage, gomock.Any()).Return(errors.New("manifest unknown"))
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode)
}
// --- conflicts ------------------------------------------------------
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
h := newHarness(t)
h.leases.acquired = false
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
}
// --- input validation ----------------------------------------------
func TestHandleRejectsInvalidInput(t *testing.T) {
h := newHarness(t)
service := h.build(t, "outer-token")
cases := []patchruntime.Input{
{GameID: "", NewImageRef: patchImage, OpSource: operation.OpSourceGMRest},
{GameID: "g", NewImageRef: "", OpSource: operation.OpSourceGMRest},
{GameID: "g", NewImageRef: patchImage, OpSource: operation.OpSource("bogus")},
}
for _, input := range cases {
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
}
}
// --- constructor ---------------------------------------------------
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
h := newHarness(t)
deps := patchruntime.Dependencies{
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
}
_, err := patchruntime.NewService(deps)
require.Error(t, err)
}
@@ -0,0 +1,482 @@
// Package restartruntime implements the `restart` lifecycle operation
// owned by Runtime Manager. Restart is a recreate: under one outer
// per-game lease the service runs the stop service, removes the
// container with `docker rm`, and runs the start service with the
// runtime's current `image_ref`. The hostname / engine endpoint stays
// stable across the recreate; `container_id` changes.
//
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
// §Lifecycles → Restart`. Design rationale is captured in
// `rtmanager/docs/services.md`, in particular the lease-sharing
// pattern with `startruntime.Service.Run` / `stopruntime.Service.Run`,
// the correlation-id reuse on `source_ref`, and the
// inner-stop-then-rm-failure recovery rule.
package restartruntime
import (
"context"
"crypto/rand"
"encoding/base64"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/logging"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"galaxy/rtmanager/internal/telemetry"
)
// leaseReleaseTimeout bounds the deferred lease-release call.
const leaseReleaseTimeout = 5 * time.Second
// Input stores the per-call arguments for one restart operation.
type Input struct {
// GameID identifies the platform game to restart.
GameID string
// OpSource classifies how the request entered Runtime Manager.
// Required: every operation_log entry carries an op_source.
OpSource operation.OpSource
// SourceRef stores the optional opaque per-source reference (REST
// request id, admin user id). When non-empty it is reused as the
// correlation id linking the outer restart entry to the inner stop
// and start log entries.
SourceRef string
}
// Validate reports whether input carries the structural invariants the
// service requires.
func (input Input) Validate() error {
if strings.TrimSpace(input.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if !input.OpSource.IsKnown() {
return fmt.Errorf("op source %q is unsupported", input.OpSource)
}
return nil
}
// Result stores the deterministic outcome of one Handle call.
type Result struct {
// Record carries the runtime record installed by the inner start on
// success; zero on failure.
Record runtime.RuntimeRecord
// Outcome reports whether the operation completed (success) or
// produced a stable failure code.
Outcome operation.Outcome
// ErrorCode stores the stable error code on failure. Empty for
// success.
ErrorCode string
// ErrorMessage stores the operator-readable detail on failure.
// Empty for success.
ErrorMessage string
}
// Dependencies groups the collaborators required by Service.
type Dependencies struct {
// RuntimeRecords reads the runtime record at the start of restart
// to capture the current image_ref and container_id.
RuntimeRecords ports.RuntimeRecordStore
// OperationLogs records the outer restart audit entry. Inner stop
// and start services append their own entries through their own
// stores.
OperationLogs ports.OperationLogStore
// Docker drives the docker rm step between the inner stop and
// inner start.
Docker ports.DockerClient
// Leases serialises operations against the same game id. The outer
// lease is held for the entire stop + rm + start sequence.
Leases ports.GameLeaseStore
// StopService runs the inner stop step under the outer lease.
StopService *stopruntime.Service
// StartService runs the inner start step under the outer lease.
StartService *startruntime.Service
// Coordination supplies the per-game lease TTL.
Coordination config.CoordinationConfig
// Telemetry records restart outcomes and lease latency. Required.
Telemetry *telemetry.Runtime
// Logger records structured service-level events. Defaults to
// `slog.Default()` when nil.
Logger *slog.Logger
// Clock supplies the wall-clock used for operation timestamps.
// Defaults to `time.Now` when nil.
Clock func() time.Time
// NewToken supplies a unique opaque token. Used both for the lease
// and for the correlation id when Input.SourceRef is empty.
// Defaults to a 32-byte random base64url string when nil.
NewToken func() string
}
// Service executes the restart lifecycle operation.
type Service struct {
runtimeRecords ports.RuntimeRecordStore
operationLogs ports.OperationLogStore
docker ports.DockerClient
leases ports.GameLeaseStore
stopService *stopruntime.Service
startService *startruntime.Service
leaseTTL time.Duration
telemetry *telemetry.Runtime
logger *slog.Logger
clock func() time.Time
newToken func() string
}
// NewService constructs one Service from deps.
func NewService(deps Dependencies) (*Service, error) {
switch {
case deps.RuntimeRecords == nil:
return nil, errors.New("new restart runtime service: nil runtime records")
case deps.OperationLogs == nil:
return nil, errors.New("new restart runtime service: nil operation logs")
case deps.Docker == nil:
return nil, errors.New("new restart runtime service: nil docker client")
case deps.Leases == nil:
return nil, errors.New("new restart runtime service: nil lease store")
case deps.StopService == nil:
return nil, errors.New("new restart runtime service: nil stop service")
case deps.StartService == nil:
return nil, errors.New("new restart runtime service: nil start service")
case deps.Telemetry == nil:
return nil, errors.New("new restart runtime service: nil telemetry runtime")
}
if err := deps.Coordination.Validate(); err != nil {
return nil, fmt.Errorf("new restart runtime service: coordination config: %w", err)
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
logger = logger.With("service", "rtmanager.restartruntime")
newToken := deps.NewToken
if newToken == nil {
newToken = defaultTokenGenerator()
}
return &Service{
runtimeRecords: deps.RuntimeRecords,
operationLogs: deps.OperationLogs,
docker: deps.Docker,
leases: deps.Leases,
stopService: deps.StopService,
startService: deps.StartService,
leaseTTL: deps.Coordination.GameLeaseTTL,
telemetry: deps.Telemetry,
logger: logger,
clock: clock,
newToken: newToken,
}, nil
}
// Handle executes one restart operation end-to-end. The Go-level error
// return is reserved for non-business failures (nil context, nil
// receiver). Every business outcome — success or any of the stable
// failure codes — flows through Result.
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("restart runtime: nil service")
}
if ctx == nil {
return Result{}, errors.New("restart runtime: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInvalidRequest,
errorMessage: err.Error(),
}), nil
}
token := service.newToken()
leaseStart := service.clock()
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
}), nil
}
if !acquired {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeConflict,
errorMessage: "another lifecycle operation is in progress for this game",
}), nil
}
defer service.releaseLease(ctx, input.GameID, token)
return service.runUnderLease(ctx, input, opStartedAt)
}
// runUnderLease executes the lease-protected restart sequence. Loads
// the runtime record, runs inner stop, removes the container, runs
// inner start.
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
if errors.Is(err, runtime.ErrNotFound) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeNotFound,
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
}), nil
}
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
}), nil
}
if existing.Status == runtime.StatusRemoved {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeConflict,
errorMessage: fmt.Sprintf("runtime for game %q is removed; cannot restart", input.GameID),
imageRef: existing.CurrentImageRef,
}), nil
}
if strings.TrimSpace(existing.CurrentImageRef) == "" {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("runtime record for game %q has no image_ref to restart with", input.GameID),
}), nil
}
correlationRef := input.SourceRef
if correlationRef == "" {
correlationRef = service.newToken()
}
containerID := existing.CurrentContainerID
imageRef := existing.CurrentImageRef
stopResult, err := service.stopService.Run(ctx, stopruntime.Input{
GameID: input.GameID,
Reason: stopruntime.StopReasonAdminRequest,
OpSource: input.OpSource,
SourceRef: correlationRef,
})
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("inner stop: %s", err.Error()),
imageRef: imageRef,
containerID: containerID,
}), nil
}
if stopResult.Outcome == operation.OutcomeFailure {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: stopResult.ErrorCode,
errorMessage: fmt.Sprintf("inner stop failed: %s", stopResult.ErrorMessage),
imageRef: imageRef,
containerID: containerID,
}), nil
}
if containerID != "" {
if err := service.docker.Remove(ctx, containerID); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("docker remove: %s", err.Error()),
imageRef: imageRef,
containerID: containerID,
}), nil
}
}
startResult, err := service.startService.Run(ctx, startruntime.Input{
GameID: input.GameID,
ImageRef: imageRef,
OpSource: input.OpSource,
SourceRef: correlationRef,
})
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("inner start: %s", err.Error()),
imageRef: imageRef,
}), nil
}
if startResult.Outcome == operation.OutcomeFailure {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startResult.ErrorCode,
errorMessage: fmt.Sprintf("inner start failed: %s", startResult.ErrorMessage),
imageRef: imageRef,
}), nil
}
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindRestart,
OpSource: input.OpSource,
SourceRef: correlationRef,
ImageRef: imageRef,
ContainerID: startResult.Record.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordRestartOutcome(ctx, string(operation.OutcomeSuccess), "")
logArgs := []any{
"game_id", input.GameID,
"prev_container_id", containerID,
"new_container_id", startResult.Record.CurrentContainerID,
"image_ref", imageRef,
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime restarted", logArgs...)
return Result{
Record: startResult.Record,
Outcome: operation.OutcomeSuccess,
}, nil
}
// failureCtx groups the inputs to recordFailure.
type failureCtx struct {
opStartedAt time.Time
input Input
errorCode string
errorMessage string
imageRef string
containerID string
}
// recordFailure records the outer failure operation_log entry and emits
// telemetry. Inner stop / start services have already recorded their
// own entries; this is the outer summary.
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: fc.input.GameID,
OpKind: operation.OpKindRestart,
OpSource: fc.input.OpSource,
SourceRef: correlationRefOrEmpty(fc.input),
ImageRef: fc.imageRef,
ContainerID: fc.containerID,
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
StartedAt: fc.opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordRestartOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode)
logArgs := []any{
"game_id", fc.input.GameID,
"image_ref", fc.imageRef,
"op_source", string(fc.input.OpSource),
"error_code", fc.errorCode,
"error_message", fc.errorMessage,
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.WarnContext(ctx, "runtime restart failed", logArgs...)
return Result{
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
}
}
// correlationRefOrEmpty returns the original Input.SourceRef for the
// outer entry. Outer-failure paths that did not yet generate a
// correlation id (input validation, lease busy) keep the original
// `source_ref` which is the actor ref.
func correlationRefOrEmpty(input Input) string {
return input.SourceRef
}
// releaseLease releases the per-game lease in a fresh background context.
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
defer cancel()
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
service.logger.WarnContext(ctx, "release game lease",
"game_id", gameID,
"err", err.Error(),
)
}
}
// bestEffortAppend writes one outer operation_log entry. Inner ops have
// already appended their own; a failure here only loses the outer
// summary, which is acceptable.
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
service.logger.ErrorContext(ctx, "append operation log",
"game_id", entry.GameID,
"op_kind", string(entry.OpKind),
"outcome", string(entry.Outcome),
"error_code", entry.ErrorCode,
"err", err.Error(),
)
}
}
// defaultTokenGenerator returns a function that produces 32-byte
// base64url-encoded tokens.
func defaultTokenGenerator() func() string {
return func() string {
var buf [32]byte
if _, err := rand.Read(buf[:]); err != nil {
return "rtmanager-fallback-token"
}
return base64.RawURLEncoding.EncodeToString(buf[:])
}
}
@@ -0,0 +1,584 @@
package restartruntime_test
import (
"context"
"errors"
"sync"
"testing"
"time"
"galaxy/notificationintent"
"galaxy/rtmanager/internal/adapters/docker/mocks"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/restartruntime"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"galaxy/rtmanager/internal/telemetry"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
)
// --- shared fake doubles ----------------------------------------------
type fakeRuntimeRecords struct {
mu sync.Mutex
stored map[string]runtime.RuntimeRecord
getErr error
upsertErr error
updateStatusErr error
upserts []runtime.RuntimeRecord
updates []ports.UpdateStatusInput
}
func newFakeRuntimeRecords() *fakeRuntimeRecords {
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
}
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.getErr != nil {
return runtime.RuntimeRecord{}, s.getErr
}
record, ok := s.stored[gameID]
if !ok {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
return record, nil
}
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.upsertErr != nil {
return s.upsertErr
}
s.upserts = append(s.upserts, record)
s.stored[record.GameID] = record
return nil
}
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
s.mu.Lock()
defer s.mu.Unlock()
s.updates = append(s.updates, input)
if s.updateStatusErr != nil {
return s.updateStatusErr
}
record, ok := s.stored[input.GameID]
if !ok {
return runtime.ErrNotFound
}
if record.Status != input.ExpectedFrom {
return runtime.ErrConflict
}
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
return runtime.ErrConflict
}
record.Status = input.To
record.LastOpAt = input.Now
switch input.To {
case runtime.StatusStopped:
stoppedAt := input.Now
record.StoppedAt = &stoppedAt
case runtime.StatusRemoved:
removedAt := input.Now
record.RemovedAt = &removedAt
record.CurrentContainerID = ""
}
s.stored[input.GameID] = record
return nil
}
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in restart tests")
}
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in restart tests")
}
type fakeOperationLogs struct {
mu sync.Mutex
appendErr error
appends []operation.OperationEntry
}
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.appendErr != nil {
return 0, s.appendErr
}
s.appends = append(s.appends, entry)
return int64(len(s.appends)), nil
}
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
return nil, errors.New("not used in restart tests")
}
func (s *fakeOperationLogs) byKind(kind operation.OpKind) []operation.OperationEntry {
s.mu.Lock()
defer s.mu.Unlock()
out := []operation.OperationEntry{}
for _, entry := range s.appends {
if entry.OpKind == kind {
out = append(out, entry)
}
}
return out
}
type fakeLeases struct {
mu sync.Mutex
acquired bool
acquireErr error
releaseErr error
acquires []string
releases []string
}
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
l.mu.Lock()
defer l.mu.Unlock()
l.acquires = append(l.acquires, token)
if l.acquireErr != nil {
return false, l.acquireErr
}
return l.acquired, nil
}
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
l.mu.Lock()
defer l.mu.Unlock()
l.releases = append(l.releases, token)
return l.releaseErr
}
type fakeHealthEvents struct {
mu sync.Mutex
publishErr error
envelopes []ports.HealthEventEnvelope
}
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
h.mu.Lock()
defer h.mu.Unlock()
if h.publishErr != nil {
return h.publishErr
}
h.envelopes = append(h.envelopes, envelope)
return nil
}
type fakeNotifications struct {
mu sync.Mutex
publishErr error
intents []notificationintent.Intent
}
func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error {
n.mu.Lock()
defer n.mu.Unlock()
if n.publishErr != nil {
return n.publishErr
}
n.intents = append(n.intents, intent)
return nil
}
type fakeLobby struct {
record ports.LobbyGameRecord
err error
}
func (l *fakeLobby) GetGame(_ context.Context, _ string) (ports.LobbyGameRecord, error) {
if l.err != nil {
return ports.LobbyGameRecord{}, l.err
}
return l.record, nil
}
// --- harness ----------------------------------------------------------
type harness struct {
records *fakeRuntimeRecords
operationLogs *fakeOperationLogs
docker *mocks.MockDockerClient
leases *fakeLeases
healthEvents *fakeHealthEvents
notifications *fakeNotifications
lobby *fakeLobby
telemetry *telemetry.Runtime
now time.Time
stateDir string
startService *startruntime.Service
stopService *stopruntime.Service
}
func newHarness(t *testing.T) *harness {
t.Helper()
ctrl := gomock.NewController(t)
t.Cleanup(ctrl.Finish)
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
require.NoError(t, err)
h := &harness{
records: newFakeRuntimeRecords(),
operationLogs: &fakeOperationLogs{},
docker: mocks.NewMockDockerClient(ctrl),
leases: &fakeLeases{acquired: true},
healthEvents: &fakeHealthEvents{},
notifications: &fakeNotifications{},
lobby: &fakeLobby{},
telemetry: telemetryRuntime,
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
stateDir: "/var/lib/galaxy/games/game-1",
}
containerCfg := config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
}
dockerCfg := config.DockerConfig{
Host: "unix:///var/run/docker.sock",
Network: "galaxy-net",
LogDriver: "json-file",
PullPolicy: config.ImagePullPolicyIfMissing,
}
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
startService, err := startruntime.NewService(startruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
HealthEvents: h.healthEvents,
Notifications: h.notifications,
Lobby: h.lobby,
Container: containerCfg,
DockerCfg: dockerCfg,
Coordination: coordinationCfg,
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: func() string { return "inner-start-token" },
PrepareStateDir: func(_ string) (string, error) { return h.stateDir, nil },
})
require.NoError(t, err)
h.startService = startService
stopService, err := stopruntime.NewService(stopruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
HealthEvents: h.healthEvents,
Container: containerCfg,
Coordination: coordinationCfg,
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: func() string { return "inner-stop-token" },
})
require.NoError(t, err)
h.stopService = stopService
return h
}
func (h *harness) build(t *testing.T, tokens ...string) *restartruntime.Service {
t.Helper()
tokenIdx := 0
tokenGen := func() string {
if tokenIdx >= len(tokens) {
return "outer-fallback"
}
t := tokens[tokenIdx]
tokenIdx++
return t
}
service, err := restartruntime.NewService(restartruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
StopService: h.stopService,
StartService: h.startService,
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: tokenGen,
})
require.NoError(t, err)
return service
}
const imageRef = "registry.example.com/galaxy/game:1.4.7"
func runningRecord(now time.Time) runtime.RuntimeRecord {
startedAt := now.Add(-time.Hour)
return runtime.RuntimeRecord{
GameID: "game-1",
Status: runtime.StatusRunning,
CurrentContainerID: "ctr-old",
CurrentImageRef: imageRef,
EngineEndpoint: "http://galaxy-game-game-1:8080",
StatePath: "/var/lib/galaxy/games/game-1",
DockerNetwork: "galaxy-net",
StartedAt: &startedAt,
LastOpAt: startedAt,
CreatedAt: startedAt,
}
}
func basicInput() restartruntime.Input {
return restartruntime.Input{
GameID: "game-1",
OpSource: operation.OpSourceGMRest,
SourceRef: "rest-req-42",
}
}
func sampleRunResult(now time.Time) ports.RunResult {
return ports.RunResult{
ContainerID: "ctr-new",
EngineEndpoint: "http://galaxy-game-game-1:8080",
StartedAt: now,
}
}
func expectInnerStart(h *harness) {
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), imageRef, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), imageRef).Return(ports.ImageInspect{Ref: imageRef}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
}
// --- happy path -------------------------------------------------------
func TestHandleRestartFromRunning(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
expectInnerStart(h)
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Empty(t, result.ErrorCode)
assert.Equal(t, "ctr-new", result.Record.CurrentContainerID)
assert.Equal(t, imageRef, result.Record.CurrentImageRef)
assert.Equal(t, runtime.StatusRunning, result.Record.Status)
stops := h.operationLogs.byKind(operation.OpKindStop)
starts := h.operationLogs.byKind(operation.OpKindStart)
restarts := h.operationLogs.byKind(operation.OpKindRestart)
require.Len(t, stops, 1, "inner stop appended its own entry")
require.Len(t, starts, 1, "inner start appended its own entry")
require.Len(t, restarts, 1, "outer restart appended one summary entry")
assert.Equal(t, "rest-req-42", stops[0].SourceRef, "correlation id propagated to inner stop")
assert.Equal(t, "rest-req-42", starts[0].SourceRef, "correlation id propagated to inner start")
assert.Equal(t, "rest-req-42", restarts[0].SourceRef, "correlation id stored on outer restart")
assert.Equal(t, "ctr-new", restarts[0].ContainerID)
assert.Equal(t, imageRef, restarts[0].ImageRef)
assert.Equal(t, []string{"outer-token"}, h.leases.acquires)
assert.Equal(t, []string{"outer-token"}, h.leases.releases)
}
func TestHandleRestartFromStopped(t *testing.T) {
h := newHarness(t)
stoppedRecord := runningRecord(h.now)
stoppedRecord.Status = runtime.StatusStopped
stoppedAt := h.now.Add(-30 * time.Minute)
stoppedRecord.StoppedAt = &stoppedAt
h.records.stored["game-1"] = stoppedRecord
// No docker.Stop because inner stop short-circuits via replay no-op.
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
expectInnerStart(h)
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, "ctr-new", result.Record.CurrentContainerID)
}
// --- correlation id fallback -----------------------------------------
func TestHandleGeneratesCorrelationWhenSourceRefEmpty(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
expectInnerStart(h)
input := basicInput()
input.SourceRef = ""
// First newToken call yields the lease token, second yields the
// correlation id fallback.
service := h.build(t, "outer-token", "correlation-fallback")
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
stops := h.operationLogs.byKind(operation.OpKindStop)
starts := h.operationLogs.byKind(operation.OpKindStart)
restarts := h.operationLogs.byKind(operation.OpKindRestart)
require.Len(t, stops, 1)
require.Len(t, starts, 1)
require.Len(t, restarts, 1)
assert.Equal(t, "correlation-fallback", stops[0].SourceRef)
assert.Equal(t, "correlation-fallback", starts[0].SourceRef)
assert.Equal(t, "correlation-fallback", restarts[0].SourceRef)
}
// --- failure paths ---------------------------------------------------
func TestHandleNotFoundForMissingRecord(t *testing.T) {
h := newHarness(t)
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop))
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart))
require.Len(t, h.operationLogs.byKind(operation.OpKindRestart), 1)
}
func TestHandleConflictForRemovedRecord(t *testing.T) {
h := newHarness(t)
removed := runningRecord(h.now)
removed.Status = runtime.StatusRemoved
removed.CurrentContainerID = ""
removedAt := h.now.Add(-time.Hour)
removed.RemovedAt = &removedAt
h.records.stored["game-1"] = removed
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
}
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
h := newHarness(t)
h.leases.acquired = false
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
assert.Empty(t, h.leases.releases, "release must not run when acquire returned false")
}
func TestHandlePropagatesInnerStopFailure(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(errors.New("daemon unreachable"))
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
assert.Contains(t, result.ErrorMessage, "inner stop failed")
}
func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o"))
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
assert.Contains(t, result.ErrorMessage, "docker remove")
// inner stop did succeed and write its log entry; outer restart records failure.
require.Len(t, h.operationLogs.byKind(operation.OpKindStop), 1)
require.Len(t, h.operationLogs.byKind(operation.OpKindRestart), 1)
}
func TestHandlePropagatesInnerStartFailure(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), imageRef, gomock.Any()).Return(errors.New("manifest unknown"))
service := h.build(t, "outer-token")
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode)
assert.Contains(t, result.ErrorMessage, "inner start failed")
}
// --- input validation ------------------------------------------------
func TestHandleRejectsInvalidInput(t *testing.T) {
h := newHarness(t)
service := h.build(t, "outer-token")
cases := []restartruntime.Input{
{GameID: "", OpSource: operation.OpSourceGMRest},
{GameID: "g", OpSource: operation.OpSource("bogus")},
}
for _, input := range cases {
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
}
}
// --- constructor -----------------------------------------------------
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
h := newHarness(t)
deps := restartruntime.Dependencies{
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
}
_, err := restartruntime.NewService(deps)
require.Error(t, err)
}
@@ -0,0 +1,68 @@
package startruntime
// Stable error codes returned in `Result.ErrorCode`. The values match the
// vocabulary frozen by `rtmanager/README.md §Error Model`,
// `rtmanager/api/internal-openapi.yaml`, and
// `rtmanager/api/runtime-jobs-asyncapi.yaml`. Although the constants live
// in the start-service package they are the canonical home for every
// lifecycle service in `internal/service/`. Stop, restart, patch,
// cleanup, the REST handlers, and the stream consumers import these
// names rather than redeclare them; renaming any of them is a contract
// change.
const (
// ErrorCodeReplayNoOp reports that the request was an idempotent
// replay against an already-running record with the same image_ref.
ErrorCodeReplayNoOp = "replay_no_op"
// ErrorCodeStartConfigInvalid reports that the start request was
// rejected before any Docker work because of a validation failure
// (invalid image_ref shape, missing Docker network, unwritable state
// directory).
ErrorCodeStartConfigInvalid = "start_config_invalid"
// ErrorCodeImagePullFailed reports that the image pull stage failed.
ErrorCodeImagePullFailed = "image_pull_failed"
// ErrorCodeContainerStartFailed reports that `docker create` or
// `docker start` failed, or that the runtime record could not be
// installed after a successful Run.
ErrorCodeContainerStartFailed = "container_start_failed"
// ErrorCodeConflict reports an operation incompatible with the
// current runtime state (lease busy, running record with a different
// image_ref, cleanup attempted on a running runtime, restart or
// patch attempted on a removed record).
ErrorCodeConflict = "conflict"
// ErrorCodeServiceUnavailable reports that a steady-state dependency
// (Docker daemon, PostgreSQL, Redis) was unreachable for this call.
ErrorCodeServiceUnavailable = "service_unavailable"
// ErrorCodeInternal reports an unexpected error not classified by
// the other codes.
ErrorCodeInternal = "internal_error"
// ErrorCodeInvalidRequest reports that the request was rejected
// because of structural input validation (empty required fields,
// unknown enum values). Used by the stop / restart / patch /
// cleanup services for malformed Input. The start service uses the
// stricter `start_config_invalid` code instead because every start
// validation failure also raises an admin notification intent.
ErrorCodeInvalidRequest = "invalid_request"
// ErrorCodeNotFound reports that the runtime record requested by a
// stop, restart, patch or cleanup operation does not exist. Those
// services raise it; the start service never does (start installs
// the record on first call).
ErrorCodeNotFound = "not_found"
// ErrorCodeImageRefNotSemver reports that a patch operation was
// rejected because either the current or the new image reference
// could not be parsed as a semver tag.
ErrorCodeImageRefNotSemver = "image_ref_not_semver"
// ErrorCodeSemverPatchOnly reports that a patch operation was
// rejected because the major or minor component differs between the
// current and new image references.
ErrorCodeSemverPatchOnly = "semver_patch_only"
)
@@ -0,0 +1,940 @@
// Package startruntime implements the `start` lifecycle operation owned
// by Runtime Manager. The service is the single orchestrator behind
// both the asynchronous `runtime:start_jobs` consumer and the
// synchronous `POST /api/v1/internal/runtimes/{game_id}/start` REST
// handler; both callers obtain a deterministic Result with a stable
// `Outcome` / `ErrorCode` pair.
//
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
// §Lifecycles → Start`. Design rationale is captured in
// `rtmanager/docs/services.md`.
package startruntime
import (
"context"
"crypto/rand"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"galaxy/notificationintent"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/logging"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/telemetry"
"github.com/distribution/reference"
)
// Container labels applied to every engine container created by the
// start service. Frozen by `rtmanager/README.md §Container Model`.
const (
LabelOwner = "com.galaxy.owner"
LabelOwnerValue = "rtmanager"
LabelKind = "com.galaxy.kind"
LabelKindValue = "game-engine"
LabelGameID = "com.galaxy.game_id"
LabelEngineImageRef = "com.galaxy.engine_image_ref"
LabelStartedAtMs = "com.galaxy.started_at_ms"
// Image labels read at start time to derive resource limits.
imageLabelCPUQuota = "com.galaxy.cpu_quota"
imageLabelMemory = "com.galaxy.memory"
imageLabelPIDsLimit = "com.galaxy.pids_limit"
// HostnamePrefix is the constant prefix used to build the per-game
// container hostname (`galaxy-game-{game_id}`). The full hostname
// also forms the container name; restart and patch keep the same
// value so the engine endpoint stays stable across container
// recreates.
HostnamePrefix = "galaxy-game-"
// EngineStateBackCompatEnvName is the secondary env var name v1
// engines accept for the bind-mounted state directory. Always set
// alongside the configured primary name to honour the v1 backward
// compatibility commitment in `rtmanager/README.md §Container Model`.
EngineStateBackCompatEnvName = "STORAGE_PATH"
// leaseReleaseTimeout bounds the deferred lease-release call. A
// fresh background context is used so the release runs even when
// the request context was already canceled.
leaseReleaseTimeout = 5 * time.Second
)
// Input stores the per-call arguments for one start operation.
type Input struct {
// GameID identifies the platform game to start.
GameID string
// ImageRef stores the producer-resolved Docker reference of the
// engine image. Validated against `distribution/reference` before
// any Docker work.
ImageRef string
// OpSource classifies how the request entered Runtime Manager.
// Required: every operation_log entry carries an op_source.
OpSource operation.OpSource
// SourceRef stores the optional opaque per-source reference
// (Redis Stream entry id, REST request id, admin user id). Empty
// when the caller does not provide one.
SourceRef string
}
// Validate reports whether input carries the structural invariants the
// service requires.
func (input Input) Validate() error {
if strings.TrimSpace(input.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if strings.TrimSpace(input.ImageRef) == "" {
return fmt.Errorf("image ref must not be empty")
}
if !input.OpSource.IsKnown() {
return fmt.Errorf("op source %q is unsupported", input.OpSource)
}
return nil
}
// Result stores the deterministic outcome of one Handle call.
type Result struct {
// Record carries the runtime record installed by the operation.
// Populated on success and on idempotent replay (`replay_no_op`);
// zero on failure.
Record runtime.RuntimeRecord
// Outcome reports whether the operation completed (success) or
// produced a stable failure code.
Outcome operation.Outcome
// ErrorCode stores the stable error code on failure, or
// `replay_no_op` on idempotent replay. Empty for fresh successes.
ErrorCode string
// ErrorMessage stores the operator-readable detail on failure.
// Empty for successes.
ErrorMessage string
}
// Dependencies groups the collaborators required by Service.
type Dependencies struct {
// RuntimeRecords reads and installs the durable runtime record.
RuntimeRecords ports.RuntimeRecordStore
// OperationLogs records the success / failure audit entry.
OperationLogs ports.OperationLogStore
// Docker drives the Docker daemon (network check, pull, inspect,
// run, remove).
Docker ports.DockerClient
// Leases serialises operations against the same game id.
Leases ports.GameLeaseStore
// HealthEvents publishes `runtime:health_events` and upserts the
// matching `health_snapshots` row.
HealthEvents ports.HealthEventPublisher
// Notifications publishes admin-only failure intents.
Notifications ports.NotificationIntentPublisher
// Lobby provides best-effort diagnostic context for the started
// game. May be nil; the start operation does not depend on it.
Lobby ports.LobbyInternalClient
// Container groups the per-container defaults and state-directory
// settings consumed at start time.
Container config.ContainerConfig
// Docker groups the Docker daemon settings (network, log driver,
// pull policy) consumed at start time.
DockerCfg config.DockerConfig
// Coordination supplies the per-game lease TTL.
Coordination config.CoordinationConfig
// Telemetry records start outcomes, lease latency, and health
// event counters. Required.
Telemetry *telemetry.Runtime
// Logger records structured service-level events. Defaults to
// `slog.Default()` when nil.
Logger *slog.Logger
// Clock supplies the wall-clock used for operation timestamps.
// Defaults to `time.Now` when nil.
Clock func() time.Time
// NewToken supplies a unique opaque lease token. Defaults to a
// 32-byte random base64url string when nil. Tests may override.
NewToken func() string
// PrepareStateDir creates the per-game state directory and
// returns its absolute host path. Defaults to a real-filesystem
// implementation that honours Container.GameStateRoot,
// Container.GameStateDirMode, and Container.GameStateOwner{UID,GID}.
// Tests override to point at a temporary directory.
PrepareStateDir func(gameID string) (string, error)
}
// Service executes the start lifecycle operation.
type Service struct {
runtimeRecords ports.RuntimeRecordStore
operationLogs ports.OperationLogStore
docker ports.DockerClient
leases ports.GameLeaseStore
healthEvents ports.HealthEventPublisher
notifications ports.NotificationIntentPublisher
lobby ports.LobbyInternalClient
containerCfg config.ContainerConfig
dockerCfg config.DockerConfig
leaseTTL time.Duration
telemetry *telemetry.Runtime
logger *slog.Logger
clock func() time.Time
newToken func() string
prepareStateDir func(gameID string) (string, error)
}
// NewService constructs one Service from deps.
func NewService(deps Dependencies) (*Service, error) {
switch {
case deps.RuntimeRecords == nil:
return nil, errors.New("new start runtime service: nil runtime records")
case deps.OperationLogs == nil:
return nil, errors.New("new start runtime service: nil operation logs")
case deps.Docker == nil:
return nil, errors.New("new start runtime service: nil docker client")
case deps.Leases == nil:
return nil, errors.New("new start runtime service: nil lease store")
case deps.HealthEvents == nil:
return nil, errors.New("new start runtime service: nil health events publisher")
case deps.Notifications == nil:
return nil, errors.New("new start runtime service: nil notification publisher")
case deps.Telemetry == nil:
return nil, errors.New("new start runtime service: nil telemetry runtime")
}
if err := deps.Container.Validate(); err != nil {
return nil, fmt.Errorf("new start runtime service: container config: %w", err)
}
if err := deps.DockerCfg.Validate(); err != nil {
return nil, fmt.Errorf("new start runtime service: docker config: %w", err)
}
if err := deps.Coordination.Validate(); err != nil {
return nil, fmt.Errorf("new start runtime service: coordination config: %w", err)
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
logger = logger.With("service", "rtmanager.startruntime")
newToken := deps.NewToken
if newToken == nil {
newToken = defaultTokenGenerator()
}
prepareStateDir := deps.PrepareStateDir
if prepareStateDir == nil {
prepareStateDir = newDefaultStateDirPreparer(deps.Container)
}
return &Service{
runtimeRecords: deps.RuntimeRecords,
operationLogs: deps.OperationLogs,
docker: deps.Docker,
leases: deps.Leases,
healthEvents: deps.HealthEvents,
notifications: deps.Notifications,
lobby: deps.Lobby,
containerCfg: deps.Container,
dockerCfg: deps.DockerCfg,
leaseTTL: deps.Coordination.GameLeaseTTL,
telemetry: deps.Telemetry,
logger: logger,
clock: clock,
newToken: newToken,
prepareStateDir: prepareStateDir,
}, nil
}
// Handle executes one start operation end-to-end. The Go-level error
// return is reserved for non-business failures (nil context, nil
// receiver). Every business outcome — fresh success, idempotent
// replay, or any of the stable failure modes — flows through Result.
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("start runtime: nil service")
}
if ctx == nil {
return Result{}, errors.New("start runtime: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeStartConfigInvalid,
errorMessage: err.Error(),
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
}), nil
}
token := service.newToken()
leaseStart := service.clock()
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
}), nil
}
if !acquired {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeConflict,
errorMessage: "another lifecycle operation is in progress for this game",
}), nil
}
defer service.releaseLease(ctx, input.GameID, token)
return service.runUnderLease(ctx, input, opStartedAt)
}
// Run executes the start lifecycle assuming the per-game lease is
// already held by the caller. The method is reserved for orchestrator
// services in `internal/service/` that compose start with another
// operation under a single outer lease (restart and patch). External
// callers must use Handle, which acquires and releases the lease
// itself.
//
// Run still validates input and reports business outcomes through
// Result; the Go-level error return is reserved for non-business
// failures (nil context, nil receiver). Operation log entries,
// telemetry counters, health events and admin-only notification
// intents fire identically to Handle.
func (service *Service) Run(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("start runtime: nil service")
}
if ctx == nil {
return Result{}, errors.New("start runtime: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeStartConfigInvalid,
errorMessage: err.Error(),
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
}), nil
}
return service.runUnderLease(ctx, input, opStartedAt)
}
// runUnderLease executes the post-validation, lease-protected start
// steps shared by Handle and Run. Callers must validate input and
// acquire the lease (when applicable) before invocation.
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
existing, hasExisting, err := service.loadExisting(ctx, input.GameID)
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeInternal,
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
}), nil
}
if hasExisting && existing.Status == runtime.StatusRunning {
if existing.CurrentImageRef == input.ImageRef {
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
}
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeConflict,
errorMessage: fmt.Sprintf("runtime already running with image_ref %q", existing.CurrentImageRef),
}), nil
}
service.fetchLobbyDiagnostic(ctx, input.GameID)
if err := validateImageRef(input.ImageRef); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeStartConfigInvalid,
errorMessage: fmt.Sprintf("invalid image_ref: %s", err.Error()),
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
}), nil
}
if err := service.docker.EnsureNetwork(ctx, service.dockerCfg.Network); err != nil {
if errors.Is(err, ports.ErrNetworkMissing) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeStartConfigInvalid,
errorMessage: fmt.Sprintf("docker network %q is missing", service.dockerCfg.Network),
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
}), nil
}
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("ensure docker network: %s", err.Error()),
}), nil
}
if err := service.docker.PullImage(ctx, input.ImageRef, ports.PullPolicy(service.dockerCfg.PullPolicy)); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeImagePullFailed,
errorMessage: err.Error(),
notificationType: notificationintent.NotificationTypeRuntimeImagePullFailed,
}), nil
}
imageInspect, err := service.docker.InspectImage(ctx, input.ImageRef)
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeImagePullFailed,
errorMessage: fmt.Sprintf("inspect image: %s", err.Error()),
notificationType: notificationintent.NotificationTypeRuntimeImagePullFailed,
}), nil
}
cpuQuota, memory, pidsLimit := service.resolveLimits(imageInspect.Labels)
statePath, err := service.prepareStateDir(input.GameID)
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeStartConfigInvalid,
errorMessage: fmt.Sprintf("prepare state directory: %s", err.Error()),
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
}), nil
}
hostname := containerHostname(input.GameID)
spec := ports.RunSpec{
Name: hostname,
Image: input.ImageRef,
Hostname: hostname,
Network: service.dockerCfg.Network,
Env: service.buildEnv(),
Labels: service.buildLabels(input.GameID, input.ImageRef, opStartedAt),
BindMounts: []ports.BindMount{{
HostPath: statePath,
MountPath: service.containerCfg.EngineStateMountPath,
ReadOnly: false,
}},
LogDriver: service.dockerCfg.LogDriver,
LogOpts: parseLogOpts(service.dockerCfg.LogOpts),
CPUQuota: cpuQuota,
Memory: memory,
PIDsLimit: pidsLimit,
}
runResult, err := service.docker.Run(ctx, spec)
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeContainerStartFailed,
errorMessage: err.Error(),
notificationType: notificationintent.NotificationTypeRuntimeContainerStartFailed,
}), nil
}
createdAt := opStartedAt
if hasExisting && !existing.CreatedAt.IsZero() {
createdAt = existing.CreatedAt
}
startedAt := runResult.StartedAt
record := runtime.RuntimeRecord{
GameID: input.GameID,
Status: runtime.StatusRunning,
CurrentContainerID: runResult.ContainerID,
CurrentImageRef: input.ImageRef,
EngineEndpoint: runResult.EngineEndpoint,
StatePath: statePath,
DockerNetwork: service.dockerCfg.Network,
StartedAt: &startedAt,
LastOpAt: startedAt,
CreatedAt: createdAt,
}
if err := service.runtimeRecords.Upsert(ctx, record); err != nil {
service.bestEffortRemove(input.GameID, runResult.ContainerID)
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: ErrorCodeContainerStartFailed,
errorMessage: fmt.Sprintf("upsert runtime record: %s", err.Error()),
containerID: runResult.ContainerID,
notificationType: notificationintent.NotificationTypeRuntimeContainerStartFailed,
}), nil
}
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindStart,
OpSource: input.OpSource,
SourceRef: input.SourceRef,
ImageRef: input.ImageRef,
ContainerID: runResult.ContainerID,
Outcome: operation.OutcomeSuccess,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
GameID: input.GameID,
ContainerID: runResult.ContainerID,
EventType: health.EventTypeContainerStarted,
OccurredAt: startedAt,
Details: containerStartedDetails(input.ImageRef),
})
service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeSuccess), "", string(input.OpSource))
service.telemetry.RecordHealthEvent(ctx, string(health.EventTypeContainerStarted))
logArgs := []any{
"game_id", input.GameID,
"container_id", runResult.ContainerID,
"image_ref", input.ImageRef,
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime started", logArgs...)
return Result{
Record: record,
Outcome: operation.OutcomeSuccess,
}, nil
}
// failureCtx groups the inputs to recordFailure so the Handle method
// stays readable.
type failureCtx struct {
opStartedAt time.Time
input Input
errorCode string
errorMessage string
containerID string
notificationType notificationintent.NotificationType
}
// recordFailure records the failure operation_log entry, publishes the
// matching admin-only notification intent (when applicable), and emits
// telemetry. All side effects are best-effort; a downstream failure is
// logged but does not change the returned Result.
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: fc.input.GameID,
OpKind: operation.OpKindStart,
OpSource: fc.input.OpSource,
SourceRef: fc.input.SourceRef,
ImageRef: fc.input.ImageRef,
ContainerID: fc.containerID,
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
StartedAt: fc.opStartedAt,
FinishedAt: &finishedAt,
})
if fc.notificationType != "" {
service.bestEffortNotify(ctx, fc)
}
service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode, string(fc.input.OpSource))
logArgs := []any{
"game_id", fc.input.GameID,
"image_ref", fc.input.ImageRef,
"op_source", string(fc.input.OpSource),
"error_code", fc.errorCode,
"error_message", fc.errorMessage,
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.WarnContext(ctx, "runtime start failed", logArgs...)
return Result{
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
}
}
// recordReplayNoOp records the idempotent replay outcome and returns
// the existing record. The operation_log entry is appended best-effort
// so audit history captures the replay; telemetry counts the call as a
// successful start with `error_code=replay_no_op`.
func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindStart,
OpSource: input.OpSource,
SourceRef: input.SourceRef,
ImageRef: input.ImageRef,
ContainerID: existing.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
ErrorCode: ErrorCodeReplayNoOp,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeSuccess), ErrorCodeReplayNoOp, string(input.OpSource))
logArgs := []any{
"game_id", input.GameID,
"container_id", existing.CurrentContainerID,
"image_ref", input.ImageRef,
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime start replay no-op", logArgs...)
return Result{
Record: existing,
Outcome: operation.OutcomeSuccess,
ErrorCode: ErrorCodeReplayNoOp,
}
}
// loadExisting reads the runtime record for gameID. The boolean return
// reports whether a record exists; ErrNotFound is translated to
// (zero, false, nil) so the caller does not branch on the sentinel
// elsewhere.
func (service *Service) loadExisting(ctx context.Context, gameID string) (runtime.RuntimeRecord, bool, error) {
record, err := service.runtimeRecords.Get(ctx, gameID)
switch {
case errors.Is(err, runtime.ErrNotFound):
return runtime.RuntimeRecord{}, false, nil
case err != nil:
return runtime.RuntimeRecord{}, false, err
default:
return record, true, nil
}
}
// fetchLobbyDiagnostic best-effort enriches the request log with the
// Lobby-side game record. A nil Lobby client or any transport failure
// is logged and the start operation continues.
func (service *Service) fetchLobbyDiagnostic(ctx context.Context, gameID string) {
if service.lobby == nil {
return
}
record, err := service.lobby.GetGame(ctx, gameID)
if err != nil {
service.logger.DebugContext(ctx, "lobby diagnostic fetch failed",
"game_id", gameID,
"err", err.Error(),
)
return
}
service.logger.DebugContext(ctx, "lobby diagnostic fetched",
"game_id", gameID,
"lobby_status", record.Status,
"lobby_target_engine_version", record.TargetEngineVersion,
)
}
// resolveLimits derives the per-container resource limits from the
// resolved image's labels with config-driven fallbacks. Unparseable
// label values silently fall back to the configured default; operators
// see the chosen value through `rtmanager.docker_op_latency` and start
// logs.
func (service *Service) resolveLimits(labels map[string]string) (cpuQuota float64, memory string, pidsLimit int) {
cpuQuota = service.containerCfg.DefaultCPUQuota
memory = service.containerCfg.DefaultMemory
pidsLimit = service.containerCfg.DefaultPIDsLimit
if raw, ok := labels[imageLabelCPUQuota]; ok {
if value, err := strconv.ParseFloat(raw, 64); err == nil && value > 0 {
cpuQuota = value
}
}
if raw, ok := labels[imageLabelMemory]; ok && strings.TrimSpace(raw) != "" {
memory = raw
}
if raw, ok := labels[imageLabelPIDsLimit]; ok {
if value, err := strconv.Atoi(raw); err == nil && value > 0 {
pidsLimit = value
}
}
return cpuQuota, memory, pidsLimit
}
// buildEnv assembles the env-var map handed to the engine. Both the
// configured primary name and `STORAGE_PATH` are set per
// `rtmanager/README.md §Container Model` v1 backward compatibility.
func (service *Service) buildEnv() map[string]string {
mount := service.containerCfg.EngineStateMountPath
env := map[string]string{
service.containerCfg.EngineStateEnvName: mount,
EngineStateBackCompatEnvName: mount,
}
return env
}
// buildLabels assembles the container labels per
// `rtmanager/README.md §Container Model`.
func (service *Service) buildLabels(gameID, imageRef string, startedAt time.Time) map[string]string {
return map[string]string{
LabelOwner: LabelOwnerValue,
LabelKind: LabelKindValue,
LabelGameID: gameID,
LabelEngineImageRef: imageRef,
LabelStartedAtMs: strconv.FormatInt(startedAt.UTC().UnixMilli(), 10),
}
}
// releaseLease releases the per-game lease in a fresh background
// context so a canceled request context does not leave the lease
// pinned for its TTL.
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
defer cancel()
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
service.logger.WarnContext(ctx, "release game lease",
"game_id", gameID,
"err", err.Error(),
)
}
}
// bestEffortAppend writes one operation_log entry. A failure is logged
// and discarded; the durable runtime record (or its absence) remains
// the source of truth.
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
service.logger.ErrorContext(ctx, "append operation log",
"game_id", entry.GameID,
"op_kind", string(entry.OpKind),
"outcome", string(entry.Outcome),
"error_code", entry.ErrorCode,
"err", err.Error(),
)
}
}
// bestEffortPublishHealth emits one health event + snapshot upsert.
// Failures degrade silently per `rtmanager/README.md §Notification
// Contracts`; the runtime record remains the source of truth.
func (service *Service) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) {
if err := service.healthEvents.Publish(ctx, envelope); err != nil {
service.logger.ErrorContext(ctx, "publish health event",
"game_id", envelope.GameID,
"container_id", envelope.ContainerID,
"event_type", string(envelope.EventType),
"err", err.Error(),
)
}
}
// bestEffortNotify publishes one admin-only failure intent. Failures
// degrade silently because the source business state already reflects
// the outcome.
func (service *Service) bestEffortNotify(ctx context.Context, fc failureCtx) {
intent, err := buildFailureIntent(fc, service.clock().UTC())
if err != nil {
service.logger.ErrorContext(ctx, "build notification intent",
"game_id", fc.input.GameID,
"notification_type", string(fc.notificationType),
"err", err.Error(),
)
return
}
if err := service.notifications.Publish(ctx, intent); err != nil {
service.logger.ErrorContext(ctx, "publish notification intent",
"game_id", fc.input.GameID,
"notification_type", string(fc.notificationType),
"err", err.Error(),
)
return
}
service.telemetry.RecordNotificationIntent(ctx, string(fc.notificationType))
}
// bestEffortRemove forces removal of a container left running by a
// failed start that progressed past Run but failed to register the
// runtime record. Failures degrade silently — the reconciler adopts
// orphans the periodic pass observes.
func (service *Service) bestEffortRemove(gameID, containerID string) {
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
defer cancel()
if err := service.docker.Remove(cleanupCtx, containerID); err != nil {
service.logger.ErrorContext(cleanupCtx, "rollback container after upsert failure",
"game_id", gameID,
"container_id", containerID,
"err", err.Error(),
)
}
}
// containerHostname builds the per-game hostname that doubles as the
// Docker container name.
func containerHostname(gameID string) string {
return HostnamePrefix + gameID
}
// containerStartedDetails builds the `details` payload required by the
// `container_started` AsyncAPI variant.
func containerStartedDetails(imageRef string) json.RawMessage {
payload := map[string]string{"image_ref": imageRef}
encoded, _ := json.Marshal(payload)
return encoded
}
// validateImageRef rejects malformed Docker references before any
// daemon round-trip. The validation surfaces as `start_config_invalid`;
// daemon-side rejections after a valid parse are reported as
// `image_pull_failed`.
func validateImageRef(ref string) error {
if strings.TrimSpace(ref) == "" {
return fmt.Errorf("image ref must not be empty")
}
if _, err := reference.ParseNormalizedNamed(ref); err != nil {
return err
}
return nil
}
// parseLogOpts turns the `key=value,key2=value2` shape of the
// `RTMANAGER_DOCKER_LOG_OPTS` config into a map suitable for the
// Docker SDK. Empty input returns nil so the SDK uses driver defaults.
func parseLogOpts(raw string) map[string]string {
if strings.TrimSpace(raw) == "" {
return nil
}
out := make(map[string]string)
for part := range strings.SplitSeq(raw, ",") {
entry := strings.TrimSpace(part)
if entry == "" {
continue
}
index := strings.IndexByte(entry, '=')
if index <= 0 {
continue
}
out[entry[:index]] = entry[index+1:]
}
if len(out) == 0 {
return nil
}
return out
}
// buildFailureIntent constructs the admin-only notification intent for
// fc. The idempotency key is scoped per (notification_type, game_id,
// image_ref, attempted_at_ms) so the same failure observed twice is
// recognised as a duplicate by Notification Service.
func buildFailureIntent(fc failureCtx, attemptedAt time.Time) (notificationintent.Intent, error) {
attemptedAtMs := attemptedAt.UnixMilli()
idempotencyKey := fmt.Sprintf("%s.%s.%d", fc.notificationType, fc.input.GameID, attemptedAtMs)
metadata := notificationintent.Metadata{
IdempotencyKey: idempotencyKey,
OccurredAt: attemptedAt,
}
switch fc.notificationType {
case notificationintent.NotificationTypeRuntimeImagePullFailed:
return notificationintent.NewRuntimeImagePullFailedIntent(metadata, notificationintent.RuntimeImagePullFailedPayload{
GameID: fc.input.GameID,
ImageRef: fc.input.ImageRef,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
AttemptedAtMs: attemptedAtMs,
})
case notificationintent.NotificationTypeRuntimeContainerStartFailed:
return notificationintent.NewRuntimeContainerStartFailedIntent(metadata, notificationintent.RuntimeContainerStartFailedPayload{
GameID: fc.input.GameID,
ImageRef: fc.input.ImageRef,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
AttemptedAtMs: attemptedAtMs,
})
case notificationintent.NotificationTypeRuntimeStartConfigInvalid:
return notificationintent.NewRuntimeStartConfigInvalidIntent(metadata, notificationintent.RuntimeStartConfigInvalidPayload{
GameID: fc.input.GameID,
ImageRef: fc.input.ImageRef,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
AttemptedAtMs: attemptedAtMs,
})
default:
return notificationintent.Intent{}, fmt.Errorf("unsupported notification type %q", fc.notificationType)
}
}
// defaultTokenGenerator returns a function that produces 32-byte
// base64url-encoded tokens. The randomness source is `crypto/rand`;
// failures fall back to a deterministic-looking but invalid token so
// the caller observes a TryAcquire collision rather than a panic on a
// degraded entropy source.
func defaultTokenGenerator() func() string {
return func() string {
var buf [32]byte
if _, err := rand.Read(buf[:]); err != nil {
return "rtmanager-fallback-token"
}
return base64.RawURLEncoding.EncodeToString(buf[:])
}
}
// newDefaultStateDirPreparer returns a function that creates the
// per-game state directory under cfg.GameStateRoot with the configured
// permissions and ownership. The function is overridable through
// Dependencies.PrepareStateDir; tests inject a temporary-dir fake.
func newDefaultStateDirPreparer(cfg config.ContainerConfig) func(gameID string) (string, error) {
mode := os.FileMode(cfg.GameStateDirMode)
uid := cfg.GameStateOwnerUID
gid := cfg.GameStateOwnerGID
root := cfg.GameStateRoot
return func(gameID string) (string, error) {
path := filepath.Join(root, gameID)
if err := os.MkdirAll(path, mode); err != nil {
return "", fmt.Errorf("create state dir %q: %w", path, err)
}
if err := os.Chmod(path, mode); err != nil {
return "", fmt.Errorf("chmod state dir %q: %w", path, err)
}
if err := os.Chown(path, uid, gid); err != nil {
return "", fmt.Errorf("chown state dir %q: %w", path, err)
}
return path, nil
}
}
@@ -0,0 +1,693 @@
package startruntime_test
import (
"context"
"encoding/json"
"errors"
"sync"
"testing"
"time"
"galaxy/notificationintent"
"galaxy/rtmanager/internal/adapters/docker/mocks"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/telemetry"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
)
// --- test doubles -----------------------------------------------------
type fakeRuntimeRecords struct {
mu sync.Mutex
stored map[string]runtime.RuntimeRecord
getErr error
upsertErr error
upserts []runtime.RuntimeRecord
}
func newFakeRuntimeRecords() *fakeRuntimeRecords {
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
}
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.getErr != nil {
return runtime.RuntimeRecord{}, s.getErr
}
record, ok := s.stored[gameID]
if !ok {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
return record, nil
}
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.upsertErr != nil {
return s.upsertErr
}
s.upserts = append(s.upserts, record)
s.stored[record.GameID] = record
return nil
}
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
return errors.New("not used in start tests")
}
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in start tests")
}
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in start tests")
}
type fakeOperationLogs struct {
mu sync.Mutex
appendErr error
appends []operation.OperationEntry
}
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.appendErr != nil {
return 0, s.appendErr
}
s.appends = append(s.appends, entry)
return int64(len(s.appends)), nil
}
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
return nil, errors.New("not used in start tests")
}
func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) {
s.mu.Lock()
defer s.mu.Unlock()
if len(s.appends) == 0 {
return operation.OperationEntry{}, false
}
return s.appends[len(s.appends)-1], true
}
type fakeLeases struct {
acquired bool
acquireErr error
releaseErr error
mu sync.Mutex
acquires []string
releases []string
}
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
l.mu.Lock()
defer l.mu.Unlock()
l.acquires = append(l.acquires, token)
if l.acquireErr != nil {
return false, l.acquireErr
}
return l.acquired, nil
}
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
l.mu.Lock()
defer l.mu.Unlock()
l.releases = append(l.releases, token)
return l.releaseErr
}
type fakeHealthEvents struct {
mu sync.Mutex
publishErr error
envelopes []ports.HealthEventEnvelope
}
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
h.mu.Lock()
defer h.mu.Unlock()
if h.publishErr != nil {
return h.publishErr
}
h.envelopes = append(h.envelopes, envelope)
return nil
}
type fakeNotifications struct {
mu sync.Mutex
publishErr error
intents []notificationintent.Intent
}
func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error {
n.mu.Lock()
defer n.mu.Unlock()
if n.publishErr != nil {
return n.publishErr
}
n.intents = append(n.intents, intent)
return nil
}
type fakeLobby struct {
record ports.LobbyGameRecord
err error
mu sync.Mutex
calls []string
}
func (l *fakeLobby) GetGame(_ context.Context, gameID string) (ports.LobbyGameRecord, error) {
l.mu.Lock()
defer l.mu.Unlock()
l.calls = append(l.calls, gameID)
if l.err != nil {
return ports.LobbyGameRecord{}, l.err
}
return l.record, nil
}
// --- harness ----------------------------------------------------------
type harness struct {
records *fakeRuntimeRecords
operationLogs *fakeOperationLogs
docker *mocks.MockDockerClient
leases *fakeLeases
healthEvents *fakeHealthEvents
notifications *fakeNotifications
lobby *fakeLobby
telemetry *telemetry.Runtime
now time.Time
stateDir string
}
func newHarness(t *testing.T) *harness {
t.Helper()
ctrl := gomock.NewController(t)
t.Cleanup(ctrl.Finish)
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
require.NoError(t, err)
return &harness{
records: newFakeRuntimeRecords(),
operationLogs: &fakeOperationLogs{},
docker: mocks.NewMockDockerClient(ctrl),
leases: &fakeLeases{acquired: true},
healthEvents: &fakeHealthEvents{},
notifications: &fakeNotifications{},
lobby: &fakeLobby{},
telemetry: telemetryRuntime,
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
stateDir: "/var/lib/galaxy/games/game-1",
}
}
func (h *harness) build(t *testing.T) *startruntime.Service {
t.Helper()
containerCfg := config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
}
dockerCfg := config.DockerConfig{
Host: "unix:///var/run/docker.sock",
Network: "galaxy-net",
LogDriver: "json-file",
PullPolicy: config.ImagePullPolicyIfMissing,
}
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
service, err := startruntime.NewService(startruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
HealthEvents: h.healthEvents,
Notifications: h.notifications,
Lobby: h.lobby,
Container: containerCfg,
DockerCfg: dockerCfg,
Coordination: coordinationCfg,
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: func() string { return "token-A" },
PrepareStateDir: func(_ string) (string, error) {
return h.stateDir, nil
},
})
require.NoError(t, err)
return service
}
func basicInput() startruntime.Input {
return startruntime.Input{
GameID: "game-1",
ImageRef: "registry.example.com/galaxy/game:1.4.7",
OpSource: operation.OpSourceLobbyStream,
SourceRef: "1700000000000-0",
}
}
func sampleRunResult(now time.Time) ports.RunResult {
return ports.RunResult{
ContainerID: "ctr-123",
EngineEndpoint: "http://galaxy-game-game-1:8080",
StartedAt: now,
}
}
// --- happy path -------------------------------------------------------
func TestHandleHappyPath(t *testing.T) {
h := newHarness(t)
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, ports.PullPolicy(config.ImagePullPolicyIfMissing)).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{
Ref: input.ImageRef,
Labels: map[string]string{
"com.galaxy.cpu_quota": "0.5",
"com.galaxy.memory": "256m",
"com.galaxy.pids_limit": "256",
},
}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).DoAndReturn(func(_ context.Context, spec ports.RunSpec) (ports.RunResult, error) {
assert.Equal(t, "galaxy-game-game-1", spec.Name)
assert.Equal(t, "galaxy-game-game-1", spec.Hostname)
assert.Equal(t, input.ImageRef, spec.Image)
assert.Equal(t, "galaxy-net", spec.Network)
assert.Equal(t, "json-file", spec.LogDriver)
assert.InDelta(t, 0.5, spec.CPUQuota, 0)
assert.Equal(t, "256m", spec.Memory)
assert.Equal(t, 256, spec.PIDsLimit)
assert.Equal(t, h.stateDir, spec.BindMounts[0].HostPath)
assert.Equal(t, "/var/lib/galaxy-game", spec.BindMounts[0].MountPath)
assert.Equal(t, "/var/lib/galaxy-game", spec.Env["GAME_STATE_PATH"])
assert.Equal(t, "/var/lib/galaxy-game", spec.Env["STORAGE_PATH"])
assert.Equal(t, "rtmanager", spec.Labels[startruntime.LabelOwner])
assert.Equal(t, "game-engine", spec.Labels[startruntime.LabelKind])
assert.Equal(t, input.GameID, spec.Labels[startruntime.LabelGameID])
assert.Equal(t, input.ImageRef, spec.Labels[startruntime.LabelEngineImageRef])
return sampleRunResult(h.now), nil
})
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Empty(t, result.ErrorCode)
assert.Equal(t, runtime.StatusRunning, result.Record.Status)
assert.Equal(t, "ctr-123", result.Record.CurrentContainerID)
assert.Equal(t, input.ImageRef, result.Record.CurrentImageRef)
assert.Equal(t, "http://galaxy-game-game-1:8080", result.Record.EngineEndpoint)
assert.Equal(t, h.stateDir, result.Record.StatePath)
assert.Equal(t, "galaxy-net", result.Record.DockerNetwork)
require.NotNil(t, result.Record.StartedAt)
assert.Equal(t, h.now, *result.Record.StartedAt)
assert.Equal(t, h.now, result.Record.LastOpAt)
assert.Equal(t, h.now, result.Record.CreatedAt)
require.Len(t, h.records.upserts, 1)
require.Len(t, h.operationLogs.appends, 1)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OpKindStart, last.OpKind)
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
assert.Empty(t, last.ErrorCode)
assert.Equal(t, "ctr-123", last.ContainerID)
require.Len(t, h.healthEvents.envelopes, 1)
assert.Equal(t, health.EventTypeContainerStarted, h.healthEvents.envelopes[0].EventType)
var details map[string]string
require.NoError(t, json.Unmarshal(h.healthEvents.envelopes[0].Details, &details))
assert.Equal(t, input.ImageRef, details["image_ref"])
assert.Empty(t, h.notifications.intents, "no notification intent expected on success")
assert.Equal(t, []string{"token-A"}, h.leases.acquires)
assert.Equal(t, []string{"token-A"}, h.leases.releases)
assert.Equal(t, []string{input.GameID}, h.lobby.calls)
}
// --- idempotent replay ------------------------------------------------
func TestHandleReplayNoOpForRunningRecordWithSameImageRef(t *testing.T) {
h := newHarness(t)
input := basicInput()
startedAt := h.now.Add(-time.Hour)
h.records.stored[input.GameID] = runtime.RuntimeRecord{
GameID: input.GameID,
Status: runtime.StatusRunning,
CurrentContainerID: "ctr-prev",
CurrentImageRef: input.ImageRef,
EngineEndpoint: "http://galaxy-game-game-1:8080",
StatePath: h.stateDir,
DockerNetwork: "galaxy-net",
StartedAt: &startedAt,
LastOpAt: startedAt,
CreatedAt: startedAt,
}
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
assert.Equal(t, "ctr-prev", result.Record.CurrentContainerID)
assert.Empty(t, h.records.upserts, "replay must not Upsert a fresh record")
require.Len(t, h.operationLogs.appends, 1)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode)
assert.Equal(t, "ctr-prev", last.ContainerID)
assert.Empty(t, h.notifications.intents)
assert.Equal(t, []string{"token-A"}, h.leases.releases, "lease must be released after replay no-op")
}
// --- conflicts --------------------------------------------------------
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
h := newHarness(t)
h.leases.acquired = false
input := basicInput()
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
require.Len(t, h.operationLogs.appends, 1)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OutcomeFailure, last.Outcome)
assert.Equal(t, startruntime.ErrorCodeConflict, last.ErrorCode)
assert.Empty(t, h.notifications.intents, "lease conflicts must not raise admin notifications")
assert.Empty(t, h.leases.releases, "release must not run when acquire returned false")
}
func TestHandleConflictWhenRunningWithDifferentImageRef(t *testing.T) {
h := newHarness(t)
input := basicInput()
startedAt := h.now.Add(-time.Hour)
h.records.stored[input.GameID] = runtime.RuntimeRecord{
GameID: input.GameID,
Status: runtime.StatusRunning,
CurrentContainerID: "ctr-prev",
CurrentImageRef: "registry.example.com/galaxy/game:1.4.6",
EngineEndpoint: "http://galaxy-game-game-1:8080",
StatePath: h.stateDir,
DockerNetwork: "galaxy-net",
StartedAt: &startedAt,
LastOpAt: startedAt,
CreatedAt: startedAt,
}
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, startruntime.ErrorCodeConflict, last.ErrorCode)
assert.Empty(t, h.notifications.intents)
assert.Empty(t, h.records.upserts)
}
// --- start_config_invalid ---------------------------------------------
func TestHandleStartConfigInvalidWhenImageRefMalformed(t *testing.T) {
h := newHarness(t)
input := basicInput()
input.ImageRef = "::not a docker reference::"
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
require.Len(t, h.notifications.intents, 1)
assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OutcomeFailure, last.Outcome)
}
func TestHandleStartConfigInvalidWhenNetworkMissing(t *testing.T) {
h := newHarness(t)
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(ports.ErrNetworkMissing)
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
require.Len(t, h.notifications.intents, 1)
assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType)
}
func TestHandleStartConfigInvalidWhenStateDirFails(t *testing.T) {
h := newHarness(t)
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
service, err := startruntime.NewService(startruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
HealthEvents: h.healthEvents,
Notifications: h.notifications,
Lobby: h.lobby,
Container: config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
},
DockerCfg: config.DockerConfig{
Host: "unix:///var/run/docker.sock",
Network: "galaxy-net",
LogDriver: "json-file",
PullPolicy: config.ImagePullPolicyIfMissing,
},
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: func() string { return "token-A" },
PrepareStateDir: func(_ string) (string, error) {
return "", errors.New("disk full")
},
})
require.NoError(t, err)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
require.Len(t, h.notifications.intents, 1)
assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType)
}
// --- image_pull_failed ------------------------------------------------
func TestHandleImagePullFailed(t *testing.T) {
h := newHarness(t)
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(errors.New("manifest unknown"))
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode)
require.Len(t, h.notifications.intents, 1)
assert.Equal(t, notificationintent.NotificationTypeRuntimeImagePullFailed, h.notifications.intents[0].NotificationType)
assert.Empty(t, h.records.upserts)
}
// --- container_start_failed ------------------------------------------
func TestHandleContainerStartFailedOnRunError(t *testing.T) {
h := newHarness(t)
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(ports.RunResult{}, errors.New("container name conflict"))
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeContainerStartFailed, result.ErrorCode)
require.Len(t, h.notifications.intents, 1)
assert.Equal(t, notificationintent.NotificationTypeRuntimeContainerStartFailed, h.notifications.intents[0].NotificationType)
assert.Empty(t, h.records.upserts)
}
func TestHandleRollsBackContainerWhenUpsertFails(t *testing.T) {
h := newHarness(t)
h.records.upsertErr = errors.New("connection refused")
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
h.docker.EXPECT().Remove(gomock.Any(), "ctr-123").Return(nil)
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeContainerStartFailed, result.ErrorCode)
require.Len(t, h.notifications.intents, 1)
assert.Equal(t, notificationintent.NotificationTypeRuntimeContainerStartFailed, h.notifications.intents[0].NotificationType)
}
// --- best-effort degradation -----------------------------------------
func TestHandleSuccessSurvivesOperationLogFailure(t *testing.T) {
h := newHarness(t)
h.operationLogs.appendErr = errors.New("postgres down")
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Empty(t, result.ErrorCode)
assert.Len(t, h.records.upserts, 1)
}
func TestHandleSuccessSurvivesHealthPublishFailure(t *testing.T) {
h := newHarness(t)
h.healthEvents.publishErr = errors.New("redis down")
input := basicInput()
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Len(t, h.records.upserts, 1)
}
// --- pre-existing stopped record proceeds with fresh start ----------
func TestHandlePreservesCreatedAtForExistingRecord(t *testing.T) {
h := newHarness(t)
input := basicInput()
originalCreatedAt := h.now.Add(-72 * time.Hour)
stoppedAt := h.now.Add(-time.Hour)
h.records.stored[input.GameID] = runtime.RuntimeRecord{
GameID: input.GameID,
Status: runtime.StatusStopped,
CurrentImageRef: "registry.example.com/galaxy/game:1.4.6",
EngineEndpoint: "http://galaxy-game-game-1:8080",
StatePath: h.stateDir,
DockerNetwork: "galaxy-net",
StoppedAt: &stoppedAt,
LastOpAt: stoppedAt,
CreatedAt: originalCreatedAt,
}
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
service := h.build(t)
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, originalCreatedAt, result.Record.CreatedAt, "created_at must be preserved across re-starts")
assert.Equal(t, runtime.StatusRunning, result.Record.Status)
assert.Equal(t, input.ImageRef, result.Record.CurrentImageRef)
}
// --- input validation -----------------------------------------------
func TestHandleRejectsInvalidInput(t *testing.T) {
h := newHarness(t)
service := h.build(t)
cases := []startruntime.Input{
{GameID: "", ImageRef: "x", OpSource: operation.OpSourceLobbyStream},
{GameID: "g", ImageRef: "", OpSource: operation.OpSourceLobbyStream},
{GameID: "g", ImageRef: "x", OpSource: operation.OpSource("bogus")},
}
for _, input := range cases {
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
}
}
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
h := newHarness(t)
deps := startruntime.Dependencies{
Container: config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
},
DockerCfg: config.DockerConfig{
Host: "unix:///var/run/docker.sock",
Network: "galaxy-net",
LogDriver: "json-file",
PullPolicy: config.ImagePullPolicyIfMissing,
},
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
}
_, err := startruntime.NewService(deps)
require.Error(t, err)
}
@@ -0,0 +1,612 @@
// Package stopruntime implements the `stop` lifecycle operation owned by
// Runtime Manager. The service is the single orchestrator behind both
// the asynchronous `runtime:stop_jobs` consumer and the synchronous
// `POST /api/v1/internal/runtimes/{game_id}/stop` REST handler. It is
// also the inner stop step of the restart and patch services, which
// call Run while holding the outer per-game lease.
//
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
// §Lifecycles → Stop`. Design rationale is captured in
// `rtmanager/docs/services.md`.
package stopruntime
import (
"context"
"crypto/rand"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/logging"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/telemetry"
)
// leaseReleaseTimeout bounds the deferred lease-release call. A fresh
// background context is used so the release runs even when the request
// context was already canceled.
const leaseReleaseTimeout = 5 * time.Second
// Input stores the per-call arguments for one stop operation.
type Input struct {
// GameID identifies the platform game to stop.
GameID string
// Reason classifies the trigger of the stop. Required.
Reason StopReason
// OpSource classifies how the request entered Runtime Manager.
// Required: every operation_log entry carries an op_source.
OpSource operation.OpSource
// SourceRef stores the optional opaque per-source reference (Redis
// Stream entry id, REST request id, admin user id). Empty when the
// caller does not provide one. For inner calls invoked by the
// restart and patch orchestrators it carries the outer correlation
// id so the three operation_log entries share it.
SourceRef string
}
// Validate reports whether input carries the structural invariants the
// service requires.
func (input Input) Validate() error {
if strings.TrimSpace(input.GameID) == "" {
return fmt.Errorf("game id must not be empty")
}
if !input.OpSource.IsKnown() {
return fmt.Errorf("op source %q is unsupported", input.OpSource)
}
if err := input.Reason.Validate(); err != nil {
return err
}
return nil
}
// Result stores the deterministic outcome of one Handle / Run call.
type Result struct {
// Record carries the runtime record installed by the operation.
// Populated on success and on idempotent replay; zero on failure.
Record runtime.RuntimeRecord
// Outcome reports whether the operation completed (success) or
// produced a stable failure code.
Outcome operation.Outcome
// ErrorCode stores the stable error code on failure, or
// `replay_no_op` on idempotent replay. Empty for fresh successes.
ErrorCode string
// ErrorMessage stores the operator-readable detail on failure.
// Empty for successes.
ErrorMessage string
}
// Dependencies groups the collaborators required by Service.
type Dependencies struct {
// RuntimeRecords reads and updates the durable runtime record.
RuntimeRecords ports.RuntimeRecordStore
// OperationLogs records the success / failure audit entry.
OperationLogs ports.OperationLogStore
// Docker drives the Docker daemon (container stop).
Docker ports.DockerClient
// Leases serialises operations against the same game id.
Leases ports.GameLeaseStore
// HealthEvents publishes `runtime:health_events` and upserts the
// matching `health_snapshots` row. Used on the vanished-container
// path to emit `container_disappeared`.
HealthEvents ports.HealthEventPublisher
// Container groups the per-container settings consumed at stop time
// (the graceful stop timeout).
Container config.ContainerConfig
// Coordination supplies the per-game lease TTL.
Coordination config.CoordinationConfig
// Telemetry records stop outcomes and lease latency. Required.
Telemetry *telemetry.Runtime
// Logger records structured service-level events. Defaults to
// `slog.Default()` when nil.
Logger *slog.Logger
// Clock supplies the wall-clock used for operation timestamps.
// Defaults to `time.Now` when nil.
Clock func() time.Time
// NewToken supplies a unique opaque lease token. Defaults to a
// 32-byte random base64url string when nil. Tests may override.
NewToken func() string
}
// Service executes the stop lifecycle operation.
type Service struct {
runtimeRecords ports.RuntimeRecordStore
operationLogs ports.OperationLogStore
docker ports.DockerClient
leases ports.GameLeaseStore
healthEvents ports.HealthEventPublisher
stopTimeout time.Duration
leaseTTL time.Duration
telemetry *telemetry.Runtime
logger *slog.Logger
clock func() time.Time
newToken func() string
}
// NewService constructs one Service from deps.
func NewService(deps Dependencies) (*Service, error) {
switch {
case deps.RuntimeRecords == nil:
return nil, errors.New("new stop runtime service: nil runtime records")
case deps.OperationLogs == nil:
return nil, errors.New("new stop runtime service: nil operation logs")
case deps.Docker == nil:
return nil, errors.New("new stop runtime service: nil docker client")
case deps.Leases == nil:
return nil, errors.New("new stop runtime service: nil lease store")
case deps.HealthEvents == nil:
return nil, errors.New("new stop runtime service: nil health events publisher")
case deps.Telemetry == nil:
return nil, errors.New("new stop runtime service: nil telemetry runtime")
}
if err := deps.Container.Validate(); err != nil {
return nil, fmt.Errorf("new stop runtime service: container config: %w", err)
}
if err := deps.Coordination.Validate(); err != nil {
return nil, fmt.Errorf("new stop runtime service: coordination config: %w", err)
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
logger = logger.With("service", "rtmanager.stopruntime")
newToken := deps.NewToken
if newToken == nil {
newToken = defaultTokenGenerator()
}
return &Service{
runtimeRecords: deps.RuntimeRecords,
operationLogs: deps.OperationLogs,
docker: deps.Docker,
leases: deps.Leases,
healthEvents: deps.HealthEvents,
stopTimeout: deps.Container.StopTimeout,
leaseTTL: deps.Coordination.GameLeaseTTL,
telemetry: deps.Telemetry,
logger: logger,
clock: clock,
newToken: newToken,
}, nil
}
// Handle executes one stop operation end-to-end. The Go-level error
// return is reserved for non-business failures (nil context, nil
// receiver). Every business outcome — success, idempotent replay, or
// any of the stable failure modes — flows through Result.
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("stop runtime: nil service")
}
if ctx == nil {
return Result{}, errors.New("stop runtime: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInvalidRequest,
errorMessage: err.Error(),
}), nil
}
token := service.newToken()
leaseStart := service.clock()
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
}), nil
}
if !acquired {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeConflict,
errorMessage: "another lifecycle operation is in progress for this game",
}), nil
}
defer service.releaseLease(ctx, input.GameID, token)
return service.runUnderLease(ctx, input, opStartedAt)
}
// Run executes the stop lifecycle assuming the per-game lease is
// already held by the caller. The method is reserved for orchestrator
// services in `internal/service/` that compose stop with another
// operation under a single outer lease (restart and patch). External
// callers must use Handle.
func (service *Service) Run(ctx context.Context, input Input) (Result, error) {
if service == nil {
return Result{}, errors.New("stop runtime: nil service")
}
if ctx == nil {
return Result{}, errors.New("stop runtime: nil context")
}
opStartedAt := service.clock().UTC()
if err := input.Validate(); err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInvalidRequest,
errorMessage: err.Error(),
}), nil
}
return service.runUnderLease(ctx, input, opStartedAt)
}
// runUnderLease executes the post-validation, lease-protected stop
// steps shared by Handle and Run.
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
if errors.Is(err, runtime.ErrNotFound) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeNotFound,
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
}), nil
}
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
}), nil
}
switch existing.Status {
case runtime.StatusStopped, runtime.StatusRemoved:
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
case runtime.StatusRunning:
// proceed
default:
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("runtime record has unsupported status %q", existing.Status),
}), nil
}
if err := service.docker.Stop(ctx, existing.CurrentContainerID, service.stopTimeout); err != nil {
if errors.Is(err, ports.ErrContainerNotFound) {
return service.handleVanished(ctx, input, opStartedAt, existing), nil
}
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeServiceUnavailable,
errorMessage: fmt.Sprintf("docker stop: %s", err.Error()),
containerID: existing.CurrentContainerID,
imageRef: existing.CurrentImageRef,
}), nil
}
updateNow := service.clock().UTC()
err = service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: input.GameID,
ExpectedFrom: runtime.StatusRunning,
ExpectedContainerID: existing.CurrentContainerID,
To: runtime.StatusStopped,
Now: updateNow,
})
if errors.Is(err, runtime.ErrConflict) {
// CAS race: a concurrent reconciler / restart already moved the
// record. The desired terminal state was reached by another path.
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
}
if errors.Is(err, runtime.ErrNotFound) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeNotFound,
errorMessage: fmt.Sprintf("runtime record for game %q vanished mid-stop", input.GameID),
containerID: existing.CurrentContainerID,
imageRef: existing.CurrentImageRef,
}), nil
}
if err != nil {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("update runtime status: %s", err.Error()),
containerID: existing.CurrentContainerID,
imageRef: existing.CurrentImageRef,
}), nil
}
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindStop,
OpSource: input.OpSource,
SourceRef: input.SourceRef,
ImageRef: existing.CurrentImageRef,
ContainerID: existing.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
record := existing
record.Status = runtime.StatusStopped
stoppedAt := updateNow
record.StoppedAt = &stoppedAt
record.LastOpAt = updateNow
logArgs := []any{
"game_id", input.GameID,
"container_id", existing.CurrentContainerID,
"reason", string(input.Reason),
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime stopped", logArgs...)
return Result{
Record: record,
Outcome: operation.OutcomeSuccess,
}, nil
}
// handleVanished records the success outcome for the case where docker
// stop reports the container as already gone. It updates the record to
// removed, publishes container_disappeared, and returns success.
func (service *Service) handleVanished(ctx context.Context, input Input, opStartedAt time.Time, existing runtime.RuntimeRecord) Result {
updateNow := service.clock().UTC()
err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
GameID: input.GameID,
ExpectedFrom: runtime.StatusRunning,
ExpectedContainerID: existing.CurrentContainerID,
To: runtime.StatusRemoved,
Now: updateNow,
})
if errors.Is(err, runtime.ErrConflict) {
return service.recordReplayNoOp(ctx, opStartedAt, input, existing)
}
if err != nil && !errors.Is(err, runtime.ErrNotFound) {
return service.recordFailure(ctx, failureCtx{
opStartedAt: opStartedAt,
input: input,
errorCode: startruntime.ErrorCodeInternal,
errorMessage: fmt.Sprintf("update runtime status to removed: %s", err.Error()),
containerID: existing.CurrentContainerID,
imageRef: existing.CurrentImageRef,
})
}
service.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
GameID: input.GameID,
ContainerID: existing.CurrentContainerID,
EventType: health.EventTypeContainerDisappeared,
OccurredAt: updateNow,
Details: emptyHealthDetails(),
})
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindStop,
OpSource: input.OpSource,
SourceRef: input.SourceRef,
ImageRef: existing.CurrentImageRef,
ContainerID: existing.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
service.telemetry.RecordHealthEvent(ctx, string(health.EventTypeContainerDisappeared))
record := existing
record.Status = runtime.StatusRemoved
record.CurrentContainerID = ""
removedAt := updateNow
record.RemovedAt = &removedAt
record.LastOpAt = updateNow
logArgs := []any{
"game_id", input.GameID,
"container_id", existing.CurrentContainerID,
"reason", string(input.Reason),
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime stop on vanished container", logArgs...)
return Result{
Record: record,
Outcome: operation.OutcomeSuccess,
}
}
// recordReplayNoOp records the idempotent replay outcome and returns the
// existing record unchanged.
func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: input.GameID,
OpKind: operation.OpKindStop,
OpSource: input.OpSource,
SourceRef: input.SourceRef,
ImageRef: existing.CurrentImageRef,
ContainerID: existing.CurrentContainerID,
Outcome: operation.OutcomeSuccess,
ErrorCode: startruntime.ErrorCodeReplayNoOp,
StartedAt: opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
logArgs := []any{
"game_id", input.GameID,
"container_id", existing.CurrentContainerID,
"reason", string(input.Reason),
"op_source", string(input.OpSource),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.InfoContext(ctx, "runtime stop replay no-op", logArgs...)
return Result{
Record: existing,
Outcome: operation.OutcomeSuccess,
ErrorCode: startruntime.ErrorCodeReplayNoOp,
}
}
// failureCtx groups the inputs to recordFailure so the runUnderLease
// method stays readable.
type failureCtx struct {
opStartedAt time.Time
input Input
errorCode string
errorMessage string
containerID string
imageRef string
}
// recordFailure records the failure operation_log entry and emits
// telemetry. The runtime record stays untouched.
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
finishedAt := service.clock().UTC()
service.bestEffortAppend(ctx, operation.OperationEntry{
GameID: fc.input.GameID,
OpKind: operation.OpKindStop,
OpSource: fc.input.OpSource,
SourceRef: fc.input.SourceRef,
ImageRef: fc.imageRef,
ContainerID: fc.containerID,
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
StartedAt: fc.opStartedAt,
FinishedAt: &finishedAt,
})
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeFailure), string(fc.input.Reason), string(fc.input.OpSource))
logArgs := []any{
"game_id", fc.input.GameID,
"reason", string(fc.input.Reason),
"op_source", string(fc.input.OpSource),
"error_code", fc.errorCode,
"error_message", fc.errorMessage,
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
service.logger.WarnContext(ctx, "runtime stop failed", logArgs...)
return Result{
Outcome: operation.OutcomeFailure,
ErrorCode: fc.errorCode,
ErrorMessage: fc.errorMessage,
}
}
// releaseLease releases the per-game lease in a fresh background context
// so a canceled request context does not leave the lease pinned for its
// TTL.
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
defer cancel()
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
service.logger.WarnContext(ctx, "release game lease",
"game_id", gameID,
"err", err.Error(),
)
}
}
// bestEffortAppend writes one operation_log entry. A failure is logged
// and discarded; the durable runtime record (or its absence) remains
// the source of truth.
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
service.logger.ErrorContext(ctx, "append operation log",
"game_id", entry.GameID,
"op_kind", string(entry.OpKind),
"outcome", string(entry.Outcome),
"error_code", entry.ErrorCode,
"err", err.Error(),
)
}
}
// bestEffortPublishHealth emits one health event + snapshot upsert.
// Failures degrade silently per `rtmanager/README.md §Notification
// Contracts`; the runtime record remains the source of truth.
func (service *Service) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) {
if err := service.healthEvents.Publish(ctx, envelope); err != nil {
service.logger.ErrorContext(ctx, "publish health event",
"game_id", envelope.GameID,
"container_id", envelope.ContainerID,
"event_type", string(envelope.EventType),
"err", err.Error(),
)
}
}
// defaultTokenGenerator returns a function that produces 32-byte
// base64url-encoded tokens. Mirrors the start service: a degraded
// entropy source falls back to a sentinel token so the next TryAcquire
// observes a collision rather than a panic.
func defaultTokenGenerator() func() string {
return func() string {
var buf [32]byte
if _, err := rand.Read(buf[:]); err != nil {
return "rtmanager-fallback-token"
}
return base64.RawURLEncoding.EncodeToString(buf[:])
}
}
// emptyHealthDetails returns the canonical empty-object payload required
// by the `container_disappeared` AsyncAPI variant.
func emptyHealthDetails() json.RawMessage {
return json.RawMessage("{}")
}
@@ -0,0 +1,537 @@
package stopruntime_test
import (
"context"
"errors"
"sync"
"testing"
"time"
"galaxy/rtmanager/internal/adapters/docker/mocks"
"galaxy/rtmanager/internal/config"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
"galaxy/rtmanager/internal/service/stopruntime"
"galaxy/rtmanager/internal/telemetry"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
)
// --- test doubles -----------------------------------------------------
type fakeRuntimeRecords struct {
mu sync.Mutex
stored map[string]runtime.RuntimeRecord
getErr error
updateStatusErr error
updates []ports.UpdateStatusInput
}
func newFakeRuntimeRecords() *fakeRuntimeRecords {
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
}
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.getErr != nil {
return runtime.RuntimeRecord{}, s.getErr
}
record, ok := s.stored[gameID]
if !ok {
return runtime.RuntimeRecord{}, runtime.ErrNotFound
}
return record, nil
}
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error {
return errors.New("not used in stop tests")
}
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
s.mu.Lock()
defer s.mu.Unlock()
s.updates = append(s.updates, input)
if s.updateStatusErr != nil {
return s.updateStatusErr
}
record, ok := s.stored[input.GameID]
if !ok {
return runtime.ErrNotFound
}
if record.Status != input.ExpectedFrom {
return runtime.ErrConflict
}
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
return runtime.ErrConflict
}
record.Status = input.To
record.LastOpAt = input.Now
switch input.To {
case runtime.StatusStopped:
stoppedAt := input.Now
record.StoppedAt = &stoppedAt
case runtime.StatusRemoved:
removedAt := input.Now
record.RemovedAt = &removedAt
record.CurrentContainerID = ""
}
s.stored[input.GameID] = record
return nil
}
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in stop tests")
}
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
return nil, errors.New("not used in stop tests")
}
type fakeOperationLogs struct {
mu sync.Mutex
appendErr error
appends []operation.OperationEntry
}
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.appendErr != nil {
return 0, s.appendErr
}
s.appends = append(s.appends, entry)
return int64(len(s.appends)), nil
}
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
return nil, errors.New("not used in stop tests")
}
func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) {
s.mu.Lock()
defer s.mu.Unlock()
if len(s.appends) == 0 {
return operation.OperationEntry{}, false
}
return s.appends[len(s.appends)-1], true
}
type fakeLeases struct {
acquired bool
acquireErr error
releaseErr error
mu sync.Mutex
acquires []string
releases []string
}
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
l.mu.Lock()
defer l.mu.Unlock()
l.acquires = append(l.acquires, token)
if l.acquireErr != nil {
return false, l.acquireErr
}
return l.acquired, nil
}
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
l.mu.Lock()
defer l.mu.Unlock()
l.releases = append(l.releases, token)
return l.releaseErr
}
type fakeHealthEvents struct {
mu sync.Mutex
publishErr error
envelopes []ports.HealthEventEnvelope
}
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
h.mu.Lock()
defer h.mu.Unlock()
if h.publishErr != nil {
return h.publishErr
}
h.envelopes = append(h.envelopes, envelope)
return nil
}
// --- harness ----------------------------------------------------------
type harness struct {
records *fakeRuntimeRecords
operationLogs *fakeOperationLogs
docker *mocks.MockDockerClient
leases *fakeLeases
healthEvents *fakeHealthEvents
telemetry *telemetry.Runtime
now time.Time
}
func newHarness(t *testing.T) *harness {
t.Helper()
ctrl := gomock.NewController(t)
t.Cleanup(ctrl.Finish)
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
require.NoError(t, err)
return &harness{
records: newFakeRuntimeRecords(),
operationLogs: &fakeOperationLogs{},
docker: mocks.NewMockDockerClient(ctrl),
leases: &fakeLeases{acquired: true},
healthEvents: &fakeHealthEvents{},
telemetry: telemetryRuntime,
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
}
}
func (h *harness) build(t *testing.T) *stopruntime.Service {
t.Helper()
containerCfg := config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
}
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
service, err := stopruntime.NewService(stopruntime.Dependencies{
RuntimeRecords: h.records,
OperationLogs: h.operationLogs,
Docker: h.docker,
Leases: h.leases,
HealthEvents: h.healthEvents,
Container: containerCfg,
Coordination: coordinationCfg,
Telemetry: h.telemetry,
Clock: func() time.Time { return h.now },
NewToken: func() string { return "token-A" },
})
require.NoError(t, err)
return service
}
func basicInput() stopruntime.Input {
return stopruntime.Input{
GameID: "game-1",
Reason: stopruntime.StopReasonCancelled,
OpSource: operation.OpSourceLobbyStream,
SourceRef: "1700000000000-0",
}
}
func runningRecord(now time.Time) runtime.RuntimeRecord {
startedAt := now.Add(-time.Hour)
return runtime.RuntimeRecord{
GameID: "game-1",
Status: runtime.StatusRunning,
CurrentContainerID: "ctr-123",
CurrentImageRef: "registry.example.com/galaxy/game:1.4.7",
EngineEndpoint: "http://galaxy-game-game-1:8080",
StatePath: "/var/lib/galaxy/games/game-1",
DockerNetwork: "galaxy-net",
StartedAt: &startedAt,
LastOpAt: startedAt,
CreatedAt: startedAt,
}
}
// --- happy path -------------------------------------------------------
func TestHandleHappyPath(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Empty(t, result.ErrorCode)
assert.Equal(t, runtime.StatusStopped, result.Record.Status)
require.NotNil(t, result.Record.StoppedAt)
assert.Equal(t, h.now, *result.Record.StoppedAt)
assert.Equal(t, h.now, result.Record.LastOpAt)
require.Len(t, h.records.updates, 1)
assert.Equal(t, runtime.StatusRunning, h.records.updates[0].ExpectedFrom)
assert.Equal(t, runtime.StatusStopped, h.records.updates[0].To)
assert.Equal(t, "ctr-123", h.records.updates[0].ExpectedContainerID)
require.Len(t, h.operationLogs.appends, 1)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OpKindStop, last.OpKind)
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
assert.Empty(t, last.ErrorCode)
assert.Equal(t, "ctr-123", last.ContainerID)
assert.Empty(t, h.healthEvents.envelopes)
assert.Equal(t, []string{"token-A"}, h.leases.acquires)
assert.Equal(t, []string{"token-A"}, h.leases.releases)
}
// --- replay ----------------------------------------------------------
func TestHandleReplayNoOpForStoppedRecord(t *testing.T) {
h := newHarness(t)
stoppedRecord := runningRecord(h.now)
stoppedRecord.Status = runtime.StatusStopped
stoppedAt := h.now.Add(-time.Minute)
stoppedRecord.StoppedAt = &stoppedAt
h.records.stored["game-1"] = stoppedRecord
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
assert.Equal(t, runtime.StatusStopped, result.Record.Status)
assert.Empty(t, h.records.updates)
require.Len(t, h.operationLogs.appends, 1)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode)
assert.Equal(t, []string{"token-A"}, h.leases.releases)
}
func TestHandleReplayNoOpForRemovedRecord(t *testing.T) {
h := newHarness(t)
removed := runningRecord(h.now)
removed.Status = runtime.StatusRemoved
removed.CurrentContainerID = ""
removedAt := h.now.Add(-time.Minute)
removed.RemovedAt = &removedAt
h.records.stored["game-1"] = removed
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
}
// --- vanished container ----------------------------------------------
func TestHandleVanishedContainerMarksRemoved(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(ports.ErrContainerNotFound)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Empty(t, result.ErrorCode)
assert.Equal(t, runtime.StatusRemoved, result.Record.Status)
assert.Empty(t, result.Record.CurrentContainerID)
require.Len(t, h.records.updates, 1)
assert.Equal(t, runtime.StatusRemoved, h.records.updates[0].To)
require.Len(t, h.healthEvents.envelopes, 1)
assert.Equal(t, health.EventTypeContainerDisappeared, h.healthEvents.envelopes[0].EventType)
require.Len(t, h.operationLogs.appends, 1)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
assert.Empty(t, last.ErrorCode)
}
// --- failure paths ---------------------------------------------------
func TestHandleNotFoundForMissingRecord(t *testing.T) {
h := newHarness(t)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
assert.Empty(t, h.healthEvents.envelopes)
assert.Empty(t, h.records.updates)
}
func TestHandleServiceUnavailableOnDockerError(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(errors.New("docker daemon timeout"))
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
last, _ := h.operationLogs.lastAppend()
assert.Equal(t, operation.OutcomeFailure, last.Outcome)
assert.Equal(t, "ctr-123", last.ContainerID)
assert.Empty(t, h.records.updates, "no record mutation on docker stop failure")
}
func TestHandleReplayNoOpOnUpdateStatusConflict(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.records.updateStatusErr = runtime.ErrConflict
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
}
func TestHandleInternalErrorOnUpdateStatusGenericError(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.records.updateStatusErr = errors.New("postgres down")
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeInternal, result.ErrorCode)
}
// --- conflicts -------------------------------------------------------
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
h := newHarness(t)
h.leases.acquired = false
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
assert.Empty(t, h.leases.releases, "release must not run when acquire returned false")
}
func TestHandleServiceUnavailableOnLeaseError(t *testing.T) {
h := newHarness(t)
h.leases.acquireErr = errors.New("redis timeout")
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
}
// --- input validation ------------------------------------------------
func TestHandleRejectsInvalidInput(t *testing.T) {
h := newHarness(t)
service := h.build(t)
cases := []stopruntime.Input{
{GameID: "", Reason: stopruntime.StopReasonCancelled, OpSource: operation.OpSourceLobbyStream},
{GameID: "g", Reason: "", OpSource: operation.OpSourceLobbyStream},
{GameID: "g", Reason: stopruntime.StopReason("bogus"), OpSource: operation.OpSourceLobbyStream},
{GameID: "g", Reason: stopruntime.StopReasonCancelled, OpSource: operation.OpSource("bogus")},
}
for _, input := range cases {
result, err := service.Handle(context.Background(), input)
require.NoError(t, err)
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
}
}
// --- Run path (no-lease) ---------------------------------------------
func TestRunSkipsLease(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.leases.acquired = false // would block Handle; Run must ignore
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
service := h.build(t)
result, err := service.Run(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Empty(t, h.leases.acquires, "Run must not touch the lease store")
assert.Empty(t, h.leases.releases)
}
// --- best-effort degradation ----------------------------------------
func TestHandleSurvivesOperationLogFailure(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.operationLogs.appendErr = errors.New("postgres down")
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
}
func TestHandleSurvivesHealthPublishFailureOnVanished(t *testing.T) {
h := newHarness(t)
h.records.stored["game-1"] = runningRecord(h.now)
h.healthEvents.publishErr = errors.New("redis down")
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(ports.ErrContainerNotFound)
service := h.build(t)
result, err := service.Handle(context.Background(), basicInput())
require.NoError(t, err)
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
assert.Equal(t, runtime.StatusRemoved, result.Record.Status)
}
// --- constructor -----------------------------------------------------
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
h := newHarness(t)
deps := stopruntime.Dependencies{
Container: config.ContainerConfig{
DefaultCPUQuota: 1.0,
DefaultMemory: "512m",
DefaultPIDsLimit: 512,
StopTimeout: 30 * time.Second,
Retention: 30 * 24 * time.Hour,
EngineStateMountPath: "/var/lib/galaxy-game",
EngineStateEnvName: "GAME_STATE_PATH",
GameStateDirMode: 0o750,
GameStateRoot: "/var/lib/galaxy/games",
},
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
Telemetry: h.telemetry,
}
_, err := stopruntime.NewService(deps)
require.Error(t, err)
}
@@ -0,0 +1,82 @@
package stopruntime
import "fmt"
// StopReason classifies why a caller is asking Runtime Manager to stop a
// game container. The enum is part of the `runtime:stop_jobs` envelope
// produced by Game Lobby and the body of the `POST
// /api/v1/internal/runtimes/{game_id}/stop` REST endpoint, and mirrors
// the AsyncAPI contract frozen in
// `rtmanager/api/runtime-jobs-asyncapi.yaml`.
//
// The vocabulary is shared with `lobby/internal/ports/runtimemanager.go`;
// the two declarations stay byte-identical and adding a new value
// requires a coordinated contract bump on both sides.
type StopReason string
// StopReason enum values. Adding a new value is a contract change that
// touches the AsyncAPI spec, the Lobby producer, and every Runtime
// Manager consumer.
const (
// StopReasonOrphanCleanup releases a container whose post-start
// metadata persistence failed in Lobby.
StopReasonOrphanCleanup StopReason = "orphan_cleanup"
// StopReasonCancelled covers user-lifecycle cascade and explicit
// cancel paths for in-flight games.
StopReasonCancelled StopReason = "cancelled"
// StopReasonFinished is reserved for engine-driven game finish flows.
StopReasonFinished StopReason = "finished"
// StopReasonAdminRequest is reserved for admin-initiated stop paths.
StopReasonAdminRequest StopReason = "admin_request"
// StopReasonTimeout is reserved for timeout-driven stop paths.
StopReasonTimeout StopReason = "timeout"
)
// IsKnown reports whether reason belongs to the frozen stop-reason
// vocabulary.
func (reason StopReason) IsKnown() bool {
switch reason {
case StopReasonOrphanCleanup,
StopReasonCancelled,
StopReasonFinished,
StopReasonAdminRequest,
StopReasonTimeout:
return true
default:
return false
}
}
// AllStopReasons returns the frozen list of every stop-reason value. The
// slice order is stable across calls and matches the AsyncAPI enum order.
func AllStopReasons() []StopReason {
return []StopReason{
StopReasonOrphanCleanup,
StopReasonCancelled,
StopReasonFinished,
StopReasonAdminRequest,
StopReasonTimeout,
}
}
// String returns reason as its stored enum value. Useful in log fields
// and telemetry attributes.
func (reason StopReason) String() string {
return string(reason)
}
// Validate reports whether reason carries one of the five values fixed
// by the AsyncAPI contract.
func (reason StopReason) Validate() error {
if reason == "" {
return fmt.Errorf("stop reason must not be empty")
}
if !reason.IsKnown() {
return fmt.Errorf("stop reason %q is unsupported", reason)
}
return nil
}
+651
View File
@@ -0,0 +1,651 @@
// Package telemetry provides lightweight OpenTelemetry helpers and
// low-cardinality Runtime Manager instruments used by the runnable
// skeleton. Later stages emit into the instruments declared here without
// touching this package.
package telemetry
import (
"context"
"errors"
"fmt"
"log/slog"
"os"
"strings"
"sync"
"time"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/propagation"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
oteltrace "go.opentelemetry.io/otel/trace"
)
const meterName = "galaxy/rtmanager"
const (
defaultServiceName = "galaxy-rtmanager"
processExporterNone = "none"
processExporterOTLP = "otlp"
processProtocolHTTPProtobuf = "http/protobuf"
processProtocolGRPC = "grpc"
)
// ProcessConfig configures the process-wide OpenTelemetry runtime.
type ProcessConfig struct {
// ServiceName overrides the default OpenTelemetry service name.
ServiceName string
// TracesExporter selects the external traces exporter. Supported values
// are `none` and `otlp`.
TracesExporter string
// MetricsExporter selects the external metrics exporter. Supported
// values are `none` and `otlp`.
MetricsExporter string
// TracesProtocol selects the OTLP traces protocol when TracesExporter is
// `otlp`.
TracesProtocol string
// MetricsProtocol selects the OTLP metrics protocol when
// MetricsExporter is `otlp`.
MetricsProtocol string
// StdoutTracesEnabled enables the additional stdout trace exporter used
// for local development and debugging.
StdoutTracesEnabled bool
// StdoutMetricsEnabled enables the additional stdout metric exporter
// used for local development and debugging.
StdoutMetricsEnabled bool
}
// Validate reports whether cfg contains a supported OpenTelemetry exporter
// configuration.
func (cfg ProcessConfig) Validate() error {
switch cfg.TracesExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter)
}
switch cfg.MetricsExporter {
case processExporterNone, processExporterOTLP:
default:
return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter)
}
if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol)
}
if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC {
return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol)
}
return nil
}
// Runtime owns the Runtime Manager OpenTelemetry providers and the
// low-cardinality custom instruments listed in `rtmanager/README.md`
// §Observability.
type Runtime struct {
tracerProvider oteltrace.TracerProvider
meterProvider metric.MeterProvider
meter metric.Meter
shutdownMu sync.Mutex
shutdownDone bool
shutdownErr error
shutdownFns []func(context.Context) error
internalHTTPRequests metric.Int64Counter
internalHTTPDuration metric.Float64Histogram
startOutcomes metric.Int64Counter
stopOutcomes metric.Int64Counter
restartOutcomes metric.Int64Counter
patchOutcomes metric.Int64Counter
cleanupOutcomes metric.Int64Counter
healthEvents metric.Int64Counter
reconcileDrift metric.Int64Counter
notificationIntents metric.Int64Counter
dockerOpLatency metric.Float64Histogram
leaseAcquireLatency metric.Float64Histogram
runtimeRecordsByStatus metric.Int64ObservableGauge
gaugeMu sync.Mutex
gaugeRegistration metric.Registration
}
// NewWithProviders constructs a telemetry runtime around explicitly supplied
// meterProvider and tracerProvider values.
func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) {
if meterProvider == nil {
meterProvider = otel.GetMeterProvider()
}
if tracerProvider == nil {
tracerProvider = otel.GetTracerProvider()
}
if meterProvider == nil {
return nil, errors.New("new rtmanager telemetry runtime: nil meter provider")
}
if tracerProvider == nil {
return nil, errors.New("new rtmanager telemetry runtime: nil tracer provider")
}
return buildRuntime(meterProvider, tracerProvider, nil)
}
// NewProcess constructs the process-wide Runtime Manager OpenTelemetry
// runtime from cfg, installs the resulting providers globally, and
// returns the runtime.
func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) {
if ctx == nil {
return nil, errors.New("new rtmanager telemetry process: nil context")
}
if err := cfg.Validate(); err != nil {
return nil, fmt.Errorf("new rtmanager telemetry process: %w", err)
}
if logger == nil {
logger = slog.Default()
}
serviceName := strings.TrimSpace(cfg.ServiceName)
if serviceName == "" {
serviceName = defaultServiceName
}
res := resource.NewSchemaless(attribute.String("service.name", serviceName))
tracerProvider, err := newTracerProvider(ctx, res, cfg)
if err != nil {
return nil, fmt.Errorf("new rtmanager telemetry process: tracer provider: %w", err)
}
meterProvider, err := newMeterProvider(ctx, res, cfg)
if err != nil {
return nil, fmt.Errorf("new rtmanager telemetry process: meter provider: %w", err)
}
otel.SetTracerProvider(tracerProvider)
otel.SetMeterProvider(meterProvider)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
runtime, err := buildRuntime(meterProvider, tracerProvider, []func(context.Context) error{
meterProvider.Shutdown,
tracerProvider.Shutdown,
})
if err != nil {
return nil, fmt.Errorf("new rtmanager telemetry process: runtime: %w", err)
}
logger.Info("rtmanager telemetry configured",
"service_name", serviceName,
"traces_exporter", cfg.TracesExporter,
"metrics_exporter", cfg.MetricsExporter,
)
return runtime, nil
}
// TracerProvider returns the runtime tracer provider.
func (runtime *Runtime) TracerProvider() oteltrace.TracerProvider {
if runtime == nil || runtime.tracerProvider == nil {
return otel.GetTracerProvider()
}
return runtime.tracerProvider
}
// MeterProvider returns the runtime meter provider.
func (runtime *Runtime) MeterProvider() metric.MeterProvider {
if runtime == nil || runtime.meterProvider == nil {
return otel.GetMeterProvider()
}
return runtime.meterProvider
}
// Shutdown flushes and stops the configured telemetry providers. Shutdown
// is idempotent.
func (runtime *Runtime) Shutdown(ctx context.Context) error {
if runtime == nil {
return nil
}
runtime.shutdownMu.Lock()
if runtime.shutdownDone {
err := runtime.shutdownErr
runtime.shutdownMu.Unlock()
return err
}
runtime.shutdownDone = true
runtime.shutdownMu.Unlock()
runtime.gaugeMu.Lock()
if runtime.gaugeRegistration != nil {
_ = runtime.gaugeRegistration.Unregister()
runtime.gaugeRegistration = nil
}
runtime.gaugeMu.Unlock()
var shutdownErr error
for index := len(runtime.shutdownFns) - 1; index >= 0; index-- {
shutdownErr = errors.Join(shutdownErr, runtime.shutdownFns[index](ctx))
}
runtime.shutdownMu.Lock()
runtime.shutdownErr = shutdownErr
runtime.shutdownMu.Unlock()
return shutdownErr
}
// RecordInternalHTTPRequest records one internal HTTP request outcome.
func (runtime *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
if runtime == nil {
return
}
options := metric.WithAttributes(attrs...)
runtime.internalHTTPRequests.Add(normalizeContext(ctx), 1, options)
runtime.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
}
// RecordStartOutcome records one terminal outcome of the start operation.
// outcome is `success` or `failure`; errorCode is `replay_no_op` or one of
// the stable failure codes from `rtmanager/README.md` §Error Model;
// opSource is `lobby_stream`, `gm_rest`, or `admin_rest`.
func (runtime *Runtime) RecordStartOutcome(ctx context.Context, outcome, errorCode, opSource string) {
if runtime == nil || runtime.startOutcomes == nil {
return
}
runtime.startOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("outcome", outcome),
attribute.String("error_code", errorCode),
attribute.String("op_source", opSource),
))
}
// RecordStopOutcome records one terminal outcome of the stop operation.
// reason is the value carried on `runtime:stop_jobs` or the matching REST
// reason; opSource is `lobby_stream`, `gm_rest`, or `admin_rest`.
func (runtime *Runtime) RecordStopOutcome(ctx context.Context, outcome, reason, opSource string) {
if runtime == nil || runtime.stopOutcomes == nil {
return
}
runtime.stopOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("outcome", outcome),
attribute.String("reason", reason),
attribute.String("op_source", opSource),
))
}
// RecordRestartOutcome records one terminal outcome of the restart
// operation.
func (runtime *Runtime) RecordRestartOutcome(ctx context.Context, outcome, errorCode string) {
if runtime == nil || runtime.restartOutcomes == nil {
return
}
runtime.restartOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("outcome", outcome),
attribute.String("error_code", errorCode),
))
}
// RecordPatchOutcome records one terminal outcome of the patch operation.
func (runtime *Runtime) RecordPatchOutcome(ctx context.Context, outcome, errorCode string) {
if runtime == nil || runtime.patchOutcomes == nil {
return
}
runtime.patchOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("outcome", outcome),
attribute.String("error_code", errorCode),
))
}
// RecordCleanupOutcome records one terminal outcome of the cleanup
// operation. opSource is `auto_ttl` for the periodic cleanup worker and
// `admin_rest` for explicit administrative removal.
func (runtime *Runtime) RecordCleanupOutcome(ctx context.Context, outcome, opSource string) {
if runtime == nil || runtime.cleanupOutcomes == nil {
return
}
runtime.cleanupOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("outcome", outcome),
attribute.String("op_source", opSource),
))
}
// RecordHealthEvent records one technical runtime event published on
// `runtime:health_events`. eventType comes from the frozen vocabulary in
// `rtmanager/README.md` §Async Stream Contracts.
func (runtime *Runtime) RecordHealthEvent(ctx context.Context, eventType string) {
if runtime == nil || runtime.healthEvents == nil {
return
}
runtime.healthEvents.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("event_type", eventType),
))
}
// RecordReconcileDrift records one drift outcome from the reconciler. kind
// is `adopt`, `dispose`, or `observed_exited`.
func (runtime *Runtime) RecordReconcileDrift(ctx context.Context, kind string) {
if runtime == nil || runtime.reconcileDrift == nil {
return
}
runtime.reconcileDrift.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("kind", kind),
))
}
// RecordNotificationIntent records one admin-only notification intent
// publish attempt. notificationType is `runtime.image_pull_failed`,
// `runtime.container_start_failed`, or `runtime.start_config_invalid`.
func (runtime *Runtime) RecordNotificationIntent(ctx context.Context, notificationType string) {
if runtime == nil || runtime.notificationIntents == nil {
return
}
runtime.notificationIntents.Add(normalizeContext(ctx), 1, metric.WithAttributes(
attribute.String("notification_type", notificationType),
))
}
// RecordDockerOpLatency records the wall-clock duration of one Docker SDK
// call. op is one of `pull`, `create`, `start`, `stop`, `rm`, `inspect`,
// `events`.
func (runtime *Runtime) RecordDockerOpLatency(ctx context.Context, op string, duration time.Duration) {
if runtime == nil || runtime.dockerOpLatency == nil {
return
}
runtime.dockerOpLatency.Record(normalizeContext(ctx), duration.Seconds()*1000, metric.WithAttributes(
attribute.String("op", op),
))
}
// RecordLeaseAcquireLatency records the wall-clock latency of one
// per-game Redis lease acquisition.
func (runtime *Runtime) RecordLeaseAcquireLatency(ctx context.Context, duration time.Duration) {
if runtime == nil || runtime.leaseAcquireLatency == nil {
return
}
runtime.leaseAcquireLatency.Record(normalizeContext(ctx), duration.Seconds()*1000)
}
// RuntimeRecordsByStatusProbe reports the number of runtime_records rows
// per status. The production probe wraps the runtime record store; tests
// may pass a stub.
type RuntimeRecordsByStatusProbe interface {
CountByStatus(ctx context.Context) (map[string]int, error)
}
// GaugeDependencies groups the collaborators required by RegisterGauges.
type GaugeDependencies struct {
// RuntimeRecordsByStatus probes the per-status row count for
// `rtmanager.runtime_records_by_status`.
RuntimeRecordsByStatus RuntimeRecordsByStatusProbe
// Logger records non-fatal probe errors. Defaults to slog.Default
// when nil.
Logger *slog.Logger
}
// RegisterGauges installs the observable-gauge callback that reports
// `rtmanager.runtime_records_by_status`. It is safe to call once per
// Runtime; a second call replaces the previous registration. The runtime
// keeps no strong reference to deps beyond the callback closure.
//
// The wiring layer registers the gauge once the persistence adapters
// are constructed.
func (runtime *Runtime) RegisterGauges(deps GaugeDependencies) error {
if runtime == nil {
return errors.New("register rtmanager gauges: nil runtime")
}
if deps.RuntimeRecordsByStatus == nil {
return errors.New("register rtmanager gauges: nil runtime records probe")
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
runtime.gaugeMu.Lock()
defer runtime.gaugeMu.Unlock()
if runtime.gaugeRegistration != nil {
_ = runtime.gaugeRegistration.Unregister()
runtime.gaugeRegistration = nil
}
callback := func(ctx context.Context, observer metric.Observer) error {
counts, err := deps.RuntimeRecordsByStatus.CountByStatus(ctx)
if err != nil {
logger.WarnContext(ctx, "runtime records probe failed",
"err", err.Error(),
)
return nil
}
for status, count := range counts {
observer.ObserveInt64(runtime.runtimeRecordsByStatus, int64(count), metric.WithAttributes(
attribute.String("status", status),
))
}
return nil
}
registration, err := runtime.meter.RegisterCallback(callback, runtime.runtimeRecordsByStatus)
if err != nil {
return fmt.Errorf("register rtmanager gauges: %w", err)
}
runtime.gaugeRegistration = registration
return nil
}
func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) {
meter := meterProvider.Meter(meterName)
runtime := &Runtime{
tracerProvider: tracerProvider,
meterProvider: meterProvider,
meter: meter,
shutdownFns: append([]func(context.Context) error(nil), shutdownFns...),
}
internalHTTPRequests, err := meter.Int64Counter("rtmanager.internal_http.requests")
if err != nil {
return nil, fmt.Errorf("build rtmanager telemetry runtime: internal_http.requests: %w", err)
}
internalHTTPDuration, err := meter.Float64Histogram("rtmanager.internal_http.duration", metric.WithUnit("ms"))
if err != nil {
return nil, fmt.Errorf("build rtmanager telemetry runtime: internal_http.duration: %w", err)
}
runtime.internalHTTPRequests = internalHTTPRequests
runtime.internalHTTPDuration = internalHTTPDuration
if err := registerCounters(meter, runtime); err != nil {
return nil, err
}
if err := registerHistograms(meter, runtime); err != nil {
return nil, err
}
if err := registerObservableGauges(meter, runtime); err != nil {
return nil, err
}
return runtime, nil
}
func registerCounters(meter metric.Meter, runtime *Runtime) error {
specs := []struct {
name string
target *metric.Int64Counter
}{
{"rtmanager.start_outcomes", &runtime.startOutcomes},
{"rtmanager.stop_outcomes", &runtime.stopOutcomes},
{"rtmanager.restart_outcomes", &runtime.restartOutcomes},
{"rtmanager.patch_outcomes", &runtime.patchOutcomes},
{"rtmanager.cleanup_outcomes", &runtime.cleanupOutcomes},
{"rtmanager.health_events", &runtime.healthEvents},
{"rtmanager.reconcile_drift", &runtime.reconcileDrift},
{"rtmanager.notification_intents", &runtime.notificationIntents},
}
for _, spec := range specs {
counter, err := meter.Int64Counter(spec.name)
if err != nil {
return fmt.Errorf("build rtmanager telemetry runtime: %s: %w", spec.name, err)
}
*spec.target = counter
}
return nil
}
func registerHistograms(meter metric.Meter, runtime *Runtime) error {
specs := []struct {
name string
unit string
target *metric.Float64Histogram
}{
{"rtmanager.docker_op_latency", "ms", &runtime.dockerOpLatency},
{"rtmanager.lease_acquire_latency", "ms", &runtime.leaseAcquireLatency},
}
for _, spec := range specs {
options := []metric.Float64HistogramOption{}
if spec.unit != "" {
options = append(options, metric.WithUnit(spec.unit))
}
histogram, err := meter.Float64Histogram(spec.name, options...)
if err != nil {
return fmt.Errorf("build rtmanager telemetry runtime: %s: %w", spec.name, err)
}
*spec.target = histogram
}
return nil
}
func registerObservableGauges(meter metric.Meter, runtime *Runtime) error {
gauge, err := meter.Int64ObservableGauge("rtmanager.runtime_records_by_status")
if err != nil {
return fmt.Errorf("build rtmanager telemetry runtime: runtime_records_by_status: %w", err)
}
runtime.runtimeRecordsByStatus = gauge
return nil
}
func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdktrace.TracerProvider, error) {
options := []sdktrace.TracerProviderOption{
sdktrace.WithResource(res),
}
if exporter, err := traceExporter(ctx, cfg); err != nil {
return nil, err
} else if exporter != nil {
options = append(options, sdktrace.WithBatcher(exporter))
}
if cfg.StdoutTracesEnabled {
exporter, err := stdouttrace.New(stdouttrace.WithWriter(os.Stdout))
if err != nil {
return nil, fmt.Errorf("stdout traces exporter: %w", err)
}
options = append(options, sdktrace.WithBatcher(exporter))
}
return sdktrace.NewTracerProvider(options...), nil
}
func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdkmetric.MeterProvider, error) {
options := []sdkmetric.Option{
sdkmetric.WithResource(res),
}
if exporter, err := metricExporter(ctx, cfg); err != nil {
return nil, err
} else if exporter != nil {
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
if cfg.StdoutMetricsEnabled {
exporter, err := stdoutmetric.New(stdoutmetric.WithWriter(os.Stdout))
if err != nil {
return nil, fmt.Errorf("stdout metrics exporter: %w", err)
}
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
}
return sdkmetric.NewMeterProvider(options...), nil
}
func traceExporter(ctx context.Context, cfg ProcessConfig) (sdktrace.SpanExporter, error) {
if cfg.TracesExporter != processExporterOTLP {
return nil, nil
}
switch normalizeProtocol(cfg.TracesProtocol) {
case processProtocolGRPC:
exporter, err := otlptracegrpc.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp grpc traces exporter: %w", err)
}
return exporter, nil
default:
exporter, err := otlptracehttp.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp http traces exporter: %w", err)
}
return exporter, nil
}
}
func metricExporter(ctx context.Context, cfg ProcessConfig) (sdkmetric.Exporter, error) {
if cfg.MetricsExporter != processExporterOTLP {
return nil, nil
}
switch normalizeProtocol(cfg.MetricsProtocol) {
case processProtocolGRPC:
exporter, err := otlpmetricgrpc.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp grpc metrics exporter: %w", err)
}
return exporter, nil
default:
exporter, err := otlpmetrichttp.New(ctx)
if err != nil {
return nil, fmt.Errorf("otlp http metrics exporter: %w", err)
}
return exporter, nil
}
}
func normalizeProtocol(value string) string {
switch strings.TrimSpace(value) {
case processProtocolGRPC:
return processProtocolGRPC
default:
return processProtocolHTTPProtobuf
}
}
func normalizeContext(ctx context.Context) context.Context {
if ctx == nil {
return context.Background()
}
return ctx
}

Some files were not shown because too many files have changed in this diff Show More