Files
galaxy-game/rtmanager/internal/adapters/docker/client.go
T
2026-04-28 20:39:18 +02:00

494 lines
15 KiB
Go

// Package docker provides the production Docker SDK adapter that
// implements `galaxy/rtmanager/internal/ports.DockerClient`. The
// adapter is the single component allowed to talk to the local Docker
// daemon; every Runtime Manager service path that needs container
// lifecycle operations goes through this surface.
//
// The adapter is intentionally narrow — it does not orchestrate, log,
// or retry. Cross-cutting concerns (lease coordination, durable state,
// notification side-effects) live in the service layer.
package docker
import (
"context"
"errors"
"fmt"
"io"
"maps"
"strings"
"sync"
"time"
cerrdefs "github.com/containerd/errdefs"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/events"
"github.com/docker/docker/api/types/filters"
"github.com/docker/docker/api/types/image"
"github.com/docker/docker/api/types/network"
dockerclient "github.com/docker/docker/client"
"github.com/docker/go-units"
"galaxy/rtmanager/internal/ports"
)
// EnginePort is the in-container HTTP port the engine listens on. The
// value is fixed by `rtmanager/README.md §Container Model` and by the
// engine's Dockerfile (`game/Dockerfile`); RTM never publishes the port
// to the host. Keeping the constant here lets the adapter own the URL
// shape so the start service does not have to know it.
const EnginePort = 8080
// Config groups the dependencies and per-process defaults required to
// construct a Client. The struct is value-typed so wiring code can
// build it inline without intermediate variables.
type Config struct {
// Docker stores the SDK client this adapter wraps. It must be
// non-nil; callers typically construct it via `client.NewClientWithOpts`.
Docker *dockerclient.Client
// LogDriver stores the Docker logging driver applied to every
// container the adapter creates (e.g. `json-file`).
LogDriver string
// LogOpts stores the comma-separated `key=value` driver options
// forwarded to Docker. Empty disables driver-specific options.
LogOpts string
// Clock supplies the wall-clock used for `RunResult.StartedAt`.
// Defaults to `time.Now` when nil.
Clock func() time.Time
}
// Client is the production adapter implementing `ports.DockerClient`.
// Construct it via NewClient; do not zero-initialise.
type Client struct {
docker *dockerclient.Client
logDriver string
logOpts string
clock func() time.Time
}
// NewClient constructs a Client from cfg. It returns an error if cfg
// does not carry the minimum collaborator set the adapter needs to
// function.
func NewClient(cfg Config) (*Client, error) {
if cfg.Docker == nil {
return nil, errors.New("new docker adapter: nil docker client")
}
if strings.TrimSpace(cfg.LogDriver) == "" {
return nil, errors.New("new docker adapter: log driver must not be empty")
}
clock := cfg.Clock
if clock == nil {
clock = time.Now
}
return &Client{
docker: cfg.Docker,
logDriver: cfg.LogDriver,
logOpts: cfg.LogOpts,
clock: clock,
}, nil
}
// EnsureNetwork verifies the user-defined Docker network is present.
// The adapter never creates networks; provisioning is the operator's
// job per `rtmanager/README.md §Container Model`.
func (client *Client) EnsureNetwork(ctx context.Context, name string) error {
if _, err := client.docker.NetworkInspect(ctx, name, network.InspectOptions{}); err != nil {
if cerrdefs.IsNotFound(err) {
return ports.ErrNetworkMissing
}
return fmt.Errorf("ensure network %q: %w", name, err)
}
return nil
}
// PullImage pulls ref according to policy. The pull stream is drained
// to completion because the Docker SDK only finishes the underlying
// pull when the body is consumed.
func (client *Client) PullImage(ctx context.Context, ref string, policy ports.PullPolicy) error {
if !policy.IsKnown() {
return fmt.Errorf("pull image %q: unknown pull policy %q", ref, policy)
}
switch policy {
case ports.PullPolicyAlways:
return client.runPull(ctx, ref)
case ports.PullPolicyIfMissing:
if present, err := client.imagePresent(ctx, ref); err != nil {
return err
} else if present {
return nil
}
return client.runPull(ctx, ref)
case ports.PullPolicyNever:
present, err := client.imagePresent(ctx, ref)
if err != nil {
return err
}
if !present {
return ports.ErrImageNotFound
}
return nil
default:
return fmt.Errorf("pull image %q: unsupported pull policy %q", ref, policy)
}
}
// InspectImage returns image metadata for ref. RTM only reads labels
// at start time; the broader inspect struct stays accessible for
// diagnostics.
func (client *Client) InspectImage(ctx context.Context, ref string) (ports.ImageInspect, error) {
inspect, err := client.docker.ImageInspect(ctx, ref)
if err != nil {
if cerrdefs.IsNotFound(err) {
return ports.ImageInspect{}, ports.ErrImageNotFound
}
return ports.ImageInspect{}, fmt.Errorf("inspect image %q: %w", ref, err)
}
var labels map[string]string
if inspect.Config != nil {
labels = copyStringMap(inspect.Config.Labels)
}
return ports.ImageInspect{Ref: ref, Labels: labels}, nil
}
// InspectContainer returns container metadata for containerID. The
// adapter best-effort decodes Docker timestamps; malformed values map
// to the zero time so callers do not have to defend against nil
// pointers in the SDK response.
func (client *Client) InspectContainer(ctx context.Context, containerID string) (ports.ContainerInspect, error) {
inspect, err := client.docker.ContainerInspect(ctx, containerID)
if err != nil {
if cerrdefs.IsNotFound(err) {
return ports.ContainerInspect{}, ports.ErrContainerNotFound
}
return ports.ContainerInspect{}, fmt.Errorf("inspect container %q: %w", containerID, err)
}
result := ports.ContainerInspect{ID: inspect.ID}
if inspect.ContainerJSONBase != nil {
result.RestartCount = inspect.RestartCount
if inspect.State != nil {
result.Status = string(inspect.State.Status)
result.OOMKilled = inspect.State.OOMKilled
result.ExitCode = inspect.State.ExitCode
result.StartedAt = parseDockerTime(inspect.State.StartedAt)
result.FinishedAt = parseDockerTime(inspect.State.FinishedAt)
if inspect.State.Health != nil {
result.Health = string(inspect.State.Health.Status)
}
}
}
if inspect.Config != nil {
result.ImageRef = inspect.Config.Image
result.Hostname = inspect.Config.Hostname
result.Labels = copyStringMap(inspect.Config.Labels)
}
return result, nil
}
// Run creates and starts one container according to spec. On
// `ContainerStart` failure the adapter best-effort removes the partial
// container so the start service never has to clean up after a failed
// start path.
func (client *Client) Run(ctx context.Context, spec ports.RunSpec) (ports.RunResult, error) {
if err := spec.Validate(); err != nil {
return ports.RunResult{}, fmt.Errorf("run container: %w", err)
}
memoryBytes, err := units.RAMInBytes(spec.Memory)
if err != nil {
return ports.RunResult{}, fmt.Errorf("run container %q: parse memory %q: %w", spec.Name, spec.Memory, err)
}
pidsLimit := int64(spec.PIDsLimit)
containerCfg := &container.Config{
Image: spec.Image,
Hostname: spec.Hostname,
Env: envMapToSlice(spec.Env),
Labels: copyStringMap(spec.Labels),
Cmd: append([]string(nil), spec.Cmd...),
}
hostCfg := &container.HostConfig{
Binds: bindMountsToBinds(spec.BindMounts),
LogConfig: container.LogConfig{
Type: client.logDriver,
Config: parseLogOpts(client.logOpts),
},
Resources: container.Resources{
NanoCPUs: int64(spec.CPUQuota * 1e9),
Memory: memoryBytes,
PidsLimit: &pidsLimit,
},
}
netCfg := &network.NetworkingConfig{
EndpointsConfig: map[string]*network.EndpointSettings{
spec.Network: {
Aliases: []string{spec.Hostname},
},
},
}
created, err := client.docker.ContainerCreate(ctx, containerCfg, hostCfg, netCfg, nil, spec.Name)
if err != nil {
return ports.RunResult{}, fmt.Errorf("create container %q: %w", spec.Name, err)
}
if err := client.docker.ContainerStart(ctx, created.ID, container.StartOptions{}); err != nil {
client.cleanupAfterFailedStart(created.ID)
return ports.RunResult{}, fmt.Errorf("start container %q: %w", spec.Name, err)
}
return ports.RunResult{
ContainerID: created.ID,
EngineEndpoint: fmt.Sprintf("http://%s:%d", spec.Hostname, EnginePort),
StartedAt: client.clock(),
}, nil
}
// Stop bounds graceful shutdown by timeout. A missing container is
// surfaced as ErrContainerNotFound so the service layer can treat it
// as already-stopped per `rtmanager/README.md §Lifecycles → Stop`.
func (client *Client) Stop(ctx context.Context, containerID string, timeout time.Duration) error {
seconds := max(int(timeout.Round(time.Second).Seconds()), 0)
if err := client.docker.ContainerStop(ctx, containerID, container.StopOptions{Timeout: &seconds}); err != nil {
if cerrdefs.IsNotFound(err) {
return ports.ErrContainerNotFound
}
return fmt.Errorf("stop container %q: %w", containerID, err)
}
return nil
}
// Remove removes the container without forcing kill. A missing
// container is reported as success so callers can treat the operation
// as idempotent.
func (client *Client) Remove(ctx context.Context, containerID string) error {
if err := client.docker.ContainerRemove(ctx, containerID, container.RemoveOptions{}); err != nil {
if cerrdefs.IsNotFound(err) {
return nil
}
return fmt.Errorf("remove container %q: %w", containerID, err)
}
return nil
}
// List returns container summaries that match filter. Empty Labels
// match every container; the reconciler always passes
// `com.galaxy.owner=rtmanager`.
func (client *Client) List(ctx context.Context, filter ports.ListFilter) ([]ports.ContainerSummary, error) {
args := filters.NewArgs()
for key, value := range filter.Labels {
args.Add("label", key+"="+value)
}
summaries, err := client.docker.ContainerList(ctx, container.ListOptions{All: true, Filters: args})
if err != nil {
return nil, fmt.Errorf("list containers: %w", err)
}
out := make([]ports.ContainerSummary, 0, len(summaries))
for _, summary := range summaries {
hostname := ""
if len(summary.Names) > 0 {
hostname = strings.TrimPrefix(summary.Names[0], "/")
}
out = append(out, ports.ContainerSummary{
ID: summary.ID,
ImageRef: summary.Image,
Hostname: hostname,
Labels: copyStringMap(summary.Labels),
Status: string(summary.State),
StartedAt: time.Unix(summary.Created, 0).UTC(),
})
}
return out, nil
}
// EventsListen subscribes to the Docker events stream and returns a
// typed channel of decoded container events plus an asynchronous
// error channel. The caller cancels ctx to terminate the subscription;
// the goroutine closes both channels on termination.
func (client *Client) EventsListen(ctx context.Context) (<-chan ports.DockerEvent, <-chan error, error) {
msgs, sdkErrs := client.docker.Events(ctx, events.ListOptions{})
out := make(chan ports.DockerEvent)
outErrs := make(chan error, 1)
var closeOnce sync.Once
closeAll := func() {
closeOnce.Do(func() {
close(out)
close(outErrs)
})
}
go func() {
defer closeAll()
for {
select {
case <-ctx.Done():
return
case msg, ok := <-msgs:
if !ok {
return
}
if msg.Type != events.ContainerEventType {
continue
}
select {
case <-ctx.Done():
return
case out <- decodeEvent(msg):
}
case err, ok := <-sdkErrs:
if !ok {
return
}
if err == nil {
continue
}
select {
case <-ctx.Done():
case outErrs <- err:
}
return
}
}
}()
return out, outErrs, nil
}
func (client *Client) cleanupAfterFailedStart(containerID string) {
cleanupCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
_ = client.docker.ContainerRemove(cleanupCtx, containerID, container.RemoveOptions{Force: true})
}
func (client *Client) imagePresent(ctx context.Context, ref string) (bool, error) {
if _, err := client.docker.ImageInspect(ctx, ref); err != nil {
if cerrdefs.IsNotFound(err) {
return false, nil
}
return false, fmt.Errorf("inspect image %q: %w", ref, err)
}
return true, nil
}
func (client *Client) runPull(ctx context.Context, ref string) error {
body, err := client.docker.ImagePull(ctx, ref, image.PullOptions{})
if err != nil {
if cerrdefs.IsNotFound(err) {
return ports.ErrImageNotFound
}
return fmt.Errorf("pull image %q: %w", ref, err)
}
defer body.Close()
if _, err := io.Copy(io.Discard, body); err != nil {
return fmt.Errorf("drain pull stream for %q: %w", ref, err)
}
return nil
}
func envMapToSlice(envMap map[string]string) []string {
if len(envMap) == 0 {
return nil
}
out := make([]string, 0, len(envMap))
for key, value := range envMap {
out = append(out, key+"="+value)
}
return out
}
func bindMountsToBinds(mounts []ports.BindMount) []string {
if len(mounts) == 0 {
return nil
}
binds := make([]string, 0, len(mounts))
for _, mount := range mounts {
bind := mount.HostPath + ":" + mount.MountPath
if mount.ReadOnly {
bind += ":ro"
}
binds = append(binds, bind)
}
return binds
}
func parseLogOpts(raw string) map[string]string {
if strings.TrimSpace(raw) == "" {
return nil
}
out := make(map[string]string)
for part := range strings.SplitSeq(raw, ",") {
entry := strings.TrimSpace(part)
if entry == "" {
continue
}
index := strings.IndexByte(entry, '=')
if index <= 0 {
continue
}
out[entry[:index]] = entry[index+1:]
}
if len(out) == 0 {
return nil
}
return out
}
func parseDockerTime(raw string) time.Time {
if raw == "" {
return time.Time{}
}
parsed, err := time.Parse(time.RFC3339Nano, raw)
if err != nil {
return time.Time{}
}
return parsed.UTC()
}
func copyStringMap(in map[string]string) map[string]string {
if in == nil {
return nil
}
out := make(map[string]string, len(in))
maps.Copy(out, in)
return out
}
func decodeEvent(msg events.Message) ports.DockerEvent {
occurredAt := time.Time{}
switch {
case msg.TimeNano != 0:
occurredAt = time.Unix(0, msg.TimeNano).UTC()
case msg.Time != 0:
occurredAt = time.Unix(msg.Time, 0).UTC()
}
exitCode := 0
if raw, ok := msg.Actor.Attributes["exitCode"]; ok {
if value, err := parseExitCode(raw); err == nil {
exitCode = value
}
}
return ports.DockerEvent{
Action: string(msg.Action),
ContainerID: msg.Actor.ID,
Labels: copyStringMap(msg.Actor.Attributes),
ExitCode: exitCode,
OccurredAt: occurredAt,
}
}
func parseExitCode(raw string) (int, error) {
value := 0
for _, r := range raw {
if r < '0' || r > '9' {
return 0, fmt.Errorf("non-numeric exit code %q", raw)
}
value = value*10 + int(r-'0')
}
return value, nil
}
// Compile-time assertion: Client implements ports.DockerClient.
var _ ports.DockerClient = (*Client)(nil)