Files
galaxy-game/integration/testenv/backend.go
T
Ilia Denisov a338ebf058
Tests · Integration / integration (pull_request) Successful in 1m37s
fix(integration): scope preclean to galaxy.stack=integration
Root cause for the long-standing "Dev Sandbox flips to cancelled
after dev-deploy" symptom in push-triggered cycles: when
`integration.yaml` runs in parallel with `dev-deploy.yaml`, its
`integration/scripts/preclean.sh` issues a `docker rm -f` over every
container labelled `galaxy.backend=1`. That label is stamped by the
backend's runtime adapter on every engine it spawns — including the
engines living in the long-lived dev-deploy environment on the same
Docker daemon. Each post-merge auto-deploy therefore had the
integration preclean wipe the dev-sandbox engine, and the new
backend's reconciler tick observed `container disappeared` and
cascaded the sandbox into `cancelled`.

Fix:

- `integration/testenv/backend.go` now sets
  `BACKEND_STACK_LABEL=integration` on every backend-under-test, so
  the engines spawned by integration carry
  `galaxy.stack=integration` in addition to `galaxy.backend=1`. The
  backend support for this env was added in the previous CI tidy-up
  PR (#13).

- `integration/scripts/preclean.sh` gains a multi-label AND filter
  helper and uses it to scope engine cleanup to the combination
  `galaxy.backend=1 AND galaxy.stack=integration`. dev-deploy and
  local-dev engines carry different `galaxy.stack` values, so the
  AND match leaves them alone.

- `docs/ARCHITECTURE.md` "Container labels" — refreshed to call out
  the AND-scoping rule and the new integration backend stamp.

- `tools/dev-deploy/KNOWN-ISSUES.md` — the sandbox-cancel entry
  gets an "Update" section recording the root cause and the fix; the
  status is downgraded to "partially fixed" because the solo
  `workflow_dispatch` reproduction (which does NOT trigger
  integration) remains unexplained.

- `tools/dev-deploy/KNOWN-ISSUES.md` — separately, document the
  `docker restart galaxy-dev-backend` failure caused by the
  runner-workspace bind-mount that surfaced while diagnosing this
  issue. Workaround: `make -C tools/dev-deploy up` from the
  persistent checkout. Real fix is a follow-up (bake fixture into
  image or copy to named volume).

Verification:

- `go build ./backend/... ./integration/...` — clean.
- `bash -n integration/scripts/preclean.sh` — syntax OK.
- Live AND-filter check on the dev host:
  `docker ps -aq --filter label=galaxy.backend=1 --filter label=galaxy.stack=integration`
  returns nothing while the dev-deploy engine
  `galaxy-game-80f3ce86-...` keeps running.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 01:37:55 +02:00

189 lines
6.2 KiB
Go

package testenv
import (
"context"
"fmt"
"path/filepath"
"testing"
"time"
"github.com/google/uuid"
"github.com/moby/moby/api/types/container"
"github.com/moby/moby/api/types/mount"
"github.com/testcontainers/testcontainers-go"
tcnetwork "github.com/testcontainers/testcontainers-go/network"
"github.com/testcontainers/testcontainers-go/wait"
)
// BackendContainer wraps a running galaxy/backend:integration
// container reachable from the host (HTTPHost, GRPCPushHost) and
// from the shared Docker network at the alias "backend".
type BackendContainer struct {
Container testcontainers.Container
HTTPHost string
HTTPPort int
HTTPURL string
GRPCHost string
GRPCPort int
GRPCURL string
// AdminUser/AdminPassword are the bootstrap admin credentials this
// container started with. Tests that exercise the admin surface
// reuse them directly.
AdminUser string
AdminPassword string
}
// BackendOptions tunes a backend container before it boots.
type BackendOptions struct {
NetworkAlias string
NetworkName string
PostgresDSN string
MailpitHost string
MailpitPort int
GeoIPHostPath string
AdminEmail string
Extra map[string]string
}
// StartBackend boots galaxy/backend:integration with the supplied
// options.
func StartBackend(t *testing.T, opts BackendOptions) *BackendContainer {
t.Helper()
EnsureBackendImage(t)
if opts.NetworkAlias == "" {
opts.NetworkAlias = "backend"
}
if opts.AdminEmail == "" {
opts.AdminEmail = "admin@galaxy.test"
}
geoIPInContainer := "/var/lib/galaxy/geoip.mmdb"
// Use a unique daemon-side path for each test so concurrent
// runs cannot collide. Docker creates the source directory at
// container start because BindOptions.CreateMountpoint=true.
stateRoot := "/tmp/galaxy-state-" + uuid.NewString()
env := map[string]string{
"BACKEND_HTTP_LISTEN_ADDR": ":8080",
"BACKEND_GRPC_PUSH_LISTEN_ADDR": ":8081",
"BACKEND_LOGGING_LEVEL": "info",
"BACKEND_POSTGRES_DSN": opts.PostgresDSN,
"BACKEND_SMTP_HOST": opts.MailpitHost,
"BACKEND_SMTP_PORT": fmt.Sprintf("%d", opts.MailpitPort),
"BACKEND_SMTP_FROM": "galaxy-backend@galaxy.test",
"BACKEND_SMTP_TLS_MODE": "none",
"BACKEND_DOCKER_NETWORK": opts.NetworkName,
"BACKEND_GAME_STATE_ROOT": stateRoot,
"BACKEND_ADMIN_BOOTSTRAP_USER": "bootstrap",
"BACKEND_ADMIN_BOOTSTRAP_PASSWORD": "bootstrap-secret",
"BACKEND_GEOIP_DB_PATH": geoIPInContainer,
"BACKEND_OTEL_TRACES_EXPORTER": "none",
"BACKEND_OTEL_METRICS_EXPORTER": "none",
"BACKEND_NOTIFICATION_ADMIN_EMAIL": opts.AdminEmail,
"BACKEND_AUTH_CHALLENGE_THROTTLE_MAX": "100",
"BACKEND_MAIL_WORKER_INTERVAL": "500ms",
"BACKEND_NOTIFICATION_WORKER_INTERVAL": "500ms",
// Stamp galaxy.stack=integration on every engine container the
// backend-under-test spawns so the post-run preclean.sh can
// scope its cleanup to integration-owned engines and leave
// dev-deploy / local-dev stacks running on the same daemon
// untouched. See `integration/scripts/preclean.sh` and the
// "Container labels" section in `docs/ARCHITECTURE.md`.
"BACKEND_STACK_LABEL": "integration",
}
for k, v := range opts.Extra {
env[k] = v
}
dockerSocket := DockerSocketPath()
req := testcontainers.ContainerRequest{
Image: BackendImage,
ExposedPorts: []string{"8080/tcp", "8081/tcp"},
Env: env,
WaitingFor: wait.ForHTTP("/healthz").
WithPort("8080/tcp").
WithStartupTimeout(60 * time.Second),
Files: []testcontainers.ContainerFile{
{
HostFilePath: opts.GeoIPHostPath,
ContainerFilePath: geoIPInContainer,
FileMode: 0o644,
},
},
HostConfigModifier: func(hc *container.HostConfig) {
hc.Binds = append(hc.Binds, dockerSocket+":/var/run/docker.sock")
// Bind a unique daemon-side directory at the same path
// inside the backend container. CreateMountpoint=true
// asks the daemon to create the source directory if it
// is missing, so we do not need a second container just
// to mkdir on the daemon host. Per-game subdirectories
// are created by backend's runtime via os.MkdirAll
// before each engine container start.
hc.Mounts = append(hc.Mounts, mount.Mount{
Type: mount.TypeBind,
Source: stateRoot,
Target: stateRoot,
BindOptions: &mount.BindOptions{
CreateMountpoint: true,
},
})
},
// The distroless `nonroot` user (uid 65532) cannot reach the
// Docker daemon socket that backend mounts to manage engine
// containers. In integration tests we run as root so the
// dockerclient.EnsureNetwork startup probe succeeds; the
// production deployment will rely on a docker-socket-proxy
// sidecar (see ARCHITECTURE.md §13).
User: "0:0",
}
gcr := &testcontainers.GenericContainerRequest{ContainerRequest: req}
if opts.NetworkName != "" {
_ = tcnetwork.WithNetwork([]string{opts.NetworkAlias}, &testcontainers.DockerNetwork{Name: opts.NetworkName}).Customize(gcr)
}
gcr.Started = true
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
defer cancel()
container, err := testcontainers.GenericContainer(ctx, *gcr)
if err != nil {
t.Fatalf("start backend container: %v", err)
}
t.Cleanup(func() {
if err := testcontainers.TerminateContainer(container); err != nil {
t.Logf("terminate backend: %v", err)
}
})
host, err := container.Host(ctx)
if err != nil {
t.Fatalf("backend host: %v", err)
}
httpPort, err := container.MappedPort(ctx, "8080/tcp")
if err != nil {
t.Fatalf("backend http port: %v", err)
}
grpcPort, err := container.MappedPort(ctx, "8081/tcp")
if err != nil {
t.Fatalf("backend grpc port: %v", err)
}
return &BackendContainer{
Container: container,
HTTPHost: host,
HTTPPort: int(httpPort.Num()),
HTTPURL: fmt.Sprintf("http://%s:%d", host, httpPort.Num()),
GRPCHost: host,
GRPCPort: int(grpcPort.Num()),
GRPCURL: fmt.Sprintf("%s:%d", host, grpcPort.Num()),
AdminUser: env["BACKEND_ADMIN_BOOTSTRAP_USER"],
AdminPassword: env["BACKEND_ADMIN_BOOTSTRAP_PASSWORD"],
}
}
// _ keeps filepath imported even when only the network helper grows
// here later.
var _ = filepath.Separator