Merge pull request 'dev-deploy: production mirror + full observability behind the /_gm gate' (#88) from feature/dev-prod-mirror into development
Deploy · Dev / deploy (push) Successful in 42s
Tests · Go / test (push) Successful in 1m58s
Tests · Integration / integration (push) Successful in 1m44s
Tests · UI / test (push) Successful in 3m20s

This commit was merged in pull request #88.
This commit is contained in:
2026-06-01 04:56:45 +00:00
32 changed files with 721 additions and 753 deletions
+40 -5
View File
@@ -148,12 +148,37 @@ jobs:
-v "${{ gitea.workspace }}/pkg/geoip/test-data/test-data:/src:ro" \ -v "${{ gitea.workspace }}/pkg/geoip/test-data/test-data:/src:ro" \
alpine sh -c 'cp /src/GeoIP2-Country-Test.mmdb /dst/geoip.mmdb' alpine sh -c 'cp /src/GeoIP2-Country-Test.mmdb /dst/geoip.mmdb'
- name: Seed mailpit relay config
env:
GALAXY_DEV_MAIL_RELAY_USERNAME: ${{ secrets.GALAXY_DEV_MAIL_RELAY_USERNAME }}
GALAXY_DEV_MAIL_RELAY_PASSWORD: ${{ secrets.GALAXY_DEV_MAIL_RELAY_PASSWORD }}
run: |
# Render the Mailpit relay upstream config from the template,
# substituting the Gmail App Password from a Gitea secret, then
# seed it into a named volume (same rationale as the geoip seed:
# a workspace bind-mount would vanish with the runner workspace).
# The secret never lands in git or a committed file; it is
# rendered to a tmpfile outside the repo and removed after. Gmail
# App Passwords are [a-z]{16}, so the `|` sed delimiter is safe.
# When the secret is unset the creds render empty and the compose
# default relay-match is non-routable, so the stack only captures.
rendered="$(mktemp)"
sed -e "s|\${GALAXY_DEV_MAIL_RELAY_USERNAME}|${GALAXY_DEV_MAIL_RELAY_USERNAME}|g" \
-e "s|\${GALAXY_DEV_MAIL_RELAY_PASSWORD}|${GALAXY_DEV_MAIL_RELAY_PASSWORD}|g" \
"${{ gitea.workspace }}/tools/dev-deploy/mailpit/relay.conf.tmpl" > "$rendered"
docker volume create galaxy-dev-mailpit-config >/dev/null
docker run --rm \
-v galaxy-dev-mailpit-config:/dst \
-v "$rendered:/src/relay.conf:ro" \
alpine sh -c 'cp /src/relay.conf /dst/relay.conf && chmod 600 /dst/relay.conf'
rm -f "$rendered"
- name: Recycle engine containers on image drift - name: Recycle engine containers on image drift
run: | run: |
# Compare the freshly-built `galaxy-engine:dev` SHA against # Compare the freshly-built `galaxy-engine:dev` SHA against
# every running `galaxy-game-*` container. The backend # every running `galaxy-game-*` container. The backend
# reconciler adopts pre-existing labelled engine containers # reconciler adopts pre-existing labelled engine containers
# without checking image drift, so a running sandbox would # without checking image drift, so a running game would
# otherwise keep serving the previous engine code until the # otherwise keep serving the previous engine code until the
# container is recycled by hand. This step makes the recycle # container is recycled by hand. This step makes the recycle
# automatic but only when it is actually needed: # automatic but only when it is actually needed:
@@ -168,10 +193,7 @@ jobs:
# silent state corruption otherwise), and cascade-delete # silent state corruption otherwise), and cascade-delete
# the lobby `games` row (the FKs in `00001_init.sql` # the lobby `games` row (the FKs in `00001_init.sql`
# drop the matching `runtime_records`, `memberships`, # drop the matching `runtime_records`, `memberships`,
# `player_mappings`, etc. in the same write). The # `player_mappings`, etc. in the same write).
# `dev-sandbox` bootstrap on the next backend boot finds
# no live sandbox and provisions a fresh one on the new
# engine image.
# #
# Backend is stopped first to keep the reconciler from # Backend is stopped first to keep the reconciler from
# racing the recycle (mid-stream adoption / restart). The # racing the recycle (mid-stream adoption / restart). The
@@ -234,11 +256,24 @@ jobs:
- name: Bring up the stack - name: Bring up the stack
working-directory: tools/dev-deploy working-directory: tools/dev-deploy
env:
# Recipient regex Mailpit auto-relays to the owner's Gmail.
# Unset/empty → the compose default (non-routable) keeps the
# stack capture-only.
GALAXY_DEV_MAIL_RELAY_MATCH: ${{ vars.GALAXY_DEV_MAIL_RELAY_MATCH }}
# Grafana admin password; unset/empty -> compose default 'admin'.
GALAXY_DEV_GRAFANA_ADMIN_PASSWORD: ${{ secrets.GALAXY_DEV_GRAFANA_ADMIN_PASSWORD }}
run: | run: |
# Resolve in the shell, not in YAML expressions — `env.HOME` # Resolve in the shell, not in YAML expressions — `env.HOME`
# is empty at the workflow-evaluation stage. # is empty at the workflow-evaluation stage.
export GALAXY_DEV_GAME_STATE_DIR="$HOME/.galaxy-dev/game-state" export GALAXY_DEV_GAME_STATE_DIR="$HOME/.galaxy-dev/game-state"
mkdir -p "$GALAXY_DEV_GAME_STATE_DIR" mkdir -p "$GALAXY_DEV_GAME_STATE_DIR"
# Seed the monitoring config to a stable, reboot-surviving host
# path (compose binds \${GALAXY_DEV_MONITORING_DIR} read-only).
export GALAXY_DEV_MONITORING_DIR="$HOME/.galaxy-dev/monitoring"
rm -rf "$GALAXY_DEV_MONITORING_DIR"
mkdir -p "$GALAXY_DEV_MONITORING_DIR"
cp -r monitoring/. "$GALAXY_DEV_MONITORING_DIR/"
docker compose up -d --wait --remove-orphans docker compose up -d --wait --remove-orphans
- name: Probe the stack - name: Probe the stack
+6 -18
View File
@@ -26,7 +26,6 @@ import (
"galaxy/backend/internal/app" "galaxy/backend/internal/app"
"galaxy/backend/internal/auth" "galaxy/backend/internal/auth"
"galaxy/backend/internal/config" "galaxy/backend/internal/config"
"galaxy/backend/internal/devsandbox"
"galaxy/backend/internal/diplomail" "galaxy/backend/internal/diplomail"
"galaxy/backend/internal/diplomail/detector" "galaxy/backend/internal/diplomail/detector"
"galaxy/backend/internal/diplomail/translator" "galaxy/backend/internal/diplomail/translator"
@@ -274,29 +273,18 @@ func run(ctx context.Context) (err error) {
) )
runtimeGateway.svc = runtimeSvc runtimeGateway.svc = runtimeSvc
// Run a single reconciliation pass before the dev-sandbox // Run a single reconciliation pass at startup so any runtime row
// bootstrap so any runtime row pointing at a vanished engine // pointing at a vanished engine container (a host reboot wiped
// container (host reboot wiped /tmp/galaxy-game-state/<uuid>; // /tmp/galaxy-game-state/<uuid>; `tools/local-dev`'s
// `tools/local-dev`'s `prune-broken-engines` target reaped the // `prune-broken-engines` target reaped the husk) is cascaded
// husk) is already cascaded through `markRemoved` → lobby // through `markRemoved` → lobby `cancelled` before the server
// `cancelled` by the time the bootstrap walks the sandbox list. // starts serving requests. Failures are
// Without this pre-tick the bootstrap would reuse the
// soon-to-be-cancelled game and force the developer into a
// second `make up` cycle to land a healthy sandbox. Failures are
// non-fatal: the periodic ticker started later catches up, and // non-fatal: the periodic ticker started later catches up, and
// the worst case degrades to the legacy two-cycle recovery. // the worst case degrades to the legacy two-cycle recovery.
if err := runtimeSvc.Reconciler().Tick(ctx); err != nil { if err := runtimeSvc.Reconciler().Tick(ctx); err != nil {
logger.Warn("pre-bootstrap reconciler tick failed", zap.Error(err)) logger.Warn("pre-bootstrap reconciler tick failed", zap.Error(err))
} }
if err := devsandbox.Bootstrap(ctx, devsandbox.Deps{
Users: userSvc,
Lobby: lobbySvc,
EngineVersions: engineVersionSvc,
}, cfg.DevSandbox, logger); err != nil {
return fmt.Errorf("dev sandbox bootstrap: %w", err)
}
notifStore := notification.NewStore(db) notifStore := notification.NewStore(db)
notifSvc := notification.NewService(notification.Deps{ notifSvc := notification.NewService(notification.Deps{
Store: notifStore, Store: notifStore,
@@ -17,6 +17,8 @@
<a href="/_gm/games"{{if eq .ActiveNav "games"}} class="active"{{end}}>Games</a> <a href="/_gm/games"{{if eq .ActiveNav "games"}} class="active"{{end}}>Games</a>
<a href="/_gm/operators"{{if eq .ActiveNav "operators"}} class="active"{{end}}>Operators</a> <a href="/_gm/operators"{{if eq .ActiveNav "operators"}} class="active"{{end}}>Operators</a>
<a href="/_gm/mail"{{if eq .ActiveNav "mail"}} class="active"{{end}}>Mail</a> <a href="/_gm/mail"{{if eq .ActiveNav "mail"}} class="active"{{end}}>Mail</a>
<a href="/_gm/grafana/" target="_blank" rel="noopener">Grafana</a>
<a href="/_gm/mailpit/" target="_blank" rel="noopener">Mailpit</a>
</nav> </nav>
<span class="who">{{.Username}}</span> <span class="who">{{.Username}}</span>
</header> </header>
-51
View File
@@ -105,11 +105,6 @@ const (
envDiplomailTranslatorTimeout = "BACKEND_DIPLOMAIL_TRANSLATOR_TIMEOUT" envDiplomailTranslatorTimeout = "BACKEND_DIPLOMAIL_TRANSLATOR_TIMEOUT"
envDiplomailTranslatorMaxAttempts = "BACKEND_DIPLOMAIL_TRANSLATOR_MAX_ATTEMPTS" envDiplomailTranslatorMaxAttempts = "BACKEND_DIPLOMAIL_TRANSLATOR_MAX_ATTEMPTS"
envDiplomailWorkerInterval = "BACKEND_DIPLOMAIL_WORKER_INTERVAL" envDiplomailWorkerInterval = "BACKEND_DIPLOMAIL_WORKER_INTERVAL"
envDevSandboxEmail = "BACKEND_DEV_SANDBOX_EMAIL"
envDevSandboxEngineImage = "BACKEND_DEV_SANDBOX_ENGINE_IMAGE"
envDevSandboxEngineVersion = "BACKEND_DEV_SANDBOX_ENGINE_VERSION"
envDevSandboxPlayerCount = "BACKEND_DEV_SANDBOX_PLAYER_COUNT"
) )
// Default values applied when an environment variable is absent. // Default values applied when an environment variable is absent.
@@ -178,9 +173,6 @@ const (
defaultDiplomailTranslatorTimeout = 10 * time.Second defaultDiplomailTranslatorTimeout = 10 * time.Second
defaultDiplomailTranslatorMaxAttempts = 5 defaultDiplomailTranslatorMaxAttempts = 5
defaultDiplomailWorkerInterval = 2 * time.Second defaultDiplomailWorkerInterval = 2 * time.Second
defaultDevSandboxEngineVersion = "0.1.0"
defaultDevSandboxPlayerCount = 20
) )
// Allowed values for the closed-set string options. // Allowed values for the closed-set string options.
@@ -219,29 +211,12 @@ type Config struct {
Runtime RuntimeConfig Runtime RuntimeConfig
Notification NotificationConfig Notification NotificationConfig
Diplomail DiplomailConfig Diplomail DiplomailConfig
DevSandbox DevSandboxConfig
// FreshnessWindow mirrors the gateway freshness window and is used by the // FreshnessWindow mirrors the gateway freshness window and is used by the
// push server to bound the cursor TTL. // push server to bound the cursor TTL.
FreshnessWindow time.Duration FreshnessWindow time.Duration
} }
// DevSandboxConfig configures the boot-time bootstrap implemented in
// `backend/internal/devsandbox`. When Email is empty the bootstrap
// is a no-op, which is the production posture. When Email is set —
// from `BACKEND_DEV_SANDBOX_EMAIL` in the `tools/local-dev` stack —
// the bootstrap idempotently provisions a real user, the configured
// number of dummy participants, a private "Dev Sandbox" game, the
// matching memberships, and drives the lifecycle to `running`. The
// engine image and engine version refer to a row that the bootstrap
// also seeds in `engine_versions`.
type DevSandboxConfig struct {
Email string
EngineImage string
EngineVersion string
PlayerCount int
}
// LoggingConfig stores the parameters used by the structured logger. // LoggingConfig stores the parameters used by the structured logger.
type LoggingConfig struct { type LoggingConfig struct {
// Level is the zap level name (e.g. "debug", "info", "warn", "error"). // Level is the zap level name (e.g. "debug", "info", "warn", "error").
@@ -572,10 +547,6 @@ func DefaultConfig() Config {
TranslatorMaxAttempts: defaultDiplomailTranslatorMaxAttempts, TranslatorMaxAttempts: defaultDiplomailTranslatorMaxAttempts,
WorkerInterval: defaultDiplomailWorkerInterval, WorkerInterval: defaultDiplomailWorkerInterval,
}, },
DevSandbox: DevSandboxConfig{
EngineVersion: defaultDevSandboxEngineVersion,
PlayerCount: defaultDevSandboxPlayerCount,
},
Runtime: RuntimeConfig{ Runtime: RuntimeConfig{
WorkerPoolSize: defaultRuntimeWorkerPoolSize, WorkerPoolSize: defaultRuntimeWorkerPoolSize,
JobQueueSize: defaultRuntimeJobQueueSize, JobQueueSize: defaultRuntimeJobQueueSize,
@@ -755,13 +726,6 @@ func LoadFromEnv() (Config, error) {
return Config{}, err return Config{}, err
} }
cfg.DevSandbox.Email = strings.TrimSpace(loadString(envDevSandboxEmail, cfg.DevSandbox.Email))
cfg.DevSandbox.EngineImage = strings.TrimSpace(loadString(envDevSandboxEngineImage, cfg.DevSandbox.EngineImage))
cfg.DevSandbox.EngineVersion = strings.TrimSpace(loadString(envDevSandboxEngineVersion, cfg.DevSandbox.EngineVersion))
if cfg.DevSandbox.PlayerCount, err = loadInt(envDevSandboxPlayerCount, cfg.DevSandbox.PlayerCount); err != nil {
return Config{}, err
}
if err := cfg.Validate(); err != nil { if err := cfg.Validate(); err != nil {
return Config{}, err return Config{}, err
} }
@@ -973,21 +937,6 @@ func (c Config) Validate() error {
} }
} }
if email := strings.TrimSpace(c.DevSandbox.Email); email != "" {
if _, err := netmail.ParseAddress(email); err != nil {
return fmt.Errorf("%s must be a valid RFC 5322 address: %w", envDevSandboxEmail, err)
}
if strings.TrimSpace(c.DevSandbox.EngineImage) == "" {
return fmt.Errorf("%s must not be empty when %s is set", envDevSandboxEngineImage, envDevSandboxEmail)
}
if strings.TrimSpace(c.DevSandbox.EngineVersion) == "" {
return fmt.Errorf("%s must not be empty when %s is set", envDevSandboxEngineVersion, envDevSandboxEmail)
}
if c.DevSandbox.PlayerCount <= 0 {
return fmt.Errorf("%s must be positive when %s is set", envDevSandboxPlayerCount, envDevSandboxEmail)
}
}
return nil return nil
} }
-287
View File
@@ -1,287 +0,0 @@
// Package devsandbox provisions a ready-to-play game on backend boot
// for the `tools/local-dev` stack.
//
// Bootstrap is invoked from `backend/cmd/backend/main.go` after the
// admin bootstrap and before the HTTP listener starts. It reads
// `cfg.DevSandbox`; when `Email` is empty (the production posture)
// the function logs "skipped" and returns nil. When set, it
// idempotently:
//
// 1. registers the configured engine version and image;
// 2. find-or-creates the real dev user with the configured email;
// 3. find-or-creates `cfg.PlayerCount - 1` deterministic dummy
// users so the engine's minimum-players constraint is met;
// 4. find-or-creates a private "Dev Sandbox" game owned by the
// real user with min/max_players = cfg.PlayerCount and a
// year-out turn schedule (effectively frozen at turn 1);
// 5. inserts memberships for all participants bypassing the
// application/approval flow;
// 6. drives the lifecycle to `running` (or as far as possible if
// the runtime is busy).
//
// The function is a no-op on subsequent boots once the game is
// running; partial states from earlier crashes are recovered.
package devsandbox
import (
"context"
"errors"
"fmt"
"time"
"galaxy/backend/internal/config"
"galaxy/backend/internal/lobby"
"galaxy/backend/internal/runtime"
"github.com/google/uuid"
"go.uber.org/zap"
)
// SandboxGameName is the display name used to identify the
// auto-provisioned game on subsequent reboots. The combination of
// game_name and owner_user_id is unique enough in practice — only
// the dev sandbox bootstrap creates a game owned by the configured
// real user with this exact name.
const SandboxGameName = "Dev Sandbox"
// SandboxTurnSchedule keeps the game on turn 1 by scheduling the
// next turn a year out. The runtime scheduler still parses this and
// will tick once a year — long enough to never interfere with
// solo UI development.
const SandboxTurnSchedule = "0 0 1 1 *"
// UserEnsurer matches `auth.UserEnsurer`. We define a local
// interface to avoid importing the auth package and circular
// dependencies — the production wiring passes the same `*user.Service`
// instance used by auth.
type UserEnsurer interface {
EnsureByEmail(ctx context.Context, email, preferredLanguage, timeZone, declaredCountry string) (uuid.UUID, error)
}
// Deps aggregates the collaborators Bootstrap needs.
type Deps struct {
Users UserEnsurer
Lobby *lobby.Service
EngineVersions *runtime.EngineVersionService
}
// Bootstrap runs the seven-step provisioning flow described on the
// package doc comment. Errors are returned to the caller; the boot
// path in `cmd/backend/main.go` aborts startup if Bootstrap fails so
// a misconfigured dev environment surfaces immediately rather than
// silently leaving the lobby empty.
func Bootstrap(ctx context.Context, deps Deps, cfg config.DevSandboxConfig, logger *zap.Logger) error {
if logger == nil {
logger = zap.NewNop()
}
logger = logger.Named("dev_sandbox")
if cfg.Email == "" {
logger.Info("skipped (no email)")
return nil
}
if deps.Users == nil || deps.Lobby == nil || deps.EngineVersions == nil {
return errors.New("dev_sandbox: deps.Users, deps.Lobby and deps.EngineVersions are required")
}
if cfg.PlayerCount <= 0 {
return fmt.Errorf("dev_sandbox: PlayerCount must be positive, got %d", cfg.PlayerCount)
}
if err := ensureEngineVersion(ctx, deps.EngineVersions, cfg, logger); err != nil {
return err
}
realID, err := deps.Users.EnsureByEmail(ctx, cfg.Email, "en", "UTC", "")
if err != nil {
return fmt.Errorf("dev_sandbox: ensure real user: %w", err)
}
dummyIDs := make([]uuid.UUID, 0, cfg.PlayerCount-1)
for i := 1; i < cfg.PlayerCount; i++ {
email := fmt.Sprintf("dev-dummy-%02d@local.test", i)
id, err := deps.Users.EnsureByEmail(ctx, email, "en", "UTC", "")
if err != nil {
return fmt.Errorf("dev_sandbox: ensure dummy %d: %w", i, err)
}
dummyIDs = append(dummyIDs, id)
}
if err := purgeTerminalSandboxGames(ctx, deps.Lobby, realID, logger); err != nil {
return err
}
game, err := findOrCreateSandboxGame(ctx, deps.Lobby, realID, cfg)
if err != nil {
return err
}
game, err = ensureMembershipsAndDrive(ctx, deps.Lobby, game, realID, dummyIDs, logger)
if err != nil {
return err
}
logger.Info("bootstrap complete",
zap.String("user_id", realID.String()),
zap.String("game_id", game.GameID.String()),
zap.String("status", game.Status),
)
return nil
}
func ensureEngineVersion(ctx context.Context, svc *runtime.EngineVersionService, cfg config.DevSandboxConfig, logger *zap.Logger) error {
_, err := svc.Register(ctx, runtime.RegisterInput{
Version: cfg.EngineVersion,
ImageRef: cfg.EngineImage,
})
switch {
case err == nil:
logger.Info("engine version registered",
zap.String("version", cfg.EngineVersion),
zap.String("image", cfg.EngineImage),
)
return nil
case errors.Is(err, runtime.ErrEngineVersionTaken):
logger.Debug("engine version already registered",
zap.String("version", cfg.EngineVersion),
)
return nil
default:
return fmt.Errorf("dev_sandbox: register engine version: %w", err)
}
}
// terminalSandboxStatus reports whether a sandbox game has reached a
// state from which it can no longer be driven back to running. We
// treat such games as "absent" so the next bootstrap creates a fresh
// one rather than handing the developer a dead lobby tile.
func terminalSandboxStatus(status string) bool {
switch status {
case lobby.GameStatusCancelled, lobby.GameStatusFinished, lobby.GameStatusStartFailed:
return true
}
return false
}
// purgeTerminalSandboxGames deletes every previous "Dev Sandbox" game
// the dev user owns that has reached a terminal state
// (cancelled / finished / start_failed). The cascade declared in
// `00001_init.sql` removes the matching memberships, applications,
// invites, runtime records, and player mappings in the same write,
// so the developer's lobby never piles up dead tiles between
// `make rebuild` cycles. Non-terminal games are left untouched —
// a `running` sandbox from a previous boot is the happy path.
func purgeTerminalSandboxGames(ctx context.Context, svc *lobby.Service, ownerID uuid.UUID, logger *zap.Logger) error {
games, err := svc.ListMyGames(ctx, ownerID)
if err != nil {
return fmt.Errorf("dev_sandbox: list my games: %w", err)
}
for _, g := range games {
if g.GameName != SandboxGameName || g.OwnerUserID == nil || *g.OwnerUserID != ownerID {
continue
}
if !terminalSandboxStatus(g.Status) {
continue
}
if err := svc.DeleteGame(ctx, g.GameID); err != nil {
return fmt.Errorf("dev_sandbox: delete terminal sandbox %s: %w", g.GameID, err)
}
logger.Info("purged terminal sandbox game",
zap.String("game_id", g.GameID.String()),
zap.String("status", g.Status),
)
}
return nil
}
func findOrCreateSandboxGame(ctx context.Context, svc *lobby.Service, ownerID uuid.UUID, cfg config.DevSandboxConfig) (lobby.GameRecord, error) {
games, err := svc.ListMyGames(ctx, ownerID)
if err != nil {
return lobby.GameRecord{}, fmt.Errorf("dev_sandbox: list my games: %w", err)
}
for _, g := range games {
if g.GameName != SandboxGameName || g.OwnerUserID == nil || *g.OwnerUserID != ownerID {
continue
}
// `purgeTerminalSandboxGames` ran before us, so any sandbox
// game still in the list is either a live one we should
// reuse or a transient state we can drive forward.
return g, nil
}
rec, err := svc.CreateGame(ctx, lobby.CreateGameInput{
OwnerUserID: &ownerID,
Visibility: lobby.VisibilityPrivate,
GameName: SandboxGameName,
Description: "Auto-provisioned by backend/internal/devsandbox for solo UI development.",
MinPlayers: int32(cfg.PlayerCount),
MaxPlayers: int32(cfg.PlayerCount),
StartGapHours: 0,
StartGapPlayers: 0,
EnrollmentEndsAt: time.Now().Add(365 * 24 * time.Hour),
TurnSchedule: SandboxTurnSchedule,
TargetEngineVersion: cfg.EngineVersion,
})
if err != nil {
return lobby.GameRecord{}, fmt.Errorf("dev_sandbox: create game: %w", err)
}
return rec, nil
}
func ensureMembershipsAndDrive(ctx context.Context, svc *lobby.Service, game lobby.GameRecord, realID uuid.UUID, dummyIDs []uuid.UUID, logger *zap.Logger) (lobby.GameRecord, error) {
caller := realID
if game.Status == lobby.GameStatusDraft {
next, err := svc.OpenEnrollment(ctx, &caller, false, game.GameID)
if err != nil {
return game, fmt.Errorf("dev_sandbox: open enrollment: %w", err)
}
game = next
}
if game.Status == lobby.GameStatusEnrollmentOpen {
users := append([]uuid.UUID{realID}, dummyIDs...)
for i, uid := range users {
raceName := fmt.Sprintf("Sandbox-%02d", i+1)
if _, err := svc.InsertMembershipDirect(ctx, lobby.InsertMembershipDirectInput{
GameID: game.GameID,
UserID: uid,
RaceName: raceName,
}); err != nil {
return game, fmt.Errorf("dev_sandbox: insert membership %d: %w", i+1, err)
}
}
logger.Info("memberships ensured",
zap.Int("count", len(users)),
zap.String("game_id", game.GameID.String()),
)
next, err := svc.ReadyToStart(ctx, &caller, false, game.GameID)
if err != nil {
return game, fmt.Errorf("dev_sandbox: ready to start: %w", err)
}
game = next
}
if game.Status == lobby.GameStatusReadyToStart {
next, err := svc.Start(ctx, &caller, false, game.GameID)
if err != nil {
return game, fmt.Errorf("dev_sandbox: start: %w", err)
}
game = next
}
if game.Status == lobby.GameStatusStartFailed {
next, err := svc.RetryStart(ctx, &caller, false, game.GameID)
if err != nil {
logger.Warn("retry start failed", zap.Error(err))
return game, nil
}
game = next
if game.Status == lobby.GameStatusReadyToStart {
next, err := svc.Start(ctx, &caller, false, game.GameID)
if err != nil {
return game, fmt.Errorf("dev_sandbox: start after retry: %w", err)
}
game = next
}
}
return game, nil
}
@@ -1,106 +0,0 @@
package devsandbox
import (
"context"
"errors"
"testing"
"galaxy/backend/internal/config"
"github.com/google/uuid"
"go.uber.org/zap"
)
// TestBootstrapSkippedWhenEmailEmpty exercises the no-op branch: with
// the production posture (Email == "") Bootstrap must return without
// touching any dependency. The fact that Users/Lobby/EngineVersions
// are nil here doubles as a check that the early-return runs first.
func TestBootstrapSkippedWhenEmailEmpty(t *testing.T) {
err := Bootstrap(
context.Background(),
Deps{},
config.DevSandboxConfig{},
zap.NewNop(),
)
if err != nil {
t.Fatalf("expected nil error on empty email, got: %v", err)
}
}
// TestBootstrapRejectsZeroPlayerCount confirms the validation
// short-circuits the flow before any DB call when PlayerCount is
// non-positive but Email is set. The error path is fast and never
// dereferences the (still-nil) Users/Lobby deps.
func TestBootstrapRejectsZeroPlayerCount(t *testing.T) {
err := Bootstrap(
context.Background(),
Deps{Users: stubEnsurer{}, Lobby: nil, EngineVersions: nil},
config.DevSandboxConfig{
Email: "dev@local.test",
EngineImage: "galaxy-engine:local-dev",
EngineVersion: "0.0.0-local-dev",
PlayerCount: 0,
},
zap.NewNop(),
)
if err == nil {
t.Fatal("expected error on zero PlayerCount, got nil")
}
}
// TestBootstrapRejectsMissingDeps checks that a misconfigured wiring
// (Email set but one of the required services nil) fails fast rather
// than panicking when the bootstrap reaches its first service call.
func TestBootstrapRejectsMissingDeps(t *testing.T) {
err := Bootstrap(
context.Background(),
Deps{Users: stubEnsurer{}, Lobby: nil, EngineVersions: nil},
config.DevSandboxConfig{
Email: "dev@local.test",
EngineImage: "galaxy-engine:local-dev",
EngineVersion: "0.0.0-local-dev",
PlayerCount: 20,
},
zap.NewNop(),
)
if err == nil {
t.Fatal("expected error on missing deps, got nil")
}
if !errors.Is(err, errMissingDepsSentinel) && err.Error() == "" {
// The exact wording is not part of the contract; this branch
// only asserts the error is non-nil and human-readable.
t.Fatalf("error has empty message: %v", err)
}
}
// errMissingDepsSentinel exists so the assertion above can compile;
// the real error is constructed via errors.New inside Bootstrap and
// is intentionally not exported. The test only needs to confirm the
// returned error has a message.
var errMissingDepsSentinel = errors.New("sentinel")
// TestTerminalSandboxStatus pins the contract that decides whether a
// previously created sandbox game gets purged on the next boot.
// Terminal states are deleted (cascade-style) so the developer's
// lobby never piles up dead tiles between `make rebuild` cycles.
func TestTerminalSandboxStatus(t *testing.T) {
terminal := []string{"cancelled", "finished", "start_failed"}
live := []string{"draft", "enrollment_open", "ready_to_start", "starting", "running", "paused"}
for _, status := range terminal {
if !terminalSandboxStatus(status) {
t.Errorf("expected %q to be terminal", status)
}
}
for _, status := range live {
if terminalSandboxStatus(status) {
t.Errorf("expected %q to be non-terminal", status)
}
}
}
type stubEnsurer struct{}
func (stubEnsurer) EnsureByEmail(_ context.Context, _, _, _, _ string) (uuid.UUID, error) {
return uuid.UUID{}, nil
}
+4 -5
View File
@@ -274,11 +274,10 @@ func (s *Service) ListFinishedGamesBefore(ctx context.Context, cutoff time.Time)
// `ON DELETE CASCADE` constraints declared in `00001_init.sql`. // `ON DELETE CASCADE` constraints declared in `00001_init.sql`.
// Idempotent: returns nil when no game matches. // Idempotent: returns nil when no game matches.
// //
// Phase 14 introduces this method for the dev-sandbox bootstrap so a // `DeleteGame` is destructive — a hard delete that bypasses the
// terminal "Dev Sandbox" tile from a previous local-dev session can // cascade-notification machinery — so production callers stay on the
// be scrubbed before a fresh game spawns. Production callers must // regular cancel / finish lifecycle. It is exercised by the lobby
// stay on the regular cancel / finish lifecycle — `DeleteGame` is // integration tests.
// destructive and bypasses the cascade-notification machinery.
func (s *Service) DeleteGame(ctx context.Context, gameID uuid.UUID) error { func (s *Service) DeleteGame(ctx context.Context, gameID uuid.UUID) error {
if err := s.deps.Store.DeleteGame(ctx, gameID); err != nil { if err := s.deps.Store.DeleteGame(ctx, gameID); err != nil {
return err return err
+2 -2
View File
@@ -248,8 +248,8 @@ func TestEndToEndPrivateGameFlow(t *testing.T) {
} }
} }
// TestDeleteGameCascadesEverything pins the contract the dev-sandbox // TestDeleteGameCascadesEverything pins the DeleteGame contract:
// bootstrap relies on: removing a game wipes every referencing row // removing a game wipes every referencing row
// (memberships, applications, invites, runtime_records, // (memberships, applications, invites, runtime_records,
// player_mappings) in a single SQL statement. Before this is wired // player_mappings) in a single SQL statement. Before this is wired
// the developer's lobby pile up cancelled tiles between // the developer's lobby pile up cancelled tiles between
+5 -6
View File
@@ -20,9 +20,9 @@ type InsertMembershipDirectInput struct {
// writes as ApproveApplication: the per-game race-name reservation // writes as ApproveApplication: the per-game race-name reservation
// row plus the membership row, and refreshes the in-memory caches. // row plus the membership row, and refreshes the in-memory caches.
// //
// The method is intended for boot-time provisioning by // The method is intended for trusted boot-time provisioning and
// `backend/internal/devsandbox` and similar trusted callers. It is // integration tests; it is not exposed through any HTTP handler. The
// not exposed through any HTTP handler. The caller must guarantee // caller must guarantee
// game.Status == GameStatusEnrollmentOpen — the function returns // game.Status == GameStatusEnrollmentOpen — the function returns
// ErrConflict otherwise — and that the race-name policy and // ErrConflict otherwise — and that the race-name policy and
// canonical-key invariants are honoured (the implementation reuses // canonical-key invariants are honoured (the implementation reuses
@@ -30,9 +30,8 @@ type InsertMembershipDirectInput struct {
// or unsuitable name still fails). // or unsuitable name still fails).
// //
// Idempotency: if a membership for (GameID, UserID) already exists // Idempotency: if a membership for (GameID, UserID) already exists
// the function returns the existing row without modifying state. // the function returns the existing row without modifying state, so
// This makes the helper safe to call on every backend boot from // the helper is safe to call repeatedly.
// devsandbox.Bootstrap.
func (s *Service) InsertMembershipDirect(ctx context.Context, in InsertMembershipDirectInput) (Membership, error) { func (s *Service) InsertMembershipDirect(ctx context.Context, in InsertMembershipDirectInput) (Membership, error) {
displayName, err := ValidateDisplayName(in.RaceName) displayName, err := ValidateDisplayName(in.RaceName)
if err != nil { if err != nil {
+2 -3
View File
@@ -236,9 +236,8 @@ func (s *Store) ListMyGames(ctx context.Context, userID uuid.UUID) ([]GameRecord
// referencing table (memberships / applications / invites / // referencing table (memberships / applications / invites /
// runtime_records / player_mappings — all declared with ON DELETE // runtime_records / player_mappings — all declared with ON DELETE
// CASCADE in `00001_init.sql`). Idempotent: returns nil when no row // CASCADE in `00001_init.sql`). Idempotent: returns nil when no row
// matches. Used by the dev-sandbox bootstrap to scrub terminal // matches. A hard delete for trusted callers and integration tests;
// games on every backend boot so the developer's lobby never piles // production lifecycle uses cancel / finish.
// up cancelled tiles.
func (s *Store) DeleteGame(ctx context.Context, gameID uuid.UUID) error { func (s *Store) DeleteGame(ctx context.Context, gameID uuid.UUID) error {
g := table.Games g := table.Games
stmt := g.DELETE().WHERE(g.GameID.EQ(postgres.UUID(gameID))) stmt := g.DELETE().WHERE(g.GameID.EQ(postgres.UUID(gameID)))
+13
View File
@@ -888,6 +888,19 @@ addition.
- Health probes are unauthenticated `GET /healthz` (process liveness) and - Health probes are unauthenticated `GET /healthz` (process liveness) and
`GET /readyz` (Postgres reachable, migrations applied, gRPC listener `GET /readyz` (Postgres reachable, migrations applied, gRPC listener
bound). Probes are excluded from anti-replay and rate limiting. bound). Probes are excluded from anti-replay and rate limiting.
- **Collection (dev, production mirror).** The long-lived dev environment
(`tools/dev-deploy/`) runs a full metrics + logs + traces stack on its
internal network with no host ports: Prometheus scrapes the backend
(`:9100`) and gateway (`:9191`) endpoints plus `node-exporter` and
cAdvisor; Tempo ingests OTLP traces from backend and gateway; Loki
stores container logs shipped by promtail (Docker service-discovery on
the `galaxy.stack=dev-deploy` label). Grafana (provisioned datasources
+ dashboards) and the Mailpit capture UI are reached only through the
operator console's single `/_gm` Basic Auth gate (§14.1) — at
`/_gm/grafana/` and `/_gm/mailpit/` — so one password covers the
console and both UIs. Retention is tuned small (Prometheus 15d, Loki
7d, Tempo 3d). The same compose fragment is meant to back production.
See `tools/dev-deploy/monitoring/README.md`.
## 18. CI and Environments ## 18. CI and Environments
+4 -1
View File
@@ -1182,7 +1182,10 @@ The console landing page is a dashboard that summarises operational
health: whether the backend is ready and the database reachable, how many health: whether the backend is ready and the database reachable, how many
game runtimes sit in each state, and the depth of the mail and game runtimes sit in each state, and the depth of the mail and
notification queues. It is a read-only point-in-time view for quick notification queues. It is a read-only point-in-time view for quick
triage, not a metrics history. triage, not a metrics history. The console nav also links to Grafana
(metrics, logs and traces) and the Mailpit capture UI, which the
deployment serves under the same `/_gm` Basic Auth gate — one sign-in
covers the console and both UIs.
### 10.3 Admin account management ### 10.3 Admin account management
+3 -1
View File
@@ -1218,7 +1218,9 @@ admin-API, либо через серверно-рендеримую веб-ко
здоровье: готов ли backend и доступна ли БД, сколько игровых рантаймов здоровье: готов ли backend и доступна ли БД, сколько игровых рантаймов
в каждом состоянии, какова глубина очередей почты и уведомлений. Это в каждом состоянии, какова глубина очередей почты и уведомлений. Это
read-only-срез на текущий момент для быстрой диагностики, не история read-only-срез на текущий момент для быстрой диагностики, не история
метрик. метрик. Навигация консоли также ведёт в Grafana (метрики, логи и
трейсы) и в UI захвата почты Mailpit, которые деплой отдаёт под тем же
шлюзом Basic Auth `/_gm` — один вход покрывает консоль и оба UI.
### 10.3 Управление admin-аккаунтами ### 10.3 Управление admin-аккаунтами
-6
View File
@@ -7,12 +7,6 @@
# baked into `docker-compose.yml`, so this file documents the knobs # baked into `docker-compose.yml`, so this file documents the knobs
# rather than driving them. # rather than driving them.
# Auto-provisioned sandbox bootstrap. Empty disables the bootstrap.
BACKEND_DEV_SANDBOX_EMAIL=dev@galaxy.lan
BACKEND_DEV_SANDBOX_ENGINE_IMAGE=galaxy-engine:dev
BACKEND_DEV_SANDBOX_ENGINE_VERSION=0.1.0
BACKEND_DEV_SANDBOX_PLAYER_COUNT=20
# `123456` short-circuits the email-code path for the dev account. # `123456` short-circuits the email-code path for the dev account.
# This is also the docker-compose default — set the variable to an # This is also the docker-compose default — set the variable to an
# empty string here when the environment must rely on real Mailpit # empty string here when the environment must rely on real Mailpit
+25 -3
View File
@@ -29,13 +29,35 @@
reverse_proxy galaxy-api:8080 reverse_proxy galaxy-api:8080
} }
# Operator console. Shares the gateway public listener with `/api`; the # Operator console + observability behind one Basic Auth gate. The gate
# gateway applies the admin anti-abuse class and reverse-proxies to the # credential equals the admin-console account (dev: gm / gm-dev-password),
# backend `/_gm` surface, which enforces Basic Auth and renders the pages. # so Caddy forwards the same Authorization header to the backend `/_gm`
# surface (its own Basic Auth) and to Grafana/Mailpit — one prompt covers
# all three. The gateway applies the admin anti-abuse class to the console.
@gm path /_gm /_gm/* @gm path /_gm /_gm/*
handle @gm { handle @gm {
basic_auth {
gm "$2a$14$xVh1TLaZxh8fazlKrI9Mx.NQMQlMarYWtr3FRELmZIXuac/DeeTRO"
}
# Grafana under /_gm/grafana/ (sub-path mode; anonymous Admin, so the
# /_gm gate is the only barrier — GF_AUTH_BASIC_ENABLED=false makes it
# ignore the forwarded Authorization header).
handle /_gm/grafana/* {
reverse_proxy galaxy-grafana:3000
}
# Mailpit captured-mail UI under /_gm/mailpit/ (MP_WEBROOT). Shows
# every message the backend sent, relayed or not.
handle /_gm/mailpit/* {
reverse_proxy galaxy-mailpit:8025
}
# The operator console itself (gateway -> backend /_gm surface).
handle {
reverse_proxy galaxy-api:8080 reverse_proxy galaxy-api:8080
} }
}
# Bare `/game` (no trailing slash) -> `/game/` so the SPA root # Bare `/game` (no trailing slash) -> `/game/` so the SPA root
# resolves before the site catch-all can claim it. # resolves before the site catch-all can claim it.
+3 -159
View File
@@ -1,164 +1,8 @@
# `tools/dev-deploy/` — known issues # `tools/dev-deploy/` — known issues
Issues that surface in the long-lived dev environment but are not yet Issues that surfaced in the long-lived dev environment. Each entry lists
fixed. Each entry lists the observed symptom, the diagnostic evidence, the observed symptom, the diagnostic evidence, and the fix or the open
the working hypothesis, and the open questions that have to be questions that have to be answered before a fix lands.
answered before a fix lands.
## Dev Sandbox game flips to `cancelled` after a `dev-deploy` redispatch
### Symptom
A previously `running` "Dev Sandbox" game (created by
`backend/internal/devsandbox`) transitions to `cancelled` ~15 minutes
after a `dev-deploy.yaml` workflow_dispatch run finishes. The user's
browser session survives (the same `device_session_id` keeps working),
but the lobby shows no game because the only game it had is now
terminal. `purgeTerminalSandboxGames` does pick it up on the **next**
boot and creates a fresh sandbox — but the first redispatch leaves
the user with an empty lobby until backend restarts again.
### Diagnostic evidence
Backend logs from the broken cycle (timestamps abbreviated):
```text
20:24:40 dev_sandbox: purged terminal sandbox game game_id=<prev> status=cancelled
20:24:40 dev_sandbox: memberships ensured count=20 game_id=<new>
20:24:40 dev_sandbox: bootstrap complete user_id=<owner> game_id=<new> status=starting
...
20:25:09 user mail sent failed (diplomail tables missing — unrelated)
...
20:39:40 lobby: game cancelled by runtime reconciler game_id=<new>
op=reconcile status=removed message="container disappeared"
```
Between 20:24:40 (`status=starting`) and 20:39:40 (reconciler cancel)
the backend logs are silent on the runtime / engine paths — no
`engine spawned`, no `engine container started`, no `runtime
transition` lines. The reconciler then fires and reports the engine
container as missing.
`docker ps -a --filter 'label=org.opencontainers.image.title=galaxy-game-engine'`
returns no rows during this window — the engine container is neither
running nor stopped on the host, so it either was never spawned or
was removed before the host snapshot.
### What has been ruled out
A live `docker inspect` on a healthy engine container shows:
```text
Labels: galaxy.backend=1, galaxy.engine_version=0.1.0,
galaxy.game_id=<uuid>,
org.opencontainers.image.title=galaxy-game-engine,
com.galaxy.{cpu_quota,memory,pids_limit}
AutoRemove: false
RestartPolicy: on-failure
NetworkMode: galaxy-dev-internal
```
There are no `com.docker.compose.*` labels and `AutoRemove=false`,
so `--remove-orphans` cannot reap the engine and a `--rm`-style
self-destruct is not in play. Two redispatches captured under
`docker events --filter event=create,start,die,destroy,kill,stop`
also confirmed it: across both runs the only `die` / `destroy`
events were for `galaxy-dev-{backend,api,caddy}`. The live engine
container survived both redispatches, and the reconciler that
fires 60 seconds after the new backend boots correctly matched
it through `byGameID` / `byContainerID`.
`backend/internal/runtime/service.go` only removes engine
containers from the explicit `runStop` / `runRestart` / `runPatch`
paths. There is no `runtime.Service.Shutdown` that proactively
kills containers on backend exit, so a graceful SIGTERM to
`galaxy-dev-backend` will not touch its child engine containers.
### Host-side hypotheses considered and rejected by the owner
The natural follow-up suspects after compose was cleared — host-side
`docker prune` cron jobs, a manual `docker rm`, an out-of-band
`dockerd` restart, and an idle-state engine crash — were all
rejected by the project owner: the dev host runs none of those
periodic cleanups, no one manually removed the container, dockerd
was not restarted in the window, and the engine binary does not
crash while idling on API calls.
### Best remaining suspicion
Something the `dev-deploy.yaml` CI run does between successful
image builds and the final `docker compose up -d --wait
--remove-orphans` clobbers the previously-spawned engine container.
The chain at runtime contains:
1. `docker build -t galaxy-engine:dev -f game/Dockerfile .`
2. `docker compose build galaxy-backend galaxy-api`
3. `docker run --rm` alpine for the UI volume seed
4. `docker compose up -d --wait --remove-orphans`
None of these *should* touch an unmanaged engine container, but
the reproduction window points squarely inside this sequence. A
deliberate next reproduction with `docker events --since 0` armed
*before* the deploy starts and live for the entire job — captured
end-to-end on the dev host, not just the chunk after backend
recreate — would pin which step emits the `destroy` on the engine.
### Update 2026-05-19: integration preclean identified as one cause
A live reproduction during the post-merge auto-deploy cycle (Gitea
run #188 dev-deploy plus parallel run #190 integration) pinned one
clobbering source: `integration/scripts/preclean.sh` was unscoped
and removed *every* container labelled `galaxy.backend=1`, including
the dev-deploy engine. Timeline from the dev host:
```text
23:10:40 backend pre-bootstrap reconciler tick: engine alive
23:10:40 dev_sandbox bootstrap: status=running
23:10:56 preclean: removing 1 backend-managed engine containers ← integration run #190
23:11:40 reconciler: container disappeared → game cancelled
```
Fix landed: `BACKEND_STACK_LABEL=integration` is now passed to
every integration backend (see
`integration/testenv/backend.go`) and `preclean.sh` AND-combines
`galaxy.backend=1` with `galaxy.stack=integration`, so dev-deploy /
local-dev engines stamped with different stack values are no longer
collateral.
This covers **push**-triggered cycles where `dev-deploy.yaml` and
`integration.yaml` run on the same Gitea host. The original
hypothesis (a `workflow_dispatch dev-deploy` solo run also losing
the engine) is *not* explained by the integration fix — manual
dispatches do not trigger `integration.yaml`. Keep this entry open
until a solo-dispatch reproduction confirms whether the symptom
still occurs.
### Status
Partially fixed (push-triggered cycles). Solo `workflow_dispatch`
reproductions still open. If the symptom recurs after the
integration fix lands, capture `docker events --since 0` for the
full dispatch window and attach here.
### Workaround in use today
When the sandbox game flips to `cancelled`, redispatch `dev-deploy`:
```sh
curl -X POST -n -H 'Content-Type: application/json' \
-d '{"ref":"<branch>"}' \
https://gitea.iliadenisov.ru/api/v1/repos/developer/galaxy-game/actions/workflows/dev-deploy.yaml/dispatches
```
The next boot's `purgeTerminalSandboxGames` removes the cancelled
row, `findOrCreateSandboxGame` creates a fresh one, and
`ensureMembershipsAndDrive` puts the new game back to `running`.
### Owner
Unassigned. File an issue once we have the runtime / reconciler
analysis above; reference this section in the issue body so future
redeploys can short-circuit the diagnostic loop.
## `docker restart galaxy-dev-backend` fails after the CI runner cleans up ## `docker restart galaxy-dev-backend` fails after the CI runner cleans up
+79 -19
View File
@@ -114,17 +114,72 @@ calls `make clean-data`.
The same dev-mode email-code override as `tools/local-dev/` applies, The same dev-mode email-code override as `tools/local-dev/` applies,
and the dev-deploy compose ships with it enabled by default: and the dev-deploy compose ships with it enabled by default:
1. Enter `dev@galaxy.lan` (or whatever `BACKEND_DEV_SANDBOX_EMAIL` 1. Enter your email address in the login form.
resolves to) in the login form.
2. Submit `123456` as the code — the docker-compose default for 2. Submit `123456` as the code — the docker-compose default for
`BACKEND_AUTH_DEV_FIXED_CODE` is `123456`, so the bcrypt-hashed `BACKEND_AUTH_DEV_FIXED_CODE` is `123456`, so the bcrypt-hashed
email code stays a fallback. To force real Mailpit codes (e.g. for email code stays a fallback. To force the real email code (which
mail-flow QA), set `BACKEND_AUTH_DEV_FIXED_CODE=` (empty) in a Mailpit then relays to your Gmail — see **Mail** below), set
local `.env` and `make rebuild`. `BACKEND_AUTH_DEV_FIXED_CODE=` (empty) and redeploy.
The fixed-code override is rejected by production env loaders, so it The fixed-code override is rejected by production env loaders, so it
cannot leak into the prod environment. cannot leak into the prod environment.
## Mail
The backend always submits mail to **Mailpit** (`galaxy-mailpit:1025`),
exactly as it would to a production SMTP server. Mailpit captures every
message in its UI (internal `:8025`) and, when configured, **relays**
the ones whose recipient matches `GALAXY_DEV_MAIL_RELAY_MATCH` up to a
real Gmail account — so an OTP addressed to you lands in your real inbox
while everything else stays captured-only.
Configure the relay through Gitea Actions secrets/vars (never
committed); the `dev-deploy.yaml` workflow renders Mailpit's
`relay.conf` (from `tools/dev-deploy/mailpit/relay.conf.tmpl`) and seeds
it into the `galaxy-dev-mailpit-config` volume:
| Name | Kind | Purpose |
| --- | --- | --- |
| `GALAXY_DEV_MAIL_RELAY_USERNAME` | secret | Gmail address used as the relay login + From. |
| `GALAXY_DEV_MAIL_RELAY_PASSWORD` | secret | Gmail **App Password** (requires 2FA; not the account password). |
| `GALAXY_DEV_MAIL_RELAY_MATCH` | var | Recipient regex to auto-relay (e.g. your Gmail address). Unset → capture-only. |
With none set the stack only captures mail (the compose relay-match
defaults to a non-routable address), so it can never email third
parties.
The capture UI is exposed through the operator console's `/_gm` gate at
[`/_gm/mailpit/`](https://galaxy.lan/_gm/mailpit/) — one Basic Auth for
the console, Grafana and Mailpit (see **Observability**). It shows
**every** message the backend sent, relayed or not, so you can read any
account's OTP regardless of the relay-match. For multi-account testing:
register several `you+tag@gmail.com` aliases and widen the match to a
regex such as `^you(\+[^@]+)?@gmail\.com$` (Gmail folds every `+tag`
into one inbox), or just read the codes in the Mailpit UI, or skip mail
entirely with the `123456` dev-code.
## Observability
A full metrics + logs + traces stack runs alongside the app on the
internal network (no host ports), as a production mirror. **Grafana**
and the **Mailpit** UI are reached only through the operator console's
single `/_gm` Basic Auth gate — one password (the admin-console account)
unlocks the console, [`/_gm/grafana/`](https://galaxy.lan/_gm/grafana/)
and [`/_gm/mailpit/`](https://galaxy.lan/_gm/mailpit/), with links in the
console nav. Grafana runs anonymous-Admin behind the gate (no own
login); Prometheus, Loki and Tempo stay internal-only.
- **Metrics** — Prometheus scrapes backend, gateway, `node-exporter` and
cAdvisor.
- **Logs** — promtail → Loki (Docker SD on the `galaxy.stack=dev-deploy`
label).
- **Traces** — backend + gateway → Tempo over OTLP.
Grafana's admin user is seeded from `GALAXY_DEV_GRAFANA_ADMIN_PASSWORD`
(for provisioning/API; the UI needs no Grafana login). See
[`monitoring/README.md`](monitoring/README.md) for services, configs and
tuning knobs.
## Networking ## Networking
``` ```
@@ -139,6 +194,8 @@ galaxy-caddy (networks: edge + galaxy-dev-internal)
│ /game/* -> file_server /srv/galaxy-ui (volume galaxy-dev-ui-dist) │ /game/* -> file_server /srv/galaxy-ui (volume galaxy-dev-ui-dist)
│ /api/*, /healthz -> reverse_proxy galaxy-api:8080 │ /api/*, /healthz -> reverse_proxy galaxy-api:8080
│ /rpc/* -> reverse_proxy galaxy-api:9090 (strips /rpc) │ /rpc/* -> reverse_proxy galaxy-api:9090 (strips /rpc)
│ /_gm, /_gm/* -> reverse_proxy galaxy-api:8080 (Basic Auth gate;
│ /_gm/grafana/ -> grafana, /_gm/mailpit/ -> mailpit)
galaxy-dev-internal galaxy-dev-internal
├─ galaxy-api (gateway: :8080 REST, :9090 gRPC) ├─ galaxy-api (gateway: :8080 REST, :9090 gRPC)
@@ -146,7 +203,9 @@ galaxy-dev-internal
├─ galaxy-postgres (postgres: :5432) ├─ galaxy-postgres (postgres: :5432)
├─ galaxy-redis (redis: :6379) ├─ galaxy-redis (redis: :6379)
├─ galaxy-mailpit (mailpit: :8025 UI, :1025 SMTP) ├─ galaxy-mailpit (mailpit: :8025 UI, :1025 SMTP)
─ engine containers (spawned by backend on demand) ─ engine containers (spawned by backend on demand)
└─ observability (prometheus, grafana, loki, promtail, tempo,
node-exporter, cadvisor)
``` ```
The compose project deliberately exposes no host ports. Diagnostics The compose project deliberately exposes no host ports. Diagnostics
@@ -191,8 +250,10 @@ make clean-data Stop everything and wipe volumes + game-state dir
## Files ## Files
- `docker-compose.yml` — six services: postgres, redis, mailpit, - `docker-compose.yml` — the application services (postgres, redis,
galaxy-backend, galaxy-api, galaxy-caddy. `galaxy-caddy` mounts both mailpit, galaxy-backend, galaxy-api, galaxy-caddy) plus the
observability stack (prometheus, grafana, loki, promtail, tempo,
node-exporter, cadvisor). `galaxy-caddy` mounts both
the `galaxy-dev-site-dist` (`/srv/galaxy-site`) and the `galaxy-dev-site-dist` (`/srv/galaxy-site`) and
`galaxy-dev-ui-dist` (`/srv/galaxy-ui`) volumes and reverse-proxies `galaxy-dev-ui-dist` (`/srv/galaxy-ui`) volumes and reverse-proxies
both gateway tiers (REST/health on `:8080`, Connect/gRPC-web on both gateway tiers (REST/health on `:8080`, Connect/gRPC-web on
@@ -204,6 +265,8 @@ make clean-data Stop everything and wipe volumes + game-state dir
at `/etc/caddy/Caddyfile`. at `/etc/caddy/Caddyfile`.
- `Caddyfile.prod` — placeholder for a future prod deployment; not used - `Caddyfile.prod` — placeholder for a future prod deployment; not used
by this compose. by this compose.
- `monitoring/` — Prometheus / Loki / promtail / Tempo / Grafana
configuration, provisioned as code; see `monitoring/README.md`.
- `Makefile` — wrapper over `docker compose` with helpers for engine, - `Makefile` — wrapper over `docker compose` with helpers for engine,
site/UI seeding, health probes, and full wipe. site/UI seeding, health probes, and full wipe.
- `.env.example` — non-secret defaults for the compose `${VAR:-}` - `.env.example` — non-secret defaults for the compose `${VAR:-}`
@@ -212,8 +275,7 @@ make clean-data Stop everything and wipe volumes + game-state dir
## Known issues ## Known issues
See [`KNOWN-ISSUES.md`](KNOWN-ISSUES.md) for symptoms that surface See [`KNOWN-ISSUES.md`](KNOWN-ISSUES.md) for symptoms that surface
in the long-lived dev environment but are not yet fixed (currently: in the long-lived dev environment but are not yet fixed.
the sandbox game flipping to `cancelled` after a redispatch).
## Deployment cadence ## Deployment cadence
@@ -237,12 +299,12 @@ behind. There is no separate state to clean up between the two paths.
### Engine image drift recycle ### Engine image drift recycle
`backend` spawns one engine container per game (the long-lived "Dev `backend` spawns one engine container per running game and the
Sandbox" plus any user-created games) and the reconciler reattaches reconciler reattaches to whatever it finds with the
to whatever it finds with the `galaxy.stack=dev-deploy` label. That `galaxy.stack=dev-deploy` label. That reattach does not check the
reattach does not check the running container's image SHA against the running container's image SHA against the freshly-built
freshly-built `galaxy-engine:dev` tag, so an unchanged container would `galaxy-engine:dev` tag, so an unchanged container would otherwise
otherwise keep serving the previous engine code after a redeploy. keep serving the previous engine code after a redeploy.
The `dev-deploy.yaml` workflow handles this in the The `dev-deploy.yaml` workflow handles this in the
`Recycle engine containers on image drift` step. When `docker build` `Recycle engine containers on image drift` step. When `docker build`
@@ -250,9 +312,7 @@ produces a new `galaxy-engine:dev` SHA, the step compares it against
every running `galaxy-game-*` container and, for each drifted one, every running `galaxy-game-*` container and, for each drifted one,
stops the backend, removes the container, wipes its bind-mounted stops the backend, removes the container, wipes its bind-mounted
state directory (Engine.Init() writes turn-0 over any pre-existing state directory (Engine.Init() writes turn-0 over any pre-existing
`turn-N` files), and cascade-deletes the lobby `games` row. The `turn-N` files), and cascade-deletes the lobby `games` row.
`dev-sandbox` bootstrap on the next backend boot finds no live
sandbox and provisions a fresh one on the new engine image.
When the engine sources are unchanged, the BuildKit cache hits and When the engine sources are unchanged, the BuildKit cache hits and
the SHA stays the same — the recycle step is a no-op and the running the SHA stays the same — the recycle step is a no-op and the running
+195 -11
View File
@@ -66,12 +66,26 @@ services:
image: axllent/mailpit:v1.21 image: axllent/mailpit:v1.21
container_name: galaxy-dev-mailpit container_name: galaxy-dev-mailpit
restart: unless-stopped restart: unless-stopped
# Mailpit is both the SMTP submission point and a relay: it captures
# every message in its UI and auto-relays the ones whose recipient
# matches GALAXY_DEV_MAIL_RELAY_MATCH to the Gmail account in the
# secret-rendered relay config. The default match is non-routable, so
# a stack brought up without the relay secret only captures, never sends.
command:
- "--smtp-relay-config=/etc/mailpit/relay.conf"
- "--smtp-relay-matching=${GALAXY_DEV_MAIL_RELAY_MATCH:-nobody@invalid.example}"
# Serve the capture UI under /_gm/mailpit so the host Caddy can expose
# it at https://galaxy.lan/_gm/mailpit/ behind the shared /_gm gate;
# SMTP is unaffected.
- "--webroot=/_gm/mailpit"
labels: labels:
galaxy.stack: dev-deploy galaxy.stack: dev-deploy
networks: networks:
- galaxy-internal - galaxy-internal
volumes:
- galaxy-dev-mailpit-config:/etc/mailpit:ro
healthcheck: healthcheck:
test: ["CMD", "wget", "-q", "-O-", "http://localhost:8025/livez"] test: ["CMD", "wget", "-q", "-O-", "http://localhost:8025/_gm/mailpit/livez"]
interval: 3s interval: 3s
timeout: 3s timeout: 3s
retries: 30 retries: 30
@@ -108,7 +122,13 @@ services:
BACKEND_NOTIFICATION_ADMIN_EMAIL: admin@galaxy.lan BACKEND_NOTIFICATION_ADMIN_EMAIL: admin@galaxy.lan
BACKEND_MAIL_WORKER_INTERVAL: 500ms BACKEND_MAIL_WORKER_INTERVAL: 500ms
BACKEND_NOTIFICATION_WORKER_INTERVAL: 500ms BACKEND_NOTIFICATION_WORKER_INTERVAL: 500ms
BACKEND_OTEL_TRACES_EXPORTER: none BACKEND_OTEL_TRACES_EXPORTER: otlp
BACKEND_OTEL_PROTOCOL: grpc
BACKEND_OTEL_ENDPOINT: "galaxy-tempo:4317"
# Tempo's OTLP receiver is plaintext on the internal network; the
# backend's gRPC exporter defaults to TLS, so disable it via the
# standard SDK env (applied on top of WithEndpoint).
OTEL_EXPORTER_OTLP_INSECURE: "true"
# Prometheus metrics are enabled in dev so the `/metrics` scrape # Prometheus metrics are enabled in dev so the `/metrics` scrape
# endpoint is live and stable ahead of standing up a Prometheus + # endpoint is live and stable ahead of standing up a Prometheus +
# Grafana stack on the internal network. The listener stays internal # Grafana stack on the internal network. The listener stays internal
@@ -127,15 +147,6 @@ services:
# bcrypt-hashed code is single-use). Set the var to an empty # bcrypt-hashed code is single-use). Set the var to an empty
# string in `.env` to disable. # string in `.env` to disable.
BACKEND_AUTH_DEV_FIXED_CODE: ${BACKEND_AUTH_DEV_FIXED_CODE:-123456} BACKEND_AUTH_DEV_FIXED_CODE: ${BACKEND_AUTH_DEV_FIXED_CODE:-123456}
# Long-lived dev environment always bootstraps the "Dev Sandbox"
# game owned by this email so a freshly redeployed stack already
# has one ready-to-play game in the lobby. Set the variable to an
# empty string in `.env` to disable the bootstrap (e.g. for a
# cold-start QA pass).
BACKEND_DEV_SANDBOX_EMAIL: ${BACKEND_DEV_SANDBOX_EMAIL:-dev@galaxy.lan}
BACKEND_DEV_SANDBOX_ENGINE_IMAGE: ${BACKEND_DEV_SANDBOX_ENGINE_IMAGE:-galaxy-engine:dev}
BACKEND_DEV_SANDBOX_ENGINE_VERSION: ${BACKEND_DEV_SANDBOX_ENGINE_VERSION:-0.1.0}
BACKEND_DEV_SANDBOX_PLAYER_COUNT: ${BACKEND_DEV_SANDBOX_PLAYER_COUNT:-20}
volumes: volumes:
- /var/run/docker.sock:/var/run/docker.sock - /var/run/docker.sock:/var/run/docker.sock
# Per-game state directories live under the same absolute path # Per-game state directories live under the same absolute path
@@ -195,6 +206,12 @@ services:
# the internal network — live and stable for a future scrape, not # the internal network — live and stable for a future scrape, not
# mapped to the host. # mapped to the host.
GATEWAY_ADMIN_HTTP_ADDR: ":9191" GATEWAY_ADMIN_HTTP_ADDR: ":9191"
# Traces -> Tempo over OTLP gRPC (plaintext on the internal net).
OTEL_SERVICE_NAME: galaxy-gateway
OTEL_TRACES_EXPORTER: otlp
OTEL_EXPORTER_OTLP_PROTOCOL: grpc
OTEL_EXPORTER_OTLP_ENDPOINT: "http://galaxy-tempo:4317"
OTEL_EXPORTER_OTLP_INSECURE: "true"
GATEWAY_BACKEND_HTTP_URL: "http://galaxy-backend:8080" GATEWAY_BACKEND_HTTP_URL: "http://galaxy-backend:8080"
GATEWAY_BACKEND_GRPC_PUSH_URL: "galaxy-backend:8081" GATEWAY_BACKEND_GRPC_PUSH_URL: "galaxy-backend:8081"
GATEWAY_BACKEND_GATEWAY_CLIENT_ID: dev-gateway-1 GATEWAY_BACKEND_GATEWAY_CLIENT_ID: dev-gateway-1
@@ -263,6 +280,163 @@ services:
- galaxy-internal - galaxy-internal
- edge - edge
galaxy-prometheus:
image: prom/prometheus:v2.55.1
container_name: galaxy-dev-prometheus
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=15d
- --web.enable-lifecycle
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- galaxy-dev-prometheus-data:/prometheus
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 384m
galaxy-loki:
image: grafana/loki:3.3.2
container_name: galaxy-dev-loki
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command: ["-config.file=/etc/loki/loki.yml"]
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/loki/loki.yml:/etc/loki/loki.yml:ro
- galaxy-dev-loki-data:/loki
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 384m
galaxy-promtail:
image: grafana/promtail:3.3.2
container_name: galaxy-dev-promtail
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command: ["-config.file=/etc/promtail/promtail.yml"]
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/promtail/promtail.yml:/etc/promtail/promtail.yml:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 128m
galaxy-tempo:
image: grafana/tempo:2.7.1
container_name: galaxy-dev-tempo
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command: ["-config.file=/etc/tempo/tempo.yml"]
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/tempo/tempo.yml:/etc/tempo/tempo.yml:ro
- galaxy-dev-tempo-data:/var/tempo
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 384m
galaxy-node-exporter:
image: prom/node-exporter:v1.8.2
container_name: galaxy-dev-node-exporter
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/rootfs
- --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
pid: host
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 64m
galaxy-cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.49.1
container_name: galaxy-dev-cadvisor
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command:
- --housekeeping_interval=30s
- --docker_only=true
- --store_container_labels=false
privileged: true
devices:
- /dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 256m
galaxy-grafana:
image: grafana/grafana:11.4.0
container_name: galaxy-dev-grafana
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
depends_on:
- galaxy-prometheus
- galaxy-loki
- galaxy-tempo
environment:
GF_SECURITY_ADMIN_PASSWORD: ${GALAXY_DEV_GRAFANA_ADMIN_PASSWORD:-admin}
GF_SERVER_ROOT_URL: https://galaxy.lan/_gm/grafana/
GF_SERVER_SERVE_FROM_SUB_PATH: "true"
# No own login: the /_gm Basic Auth gate is the only barrier, so
# serve everyone as anonymous Admin and ignore the forwarded
# Authorization header (basic auth off, login form off).
GF_AUTH_ANONYMOUS_ENABLED: "true"
GF_AUTH_ANONYMOUS_ORG_ROLE: Admin
GF_AUTH_DISABLE_LOGIN_FORM: "true"
GF_AUTH_BASIC_ENABLED: "false"
GF_USERS_ALLOW_SIGN_UP: "false"
GF_ANALYTICS_REPORTING_ENABLED: "false"
GF_ANALYTICS_CHECK_FOR_UPDATES: "false"
GF_NEWS_NEWS_FEED_ENABLED: "false"
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/provisioning:/etc/grafana/provisioning:ro
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/dashboards:/var/lib/grafana/dashboards:ro
- galaxy-dev-grafana-data:/var/lib/grafana
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 256m
networks: networks:
galaxy-internal: galaxy-internal:
name: galaxy-dev-internal name: galaxy-dev-internal
@@ -292,3 +466,13 @@ volumes:
name: galaxy-dev-site-dist name: galaxy-dev-site-dist
galaxy-dev-geoip-data: galaxy-dev-geoip-data:
name: galaxy-dev-geoip-data name: galaxy-dev-geoip-data
galaxy-dev-mailpit-config:
name: galaxy-dev-mailpit-config
galaxy-dev-prometheus-data:
name: galaxy-dev-prometheus-data
galaxy-dev-grafana-data:
name: galaxy-dev-grafana-data
galaxy-dev-loki-data:
name: galaxy-dev-loki-data
galaxy-dev-tempo-data:
name: galaxy-dev-tempo-data
+18
View File
@@ -0,0 +1,18 @@
# Mailpit SMTP relay upstream — RENDERED AT DEPLOY TIME by
# .gitea/workflows/dev-deploy.yaml from Gitea Actions secrets, then
# seeded into the `galaxy-dev-mailpit-config` volume. The Gmail App
# Password is a secret and MUST NOT be committed: this template only
# carries ${PLACEHOLDER}s that the workflow substitutes. See
# tools/dev-deploy/README.md ("Mail").
#
# Mailpit captures every message; the `--smtp-relay-matching` flag (set
# from GALAXY_DEV_MAIL_RELAY_MATCH in the compose) decides which
# recipients are actually relayed up to this Gmail account.
host: smtp.gmail.com
port: 587
starttls: true
allow-insecure: false
auth: login
username: ${GALAXY_DEV_MAIL_RELAY_USERNAME}
password: ${GALAXY_DEV_MAIL_RELAY_PASSWORD}
return-path: ${GALAXY_DEV_MAIL_RELAY_USERNAME}
+77
View File
@@ -0,0 +1,77 @@
# `tools/dev-deploy/monitoring/` — observability stack
The long-lived dev environment runs a full metrics + logs + traces stack
alongside the application as a **production mirror**: the same compose
fragment and collector configs are meant to back production later. Every
collector lives on the internal `galaxy-dev-internal` network and
publishes **no host port**. The browser-reachable pieces (Grafana and
the Mailpit UI) sit behind the operator console's single `/_gm` Basic
Auth gate — see [`../README.md`](../README.md) and `ARCHITECTURE.md §14`.
## Services
| Service | Image | Role | Reachable |
| --- | --- | --- | --- |
| `galaxy-prometheus` | `prom/prometheus` | Scrape + store metrics (15d) | internal `:9090` |
| `galaxy-loki` | `grafana/loki` | Log store (7d) | internal `:3100` |
| `galaxy-promtail` | `grafana/promtail` | Ship container logs to Loki | — |
| `galaxy-tempo` | `grafana/tempo` | Trace store (3d), OTLP receiver | internal `:3200`, OTLP `:4317`/`:4318` |
| `galaxy-node-exporter` | `prom/node-exporter` | Host metrics | internal `:9100` |
| `galaxy-cadvisor` | `cadvisor` | Per-container CPU/memory/IO | internal `:8080` |
| `galaxy-grafana` | `grafana/grafana` | Dashboards + Explore | Caddy `/_gm/grafana/` |
## What is collected
- **Metrics.** Prometheus (30s interval) scrapes the backend Prometheus
endpoint (`galaxy-backend:9100`), the gateway admin endpoint
(`galaxy-api:9191`), `node-exporter` (host) and cAdvisor (per
container). Engine containers expose no `/metrics`; cAdvisor covers
their resource use.
- **Logs.** promtail discovers containers through the Docker API,
filtered to the `galaxy.stack=dev-deploy` label, and ships their
stdout/stderr to Loki labelled by `container`.
- **Traces.** backend and gateway export OTLP traces over gRPC to Tempo
(`galaxy-tempo:4317`), plaintext on the internal network
(`OTEL_EXPORTER_OTLP_INSECURE=true`, since Tempo's receiver is not
TLS-wrapped inside the contour).
## Grafana access (behind the `/_gm` gate)
Grafana is served under `/_gm/grafana/` (`GF_SERVER_ROOT_URL` +
`GF_SERVER_SERVE_FROM_SUB_PATH=true`) **behind the shared operator gate**:
the Caddy `/_gm/*` Basic Auth (the admin-console account) is the only
barrier. Grafana itself runs as **anonymous Admin** with its login form
and basic auth disabled (`GF_AUTH_ANONYMOUS_ENABLED=true`,
`GF_AUTH_ANONYMOUS_ORG_ROLE=Admin`, `GF_AUTH_DISABLE_LOGIN_FORM=true`,
`GF_AUTH_BASIC_ENABLED=false`), so it ignores the forwarded credentials
and asks for no second password. `GALAXY_DEV_GRAFANA_ADMIN_PASSWORD`
still seeds the admin user for provisioning/API use.
Datasources (Prometheus, Loki, Tempo) and a starter dashboard
(`grafana/dashboards/galaxy-overview.json`) are provisioned as code under
`grafana/provisioning/`.
## Config delivery
`dev-deploy.yaml` copies this directory to a stable host path
(`$HOME/.galaxy-dev/monitoring`, exported as `GALAXY_DEV_MONITORING_DIR`)
before `compose up`, and the compose binds it read-only into the
collectors. A stable path — not the ephemeral CI workspace — keeps the
mounts valid across container restarts and host reboots (the same lesson
as the geoip volume; see `../KNOWN-ISSUES.md`).
## Tuning (cost knobs)
Defaults favour the smallest workable footprint; all are config/compose
values:
- Prometheus `scrape_interval=30s`, `--storage.tsdb.retention.time=15d`.
- Loki `retention_period=168h` (7d); Tempo `block_retention=72h` (3d).
- cAdvisor `--housekeeping_interval=30s`.
- Per-service `deploy.resources.limits.memory` caps (~1.5 GB total cap;
steady-state well under that).
Seven always-on containers cost roughly ~1.1 GB steady RAM and
~1.52.5 GB disk at these retention windows. cAdvisor is the main CPU
cost; on a constrained host it can be dropped (host + app metrics still
cover most needs).
@@ -0,0 +1,46 @@
{
"annotations": { "list": [] },
"editable": true,
"graphTooltip": 0,
"panels": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"id": 1,
"title": "Backend HTTP request rate",
"type": "timeseries",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "sum by (group) (rate(http_requests_total[5m]))",
"legendFormat": "{{group}}"
}
]
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"id": 2,
"title": "Container memory (cadvisor)",
"type": "timeseries",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "sum by (name) (container_memory_usage_bytes{name=~\"galaxy-dev-.*|galaxy-game-.*\"})",
"legendFormat": "{{name}}"
}
]
}
],
"schemaVersion": 39,
"tags": ["galaxy"],
"templating": { "list": [] },
"time": { "from": "now-6h", "to": "now" },
"timepicker": {},
"title": "Galaxy — overview",
"uid": "galaxy-overview",
"version": 1,
"weekStart": ""
}
@@ -0,0 +1,12 @@
# Grafana dashboard provider: load every JSON under the mounted
# dashboards directory at startup (provisioned as code).
apiVersion: 1
providers:
- name: galaxy
type: file
disableDeletion: false
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: false
@@ -0,0 +1,22 @@
# Grafana datasources provisioned as code (dev↔prod parity). All reach
# the collectors by Docker DNS (compose service names) on
# galaxy-dev-internal.
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
uid: prometheus
url: http://galaxy-prometheus:9090
isDefault: true
- name: Loki
type: loki
access: proxy
uid: loki
url: http://galaxy-loki:3100
- name: Tempo
type: tempo
access: proxy
uid: tempo
url: http://galaxy-tempo:3200
+47
View File
@@ -0,0 +1,47 @@
# Single-binary Loki for the dev stack: filesystem storage, in-memory
# ring, 7-day retention. Internal-only (no host port).
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9095
log_level: warn
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
limits_config:
retention_period: 168h
reject_old_samples: true
reject_old_samples_max_age: 168h
compactor:
working_directory: /loki/compactor
retention_enabled: true
delete_request_store: filesystem
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 64
@@ -0,0 +1,24 @@
# Prometheus scrape config for the dev observability stack. Retention is
# a CLI flag in the compose command, not here. Targets are reached by
# Docker DNS (compose service names) on galaxy-dev-internal; nothing is
# published to the host.
global:
scrape_interval: 30s
evaluation_interval: 30s
scrape_configs:
- job_name: backend
static_configs:
- targets: ["galaxy-backend:9100"]
- job_name: gateway
static_configs:
- targets: ["galaxy-api:9191"]
- job_name: node
static_configs:
- targets: ["galaxy-node-exporter:9100"]
- job_name: cadvisor
static_configs:
- targets: ["galaxy-cadvisor:8080"]
- job_name: prometheus
static_configs:
- targets: ["localhost:9090"]
@@ -0,0 +1,30 @@
# Promtail tails the dev stack's container logs via the Docker API
# (service discovery filtered to the galaxy.stack=dev-deploy label) and
# ships them to Loki. Requires the Docker socket mounted read-only.
server:
http_listen_port: 9080
grpc_listen_port: 0
log_level: warn
positions:
filename: /tmp/positions.yaml
clients:
- url: http://galaxy-loki:3100/loki/api/v1/push
scrape_configs:
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 15s
filters:
- name: label
values: ["galaxy.stack=dev-deploy"]
relabel_configs:
- source_labels: ["__meta_docker_container_name"]
regex: "/?(.*)"
target_label: container
- source_labels: ["__meta_docker_container_label_galaxy_game_id"]
target_label: game_id
- source_labels: ["__meta_docker_container_log_stream"]
target_label: stream
@@ -0,0 +1,30 @@
# Single-binary Tempo for the dev stack: OTLP receivers, local block
# storage, 3-day retention. Internal-only (no host port). Backend and
# gateway push traces here over OTLP gRPC (4317).
server:
http_listen_port: 3200
log_level: warn
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
ingester:
max_block_duration: 5m
compactor:
compaction:
block_retention: 72h
storage:
trace:
backend: local
local:
path: /var/tempo/blocks
wal:
path: /var/tempo/wal
+7 -9
View File
@@ -22,7 +22,7 @@ help:
@echo " make up Build (if needed) and bring up the stack, wait until healthy" @echo " make up Build (if needed) and bring up the stack, wait until healthy"
@echo " make down Stop compose containers, leave engines + volumes intact" @echo " make down Stop compose containers, leave engines + volumes intact"
@echo " make rebuild Force rebuild of backend / gateway images and bring up" @echo " make rebuild Force rebuild of backend / gateway images and bring up"
@echo " make build-engine Build the engine image $(ENGINE_IMAGE) used by the dev sandbox" @echo " make build-engine Build the engine image $(ENGINE_IMAGE) used by running games"
@echo " make stop-engines Stop and remove only the per-game engine containers" @echo " make stop-engines Stop and remove only the per-game engine containers"
@echo " make prune-broken-engines Remove non-running engine containers Docker can't heal (run inside 'up')" @echo " make prune-broken-engines Remove non-running engine containers Docker can't heal (run inside 'up')"
@echo " make clean Stop everything (incl. engines) and wipe volumes + game state" @echo " make clean Stop everything (incl. engines) and wipe volumes + game state"
@@ -37,8 +37,9 @@ help:
@echo " pnpm -C ui/frontend dev" @echo " pnpm -C ui/frontend dev"
@echo "and open http://localhost:5173 (UI) plus http://localhost:8025 (Mailpit)." @echo "and open http://localhost:5173 (UI) plus http://localhost:8025 (Mailpit)."
@echo "" @echo ""
@echo "Default login for the auto-provisioned dev sandbox: dev@local.test" @echo "Sign in with email-OTP; the fixed login code 123456 works when"
@echo "(see BACKEND_DEV_SANDBOX_EMAIL in .env). Login code: 123456." @echo "BACKEND_AUTH_DEV_FIXED_CODE is set in .env. No game is auto-provisioned —"
@echo "load a legacy report via the UI's DEV report loader to exercise the map."
up: build-engine prune-broken-engines up: build-engine prune-broken-engines
$(COMPOSE) up -d --wait $(COMPOSE) up -d --wait
@@ -88,12 +89,9 @@ stop-engines:
# bind-mount source and leaves it stuck in `exited` / `created` # bind-mount source and leaves it stuck in `exited` / `created`
# state. This target prunes the husks before `compose up`; the # state. This target prunes the husks before `compose up`; the
# backend's pre-bootstrap reconciler tick (`backend/cmd/backend/main.go`) # backend's pre-bootstrap reconciler tick (`backend/cmd/backend/main.go`)
# then cascades the orphan runtime row to `removed`, the lobby # then cascades the orphan runtime row to `removed` and the lobby
# cancels the game, and the dev-sandbox bootstrap purges the # cancels the game. Healthy `running` / `restarting` containers are
# cancelled tile and provisions a fresh sandbox in the same # left intact so a long-lived game survives normal up/down cycles.
# `make up` cycle. Healthy `running` / `restarting` containers are
# left intact so a long-lived sandbox survives normal up/down
# cycles.
prune-broken-engines: prune-broken-engines:
@ids=""; \ @ids=""; \
for cid in $$(docker ps -aq \ for cid in $$(docker ps -aq \
+16 -50
View File
@@ -78,49 +78,24 @@ To force the second path (no fast-bypass), edit
`make rebuild` (or simply `docker compose up -d backend` to recreate `make rebuild` (or simply `docker compose up -d backend` to recreate
the backend with the new env). the backend with the new env).
## Auto-provisioned dev sandbox ## No auto-provisioned game
`make up` provisions a private game called **Dev Sandbox** owned by `make up` brings up the stack with an empty lobby — there is no
the dev user (default `dev@local.test`). The flow is implemented in auto-provisioned game. Sign in with email-OTP (the fixed dev code
`backend/internal/devsandbox` and runs on every backend boot when `123456` works when `BACKEND_AUTH_DEV_FIXED_CODE` is set in
`BACKEND_DEV_SANDBOX_EMAIL` is non-empty in `tools/local-dev/.env`. `tools/local-dev/.env`):
Bootstrap is idempotent — re-running `make up` after a `make down`
finds the existing user, dummy participants, game, and memberships
without creating duplicates. If a previous boot crashed mid-way
(game stuck in `enrollment_open` or `ready_to_start`), the next boot
resumes the lifecycle.
To log in straight into the sandbox:
1. `make -C tools/local-dev up` 1. `make -C tools/local-dev up`
2. `pnpm -C ui/frontend dev` (in another terminal) 2. `pnpm -C ui/frontend dev` (in another terminal)
3. Open <http://localhost:5173/login>, enter `dev@local.test`, then 3. Open <http://localhost:5173/login>, enter your email, then the dev
the dev code `123456`. code `123456`.
4. The lobby shows **Dev Sandbox** in *My Games*; click in.
To disable the bootstrap, clear `BACKEND_DEV_SANDBOX_EMAIL` in To exercise the map and report views without running a full game, use
`tools/local-dev/.env` and `docker compose up -d backend` (or the UI's DEV **synthetic report loader**: convert a legacy `.REP` with
`make rebuild`). Existing users / games are not removed. `tools/local-dev/legacy-report/` and load the resulting JSON through the
loader (see that tool's README). To play a real game, create one in the
Terminal sandbox games — anything in `cancelled`, `finished`, or lobby and let the engine (`galaxy-engine:local-dev`, built by
`start_failed` — are deleted on every boot before find-or-create `make build-engine`) run it.
runs. The cascade declared in `00001_init.sql` removes the
matching memberships, applications, invites, runtime records,
and player mappings in the same write, so the dev user's lobby
shows exactly one running tile at all times. Cancelling the
sandbox manually and running `docker compose restart backend`
(or `make rebuild`) yields a fresh game without leaving dead
tiles behind.
The bootstrap requires:
- `galaxy-engine:local-dev` Docker image (`make build-engine`).
- `BACKEND_DEV_SANDBOX_ENGINE_VERSION` parses as plain semver
(`MAJOR.MINOR.PATCH`); the default `0.1.0` is what the bootstrap
registers in the `engine_versions` row that points at the image.
- `BACKEND_DEV_SANDBOX_PLAYER_COUNT` ≥ 20 (the engine's minimum;
19 deterministic dummies fill the slots so the single real user
can start the game).
- A frozen turn schedule (`0 0 1 1 *` — once a year) so the visible - A frozen turn schedule (`0 0 1 1 *` — once a year) so the visible
game state stays at turn 1 until you explicitly progress it. game state stays at turn 1 until you explicitly progress it.
@@ -239,24 +214,15 @@ make status docker compose ps
this in one cycle: `prune-broken-engines` (runs as part of `up`) this in one cycle: `prune-broken-engines` (runs as part of `up`)
removes every engine container that is not in `running` / removes every engine container that is not in `running` /
`restarting` state, the backend's pre-bootstrap reconciler tick `restarting` state, the backend's pre-bootstrap reconciler tick
cascades the orphan runtime row to `removed`, the lobby cancels cascades the orphan runtime row to `removed`, and the lobby cancels
the matching sandbox game, and the dev-sandbox bootstrap purges the matching game. To run the cleanup by hand without restarting the
the cancelled tile and provisions a fresh sandbox with a brand rest of the stack, `make prune-broken-engines`.
new state directory. To run the cleanup by hand without restarting
the rest of the stack, `make prune-broken-engines`.
The cycle relies on the backend image carrying the pre-bootstrap The cycle relies on the backend image carrying the pre-bootstrap
reconciler tick (`backend/cmd/backend/main.go`). `make up` reuses reconciler tick (`backend/cmd/backend/main.go`). `make up` reuses
the cached image, so after pulling this commit the first time you the cached image, so after pulling this commit the first time you
must `make rebuild` once to bake the fix in. Future `make up` must `make rebuild` once to bake the fix in. Future `make up`
cycles will heal in one shot. cycles will heal in one shot.
If after the heal cycle the lobby still shows only a `cancelled`
sandbox tile and no running game, the running backend image
predates the pre-bootstrap reconciler tick — the periodic ticker
cancels the orphan after bootstrap has already returned, leaving
the lobby in the half-baked state. `make rebuild` recreates the
image and then `make up` lands a fresh sandbox.
- **`make up` reports a build error mentioning `pkg/cronutil`** — - **`make up` reports a build error mentioning `pkg/cronutil`** —
upstream module list drifted; copy any new `pkg/<name>/` line into upstream module list drifted; copy any new `pkg/<name>/` line into
the local-dev `backend.Dockerfile` / `gateway.Dockerfile` to match the local-dev `backend.Dockerfile` / `gateway.Dockerfile` to match
-4
View File
@@ -122,10 +122,6 @@ services:
BACKEND_OTEL_TRACES_EXPORTER: none BACKEND_OTEL_TRACES_EXPORTER: none
BACKEND_OTEL_METRICS_EXPORTER: none BACKEND_OTEL_METRICS_EXPORTER: none
BACKEND_AUTH_DEV_FIXED_CODE: ${BACKEND_AUTH_DEV_FIXED_CODE:-} BACKEND_AUTH_DEV_FIXED_CODE: ${BACKEND_AUTH_DEV_FIXED_CODE:-}
BACKEND_DEV_SANDBOX_EMAIL: ${BACKEND_DEV_SANDBOX_EMAIL:-}
BACKEND_DEV_SANDBOX_ENGINE_IMAGE: ${BACKEND_DEV_SANDBOX_ENGINE_IMAGE:-}
BACKEND_DEV_SANDBOX_ENGINE_VERSION: ${BACKEND_DEV_SANDBOX_ENGINE_VERSION:-}
BACKEND_DEV_SANDBOX_PLAYER_COUNT: ${BACKEND_DEV_SANDBOX_PLAYER_COUNT:-}
volumes: volumes:
- /var/run/docker.sock:/var/run/docker.sock - /var/run/docker.sock:/var/run/docker.sock
# Per-game state directories live under the same absolute path # Per-game state directories live under the same absolute path
+4 -4
View File
@@ -85,16 +85,16 @@ report to fetch. Two alternatives were rejected:
- a brand-new `user.games.state` message — adds a full wire-flow - a brand-new `user.games.state` message — adds a full wire-flow
(fbs schema, transcoder, gateway routing, backend handler) for a (fbs schema, transcoder, gateway routing, backend handler) for a
one-field response; one-field response;
- hard-coding `turn=0` for all games — works for the dev sandbox - hard-coding `turn=0` for all games — works for a synthetic report
(which never advances past turn zero) but renders the initial loaded at turn zero but mis-renders the initial state for any real
state for any real game past turn zero. game past turn zero.
Extending `GameSummary` reuses the existing lobby pipeline; the Extending `GameSummary` reuses the existing lobby pipeline; the
backend already tracks `current_turn` in its runtime projection backend already tracks `current_turn` in its runtime projection
(`backend/internal/server/handlers_user_lobby_helpers.go` (`backend/internal/server/handlers_user_lobby_helpers.go`
`gameSummaryToWire` reads it from `g.RuntimeSnapshot.CurrentTurn`). `gameSummaryToWire` reads it from `g.RuntimeSnapshot.CurrentTurn`).
The `current_turn` field defaults to zero on the FB side, so existing The `current_turn` field defaults to zero on the FB side, so existing
tests and the dev sandbox flow continue to work unchanged. tests and the synthetic-report flow continue to work unchanged.
## State binding ## State binding
+4 -2
View File
@@ -1,6 +1,8 @@
// DEV-only synthetic-report loader. Backs the "Load synthetic report" // DEV-only synthetic-report loader. Backs the "Load synthetic report"
// affordance on the lobby (visible behind `import.meta.env.DEV`) and // affordance on the lobby (visible when the build-time flag
// the in-game shell layout's bypass for the synthetic game id range. // `VITE_GALAXY_DEV_AFFORDANCES === "true"` — the dev and dev-deploy
// bundles; stripped from prod) and the in-game shell layout's bypass
// for the synthetic game id range.
// //
// The accepted JSON shape mirrors `pkg/model/report.Report` as // The accepted JSON shape mirrors `pkg/model/report.Report` as
// emitted by `tools/local-dev/legacy-report/cmd/legacy-report-to-json`. // emitted by `tools/local-dev/legacy-report/cmd/legacy-report-to-json`.