From 0cae89cba2a8302240bbc61ad82fe15773c2594c Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Sun, 31 May 2026 22:28:03 +0200 Subject: [PATCH 1/8] refactor(dev): remove the dev-sandbox bootstrap everywhere Stage 1 of the dev-as-prod-mirror rework. The auto-provisioned "Dev Sandbox" game and dummy users are removed so the dev contour starts empty like prod; the separate legacy-report loader stays as the test-data path. - delete backend/internal/devsandbox (package + tests) - drop the bootstrap call + DevSandboxConfig (struct, Config field, BACKEND_DEV_SANDBOX_* env, defaults, loader, validation) - strip BACKEND_DEV_SANDBOX_* from dev-deploy + local-dev compose and .env.example; the generic engine-recycle / prune-broken-engines logic stays (it serves real games) - update tooling docs (dev-deploy README + KNOWN-ISSUES, local-dev README + Makefile) and stale comments; DeleteGame and InsertMembershipDirect remain (exercised by lobby integration tests) No app behaviour change beyond not auto-creating the sandbox game. --- .gitea/workflows/dev-deploy.yaml | 7 +- backend/cmd/backend/main.go | 24 +- backend/internal/config/config.go | 51 ---- backend/internal/devsandbox/bootstrap.go | 287 ------------------ backend/internal/devsandbox/bootstrap_test.go | 106 ------- backend/internal/lobby/games.go | 9 +- backend/internal/lobby/lobby_e2e_test.go | 4 +- backend/internal/lobby/membership_direct.go | 11 +- backend/internal/lobby/store.go | 5 +- tools/dev-deploy/.env.example | 6 - tools/dev-deploy/KNOWN-ISSUES.md | 162 +--------- tools/dev-deploy/README.md | 22 +- tools/dev-deploy/docker-compose.yml | 9 - tools/local-dev/Makefile | 16 +- tools/local-dev/README.md | 66 +--- tools/local-dev/docker-compose.yml | 4 - ui/docs/game-state.md | 8 +- 17 files changed, 60 insertions(+), 737 deletions(-) delete mode 100644 backend/internal/devsandbox/bootstrap.go delete mode 100644 backend/internal/devsandbox/bootstrap_test.go diff --git a/.gitea/workflows/dev-deploy.yaml b/.gitea/workflows/dev-deploy.yaml index 2a2ef6b..b9b3c94 100644 --- a/.gitea/workflows/dev-deploy.yaml +++ b/.gitea/workflows/dev-deploy.yaml @@ -153,7 +153,7 @@ jobs: # Compare the freshly-built `galaxy-engine:dev` SHA against # every running `galaxy-game-*` container. The backend # reconciler adopts pre-existing labelled engine containers - # without checking image drift, so a running sandbox would + # without checking image drift, so a running game would # otherwise keep serving the previous engine code until the # container is recycled by hand. This step makes the recycle # automatic but only when it is actually needed: @@ -168,10 +168,7 @@ jobs: # silent state corruption otherwise), and cascade-delete # the lobby `games` row (the FKs in `00001_init.sql` # drop the matching `runtime_records`, `memberships`, - # `player_mappings`, etc. in the same write). The - # `dev-sandbox` bootstrap on the next backend boot finds - # no live sandbox and provisions a fresh one on the new - # engine image. + # `player_mappings`, etc. in the same write). # # Backend is stopped first to keep the reconciler from # racing the recycle (mid-stream adoption / restart). The diff --git a/backend/cmd/backend/main.go b/backend/cmd/backend/main.go index 8d87527..d5b4286 100644 --- a/backend/cmd/backend/main.go +++ b/backend/cmd/backend/main.go @@ -26,7 +26,6 @@ import ( "galaxy/backend/internal/app" "galaxy/backend/internal/auth" "galaxy/backend/internal/config" - "galaxy/backend/internal/devsandbox" "galaxy/backend/internal/diplomail" "galaxy/backend/internal/diplomail/detector" "galaxy/backend/internal/diplomail/translator" @@ -274,29 +273,18 @@ func run(ctx context.Context) (err error) { ) runtimeGateway.svc = runtimeSvc - // Run a single reconciliation pass before the dev-sandbox - // bootstrap so any runtime row pointing at a vanished engine - // container (host reboot wiped /tmp/galaxy-game-state/; - // `tools/local-dev`'s `prune-broken-engines` target reaped the - // husk) is already cascaded through `markRemoved` → lobby - // `cancelled` by the time the bootstrap walks the sandbox list. - // Without this pre-tick the bootstrap would reuse the - // soon-to-be-cancelled game and force the developer into a - // second `make up` cycle to land a healthy sandbox. Failures are + // Run a single reconciliation pass at startup so any runtime row + // pointing at a vanished engine container (a host reboot wiped + // /tmp/galaxy-game-state/; `tools/local-dev`'s + // `prune-broken-engines` target reaped the husk) is cascaded + // through `markRemoved` → lobby `cancelled` before the server + // starts serving requests. Failures are // non-fatal: the periodic ticker started later catches up, and // the worst case degrades to the legacy two-cycle recovery. if err := runtimeSvc.Reconciler().Tick(ctx); err != nil { logger.Warn("pre-bootstrap reconciler tick failed", zap.Error(err)) } - if err := devsandbox.Bootstrap(ctx, devsandbox.Deps{ - Users: userSvc, - Lobby: lobbySvc, - EngineVersions: engineVersionSvc, - }, cfg.DevSandbox, logger); err != nil { - return fmt.Errorf("dev sandbox bootstrap: %w", err) - } - notifStore := notification.NewStore(db) notifSvc := notification.NewService(notification.Deps{ Store: notifStore, diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index baf6c7b..242355b 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -105,11 +105,6 @@ const ( envDiplomailTranslatorTimeout = "BACKEND_DIPLOMAIL_TRANSLATOR_TIMEOUT" envDiplomailTranslatorMaxAttempts = "BACKEND_DIPLOMAIL_TRANSLATOR_MAX_ATTEMPTS" envDiplomailWorkerInterval = "BACKEND_DIPLOMAIL_WORKER_INTERVAL" - - envDevSandboxEmail = "BACKEND_DEV_SANDBOX_EMAIL" - envDevSandboxEngineImage = "BACKEND_DEV_SANDBOX_ENGINE_IMAGE" - envDevSandboxEngineVersion = "BACKEND_DEV_SANDBOX_ENGINE_VERSION" - envDevSandboxPlayerCount = "BACKEND_DEV_SANDBOX_PLAYER_COUNT" ) // Default values applied when an environment variable is absent. @@ -178,9 +173,6 @@ const ( defaultDiplomailTranslatorTimeout = 10 * time.Second defaultDiplomailTranslatorMaxAttempts = 5 defaultDiplomailWorkerInterval = 2 * time.Second - - defaultDevSandboxEngineVersion = "0.1.0" - defaultDevSandboxPlayerCount = 20 ) // Allowed values for the closed-set string options. @@ -219,29 +211,12 @@ type Config struct { Runtime RuntimeConfig Notification NotificationConfig Diplomail DiplomailConfig - DevSandbox DevSandboxConfig // FreshnessWindow mirrors the gateway freshness window and is used by the // push server to bound the cursor TTL. FreshnessWindow time.Duration } -// DevSandboxConfig configures the boot-time bootstrap implemented in -// `backend/internal/devsandbox`. When Email is empty the bootstrap -// is a no-op, which is the production posture. When Email is set — -// from `BACKEND_DEV_SANDBOX_EMAIL` in the `tools/local-dev` stack — -// the bootstrap idempotently provisions a real user, the configured -// number of dummy participants, a private "Dev Sandbox" game, the -// matching memberships, and drives the lifecycle to `running`. The -// engine image and engine version refer to a row that the bootstrap -// also seeds in `engine_versions`. -type DevSandboxConfig struct { - Email string - EngineImage string - EngineVersion string - PlayerCount int -} - // LoggingConfig stores the parameters used by the structured logger. type LoggingConfig struct { // Level is the zap level name (e.g. "debug", "info", "warn", "error"). @@ -572,10 +547,6 @@ func DefaultConfig() Config { TranslatorMaxAttempts: defaultDiplomailTranslatorMaxAttempts, WorkerInterval: defaultDiplomailWorkerInterval, }, - DevSandbox: DevSandboxConfig{ - EngineVersion: defaultDevSandboxEngineVersion, - PlayerCount: defaultDevSandboxPlayerCount, - }, Runtime: RuntimeConfig{ WorkerPoolSize: defaultRuntimeWorkerPoolSize, JobQueueSize: defaultRuntimeJobQueueSize, @@ -755,13 +726,6 @@ func LoadFromEnv() (Config, error) { return Config{}, err } - cfg.DevSandbox.Email = strings.TrimSpace(loadString(envDevSandboxEmail, cfg.DevSandbox.Email)) - cfg.DevSandbox.EngineImage = strings.TrimSpace(loadString(envDevSandboxEngineImage, cfg.DevSandbox.EngineImage)) - cfg.DevSandbox.EngineVersion = strings.TrimSpace(loadString(envDevSandboxEngineVersion, cfg.DevSandbox.EngineVersion)) - if cfg.DevSandbox.PlayerCount, err = loadInt(envDevSandboxPlayerCount, cfg.DevSandbox.PlayerCount); err != nil { - return Config{}, err - } - if err := cfg.Validate(); err != nil { return Config{}, err } @@ -973,21 +937,6 @@ func (c Config) Validate() error { } } - if email := strings.TrimSpace(c.DevSandbox.Email); email != "" { - if _, err := netmail.ParseAddress(email); err != nil { - return fmt.Errorf("%s must be a valid RFC 5322 address: %w", envDevSandboxEmail, err) - } - if strings.TrimSpace(c.DevSandbox.EngineImage) == "" { - return fmt.Errorf("%s must not be empty when %s is set", envDevSandboxEngineImage, envDevSandboxEmail) - } - if strings.TrimSpace(c.DevSandbox.EngineVersion) == "" { - return fmt.Errorf("%s must not be empty when %s is set", envDevSandboxEngineVersion, envDevSandboxEmail) - } - if c.DevSandbox.PlayerCount <= 0 { - return fmt.Errorf("%s must be positive when %s is set", envDevSandboxPlayerCount, envDevSandboxEmail) - } - } - return nil } diff --git a/backend/internal/devsandbox/bootstrap.go b/backend/internal/devsandbox/bootstrap.go deleted file mode 100644 index 849a94c..0000000 --- a/backend/internal/devsandbox/bootstrap.go +++ /dev/null @@ -1,287 +0,0 @@ -// Package devsandbox provisions a ready-to-play game on backend boot -// for the `tools/local-dev` stack. -// -// Bootstrap is invoked from `backend/cmd/backend/main.go` after the -// admin bootstrap and before the HTTP listener starts. It reads -// `cfg.DevSandbox`; when `Email` is empty (the production posture) -// the function logs "skipped" and returns nil. When set, it -// idempotently: -// -// 1. registers the configured engine version and image; -// 2. find-or-creates the real dev user with the configured email; -// 3. find-or-creates `cfg.PlayerCount - 1` deterministic dummy -// users so the engine's minimum-players constraint is met; -// 4. find-or-creates a private "Dev Sandbox" game owned by the -// real user with min/max_players = cfg.PlayerCount and a -// year-out turn schedule (effectively frozen at turn 1); -// 5. inserts memberships for all participants bypassing the -// application/approval flow; -// 6. drives the lifecycle to `running` (or as far as possible if -// the runtime is busy). -// -// The function is a no-op on subsequent boots once the game is -// running; partial states from earlier crashes are recovered. -package devsandbox - -import ( - "context" - "errors" - "fmt" - "time" - - "galaxy/backend/internal/config" - "galaxy/backend/internal/lobby" - "galaxy/backend/internal/runtime" - - "github.com/google/uuid" - "go.uber.org/zap" -) - -// SandboxGameName is the display name used to identify the -// auto-provisioned game on subsequent reboots. The combination of -// game_name and owner_user_id is unique enough in practice — only -// the dev sandbox bootstrap creates a game owned by the configured -// real user with this exact name. -const SandboxGameName = "Dev Sandbox" - -// SandboxTurnSchedule keeps the game on turn 1 by scheduling the -// next turn a year out. The runtime scheduler still parses this and -// will tick once a year — long enough to never interfere with -// solo UI development. -const SandboxTurnSchedule = "0 0 1 1 *" - -// UserEnsurer matches `auth.UserEnsurer`. We define a local -// interface to avoid importing the auth package and circular -// dependencies — the production wiring passes the same `*user.Service` -// instance used by auth. -type UserEnsurer interface { - EnsureByEmail(ctx context.Context, email, preferredLanguage, timeZone, declaredCountry string) (uuid.UUID, error) -} - -// Deps aggregates the collaborators Bootstrap needs. -type Deps struct { - Users UserEnsurer - Lobby *lobby.Service - EngineVersions *runtime.EngineVersionService -} - -// Bootstrap runs the seven-step provisioning flow described on the -// package doc comment. Errors are returned to the caller; the boot -// path in `cmd/backend/main.go` aborts startup if Bootstrap fails so -// a misconfigured dev environment surfaces immediately rather than -// silently leaving the lobby empty. -func Bootstrap(ctx context.Context, deps Deps, cfg config.DevSandboxConfig, logger *zap.Logger) error { - if logger == nil { - logger = zap.NewNop() - } - logger = logger.Named("dev_sandbox") - - if cfg.Email == "" { - logger.Info("skipped (no email)") - return nil - } - if deps.Users == nil || deps.Lobby == nil || deps.EngineVersions == nil { - return errors.New("dev_sandbox: deps.Users, deps.Lobby and deps.EngineVersions are required") - } - if cfg.PlayerCount <= 0 { - return fmt.Errorf("dev_sandbox: PlayerCount must be positive, got %d", cfg.PlayerCount) - } - - if err := ensureEngineVersion(ctx, deps.EngineVersions, cfg, logger); err != nil { - return err - } - - realID, err := deps.Users.EnsureByEmail(ctx, cfg.Email, "en", "UTC", "") - if err != nil { - return fmt.Errorf("dev_sandbox: ensure real user: %w", err) - } - - dummyIDs := make([]uuid.UUID, 0, cfg.PlayerCount-1) - for i := 1; i < cfg.PlayerCount; i++ { - email := fmt.Sprintf("dev-dummy-%02d@local.test", i) - id, err := deps.Users.EnsureByEmail(ctx, email, "en", "UTC", "") - if err != nil { - return fmt.Errorf("dev_sandbox: ensure dummy %d: %w", i, err) - } - dummyIDs = append(dummyIDs, id) - } - - if err := purgeTerminalSandboxGames(ctx, deps.Lobby, realID, logger); err != nil { - return err - } - - game, err := findOrCreateSandboxGame(ctx, deps.Lobby, realID, cfg) - if err != nil { - return err - } - - game, err = ensureMembershipsAndDrive(ctx, deps.Lobby, game, realID, dummyIDs, logger) - if err != nil { - return err - } - - logger.Info("bootstrap complete", - zap.String("user_id", realID.String()), - zap.String("game_id", game.GameID.String()), - zap.String("status", game.Status), - ) - return nil -} - -func ensureEngineVersion(ctx context.Context, svc *runtime.EngineVersionService, cfg config.DevSandboxConfig, logger *zap.Logger) error { - _, err := svc.Register(ctx, runtime.RegisterInput{ - Version: cfg.EngineVersion, - ImageRef: cfg.EngineImage, - }) - switch { - case err == nil: - logger.Info("engine version registered", - zap.String("version", cfg.EngineVersion), - zap.String("image", cfg.EngineImage), - ) - return nil - case errors.Is(err, runtime.ErrEngineVersionTaken): - logger.Debug("engine version already registered", - zap.String("version", cfg.EngineVersion), - ) - return nil - default: - return fmt.Errorf("dev_sandbox: register engine version: %w", err) - } -} - -// terminalSandboxStatus reports whether a sandbox game has reached a -// state from which it can no longer be driven back to running. We -// treat such games as "absent" so the next bootstrap creates a fresh -// one rather than handing the developer a dead lobby tile. -func terminalSandboxStatus(status string) bool { - switch status { - case lobby.GameStatusCancelled, lobby.GameStatusFinished, lobby.GameStatusStartFailed: - return true - } - return false -} - -// purgeTerminalSandboxGames deletes every previous "Dev Sandbox" game -// the dev user owns that has reached a terminal state -// (cancelled / finished / start_failed). The cascade declared in -// `00001_init.sql` removes the matching memberships, applications, -// invites, runtime records, and player mappings in the same write, -// so the developer's lobby never piles up dead tiles between -// `make rebuild` cycles. Non-terminal games are left untouched — -// a `running` sandbox from a previous boot is the happy path. -func purgeTerminalSandboxGames(ctx context.Context, svc *lobby.Service, ownerID uuid.UUID, logger *zap.Logger) error { - games, err := svc.ListMyGames(ctx, ownerID) - if err != nil { - return fmt.Errorf("dev_sandbox: list my games: %w", err) - } - for _, g := range games { - if g.GameName != SandboxGameName || g.OwnerUserID == nil || *g.OwnerUserID != ownerID { - continue - } - if !terminalSandboxStatus(g.Status) { - continue - } - if err := svc.DeleteGame(ctx, g.GameID); err != nil { - return fmt.Errorf("dev_sandbox: delete terminal sandbox %s: %w", g.GameID, err) - } - logger.Info("purged terminal sandbox game", - zap.String("game_id", g.GameID.String()), - zap.String("status", g.Status), - ) - } - return nil -} - -func findOrCreateSandboxGame(ctx context.Context, svc *lobby.Service, ownerID uuid.UUID, cfg config.DevSandboxConfig) (lobby.GameRecord, error) { - games, err := svc.ListMyGames(ctx, ownerID) - if err != nil { - return lobby.GameRecord{}, fmt.Errorf("dev_sandbox: list my games: %w", err) - } - for _, g := range games { - if g.GameName != SandboxGameName || g.OwnerUserID == nil || *g.OwnerUserID != ownerID { - continue - } - // `purgeTerminalSandboxGames` ran before us, so any sandbox - // game still in the list is either a live one we should - // reuse or a transient state we can drive forward. - return g, nil - } - rec, err := svc.CreateGame(ctx, lobby.CreateGameInput{ - OwnerUserID: &ownerID, - Visibility: lobby.VisibilityPrivate, - GameName: SandboxGameName, - Description: "Auto-provisioned by backend/internal/devsandbox for solo UI development.", - MinPlayers: int32(cfg.PlayerCount), - MaxPlayers: int32(cfg.PlayerCount), - StartGapHours: 0, - StartGapPlayers: 0, - EnrollmentEndsAt: time.Now().Add(365 * 24 * time.Hour), - TurnSchedule: SandboxTurnSchedule, - TargetEngineVersion: cfg.EngineVersion, - }) - if err != nil { - return lobby.GameRecord{}, fmt.Errorf("dev_sandbox: create game: %w", err) - } - return rec, nil -} - -func ensureMembershipsAndDrive(ctx context.Context, svc *lobby.Service, game lobby.GameRecord, realID uuid.UUID, dummyIDs []uuid.UUID, logger *zap.Logger) (lobby.GameRecord, error) { - caller := realID - if game.Status == lobby.GameStatusDraft { - next, err := svc.OpenEnrollment(ctx, &caller, false, game.GameID) - if err != nil { - return game, fmt.Errorf("dev_sandbox: open enrollment: %w", err) - } - game = next - } - - if game.Status == lobby.GameStatusEnrollmentOpen { - users := append([]uuid.UUID{realID}, dummyIDs...) - for i, uid := range users { - raceName := fmt.Sprintf("Sandbox-%02d", i+1) - if _, err := svc.InsertMembershipDirect(ctx, lobby.InsertMembershipDirectInput{ - GameID: game.GameID, - UserID: uid, - RaceName: raceName, - }); err != nil { - return game, fmt.Errorf("dev_sandbox: insert membership %d: %w", i+1, err) - } - } - logger.Info("memberships ensured", - zap.Int("count", len(users)), - zap.String("game_id", game.GameID.String()), - ) - next, err := svc.ReadyToStart(ctx, &caller, false, game.GameID) - if err != nil { - return game, fmt.Errorf("dev_sandbox: ready to start: %w", err) - } - game = next - } - - if game.Status == lobby.GameStatusReadyToStart { - next, err := svc.Start(ctx, &caller, false, game.GameID) - if err != nil { - return game, fmt.Errorf("dev_sandbox: start: %w", err) - } - game = next - } - - if game.Status == lobby.GameStatusStartFailed { - next, err := svc.RetryStart(ctx, &caller, false, game.GameID) - if err != nil { - logger.Warn("retry start failed", zap.Error(err)) - return game, nil - } - game = next - if game.Status == lobby.GameStatusReadyToStart { - next, err := svc.Start(ctx, &caller, false, game.GameID) - if err != nil { - return game, fmt.Errorf("dev_sandbox: start after retry: %w", err) - } - game = next - } - } - - return game, nil -} diff --git a/backend/internal/devsandbox/bootstrap_test.go b/backend/internal/devsandbox/bootstrap_test.go deleted file mode 100644 index 714d6cd..0000000 --- a/backend/internal/devsandbox/bootstrap_test.go +++ /dev/null @@ -1,106 +0,0 @@ -package devsandbox - -import ( - "context" - "errors" - "testing" - - "galaxy/backend/internal/config" - - "github.com/google/uuid" - "go.uber.org/zap" -) - -// TestBootstrapSkippedWhenEmailEmpty exercises the no-op branch: with -// the production posture (Email == "") Bootstrap must return without -// touching any dependency. The fact that Users/Lobby/EngineVersions -// are nil here doubles as a check that the early-return runs first. -func TestBootstrapSkippedWhenEmailEmpty(t *testing.T) { - err := Bootstrap( - context.Background(), - Deps{}, - config.DevSandboxConfig{}, - zap.NewNop(), - ) - if err != nil { - t.Fatalf("expected nil error on empty email, got: %v", err) - } -} - -// TestBootstrapRejectsZeroPlayerCount confirms the validation -// short-circuits the flow before any DB call when PlayerCount is -// non-positive but Email is set. The error path is fast and never -// dereferences the (still-nil) Users/Lobby deps. -func TestBootstrapRejectsZeroPlayerCount(t *testing.T) { - err := Bootstrap( - context.Background(), - Deps{Users: stubEnsurer{}, Lobby: nil, EngineVersions: nil}, - config.DevSandboxConfig{ - Email: "dev@local.test", - EngineImage: "galaxy-engine:local-dev", - EngineVersion: "0.0.0-local-dev", - PlayerCount: 0, - }, - zap.NewNop(), - ) - if err == nil { - t.Fatal("expected error on zero PlayerCount, got nil") - } -} - -// TestBootstrapRejectsMissingDeps checks that a misconfigured wiring -// (Email set but one of the required services nil) fails fast rather -// than panicking when the bootstrap reaches its first service call. -func TestBootstrapRejectsMissingDeps(t *testing.T) { - err := Bootstrap( - context.Background(), - Deps{Users: stubEnsurer{}, Lobby: nil, EngineVersions: nil}, - config.DevSandboxConfig{ - Email: "dev@local.test", - EngineImage: "galaxy-engine:local-dev", - EngineVersion: "0.0.0-local-dev", - PlayerCount: 20, - }, - zap.NewNop(), - ) - if err == nil { - t.Fatal("expected error on missing deps, got nil") - } - if !errors.Is(err, errMissingDepsSentinel) && err.Error() == "" { - // The exact wording is not part of the contract; this branch - // only asserts the error is non-nil and human-readable. - t.Fatalf("error has empty message: %v", err) - } -} - -// errMissingDepsSentinel exists so the assertion above can compile; -// the real error is constructed via errors.New inside Bootstrap and -// is intentionally not exported. The test only needs to confirm the -// returned error has a message. -var errMissingDepsSentinel = errors.New("sentinel") - -// TestTerminalSandboxStatus pins the contract that decides whether a -// previously created sandbox game gets purged on the next boot. -// Terminal states are deleted (cascade-style) so the developer's -// lobby never piles up dead tiles between `make rebuild` cycles. -func TestTerminalSandboxStatus(t *testing.T) { - terminal := []string{"cancelled", "finished", "start_failed"} - live := []string{"draft", "enrollment_open", "ready_to_start", "starting", "running", "paused"} - - for _, status := range terminal { - if !terminalSandboxStatus(status) { - t.Errorf("expected %q to be terminal", status) - } - } - for _, status := range live { - if terminalSandboxStatus(status) { - t.Errorf("expected %q to be non-terminal", status) - } - } -} - -type stubEnsurer struct{} - -func (stubEnsurer) EnsureByEmail(_ context.Context, _, _, _, _ string) (uuid.UUID, error) { - return uuid.UUID{}, nil -} diff --git a/backend/internal/lobby/games.go b/backend/internal/lobby/games.go index ad98f4f..9ee1bab 100644 --- a/backend/internal/lobby/games.go +++ b/backend/internal/lobby/games.go @@ -274,11 +274,10 @@ func (s *Service) ListFinishedGamesBefore(ctx context.Context, cutoff time.Time) // `ON DELETE CASCADE` constraints declared in `00001_init.sql`. // Idempotent: returns nil when no game matches. // -// Phase 14 introduces this method for the dev-sandbox bootstrap so a -// terminal "Dev Sandbox" tile from a previous local-dev session can -// be scrubbed before a fresh game spawns. Production callers must -// stay on the regular cancel / finish lifecycle — `DeleteGame` is -// destructive and bypasses the cascade-notification machinery. +// `DeleteGame` is destructive — a hard delete that bypasses the +// cascade-notification machinery — so production callers stay on the +// regular cancel / finish lifecycle. It is exercised by the lobby +// integration tests. func (s *Service) DeleteGame(ctx context.Context, gameID uuid.UUID) error { if err := s.deps.Store.DeleteGame(ctx, gameID); err != nil { return err diff --git a/backend/internal/lobby/lobby_e2e_test.go b/backend/internal/lobby/lobby_e2e_test.go index 7c5baed..460cfb6 100644 --- a/backend/internal/lobby/lobby_e2e_test.go +++ b/backend/internal/lobby/lobby_e2e_test.go @@ -248,8 +248,8 @@ func TestEndToEndPrivateGameFlow(t *testing.T) { } } -// TestDeleteGameCascadesEverything pins the contract the dev-sandbox -// bootstrap relies on: removing a game wipes every referencing row +// TestDeleteGameCascadesEverything pins the DeleteGame contract: +// removing a game wipes every referencing row // (memberships, applications, invites, runtime_records, // player_mappings) in a single SQL statement. Before this is wired // the developer's lobby pile up cancelled tiles between diff --git a/backend/internal/lobby/membership_direct.go b/backend/internal/lobby/membership_direct.go index 1a9201c..c5aa150 100644 --- a/backend/internal/lobby/membership_direct.go +++ b/backend/internal/lobby/membership_direct.go @@ -20,9 +20,9 @@ type InsertMembershipDirectInput struct { // writes as ApproveApplication: the per-game race-name reservation // row plus the membership row, and refreshes the in-memory caches. // -// The method is intended for boot-time provisioning by -// `backend/internal/devsandbox` and similar trusted callers. It is -// not exposed through any HTTP handler. The caller must guarantee +// The method is intended for trusted boot-time provisioning and +// integration tests; it is not exposed through any HTTP handler. The +// caller must guarantee // game.Status == GameStatusEnrollmentOpen — the function returns // ErrConflict otherwise — and that the race-name policy and // canonical-key invariants are honoured (the implementation reuses @@ -30,9 +30,8 @@ type InsertMembershipDirectInput struct { // or unsuitable name still fails). // // Idempotency: if a membership for (GameID, UserID) already exists -// the function returns the existing row without modifying state. -// This makes the helper safe to call on every backend boot from -// devsandbox.Bootstrap. +// the function returns the existing row without modifying state, so +// the helper is safe to call repeatedly. func (s *Service) InsertMembershipDirect(ctx context.Context, in InsertMembershipDirectInput) (Membership, error) { displayName, err := ValidateDisplayName(in.RaceName) if err != nil { diff --git a/backend/internal/lobby/store.go b/backend/internal/lobby/store.go index 97a8c90..c4ee547 100644 --- a/backend/internal/lobby/store.go +++ b/backend/internal/lobby/store.go @@ -236,9 +236,8 @@ func (s *Store) ListMyGames(ctx context.Context, userID uuid.UUID) ([]GameRecord // referencing table (memberships / applications / invites / // runtime_records / player_mappings — all declared with ON DELETE // CASCADE in `00001_init.sql`). Idempotent: returns nil when no row -// matches. Used by the dev-sandbox bootstrap to scrub terminal -// games on every backend boot so the developer's lobby never piles -// up cancelled tiles. +// matches. A hard delete for trusted callers and integration tests; +// production lifecycle uses cancel / finish. func (s *Store) DeleteGame(ctx context.Context, gameID uuid.UUID) error { g := table.Games stmt := g.DELETE().WHERE(g.GameID.EQ(postgres.UUID(gameID))) diff --git a/tools/dev-deploy/.env.example b/tools/dev-deploy/.env.example index 73e932e..259b08c 100644 --- a/tools/dev-deploy/.env.example +++ b/tools/dev-deploy/.env.example @@ -7,12 +7,6 @@ # baked into `docker-compose.yml`, so this file documents the knobs # rather than driving them. -# Auto-provisioned sandbox bootstrap. Empty disables the bootstrap. -BACKEND_DEV_SANDBOX_EMAIL=dev@galaxy.lan -BACKEND_DEV_SANDBOX_ENGINE_IMAGE=galaxy-engine:dev -BACKEND_DEV_SANDBOX_ENGINE_VERSION=0.1.0 -BACKEND_DEV_SANDBOX_PLAYER_COUNT=20 - # `123456` short-circuits the email-code path for the dev account. # This is also the docker-compose default — set the variable to an # empty string here when the environment must rely on real Mailpit diff --git a/tools/dev-deploy/KNOWN-ISSUES.md b/tools/dev-deploy/KNOWN-ISSUES.md index 966ca67..42b5607 100644 --- a/tools/dev-deploy/KNOWN-ISSUES.md +++ b/tools/dev-deploy/KNOWN-ISSUES.md @@ -1,164 +1,8 @@ # `tools/dev-deploy/` — known issues -Issues that surface in the long-lived dev environment but are not yet -fixed. Each entry lists the observed symptom, the diagnostic evidence, -the working hypothesis, and the open questions that have to be -answered before a fix lands. - -## Dev Sandbox game flips to `cancelled` after a `dev-deploy` redispatch - -### Symptom - -A previously `running` "Dev Sandbox" game (created by -`backend/internal/devsandbox`) transitions to `cancelled` ~15 minutes -after a `dev-deploy.yaml` workflow_dispatch run finishes. The user's -browser session survives (the same `device_session_id` keeps working), -but the lobby shows no game because the only game it had is now -terminal. `purgeTerminalSandboxGames` does pick it up on the **next** -boot and creates a fresh sandbox — but the first redispatch leaves -the user with an empty lobby until backend restarts again. - -### Diagnostic evidence - -Backend logs from the broken cycle (timestamps abbreviated): - -```text -20:24:40 dev_sandbox: purged terminal sandbox game game_id= status=cancelled -20:24:40 dev_sandbox: memberships ensured count=20 game_id= -20:24:40 dev_sandbox: bootstrap complete user_id= game_id= status=starting -... -20:25:09 user mail sent failed (diplomail tables missing — unrelated) -... -20:39:40 lobby: game cancelled by runtime reconciler game_id= - op=reconcile status=removed message="container disappeared" -``` - -Between 20:24:40 (`status=starting`) and 20:39:40 (reconciler cancel) -the backend logs are silent on the runtime / engine paths — no -`engine spawned`, no `engine container started`, no `runtime -transition` lines. The reconciler then fires and reports the engine -container as missing. - -`docker ps -a --filter 'label=org.opencontainers.image.title=galaxy-game-engine'` -returns no rows during this window — the engine container is neither -running nor stopped on the host, so it either was never spawned or -was removed before the host snapshot. - -### What has been ruled out - -A live `docker inspect` on a healthy engine container shows: - -```text -Labels: galaxy.backend=1, galaxy.engine_version=0.1.0, - galaxy.game_id=, - org.opencontainers.image.title=galaxy-game-engine, - com.galaxy.{cpu_quota,memory,pids_limit} -AutoRemove: false -RestartPolicy: on-failure -NetworkMode: galaxy-dev-internal -``` - -There are no `com.docker.compose.*` labels and `AutoRemove=false`, -so `--remove-orphans` cannot reap the engine and a `--rm`-style -self-destruct is not in play. Two redispatches captured under -`docker events --filter event=create,start,die,destroy,kill,stop` -also confirmed it: across both runs the only `die` / `destroy` -events were for `galaxy-dev-{backend,api,caddy}`. The live engine -container survived both redispatches, and the reconciler that -fires 60 seconds after the new backend boots correctly matched -it through `byGameID` / `byContainerID`. - -`backend/internal/runtime/service.go` only removes engine -containers from the explicit `runStop` / `runRestart` / `runPatch` -paths. There is no `runtime.Service.Shutdown` that proactively -kills containers on backend exit, so a graceful SIGTERM to -`galaxy-dev-backend` will not touch its child engine containers. - -### Host-side hypotheses considered and rejected by the owner - -The natural follow-up suspects after compose was cleared — host-side -`docker prune` cron jobs, a manual `docker rm`, an out-of-band -`dockerd` restart, and an idle-state engine crash — were all -rejected by the project owner: the dev host runs none of those -periodic cleanups, no one manually removed the container, dockerd -was not restarted in the window, and the engine binary does not -crash while idling on API calls. - -### Best remaining suspicion - -Something the `dev-deploy.yaml` CI run does between successful -image builds and the final `docker compose up -d --wait ---remove-orphans` clobbers the previously-spawned engine container. -The chain at runtime contains: - -1. `docker build -t galaxy-engine:dev -f game/Dockerfile .` -2. `docker compose build galaxy-backend galaxy-api` -3. `docker run --rm` alpine for the UI volume seed -4. `docker compose up -d --wait --remove-orphans` - -None of these *should* touch an unmanaged engine container, but -the reproduction window points squarely inside this sequence. A -deliberate next reproduction with `docker events --since 0` armed -*before* the deploy starts and live for the entire job — captured -end-to-end on the dev host, not just the chunk after backend -recreate — would pin which step emits the `destroy` on the engine. - -### Update 2026-05-19: integration preclean identified as one cause - -A live reproduction during the post-merge auto-deploy cycle (Gitea -run #188 dev-deploy plus parallel run #190 integration) pinned one -clobbering source: `integration/scripts/preclean.sh` was unscoped -and removed *every* container labelled `galaxy.backend=1`, including -the dev-deploy engine. Timeline from the dev host: - -```text -23:10:40 backend pre-bootstrap reconciler tick: engine alive -23:10:40 dev_sandbox bootstrap: status=running -23:10:56 preclean: removing 1 backend-managed engine containers ← integration run #190 -23:11:40 reconciler: container disappeared → game cancelled -``` - -Fix landed: `BACKEND_STACK_LABEL=integration` is now passed to -every integration backend (see -`integration/testenv/backend.go`) and `preclean.sh` AND-combines -`galaxy.backend=1` with `galaxy.stack=integration`, so dev-deploy / -local-dev engines stamped with different stack values are no longer -collateral. - -This covers **push**-triggered cycles where `dev-deploy.yaml` and -`integration.yaml` run on the same Gitea host. The original -hypothesis (a `workflow_dispatch dev-deploy` solo run also losing -the engine) is *not* explained by the integration fix — manual -dispatches do not trigger `integration.yaml`. Keep this entry open -until a solo-dispatch reproduction confirms whether the symptom -still occurs. - -### Status - -Partially fixed (push-triggered cycles). Solo `workflow_dispatch` -reproductions still open. If the symptom recurs after the -integration fix lands, capture `docker events --since 0` for the -full dispatch window and attach here. - -### Workaround in use today - -When the sandbox game flips to `cancelled`, redispatch `dev-deploy`: - -```sh -curl -X POST -n -H 'Content-Type: application/json' \ - -d '{"ref":""}' \ - https://gitea.iliadenisov.ru/api/v1/repos/developer/galaxy-game/actions/workflows/dev-deploy.yaml/dispatches -``` - -The next boot's `purgeTerminalSandboxGames` removes the cancelled -row, `findOrCreateSandboxGame` creates a fresh one, and -`ensureMembershipsAndDrive` puts the new game back to `running`. - -### Owner - -Unassigned. File an issue once we have the runtime / reconciler -analysis above; reference this section in the issue body so future -redeploys can short-circuit the diagnostic loop. +Issues that surfaced in the long-lived dev environment. Each entry lists +the observed symptom, the diagnostic evidence, and the fix or the open +questions that have to be answered before a fix lands. ## `docker restart galaxy-dev-backend` fails after the CI runner cleans up diff --git a/tools/dev-deploy/README.md b/tools/dev-deploy/README.md index 2a7a9fe..4f66960 100644 --- a/tools/dev-deploy/README.md +++ b/tools/dev-deploy/README.md @@ -114,8 +114,7 @@ calls `make clean-data`. The same dev-mode email-code override as `tools/local-dev/` applies, and the dev-deploy compose ships with it enabled by default: -1. Enter `dev@galaxy.lan` (or whatever `BACKEND_DEV_SANDBOX_EMAIL` - resolves to) in the login form. +1. Enter your email address in the login form. 2. Submit `123456` as the code — the docker-compose default for `BACKEND_AUTH_DEV_FIXED_CODE` is `123456`, so the bcrypt-hashed email code stays a fallback. To force real Mailpit codes (e.g. for @@ -212,8 +211,7 @@ make clean-data Stop everything and wipe volumes + game-state dir ## Known issues See [`KNOWN-ISSUES.md`](KNOWN-ISSUES.md) for symptoms that surface -in the long-lived dev environment but are not yet fixed (currently: -the sandbox game flipping to `cancelled` after a redispatch). +in the long-lived dev environment but are not yet fixed. ## Deployment cadence @@ -237,12 +235,12 @@ behind. There is no separate state to clean up between the two paths. ### Engine image drift recycle -`backend` spawns one engine container per game (the long-lived "Dev -Sandbox" plus any user-created games) and the reconciler reattaches -to whatever it finds with the `galaxy.stack=dev-deploy` label. That -reattach does not check the running container's image SHA against the -freshly-built `galaxy-engine:dev` tag, so an unchanged container would -otherwise keep serving the previous engine code after a redeploy. +`backend` spawns one engine container per running game and the +reconciler reattaches to whatever it finds with the +`galaxy.stack=dev-deploy` label. That reattach does not check the +running container's image SHA against the freshly-built +`galaxy-engine:dev` tag, so an unchanged container would otherwise +keep serving the previous engine code after a redeploy. The `dev-deploy.yaml` workflow handles this in the `Recycle engine containers on image drift` step. When `docker build` @@ -250,9 +248,7 @@ produces a new `galaxy-engine:dev` SHA, the step compares it against every running `galaxy-game-*` container and, for each drifted one, stops the backend, removes the container, wipes its bind-mounted state directory (Engine.Init() writes turn-0 over any pre-existing -`turn-N` files), and cascade-deletes the lobby `games` row. The -`dev-sandbox` bootstrap on the next backend boot finds no live -sandbox and provisions a fresh one on the new engine image. +`turn-N` files), and cascade-deletes the lobby `games` row. When the engine sources are unchanged, the BuildKit cache hits and the SHA stays the same — the recycle step is a no-op and the running diff --git a/tools/dev-deploy/docker-compose.yml b/tools/dev-deploy/docker-compose.yml index 23e260e..02551dd 100644 --- a/tools/dev-deploy/docker-compose.yml +++ b/tools/dev-deploy/docker-compose.yml @@ -127,15 +127,6 @@ services: # bcrypt-hashed code is single-use). Set the var to an empty # string in `.env` to disable. BACKEND_AUTH_DEV_FIXED_CODE: ${BACKEND_AUTH_DEV_FIXED_CODE:-123456} - # Long-lived dev environment always bootstraps the "Dev Sandbox" - # game owned by this email so a freshly redeployed stack already - # has one ready-to-play game in the lobby. Set the variable to an - # empty string in `.env` to disable the bootstrap (e.g. for a - # cold-start QA pass). - BACKEND_DEV_SANDBOX_EMAIL: ${BACKEND_DEV_SANDBOX_EMAIL:-dev@galaxy.lan} - BACKEND_DEV_SANDBOX_ENGINE_IMAGE: ${BACKEND_DEV_SANDBOX_ENGINE_IMAGE:-galaxy-engine:dev} - BACKEND_DEV_SANDBOX_ENGINE_VERSION: ${BACKEND_DEV_SANDBOX_ENGINE_VERSION:-0.1.0} - BACKEND_DEV_SANDBOX_PLAYER_COUNT: ${BACKEND_DEV_SANDBOX_PLAYER_COUNT:-20} volumes: - /var/run/docker.sock:/var/run/docker.sock # Per-game state directories live under the same absolute path diff --git a/tools/local-dev/Makefile b/tools/local-dev/Makefile index 4981f23..02b1b94 100644 --- a/tools/local-dev/Makefile +++ b/tools/local-dev/Makefile @@ -22,7 +22,7 @@ help: @echo " make up Build (if needed) and bring up the stack, wait until healthy" @echo " make down Stop compose containers, leave engines + volumes intact" @echo " make rebuild Force rebuild of backend / gateway images and bring up" - @echo " make build-engine Build the engine image $(ENGINE_IMAGE) used by the dev sandbox" + @echo " make build-engine Build the engine image $(ENGINE_IMAGE) used by running games" @echo " make stop-engines Stop and remove only the per-game engine containers" @echo " make prune-broken-engines Remove non-running engine containers Docker can't heal (run inside 'up')" @echo " make clean Stop everything (incl. engines) and wipe volumes + game state" @@ -37,8 +37,9 @@ help: @echo " pnpm -C ui/frontend dev" @echo "and open http://localhost:5173 (UI) plus http://localhost:8025 (Mailpit)." @echo "" - @echo "Default login for the auto-provisioned dev sandbox: dev@local.test" - @echo "(see BACKEND_DEV_SANDBOX_EMAIL in .env). Login code: 123456." + @echo "Sign in with email-OTP; the fixed login code 123456 works when" + @echo "BACKEND_AUTH_DEV_FIXED_CODE is set in .env. No game is auto-provisioned —" + @echo "load a legacy report via the UI's DEV report loader to exercise the map." up: build-engine prune-broken-engines $(COMPOSE) up -d --wait @@ -88,12 +89,9 @@ stop-engines: # bind-mount source and leaves it stuck in `exited` / `created` # state. This target prunes the husks before `compose up`; the # backend's pre-bootstrap reconciler tick (`backend/cmd/backend/main.go`) -# then cascades the orphan runtime row to `removed`, the lobby -# cancels the game, and the dev-sandbox bootstrap purges the -# cancelled tile and provisions a fresh sandbox in the same -# `make up` cycle. Healthy `running` / `restarting` containers are -# left intact so a long-lived sandbox survives normal up/down -# cycles. +# then cascades the orphan runtime row to `removed` and the lobby +# cancels the game. Healthy `running` / `restarting` containers are +# left intact so a long-lived game survives normal up/down cycles. prune-broken-engines: @ids=""; \ for cid in $$(docker ps -aq \ diff --git a/tools/local-dev/README.md b/tools/local-dev/README.md index 26404db..8201568 100644 --- a/tools/local-dev/README.md +++ b/tools/local-dev/README.md @@ -78,49 +78,24 @@ To force the second path (no fast-bypass), edit `make rebuild` (or simply `docker compose up -d backend` to recreate the backend with the new env). -## Auto-provisioned dev sandbox +## No auto-provisioned game -`make up` provisions a private game called **Dev Sandbox** owned by -the dev user (default `dev@local.test`). The flow is implemented in -`backend/internal/devsandbox` and runs on every backend boot when -`BACKEND_DEV_SANDBOX_EMAIL` is non-empty in `tools/local-dev/.env`. - -Bootstrap is idempotent — re-running `make up` after a `make down` -finds the existing user, dummy participants, game, and memberships -without creating duplicates. If a previous boot crashed mid-way -(game stuck in `enrollment_open` or `ready_to_start`), the next boot -resumes the lifecycle. - -To log in straight into the sandbox: +`make up` brings up the stack with an empty lobby — there is no +auto-provisioned game. Sign in with email-OTP (the fixed dev code +`123456` works when `BACKEND_AUTH_DEV_FIXED_CODE` is set in +`tools/local-dev/.env`): 1. `make -C tools/local-dev up` 2. `pnpm -C ui/frontend dev` (in another terminal) -3. Open , enter `dev@local.test`, then - the dev code `123456`. -4. The lobby shows **Dev Sandbox** in *My Games*; click in. +3. Open , enter your email, then the dev + code `123456`. -To disable the bootstrap, clear `BACKEND_DEV_SANDBOX_EMAIL` in -`tools/local-dev/.env` and `docker compose up -d backend` (or -`make rebuild`). Existing users / games are not removed. - -Terminal sandbox games — anything in `cancelled`, `finished`, or -`start_failed` — are deleted on every boot before find-or-create -runs. The cascade declared in `00001_init.sql` removes the -matching memberships, applications, invites, runtime records, -and player mappings in the same write, so the dev user's lobby -shows exactly one running tile at all times. Cancelling the -sandbox manually and running `docker compose restart backend` -(or `make rebuild`) yields a fresh game without leaving dead -tiles behind. - -The bootstrap requires: -- `galaxy-engine:local-dev` Docker image (`make build-engine`). -- `BACKEND_DEV_SANDBOX_ENGINE_VERSION` parses as plain semver - (`MAJOR.MINOR.PATCH`); the default `0.1.0` is what the bootstrap - registers in the `engine_versions` row that points at the image. -- `BACKEND_DEV_SANDBOX_PLAYER_COUNT` ≥ 20 (the engine's minimum; - 19 deterministic dummies fill the slots so the single real user - can start the game). +To exercise the map and report views without running a full game, use +the UI's DEV **synthetic report loader**: convert a legacy `.REP` with +`tools/local-dev/legacy-report/` and load the resulting JSON through the +loader (see that tool's README). To play a real game, create one in the +lobby and let the engine (`galaxy-engine:local-dev`, built by +`make build-engine`) run it. - A frozen turn schedule (`0 0 1 1 *` — once a year) so the visible game state stays at turn 1 until you explicitly progress it. @@ -239,24 +214,15 @@ make status docker compose ps this in one cycle: `prune-broken-engines` (runs as part of `up`) removes every engine container that is not in `running` / `restarting` state, the backend's pre-bootstrap reconciler tick - cascades the orphan runtime row to `removed`, the lobby cancels - the matching sandbox game, and the dev-sandbox bootstrap purges - the cancelled tile and provisions a fresh sandbox with a brand - new state directory. To run the cleanup by hand without restarting - the rest of the stack, `make prune-broken-engines`. + cascades the orphan runtime row to `removed`, and the lobby cancels + the matching game. To run the cleanup by hand without restarting the + rest of the stack, `make prune-broken-engines`. The cycle relies on the backend image carrying the pre-bootstrap reconciler tick (`backend/cmd/backend/main.go`). `make up` reuses the cached image, so after pulling this commit the first time you must `make rebuild` once to bake the fix in. Future `make up` cycles will heal in one shot. - - If after the heal cycle the lobby still shows only a `cancelled` - sandbox tile and no running game, the running backend image - predates the pre-bootstrap reconciler tick — the periodic ticker - cancels the orphan after bootstrap has already returned, leaving - the lobby in the half-baked state. `make rebuild` recreates the - image and then `make up` lands a fresh sandbox. - **`make up` reports a build error mentioning `pkg/cronutil`** — upstream module list drifted; copy any new `pkg//` line into the local-dev `backend.Dockerfile` / `gateway.Dockerfile` to match diff --git a/tools/local-dev/docker-compose.yml b/tools/local-dev/docker-compose.yml index d063382..b37a509 100644 --- a/tools/local-dev/docker-compose.yml +++ b/tools/local-dev/docker-compose.yml @@ -122,10 +122,6 @@ services: BACKEND_OTEL_TRACES_EXPORTER: none BACKEND_OTEL_METRICS_EXPORTER: none BACKEND_AUTH_DEV_FIXED_CODE: ${BACKEND_AUTH_DEV_FIXED_CODE:-} - BACKEND_DEV_SANDBOX_EMAIL: ${BACKEND_DEV_SANDBOX_EMAIL:-} - BACKEND_DEV_SANDBOX_ENGINE_IMAGE: ${BACKEND_DEV_SANDBOX_ENGINE_IMAGE:-} - BACKEND_DEV_SANDBOX_ENGINE_VERSION: ${BACKEND_DEV_SANDBOX_ENGINE_VERSION:-} - BACKEND_DEV_SANDBOX_PLAYER_COUNT: ${BACKEND_DEV_SANDBOX_PLAYER_COUNT:-} volumes: - /var/run/docker.sock:/var/run/docker.sock # Per-game state directories live under the same absolute path diff --git a/ui/docs/game-state.md b/ui/docs/game-state.md index d377e2e..756419d 100644 --- a/ui/docs/game-state.md +++ b/ui/docs/game-state.md @@ -85,16 +85,16 @@ report to fetch. Two alternatives were rejected: - a brand-new `user.games.state` message — adds a full wire-flow (fbs schema, transcoder, gateway routing, backend handler) for a one-field response; -- hard-coding `turn=0` for all games — works for the dev sandbox - (which never advances past turn zero) but renders the initial - state for any real game past turn zero. +- hard-coding `turn=0` for all games — works for a synthetic report + loaded at turn zero but mis-renders the initial state for any real + game past turn zero. Extending `GameSummary` reuses the existing lobby pipeline; the backend already tracks `current_turn` in its runtime projection (`backend/internal/server/handlers_user_lobby_helpers.go` `gameSummaryToWire` reads it from `g.RuntimeSnapshot.CurrentTurn`). The `current_turn` field defaults to zero on the FB side, so existing -tests and the dev sandbox flow continue to work unchanged. +tests and the synthetic-report flow continue to work unchanged. ## State binding -- 2.52.0 From 225f89fad6eff6745e7ef354fccbd9323a309a2c Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Sun, 31 May 2026 22:33:32 +0200 Subject: [PATCH 2/8] docs(ui): correct the synthetic-report loader gate comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage 2 of the dev-as-prod-mirror rework. The legacy-report (synthetic) report loader is already available in the dev-deploy UI: it is gated by the build-time flag VITE_GALAXY_DEV_AFFORDANCES (set "true" in dev-deploy.yaml line 89, unset in prod-build.yaml so prod strips it), not by import.meta.env.DEV. Correct the stale header comment that claimed import.meta.env.DEV. No functional change — the desired "loader in dev, absent in prod" posture already holds. --- ui/frontend/src/api/synthetic-report.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ui/frontend/src/api/synthetic-report.ts b/ui/frontend/src/api/synthetic-report.ts index d4ea0e1..7d83657 100644 --- a/ui/frontend/src/api/synthetic-report.ts +++ b/ui/frontend/src/api/synthetic-report.ts @@ -1,6 +1,8 @@ // DEV-only synthetic-report loader. Backs the "Load synthetic report" -// affordance on the lobby (visible behind `import.meta.env.DEV`) and -// the in-game shell layout's bypass for the synthetic game id range. +// affordance on the lobby (visible when the build-time flag +// `VITE_GALAXY_DEV_AFFORDANCES === "true"` — the dev and dev-deploy +// bundles; stripped from prod) and the in-game shell layout's bypass +// for the synthetic game id range. // // The accepted JSON shape mirrors `pkg/model/report.Report` as // emitted by `tools/local-dev/legacy-report/cmd/legacy-report-to-json`. -- 2.52.0 From 7fb6a63c2b98e35ab4005973aa31f3d877a4d6c9 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Sun, 31 May 2026 22:44:32 +0200 Subject: [PATCH 3/8] feat(dev-deploy): relay Mailpit to Gmail (Stage 3) Keep Mailpit as the backend's SMTP submission point and turn on its relay so OTP/notification mail addressed to the owner reaches a real Gmail inbox, while everything else stays captured-only. - mailpit gains --smtp-relay-config + --smtp-relay-matching (default non-routable, so an unconfigured stack only captures); relay.conf is mounted from a new galaxy-dev-mailpit-config volume - tools/dev-deploy/mailpit/relay.conf.tmpl + a dev-deploy.yaml step that renders it from Gitea secrets (Gmail App Password, never committed) and seeds the volume; the GALAXY_DEV_MAIL_RELAY_MATCH var drives the relay-matching recipient - backend SMTP config unchanged (still -> galaxy-mailpit:1025) - dev-deploy README documents the relay + required secrets/vars Verified locally: compose config valid; the rendered relay.conf is accepted by mailpit v1.21.8 (relay + recipient-matching enabled). Real Gmail delivery is verified at the dev-deploy preview once the owner sets the secrets. --- .gitea/workflows/dev-deploy.yaml | 30 ++++++++++++++++++++++++ tools/dev-deploy/README.md | 30 +++++++++++++++++++++--- tools/dev-deploy/docker-compose.yml | 12 ++++++++++ tools/dev-deploy/mailpit/relay.conf.tmpl | 18 ++++++++++++++ 4 files changed, 87 insertions(+), 3 deletions(-) create mode 100644 tools/dev-deploy/mailpit/relay.conf.tmpl diff --git a/.gitea/workflows/dev-deploy.yaml b/.gitea/workflows/dev-deploy.yaml index b9b3c94..9e74fbd 100644 --- a/.gitea/workflows/dev-deploy.yaml +++ b/.gitea/workflows/dev-deploy.yaml @@ -148,6 +148,31 @@ jobs: -v "${{ gitea.workspace }}/pkg/geoip/test-data/test-data:/src:ro" \ alpine sh -c 'cp /src/GeoIP2-Country-Test.mmdb /dst/geoip.mmdb' + - name: Seed mailpit relay config + env: + GALAXY_DEV_MAIL_RELAY_USERNAME: ${{ secrets.GALAXY_DEV_MAIL_RELAY_USERNAME }} + GALAXY_DEV_MAIL_RELAY_PASSWORD: ${{ secrets.GALAXY_DEV_MAIL_RELAY_PASSWORD }} + run: | + # Render the Mailpit relay upstream config from the template, + # substituting the Gmail App Password from a Gitea secret, then + # seed it into a named volume (same rationale as the geoip seed: + # a workspace bind-mount would vanish with the runner workspace). + # The secret never lands in git or a committed file; it is + # rendered to a tmpfile outside the repo and removed after. Gmail + # App Passwords are [a-z]{16}, so the `|` sed delimiter is safe. + # When the secret is unset the creds render empty and the compose + # default relay-match is non-routable, so the stack only captures. + rendered="$(mktemp)" + sed -e "s|\${GALAXY_DEV_MAIL_RELAY_USERNAME}|${GALAXY_DEV_MAIL_RELAY_USERNAME}|g" \ + -e "s|\${GALAXY_DEV_MAIL_RELAY_PASSWORD}|${GALAXY_DEV_MAIL_RELAY_PASSWORD}|g" \ + "${{ gitea.workspace }}/tools/dev-deploy/mailpit/relay.conf.tmpl" > "$rendered" + docker volume create galaxy-dev-mailpit-config >/dev/null + docker run --rm \ + -v galaxy-dev-mailpit-config:/dst \ + -v "$rendered:/src/relay.conf:ro" \ + alpine sh -c 'cp /src/relay.conf /dst/relay.conf && chmod 600 /dst/relay.conf' + rm -f "$rendered" + - name: Recycle engine containers on image drift run: | # Compare the freshly-built `galaxy-engine:dev` SHA against @@ -231,6 +256,11 @@ jobs: - name: Bring up the stack working-directory: tools/dev-deploy + env: + # Recipient regex Mailpit auto-relays to the owner's Gmail. + # Unset/empty → the compose default (non-routable) keeps the + # stack capture-only. + GALAXY_DEV_MAIL_RELAY_MATCH: ${{ vars.GALAXY_DEV_MAIL_RELAY_MATCH }} run: | # Resolve in the shell, not in YAML expressions — `env.HOME` # is empty at the workflow-evaluation stage. diff --git a/tools/dev-deploy/README.md b/tools/dev-deploy/README.md index 4f66960..2f77efa 100644 --- a/tools/dev-deploy/README.md +++ b/tools/dev-deploy/README.md @@ -117,13 +117,37 @@ and the dev-deploy compose ships with it enabled by default: 1. Enter your email address in the login form. 2. Submit `123456` as the code — the docker-compose default for `BACKEND_AUTH_DEV_FIXED_CODE` is `123456`, so the bcrypt-hashed - email code stays a fallback. To force real Mailpit codes (e.g. for - mail-flow QA), set `BACKEND_AUTH_DEV_FIXED_CODE=` (empty) in a - local `.env` and `make rebuild`. + email code stays a fallback. To force the real email code (which + Mailpit then relays to your Gmail — see **Mail** below), set + `BACKEND_AUTH_DEV_FIXED_CODE=` (empty) and redeploy. The fixed-code override is rejected by production env loaders, so it cannot leak into the prod environment. +## Mail + +The backend always submits mail to **Mailpit** (`galaxy-mailpit:1025`), +exactly as it would to a production SMTP server. Mailpit captures every +message in its UI (internal `:8025`) and, when configured, **relays** +the ones whose recipient matches `GALAXY_DEV_MAIL_RELAY_MATCH` up to a +real Gmail account — so an OTP addressed to you lands in your real inbox +while everything else stays captured-only. + +Configure the relay through Gitea Actions secrets/vars (never +committed); the `dev-deploy.yaml` workflow renders Mailpit's +`relay.conf` (from `tools/dev-deploy/mailpit/relay.conf.tmpl`) and seeds +it into the `galaxy-dev-mailpit-config` volume: + +| Name | Kind | Purpose | +| --- | --- | --- | +| `GALAXY_DEV_MAIL_RELAY_USERNAME` | secret | Gmail address used as the relay login + From. | +| `GALAXY_DEV_MAIL_RELAY_PASSWORD` | secret | Gmail **App Password** (requires 2FA; not the account password). | +| `GALAXY_DEV_MAIL_RELAY_MATCH` | var | Recipient regex to auto-relay (e.g. your Gmail address). Unset → capture-only. | + +With none set the stack only captures mail (the compose relay-match +defaults to a non-routable address), so it can never email third +parties. + ## Networking ``` diff --git a/tools/dev-deploy/docker-compose.yml b/tools/dev-deploy/docker-compose.yml index 02551dd..d969f34 100644 --- a/tools/dev-deploy/docker-compose.yml +++ b/tools/dev-deploy/docker-compose.yml @@ -66,10 +66,20 @@ services: image: axllent/mailpit:v1.21 container_name: galaxy-dev-mailpit restart: unless-stopped + # Mailpit is both the SMTP submission point and a relay: it captures + # every message in its UI and auto-relays the ones whose recipient + # matches GALAXY_DEV_MAIL_RELAY_MATCH to the Gmail account in the + # secret-rendered relay config. The default match is non-routable, so + # a stack brought up without the relay secret only captures, never sends. + command: + - "--smtp-relay-config=/etc/mailpit/relay.conf" + - "--smtp-relay-matching=${GALAXY_DEV_MAIL_RELAY_MATCH:-nobody@invalid.example}" labels: galaxy.stack: dev-deploy networks: - galaxy-internal + volumes: + - galaxy-dev-mailpit-config:/etc/mailpit:ro healthcheck: test: ["CMD", "wget", "-q", "-O-", "http://localhost:8025/livez"] interval: 3s @@ -283,3 +293,5 @@ volumes: name: galaxy-dev-site-dist galaxy-dev-geoip-data: name: galaxy-dev-geoip-data + galaxy-dev-mailpit-config: + name: galaxy-dev-mailpit-config diff --git a/tools/dev-deploy/mailpit/relay.conf.tmpl b/tools/dev-deploy/mailpit/relay.conf.tmpl new file mode 100644 index 0000000..2c4de2e --- /dev/null +++ b/tools/dev-deploy/mailpit/relay.conf.tmpl @@ -0,0 +1,18 @@ +# Mailpit SMTP relay upstream — RENDERED AT DEPLOY TIME by +# .gitea/workflows/dev-deploy.yaml from Gitea Actions secrets, then +# seeded into the `galaxy-dev-mailpit-config` volume. The Gmail App +# Password is a secret and MUST NOT be committed: this template only +# carries ${PLACEHOLDER}s that the workflow substitutes. See +# tools/dev-deploy/README.md ("Mail"). +# +# Mailpit captures every message; the `--smtp-relay-matching` flag (set +# from GALAXY_DEV_MAIL_RELAY_MATCH in the compose) decides which +# recipients are actually relayed up to this Gmail account. +host: smtp.gmail.com +port: 587 +starttls: true +allow-insecure: false +auth: login +username: ${GALAXY_DEV_MAIL_RELAY_USERNAME} +password: ${GALAXY_DEV_MAIL_RELAY_PASSWORD} +return-path: ${GALAXY_DEV_MAIL_RELAY_USERNAME} -- 2.52.0 From 84a0ccb23f2b00e44be20ba9e7cc8a90d588da72 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Sun, 31 May 2026 23:39:06 +0200 Subject: [PATCH 4/8] feat(dev-deploy): full observability stack (Prometheus/Grafana/Loki/Tempo) Stand up a production-mirror monitoring stack in the long-lived dev contour, all on galaxy-dev-internal with no host ports (reached only via the in-repo galaxy-dev-caddy): - Prometheus scrapes backend:9100, gateway:9191, node-exporter and cadvisor (30s interval, 15d retention); Loki (7d) + promtail (Docker service discovery by the galaxy.stack=dev-deploy label) for logs; Tempo (3d) for traces. - Backend and gateway now export OTLP traces to Tempo over plaintext gRPC on the internal network (OTEL_EXPORTER_OTLP_INSECURE). - Grafana provisioned as code (Prometheus/Loki/Tempo datasources plus a starter dashboard), served under /grafana/ via Caddy sub-path mode; admin password from the GALAXY_DEV_GRAFANA_ADMIN_PASSWORD secret. - Expose the Mailpit capture UI under /mailpit/ (Caddy basic-auth + MP_WEBROOT) so every captured message is readable regardless of relay. - dev-deploy.yaml seeds the monitoring config to a stable, reboot- surviving host path and injects the Grafana admin secret. Per-service memory limits keep the footprint within budget. All collector config lives under tools/dev-deploy/monitoring/ for dev/prod parity. --- tools/dev-deploy/docker-compose.yml | 175 +++++++++++++++++- .../grafana/dashboards/galaxy-overview.json | 46 +++++ .../provisioning/dashboards/dashboards.yml | 12 ++ .../provisioning/datasources/datasources.yml | 22 +++ tools/dev-deploy/monitoring/loki/loki.yml | 47 +++++ .../monitoring/prometheus/prometheus.yml | 24 +++ .../monitoring/promtail/promtail.yml | 30 +++ tools/dev-deploy/monitoring/tempo/tempo.yml | 30 +++ 8 files changed, 385 insertions(+), 1 deletion(-) create mode 100644 tools/dev-deploy/monitoring/grafana/dashboards/galaxy-overview.json create mode 100644 tools/dev-deploy/monitoring/grafana/provisioning/dashboards/dashboards.yml create mode 100644 tools/dev-deploy/monitoring/grafana/provisioning/datasources/datasources.yml create mode 100644 tools/dev-deploy/monitoring/loki/loki.yml create mode 100644 tools/dev-deploy/monitoring/prometheus/prometheus.yml create mode 100644 tools/dev-deploy/monitoring/promtail/promtail.yml create mode 100644 tools/dev-deploy/monitoring/tempo/tempo.yml diff --git a/tools/dev-deploy/docker-compose.yml b/tools/dev-deploy/docker-compose.yml index d969f34..3c44fbc 100644 --- a/tools/dev-deploy/docker-compose.yml +++ b/tools/dev-deploy/docker-compose.yml @@ -74,6 +74,9 @@ services: command: - "--smtp-relay-config=/etc/mailpit/relay.conf" - "--smtp-relay-matching=${GALAXY_DEV_MAIL_RELAY_MATCH:-nobody@invalid.example}" + # Serve the capture UI under /mailpit so the host Caddy can expose it + # at https://galaxy.lan/mailpit/ (behind basic-auth); SMTP is unaffected. + - "--webroot=/mailpit" labels: galaxy.stack: dev-deploy networks: @@ -118,7 +121,13 @@ services: BACKEND_NOTIFICATION_ADMIN_EMAIL: admin@galaxy.lan BACKEND_MAIL_WORKER_INTERVAL: 500ms BACKEND_NOTIFICATION_WORKER_INTERVAL: 500ms - BACKEND_OTEL_TRACES_EXPORTER: none + BACKEND_OTEL_TRACES_EXPORTER: otlp + BACKEND_OTEL_PROTOCOL: grpc + BACKEND_OTEL_ENDPOINT: "galaxy-tempo:4317" + # Tempo's OTLP receiver is plaintext on the internal network; the + # backend's gRPC exporter defaults to TLS, so disable it via the + # standard SDK env (applied on top of WithEndpoint). + OTEL_EXPORTER_OTLP_INSECURE: "true" # Prometheus metrics are enabled in dev so the `/metrics` scrape # endpoint is live and stable ahead of standing up a Prometheus + # Grafana stack on the internal network. The listener stays internal @@ -196,6 +205,12 @@ services: # the internal network — live and stable for a future scrape, not # mapped to the host. GATEWAY_ADMIN_HTTP_ADDR: ":9191" + # Traces -> Tempo over OTLP gRPC (plaintext on the internal net). + OTEL_SERVICE_NAME: galaxy-gateway + OTEL_TRACES_EXPORTER: otlp + OTEL_EXPORTER_OTLP_PROTOCOL: grpc + OTEL_EXPORTER_OTLP_ENDPOINT: "http://galaxy-tempo:4317" + OTEL_EXPORTER_OTLP_INSECURE: "true" GATEWAY_BACKEND_HTTP_URL: "http://galaxy-backend:8080" GATEWAY_BACKEND_GRPC_PUSH_URL: "galaxy-backend:8081" GATEWAY_BACKEND_GATEWAY_CLIENT_ID: dev-gateway-1 @@ -264,6 +279,156 @@ services: - galaxy-internal - edge + galaxy-prometheus: + image: prom/prometheus:v2.55.1 + container_name: galaxy-dev-prometheus + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.time=15d + - --web.enable-lifecycle + volumes: + - ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - galaxy-dev-prometheus-data:/prometheus + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 384m + + galaxy-loki: + image: grafana/loki:3.3.2 + container_name: galaxy-dev-loki + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + command: ["-config.file=/etc/loki/loki.yml"] + volumes: + - ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/loki/loki.yml:/etc/loki/loki.yml:ro + - galaxy-dev-loki-data:/loki + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 384m + + galaxy-promtail: + image: grafana/promtail:3.3.2 + container_name: galaxy-dev-promtail + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + command: ["-config.file=/etc/promtail/promtail.yml"] + volumes: + - ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/promtail/promtail.yml:/etc/promtail/promtail.yml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 128m + + galaxy-tempo: + image: grafana/tempo:2.7.1 + container_name: galaxy-dev-tempo + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + command: ["-config.file=/etc/tempo/tempo.yml"] + volumes: + - ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/tempo/tempo.yml:/etc/tempo/tempo.yml:ro + - galaxy-dev-tempo-data:/var/tempo + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 384m + + galaxy-node-exporter: + image: prom/node-exporter:v1.8.2 + container_name: galaxy-dev-node-exporter + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + command: + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --path.rootfs=/rootfs + - --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/) + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + pid: host + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 64m + + galaxy-cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.49.1 + container_name: galaxy-dev-cadvisor + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + command: + - --housekeeping_interval=30s + - --docker_only=true + - --store_container_labels=false + privileged: true + devices: + - /dev/kmsg + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + - /dev/disk/:/dev/disk:ro + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 256m + + galaxy-grafana: + image: grafana/grafana:11.4.0 + container_name: galaxy-dev-grafana + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + depends_on: + - galaxy-prometheus + - galaxy-loki + - galaxy-tempo + environment: + GF_SECURITY_ADMIN_PASSWORD: ${GALAXY_DEV_GRAFANA_ADMIN_PASSWORD:-admin} + GF_SERVER_ROOT_URL: https://galaxy.lan/grafana/ + GF_SERVER_SERVE_FROM_SUB_PATH: "true" + GF_USERS_ALLOW_SIGN_UP: "false" + GF_ANALYTICS_REPORTING_ENABLED: "false" + GF_ANALYTICS_CHECK_FOR_UPDATES: "false" + GF_NEWS_NEWS_FEED_ENABLED: "false" + volumes: + - ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/provisioning:/etc/grafana/provisioning:ro + - ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/dashboards:/var/lib/grafana/dashboards:ro + - galaxy-dev-grafana-data:/var/lib/grafana + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 256m + networks: galaxy-internal: name: galaxy-dev-internal @@ -295,3 +460,11 @@ volumes: name: galaxy-dev-geoip-data galaxy-dev-mailpit-config: name: galaxy-dev-mailpit-config + galaxy-dev-prometheus-data: + name: galaxy-dev-prometheus-data + galaxy-dev-grafana-data: + name: galaxy-dev-grafana-data + galaxy-dev-loki-data: + name: galaxy-dev-loki-data + galaxy-dev-tempo-data: + name: galaxy-dev-tempo-data diff --git a/tools/dev-deploy/monitoring/grafana/dashboards/galaxy-overview.json b/tools/dev-deploy/monitoring/grafana/dashboards/galaxy-overview.json new file mode 100644 index 0000000..845182c --- /dev/null +++ b/tools/dev-deploy/monitoring/grafana/dashboards/galaxy-overview.json @@ -0,0 +1,46 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "graphTooltip": 0, + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "id": 1, + "title": "Backend HTTP request rate", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (group) (rate(http_requests_total[5m]))", + "legendFormat": "{{group}}" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "id": 2, + "title": "Container memory (cadvisor)", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (name) (container_memory_usage_bytes{name=~\"galaxy-dev-.*|galaxy-game-.*\"})", + "legendFormat": "{{name}}" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["galaxy"], + "templating": { "list": [] }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "title": "Galaxy — overview", + "uid": "galaxy-overview", + "version": 1, + "weekStart": "" +} diff --git a/tools/dev-deploy/monitoring/grafana/provisioning/dashboards/dashboards.yml b/tools/dev-deploy/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..daa0fe0 --- /dev/null +++ b/tools/dev-deploy/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +# Grafana dashboard provider: load every JSON under the mounted +# dashboards directory at startup (provisioned as code). +apiVersion: 1 + +providers: + - name: galaxy + type: file + disableDeletion: false + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/tools/dev-deploy/monitoring/grafana/provisioning/datasources/datasources.yml b/tools/dev-deploy/monitoring/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 0000000..048dc55 --- /dev/null +++ b/tools/dev-deploy/monitoring/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,22 @@ +# Grafana datasources provisioned as code (dev↔prod parity). All reach +# the collectors by Docker DNS (compose service names) on +# galaxy-dev-internal. +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + uid: prometheus + url: http://galaxy-prometheus:9090 + isDefault: true + - name: Loki + type: loki + access: proxy + uid: loki + url: http://galaxy-loki:3100 + - name: Tempo + type: tempo + access: proxy + uid: tempo + url: http://galaxy-tempo:3200 diff --git a/tools/dev-deploy/monitoring/loki/loki.yml b/tools/dev-deploy/monitoring/loki/loki.yml new file mode 100644 index 0000000..b477d01 --- /dev/null +++ b/tools/dev-deploy/monitoring/loki/loki.yml @@ -0,0 +1,47 @@ +# Single-binary Loki for the dev stack: filesystem storage, in-memory +# ring, 7-day retention. Internal-only (no host port). +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9095 + log_level: warn + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: 168h + reject_old_samples: true + reject_old_samples_max_age: 168h + +compactor: + working_directory: /loki/compactor + retention_enabled: true + delete_request_store: filesystem + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 64 diff --git a/tools/dev-deploy/monitoring/prometheus/prometheus.yml b/tools/dev-deploy/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..d0ae211 --- /dev/null +++ b/tools/dev-deploy/monitoring/prometheus/prometheus.yml @@ -0,0 +1,24 @@ +# Prometheus scrape config for the dev observability stack. Retention is +# a CLI flag in the compose command, not here. Targets are reached by +# Docker DNS (compose service names) on galaxy-dev-internal; nothing is +# published to the host. +global: + scrape_interval: 30s + evaluation_interval: 30s + +scrape_configs: + - job_name: backend + static_configs: + - targets: ["galaxy-backend:9100"] + - job_name: gateway + static_configs: + - targets: ["galaxy-api:9191"] + - job_name: node + static_configs: + - targets: ["galaxy-node-exporter:9100"] + - job_name: cadvisor + static_configs: + - targets: ["galaxy-cadvisor:8080"] + - job_name: prometheus + static_configs: + - targets: ["localhost:9090"] diff --git a/tools/dev-deploy/monitoring/promtail/promtail.yml b/tools/dev-deploy/monitoring/promtail/promtail.yml new file mode 100644 index 0000000..369d24a --- /dev/null +++ b/tools/dev-deploy/monitoring/promtail/promtail.yml @@ -0,0 +1,30 @@ +# Promtail tails the dev stack's container logs via the Docker API +# (service discovery filtered to the galaxy.stack=dev-deploy label) and +# ships them to Loki. Requires the Docker socket mounted read-only. +server: + http_listen_port: 9080 + grpc_listen_port: 0 + log_level: warn + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://galaxy-loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 15s + filters: + - name: label + values: ["galaxy.stack=dev-deploy"] + relabel_configs: + - source_labels: ["__meta_docker_container_name"] + regex: "/?(.*)" + target_label: container + - source_labels: ["__meta_docker_container_label_galaxy_game_id"] + target_label: game_id + - source_labels: ["__meta_docker_container_log_stream"] + target_label: stream diff --git a/tools/dev-deploy/monitoring/tempo/tempo.yml b/tools/dev-deploy/monitoring/tempo/tempo.yml new file mode 100644 index 0000000..ef68f01 --- /dev/null +++ b/tools/dev-deploy/monitoring/tempo/tempo.yml @@ -0,0 +1,30 @@ +# Single-binary Tempo for the dev stack: OTLP receivers, local block +# storage, 3-day retention. Internal-only (no host port). Backend and +# gateway push traces here over OTLP gRPC (4317). +server: + http_listen_port: 3200 + log_level: warn + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +ingester: + max_block_duration: 5m + +compactor: + compaction: + block_retention: 72h + +storage: + trace: + backend: local + local: + path: /var/tempo/blocks + wal: + path: /var/tempo/wal -- 2.52.0 From e11092234c20488013a78ccaecffd9c9ae90c551 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Mon, 1 Jun 2026 05:46:19 +0200 Subject: [PATCH 5/8] feat(dev-deploy): expose Grafana + Mailpit UIs via Caddy; seed monitoring config Deploy wiring for the observability stack (the services and collector config landed in the previous commit): - Caddyfile.dev: route /grafana/* to galaxy-grafana:3000 (Caddy sub-path mode, Grafana keeps its own login) and /mailpit/* to galaxy-mailpit:8025 behind dev basic-auth, so the captured-mail UI (every message, relayed or not) and Grafana are reachable through the single dev origin. - dev-deploy.yaml: seed the monitoring config tree to a stable, reboot-surviving host path (GALAXY_DEV_MONITORING_DIR) before bringing the stack up, and inject the Grafana admin password from a Gitea secret (GALAXY_DEV_GRAFANA_ADMIN_PASSWORD; empty falls back to the compose default). --- .gitea/workflows/dev-deploy.yaml | 8 ++++++++ tools/dev-deploy/Caddyfile.dev | 16 ++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/.gitea/workflows/dev-deploy.yaml b/.gitea/workflows/dev-deploy.yaml index 9e74fbd..5d55451 100644 --- a/.gitea/workflows/dev-deploy.yaml +++ b/.gitea/workflows/dev-deploy.yaml @@ -261,11 +261,19 @@ jobs: # Unset/empty → the compose default (non-routable) keeps the # stack capture-only. GALAXY_DEV_MAIL_RELAY_MATCH: ${{ vars.GALAXY_DEV_MAIL_RELAY_MATCH }} + # Grafana admin password; unset/empty -> compose default 'admin'. + GALAXY_DEV_GRAFANA_ADMIN_PASSWORD: ${{ secrets.GALAXY_DEV_GRAFANA_ADMIN_PASSWORD }} run: | # Resolve in the shell, not in YAML expressions — `env.HOME` # is empty at the workflow-evaluation stage. export GALAXY_DEV_GAME_STATE_DIR="$HOME/.galaxy-dev/game-state" mkdir -p "$GALAXY_DEV_GAME_STATE_DIR" + # Seed the monitoring config to a stable, reboot-surviving host + # path (compose binds \${GALAXY_DEV_MONITORING_DIR} read-only). + export GALAXY_DEV_MONITORING_DIR="$HOME/.galaxy-dev/monitoring" + rm -rf "$GALAXY_DEV_MONITORING_DIR" + mkdir -p "$GALAXY_DEV_MONITORING_DIR" + cp -r monitoring/. "$GALAXY_DEV_MONITORING_DIR/" docker compose up -d --wait --remove-orphans - name: Probe the stack diff --git a/tools/dev-deploy/Caddyfile.dev b/tools/dev-deploy/Caddyfile.dev index af25751..bd485bf 100644 --- a/tools/dev-deploy/Caddyfile.dev +++ b/tools/dev-deploy/Caddyfile.dev @@ -37,6 +37,22 @@ reverse_proxy galaxy-api:8080 } + # Grafana (observability UI) under /grafana/ — Caddy sub-path mode + # (Grafana set with GF_SERVER_SERVE_FROM_SUB_PATH); its own login. + handle /grafana/* { + reverse_proxy galaxy-grafana:3000 + } + + # Mailpit captured-mail UI under /mailpit/. Shows every message the + # backend sent (relayed or not); basic-auth (dev: gm / gm-dev-password) + # guards the OTP codes it exposes. Mailpit runs with MP_WEBROOT=/mailpit. + handle /mailpit/* { + basic_auth { + gm "$2a$14$xVh1TLaZxh8fazlKrI9Mx.NQMQlMarYWtr3FRELmZIXuac/DeeTRO" + } + reverse_proxy galaxy-mailpit:8025 + } + # Bare `/game` (no trailing slash) -> `/game/` so the SPA root # resolves before the site catch-all can claim it. handle /game { -- 2.52.0 From 45815c27d92d92dfc82fdb97992958ece2071e0c Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Mon, 1 Jun 2026 06:11:25 +0200 Subject: [PATCH 6/8] fix(dev-deploy): probe Mailpit /mailpit/livez under MP_WEBROOT MP_WEBROOT=/mailpit prefixes every Mailpit HTTP route, including the /livez health endpoint. The container healthcheck still probed http://localhost:8025/livez, which now 404s, so Mailpit reported unhealthy; the backend depends_on it with condition: service_healthy and never started, cascading to the gateway and Caddy and failing `docker compose up --wait`. Point the healthcheck at /mailpit/livez. --- tools/dev-deploy/docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/dev-deploy/docker-compose.yml b/tools/dev-deploy/docker-compose.yml index 3c44fbc..d9cf40e 100644 --- a/tools/dev-deploy/docker-compose.yml +++ b/tools/dev-deploy/docker-compose.yml @@ -84,7 +84,7 @@ services: volumes: - galaxy-dev-mailpit-config:/etc/mailpit:ro healthcheck: - test: ["CMD", "wget", "-q", "-O-", "http://localhost:8025/livez"] + test: ["CMD", "wget", "-q", "-O-", "http://localhost:8025/mailpit/livez"] interval: 3s timeout: 3s retries: 30 -- 2.52.0 From cb8491c200cc3da3dceb08fef23232a870cec8e0 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Mon, 1 Jun 2026 06:30:15 +0200 Subject: [PATCH 7/8] feat(dev-deploy): one /_gm gate for console + Grafana + Mailpit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consolidate the operator console and the observability / captured-mail UIs behind a single Basic Auth gate, so one password (the admin-console account, dev: gm/gm-dev-password) unlocks all three, with links in the console nav: - Caddyfile.dev: a single basic_auth on /_gm/* fronts nested routes — /_gm/grafana/ -> Grafana, /_gm/mailpit/ -> Mailpit, catch-all -> the gateway/backend console. Caddy forwards the same Authorization header, which the backend console also accepts, so there is one prompt. The former top-level /grafana/ and /mailpit/ routes are removed. - Grafana: served under /_gm/grafana/ (sub-path) as anonymous Admin with the login form and basic auth disabled, so it relies solely on the /_gm gate and ignores the forwarded credentials. - Mailpit: MP_WEBROOT=/_gm/mailpit (and the healthcheck path) so its UI lives under the gate. - Operator console: add Grafana and Mailpit links to the nav. --- .../adminconsole/templates/layout.gohtml | 2 + tools/dev-deploy/Caddyfile.dev | 40 +++++++++++-------- tools/dev-deploy/docker-compose.yml | 18 ++++++--- 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/backend/internal/adminconsole/templates/layout.gohtml b/backend/internal/adminconsole/templates/layout.gohtml index 8634190..8dd6676 100644 --- a/backend/internal/adminconsole/templates/layout.gohtml +++ b/backend/internal/adminconsole/templates/layout.gohtml @@ -17,6 +17,8 @@ Games Operators Mail + Grafana + Mailpit {{.Username}} diff --git a/tools/dev-deploy/Caddyfile.dev b/tools/dev-deploy/Caddyfile.dev index bd485bf..462b9fe 100644 --- a/tools/dev-deploy/Caddyfile.dev +++ b/tools/dev-deploy/Caddyfile.dev @@ -29,28 +29,34 @@ reverse_proxy galaxy-api:8080 } - # Operator console. Shares the gateway public listener with `/api`; the - # gateway applies the admin anti-abuse class and reverse-proxies to the - # backend `/_gm` surface, which enforces Basic Auth and renders the pages. + # Operator console + observability behind one Basic Auth gate. The gate + # credential equals the admin-console account (dev: gm / gm-dev-password), + # so Caddy forwards the same Authorization header to the backend `/_gm` + # surface (its own Basic Auth) and to Grafana/Mailpit — one prompt covers + # all three. The gateway applies the admin anti-abuse class to the console. @gm path /_gm /_gm/* handle @gm { - reverse_proxy galaxy-api:8080 - } - - # Grafana (observability UI) under /grafana/ — Caddy sub-path mode - # (Grafana set with GF_SERVER_SERVE_FROM_SUB_PATH); its own login. - handle /grafana/* { - reverse_proxy galaxy-grafana:3000 - } - - # Mailpit captured-mail UI under /mailpit/. Shows every message the - # backend sent (relayed or not); basic-auth (dev: gm / gm-dev-password) - # guards the OTP codes it exposes. Mailpit runs with MP_WEBROOT=/mailpit. - handle /mailpit/* { basic_auth { gm "$2a$14$xVh1TLaZxh8fazlKrI9Mx.NQMQlMarYWtr3FRELmZIXuac/DeeTRO" } - reverse_proxy galaxy-mailpit:8025 + + # Grafana under /_gm/grafana/ (sub-path mode; anonymous Admin, so the + # /_gm gate is the only barrier — GF_AUTH_BASIC_ENABLED=false makes it + # ignore the forwarded Authorization header). + handle /_gm/grafana/* { + reverse_proxy galaxy-grafana:3000 + } + + # Mailpit captured-mail UI under /_gm/mailpit/ (MP_WEBROOT). Shows + # every message the backend sent, relayed or not. + handle /_gm/mailpit/* { + reverse_proxy galaxy-mailpit:8025 + } + + # The operator console itself (gateway -> backend /_gm surface). + handle { + reverse_proxy galaxy-api:8080 + } } # Bare `/game` (no trailing slash) -> `/game/` so the SPA root diff --git a/tools/dev-deploy/docker-compose.yml b/tools/dev-deploy/docker-compose.yml index d9cf40e..cdb647d 100644 --- a/tools/dev-deploy/docker-compose.yml +++ b/tools/dev-deploy/docker-compose.yml @@ -74,9 +74,10 @@ services: command: - "--smtp-relay-config=/etc/mailpit/relay.conf" - "--smtp-relay-matching=${GALAXY_DEV_MAIL_RELAY_MATCH:-nobody@invalid.example}" - # Serve the capture UI under /mailpit so the host Caddy can expose it - # at https://galaxy.lan/mailpit/ (behind basic-auth); SMTP is unaffected. - - "--webroot=/mailpit" + # Serve the capture UI under /_gm/mailpit so the host Caddy can expose + # it at https://galaxy.lan/_gm/mailpit/ behind the shared /_gm gate; + # SMTP is unaffected. + - "--webroot=/_gm/mailpit" labels: galaxy.stack: dev-deploy networks: @@ -84,7 +85,7 @@ services: volumes: - galaxy-dev-mailpit-config:/etc/mailpit:ro healthcheck: - test: ["CMD", "wget", "-q", "-O-", "http://localhost:8025/mailpit/livez"] + test: ["CMD", "wget", "-q", "-O-", "http://localhost:8025/_gm/mailpit/livez"] interval: 3s timeout: 3s retries: 30 @@ -412,8 +413,15 @@ services: - galaxy-tempo environment: GF_SECURITY_ADMIN_PASSWORD: ${GALAXY_DEV_GRAFANA_ADMIN_PASSWORD:-admin} - GF_SERVER_ROOT_URL: https://galaxy.lan/grafana/ + GF_SERVER_ROOT_URL: https://galaxy.lan/_gm/grafana/ GF_SERVER_SERVE_FROM_SUB_PATH: "true" + # No own login: the /_gm Basic Auth gate is the only barrier, so + # serve everyone as anonymous Admin and ignore the forwarded + # Authorization header (basic auth off, login form off). + GF_AUTH_ANONYMOUS_ENABLED: "true" + GF_AUTH_ANONYMOUS_ORG_ROLE: Admin + GF_AUTH_DISABLE_LOGIN_FORM: "true" + GF_AUTH_BASIC_ENABLED: "false" GF_USERS_ALLOW_SIGN_UP: "false" GF_ANALYTICS_REPORTING_ENABLED: "false" GF_ANALYTICS_CHECK_FOR_UPDATES: "false" -- 2.52.0 From 814eae0802f383400e674f2eaeb2039b4b953848 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Mon, 1 Jun 2026 06:37:24 +0200 Subject: [PATCH 8/8] docs: observability stack + the single /_gm gate for Grafana/Mailpit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ARCHITECTURE §17: the dev (production-mirror) collection stack (Prometheus / Loki / Tempo / promtail / node-exporter / cAdvisor) and the single /_gm Basic Auth gate fronting Grafana and the Mailpit UI. - tools/dev-deploy/monitoring/README.md (new): services, what is collected, Grafana-behind-the-gate access, config delivery, tuning. - tools/dev-deploy/README.md: an Observability section; the Mailpit UI under /_gm/mailpit/; Networking diagram and Files list updated. - FUNCTIONAL §10.2.1 (+ ru mirror): the operator console nav links to Grafana and Mailpit under the same /_gm gate, one sign-in for all. --- docs/ARCHITECTURE.md | 13 +++++ docs/FUNCTIONAL.md | 5 +- docs/FUNCTIONAL_ru.md | 4 +- tools/dev-deploy/README.md | 46 ++++++++++++++-- tools/dev-deploy/monitoring/README.md | 77 +++++++++++++++++++++++++++ 5 files changed, 140 insertions(+), 5 deletions(-) create mode 100644 tools/dev-deploy/monitoring/README.md diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index a771037..a7b9a8c 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -888,6 +888,19 @@ addition. - Health probes are unauthenticated `GET /healthz` (process liveness) and `GET /readyz` (Postgres reachable, migrations applied, gRPC listener bound). Probes are excluded from anti-replay and rate limiting. +- **Collection (dev, production mirror).** The long-lived dev environment + (`tools/dev-deploy/`) runs a full metrics + logs + traces stack on its + internal network with no host ports: Prometheus scrapes the backend + (`:9100`) and gateway (`:9191`) endpoints plus `node-exporter` and + cAdvisor; Tempo ingests OTLP traces from backend and gateway; Loki + stores container logs shipped by promtail (Docker service-discovery on + the `galaxy.stack=dev-deploy` label). Grafana (provisioned datasources + + dashboards) and the Mailpit capture UI are reached only through the + operator console's single `/_gm` Basic Auth gate (§14.1) — at + `/_gm/grafana/` and `/_gm/mailpit/` — so one password covers the + console and both UIs. Retention is tuned small (Prometheus 15d, Loki + 7d, Tempo 3d). The same compose fragment is meant to back production. + See `tools/dev-deploy/monitoring/README.md`. ## 18. CI and Environments diff --git a/docs/FUNCTIONAL.md b/docs/FUNCTIONAL.md index d9b22cf..40e8c0b 100644 --- a/docs/FUNCTIONAL.md +++ b/docs/FUNCTIONAL.md @@ -1182,7 +1182,10 @@ The console landing page is a dashboard that summarises operational health: whether the backend is ready and the database reachable, how many game runtimes sit in each state, and the depth of the mail and notification queues. It is a read-only point-in-time view for quick -triage, not a metrics history. +triage, not a metrics history. The console nav also links to Grafana +(metrics, logs and traces) and the Mailpit capture UI, which the +deployment serves under the same `/_gm` Basic Auth gate — one sign-in +covers the console and both UIs. ### 10.3 Admin account management diff --git a/docs/FUNCTIONAL_ru.md b/docs/FUNCTIONAL_ru.md index a81f6d7..0e8b2eb 100644 --- a/docs/FUNCTIONAL_ru.md +++ b/docs/FUNCTIONAL_ru.md @@ -1218,7 +1218,9 @@ admin-API, либо через серверно-рендеримую веб-ко здоровье: готов ли backend и доступна ли БД, сколько игровых рантаймов в каждом состоянии, какова глубина очередей почты и уведомлений. Это read-only-срез на текущий момент для быстрой диагностики, не история -метрик. +метрик. Навигация консоли также ведёт в Grafana (метрики, логи и +трейсы) и в UI захвата почты Mailpit, которые деплой отдаёт под тем же +шлюзом Basic Auth `/_gm` — один вход покрывает консоль и оба UI. ### 10.3 Управление admin-аккаунтами diff --git a/tools/dev-deploy/README.md b/tools/dev-deploy/README.md index 2f77efa..ff19718 100644 --- a/tools/dev-deploy/README.md +++ b/tools/dev-deploy/README.md @@ -148,6 +148,38 @@ With none set the stack only captures mail (the compose relay-match defaults to a non-routable address), so it can never email third parties. +The capture UI is exposed through the operator console's `/_gm` gate at +[`/_gm/mailpit/`](https://galaxy.lan/_gm/mailpit/) — one Basic Auth for +the console, Grafana and Mailpit (see **Observability**). It shows +**every** message the backend sent, relayed or not, so you can read any +account's OTP regardless of the relay-match. For multi-account testing: +register several `you+tag@gmail.com` aliases and widen the match to a +regex such as `^you(\+[^@]+)?@gmail\.com$` (Gmail folds every `+tag` +into one inbox), or just read the codes in the Mailpit UI, or skip mail +entirely with the `123456` dev-code. + +## Observability + +A full metrics + logs + traces stack runs alongside the app on the +internal network (no host ports), as a production mirror. **Grafana** +and the **Mailpit** UI are reached only through the operator console's +single `/_gm` Basic Auth gate — one password (the admin-console account) +unlocks the console, [`/_gm/grafana/`](https://galaxy.lan/_gm/grafana/) +and [`/_gm/mailpit/`](https://galaxy.lan/_gm/mailpit/), with links in the +console nav. Grafana runs anonymous-Admin behind the gate (no own +login); Prometheus, Loki and Tempo stay internal-only. + +- **Metrics** — Prometheus scrapes backend, gateway, `node-exporter` and + cAdvisor. +- **Logs** — promtail → Loki (Docker SD on the `galaxy.stack=dev-deploy` + label). +- **Traces** — backend + gateway → Tempo over OTLP. + +Grafana's admin user is seeded from `GALAXY_DEV_GRAFANA_ADMIN_PASSWORD` +(for provisioning/API; the UI needs no Grafana login). See +[`monitoring/README.md`](monitoring/README.md) for services, configs and +tuning knobs. + ## Networking ``` @@ -162,6 +194,8 @@ galaxy-caddy (networks: edge + galaxy-dev-internal) │ /game/* -> file_server /srv/galaxy-ui (volume galaxy-dev-ui-dist) │ /api/*, /healthz -> reverse_proxy galaxy-api:8080 │ /rpc/* -> reverse_proxy galaxy-api:9090 (strips /rpc) + │ /_gm, /_gm/* -> reverse_proxy galaxy-api:8080 (Basic Auth gate; + │ /_gm/grafana/ -> grafana, /_gm/mailpit/ -> mailpit) ▼ galaxy-dev-internal ├─ galaxy-api (gateway: :8080 REST, :9090 gRPC) @@ -169,7 +203,9 @@ galaxy-dev-internal ├─ galaxy-postgres (postgres: :5432) ├─ galaxy-redis (redis: :6379) ├─ galaxy-mailpit (mailpit: :8025 UI, :1025 SMTP) - └─ engine containers (spawned by backend on demand) + ├─ engine containers (spawned by backend on demand) + └─ observability (prometheus, grafana, loki, promtail, tempo, + node-exporter, cadvisor) ``` The compose project deliberately exposes no host ports. Diagnostics @@ -214,8 +250,10 @@ make clean-data Stop everything and wipe volumes + game-state dir ## Files -- `docker-compose.yml` — six services: postgres, redis, mailpit, - galaxy-backend, galaxy-api, galaxy-caddy. `galaxy-caddy` mounts both +- `docker-compose.yml` — the application services (postgres, redis, + mailpit, galaxy-backend, galaxy-api, galaxy-caddy) plus the + observability stack (prometheus, grafana, loki, promtail, tempo, + node-exporter, cadvisor). `galaxy-caddy` mounts both the `galaxy-dev-site-dist` (`/srv/galaxy-site`) and `galaxy-dev-ui-dist` (`/srv/galaxy-ui`) volumes and reverse-proxies both gateway tiers (REST/health on `:8080`, Connect/gRPC-web on @@ -227,6 +265,8 @@ make clean-data Stop everything and wipe volumes + game-state dir at `/etc/caddy/Caddyfile`. - `Caddyfile.prod` — placeholder for a future prod deployment; not used by this compose. +- `monitoring/` — Prometheus / Loki / promtail / Tempo / Grafana + configuration, provisioned as code; see `monitoring/README.md`. - `Makefile` — wrapper over `docker compose` with helpers for engine, site/UI seeding, health probes, and full wipe. - `.env.example` — non-secret defaults for the compose `${VAR:-}` diff --git a/tools/dev-deploy/monitoring/README.md b/tools/dev-deploy/monitoring/README.md new file mode 100644 index 0000000..3ebb465 --- /dev/null +++ b/tools/dev-deploy/monitoring/README.md @@ -0,0 +1,77 @@ +# `tools/dev-deploy/monitoring/` — observability stack + +The long-lived dev environment runs a full metrics + logs + traces stack +alongside the application as a **production mirror**: the same compose +fragment and collector configs are meant to back production later. Every +collector lives on the internal `galaxy-dev-internal` network and +publishes **no host port**. The browser-reachable pieces (Grafana and +the Mailpit UI) sit behind the operator console's single `/_gm` Basic +Auth gate — see [`../README.md`](../README.md) and `ARCHITECTURE.md §14`. + +## Services + +| Service | Image | Role | Reachable | +| --- | --- | --- | --- | +| `galaxy-prometheus` | `prom/prometheus` | Scrape + store metrics (15d) | internal `:9090` | +| `galaxy-loki` | `grafana/loki` | Log store (7d) | internal `:3100` | +| `galaxy-promtail` | `grafana/promtail` | Ship container logs to Loki | — | +| `galaxy-tempo` | `grafana/tempo` | Trace store (3d), OTLP receiver | internal `:3200`, OTLP `:4317`/`:4318` | +| `galaxy-node-exporter` | `prom/node-exporter` | Host metrics | internal `:9100` | +| `galaxy-cadvisor` | `cadvisor` | Per-container CPU/memory/IO | internal `:8080` | +| `galaxy-grafana` | `grafana/grafana` | Dashboards + Explore | Caddy `/_gm/grafana/` | + +## What is collected + +- **Metrics.** Prometheus (30s interval) scrapes the backend Prometheus + endpoint (`galaxy-backend:9100`), the gateway admin endpoint + (`galaxy-api:9191`), `node-exporter` (host) and cAdvisor (per + container). Engine containers expose no `/metrics`; cAdvisor covers + their resource use. +- **Logs.** promtail discovers containers through the Docker API, + filtered to the `galaxy.stack=dev-deploy` label, and ships their + stdout/stderr to Loki labelled by `container`. +- **Traces.** backend and gateway export OTLP traces over gRPC to Tempo + (`galaxy-tempo:4317`), plaintext on the internal network + (`OTEL_EXPORTER_OTLP_INSECURE=true`, since Tempo's receiver is not + TLS-wrapped inside the contour). + +## Grafana access (behind the `/_gm` gate) + +Grafana is served under `/_gm/grafana/` (`GF_SERVER_ROOT_URL` + +`GF_SERVER_SERVE_FROM_SUB_PATH=true`) **behind the shared operator gate**: +the Caddy `/_gm/*` Basic Auth (the admin-console account) is the only +barrier. Grafana itself runs as **anonymous Admin** with its login form +and basic auth disabled (`GF_AUTH_ANONYMOUS_ENABLED=true`, +`GF_AUTH_ANONYMOUS_ORG_ROLE=Admin`, `GF_AUTH_DISABLE_LOGIN_FORM=true`, +`GF_AUTH_BASIC_ENABLED=false`), so it ignores the forwarded credentials +and asks for no second password. `GALAXY_DEV_GRAFANA_ADMIN_PASSWORD` +still seeds the admin user for provisioning/API use. + +Datasources (Prometheus, Loki, Tempo) and a starter dashboard +(`grafana/dashboards/galaxy-overview.json`) are provisioned as code under +`grafana/provisioning/`. + +## Config delivery + +`dev-deploy.yaml` copies this directory to a stable host path +(`$HOME/.galaxy-dev/monitoring`, exported as `GALAXY_DEV_MONITORING_DIR`) +before `compose up`, and the compose binds it read-only into the +collectors. A stable path — not the ephemeral CI workspace — keeps the +mounts valid across container restarts and host reboots (the same lesson +as the geoip volume; see `../KNOWN-ISSUES.md`). + +## Tuning (cost knobs) + +Defaults favour the smallest workable footprint; all are config/compose +values: + +- Prometheus `scrape_interval=30s`, `--storage.tsdb.retention.time=15d`. +- Loki `retention_period=168h` (7d); Tempo `block_retention=72h` (3d). +- cAdvisor `--housekeeping_interval=30s`. +- Per-service `deploy.resources.limits.memory` caps (~1.5 GB total cap; + steady-state well under that). + +Seven always-on containers cost roughly ~1.1 GB steady RAM and +~1.5–2.5 GB disk at these retention windows. cAdvisor is the main CPU +cost; on a constrained host it can be dropped (host + app metrics still +cover most needs). -- 2.52.0