diff --git a/.gitea/workflows/ci.yaml b/.gitea/workflows/ci.yaml index 3283130..00df876 100644 --- a/.gitea/workflows/ci.yaml +++ b/.gitea/workflows/ci.yaml @@ -298,17 +298,21 @@ jobs: # pick up the fresh config. docker compose --ansi never up -d --force-recreate --no-deps caddy otelcol prometheus tempo grafana - - name: Probe the gateway through caddy + - name: Probe the landing and the gateway through caddy run: | set -u + # Two probes through the contour caddy (R3 split): "/" is the static + # landing container, "/app/" is the gateway-served SPA shell. for i in $(seq 1 20); do - if docker run --rm --network edge alpine:3.20 wget -q -T 5 -O /dev/null http://scrabble/; then - echo "healthy: GET http://scrabble/" + if docker run --rm --network edge alpine:3.20 wget -q -T 5 -O /dev/null http://scrabble/ && + docker run --rm --network edge alpine:3.20 wget -q -T 5 -O /dev/null http://scrabble/app/; then + echo "healthy: GET http://scrabble/ (landing) + /app/ (gateway)" exit 0 fi sleep 3 done - echo "probe failed; recent gateway logs:" + echo "probe failed; recent landing + gateway logs:" + docker logs --tail 50 scrabble-landing || true docker logs --tail 50 scrabble-gateway || true exit 1 diff --git a/.gitignore b/.gitignore index 259564b..912d4c2 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,6 @@ # Local, unstaged env overrides **/.env.local **/.env.*.local + +# Claude Code harness runtime artifacts +.claude/scheduled_tasks.lock diff --git a/CLAUDE.md b/CLAUDE.md index ba32697..28a1502 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -127,8 +127,8 @@ docs/ .gitea/workflows/ PLAN.md CLAUDE.md README.md gateway/ ui/ pkg/ # added by their stages platform/telegram/ # Telegram connector side-service (Stage 9): bot + gRPC API loadtest/ # module scrabble/loadtest: the pre-release stress harness (R2) -backend/Dockerfile gateway/Dockerfile platform/telegram/Dockerfile loadtest/Dockerfile # multi-stage distroless (Stage 16; loadtest R2) -deploy/ # docker-compose + caddy + otelcol/prometheus/tempo/grafana (+ cAdvisor/postgres_exporter, R2) +backend/Dockerfile gateway/Dockerfile platform/telegram/Dockerfile loadtest/Dockerfile # multi-stage distroless (Stage 16; loadtest R2); gateway/Dockerfile also has the `landing` target (R3) +deploy/ # docker-compose + caddy + landing + otelcol/prometheus/tempo/grafana (+ cAdvisor/postgres_exporter, R2) ``` ## Build & test @@ -144,8 +144,9 @@ go run ./backend/cmd/backend # /healthz, /readyz on :8080 cd ui && pnpm install && pnpm check && pnpm test:unit && pnpm build # the UI (Stage 7+) pnpm start # UI mock mode: lobby -> game, no backend -docker build -f backend/Dockerfile -t scrabble-backend . # images (Stage 16); gateway embeds the UI -docker build -f gateway/Dockerfile -t scrabble-gateway . +docker build -f backend/Dockerfile -t scrabble-backend . # images (Stage 16); gateway embeds the SPA +docker build -f gateway/Dockerfile --target gateway -t scrabble-gateway . +docker build -f gateway/Dockerfile --target landing -t scrabble-landing . # static landing (R3) docker compose -f deploy/docker-compose.yml config # validate the full contour ``` diff --git a/PRERELEASE.md b/PRERELEASE.md index 46cf842..51a2c75 100644 --- a/PRERELEASE.md +++ b/PRERELEASE.md @@ -19,7 +19,7 @@ the edge before prod. Each phase maps back to the owner's raw pre-release TODO l |---|-------|-----------|--------| | R1 | Schema & naming reset | 1 + 10 | **done** | | R2 | Stress harness + contour observability + early run | 9a | **done** | -| R3 | Edge hardening | 2 + 8 + 3 | todo | +| R3 | Edge hardening | 2 + 8 + 3 | **done** | | R4 | Push enrichment + kill the last poll | 4 + 5 | todo | | R5 | Bundle slimming | 6 | todo | | R6 | Refactor + docs reconciliation + de-staging | 7 | todo | @@ -253,3 +253,31 @@ Then Stage 18. it feeds R3 (h2c `MaxConcurrentStreams`/timeouts, body-size cap), R6 and R7 (per-player transports, separate hardware, pool/limit sizing). - **CI:** `./loadtest/...` added to the path filter + vet/build/test; `go.work.sum` carries the new deps. + +- **R3** (interview + implementation): + - **Locked decisions:** the flag column lands by **editing the R1 baseline** (+ a contour schema + wipe after merge — no migration chain accrues before prod); auto-flag defaults **1000 rejected / + 10 min** (`BACKEND_HIGHRATE_FLAG_THRESHOLD`/`_WINDOW`, rolling window, set-once, operator clears, + no auto-ban); landing image = **caddy:2-alpine**; throttle data flows **gateway → backend** (a + 30 s per-key summary POST to the new `/api/v1/internal/ratelimit/report`, the existing trusted + direction) with the episode window + flag rule in the backend (`internal/ratewatch`); rejection + logging = **Warn summary per key per window + Debug per rejection** — a deliberate deviation from + the phase's "structured log per rejection" (the R2 hammer would have logged ~522k lines in + minutes); all three R2-report tails included (explicit h2c sizing, the session-resolve failure + cause at Warn, reviving the admin limiter). + - **Body cap:** `GATEWAY_MAX_BODY_BYTES` (default 1 MiB) as both the Connect per-message read limit + and an `http.MaxBytesReader` wrap of the public mux; an oversized Execute is `resource_exhausted`. + - **Dead config found:** `AdminPerMinute`/`AdminBurst` were never wired — the gateway `/_gm` mount is + now 429-guarded per IP ahead of its Basic-Auth. The caddy-fronted contour path stays unlimited + (stock caddy has no limiter) — an accepted gap, recorded in `docs/ARCHITECTURE.md` §12. + - **Landing split:** a `landing` target in `gateway/Dockerfile` (the UI build stage is shared; + identical compose build args keep it one cached build); the gateway drops `landing.html` from the + embed and 308-redirects `/` → `/app/`; the contour caddy routes `/app/`, `/telegram/` and the + Connect path to the gateway and the catch-all to the landing container; the CI deploy probe now + checks both `/` (landing) and `/app/` (gateway). + - **Observability:** `gateway_rate_limited_total{class}` (user/public/email/admin, aggregate-only) + + a rate-vs-rejections panel on the Edge/UX dashboard; the admin console gains the **Throttled** + page (the in-memory episode window, reset-on-restart like `active_users`, plus the flagged-account + queue) and the flag badge / clear action on the user list / card. + - The jet regen also restored the previously missing `game_drafts`/`game_hidden` generated models + (their tables were added after the last jetgen run; no behaviour change). diff --git a/backend/README.md b/backend/README.md index 03453ce..724642f 100644 --- a/backend/README.md +++ b/backend/README.md @@ -99,6 +99,14 @@ durable owner — then the durable account wins and a fresh session is minted fo The `accounts.paid_account`/`merged_into`/`merged_at` columns back this. This supersedes the Stage 8 `email.bind.*` edge surface (the `RequestCode`/`ConfirmCode` primitives stay). +**R3** adds rate-limit observability: the gateway posts its periodic rejection +summaries to `POST /api/v1/internal/ratelimit/report`; `internal/ratewatch` keeps a +bounded in-memory episode window for the console's **Throttled** page and applies the +conservative auto-flag — an account sustaining `BACKEND_HIGHRATE_FLAG_THRESHOLD` +rejected calls within `BACKEND_HIGHRATE_FLAG_WINDOW` gets the soft, reversible +`accounts.flagged_high_rate_at` marker (set-once; a badge in the user list and a +**Clear** action on the user card; never an automatic ban). + ## Package layout ``` @@ -121,6 +129,7 @@ internal/lobby/ # in-memory matchmaking pool (+ robot substitution) + frien internal/robot/ # human-like robot opponent: account pool, seed-derived strategy, move driver internal/adminconsole/ # server-rendered admin console (Go templates + embedded CSS, view models), served at /_gm internal/connector/ # backend gRPC client to the Telegram connector (operator broadcasts) +internal/ratewatch/ # gateway rate-limit reports: episode window for the console + the high-rate auto-flag (R3) ``` ## Configuration (environment) @@ -153,6 +162,8 @@ internal/connector/ # backend gRPC client to the Telegram connector (operator b | `BACKEND_CONNECTOR_ADDR` | — | Telegram connector gRPC address for admin-console operator broadcasts. Empty disables broadcasts. | | `BACKEND_GUEST_REAP_INTERVAL` | `1h` | How often the abandoned-guest reaper sweeps. | | `BACKEND_GUEST_RETENTION` | `720h` | Account age past which a guest with no game seat is deleted. | +| `BACKEND_HIGHRATE_FLAG_THRESHOLD` | `1000` | Gateway-reported rejected calls within the window past which an account is soft-flagged (R3). | +| `BACKEND_HIGHRATE_FLAG_WINDOW` | `10m` | The rolling window those rejections accumulate over. | ## Run diff --git a/backend/cmd/backend/main.go b/backend/cmd/backend/main.go index 0fca27e..0854508 100644 --- a/backend/cmd/backend/main.go +++ b/backend/cmd/backend/main.go @@ -28,6 +28,7 @@ import ( "scrabble/backend/internal/notify" "scrabble/backend/internal/postgres" "scrabble/backend/internal/pushgrpc" + "scrabble/backend/internal/ratewatch" "scrabble/backend/internal/robot" "scrabble/backend/internal/server" "scrabble/backend/internal/session" @@ -177,6 +178,13 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error { invitations.SetNotifier(hub) logger.Info("lobby and social domains ready", zap.Duration("robot_wait", cfg.Lobby.RobotWait)) + // R3 rate-limit observability: ingest the gateway's rejection reports for the + // admin throttled view and the conservative high-rate auto-flag. + rateWatch := ratewatch.New(cfg.RateWatch, accounts, logger) + logger.Info("rate watch ready", + zap.Int("flag_threshold", cfg.RateWatch.FlagThreshold), + zap.Duration("flag_window", cfg.RateWatch.FlagWindow)) + srv := server.New(cfg.HTTPAddr, server.Deps{ Logger: logger, DB: db, @@ -193,6 +201,7 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error { Registry: registry, DictDir: cfg.Game.DictDir, Connector: conn, + RateWatch: rateWatch, }) pushSrv := pushgrpc.NewServer(cfg.GRPCAddr, hub, logger) diff --git a/backend/internal/account/account.go b/backend/internal/account/account.go index 8fee67a..975484a 100644 --- a/backend/internal/account/account.go +++ b/backend/internal/account/account.go @@ -76,8 +76,13 @@ type Account struct { // uuid.Nil for a live account. A tombstone keeps the row so the no-cascade // foreign keys of a shared finished game stay valid (Stage 11). MergedInto uuid.UUID - CreatedAt time.Time - UpdatedAt time.Time + // FlaggedHighRateAt is the soft, reversible "suspected high-rate" marker: the + // zero time for an unflagged account, otherwise when the gateway-reported + // rate-limiter rejections first crossed the sustained threshold (R3). An + // operator clears it in the admin console; it never gates any request. + FlaggedHighRateAt time.Time + CreatedAt time.Time + UpdatedAt time.Time } // Identity is one of an account's platform/email identities, surfaced on the @@ -422,6 +427,43 @@ func (s *Store) SpendHint(ctx context.Context, id uuid.UUID) (bool, error) { return n > 0, nil } +// FlagHighRate stamps the soft "suspected high-rate" marker with at, only when +// the account is not already flagged — the first sustained episode wins, and a +// re-flag after an operator clear starts a fresh timestamp. An infra marker, not +// a profile edit, so updated_at is untouched; it never gates any request (R3). +// It reports whether the flag was newly set. +func (s *Store) FlagHighRate(ctx context.Context, id uuid.UUID, at time.Time) (bool, error) { + stmt := table.Accounts. + UPDATE(table.Accounts.FlaggedHighRateAt). + SET(postgres.TimestampzT(at.UTC())). + WHERE( + table.Accounts.AccountID.EQ(postgres.UUID(id)). + AND(table.Accounts.FlaggedHighRateAt.IS_NULL()), + ) + res, err := stmt.ExecContext(ctx, s.db) + if err != nil { + return false, fmt.Errorf("account: flag high rate %s: %w", id, err) + } + n, err := res.RowsAffected() + if err != nil { + return false, fmt.Errorf("account: flag high rate rows %s: %w", id, err) + } + return n > 0, nil +} + +// ClearHighRateFlag removes the high-rate marker — the operator's reversible +// action in the admin console. Clearing an unflagged account is a no-op. +func (s *Store) ClearHighRateFlag(ctx context.Context, id uuid.UUID) error { + stmt := table.Accounts. + UPDATE(table.Accounts.FlaggedHighRateAt). + SET(postgres.NULL). + WHERE(table.Accounts.AccountID.EQ(postgres.UUID(id))) + if _, err := stmt.ExecContext(ctx, s.db); err != nil { + return fmt.Errorf("account: clear high-rate flag %s: %w", id, err) + } + return nil +} + // SetServiceLanguage records the service language (en/ru) of the bot a Telegram // user authenticated through. It is called on every Telegram login — new and // existing accounts — so it tracks the bot the user last came through (last-login- @@ -452,6 +494,10 @@ func modelToAccount(row model.Accounts) Account { if row.ServiceLanguage != nil { serviceLanguage = *row.ServiceLanguage } + var flaggedHighRateAt time.Time + if row.FlaggedHighRateAt != nil { + flaggedHighRateAt = *row.FlaggedHighRateAt + } return Account{ ID: row.AccountID, DisplayName: row.DisplayName, @@ -467,6 +513,7 @@ func modelToAccount(row model.Accounts) Account { NotificationsInAppOnly: row.NotificationsInAppOnly, PaidAccount: row.PaidAccount, MergedInto: mergedInto, + FlaggedHighRateAt: flaggedHighRateAt, CreatedAt: row.CreatedAt, UpdatedAt: row.UpdatedAt, } diff --git a/backend/internal/account/userlist.go b/backend/internal/account/userlist.go index b2bd372..6f2d877 100644 --- a/backend/internal/account/userlist.go +++ b/backend/internal/account/userlist.go @@ -2,6 +2,7 @@ package account import ( "context" + "database/sql" "fmt" "strings" "time" @@ -18,6 +19,9 @@ type UserListItem struct { PreferredLanguage string IsGuest bool IsRobot bool + // FlaggedHighRateAt is the soft high-rate marker (zero when unflagged), shown + // as a badge in the console list (R3). + FlaggedHighRateAt time.Time CreatedAt time.Time } @@ -65,7 +69,7 @@ func userListWhere(f UserFilter) (string, []any) { // ListUsers returns the filtered admin user list, newest first, paginated. func (s *Store) ListUsers(ctx context.Context, f UserFilter, limit, offset int) ([]UserListItem, error) { where, args := userListWhere(f) - q := `SELECT a.account_id, a.display_name, a.preferred_language, a.is_guest, a.created_at, ` + robotExists + ` AS is_robot + q := `SELECT a.account_id, a.display_name, a.preferred_language, a.is_guest, a.flagged_high_rate_at, a.created_at, ` + robotExists + ` AS is_robot FROM backend.accounts a WHERE ` + where + fmt.Sprintf(` ORDER BY a.created_at DESC LIMIT $%d OFFSET $%d`, len(args)+1, len(args)+2) args = append(args, limit, offset) @@ -77,14 +81,51 @@ FROM backend.accounts a WHERE ` + where + var out []UserListItem for rows.Next() { var it UserListItem - if err := rows.Scan(&it.ID, &it.DisplayName, &it.PreferredLanguage, &it.IsGuest, &it.CreatedAt, &it.IsRobot); err != nil { + var flagged sql.NullTime + if err := rows.Scan(&it.ID, &it.DisplayName, &it.PreferredLanguage, &it.IsGuest, &flagged, &it.CreatedAt, &it.IsRobot); err != nil { return nil, fmt.Errorf("account: scan user: %w", err) } + if flagged.Valid { + it.FlaggedHighRateAt = flagged.Time + } out = append(out, it) } return out, rows.Err() } +// FlaggedAccount is one row of the console's high-rate review queue. +type FlaggedAccount struct { + ID uuid.UUID + DisplayName string + FlaggedHighRateAt time.Time +} + +// flaggedListCap bounds the console's flagged-account list; the operator clears +// flags as they are reviewed, so the queue stays short in practice. +const flaggedListCap = 200 + +// ListFlaggedHighRate returns the accounts carrying the high-rate flag, most +// recently flagged first (R3). +func (s *Store) ListFlaggedHighRate(ctx context.Context) ([]FlaggedAccount, error) { + rows, err := s.db.QueryContext(ctx, + `SELECT account_id, display_name, flagged_high_rate_at +FROM backend.accounts WHERE flagged_high_rate_at IS NOT NULL +ORDER BY flagged_high_rate_at DESC LIMIT $1`, flaggedListCap) + if err != nil { + return nil, fmt.Errorf("account: list flagged: %w", err) + } + defer rows.Close() + var out []FlaggedAccount + for rows.Next() { + var fa FlaggedAccount + if err := rows.Scan(&fa.ID, &fa.DisplayName, &fa.FlaggedHighRateAt); err != nil { + return nil, fmt.Errorf("account: scan flagged: %w", err) + } + out = append(out, fa) + } + return out, rows.Err() +} + // CountUsers counts the filtered admin user list, for pagination. func (s *Store) CountUsers(ctx context.Context, f UserFilter) (int, error) { where, args := userListWhere(f) diff --git a/backend/internal/adminconsole/render_test.go b/backend/internal/adminconsole/render_test.go index 4bdffad..eb6ab9c 100644 --- a/backend/internal/adminconsole/render_test.go +++ b/backend/internal/adminconsole/render_test.go @@ -21,8 +21,14 @@ func TestRendererRendersEveryPage(t *testing.T) { want string }{ {"dashboard", DashboardView{Accounts: 3, Variants: []VariantVersions{{Variant: "scrabble_en", Latest: "v1", Versions: []string{"v1"}}}}, "Dashboard"}, - {"users", UsersView{Items: []UserRow{{ID: "a1", DisplayName: "Kaya"}}, Pager: NewPager(1, 50, 1)}, "Kaya"}, + {"users", UsersView{Items: []UserRow{{ID: "a1", DisplayName: "Kaya", FlaggedHighRate: true}}, Pager: NewPager(1, 50, 1)}, "high-rate"}, {"user_detail", UserDetailView{ID: "a1", DisplayName: "Kaya", HasStats: true, Stats: StatsRow{Wins: 2}, TelegramID: "123", ConnectorEnabled: true}, "Send Telegram message"}, + {"user_detail", UserDetailView{ID: "a1", DisplayName: "Kaya", FlaggedHighRateAt: "2026-06-10 12:00"}, "Clear high-rate flag"}, + {"throttled", ThrottledView{ + Episodes: []ThrottleEpisodeRow{{Class: "user", Key: "a1", UserID: "a1", Rejected: 1234, FirstSeen: "2026-06-10 12:00", LastSeen: "2026-06-10 12:05"}}, + Flagged: []FlaggedAccountRow{{ID: "a1", DisplayName: "Kaya", FlaggedAt: "2026-06-10 12:05"}}, + FlagThreshold: 1000, FlagWindow: "10m0s", + }, "Recent episodes"}, {"games", GamesView{Items: []GameRow{{ID: "g1", Variant: "scrabble_en", Status: "active"}}, Status: "active", Pager: NewPager(1, 50, 1)}, "g1"}, {"game_detail", GameDetailView{ID: "g1", Variant: "scrabble_en", Seats: []SeatRow{{Seat: 0, DisplayName: "Kaya"}}}, "Seats"}, {"complaints", ComplaintsView{Items: []ComplaintRow{{ID: "c1", Word: "qi", Status: "open"}}, Status: "open", Pager: NewPager(1, 50, 1)}, "qi"}, diff --git a/backend/internal/adminconsole/templates/layout.gohtml b/backend/internal/adminconsole/templates/layout.gohtml index 970d876..14e09d9 100644 --- a/backend/internal/adminconsole/templates/layout.gohtml +++ b/backend/internal/adminconsole/templates/layout.gohtml @@ -17,6 +17,7 @@ Games Complaints Messages + Throttled Dictionary Broadcast Grafana ↗ diff --git a/backend/internal/adminconsole/templates/pages/throttled.gohtml b/backend/internal/adminconsole/templates/pages/throttled.gohtml new file mode 100644 index 0000000..696b339 --- /dev/null +++ b/backend/internal/adminconsole/templates/pages/throttled.gohtml @@ -0,0 +1,39 @@ +{{define "content" -}} +

Throttled

+{{with .Data}} +

Rate-limiter rejections reported periodically by the gateway. The episode +list is in-memory and resets on a backend restart. An account sustaining +{{.FlagThreshold}}+ rejected calls within {{.FlagWindow}} is soft-flagged for review +below — never banned automatically; clear the flag on the user card.

+

Recent episodes

+ + + +{{range .Episodes}} + + + + + + + +{{else}} + +{{end}} + +
ClassKeyRejectedFirst seenLast seen
{{.Class}}{{if .UserID}}{{.Key}}{{else}}{{.Key}}{{end}}{{.Rejected}}{{.FirstSeen}}{{.LastSeen}}
nothing throttled recently
+
+

Flagged accounts

+ + + +{{range .Flagged}} + +{{else}} + +{{end}} + +
AccountDisplay nameFlagged
{{.ID}}{{.DisplayName}}{{.FlaggedAt}}
no flagged accounts
+
+{{end}} +{{- end}} diff --git a/backend/internal/adminconsole/templates/pages/user_detail.gohtml b/backend/internal/adminconsole/templates/pages/user_detail.gohtml index 08b75b2..f4396d1 100644 --- a/backend/internal/adminconsole/templates/pages/user_detail.gohtml +++ b/backend/internal/adminconsole/templates/pages/user_detail.gohtml @@ -13,8 +13,14 @@
  • Paid {{if .PaidAccount}}yes{{else}}no{{end}}
  • Hint wallet {{.HintBalance}}
  • {{if .MergedInto}}
  • Merged into {{.MergedInto}}
  • {{end}} +{{if .FlaggedHighRateAt}}
  • High-rate flag {{.FlaggedHighRateAt}}
  • {{end}}
  • Created {{.CreatedAt}}
  • +{{if .FlaggedHighRateAt}} +
    + +
    +{{end}}

    Statistics

    {{if .HasStats}} diff --git a/backend/internal/adminconsole/templates/pages/users.gohtml b/backend/internal/adminconsole/templates/pages/users.gohtml index ef8cf21..9a116a0 100644 --- a/backend/internal/adminconsole/templates/pages/users.gohtml +++ b/backend/internal/adminconsole/templates/pages/users.gohtml @@ -17,7 +17,7 @@ {{range .Items}} {{.ID}} -{{.DisplayName}}{{if .Guest}} guest{{end}} +{{.DisplayName}}{{if .Guest}} guest{{end}}{{if .FlaggedHighRate}} high-rate{{end}} {{.Kind}} {{.Language}} {{.CreatedAt}} diff --git a/backend/internal/adminconsole/views.go b/backend/internal/adminconsole/views.go index 1ef4aed..8aef4c0 100644 --- a/backend/internal/adminconsole/views.go +++ b/backend/internal/adminconsole/views.go @@ -59,18 +59,20 @@ type UsersView struct { } // UserRow is one account row in the list. MoveMin/Avg/Max are the account's -// pre-formatted move-duration summary (empty when it has no timed move). +// pre-formatted move-duration summary (empty when it has no timed move); +// FlaggedHighRate marks the soft high-rate badge (R3). type UserRow struct { - ID string - DisplayName string - Kind string - Language string - Guest bool - CreatedAt string - HasMoveStats bool - MoveMin string - MoveAvg string - MoveMax string + ID string + DisplayName string + Kind string + Language string + Guest bool + FlaggedHighRate bool + CreatedAt string + HasMoveStats bool + MoveMin string + MoveAvg string + MoveMax string } // MessagesView is the paginated chat-message moderation list. NameMask/ExtMask are the @@ -110,15 +112,18 @@ type UserDetailView struct { PaidAccount bool // MergedInto is the primary account id when this account has been retired by a // merge (Stage 11), or empty for a live account. - MergedInto string - HintBalance int - CreatedAt string - HasStats bool - Stats StatsRow - Identities []IdentityRow - Games []GameRow - TelegramID string - ConnectorEnabled bool + MergedInto string + // FlaggedHighRateAt is the pre-formatted soft high-rate marker timestamp, + // empty for an unflagged account; the card shows it with the Clear action (R3). + FlaggedHighRateAt string + HintBalance int + CreatedAt string + HasStats bool + Stats StatsRow + Identities []IdentityRow + Games []GameRow + TelegramID string + ConnectorEnabled bool // MoveChart is the pre-rendered inline SVG of the account's per-move-number think // time (min/mean/max), empty when the account has no timed move. MoveChart template.HTML @@ -247,6 +252,35 @@ type BroadcastView struct { ConnectorEnabled bool } +// ThrottledView is the rate-limit observability page: the recent gateway-reported +// throttle episodes (in-memory, reset on restart) and the accounts currently +// carrying the high-rate flag. FlagThreshold and FlagWindow caption the active +// auto-flag tuning. +type ThrottledView struct { + Episodes []ThrottleEpisodeRow + Flagged []FlaggedAccountRow + FlagThreshold int + FlagWindow string +} + +// ThrottleEpisodeRow is one recently throttled limiter key. UserID links to the +// user card and is set only for the user class (the other classes key by IP). +type ThrottleEpisodeRow struct { + Class string + Key string + UserID string + Rejected int + FirstSeen string + LastSeen string +} + +// FlaggedAccountRow is one account carrying the high-rate flag. +type FlaggedAccountRow struct { + ID string + DisplayName string + FlaggedAt string +} + // MessageView is the result page shown after a POST action. type MessageView struct { Heading string diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index 9cc69e0..063911f 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -12,6 +12,7 @@ import ( "scrabble/backend/internal/game" "scrabble/backend/internal/lobby" "scrabble/backend/internal/postgres" + "scrabble/backend/internal/ratewatch" "scrabble/backend/internal/robot" "scrabble/backend/internal/telemetry" ) @@ -35,6 +36,9 @@ type Config struct { Lobby lobby.Config // Robot configures the robot opponent driver (scan cadence). Robot robot.Config + // RateWatch tunes the conservative high-rate auto-flag applied to the + // gateway's rate-limiter rejection reports (R3). + RateWatch ratewatch.Config // SMTP configures the email relay used for confirm-codes. An empty Host // selects the development log mailer (the code is logged, not sent). SMTP account.SMTPConfig @@ -105,6 +109,14 @@ func Load() (Config, error) { return Config{}, err } + rw := ratewatch.DefaultConfig() + if rw.FlagThreshold, err = envInt("BACKEND_HIGHRATE_FLAG_THRESHOLD", rw.FlagThreshold); err != nil { + return Config{}, err + } + if rw.FlagWindow, err = envDuration("BACKEND_HIGHRATE_FLAG_WINDOW", rw.FlagWindow); err != nil { + return Config{}, err + } + guestReapInterval, err := envDuration("BACKEND_GUEST_REAP_INTERVAL", defaultGuestReapInterval) if err != nil { return Config{}, err @@ -131,6 +143,7 @@ func Load() (Config, error) { Game: gm, Lobby: lb, Robot: rb, + RateWatch: rw, SMTP: smtp, ConnectorAddr: os.Getenv("BACKEND_CONNECTOR_ADDR"), GuestReapInterval: guestReapInterval, @@ -170,6 +183,9 @@ func (c Config) validate() error { if err := c.Robot.Validate(); err != nil { return fmt.Errorf("config: %w", err) } + if err := c.RateWatch.Validate(); err != nil { + return fmt.Errorf("config: %w", err) + } if c.GuestReapInterval <= 0 { return fmt.Errorf("config: BACKEND_GUEST_REAP_INTERVAL must be positive") } diff --git a/backend/internal/inttest/account_test.go b/backend/internal/inttest/account_test.go index 1be5922..286b171 100644 --- a/backend/internal/inttest/account_test.go +++ b/backend/internal/inttest/account_test.go @@ -6,6 +6,7 @@ import ( "context" "errors" "testing" + "time" "github.com/google/uuid" @@ -195,6 +196,62 @@ func TestServiceLanguageRoundTrip(t *testing.T) { } } +// TestHighRateFlagRoundTrip covers the R3 soft high-rate marker: a fresh account +// is unflagged, FlagHighRate stamps it exactly once (a second sustained episode +// never moves the timestamp), ClearHighRateFlag reverses it, and a re-flag after +// the operator clear takes a fresh timestamp. +func TestHighRateFlagRoundTrip(t *testing.T) { + ctx := context.Background() + store := account.NewStore(testDB) + acc, err := store.ProvisionTelegram(ctx, "tg-"+uuid.NewString(), "en", "", "Player") + if err != nil { + t.Fatalf("provision telegram: %v", err) + } + if !acc.FlaggedHighRateAt.IsZero() { + t.Fatalf("fresh FlaggedHighRateAt = %v, want zero", acc.FlaggedHighRateAt) + } + + first := time.Date(2026, 6, 1, 12, 0, 0, 0, time.UTC) + set, err := store.FlagHighRate(ctx, acc.ID, first) + if err != nil { + t.Fatalf("flag: %v", err) + } + if !set { + t.Fatal("first FlagHighRate reported not set") + } + if set, err = store.FlagHighRate(ctx, acc.ID, first.Add(time.Hour)); err != nil { + t.Fatalf("re-flag: %v", err) + } else if set { + t.Fatal("second FlagHighRate must not overwrite the marker") + } + got, err := store.GetByID(ctx, acc.ID) + if err != nil { + t.Fatalf("get by id: %v", err) + } + if !got.FlaggedHighRateAt.Equal(first) { + t.Errorf("FlaggedHighRateAt = %v, want %v", got.FlaggedHighRateAt, first) + } + + if err := store.ClearHighRateFlag(ctx, acc.ID); err != nil { + t.Fatalf("clear: %v", err) + } + if got, err = store.GetByID(ctx, acc.ID); err != nil { + t.Fatalf("get by id: %v", err) + } else if !got.FlaggedHighRateAt.IsZero() { + t.Errorf("cleared FlaggedHighRateAt = %v, want zero", got.FlaggedHighRateAt) + } + + second := first.Add(24 * time.Hour) + if set, err = store.FlagHighRate(ctx, acc.ID, second); err != nil || !set { + t.Fatalf("re-flag after clear = (%v, %v), want (true, nil)", set, err) + } + if got, err = store.GetByID(ctx, acc.ID); err != nil { + t.Fatalf("get by id: %v", err) + } else if !got.FlaggedHighRateAt.Equal(second) { + t.Errorf("re-flagged FlaggedHighRateAt = %v, want %v", got.FlaggedHighRateAt, second) + } +} + // TestIdentityExternalID covers the reverse identity lookup the push-target route // uses: it returns the external_id for the matching kind and ErrNotFound otherwise, // including for a guest that carries no identity. diff --git a/backend/internal/inttest/admin_test.go b/backend/internal/inttest/admin_test.go index 6b77c97..0171c54 100644 --- a/backend/internal/inttest/admin_test.go +++ b/backend/internal/inttest/admin_test.go @@ -16,6 +16,7 @@ import ( "scrabble/backend/internal/account" "scrabble/backend/internal/engine" "scrabble/backend/internal/game" + "scrabble/backend/internal/ratewatch" "scrabble/backend/internal/server" ) @@ -206,6 +207,72 @@ func TestConsoleGameDetailRobotSchedule(t *testing.T) { } } +// TestConsoleThrottledViewAndFlagClear drives the R3 rate-limit surface end to +// end against real stores: a gateway report past the threshold auto-flags the +// account, the throttled view shows the episode and the flagged account, the +// user card carries the marker, and the operator clear (a same-origin POST) +// reverses it. +func TestConsoleThrottledViewAndFlagClear(t *testing.T) { + ctx := context.Background() + accounts := account.NewStore(testDB) + acc, err := accounts.ProvisionTelegram(ctx, "tg-"+uuid.NewString(), "en", "", "Throttled Player") + if err != nil { + t.Fatalf("provision: %v", err) + } + + watch := ratewatch.New(ratewatch.Config{FlagThreshold: 100, FlagWindow: 10 * time.Minute}, accounts, zap.NewNop()) + srv := server.New(":0", server.Deps{ + Logger: zap.NewNop(), + Accounts: accounts, + Games: newGameService(), + Registry: testRegistry, + DictDir: dictDir(), + RateWatch: watch, + }) + h := srv.Handler() + + report := `{"window_seconds":30,"entries":[` + + `{"class":"user","key":"` + acc.ID.String() + `","rejected":150},` + + `{"class":"public","key":"10.1.2.3","rejected":7}]}` + req := httptest.NewRequest(http.MethodPost, "http://admin.test/api/v1/internal/ratelimit/report", strings.NewReader(report)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + if rec.Code != http.StatusNoContent { + t.Fatalf("report = %d, want 204", rec.Code) + } + + got, err := accounts.GetByID(ctx, acc.ID) + if err != nil { + t.Fatalf("get by id: %v", err) + } + if got.FlaggedHighRateAt.IsZero() { + t.Fatal("account not auto-flagged past the threshold") + } + + base := "http://admin.test/_gm" + code, body := consoleDo(h, http.MethodGet, base+"/throttled", "", "") + if code != http.StatusOK || !strings.Contains(body, acc.ID.String()) || + !strings.Contains(body, "10.1.2.3") || !strings.Contains(body, "Throttled Player") { + t.Fatalf("throttled view = %d, episode/flag shown = %v/%v", + code, strings.Contains(body, "10.1.2.3"), strings.Contains(body, "Throttled Player")) + } + if code, body = consoleDo(h, http.MethodGet, base+"/users/"+acc.ID.String(), "", ""); code != http.StatusOK || !strings.Contains(body, "Clear high-rate flag") { + t.Fatalf("user card = %d, has clear action = %v", code, strings.Contains(body, "Clear high-rate flag")) + } + + // The clear POST is CSRF-guarded like every console action. + if code, _ = consoleDo(h, http.MethodPost, base+"/users/"+acc.ID.String()+"/clear-high-rate-flag", "", ""); code != http.StatusForbidden { + t.Fatalf("clear without origin = %d, want 403", code) + } + if code, body = consoleDo(h, http.MethodPost, base+"/users/"+acc.ID.String()+"/clear-high-rate-flag", "x=1", "http://admin.test"); code != http.StatusOK || !strings.Contains(body, "Cleared") { + t.Fatalf("clear with origin = %d, has Cleared = %v", code, strings.Contains(body, "Cleared")) + } + if got, err = accounts.GetByID(ctx, acc.ID); err != nil || !got.FlaggedHighRateAt.IsZero() { + t.Fatalf("flag survived the clear: %v (err %v)", got.FlaggedHighRateAt, err) + } +} + // consoleDo issues a request to h, optionally with an Origin header, and returns // the status and body. Form bodies are sent as application/x-www-form-urlencoded. func consoleDo(h http.Handler, method, target, body, origin string) (int, string) { diff --git a/backend/internal/postgres/jet/backend/model/accounts.go b/backend/internal/postgres/jet/backend/model/accounts.go index 2501d3a..ae26769 100644 --- a/backend/internal/postgres/jet/backend/model/accounts.go +++ b/backend/internal/postgres/jet/backend/model/accounts.go @@ -30,4 +30,5 @@ type Accounts struct { MergedInto *uuid.UUID MergedAt *time.Time ServiceLanguage *string + FlaggedHighRateAt *time.Time } diff --git a/backend/internal/postgres/jet/backend/model/game_drafts.go b/backend/internal/postgres/jet/backend/model/game_drafts.go new file mode 100644 index 0000000..c9c32c2 --- /dev/null +++ b/backend/internal/postgres/jet/backend/model/game_drafts.go @@ -0,0 +1,21 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package model + +import ( + "github.com/google/uuid" + "time" +) + +type GameDrafts struct { + GameID uuid.UUID `sql:"primary_key"` + AccountID uuid.UUID `sql:"primary_key"` + RackOrder string + BoardTiles string + UpdatedAt time.Time +} diff --git a/backend/internal/postgres/jet/backend/model/game_hidden.go b/backend/internal/postgres/jet/backend/model/game_hidden.go new file mode 100644 index 0000000..d64f4a1 --- /dev/null +++ b/backend/internal/postgres/jet/backend/model/game_hidden.go @@ -0,0 +1,19 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package model + +import ( + "github.com/google/uuid" + "time" +) + +type GameHidden struct { + AccountID uuid.UUID `sql:"primary_key"` + GameID uuid.UUID `sql:"primary_key"` + CreatedAt time.Time +} diff --git a/backend/internal/postgres/jet/backend/table/accounts.go b/backend/internal/postgres/jet/backend/table/accounts.go index 906a396..021e7a4 100644 --- a/backend/internal/postgres/jet/backend/table/accounts.go +++ b/backend/internal/postgres/jet/backend/table/accounts.go @@ -34,6 +34,7 @@ type accountsTable struct { MergedInto postgres.ColumnString MergedAt postgres.ColumnTimestampz ServiceLanguage postgres.ColumnString + FlaggedHighRateAt postgres.ColumnTimestampz AllColumns postgres.ColumnList MutableColumns postgres.ColumnList @@ -92,8 +93,9 @@ func newAccountsTableImpl(schemaName, tableName, alias string) accountsTable { MergedIntoColumn = postgres.StringColumn("merged_into") MergedAtColumn = postgres.TimestampzColumn("merged_at") ServiceLanguageColumn = postgres.StringColumn("service_language") - allColumns = postgres.ColumnList{AccountIDColumn, DisplayNameColumn, PreferredLanguageColumn, TimeZoneColumn, BlockChatColumn, BlockFriendRequestsColumn, CreatedAtColumn, UpdatedAtColumn, AwayStartColumn, AwayEndColumn, HintBalanceColumn, IsGuestColumn, NotificationsInAppOnlyColumn, PaidAccountColumn, MergedIntoColumn, MergedAtColumn, ServiceLanguageColumn} - mutableColumns = postgres.ColumnList{DisplayNameColumn, PreferredLanguageColumn, TimeZoneColumn, BlockChatColumn, BlockFriendRequestsColumn, CreatedAtColumn, UpdatedAtColumn, AwayStartColumn, AwayEndColumn, HintBalanceColumn, IsGuestColumn, NotificationsInAppOnlyColumn, PaidAccountColumn, MergedIntoColumn, MergedAtColumn, ServiceLanguageColumn} + FlaggedHighRateAtColumn = postgres.TimestampzColumn("flagged_high_rate_at") + allColumns = postgres.ColumnList{AccountIDColumn, DisplayNameColumn, PreferredLanguageColumn, TimeZoneColumn, BlockChatColumn, BlockFriendRequestsColumn, CreatedAtColumn, UpdatedAtColumn, AwayStartColumn, AwayEndColumn, HintBalanceColumn, IsGuestColumn, NotificationsInAppOnlyColumn, PaidAccountColumn, MergedIntoColumn, MergedAtColumn, ServiceLanguageColumn, FlaggedHighRateAtColumn} + mutableColumns = postgres.ColumnList{DisplayNameColumn, PreferredLanguageColumn, TimeZoneColumn, BlockChatColumn, BlockFriendRequestsColumn, CreatedAtColumn, UpdatedAtColumn, AwayStartColumn, AwayEndColumn, HintBalanceColumn, IsGuestColumn, NotificationsInAppOnlyColumn, PaidAccountColumn, MergedIntoColumn, MergedAtColumn, ServiceLanguageColumn, FlaggedHighRateAtColumn} defaultColumns = postgres.ColumnList{DisplayNameColumn, PreferredLanguageColumn, TimeZoneColumn, BlockChatColumn, BlockFriendRequestsColumn, CreatedAtColumn, UpdatedAtColumn, AwayStartColumn, AwayEndColumn, HintBalanceColumn, IsGuestColumn, NotificationsInAppOnlyColumn, PaidAccountColumn} ) @@ -118,6 +120,7 @@ func newAccountsTableImpl(schemaName, tableName, alias string) accountsTable { MergedInto: MergedIntoColumn, MergedAt: MergedAtColumn, ServiceLanguage: ServiceLanguageColumn, + FlaggedHighRateAt: FlaggedHighRateAtColumn, AllColumns: allColumns, MutableColumns: mutableColumns, diff --git a/backend/internal/postgres/jet/backend/table/game_drafts.go b/backend/internal/postgres/jet/backend/table/game_drafts.go new file mode 100644 index 0000000..18f099e --- /dev/null +++ b/backend/internal/postgres/jet/backend/table/game_drafts.go @@ -0,0 +1,90 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package table + +import ( + "github.com/go-jet/jet/v2/postgres" +) + +var GameDrafts = newGameDraftsTable("backend", "game_drafts", "") + +type gameDraftsTable struct { + postgres.Table + + // Columns + GameID postgres.ColumnString + AccountID postgres.ColumnString + RackOrder postgres.ColumnString + BoardTiles postgres.ColumnString + UpdatedAt postgres.ColumnTimestampz + + AllColumns postgres.ColumnList + MutableColumns postgres.ColumnList + DefaultColumns postgres.ColumnList +} + +type GameDraftsTable struct { + gameDraftsTable + + EXCLUDED gameDraftsTable +} + +// AS creates new GameDraftsTable with assigned alias +func (a GameDraftsTable) AS(alias string) *GameDraftsTable { + return newGameDraftsTable(a.SchemaName(), a.TableName(), alias) +} + +// Schema creates new GameDraftsTable with assigned schema name +func (a GameDraftsTable) FromSchema(schemaName string) *GameDraftsTable { + return newGameDraftsTable(schemaName, a.TableName(), a.Alias()) +} + +// WithPrefix creates new GameDraftsTable with assigned table prefix +func (a GameDraftsTable) WithPrefix(prefix string) *GameDraftsTable { + return newGameDraftsTable(a.SchemaName(), prefix+a.TableName(), a.TableName()) +} + +// WithSuffix creates new GameDraftsTable with assigned table suffix +func (a GameDraftsTable) WithSuffix(suffix string) *GameDraftsTable { + return newGameDraftsTable(a.SchemaName(), a.TableName()+suffix, a.TableName()) +} + +func newGameDraftsTable(schemaName, tableName, alias string) *GameDraftsTable { + return &GameDraftsTable{ + gameDraftsTable: newGameDraftsTableImpl(schemaName, tableName, alias), + EXCLUDED: newGameDraftsTableImpl("", "excluded", ""), + } +} + +func newGameDraftsTableImpl(schemaName, tableName, alias string) gameDraftsTable { + var ( + GameIDColumn = postgres.StringColumn("game_id") + AccountIDColumn = postgres.StringColumn("account_id") + RackOrderColumn = postgres.StringColumn("rack_order") + BoardTilesColumn = postgres.StringColumn("board_tiles") + UpdatedAtColumn = postgres.TimestampzColumn("updated_at") + allColumns = postgres.ColumnList{GameIDColumn, AccountIDColumn, RackOrderColumn, BoardTilesColumn, UpdatedAtColumn} + mutableColumns = postgres.ColumnList{RackOrderColumn, BoardTilesColumn, UpdatedAtColumn} + defaultColumns = postgres.ColumnList{RackOrderColumn, BoardTilesColumn, UpdatedAtColumn} + ) + + return gameDraftsTable{ + Table: postgres.NewTable(schemaName, tableName, alias, allColumns...), + + //Columns + GameID: GameIDColumn, + AccountID: AccountIDColumn, + RackOrder: RackOrderColumn, + BoardTiles: BoardTilesColumn, + UpdatedAt: UpdatedAtColumn, + + AllColumns: allColumns, + MutableColumns: mutableColumns, + DefaultColumns: defaultColumns, + } +} diff --git a/backend/internal/postgres/jet/backend/table/game_hidden.go b/backend/internal/postgres/jet/backend/table/game_hidden.go new file mode 100644 index 0000000..d55a2a9 --- /dev/null +++ b/backend/internal/postgres/jet/backend/table/game_hidden.go @@ -0,0 +1,84 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package table + +import ( + "github.com/go-jet/jet/v2/postgres" +) + +var GameHidden = newGameHiddenTable("backend", "game_hidden", "") + +type gameHiddenTable struct { + postgres.Table + + // Columns + AccountID postgres.ColumnString + GameID postgres.ColumnString + CreatedAt postgres.ColumnTimestampz + + AllColumns postgres.ColumnList + MutableColumns postgres.ColumnList + DefaultColumns postgres.ColumnList +} + +type GameHiddenTable struct { + gameHiddenTable + + EXCLUDED gameHiddenTable +} + +// AS creates new GameHiddenTable with assigned alias +func (a GameHiddenTable) AS(alias string) *GameHiddenTable { + return newGameHiddenTable(a.SchemaName(), a.TableName(), alias) +} + +// Schema creates new GameHiddenTable with assigned schema name +func (a GameHiddenTable) FromSchema(schemaName string) *GameHiddenTable { + return newGameHiddenTable(schemaName, a.TableName(), a.Alias()) +} + +// WithPrefix creates new GameHiddenTable with assigned table prefix +func (a GameHiddenTable) WithPrefix(prefix string) *GameHiddenTable { + return newGameHiddenTable(a.SchemaName(), prefix+a.TableName(), a.TableName()) +} + +// WithSuffix creates new GameHiddenTable with assigned table suffix +func (a GameHiddenTable) WithSuffix(suffix string) *GameHiddenTable { + return newGameHiddenTable(a.SchemaName(), a.TableName()+suffix, a.TableName()) +} + +func newGameHiddenTable(schemaName, tableName, alias string) *GameHiddenTable { + return &GameHiddenTable{ + gameHiddenTable: newGameHiddenTableImpl(schemaName, tableName, alias), + EXCLUDED: newGameHiddenTableImpl("", "excluded", ""), + } +} + +func newGameHiddenTableImpl(schemaName, tableName, alias string) gameHiddenTable { + var ( + AccountIDColumn = postgres.StringColumn("account_id") + GameIDColumn = postgres.StringColumn("game_id") + CreatedAtColumn = postgres.TimestampzColumn("created_at") + allColumns = postgres.ColumnList{AccountIDColumn, GameIDColumn, CreatedAtColumn} + mutableColumns = postgres.ColumnList{CreatedAtColumn} + defaultColumns = postgres.ColumnList{CreatedAtColumn} + ) + + return gameHiddenTable{ + Table: postgres.NewTable(schemaName, tableName, alias, allColumns...), + + //Columns + AccountID: AccountIDColumn, + GameID: GameIDColumn, + CreatedAt: CreatedAtColumn, + + AllColumns: allColumns, + MutableColumns: mutableColumns, + DefaultColumns: defaultColumns, + } +} diff --git a/backend/internal/postgres/jet/backend/table/table_use_schema.go b/backend/internal/postgres/jet/backend/table/table_use_schema.go index edc7307..70d757d 100644 --- a/backend/internal/postgres/jet/backend/table/table_use_schema.go +++ b/backend/internal/postgres/jet/backend/table/table_use_schema.go @@ -18,6 +18,8 @@ func UseSchema(schema string) { EmailConfirmations = EmailConfirmations.FromSchema(schema) FriendCodes = FriendCodes.FromSchema(schema) Friendships = Friendships.FromSchema(schema) + GameDrafts = GameDrafts.FromSchema(schema) + GameHidden = GameHidden.FromSchema(schema) GameInvitationInvitees = GameInvitationInvitees.FromSchema(schema) GameInvitations = GameInvitations.FromSchema(schema) GameMoves = GameMoves.FromSchema(schema) diff --git a/backend/internal/postgres/migrations/00001_baseline.sql b/backend/internal/postgres/migrations/00001_baseline.sql index 7a63bef..f33592e 100644 --- a/backend/internal/postgres/migrations/00001_baseline.sql +++ b/backend/internal/postgres/migrations/00001_baseline.sql @@ -35,6 +35,10 @@ CREATE TABLE accounts ( merged_into uuid REFERENCES accounts (account_id) ON DELETE SET NULL, merged_at timestamptz, service_language text CHECK (service_language IN ('en', 'ru')), + -- Soft, reversible "suspected high-rate" marker (R3): set once when the gateway + -- reports sustained rate-limiter rejections past the threshold; an operator + -- clears it in the admin console. Never an automatic ban. + flagged_high_rate_at timestamptz, CONSTRAINT accounts_preferred_language_chk CHECK (preferred_language IN ('en', 'ru')), CONSTRAINT accounts_hint_balance_chk CHECK (hint_balance >= 0) ); diff --git a/backend/internal/ratewatch/ratewatch.go b/backend/internal/ratewatch/ratewatch.go new file mode 100644 index 0000000..f236f78 --- /dev/null +++ b/backend/internal/ratewatch/ratewatch.go @@ -0,0 +1,235 @@ +// Package ratewatch ingests the gateway's periodic rate-limiter rejection +// reports (R3). It keeps an in-memory window of recent throttle episodes for +// the admin console's view and applies the conservative high-rate auto-flag: +// when one account's rejections within the rolling window cross the threshold, +// the account store stamps the soft, reversible flagged_high_rate_at marker +// (set-once; an operator clears it; never an automatic ban). Like the gateway's +// active_users gauge it is single-instance and resets on restart by design — +// the durable part is the account flag, not the episode window. +package ratewatch + +import ( + "context" + "fmt" + "sort" + "sync" + "time" + + "github.com/google/uuid" + "go.uber.org/zap" +) + +// ClassUser is the limiter class whose keys are account ids — the only class +// the auto-flag applies to (the others are keyed by client IP). +const ClassUser = "user" + +const ( + // maxSeries bounds the distinct (class, key) series kept for the console + // view, so a key-spraying client cannot grow the map: past the bound the + // least-recently-throttled series is evicted. + maxSeries = 200 + // minRetention keeps an episode visible in the console for at least an hour + // after its last rejection (longer when the flag window is longer). + minRetention = time.Hour +) + +// Config tunes the conservative high-rate auto-flag. +type Config struct { + // FlagThreshold is the rejected-call count within FlagWindow past which a + // user account is flagged. + FlagThreshold int + // FlagWindow is the rolling window the rejections accumulate over. + FlagWindow time.Duration +} + +// DefaultConfig returns the agreed conservative defaults — 1000 rejected calls +// within a rolling 10 minutes (~1.7/s sustained, far above the client's +// capped-backoff retry noise yet a fraction of an abusive loop). +func DefaultConfig() Config { + return Config{FlagThreshold: 1000, FlagWindow: 10 * time.Minute} +} + +// Validate reports whether the configuration values are acceptable. +func (c Config) Validate() error { + if c.FlagThreshold <= 0 { + return fmt.Errorf("ratewatch: flag threshold must be positive") + } + if c.FlagWindow <= 0 { + return fmt.Errorf("ratewatch: flag window must be positive") + } + return nil +} + +// Flagger stamps the account-level high-rate marker; account.Store satisfies it. +type Flagger interface { + FlagHighRate(ctx context.Context, id uuid.UUID, at time.Time) (bool, error) +} + +// Entry is one reported aggregate: the rejections of one limiter key within one +// gateway report window (the wire mirror of the gateway's rejection summary). +type Entry struct { + Class string + Key string + Rejected int +} + +// Episode is one key's recent-throttle aggregate for the admin view. +type Episode struct { + Class string + Key string + Rejected int + FirstSeen time.Time + LastSeen time.Time +} + +// Watch accumulates reports and applies the auto-flag rule. +type Watch struct { + cfg Config + flagger Flagger + log *zap.Logger + now func() time.Time + + mu sync.Mutex + series map[seriesKey]*series +} + +type seriesKey struct{ class, key string } + +type series struct { + points []point // ascending by time +} + +type point struct { + at time.Time + n int +} + +// New constructs a Watch over flagger with cfg. A nil logger is replaced by a +// no-op one; a nil flagger disables the auto-flag (the view still works). +func New(cfg Config, flagger Flagger, log *zap.Logger) *Watch { + if log == nil { + log = zap.NewNop() + } + return &Watch{ + cfg: cfg, + flagger: flagger, + log: log, + now: time.Now, + series: make(map[seriesKey]*series), + } +} + +// Ingest records one gateway report. Entries with an empty class or key or a +// non-positive count are skipped. When a user-class series crosses the flag +// threshold within the flag window, the account is flagged (the store keeps it +// set-once, so a sustained episode costs one no-op UPDATE per report). +func (w *Watch) Ingest(ctx context.Context, entries []Entry) { + if len(entries) == 0 { + return + } + now := w.now() + var flag []uuid.UUID + w.mu.Lock() + for _, e := range entries { + if e.Class == "" || e.Key == "" || e.Rejected <= 0 { + continue + } + k := seriesKey{class: e.Class, key: e.Key} + s := w.series[k] + if s == nil { + s = &series{} + w.series[k] = s + } + s.points = append(s.points, point{at: now, n: e.Rejected}) + if e.Class == ClassUser && s.sumSince(now.Add(-w.cfg.FlagWindow)) >= w.cfg.FlagThreshold { + if id, err := uuid.Parse(e.Key); err == nil { + flag = append(flag, id) + } + } + } + w.pruneLocked(now) + w.mu.Unlock() + + if w.flagger == nil { + return + } + for _, id := range flag { + set, err := w.flagger.FlagHighRate(ctx, id, now) + switch { + case err != nil: + w.log.Warn("high-rate flag failed", zap.String("account_id", id.String()), zap.Error(err)) + case set: + w.log.Info("account flagged high-rate", + zap.String("account_id", id.String()), + zap.Int("threshold", w.cfg.FlagThreshold), + zap.Duration("window", w.cfg.FlagWindow)) + } + } +} + +// Config returns the active auto-flag tuning (the admin console captions it). +func (w *Watch) Config() Config { return w.cfg } + +// Recent returns the retained throttle episodes, most recently throttled first. +func (w *Watch) Recent() []Episode { + w.mu.Lock() + defer w.mu.Unlock() + out := make([]Episode, 0, len(w.series)) + for k, s := range w.series { + if len(s.points) == 0 { + continue + } + ep := Episode{ + Class: k.class, + Key: k.key, + FirstSeen: s.points[0].at, + LastSeen: s.points[len(s.points)-1].at, + } + for _, p := range s.points { + ep.Rejected += p.n + } + out = append(out, ep) + } + sort.Slice(out, func(i, j int) bool { return out[i].LastSeen.After(out[j].LastSeen) }) + return out +} + +// sumSince totals the points at or after cutoff. +func (s *series) sumSince(cutoff time.Time) int { + sum := 0 + for i := len(s.points) - 1; i >= 0; i-- { + if s.points[i].at.Before(cutoff) { + break + } + sum += s.points[i].n + } + return sum +} + +// pruneLocked drops points past retention, empty series, and — past maxSeries — +// the least-recently-throttled series. The caller holds w.mu. +func (w *Watch) pruneLocked(now time.Time) { + cutoff := now.Add(-max(minRetention, w.cfg.FlagWindow)) + for k, s := range w.series { + i := 0 + for i < len(s.points) && s.points[i].at.Before(cutoff) { + i++ + } + s.points = s.points[i:] + if len(s.points) == 0 { + delete(w.series, k) + } + } + for len(w.series) > maxSeries { + var oldest seriesKey + var oldestAt time.Time + first := true + for k, s := range w.series { + last := s.points[len(s.points)-1].at + if first || last.Before(oldestAt) { + oldest, oldestAt, first = k, last, false + } + } + delete(w.series, oldest) + } +} diff --git a/backend/internal/ratewatch/ratewatch_test.go b/backend/internal/ratewatch/ratewatch_test.go new file mode 100644 index 0000000..0bde791 --- /dev/null +++ b/backend/internal/ratewatch/ratewatch_test.go @@ -0,0 +1,140 @@ +package ratewatch + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/google/uuid" +) + +// fakeFlagger records flag calls and reports them as newly set. +type fakeFlagger struct { + calls []uuid.UUID +} + +func (f *fakeFlagger) FlagHighRate(_ context.Context, id uuid.UUID, _ time.Time) (bool, error) { + f.calls = append(f.calls, id) + return true, nil +} + +// watchAt returns a Watch with a controllable clock. +func watchAt(cfg Config, flagger Flagger, at *time.Time) *Watch { + w := New(cfg, flagger, nil) + w.now = func() time.Time { return *at } + return w +} + +// TestIngestAggregatesAndRecent verifies episodes accumulate per (class, key), +// invalid entries are skipped, and Recent orders by last rejection. +func TestIngestAggregatesAndRecent(t *testing.T) { + now := time.Date(2026, 6, 10, 12, 0, 0, 0, time.UTC) + w := watchAt(DefaultConfig(), nil, &now) + ctx := context.Background() + + w.Ingest(ctx, []Entry{ + {Class: "public", Key: "10.0.0.1", Rejected: 3}, + {Class: "user", Key: "u-1", Rejected: 5}, + {Class: "", Key: "x", Rejected: 1}, + {Class: "user", Key: "", Rejected: 1}, + {Class: "user", Key: "u-1", Rejected: 0}, + }) + now = now.Add(30 * time.Second) + w.Ingest(ctx, []Entry{{Class: "public", Key: "10.0.0.1", Rejected: 4}}) + + got := w.Recent() + if len(got) != 2 { + t.Fatalf("Recent returned %d episodes, want 2", len(got)) + } + if got[0].Class != "public" || got[0].Key != "10.0.0.1" || got[0].Rejected != 7 { + t.Errorf("first episode = %+v, want public/10.0.0.1 rejected=7", got[0]) + } + if !got[0].LastSeen.After(got[0].FirstSeen) { + t.Errorf("episode span = [%v, %v], want a positive span", got[0].FirstSeen, got[0].LastSeen) + } + if got[1].Class != "user" || got[1].Rejected != 5 { + t.Errorf("second episode = %+v, want user rejected=5", got[1]) + } +} + +// TestAutoFlagThreshold verifies the flag fires only for a user-class series +// crossing the threshold within the window, with a parseable account id. +func TestAutoFlagThreshold(t *testing.T) { + now := time.Date(2026, 6, 10, 12, 0, 0, 0, time.UTC) + flagged := &fakeFlagger{} + id := uuid.New() + w := watchAt(Config{FlagThreshold: 100, FlagWindow: 10 * time.Minute}, flagged, &now) + ctx := context.Background() + + w.Ingest(ctx, []Entry{ + {Class: "user", Key: id.String(), Rejected: 99}, + {Class: "public", Key: "10.0.0.1", Rejected: 1000}, + {Class: "user", Key: "not-a-uuid", Rejected: 1000}, + }) + if len(flagged.calls) != 0 { + t.Fatalf("flagged %v below the threshold", flagged.calls) + } + + now = now.Add(30 * time.Second) + w.Ingest(ctx, []Entry{{Class: "user", Key: id.String(), Rejected: 1}}) + if len(flagged.calls) != 1 || flagged.calls[0] != id { + t.Fatalf("flag calls = %v, want exactly [%s]", flagged.calls, id) + } +} + +// TestAutoFlagWindowExpiry verifies rejections age out of the rolling window. +func TestAutoFlagWindowExpiry(t *testing.T) { + now := time.Date(2026, 6, 10, 12, 0, 0, 0, time.UTC) + flagged := &fakeFlagger{} + id := uuid.New() + w := watchAt(Config{FlagThreshold: 100, FlagWindow: 10 * time.Minute}, flagged, &now) + ctx := context.Background() + + w.Ingest(ctx, []Entry{{Class: "user", Key: id.String(), Rejected: 60}}) + now = now.Add(11 * time.Minute) + w.Ingest(ctx, []Entry{{Class: "user", Key: id.String(), Rejected: 60}}) + if len(flagged.calls) != 0 { + t.Fatalf("flagged %v across an expired window", flagged.calls) + } + now = now.Add(time.Minute) + w.Ingest(ctx, []Entry{{Class: "user", Key: id.String(), Rejected: 50}}) + if len(flagged.calls) != 1 { + t.Fatalf("flag calls = %v, want one in-window crossing", flagged.calls) + } +} + +// TestSeriesBound verifies the episode map stays bounded by evicting the +// least-recently-throttled series. +func TestSeriesBound(t *testing.T) { + now := time.Date(2026, 6, 10, 12, 0, 0, 0, time.UTC) + w := watchAt(DefaultConfig(), nil, &now) + ctx := context.Background() + + for i := range maxSeries + 10 { + now = now.Add(time.Second) + w.Ingest(ctx, []Entry{{Class: "public", Key: fmt.Sprintf("10.0.%d.%d", i/256, i%256), Rejected: 1}}) + } + got := w.Recent() + if len(got) != maxSeries { + t.Fatalf("retained %d series, want %d", len(got), maxSeries) + } + for _, ep := range got { + if ep.Key == "10.0.0.0" { + t.Fatal("the least-recently-throttled series survived the bound") + } + } +} + +// TestConfigValidate covers the tuning guards. +func TestConfigValidate(t *testing.T) { + if err := DefaultConfig().Validate(); err != nil { + t.Errorf("default config invalid: %v", err) + } + if err := (Config{FlagThreshold: 0, FlagWindow: time.Minute}).Validate(); err == nil { + t.Error("zero threshold passed validation") + } + if err := (Config{FlagThreshold: 1, FlagWindow: 0}).Validate(); err == nil { + t.Error("zero window passed validation") + } +} diff --git a/backend/internal/server/handlers.go b/backend/internal/server/handlers.go index f760409..374df86 100644 --- a/backend/internal/server/handlers.go +++ b/backend/internal/server/handlers.go @@ -37,6 +37,11 @@ func (s *Server) registerRoutes() { // before delivering an out-of-app notification. in.POST("/push-target", s.handlePushTarget) } + if s.ratewatch != nil { + // The gateway's periodic rate-limiter rejection summary (R3): feeds the + // admin console's throttled view and the high-rate auto-flag. + s.internal.POST("/ratelimit/report", s.handleRateLimitReport) + } u := s.user if s.accounts != nil { u.GET("/profile", s.handleProfile) @@ -120,10 +125,8 @@ func gameIDParam(c *gin.Context) (uuid.UUID, bool) { // X-Forwarded-For (the first hop), falling back to the direct peer. func clientIP(c *gin.Context) string { if xff := c.GetHeader("X-Forwarded-For"); xff != "" { - if i := strings.IndexByte(xff, ','); i >= 0 { - return strings.TrimSpace(xff[:i]) - } - return strings.TrimSpace(xff) + first, _, _ := strings.Cut(xff, ",") + return strings.TrimSpace(first) } return c.ClientIP() } diff --git a/backend/internal/server/handlers_admin_console.go b/backend/internal/server/handlers_admin_console.go index bfa3758..e86302f 100644 --- a/backend/internal/server/handlers_admin_console.go +++ b/backend/internal/server/handlers_admin_console.go @@ -19,6 +19,7 @@ import ( "scrabble/backend/internal/adminconsole" "scrabble/backend/internal/engine" "scrabble/backend/internal/game" + "scrabble/backend/internal/ratewatch" "scrabble/backend/internal/robot" "scrabble/backend/internal/social" ) @@ -48,6 +49,8 @@ func (s *Server) registerConsole(router *gin.Engine) { gm.GET("/users", s.consoleUsers) gm.GET("/users/:id", s.consoleUserDetail) gm.POST("/users/:id/message", s.consoleUserMessage) + gm.POST("/users/:id/clear-high-rate-flag", s.consoleClearHighRateFlag) + gm.GET("/throttled", s.consoleThrottled) gm.GET("/games", s.consoleGames) gm.GET("/games/:id", s.consoleGameDetail) gm.GET("/complaints", s.consoleComplaints) @@ -117,7 +120,8 @@ func (s *Server) consoleUsers(c *gin.Context) { } view.Items = append(view.Items, adminconsole.UserRow{ ID: it.ID.String(), DisplayName: it.DisplayName, Kind: kind, - Language: it.PreferredLanguage, Guest: it.IsGuest, CreatedAt: fmtTime(it.CreatedAt), + Language: it.PreferredLanguage, Guest: it.IsGuest, + FlaggedHighRate: !it.FlaggedHighRateAt.IsZero(), CreatedAt: fmtTime(it.CreatedAt), }) ids = append(ids, it.ID) } @@ -257,6 +261,9 @@ func (s *Server) consoleUserDetail(c *gin.Context) { if acc.MergedInto != uuid.Nil { view.MergedInto = acc.MergedInto.String() } + if !acc.FlaggedHighRateAt.IsZero() { + view.FlaggedHighRateAt = fmtTime(acc.FlaggedHighRateAt) + } if view.HasStats { if st, err := s.accounts.GetStats(ctx, id); err == nil { view.Stats = adminconsole.StatsRow{Wins: st.Wins, Losses: st.Losses, Draws: st.Draws, MaxGamePoints: st.MaxGamePoints, MaxWordPoints: st.MaxWordPoints} @@ -551,6 +558,56 @@ func (s *Server) consolePostBroadcast(c *gin.Context) { } } +// consoleThrottled renders the rate-limit observability page: the recent +// gateway-reported throttle episodes (in-memory, reset on a backend restart) +// and the accounts currently carrying the soft high-rate flag (R3). +func (s *Server) consoleThrottled(c *gin.Context) { + ctx := c.Request.Context() + var view adminconsole.ThrottledView + if s.ratewatch != nil { + cfg := s.ratewatch.Config() + view.FlagThreshold = cfg.FlagThreshold + view.FlagWindow = cfg.FlagWindow.String() + for _, ep := range s.ratewatch.Recent() { + row := adminconsole.ThrottleEpisodeRow{ + Class: ep.Class, Key: ep.Key, Rejected: ep.Rejected, + FirstSeen: fmtTime(ep.FirstSeen), LastSeen: fmtTime(ep.LastSeen), + } + if ep.Class == ratewatch.ClassUser { + if id, err := uuid.Parse(ep.Key); err == nil { + row.UserID = id.String() + } + } + view.Episodes = append(view.Episodes, row) + } + } + flagged, err := s.accounts.ListFlaggedHighRate(ctx) + if err != nil { + s.consoleError(c, err) + return + } + for _, fa := range flagged { + view.Flagged = append(view.Flagged, adminconsole.FlaggedAccountRow{ + ID: fa.ID.String(), DisplayName: fa.DisplayName, FlaggedAt: fmtTime(fa.FlaggedHighRateAt), + }) + } + s.renderConsole(c, "throttled", "throttled", "Throttled", view) +} + +// consoleClearHighRateFlag clears the soft high-rate marker — the operator's +// reversible review action (R3). +func (s *Server) consoleClearHighRateFlag(c *gin.Context) { + id, ok := s.consoleUUID(c, "/_gm/users") + if !ok { + return + } + if err := s.accounts.ClearHighRateFlag(c.Request.Context(), id); err != nil { + s.consoleError(c, err) + return + } + s.renderConsoleMessage(c, "Cleared", "high-rate flag cleared", "/_gm/users/"+id.String()) +} + // variantVersions builds the per-variant resident-version summary from the registry. func (s *Server) variantVersions() []adminconsole.VariantVersions { out := make([]adminconsole.VariantVersions, 0, len(engine.Variants())) diff --git a/backend/internal/server/handlers_ratelimit.go b/backend/internal/server/handlers_ratelimit.go new file mode 100644 index 0000000..512a9ae --- /dev/null +++ b/backend/internal/server/handlers_ratelimit.go @@ -0,0 +1,41 @@ +package server + +import ( + "net/http" + + "github.com/gin-gonic/gin" + + "scrabble/backend/internal/ratewatch" +) + +// rateLimitReportRequest mirrors the gateway's periodic rejection summary: every +// entry aggregates one limiter key (class + key) over the report window. +type rateLimitReportRequest struct { + WindowSeconds int `json:"window_seconds"` + Entries []rateLimitReportEntry `json:"entries"` +} + +// rateLimitReportEntry is one (class, key) aggregate of the report. +type rateLimitReportEntry struct { + Class string `json:"class"` + Key string `json:"key"` + Rejected int `json:"rejected"` +} + +// handleRateLimitReport ingests one gateway rejection report into the rate +// watch — the admin console's throttled view and the high-rate auto-flag (R3). +// Internal, gateway-only: like sessions/resolve it trusts the network segment. +// Malformed individual entries are skipped by the watch itself. +func (s *Server) handleRateLimitReport(c *gin.Context) { + var req rateLimitReportRequest + if err := c.ShouldBindJSON(&req); err != nil { + abortBadRequest(c, "invalid rate-limit report") + return + } + entries := make([]ratewatch.Entry, 0, len(req.Entries)) + for _, e := range req.Entries { + entries = append(entries, ratewatch.Entry{Class: e.Class, Key: e.Key, Rejected: e.Rejected}) + } + s.ratewatch.Ingest(c.Request.Context(), entries) + c.Status(http.StatusNoContent) +} diff --git a/backend/internal/server/handlers_test.go b/backend/internal/server/handlers_test.go index d3257dd..c8e086a 100644 --- a/backend/internal/server/handlers_test.go +++ b/backend/internal/server/handlers_test.go @@ -10,6 +10,7 @@ import ( "scrabble/backend/internal/account" "scrabble/backend/internal/game" + "scrabble/backend/internal/ratewatch" "scrabble/backend/internal/session" ) @@ -57,6 +58,23 @@ func TestResolveSessionRejectsEmptyToken(t *testing.T) { } } +// TestRateLimitReportEndpoint covers the internal R3 report route: a malformed +// body is a 400, a valid report lands in the rate watch with 204. +func TestRateLimitReportEndpoint(t *testing.T) { + watch := ratewatch.New(ratewatch.DefaultConfig(), nil, nil) + s := New(":0", Deps{RateWatch: watch}) + if rec := do(t, s, http.MethodPost, "/api/v1/internal/ratelimit/report", `{bad`, nil); rec.Code != http.StatusBadRequest { + t.Fatalf("malformed report = %d, want 400", rec.Code) + } + body := `{"window_seconds":30,"entries":[{"class":"user","key":"` + uuid.NewString() + `","rejected":7}]}` + if rec := do(t, s, http.MethodPost, "/api/v1/internal/ratelimit/report", body, nil); rec.Code != http.StatusNoContent { + t.Fatalf("report = %d, want 204", rec.Code) + } + if eps := watch.Recent(); len(eps) != 1 || eps[0].Rejected != 7 { + t.Fatalf("watch episodes = %+v, want one entry with rejected=7", eps) + } +} + func TestSubmitPlayRejectsBadDirection(t *testing.T) { headers := map[string]string{"X-User-ID": uuid.New().String()} path := "/api/v1/user/games/" + uuid.New().String() + "/play" diff --git a/backend/internal/server/server.go b/backend/internal/server/server.go index 9c914f7..618efdc 100644 --- a/backend/internal/server/server.go +++ b/backend/internal/server/server.go @@ -24,6 +24,7 @@ import ( "scrabble/backend/internal/game" "scrabble/backend/internal/link" "scrabble/backend/internal/lobby" + "scrabble/backend/internal/ratewatch" "scrabble/backend/internal/session" "scrabble/backend/internal/social" "scrabble/backend/internal/telemetry" @@ -71,6 +72,10 @@ type Deps struct { // nil when BACKEND_CONNECTOR_ADDR is unset (broadcasts show a "not configured" // notice). Connector *connector.Client + // RateWatch ingests the gateway's rate-limiter rejection reports (R3): the + // admin console's throttled view + the high-rate auto-flag. A nil RateWatch + // disables the internal report endpoint and the console view. + RateWatch *ratewatch.Watch } // Server owns the gin engine, the underlying HTTP server and the readiness @@ -93,6 +98,7 @@ type Server struct { registry *engine.Registry dictDir string connector *connector.Client + ratewatch *ratewatch.Watch console *adminconsole.Renderer public *gin.RouterGroup @@ -133,6 +139,7 @@ func New(addr string, deps Deps) *Server { registry: deps.Registry, dictDir: deps.DictDir, connector: deps.Connector, + ratewatch: deps.RateWatch, http: &http.Server{Addr: addr, Handler: engine}, } s.registerProbes(engine) diff --git a/deploy/README.md b/deploy/README.md index 9435c8a..5caa0e5 100644 --- a/deploy/README.md +++ b/deploy/README.md @@ -1,9 +1,9 @@ # deploy -The full Scrabble contour: `backend` + `gateway` + Postgres + the Telegram -connector (with a VPN sidecar) + the observability stack (OTel Collector → -Prometheus + Tempo → Grafana), fronted by a **caddy** that owns a single `/_gm` -Basic-Auth (the admin console + Grafana). Topology and the decision record are in +The full Scrabble contour: `backend` + `gateway` + the static `landing` + Postgres + +the Telegram connector (with a VPN sidecar) + the observability stack (OTel +Collector → Prometheus + Tempo → Grafana), fronted by a **caddy** that owns a single +`/_gm` Basic-Auth (the admin console + Grafana). Topology and the decision record are in [`../docs/ARCHITECTURE.md`](../docs/ARCHITECTURE.md) §13; this file is the operational reference for **every environment variable**. @@ -11,8 +11,9 @@ operational reference for **every environment variable**. | Service | Image | Role | | --- | --- | --- | -| `caddy` | `caddy:2-alpine` | Edge proxy (alias `scrabble` on `edge`): single `/_gm` Basic-Auth → admin console + Grafana; everything else → gateway. TLS per `CADDY_SITE_ADDRESS`. | -| `gateway` | built (`gateway/Dockerfile`) | Public edge; serves the embedded landing at `/` and the game SPA at `/app/` + `/telegram/`; Connect-RPC edge. | +| `caddy` | `caddy:2-alpine` | Edge proxy (alias `scrabble` on `edge`): single `/_gm` Basic-Auth → admin console + Grafana; `/app/`, `/telegram/` + the Connect path → gateway; the catch-all (incl. `/`) → landing. TLS per `CADDY_SITE_ADDRESS`. | +| `gateway` | built (`gateway/Dockerfile`, target `gateway`) | Public edge; serves the embedded game SPA at `/app/` + `/telegram/`; Connect-RPC edge. `/` redirects to `/app/`. | +| `landing` | built (`gateway/Dockerfile`, target `landing`) | Static landing page at `/` (caddy:2-alpine + the shared Vite build, `deploy/landing/Caddyfile`); absorbs stray public paths (R3). | | `backend` | built (`backend/Dockerfile`) | Domain service; bakes in the DAWG dictionaries; runs migrations at boot. | | `postgres` | `postgres:17-alpine` | Database (named volume, `pg_isready` healthcheck). | | `vpn` + `telegram` | sidecar + built (`platform/telegram/Dockerfile`) | Telegram connector; egresses through the AmneziaWG sidecar; internal gRPC at `telegram:9091`. | @@ -88,8 +89,9 @@ connector **fails at boot** if both are empty. | `VITE_TELEGRAM_GAME_CHANNEL_NAME_RU` | variable | _(empty)_ | UI build-arg: the landing "Play in Telegram" link for the **Russian** bot (e.g. `https://t.me/Erudit_Game`). | | `VITE_GATEWAY_URL` | variable | _(empty)_ | UI build-arg: gateway origin; empty = same-origin (the usual single-origin deploy). | -The five `VITE_*` are **build-args** baked into the gateway image at build time, so -changing them requires a rebuild (`--build`), not just a restart. +The five `VITE_*` are **build-args** baked into the gateway and landing images at +build time (both targets share one UI build stage — keep the args identical so it is +built once), so changing them requires a rebuild (`--build`), not just a restart. ## Fixed internal wiring (not operator-set) diff --git a/deploy/caddy/Caddyfile b/deploy/caddy/Caddyfile index ba26fbe..8860263 100644 --- a/deploy/caddy/Caddyfile +++ b/deploy/caddy/Caddyfile @@ -1,7 +1,9 @@ # Edge reverse proxy for the Scrabble contour. A single Basic-Auth gate covers # every operator surface under /_gm (the backend-rendered admin console and the -# Grafana subpath); everything else (the SPA at / and /telegram/, plus the -# Connect edge) goes to the gateway. Mirrors ../galaxy-game's /_gm model. +# Grafana subpath); the game SPA (/app/, /telegram/) and the Connect edge go to +# the gateway; the catch-all — notably the public landing at / — goes to the +# static landing container (R3), so stray traffic never reaches the Go edge. +# Mirrors ../galaxy-game's /_gm model. # # CADDY_SITE_ADDRESS is ":80" in the test contour (the host caddy terminates TLS # and forwards); set it to a domain in prod (Stage 18) so this caddy does its own @@ -36,8 +38,14 @@ } } - # The SPA (/, /telegram/) and the Connect edge are served by the gateway. - handle { + # The game SPA and the Connect edge are served by the gateway. + @gateway path /app /app/* /telegram /telegram/* /scrabble.edge.v1.Gateway/* + handle @gateway { reverse_proxy gateway:8081 } + + # Everything else — the public landing at / and any stray path — is static. + handle { + reverse_proxy landing:80 + } } diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index cb071f6..953825a 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -75,6 +75,7 @@ services: build: context: .. dockerfile: gateway/Dockerfile + target: gateway args: VITE_TELEGRAM_BOT_ID: ${VITE_TELEGRAM_BOT_ID:-} VITE_TELEGRAM_LINK: ${VITE_TELEGRAM_LINK:-} @@ -100,6 +101,28 @@ services: # caddy owns the /_gm Basic-Auth and routes /_gm to the backend directly. networks: [internal] + # --- Landing (static) ------------------------------------------------------- + # The public landing page in its own caddy container (R3): the contour caddy + # routes the catch-all (notably /) here, the gateway keeps only /app/, + # /telegram/ and the Connect edge. Shares the gateway Dockerfile's UI build + # stage — identical build args keep that stage a single cached build. + landing: + container_name: scrabble-landing + image: scrabble-landing:latest + build: + context: .. + dockerfile: gateway/Dockerfile + target: landing + args: + VITE_TELEGRAM_BOT_ID: ${VITE_TELEGRAM_BOT_ID:-} + VITE_TELEGRAM_LINK: ${VITE_TELEGRAM_LINK:-} + VITE_TELEGRAM_GAME_CHANNEL_NAME_EN: ${VITE_TELEGRAM_GAME_CHANNEL_NAME_EN:-} + VITE_TELEGRAM_GAME_CHANNEL_NAME_RU: ${VITE_TELEGRAM_GAME_CHANNEL_NAME_RU:-} + VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-} + VITE_APP_VERSION: ${APP_VERSION:-dev} + restart: unless-stopped + networks: [internal] + # --- Telegram connector (egress via the VPN sidecar) ----------------------- vpn: container_name: scrabble-telegram-vpn @@ -145,12 +168,13 @@ services: OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317 OTEL_EXPORTER_OTLP_INSECURE: "true" - # --- Edge reverse proxy (single /_gm Basic-Auth; SPA + Connect -> gateway) -- + # --- Edge reverse proxy (single /_gm Basic-Auth; SPA + Connect -> gateway; + # the catch-all incl. the landing -> the static landing container) ------- caddy: container_name: scrabble-caddy image: caddy:2-alpine restart: unless-stopped - depends_on: [gateway, backend, grafana] + depends_on: [gateway, backend, grafana, landing] environment: # Test: ":80" (host caddy terminates TLS). Prod: a domain for own ACME. CADDY_SITE_ADDRESS: ${CADDY_SITE_ADDRESS:-:80} diff --git a/deploy/grafana/dashboards/edge-ux.json b/deploy/grafana/dashboards/edge-ux.json index 44b49d6..413a268 100644 --- a/deploy/grafana/dashboards/edge-ux.json +++ b/deploy/grafana/dashboards/edge-ux.json @@ -34,6 +34,18 @@ "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] }, "datasource": { "type": "prometheus", "uid": "prometheus" }, "targets": [{ "refId": "A", "expr": "sum(rate(edge_request_duration_count[5m])) by (result)", "legendFormat": "{{result}}" }] + }, + { + "type": "timeseries", + "title": "Rate limiting — request rate vs rejections (R3)", + "description": "Aggregate only (no per-user labels, the Stage 12/17 discipline): total edge request rate against the limiter rejection rate by class. Per-key detail lives in the admin console's Throttled view.", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, + "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { "refId": "A", "expr": "sum(rate(edge_request_duration_count[5m]))", "legendFormat": "requests" }, + { "refId": "B", "expr": "sum(rate(gateway_rate_limited_total[5m])) by (class)", "legendFormat": "rejected · {{class}}" } + ] } ] } diff --git a/deploy/landing/Caddyfile b/deploy/landing/Caddyfile new file mode 100644 index 0000000..f14b522 --- /dev/null +++ b/deploy/landing/Caddyfile @@ -0,0 +1,27 @@ +# Static landing container (R3). Serves the public landing page and the built +# assets it references at /; the game SPA (/app/, /telegram/) and the Connect +# edge stay on the gateway. The contour caddy routes the catch-all here, so +# stray public paths are absorbed by static file serving and never reach the Go +# edge. This file is baked into the image at build time (gateway/Dockerfile, +# target `landing`), not bind-mounted. +{ + admin off +} + +:80 { + root * /srv + encode zstd gzip + + # Mirror the gateway webui caching: hash-named build assets are immutable, + # every HTML shell is no-cache so a new deploy is picked up immediately. + header /assets/* Cache-Control "public, max-age=31536000, immutable" + @shell not path /assets/* + header @shell Cache-Control "no-cache" + + # An unknown path falls back to the landing shell (the gateway's old "/" + # behaviour); "/" itself resolves through the index below. + try_files {path} /landing.html + file_server { + index landing.html + } +} diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 6b6e0f2..d1546a5 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -98,6 +98,15 @@ dropped). Horizontal scaling is explicit future work. response was lost — its button is disabled while offline and the player re-issues it on reconnect). A reachability watcher (a lightweight `profile.get` probe) clears the signal when no other traffic is in flight; the live `Subscribe` stream's drop/recovery feeds the same signal. + **Edge hardening (R3):** every request body on the public listener is capped at + `GATEWAY_MAX_BODY_BYTES` (default 1 MiB — far above any legitimate payload), both at the HTTP + layer (`http.MaxBytesReader`) and as the Connect per-message read limit, so an oversized + `Execute` is refused (`resource_exhausted`) without buffering. The h2c server carries explicit + sizing: `MaxConcurrentStreams` 250 (the x/net default made visible — a real client holds one + `Subscribe` stream plus a few unary calls) and a 3-minute connection `IdleTimeout` (a live + `Subscribe` stream keeps its connection active, so only abandoned connections are reaped); the + `http.Server` sets only `ReadHeaderTimeout` (10 s) — Read/WriteTimeout would kill the stream. + R7 revisits the exact values under load. - **Alphabet on the wire (Stage 13)**: live play exchanges **alphabet indices**, not concrete letters. The rack (`StateView.rack`), the `SubmitPlay`/`Evaluate` tiles, the `Exchange` tiles and the `CheckWord` word are `ubyte` indices into the variant's alphabet @@ -572,6 +581,21 @@ promotions) is future work and would deliver short markdown messages (text + lin distinct accounts that performed an authenticated edge action in the window. The gauge is single-process by design (single-instance MVP, §10): it is correct for one gateway, resets on restart, and is a live operational figure, not a billing count. +- **Rate-limit observability (R3):** every limiter rejection increments the gateway + counter `gateway_rate_limited_total` (`class` = user/public/email/admin — aggregate + only, honouring the no-per-user-label discipline above) and logs one **Debug** line; + a gateway reporter drains the per-key rejection tracker every 30 s, emits one **Warn** + summary per throttled key and posts the report to the backend + (`POST /api/v1/internal/ratelimit/report`, network-trusted like `sessions/resolve`). + The backend's `ratewatch` keeps a bounded in-memory episode window (single-instance, + resets on restart, like `active_users`) surfaced on the admin console's **Throttled** + page next to the flagged-account review queue, and applies the **conservative + auto-flag**: an account sustaining `BACKEND_HIGHRATE_FLAG_THRESHOLD` rejected calls + (default 1000) within `BACKEND_HIGHRATE_FLAG_WINDOW` (default 10 min) gets the soft, + reversible `accounts.flagged_high_rate_at` marker — set once, shown in the user + list/detail, cleared by the operator, **never an automatic ban** and never a request + gate. The Edge/UX dashboard graphs the aggregate request rate against the rejection + rate by class. - Unauthenticated `GET /healthz` (liveness) and `GET /readyz` (readiness — the database answers a bounded ping and the session cache is warmed). - The backend serves a **second listener** — a gRPC server @@ -582,12 +606,12 @@ promotions) is future work and would deliver short markdown messages (text + lin | Concern | Enforced by | | --- | --- | -| Public rate limiting / anti-abuse | gateway | +| Public rate limiting / anti-abuse | gateway (per-IP public/email/admin classes, per-user authenticated class; a request body cap of `GATEWAY_MAX_BODY_BYTES`; rejections are metered, summarised to the backend and surfaced in the admin console with a conservative reversible auto-flag — R3, §11) | | Telegram initData validation (bot-token HMAC) | the Telegram connector; the gateway delegates it over gRPC, so the bot token lives only in the connector | | Session minting; email-code / guest validation | gateway (with backend) | | Session → `user_id` resolution, `X-User-ID` injection | gateway | | Authorisation, ownership, state transitions | backend (`X-User-ID` is the sole identity input) | -| Admin authentication | a single Basic-Auth gate on `/_gm/*`, forwarded **verbatim** to the backend's server-rendered admin console (and, in the deployed contour, routing `/_gm/grafana/*` to Grafana). In the deploy the **caddy** owns this gate (§13); a local non-caddy run uses the gateway's own `GATEWAY_ADMIN_*` proxy. The backend trusts the proxy (no admin principal) and guards its state-changing POSTs with a **same-origin** check — the console's CSRF defence. No operator identity is tracked | +| Admin authentication | a single Basic-Auth gate on `/_gm/*`, forwarded **verbatim** to the backend's server-rendered admin console (and, in the deployed contour, routing `/_gm/grafana/*` to Grafana). In the deploy the **caddy** owns this gate (§13); a local non-caddy run uses the gateway's own `GATEWAY_ADMIN_*` proxy, which the per-IP admin limiter class guards ahead of its Basic-Auth (R3) — the caddy-fronted path has no limiter (stock caddy), an accepted gap. The backend trusts the proxy (no admin principal) and guards its state-changing POSTs with a **same-origin** check — the console's CSRF defence. No operator identity is tracked | | backend ↔ gateway ↔ connector trust | the network (only gateway may reach backend; the connector serves unauthenticated gRPC on the internal segment) | This is an explicit, accepted MVP risk: compromise of the gateway↔backend @@ -597,7 +621,7 @@ mutual auth is a future hardening step. **Short numeric codes** (email confirm-codes and Stage 8 friend codes) are stored only as SHA-256 hashes and are short-lived and single-use. The unauthenticated email path carries a tight per-IP sub-limit (5 / 10 min); the **friend-code redeem** -is authenticated, so it rides the per-user limit (120 / min) and is further bounded +is authenticated, so it rides the per-user limit (300 / min) and is further bounded by the code's 12 h TTL, single use, and **one live code per issuer** (which caps the valid-code population). Brute-forcing a 6-digit friend code within these limits is an accepted MVP risk with low blast radius (an unwanted friendship is removable/blockable); @@ -605,22 +629,27 @@ a dedicated redeem sub-limit or a longer code is the hardening step if abuse app ## 13. Deployment (informational) -Single public origin, path-routed. The gateway **embeds** the static UI build -(`go:embed`, baked in by a node stage in `gateway/Dockerfile`). The Vite build has two -entries: a lightweight **landing page** served at `/`, and the game **SPA** served at +Single public origin, path-routed. The Vite build has two entries: a lightweight +**landing page** and the game **SPA**. The gateway **embeds** the SPA build +(`go:embed`, baked in by a node stage in `gateway/Dockerfile`) and serves it at `/app/` (web) and `/telegram/` (the Telegram Mini App; outside Telegram that path -redirects to the root — the client-side guard). Hash-named `/assets/*` are served +redirects to the root — the client-side guard); a stray hit on the gateway's `/` +308-redirects to `/app/`. The **landing** ships in its own static container (R3): the +`landing` target of `gateway/Dockerfile` (caddy:2-alpine + the same Vite build, +`deploy/landing/Caddyfile`) serves it at `/`, so stray public traffic is absorbed by +static file serving and never reaches the Go edge. Hash-named `/assets/*` are served `immutable` (a relaunch is a cache hit, not a re-download); the HTML shells are -`no-cache` so a new deploy is picked up. An in-compose **caddy** is the -contour's edge: it owns a single `/_gm` Basic-Auth and routes `/_gm/grafana/*` to -**Grafana** (anonymous-admin, so the one shared login gates it with no per-user -Grafana accounts) and the rest of `/_gm/*` to the backend-rendered **admin console**; -everything else (`/`, `/app/`, `/telegram/`, the Connect edge) goes to the gateway. The +`no-cache` so a new deploy is picked up — both containers apply the same caching. An +in-compose **caddy** is the contour's edge: it owns a single `/_gm` Basic-Auth and +routes `/_gm/grafana/*` to **Grafana** (anonymous-admin, so the one shared login gates +it with no per-user Grafana accounts) and the rest of `/_gm/*` to the backend-rendered +**admin console**; `/app/`, `/telegram/` and the Connect path go to the gateway; the +catch-all — notably the landing at `/` — goes to the landing container. The **Telegram connector** runs as a separate container with **no public ingress** — it long-polls Telegram and egresses through a VPN sidecar, answering only internal gRPC. The full contour (`deploy/docker-compose.yml`) runs one `gateway`, one `backend`, -one Postgres, the connector (+ its VPN sidecar) and the **observability stack** — +one Postgres, the static `landing`, the connector (+ its VPN sidecar) and the **observability stack** — OTel Collector (OTLP/gRPC ingest → Prometheus metrics + Tempo traces) and Grafana with provisioned datasources and dashboards. All three services export OTLP to the collector; the connector shares the VPN sidecar's netns, so its `AWG_CONF` must not @@ -633,7 +662,8 @@ network (project-scoped DNS); only caddy joins the shared external `edge` networ Two contours, two secret/variable prefixes (`TEST_` / `PROD_`): - **Test** (Stage 16): auto-deploys on a PR into — or a push to — `development` (`.gitea/workflows/ci.yaml` → `docker compose up -d --build` on the Gitea runner - host, then a `GET /` probe through caddy). The host caddy terminates TLS and + host, then `GET /` + `GET /app/` probes through caddy — the landing container and + the gateway, R3). The host caddy terminates TLS and forwards the domain to `scrabble:80`, so the in-compose caddy serves plain HTTP (`CADDY_SITE_ADDRESS=:80`). The in-compose caddy **trusts X-Forwarded-For from private-range upstreams** (`trusted_proxies private_ranges`), so the real client IP — diff --git a/docs/FUNCTIONAL.md b/docs/FUNCTIONAL.md index c425027..3399b89 100644 --- a/docs/FUNCTIONAL.md +++ b/docs/FUNCTIONAL.md @@ -171,3 +171,11 @@ applied after a reload). When a Telegram connector is configured an operator can **message a user** (by their Telegram identity) or **post to the game channel**. State-changing actions are protected by a same-origin check; the console tracks no operator identity. + +The console also surfaces **rate-limit abuse** (R3): a **Throttled** page lists the +recently throttled users/IPs the gateway reported (an in-memory window — it resets on +a backend restart) and the accounts currently carrying the soft **high-rate flag**. An +account sustaining rejections past a tunable threshold is flagged automatically — +the marker is reversible, shown as a badge in the user list and on the user card, and +**never blocks play**; the operator reviews and clears it from the user card. There is +no automatic ban. diff --git a/docs/FUNCTIONAL_ru.md b/docs/FUNCTIONAL_ru.md index f82266b..ee0e7b3 100644 --- a/docs/FUNCTIONAL_ru.md +++ b/docs/FUNCTIONAL_ru.md @@ -175,3 +175,11 @@ identity, их игры) и **игры** (сводка + места), разби подключён Telegram-коннектор, оператор также может **написать пользователю** (по его Telegram-identity) или **отправить пост в игровой канал**. Изменяющие действия защищены проверкой same-origin; личность оператора не отслеживается. + +Консоль также показывает **злоупотребление лимитами** (R3): страница **Throttled** +перечисляет недавно затроттленных пользователей/IP по отчётам gateway (окно в памяти — +сбрасывается при рестарте backend) и аккаунты с действующим мягким **high-rate +флагом**. Аккаунт, устойчиво превышающий настраиваемый порог отказов, помечается +автоматически — маркер обратим, виден бейджем в списке пользователей и на карточке +аккаунта и **никогда не блокирует игру**; оператор рассматривает и снимает его с +карточки пользователя. Автоматического бана нет. diff --git a/docs/TESTING.md b/docs/TESTING.md index 2319a8e..02dc510 100644 --- a/docs/TESTING.md +++ b/docs/TESTING.md @@ -76,7 +76,14 @@ tests or touching CI. unsubscribe), the transcode round-trips (FlatBuffers↔JSON, X-User-ID forwarding, nested GameView, domain-code surfacing), the admin Basic-Auth reverse proxy (401 / forward), and a full Connect `Execute` path end to end - (guest auth, unauthenticated rejection, unknown message type). The backend gains + (guest auth, unauthenticated rejection, unknown message type). **R3** adds the + edge-hardening cases: an oversized `Execute` payload is refused + (`resource_exhausted`, the `GATEWAY_MAX_BODY_BYTES` cap), a limiter rejection + lands in `gateway_rate_limited_total{class}` and the rejection tracker + (drain/aggregate unit tests), the report POST reaches + `/api/v1/internal/ratelimit/report` with the agreed JSON shape, the `/_gm` + mount is 429-guarded by the per-IP admin class, and the gateway's `/` + 308-redirects to `/app/` (the landing left the embed). The backend gains the **guest** lifecycle (a guest plays an auto-match to a natural end yet accrues no statistics) and the **email-as-login** flow (request/verify, returning user) in `inttest`. Stage 8 adds gateway transcode round-trips for the new social/account @@ -92,7 +99,12 @@ tests or touching CI. 404 when not). Postgres-backed `inttest` drives the **complaint resolution → dictionary-change pipeline** (file → resolve with a disposition → pending change → mark applied), the admin **list/count** read queries, and the **/_gm console over HTTP** - (pages render; a resolve POST needs a same-origin header). + (pages render; a resolve POST needs a same-origin header). **R3** adds `ratewatch` + unit tests (window accumulation, the auto-flag threshold + expiry, the bounded + episode map), the account-store **high-rate flag round-trip** (set-once / clear / + re-flag) and a console flow in `inttest`: a gateway report auto-flags the account, + the **Throttled** page shows the episode and the flagged queue, the user card + carries the marker and the CSRF-guarded **Clear** reverses it. - **Observability & performance** *(Stage 12)* — `pkg/telemetry` unit-tests the exporter selection (`none`/`stdout`/`otlp` build providers; OTLP constructs with no collector; the nil-runtime fallback). The domain metrics are exercised through a manual diff --git a/gateway/Dockerfile b/gateway/Dockerfile index 8a45a94..092fd6c 100644 --- a/gateway/Dockerfile +++ b/gateway/Dockerfile @@ -1,15 +1,18 @@ -# Multi-stage build for the gateway service. A node stage builds the static UI -# (Vite), the result is embedded into the Go binary (gateway/internal/webui/dist), -# and the Go stage — mirroring platform/telegram/Dockerfile — yields a static -# binary shipped on distroless nonroot. So the single binary serves the SPA at / -# and /telegram/ (docs/ARCHITECTURE.md §13) with no separate static container. +# Multi-stage build for the gateway service and the landing image. A node stage +# builds the static UI (Vite) once; the `landing` target serves that build from a +# static caddy container (the public landing at /, R3), while the final gateway +# target embeds it — minus landing.html — into the Go binary +# (gateway/internal/webui/dist), which serves the game SPA at /app/ and +# /telegram/ (docs/ARCHITECTURE.md §13). The Go stage mirrors +# platform/telegram/Dockerfile and ships on distroless nonroot. # # The production UI build vars are image build-args, baked into the bundle. -# Build from the repository root so go.work, pkg/, gateway/ and ui/ are all in the -# Docker context: +# Build from the repository root so go.work, pkg/, gateway/, ui/ and +# deploy/landing/ are all in the Docker context: # docker build -f gateway/Dockerfile \ # --build-arg VITE_GATEWAY_URL=https://example \ # -t scrabble-gateway . +# docker build -f gateway/Dockerfile --target landing -t scrabble-landing . # --- UI build ---------------------------------------------------------------- FROM node:22-alpine AS ui @@ -38,6 +41,14 @@ RUN pnpm install --frozen-lockfile COPY ui ./ RUN pnpm build +# --- landing ------------------------------------------------------------------- +# The public landing page as its own static container (R3): the same Vite build +# served by caddy at /, so stray public traffic is absorbed by static file +# serving and never reaches the Go edge. +FROM caddy:2-alpine AS landing +COPY deploy/landing/Caddyfile /etc/caddy/Caddyfile +COPY --from=ui /ui/dist /srv + # --- Go build ---------------------------------------------------------------- FROM golang:1.26.3-alpine AS build WORKDIR /src @@ -46,9 +57,11 @@ COPY pkg ./pkg COPY gateway ./gateway # Replace the committed placeholder with the freshly built UI before compiling, so -# go:embed bakes the real bundle into the binary. +# go:embed bakes the real bundle into the binary. The landing shell ships in the +# landing image, not in the gateway (R3). RUN rm -rf gateway/internal/webui/dist COPY --from=ui /ui/dist gateway/internal/webui/dist +RUN rm gateway/internal/webui/dist/landing.html # Reduce the workspace to what the gateway needs: gateway + pkg (loadtest is not in # this context; its scrabble/gateway replace targets ./gateway, which is present here). @@ -56,6 +69,6 @@ RUN go work edit -dropuse=./backend -dropuse=./platform/telegram -dropuse=./load RUN CGO_ENABLED=0 GOOS=linux go build -trimpath -o /out/gateway ./gateway/cmd/gateway # --- runtime ----------------------------------------------------------------- -FROM gcr.io/distroless/static-debian12:nonroot +FROM gcr.io/distroless/static-debian12:nonroot AS gateway COPY --from=build /out/gateway /usr/local/bin/gateway ENTRYPOINT ["/usr/local/bin/gateway"] diff --git a/gateway/README.md b/gateway/README.md index af4201f..d162bc0 100644 --- a/gateway/README.md +++ b/gateway/README.md @@ -23,7 +23,7 @@ proto/edge/v1/ # Connect envelope contract (committed generated Go) internal/config/ # GATEWAY_* env config internal/backendclient/ # typed REST client (+ X-User-ID) and push gRPC client internal/session/ # in-memory session cache (LRU/TTL, backend fallback) -internal/ratelimit/ # token-bucket limiter (golang.org/x/time/rate) +internal/ratelimit/ # token-bucket limiter (golang.org/x/time/rate) + the rejection tracker (R3) internal/connector/ # gRPC client to the Telegram connector (initData validate, out-of-app push) + routing internal/push/ # live-event fan-out hub (per-user client streams) internal/transcode/ # FlatBuffers<->REST bridge + message_type registry @@ -79,12 +79,21 @@ connector (`ValidateLoginWidget`) and forward the trusted `external_id`. These | `GATEWAY_SESSION_TTL` | `10m` | cached session lifetime | | `GATEWAY_SESSION_CACHE_MAX` | `50000` | cached session cap | | `GATEWAY_PUSH_HEARTBEAT_INTERVAL` | `10s` | live-stream keep-alive (an immediate heartbeat also fires on open, under the ~15s edge idle timeout) | +| `GATEWAY_MAX_BODY_BYTES` | `1048576` | caps one request body and one Connect message read; an oversized Execute is refused with `resource_exhausted` (R3) | | `GATEWAY_SERVICE_NAME` | `scrabble-gateway` | OpenTelemetry `service.name` | | `GATEWAY_OTEL_TRACES_EXPORTER` | `none` | `none`, `stdout` or `otlp` (gRPC; endpoint from `OTEL_EXPORTER_OTLP_*`) | | `GATEWAY_OTEL_METRICS_EXPORTER` | `none` | `none`, `stdout` or `otlp` | Rate-limit defaults (built-in): public 30/min·IP (burst 10), authenticated -120/min·user (burst 40), admin 60/min·IP (burst 20), email-code 5/10 min·IP. +300/min·user (burst 80, raised in Stage 17 for multi-device play), admin +60/min·IP (burst 20, guarding the `/_gm` mount ahead of its Basic-Auth), +email-code 5/10 min·IP (burst 2). + +Every rejection increments `gateway_rate_limited_total{class}` +(`user`/`public`/`email`/`admin`) and logs one Debug line; a reporter drains the +per-key rejection tracker every 30 s, emits a Warn summary per throttled key and +posts the report to the backend (`/api/v1/internal/ratelimit/report`), feeding +the admin console's throttled view and the high-rate auto-flag (R3). ## Run diff --git a/gateway/cmd/gateway/main.go b/gateway/cmd/gateway/main.go index 458b287..d6439f0 100644 --- a/gateway/cmd/gateway/main.go +++ b/gateway/cmd/gateway/main.go @@ -39,6 +39,14 @@ const ( pushReconnectDelay = 2 * time.Second // gatewayID identifies this gateway instance to the backend push channel. gatewayID = "gateway" + // readHeaderTimeout bounds reading one request's headers on the public + // listener (a slowloris guard). Bodies and long-lived streams are governed by + // the h2c settings in connectsrv — Read/WriteTimeout stay unset on purpose, + // they would kill the Subscribe stream (R3). + readHeaderTimeout = 10 * time.Second + // throttleReportInterval is the cadence of the rate-limiter rejection + // summary: the Warn log per throttled key and the report to the backend (R3). + throttleReportInterval = 30 * time.Second ) func main() { @@ -89,6 +97,7 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error { sessions := session.NewCache(backend, cfg.SessionTTL, cfg.SessionCacheMax) limiter := ratelimit.New() + tracker := ratelimit.NewTracker() hub := push.NewHub(0) var conn *connector.Client @@ -119,22 +128,26 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error { registry := transcode.NewRegistry(backend, validator, cfg.DefaultSupportedLanguages...) edge := connectsrv.NewServer(connectsrv.Deps{ - Registry: registry, - Sessions: sessions, - Limiter: limiter, - Hub: hub, - RateLimit: cfg.RateLimit, - Heartbeat: cfg.PushHeartbeatInterval, - Logger: logger, - AdminProxy: adminProxy, - Meter: tel.MeterProvider().Meter("scrabble/gateway/edge"), + Registry: registry, + Sessions: sessions, + Limiter: limiter, + Tracker: tracker, + Hub: hub, + RateLimit: cfg.RateLimit, + Heartbeat: cfg.PushHeartbeatInterval, + Logger: logger, + AdminProxy: adminProxy, + Meter: tel.MeterProvider().Meter("scrabble/gateway/edge"), + MaxBodyBytes: cfg.MaxBodyBytes, }) // Bridge the backend push stream into the fan-out hub (and the out-of-app // channel via the connector). go runPushPump(ctx, backend, hub, conn, logger) + // Periodically summarise rate-limiter rejections (Warn log + backend report). + go runThrottleReporter(ctx, tracker, backend, logger) - public := &http.Server{Addr: cfg.HTTPAddr, Handler: edge.HTTPHandler()} + public := &http.Server{Addr: cfg.HTTPAddr, Handler: edge.HTTPHandler(), ReadHeaderTimeout: readHeaderTimeout} servers := []*namedServer{{name: "public", srv: public}} logger.Info("gateway starting", @@ -182,6 +195,37 @@ func runServers(ctx context.Context, cancel context.CancelFunc, servers []*named return first } +// runThrottleReporter drains the rate-limiter rejection tracker on a fixed +// cadence, emits one Warn summary per throttled key and forwards the report to +// the backend (which feeds the admin throttled view and the high-rate +// auto-flag), until the context is done. A failed delivery is logged and +// dropped — the next window reports fresh data anyway. +func runThrottleReporter(ctx context.Context, tracker *ratelimit.Tracker, backend *backendclient.Client, logger *zap.Logger) { + ticker := time.NewTicker(throttleReportInterval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + entries := tracker.Drain() + if len(entries) == 0 { + continue + } + for _, e := range entries { + logger.Warn("rate limited", + zap.String("class", e.Class), + zap.String("key", e.Key), + zap.Int("rejected", e.Rejected), + zap.Duration("window", throttleReportInterval)) + } + if err := backend.ReportRateLimited(ctx, int(throttleReportInterval.Seconds()), entries); err != nil { + logger.Warn("rate-limit report failed", zap.Error(err)) + } + } +} + // runPushPump keeps a backend push subscription open, forwarding every event to // the hub and re-subscribing after the stream ends, until the context is done. For // the out-of-app push kinds it also routes events whose recipient has no live diff --git a/gateway/internal/backendclient/client.go b/gateway/internal/backendclient/client.go index c63f2fb..d1caeac 100644 --- a/gateway/internal/backendclient/client.go +++ b/gateway/internal/backendclient/client.go @@ -18,6 +18,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" + "scrabble/gateway/internal/ratelimit" pushv1 "scrabble/pkg/proto/push/v1" ) @@ -124,3 +125,15 @@ func parseAPIError(status int, data []byte) *APIError { func (c *Client) SubscribePush(ctx context.Context, gatewayID string) (grpc.ServerStreamingClient[pushv1.Event], error) { return c.push.Subscribe(ctx, &pushv1.SubscribeRequest{GatewayId: gatewayID}) } + +// ReportRateLimited posts the gateway's periodic rate-limiter rejection summary +// to the backend, which feeds the admin console's throttled view and the +// high-rate auto-flag. The endpoint carries no user identity: like +// sessions/resolve it rides the trusted internal segment (R3). +func (c *Client) ReportRateLimited(ctx context.Context, windowSeconds int, entries []ratelimit.Rejection) error { + body := struct { + WindowSeconds int `json:"window_seconds"` + Entries []ratelimit.Rejection `json:"entries"` + }{WindowSeconds: windowSeconds, Entries: entries} + return c.do(ctx, http.MethodPost, "/api/v1/internal/ratelimit/report", "", "", body, nil) +} diff --git a/gateway/internal/backendclient/client_test.go b/gateway/internal/backendclient/client_test.go new file mode 100644 index 0000000..3ad3612 --- /dev/null +++ b/gateway/internal/backendclient/client_test.go @@ -0,0 +1,48 @@ +package backendclient_test + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "scrabble/gateway/internal/backendclient" + "scrabble/gateway/internal/ratelimit" +) + +// TestReportRateLimited verifies the rejection report reaches the backend's +// internal endpoint with the agreed JSON shape and no user identity. +func TestReportRateLimited(t *testing.T) { + var got struct { + WindowSeconds int `json:"window_seconds"` + Entries []ratelimit.Rejection `json:"entries"` + } + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost || r.URL.Path != "/api/v1/internal/ratelimit/report" { + t.Errorf("call = %s %s, want POST /api/v1/internal/ratelimit/report", r.Method, r.URL.Path) + } + if uid := r.Header.Get("X-User-ID"); uid != "" { + t.Errorf("X-User-ID = %q, want empty", uid) + } + if err := json.NewDecoder(r.Body).Decode(&got); err != nil { + t.Errorf("decode report: %v", err) + } + })) + defer srv.Close() + + c, err := backendclient.New(srv.URL, "localhost:9090", 2*time.Second) + if err != nil { + t.Fatalf("backendclient: %v", err) + } + defer func() { _ = c.Close() }() + + entries := []ratelimit.Rejection{{Class: "user", Key: "u-1", Rejected: 5}} + if err := c.ReportRateLimited(context.Background(), 30, entries); err != nil { + t.Fatalf("ReportRateLimited: %v", err) + } + if got.WindowSeconds != 30 || len(got.Entries) != 1 || got.Entries[0] != entries[0] { + t.Fatalf("backend received %+v, want window 30 + %+v", got, entries[0]) + } +} diff --git a/gateway/internal/config/config.go b/gateway/internal/config/config.go index e035687..3800be6 100644 --- a/gateway/internal/config/config.go +++ b/gateway/internal/config/config.go @@ -44,6 +44,9 @@ type Config struct { SessionCacheMax int // PushHeartbeatInterval is the idle keep-alive cadence on a client live stream. PushHeartbeatInterval time.Duration + // MaxBodyBytes caps one inbound request body on the public listener and one + // Connect message read; oversized requests are refused without buffering. + MaxBodyBytes int // RateLimit configures the in-memory anti-abuse limiter. RateLimit RateLimitConfig // Telemetry configures the OpenTelemetry providers (shared bootstrap). @@ -77,6 +80,11 @@ const ( defaultServiceName = "scrabble-gateway" ) +// DefaultMaxBodyBytes is the default request-body cap (GATEWAY_MAX_BODY_BYTES): +// 1 MiB — far above any legitimate edge payload (drafts and chat are a few KB) +// yet small enough to stop a cheap memory-amplification upload (R3). +const DefaultMaxBodyBytes = 1 << 20 + // supportedLanguages is the set of game languages a service may declare for the // New Game variant gating; defaultSupportedLanguages is the non-platform fallback. var ( @@ -130,6 +138,9 @@ func Load() (Config, error) { if c.PushHeartbeatInterval, err = envDuration("GATEWAY_PUSH_HEARTBEAT_INTERVAL", defaultPushHeartbeatInterval); err != nil { return Config{}, err } + if c.MaxBodyBytes, err = envInt("GATEWAY_MAX_BODY_BYTES", DefaultMaxBodyBytes); err != nil { + return Config{}, err + } if c.DefaultSupportedLanguages, err = envLanguages("GATEWAY_DEFAULT_SUPPORTED_LANGUAGES", defaultSupportedLanguages); err != nil { return Config{}, err } @@ -161,6 +172,9 @@ func (c Config) validate() error { if c.BackendGRPCAddr == "" { return fmt.Errorf("config: GATEWAY_BACKEND_GRPC_ADDR must not be empty") } + if c.MaxBodyBytes <= 0 { + return fmt.Errorf("config: GATEWAY_MAX_BODY_BYTES must be positive") + } if err := c.Telemetry.Validate(); err != nil { return fmt.Errorf("config: %w", err) } diff --git a/gateway/internal/config/config_test.go b/gateway/internal/config/config_test.go index 8f9afcd..9667ea4 100644 --- a/gateway/internal/config/config_test.go +++ b/gateway/internal/config/config_test.go @@ -29,3 +29,19 @@ func TestLoadRejectsUnsupportedExporter(t *testing.T) { t.Fatal("Load: expected an error for an unsupported exporter, got nil") } } + +// TestLoadMaxBodyBytes verifies the body-cap default and that a non-positive +// override fails validation. +func TestLoadMaxBodyBytes(t *testing.T) { + c, err := Load() + if err != nil { + t.Fatalf("Load: %v", err) + } + if c.MaxBodyBytes != DefaultMaxBodyBytes { + t.Errorf("MaxBodyBytes = %d, want %d", c.MaxBodyBytes, DefaultMaxBodyBytes) + } + t.Setenv("GATEWAY_MAX_BODY_BYTES", "0") + if _, err := Load(); err == nil { + t.Fatal("Load: expected an error for a non-positive body cap, got nil") + } +} diff --git a/gateway/internal/connectsrv/metrics.go b/gateway/internal/connectsrv/metrics.go index a08c9c2..ab18bf1 100644 --- a/gateway/internal/connectsrv/metrics.go +++ b/gateway/internal/connectsrv/metrics.go @@ -24,8 +24,9 @@ var activeUserWindows = []struct { // serverMetrics holds the edge's operational instruments. It defaults to no-ops; // NewServer installs the real meter when one is supplied in Deps. type serverMetrics struct { - edge metric.Float64Histogram - active *activeUsers + edge metric.Float64Histogram + rateLimited metric.Int64Counter + active *activeUsers } // newServerMetrics builds the instruments on meter (nil selects a no-op meter), @@ -42,7 +43,12 @@ func newServerMetrics(meter metric.Meter) *serverMetrics { if err != nil { h, _ = noop.NewMeterProvider().Meter(meterName).Float64Histogram("edge_request_duration") } - m := &serverMetrics{edge: h, active: newActiveUsers()} + c, err := meter.Int64Counter("gateway_rate_limited_total", + metric.WithDescription("Rate-limiter rejections at the edge, by limiter class (user, public, email or admin) — aggregate only, no per-user attributes.")) + if err != nil { + c, _ = noop.NewMeterProvider().Meter(meterName).Int64Counter("gateway_rate_limited_total") + } + m := &serverMetrics{edge: h, rateLimited: c, active: newActiveUsers()} gauge, err := meter.Int64ObservableGauge("active_users", metric.WithDescription("Distinct accounts that performed an authenticated action within the window (in-memory, single gateway instance).")) @@ -75,3 +81,8 @@ func (m *serverMetrics) recordEdge(ctx context.Context, msgType, result string, func (m *serverMetrics) recordActive(uid string) { m.active.seen(uid) } + +// recordRateLimited counts one limiter rejection under class. +func (m *serverMetrics) recordRateLimited(ctx context.Context, class string) { + m.rateLimited.Add(ctx, 1, metric.WithAttributes(attribute.String("class", class))) +} diff --git a/gateway/internal/connectsrv/metrics_test.go b/gateway/internal/connectsrv/metrics_test.go index 80987cc..2f0910c 100644 --- a/gateway/internal/connectsrv/metrics_test.go +++ b/gateway/internal/connectsrv/metrics_test.go @@ -52,3 +52,41 @@ func TestEdgeMetric(t *testing.T) { t.Errorf("edge auth.guest/domain = %d, want 1", got) } } + +// TestRateLimitedMetric records limiter rejections through a manual reader and +// asserts gateway_rate_limited_total splits by class. +func TestRateLimitedMetric(t *testing.T) { + ctx := context.Background() + reader := sdkmetric.NewManualReader() + meter := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)).Meter("test") + m := newServerMetrics(meter) + + m.recordRateLimited(ctx, "user") + m.recordRateLimited(ctx, "user") + m.recordRateLimited(ctx, "public") + + var rm metricdata.ResourceMetrics + if err := reader.Collect(ctx, &rm); err != nil { + t.Fatalf("collect: %v", err) + } + + counts := map[string]int64{} + for _, sm := range rm.ScopeMetrics { + for _, md := range sm.Metrics { + if md.Name != "gateway_rate_limited_total" { + continue + } + sum, ok := md.Data.(metricdata.Sum[int64]) + if !ok { + t.Fatalf("gateway_rate_limited_total is not an int64 sum") + } + for _, dp := range sum.DataPoints { + class, _ := dp.Attributes.Value(attribute.Key("class")) + counts[class.AsString()] += dp.Value + } + } + } + if counts["user"] != 2 || counts["public"] != 1 { + t.Errorf("rate_limited counts = %v, want user=2 public=1", counts) + } +} diff --git a/gateway/internal/connectsrv/server.go b/gateway/internal/connectsrv/server.go index 455ad3d..f1880d3 100644 --- a/gateway/internal/connectsrv/server.go +++ b/gateway/internal/connectsrv/server.go @@ -8,6 +8,7 @@ package connectsrv import ( "context" + "errors" "net" "net/http" "strings" @@ -19,6 +20,7 @@ import ( "golang.org/x/net/http2" "golang.org/x/net/http2/h2c" + "scrabble/gateway/internal/backendclient" "scrabble/gateway/internal/config" "scrabble/gateway/internal/push" "scrabble/gateway/internal/ratelimit" @@ -32,33 +34,68 @@ import ( // heartbeatKind is the live-stream keep-alive event kind. const heartbeatKind = "heartbeat" +// Limiter classes, the `class` attribute of gateway_rate_limited_total and the +// class field of the periodic rejection report (R3). +const ( + classUser = "user" + classPublic = "public" + classEmail = "email" + classAdmin = "admin" +) + +// Explicit h2c server sizing (R3, after the R2 stress run questioned the +// implicit defaults). +const ( + // h2cMaxConcurrentStreams bounds the open streams per client connection — the + // x/net default made explicit. A real client holds one Subscribe stream plus a + // few unary calls; only a synthetic load multiplexing many players over one + // transport approaches it. R7 revisits the sizing. + h2cMaxConcurrentStreams = 250 + // h2cIdleTimeout closes a connection with no open streams. A live Subscribe + // stream keeps its connection active, so long-lived clients are unaffected; + // only abandoned connections are reaped. + h2cIdleTimeout = 3 * time.Minute +) + // Server implements edgev1connect.GatewayHandler. type Server struct { registry *transcode.Registry sessions *session.Cache limiter *ratelimit.Limiter + tracker *ratelimit.Tracker hub *push.Hub heartbeat time.Duration log *zap.Logger adminProxy http.Handler metrics *serverMetrics + maxBodyBytes int + publicPolicy ratelimit.Policy userPolicy ratelimit.Policy emailPolicy ratelimit.Policy + adminPolicy ratelimit.Policy } -// Deps carries the Server's dependencies. +// Deps carries the Server's dependencies. A nil Limiter, nil Tracker, zero +// RateLimit and non-positive MaxBodyBytes each select a safe default. type Deps struct { - Registry *transcode.Registry - Sessions *session.Cache - Limiter *ratelimit.Limiter + Registry *transcode.Registry + Sessions *session.Cache + Limiter *ratelimit.Limiter + // Tracker accumulates limiter rejections for the periodic report; nil + // selects a private tracker (rejections are then only counted, never + // reported). + Tracker *ratelimit.Tracker Hub *push.Hub RateLimit config.RateLimitConfig Heartbeat time.Duration Logger *zap.Logger AdminProxy http.Handler Meter metric.Meter + // MaxBodyBytes caps one inbound request body and one Connect message read; + // zero or negative selects config.DefaultMaxBodyBytes. + MaxBodyBytes int } // NewServer constructs the edge service. @@ -67,47 +104,85 @@ func NewServer(d Deps) *Server { if log == nil { log = zap.NewNop() } + maxBody := d.MaxBodyBytes + if maxBody <= 0 { + maxBody = config.DefaultMaxBodyBytes + } + tracker := d.Tracker + if tracker == nil { + tracker = ratelimit.NewTracker() + } + limiter := d.Limiter + if limiter == nil { + limiter = ratelimit.New() + } + rl := d.RateLimit + if rl == (config.RateLimitConfig{}) { + rl = config.DefaultRateLimit() + } return &Server{ registry: d.Registry, sessions: d.Sessions, - limiter: d.Limiter, + limiter: limiter, + tracker: tracker, hub: d.Hub, heartbeat: d.Heartbeat, log: log, adminProxy: d.AdminProxy, metrics: newServerMetrics(d.Meter), - publicPolicy: ratelimit.PerMinute(d.RateLimit.PublicPerMinute, d.RateLimit.PublicBurst), - userPolicy: ratelimit.PerMinute(d.RateLimit.UserPerMinute, d.RateLimit.UserBurst), - emailPolicy: ratelimit.Per(d.RateLimit.EmailPer10Min, 10*time.Minute, d.RateLimit.EmailBurst), + maxBodyBytes: maxBody, + publicPolicy: ratelimit.PerMinute(rl.PublicPerMinute, rl.PublicBurst), + userPolicy: ratelimit.PerMinute(rl.UserPerMinute, rl.UserBurst), + emailPolicy: ratelimit.Per(rl.EmailPer10Min, 10*time.Minute, rl.EmailBurst), + adminPolicy: ratelimit.PerMinute(rl.AdminPerMinute, rl.AdminBurst), } } // HTTPHandler returns the h2c-wrapped Connect handler ready to serve. func (s *Server) HTTPHandler() http.Handler { mux := http.NewServeMux() - path, h := edgev1connect.NewGatewayHandler(s) + // The Connect read cap mirrors the HTTP-level body cap below; an oversized + // Execute message is refused (resource_exhausted) instead of buffered. + path, h := edgev1connect.NewGatewayHandler(s, connect.WithReadMaxBytes(s.maxBodyBytes)) mux.Handle(path, h) if s.adminProxy != nil { // The admin console (backend /_gm) is served on the public listener behind // the proxy's Basic-Auth, mounted below the h2c wrap so the Connect edge keeps // working over h2c (docs/ARCHITECTURE.md §12). In the deployed contour the // front caddy owns the /_gm Basic-Auth and Grafana routing; this mount serves - // a non-caddy (local) setup. - mux.Handle("/_gm/", s.adminProxy) + // a non-caddy (local) setup. The per-IP admin limiter class guards it — + // notably a Basic-Auth brute force (R3). + mux.Handle("/_gm/", s.limitAdmin(s.adminProxy)) } else { // With the console disabled here, keep /_gm a 404 so the SPA catch-all below // does not serve the app shell at the operator path. mux.Handle("/_gm/", http.NotFoundHandler()) } - // The embedded UI: the game SPA under /app/ (web) and /telegram/ (the Telegram Mini - // App), with a separate landing page at the catch-all "/" — the single-origin model - // (docs/ARCHITECTURE.md §13). All sit below the h2c wrap so the Connect edge (a more - // specific prefix) keeps priority. Each SPA mount falls back to the app shell - // (index.html) for the hash router; "/" falls back to the landing (landing.html). + // The embedded UI: the game SPA under /app/ (web) and /telegram/ (the Telegram + // Mini App) — the single-origin model (docs/ARCHITECTURE.md §13). Both sit below + // the h2c wrap so the Connect edge (a more specific prefix) keeps priority, and + // each mount falls back to the app shell (index.html) for the hash router. The + // public landing moved to its own static container behind the contour caddy + // (R3), so the catch-all redirects a stray root hit to the app shell — which + // keeps a local no-caddy run usable. mux.Handle("/telegram/", webui.Handler("/telegram/", "index.html")) mux.Handle("/app/", webui.Handler("/app/", "index.html")) - mux.Handle("/", webui.Handler("", "landing.html")) - return h2c.NewHandler(mux, &http2.Server{}) + mux.Handle("/", http.RedirectHandler("/app/", http.StatusPermanentRedirect)) + // Every request body on the public listener is capped (the admin proxy POSTs + // included); the h2c server carries explicit stream/idle sizing (R3). + return h2c.NewHandler(maxBodyHandler(s.maxBodyBytes, mux), &http2.Server{ + MaxConcurrentStreams: h2cMaxConcurrentStreams, + IdleTimeout: h2cIdleTimeout, + }) +} + +// maxBodyHandler caps every inbound request body at limit bytes: a read past the +// cap fails with *http.MaxBytesError and the connection is marked to close. +func maxBodyHandler(limit int, next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + r.Body = http.MaxBytesReader(w, r.Body, int64(limit)) + next.ServeHTTP(w, r) + }) } // Execute runs one unary operation. Domain failures are returned in the envelope @@ -138,17 +213,17 @@ func (s *Server) Execute(ctx context.Context, req *connect.Request[edgev1.Execut s.metrics.recordActive(uid) if !s.limiter.Allow("user:"+uid, s.userPolicy) { result = "rate_limited" - return nil, connect.NewError(connect.CodeResourceExhausted, errRateLimited) + return nil, s.rejectRateLimited(ctx, classUser, uid, msgType) } tr.UserID = uid } else { if !s.limiter.Allow("ip:"+clientIP, s.publicPolicy) { result = "rate_limited" - return nil, connect.NewError(connect.CodeResourceExhausted, errRateLimited) + return nil, s.rejectRateLimited(ctx, classPublic, clientIP, msgType) } if op.Email && !s.limiter.Allow("email:"+clientIP, s.emailPolicy) { result = "rate_limited" - return nil, connect.NewError(connect.CodeResourceExhausted, errRateLimited) + return nil, s.rejectRateLimited(ctx, classEmail, clientIP, msgType) } } @@ -180,7 +255,7 @@ func (s *Server) Subscribe(ctx context.Context, req *connect.Request[edgev1.Subs return err } if !s.limiter.Allow("user:"+uid, s.userPolicy) { - return connect.NewError(connect.CodeResourceExhausted, errRateLimited) + return s.rejectRateLimited(ctx, classUser, uid, "subscribe") } events, cancel := s.hub.Subscribe(uid) @@ -216,6 +291,43 @@ func (s *Server) Subscribe(ctx context.Context, req *connect.Request[edgev1.Subs } } +// noteRateLimited accounts one limiter rejection: the aggregate counter, the +// per-rejection Debug line and the periodic-report tracker. The operational +// signal is the reporter's Warn summary; per-rejection logging stays at Debug so +// a rejection flood cannot flood the log (R3). +func (s *Server) noteRateLimited(ctx context.Context, class, key, msgType string) { + s.metrics.recordRateLimited(ctx, class) + s.tracker.Add(class, key) + s.log.Debug("rate limited", + zap.String("class", class), + zap.String("key", key), + zap.String("message_type", msgType)) +} + +// rejectRateLimited accounts one limiter rejection and returns the Connect error +// for the caller. +func (s *Server) rejectRateLimited(ctx context.Context, class, key, msgType string) error { + s.noteRateLimited(ctx, class, key, msgType) + return connect.NewError(connect.CodeResourceExhausted, errRateLimited) +} + +// limitAdmin guards the admin proxy with the per-IP admin limiter class, ahead +// of its Basic-Auth check (a credential brute force is exactly what it bounds). +// It covers the gateway-fronted /_gm mount; in the deployed contour /_gm reaches +// the backend through caddy, whose Basic-Auth has no limiter (stock caddy) — see +// docs/ARCHITECTURE.md §12 (R3). +func (s *Server) limitAdmin(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + ip := peerIP(r.RemoteAddr, r.Header) + if !s.limiter.Allow("admin:"+ip, s.adminPolicy) { + s.noteRateLimited(r.Context(), classAdmin, ip, "admin") + http.Error(w, "rate limited", http.StatusTooManyRequests) + return + } + next.ServeHTTP(w, r) + }) +} + // resolve extracts and resolves the Authorization bearer token to an account id, // returning a Connect Unauthenticated error when it is missing or unknown. func (s *Server) resolve(ctx context.Context, h http.Header) (string, error) { @@ -225,6 +337,15 @@ func (s *Server) resolve(ctx context.Context, h http.Header) (string, error) { } uid, err := s.sessions.Resolve(ctx, token) if err != nil { + // An unknown or expired token (a backend 4xx) is the client's problem and + // stays silent; anything else — a resolve timeout, a refused connection, a + // backend 5xx — is an infra failure misread as "unauthenticated" by the + // client, so surface the cause (the transient resolves seen under load in + // the R2 stress run). The token itself is never logged. + var apiErr *backendclient.APIError + if !errors.As(err, &apiErr) || apiErr.Status >= http.StatusInternalServerError { + s.log.Warn("session resolve failed", zap.Error(err)) + } return "", connect.NewError(connect.CodeUnauthenticated, errInvalidSession) } return uid, nil @@ -247,10 +368,8 @@ func bearerToken(header string) string { // peer address (host part). func peerIP(peerAddr string, h http.Header) string { if xff := h.Get("X-Forwarded-For"); xff != "" { - if i := strings.IndexByte(xff, ','); i >= 0 { - return strings.TrimSpace(xff[:i]) - } - return strings.TrimSpace(xff) + first, _, _ := strings.Cut(xff, ",") + return strings.TrimSpace(first) } if host, _, err := net.SplitHostPort(peerAddr); err == nil { return host diff --git a/gateway/internal/connectsrv/server_test.go b/gateway/internal/connectsrv/server_test.go index 59fcadf..d5c8f52 100644 --- a/gateway/internal/connectsrv/server_test.go +++ b/gateway/internal/connectsrv/server_test.go @@ -83,6 +83,140 @@ func TestExecuteAuthedRequiresSession(t *testing.T) { } } +// TestExecuteRateLimitedTracked verifies a limiter rejection returns +// ResourceExhausted and lands in the rejection tracker under the public class, +// keyed by the client IP (R3). +func TestExecuteRateLimitedTracked(t *testing.T) { + backendSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`{"token":"tok","user_id":"u-1","is_guest":true,"display_name":"Guest"}`)) + })) + defer backendSrv.Close() + backend, err := backendclient.New(backendSrv.URL, "localhost:9090", 2*time.Second) + if err != nil { + t.Fatalf("backendclient: %v", err) + } + defer func() { _ = backend.Close() }() + + limits := config.DefaultRateLimit() + limits.PublicPerMinute, limits.PublicBurst = 1, 1 + tracker := ratelimit.NewTracker() + edge := connectsrv.NewServer(connectsrv.Deps{ + Registry: transcode.NewRegistry(backend, nil), + Sessions: session.NewCache(backend, time.Minute, 100), + Limiter: ratelimit.New(), + Tracker: tracker, + Hub: push.NewHub(0), + RateLimit: limits, + Heartbeat: 15 * time.Second, + }) + edgeSrv := httptest.NewServer(edge.HTTPHandler()) + defer edgeSrv.Close() + client := edgev1connect.NewGatewayClient(http.DefaultClient, edgeSrv.URL) + + if _, err := client.Execute(context.Background(), connect.NewRequest(&edgev1.ExecuteRequest{ + MessageType: transcode.MsgAuthGuest, + })); err != nil { + t.Fatalf("first execute: %v", err) + } + _, err = client.Execute(context.Background(), connect.NewRequest(&edgev1.ExecuteRequest{ + MessageType: transcode.MsgAuthGuest, + })) + if connect.CodeOf(err) != connect.CodeResourceExhausted { + t.Fatalf("code = %v, want ResourceExhausted", connect.CodeOf(err)) + } + + entries := tracker.Drain() + if len(entries) != 1 { + t.Fatalf("tracker drained %d entries, want 1", len(entries)) + } + if e := entries[0]; e.Class != "public" || e.Key != "127.0.0.1" || e.Rejected != 1 { + t.Fatalf("tracked %+v, want public/127.0.0.1 rejected=1", e) + } +} + +// TestAdminMountRateLimited verifies the /_gm mount is guarded by the per-IP +// admin limiter class ahead of the proxy's Basic-Auth (R3). +func TestAdminMountRateLimited(t *testing.T) { + backendSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {})) + defer backendSrv.Close() + backend, err := backendclient.New(backendSrv.URL, "localhost:9090", 2*time.Second) + if err != nil { + t.Fatalf("backendclient: %v", err) + } + defer func() { _ = backend.Close() }() + + limits := config.DefaultRateLimit() + limits.AdminPerMinute, limits.AdminBurst = 1, 1 + edge := connectsrv.NewServer(connectsrv.Deps{ + Registry: transcode.NewRegistry(backend, nil), + Sessions: session.NewCache(backend, time.Minute, 100), + Limiter: ratelimit.New(), + Hub: push.NewHub(0), + RateLimit: limits, + Heartbeat: 15 * time.Second, + AdminProxy: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + }), + }) + edgeSrv := httptest.NewServer(edge.HTTPHandler()) + defer edgeSrv.Close() + + first, err := http.Get(edgeSrv.URL + "/_gm/") + if err != nil { + t.Fatalf("first /_gm: %v", err) + } + _ = first.Body.Close() + if first.StatusCode != http.StatusOK { + t.Fatalf("first /_gm = %d, want 200", first.StatusCode) + } + second, err := http.Get(edgeSrv.URL + "/_gm/") + if err != nil { + t.Fatalf("second /_gm: %v", err) + } + _ = second.Body.Close() + if second.StatusCode != http.StatusTooManyRequests { + t.Fatalf("second /_gm = %d, want 429", second.StatusCode) + } +} + +// TestExecuteOversizedPayloadRejected verifies the request-body cap: an Execute +// message above GATEWAY_MAX_BODY_BYTES is refused at the edge without reaching +// the backend (R3). +func TestExecuteOversizedPayloadRejected(t *testing.T) { + client, cleanup := newEdge(t, func(w http.ResponseWriter, r *http.Request) { + t.Error("backend must not be called for an oversized payload") + }) + defer cleanup() + + _, err := client.Execute(context.Background(), connect.NewRequest(&edgev1.ExecuteRequest{ + MessageType: transcode.MsgAuthGuest, + Payload: make([]byte, config.DefaultMaxBodyBytes+1), + })) + if connect.CodeOf(err) != connect.CodeResourceExhausted { + t.Fatalf("code = %v, want ResourceExhausted", connect.CodeOf(err)) + } +} + +// TestRootRedirectsToApp verifies the gateway no longer serves a landing at "/" +// (it lives in the landing container since R3): a stray root hit is redirected +// to the app shell. +func TestRootRedirectsToApp(t *testing.T) { + front := httptest.NewServer(connectsrv.NewServer(connectsrv.Deps{}).HTTPHandler()) + defer front.Close() + + client := &http.Client{CheckRedirect: func(*http.Request, []*http.Request) error { + return http.ErrUseLastResponse + }} + resp, err := client.Get(front.URL + "/") + if err != nil { + t.Fatalf("get /: %v", err) + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != http.StatusPermanentRedirect || resp.Header.Get("Location") != "/app/" { + t.Fatalf("GET / = %d -> %q, want 308 -> /app/", resp.StatusCode, resp.Header.Get("Location")) + } +} + func TestExecuteUnknownMessageType(t *testing.T) { client, cleanup := newEdge(t, func(w http.ResponseWriter, r *http.Request) {}) defer cleanup() diff --git a/gateway/internal/ratelimit/tracker.go b/gateway/internal/ratelimit/tracker.go new file mode 100644 index 0000000..92aa602 --- /dev/null +++ b/gateway/internal/ratelimit/tracker.go @@ -0,0 +1,52 @@ +package ratelimit + +import "sync" + +// Rejection aggregates the limiter rejections of one key within one report +// window. Class is the limiter class (user, public, email or admin); Key is the +// class-specific subject — an account id for the user class, a client IP for the +// others. The JSON shape is the gateway→backend rate-limit report wire contract. +type Rejection struct { + Class string `json:"class"` + Key string `json:"key"` + Rejected int `json:"rejected"` +} + +// Tracker accumulates limiter rejections between drains. The gateway's periodic +// reporter drains it to emit the per-key log summary and the backend report; the +// per-rejection cost is one map increment under a mutex, safe on the hot path. +type Tracker struct { + mu sync.Mutex + m map[trackerKey]int +} + +type trackerKey struct{ class, key string } + +// NewTracker constructs an empty Tracker. +func NewTracker() *Tracker { + return &Tracker{m: make(map[trackerKey]int)} +} + +// Add counts one rejection of key under class. +func (t *Tracker) Add(class, key string) { + t.mu.Lock() + defer t.mu.Unlock() + t.m[trackerKey{class: class, key: key}]++ +} + +// Drain returns the rejections accumulated since the previous drain, in +// unspecified order, and resets the tracker. It returns nil when nothing was +// rejected. +func (t *Tracker) Drain() []Rejection { + t.mu.Lock() + defer t.mu.Unlock() + if len(t.m) == 0 { + return nil + } + out := make([]Rejection, 0, len(t.m)) + for k, n := range t.m { + out = append(out, Rejection{Class: k.class, Key: k.key, Rejected: n}) + } + clear(t.m) + return out +} diff --git a/gateway/internal/ratelimit/tracker_test.go b/gateway/internal/ratelimit/tracker_test.go new file mode 100644 index 0000000..4f10f3d --- /dev/null +++ b/gateway/internal/ratelimit/tracker_test.go @@ -0,0 +1,38 @@ +package ratelimit_test + +import ( + "testing" + + "scrabble/gateway/internal/ratelimit" +) + +// TestTrackerDrain verifies rejections aggregate per (class, key) and that a +// drain resets the tracker. +func TestTrackerDrain(t *testing.T) { + tr := ratelimit.NewTracker() + if got := tr.Drain(); got != nil { + t.Fatalf("empty tracker drained %v, want nil", got) + } + + tr.Add("user", "u-1") + tr.Add("user", "u-1") + tr.Add("public", "10.0.0.1") + + got := map[string]ratelimit.Rejection{} + for _, r := range tr.Drain() { + got[r.Class+"/"+r.Key] = r + } + if len(got) != 2 { + t.Fatalf("drained %d entries, want 2", len(got)) + } + if r := got["user/u-1"]; r.Rejected != 2 { + t.Errorf("user/u-1 rejected = %d, want 2", r.Rejected) + } + if r := got["public/10.0.0.1"]; r.Rejected != 1 { + t.Errorf("public/10.0.0.1 rejected = %d, want 1", r.Rejected) + } + + if got := tr.Drain(); got != nil { + t.Fatalf("second drain = %v, want nil", got) + } +} diff --git a/gateway/internal/webui/dist/landing.html b/gateway/internal/webui/dist/landing.html deleted file mode 100644 index f06ae67..0000000 --- a/gateway/internal/webui/dist/landing.html +++ /dev/null @@ -1,16 +0,0 @@ - - - - - - Scrabble - - - -

    - Landing build placeholder. The production gateway image embeds the real Vite - build (see gateway/Dockerfile); seeing this page means the binary was built - without a UI build. -

    - - diff --git a/gateway/internal/webui/webui.go b/gateway/internal/webui/webui.go index 5bde63e..2336265 100644 --- a/gateway/internal/webui/webui.go +++ b/gateway/internal/webui/webui.go @@ -1,12 +1,13 @@ // Package webui serves the embedded static UI build over the public edge. // -// The committed dist/ holds only placeholder index.html / landing.html so the gateway -// module compiles with a plain `go build` (and in CI) without a UI build. The production -// gateway image replaces dist/ with the real Vite build before compiling (see -// gateway/Dockerfile), so the binary ships the UI inside it. Because Vite is built with a -// relative asset base, one build serves under any path: the game SPA is mounted at /app/ -// (web) and /telegram/ (the Telegram Mini App), with a separate landing page at / — the -// single-origin model in docs/ARCHITECTURE.md §13. +// The committed dist/ holds only a placeholder index.html so the gateway module +// compiles with a plain `go build` (and in CI) without a UI build. The production +// gateway image replaces dist/ with the real Vite build — minus landing.html, which +// ships in the separate landing container since R3 — before compiling (see +// gateway/Dockerfile), so the binary ships the UI inside it. Because Vite is built +// with a relative asset base, one build serves under any path: the game SPA is +// mounted at /app/ (web) and /telegram/ (the Telegram Mini App) — the single-origin +// model in docs/ARCHITECTURE.md §13. // // Caching (Stage 17): Vite emits hash-named files under assets/, so those are immutable and // cached hard (a reload/relaunch is a cache hit, not a re-download); the HTML shells carry @@ -35,10 +36,10 @@ func distFS() fs.FS { } // Handler serves the embedded UI. An existing file is served directly (hash-named assets get -// an immutable cache); every other path falls back to indexName (the SPA shell or the landing -// page) so a client-side deep link still loads. When stripPrefix is non-empty it is removed -// from the request path before lookup, so the same build serves under a sub-path (e.g. -// "/app/" or "/telegram/"). +// an immutable cache); every other path falls back to indexName (the SPA shell) so a +// client-side deep link still loads. When stripPrefix is non-empty it is removed from the +// request path before lookup, so the same build serves under a sub-path (e.g. "/app/" or +// "/telegram/"). func Handler(stripPrefix, indexName string) http.Handler { content := distFS() files := http.FileServer(http.FS(content)) diff --git a/gateway/internal/webui/webui_test.go b/gateway/internal/webui/webui_test.go index c7a4d4a..c40d30f 100644 --- a/gateway/internal/webui/webui_test.go +++ b/gateway/internal/webui/webui_test.go @@ -22,20 +22,12 @@ func body(t *testing.T, resp *http.Response) string { return string(b) } -// TestLandingMountServesLandingAndFallsBack: "/" serves the landing shell (no-cache) and -// any unknown path falls back to it. -func TestLandingMountServesLandingAndFallsBack(t *testing.T) { - h := Handler("", "landing.html") - - resp := get(t, h, "/") - if resp.StatusCode != http.StatusOK || !strings.Contains(body(t, resp), "scrabble-landing") { - t.Fatalf("GET / did not serve the landing shell (status %d)", resp.StatusCode) - } - if cc := get(t, h, "/").Header.Get("Cache-Control"); cc != "no-cache" { - t.Errorf("landing Cache-Control = %q, want no-cache", cc) - } - if resp := get(t, h, "/whatever"); resp.StatusCode != http.StatusOK { - t.Fatalf("GET /whatever status = %d, want 200 (fallback)", resp.StatusCode) +// TestShellNoCache: the served HTML shell carries no-cache so a new deploy's +// shell (and the asset URLs it references) is fetched fresh. +func TestShellNoCache(t *testing.T) { + h := Handler("/app/", "index.html") + if cc := get(t, h, "/app/").Header.Get("Cache-Control"); cc != "no-cache" { + t.Errorf("shell Cache-Control = %q, want no-cache", cc) } }