From dcd8de8b00312501a718d1728f5126c569b0ef77 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Thu, 4 Jun 2026 14:22:15 +0200 Subject: [PATCH 1/2] Stage 12: observability & performance (OTel/OTLP, domain metrics, guest GC) - pkg/telemetry: shared OTel provider bootstrap (none/stdout/otlp + W3C propagators + Go runtime metrics); backend/internal/telemetry becomes a thin facade keeping its gin middleware. - Telemetry parity: gateway and the Telegram connector gain telemetry runtimes and config (GATEWAY_/TELEGRAM_ SERVICE_NAME + OTEL_*); otelgrpc instruments the backend push server, the gateway's backend+connector clients and the connector server. Default exporter stays none (collector/dashboards are Stage 14). - Operational metrics (variant attribute on game-scoped ones): game_replay_duration, game_move_validate_duration, games_started_total, games_abandoned_total, game_cache_active, chat_messages_total{kind}, gateway edge_request_duration. Wired via the SetMetrics setter pattern (default no-op meter). - TODO-3: account.GuestReaper deletes guests with no game seat past BACKEND_GUEST_RETENTION (default 30d, swept every BACKEND_GUEST_REAP_INTERVAL). - Tests: pkg/telemetry exporter selection; game/social/edge metric recording via a manual reader; config (otlp accepted, guest knobs); inttest guest reaper. - Docs: PLAN.md re-scopes Stage 12 and adds Stage 13 (alphabet-on-wire) + Stage 14 (CI/deploy) with the agreed dictionary-versioning resolution; ARCHITECTURE 11/13, TESTING, the three READMEs and FUNCTIONAL(+ru) updated. --- PLAN.md | 136 ++++++++++- backend/README.md | 6 +- backend/cmd/backend/main.go | 13 ++ backend/go.mod | 1 + backend/internal/account/reaper.go | 87 +++++++ backend/internal/config/config.go | 50 ++-- backend/internal/config/config_test.go | 48 +++- backend/internal/game/cache.go | 21 +- backend/internal/game/helpers_test.go | 2 +- backend/internal/game/metrics.go | 109 +++++++++ backend/internal/game/metrics_test.go | 95 ++++++++ backend/internal/game/service.go | 11 +- backend/internal/inttest/guest_reaper_test.go | 76 ++++++ backend/internal/pushgrpc/server.go | 3 +- backend/internal/social/chat.go | 2 + backend/internal/social/metrics.go | 49 ++++ backend/internal/social/metrics_test.go | 48 ++++ backend/internal/social/social.go | 2 + backend/internal/telemetry/telemetry.go | 200 +++------------- backend/internal/telemetry/telemetry_test.go | 12 +- docs/ARCHITECTURE.md | 31 ++- docs/FUNCTIONAL.md | 3 +- docs/FUNCTIONAL_ru.md | 3 +- docs/TESTING.md | 10 + gateway/README.md | 3 + gateway/cmd/gateway/main.go | 19 ++ gateway/go.mod | 4 + gateway/internal/backendclient/client.go | 6 +- gateway/internal/config/config.go | 13 ++ gateway/internal/config/config_test.go | 31 +++ gateway/internal/connector/client.go | 6 +- gateway/internal/connectsrv/metrics.go | 43 ++++ gateway/internal/connectsrv/metrics_test.go | 54 +++++ gateway/internal/connectsrv/server.go | 15 ++ go.work.sum | 2 + pkg/go.mod | 22 +- pkg/go.sum | 40 ++++ pkg/telemetry/telemetry.go | 218 ++++++++++++++++++ pkg/telemetry/telemetry_test.go | 84 +++++++ platform/telegram/README.md | 3 + platform/telegram/cmd/telegram/main.go | 23 +- platform/telegram/go.mod | 1 + platform/telegram/internal/config/config.go | 12 + .../telegram/internal/config/config_test.go | 41 ++++ 44 files changed, 1434 insertions(+), 224 deletions(-) create mode 100644 backend/internal/account/reaper.go create mode 100644 backend/internal/game/metrics.go create mode 100644 backend/internal/game/metrics_test.go create mode 100644 backend/internal/inttest/guest_reaper_test.go create mode 100644 backend/internal/social/metrics.go create mode 100644 backend/internal/social/metrics_test.go create mode 100644 gateway/internal/config/config_test.go create mode 100644 gateway/internal/connectsrv/metrics.go create mode 100644 gateway/internal/connectsrv/metrics_test.go create mode 100644 pkg/telemetry/telemetry.go create mode 100644 pkg/telemetry/telemetry_test.go create mode 100644 platform/telegram/internal/config/config_test.go diff --git a/PLAN.md b/PLAN.md index f6c8abd..404f902 100644 --- a/PLAN.md +++ b/PLAN.md @@ -45,7 +45,9 @@ independent (see ARCHITECTURE §9.1). | 9 | Telegram integration (bot side-service, deep-link, push) | **done** | | 10 | Admin & dictionary ops (complaint review, version reload) | **done** | | 11 | Account linking & merge | **done** | -| 12 | Polish (observability, perf with evidence, deploy) | todo | +| 12 | Observability & performance (telemetry, metrics, guest GC) | todo | +| 13 | Alphabet on the wire (UI alphabet-agnostic) | todo | +| 14 | CI & deploy (multi-service, dictionary artifacts) | todo | Scaffolding is incremental: `go.work` lists only existing modules; each stage adds the modules it needs. @@ -204,10 +206,68 @@ dedupe). High blast-radius — focused regression tests. Open details: conflict resolution (active games on both, duplicate friends, display-name collisions); irreversibility/audit; confirm-flow per platform. -### Stage 12 — Polish -Scope: observability dashboards, evidence-based performance work, prod -build/deploy. -Open details: deployment target/host; dashboards; load expectations. +### Stage 12 — Observability & performance +Scope: wire a configurable **OTLP** exporter (alongside `none`/`stdout`), shared in a +new `pkg/telemetry`; add telemetry to the **gateway** and the **Telegram connector** +(providers + `otelgrpc` on the gRPC hops) for parity with the backend; add +domain/operational **metrics** close to the business (game replay/validate timings, +started/abandoned games, live-cache size, chat/nudge counts, the edge roundtrip, Go +runtime metrics); discharge **TODO-3** (abandoned-guest GC). The OTLP collector and +dashboards are stood up with the deploy (Stage 14); the default exporter stays `none`, +so CI needs no collector. Performance is operational-metric instrumentation, not +speculative optimisation (the standing "evidence first" rule — no measured hotspot yet). +Open details: exporter default and whether a collector is stood up now; the metric set +and its attributes; the guest-reaper trigger given revoke-only sessions. + +### Stage 13 — Alphabet on the wire (TODO-4) +Scope: make the UI **alphabet-agnostic**. On game-screen load the client receives the +variant's alphabet table `(letter, index, value)` for **display only**, caches it in +memory by variant (a request flag gates whether the table is included, so it is not +resent on every state poll); live play then exchanges **letter indices** both ways, and +**word-check** sends indices, constraining input to the variant's alphabet. The engine +already works in alphabet-index bytes, so the wire does *less* decoding in live play; the +durable journal / history / GCG stay decoded concrete characters (the §9.1 +dictionary-independent invariant is untouched). The alphabet comes from the **solver's +rules** (not the DAWG), so the wire table is pinned by the solver version. **Index-drift +caveat:** the running solver, the DAWGs (built against it — Stage 14 / TODO-2) and the +wire table must agree, or letter indexing silently corrupts. Blast radius: `pkg/fbs` +(a new Alphabet table; index fields in `StateView`/rack and in +`SubmitPlay`/`Exchange`/`check_word`) → backend DTO encode/decode → UI +`codec.ts`/`premiums.ts` → board/rack render, the move/exchange/word-check senders, the +mock transport and the Vitest tests. +Open details: the fbs shape and `include_alphabet` flag placement; whether to keep +concrete-letter fields during the transition; whether tile exchange moves fully to +indices; the premiums.ts parity-test rework. + +### Stage 14 — CI & deploy +Scope: the full **multi-service production deploy** plus the observability backend, also +discharging **TODO-1** and **TODO-2**. Backend + gateway **Dockerfiles** (multi-stage +distroless, mirroring the Stage 9 connector image); the gateway gains **static UI +serving** (the §13 single-origin model — mini-landing at `/`, Mini App under +`/telegram/`), documented since Stage 9 but **not yet implemented**; prod UI build vars +(`VITE_TELEGRAM_BOT_ID` for the Login Widget, the Mini App URL / share link); a root +`deploy/docker-compose.yml` (backend + gateway + Postgres + connector + the OTLP +collector / Grafana stack) on the external `edge` network behind the host caddy, the VPN +sidecar only for the connector; a **deploy workflow** mirroring `../15-puzzle` (host-mode +runner, `docker compose up -d --build`, no external registry, env from Gitea secrets, a +post-deploy probe). Stand up the **OTLP collector + dashboards** (the export wiring landed +in Stage 12). +- **TODO-1 — publish & version the solver:** tag/publish `scrabble-solver`, drop the + `go.work` replace + the CI clone, pin a version in `backend/go.mod` (or keep cloning the + sibling as the minimal-diff fallback). The DAWGs are delivered separately regardless. +- **TODO-2 — versioned dictionary artifacts:** a **new versioned repo** for the wordlist + parsers + built DAWGs, delivered as a **release artifact** (Gitea release / OCI / object + store — not `go get`; DAWGs are data). **One semver label `vX.Y.Z` for the whole set**, + additive: a deploy drops a new `BACKEND_DICT_DIR//` subdir; + `engine.OpenWithVersions` loads every present subdir at boot; `BACKEND_DICT_VERSION` + selects the default for **new** games. A new version never breaks a running backend + (each game pins its `dict_version`; versions are additive); **only active games need a + dictionary** (validate-at-submit — finished games replay the dictionary-independent + journal), so a version is safe to retire once no active game pins it. The dict repo must + build against the **same `dafsa`/`alphabet`/solver** the backend runs, or letter indexing + drifts (ties into Stage 13). +Open details: embed-vs-mount for the UI build and the DAWG set; the OTLP collector / +dashboard stack; solver-publish vs clone-in-build; load expectations. ## Refinements logged during implementation @@ -796,12 +856,50 @@ Open details: deployment target/host; dashboards; load expectations. ./pkg/... ./platform/telegram/...`; integration stays `./backend/...`. UI ~90 KB gzip JS (budget 100 KB). New error code `merge_active_game_conflict`. +- **Stage 12** (interview + implementation): + - **Re-scoped & split** (interview): the original "Polish (observability + perf + + deploy)" was too large for one session, so it was split — **Stage 12** = observability + + performance + guest GC; **Stage 13** = alphabet-on-the-wire (TODO-4); **Stage 14** = + CI & deploy (TODO-1, TODO-2, the collector + dashboards). The latter two were written + into the plan now as the agreed baseline (each still re-interviews at its own start). + - **Shared telemetry** (interview): a new `pkg/telemetry` owns the OTel provider + bootstrap (exporter selection, W3C propagators, shutdown, Go runtime metrics); the + backend `internal/telemetry` is now a thin facade over it (keeping its gin middleware), + and the gateway and connector gained telemetry runtimes. A configurable **`otlp`** + exporter was added alongside `none`/`stdout`; the **default stays `none`**, the OTLP + endpoint comes from the standard `OTEL_EXPORTER_OTLP_*` env, and the collector + + dashboards are Stage 14 (so CI needs none). `otelgrpc` instruments the backend push + server, the gateway's backend + connector clients, and the connector's gRPC server. + New config `GATEWAY_SERVICE_NAME`/`GATEWAY_OTEL_*` and `TELEGRAM_SERVICE_NAME`/ + `TELEGRAM_OTEL_*`; the backend's existing `BACKEND_OTEL_*` gained the `otlp` value. + - **Metrics = operational, business-near** (interview): histograms + `game_replay_duration` and `game_move_validate_duration`; counters + `games_started_total`, `games_abandoned_total` (a turn-timeout seat drop) and + `chat_messages_total` (`kind`=message/nudge); an observable gauge `game_cache_active`; + the gateway `edge_request_duration` (`message_type`/`result`); plus Go runtime/heap + metrics. Game-scoped metrics carry a **`variant`** attribute + (english/russian_scrabble/erudit — chosen over a coarser `language`, which it + subsumes); the gateway edge metric is variant-agnostic. Optional wiring uses the + established `SetMetrics`/`SetNotifier` setter pattern (default no-op meter), so existing + constructors and tests are untouched. **No speculative optimisation** — there is no + measured hotspot; the deliverable is the instrumentation (the standing "performance only + with evidence" rule). pprof was not added (reframed away by the owner). + - **Guest GC** (interview, TODO-3): age-based, no-seat-only — see the discharged TODO-3 + below; new config `BACKEND_GUEST_REAP_INTERVAL`/`BACKEND_GUEST_RETENTION`. + - **Deps/CI**: new OTel modules (the OTLP exporters, + `contrib/instrumentation/runtime`, `otelgrpc`) added with the no-tidy pattern + (`go mod edit` + `go mod download` + `go work sync`; `pkg` carries no bare-path dep, so + it tidies cleanly). No workflow change — the Go workflows already span + `./backend/... ./gateway/... ./pkg/... ./platform/telegram/...`, integration stays + `./backend/...`, and the default `none` exporter keeps CI collector-free. + ## Deferred TODOs (cross-stage) - **TODO-1 — publish & version the solver.** Once `scrabble-solver` is stable, give it a real module URL and switch `backend` to a versioned dependency, dropping the `go.work` replace and the CI clone. Removes the floating - `master` dependency accepted for now (Stage 2 interview). + `master` dependency accepted for now (Stage 2 interview). **Planned for Stage 14** + (it cleans up the backend Docker build; a clone-in-build fallback stays available). - **TODO-2 — split the solver into engine vs dictionary generator + versioned dictionary artifacts.** Owner's idea, with the caveats agreed at the Stage 2 interview: the split is sound (build-time wordlist→DAWG vs runtime load have @@ -817,12 +915,22 @@ Open details: deployment target/host; dashboards; load expectations. `BACKEND_DICT_DIR//` loaded via `Registry.LoadAvailable`, restart-restored by `engine.OpenWithVersions`) — keep the `BACKEND_DICT_DIR` directory as the runtime contract: a new `.dawg` appears in it and is loaded with - `dawg.Load`. -- **TODO-3 — garbage-collect abandoned guest accounts.** Stage 6 makes a guest a - durable `accounts` row (no identity, `is_guest`), so an ephemeral guest leaves a - row behind. Add a periodic reaper (or a finished-and-idle sweep) that deletes - guest accounts with no active games once their last session is gone; the - `ON DELETE CASCADE` foreign keys clean up the dependent rows. + `dawg.Load`. **Planned for Stage 14**, agreed resolution: a **new versioned repo** + for the parsers + built DAWGs, delivered as a **release artifact** (not `go get`), + versioned with **one semver label for the whole set** (additive; old versions retired + once no active game pins them — see Stage 14). The generator must build against the same + `dafsa`/`alphabet`/solver as the runtime (the index-drift caveat, shared with TODO-4). +- ~~**TODO-3 — garbage-collect abandoned guest accounts.**~~ **Done in Stage 12.** + A periodic `account.GuestReaper` deletes guests (`is_guest`) **with no game seat at + all** whose account age exceeds `BACKEND_GUEST_RETENTION` (default 30 d, swept every + `BACKEND_GUEST_REAP_INTERVAL`, default 1 h). Two schema facts shaped this, narrowing + the original sketch: (1) `game_players`/`chat_messages`/`complaints` reference accounts + **without** `ON DELETE CASCADE`, and a finished game belongs to the other players' + history, so a guest with any seat is retained (a delete would be blocked anyway) — hence + "no seat", not "no active game"; (2) sessions are revoke-only with no maintained + `last_seen_at`, so a lingering session never expires and **account age** is the + abandonment trigger, not "last session gone". The reaped guest's `sessions`/`identities`/ + `account_stats` fall away via their own `ON DELETE CASCADE`. - **TODO-4 — put the per-game alphabet on the wire (owner's idea, Stage 7).** Today the client hardcodes each variant's letters/values (ported into `ui/src/lib/premiums.ts` from `scrabble-solver/rules/rules.go`) and the edge exchanges plays/hints by concrete @@ -830,6 +938,10 @@ Open details: deployment target/host; dashboards; load expectations. value)` table so the UI stops duplicating it, and optionally moving tile exchange to letter **indices** end-to-end. Caveat (as for the dictionaries, TODO-2): the wire table must stay pinned to the same `rules.Alphabet` the engine uses, or indices drift. + **Planned for Stage 13**, expanded (owner) to a fully **alphabet-agnostic UI**: the + client caches the per-variant table (display only) behind an `include_alphabet` request + flag and exchanges indices both ways, word-check included; the durable journal stays + concrete characters (§9.1). See Stage 13. - **TODO-5 — QR friend codes (owner's idea, Stage 8).** *Partially done in Stage 9:* the deep-link scheme now exists (`f`, shared Go ↔ TS), the bot redeems it on launch, and the UI shows a **share-to-Telegram** link for an issued code when diff --git a/backend/README.md b/backend/README.md index 8262bc8..7d3d2bb 100644 --- a/backend/README.md +++ b/backend/README.md @@ -132,8 +132,8 @@ internal/connector/ # backend gRPC client to the Telegram connector (operator b | `BACKEND_POSTGRES_CONN_MAX_LIFETIME` | `30m` | Max connection lifetime. | | `BACKEND_POSTGRES_OPERATION_TIMEOUT` | `5s` | Connect attempt + `/readyz` ping bound. | | `BACKEND_SERVICE_NAME` | `scrabble-backend` | OpenTelemetry `service.name`. | -| `BACKEND_OTEL_TRACES_EXPORTER` | `none` | `none` or `stdout` (OTLP arrives later). | -| `BACKEND_OTEL_METRICS_EXPORTER` | `none` | `none` or `stdout`. | +| `BACKEND_OTEL_TRACES_EXPORTER` | `none` | `none`, `stdout` or `otlp` (gRPC; endpoint from the standard `OTEL_EXPORTER_OTLP_*`). | +| `BACKEND_OTEL_METRICS_EXPORTER` | `none` | `none`, `stdout` or `otlp`. | | `BACKEND_DICT_DIR` | — | **Required.** Directory of committed `.dawg` dictionaries. | | `BACKEND_DICT_VERSION` | `v1` | Dictionary version new games pin. | | `BACKEND_GAME_TIMEOUT_SWEEP_INTERVAL` | `1m` | How often the turn-timeout sweeper runs. | @@ -147,6 +147,8 @@ internal/connector/ # backend gRPC client to the Telegram connector (operator b | `BACKEND_SMTP_PASSWORD` | — | SMTP password. | | `BACKEND_SMTP_FROM` | `no-reply@localhost` | Envelope/From address for confirm-codes. | | `BACKEND_CONNECTOR_ADDR` | — | Telegram connector gRPC address for admin-console operator broadcasts. Empty disables broadcasts. | +| `BACKEND_GUEST_REAP_INTERVAL` | `1h` | How often the abandoned-guest reaper sweeps. | +| `BACKEND_GUEST_RETENTION` | `720h` | Account age past which a guest with no game seat is deleted. | ## Run diff --git a/backend/cmd/backend/main.go b/backend/cmd/backend/main.go index 7cbc0fb..4b8ce31 100644 --- a/backend/cmd/backend/main.go +++ b/backend/cmd/backend/main.go @@ -80,6 +80,9 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error { logger.Warn("telemetry shutdown", zap.Error(err)) } }() + if err := tel.StartRuntimeMetrics(); err != nil { + logger.Warn("telemetry: start runtime metrics", zap.Error(err)) + } db, err := postgres.Open(ctx, cfg.Postgres, postgres.WithTracerProvider(tel.TracerProvider()), @@ -131,10 +134,19 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error { accounts := account.NewStore(db) games := game.NewService(game.NewStore(db), accounts, registry, cfg.Game, logger) games.SetNotifier(hub) + games.SetMetrics(tel.MeterProvider().Meter("scrabble/backend/game")) go games.RunSweeper(ctx, cfg.Game.TimeoutSweepInterval) logger.Info("game turn-timeout sweeper started", zap.Duration("interval", cfg.Game.TimeoutSweepInterval)) + // Stage 12 TODO-3: reap abandoned guest accounts (no game seat, account age past + // the retention window). Dependent rows fall away via ON DELETE CASCADE. + guestReaper := account.NewGuestReaper(accounts, cfg.GuestRetention, logger) + go guestReaper.Run(ctx, cfg.GuestReapInterval) + logger.Info("guest reaper started", + zap.Duration("interval", cfg.GuestReapInterval), + zap.Duration("retention", cfg.GuestRetention)) + // Stage 4 lobby & social domains. Their REST and stream surface is added with // the gateway in Stage 6, so they are handed to the server (like the route // groups) for the handlers to come. @@ -145,6 +157,7 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error { links := link.NewService(emails, accounts, accountmerge.NewMerger(db), sessions) socialSvc := social.NewService(social.NewStore(db), accounts, games) socialSvc.SetNotifier(hub) + socialSvc.SetMetrics(tel.MeterProvider().Meter("scrabble/backend/social")) // Stage 5 robot opponent: provision its durable account pool (a hard startup // dependency, like the dictionaries) and start its move driver. The matchmaker diff --git a/backend/go.mod b/backend/go.mod index bb0a4fc..fcae0af 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -101,6 +101,7 @@ require ( github.com/yusufpapurcu/wmi v1.2.4 // indirect go.mongodb.org/mongo-driver/v2 v2.5.0 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/arch v0.22.0 // indirect diff --git a/backend/internal/account/reaper.go b/backend/internal/account/reaper.go new file mode 100644 index 0000000..b319441 --- /dev/null +++ b/backend/internal/account/reaper.go @@ -0,0 +1,87 @@ +package account + +import ( + "context" + "fmt" + "time" + + "github.com/go-jet/jet/v2/postgres" + "go.uber.org/zap" + + "scrabble/backend/internal/postgres/jet/backend/table" +) + +// ReapAbandonedGuests deletes guest accounts created before olderThan that are +// not seated in any game. It returns the number deleted. +// +// Scope is deliberately "no game seat at all", not merely "no active game": a +// finished game belongs to the other players' history, and game_players carries no +// ON DELETE CASCADE to accounts (docs/ARCHITECTURE.md §4), so a guest with any seat +// is retained (and a delete would be blocked by that foreign key regardless). The +// dependent rows of a reaped guest — sessions, identities, account_stats — fall +// away through their own ON DELETE CASCADE foreign keys. Account age is the +// abandonment signal because sessions are revoke-only with no maintained +// last_seen_at, so a lingering session never expires on its own. +func (s *Store) ReapAbandonedGuests(ctx context.Context, olderThan time.Time) (int64, error) { + stmt := table.Accounts.DELETE().WHERE( + table.Accounts.IsGuest.EQ(postgres.Bool(true)). + AND(table.Accounts.CreatedAt.LT(postgres.TimestampzT(olderThan))). + AND(postgres.NOT(postgres.EXISTS( + postgres.SELECT(table.GamePlayers.AccountID). + FROM(table.GamePlayers). + WHERE(table.GamePlayers.AccountID.EQ(table.Accounts.AccountID)), + ))), + ) + res, err := stmt.ExecContext(ctx, s.db) + if err != nil { + return 0, fmt.Errorf("account: reap guests: %w", err) + } + n, err := res.RowsAffected() + if err != nil { + return 0, fmt.Errorf("account: reap guests rows affected: %w", err) + } + return n, nil +} + +// GuestReaper periodically deletes abandoned guest accounts via +// Store.ReapAbandonedGuests. It mirrors the game turn-timeout sweeper and the +// matchmaker reaper: one background goroutine, started once from main. +type GuestReaper struct { + store *Store + retention time.Duration + clock func() time.Time + log *zap.Logger +} + +// NewGuestReaper constructs a reaper deleting guests whose account age exceeds +// retention. log may be nil. +func NewGuestReaper(store *Store, retention time.Duration, log *zap.Logger) *GuestReaper { + if log == nil { + log = zap.NewNop() + } + return &GuestReaper{ + store: store, + retention: retention, + clock: func() time.Time { return time.Now().UTC() }, + log: log, + } +} + +// Run reaps abandoned guests on each tick until ctx is cancelled. +func (r *GuestReaper) Run(ctx context.Context, interval time.Duration) { + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + n, err := r.store.ReapAbandonedGuests(ctx, r.clock().Add(-r.retention)) + if err != nil { + r.log.Warn("guest reap failed", zap.Error(err)) + } else if n > 0 { + r.log.Info("reaped abandoned guests", zap.Int64("count", n)) + } + } + } +} diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index a6726ab..9cc69e0 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -42,13 +42,20 @@ type Config struct { // side-service, used by the admin console to send operator broadcasts. Empty // disables broadcasts (the admin broadcast actions report "not configured"). ConnectorAddr string + // GuestReapInterval is the cadence of the abandoned-guest reaper sweep. + GuestReapInterval time.Duration + // GuestRetention is the account age past which an unused guest (no game seat) + // is eligible for deletion by the reaper. + GuestRetention time.Duration } // Defaults applied when the corresponding environment variable is unset. const ( - defaultHTTPAddr = ":8080" - defaultGRPCAddr = ":9090" - defaultLogLevel = "info" + defaultHTTPAddr = ":8080" + defaultGRPCAddr = ":9090" + defaultLogLevel = "info" + defaultGuestReapInterval = time.Hour + defaultGuestRetention = 30 * 24 * time.Hour ) // Load reads the configuration from the environment, applies defaults for unset @@ -98,6 +105,15 @@ func Load() (Config, error) { return Config{}, err } + guestReapInterval, err := envDuration("BACKEND_GUEST_REAP_INTERVAL", defaultGuestReapInterval) + if err != nil { + return Config{}, err + } + guestRetention, err := envDuration("BACKEND_GUEST_RETENTION", defaultGuestRetention) + if err != nil { + return Config{}, err + } + smtp := account.SMTPConfig{ Host: os.Getenv("BACKEND_SMTP_HOST"), Port: envOr("BACKEND_SMTP_PORT", "587"), @@ -107,16 +123,18 @@ func Load() (Config, error) { } c := Config{ - HTTPAddr: envOr("BACKEND_HTTP_ADDR", defaultHTTPAddr), - GRPCAddr: envOr("BACKEND_GRPC_ADDR", defaultGRPCAddr), - LogLevel: envOr("BACKEND_LOG_LEVEL", defaultLogLevel), - Postgres: pg, - Telemetry: tel, - Game: gm, - Lobby: lb, - Robot: rb, - SMTP: smtp, - ConnectorAddr: os.Getenv("BACKEND_CONNECTOR_ADDR"), + HTTPAddr: envOr("BACKEND_HTTP_ADDR", defaultHTTPAddr), + GRPCAddr: envOr("BACKEND_GRPC_ADDR", defaultGRPCAddr), + LogLevel: envOr("BACKEND_LOG_LEVEL", defaultLogLevel), + Postgres: pg, + Telemetry: tel, + Game: gm, + Lobby: lb, + Robot: rb, + SMTP: smtp, + ConnectorAddr: os.Getenv("BACKEND_CONNECTOR_ADDR"), + GuestReapInterval: guestReapInterval, + GuestRetention: guestRetention, } if err := c.validate(); err != nil { return Config{}, err @@ -152,6 +170,12 @@ func (c Config) validate() error { if err := c.Robot.Validate(); err != nil { return fmt.Errorf("config: %w", err) } + if c.GuestReapInterval <= 0 { + return fmt.Errorf("config: BACKEND_GUEST_REAP_INTERVAL must be positive") + } + if c.GuestRetention <= 0 { + return fmt.Errorf("config: BACKEND_GUEST_RETENTION must be positive") + } return nil } diff --git a/backend/internal/config/config_test.go b/backend/internal/config/config_test.go index c568e1b..90ed415 100644 --- a/backend/internal/config/config_test.go +++ b/backend/internal/config/config_test.go @@ -151,12 +151,54 @@ func TestLoadRejectsMalformedDuration(t *testing.T) { } } -// TestLoadRejectsUnsupportedExporter verifies that an exporter outside the MVP -// set is rejected. +// TestLoadRejectsUnsupportedExporter verifies that an exporter outside the +// supported set is rejected. func TestLoadRejectsUnsupportedExporter(t *testing.T) { t.Setenv("BACKEND_POSTGRES_DSN", testDSN) - t.Setenv("BACKEND_OTEL_TRACES_EXPORTER", "otlp") + t.Setenv("BACKEND_OTEL_TRACES_EXPORTER", "prometheus") if _, err := Load(); err == nil { t.Fatal("Load: expected an error for an unsupported exporter, got nil") } } + +// TestLoadAcceptsOTLPExporter verifies that the otlp exporter is now accepted +// (the collector is stood up with the deploy; the default stays none). +func TestLoadAcceptsOTLPExporter(t *testing.T) { + t.Setenv("BACKEND_POSTGRES_DSN", testDSN) + t.Setenv("BACKEND_DICT_DIR", "/dict") + t.Setenv("BACKEND_OTEL_TRACES_EXPORTER", "otlp") + t.Setenv("BACKEND_OTEL_METRICS_EXPORTER", "otlp") + if _, err := Load(); err != nil { + t.Fatalf("Load with otlp exporters: %v", err) + } +} + +// TestLoadGuestReaperDefaultsAndOverride covers the guest-reaper knobs: defaults +// when unset, an override, and rejection of a non-positive value. +func TestLoadGuestReaperDefaultsAndOverride(t *testing.T) { + t.Setenv("BACKEND_POSTGRES_DSN", testDSN) + t.Setenv("BACKEND_DICT_DIR", "/dict") + + c, err := Load() + if err != nil { + t.Fatalf("Load: %v", err) + } + if c.GuestReapInterval != defaultGuestReapInterval { + t.Errorf("GuestReapInterval = %s, want %s", c.GuestReapInterval, defaultGuestReapInterval) + } + if c.GuestRetention != defaultGuestRetention { + t.Errorf("GuestRetention = %s, want %s", c.GuestRetention, defaultGuestRetention) + } + + t.Setenv("BACKEND_GUEST_RETENTION", "168h") + if c, err = Load(); err != nil { + t.Fatalf("Load (override): %v", err) + } else if c.GuestRetention != 168*time.Hour { + t.Errorf("GuestRetention = %s, want 168h", c.GuestRetention) + } + + t.Setenv("BACKEND_GUEST_REAP_INTERVAL", "0s") + if _, err := Load(); err == nil { + t.Fatal("Load: expected an error for a non-positive reap interval, got nil") + } +} diff --git a/backend/internal/game/cache.go b/backend/internal/game/cache.go index 5056069..8ec2c2f 100644 --- a/backend/internal/game/cache.go +++ b/backend/internal/game/cache.go @@ -63,6 +63,7 @@ type gameCache struct { type cachedGame struct { game *engine.Game + variant string lastAccess time.Time } @@ -82,11 +83,12 @@ func (c *gameCache) get(id uuid.UUID) (*engine.Game, bool) { return e.game, true } -// put stores g as the live game for id. -func (c *gameCache) put(id uuid.UUID, g *engine.Game) { +// put stores g as the live game for id. variant labels the entry so the active- +// games gauge can report counts by variant without inspecting engine internals. +func (c *gameCache) put(id uuid.UUID, g *engine.Game, variant string) { c.mu.Lock() defer c.mu.Unlock() - c.entries[id] = &cachedGame{game: g, lastAccess: c.now()} + c.entries[id] = &cachedGame{game: g, variant: variant, lastAccess: c.now()} } // remove drops id from the cache (used on a finished game and after a failed @@ -119,3 +121,16 @@ func (c *gameCache) size() int { defer c.mu.Unlock() return len(c.entries) } + +// countByVariant tallies the resident games by their variant label. It backs the +// game_cache_active observable gauge; the resident set is the bounded number of +// live (active) games, so the scan under the lock is cheap. +func (c *gameCache) countByVariant() map[string]int { + c.mu.Lock() + defer c.mu.Unlock() + out := make(map[string]int, len(c.entries)) + for _, e := range c.entries { + out[e.variant]++ + } + return out +} diff --git a/backend/internal/game/helpers_test.go b/backend/internal/game/helpers_test.go index c5e7f94..80561d7 100644 --- a/backend/internal/game/helpers_test.go +++ b/backend/internal/game/helpers_test.go @@ -94,7 +94,7 @@ func TestGameCacheEviction(t *testing.T) { cur := time.Unix(1_700_000_000, 0) cache := newGameCache(time.Hour, func() time.Time { return cur }) id := uuid.New() - cache.put(id, nil) + cache.put(id, nil, "english") if _, ok := cache.get(id); !ok { t.Fatal("game must be resident after put") } diff --git a/backend/internal/game/metrics.go b/backend/internal/game/metrics.go new file mode 100644 index 0000000..8a59185 --- /dev/null +++ b/backend/internal/game/metrics.go @@ -0,0 +1,109 @@ +package game + +import ( + "context" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/metric/noop" + "go.uber.org/zap" + + "scrabble/backend/internal/engine" +) + +// meterName scopes the game domain's OpenTelemetry instruments. +const meterName = "scrabble/backend/game" + +// gameMetrics holds the game domain's operational instruments. Every game-scoped +// measurement carries a "variant" attribute (english/russian/erudit). The +// instruments default to no-ops (see defaultGameMetrics), so recording is always +// safe; SetMetrics installs the real meter during startup wiring. +type gameMetrics struct { + replay metric.Float64Histogram + validate metric.Float64Histogram + started metric.Int64Counter + abandoned metric.Int64Counter +} + +// defaultGameMetrics returns instruments backed by a no-op meter, recording +// nothing until SetMetrics installs a real one. +func defaultGameMetrics() *gameMetrics { + return newGameMetrics(noop.NewMeterProvider().Meter(meterName)) +} + +// newGameMetrics builds the instruments on meter, falling back to no-op +// instruments on the (rare) construction error so the game domain never fails to +// start over telemetry. +func newGameMetrics(meter metric.Meter) *gameMetrics { + return &gameMetrics{ + replay: histogram(meter, "game_replay_duration", "Seconds to rebuild a live game from its journal on a cache miss."), + validate: histogram(meter, "game_move_validate_duration", "Seconds to validate and score a tentative play (EvaluatePlay)."), + started: counter(meter, "games_started_total", "Games created and started."), + abandoned: counter(meter, "games_abandoned_total", "Player seats dropped by the turn-timeout sweeper."), + } +} + +// SetMetrics installs the meter the game domain records to and registers the +// observable gauge reporting the live games resident in the cache by variant. It +// must be called during startup wiring; the default is a no-op meter. +func (svc *Service) SetMetrics(meter metric.Meter) { + if meter == nil { + return + } + svc.metrics = newGameMetrics(meter) + if _, err := meter.Int64ObservableGauge("game_cache_active", + metric.WithDescription("Live games currently resident in the in-memory cache, by variant."), + metric.WithInt64Callback(func(_ context.Context, o metric.Int64Observer) error { + for variant, n := range svc.cache.countByVariant() { + o.Observe(int64(n), metric.WithAttributes(attribute.String("variant", variant))) + } + return nil + }), + ); err != nil { + svc.log.Warn("game: register cache gauge", zap.Error(err)) + } +} + +// recordReplay records the duration of a cache-miss journal replay for variant. +func (m *gameMetrics) recordReplay(ctx context.Context, v engine.Variant, start time.Time) { + m.replay.Record(ctx, time.Since(start).Seconds(), variantAttr(v)) +} + +// recordValidate records the duration of one play validation for variant. +func (m *gameMetrics) recordValidate(ctx context.Context, v engine.Variant, start time.Time) { + m.validate.Record(ctx, time.Since(start).Seconds(), variantAttr(v)) +} + +// recordStarted counts one started game of variant. +func (m *gameMetrics) recordStarted(ctx context.Context, v engine.Variant) { + m.started.Add(ctx, 1, variantAttr(v)) +} + +// recordAbandoned counts one seat dropped by the turn-timeout sweeper in a game of +// variant. +func (m *gameMetrics) recordAbandoned(ctx context.Context, v engine.Variant) { + m.abandoned.Add(ctx, 1, variantAttr(v)) +} + +// variantAttr is the shared "variant" attribute option, usable for both Record and +// Add measurements. +func variantAttr(v engine.Variant) metric.MeasurementOption { + return metric.WithAttributes(attribute.String("variant", v.String())) +} + +func histogram(m metric.Meter, name, desc string) metric.Float64Histogram { + h, err := m.Float64Histogram(name, metric.WithUnit("s"), metric.WithDescription(desc)) + if err != nil { + h, _ = noop.NewMeterProvider().Meter(meterName).Float64Histogram(name) + } + return h +} + +func counter(m metric.Meter, name, desc string) metric.Int64Counter { + c, err := m.Int64Counter(name, metric.WithDescription(desc)) + if err != nil { + c, _ = noop.NewMeterProvider().Meter(meterName).Int64Counter(name) + } + return c +} diff --git a/backend/internal/game/metrics_test.go b/backend/internal/game/metrics_test.go new file mode 100644 index 0000000..dad8b97 --- /dev/null +++ b/backend/internal/game/metrics_test.go @@ -0,0 +1,95 @@ +package game + +import ( + "context" + "testing" + "time" + + "go.opentelemetry.io/otel/attribute" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" + + "scrabble/backend/internal/engine" +) + +// TestGameMetrics records each game instrument through a manual reader and asserts +// the counters carry the right "variant" attribute and the histograms observe. +func TestGameMetrics(t *testing.T) { + ctx := context.Background() + reader := sdkmetric.NewManualReader() + meter := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)).Meter("test") + m := newGameMetrics(meter) + + m.recordStarted(ctx, engine.VariantEnglish) + m.recordStarted(ctx, engine.VariantEnglish) + m.recordStarted(ctx, engine.VariantRussianScrabble) + m.recordAbandoned(ctx, engine.VariantErudit) + m.recordReplay(ctx, engine.VariantEnglish, time.Now().Add(-time.Millisecond)) + m.recordValidate(ctx, engine.VariantRussianScrabble, time.Now().Add(-time.Millisecond)) + + var rm metricdata.ResourceMetrics + if err := reader.Collect(ctx, &rm); err != nil { + t.Fatalf("collect: %v", err) + } + + started := counterByAttr(t, rm, "games_started_total", "variant") + if started["english"] != 2 || started["russian_scrabble"] != 1 { + t.Errorf("games_started_total = %v, want english:2 russian_scrabble:1", started) + } + if abandoned := counterByAttr(t, rm, "games_abandoned_total", "variant"); abandoned["erudit"] != 1 { + t.Errorf("games_abandoned_total = %v, want erudit:1", abandoned) + } + if c := histogramCount(t, rm, "game_replay_duration"); c != 1 { + t.Errorf("game_replay_duration observations = %d, want 1", c) + } + if c := histogramCount(t, rm, "game_move_validate_duration"); c != 1 { + t.Errorf("game_move_validate_duration observations = %d, want 1", c) + } +} + +// counterByAttr sums the int64 counter named name, grouped by the value of the +// attribute key attr. +func counterByAttr(t *testing.T, rm metricdata.ResourceMetrics, name, attr string) map[string]int64 { + t.Helper() + out := map[string]int64{} + for _, sm := range rm.ScopeMetrics { + for _, md := range sm.Metrics { + if md.Name != name { + continue + } + sum, ok := md.Data.(metricdata.Sum[int64]) + if !ok { + t.Fatalf("%s is not an int64 sum", name) + } + for _, dp := range sum.DataPoints { + v, _ := dp.Attributes.Value(attribute.Key(attr)) + out[v.AsString()] += dp.Value + } + } + } + return out +} + +// histogramCount returns the total observation count of the float64 histogram +// named name. +func histogramCount(t *testing.T, rm metricdata.ResourceMetrics, name string) uint64 { + t.Helper() + for _, sm := range rm.ScopeMetrics { + for _, md := range sm.Metrics { + if md.Name != name { + continue + } + h, ok := md.Data.(metricdata.Histogram[float64]) + if !ok { + t.Fatalf("%s is not a float64 histogram", name) + } + var n uint64 + for _, dp := range h.DataPoints { + n += dp.Count + } + return n + } + } + t.Fatalf("%s not found", name) + return 0 +} diff --git a/backend/internal/game/service.go b/backend/internal/game/service.go index d7e85c9..287e336 100644 --- a/backend/internal/game/service.go +++ b/backend/internal/game/service.go @@ -33,6 +33,7 @@ type Service struct { clock func() time.Time rng func() int64 pub notify.Publisher + metrics *gameMetrics log *zap.Logger } @@ -51,6 +52,7 @@ func NewService(store *Store, accounts *account.Store, registry *engine.Registry clock: clock, rng: randomSeed, pub: notify.Nop{}, + metrics: defaultGameMetrics(), log: log, } } @@ -135,7 +137,8 @@ func (svc *Service) Create(ctx context.Context, params CreateParams) (Game, erro if err := svc.store.CreateGame(ctx, ins, params.Seats); err != nil { return Game{}, err } - svc.cache.put(id, g) + svc.cache.put(id, g, params.Variant.String()) + svc.metrics.recordStarted(ctx, params.Variant) return svc.store.GetGame(ctx, id) } @@ -350,6 +353,7 @@ func (svc *Service) timeoutGame(ctx context.Context, gameID uuid.UUID, now time. if _, err := svc.commit(ctx, gameID, g, rec, "timeout", rackBefore, nil, cur.Seats); err != nil { return false, err } + svc.metrics.recordAbandoned(ctx, cur.Variant) return true, nil } @@ -373,7 +377,9 @@ func (svc *Service) EvaluatePlay(ctx context.Context, gameID, accountID uuid.UUI if err != nil { return EvalResult{}, err } + validateStart := time.Now() rec, err := g.EvaluatePlay(dir, tiles) + svc.metrics.recordValidate(ctx, pre.Variant, validateStart) if err != nil { if errors.Is(err, engine.ErrIllegalPlay) { return EvalResult{Valid: false}, nil @@ -704,7 +710,7 @@ func (svc *Service) liveGame(ctx context.Context, pre Game) (*engine.Game, error return nil, err } if !g.Over() { - svc.cache.put(pre.ID, g) + svc.cache.put(pre.ID, g, pre.Variant.String()) } return g, nil } @@ -713,6 +719,7 @@ func (svc *Service) liveGame(ctx context.Context, pre Game) (*engine.Game, error // re-applying every journalled move in order. The deterministic bag makes the // reconstruction exact. func (svc *Service) replay(ctx context.Context, pre Game) (*engine.Game, error) { + defer svc.metrics.recordReplay(ctx, pre.Variant, time.Now()) seed, err := svc.store.GameSeed(ctx, pre.ID) if err != nil { return nil, err diff --git a/backend/internal/inttest/guest_reaper_test.go b/backend/internal/inttest/guest_reaper_test.go new file mode 100644 index 0000000..d63cae1 --- /dev/null +++ b/backend/internal/inttest/guest_reaper_test.go @@ -0,0 +1,76 @@ +//go:build integration + +package inttest + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/google/uuid" + + "scrabble/backend/internal/account" + "scrabble/backend/internal/engine" + "scrabble/backend/internal/game" +) + +// TestGuestReaper verifies the abandoned-guest reaper: it deletes guests with no +// game seat once their account age is past the cutoff, while sparing guests that +// are too young, guests seated in a game (the FK-protected opponent history), and +// durable accounts. +func TestGuestReaper(t *testing.T) { + ctx := context.Background() + store := account.NewStore(testDB) + + guestA := provisionGuest(t) // guest, no seat → reaped on a future cutoff + guestB := provisionGuest(t) // guest, no seat → reaped on a future cutoff + seated := provisionGuest(t) // guest seated in a game → kept + durable := provisionAccount(t) + + // Seat the third guest in a game with a durable opponent (Create needs 2-4). + opp := provisionAccount(t) + if _, err := newGameService().Create(ctx, game.CreateParams{ + Variant: engine.VariantEnglish, Seats: []uuid.UUID{seated, opp}, TurnTimeout: 24 * time.Hour, Seed: 1, + }); err != nil { + t.Fatalf("create game: %v", err) + } + + // A cutoff in the past: every account is younger than the window, so the age + // gate spares them all. + if n, err := store.ReapAbandonedGuests(ctx, time.Now().Add(-time.Hour)); err != nil { + t.Fatalf("reap (past cutoff): %v", err) + } else if n != 0 { + t.Fatalf("reap with a past cutoff deleted %d, want 0", n) + } + assertAccount(t, store, guestA, true) + + // A cutoff in the future: every account predates it, so the no-seat guests are + // reaped and the seated guest and the durable account survive. + if _, err := store.ReapAbandonedGuests(ctx, time.Now().Add(time.Hour)); err != nil { + t.Fatalf("reap (future cutoff): %v", err) + } + assertAccount(t, store, guestA, false) + assertAccount(t, store, guestB, false) + assertAccount(t, store, seated, true) + assertAccount(t, store, durable, true) +} + +// assertAccount checks whether the account with id is present, failing the test +// when its presence differs from want. +func assertAccount(t *testing.T, store *account.Store, id uuid.UUID, want bool) { + t.Helper() + _, err := store.GetByID(context.Background(), id) + switch { + case err == nil: + if !want { + t.Errorf("account %s still exists, want reaped", id) + } + case errors.Is(err, account.ErrNotFound): + if want { + t.Errorf("account %s was reaped, want kept", id) + } + default: + t.Fatalf("get account %s: %v", id, err) + } +} diff --git a/backend/internal/pushgrpc/server.go b/backend/internal/pushgrpc/server.go index e9a73bf..8a3dcad 100644 --- a/backend/internal/pushgrpc/server.go +++ b/backend/internal/pushgrpc/server.go @@ -11,6 +11,7 @@ import ( "fmt" "net" + "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "go.uber.org/zap" "google.golang.org/grpc" @@ -78,7 +79,7 @@ func NewServer(addr string, hub *notify.Hub, log *zap.Logger) *Server { if log == nil { log = zap.NewNop() } - gs := grpc.NewServer() + gs := grpc.NewServer(grpc.StatsHandler(otelgrpc.NewServerHandler())) pushv1.RegisterPushServer(gs, NewService(hub, log)) return &Server{grpc: gs, addr: addr, log: log} } diff --git a/backend/internal/social/chat.go b/backend/internal/social/chat.go index 6a45893..387440e 100644 --- a/backend/internal/social/chat.go +++ b/backend/internal/social/chat.go @@ -77,6 +77,7 @@ func (svc *Service) PostMessage(ctx context.Context, gameID, senderID uuid.UUID, if err != nil { return Message{}, err } + svc.metrics.recordChat(ctx, kindMessage) svc.emitChat(seats, senderID, msg) return msg, nil } @@ -110,6 +111,7 @@ func (svc *Service) Nudge(ctx context.Context, gameID, senderID uuid.UUID) (Mess if err != nil { return Message{}, err } + svc.metrics.recordChat(ctx, kindNudge) if toMove >= 0 && toMove < len(seats) { svc.pub.Publish(notify.Nudge(seats[toMove], gameID, senderID)) } diff --git a/backend/internal/social/metrics.go b/backend/internal/social/metrics.go new file mode 100644 index 0000000..9971148 --- /dev/null +++ b/backend/internal/social/metrics.go @@ -0,0 +1,49 @@ +package social + +import ( + "context" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/metric/noop" +) + +// meterName scopes the social domain's OpenTelemetry instruments. +const meterName = "scrabble/backend/social" + +// socialMetrics holds the social domain's operational instruments. It defaults to +// no-ops (see defaultSocialMetrics); SetMetrics installs the real meter during +// startup wiring. +type socialMetrics struct { + messages metric.Int64Counter +} + +// defaultSocialMetrics returns instruments backed by a no-op meter. +func defaultSocialMetrics() *socialMetrics { + return newSocialMetrics(noop.NewMeterProvider().Meter(meterName)) +} + +// newSocialMetrics builds the instruments on meter, falling back to a no-op +// counter on the (rare) construction error. +func newSocialMetrics(meter metric.Meter) *socialMetrics { + c, err := meter.Int64Counter("chat_messages_total", + metric.WithDescription("Per-game chat entries posted, labelled by kind (message/nudge).")) + if err != nil { + c, _ = noop.NewMeterProvider().Meter(meterName).Int64Counter("chat_messages_total") + } + return &socialMetrics{messages: c} +} + +// SetMetrics installs the meter the social domain records to. It must be called +// during startup wiring; the default is a no-op meter. +func (svc *Service) SetMetrics(meter metric.Meter) { + if meter == nil { + return + } + svc.metrics = newSocialMetrics(meter) +} + +// recordChat counts one posted chat entry of the given kind (message or nudge). +func (m *socialMetrics) recordChat(ctx context.Context, kind string) { + m.messages.Add(ctx, 1, metric.WithAttributes(attribute.String("kind", kind))) +} diff --git a/backend/internal/social/metrics_test.go b/backend/internal/social/metrics_test.go new file mode 100644 index 0000000..9fbaddf --- /dev/null +++ b/backend/internal/social/metrics_test.go @@ -0,0 +1,48 @@ +package social + +import ( + "context" + "testing" + + "go.opentelemetry.io/otel/attribute" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" +) + +// TestSocialMetrics records chat and nudge entries through a manual reader and +// asserts chat_messages_total splits by the "kind" attribute. +func TestSocialMetrics(t *testing.T) { + ctx := context.Background() + reader := sdkmetric.NewManualReader() + meter := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)).Meter("test") + m := newSocialMetrics(meter) + + m.recordChat(ctx, kindMessage) + m.recordChat(ctx, kindMessage) + m.recordChat(ctx, kindNudge) + + var rm metricdata.ResourceMetrics + if err := reader.Collect(ctx, &rm); err != nil { + t.Fatalf("collect: %v", err) + } + + got := map[string]int64{} + for _, sm := range rm.ScopeMetrics { + for _, md := range sm.Metrics { + if md.Name != "chat_messages_total" { + continue + } + sum, ok := md.Data.(metricdata.Sum[int64]) + if !ok { + t.Fatalf("chat_messages_total is not an int64 sum") + } + for _, dp := range sum.DataPoints { + v, _ := dp.Attributes.Value(attribute.Key("kind")) + got[v.AsString()] += dp.Value + } + } + } + if got[kindMessage] != 2 || got[kindNudge] != 1 { + t.Errorf("chat_messages_total = %v, want message:2 nudge:1", got) + } +} diff --git a/backend/internal/social/social.go b/backend/internal/social/social.go index 7f97f1f..43002c2 100644 --- a/backend/internal/social/social.go +++ b/backend/internal/social/social.go @@ -76,6 +76,7 @@ type Service struct { accounts *account.Store games GameReader pub notify.Publisher + metrics *socialMetrics now func() time.Time } @@ -87,6 +88,7 @@ func NewService(store *Store, accounts *account.Store, games GameReader) *Servic accounts: accounts, games: games, pub: notify.Nop{}, + metrics: defaultSocialMetrics(), now: func() time.Time { return time.Now().UTC() }, } } diff --git a/backend/internal/telemetry/telemetry.go b/backend/internal/telemetry/telemetry.go index 49a59af..0e86589 100644 --- a/backend/internal/telemetry/telemetry.go +++ b/backend/internal/telemetry/telemetry.go @@ -1,158 +1,58 @@ -// Package telemetry owns the OpenTelemetry runtime for the backend process. -// -// New constructs the configured tracer and meter providers, registers them as -// the OpenTelemetry globals, and exposes Shutdown for orderly exit. The MVP -// supports the `none` and `stdout` exporters; OTLP export and dashboards arrive -// in a later stage. The per-request timing middleware lives in middleware.go and -// uses the registered global tracer, so requests are timed and logged even when -// the exporter is `none`. +// Package telemetry owns the backend's OpenTelemetry wiring. The provider +// bootstrap (exporter selection, propagators, shutdown, Go runtime metrics) is +// shared across the Scrabble services in scrabble/pkg/telemetry; this package is a +// thin backend-flavoured facade over it (the "scrabble-backend" default service +// name) plus the backend-specific gin request-timing middleware (middleware.go), +// which uses the registered global tracer so requests are timed and logged even +// when the exporter is "none". package telemetry import ( "context" - "errors" - "fmt" - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/exporters/stdout/stdoutmetric" - "go.opentelemetry.io/otel/exporters/stdout/stdouttrace" - "go.opentelemetry.io/otel/metric" - "go.opentelemetry.io/otel/propagation" - sdkmetric "go.opentelemetry.io/otel/sdk/metric" - "go.opentelemetry.io/otel/sdk/resource" - sdktrace "go.opentelemetry.io/otel/sdk/trace" "go.opentelemetry.io/otel/trace" "go.uber.org/zap" + + pkgtel "scrabble/pkg/telemetry" ) -// Exporter selectors supported by the backend. +// Exporter selectors, re-exported from scrabble/pkg/telemetry so the backend's +// config and tests need not import the shared package directly. const ( - ExporterNone = "none" - ExporterStdout = "stdout" + ExporterNone = pkgtel.ExporterNone + ExporterStdout = pkgtel.ExporterStdout + ExporterOTLP = pkgtel.ExporterOTLP ) -// DefaultServiceName labels traces and metrics when BACKEND_SERVICE_NAME is -// unset. +// DefaultServiceName labels traces and metrics when BACKEND_SERVICE_NAME is unset. const DefaultServiceName = "scrabble-backend" -// Config selects the telemetry providers' service name and exporters. -type Config struct { - // ServiceName is reported as the OpenTelemetry service.name resource. - ServiceName string - // TracesExporter is one of ExporterNone or ExporterStdout. - TracesExporter string - // MetricsExporter is one of ExporterNone or ExporterStdout. - MetricsExporter string -} +// Config selects the telemetry providers' service name and exporters. It aliases +// the shared configuration type. +type Config = pkgtel.Config -// DefaultConfig returns the MVP telemetry configuration: named service, no -// exporters (so no collector is required locally or in CI). +// Runtime owns the shared OpenTelemetry providers. It aliases the shared runtime +// type, so callers keep using telemetry.Runtime. +type Runtime = pkgtel.Runtime + +// DefaultConfig returns the backend's telemetry configuration: the +// "scrabble-backend" service name and both exporters off (so no collector is +// required locally or in CI). func DefaultConfig() Config { - return Config{ - ServiceName: DefaultServiceName, - TracesExporter: ExporterNone, - MetricsExporter: ExporterNone, - } -} - -// Validate reports whether the configuration selects supported exporters. -func (c Config) Validate() error { - if c.ServiceName == "" { - return errors.New("telemetry: ServiceName must not be empty") - } - if err := validateExporter("traces", c.TracesExporter); err != nil { - return err - } - return validateExporter("metrics", c.MetricsExporter) -} - -func validateExporter(kind, value string) error { - switch value { - case ExporterNone, ExporterStdout: - return nil - default: - return fmt.Errorf("telemetry: unsupported %s exporter %q", kind, value) - } -} - -// Runtime owns the shared OpenTelemetry providers. -type Runtime struct { - tracerProvider *sdktrace.TracerProvider - meterProvider *sdkmetric.MeterProvider + return pkgtel.DefaultConfig(DefaultServiceName) } // New constructs the telemetry runtime, registers the global providers and the -// W3C trace-context/baggage propagators, and returns the Runtime. Callers must -// invoke Runtime.Shutdown during process exit. +// W3C propagators, and returns the Runtime. Callers must invoke Runtime.Shutdown +// during process exit. func New(ctx context.Context, cfg Config) (*Runtime, error) { - if err := cfg.Validate(); err != nil { - return nil, err - } - - res, err := resource.New(ctx, resource.WithAttributes( - attribute.String("service.name", cfg.ServiceName), - )) - if err != nil { - return nil, fmt.Errorf("telemetry: build resource: %w", err) - } - - tracerProvider, err := newTracerProvider(cfg, res) - if err != nil { - return nil, fmt.Errorf("telemetry: build tracer provider: %w", err) - } - meterProvider, err := newMeterProvider(cfg, res) - if err != nil { - _ = tracerProvider.Shutdown(ctx) - return nil, fmt.Errorf("telemetry: build meter provider: %w", err) - } - - otel.SetTracerProvider(tracerProvider) - otel.SetMeterProvider(meterProvider) - otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( - propagation.TraceContext{}, - propagation.Baggage{}, - )) - - return &Runtime{tracerProvider: tracerProvider, meterProvider: meterProvider}, nil -} - -// TracerProvider returns the runtime tracer provider, or the global one when r -// is not initialised. -func (r *Runtime) TracerProvider() trace.TracerProvider { - if r == nil || r.tracerProvider == nil { - return otel.GetTracerProvider() - } - return r.tracerProvider -} - -// MeterProvider returns the runtime meter provider, or the global one when r is -// not initialised. -func (r *Runtime) MeterProvider() metric.MeterProvider { - if r == nil || r.meterProvider == nil { - return otel.GetMeterProvider() - } - return r.meterProvider -} - -// Shutdown flushes both providers within ctx. -func (r *Runtime) Shutdown(ctx context.Context) error { - if r == nil { - return nil - } - var err error - if r.meterProvider != nil { - err = errors.Join(err, r.meterProvider.Shutdown(ctx)) - } - if r.tracerProvider != nil { - err = errors.Join(err, r.tracerProvider.Shutdown(ctx)) - } - return err + return pkgtel.New(ctx, cfg) } // TraceFieldsFromContext returns zap fields identifying the active span, or nil -// when ctx carries no valid span context. Collocated here so callers do not -// import the OpenTelemetry API directly. +// when ctx carries no valid span context. Collocated here so callers (the +// request-timing middleware and the access log) do not import the OpenTelemetry +// API directly. func TraceFieldsFromContext(ctx context.Context) []zap.Field { if ctx == nil { return nil @@ -166,39 +66,3 @@ func TraceFieldsFromContext(ctx context.Context) []zap.Field { zap.String("otel_span_id", sc.SpanID().String()), } } - -func newTracerProvider(cfg Config, res *resource.Resource) (*sdktrace.TracerProvider, error) { - switch cfg.TracesExporter { - case ExporterNone: - return sdktrace.NewTracerProvider(sdktrace.WithResource(res)), nil - case ExporterStdout: - exporter, err := stdouttrace.New() - if err != nil { - return nil, fmt.Errorf("stdout trace exporter: %w", err) - } - return sdktrace.NewTracerProvider( - sdktrace.WithBatcher(exporter), - sdktrace.WithResource(res), - ), nil - default: - return nil, fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter) - } -} - -func newMeterProvider(cfg Config, res *resource.Resource) (*sdkmetric.MeterProvider, error) { - switch cfg.MetricsExporter { - case ExporterNone: - return sdkmetric.NewMeterProvider(sdkmetric.WithResource(res)), nil - case ExporterStdout: - exporter, err := stdoutmetric.New() - if err != nil { - return nil, fmt.Errorf("stdout metric exporter: %w", err) - } - return sdkmetric.NewMeterProvider( - sdkmetric.WithResource(res), - sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)), - ), nil - default: - return nil, fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter) - } -} diff --git a/backend/internal/telemetry/telemetry_test.go b/backend/internal/telemetry/telemetry_test.go index 1c16d5f..efe58ae 100644 --- a/backend/internal/telemetry/telemetry_test.go +++ b/backend/internal/telemetry/telemetry_test.go @@ -12,9 +12,15 @@ func TestConfigValidate(t *testing.T) { } otlp := DefaultConfig() - otlp.TracesExporter = "otlp" - if err := otlp.Validate(); err == nil { - t.Error("otlp exporter must be rejected in the MVP set") + otlp.TracesExporter = ExporterOTLP + if err := otlp.Validate(); err != nil { + t.Errorf("otlp exporter must be accepted: %v", err) + } + + bad := DefaultConfig() + bad.MetricsExporter = "prometheus" + if err := bad.Validate(); err == nil { + t.Error("unsupported exporter must be rejected") } noName := DefaultConfig() diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index ba7d285..ff5d018 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -444,14 +444,27 @@ promotions) is future work and would deliver short markdown messages (text + lin ## 11. Observability - Structured logging with `go.uber.org/zap` (JSON). OpenTelemetry tracer and - meter providers are wired (Stage 1), env-gated by - `BACKEND_OTEL_{TRACES,METRICS}_EXPORTER` with a default of `none` (so no - collector is required locally or in CI); `stdout` is available for debugging - and the Postgres pool is instrumented with otelsql. OTLP export, a Prometheus - pull endpoint, and dashboards arrive with the first real workload. + meter providers are wired in **all three services** (backend, gateway, the + Telegram connector) through a shared `pkg/telemetry` bootstrap, env-gated per + service by `{BACKEND,GATEWAY,TELEGRAM}_OTEL_{TRACES,METRICS}_EXPORTER` with a + default of `none` (so no collector is required locally or in CI). `stdout` is + available for debugging; **`otlp`** (gRPC, endpoint from the standard + `OTEL_EXPORTER_OTLP_*` environment) exports to a collector. The Postgres pool is + instrumented with otelsql and `otelgrpc` traces the backend↔gateway push stream + and the gateway↔connector calls. The OTLP collector and Grafana dashboards are + stood up with the deploy (Stage 14). - Per-request server-side timing via gin middleware from day one (the access log carries method, route, status, latency and the active trace id). A client-measured RTT piggybacked on the next request is a later enhancement. +- Domain/operational metrics (Stage 12), recorded through the meter and invisible + until an exporter is configured: histograms `game_replay_duration` (journal + rebuild on a cache miss) and `game_move_validate_duration`; counters + `games_started_total`, `games_abandoned_total` (a turn-timeout seat drop), + `chat_messages_total` (`kind` = message/nudge) and `robot_games_finished_total`; + an observable gauge `game_cache_active`; the gateway `edge_request_duration` + (the UI-perceived roundtrip, by `message_type`/`result`); and Go runtime/heap + metrics. Game-scoped metrics carry a `variant` attribute + (english/russian_scrabble/erudit). - Unauthenticated `GET /healthz` (liveness) and `GET /readyz` (readiness — the database answers a bounded ping and the session cache is warmed). - The backend serves a **second listener** — a gRPC server @@ -486,15 +499,15 @@ a dedicated redeem sub-limit or a longer code is the hardening step if abuse app ## 13. Deployment (informational) Single public origin, path-routed: a mini-landing at the root, the **Telegram Mini -App under `/telegram/`** (the gateway serves the static UI build; outside Telegram -that path redirects to the root), the gateway public surface and the **admin console +App under `/telegram/`** (the gateway serves the static UI build, wired in Stage 14; +outside Telegram that path redirects to the root), the gateway public surface and the **admin console at `/_gm`** (backend-rendered, Basic-Auth at the gateway) share one host that terminates TLS. The **Telegram connector** runs as a separate container with **no public ingress** — it long-polls Telegram and egresses through a VPN sidecar, answering only internal gRPC. MVP runs one `gateway`, one `backend`, one Postgres, plus the connector. The connector's Docker/compose ships now -(`platform/telegram/deploy`, mirroring `../15-puzzle`); the full multi-service deploy -is Stage 12. +(`platform/telegram/deploy`, mirroring `../15-puzzle`); the gateway's static UI serving +and the full multi-service deploy land in Stage 14. ## 14. CI & branches diff --git a/docs/FUNCTIONAL.md b/docs/FUNCTIONAL.md index 2444c52..1b3c6cb 100644 --- a/docs/FUNCTIONAL.md +++ b/docs/FUNCTIONAL.md @@ -29,7 +29,8 @@ session token; the backend resolves it to an internal `user_id`. A **Telegram Mi App** launch authenticates from the platform's signed `initData`, themes the UI to the Telegram colours, and — on first contact — seeds the new account's interface language from the Telegram client. Guests are session-only with restricted features -(auto-match only; no friends, stats or history). While the app is open the client +(auto-match only; no friends, stats or history); an abandoned guest that never +joined a game and has been idle past the retention window is garbage-collected. While the app is open the client keeps a live stream and receives in-app updates in real time — the opponent's move, your turn, chat, nudges and a found match. When the app is **closed**, the chosen out-of-app events (your turn, nudge, a found match, an invitation or friend request) diff --git a/docs/FUNCTIONAL_ru.md b/docs/FUNCTIONAL_ru.md index 049ef5b..d20e7be 100644 --- a/docs/FUNCTIONAL_ru.md +++ b/docs/FUNCTIONAL_ru.md @@ -30,7 +30,8 @@ session-токен; backend сопоставляет его с внутренн Mini App** авторизует по подписанным `initData` платформы, перекрашивает интерфейс в цвета Telegram и — при первом контакте — задаёт язык интерфейса нового аккаунта по языку Telegram-клиента. Гость — только сессия, с урезанными функциями (только -авто-подбор; без друзей, статистики и истории). Пока приложение открыто, клиент +авто-подбор; без друзей, статистики и истории); заброшенный гость, не вошедший ни +в одну игру и простаивавший дольше окна удержания, удаляется сборщиком. Пока приложение открыто, клиент держит живой стрим и получает обновления в реальном времени — ход соперника, ваш ход, чат, nudge и найденный матч. Когда приложение **закрыто**, выбранные внеприложенческие события (ваш ход, nudge, найденный матч, приглашение или заявка в друзья) приходят diff --git a/docs/TESTING.md b/docs/TESTING.md index b23d4fb..0244883 100644 --- a/docs/TESTING.md +++ b/docs/TESTING.md @@ -93,6 +93,16 @@ tests or touching CI. dictionary-change pipeline** (file → resolve with a disposition → pending change → mark applied), the admin **list/count** read queries, and the **/_gm console over HTTP** (pages render; a resolve POST needs a same-origin header). +- **Observability & performance** *(Stage 12)* — `pkg/telemetry` unit-tests the exporter + selection (`none`/`stdout`/`otlp` build providers; OTLP constructs with no collector; + the nil-runtime fallback). The domain metrics are exercised through a manual + `sdkmetric` reader: `backend/internal/game` and `…/social` assert the counters and + histograms record with the right `variant`/`kind` attributes, and + `gateway/internal/connectsrv` asserts `edge_request_duration` by `message_type`/ + `result`. Config tests cover the new telemetry env vars (backend/gateway/connector — + `otlp` now accepted, an unsupported exporter rejected) and the guest-reaper knobs. + Postgres-backed `inttest` drives the **guest reaper** end to end (an abandoned guest is + reaped; a too-young guest, a seated guest and a durable account are kept). ## Principles diff --git a/gateway/README.md b/gateway/README.md index b293f49..21f53b9 100644 --- a/gateway/README.md +++ b/gateway/README.md @@ -72,6 +72,9 @@ connector (`ValidateLoginWidget`) and forward the trusted `external_id`. These | `GATEWAY_SESSION_TTL` | `10m` | cached session lifetime | | `GATEWAY_SESSION_CACHE_MAX` | `50000` | cached session cap | | `GATEWAY_PUSH_HEARTBEAT_INTERVAL` | `15s` | live-stream keep-alive | +| `GATEWAY_SERVICE_NAME` | `scrabble-gateway` | OpenTelemetry `service.name` | +| `GATEWAY_OTEL_TRACES_EXPORTER` | `none` | `none`, `stdout` or `otlp` (gRPC; endpoint from `OTEL_EXPORTER_OTLP_*`) | +| `GATEWAY_OTEL_METRICS_EXPORTER` | `none` | `none`, `stdout` or `otlp` | Rate-limit defaults (built-in): public 30/min·IP (burst 10), authenticated 120/min·user (burst 40), admin 60/min·IP (burst 20), email-code 5/10 min·IP. diff --git a/gateway/cmd/gateway/main.go b/gateway/cmd/gateway/main.go index 766c431..b96634d 100644 --- a/gateway/cmd/gateway/main.go +++ b/gateway/cmd/gateway/main.go @@ -26,11 +26,14 @@ import ( "scrabble/gateway/internal/ratelimit" "scrabble/gateway/internal/session" "scrabble/gateway/internal/transcode" + pkgtel "scrabble/pkg/telemetry" ) const ( // shutdownTimeout bounds the graceful HTTP shutdown. shutdownTimeout = 10 * time.Second + // telemetryShutdownTimeout bounds the OpenTelemetry flush during process exit. + telemetryShutdownTimeout = 5 * time.Second // pushReconnectDelay is the pause before re-subscribing to the backend push // stream after it ends. pushReconnectDelay = 2 * time.Second @@ -63,6 +66,21 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error { ctx, cancel := context.WithCancel(ctx) defer cancel() + tel, err := pkgtel.New(ctx, cfg.Telemetry) + if err != nil { + return err + } + defer func() { + shutdownCtx, sc := context.WithTimeout(context.Background(), telemetryShutdownTimeout) + defer sc() + if err := tel.Shutdown(shutdownCtx); err != nil { + logger.Warn("telemetry shutdown", zap.Error(err)) + } + }() + if err := tel.StartRuntimeMetrics(); err != nil { + logger.Warn("telemetry: start runtime metrics", zap.Error(err)) + } + backend, err := backendclient.New(cfg.BackendHTTPURL, cfg.BackendGRPCAddr, cfg.BackendTimeout) if err != nil { return err @@ -109,6 +127,7 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error { Heartbeat: cfg.PushHeartbeatInterval, Logger: logger, AdminProxy: adminProxy, + Meter: tel.MeterProvider().Meter("scrabble/gateway/edge"), }) // Bridge the backend push stream into the fan-out hub (and the out-of-app diff --git a/gateway/go.mod b/gateway/go.mod index 0117328..be4f349 100644 --- a/gateway/go.mod +++ b/gateway/go.mod @@ -14,6 +14,10 @@ require ( ) require ( + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0 + go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/metric v1.43.0 + go.opentelemetry.io/otel/sdk/metric v1.43.0 go.uber.org/multierr v1.10.0 // indirect golang.org/x/sys v0.43.0 // indirect golang.org/x/text v0.36.0 // indirect diff --git a/gateway/internal/backendclient/client.go b/gateway/internal/backendclient/client.go index 9252e09..c63f2fb 100644 --- a/gateway/internal/backendclient/client.go +++ b/gateway/internal/backendclient/client.go @@ -14,6 +14,7 @@ import ( "strings" "time" + "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" @@ -32,7 +33,10 @@ type Client struct { // backend lives on a trusted network segment, so the gRPC connection uses // insecure (plaintext) transport credentials (ARCHITECTURE.md §12). func New(httpURL, grpcAddr string, timeout time.Duration) (*Client, error) { - conn, err := grpc.NewClient(grpcAddr, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.NewClient(grpcAddr, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithStatsHandler(otelgrpc.NewClientHandler()), + ) if err != nil { return nil, fmt.Errorf("backendclient: dial push %s: %w", grpcAddr, err) } diff --git a/gateway/internal/config/config.go b/gateway/internal/config/config.go index eb32ec8..6ea46e4 100644 --- a/gateway/internal/config/config.go +++ b/gateway/internal/config/config.go @@ -7,6 +7,8 @@ import ( "os" "strconv" "time" + + pkgtel "scrabble/pkg/telemetry" ) // Config holds the gateway's runtime configuration. @@ -38,6 +40,8 @@ type Config struct { PushHeartbeatInterval time.Duration // RateLimit configures the in-memory anti-abuse limiter. RateLimit RateLimitConfig + // Telemetry configures the OpenTelemetry providers (shared bootstrap). + Telemetry pkgtel.Config } // RateLimitConfig holds the token-bucket limits per class. Public and admin are @@ -64,6 +68,7 @@ const ( defaultSessionTTL = 10 * time.Minute defaultSessionCacheMax = 50000 defaultPushHeartbeatInterval = 15 * time.Second + defaultServiceName = "scrabble-gateway" ) // DefaultRateLimit returns the built-in anti-abuse limits. @@ -91,6 +96,11 @@ func Load() (Config, error) { SessionCacheMax: defaultSessionCacheMax, RateLimit: DefaultRateLimit(), } + tel := pkgtel.DefaultConfig(defaultServiceName) + tel.ServiceName = envOr("GATEWAY_SERVICE_NAME", tel.ServiceName) + tel.TracesExporter = envOr("GATEWAY_OTEL_TRACES_EXPORTER", tel.TracesExporter) + tel.MetricsExporter = envOr("GATEWAY_OTEL_METRICS_EXPORTER", tel.MetricsExporter) + c.Telemetry = tel if c.BackendTimeout, err = envDuration("GATEWAY_BACKEND_TIMEOUT", defaultBackendTimeout); err != nil { return Config{}, err } @@ -131,6 +141,9 @@ func (c Config) validate() error { if c.BackendGRPCAddr == "" { return fmt.Errorf("config: GATEWAY_BACKEND_GRPC_ADDR must not be empty") } + if err := c.Telemetry.Validate(); err != nil { + return fmt.Errorf("config: %w", err) + } return nil } diff --git a/gateway/internal/config/config_test.go b/gateway/internal/config/config_test.go new file mode 100644 index 0000000..8f9afcd --- /dev/null +++ b/gateway/internal/config/config_test.go @@ -0,0 +1,31 @@ +package config + +import ( + "testing" + + pkgtel "scrabble/pkg/telemetry" +) + +// TestLoadTelemetryDefaults verifies the gateway telemetry defaults: the +// "scrabble-gateway" service name and both exporters off. +func TestLoadTelemetryDefaults(t *testing.T) { + c, err := Load() + if err != nil { + t.Fatalf("Load: %v", err) + } + if c.Telemetry.ServiceName != defaultServiceName { + t.Errorf("Telemetry.ServiceName = %q, want %q", c.Telemetry.ServiceName, defaultServiceName) + } + if c.Telemetry.TracesExporter != pkgtel.ExporterNone || c.Telemetry.MetricsExporter != pkgtel.ExporterNone { + t.Errorf("exporters = %q/%q, want none/none", c.Telemetry.TracesExporter, c.Telemetry.MetricsExporter) + } +} + +// TestLoadRejectsUnsupportedExporter verifies an exporter outside the supported +// set fails validation. +func TestLoadRejectsUnsupportedExporter(t *testing.T) { + t.Setenv("GATEWAY_OTEL_METRICS_EXPORTER", "prometheus") + if _, err := Load(); err == nil { + t.Fatal("Load: expected an error for an unsupported exporter, got nil") + } +} diff --git a/gateway/internal/connector/client.go b/gateway/internal/connector/client.go index 40f0d17..1f3ce78 100644 --- a/gateway/internal/connector/client.go +++ b/gateway/internal/connector/client.go @@ -9,6 +9,7 @@ import ( "errors" "fmt" + "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/credentials/insecure" @@ -42,7 +43,10 @@ type Client struct { // New dials the connector gRPC endpoint. func New(addr string) (*Client, error) { - conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.NewClient(addr, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithStatsHandler(otelgrpc.NewClientHandler()), + ) if err != nil { return nil, fmt.Errorf("connector: dial %s: %w", addr, err) } diff --git a/gateway/internal/connectsrv/metrics.go b/gateway/internal/connectsrv/metrics.go new file mode 100644 index 0000000..abcfd97 --- /dev/null +++ b/gateway/internal/connectsrv/metrics.go @@ -0,0 +1,43 @@ +package connectsrv + +import ( + "context" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/metric/noop" +) + +// meterName scopes the gateway edge's OpenTelemetry instruments. +const meterName = "scrabble/gateway/edge" + +// serverMetrics holds the edge's operational instruments. It defaults to no-ops; +// NewServer installs the real meter when one is supplied in Deps. +type serverMetrics struct { + edge metric.Float64Histogram +} + +// newServerMetrics builds the instruments on meter (nil selects a no-op meter), +// falling back to a no-op histogram on the (rare) construction error. +func newServerMetrics(meter metric.Meter) *serverMetrics { + if meter == nil { + meter = noop.NewMeterProvider().Meter(meterName) + } + h, err := meter.Float64Histogram("edge_request_duration", + metric.WithUnit("s"), + metric.WithDescription("Seconds to serve one Connect Execute call, by message type and result.")) + if err != nil { + h, _ = noop.NewMeterProvider().Meter(meterName).Float64Histogram("edge_request_duration") + } + return &serverMetrics{edge: h} +} + +// recordEdge records the duration of one Execute call labelled by message type and +// outcome (ok, domain, unauthenticated, rate_limited, unknown_type or internal). +func (m *serverMetrics) recordEdge(ctx context.Context, msgType, result string, start time.Time) { + m.edge.Record(ctx, time.Since(start).Seconds(), metric.WithAttributes( + attribute.String("message_type", msgType), + attribute.String("result", result), + )) +} diff --git a/gateway/internal/connectsrv/metrics_test.go b/gateway/internal/connectsrv/metrics_test.go new file mode 100644 index 0000000..80987cc --- /dev/null +++ b/gateway/internal/connectsrv/metrics_test.go @@ -0,0 +1,54 @@ +package connectsrv + +import ( + "context" + "testing" + "time" + + "go.opentelemetry.io/otel/attribute" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" +) + +// TestEdgeMetric records Execute outcomes through a manual reader and asserts the +// edge_request_duration histogram splits by message_type and result. +func TestEdgeMetric(t *testing.T) { + ctx := context.Background() + reader := sdkmetric.NewManualReader() + meter := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)).Meter("test") + m := newServerMetrics(meter) + + m.recordEdge(ctx, "game.submit_play", "ok", time.Now().Add(-time.Millisecond)) + m.recordEdge(ctx, "game.submit_play", "ok", time.Now().Add(-time.Millisecond)) + m.recordEdge(ctx, "auth.guest", "domain", time.Now().Add(-time.Millisecond)) + + var rm metricdata.ResourceMetrics + if err := reader.Collect(ctx, &rm); err != nil { + t.Fatalf("collect: %v", err) + } + + type key struct{ messageType, result string } + counts := map[key]uint64{} + for _, sm := range rm.ScopeMetrics { + for _, md := range sm.Metrics { + if md.Name != "edge_request_duration" { + continue + } + h, ok := md.Data.(metricdata.Histogram[float64]) + if !ok { + t.Fatalf("edge_request_duration is not a float64 histogram") + } + for _, dp := range h.DataPoints { + mt, _ := dp.Attributes.Value(attribute.Key("message_type")) + res, _ := dp.Attributes.Value(attribute.Key("result")) + counts[key{mt.AsString(), res.AsString()}] += dp.Count + } + } + } + if got := counts[key{"game.submit_play", "ok"}]; got != 2 { + t.Errorf("edge game.submit_play/ok = %d, want 2", got) + } + if got := counts[key{"auth.guest", "domain"}]; got != 1 { + t.Errorf("edge auth.guest/domain = %d, want 1", got) + } +} diff --git a/gateway/internal/connectsrv/server.go b/gateway/internal/connectsrv/server.go index 9c478b1..db3c72a 100644 --- a/gateway/internal/connectsrv/server.go +++ b/gateway/internal/connectsrv/server.go @@ -14,6 +14,7 @@ import ( "time" "connectrpc.com/connect" + "go.opentelemetry.io/otel/metric" "go.uber.org/zap" "golang.org/x/net/http2" "golang.org/x/net/http2/h2c" @@ -39,6 +40,7 @@ type Server struct { heartbeat time.Duration log *zap.Logger adminProxy http.Handler + metrics *serverMetrics publicPolicy ratelimit.Policy userPolicy ratelimit.Policy @@ -55,6 +57,7 @@ type Deps struct { Heartbeat time.Duration Logger *zap.Logger AdminProxy http.Handler + Meter metric.Meter } // NewServer constructs the edge service. @@ -71,6 +74,7 @@ func NewServer(d Deps) *Server { heartbeat: d.Heartbeat, log: log, adminProxy: d.AdminProxy, + metrics: newServerMetrics(d.Meter), publicPolicy: ratelimit.PerMinute(d.RateLimit.PublicPerMinute, d.RateLimit.PublicBurst), userPolicy: ratelimit.PerMinute(d.RateLimit.UserPerMinute, d.RateLimit.UserBurst), emailPolicy: ratelimit.Per(d.RateLimit.EmailPer10Min, 10*time.Minute, d.RateLimit.EmailBurst), @@ -95,9 +99,14 @@ func (s *Server) HTTPHandler() http.Handler { // (result_code != "ok", HTTP 200); only edge failures (rate limit, missing // session, unknown type, internal) become Connect errors. func (s *Server) Execute(ctx context.Context, req *connect.Request[edgev1.ExecuteRequest]) (*connect.Response[edgev1.ExecuteResponse], error) { + start := time.Now() msgType := req.Msg.GetMessageType() + result := "internal" + defer func() { s.metrics.recordEdge(ctx, msgType, result, start) }() + op, ok := s.registry.Lookup(msgType) if !ok { + result = "unknown_type" return nil, connect.NewError(connect.CodeNotFound, errUnknownMessageType(msgType)) } clientIP := peerIP(req.Peer().Addr, req.Header()) @@ -106,17 +115,21 @@ func (s *Server) Execute(ctx context.Context, req *connect.Request[edgev1.Execut if op.Auth { uid, err := s.resolve(ctx, req.Header()) if err != nil { + result = "unauthenticated" return nil, err } if !s.limiter.Allow("user:"+uid, s.userPolicy) { + result = "rate_limited" return nil, connect.NewError(connect.CodeResourceExhausted, errRateLimited) } tr.UserID = uid } else { if !s.limiter.Allow("ip:"+clientIP, s.publicPolicy) { + result = "rate_limited" return nil, connect.NewError(connect.CodeResourceExhausted, errRateLimited) } if op.Email && !s.limiter.Allow("email:"+clientIP, s.emailPolicy) { + result = "rate_limited" return nil, connect.NewError(connect.CodeResourceExhausted, errRateLimited) } } @@ -124,6 +137,7 @@ func (s *Server) Execute(ctx context.Context, req *connect.Request[edgev1.Execut payload, err := op.Handler(ctx, tr) if err != nil { if code, domain := transcode.DomainCode(err); domain { + result = "domain" return connect.NewResponse(&edgev1.ExecuteResponse{ RequestId: req.Msg.GetRequestId(), ResultCode: code, @@ -132,6 +146,7 @@ func (s *Server) Execute(ctx context.Context, req *connect.Request[edgev1.Execut s.log.Error("execute failed", zap.String("message_type", msgType), zap.Error(err)) return nil, connect.NewError(connect.CodeInternal, errInternal) } + result = "ok" return connect.NewResponse(&edgev1.ExecuteResponse{ RequestId: req.Msg.GetRequestId(), ResultCode: "ok", diff --git a/go.work.sum b/go.work.sum index 33a2549..c2a80ce 100644 --- a/go.work.sum +++ b/go.work.sum @@ -72,6 +72,8 @@ github.com/ydb-platform/ydb-go-sdk/v3 v3.135.0/go.mod h1:VYUUkRJkKuQPkIpgtZJj6+5 github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= github.com/ziutek/mymysql v1.5.4/go.mod h1:LMSpPZ6DbqWFxNCHW77HeMg9I646SAhApZ/wKdgO/C0= go.opentelemetry.io/contrib/detectors/gcp v1.39.0/go.mod h1:t/OGqzHBa5v6RHZwrDBJ2OirWc+4q/w2fTbLZwAKjTk= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0 h1:0Qx7VGBacMm9ZENQ7TnNObTYI4ShC+lHI16seduaxZo= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0/go.mod h1:Sje3i3MjSPKTSPvVWCaL8ugBzJwik3u4smCjUeuupqg= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/exp v0.0.0-20260410095643-746e56fc9e2f h1:W3F4c+6OLc6H2lb//N1q4WpJkhzJCK5J6kUi1NTVXfM= golang.org/x/exp v0.0.0-20260410095643-746e56fc9e2f/go.mod h1:J1xhfL/vlindoeF/aINzNzt2Bket5bjo9sdOYzOsU80= diff --git a/pkg/go.mod b/pkg/go.mod index c1b54a7..3d40996 100644 --- a/pkg/go.mod +++ b/pkg/go.mod @@ -4,15 +4,33 @@ go 1.26.3 require ( github.com/google/flatbuffers v23.5.26+incompatible + go.opentelemetry.io/contrib/instrumentation/runtime v0.68.0 + go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 + go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.43.0 + go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0 + go.opentelemetry.io/otel/metric v1.43.0 + go.opentelemetry.io/otel/sdk v1.43.0 + go.opentelemetry.io/otel/sdk/metric v1.43.0 + go.opentelemetry.io/otel/trace v1.43.0 google.golang.org/grpc v1.80.0 google.golang.org/protobuf v1.36.11 ) require ( - go.opentelemetry.io/otel v1.43.0 // indirect - go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect + go.opentelemetry.io/proto/otlp v1.10.0 // indirect golang.org/x/net v0.53.0 // indirect golang.org/x/sys v0.43.0 // indirect golang.org/x/text v0.36.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20260420184626-e10c466a9529 // indirect ) diff --git a/pkg/go.sum b/pkg/go.sum index 783494b..a9f02bc 100644 --- a/pkg/go.sum +++ b/pkg/go.sum @@ -1,5 +1,10 @@ +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= @@ -12,20 +17,55 @@ github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/contrib/instrumentation/runtime v0.68.0 h1:jhVIQEprwUTV+KfzzliLidclhoTOoHTgdz96kAyR8mU= +go.opentelemetry.io/contrib/instrumentation/runtime v0.68.0/go.mod h1:4HsdbLUbernaTnA8CNaNE+1g026SciXb3juRYe3l8EY= go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 h1:8UQVDcZxOJLtX6gxtDt3vY2WTgvZqMQRzjsqiIHQdkc= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0/go.mod h1:2lmweYCiHYpEjQ/lSJBYhj9jP1zvCvQW4BqL9dnT7FQ= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 h1:RAE+JPfvEmvy+0LzyUA25/SGawPwIUbZ6u0Wug54sLc= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0/go.mod h1:AGmbycVGEsRx9mXMZ75CsOyhSP6MFIcj/6dnG+vhVjk= +go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.43.0 h1:TC+BewnDpeiAmcscXbGMfxkO+mwYUwE/VySwvw88PfA= +go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.43.0/go.mod h1:J/ZyF4vfPwsSr9xJSPyQ4LqtcTPULFR64KwTikGLe+A= +go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0 h1:mS47AX77OtFfKG4vtp+84kuGSFZHTyxtXIN269vChY0= +go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0/go.mod h1:PJnsC41lAGncJlPUniSwM81gc80GkgWJWr3cu2nKEtU= go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= +go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g= +go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= +golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= +golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= +golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M= google.golang.org/genproto/googleapis/rpc v0.0.0-20260420184626-e10c466a9529 h1:XF8+t6QQiS0o9ArVan/HW8Q7cycNPGsJf6GA2nXxYAg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260420184626-e10c466a9529/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM= google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/pkg/telemetry/telemetry.go b/pkg/telemetry/telemetry.go new file mode 100644 index 0000000..6e76606 --- /dev/null +++ b/pkg/telemetry/telemetry.go @@ -0,0 +1,218 @@ +// Package telemetry provides the shared OpenTelemetry runtime bootstrap for the +// Scrabble services (backend, gateway and the Telegram connector). New builds the +// tracer and meter providers from a Config, registers them as the OpenTelemetry +// globals and installs the W3C trace-context/baggage propagators; the per-service +// HTTP/RPC middleware and instruments live in the owning service. +// +// Three exporters are supported per signal: "none" (the default — no collector is +// required locally or in CI), "stdout" (debugging) and "otlp" (gRPC export to a +// collector). The OTLP endpoint and security are taken from the standard +// OTEL_EXPORTER_OTLP_* environment variables read by the SDK, so no bespoke +// configuration is introduced; the collector itself is stood up with the deploy +// (PLAN.md Stage 14). +package telemetry + +import ( + "context" + "errors" + "fmt" + + "go.opentelemetry.io/contrib/instrumentation/runtime" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/exporters/stdout/stdoutmetric" + "go.opentelemetry.io/otel/exporters/stdout/stdouttrace" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/propagation" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + "go.opentelemetry.io/otel/trace" +) + +// Exporter selectors supported per signal. +const ( + ExporterNone = "none" + ExporterStdout = "stdout" + ExporterOTLP = "otlp" +) + +// Config selects a service's telemetry providers: the reported service name and +// the per-signal exporters. +type Config struct { + // ServiceName is reported as the OpenTelemetry service.name resource. + ServiceName string + // TracesExporter is one of ExporterNone, ExporterStdout or ExporterOTLP. + TracesExporter string + // MetricsExporter is one of ExporterNone, ExporterStdout or ExporterOTLP. + MetricsExporter string +} + +// DefaultConfig returns the default telemetry configuration for serviceName: both +// exporters off, so no collector is required locally or in CI. +func DefaultConfig(serviceName string) Config { + return Config{ + ServiceName: serviceName, + TracesExporter: ExporterNone, + MetricsExporter: ExporterNone, + } +} + +// Validate reports whether the configuration names a service and selects +// supported exporters. +func (c Config) Validate() error { + if c.ServiceName == "" { + return errors.New("telemetry: ServiceName must not be empty") + } + if err := validateExporter("traces", c.TracesExporter); err != nil { + return err + } + return validateExporter("metrics", c.MetricsExporter) +} + +func validateExporter(kind, value string) error { + switch value { + case ExporterNone, ExporterStdout, ExporterOTLP: + return nil + default: + return fmt.Errorf("telemetry: unsupported %s exporter %q", kind, value) + } +} + +// Runtime owns a service's OpenTelemetry providers. +type Runtime struct { + tracerProvider *sdktrace.TracerProvider + meterProvider *sdkmetric.MeterProvider +} + +// New constructs the telemetry runtime, registers the global providers and the +// W3C trace-context/baggage propagators, and returns the Runtime. Callers must +// invoke Runtime.Shutdown during process exit. The OTLP exporters dial lazily, so +// New does not fail when no collector is reachable. +func New(ctx context.Context, cfg Config) (*Runtime, error) { + if err := cfg.Validate(); err != nil { + return nil, err + } + + res, err := resource.New(ctx, resource.WithAttributes( + attribute.String("service.name", cfg.ServiceName), + )) + if err != nil { + return nil, fmt.Errorf("telemetry: build resource: %w", err) + } + + tracerProvider, err := newTracerProvider(ctx, cfg, res) + if err != nil { + return nil, fmt.Errorf("telemetry: build tracer provider: %w", err) + } + meterProvider, err := newMeterProvider(ctx, cfg, res) + if err != nil { + _ = tracerProvider.Shutdown(ctx) + return nil, fmt.Errorf("telemetry: build meter provider: %w", err) + } + + otel.SetTracerProvider(tracerProvider) + otel.SetMeterProvider(meterProvider) + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + propagation.Baggage{}, + )) + + return &Runtime{tracerProvider: tracerProvider, meterProvider: meterProvider}, nil +} + +// TracerProvider returns the runtime tracer provider, or the global one when r is +// not initialised. +func (r *Runtime) TracerProvider() trace.TracerProvider { + if r == nil || r.tracerProvider == nil { + return otel.GetTracerProvider() + } + return r.tracerProvider +} + +// MeterProvider returns the runtime meter provider, or the global one when r is +// not initialised. +func (r *Runtime) MeterProvider() metric.MeterProvider { + if r == nil || r.meterProvider == nil { + return otel.GetMeterProvider() + } + return r.meterProvider +} + +// StartRuntimeMetrics begins collecting Go runtime metrics (heap, GC, goroutines) +// against the runtime meter provider. It is a no-op observer set: nothing is +// exported while the metrics exporter is "none". Call it once after New. +func (r *Runtime) StartRuntimeMetrics() error { + return runtime.Start(runtime.WithMeterProvider(r.MeterProvider())) +} + +// Shutdown flushes both providers within ctx. +func (r *Runtime) Shutdown(ctx context.Context) error { + if r == nil { + return nil + } + var err error + if r.meterProvider != nil { + err = errors.Join(err, r.meterProvider.Shutdown(ctx)) + } + if r.tracerProvider != nil { + err = errors.Join(err, r.tracerProvider.Shutdown(ctx)) + } + return err +} + +func newTracerProvider(ctx context.Context, cfg Config, res *resource.Resource) (*sdktrace.TracerProvider, error) { + switch cfg.TracesExporter { + case ExporterNone: + return sdktrace.NewTracerProvider(sdktrace.WithResource(res)), nil + case ExporterStdout: + exporter, err := stdouttrace.New() + if err != nil { + return nil, fmt.Errorf("stdout trace exporter: %w", err) + } + return sdktrace.NewTracerProvider( + sdktrace.WithBatcher(exporter), + sdktrace.WithResource(res), + ), nil + case ExporterOTLP: + exporter, err := otlptracegrpc.New(ctx) + if err != nil { + return nil, fmt.Errorf("otlp trace exporter: %w", err) + } + return sdktrace.NewTracerProvider( + sdktrace.WithBatcher(exporter), + sdktrace.WithResource(res), + ), nil + default: + return nil, fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter) + } +} + +func newMeterProvider(ctx context.Context, cfg Config, res *resource.Resource) (*sdkmetric.MeterProvider, error) { + switch cfg.MetricsExporter { + case ExporterNone: + return sdkmetric.NewMeterProvider(sdkmetric.WithResource(res)), nil + case ExporterStdout: + exporter, err := stdoutmetric.New() + if err != nil { + return nil, fmt.Errorf("stdout metric exporter: %w", err) + } + return sdkmetric.NewMeterProvider( + sdkmetric.WithResource(res), + sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)), + ), nil + case ExporterOTLP: + exporter, err := otlpmetricgrpc.New(ctx) + if err != nil { + return nil, fmt.Errorf("otlp metric exporter: %w", err) + } + return sdkmetric.NewMeterProvider( + sdkmetric.WithResource(res), + sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)), + ), nil + default: + return nil, fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter) + } +} diff --git a/pkg/telemetry/telemetry_test.go b/pkg/telemetry/telemetry_test.go new file mode 100644 index 0000000..d6edf23 --- /dev/null +++ b/pkg/telemetry/telemetry_test.go @@ -0,0 +1,84 @@ +package telemetry + +import ( + "context" + "testing" + "time" +) + +// TestConfigValidate covers the supported and rejected exporter selections. +func TestConfigValidate(t *testing.T) { + for _, exp := range []string{ExporterNone, ExporterStdout, ExporterOTLP} { + c := DefaultConfig("svc") + c.TracesExporter = exp + c.MetricsExporter = exp + if err := c.Validate(); err != nil { + t.Errorf("exporter %q must be valid: %v", exp, err) + } + } + bad := DefaultConfig("svc") + bad.TracesExporter = "prometheus" + if err := bad.Validate(); err == nil { + t.Error("unsupported exporter must be rejected") + } + if err := DefaultConfig("").Validate(); err == nil { + t.Error("empty service name must be rejected") + } +} + +// TestNewNoneAndStdout builds the providers with the none and stdout exporters, +// starts the runtime metrics and shuts them down. +func TestNewNoneAndStdout(t *testing.T) { + for _, exp := range []string{ExporterNone, ExporterStdout} { + cfg := DefaultConfig("svc") + cfg.TracesExporter = exp + cfg.MetricsExporter = exp + rt, err := New(context.Background(), cfg) + if err != nil { + t.Fatalf("New(%q): %v", exp, err) + } + if rt.TracerProvider() == nil || rt.MeterProvider() == nil { + t.Errorf("New(%q): nil provider", exp) + } + if err := rt.StartRuntimeMetrics(); err != nil { + t.Errorf("StartRuntimeMetrics(%q): %v", exp, err) + } + if err := rt.Shutdown(context.Background()); err != nil { + t.Errorf("Shutdown(%q): %v", exp, err) + } + } +} + +// TestNewOTLPConstructs verifies the otlp exporter selection builds providers +// without a reachable collector (the gRPC exporters dial lazily). Shutdown is +// best-effort: with no collector the final export may error, so it is bounded and +// its result ignored. +func TestNewOTLPConstructs(t *testing.T) { + cfg := DefaultConfig("svc") + cfg.TracesExporter = ExporterOTLP + cfg.MetricsExporter = ExporterOTLP + rt, err := New(context.Background(), cfg) + if err != nil { + t.Fatalf("New(otlp): %v", err) + } + if rt.TracerProvider() == nil || rt.MeterProvider() == nil { + t.Error("New(otlp): nil provider") + } + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = rt.Shutdown(ctx) +} + +// TestNilRuntime checks the nil-receiver fallbacks used before initialisation. +func TestNilRuntime(t *testing.T) { + var rt *Runtime + if rt.TracerProvider() == nil { + t.Error("nil runtime must fall back to the global tracer provider") + } + if rt.MeterProvider() == nil { + t.Error("nil runtime must fall back to the global meter provider") + } + if err := rt.Shutdown(context.Background()); err != nil { + t.Errorf("nil runtime Shutdown: %v", err) + } +} diff --git a/platform/telegram/README.md b/platform/telegram/README.md index 2d710ff..8195537 100644 --- a/platform/telegram/README.md +++ b/platform/telegram/README.md @@ -63,6 +63,9 @@ The bot turns a `/start ` or a notification target into a launch-button | `TELEGRAM_TEST_ENV` | `false` | route to the Bot API **test environment** (`/bot/test/METHOD`) | | `TELEGRAM_GAME_CHANNEL_ID` | — | game channel chat id for `SendToGameChannel` | | `TELEGRAM_LOG_LEVEL` | `info` | zap log level | +| `TELEGRAM_SERVICE_NAME` | `scrabble-telegram` | OpenTelemetry `service.name` | +| `TELEGRAM_OTEL_TRACES_EXPORTER` | `none` | `none`, `stdout` or `otlp` (gRPC; endpoint from `OTEL_EXPORTER_OTLP_*`) | +| `TELEGRAM_OTEL_METRICS_EXPORTER` | `none` | `none`, `stdout` or `otlp` | The **test environment** is selected by `TELEGRAM_TEST_ENV=true`, which suffixes the Bot API path with `/test` (the connector appends it to the token, since the client diff --git a/platform/telegram/cmd/telegram/main.go b/platform/telegram/cmd/telegram/main.go index 32cdfae..ef48286 100644 --- a/platform/telegram/cmd/telegram/main.go +++ b/platform/telegram/cmd/telegram/main.go @@ -12,11 +12,14 @@ import ( "net" "os/signal" "syscall" + "time" + "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "go.uber.org/zap" "google.golang.org/grpc" telegramv1 "scrabble/pkg/proto/telegram/v1" + pkgtel "scrabble/pkg/telemetry" "scrabble/platform/telegram/internal/bot" "scrabble/platform/telegram/internal/config" "scrabble/platform/telegram/internal/connector" @@ -24,6 +27,9 @@ import ( "scrabble/platform/telegram/internal/loginwidget" ) +// telemetryShutdownTimeout bounds the OpenTelemetry flush during process exit. +const telemetryShutdownTimeout = 5 * time.Second + func main() { cfg, err := config.Load() if err != nil { @@ -46,6 +52,21 @@ func main() { // run wires the bot and the gRPC server and serves both until the context is // cancelled. func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error { + tel, err := pkgtel.New(ctx, cfg.Telemetry) + if err != nil { + return err + } + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), telemetryShutdownTimeout) + defer cancel() + if err := tel.Shutdown(shutdownCtx); err != nil { + logger.Warn("telemetry shutdown", zap.Error(err)) + } + }() + if err := tel.StartRuntimeMetrics(); err != nil { + logger.Warn("telemetry: start runtime metrics", zap.Error(err)) + } + b, err := bot.New(bot.Config{ Token: cfg.BotToken, APIBaseURL: cfg.APIBaseURL, @@ -60,7 +81,7 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error { loginwidget.NewHMACValidator(cfg.BotToken), b, cfg.GameChannelID, logger) - grpcServer := grpc.NewServer() + grpcServer := grpc.NewServer(grpc.StatsHandler(otelgrpc.NewServerHandler())) telegramv1.RegisterTelegramServer(grpcServer, srv) lis, err := net.Listen("tcp", cfg.GRPCAddr) diff --git a/platform/telegram/go.mod b/platform/telegram/go.mod index 29bee4c..d300118 100644 --- a/platform/telegram/go.mod +++ b/platform/telegram/go.mod @@ -5,6 +5,7 @@ go 1.26.3 require ( github.com/go-telegram/bot v1.21.0 github.com/google/flatbuffers v23.5.26+incompatible + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0 go.uber.org/zap v1.27.1 google.golang.org/grpc v1.80.0 google.golang.org/protobuf v1.36.11 diff --git a/platform/telegram/internal/config/config.go b/platform/telegram/internal/config/config.go index 5e13130..3aca704 100644 --- a/platform/telegram/internal/config/config.go +++ b/platform/telegram/internal/config/config.go @@ -6,6 +6,8 @@ import ( "os" "strconv" "strings" + + pkgtel "scrabble/pkg/telemetry" ) // Config is the Telegram connector's runtime configuration, read from the @@ -34,6 +36,8 @@ type Config struct { GameChannelID int64 // LogLevel is the zap log level (TELEGRAM_LOG_LEVEL, default info). LogLevel string + // Telemetry configures the OpenTelemetry providers (shared bootstrap). + Telemetry pkgtel.Config } // Load reads the connector configuration from the environment, applying defaults @@ -47,6 +51,11 @@ func Load() (Config, error) { TestEnv: os.Getenv("TELEGRAM_TEST_ENV") == "true", LogLevel: envOr("TELEGRAM_LOG_LEVEL", "info"), } + tel := pkgtel.DefaultConfig("scrabble-telegram") + tel.ServiceName = envOr("TELEGRAM_SERVICE_NAME", tel.ServiceName) + tel.TracesExporter = envOr("TELEGRAM_OTEL_TRACES_EXPORTER", tel.TracesExporter) + tel.MetricsExporter = envOr("TELEGRAM_OTEL_METRICS_EXPORTER", tel.MetricsExporter) + cfg.Telemetry = tel if cfg.BotToken == "" { return Config{}, fmt.Errorf("config: TELEGRAM_BOT_TOKEN is required") } @@ -60,6 +69,9 @@ func Load() (Config, error) { } cfg.GameChannelID = id } + if err := cfg.Telemetry.Validate(); err != nil { + return Config{}, fmt.Errorf("config: %w", err) + } return cfg, nil } diff --git a/platform/telegram/internal/config/config_test.go b/platform/telegram/internal/config/config_test.go new file mode 100644 index 0000000..6a733bb --- /dev/null +++ b/platform/telegram/internal/config/config_test.go @@ -0,0 +1,41 @@ +package config + +import ( + "testing" + + pkgtel "scrabble/pkg/telemetry" +) + +// setRequired sets the two required connector variables so Load reaches the +// telemetry checks. +func setRequired(t *testing.T) { + t.Helper() + t.Setenv("TELEGRAM_BOT_TOKEN", "test-token") + t.Setenv("TELEGRAM_MINIAPP_URL", "https://example.org/app") +} + +// TestLoadTelemetryDefaults verifies the connector telemetry defaults: the +// "scrabble-telegram" service name and both exporters off. +func TestLoadTelemetryDefaults(t *testing.T) { + setRequired(t) + c, err := Load() + if err != nil { + t.Fatalf("Load: %v", err) + } + if c.Telemetry.ServiceName != "scrabble-telegram" { + t.Errorf("Telemetry.ServiceName = %q, want scrabble-telegram", c.Telemetry.ServiceName) + } + if c.Telemetry.TracesExporter != pkgtel.ExporterNone || c.Telemetry.MetricsExporter != pkgtel.ExporterNone { + t.Errorf("exporters = %q/%q, want none/none", c.Telemetry.TracesExporter, c.Telemetry.MetricsExporter) + } +} + +// TestLoadRejectsUnsupportedExporter verifies an exporter outside the supported +// set fails validation. +func TestLoadRejectsUnsupportedExporter(t *testing.T) { + setRequired(t) + t.Setenv("TELEGRAM_OTEL_TRACES_EXPORTER", "jaeger") + if _, err := Load(); err == nil { + t.Fatal("Load: expected an error for an unsupported exporter, got nil") + } +} From d99705645f7da77ca8a5c891272aa508ea08094e Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Thu, 4 Jun 2026 14:24:42 +0200 Subject: [PATCH 2/2] Stage 12: mark done in the stage tracker (CI green) --- PLAN.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PLAN.md b/PLAN.md index 404f902..35e26dd 100644 --- a/PLAN.md +++ b/PLAN.md @@ -45,7 +45,7 @@ independent (see ARCHITECTURE §9.1). | 9 | Telegram integration (bot side-service, deep-link, push) | **done** | | 10 | Admin & dictionary ops (complaint review, version reload) | **done** | | 11 | Account linking & merge | **done** | -| 12 | Observability & performance (telemetry, metrics, guest GC) | todo | +| 12 | Observability & performance (telemetry, metrics, guest GC) | **done** | | 13 | Alphabet on the wire (UI alphabet-agnostic) | todo | | 14 | CI & deploy (multi-service, dictionary artifacts) | todo |