Stage 16: deploy infra & test contour
CI / unit (pull_request) Successful in 9s
CI / integration (pull_request) Successful in 11s
CI / ui (pull_request) Successful in 19s
CI / deploy (pull_request) Failing after 1s

- backend + gateway multi-stage distroless Dockerfiles; the gateway embeds and
  serves the SPA at / and /telegram/ via go:embed (committed dist placeholder,
  real build baked in by the image's node stage)
- deploy/docker-compose.yml: backend + gateway + Postgres + Telegram connector
  (VPN sidecar) + OTel Collector + Prometheus (15d) + Tempo (72h) + Grafana,
  fronted by a caddy owning a single /_gm Basic-Auth (admin console + Grafana
  subpath); inter-service on a private network, only caddy on the edge network
- new metrics: backend accounts_created_total{kind} (robots excluded) and an
  in-memory gateway active_users{window=24h,7d} gauge
- CI: single .gitea/workflows/ci.yaml (unit/integration/ui + a gated test-contour
  deploy) on the new feature/* -> development -> master branch model; the old
  go-unit/integration/ui-test workflows are folded in; the connector-scoped
  compose is retired (superseded by deploy/)
- docs: ARCHITECTURE §11/§12/§13, root + gateway READMEs, CLAUDE.md branching,
  PLAN.md (stage 16 done + refinements + Stage 17 forward-notes)
This commit is contained in:
Ilia Denisov
2026-06-05 11:42:26 +02:00
parent 8c8f8c4d42
commit 8700fbfae1
35 changed files with 1413 additions and 318 deletions
@@ -0,0 +1,63 @@
package connectsrv
import (
"sync"
"time"
)
// activeUsers tracks distinct authenticated accounts by last-action time, backing
// the in-memory active_users gauge. It is single-process by design (the gateway is
// single-instance in the MVP, docs/ARCHITECTURE.md §10): the distinct count is
// correct for one process, resets on restart, and is a live operational gauge, not
// a billing figure. Memory is bounded by the number of distinct accounts active
// within the longest window; stale entries are pruned on observation.
type activeUsers struct {
mu sync.Mutex
lastSeen map[string]time.Time
now func() time.Time
}
// newActiveUsers returns an empty tracker using the wall clock.
func newActiveUsers() *activeUsers {
return &activeUsers{lastSeen: make(map[string]time.Time), now: time.Now}
}
// seen records that account uid performed an authenticated action now.
func (a *activeUsers) seen(uid string) {
if uid == "" {
return
}
a.mu.Lock()
a.lastSeen[uid] = a.now()
a.mu.Unlock()
}
// counts returns, for each window, the number of distinct accounts last seen
// within it, pruning entries older than the longest window in the same pass.
func (a *activeUsers) counts(windows []time.Duration) []int {
a.mu.Lock()
defer a.mu.Unlock()
now := a.now()
var longest time.Duration
for _, w := range windows {
if w > longest {
longest = w
}
}
res := make([]int, len(windows))
for uid, ts := range a.lastSeen {
age := now.Sub(ts)
if age > longest {
delete(a.lastSeen, uid)
continue
}
for i, w := range windows {
if age <= w {
res[i]++
}
}
}
return res
}
@@ -0,0 +1,45 @@
package connectsrv
import (
"testing"
"time"
)
func TestActiveUsersCountsAndPrune(t *testing.T) {
a := newActiveUsers()
base := time.Date(2026, 6, 5, 12, 0, 0, 0, time.UTC)
cur := base
a.now = func() time.Time { return cur }
a.seen("u1") // at base
cur = base.Add(2 * time.Hour)
a.seen("u2") // base+2h
cur = base.Add(50 * time.Hour)
a.seen("u3") // base+50h
windows := []time.Duration{24 * time.Hour, 7 * 24 * time.Hour}
// now = base+50h: u3 within 24h; all three within 7d.
got := a.counts(windows)
if got[0] != 1 || got[1] != 3 {
t.Fatalf("counts at +50h = %v, want [1 3]", got)
}
// now = base+169h: u1 (age 169h) prunes past the 7d window; u2/u3 remain in 7d.
cur = base.Add(169 * time.Hour)
got = a.counts(windows)
if got[0] != 0 || got[1] != 2 {
t.Fatalf("counts at +169h = %v, want [0 2]", got)
}
if _, ok := a.lastSeen["u1"]; ok {
t.Fatalf("u1 should have been pruned from the tracker")
}
}
func TestActiveUsersIgnoresEmpty(t *testing.T) {
a := newActiveUsers()
a.seen("")
if got := a.counts([]time.Duration{time.Hour}); got[0] != 0 {
t.Fatalf("empty uid recorded: got %v", got)
}
}
+37 -3
View File
@@ -12,14 +12,26 @@ import (
// meterName scopes the gateway edge's OpenTelemetry instruments.
const meterName = "scrabble/gateway/edge"
// activeUserWindows are the rolling windows the active_users gauge reports.
var activeUserWindows = []struct {
label string
dur time.Duration
}{
{label: "24h", dur: 24 * time.Hour},
{label: "7d", dur: 7 * 24 * time.Hour},
}
// serverMetrics holds the edge's operational instruments. It defaults to no-ops;
// NewServer installs the real meter when one is supplied in Deps.
type serverMetrics struct {
edge metric.Float64Histogram
edge metric.Float64Histogram
active *activeUsers
}
// newServerMetrics builds the instruments on meter (nil selects a no-op meter),
// falling back to a no-op histogram on the (rare) construction error.
// falling back to a no-op histogram on the (rare) construction error. The
// active_users gauge is registered as an observable callback over the in-memory
// tracker.
func newServerMetrics(meter metric.Meter) *serverMetrics {
if meter == nil {
meter = noop.NewMeterProvider().Meter(meterName)
@@ -30,7 +42,24 @@ func newServerMetrics(meter metric.Meter) *serverMetrics {
if err != nil {
h, _ = noop.NewMeterProvider().Meter(meterName).Float64Histogram("edge_request_duration")
}
return &serverMetrics{edge: h}
m := &serverMetrics{edge: h, active: newActiveUsers()}
gauge, err := meter.Int64ObservableGauge("active_users",
metric.WithDescription("Distinct accounts that performed an authenticated action within the window (in-memory, single gateway instance)."))
if err == nil {
windows := make([]time.Duration, len(activeUserWindows))
for i, w := range activeUserWindows {
windows[i] = w.dur
}
_, _ = meter.RegisterCallback(func(_ context.Context, o metric.Observer) error {
counts := m.active.counts(windows)
for i, w := range activeUserWindows {
o.ObserveInt64(gauge, int64(counts[i]), metric.WithAttributes(attribute.String("window", w.label)))
}
return nil
}, gauge)
}
return m
}
// recordEdge records the duration of one Execute call labelled by message type and
@@ -41,3 +70,8 @@ func (m *serverMetrics) recordEdge(ctx context.Context, msgType, result string,
attribute.String("result", result),
))
}
// recordActive marks account uid active now, feeding the active_users gauge.
func (m *serverMetrics) recordActive(uid string) {
m.active.seen(uid)
}
+17 -1
View File
@@ -24,6 +24,7 @@ import (
"scrabble/gateway/internal/ratelimit"
"scrabble/gateway/internal/session"
"scrabble/gateway/internal/transcode"
"scrabble/gateway/internal/webui"
edgev1 "scrabble/gateway/proto/edge/v1"
"scrabble/gateway/proto/edge/v1/edgev1connect"
)
@@ -89,9 +90,21 @@ func (s *Server) HTTPHandler() http.Handler {
if s.adminProxy != nil {
// The admin console (backend /_gm) is served on the public listener behind
// the proxy's Basic-Auth, mounted below the h2c wrap so the Connect edge keeps
// working over h2c (docs/ARCHITECTURE.md §12).
// working over h2c (docs/ARCHITECTURE.md §12). In the deployed contour the
// front caddy owns the /_gm Basic-Auth and Grafana routing; this mount serves
// a non-caddy (local) setup.
mux.Handle("/_gm/", s.adminProxy)
} else {
// With the console disabled here, keep /_gm a 404 so the SPA catch-all below
// does not serve the app shell at the operator path.
mux.Handle("/_gm/", http.NotFoundHandler())
}
// The embedded single-page UI is served at the site root and, for the Telegram
// Mini App, under /telegram/ — the single-origin model (docs/ARCHITECTURE.md
// §13). Both mounts sit below the h2c wrap so the Connect edge (a more specific
// prefix) keeps priority; "/" is the catch-all SPA fallback for the hash router.
mux.Handle("/telegram/", webui.Handler("/telegram/"))
mux.Handle("/", webui.Handler(""))
return h2c.NewHandler(mux, &http2.Server{})
}
@@ -118,6 +131,9 @@ func (s *Server) Execute(ctx context.Context, req *connect.Request[edgev1.Execut
result = "unauthenticated"
return nil, err
}
// A valid session proving an authenticated request is an "action" for the
// active_users gauge, counted before the rate-limit/domain outcome.
s.metrics.recordActive(uid)
if !s.limiter.Allow("user:"+uid, s.userPolicy) {
result = "rate_limited"
return nil, connect.NewError(connect.CodeResourceExhausted, errRateLimited)