R3: gateway edge hardening — body cap, h2c sizing, rate-limit observability

- GATEWAY_MAX_BODY_BYTES (1 MiB): connect WithReadMaxBytes + http.MaxBytesReader
  on the public mux; explicit http2.Server MaxConcurrentStreams/IdleTimeout and
  an http.Server ReadHeaderTimeout (R2 report follow-up).
- gateway_rate_limited_total{class} counter, Debug per rejection, a rejection
  tracker drained every 30 s into a Warn summary per key and a report POST to
  /api/v1/internal/ratelimit/report (feeds the admin view + auto-flag).
- The dead AdminPerMinute/AdminBurst policy now guards the /_gm mount (429),
  ahead of its Basic-Auth.
- resolve() logs the cause of infra session-resolve failures at Warn (the
  transient unauthenticated dips from the R2 run); unknown tokens stay silent.
This commit is contained in:
Ilia Denisov
2026-06-10 01:58:48 +02:00
parent c23ac94c4e
commit 8878711cf3
12 changed files with 549 additions and 35 deletions
+54 -10
View File
@@ -39,6 +39,14 @@ const (
pushReconnectDelay = 2 * time.Second
// gatewayID identifies this gateway instance to the backend push channel.
gatewayID = "gateway"
// readHeaderTimeout bounds reading one request's headers on the public
// listener (a slowloris guard). Bodies and long-lived streams are governed by
// the h2c settings in connectsrv — Read/WriteTimeout stay unset on purpose,
// they would kill the Subscribe stream (R3).
readHeaderTimeout = 10 * time.Second
// throttleReportInterval is the cadence of the rate-limiter rejection
// summary: the Warn log per throttled key and the report to the backend (R3).
throttleReportInterval = 30 * time.Second
)
func main() {
@@ -89,6 +97,7 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error {
sessions := session.NewCache(backend, cfg.SessionTTL, cfg.SessionCacheMax)
limiter := ratelimit.New()
tracker := ratelimit.NewTracker()
hub := push.NewHub(0)
var conn *connector.Client
@@ -119,22 +128,26 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error {
registry := transcode.NewRegistry(backend, validator, cfg.DefaultSupportedLanguages...)
edge := connectsrv.NewServer(connectsrv.Deps{
Registry: registry,
Sessions: sessions,
Limiter: limiter,
Hub: hub,
RateLimit: cfg.RateLimit,
Heartbeat: cfg.PushHeartbeatInterval,
Logger: logger,
AdminProxy: adminProxy,
Meter: tel.MeterProvider().Meter("scrabble/gateway/edge"),
Registry: registry,
Sessions: sessions,
Limiter: limiter,
Tracker: tracker,
Hub: hub,
RateLimit: cfg.RateLimit,
Heartbeat: cfg.PushHeartbeatInterval,
Logger: logger,
AdminProxy: adminProxy,
Meter: tel.MeterProvider().Meter("scrabble/gateway/edge"),
MaxBodyBytes: cfg.MaxBodyBytes,
})
// Bridge the backend push stream into the fan-out hub (and the out-of-app
// channel via the connector).
go runPushPump(ctx, backend, hub, conn, logger)
// Periodically summarise rate-limiter rejections (Warn log + backend report).
go runThrottleReporter(ctx, tracker, backend, logger)
public := &http.Server{Addr: cfg.HTTPAddr, Handler: edge.HTTPHandler()}
public := &http.Server{Addr: cfg.HTTPAddr, Handler: edge.HTTPHandler(), ReadHeaderTimeout: readHeaderTimeout}
servers := []*namedServer{{name: "public", srv: public}}
logger.Info("gateway starting",
@@ -182,6 +195,37 @@ func runServers(ctx context.Context, cancel context.CancelFunc, servers []*named
return first
}
// runThrottleReporter drains the rate-limiter rejection tracker on a fixed
// cadence, emits one Warn summary per throttled key and forwards the report to
// the backend (which feeds the admin throttled view and the high-rate
// auto-flag), until the context is done. A failed delivery is logged and
// dropped — the next window reports fresh data anyway.
func runThrottleReporter(ctx context.Context, tracker *ratelimit.Tracker, backend *backendclient.Client, logger *zap.Logger) {
ticker := time.NewTicker(throttleReportInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
}
entries := tracker.Drain()
if len(entries) == 0 {
continue
}
for _, e := range entries {
logger.Warn("rate limited",
zap.String("class", e.Class),
zap.String("key", e.Key),
zap.Int("rejected", e.Rejected),
zap.Duration("window", throttleReportInterval))
}
if err := backend.ReportRateLimited(ctx, int(throttleReportInterval.Seconds()), entries); err != nil {
logger.Warn("rate-limit report failed", zap.Error(err))
}
}
}
// runPushPump keeps a backend push subscription open, forwarding every event to
// the hub and re-subscribing after the stream ends, until the context is done. For
// the out-of-app push kinds it also routes events whose recipient has no live