R3: gateway edge hardening — body cap, h2c sizing, rate-limit observability
- GATEWAY_MAX_BODY_BYTES (1 MiB): connect WithReadMaxBytes + http.MaxBytesReader
on the public mux; explicit http2.Server MaxConcurrentStreams/IdleTimeout and
an http.Server ReadHeaderTimeout (R2 report follow-up).
- gateway_rate_limited_total{class} counter, Debug per rejection, a rejection
tracker drained every 30 s into a Warn summary per key and a report POST to
/api/v1/internal/ratelimit/report (feeds the admin view + auto-flag).
- The dead AdminPerMinute/AdminBurst policy now guards the /_gm mount (429),
ahead of its Basic-Auth.
- resolve() logs the cause of infra session-resolve failures at Warn (the
transient unauthenticated dips from the R2 run); unknown tokens stay silent.
This commit is contained in:
+54
-10
@@ -39,6 +39,14 @@ const (
|
||||
pushReconnectDelay = 2 * time.Second
|
||||
// gatewayID identifies this gateway instance to the backend push channel.
|
||||
gatewayID = "gateway"
|
||||
// readHeaderTimeout bounds reading one request's headers on the public
|
||||
// listener (a slowloris guard). Bodies and long-lived streams are governed by
|
||||
// the h2c settings in connectsrv — Read/WriteTimeout stay unset on purpose,
|
||||
// they would kill the Subscribe stream (R3).
|
||||
readHeaderTimeout = 10 * time.Second
|
||||
// throttleReportInterval is the cadence of the rate-limiter rejection
|
||||
// summary: the Warn log per throttled key and the report to the backend (R3).
|
||||
throttleReportInterval = 30 * time.Second
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -89,6 +97,7 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error {
|
||||
|
||||
sessions := session.NewCache(backend, cfg.SessionTTL, cfg.SessionCacheMax)
|
||||
limiter := ratelimit.New()
|
||||
tracker := ratelimit.NewTracker()
|
||||
hub := push.NewHub(0)
|
||||
|
||||
var conn *connector.Client
|
||||
@@ -119,22 +128,26 @@ func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error {
|
||||
|
||||
registry := transcode.NewRegistry(backend, validator, cfg.DefaultSupportedLanguages...)
|
||||
edge := connectsrv.NewServer(connectsrv.Deps{
|
||||
Registry: registry,
|
||||
Sessions: sessions,
|
||||
Limiter: limiter,
|
||||
Hub: hub,
|
||||
RateLimit: cfg.RateLimit,
|
||||
Heartbeat: cfg.PushHeartbeatInterval,
|
||||
Logger: logger,
|
||||
AdminProxy: adminProxy,
|
||||
Meter: tel.MeterProvider().Meter("scrabble/gateway/edge"),
|
||||
Registry: registry,
|
||||
Sessions: sessions,
|
||||
Limiter: limiter,
|
||||
Tracker: tracker,
|
||||
Hub: hub,
|
||||
RateLimit: cfg.RateLimit,
|
||||
Heartbeat: cfg.PushHeartbeatInterval,
|
||||
Logger: logger,
|
||||
AdminProxy: adminProxy,
|
||||
Meter: tel.MeterProvider().Meter("scrabble/gateway/edge"),
|
||||
MaxBodyBytes: cfg.MaxBodyBytes,
|
||||
})
|
||||
|
||||
// Bridge the backend push stream into the fan-out hub (and the out-of-app
|
||||
// channel via the connector).
|
||||
go runPushPump(ctx, backend, hub, conn, logger)
|
||||
// Periodically summarise rate-limiter rejections (Warn log + backend report).
|
||||
go runThrottleReporter(ctx, tracker, backend, logger)
|
||||
|
||||
public := &http.Server{Addr: cfg.HTTPAddr, Handler: edge.HTTPHandler()}
|
||||
public := &http.Server{Addr: cfg.HTTPAddr, Handler: edge.HTTPHandler(), ReadHeaderTimeout: readHeaderTimeout}
|
||||
servers := []*namedServer{{name: "public", srv: public}}
|
||||
|
||||
logger.Info("gateway starting",
|
||||
@@ -182,6 +195,37 @@ func runServers(ctx context.Context, cancel context.CancelFunc, servers []*named
|
||||
return first
|
||||
}
|
||||
|
||||
// runThrottleReporter drains the rate-limiter rejection tracker on a fixed
|
||||
// cadence, emits one Warn summary per throttled key and forwards the report to
|
||||
// the backend (which feeds the admin throttled view and the high-rate
|
||||
// auto-flag), until the context is done. A failed delivery is logged and
|
||||
// dropped — the next window reports fresh data anyway.
|
||||
func runThrottleReporter(ctx context.Context, tracker *ratelimit.Tracker, backend *backendclient.Client, logger *zap.Logger) {
|
||||
ticker := time.NewTicker(throttleReportInterval)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
}
|
||||
entries := tracker.Drain()
|
||||
if len(entries) == 0 {
|
||||
continue
|
||||
}
|
||||
for _, e := range entries {
|
||||
logger.Warn("rate limited",
|
||||
zap.String("class", e.Class),
|
||||
zap.String("key", e.Key),
|
||||
zap.Int("rejected", e.Rejected),
|
||||
zap.Duration("window", throttleReportInterval))
|
||||
}
|
||||
if err := backend.ReportRateLimited(ctx, int(throttleReportInterval.Seconds()), entries); err != nil {
|
||||
logger.Warn("rate-limit report failed", zap.Error(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// runPushPump keeps a backend push subscription open, forwarding every event to
|
||||
// the hub and re-subscribing after the stream ends, until the context is done. For
|
||||
// the out-of-app push kinds it also routes events whose recipient has no live
|
||||
|
||||
Reference in New Issue
Block a user