Files
scrabble-game/gateway/cmd/gateway/main.go
T
Ilia Denisov 8878711cf3 R3: gateway edge hardening — body cap, h2c sizing, rate-limit observability
- GATEWAY_MAX_BODY_BYTES (1 MiB): connect WithReadMaxBytes + http.MaxBytesReader
  on the public mux; explicit http2.Server MaxConcurrentStreams/IdleTimeout and
  an http.Server ReadHeaderTimeout (R2 report follow-up).
- gateway_rate_limited_total{class} counter, Debug per rejection, a rejection
  tracker drained every 30 s into a Warn summary per key and a report POST to
  /api/v1/internal/ratelimit/report (feeds the admin view + auto-flag).
- The dead AdminPerMinute/AdminBurst policy now guards the /_gm mount (429),
  ahead of its Basic-Auth.
- resolve() logs the cause of infra session-resolve failures at Warn (the
  transient unauthenticated dips from the R2 run); unknown tokens stay silent.
2026-06-10 01:58:48 +02:00

317 lines
11 KiB
Go

// Command gateway is the Scrabble platform's only public ingress. It terminates
// the client's Connect-RPC/FlatBuffers traffic over h2c, validates platform /
// email / guest credentials and mints opaque sessions, rate-limits, injects
// X-User-ID when forwarding to the backend over REST, and bridges the backend's
// gRPC push stream to each client's in-app live channel. It also serves the
// backend's admin console at /_gm on the public listener behind HTTP Basic-Auth.
package main
import (
"context"
"errors"
"log"
"net/http"
"os/signal"
"syscall"
"time"
"go.uber.org/zap"
"scrabble/gateway/internal/admin"
"scrabble/gateway/internal/backendclient"
"scrabble/gateway/internal/config"
"scrabble/gateway/internal/connector"
"scrabble/gateway/internal/connectsrv"
"scrabble/gateway/internal/push"
"scrabble/gateway/internal/ratelimit"
"scrabble/gateway/internal/session"
"scrabble/gateway/internal/transcode"
pkgtel "scrabble/pkg/telemetry"
)
const (
// shutdownTimeout bounds the graceful HTTP shutdown.
shutdownTimeout = 10 * time.Second
// telemetryShutdownTimeout bounds the OpenTelemetry flush during process exit.
telemetryShutdownTimeout = 5 * time.Second
// pushReconnectDelay is the pause before re-subscribing to the backend push
// stream after it ends.
pushReconnectDelay = 2 * time.Second
// gatewayID identifies this gateway instance to the backend push channel.
gatewayID = "gateway"
// readHeaderTimeout bounds reading one request's headers on the public
// listener (a slowloris guard). Bodies and long-lived streams are governed by
// the h2c settings in connectsrv — Read/WriteTimeout stay unset on purpose,
// they would kill the Subscribe stream (R3).
readHeaderTimeout = 10 * time.Second
// throttleReportInterval is the cadence of the rate-limiter rejection
// summary: the Warn log per throttled key and the report to the backend (R3).
throttleReportInterval = 30 * time.Second
)
func main() {
cfg, err := config.Load()
if err != nil {
log.Fatalf("gateway: load config: %v", err)
}
logger, err := newLogger(cfg.LogLevel)
if err != nil {
log.Fatalf("gateway: build logger: %v", err)
}
defer func() { _ = logger.Sync() }()
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stop()
if err := run(ctx, cfg, logger); err != nil {
logger.Fatal("gateway: terminated", zap.Error(err))
}
}
// run wires the gateway dependencies and serves the public listener (which also
// fronts the admin console at /_gm) until the context is cancelled.
func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
tel, err := pkgtel.New(ctx, cfg.Telemetry)
if err != nil {
return err
}
defer func() {
shutdownCtx, sc := context.WithTimeout(context.Background(), telemetryShutdownTimeout)
defer sc()
if err := tel.Shutdown(shutdownCtx); err != nil {
logger.Warn("telemetry shutdown", zap.Error(err))
}
}()
if err := tel.StartRuntimeMetrics(); err != nil {
logger.Warn("telemetry: start runtime metrics", zap.Error(err))
}
backend, err := backendclient.New(cfg.BackendHTTPURL, cfg.BackendGRPCAddr, cfg.BackendTimeout)
if err != nil {
return err
}
defer func() { _ = backend.Close() }()
sessions := session.NewCache(backend, cfg.SessionTTL, cfg.SessionCacheMax)
limiter := ratelimit.New()
tracker := ratelimit.NewTracker()
hub := push.NewHub(0)
var conn *connector.Client
var validator transcode.TelegramValidator
if cfg.ConnectorAddr != "" {
conn, err = connector.New(cfg.ConnectorAddr)
if err != nil {
return err
}
defer func() { _ = conn.Close() }()
validator = conn
} else {
logger.Warn("telegram disabled (GATEWAY_CONNECTOR_ADDR unset)")
}
// The admin console (backend /_gm) is fronted on the public listener behind
// Basic-Auth, enabled when both credentials are set; it is mounted on the edge
// mux so the Connect h2c handler stays the top-level handler.
var adminProxy http.Handler
if cfg.AdminEnabled() {
adminProxy, err = admin.NewProxy(cfg.BackendHTTPURL, cfg.AdminUser, cfg.AdminPassword, logger)
if err != nil {
return err
}
} else {
logger.Info("admin console disabled (set GATEWAY_ADMIN_USER and GATEWAY_ADMIN_PASSWORD)")
}
registry := transcode.NewRegistry(backend, validator, cfg.DefaultSupportedLanguages...)
edge := connectsrv.NewServer(connectsrv.Deps{
Registry: registry,
Sessions: sessions,
Limiter: limiter,
Tracker: tracker,
Hub: hub,
RateLimit: cfg.RateLimit,
Heartbeat: cfg.PushHeartbeatInterval,
Logger: logger,
AdminProxy: adminProxy,
Meter: tel.MeterProvider().Meter("scrabble/gateway/edge"),
MaxBodyBytes: cfg.MaxBodyBytes,
})
// Bridge the backend push stream into the fan-out hub (and the out-of-app
// channel via the connector).
go runPushPump(ctx, backend, hub, conn, logger)
// Periodically summarise rate-limiter rejections (Warn log + backend report).
go runThrottleReporter(ctx, tracker, backend, logger)
public := &http.Server{Addr: cfg.HTTPAddr, Handler: edge.HTTPHandler(), ReadHeaderTimeout: readHeaderTimeout}
servers := []*namedServer{{name: "public", srv: public}}
logger.Info("gateway starting",
zap.String("http_addr", cfg.HTTPAddr),
zap.String("backend_http", cfg.BackendHTTPURL),
zap.String("backend_grpc", cfg.BackendGRPCAddr))
return runServers(ctx, cancel, servers, logger)
}
// namedServer pairs an HTTP server with a label for diagnostics.
type namedServer struct {
name string
srv *http.Server
}
// runServers serves every listener and shuts them all down when the first one
// stops or the context is cancelled.
func runServers(ctx context.Context, cancel context.CancelFunc, servers []*namedServer, logger *zap.Logger) error {
errc := make(chan error, len(servers))
for _, s := range servers {
go func(s *namedServer) {
logger.Info("listener starting", zap.String("server", s.name), zap.String("addr", s.srv.Addr))
err := s.srv.ListenAndServe()
if errors.Is(err, http.ErrServerClosed) {
err = nil
}
errc <- err
}(s)
}
var first error
select {
case <-ctx.Done():
case first = <-errc:
}
cancel()
shutdownCtx, sc := context.WithTimeout(context.Background(), shutdownTimeout)
defer sc()
for _, s := range servers {
if err := s.srv.Shutdown(shutdownCtx); err != nil {
logger.Warn("listener shutdown", zap.String("server", s.name), zap.Error(err))
}
}
return first
}
// runThrottleReporter drains the rate-limiter rejection tracker on a fixed
// cadence, emits one Warn summary per throttled key and forwards the report to
// the backend (which feeds the admin throttled view and the high-rate
// auto-flag), until the context is done. A failed delivery is logged and
// dropped — the next window reports fresh data anyway.
func runThrottleReporter(ctx context.Context, tracker *ratelimit.Tracker, backend *backendclient.Client, logger *zap.Logger) {
ticker := time.NewTicker(throttleReportInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
}
entries := tracker.Drain()
if len(entries) == 0 {
continue
}
for _, e := range entries {
logger.Warn("rate limited",
zap.String("class", e.Class),
zap.String("key", e.Key),
zap.Int("rejected", e.Rejected),
zap.Duration("window", throttleReportInterval))
}
if err := backend.ReportRateLimited(ctx, int(throttleReportInterval.Seconds()), entries); err != nil {
logger.Warn("rate-limit report failed", zap.Error(err))
}
}
}
// runPushPump keeps a backend push subscription open, forwarding every event to
// the hub and re-subscribing after the stream ends, until the context is done. For
// the out-of-app push kinds it also routes events whose recipient has no live
// in-app stream to the platform connector (a nil connector disables that channel).
func runPushPump(ctx context.Context, backend *backendclient.Client, hub *push.Hub, conn *connector.Client, logger *zap.Logger) {
for ctx.Err() == nil {
stream, err := backend.SubscribePush(ctx, gatewayID)
if err != nil {
logger.Warn("push subscribe failed", zap.Error(err))
if !sleep(ctx, pushReconnectDelay) {
return
}
continue
}
for {
ev, err := stream.Recv()
if err != nil {
if ctx.Err() == nil {
logger.Warn("push stream ended", zap.Error(err))
}
break
}
hub.Publish(push.Event{
UserID: ev.GetUserId(),
Kind: ev.GetKind(),
Payload: ev.GetPayload(),
EventID: ev.GetEventId(),
})
// Out-of-app fallback: when the recipient has no live in-app stream,
// deliver the event over the platform push channel. Done in a goroutine
// so a slow connector never stalls the in-app firehose.
if conn != nil && connector.OutOfAppKind(ev.GetKind()) && !hub.HasSubscribers(ev.GetUserId()) {
go deliverOutOfApp(ctx, backend, conn, ev.GetUserId(), ev.GetKind(), ev.GetPayload(), ev.GetLanguage(), logger)
}
}
if !sleep(ctx, pushReconnectDelay) {
return
}
}
}
// deliverOutOfApp resolves the recipient's push target and, when they have a
// Telegram identity and have not confined notifications to the app, asks the
// connector to deliver the event. It is best-effort: every failure is logged and
// dropped (the in-app stream remains the primary channel).
func deliverOutOfApp(ctx context.Context, backend *backendclient.Client, conn *connector.Client, userID, kind string, payload []byte, gameLang string, logger *zap.Logger) {
target, err := backend.PushTarget(ctx, userID)
if err != nil {
logger.Warn("push target lookup failed", zap.String("user_id", userID), zap.Error(err))
return
}
if !connector.DeliverToTarget(target.ExternalID, target.NotificationsInAppOnly) {
return
}
// A game event carries its own language, so the push comes from the game's bot rather than
// the recipient's last-login bot (Stage 17); other events fall back to the service language.
lang := target.Language
if gameLang != "" {
lang = gameLang
}
if _, err := conn.Notify(ctx, target.ExternalID, kind, payload, lang); err != nil {
logger.Warn("out-of-app notify failed", zap.String("kind", kind), zap.Error(err))
}
}
// sleep waits for d or until ctx is cancelled, reporting whether it waited the
// full duration.
func sleep(ctx context.Context, d time.Duration) bool {
t := time.NewTimer(d)
defer t.Stop()
select {
case <-ctx.Done():
return false
case <-t.C:
return true
}
}
// newLogger builds a production JSON logger at the given level.
func newLogger(level string) (*zap.Logger, error) {
var lvl zap.AtomicLevel
if err := lvl.UnmarshalText([]byte(level)); err != nil {
return nil, err
}
cfg := zap.NewProductionConfig()
cfg.Level = lvl
return cfg.Build()
}