8881214213
Mechanical, behaviour-preserving removal of Stage N / TODO-N / phase (RN) references from comments, doc-comments, service READMEs, the current-state docs (ARCHITECTURE, FUNCTIONAL+_ru, TESTING, UI_DESIGN), config-file comments, and the .fbs/.proto schema comments. PLAN.md / PRERELEASE.md / CLAUDE.md keep the stage history. - Rename the only stage-named identifiers: registerStage8 -> registerSocialOps, registerStage11 -> registerLinkOps (gateway transcode). - Split stage6_test.go: TestEmailLoginFlow -> email_test.go, TestGuestAutoMatchLeavesNoStats (+ provisionGuest) -> account_test.go. - Regenerated proto bindings (push.pb.go, telegram_grpc.pb.go) from the de-staged .proto comments; FB Go/TS bindings unchanged (flatc strips schema comments). go build/vet/gofmt clean across modules; integration typecheck and pnpm check green.
317 lines
10 KiB
Go
317 lines
10 KiB
Go
// Command gateway is the Scrabble platform's only public ingress. It terminates
|
|
// the client's Connect-RPC/FlatBuffers traffic over h2c, validates platform /
|
|
// email / guest credentials and mints opaque sessions, rate-limits, injects
|
|
// X-User-ID when forwarding to the backend over REST, and bridges the backend's
|
|
// gRPC push stream to each client's in-app live channel. It also serves the
|
|
// backend's admin console at /_gm on the public listener behind HTTP Basic-Auth.
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"log"
|
|
"net/http"
|
|
"os/signal"
|
|
"syscall"
|
|
"time"
|
|
|
|
"go.uber.org/zap"
|
|
|
|
"scrabble/gateway/internal/admin"
|
|
"scrabble/gateway/internal/backendclient"
|
|
"scrabble/gateway/internal/config"
|
|
"scrabble/gateway/internal/connector"
|
|
"scrabble/gateway/internal/connectsrv"
|
|
"scrabble/gateway/internal/push"
|
|
"scrabble/gateway/internal/ratelimit"
|
|
"scrabble/gateway/internal/session"
|
|
"scrabble/gateway/internal/transcode"
|
|
pkgtel "scrabble/pkg/telemetry"
|
|
)
|
|
|
|
const (
|
|
// shutdownTimeout bounds the graceful HTTP shutdown.
|
|
shutdownTimeout = 10 * time.Second
|
|
// telemetryShutdownTimeout bounds the OpenTelemetry flush during process exit.
|
|
telemetryShutdownTimeout = 5 * time.Second
|
|
// pushReconnectDelay is the pause before re-subscribing to the backend push
|
|
// stream after it ends.
|
|
pushReconnectDelay = 2 * time.Second
|
|
// gatewayID identifies this gateway instance to the backend push channel.
|
|
gatewayID = "gateway"
|
|
// readHeaderTimeout bounds reading one request's headers on the public
|
|
// listener (a slowloris guard). Bodies and long-lived streams are governed by
|
|
// the h2c settings in connectsrv — Read/WriteTimeout stay unset on purpose,
|
|
// they would kill the Subscribe stream.
|
|
readHeaderTimeout = 10 * time.Second
|
|
// throttleReportInterval is the cadence of the rate-limiter rejection
|
|
// summary: the Warn log per throttled key and the report to the backend.
|
|
throttleReportInterval = 30 * time.Second
|
|
)
|
|
|
|
func main() {
|
|
cfg, err := config.Load()
|
|
if err != nil {
|
|
log.Fatalf("gateway: load config: %v", err)
|
|
}
|
|
logger, err := newLogger(cfg.LogLevel)
|
|
if err != nil {
|
|
log.Fatalf("gateway: build logger: %v", err)
|
|
}
|
|
defer func() { _ = logger.Sync() }()
|
|
|
|
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
|
defer stop()
|
|
|
|
if err := run(ctx, cfg, logger); err != nil {
|
|
logger.Fatal("gateway: terminated", zap.Error(err))
|
|
}
|
|
}
|
|
|
|
// run wires the gateway dependencies and serves the public listener (which also
|
|
// fronts the admin console at /_gm) until the context is cancelled.
|
|
func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error {
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
defer cancel()
|
|
|
|
tel, err := pkgtel.New(ctx, cfg.Telemetry)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() {
|
|
shutdownCtx, sc := context.WithTimeout(context.Background(), telemetryShutdownTimeout)
|
|
defer sc()
|
|
if err := tel.Shutdown(shutdownCtx); err != nil {
|
|
logger.Warn("telemetry shutdown", zap.Error(err))
|
|
}
|
|
}()
|
|
if err := tel.StartRuntimeMetrics(); err != nil {
|
|
logger.Warn("telemetry: start runtime metrics", zap.Error(err))
|
|
}
|
|
|
|
backend, err := backendclient.New(cfg.BackendHTTPURL, cfg.BackendGRPCAddr, cfg.BackendTimeout)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = backend.Close() }()
|
|
|
|
sessions := session.NewCache(backend, cfg.SessionTTL, cfg.SessionCacheMax)
|
|
limiter := ratelimit.New()
|
|
tracker := ratelimit.NewTracker()
|
|
hub := push.NewHub(0)
|
|
|
|
var conn *connector.Client
|
|
var validator transcode.TelegramValidator
|
|
if cfg.ConnectorAddr != "" {
|
|
conn, err = connector.New(cfg.ConnectorAddr)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = conn.Close() }()
|
|
validator = conn
|
|
} else {
|
|
logger.Warn("telegram disabled (GATEWAY_CONNECTOR_ADDR unset)")
|
|
}
|
|
|
|
// The admin console (backend /_gm) is fronted on the public listener behind
|
|
// Basic-Auth, enabled when both credentials are set; it is mounted on the edge
|
|
// mux so the Connect h2c handler stays the top-level handler.
|
|
var adminProxy http.Handler
|
|
if cfg.AdminEnabled() {
|
|
adminProxy, err = admin.NewProxy(cfg.BackendHTTPURL, cfg.AdminUser, cfg.AdminPassword, logger)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
logger.Info("admin console disabled (set GATEWAY_ADMIN_USER and GATEWAY_ADMIN_PASSWORD)")
|
|
}
|
|
|
|
registry := transcode.NewRegistry(backend, validator, cfg.DefaultSupportedLanguages...)
|
|
edge := connectsrv.NewServer(connectsrv.Deps{
|
|
Registry: registry,
|
|
Sessions: sessions,
|
|
Limiter: limiter,
|
|
Tracker: tracker,
|
|
Hub: hub,
|
|
RateLimit: cfg.RateLimit,
|
|
Heartbeat: cfg.PushHeartbeatInterval,
|
|
Logger: logger,
|
|
AdminProxy: adminProxy,
|
|
Meter: tel.MeterProvider().Meter("scrabble/gateway/edge"),
|
|
MaxBodyBytes: cfg.MaxBodyBytes,
|
|
})
|
|
|
|
// Bridge the backend push stream into the fan-out hub (and the out-of-app
|
|
// channel via the connector).
|
|
go runPushPump(ctx, backend, hub, conn, logger)
|
|
// Periodically summarise rate-limiter rejections (Warn log + backend report).
|
|
go runThrottleReporter(ctx, tracker, backend, logger)
|
|
|
|
public := &http.Server{Addr: cfg.HTTPAddr, Handler: edge.HTTPHandler(), ReadHeaderTimeout: readHeaderTimeout}
|
|
servers := []*namedServer{{name: "public", srv: public}}
|
|
|
|
logger.Info("gateway starting",
|
|
zap.String("http_addr", cfg.HTTPAddr),
|
|
zap.String("backend_http", cfg.BackendHTTPURL),
|
|
zap.String("backend_grpc", cfg.BackendGRPCAddr))
|
|
return runServers(ctx, cancel, servers, logger)
|
|
}
|
|
|
|
// namedServer pairs an HTTP server with a label for diagnostics.
|
|
type namedServer struct {
|
|
name string
|
|
srv *http.Server
|
|
}
|
|
|
|
// runServers serves every listener and shuts them all down when the first one
|
|
// stops or the context is cancelled.
|
|
func runServers(ctx context.Context, cancel context.CancelFunc, servers []*namedServer, logger *zap.Logger) error {
|
|
errc := make(chan error, len(servers))
|
|
for _, s := range servers {
|
|
go func(s *namedServer) {
|
|
logger.Info("listener starting", zap.String("server", s.name), zap.String("addr", s.srv.Addr))
|
|
err := s.srv.ListenAndServe()
|
|
if errors.Is(err, http.ErrServerClosed) {
|
|
err = nil
|
|
}
|
|
errc <- err
|
|
}(s)
|
|
}
|
|
|
|
var first error
|
|
select {
|
|
case <-ctx.Done():
|
|
case first = <-errc:
|
|
}
|
|
cancel()
|
|
|
|
shutdownCtx, sc := context.WithTimeout(context.Background(), shutdownTimeout)
|
|
defer sc()
|
|
for _, s := range servers {
|
|
if err := s.srv.Shutdown(shutdownCtx); err != nil {
|
|
logger.Warn("listener shutdown", zap.String("server", s.name), zap.Error(err))
|
|
}
|
|
}
|
|
return first
|
|
}
|
|
|
|
// runThrottleReporter drains the rate-limiter rejection tracker on a fixed
|
|
// cadence, emits one Warn summary per throttled key and forwards the report to
|
|
// the backend (which feeds the admin throttled view and the high-rate
|
|
// auto-flag), until the context is done. A failed delivery is logged and
|
|
// dropped — the next window reports fresh data anyway.
|
|
func runThrottleReporter(ctx context.Context, tracker *ratelimit.Tracker, backend *backendclient.Client, logger *zap.Logger) {
|
|
ticker := time.NewTicker(throttleReportInterval)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
}
|
|
entries := tracker.Drain()
|
|
if len(entries) == 0 {
|
|
continue
|
|
}
|
|
for _, e := range entries {
|
|
logger.Warn("rate limited",
|
|
zap.String("class", e.Class),
|
|
zap.String("key", e.Key),
|
|
zap.Int("rejected", e.Rejected),
|
|
zap.Duration("window", throttleReportInterval))
|
|
}
|
|
if err := backend.ReportRateLimited(ctx, int(throttleReportInterval.Seconds()), entries); err != nil {
|
|
logger.Warn("rate-limit report failed", zap.Error(err))
|
|
}
|
|
}
|
|
}
|
|
|
|
// runPushPump keeps a backend push subscription open, forwarding every event to
|
|
// the hub and re-subscribing after the stream ends, until the context is done. For
|
|
// the out-of-app push kinds it also routes events whose recipient has no live
|
|
// in-app stream to the platform connector (a nil connector disables that channel).
|
|
func runPushPump(ctx context.Context, backend *backendclient.Client, hub *push.Hub, conn *connector.Client, logger *zap.Logger) {
|
|
for ctx.Err() == nil {
|
|
stream, err := backend.SubscribePush(ctx, gatewayID)
|
|
if err != nil {
|
|
logger.Warn("push subscribe failed", zap.Error(err))
|
|
if !sleep(ctx, pushReconnectDelay) {
|
|
return
|
|
}
|
|
continue
|
|
}
|
|
for {
|
|
ev, err := stream.Recv()
|
|
if err != nil {
|
|
if ctx.Err() == nil {
|
|
logger.Warn("push stream ended", zap.Error(err))
|
|
}
|
|
break
|
|
}
|
|
hub.Publish(push.Event{
|
|
UserID: ev.GetUserId(),
|
|
Kind: ev.GetKind(),
|
|
Payload: ev.GetPayload(),
|
|
EventID: ev.GetEventId(),
|
|
})
|
|
// Out-of-app fallback: when the recipient has no live in-app stream,
|
|
// deliver the event over the platform push channel. Done in a goroutine
|
|
// so a slow connector never stalls the in-app firehose.
|
|
if conn != nil && connector.OutOfAppKind(ev.GetKind()) && !hub.HasSubscribers(ev.GetUserId()) {
|
|
go deliverOutOfApp(ctx, backend, conn, ev.GetUserId(), ev.GetKind(), ev.GetPayload(), ev.GetLanguage(), logger)
|
|
}
|
|
}
|
|
if !sleep(ctx, pushReconnectDelay) {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// deliverOutOfApp resolves the recipient's push target and, when they have a
|
|
// Telegram identity and have not confined notifications to the app, asks the
|
|
// connector to deliver the event. It is best-effort: every failure is logged and
|
|
// dropped (the in-app stream remains the primary channel).
|
|
func deliverOutOfApp(ctx context.Context, backend *backendclient.Client, conn *connector.Client, userID, kind string, payload []byte, gameLang string, logger *zap.Logger) {
|
|
target, err := backend.PushTarget(ctx, userID)
|
|
if err != nil {
|
|
logger.Warn("push target lookup failed", zap.String("user_id", userID), zap.Error(err))
|
|
return
|
|
}
|
|
if !connector.DeliverToTarget(target.ExternalID, target.NotificationsInAppOnly) {
|
|
return
|
|
}
|
|
// A game event carries its own language, so the push comes from the game's bot rather than
|
|
// the recipient's last-login bot; other events fall back to the service language.
|
|
lang := target.Language
|
|
if gameLang != "" {
|
|
lang = gameLang
|
|
}
|
|
if _, err := conn.Notify(ctx, target.ExternalID, kind, payload, lang); err != nil {
|
|
logger.Warn("out-of-app notify failed", zap.String("kind", kind), zap.Error(err))
|
|
}
|
|
}
|
|
|
|
// sleep waits for d or until ctx is cancelled, reporting whether it waited the
|
|
// full duration.
|
|
func sleep(ctx context.Context, d time.Duration) bool {
|
|
t := time.NewTimer(d)
|
|
defer t.Stop()
|
|
select {
|
|
case <-ctx.Done():
|
|
return false
|
|
case <-t.C:
|
|
return true
|
|
}
|
|
}
|
|
|
|
// newLogger builds a production JSON logger at the given level.
|
|
func newLogger(level string) (*zap.Logger, error) {
|
|
var lvl zap.AtomicLevel
|
|
if err := lvl.UnmarshalText([]byte(level)); err != nil {
|
|
return nil, err
|
|
}
|
|
cfg := zap.NewProductionConfig()
|
|
cfg.Level = lvl
|
|
return cfg.Build()
|
|
}
|