R7: contour docker_stats observability + container limits/GOMAXPROCS

Observability: replace cAdvisor (which resolves only the root cgroup on the contour host — separate-XFS /var/lib/docker) with the otelcol docker_stats receiver, which reads per-container CPU/memory/network straight from the Docker API and works the same in prod. The collector joins the host docker group (DOCKER_GID, default 989) and mounts the socket read-only; its metrics flow out through the existing prometheus exporter, so the cAdvisor scrape job and the privileged cAdvisor service are removed. The Resources dashboard panels are retargeted to the docker_stats metric names (container_name label; container.cpu.utilization/100 == cores). Container limits: apply deploy.resources.limits (honoured by Compose v2) across the contour and pin GOMAXPROCS to the CPU limit on the Go services so the runtime matches the cgroup quota. Starting values are generous over the R2 peak (~1 core / <=100 MiB per app service) to avoid skewing or OOM-killing the measurement run; they are tightened to the agreed prod sizing after the final stress run (R7 Round 2). The privileged VPN sidecar is left unconstrained.
2026-06-10 18:53:19 +02:00
parent 04263a17ca
commit c16f27475f
4 changed files with 94 additions and 35 deletions
@@ -39,6 +39,13 @@ services:
      retries: 30
    volumes:
      - postgres-data:/var/lib/postgresql/data
+    # R7 starting limits: 512M leaves headroom over the default 128 MB shared_buffers +
+    # per-connection memory (R2 peaked at 28 backends / 69 MiB RSS); tighten after the run.
+    deploy:
+      resources:
+        limits:
+          cpus: "2.0"
+          memory: 512M
    networks: [internal]

  backend:
@@ -65,8 +72,19 @@ services:
      BACKEND_OTEL_METRICS_EXPORTER: otlp
      OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317
      OTEL_EXPORTER_OTLP_INSECURE: "true"
+      # GOMAXPROCS matches the CPU limit below so the Go scheduler aligns with the
+      # cgroup quota (the runtime otherwise sees all of the host's cores).
+      GOMAXPROCS: "2"
    # No container healthcheck: the distroless image has no shell/wget. Readiness
    # is covered by the CI post-deploy probe (GET / through caddy).
+    # R7 starting limits (generous over the R2 ~1-core / <=100 MiB peak); tightened to
+    # the agreed prod values after the final stress run. deploy.resources.limits is
+    # honoured by `docker compose up` (Compose v2), not only by swarm.
+    deploy:
+      resources:
+        limits:
+          cpus: "2.0"
+          memory: 512M
    networks: [internal]

  gateway:
@@ -97,8 +115,17 @@ services:
      GATEWAY_OTEL_METRICS_EXPORTER: otlp
      OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317
      OTEL_EXPORTER_OTLP_INSECURE: "true"
+      # GOMAXPROCS matches the CPU limit below (see backend).
+      GOMAXPROCS: "2"
      # GATEWAY_ADMIN_* intentionally unset: in the deployed contour the front
      # caddy owns the /_gm Basic-Auth and routes /_gm to the backend directly.
+    # R7 starting limits (generous over the R2 ~1-core / <=100 MiB peak); tighten after
+    # the final stress run.
+    deploy:
+      resources:
+        limits:
+          cpus: "2.0"
+          memory: 512M
    networks: [internal]

  # --- Landing (static) -------------------------------------------------------
@@ -121,6 +148,10 @@ services:
        VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-}
        VITE_APP_VERSION: ${APP_VERSION:-dev}
    restart: unless-stopped
+    deploy:
+      resources:
+        limits:
+          memory: 128M
    networks: [internal]

  # --- Telegram connector (egress via the VPN sidecar) -----------------------
@@ -167,6 +198,13 @@ services:
      TELEGRAM_OTEL_METRICS_EXPORTER: otlp
      OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317
      OTEL_EXPORTER_OTLP_INSECURE: "true"
+      # The connector is light (the stress run does not drive Telegram); one P suffices.
+      GOMAXPROCS: "1"
+    deploy:
+      resources:
+        limits:
+          cpus: "1.0"
+          memory: 256M

  # --- Edge reverse proxy (single /_gm Basic-Auth; SPA + Connect -> gateway;
  #     the catch-all incl. the landing -> the static landing container) -------
@@ -183,6 +221,10 @@ services:
    volumes:
      - ${SCRABBLE_CONFIG_DIR:-.}/caddy/Caddyfile:/etc/caddy/Caddyfile:ro
      - caddy-data:/data
+    deploy:
+      resources:
+        limits:
+          memory: 128M
    networks:
      internal: {}
      edge:
@@ -194,8 +236,19 @@ services:
    image: otel/opentelemetry-collector-contrib:0.119.0
    restart: unless-stopped
    command: ["--config=/etc/otelcol/config.yaml"]
+    # The docker_stats receiver reads per-container metrics from the Docker API, so the
+    # collector (image UID 10001) joins the host's docker group to read the socket —
+    # DOCKER_GID defaults to the contour host's 989; set it for other hosts (prod). The
+    # socket is mounted read-only. This replaces cAdvisor, whose per-container metrics
+    # are empty on this host (separate-XFS /var/lib/docker).
+    group_add: ["${DOCKER_GID:-989}"]
    volumes:
      - ${SCRABBLE_CONFIG_DIR:-.}/otelcol/config.yaml:/etc/otelcol/config.yaml:ro
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+    deploy:
+      resources:
+        limits:
+          memory: 512M
    networks: [internal]

  prometheus:
@@ -208,6 +261,10 @@ services:
    volumes:
      - ${SCRABBLE_CONFIG_DIR:-.}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus-data:/prometheus
+    deploy:
+      resources:
+        limits:
+          memory: 512M
    networks: [internal]

  tempo:
@@ -218,6 +275,11 @@ services:
    volumes:
      - ${SCRABBLE_CONFIG_DIR:-.}/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro
      - tempo-data:/var/tempo
+    # tempo peaked at ~446 MiB in R2; 1G leaves headroom for the final run.
+    deploy:
+      resources:
+        limits:
+          memory: 1G
    networks: [internal]

  grafana:
@@ -247,26 +309,10 @@ services:
      # provider logs "no such file or directory").
      - ${SCRABBLE_CONFIG_DIR:-.}/grafana/dashboards:/etc/grafana/dashboards:ro
      - grafana-data:/var/lib/grafana
-    networks: [internal]
-
-  # cAdvisor exports per-container resource metrics (CPU / memory / network / disk)
-  # for the R2/R7 stress runs' resource baseline. Prometheus scrapes it at :8080
-  # over the internal network. It needs read access to the host's cgroup and
-  # container state; --docker_only trims non-container cgroup series.
-  cadvisor:
-    container_name: scrabble-cadvisor
-    image: gcr.io/cadvisor/cadvisor:v0.49.1
-    restart: unless-stopped
-    privileged: true
-    command: ["--docker_only=true", "--housekeeping_interval=15s"]
-    devices:
-      - /dev/kmsg
-    volumes:
-      - /:/rootfs:ro
-      - /var/run:/var/run:ro
-      - /sys:/sys:ro
-      - /var/lib/docker/:/var/lib/docker:ro
-      - /dev/disk/:/dev/disk:ro
+    deploy:
+      resources:
+        limits:
+          memory: 512M
    networks: [internal]

  # postgres_exporter exports Postgres server metrics (connections, cache hit ratio,
@@ -279,6 +325,10 @@ services:
    depends_on: [postgres]
    environment:
      DATA_SOURCE_NAME: postgresql://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable
+    deploy:
+      resources:
+        limits:
+          memory: 128M
    networks: [internal]

 networks:
@@ -4,7 +4,7 @@
  "tags": ["scrabble"],
  "timezone": "",
  "schemaVersion": 39,
-  "version": 1,
+  "version": 2,
  "refresh": "30s",
  "time": { "from": "now-1h", "to": "now" },
  "panels": [
@@ -43,30 +43,30 @@
    {
      "type": "timeseries",
      "title": "Container CPU (cores) by container",
-      "description": "cAdvisor container_cpu_usage_seconds_total rate, per scrabble-* container (the load harness appears when run as --name scrabble-loadtest). Verify the metric name against live Prometheus if empty.",
+      "description": "docker_stats container.cpu.utilization (a gauge where 100 == one core) / 100, per scrabble-* container; the load harness appears when run as --name scrabble-loadtest. Verify the scaling against live Prometheus.",
      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 },
      "datasource": { "type": "prometheus", "uid": "prometheus" },
-      "targets": [{ "refId": "A", "expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"scrabble-.+\"}[5m])) by (name)", "legendFormat": "{{name}}" }]
+      "targets": [{ "refId": "A", "expr": "max(container_cpu_utilization{container_name=~\"scrabble-.+\"}) by (container_name) / 100", "legendFormat": "{{container_name}}" }]
    },
    {
      "type": "timeseries",
-      "title": "Container memory (working set) by container",
-      "description": "cAdvisor container_memory_working_set_bytes, per scrabble-* container.",
+      "title": "Container memory (usage) by container",
+      "description": "docker_stats container.memory.usage.total bytes, per scrabble-* container.",
      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 },
      "fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] },
      "datasource": { "type": "prometheus", "uid": "prometheus" },
-      "targets": [{ "refId": "A", "expr": "max(container_memory_working_set_bytes{name=~\"scrabble-.+\"}) by (name)", "legendFormat": "{{name}}" }]
+      "targets": [{ "refId": "A", "expr": "max(container_memory_usage_total{container_name=~\"scrabble-.+\"}) by (container_name)", "legendFormat": "{{container_name}}" }]
    },
    {
      "type": "timeseries",
      "title": "Container network I/O by container",
-      "description": "cAdvisor receive (+) and transmit (-) byte rates per scrabble-* container.",
+      "description": "docker_stats receive (+) and transmit (-) byte rates per scrabble-* container (summed across interfaces).",
      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 },
      "fieldConfig": { "defaults": { "unit": "Bps" }, "overrides": [] },
      "datasource": { "type": "prometheus", "uid": "prometheus" },
      "targets": [
-        { "refId": "A", "expr": "sum(rate(container_network_receive_bytes_total{name=~\"scrabble-.+\"}[5m])) by (name)", "legendFormat": "rx {{name}}" },
-        { "refId": "B", "expr": "-sum(rate(container_network_transmit_bytes_total{name=~\"scrabble-.+\"}[5m])) by (name)", "legendFormat": "tx {{name}}" }
+        { "refId": "A", "expr": "sum(rate(container_network_io_usage_rx_bytes{container_name=~\"scrabble-.+\"}[5m])) by (container_name)", "legendFormat": "rx {{container_name}}" },
+        { "refId": "B", "expr": "-sum(rate(container_network_io_usage_tx_bytes{container_name=~\"scrabble-.+\"}[5m])) by (container_name)", "legendFormat": "tx {{container_name}}" }
      ]
    },
    {
@@ -6,6 +6,18 @@ receivers:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
+  # Per-container resource metrics (CPU / memory / network) read straight from the
+  # Docker API. This replaces cAdvisor, which on the contour host resolves only the
+  # root cgroup (its /var/lib/docker is a separate XFS mount), and works the same in
+  # prod. The collector reaches the socket via group_add in docker-compose.yml.
+  # collection_interval matches Prometheus' 30s scrape. container.cpu.utilization is a
+  # gauge where 100 == one core (it mirrors `docker stats` CPU%).
+  docker_stats:
+    endpoint: unix:///var/run/docker.sock
+    collection_interval: 30s
+    metrics:
+      container.cpu.utilization:
+        enabled: true

 processors:
  batch: {}
@@ -33,6 +45,6 @@ service:
      processors: [batch]
      exporters: [otlp/tempo]
    metrics:
-      receivers: [otlp]
+      receivers: [otlp, docker_stats]
      processors: [batch]
      exporters: [prometheus]
@@ -6,17 +6,14 @@ global:
  evaluation_interval: 30s

 scrape_configs:
+  # otelcol exposes both the services' OTLP metrics and the docker_stats receiver's
+  # per-container resource metrics (CPU/memory/network) on one endpoint.
  - job_name: otelcol
    static_configs:
      - targets: ["otelcol:9464"]
  - job_name: prometheus
    static_configs:
      - targets: ["localhost:9090"]
-  # Container resource metrics (CPU/memory/network/disk) for every contour
-  # container, for the R2/R7 stress runs' resource baseline.
-  - job_name: cadvisor
-    static_configs:
-      - targets: ["cadvisor:8080"]
  # Postgres server metrics (connections, cache hit ratio, transactions, db size).
  - job_name: postgres_exporter
    static_configs: