From 4a07d48a7bf126a28473203bb618194f253d7142 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Fri, 5 Jun 2026 17:34:33 +0200 Subject: [PATCH 1/2] Fix Grafana dashboards mount; keep connector OTLP (AWG_CONF must omit DNS=) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - deploy/docker-compose.yml: mount the provisioned dashboards at /etc/grafana/dashboards, not /var/lib/grafana/dashboards — the grafana-data volume mounts over the latter and shadows the nested bind, so the provider logged "readdirent /var/lib/grafana/dashboards: no such file or directory". dashboards.yaml provider path updated to match. - Connector telemetry stays OTLP. The VPN sidecar's netns reaches the collector's internal IP fine (connected route, off-tunnel), but the sidecar's DNS hijacks name resolution: AWG_CONF must NOT carry a DNS= directive, else otelcol won't resolve ("produced zero addresses"). Without DNS= the netns uses Docker's resolver (resolves both otelcol and api.telegram.org). Documented in deploy/README.md (AWG_CONF row + wiring note), ARCHITECTURE §13, compose comment. --- deploy/README.md | 14 +++++++++----- deploy/docker-compose.yml | 11 ++++++++++- .../provisioning/dashboards/dashboards.yaml | 2 +- docs/ARCHITECTURE.md | 10 +++++++--- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/deploy/README.md b/deploy/README.md index 8797966..0b28545 100644 --- a/deploy/README.md +++ b/deploy/README.md @@ -49,7 +49,7 @@ feeds the compose's `POSTGRES_PASSWORD`, etc. | Variable | Gitea kind | Purpose | | --- | --- | --- | | `POSTGRES_PASSWORD` | secret | Postgres password (also embedded in `BACKEND_POSTGRES_DSN`). | -| `AWG_CONF` | secret | AmneziaWG config for the VPN sidecar (the connector's only egress). | +| `AWG_CONF` | secret | AmneziaWG config for the VPN sidecar (the connector's only egress). **Must not contain a `DNS=` line** — it hijacks the shared netns's resolv.conf and breaks the connector resolving `otelcol` (telemetry export). Without it, Docker's resolver handles both `otelcol` and `api.telegram.org`. | | `GM_BASICAUTH_HASH` | secret | bcrypt hash gating `/_gm` (admin console + Grafana). Generate with `docker run --rm caddy:2-alpine caddy hash-password --plaintext ''`. | | `TELEGRAM_MINIAPP_URL` | variable | The Mini App URL the connector hands out in deep links / buttons. | @@ -87,10 +87,14 @@ These are hard-wired in `docker-compose.yml` (no `${...}`), pointing the service at each other on the `internal` network — listed here so they are not mistaken for missing config: `BACKEND_POSTGRES_DSN` (→ `postgres`, `search_path=backend`), `GATEWAY_BACKEND_HTTP_URL`/`_GRPC_ADDR` (→ `backend`), -`GATEWAY_CONNECTOR_ADDR`/`BACKEND_CONNECTOR_ADDR` (→ `telegram:9091`), the three -services' `*_OTEL_*_EXPORTER=otlp` + `OTEL_EXPORTER_OTLP_ENDPOINT=http://otelcol:4317` -(`_INSECURE=true`). `GATEWAY_ADMIN_*` is intentionally **unset** — caddy owns `/_gm` -in the contour. +`GATEWAY_CONNECTOR_ADDR`/`BACKEND_CONNECTOR_ADDR` (→ `telegram:9091`), and all three +services' `*_OTEL_*_EXPORTER=otlp` → `OTEL_EXPORTER_OTLP_ENDPOINT=http://otelcol:4317` +(`_INSECURE=true`). The connector shares the VPN sidecar's netns: routing to the +collector's internal IP is fine (connected route), but its `AWG_CONF` must **not** +set a `DNS=` directive — that hijacks resolv.conf and breaks resolving `otelcol` +("produced zero addresses"); without it the netns uses Docker's resolver, which +resolves both `otelcol` and `api.telegram.org`. `GATEWAY_ADMIN_*` is intentionally +**unset** — caddy owns `/_gm` in the contour. ## Host-side setup (outside this repo) diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index 6868737..f36a531 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -125,6 +125,12 @@ services: TELEGRAM_API_BASE_URL: ${TELEGRAM_API_BASE_URL:-} TELEGRAM_LOG_LEVEL: ${LOG_LEVEL:-info} TELEGRAM_SERVICE_NAME: scrabble-telegram + # The connector shares the VPN sidecar's netns. Routing to the collector's + # internal IP stays off the tunnel (connected route), but the sidecar's DNS + # hijacks name resolution: AWG_CONF must NOT carry a `DNS=` directive, else + # `otelcol` won't resolve ("produced zero addresses"). Without DNS= the netns + # uses Docker's resolver, which resolves both otelcol and api.telegram.org + # (see deploy/README.md). TELEGRAM_OTEL_TRACES_EXPORTER: otlp TELEGRAM_OTEL_METRICS_EXPORTER: otlp OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317 @@ -199,7 +205,10 @@ services: GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin} volumes: - ./grafana/provisioning:/etc/grafana/provisioning:ro - - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + # Dashboards live under /etc/grafana (NOT /var/lib/grafana, which the + # grafana-data volume mounts over — a nested bind there is shadowed and the + # provider logs "no such file or directory"). + - ./grafana/dashboards:/etc/grafana/dashboards:ro - grafana-data:/var/lib/grafana networks: [internal] diff --git a/deploy/grafana/provisioning/dashboards/dashboards.yaml b/deploy/grafana/provisioning/dashboards/dashboards.yaml index 3772be2..8b92fd6 100644 --- a/deploy/grafana/provisioning/dashboards/dashboards.yaml +++ b/deploy/grafana/provisioning/dashboards/dashboards.yaml @@ -11,5 +11,5 @@ providers: editable: true allowUiUpdates: true options: - path: /var/lib/grafana/dashboards + path: /etc/grafana/dashboards foldersFromFilesStructure: false diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index f172a9f..6f369f0 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -559,9 +559,13 @@ long-polls Telegram and egresses through a VPN sidecar, answering only internal The full contour (`deploy/docker-compose.yml`) runs one `gateway`, one `backend`, one Postgres, the connector (+ its VPN sidecar) and the **observability stack** — OTel Collector (OTLP/gRPC ingest → Prometheus metrics + Tempo traces) and Grafana -with provisioned datasources and dashboards. Inter-service traffic uses a private -`internal` network (project-scoped DNS); only caddy joins the shared external `edge` -network (alias `scrabble`). +with provisioned datasources and dashboards. All three services export OTLP to the +collector; the connector shares the VPN sidecar's netns, so its `AWG_CONF` must not +carry a `DNS=` directive (that would hijack resolv.conf and stop it resolving +`otelcol`; without it the netns uses Docker's resolver, which resolves both +`otelcol` and `api.telegram.org`). Inter-service traffic uses a private `internal` +network (project-scoped DNS); only caddy joins the shared external `edge` network +(alias `scrabble`). Two contours, two secret/variable prefixes (`TEST_` / `PROD_`): - **Test** (Stage 16): auto-deploys on a PR into — or a push to — `development` From 831ecd0cab9efc01d8baff1d622daa3ad15efa17 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Fri, 5 Jun 2026 17:42:21 +0200 Subject: [PATCH 2/2] Fix dangling config binds: seed configs to a stable host path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause of the Grafana "readdirent /etc/grafana/dashboards: no such file or directory": the CI runner checks out into an ephemeral act workspace that is removed after the job, so binding the compose config files straight from it dangles the mounts in the long-lived containers (verified the act source dir is emptied after the job). caddy/otelcol/prometheus/tempo read their config once at startup so they survive, but would break on a restart — same latent bug. Fix (mirrors ../galaxy-game's $HOME/.galaxy-dev/monitoring): the deploy job seeds the config dirs to a stable $HOME/.scrabble-deploy and the compose binds them via ${SCRABBLE_CONFIG_DIR:-.} (local runs keep "."). Documented in the compose header, deploy/README.md and the ci.yaml step. --- .gitea/workflows/ci.yaml | 10 ++++++++++ deploy/README.md | 8 ++++++++ deploy/docker-compose.yml | 18 ++++++++++++------ 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/.gitea/workflows/ci.yaml b/.gitea/workflows/ci.yaml index 75228cb..bfb321b 100644 --- a/.gitea/workflows/ci.yaml +++ b/.gitea/workflows/ci.yaml @@ -188,6 +188,16 @@ jobs: DICT_VERSION: ${{ vars.TEST_DICT_VERSION }} LOG_LEVEL: ${{ vars.TEST_LOG_LEVEL }} run: | + # Seed the config files to a stable host path. The runner checks out into + # an ephemeral act workspace that is removed after the job, which would + # dangle the compose config bind mounts in the long-lived containers + # (e.g. Grafana then logs "no such file or directory"). Bind from a stable + # dir instead (mirrors ../galaxy-game's $HOME/.galaxy-dev/monitoring). + conf="$HOME/.scrabble-deploy" + rm -rf "$conf" + mkdir -p "$conf" + cp -r caddy otelcol prometheus tempo grafana "$conf"/ + export SCRABBLE_CONFIG_DIR="$conf" docker compose --ansi never build --progress plain docker compose --ansi never up -d --remove-orphans diff --git a/deploy/README.md b/deploy/README.md index 0b28545..62ab89d 100644 --- a/deploy/README.md +++ b/deploy/README.md @@ -42,6 +42,14 @@ runs `docker compose up -d --build` on the runner host. Stage 18 (prod) maps the **`PROD_`** set the same way. So a Gitea secret named `TEST_POSTGRES_PASSWORD` feeds the compose's `POSTGRES_PASSWORD`, etc. +The deploy job also **seeds the config files** (`caddy`, `otelcol`, `prometheus`, +`tempo`, `grafana`) to a stable host path (`$HOME/.scrabble-deploy`) and sets +`SCRABBLE_CONFIG_DIR` to it before `up`. The runner's checkout is an ephemeral act +workspace that is removed after the job — binding config straight from it would +dangle the mounts in the long-lived containers (Grafana would log +`no such file or directory`). Locally `SCRABBLE_CONFIG_DIR` defaults to `.`, so the +compose binds from this directory. + ## Required variables `docker compose` aborts immediately if any of these is unset (they use `:?`): diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index f36a531..09cb03d 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -5,6 +5,12 @@ # interpolated from Gitea Actions TEST_ secrets/variables exported by the deploy # job (see deploy/.env.example for the unprefixed names). # +# Config bind sources are prefixed with ${SCRABBLE_CONFIG_DIR:-.}: locally they bind +# straight from this directory, but CI seeds them to a stable host path and sets +# SCRABBLE_CONFIG_DIR to it, because the runner's checkout is ephemeral (act removes +# it after the job) and the bind mounts must outlive the job in the long-running +# containers (see .gitea/workflows/ci.yaml + deploy/README.md). +# # Networking (mirrors ../galaxy-game): # - `internal` (scrabble-internal): all inter-service traffic, project-private # DNS so service names never collide on the shared `edge` network. @@ -148,7 +154,7 @@ services: GM_BASICAUTH_USER: ${GM_BASICAUTH_USER:-gm} GM_BASICAUTH_HASH: ${GM_BASICAUTH_HASH:?set GM_BASICAUTH_HASH} volumes: - - ./caddy/Caddyfile:/etc/caddy/Caddyfile:ro + - ${SCRABBLE_CONFIG_DIR:-.}/caddy/Caddyfile:/etc/caddy/Caddyfile:ro - caddy-data:/data networks: internal: {} @@ -162,7 +168,7 @@ services: restart: unless-stopped command: ["--config=/etc/otelcol/config.yaml"] volumes: - - ./otelcol/config.yaml:/etc/otelcol/config.yaml:ro + - ${SCRABBLE_CONFIG_DIR:-.}/otelcol/config.yaml:/etc/otelcol/config.yaml:ro networks: [internal] prometheus: @@ -173,7 +179,7 @@ services: - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.retention.time=15d volumes: - - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ${SCRABBLE_CONFIG_DIR:-.}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - prometheus-data:/prometheus networks: [internal] @@ -183,7 +189,7 @@ services: restart: unless-stopped command: ["-config.file=/etc/tempo/tempo.yaml"] volumes: - - ./tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro + - ${SCRABBLE_CONFIG_DIR:-.}/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro - tempo-data:/var/tempo networks: [internal] @@ -204,11 +210,11 @@ services: GF_USERS_ALLOW_SIGN_UP: "false" GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin} volumes: - - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ${SCRABBLE_CONFIG_DIR:-.}/grafana/provisioning:/etc/grafana/provisioning:ro # Dashboards live under /etc/grafana (NOT /var/lib/grafana, which the # grafana-data volume mounts over — a nested bind there is shadowed and the # provider logs "no such file or directory"). - - ./grafana/dashboards:/etc/grafana/dashboards:ro + - ${SCRABBLE_CONFIG_DIR:-.}/grafana/dashboards:/etc/grafana/dashboards:ro - grafana-data:/var/lib/grafana networks: [internal]