feat(dev-deploy): full observability stack (Prometheus/Grafana/Loki/Tempo)

Stand up a production-mirror monitoring stack in the long-lived dev
contour, all on galaxy-dev-internal with no host ports (reached only via
the in-repo galaxy-dev-caddy):

- Prometheus scrapes backend:9100, gateway:9191, node-exporter and
  cadvisor (30s interval, 15d retention); Loki (7d) + promtail (Docker
  service discovery by the galaxy.stack=dev-deploy label) for logs;
  Tempo (3d) for traces.
- Backend and gateway now export OTLP traces to Tempo over plaintext
  gRPC on the internal network (OTEL_EXPORTER_OTLP_INSECURE).
- Grafana provisioned as code (Prometheus/Loki/Tempo datasources plus a
  starter dashboard), served under /grafana/ via Caddy sub-path mode;
  admin password from the GALAXY_DEV_GRAFANA_ADMIN_PASSWORD secret.
- Expose the Mailpit capture UI under /mailpit/ (Caddy basic-auth +
  MP_WEBROOT) so every captured message is readable regardless of relay.
- dev-deploy.yaml seeds the monitoring config to a stable, reboot-
  surviving host path and injects the Grafana admin secret.

Per-service memory limits keep the footprint within budget. All
collector config lives under tools/dev-deploy/monitoring/ for dev/prod
parity.
This commit is contained in:
Ilia Denisov
2026-05-31 23:39:06 +02:00
parent 7fb6a63c2b
commit 84a0ccb23f
8 changed files with 385 additions and 1 deletions
+174 -1
View File
@@ -74,6 +74,9 @@ services:
command:
- "--smtp-relay-config=/etc/mailpit/relay.conf"
- "--smtp-relay-matching=${GALAXY_DEV_MAIL_RELAY_MATCH:-nobody@invalid.example}"
# Serve the capture UI under /mailpit so the host Caddy can expose it
# at https://galaxy.lan/mailpit/ (behind basic-auth); SMTP is unaffected.
- "--webroot=/mailpit"
labels:
galaxy.stack: dev-deploy
networks:
@@ -118,7 +121,13 @@ services:
BACKEND_NOTIFICATION_ADMIN_EMAIL: admin@galaxy.lan
BACKEND_MAIL_WORKER_INTERVAL: 500ms
BACKEND_NOTIFICATION_WORKER_INTERVAL: 500ms
BACKEND_OTEL_TRACES_EXPORTER: none
BACKEND_OTEL_TRACES_EXPORTER: otlp
BACKEND_OTEL_PROTOCOL: grpc
BACKEND_OTEL_ENDPOINT: "galaxy-tempo:4317"
# Tempo's OTLP receiver is plaintext on the internal network; the
# backend's gRPC exporter defaults to TLS, so disable it via the
# standard SDK env (applied on top of WithEndpoint).
OTEL_EXPORTER_OTLP_INSECURE: "true"
# Prometheus metrics are enabled in dev so the `/metrics` scrape
# endpoint is live and stable ahead of standing up a Prometheus +
# Grafana stack on the internal network. The listener stays internal
@@ -196,6 +205,12 @@ services:
# the internal network — live and stable for a future scrape, not
# mapped to the host.
GATEWAY_ADMIN_HTTP_ADDR: ":9191"
# Traces -> Tempo over OTLP gRPC (plaintext on the internal net).
OTEL_SERVICE_NAME: galaxy-gateway
OTEL_TRACES_EXPORTER: otlp
OTEL_EXPORTER_OTLP_PROTOCOL: grpc
OTEL_EXPORTER_OTLP_ENDPOINT: "http://galaxy-tempo:4317"
OTEL_EXPORTER_OTLP_INSECURE: "true"
GATEWAY_BACKEND_HTTP_URL: "http://galaxy-backend:8080"
GATEWAY_BACKEND_GRPC_PUSH_URL: "galaxy-backend:8081"
GATEWAY_BACKEND_GATEWAY_CLIENT_ID: dev-gateway-1
@@ -264,6 +279,156 @@ services:
- galaxy-internal
- edge
galaxy-prometheus:
image: prom/prometheus:v2.55.1
container_name: galaxy-dev-prometheus
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=15d
- --web.enable-lifecycle
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- galaxy-dev-prometheus-data:/prometheus
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 384m
galaxy-loki:
image: grafana/loki:3.3.2
container_name: galaxy-dev-loki
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command: ["-config.file=/etc/loki/loki.yml"]
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/loki/loki.yml:/etc/loki/loki.yml:ro
- galaxy-dev-loki-data:/loki
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 384m
galaxy-promtail:
image: grafana/promtail:3.3.2
container_name: galaxy-dev-promtail
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command: ["-config.file=/etc/promtail/promtail.yml"]
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/promtail/promtail.yml:/etc/promtail/promtail.yml:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 128m
galaxy-tempo:
image: grafana/tempo:2.7.1
container_name: galaxy-dev-tempo
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command: ["-config.file=/etc/tempo/tempo.yml"]
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/tempo/tempo.yml:/etc/tempo/tempo.yml:ro
- galaxy-dev-tempo-data:/var/tempo
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 384m
galaxy-node-exporter:
image: prom/node-exporter:v1.8.2
container_name: galaxy-dev-node-exporter
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/rootfs
- --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
pid: host
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 64m
galaxy-cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.49.1
container_name: galaxy-dev-cadvisor
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command:
- --housekeeping_interval=30s
- --docker_only=true
- --store_container_labels=false
privileged: true
devices:
- /dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 256m
galaxy-grafana:
image: grafana/grafana:11.4.0
container_name: galaxy-dev-grafana
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
depends_on:
- galaxy-prometheus
- galaxy-loki
- galaxy-tempo
environment:
GF_SECURITY_ADMIN_PASSWORD: ${GALAXY_DEV_GRAFANA_ADMIN_PASSWORD:-admin}
GF_SERVER_ROOT_URL: https://galaxy.lan/grafana/
GF_SERVER_SERVE_FROM_SUB_PATH: "true"
GF_USERS_ALLOW_SIGN_UP: "false"
GF_ANALYTICS_REPORTING_ENABLED: "false"
GF_ANALYTICS_CHECK_FOR_UPDATES: "false"
GF_NEWS_NEWS_FEED_ENABLED: "false"
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/provisioning:/etc/grafana/provisioning:ro
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/dashboards:/var/lib/grafana/dashboards:ro
- galaxy-dev-grafana-data:/var/lib/grafana
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 256m
networks:
galaxy-internal:
name: galaxy-dev-internal
@@ -295,3 +460,11 @@ volumes:
name: galaxy-dev-geoip-data
galaxy-dev-mailpit-config:
name: galaxy-dev-mailpit-config
galaxy-dev-prometheus-data:
name: galaxy-dev-prometheus-data
galaxy-dev-grafana-data:
name: galaxy-dev-grafana-data
galaxy-dev-loki-data:
name: galaxy-dev-loki-data
galaxy-dev-tempo-data:
name: galaxy-dev-tempo-data