feat(dev-deploy): full observability stack (Prometheus/Grafana/Loki/Tempo)
Stand up a production-mirror monitoring stack in the long-lived dev contour, all on galaxy-dev-internal with no host ports (reached only via the in-repo galaxy-dev-caddy): - Prometheus scrapes backend:9100, gateway:9191, node-exporter and cadvisor (30s interval, 15d retention); Loki (7d) + promtail (Docker service discovery by the galaxy.stack=dev-deploy label) for logs; Tempo (3d) for traces. - Backend and gateway now export OTLP traces to Tempo over plaintext gRPC on the internal network (OTEL_EXPORTER_OTLP_INSECURE). - Grafana provisioned as code (Prometheus/Loki/Tempo datasources plus a starter dashboard), served under /grafana/ via Caddy sub-path mode; admin password from the GALAXY_DEV_GRAFANA_ADMIN_PASSWORD secret. - Expose the Mailpit capture UI under /mailpit/ (Caddy basic-auth + MP_WEBROOT) so every captured message is readable regardless of relay. - dev-deploy.yaml seeds the monitoring config to a stable, reboot- surviving host path and injects the Grafana admin secret. Per-service memory limits keep the footprint within budget. All collector config lives under tools/dev-deploy/monitoring/ for dev/prod parity.
This commit is contained in:
@@ -74,6 +74,9 @@ services:
|
||||
command:
|
||||
- "--smtp-relay-config=/etc/mailpit/relay.conf"
|
||||
- "--smtp-relay-matching=${GALAXY_DEV_MAIL_RELAY_MATCH:-nobody@invalid.example}"
|
||||
# Serve the capture UI under /mailpit so the host Caddy can expose it
|
||||
# at https://galaxy.lan/mailpit/ (behind basic-auth); SMTP is unaffected.
|
||||
- "--webroot=/mailpit"
|
||||
labels:
|
||||
galaxy.stack: dev-deploy
|
||||
networks:
|
||||
@@ -118,7 +121,13 @@ services:
|
||||
BACKEND_NOTIFICATION_ADMIN_EMAIL: admin@galaxy.lan
|
||||
BACKEND_MAIL_WORKER_INTERVAL: 500ms
|
||||
BACKEND_NOTIFICATION_WORKER_INTERVAL: 500ms
|
||||
BACKEND_OTEL_TRACES_EXPORTER: none
|
||||
BACKEND_OTEL_TRACES_EXPORTER: otlp
|
||||
BACKEND_OTEL_PROTOCOL: grpc
|
||||
BACKEND_OTEL_ENDPOINT: "galaxy-tempo:4317"
|
||||
# Tempo's OTLP receiver is plaintext on the internal network; the
|
||||
# backend's gRPC exporter defaults to TLS, so disable it via the
|
||||
# standard SDK env (applied on top of WithEndpoint).
|
||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||
# Prometheus metrics are enabled in dev so the `/metrics` scrape
|
||||
# endpoint is live and stable ahead of standing up a Prometheus +
|
||||
# Grafana stack on the internal network. The listener stays internal
|
||||
@@ -196,6 +205,12 @@ services:
|
||||
# the internal network — live and stable for a future scrape, not
|
||||
# mapped to the host.
|
||||
GATEWAY_ADMIN_HTTP_ADDR: ":9191"
|
||||
# Traces -> Tempo over OTLP gRPC (plaintext on the internal net).
|
||||
OTEL_SERVICE_NAME: galaxy-gateway
|
||||
OTEL_TRACES_EXPORTER: otlp
|
||||
OTEL_EXPORTER_OTLP_PROTOCOL: grpc
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: "http://galaxy-tempo:4317"
|
||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||
GATEWAY_BACKEND_HTTP_URL: "http://galaxy-backend:8080"
|
||||
GATEWAY_BACKEND_GRPC_PUSH_URL: "galaxy-backend:8081"
|
||||
GATEWAY_BACKEND_GATEWAY_CLIENT_ID: dev-gateway-1
|
||||
@@ -264,6 +279,156 @@ services:
|
||||
- galaxy-internal
|
||||
- edge
|
||||
|
||||
galaxy-prometheus:
|
||||
image: prom/prometheus:v2.55.1
|
||||
container_name: galaxy-dev-prometheus
|
||||
restart: unless-stopped
|
||||
labels:
|
||||
galaxy.stack: dev-deploy
|
||||
command:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
- --storage.tsdb.path=/prometheus
|
||||
- --storage.tsdb.retention.time=15d
|
||||
- --web.enable-lifecycle
|
||||
volumes:
|
||||
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- galaxy-dev-prometheus-data:/prometheus
|
||||
networks:
|
||||
- galaxy-internal
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 384m
|
||||
|
||||
galaxy-loki:
|
||||
image: grafana/loki:3.3.2
|
||||
container_name: galaxy-dev-loki
|
||||
restart: unless-stopped
|
||||
labels:
|
||||
galaxy.stack: dev-deploy
|
||||
command: ["-config.file=/etc/loki/loki.yml"]
|
||||
volumes:
|
||||
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/loki/loki.yml:/etc/loki/loki.yml:ro
|
||||
- galaxy-dev-loki-data:/loki
|
||||
networks:
|
||||
- galaxy-internal
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 384m
|
||||
|
||||
galaxy-promtail:
|
||||
image: grafana/promtail:3.3.2
|
||||
container_name: galaxy-dev-promtail
|
||||
restart: unless-stopped
|
||||
labels:
|
||||
galaxy.stack: dev-deploy
|
||||
command: ["-config.file=/etc/promtail/promtail.yml"]
|
||||
volumes:
|
||||
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/promtail/promtail.yml:/etc/promtail/promtail.yml:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
networks:
|
||||
- galaxy-internal
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 128m
|
||||
|
||||
galaxy-tempo:
|
||||
image: grafana/tempo:2.7.1
|
||||
container_name: galaxy-dev-tempo
|
||||
restart: unless-stopped
|
||||
labels:
|
||||
galaxy.stack: dev-deploy
|
||||
command: ["-config.file=/etc/tempo/tempo.yml"]
|
||||
volumes:
|
||||
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/tempo/tempo.yml:/etc/tempo/tempo.yml:ro
|
||||
- galaxy-dev-tempo-data:/var/tempo
|
||||
networks:
|
||||
- galaxy-internal
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 384m
|
||||
|
||||
galaxy-node-exporter:
|
||||
image: prom/node-exporter:v1.8.2
|
||||
container_name: galaxy-dev-node-exporter
|
||||
restart: unless-stopped
|
||||
labels:
|
||||
galaxy.stack: dev-deploy
|
||||
command:
|
||||
- --path.procfs=/host/proc
|
||||
- --path.sysfs=/host/sys
|
||||
- --path.rootfs=/rootfs
|
||||
- --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
pid: host
|
||||
networks:
|
||||
- galaxy-internal
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 64m
|
||||
|
||||
galaxy-cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:v0.49.1
|
||||
container_name: galaxy-dev-cadvisor
|
||||
restart: unless-stopped
|
||||
labels:
|
||||
galaxy.stack: dev-deploy
|
||||
command:
|
||||
- --housekeeping_interval=30s
|
||||
- --docker_only=true
|
||||
- --store_container_labels=false
|
||||
privileged: true
|
||||
devices:
|
||||
- /dev/kmsg
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
networks:
|
||||
- galaxy-internal
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 256m
|
||||
|
||||
galaxy-grafana:
|
||||
image: grafana/grafana:11.4.0
|
||||
container_name: galaxy-dev-grafana
|
||||
restart: unless-stopped
|
||||
labels:
|
||||
galaxy.stack: dev-deploy
|
||||
depends_on:
|
||||
- galaxy-prometheus
|
||||
- galaxy-loki
|
||||
- galaxy-tempo
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_PASSWORD: ${GALAXY_DEV_GRAFANA_ADMIN_PASSWORD:-admin}
|
||||
GF_SERVER_ROOT_URL: https://galaxy.lan/grafana/
|
||||
GF_SERVER_SERVE_FROM_SUB_PATH: "true"
|
||||
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||
GF_ANALYTICS_REPORTING_ENABLED: "false"
|
||||
GF_ANALYTICS_CHECK_FOR_UPDATES: "false"
|
||||
GF_NEWS_NEWS_FEED_ENABLED: "false"
|
||||
volumes:
|
||||
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||
- galaxy-dev-grafana-data:/var/lib/grafana
|
||||
networks:
|
||||
- galaxy-internal
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 256m
|
||||
|
||||
networks:
|
||||
galaxy-internal:
|
||||
name: galaxy-dev-internal
|
||||
@@ -295,3 +460,11 @@ volumes:
|
||||
name: galaxy-dev-geoip-data
|
||||
galaxy-dev-mailpit-config:
|
||||
name: galaxy-dev-mailpit-config
|
||||
galaxy-dev-prometheus-data:
|
||||
name: galaxy-dev-prometheus-data
|
||||
galaxy-dev-grafana-data:
|
||||
name: galaxy-dev-grafana-data
|
||||
galaxy-dev-loki-data:
|
||||
name: galaxy-dev-loki-data
|
||||
galaxy-dev-tempo-data:
|
||||
name: galaxy-dev-tempo-data
|
||||
|
||||
Reference in New Issue
Block a user