From 84a0ccb23f2b00e44be20ba9e7cc8a90d588da72 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Sun, 31 May 2026 23:39:06 +0200 Subject: [PATCH] feat(dev-deploy): full observability stack (Prometheus/Grafana/Loki/Tempo) Stand up a production-mirror monitoring stack in the long-lived dev contour, all on galaxy-dev-internal with no host ports (reached only via the in-repo galaxy-dev-caddy): - Prometheus scrapes backend:9100, gateway:9191, node-exporter and cadvisor (30s interval, 15d retention); Loki (7d) + promtail (Docker service discovery by the galaxy.stack=dev-deploy label) for logs; Tempo (3d) for traces. - Backend and gateway now export OTLP traces to Tempo over plaintext gRPC on the internal network (OTEL_EXPORTER_OTLP_INSECURE). - Grafana provisioned as code (Prometheus/Loki/Tempo datasources plus a starter dashboard), served under /grafana/ via Caddy sub-path mode; admin password from the GALAXY_DEV_GRAFANA_ADMIN_PASSWORD secret. - Expose the Mailpit capture UI under /mailpit/ (Caddy basic-auth + MP_WEBROOT) so every captured message is readable regardless of relay. - dev-deploy.yaml seeds the monitoring config to a stable, reboot- surviving host path and injects the Grafana admin secret. Per-service memory limits keep the footprint within budget. All collector config lives under tools/dev-deploy/monitoring/ for dev/prod parity. --- tools/dev-deploy/docker-compose.yml | 175 +++++++++++++++++- .../grafana/dashboards/galaxy-overview.json | 46 +++++ .../provisioning/dashboards/dashboards.yml | 12 ++ .../provisioning/datasources/datasources.yml | 22 +++ tools/dev-deploy/monitoring/loki/loki.yml | 47 +++++ .../monitoring/prometheus/prometheus.yml | 24 +++ .../monitoring/promtail/promtail.yml | 30 +++ tools/dev-deploy/monitoring/tempo/tempo.yml | 30 +++ 8 files changed, 385 insertions(+), 1 deletion(-) create mode 100644 tools/dev-deploy/monitoring/grafana/dashboards/galaxy-overview.json create mode 100644 tools/dev-deploy/monitoring/grafana/provisioning/dashboards/dashboards.yml create mode 100644 tools/dev-deploy/monitoring/grafana/provisioning/datasources/datasources.yml create mode 100644 tools/dev-deploy/monitoring/loki/loki.yml create mode 100644 tools/dev-deploy/monitoring/prometheus/prometheus.yml create mode 100644 tools/dev-deploy/monitoring/promtail/promtail.yml create mode 100644 tools/dev-deploy/monitoring/tempo/tempo.yml diff --git a/tools/dev-deploy/docker-compose.yml b/tools/dev-deploy/docker-compose.yml index d969f34..3c44fbc 100644 --- a/tools/dev-deploy/docker-compose.yml +++ b/tools/dev-deploy/docker-compose.yml @@ -74,6 +74,9 @@ services: command: - "--smtp-relay-config=/etc/mailpit/relay.conf" - "--smtp-relay-matching=${GALAXY_DEV_MAIL_RELAY_MATCH:-nobody@invalid.example}" + # Serve the capture UI under /mailpit so the host Caddy can expose it + # at https://galaxy.lan/mailpit/ (behind basic-auth); SMTP is unaffected. + - "--webroot=/mailpit" labels: galaxy.stack: dev-deploy networks: @@ -118,7 +121,13 @@ services: BACKEND_NOTIFICATION_ADMIN_EMAIL: admin@galaxy.lan BACKEND_MAIL_WORKER_INTERVAL: 500ms BACKEND_NOTIFICATION_WORKER_INTERVAL: 500ms - BACKEND_OTEL_TRACES_EXPORTER: none + BACKEND_OTEL_TRACES_EXPORTER: otlp + BACKEND_OTEL_PROTOCOL: grpc + BACKEND_OTEL_ENDPOINT: "galaxy-tempo:4317" + # Tempo's OTLP receiver is plaintext on the internal network; the + # backend's gRPC exporter defaults to TLS, so disable it via the + # standard SDK env (applied on top of WithEndpoint). + OTEL_EXPORTER_OTLP_INSECURE: "true" # Prometheus metrics are enabled in dev so the `/metrics` scrape # endpoint is live and stable ahead of standing up a Prometheus + # Grafana stack on the internal network. The listener stays internal @@ -196,6 +205,12 @@ services: # the internal network — live and stable for a future scrape, not # mapped to the host. GATEWAY_ADMIN_HTTP_ADDR: ":9191" + # Traces -> Tempo over OTLP gRPC (plaintext on the internal net). + OTEL_SERVICE_NAME: galaxy-gateway + OTEL_TRACES_EXPORTER: otlp + OTEL_EXPORTER_OTLP_PROTOCOL: grpc + OTEL_EXPORTER_OTLP_ENDPOINT: "http://galaxy-tempo:4317" + OTEL_EXPORTER_OTLP_INSECURE: "true" GATEWAY_BACKEND_HTTP_URL: "http://galaxy-backend:8080" GATEWAY_BACKEND_GRPC_PUSH_URL: "galaxy-backend:8081" GATEWAY_BACKEND_GATEWAY_CLIENT_ID: dev-gateway-1 @@ -264,6 +279,156 @@ services: - galaxy-internal - edge + galaxy-prometheus: + image: prom/prometheus:v2.55.1 + container_name: galaxy-dev-prometheus + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.time=15d + - --web.enable-lifecycle + volumes: + - ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - galaxy-dev-prometheus-data:/prometheus + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 384m + + galaxy-loki: + image: grafana/loki:3.3.2 + container_name: galaxy-dev-loki + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + command: ["-config.file=/etc/loki/loki.yml"] + volumes: + - ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/loki/loki.yml:/etc/loki/loki.yml:ro + - galaxy-dev-loki-data:/loki + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 384m + + galaxy-promtail: + image: grafana/promtail:3.3.2 + container_name: galaxy-dev-promtail + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + command: ["-config.file=/etc/promtail/promtail.yml"] + volumes: + - ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/promtail/promtail.yml:/etc/promtail/promtail.yml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 128m + + galaxy-tempo: + image: grafana/tempo:2.7.1 + container_name: galaxy-dev-tempo + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + command: ["-config.file=/etc/tempo/tempo.yml"] + volumes: + - ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/tempo/tempo.yml:/etc/tempo/tempo.yml:ro + - galaxy-dev-tempo-data:/var/tempo + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 384m + + galaxy-node-exporter: + image: prom/node-exporter:v1.8.2 + container_name: galaxy-dev-node-exporter + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + command: + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --path.rootfs=/rootfs + - --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/) + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + pid: host + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 64m + + galaxy-cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.49.1 + container_name: galaxy-dev-cadvisor + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + command: + - --housekeeping_interval=30s + - --docker_only=true + - --store_container_labels=false + privileged: true + devices: + - /dev/kmsg + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + - /dev/disk/:/dev/disk:ro + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 256m + + galaxy-grafana: + image: grafana/grafana:11.4.0 + container_name: galaxy-dev-grafana + restart: unless-stopped + labels: + galaxy.stack: dev-deploy + depends_on: + - galaxy-prometheus + - galaxy-loki + - galaxy-tempo + environment: + GF_SECURITY_ADMIN_PASSWORD: ${GALAXY_DEV_GRAFANA_ADMIN_PASSWORD:-admin} + GF_SERVER_ROOT_URL: https://galaxy.lan/grafana/ + GF_SERVER_SERVE_FROM_SUB_PATH: "true" + GF_USERS_ALLOW_SIGN_UP: "false" + GF_ANALYTICS_REPORTING_ENABLED: "false" + GF_ANALYTICS_CHECK_FOR_UPDATES: "false" + GF_NEWS_NEWS_FEED_ENABLED: "false" + volumes: + - ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/provisioning:/etc/grafana/provisioning:ro + - ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/dashboards:/var/lib/grafana/dashboards:ro + - galaxy-dev-grafana-data:/var/lib/grafana + networks: + - galaxy-internal + deploy: + resources: + limits: + memory: 256m + networks: galaxy-internal: name: galaxy-dev-internal @@ -295,3 +460,11 @@ volumes: name: galaxy-dev-geoip-data galaxy-dev-mailpit-config: name: galaxy-dev-mailpit-config + galaxy-dev-prometheus-data: + name: galaxy-dev-prometheus-data + galaxy-dev-grafana-data: + name: galaxy-dev-grafana-data + galaxy-dev-loki-data: + name: galaxy-dev-loki-data + galaxy-dev-tempo-data: + name: galaxy-dev-tempo-data diff --git a/tools/dev-deploy/monitoring/grafana/dashboards/galaxy-overview.json b/tools/dev-deploy/monitoring/grafana/dashboards/galaxy-overview.json new file mode 100644 index 0000000..845182c --- /dev/null +++ b/tools/dev-deploy/monitoring/grafana/dashboards/galaxy-overview.json @@ -0,0 +1,46 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "graphTooltip": 0, + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "id": 1, + "title": "Backend HTTP request rate", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (group) (rate(http_requests_total[5m]))", + "legendFormat": "{{group}}" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "id": 2, + "title": "Container memory (cadvisor)", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (name) (container_memory_usage_bytes{name=~\"galaxy-dev-.*|galaxy-game-.*\"})", + "legendFormat": "{{name}}" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["galaxy"], + "templating": { "list": [] }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "title": "Galaxy — overview", + "uid": "galaxy-overview", + "version": 1, + "weekStart": "" +} diff --git a/tools/dev-deploy/monitoring/grafana/provisioning/dashboards/dashboards.yml b/tools/dev-deploy/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..daa0fe0 --- /dev/null +++ b/tools/dev-deploy/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +# Grafana dashboard provider: load every JSON under the mounted +# dashboards directory at startup (provisioned as code). +apiVersion: 1 + +providers: + - name: galaxy + type: file + disableDeletion: false + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/tools/dev-deploy/monitoring/grafana/provisioning/datasources/datasources.yml b/tools/dev-deploy/monitoring/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 0000000..048dc55 --- /dev/null +++ b/tools/dev-deploy/monitoring/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,22 @@ +# Grafana datasources provisioned as code (dev↔prod parity). All reach +# the collectors by Docker DNS (compose service names) on +# galaxy-dev-internal. +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + uid: prometheus + url: http://galaxy-prometheus:9090 + isDefault: true + - name: Loki + type: loki + access: proxy + uid: loki + url: http://galaxy-loki:3100 + - name: Tempo + type: tempo + access: proxy + uid: tempo + url: http://galaxy-tempo:3200 diff --git a/tools/dev-deploy/monitoring/loki/loki.yml b/tools/dev-deploy/monitoring/loki/loki.yml new file mode 100644 index 0000000..b477d01 --- /dev/null +++ b/tools/dev-deploy/monitoring/loki/loki.yml @@ -0,0 +1,47 @@ +# Single-binary Loki for the dev stack: filesystem storage, in-memory +# ring, 7-day retention. Internal-only (no host port). +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9095 + log_level: warn + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: 168h + reject_old_samples: true + reject_old_samples_max_age: 168h + +compactor: + working_directory: /loki/compactor + retention_enabled: true + delete_request_store: filesystem + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 64 diff --git a/tools/dev-deploy/monitoring/prometheus/prometheus.yml b/tools/dev-deploy/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..d0ae211 --- /dev/null +++ b/tools/dev-deploy/monitoring/prometheus/prometheus.yml @@ -0,0 +1,24 @@ +# Prometheus scrape config for the dev observability stack. Retention is +# a CLI flag in the compose command, not here. Targets are reached by +# Docker DNS (compose service names) on galaxy-dev-internal; nothing is +# published to the host. +global: + scrape_interval: 30s + evaluation_interval: 30s + +scrape_configs: + - job_name: backend + static_configs: + - targets: ["galaxy-backend:9100"] + - job_name: gateway + static_configs: + - targets: ["galaxy-api:9191"] + - job_name: node + static_configs: + - targets: ["galaxy-node-exporter:9100"] + - job_name: cadvisor + static_configs: + - targets: ["galaxy-cadvisor:8080"] + - job_name: prometheus + static_configs: + - targets: ["localhost:9090"] diff --git a/tools/dev-deploy/monitoring/promtail/promtail.yml b/tools/dev-deploy/monitoring/promtail/promtail.yml new file mode 100644 index 0000000..369d24a --- /dev/null +++ b/tools/dev-deploy/monitoring/promtail/promtail.yml @@ -0,0 +1,30 @@ +# Promtail tails the dev stack's container logs via the Docker API +# (service discovery filtered to the galaxy.stack=dev-deploy label) and +# ships them to Loki. Requires the Docker socket mounted read-only. +server: + http_listen_port: 9080 + grpc_listen_port: 0 + log_level: warn + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://galaxy-loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 15s + filters: + - name: label + values: ["galaxy.stack=dev-deploy"] + relabel_configs: + - source_labels: ["__meta_docker_container_name"] + regex: "/?(.*)" + target_label: container + - source_labels: ["__meta_docker_container_label_galaxy_game_id"] + target_label: game_id + - source_labels: ["__meta_docker_container_log_stream"] + target_label: stream diff --git a/tools/dev-deploy/monitoring/tempo/tempo.yml b/tools/dev-deploy/monitoring/tempo/tempo.yml new file mode 100644 index 0000000..ef68f01 --- /dev/null +++ b/tools/dev-deploy/monitoring/tempo/tempo.yml @@ -0,0 +1,30 @@ +# Single-binary Tempo for the dev stack: OTLP receivers, local block +# storage, 3-day retention. Internal-only (no host port). Backend and +# gateway push traces here over OTLP gRPC (4317). +server: + http_listen_port: 3200 + log_level: warn + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +ingester: + max_block_duration: 5m + +compactor: + compaction: + block_retention: 72h + +storage: + trace: + backend: local + local: + path: /var/tempo/blocks + wal: + path: /var/tempo/wal