feat(dev-deploy): full observability stack (Prometheus/Grafana/Loki/Tempo)
Stand up a production-mirror monitoring stack in the long-lived dev contour, all on galaxy-dev-internal with no host ports (reached only via the in-repo galaxy-dev-caddy): - Prometheus scrapes backend:9100, gateway:9191, node-exporter and cadvisor (30s interval, 15d retention); Loki (7d) + promtail (Docker service discovery by the galaxy.stack=dev-deploy label) for logs; Tempo (3d) for traces. - Backend and gateway now export OTLP traces to Tempo over plaintext gRPC on the internal network (OTEL_EXPORTER_OTLP_INSECURE). - Grafana provisioned as code (Prometheus/Loki/Tempo datasources plus a starter dashboard), served under /grafana/ via Caddy sub-path mode; admin password from the GALAXY_DEV_GRAFANA_ADMIN_PASSWORD secret. - Expose the Mailpit capture UI under /mailpit/ (Caddy basic-auth + MP_WEBROOT) so every captured message is readable regardless of relay. - dev-deploy.yaml seeds the monitoring config to a stable, reboot- surviving host path and injects the Grafana admin secret. Per-service memory limits keep the footprint within budget. All collector config lives under tools/dev-deploy/monitoring/ for dev/prod parity.
This commit is contained in:
@@ -74,6 +74,9 @@ services:
|
|||||||
command:
|
command:
|
||||||
- "--smtp-relay-config=/etc/mailpit/relay.conf"
|
- "--smtp-relay-config=/etc/mailpit/relay.conf"
|
||||||
- "--smtp-relay-matching=${GALAXY_DEV_MAIL_RELAY_MATCH:-nobody@invalid.example}"
|
- "--smtp-relay-matching=${GALAXY_DEV_MAIL_RELAY_MATCH:-nobody@invalid.example}"
|
||||||
|
# Serve the capture UI under /mailpit so the host Caddy can expose it
|
||||||
|
# at https://galaxy.lan/mailpit/ (behind basic-auth); SMTP is unaffected.
|
||||||
|
- "--webroot=/mailpit"
|
||||||
labels:
|
labels:
|
||||||
galaxy.stack: dev-deploy
|
galaxy.stack: dev-deploy
|
||||||
networks:
|
networks:
|
||||||
@@ -118,7 +121,13 @@ services:
|
|||||||
BACKEND_NOTIFICATION_ADMIN_EMAIL: admin@galaxy.lan
|
BACKEND_NOTIFICATION_ADMIN_EMAIL: admin@galaxy.lan
|
||||||
BACKEND_MAIL_WORKER_INTERVAL: 500ms
|
BACKEND_MAIL_WORKER_INTERVAL: 500ms
|
||||||
BACKEND_NOTIFICATION_WORKER_INTERVAL: 500ms
|
BACKEND_NOTIFICATION_WORKER_INTERVAL: 500ms
|
||||||
BACKEND_OTEL_TRACES_EXPORTER: none
|
BACKEND_OTEL_TRACES_EXPORTER: otlp
|
||||||
|
BACKEND_OTEL_PROTOCOL: grpc
|
||||||
|
BACKEND_OTEL_ENDPOINT: "galaxy-tempo:4317"
|
||||||
|
# Tempo's OTLP receiver is plaintext on the internal network; the
|
||||||
|
# backend's gRPC exporter defaults to TLS, so disable it via the
|
||||||
|
# standard SDK env (applied on top of WithEndpoint).
|
||||||
|
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||||
# Prometheus metrics are enabled in dev so the `/metrics` scrape
|
# Prometheus metrics are enabled in dev so the `/metrics` scrape
|
||||||
# endpoint is live and stable ahead of standing up a Prometheus +
|
# endpoint is live and stable ahead of standing up a Prometheus +
|
||||||
# Grafana stack on the internal network. The listener stays internal
|
# Grafana stack on the internal network. The listener stays internal
|
||||||
@@ -196,6 +205,12 @@ services:
|
|||||||
# the internal network — live and stable for a future scrape, not
|
# the internal network — live and stable for a future scrape, not
|
||||||
# mapped to the host.
|
# mapped to the host.
|
||||||
GATEWAY_ADMIN_HTTP_ADDR: ":9191"
|
GATEWAY_ADMIN_HTTP_ADDR: ":9191"
|
||||||
|
# Traces -> Tempo over OTLP gRPC (plaintext on the internal net).
|
||||||
|
OTEL_SERVICE_NAME: galaxy-gateway
|
||||||
|
OTEL_TRACES_EXPORTER: otlp
|
||||||
|
OTEL_EXPORTER_OTLP_PROTOCOL: grpc
|
||||||
|
OTEL_EXPORTER_OTLP_ENDPOINT: "http://galaxy-tempo:4317"
|
||||||
|
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||||
GATEWAY_BACKEND_HTTP_URL: "http://galaxy-backend:8080"
|
GATEWAY_BACKEND_HTTP_URL: "http://galaxy-backend:8080"
|
||||||
GATEWAY_BACKEND_GRPC_PUSH_URL: "galaxy-backend:8081"
|
GATEWAY_BACKEND_GRPC_PUSH_URL: "galaxy-backend:8081"
|
||||||
GATEWAY_BACKEND_GATEWAY_CLIENT_ID: dev-gateway-1
|
GATEWAY_BACKEND_GATEWAY_CLIENT_ID: dev-gateway-1
|
||||||
@@ -264,6 +279,156 @@ services:
|
|||||||
- galaxy-internal
|
- galaxy-internal
|
||||||
- edge
|
- edge
|
||||||
|
|
||||||
|
galaxy-prometheus:
|
||||||
|
image: prom/prometheus:v2.55.1
|
||||||
|
container_name: galaxy-dev-prometheus
|
||||||
|
restart: unless-stopped
|
||||||
|
labels:
|
||||||
|
galaxy.stack: dev-deploy
|
||||||
|
command:
|
||||||
|
- --config.file=/etc/prometheus/prometheus.yml
|
||||||
|
- --storage.tsdb.path=/prometheus
|
||||||
|
- --storage.tsdb.retention.time=15d
|
||||||
|
- --web.enable-lifecycle
|
||||||
|
volumes:
|
||||||
|
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
|
- galaxy-dev-prometheus-data:/prometheus
|
||||||
|
networks:
|
||||||
|
- galaxy-internal
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 384m
|
||||||
|
|
||||||
|
galaxy-loki:
|
||||||
|
image: grafana/loki:3.3.2
|
||||||
|
container_name: galaxy-dev-loki
|
||||||
|
restart: unless-stopped
|
||||||
|
labels:
|
||||||
|
galaxy.stack: dev-deploy
|
||||||
|
command: ["-config.file=/etc/loki/loki.yml"]
|
||||||
|
volumes:
|
||||||
|
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/loki/loki.yml:/etc/loki/loki.yml:ro
|
||||||
|
- galaxy-dev-loki-data:/loki
|
||||||
|
networks:
|
||||||
|
- galaxy-internal
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 384m
|
||||||
|
|
||||||
|
galaxy-promtail:
|
||||||
|
image: grafana/promtail:3.3.2
|
||||||
|
container_name: galaxy-dev-promtail
|
||||||
|
restart: unless-stopped
|
||||||
|
labels:
|
||||||
|
galaxy.stack: dev-deploy
|
||||||
|
command: ["-config.file=/etc/promtail/promtail.yml"]
|
||||||
|
volumes:
|
||||||
|
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/promtail/promtail.yml:/etc/promtail/promtail.yml:ro
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||||
|
networks:
|
||||||
|
- galaxy-internal
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 128m
|
||||||
|
|
||||||
|
galaxy-tempo:
|
||||||
|
image: grafana/tempo:2.7.1
|
||||||
|
container_name: galaxy-dev-tempo
|
||||||
|
restart: unless-stopped
|
||||||
|
labels:
|
||||||
|
galaxy.stack: dev-deploy
|
||||||
|
command: ["-config.file=/etc/tempo/tempo.yml"]
|
||||||
|
volumes:
|
||||||
|
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/tempo/tempo.yml:/etc/tempo/tempo.yml:ro
|
||||||
|
- galaxy-dev-tempo-data:/var/tempo
|
||||||
|
networks:
|
||||||
|
- galaxy-internal
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 384m
|
||||||
|
|
||||||
|
galaxy-node-exporter:
|
||||||
|
image: prom/node-exporter:v1.8.2
|
||||||
|
container_name: galaxy-dev-node-exporter
|
||||||
|
restart: unless-stopped
|
||||||
|
labels:
|
||||||
|
galaxy.stack: dev-deploy
|
||||||
|
command:
|
||||||
|
- --path.procfs=/host/proc
|
||||||
|
- --path.sysfs=/host/sys
|
||||||
|
- --path.rootfs=/rootfs
|
||||||
|
- --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
|
||||||
|
volumes:
|
||||||
|
- /proc:/host/proc:ro
|
||||||
|
- /sys:/host/sys:ro
|
||||||
|
- /:/rootfs:ro
|
||||||
|
pid: host
|
||||||
|
networks:
|
||||||
|
- galaxy-internal
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 64m
|
||||||
|
|
||||||
|
galaxy-cadvisor:
|
||||||
|
image: gcr.io/cadvisor/cadvisor:v0.49.1
|
||||||
|
container_name: galaxy-dev-cadvisor
|
||||||
|
restart: unless-stopped
|
||||||
|
labels:
|
||||||
|
galaxy.stack: dev-deploy
|
||||||
|
command:
|
||||||
|
- --housekeeping_interval=30s
|
||||||
|
- --docker_only=true
|
||||||
|
- --store_container_labels=false
|
||||||
|
privileged: true
|
||||||
|
devices:
|
||||||
|
- /dev/kmsg
|
||||||
|
volumes:
|
||||||
|
- /:/rootfs:ro
|
||||||
|
- /var/run:/var/run:ro
|
||||||
|
- /sys:/sys:ro
|
||||||
|
- /var/lib/docker/:/var/lib/docker:ro
|
||||||
|
- /dev/disk/:/dev/disk:ro
|
||||||
|
networks:
|
||||||
|
- galaxy-internal
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 256m
|
||||||
|
|
||||||
|
galaxy-grafana:
|
||||||
|
image: grafana/grafana:11.4.0
|
||||||
|
container_name: galaxy-dev-grafana
|
||||||
|
restart: unless-stopped
|
||||||
|
labels:
|
||||||
|
galaxy.stack: dev-deploy
|
||||||
|
depends_on:
|
||||||
|
- galaxy-prometheus
|
||||||
|
- galaxy-loki
|
||||||
|
- galaxy-tempo
|
||||||
|
environment:
|
||||||
|
GF_SECURITY_ADMIN_PASSWORD: ${GALAXY_DEV_GRAFANA_ADMIN_PASSWORD:-admin}
|
||||||
|
GF_SERVER_ROOT_URL: https://galaxy.lan/grafana/
|
||||||
|
GF_SERVER_SERVE_FROM_SUB_PATH: "true"
|
||||||
|
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||||
|
GF_ANALYTICS_REPORTING_ENABLED: "false"
|
||||||
|
GF_ANALYTICS_CHECK_FOR_UPDATES: "false"
|
||||||
|
GF_NEWS_NEWS_FEED_ENABLED: "false"
|
||||||
|
volumes:
|
||||||
|
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||||
|
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||||
|
- galaxy-dev-grafana-data:/var/lib/grafana
|
||||||
|
networks:
|
||||||
|
- galaxy-internal
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 256m
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
galaxy-internal:
|
galaxy-internal:
|
||||||
name: galaxy-dev-internal
|
name: galaxy-dev-internal
|
||||||
@@ -295,3 +460,11 @@ volumes:
|
|||||||
name: galaxy-dev-geoip-data
|
name: galaxy-dev-geoip-data
|
||||||
galaxy-dev-mailpit-config:
|
galaxy-dev-mailpit-config:
|
||||||
name: galaxy-dev-mailpit-config
|
name: galaxy-dev-mailpit-config
|
||||||
|
galaxy-dev-prometheus-data:
|
||||||
|
name: galaxy-dev-prometheus-data
|
||||||
|
galaxy-dev-grafana-data:
|
||||||
|
name: galaxy-dev-grafana-data
|
||||||
|
galaxy-dev-loki-data:
|
||||||
|
name: galaxy-dev-loki-data
|
||||||
|
galaxy-dev-tempo-data:
|
||||||
|
name: galaxy-dev-tempo-data
|
||||||
|
|||||||
@@ -0,0 +1,46 @@
|
|||||||
|
{
|
||||||
|
"annotations": { "list": [] },
|
||||||
|
"editable": true,
|
||||||
|
"graphTooltip": 0,
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||||
|
"id": 1,
|
||||||
|
"title": "Backend HTTP request rate",
|
||||||
|
"type": "timeseries",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"expr": "sum by (group) (rate(http_requests_total[5m]))",
|
||||||
|
"legendFormat": "{{group}}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||||
|
"id": 2,
|
||||||
|
"title": "Container memory (cadvisor)",
|
||||||
|
"type": "timeseries",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"expr": "sum by (name) (container_memory_usage_bytes{name=~\"galaxy-dev-.*|galaxy-game-.*\"})",
|
||||||
|
"legendFormat": "{{name}}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"tags": ["galaxy"],
|
||||||
|
"templating": { "list": [] },
|
||||||
|
"time": { "from": "now-6h", "to": "now" },
|
||||||
|
"timepicker": {},
|
||||||
|
"title": "Galaxy — overview",
|
||||||
|
"uid": "galaxy-overview",
|
||||||
|
"version": 1,
|
||||||
|
"weekStart": ""
|
||||||
|
}
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
# Grafana dashboard provider: load every JSON under the mounted
|
||||||
|
# dashboards directory at startup (provisioned as code).
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
providers:
|
||||||
|
- name: galaxy
|
||||||
|
type: file
|
||||||
|
disableDeletion: false
|
||||||
|
allowUiUpdates: true
|
||||||
|
options:
|
||||||
|
path: /var/lib/grafana/dashboards
|
||||||
|
foldersFromFilesStructure: false
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
# Grafana datasources provisioned as code (dev↔prod parity). All reach
|
||||||
|
# the collectors by Docker DNS (compose service names) on
|
||||||
|
# galaxy-dev-internal.
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
datasources:
|
||||||
|
- name: Prometheus
|
||||||
|
type: prometheus
|
||||||
|
access: proxy
|
||||||
|
uid: prometheus
|
||||||
|
url: http://galaxy-prometheus:9090
|
||||||
|
isDefault: true
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
access: proxy
|
||||||
|
uid: loki
|
||||||
|
url: http://galaxy-loki:3100
|
||||||
|
- name: Tempo
|
||||||
|
type: tempo
|
||||||
|
access: proxy
|
||||||
|
uid: tempo
|
||||||
|
url: http://galaxy-tempo:3200
|
||||||
@@ -0,0 +1,47 @@
|
|||||||
|
# Single-binary Loki for the dev stack: filesystem storage, in-memory
|
||||||
|
# ring, 7-day retention. Internal-only (no host port).
|
||||||
|
auth_enabled: false
|
||||||
|
|
||||||
|
server:
|
||||||
|
http_listen_port: 3100
|
||||||
|
grpc_listen_port: 9095
|
||||||
|
log_level: warn
|
||||||
|
|
||||||
|
common:
|
||||||
|
instance_addr: 127.0.0.1
|
||||||
|
path_prefix: /loki
|
||||||
|
storage:
|
||||||
|
filesystem:
|
||||||
|
chunks_directory: /loki/chunks
|
||||||
|
rules_directory: /loki/rules
|
||||||
|
replication_factor: 1
|
||||||
|
ring:
|
||||||
|
kvstore:
|
||||||
|
store: inmemory
|
||||||
|
|
||||||
|
schema_config:
|
||||||
|
configs:
|
||||||
|
- from: 2024-01-01
|
||||||
|
store: tsdb
|
||||||
|
object_store: filesystem
|
||||||
|
schema: v13
|
||||||
|
index:
|
||||||
|
prefix: index_
|
||||||
|
period: 24h
|
||||||
|
|
||||||
|
limits_config:
|
||||||
|
retention_period: 168h
|
||||||
|
reject_old_samples: true
|
||||||
|
reject_old_samples_max_age: 168h
|
||||||
|
|
||||||
|
compactor:
|
||||||
|
working_directory: /loki/compactor
|
||||||
|
retention_enabled: true
|
||||||
|
delete_request_store: filesystem
|
||||||
|
|
||||||
|
query_range:
|
||||||
|
results_cache:
|
||||||
|
cache:
|
||||||
|
embedded_cache:
|
||||||
|
enabled: true
|
||||||
|
max_size_mb: 64
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
# Prometheus scrape config for the dev observability stack. Retention is
|
||||||
|
# a CLI flag in the compose command, not here. Targets are reached by
|
||||||
|
# Docker DNS (compose service names) on galaxy-dev-internal; nothing is
|
||||||
|
# published to the host.
|
||||||
|
global:
|
||||||
|
scrape_interval: 30s
|
||||||
|
evaluation_interval: 30s
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: backend
|
||||||
|
static_configs:
|
||||||
|
- targets: ["galaxy-backend:9100"]
|
||||||
|
- job_name: gateway
|
||||||
|
static_configs:
|
||||||
|
- targets: ["galaxy-api:9191"]
|
||||||
|
- job_name: node
|
||||||
|
static_configs:
|
||||||
|
- targets: ["galaxy-node-exporter:9100"]
|
||||||
|
- job_name: cadvisor
|
||||||
|
static_configs:
|
||||||
|
- targets: ["galaxy-cadvisor:8080"]
|
||||||
|
- job_name: prometheus
|
||||||
|
static_configs:
|
||||||
|
- targets: ["localhost:9090"]
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
# Promtail tails the dev stack's container logs via the Docker API
|
||||||
|
# (service discovery filtered to the galaxy.stack=dev-deploy label) and
|
||||||
|
# ships them to Loki. Requires the Docker socket mounted read-only.
|
||||||
|
server:
|
||||||
|
http_listen_port: 9080
|
||||||
|
grpc_listen_port: 0
|
||||||
|
log_level: warn
|
||||||
|
|
||||||
|
positions:
|
||||||
|
filename: /tmp/positions.yaml
|
||||||
|
|
||||||
|
clients:
|
||||||
|
- url: http://galaxy-loki:3100/loki/api/v1/push
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: docker
|
||||||
|
docker_sd_configs:
|
||||||
|
- host: unix:///var/run/docker.sock
|
||||||
|
refresh_interval: 15s
|
||||||
|
filters:
|
||||||
|
- name: label
|
||||||
|
values: ["galaxy.stack=dev-deploy"]
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: ["__meta_docker_container_name"]
|
||||||
|
regex: "/?(.*)"
|
||||||
|
target_label: container
|
||||||
|
- source_labels: ["__meta_docker_container_label_galaxy_game_id"]
|
||||||
|
target_label: game_id
|
||||||
|
- source_labels: ["__meta_docker_container_log_stream"]
|
||||||
|
target_label: stream
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
# Single-binary Tempo for the dev stack: OTLP receivers, local block
|
||||||
|
# storage, 3-day retention. Internal-only (no host port). Backend and
|
||||||
|
# gateway push traces here over OTLP gRPC (4317).
|
||||||
|
server:
|
||||||
|
http_listen_port: 3200
|
||||||
|
log_level: warn
|
||||||
|
|
||||||
|
distributor:
|
||||||
|
receivers:
|
||||||
|
otlp:
|
||||||
|
protocols:
|
||||||
|
grpc:
|
||||||
|
endpoint: 0.0.0.0:4317
|
||||||
|
http:
|
||||||
|
endpoint: 0.0.0.0:4318
|
||||||
|
|
||||||
|
ingester:
|
||||||
|
max_block_duration: 5m
|
||||||
|
|
||||||
|
compactor:
|
||||||
|
compaction:
|
||||||
|
block_retention: 72h
|
||||||
|
|
||||||
|
storage:
|
||||||
|
trace:
|
||||||
|
backend: local
|
||||||
|
local:
|
||||||
|
path: /var/tempo/blocks
|
||||||
|
wal:
|
||||||
|
path: /var/tempo/wal
|
||||||
Reference in New Issue
Block a user