feat(dev-deploy): full observability stack (Prometheus/Grafana/Loki/Tempo)

Stand up a production-mirror monitoring stack in the long-lived dev
contour, all on galaxy-dev-internal with no host ports (reached only via
the in-repo galaxy-dev-caddy):

- Prometheus scrapes backend:9100, gateway:9191, node-exporter and
  cadvisor (30s interval, 15d retention); Loki (7d) + promtail (Docker
  service discovery by the galaxy.stack=dev-deploy label) for logs;
  Tempo (3d) for traces.
- Backend and gateway now export OTLP traces to Tempo over plaintext
  gRPC on the internal network (OTEL_EXPORTER_OTLP_INSECURE).
- Grafana provisioned as code (Prometheus/Loki/Tempo datasources plus a
  starter dashboard), served under /grafana/ via Caddy sub-path mode;
  admin password from the GALAXY_DEV_GRAFANA_ADMIN_PASSWORD secret.
- Expose the Mailpit capture UI under /mailpit/ (Caddy basic-auth +
  MP_WEBROOT) so every captured message is readable regardless of relay.
- dev-deploy.yaml seeds the monitoring config to a stable, reboot-
  surviving host path and injects the Grafana admin secret.

Per-service memory limits keep the footprint within budget. All
collector config lives under tools/dev-deploy/monitoring/ for dev/prod
parity.
This commit is contained in:
Ilia Denisov
2026-05-31 23:39:06 +02:00
parent 7fb6a63c2b
commit 84a0ccb23f
8 changed files with 385 additions and 1 deletions
@@ -0,0 +1,46 @@
{
"annotations": { "list": [] },
"editable": true,
"graphTooltip": 0,
"panels": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"id": 1,
"title": "Backend HTTP request rate",
"type": "timeseries",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "sum by (group) (rate(http_requests_total[5m]))",
"legendFormat": "{{group}}"
}
]
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"id": 2,
"title": "Container memory (cadvisor)",
"type": "timeseries",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "sum by (name) (container_memory_usage_bytes{name=~\"galaxy-dev-.*|galaxy-game-.*\"})",
"legendFormat": "{{name}}"
}
]
}
],
"schemaVersion": 39,
"tags": ["galaxy"],
"templating": { "list": [] },
"time": { "from": "now-6h", "to": "now" },
"timepicker": {},
"title": "Galaxy — overview",
"uid": "galaxy-overview",
"version": 1,
"weekStart": ""
}
@@ -0,0 +1,12 @@
# Grafana dashboard provider: load every JSON under the mounted
# dashboards directory at startup (provisioned as code).
apiVersion: 1
providers:
- name: galaxy
type: file
disableDeletion: false
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: false
@@ -0,0 +1,22 @@
# Grafana datasources provisioned as code (dev↔prod parity). All reach
# the collectors by Docker DNS (compose service names) on
# galaxy-dev-internal.
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
uid: prometheus
url: http://galaxy-prometheus:9090
isDefault: true
- name: Loki
type: loki
access: proxy
uid: loki
url: http://galaxy-loki:3100
- name: Tempo
type: tempo
access: proxy
uid: tempo
url: http://galaxy-tempo:3200
+47
View File
@@ -0,0 +1,47 @@
# Single-binary Loki for the dev stack: filesystem storage, in-memory
# ring, 7-day retention. Internal-only (no host port).
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9095
log_level: warn
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
limits_config:
retention_period: 168h
reject_old_samples: true
reject_old_samples_max_age: 168h
compactor:
working_directory: /loki/compactor
retention_enabled: true
delete_request_store: filesystem
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 64
@@ -0,0 +1,24 @@
# Prometheus scrape config for the dev observability stack. Retention is
# a CLI flag in the compose command, not here. Targets are reached by
# Docker DNS (compose service names) on galaxy-dev-internal; nothing is
# published to the host.
global:
scrape_interval: 30s
evaluation_interval: 30s
scrape_configs:
- job_name: backend
static_configs:
- targets: ["galaxy-backend:9100"]
- job_name: gateway
static_configs:
- targets: ["galaxy-api:9191"]
- job_name: node
static_configs:
- targets: ["galaxy-node-exporter:9100"]
- job_name: cadvisor
static_configs:
- targets: ["galaxy-cadvisor:8080"]
- job_name: prometheus
static_configs:
- targets: ["localhost:9090"]
@@ -0,0 +1,30 @@
# Promtail tails the dev stack's container logs via the Docker API
# (service discovery filtered to the galaxy.stack=dev-deploy label) and
# ships them to Loki. Requires the Docker socket mounted read-only.
server:
http_listen_port: 9080
grpc_listen_port: 0
log_level: warn
positions:
filename: /tmp/positions.yaml
clients:
- url: http://galaxy-loki:3100/loki/api/v1/push
scrape_configs:
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 15s
filters:
- name: label
values: ["galaxy.stack=dev-deploy"]
relabel_configs:
- source_labels: ["__meta_docker_container_name"]
regex: "/?(.*)"
target_label: container
- source_labels: ["__meta_docker_container_label_galaxy_game_id"]
target_label: game_id
- source_labels: ["__meta_docker_container_log_stream"]
target_label: stream
@@ -0,0 +1,30 @@
# Single-binary Tempo for the dev stack: OTLP receivers, local block
# storage, 3-day retention. Internal-only (no host port). Backend and
# gateway push traces here over OTLP gRPC (4317).
server:
http_listen_port: 3200
log_level: warn
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
ingester:
max_block_duration: 5m
compactor:
compaction:
block_retention: 72h
storage:
trace:
backend: local
local:
path: /var/tempo/blocks
wal:
path: /var/tempo/wal