feat(dev-deploy): full observability stack (Prometheus/Grafana/Loki/Tempo)
Stand up a production-mirror monitoring stack in the long-lived dev contour, all on galaxy-dev-internal with no host ports (reached only via the in-repo galaxy-dev-caddy): - Prometheus scrapes backend:9100, gateway:9191, node-exporter and cadvisor (30s interval, 15d retention); Loki (7d) + promtail (Docker service discovery by the galaxy.stack=dev-deploy label) for logs; Tempo (3d) for traces. - Backend and gateway now export OTLP traces to Tempo over plaintext gRPC on the internal network (OTEL_EXPORTER_OTLP_INSECURE). - Grafana provisioned as code (Prometheus/Loki/Tempo datasources plus a starter dashboard), served under /grafana/ via Caddy sub-path mode; admin password from the GALAXY_DEV_GRAFANA_ADMIN_PASSWORD secret. - Expose the Mailpit capture UI under /mailpit/ (Caddy basic-auth + MP_WEBROOT) so every captured message is readable regardless of relay. - dev-deploy.yaml seeds the monitoring config to a stable, reboot- surviving host path and injects the Grafana admin secret. Per-service memory limits keep the footprint within budget. All collector config lives under tools/dev-deploy/monitoring/ for dev/prod parity.
This commit is contained in:
@@ -0,0 +1,46 @@
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"graphTooltip": 0,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"title": "Backend HTTP request rate",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "sum by (group) (rate(http_requests_total[5m]))",
|
||||
"legendFormat": "{{group}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"id": 2,
|
||||
"title": "Container memory (cadvisor)",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "sum by (name) (container_memory_usage_bytes{name=~\"galaxy-dev-.*|galaxy-game-.*\"})",
|
||||
"legendFormat": "{{name}}"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["galaxy"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"title": "Galaxy — overview",
|
||||
"uid": "galaxy-overview",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
Reference in New Issue
Block a user