feat(dev-deploy): full observability stack (Prometheus/Grafana/Loki/Tempo)
Stand up a production-mirror monitoring stack in the long-lived dev contour, all on galaxy-dev-internal with no host ports (reached only via the in-repo galaxy-dev-caddy): - Prometheus scrapes backend:9100, gateway:9191, node-exporter and cadvisor (30s interval, 15d retention); Loki (7d) + promtail (Docker service discovery by the galaxy.stack=dev-deploy label) for logs; Tempo (3d) for traces. - Backend and gateway now export OTLP traces to Tempo over plaintext gRPC on the internal network (OTEL_EXPORTER_OTLP_INSECURE). - Grafana provisioned as code (Prometheus/Loki/Tempo datasources plus a starter dashboard), served under /grafana/ via Caddy sub-path mode; admin password from the GALAXY_DEV_GRAFANA_ADMIN_PASSWORD secret. - Expose the Mailpit capture UI under /mailpit/ (Caddy basic-auth + MP_WEBROOT) so every captured message is readable regardless of relay. - dev-deploy.yaml seeds the monitoring config to a stable, reboot- surviving host path and injects the Grafana admin secret. Per-service memory limits keep the footprint within budget. All collector config lives under tools/dev-deploy/monitoring/ for dev/prod parity.
This commit is contained in:
@@ -0,0 +1,46 @@
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"graphTooltip": 0,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"title": "Backend HTTP request rate",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "sum by (group) (rate(http_requests_total[5m]))",
|
||||
"legendFormat": "{{group}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"id": 2,
|
||||
"title": "Container memory (cadvisor)",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "sum by (name) (container_memory_usage_bytes{name=~\"galaxy-dev-.*|galaxy-game-.*\"})",
|
||||
"legendFormat": "{{name}}"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["galaxy"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"title": "Galaxy — overview",
|
||||
"uid": "galaxy-overview",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
# Grafana dashboard provider: load every JSON under the mounted
|
||||
# dashboards directory at startup (provisioned as code).
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: galaxy
|
||||
type: file
|
||||
disableDeletion: false
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
foldersFromFilesStructure: false
|
||||
@@ -0,0 +1,22 @@
|
||||
# Grafana datasources provisioned as code (dev↔prod parity). All reach
|
||||
# the collectors by Docker DNS (compose service names) on
|
||||
# galaxy-dev-internal.
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
uid: prometheus
|
||||
url: http://galaxy-prometheus:9090
|
||||
isDefault: true
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
uid: loki
|
||||
url: http://galaxy-loki:3100
|
||||
- name: Tempo
|
||||
type: tempo
|
||||
access: proxy
|
||||
uid: tempo
|
||||
url: http://galaxy-tempo:3200
|
||||
@@ -0,0 +1,47 @@
|
||||
# Single-binary Loki for the dev stack: filesystem storage, in-memory
|
||||
# ring, 7-day retention. Internal-only (no host port).
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9095
|
||||
log_level: warn
|
||||
|
||||
common:
|
||||
instance_addr: 127.0.0.1
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2024-01-01
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
limits_config:
|
||||
retention_period: 168h
|
||||
reject_old_samples: true
|
||||
reject_old_samples_max_age: 168h
|
||||
|
||||
compactor:
|
||||
working_directory: /loki/compactor
|
||||
retention_enabled: true
|
||||
delete_request_store: filesystem
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 64
|
||||
@@ -0,0 +1,24 @@
|
||||
# Prometheus scrape config for the dev observability stack. Retention is
|
||||
# a CLI flag in the compose command, not here. Targets are reached by
|
||||
# Docker DNS (compose service names) on galaxy-dev-internal; nothing is
|
||||
# published to the host.
|
||||
global:
|
||||
scrape_interval: 30s
|
||||
evaluation_interval: 30s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: backend
|
||||
static_configs:
|
||||
- targets: ["galaxy-backend:9100"]
|
||||
- job_name: gateway
|
||||
static_configs:
|
||||
- targets: ["galaxy-api:9191"]
|
||||
- job_name: node
|
||||
static_configs:
|
||||
- targets: ["galaxy-node-exporter:9100"]
|
||||
- job_name: cadvisor
|
||||
static_configs:
|
||||
- targets: ["galaxy-cadvisor:8080"]
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
@@ -0,0 +1,30 @@
|
||||
# Promtail tails the dev stack's container logs via the Docker API
|
||||
# (service discovery filtered to the galaxy.stack=dev-deploy label) and
|
||||
# ships them to Loki. Requires the Docker socket mounted read-only.
|
||||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
log_level: warn
|
||||
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://galaxy-loki:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
- job_name: docker
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
refresh_interval: 15s
|
||||
filters:
|
||||
- name: label
|
||||
values: ["galaxy.stack=dev-deploy"]
|
||||
relabel_configs:
|
||||
- source_labels: ["__meta_docker_container_name"]
|
||||
regex: "/?(.*)"
|
||||
target_label: container
|
||||
- source_labels: ["__meta_docker_container_label_galaxy_game_id"]
|
||||
target_label: game_id
|
||||
- source_labels: ["__meta_docker_container_log_stream"]
|
||||
target_label: stream
|
||||
@@ -0,0 +1,30 @@
|
||||
# Single-binary Tempo for the dev stack: OTLP receivers, local block
|
||||
# storage, 3-day retention. Internal-only (no host port). Backend and
|
||||
# gateway push traces here over OTLP gRPC (4317).
|
||||
server:
|
||||
http_listen_port: 3200
|
||||
log_level: warn
|
||||
|
||||
distributor:
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
ingester:
|
||||
max_block_duration: 5m
|
||||
|
||||
compactor:
|
||||
compaction:
|
||||
block_retention: 72h
|
||||
|
||||
storage:
|
||||
trace:
|
||||
backend: local
|
||||
local:
|
||||
path: /var/tempo/blocks
|
||||
wal:
|
||||
path: /var/tempo/wal
|
||||
Reference in New Issue
Block a user