dev-deploy: production mirror + full observability behind the /_gm gate #88

Merged
developer merged 8 commits from feature/dev-prod-mirror into development 2026-06-01 04:56:46 +00:00
8 changed files with 385 additions and 1 deletions
Showing only changes of commit 84a0ccb23f - Show all commits
+174 -1
View File
@@ -74,6 +74,9 @@ services:
command: command:
- "--smtp-relay-config=/etc/mailpit/relay.conf" - "--smtp-relay-config=/etc/mailpit/relay.conf"
- "--smtp-relay-matching=${GALAXY_DEV_MAIL_RELAY_MATCH:-nobody@invalid.example}" - "--smtp-relay-matching=${GALAXY_DEV_MAIL_RELAY_MATCH:-nobody@invalid.example}"
# Serve the capture UI under /mailpit so the host Caddy can expose it
# at https://galaxy.lan/mailpit/ (behind basic-auth); SMTP is unaffected.
- "--webroot=/mailpit"
labels: labels:
galaxy.stack: dev-deploy galaxy.stack: dev-deploy
networks: networks:
@@ -118,7 +121,13 @@ services:
BACKEND_NOTIFICATION_ADMIN_EMAIL: admin@galaxy.lan BACKEND_NOTIFICATION_ADMIN_EMAIL: admin@galaxy.lan
BACKEND_MAIL_WORKER_INTERVAL: 500ms BACKEND_MAIL_WORKER_INTERVAL: 500ms
BACKEND_NOTIFICATION_WORKER_INTERVAL: 500ms BACKEND_NOTIFICATION_WORKER_INTERVAL: 500ms
BACKEND_OTEL_TRACES_EXPORTER: none BACKEND_OTEL_TRACES_EXPORTER: otlp
BACKEND_OTEL_PROTOCOL: grpc
BACKEND_OTEL_ENDPOINT: "galaxy-tempo:4317"
# Tempo's OTLP receiver is plaintext on the internal network; the
# backend's gRPC exporter defaults to TLS, so disable it via the
# standard SDK env (applied on top of WithEndpoint).
OTEL_EXPORTER_OTLP_INSECURE: "true"
# Prometheus metrics are enabled in dev so the `/metrics` scrape # Prometheus metrics are enabled in dev so the `/metrics` scrape
# endpoint is live and stable ahead of standing up a Prometheus + # endpoint is live and stable ahead of standing up a Prometheus +
# Grafana stack on the internal network. The listener stays internal # Grafana stack on the internal network. The listener stays internal
@@ -196,6 +205,12 @@ services:
# the internal network — live and stable for a future scrape, not # the internal network — live and stable for a future scrape, not
# mapped to the host. # mapped to the host.
GATEWAY_ADMIN_HTTP_ADDR: ":9191" GATEWAY_ADMIN_HTTP_ADDR: ":9191"
# Traces -> Tempo over OTLP gRPC (plaintext on the internal net).
OTEL_SERVICE_NAME: galaxy-gateway
OTEL_TRACES_EXPORTER: otlp
OTEL_EXPORTER_OTLP_PROTOCOL: grpc
OTEL_EXPORTER_OTLP_ENDPOINT: "http://galaxy-tempo:4317"
OTEL_EXPORTER_OTLP_INSECURE: "true"
GATEWAY_BACKEND_HTTP_URL: "http://galaxy-backend:8080" GATEWAY_BACKEND_HTTP_URL: "http://galaxy-backend:8080"
GATEWAY_BACKEND_GRPC_PUSH_URL: "galaxy-backend:8081" GATEWAY_BACKEND_GRPC_PUSH_URL: "galaxy-backend:8081"
GATEWAY_BACKEND_GATEWAY_CLIENT_ID: dev-gateway-1 GATEWAY_BACKEND_GATEWAY_CLIENT_ID: dev-gateway-1
@@ -264,6 +279,156 @@ services:
- galaxy-internal - galaxy-internal
- edge - edge
galaxy-prometheus:
image: prom/prometheus:v2.55.1
container_name: galaxy-dev-prometheus
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=15d
- --web.enable-lifecycle
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- galaxy-dev-prometheus-data:/prometheus
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 384m
galaxy-loki:
image: grafana/loki:3.3.2
container_name: galaxy-dev-loki
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command: ["-config.file=/etc/loki/loki.yml"]
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/loki/loki.yml:/etc/loki/loki.yml:ro
- galaxy-dev-loki-data:/loki
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 384m
galaxy-promtail:
image: grafana/promtail:3.3.2
container_name: galaxy-dev-promtail
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command: ["-config.file=/etc/promtail/promtail.yml"]
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/promtail/promtail.yml:/etc/promtail/promtail.yml:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 128m
galaxy-tempo:
image: grafana/tempo:2.7.1
container_name: galaxy-dev-tempo
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command: ["-config.file=/etc/tempo/tempo.yml"]
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/tempo/tempo.yml:/etc/tempo/tempo.yml:ro
- galaxy-dev-tempo-data:/var/tempo
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 384m
galaxy-node-exporter:
image: prom/node-exporter:v1.8.2
container_name: galaxy-dev-node-exporter
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/rootfs
- --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
pid: host
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 64m
galaxy-cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.49.1
container_name: galaxy-dev-cadvisor
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command:
- --housekeeping_interval=30s
- --docker_only=true
- --store_container_labels=false
privileged: true
devices:
- /dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 256m
galaxy-grafana:
image: grafana/grafana:11.4.0
container_name: galaxy-dev-grafana
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
depends_on:
- galaxy-prometheus
- galaxy-loki
- galaxy-tempo
environment:
GF_SECURITY_ADMIN_PASSWORD: ${GALAXY_DEV_GRAFANA_ADMIN_PASSWORD:-admin}
GF_SERVER_ROOT_URL: https://galaxy.lan/grafana/
GF_SERVER_SERVE_FROM_SUB_PATH: "true"
GF_USERS_ALLOW_SIGN_UP: "false"
GF_ANALYTICS_REPORTING_ENABLED: "false"
GF_ANALYTICS_CHECK_FOR_UPDATES: "false"
GF_NEWS_NEWS_FEED_ENABLED: "false"
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/provisioning:/etc/grafana/provisioning:ro
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/dashboards:/var/lib/grafana/dashboards:ro
- galaxy-dev-grafana-data:/var/lib/grafana
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 256m
networks: networks:
galaxy-internal: galaxy-internal:
name: galaxy-dev-internal name: galaxy-dev-internal
@@ -295,3 +460,11 @@ volumes:
name: galaxy-dev-geoip-data name: galaxy-dev-geoip-data
galaxy-dev-mailpit-config: galaxy-dev-mailpit-config:
name: galaxy-dev-mailpit-config name: galaxy-dev-mailpit-config
galaxy-dev-prometheus-data:
name: galaxy-dev-prometheus-data
galaxy-dev-grafana-data:
name: galaxy-dev-grafana-data
galaxy-dev-loki-data:
name: galaxy-dev-loki-data
galaxy-dev-tempo-data:
name: galaxy-dev-tempo-data
@@ -0,0 +1,46 @@
{
"annotations": { "list": [] },
"editable": true,
"graphTooltip": 0,
"panels": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"id": 1,
"title": "Backend HTTP request rate",
"type": "timeseries",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "sum by (group) (rate(http_requests_total[5m]))",
"legendFormat": "{{group}}"
}
]
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"id": 2,
"title": "Container memory (cadvisor)",
"type": "timeseries",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "sum by (name) (container_memory_usage_bytes{name=~\"galaxy-dev-.*|galaxy-game-.*\"})",
"legendFormat": "{{name}}"
}
]
}
],
"schemaVersion": 39,
"tags": ["galaxy"],
"templating": { "list": [] },
"time": { "from": "now-6h", "to": "now" },
"timepicker": {},
"title": "Galaxy — overview",
"uid": "galaxy-overview",
"version": 1,
"weekStart": ""
}
@@ -0,0 +1,12 @@
# Grafana dashboard provider: load every JSON under the mounted
# dashboards directory at startup (provisioned as code).
apiVersion: 1
providers:
- name: galaxy
type: file
disableDeletion: false
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: false
@@ -0,0 +1,22 @@
# Grafana datasources provisioned as code (dev↔prod parity). All reach
# the collectors by Docker DNS (compose service names) on
# galaxy-dev-internal.
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
uid: prometheus
url: http://galaxy-prometheus:9090
isDefault: true
- name: Loki
type: loki
access: proxy
uid: loki
url: http://galaxy-loki:3100
- name: Tempo
type: tempo
access: proxy
uid: tempo
url: http://galaxy-tempo:3200
+47
View File
@@ -0,0 +1,47 @@
# Single-binary Loki for the dev stack: filesystem storage, in-memory
# ring, 7-day retention. Internal-only (no host port).
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9095
log_level: warn
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
limits_config:
retention_period: 168h
reject_old_samples: true
reject_old_samples_max_age: 168h
compactor:
working_directory: /loki/compactor
retention_enabled: true
delete_request_store: filesystem
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 64
@@ -0,0 +1,24 @@
# Prometheus scrape config for the dev observability stack. Retention is
# a CLI flag in the compose command, not here. Targets are reached by
# Docker DNS (compose service names) on galaxy-dev-internal; nothing is
# published to the host.
global:
scrape_interval: 30s
evaluation_interval: 30s
scrape_configs:
- job_name: backend
static_configs:
- targets: ["galaxy-backend:9100"]
- job_name: gateway
static_configs:
- targets: ["galaxy-api:9191"]
- job_name: node
static_configs:
- targets: ["galaxy-node-exporter:9100"]
- job_name: cadvisor
static_configs:
- targets: ["galaxy-cadvisor:8080"]
- job_name: prometheus
static_configs:
- targets: ["localhost:9090"]
@@ -0,0 +1,30 @@
# Promtail tails the dev stack's container logs via the Docker API
# (service discovery filtered to the galaxy.stack=dev-deploy label) and
# ships them to Loki. Requires the Docker socket mounted read-only.
server:
http_listen_port: 9080
grpc_listen_port: 0
log_level: warn
positions:
filename: /tmp/positions.yaml
clients:
- url: http://galaxy-loki:3100/loki/api/v1/push
scrape_configs:
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 15s
filters:
- name: label
values: ["galaxy.stack=dev-deploy"]
relabel_configs:
- source_labels: ["__meta_docker_container_name"]
regex: "/?(.*)"
target_label: container
- source_labels: ["__meta_docker_container_label_galaxy_game_id"]
target_label: game_id
- source_labels: ["__meta_docker_container_log_stream"]
target_label: stream
@@ -0,0 +1,30 @@
# Single-binary Tempo for the dev stack: OTLP receivers, local block
# storage, 3-day retention. Internal-only (no host port). Backend and
# gateway push traces here over OTLP gRPC (4317).
server:
http_listen_port: 3200
log_level: warn
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
ingester:
max_block_duration: 5m
compactor:
compaction:
block_retention: 72h
storage:
trace:
backend: local
local:
path: /var/tempo/blocks
wal:
path: /var/tempo/wal