Stage 16: deploy infra & test contour
- backend + gateway multi-stage distroless Dockerfiles; the gateway embeds and
serves the SPA at / and /telegram/ via go:embed (committed dist placeholder,
real build baked in by the image's node stage)
- deploy/docker-compose.yml: backend + gateway + Postgres + Telegram connector
(VPN sidecar) + OTel Collector + Prometheus (15d) + Tempo (72h) + Grafana,
fronted by a caddy owning a single /_gm Basic-Auth (admin console + Grafana
subpath); inter-service on a private network, only caddy on the edge network
- new metrics: backend accounts_created_total{kind} (robots excluded) and an
in-memory gateway active_users{window=24h,7d} gauge
- CI: single .gitea/workflows/ci.yaml (unit/integration/ui + a gated test-contour
deploy) on the new feature/* -> development -> master branch model; the old
go-unit/integration/ui-test workflows are folded in; the connector-scoped
compose is retired (superseded by deploy/)
- docs: ARCHITECTURE §11/§12/§13, root + gateway READMEs, CLAUDE.md branching,
PLAN.md (stage 16 done + refinements + Stage 17 forward-notes)
This commit is contained in:
@@ -0,0 +1,39 @@
|
||||
{
|
||||
"uid": "scrabble-edge",
|
||||
"title": "Scrabble — Edge / UX",
|
||||
"tags": ["scrabble"],
|
||||
"timezone": "",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"panels": [
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Edge request rate by message type",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "sum(rate(edge_request_duration_count[5m])) by (message_type)", "legendFormat": "{{message_type}}" }]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Edge p95 latency",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "refId": "A", "expr": "histogram_quantile(0.95, sum(rate(edge_request_duration_bucket[5m])) by (le))", "legendFormat": "p95" },
|
||||
{ "refId": "B", "expr": "histogram_quantile(0.50, sum(rate(edge_request_duration_bucket[5m])) by (le))", "legendFormat": "p50" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Edge requests by result",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 },
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "sum(rate(edge_request_duration_count[5m])) by (result)", "legendFormat": "{{result}}" }]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
{
|
||||
"uid": "scrabble-game",
|
||||
"title": "Scrabble — Game domain",
|
||||
"tags": ["scrabble"],
|
||||
"timezone": "",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"panels": [
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Games started / abandoned (rate by variant)",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "refId": "A", "expr": "sum(rate(games_started_total[15m])) by (variant)", "legendFormat": "started {{variant}}" },
|
||||
{ "refId": "B", "expr": "sum(rate(games_abandoned_total[15m])) by (variant)", "legendFormat": "abandoned {{variant}}" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Robot games finished (rate)",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "sum(rate(robot_games_finished_total[15m]))", "legendFormat": "robot games" }]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Live games in cache (by variant)",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "sum(game_cache_active) by (variant)", "legendFormat": "{{variant}}" }]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Chat messages (rate by kind)",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "sum(rate(chat_messages_total[15m])) by (kind)", "legendFormat": "{{kind}}" }]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Journal replay p95 (by variant)",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
|
||||
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "histogram_quantile(0.95, sum(rate(game_replay_duration_bucket[5m])) by (le, variant))", "legendFormat": "{{variant}}" }]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Move validate p95 (by variant)",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
|
||||
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "histogram_quantile(0.95, sum(rate(game_move_validate_duration_bucket[5m])) by (le, variant))", "legendFormat": "{{variant}}" }]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
{
|
||||
"uid": "scrabble-overview",
|
||||
"title": "Scrabble — Service overview",
|
||||
"tags": ["scrabble"],
|
||||
"timezone": "",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"panels": [
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Active users (24h)",
|
||||
"gridPos": { "h": 5, "w": 6, "x": 0, "y": 0 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "max(active_users{window=\"24h\"})" }]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Active users (7d)",
|
||||
"gridPos": { "h": 5, "w": 6, "x": 6, "y": 0 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "max(active_users{window=\"7d\"})" }]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Edge requests/s",
|
||||
"gridPos": { "h": 5, "w": 6, "x": 12, "y": 0 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "sum(rate(edge_request_duration_count[5m]))" }]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Edge error ratio",
|
||||
"gridPos": { "h": 5, "w": 6, "x": 18, "y": 0 },
|
||||
"fieldConfig": { "defaults": { "unit": "percentunit" }, "overrides": [] },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "sum(rate(edge_request_duration_count{result!=\"ok\"}[5m])) / clamp_min(sum(rate(edge_request_duration_count[5m])), 1)" }]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Goroutines by service",
|
||||
"description": "OTel Go runtime metric; verify the exact name against live Prometheus if empty (go_goroutine_count / process_runtime_go_goroutines depending on the contrib runtime version).",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "go_goroutine_count", "legendFormat": "{{service_name}}" }]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Heap memory used by service",
|
||||
"description": "OTel Go runtime metric (best-effort name go_memory_used); verify against live Prometheus if empty.",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 },
|
||||
"fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "sum(go_memory_used) by (service_name)", "legendFormat": "{{service_name}}" }]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"uid": "scrabble-users",
|
||||
"title": "Scrabble — Users",
|
||||
"tags": ["scrabble"],
|
||||
"timezone": "",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-7d", "to": "now" },
|
||||
"panels": [
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Active users (in-memory, single gateway)",
|
||||
"description": "Distinct accounts with an authenticated action within the window. Resets on gateway restart; correct for a single instance (MVP).",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "max(active_users) by (window)", "legendFormat": "{{window}}" }]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "New accounts (rate by kind)",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "sum(rate(accounts_created_total[1h])) by (kind)", "legendFormat": "{{kind}}" }]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "New accounts (cumulative by kind)",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "sum(accounts_created_total) by (kind)", "legendFormat": "{{kind}}" }]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
# Loads the committed dashboard JSON from /var/lib/grafana/dashboards (mounted
|
||||
# read-only from deploy/grafana/dashboards).
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: scrabble
|
||||
orgId: 1
|
||||
folder: Scrabble
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
foldersFromFilesStructure: false
|
||||
@@ -0,0 +1,16 @@
|
||||
# Grafana datasources for the Scrabble contour, provisioned at startup. Metrics
|
||||
# come from Prometheus (scraping the collector) and traces from Tempo.
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
uid: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
- name: Tempo
|
||||
type: tempo
|
||||
uid: tempo
|
||||
access: proxy
|
||||
url: http://tempo:3200
|
||||
Reference in New Issue
Block a user