galaxy-game/backend/internal/opsstatus/opsstatus.go

// Package opsstatus reads point-in-time operational signals from Postgres for
// the admin console dashboard: database reachability, per-status counts of game
// runtimes, mail deliveries, and notification routes, plus the malformed
// notification-intent count.
//
// It is a read-only projection built entirely through the go-jet query builder
// against the generated table bindings; it owns no business logic and mutates
// nothing. Richer, historical metrics are out of scope — those belong to the
// Prometheus exporters wired on `backend` and `gateway`.
package opsstatus

import (
	"context"
	"database/sql"
	"fmt"
	"time"

	"galaxy/backend/internal/postgres/jet/backend/table"

	"github.com/go-jet/jet/v2/postgres"
)

// defaultCollectTimeout bounds a single Collect call so a slow or wedged
// database cannot hang the dashboard request.
const defaultCollectTimeout = 3 * time.Second

// StatusCount pairs a status value with the number of rows currently in it.
type StatusCount struct {
	Status string
	Count  int64
}

// Snapshot is a point-in-time view of the operational signals rendered on the
// dashboard. Errors collects per-query failures so a single failing probe
// degrades to a visible note rather than failing the whole page.
type Snapshot struct {
	PostgresHealthy       bool
	Runtimes              []StatusCount
	MailDeliveries        []StatusCount
	NotificationRoutes    []StatusCount
	NotificationMalformed int64
	Errors                []string
}

// Reader collects an operational Snapshot. The admin console depends on this
// interface so the dashboard can be tested without a database.
type Reader interface {
	Collect(ctx context.Context) Snapshot
}

// Store is the Postgres-backed Reader.
type Store struct {
	db      *sql.DB
	timeout time.Duration
}

// NewStore constructs a Store reading from db.
func NewStore(db *sql.DB) *Store {
	return &Store{db: db, timeout: defaultCollectTimeout}
}

// Collect gathers the dashboard signals within a bounded timeout. It never
// returns an error: a failed probe is recorded in Snapshot.Errors and the
// remaining probes still run, except that a failed Postgres ping short-circuits
// the rest (the dependent queries would only fail the same way).
func (s *Store) Collect(ctx context.Context) Snapshot {
	ctx, cancel := context.WithTimeout(ctx, s.timeout)
	defer cancel()

	var snap Snapshot

	if err := s.db.PingContext(ctx); err != nil {
		snap.Errors = append(snap.Errors, fmt.Sprintf("postgres ping: %v", err))
		return snap
	}
	snap.PostgresHealthy = true

	if counts, err := s.statusCounts(ctx, table.RuntimeRecords.Status, table.RuntimeRecords); err != nil {
		snap.Errors = append(snap.Errors, fmt.Sprintf("runtime status counts: %v", err))
	} else {
		snap.Runtimes = counts
	}

	if counts, err := s.statusCounts(ctx, table.MailDeliveries.Status, table.MailDeliveries); err != nil {
		snap.Errors = append(snap.Errors, fmt.Sprintf("mail delivery counts: %v", err))
	} else {
		snap.MailDeliveries = counts
	}

	if counts, err := s.statusCounts(ctx, table.NotificationRoutes.Status, table.NotificationRoutes); err != nil {
		snap.Errors = append(snap.Errors, fmt.Sprintf("notification route counts: %v", err))
	} else {
		snap.NotificationRoutes = counts
	}

	if n, err := s.countAll(ctx, table.NotificationMalformedIntents); err != nil {
		snap.Errors = append(snap.Errors, fmt.Sprintf("malformed notification count: %v", err))
	} else {
		snap.NotificationMalformed = n
	}

	return snap
}

// statusCounts runs `SELECT status, COUNT(*) FROM <from> GROUP BY status`
// through jet and returns the rows ordered by status.
func (s *Store) statusCounts(ctx context.Context, status postgres.ColumnString, from postgres.ReadableTable) ([]StatusCount, error) {
	stmt := postgres.SELECT(
		status.AS("status_count.status"),
		postgres.COUNT(postgres.STAR).AS("status_count.count"),
	).FROM(from).GROUP_BY(status).ORDER_BY(status.ASC())

	var rows []struct {
		Status string `alias:"status_count.status"`
		Count  int64  `alias:"status_count.count"`
	}
	if err := stmt.QueryContext(ctx, s.db, &rows); err != nil {
		return nil, err
	}

	out := make([]StatusCount, len(rows))
	for i, row := range rows {
		out[i] = StatusCount{Status: row.Status, Count: row.Count}
	}
	return out, nil
}

// countAll runs `SELECT COUNT(*) FROM <from>` through jet.
func (s *Store) countAll(ctx context.Context, from postgres.ReadableTable) (int64, error) {
	stmt := postgres.SELECT(postgres.COUNT(postgres.STAR).AS("count")).FROM(from)

	var row struct {
		Count int64 `alias:"count"`
	}
	if err := stmt.QueryContext(ctx, s.db, &row); err != nil {
		return 0, err
	}
	return row.Count, nil
}