fix(backend): retry migrations on transient connection errors
Tests · Go / test (push) Successful in 2m1s
Tests · Go / test (pull_request) Successful in 3m0s
Tests · Integration / integration (pull_request) Successful in 1m42s

Backend e2e tests (and, more rarely, service startup) intermittently
failed applying migrations with `driver: bad connection`: a freshly
started Postgres — notably a test container — can reset a pooled
connection moments after it reports ready, killing the migration
transaction. The harness already waits for the double "ready" log and
pings before migrating, yet goose can still draw a connection postgres
then resets.

ApplyMigrations now wraps the schema-create + goose run in a bounded
retry that fires only on transient connection errors (driver.ErrBadConn
and the connection-failure messages Postgres drivers surface); both
steps are idempotent, so a retry resumes cleanly. Deterministic SQL
errors still fail fast.

Fixes the intermittent TestDiplomailAsyncFallbackOnUnsupportedPair (and
the eight other testcontainer e2e harnesses that share ApplyMigrations).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Ilia Denisov
2026-05-30 14:45:53 +02:00
parent 2f55fc4988
commit 06a2e631c9
2 changed files with 179 additions and 7 deletions
+76 -7
View File
@@ -10,7 +10,10 @@ package postgres
import (
"context"
"database/sql"
"database/sql/driver"
"errors"
"fmt"
"strings"
"time"
"galaxy/backend/internal/config"
@@ -67,18 +70,84 @@ func Open(ctx context.Context, cfg config.PostgresConfig, runtime *telemetry.Run
// backend table lives here.
const schemaName = "backend"
// migrationRetryAttempts and migrationRetryBackoff bound the transient-error
// retry around ApplyMigrations. A freshly started Postgres — notably a test
// container — can reset a pooled connection moments after it reports ready,
// which surfaces as `driver: bad connection` mid-migration; a handful of quick
// retries rides over that without masking real failures.
const (
migrationRetryAttempts = 5
migrationRetryBackoff = 250 * time.Millisecond
)
// ApplyMigrations runs every pending Up migration embedded in the backend
// binary against db. The schema is created upfront so goose's bookkeeping
// table (`goose_db_version`, scoped to the DSN `search_path = backend`)
// has somewhere to land before the first migration runs; migration
// `00001_init.sql` re-asserts the schema with `IF NOT EXISTS`, so the
// double-create is idempotent.
//
// The apply is retried on transient connection errors (see retryOnTransient).
// Both steps are idempotent — `CREATE SCHEMA IF NOT EXISTS` and goose's
// version tracking — so a retry after a dropped connection re-runs cleanly and
// resumes from the last committed migration.
func ApplyMigrations(ctx context.Context, db *sql.DB) error {
if _, err := db.ExecContext(ctx, "CREATE SCHEMA IF NOT EXISTS "+schemaName); err != nil {
return fmt.Errorf("ensure backend schema: %w", err)
}
if err := pgshared.RunMigrations(ctx, db, migrations.Migrations(), "."); err != nil {
return fmt.Errorf("apply backend migrations: %w", err)
}
return nil
return retryOnTransient(ctx, migrationRetryAttempts, migrationRetryBackoff, func() error {
if _, err := db.ExecContext(ctx, "CREATE SCHEMA IF NOT EXISTS "+schemaName); err != nil {
return fmt.Errorf("ensure backend schema: %w", err)
}
if err := pgshared.RunMigrations(ctx, db, migrations.Migrations(), "."); err != nil {
return fmt.Errorf("apply backend migrations: %w", err)
}
return nil
})
}
// retryOnTransient runs op up to attempts times, retrying only when op fails
// with a transient connection error (see isTransientConnError) — a dropped,
// reset, or refused connection, as opposed to a deterministic SQL error. It
// waits backoff between attempts and stops early if ctx is cancelled. A
// non-transient error, or the error from the final attempt, is returned as-is.
func retryOnTransient(ctx context.Context, attempts int, backoff time.Duration, op func() error) error {
var err error
for attempt := 1; attempt <= attempts; attempt++ {
if err = op(); err == nil {
return nil
}
if attempt == attempts || !isTransientConnError(err) {
return err
}
select {
case <-ctx.Done():
return errors.Join(err, ctx.Err())
case <-time.After(backoff):
}
}
return err
}
// isTransientConnError reports whether err is a transient connection-level
// failure worth retrying. It matches database/sql's driver.ErrBadConn and the
// connection-failure messages Postgres drivers surface, while leaving
// deterministic SQL errors (syntax, constraint violations) to fail fast.
func isTransientConnError(err error) bool {
if err == nil {
return false
}
if errors.Is(err, driver.ErrBadConn) {
return true
}
msg := strings.ToLower(err.Error())
for _, s := range []string{
"bad connection",
"connection refused",
"connection reset",
"broken pipe",
"server closed the connection",
} {
if strings.Contains(msg, s) {
return true
}
}
return false
}