679 lines
21 KiB
Go
679 lines
21 KiB
Go
// Package reconcile implements the drift reconciliation worker
|
|
// described in `rtmanager/README.md §Reconciliation`. The reconciler
|
|
// is the single authority that brings `runtime_records` into agreement
|
|
// with the Docker daemon's view of `com.galaxy.owner=rtmanager`
|
|
// containers.
|
|
//
|
|
// Three drift kinds are handled:
|
|
//
|
|
// - Adopt — a running container labelled `com.galaxy.owner=rtmanager`
|
|
// has no matching `runtime_records` row. The reconciler inserts a
|
|
// `status=running` record (`op_kind=reconcile_adopt`).
|
|
// - Dispose — a `status=running` row whose `current_container_id` is
|
|
// no longer reported by Docker. The reconciler updates the row to
|
|
// `status=removed`, publishes `runtime:health_events`
|
|
// `container_disappeared`, and appends `reconcile_dispose`.
|
|
// - Observed exited — a `status=running` row whose container exists
|
|
// but reports `State.Status=exited`. The reconciler transitions
|
|
// the row to `status=stopped` and publishes `container_exited`
|
|
// with the observed exit code. No `operation_log` entry is written
|
|
// because `OpKind` does not include a value for this transition;
|
|
// it is reflected in `rtmanager.reconcile_drift{kind=observed_exited}`
|
|
// instead.
|
|
//
|
|
// All write decisions for a given `game_id` are guarded by the per-game
|
|
// Redis lease; the read pass that lists Docker containers and PG
|
|
// records is lockless.
|
|
//
|
|
// The reconciler runs once synchronously at process start
|
|
// (`ReconcileNow`) before any other worker is allowed to start, and
|
|
// then periodically via `Run` as an `app.Component`. Design rationale
|
|
// is captured in `rtmanager/docs/workers.md`.
|
|
package reconcile
|
|
|
|
import (
|
|
"context"
|
|
"crypto/rand"
|
|
"encoding/base64"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"path/filepath"
|
|
"strconv"
|
|
"time"
|
|
|
|
"galaxy/rtmanager/internal/config"
|
|
"galaxy/rtmanager/internal/domain/health"
|
|
"galaxy/rtmanager/internal/domain/operation"
|
|
"galaxy/rtmanager/internal/domain/runtime"
|
|
"galaxy/rtmanager/internal/logging"
|
|
"galaxy/rtmanager/internal/ports"
|
|
"galaxy/rtmanager/internal/service/startruntime"
|
|
"galaxy/rtmanager/internal/telemetry"
|
|
)
|
|
|
|
// dockerStateRunning is the verbatim Docker `State.Status` value the
|
|
// reconciler treats as "the container is alive".
|
|
const dockerStateRunning = "running"
|
|
|
|
// dockerStateExited is the verbatim Docker `State.Status` value the
|
|
// reconciler treats as "the container has terminated".
|
|
const dockerStateExited = "exited"
|
|
|
|
// driftKindAdopt / driftKindDispose / driftKindObservedExited match the
|
|
// `kind` label vocabulary on `rtmanager.reconcile_drift`.
|
|
const (
|
|
driftKindAdopt = "adopt"
|
|
driftKindDispose = "dispose"
|
|
driftKindObservedExited = "observed_exited"
|
|
)
|
|
|
|
// leaseReleaseTimeout bounds the deferred lease-release call. A fresh
|
|
// background context is used so the release runs even if the request
|
|
// context was already canceled.
|
|
const leaseReleaseTimeout = 5 * time.Second
|
|
|
|
// Dependencies groups the collaborators required by Reconciler.
|
|
type Dependencies struct {
|
|
Docker ports.DockerClient
|
|
RuntimeRecords ports.RuntimeRecordStore
|
|
OperationLogs ports.OperationLogStore
|
|
HealthEvents ports.HealthEventPublisher
|
|
Leases ports.GameLeaseStore
|
|
|
|
Telemetry *telemetry.Runtime
|
|
|
|
DockerCfg config.DockerConfig
|
|
ContainerCfg config.ContainerConfig
|
|
Coordination config.CoordinationConfig
|
|
|
|
// Interval bounds the periodic tick. ReconcileNow ignores it.
|
|
Interval time.Duration
|
|
|
|
Clock func() time.Time
|
|
Logger *slog.Logger
|
|
NewToken func() string
|
|
}
|
|
|
|
// Reconciler drives both the synchronous initial pass and the periodic
|
|
// drift reconciliation loop.
|
|
type Reconciler struct {
|
|
docker ports.DockerClient
|
|
runtimeRecords ports.RuntimeRecordStore
|
|
operationLogs ports.OperationLogStore
|
|
healthEvents ports.HealthEventPublisher
|
|
leases ports.GameLeaseStore
|
|
|
|
telemetry *telemetry.Runtime
|
|
|
|
dockerNetwork string
|
|
stateRoot string
|
|
leaseTTL time.Duration
|
|
|
|
interval time.Duration
|
|
|
|
clock func() time.Time
|
|
logger *slog.Logger
|
|
newToken func() string
|
|
}
|
|
|
|
// NewReconciler constructs one Reconciler from deps.
|
|
func NewReconciler(deps Dependencies) (*Reconciler, error) {
|
|
switch {
|
|
case deps.Docker == nil:
|
|
return nil, errors.New("new reconciler: nil docker client")
|
|
case deps.RuntimeRecords == nil:
|
|
return nil, errors.New("new reconciler: nil runtime records store")
|
|
case deps.OperationLogs == nil:
|
|
return nil, errors.New("new reconciler: nil operation log store")
|
|
case deps.HealthEvents == nil:
|
|
return nil, errors.New("new reconciler: nil health events publisher")
|
|
case deps.Leases == nil:
|
|
return nil, errors.New("new reconciler: nil lease store")
|
|
case deps.Telemetry == nil:
|
|
return nil, errors.New("new reconciler: nil telemetry runtime")
|
|
case deps.Interval <= 0:
|
|
return nil, errors.New("new reconciler: interval must be positive")
|
|
}
|
|
if err := deps.DockerCfg.Validate(); err != nil {
|
|
return nil, fmt.Errorf("new reconciler: docker config: %w", err)
|
|
}
|
|
if err := deps.ContainerCfg.Validate(); err != nil {
|
|
return nil, fmt.Errorf("new reconciler: container config: %w", err)
|
|
}
|
|
if err := deps.Coordination.Validate(); err != nil {
|
|
return nil, fmt.Errorf("new reconciler: coordination config: %w", err)
|
|
}
|
|
|
|
clock := deps.Clock
|
|
if clock == nil {
|
|
clock = time.Now
|
|
}
|
|
logger := deps.Logger
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
newToken := deps.NewToken
|
|
if newToken == nil {
|
|
newToken = defaultTokenGenerator()
|
|
}
|
|
|
|
return &Reconciler{
|
|
docker: deps.Docker,
|
|
runtimeRecords: deps.RuntimeRecords,
|
|
operationLogs: deps.OperationLogs,
|
|
healthEvents: deps.HealthEvents,
|
|
leases: deps.Leases,
|
|
telemetry: deps.Telemetry,
|
|
dockerNetwork: deps.DockerCfg.Network,
|
|
stateRoot: deps.ContainerCfg.GameStateRoot,
|
|
leaseTTL: deps.Coordination.GameLeaseTTL,
|
|
interval: deps.Interval,
|
|
clock: clock,
|
|
logger: logger.With("worker", "rtmanager.reconcile"),
|
|
newToken: newToken,
|
|
}, nil
|
|
}
|
|
|
|
// ReconcileNow performs one full reconciliation pass synchronously.
|
|
// It is intended for the startup path described in
|
|
// `rtmanager/README.md §Startup dependencies` (step 6). Per-game
|
|
// errors are absorbed into telemetry and logs; only ctx errors are
|
|
// surfaced to the caller so a cancelled startup aborts immediately.
|
|
func (reconciler *Reconciler) ReconcileNow(ctx context.Context) error {
|
|
if reconciler == nil {
|
|
return errors.New("reconcile now: nil reconciler")
|
|
}
|
|
if ctx == nil {
|
|
return errors.New("reconcile now: nil context")
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
reconciler.tick(ctx)
|
|
return ctx.Err()
|
|
}
|
|
|
|
// Run drives the periodic reconciliation loop. It does not perform an
|
|
// immediate first pass — `ReconcileNow` covers that path; the first
|
|
// tick fires after `Interval`. Run terminates on context cancellation.
|
|
func (reconciler *Reconciler) Run(ctx context.Context) error {
|
|
if reconciler == nil {
|
|
return errors.New("run reconciler: nil reconciler")
|
|
}
|
|
if ctx == nil {
|
|
return errors.New("run reconciler: nil context")
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
reconciler.logger.Info("reconciler started",
|
|
"interval", reconciler.interval.String(),
|
|
)
|
|
defer reconciler.logger.Info("reconciler stopped")
|
|
|
|
ticker := time.NewTicker(reconciler.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-ticker.C:
|
|
reconciler.tick(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Shutdown is a no-op; Run terminates on context cancellation.
|
|
func (reconciler *Reconciler) Shutdown(ctx context.Context) error {
|
|
if ctx == nil {
|
|
return errors.New("shutdown reconciler: nil context")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Tick performs one reconciliation pass. Exported so tests can drive
|
|
// the reconciler deterministically without spinning a real ticker.
|
|
func (reconciler *Reconciler) Tick(ctx context.Context) {
|
|
reconciler.tick(ctx)
|
|
}
|
|
|
|
// tick executes one full pass: list Docker containers + PG records,
|
|
// resolve drift, and apply lease-guarded mutations for each affected
|
|
// game.
|
|
func (reconciler *Reconciler) tick(ctx context.Context) {
|
|
if err := ctx.Err(); err != nil {
|
|
return
|
|
}
|
|
|
|
containers, err := reconciler.docker.List(ctx, ports.ListFilter{
|
|
Labels: map[string]string{startruntime.LabelOwner: startruntime.LabelOwnerValue},
|
|
})
|
|
if err != nil {
|
|
reconciler.logger.WarnContext(ctx, "list owned containers",
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
records, err := reconciler.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
|
|
if err != nil {
|
|
reconciler.logger.WarnContext(ctx, "list running records",
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
containerByGame := make(map[string]ports.ContainerSummary, len(containers))
|
|
for _, summary := range containers {
|
|
gameID := summary.Labels[startruntime.LabelGameID]
|
|
if gameID == "" {
|
|
continue
|
|
}
|
|
containerByGame[gameID] = summary
|
|
}
|
|
|
|
recordByGame := make(map[string]runtime.RuntimeRecord, len(records))
|
|
for _, record := range records {
|
|
recordByGame[record.GameID] = record
|
|
}
|
|
|
|
for gameID, summary := range containerByGame {
|
|
if err := ctx.Err(); err != nil {
|
|
return
|
|
}
|
|
if _, ok := recordByGame[gameID]; ok {
|
|
continue
|
|
}
|
|
if summary.Status != dockerStateRunning {
|
|
continue
|
|
}
|
|
reconciler.adoptOne(ctx, gameID, summary)
|
|
}
|
|
|
|
for _, record := range records {
|
|
if err := ctx.Err(); err != nil {
|
|
return
|
|
}
|
|
summary, ok := containerByGame[record.GameID]
|
|
if !ok {
|
|
reconciler.disposeOne(ctx, record)
|
|
continue
|
|
}
|
|
if summary.ID != record.CurrentContainerID {
|
|
continue
|
|
}
|
|
if summary.Status == dockerStateExited {
|
|
reconciler.observedExitedOne(ctx, record, summary)
|
|
}
|
|
}
|
|
}
|
|
|
|
// adoptOne installs a `runtime_records` row for an unrecorded running
|
|
// container under the per-game lease.
|
|
func (reconciler *Reconciler) adoptOne(ctx context.Context, gameID string, summary ports.ContainerSummary) {
|
|
token := reconciler.newToken()
|
|
acquired, err := reconciler.leases.TryAcquire(ctx, gameID, token, reconciler.leaseTTL)
|
|
if err != nil {
|
|
reconciler.logger.WarnContext(ctx, "adopt: acquire lease",
|
|
"game_id", gameID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
if !acquired {
|
|
reconciler.logger.InfoContext(ctx, "adopt: lease busy, skipping",
|
|
"game_id", gameID,
|
|
)
|
|
return
|
|
}
|
|
defer reconciler.releaseLease(ctx, gameID, token)
|
|
|
|
if _, err := reconciler.runtimeRecords.Get(ctx, gameID); err == nil {
|
|
reconciler.logger.InfoContext(ctx, "adopt: record appeared concurrently, skipping",
|
|
"game_id", gameID,
|
|
)
|
|
return
|
|
} else if !errors.Is(err, runtime.ErrNotFound) {
|
|
reconciler.logger.WarnContext(ctx, "adopt: read record",
|
|
"game_id", gameID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
startedAt := reconciler.resolveStartedAt(ctx, summary)
|
|
imageRef := summary.Labels[startruntime.LabelEngineImageRef]
|
|
if imageRef == "" {
|
|
imageRef = summary.ImageRef
|
|
}
|
|
|
|
now := reconciler.clock().UTC()
|
|
createdAt := now
|
|
if startedAt.Before(createdAt) {
|
|
createdAt = startedAt
|
|
}
|
|
record := runtime.RuntimeRecord{
|
|
GameID: gameID,
|
|
Status: runtime.StatusRunning,
|
|
CurrentContainerID: summary.ID,
|
|
CurrentImageRef: imageRef,
|
|
EngineEndpoint: reconciler.engineEndpoint(gameID),
|
|
StatePath: filepath.Join(reconciler.stateRoot, gameID),
|
|
DockerNetwork: reconciler.dockerNetwork,
|
|
StartedAt: &startedAt,
|
|
LastOpAt: now,
|
|
CreatedAt: createdAt,
|
|
}
|
|
if err := reconciler.runtimeRecords.Upsert(ctx, record); err != nil {
|
|
reconciler.logger.ErrorContext(ctx, "adopt: upsert record",
|
|
"game_id", gameID,
|
|
"container_id", summary.ID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
finishedAt := reconciler.clock().UTC()
|
|
reconciler.bestEffortAppend(ctx, operation.OperationEntry{
|
|
GameID: gameID,
|
|
OpKind: operation.OpKindReconcileAdopt,
|
|
OpSource: operation.OpSourceAutoReconcile,
|
|
ImageRef: imageRef,
|
|
ContainerID: summary.ID,
|
|
Outcome: operation.OutcomeSuccess,
|
|
StartedAt: now,
|
|
FinishedAt: &finishedAt,
|
|
})
|
|
reconciler.telemetry.RecordReconcileDrift(ctx, driftKindAdopt)
|
|
|
|
logArgs := []any{
|
|
"game_id", gameID,
|
|
"container_id", summary.ID,
|
|
"image_ref", imageRef,
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
reconciler.logger.InfoContext(ctx, "reconciler adopted unrecorded container", logArgs...)
|
|
}
|
|
|
|
// disposeOne transitions a `running` record whose container is missing
|
|
// in Docker to `removed` and publishes `container_disappeared`.
|
|
func (reconciler *Reconciler) disposeOne(ctx context.Context, record runtime.RuntimeRecord) {
|
|
token := reconciler.newToken()
|
|
acquired, err := reconciler.leases.TryAcquire(ctx, record.GameID, token, reconciler.leaseTTL)
|
|
if err != nil {
|
|
reconciler.logger.WarnContext(ctx, "dispose: acquire lease",
|
|
"game_id", record.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
if !acquired {
|
|
reconciler.logger.InfoContext(ctx, "dispose: lease busy, skipping",
|
|
"game_id", record.GameID,
|
|
)
|
|
return
|
|
}
|
|
defer reconciler.releaseLease(ctx, record.GameID, token)
|
|
|
|
current, err := reconciler.runtimeRecords.Get(ctx, record.GameID)
|
|
if err != nil {
|
|
if errors.Is(err, runtime.ErrNotFound) {
|
|
return
|
|
}
|
|
reconciler.logger.WarnContext(ctx, "dispose: read record",
|
|
"game_id", record.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
if current.Status != runtime.StatusRunning || current.CurrentContainerID != record.CurrentContainerID {
|
|
reconciler.logger.InfoContext(ctx, "dispose: state changed, skipping",
|
|
"game_id", record.GameID,
|
|
)
|
|
return
|
|
}
|
|
|
|
now := reconciler.clock().UTC()
|
|
err = reconciler.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
|
GameID: record.GameID,
|
|
ExpectedFrom: runtime.StatusRunning,
|
|
ExpectedContainerID: record.CurrentContainerID,
|
|
To: runtime.StatusRemoved,
|
|
Now: now,
|
|
})
|
|
if errors.Is(err, runtime.ErrConflict) || errors.Is(err, runtime.ErrNotFound) {
|
|
reconciler.logger.InfoContext(ctx, "dispose: CAS lost, skipping",
|
|
"game_id", record.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
if err != nil {
|
|
reconciler.logger.ErrorContext(ctx, "dispose: update status",
|
|
"game_id", record.GameID,
|
|
"container_id", record.CurrentContainerID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
reconciler.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
|
|
GameID: record.GameID,
|
|
ContainerID: record.CurrentContainerID,
|
|
EventType: health.EventTypeContainerDisappeared,
|
|
OccurredAt: now,
|
|
Details: containerDisappearedDetails(),
|
|
})
|
|
|
|
finishedAt := reconciler.clock().UTC()
|
|
reconciler.bestEffortAppend(ctx, operation.OperationEntry{
|
|
GameID: record.GameID,
|
|
OpKind: operation.OpKindReconcileDispose,
|
|
OpSource: operation.OpSourceAutoReconcile,
|
|
ImageRef: record.CurrentImageRef,
|
|
ContainerID: record.CurrentContainerID,
|
|
Outcome: operation.OutcomeSuccess,
|
|
StartedAt: now,
|
|
FinishedAt: &finishedAt,
|
|
})
|
|
reconciler.telemetry.RecordReconcileDrift(ctx, driftKindDispose)
|
|
|
|
logArgs := []any{
|
|
"game_id", record.GameID,
|
|
"container_id", record.CurrentContainerID,
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
reconciler.logger.InfoContext(ctx, "reconciler disposed missing container", logArgs...)
|
|
}
|
|
|
|
// observedExitedOne transitions a `running` record whose container is
|
|
// reported as `exited` to `stopped` and publishes `container_exited`
|
|
// with the observed exit code. No `operation_log` entry is written;
|
|
// see decision record §6.
|
|
func (reconciler *Reconciler) observedExitedOne(ctx context.Context, record runtime.RuntimeRecord, summary ports.ContainerSummary) {
|
|
token := reconciler.newToken()
|
|
acquired, err := reconciler.leases.TryAcquire(ctx, record.GameID, token, reconciler.leaseTTL)
|
|
if err != nil {
|
|
reconciler.logger.WarnContext(ctx, "observed_exited: acquire lease",
|
|
"game_id", record.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
if !acquired {
|
|
reconciler.logger.InfoContext(ctx, "observed_exited: lease busy, skipping",
|
|
"game_id", record.GameID,
|
|
)
|
|
return
|
|
}
|
|
defer reconciler.releaseLease(ctx, record.GameID, token)
|
|
|
|
current, err := reconciler.runtimeRecords.Get(ctx, record.GameID)
|
|
if err != nil {
|
|
if errors.Is(err, runtime.ErrNotFound) {
|
|
return
|
|
}
|
|
reconciler.logger.WarnContext(ctx, "observed_exited: read record",
|
|
"game_id", record.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
if current.Status != runtime.StatusRunning || current.CurrentContainerID != summary.ID {
|
|
reconciler.logger.InfoContext(ctx, "observed_exited: state changed, skipping",
|
|
"game_id", record.GameID,
|
|
)
|
|
return
|
|
}
|
|
|
|
inspect, err := reconciler.docker.InspectContainer(ctx, summary.ID)
|
|
if err != nil {
|
|
reconciler.logger.WarnContext(ctx, "observed_exited: inspect container",
|
|
"game_id", record.GameID,
|
|
"container_id", summary.ID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
now := reconciler.clock().UTC()
|
|
err = reconciler.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
|
GameID: record.GameID,
|
|
ExpectedFrom: runtime.StatusRunning,
|
|
ExpectedContainerID: summary.ID,
|
|
To: runtime.StatusStopped,
|
|
Now: now,
|
|
})
|
|
if errors.Is(err, runtime.ErrConflict) || errors.Is(err, runtime.ErrNotFound) {
|
|
reconciler.logger.InfoContext(ctx, "observed_exited: CAS lost, skipping",
|
|
"game_id", record.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
if err != nil {
|
|
reconciler.logger.ErrorContext(ctx, "observed_exited: update status",
|
|
"game_id", record.GameID,
|
|
"container_id", summary.ID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
reconciler.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
|
|
GameID: record.GameID,
|
|
ContainerID: summary.ID,
|
|
EventType: health.EventTypeContainerExited,
|
|
OccurredAt: now,
|
|
Details: containerExitedDetails(inspect.ExitCode, inspect.OOMKilled),
|
|
})
|
|
reconciler.telemetry.RecordReconcileDrift(ctx, driftKindObservedExited)
|
|
|
|
logArgs := []any{
|
|
"game_id", record.GameID,
|
|
"container_id", summary.ID,
|
|
"exit_code", inspect.ExitCode,
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
reconciler.logger.InfoContext(ctx, "reconciler observed exited container", logArgs...)
|
|
}
|
|
|
|
// resolveStartedAt prefers the `com.galaxy.started_at_ms` label written
|
|
// by the start service. When the label is absent or unparseable, it
|
|
// falls back to a full inspect of the container; if inspect also fails
|
|
// or returns a zero StartedAt, the current clock is used so the record
|
|
// still validates.
|
|
func (reconciler *Reconciler) resolveStartedAt(ctx context.Context, summary ports.ContainerSummary) time.Time {
|
|
if raw, ok := summary.Labels[startruntime.LabelStartedAtMs]; ok && raw != "" {
|
|
if ms, err := strconv.ParseInt(raw, 10, 64); err == nil && ms > 0 {
|
|
return time.UnixMilli(ms).UTC()
|
|
}
|
|
}
|
|
inspect, err := reconciler.docker.InspectContainer(ctx, summary.ID)
|
|
if err == nil && !inspect.StartedAt.IsZero() {
|
|
return inspect.StartedAt.UTC()
|
|
}
|
|
return reconciler.clock().UTC()
|
|
}
|
|
|
|
// engineEndpoint mirrors the URL shape produced by the docker adapter
|
|
// (`internal/adapters/docker/client.go::Run`).
|
|
func (reconciler *Reconciler) engineEndpoint(gameID string) string {
|
|
return fmt.Sprintf("http://%s%s:8080", startruntime.HostnamePrefix, gameID)
|
|
}
|
|
|
|
// releaseLease releases the per-game lease in a fresh background
|
|
// context so a canceled tick context does not leave the lease pinned
|
|
// for its TTL.
|
|
func (reconciler *Reconciler) releaseLease(ctx context.Context, gameID, token string) {
|
|
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
|
defer cancel()
|
|
if err := reconciler.leases.Release(cleanupCtx, gameID, token); err != nil {
|
|
reconciler.logger.WarnContext(ctx, "release game lease",
|
|
"game_id", gameID,
|
|
"err", err.Error(),
|
|
)
|
|
}
|
|
}
|
|
|
|
// bestEffortAppend writes one operation_log entry. A failure is logged
|
|
// and discarded; the durable runtime record (or its absence) remains
|
|
// the source of truth.
|
|
func (reconciler *Reconciler) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
|
if _, err := reconciler.operationLogs.Append(ctx, entry); err != nil {
|
|
reconciler.logger.ErrorContext(ctx, "append operation log",
|
|
"game_id", entry.GameID,
|
|
"op_kind", string(entry.OpKind),
|
|
"err", err.Error(),
|
|
)
|
|
}
|
|
}
|
|
|
|
// bestEffortPublishHealth emits one health event + snapshot upsert.
|
|
// Failures degrade silently per `rtmanager/README.md §Notification
|
|
// Contracts`; the runtime record remains the source of truth.
|
|
func (reconciler *Reconciler) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
|
if err := reconciler.healthEvents.Publish(ctx, envelope); err != nil {
|
|
reconciler.logger.ErrorContext(ctx, "publish health event",
|
|
"game_id", envelope.GameID,
|
|
"container_id", envelope.ContainerID,
|
|
"event_type", string(envelope.EventType),
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
reconciler.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
|
|
}
|
|
|
|
// containerExitedDetails matches the JSON shape produced by the events
|
|
// listener so consumers see a single contracted payload regardless of
|
|
// the source.
|
|
func containerExitedDetails(exitCode int, oom bool) json.RawMessage {
|
|
payload := struct {
|
|
ExitCode int `json:"exit_code"`
|
|
OOM bool `json:"oom"`
|
|
}{ExitCode: exitCode, OOM: oom}
|
|
encoded, _ := json.Marshal(payload)
|
|
return encoded
|
|
}
|
|
|
|
// containerDisappearedDetails returns the canonical empty-object
|
|
// payload required by the `container_disappeared` AsyncAPI variant.
|
|
func containerDisappearedDetails() json.RawMessage {
|
|
return json.RawMessage(`{}`)
|
|
}
|
|
|
|
func defaultTokenGenerator() func() string {
|
|
return func() string {
|
|
var buf [32]byte
|
|
if _, err := rand.Read(buf[:]); err != nil {
|
|
return "rtmanager-fallback-token"
|
|
}
|
|
return base64.RawURLEncoding.EncodeToString(buf[:])
|
|
}
|
|
}
|