613 lines
21 KiB
Go
613 lines
21 KiB
Go
// Package stopruntime implements the `stop` lifecycle operation owned by
|
|
// Runtime Manager. The service is the single orchestrator behind both
|
|
// the asynchronous `runtime:stop_jobs` consumer and the synchronous
|
|
// `POST /api/v1/internal/runtimes/{game_id}/stop` REST handler. It is
|
|
// also the inner stop step of the restart and patch services, which
|
|
// call Run while holding the outer per-game lease.
|
|
//
|
|
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
|
|
// §Lifecycles → Stop`. Design rationale is captured in
|
|
// `rtmanager/docs/services.md`.
|
|
package stopruntime
|
|
|
|
import (
|
|
"context"
|
|
"crypto/rand"
|
|
"encoding/base64"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"strings"
|
|
"time"
|
|
|
|
"galaxy/rtmanager/internal/config"
|
|
"galaxy/rtmanager/internal/domain/health"
|
|
"galaxy/rtmanager/internal/domain/operation"
|
|
"galaxy/rtmanager/internal/domain/runtime"
|
|
"galaxy/rtmanager/internal/logging"
|
|
"galaxy/rtmanager/internal/ports"
|
|
"galaxy/rtmanager/internal/service/startruntime"
|
|
"galaxy/rtmanager/internal/telemetry"
|
|
)
|
|
|
|
// leaseReleaseTimeout bounds the deferred lease-release call. A fresh
|
|
// background context is used so the release runs even when the request
|
|
// context was already canceled.
|
|
const leaseReleaseTimeout = 5 * time.Second
|
|
|
|
// Input stores the per-call arguments for one stop operation.
|
|
type Input struct {
|
|
// GameID identifies the platform game to stop.
|
|
GameID string
|
|
|
|
// Reason classifies the trigger of the stop. Required.
|
|
Reason StopReason
|
|
|
|
// OpSource classifies how the request entered Runtime Manager.
|
|
// Required: every operation_log entry carries an op_source.
|
|
OpSource operation.OpSource
|
|
|
|
// SourceRef stores the optional opaque per-source reference (Redis
|
|
// Stream entry id, REST request id, admin user id). Empty when the
|
|
// caller does not provide one. For inner calls invoked by the
|
|
// restart and patch orchestrators it carries the outer correlation
|
|
// id so the three operation_log entries share it.
|
|
SourceRef string
|
|
}
|
|
|
|
// Validate reports whether input carries the structural invariants the
|
|
// service requires.
|
|
func (input Input) Validate() error {
|
|
if strings.TrimSpace(input.GameID) == "" {
|
|
return fmt.Errorf("game id must not be empty")
|
|
}
|
|
if !input.OpSource.IsKnown() {
|
|
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
|
}
|
|
if err := input.Reason.Validate(); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Result stores the deterministic outcome of one Handle / Run call.
|
|
type Result struct {
|
|
// Record carries the runtime record installed by the operation.
|
|
// Populated on success and on idempotent replay; zero on failure.
|
|
Record runtime.RuntimeRecord
|
|
|
|
// Outcome reports whether the operation completed (success) or
|
|
// produced a stable failure code.
|
|
Outcome operation.Outcome
|
|
|
|
// ErrorCode stores the stable error code on failure, or
|
|
// `replay_no_op` on idempotent replay. Empty for fresh successes.
|
|
ErrorCode string
|
|
|
|
// ErrorMessage stores the operator-readable detail on failure.
|
|
// Empty for successes.
|
|
ErrorMessage string
|
|
}
|
|
|
|
// Dependencies groups the collaborators required by Service.
|
|
type Dependencies struct {
|
|
// RuntimeRecords reads and updates the durable runtime record.
|
|
RuntimeRecords ports.RuntimeRecordStore
|
|
|
|
// OperationLogs records the success / failure audit entry.
|
|
OperationLogs ports.OperationLogStore
|
|
|
|
// Docker drives the Docker daemon (container stop).
|
|
Docker ports.DockerClient
|
|
|
|
// Leases serialises operations against the same game id.
|
|
Leases ports.GameLeaseStore
|
|
|
|
// HealthEvents publishes `runtime:health_events` and upserts the
|
|
// matching `health_snapshots` row. Used on the vanished-container
|
|
// path to emit `container_disappeared`.
|
|
HealthEvents ports.HealthEventPublisher
|
|
|
|
// Container groups the per-container settings consumed at stop time
|
|
// (the graceful stop timeout).
|
|
Container config.ContainerConfig
|
|
|
|
// Coordination supplies the per-game lease TTL.
|
|
Coordination config.CoordinationConfig
|
|
|
|
// Telemetry records stop outcomes and lease latency. Required.
|
|
Telemetry *telemetry.Runtime
|
|
|
|
// Logger records structured service-level events. Defaults to
|
|
// `slog.Default()` when nil.
|
|
Logger *slog.Logger
|
|
|
|
// Clock supplies the wall-clock used for operation timestamps.
|
|
// Defaults to `time.Now` when nil.
|
|
Clock func() time.Time
|
|
|
|
// NewToken supplies a unique opaque lease token. Defaults to a
|
|
// 32-byte random base64url string when nil. Tests may override.
|
|
NewToken func() string
|
|
}
|
|
|
|
// Service executes the stop lifecycle operation.
|
|
type Service struct {
|
|
runtimeRecords ports.RuntimeRecordStore
|
|
operationLogs ports.OperationLogStore
|
|
docker ports.DockerClient
|
|
leases ports.GameLeaseStore
|
|
healthEvents ports.HealthEventPublisher
|
|
|
|
stopTimeout time.Duration
|
|
leaseTTL time.Duration
|
|
|
|
telemetry *telemetry.Runtime
|
|
logger *slog.Logger
|
|
|
|
clock func() time.Time
|
|
newToken func() string
|
|
}
|
|
|
|
// NewService constructs one Service from deps.
|
|
func NewService(deps Dependencies) (*Service, error) {
|
|
switch {
|
|
case deps.RuntimeRecords == nil:
|
|
return nil, errors.New("new stop runtime service: nil runtime records")
|
|
case deps.OperationLogs == nil:
|
|
return nil, errors.New("new stop runtime service: nil operation logs")
|
|
case deps.Docker == nil:
|
|
return nil, errors.New("new stop runtime service: nil docker client")
|
|
case deps.Leases == nil:
|
|
return nil, errors.New("new stop runtime service: nil lease store")
|
|
case deps.HealthEvents == nil:
|
|
return nil, errors.New("new stop runtime service: nil health events publisher")
|
|
case deps.Telemetry == nil:
|
|
return nil, errors.New("new stop runtime service: nil telemetry runtime")
|
|
}
|
|
if err := deps.Container.Validate(); err != nil {
|
|
return nil, fmt.Errorf("new stop runtime service: container config: %w", err)
|
|
}
|
|
if err := deps.Coordination.Validate(); err != nil {
|
|
return nil, fmt.Errorf("new stop runtime service: coordination config: %w", err)
|
|
}
|
|
|
|
clock := deps.Clock
|
|
if clock == nil {
|
|
clock = time.Now
|
|
}
|
|
logger := deps.Logger
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
logger = logger.With("service", "rtmanager.stopruntime")
|
|
|
|
newToken := deps.NewToken
|
|
if newToken == nil {
|
|
newToken = defaultTokenGenerator()
|
|
}
|
|
|
|
return &Service{
|
|
runtimeRecords: deps.RuntimeRecords,
|
|
operationLogs: deps.OperationLogs,
|
|
docker: deps.Docker,
|
|
leases: deps.Leases,
|
|
healthEvents: deps.HealthEvents,
|
|
stopTimeout: deps.Container.StopTimeout,
|
|
leaseTTL: deps.Coordination.GameLeaseTTL,
|
|
telemetry: deps.Telemetry,
|
|
logger: logger,
|
|
clock: clock,
|
|
newToken: newToken,
|
|
}, nil
|
|
}
|
|
|
|
// Handle executes one stop operation end-to-end. The Go-level error
|
|
// return is reserved for non-business failures (nil context, nil
|
|
// receiver). Every business outcome — success, idempotent replay, or
|
|
// any of the stable failure modes — flows through Result.
|
|
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
|
if service == nil {
|
|
return Result{}, errors.New("stop runtime: nil service")
|
|
}
|
|
if ctx == nil {
|
|
return Result{}, errors.New("stop runtime: nil context")
|
|
}
|
|
|
|
opStartedAt := service.clock().UTC()
|
|
|
|
if err := input.Validate(); err != nil {
|
|
return service.recordFailure(ctx, failureCtx{
|
|
opStartedAt: opStartedAt,
|
|
input: input,
|
|
errorCode: startruntime.ErrorCodeInvalidRequest,
|
|
errorMessage: err.Error(),
|
|
}), nil
|
|
}
|
|
|
|
token := service.newToken()
|
|
leaseStart := service.clock()
|
|
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
|
|
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
|
|
if err != nil {
|
|
return service.recordFailure(ctx, failureCtx{
|
|
opStartedAt: opStartedAt,
|
|
input: input,
|
|
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
|
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
|
|
}), nil
|
|
}
|
|
if !acquired {
|
|
return service.recordFailure(ctx, failureCtx{
|
|
opStartedAt: opStartedAt,
|
|
input: input,
|
|
errorCode: startruntime.ErrorCodeConflict,
|
|
errorMessage: "another lifecycle operation is in progress for this game",
|
|
}), nil
|
|
}
|
|
defer service.releaseLease(ctx, input.GameID, token)
|
|
|
|
return service.runUnderLease(ctx, input, opStartedAt)
|
|
}
|
|
|
|
// Run executes the stop lifecycle assuming the per-game lease is
|
|
// already held by the caller. The method is reserved for orchestrator
|
|
// services in `internal/service/` that compose stop with another
|
|
// operation under a single outer lease (restart and patch). External
|
|
// callers must use Handle.
|
|
func (service *Service) Run(ctx context.Context, input Input) (Result, error) {
|
|
if service == nil {
|
|
return Result{}, errors.New("stop runtime: nil service")
|
|
}
|
|
if ctx == nil {
|
|
return Result{}, errors.New("stop runtime: nil context")
|
|
}
|
|
|
|
opStartedAt := service.clock().UTC()
|
|
|
|
if err := input.Validate(); err != nil {
|
|
return service.recordFailure(ctx, failureCtx{
|
|
opStartedAt: opStartedAt,
|
|
input: input,
|
|
errorCode: startruntime.ErrorCodeInvalidRequest,
|
|
errorMessage: err.Error(),
|
|
}), nil
|
|
}
|
|
|
|
return service.runUnderLease(ctx, input, opStartedAt)
|
|
}
|
|
|
|
// runUnderLease executes the post-validation, lease-protected stop
|
|
// steps shared by Handle and Run.
|
|
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
|
|
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
|
|
if errors.Is(err, runtime.ErrNotFound) {
|
|
return service.recordFailure(ctx, failureCtx{
|
|
opStartedAt: opStartedAt,
|
|
input: input,
|
|
errorCode: startruntime.ErrorCodeNotFound,
|
|
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
|
|
}), nil
|
|
}
|
|
if err != nil {
|
|
return service.recordFailure(ctx, failureCtx{
|
|
opStartedAt: opStartedAt,
|
|
input: input,
|
|
errorCode: startruntime.ErrorCodeInternal,
|
|
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
|
|
}), nil
|
|
}
|
|
|
|
switch existing.Status {
|
|
case runtime.StatusStopped, runtime.StatusRemoved:
|
|
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
|
|
case runtime.StatusRunning:
|
|
// proceed
|
|
default:
|
|
return service.recordFailure(ctx, failureCtx{
|
|
opStartedAt: opStartedAt,
|
|
input: input,
|
|
errorCode: startruntime.ErrorCodeInternal,
|
|
errorMessage: fmt.Sprintf("runtime record has unsupported status %q", existing.Status),
|
|
}), nil
|
|
}
|
|
|
|
if err := service.docker.Stop(ctx, existing.CurrentContainerID, service.stopTimeout); err != nil {
|
|
if errors.Is(err, ports.ErrContainerNotFound) {
|
|
return service.handleVanished(ctx, input, opStartedAt, existing), nil
|
|
}
|
|
return service.recordFailure(ctx, failureCtx{
|
|
opStartedAt: opStartedAt,
|
|
input: input,
|
|
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
|
errorMessage: fmt.Sprintf("docker stop: %s", err.Error()),
|
|
containerID: existing.CurrentContainerID,
|
|
imageRef: existing.CurrentImageRef,
|
|
}), nil
|
|
}
|
|
|
|
updateNow := service.clock().UTC()
|
|
err = service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
|
GameID: input.GameID,
|
|
ExpectedFrom: runtime.StatusRunning,
|
|
ExpectedContainerID: existing.CurrentContainerID,
|
|
To: runtime.StatusStopped,
|
|
Now: updateNow,
|
|
})
|
|
if errors.Is(err, runtime.ErrConflict) {
|
|
// CAS race: a concurrent reconciler / restart already moved the
|
|
// record. The desired terminal state was reached by another path.
|
|
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
|
|
}
|
|
if errors.Is(err, runtime.ErrNotFound) {
|
|
return service.recordFailure(ctx, failureCtx{
|
|
opStartedAt: opStartedAt,
|
|
input: input,
|
|
errorCode: startruntime.ErrorCodeNotFound,
|
|
errorMessage: fmt.Sprintf("runtime record for game %q vanished mid-stop", input.GameID),
|
|
containerID: existing.CurrentContainerID,
|
|
imageRef: existing.CurrentImageRef,
|
|
}), nil
|
|
}
|
|
if err != nil {
|
|
return service.recordFailure(ctx, failureCtx{
|
|
opStartedAt: opStartedAt,
|
|
input: input,
|
|
errorCode: startruntime.ErrorCodeInternal,
|
|
errorMessage: fmt.Sprintf("update runtime status: %s", err.Error()),
|
|
containerID: existing.CurrentContainerID,
|
|
imageRef: existing.CurrentImageRef,
|
|
}), nil
|
|
}
|
|
|
|
finishedAt := service.clock().UTC()
|
|
service.bestEffortAppend(ctx, operation.OperationEntry{
|
|
GameID: input.GameID,
|
|
OpKind: operation.OpKindStop,
|
|
OpSource: input.OpSource,
|
|
SourceRef: input.SourceRef,
|
|
ImageRef: existing.CurrentImageRef,
|
|
ContainerID: existing.CurrentContainerID,
|
|
Outcome: operation.OutcomeSuccess,
|
|
StartedAt: opStartedAt,
|
|
FinishedAt: &finishedAt,
|
|
})
|
|
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
|
|
|
|
record := existing
|
|
record.Status = runtime.StatusStopped
|
|
stoppedAt := updateNow
|
|
record.StoppedAt = &stoppedAt
|
|
record.LastOpAt = updateNow
|
|
|
|
logArgs := []any{
|
|
"game_id", input.GameID,
|
|
"container_id", existing.CurrentContainerID,
|
|
"reason", string(input.Reason),
|
|
"op_source", string(input.OpSource),
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
service.logger.InfoContext(ctx, "runtime stopped", logArgs...)
|
|
|
|
return Result{
|
|
Record: record,
|
|
Outcome: operation.OutcomeSuccess,
|
|
}, nil
|
|
}
|
|
|
|
// handleVanished records the success outcome for the case where docker
|
|
// stop reports the container as already gone. It updates the record to
|
|
// removed, publishes container_disappeared, and returns success.
|
|
func (service *Service) handleVanished(ctx context.Context, input Input, opStartedAt time.Time, existing runtime.RuntimeRecord) Result {
|
|
updateNow := service.clock().UTC()
|
|
err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
|
GameID: input.GameID,
|
|
ExpectedFrom: runtime.StatusRunning,
|
|
ExpectedContainerID: existing.CurrentContainerID,
|
|
To: runtime.StatusRemoved,
|
|
Now: updateNow,
|
|
})
|
|
if errors.Is(err, runtime.ErrConflict) {
|
|
return service.recordReplayNoOp(ctx, opStartedAt, input, existing)
|
|
}
|
|
if err != nil && !errors.Is(err, runtime.ErrNotFound) {
|
|
return service.recordFailure(ctx, failureCtx{
|
|
opStartedAt: opStartedAt,
|
|
input: input,
|
|
errorCode: startruntime.ErrorCodeInternal,
|
|
errorMessage: fmt.Sprintf("update runtime status to removed: %s", err.Error()),
|
|
containerID: existing.CurrentContainerID,
|
|
imageRef: existing.CurrentImageRef,
|
|
})
|
|
}
|
|
|
|
service.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
|
|
GameID: input.GameID,
|
|
ContainerID: existing.CurrentContainerID,
|
|
EventType: health.EventTypeContainerDisappeared,
|
|
OccurredAt: updateNow,
|
|
Details: emptyHealthDetails(),
|
|
})
|
|
|
|
finishedAt := service.clock().UTC()
|
|
service.bestEffortAppend(ctx, operation.OperationEntry{
|
|
GameID: input.GameID,
|
|
OpKind: operation.OpKindStop,
|
|
OpSource: input.OpSource,
|
|
SourceRef: input.SourceRef,
|
|
ImageRef: existing.CurrentImageRef,
|
|
ContainerID: existing.CurrentContainerID,
|
|
Outcome: operation.OutcomeSuccess,
|
|
StartedAt: opStartedAt,
|
|
FinishedAt: &finishedAt,
|
|
})
|
|
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
|
|
service.telemetry.RecordHealthEvent(ctx, string(health.EventTypeContainerDisappeared))
|
|
|
|
record := existing
|
|
record.Status = runtime.StatusRemoved
|
|
record.CurrentContainerID = ""
|
|
removedAt := updateNow
|
|
record.RemovedAt = &removedAt
|
|
record.LastOpAt = updateNow
|
|
|
|
logArgs := []any{
|
|
"game_id", input.GameID,
|
|
"container_id", existing.CurrentContainerID,
|
|
"reason", string(input.Reason),
|
|
"op_source", string(input.OpSource),
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
service.logger.InfoContext(ctx, "runtime stop on vanished container", logArgs...)
|
|
|
|
return Result{
|
|
Record: record,
|
|
Outcome: operation.OutcomeSuccess,
|
|
}
|
|
}
|
|
|
|
// recordReplayNoOp records the idempotent replay outcome and returns the
|
|
// existing record unchanged.
|
|
func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result {
|
|
finishedAt := service.clock().UTC()
|
|
service.bestEffortAppend(ctx, operation.OperationEntry{
|
|
GameID: input.GameID,
|
|
OpKind: operation.OpKindStop,
|
|
OpSource: input.OpSource,
|
|
SourceRef: input.SourceRef,
|
|
ImageRef: existing.CurrentImageRef,
|
|
ContainerID: existing.CurrentContainerID,
|
|
Outcome: operation.OutcomeSuccess,
|
|
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
|
StartedAt: opStartedAt,
|
|
FinishedAt: &finishedAt,
|
|
})
|
|
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
|
|
|
|
logArgs := []any{
|
|
"game_id", input.GameID,
|
|
"container_id", existing.CurrentContainerID,
|
|
"reason", string(input.Reason),
|
|
"op_source", string(input.OpSource),
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
service.logger.InfoContext(ctx, "runtime stop replay no-op", logArgs...)
|
|
|
|
return Result{
|
|
Record: existing,
|
|
Outcome: operation.OutcomeSuccess,
|
|
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
|
}
|
|
}
|
|
|
|
// failureCtx groups the inputs to recordFailure so the runUnderLease
|
|
// method stays readable.
|
|
type failureCtx struct {
|
|
opStartedAt time.Time
|
|
input Input
|
|
errorCode string
|
|
errorMessage string
|
|
containerID string
|
|
imageRef string
|
|
}
|
|
|
|
// recordFailure records the failure operation_log entry and emits
|
|
// telemetry. The runtime record stays untouched.
|
|
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
|
|
finishedAt := service.clock().UTC()
|
|
service.bestEffortAppend(ctx, operation.OperationEntry{
|
|
GameID: fc.input.GameID,
|
|
OpKind: operation.OpKindStop,
|
|
OpSource: fc.input.OpSource,
|
|
SourceRef: fc.input.SourceRef,
|
|
ImageRef: fc.imageRef,
|
|
ContainerID: fc.containerID,
|
|
Outcome: operation.OutcomeFailure,
|
|
ErrorCode: fc.errorCode,
|
|
ErrorMessage: fc.errorMessage,
|
|
StartedAt: fc.opStartedAt,
|
|
FinishedAt: &finishedAt,
|
|
})
|
|
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeFailure), string(fc.input.Reason), string(fc.input.OpSource))
|
|
|
|
logArgs := []any{
|
|
"game_id", fc.input.GameID,
|
|
"reason", string(fc.input.Reason),
|
|
"op_source", string(fc.input.OpSource),
|
|
"error_code", fc.errorCode,
|
|
"error_message", fc.errorMessage,
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
service.logger.WarnContext(ctx, "runtime stop failed", logArgs...)
|
|
|
|
return Result{
|
|
Outcome: operation.OutcomeFailure,
|
|
ErrorCode: fc.errorCode,
|
|
ErrorMessage: fc.errorMessage,
|
|
}
|
|
}
|
|
|
|
// releaseLease releases the per-game lease in a fresh background context
|
|
// so a canceled request context does not leave the lease pinned for its
|
|
// TTL.
|
|
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
|
|
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
|
defer cancel()
|
|
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
|
|
service.logger.WarnContext(ctx, "release game lease",
|
|
"game_id", gameID,
|
|
"err", err.Error(),
|
|
)
|
|
}
|
|
}
|
|
|
|
// bestEffortAppend writes one operation_log entry. A failure is logged
|
|
// and discarded; the durable runtime record (or its absence) remains
|
|
// the source of truth.
|
|
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
|
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
|
service.logger.ErrorContext(ctx, "append operation log",
|
|
"game_id", entry.GameID,
|
|
"op_kind", string(entry.OpKind),
|
|
"outcome", string(entry.Outcome),
|
|
"error_code", entry.ErrorCode,
|
|
"err", err.Error(),
|
|
)
|
|
}
|
|
}
|
|
|
|
// bestEffortPublishHealth emits one health event + snapshot upsert.
|
|
// Failures degrade silently per `rtmanager/README.md §Notification
|
|
// Contracts`; the runtime record remains the source of truth.
|
|
func (service *Service) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
|
if err := service.healthEvents.Publish(ctx, envelope); err != nil {
|
|
service.logger.ErrorContext(ctx, "publish health event",
|
|
"game_id", envelope.GameID,
|
|
"container_id", envelope.ContainerID,
|
|
"event_type", string(envelope.EventType),
|
|
"err", err.Error(),
|
|
)
|
|
}
|
|
}
|
|
|
|
// defaultTokenGenerator returns a function that produces 32-byte
|
|
// base64url-encoded tokens. Mirrors the start service: a degraded
|
|
// entropy source falls back to a sentinel token so the next TryAcquire
|
|
// observes a collision rather than a panic.
|
|
func defaultTokenGenerator() func() string {
|
|
return func() string {
|
|
var buf [32]byte
|
|
if _, err := rand.Read(buf[:]); err != nil {
|
|
return "rtmanager-fallback-token"
|
|
}
|
|
return base64.RawURLEncoding.EncodeToString(buf[:])
|
|
}
|
|
}
|
|
|
|
// emptyHealthDetails returns the canonical empty-object payload required
|
|
// by the `container_disappeared` AsyncAPI variant.
|
|
func emptyHealthDetails() json.RawMessage {
|
|
return json.RawMessage("{}")
|
|
}
|