376 lines
13 KiB
Go
376 lines
13 KiB
Go
// Package adminpatch implements the admin patch service-layer
|
|
// orchestrator owned by Game Master. It is driven by Admin Service or
|
|
// system administrators through
|
|
// `POST /api/v1/internal/runtimes/{game_id}/patch` and tells Runtime
|
|
// Manager to recreate the engine container with a new image, then
|
|
// rotates `runtime_records.current_image_ref` and
|
|
// `runtime_records.current_engine_version` while keeping the runtime in
|
|
// `running`.
|
|
//
|
|
// Lifecycle and failure-mode semantics follow `gamemaster/README.md
|
|
// §Lifecycles → Patch`. Design rationale (the dedicated UpdateImage
|
|
// port, rejection of deprecated targets, `service_unavailable` mapping
|
|
// for RTM failures) is captured in
|
|
// `gamemaster/docs/stage17-admin-operations.md`.
|
|
package adminpatch
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"strings"
|
|
"time"
|
|
|
|
"galaxy/gamemaster/internal/domain/engineversion"
|
|
"galaxy/gamemaster/internal/domain/operation"
|
|
"galaxy/gamemaster/internal/domain/runtime"
|
|
"galaxy/gamemaster/internal/logging"
|
|
"galaxy/gamemaster/internal/ports"
|
|
"galaxy/gamemaster/internal/telemetry"
|
|
)
|
|
|
|
// Input stores the per-call arguments for one admin patch operation.
|
|
type Input struct {
|
|
// GameID identifies the runtime to patch.
|
|
GameID string
|
|
|
|
// Version stores the target engine version (semver). Must be
|
|
// present in `engine_versions` with `status=active` and a same
|
|
// major.minor as the runtime's current version.
|
|
Version string
|
|
|
|
// OpSource classifies how the request entered Game Master. Used to
|
|
// stamp `operation_log.op_source`. Defaults to `admin_rest` when
|
|
// missing or unrecognised.
|
|
OpSource operation.OpSource
|
|
|
|
// SourceRef stores the optional opaque per-source reference (REST
|
|
// request id, admin user id). Empty when the caller does not
|
|
// provide one.
|
|
SourceRef string
|
|
}
|
|
|
|
// Validate reports whether input carries the structural invariants the
|
|
// service requires before any store is touched.
|
|
func (input Input) Validate() error {
|
|
if strings.TrimSpace(input.GameID) == "" {
|
|
return fmt.Errorf("game id must not be empty")
|
|
}
|
|
if _, err := engineversion.ParseSemver(input.Version); err != nil {
|
|
return fmt.Errorf("version: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Result stores the deterministic outcome of one Handle call. Business
|
|
// outcomes flow through Result; the Go-level error return is reserved
|
|
// for non-business failures (nil context, nil receiver).
|
|
type Result struct {
|
|
// Record carries the post-rotation runtime record. Populated on
|
|
// success; zero on early-rejection failures.
|
|
Record runtime.RuntimeRecord
|
|
|
|
// Outcome reports whether the operation completed (success) or
|
|
// produced a stable failure code.
|
|
Outcome operation.Outcome
|
|
|
|
// ErrorCode stores the stable error code on failure. Empty on
|
|
// success.
|
|
ErrorCode string
|
|
|
|
// ErrorMessage stores the operator-readable detail on failure.
|
|
// Empty on success.
|
|
ErrorMessage string
|
|
}
|
|
|
|
// IsSuccess reports whether the result represents a successful
|
|
// operation.
|
|
func (result Result) IsSuccess() bool {
|
|
return result.Outcome == operation.OutcomeSuccess
|
|
}
|
|
|
|
// Dependencies groups the collaborators required by Service.
|
|
type Dependencies struct {
|
|
// RuntimeRecords drives the row read plus the post-RTM image
|
|
// rotation under a CAS guard.
|
|
RuntimeRecords ports.RuntimeRecordStore
|
|
|
|
// EngineVersions resolves the target version's image ref and
|
|
// status.
|
|
EngineVersions ports.EngineVersionStore
|
|
|
|
// OperationLogs records the audit entry.
|
|
OperationLogs ports.OperationLogStore
|
|
|
|
// RTM drives the Runtime Manager patch call.
|
|
RTM ports.RTMClient
|
|
|
|
// Telemetry is required by the audit/log path. The Stage 17
|
|
// service does not introduce a dedicated counter; outcome metrics
|
|
// land under the future Admin Service surface.
|
|
Telemetry *telemetry.Runtime
|
|
|
|
// Logger records structured service-level events. Defaults to
|
|
// `slog.Default()` when nil.
|
|
Logger *slog.Logger
|
|
|
|
// Clock supplies the wall-clock used for operation timestamps.
|
|
// Defaults to `time.Now` when nil.
|
|
Clock func() time.Time
|
|
}
|
|
|
|
// Service executes the admin patch lifecycle operation.
|
|
type Service struct {
|
|
runtimeRecords ports.RuntimeRecordStore
|
|
engineVersions ports.EngineVersionStore
|
|
operationLogs ports.OperationLogStore
|
|
rtm ports.RTMClient
|
|
|
|
telemetry *telemetry.Runtime
|
|
logger *slog.Logger
|
|
clock func() time.Time
|
|
}
|
|
|
|
// NewService constructs one Service from deps.
|
|
func NewService(deps Dependencies) (*Service, error) {
|
|
switch {
|
|
case deps.RuntimeRecords == nil:
|
|
return nil, errors.New("new admin patch service: nil runtime records")
|
|
case deps.EngineVersions == nil:
|
|
return nil, errors.New("new admin patch service: nil engine versions")
|
|
case deps.OperationLogs == nil:
|
|
return nil, errors.New("new admin patch service: nil operation logs")
|
|
case deps.RTM == nil:
|
|
return nil, errors.New("new admin patch service: nil rtm client")
|
|
case deps.Telemetry == nil:
|
|
return nil, errors.New("new admin patch service: nil telemetry runtime")
|
|
}
|
|
|
|
clock := deps.Clock
|
|
if clock == nil {
|
|
clock = time.Now
|
|
}
|
|
logger := deps.Logger
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
logger = logger.With("service", "gamemaster.adminpatch")
|
|
|
|
return &Service{
|
|
runtimeRecords: deps.RuntimeRecords,
|
|
engineVersions: deps.EngineVersions,
|
|
operationLogs: deps.OperationLogs,
|
|
rtm: deps.RTM,
|
|
telemetry: deps.Telemetry,
|
|
logger: logger,
|
|
clock: clock,
|
|
}, nil
|
|
}
|
|
|
|
// Handle executes one admin patch operation end-to-end. The Go-level
|
|
// error return is reserved for non-business failures (nil context, nil
|
|
// receiver). Every business outcome flows through Result.
|
|
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
|
if service == nil {
|
|
return Result{}, errors.New("admin patch: nil service")
|
|
}
|
|
if ctx == nil {
|
|
return Result{}, errors.New("admin patch: nil context")
|
|
}
|
|
|
|
opStartedAt := service.clock().UTC()
|
|
|
|
if err := input.Validate(); err != nil {
|
|
return service.recordFailure(ctx, opStartedAt, input,
|
|
ErrorCodeInvalidRequest, err.Error()), nil
|
|
}
|
|
|
|
record, err := service.runtimeRecords.Get(ctx, input.GameID)
|
|
switch {
|
|
case errors.Is(err, runtime.ErrNotFound):
|
|
return service.recordFailure(ctx, opStartedAt, input,
|
|
ErrorCodeRuntimeNotFound, "runtime record does not exist"), nil
|
|
case err != nil:
|
|
return service.recordFailure(ctx, opStartedAt, input,
|
|
ErrorCodeServiceUnavailable, fmt.Sprintf("get runtime record: %s", err.Error())), nil
|
|
}
|
|
if record.Status != runtime.StatusRunning {
|
|
return service.recordFailure(ctx, opStartedAt, input,
|
|
ErrorCodeRuntimeNotRunning,
|
|
fmt.Sprintf("runtime status is %q, expected %q",
|
|
record.Status, runtime.StatusRunning)), nil
|
|
}
|
|
|
|
target, err := service.engineVersions.Get(ctx, input.Version)
|
|
switch {
|
|
case errors.Is(err, engineversion.ErrNotFound):
|
|
return service.recordFailure(ctx, opStartedAt, input,
|
|
ErrorCodeEngineVersionNotFound,
|
|
fmt.Sprintf("engine version %q not found", input.Version)), nil
|
|
case err != nil:
|
|
return service.recordFailure(ctx, opStartedAt, input,
|
|
ErrorCodeServiceUnavailable, fmt.Sprintf("get engine version: %s", err.Error())), nil
|
|
}
|
|
if target.Status != engineversion.StatusActive {
|
|
return service.recordFailure(ctx, opStartedAt, input,
|
|
ErrorCodeEngineVersionNotFound,
|
|
fmt.Sprintf("engine version %q is %q, expected %q",
|
|
input.Version, target.Status, engineversion.StatusActive)), nil
|
|
}
|
|
|
|
patchOK, semErr := engineversion.IsPatchUpgrade(record.CurrentEngineVersion, input.Version)
|
|
if semErr != nil {
|
|
return service.recordFailure(ctx, opStartedAt, input,
|
|
ErrorCodeInvalidRequest, fmt.Sprintf("compare semver: %s", semErr.Error())), nil
|
|
}
|
|
if !patchOK {
|
|
return service.recordFailure(ctx, opStartedAt, input,
|
|
ErrorCodeSemverPatchOnly,
|
|
fmt.Sprintf("target %q is not a same-major.minor patch of %q",
|
|
input.Version, record.CurrentEngineVersion)), nil
|
|
}
|
|
|
|
if err := service.rtm.Patch(ctx, input.GameID, target.ImageRef); err != nil {
|
|
return service.recordFailure(ctx, opStartedAt, input,
|
|
ErrorCodeServiceUnavailable, fmt.Sprintf("rtm patch: %s", err.Error())), nil
|
|
}
|
|
|
|
rotatedAt := service.clock().UTC()
|
|
updateErr := service.runtimeRecords.UpdateImage(ctx, ports.UpdateImageInput{
|
|
GameID: input.GameID,
|
|
ExpectedStatus: runtime.StatusRunning,
|
|
CurrentImageRef: target.ImageRef,
|
|
CurrentEngineVersion: input.Version,
|
|
Now: rotatedAt,
|
|
})
|
|
switch {
|
|
case updateErr == nil:
|
|
case errors.Is(updateErr, runtime.ErrConflict):
|
|
return service.recordFailure(ctx, opStartedAt, input,
|
|
ErrorCodeConflict,
|
|
fmt.Sprintf("runtime status changed during patch: %s", updateErr.Error())), nil
|
|
case errors.Is(updateErr, runtime.ErrNotFound):
|
|
return service.recordFailure(ctx, opStartedAt, input,
|
|
ErrorCodeRuntimeNotFound,
|
|
fmt.Sprintf("runtime record disappeared during patch: %s", updateErr.Error())), nil
|
|
default:
|
|
return service.recordFailure(ctx, opStartedAt, input,
|
|
ErrorCodeServiceUnavailable,
|
|
fmt.Sprintf("update runtime image: %s", updateErr.Error())), nil
|
|
}
|
|
|
|
persisted, reloadErr := service.runtimeRecords.Get(ctx, input.GameID)
|
|
if reloadErr != nil {
|
|
// The image rotation already committed; surface the success
|
|
// outcome with the in-memory projection so the caller still
|
|
// sees the new image_ref / engine_version.
|
|
service.logger.WarnContext(ctx, "reload runtime record after patch",
|
|
"game_id", input.GameID,
|
|
"err", reloadErr.Error(),
|
|
)
|
|
persisted = record
|
|
persisted.CurrentImageRef = target.ImageRef
|
|
persisted.CurrentEngineVersion = input.Version
|
|
persisted.UpdatedAt = rotatedAt
|
|
}
|
|
|
|
service.appendSuccessLog(ctx, opStartedAt, input)
|
|
|
|
logArgs := []any{
|
|
"game_id", input.GameID,
|
|
"new_image_ref", target.ImageRef,
|
|
"new_engine_version", input.Version,
|
|
"previous_engine_version", record.CurrentEngineVersion,
|
|
"op_source", string(fallbackOpSource(input.OpSource)),
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
service.logger.InfoContext(ctx, "runtime patched", logArgs...)
|
|
|
|
return Result{
|
|
Record: persisted,
|
|
Outcome: operation.OutcomeSuccess,
|
|
}, nil
|
|
}
|
|
|
|
// recordFailure assembles the failure Result, appends the
|
|
// operation_log failure entry, and returns the structured outcome.
|
|
func (service *Service) recordFailure(ctx context.Context, opStartedAt time.Time, input Input, errorCode string, errorMessage string) Result {
|
|
service.appendFailureLog(ctx, opStartedAt, input, errorCode, errorMessage)
|
|
|
|
logArgs := []any{
|
|
"game_id", input.GameID,
|
|
"target_version", input.Version,
|
|
"op_source", string(input.OpSource),
|
|
"error_code", errorCode,
|
|
"error_message", errorMessage,
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
service.logger.WarnContext(ctx, "admin patch rejected", logArgs...)
|
|
|
|
return Result{
|
|
Outcome: operation.OutcomeFailure,
|
|
ErrorCode: errorCode,
|
|
ErrorMessage: errorMessage,
|
|
}
|
|
}
|
|
|
|
// appendSuccessLog records the success operation_log entry.
|
|
func (service *Service) appendSuccessLog(ctx context.Context, opStartedAt time.Time, input Input) {
|
|
finishedAt := service.clock().UTC()
|
|
service.bestEffortAppend(ctx, operation.OperationEntry{
|
|
GameID: input.GameID,
|
|
OpKind: operation.OpKindPatch,
|
|
OpSource: fallbackOpSource(input.OpSource),
|
|
SourceRef: input.SourceRef,
|
|
Outcome: operation.OutcomeSuccess,
|
|
StartedAt: opStartedAt,
|
|
FinishedAt: &finishedAt,
|
|
})
|
|
}
|
|
|
|
// appendFailureLog records the failure operation_log entry. Skipped
|
|
// when the input game id is empty so the entry validator does not
|
|
// reject an audit row that adds no value.
|
|
func (service *Service) appendFailureLog(ctx context.Context, opStartedAt time.Time, input Input, errorCode string, errorMessage string) {
|
|
if strings.TrimSpace(input.GameID) == "" {
|
|
return
|
|
}
|
|
finishedAt := service.clock().UTC()
|
|
service.bestEffortAppend(ctx, operation.OperationEntry{
|
|
GameID: input.GameID,
|
|
OpKind: operation.OpKindPatch,
|
|
OpSource: fallbackOpSource(input.OpSource),
|
|
SourceRef: input.SourceRef,
|
|
Outcome: operation.OutcomeFailure,
|
|
ErrorCode: errorCode,
|
|
ErrorMessage: errorMessage,
|
|
StartedAt: opStartedAt,
|
|
FinishedAt: &finishedAt,
|
|
})
|
|
}
|
|
|
|
// bestEffortAppend writes one operation_log entry. A failure is logged
|
|
// and discarded; the runtime row is the source of truth.
|
|
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
|
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
|
service.logger.ErrorContext(ctx, "append operation log",
|
|
"game_id", entry.GameID,
|
|
"op_kind", string(entry.OpKind),
|
|
"outcome", string(entry.Outcome),
|
|
"error_code", entry.ErrorCode,
|
|
"err", err.Error(),
|
|
)
|
|
}
|
|
}
|
|
|
|
// fallbackOpSource defaults to `admin_rest` when the caller did not
|
|
// supply a known op source. Mirrors `gamemaster/README.md §Trusted
|
|
// Surfaces`.
|
|
func fallbackOpSource(source operation.OpSource) operation.OpSource {
|
|
if source.IsKnown() {
|
|
return source
|
|
}
|
|
return operation.OpSourceAdminRest
|
|
}
|