diplomail (Stage D): language detection + lazy translation cache
Replaces the LangUndetermined placeholder with whatlanggo-backed body detection on every send path, then adds a translation cache keyed on (message_id, target_lang) populated lazily on the per-message read endpoint. The noop translator that ships with Stage D returns engine="noop", which the service treats as "translation unavailable" — wiring a real backend (LibreTranslate HTTP client is the documented next step) is a one-file swap. GetMessage and ListInbox now accept a targetLang argument; the HTTP layer resolves the caller's accounts.preferred_language and forwards it. Inbox uses the cache only (never calls the translator) so bulk reads stay fast under future SaaS backends. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,79 @@
|
||||
// Package detector wraps the body-language detection used by the
|
||||
// diplomail subsystem. The package exposes a narrow `LanguageDetector`
|
||||
// interface so the implementation can be swapped without touching the
|
||||
// callers; the default backed-by-whatlanggo detector handles 84
|
||||
// natural languages and ships with the embedded statistical profiles.
|
||||
//
|
||||
// Detection happens only on the body. Subjects are short and
|
||||
// frequently template-like ("Re: ..."), so detecting on them adds
|
||||
// noise. The diplomail Service feeds the body, captures the BCP 47
|
||||
// tag returned here, and stores it in `diplomail_messages.body_lang`.
|
||||
package detector
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/abadojack/whatlanggo"
|
||||
)
|
||||
|
||||
// Undetermined is the BCP 47 placeholder stored when detection cannot
|
||||
// confidently identify a language (empty body, too-short body, mixed
|
||||
// scripts the detector refuses to bet on).
|
||||
const Undetermined = "und"
|
||||
|
||||
// LanguageDetector is the read-only surface diplomail consumes when
|
||||
// it needs to label a message body. Detect must never panic and
|
||||
// must never return an error: detection failure simply yields
|
||||
// `Undetermined`.
|
||||
type LanguageDetector interface {
|
||||
Detect(body string) string
|
||||
}
|
||||
|
||||
// New returns the package-default detector backed by `whatlanggo`.
|
||||
// The instance is safe for concurrent use; whatlanggo's `Detect`
|
||||
// reads the embedded profiles without state mutation. Callers that
|
||||
// want a fixed allow-list can build their own implementation around
|
||||
// the same interface.
|
||||
func New() LanguageDetector {
|
||||
return &whatlangDetector{}
|
||||
}
|
||||
|
||||
type whatlangDetector struct{}
|
||||
|
||||
// minRunes is the lower bound on body length below which whatlanggo
|
||||
// can flip between near-synonyms; for shorter bodies we return
|
||||
// `Undetermined` and let the noop translator skip the slot. The
|
||||
// value matches whatlanggo's documented "stable above ~25 runes"
|
||||
// guidance.
|
||||
const minRunes = 25
|
||||
|
||||
// Detect returns the BCP 47 tag for body, or `Undetermined` when the
|
||||
// body is empty / too short / whatlanggo refuses to label it. The
|
||||
// trim is applied so leading whitespace does not bias the script
|
||||
// detector toward Latin. We deliberately do not gate on
|
||||
// `info.IsReliable()` because the gate is too conservative for the
|
||||
// short sentences typical of in-game mail; a misclassification only
|
||||
// hurts the translation cache key, never correctness.
|
||||
func (d *whatlangDetector) Detect(body string) string {
|
||||
body = strings.TrimSpace(body)
|
||||
if body == "" {
|
||||
return Undetermined
|
||||
}
|
||||
if utf8.RuneCountInString(body) < minRunes {
|
||||
return Undetermined
|
||||
}
|
||||
info := whatlanggo.Detect(body)
|
||||
tag := info.Lang.Iso6391()
|
||||
if tag == "" {
|
||||
return Undetermined
|
||||
}
|
||||
return tag
|
||||
}
|
||||
|
||||
// NoopDetector returns the placeholder unconditionally. Used by
|
||||
// tests and by Stage A code paths that predate the real detector.
|
||||
type NoopDetector struct{}
|
||||
|
||||
// Detect always returns `Undetermined` regardless of input.
|
||||
func (NoopDetector) Detect(string) string { return Undetermined }
|
||||
@@ -0,0 +1,49 @@
|
||||
package detector
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestDetectKnownLanguages(t *testing.T) {
|
||||
t.Parallel()
|
||||
d := New()
|
||||
cases := []struct {
|
||||
name string
|
||||
text string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "english paragraph",
|
||||
text: "The trade agreement should be signed before the next turn. " +
|
||||
"I expect a written response by the time the engine generates the next report.",
|
||||
want: "en",
|
||||
},
|
||||
{
|
||||
name: "russian paragraph",
|
||||
text: "Привет! Я предлагаю заключить дипломатическое соглашение и провести " +
|
||||
"совместную операцию по освоению гиперпространственных маршрутов. " +
|
||||
"Жду твоего письменного ответа до конца следующего хода игры, " +
|
||||
"чтобы мы успели согласовать детали и подписать договор вовремя.",
|
||||
want: "ru",
|
||||
},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
got := d.Detect(tc.text)
|
||||
if got != tc.want {
|
||||
t.Fatalf("Detect = %q, want %q", got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetectShortOrEmpty(t *testing.T) {
|
||||
t.Parallel()
|
||||
d := New()
|
||||
short := []string{"", "hi", " "}
|
||||
for _, s := range short {
|
||||
if got := d.Detect(s); got != Undetermined {
|
||||
t.Errorf("Detect(%q) = %q, want %q", s, got, Undetermined)
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user