// Package detector wraps the body-language detection used by the // diplomail subsystem. The package exposes a narrow `LanguageDetector` // interface so the implementation can be swapped without touching the // callers; the default backed-by-whatlanggo detector handles 84 // natural languages and ships with the embedded statistical profiles. // // Detection happens only on the body. Subjects are short and // frequently template-like ("Re: ..."), so detecting on them adds // noise. The diplomail Service feeds the body, captures the BCP 47 // tag returned here, and stores it in `diplomail_messages.body_lang`. package detector import ( "strings" "unicode/utf8" "github.com/abadojack/whatlanggo" ) // Undetermined is the BCP 47 placeholder stored when detection cannot // confidently identify a language (empty body, too-short body, mixed // scripts the detector refuses to bet on). const Undetermined = "und" // LanguageDetector is the read-only surface diplomail consumes when // it needs to label a message body. Detect must never panic and // must never return an error: detection failure simply yields // `Undetermined`. type LanguageDetector interface { Detect(body string) string } // New returns the package-default detector backed by `whatlanggo`. // The instance is safe for concurrent use; whatlanggo's `Detect` // reads the embedded profiles without state mutation. Callers that // want a fixed allow-list can build their own implementation around // the same interface. func New() LanguageDetector { return &whatlangDetector{} } type whatlangDetector struct{} // minRunes is the lower bound on body length below which whatlanggo // can flip between near-synonyms; for shorter bodies we return // `Undetermined` and let the noop translator skip the slot. The // value matches whatlanggo's documented "stable above ~25 runes" // guidance. const minRunes = 25 // Detect returns the BCP 47 tag for body, or `Undetermined` when the // body is empty / too short / whatlanggo refuses to label it. The // trim is applied so leading whitespace does not bias the script // detector toward Latin. We deliberately do not gate on // `info.IsReliable()` because the gate is too conservative for the // short sentences typical of in-game mail; a misclassification only // hurts the translation cache key, never correctness. func (d *whatlangDetector) Detect(body string) string { body = strings.TrimSpace(body) if body == "" { return Undetermined } if utf8.RuneCountInString(body) < minRunes { return Undetermined } info := whatlanggo.Detect(body) tag := info.Lang.Iso6391() if tag == "" { return Undetermined } return tag } // NoopDetector returns the placeholder unconditionally. Used by // tests and by Stage A code paths that predate the real detector. type NoopDetector struct{} // Detect always returns `Undetermined` regardless of input. func (NoopDetector) Detect(string) string { return Undetermined }