#!/usr/bin/env python3 """Stage 2 — the "brain" of the Russian Scrabble word-list pipeline. It reads the Stage-1 base word list (built once by ruwords so the heavy PDF is not re-parsed) together with the grammatical notes and the singular/variant structure, runs the whole noun-selection logic in memory, and writes a minimal result: dictprep/russian/scrabble.txt — the working dictionary (common nouns, nom. sing.) dictprep/russian/undefined.txt — the ambiguous tail, left for manual review (dictprep/russian/all.txt is the Stage-1 base.) Every other bucket — adjectives, verbs, the merged note-nouns, singulars, variants — stays in memory. Pass --dump to also write them; pass --trace WORD to ask how a single word did or did not reach the dictionary. Note: all.txt is a plain word list, so the grammatical notes, "ед." singulars and "и" variants are read from the pdftotext output (slov.txt) and the Stage-1 side files; the expensive PDF parse itself runs only once. Sources, most authoritative first: OpenCorpora (mawo-pymorphy3), libmorph (libmorph_check), and the orthographic dictionary's own notes. See dictprep/README.md. Run: ru-venv/bin/python dictprep/ru_stage2.py [--dump] [--trace WORD] """ import argparse import os import re import subprocess HERE = os.path.dirname(os.path.abspath(__file__)) OUT_DIR = os.path.join(HERE, "russian") SLOV = os.path.join(OUT_DIR, "orfo_dict_2025.txt") # committed pdftotext output (source of truth) WL_FROM, WL_TO = 452, 168808 # 1-based inclusive bounds of the column word-list section OC_CACHE = "/tmp/oc_nouns.txt" LIBMORPH_BIN = os.path.join(HERE, "libmorph_check") ALPHABET = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя" ORDER = {c: i for i, c in enumerate(ALPHABET)} PROPER = {"Name", "Surn", "Patr", "Geox", "Orgn", "Trad"} LIBMORPH_NOUN_CODES = set(range(7, 22)) | {24} # 7..21 plus 24 (pluralia tantum) ADJ_END = {"ая", "яя", "ое", "ее", "ье", "ья", "ьи"} VERB3 = ("ет", "ёт", "ит", "ют", "ут", "ает", "яет", "ует", "уют", "нет", "жет", "чет") GENPL = ("ов", "ёв", "ев", "ей") def key(w): return [ORDER.get(c, 99) for c in w] def destress(s): return "".join(c for c in s if ord(c) not in (0x0300, 0x0301)).lower() def cyr_ok(w): return 2 <= len(w) <= 15 and all(("а" <= c <= "я") or c == "ё" for c in w) def load(p): return [l.strip() for l in open(p, encoding="utf-8") if l.strip()] if os.path.exists(p) else [] def write(path, words): os.makedirs(os.path.dirname(path), exist_ok=True) open(path, "w", encoding="utf-8").write("\n".join(sorted(set(words), key=key)) + "\n") import mawo_pymorphy3 # noqa: E402 M = mawo_pymorphy3.MorphAnalyzer() D = M._dawg_dict def oc_noun_lemmas(): """Every common-noun lemma (nom. sing. / pluralia tantum) in OpenCorpora's words.dawg.""" gp, pt = D.get_paradigm, D.parse_tag_string para0, tagc = {}, {} def g0(pid): r = para0.get(pid) if r is None: suf0, tag0, pre0 = gp(pid, 0) _, gr = pt(tag0) r = (pre0, suf0, gr) para0[pid] = r return r def gt(pid, idx): k = (pid, idx) r = tagc.get(k) if r is None: suf, tag, pre = gp(pid, idx) pos, gr = pt(tag) r = (suf, pre, pos, gr) tagc[k] = r return r out = set() for word, rec in D.words_dawg.iteritems(): pid, idx = rec suf, pre, pos, gr = gt(pid, idx) if pos != "NOUN": continue pre0, suf0, gr0 = g0(pid) if (PROPER & gr) or (PROPER & gr0): continue stem = word[len(pre):len(word) - len(suf)] if suf else word[len(pre):] out.add(pre0 + stem + suf0) return {w for w in out if cyr_ok(w)} def oc_status(word): """(is_common_noun, in_dictionary) for word, from OpenCorpora only.""" parses = D.get_word_parses(word) if not parses: return False, False gp, pt = D.get_paradigm, D.parse_tag_string for pid, idx in parses: suf, tag, pre = gp(pid, idx) pos, gr = pt(tag) if pos == "NOUN": _, tag0, _ = gp(pid, 0) _, gr0 = pt(tag0) if not (PROPER & gr or PROPER & gr0): return True, True return False, True def libmorph_analyze(words): """Map each word to (known, noun_lemma, codes) per libmorph; noun_lemma is None when it is not a common noun there. Empty result if the helper binary is not built.""" words = list(words) if not words or not os.path.exists(LIBMORPH_BIN): return {} proc = subprocess.run([LIBMORPH_BIN], input="\n".join(words), capture_output=True, text=True) out = {} for w, line in zip(words, proc.stdout.split("\n")): fields = line.split("\t") known = fields[:1] == ["1"] codes, noun_lemmas = set(), [] for field in fields[1:]: code, _, lex = field.partition(":") if code.isdigit(): codes.add(int(code)) if int(code) in LIBMORPH_NOUN_CODES: noun_lemmas.append(lex) lemma = (w if w in noun_lemmas else noun_lemmas[0]) if noun_lemmas else None out[w] = (known, lemma, codes) return out def build_notes(): """Map each headword (destressed, lowercased) to its grammatical note.""" def is_hw(ch): o = ord(ch) return (0x0430 <= o <= 0x044F) or (0x0410 <= o <= 0x042F) or o in (0x0401, 0x0451, 0x0300, 0x0301) hmap = {} lines = open(SLOV, encoding="utf-8").read().split("\n") for l in lines[WL_FROM - 1:WL_TO]: s = l.lstrip() e = 0 for ch in s: if is_hw(ch): e += 1 else: break hw = destress(s[:e]) if hw and hw not in hmap: hmap[hw] = destress(s[e:]).strip() return hmap def classify(w, note): """Coarse part of speech of an out-of-dictionary word from its PDF note.""" if note is None: return "amb" n = re.sub(r"\([^)]*\)", "", note).strip() # drop domain/etymology parentheticals if "кр. ф" in n or "кр.ф" in n or "прич." in n or "прил." in n: return "adj" ends = re.findall(r"-([а-яё]+)", n) if any(e in ADJ_END for e in ends): return "adj" if "сов." in n or "несов." in n or "безл." in n: return "verb" if w.endswith("ся"): # reflexive: no Russian noun ends in -ся return "verb" if any(e.endswith(VERB3) for e in ends) and not any(m in n for m in ("ед.", "тв.", "род.", "м.", "ж.", "с.")): return "verb" if n == "" and w.endswith(("ый", "ий", "ой", "ая", "ое", "ые", "ие", "яя", "ее")): return "adj" if "нескл" in n: return "noun" if any(g in n for g in ("м.", "ж.", "с.", "мн.")) else "amb" if ends: return "noun" if n == "" and w.endswith(("ать", "ять", "еть", "ить", "оть", "уть", "ыть", "ти", "чь")): return "verb" return "amb" def singular(w, note): """Nominative singular of a noun headword from the PDF note (authoritative) or, for a plural headword without an explicit singular, the mawo lemma; pluralia tantum kept.""" n = note or "" full = re.search(r"ед\.\s+([а-яё]+)", n) if full: return full.group(1) suf = re.search(r"ед\.\s+-([а-яё]+)", n) if suf: s = suf.group(1) i = w.rfind(s[0]) return w[:i] + s if i > 0 else w ends = re.findall(r"-([а-яё]+)", re.sub(r"\([^)]*\)", "", n)) if ends and ends[0].endswith(GENPL): for p in M.parse(w): if str(p.tag.POS) == "NOUN": return p.normal_form return w return w def build(): """Run the whole pipeline in memory. Returns the result sets plus a `fate` map giving every word's outcome, so a word's path can be traced or the buckets dumped.""" oc = set(load(OC_CACHE)) or oc_noun_lemmas() if not os.path.exists(OC_CACHE): write(OC_CACHE, oc) hmap = build_notes() all_words = load(os.path.join(OUT_DIR, "all.txt")) ed_nouns = set(load("/tmp/ru_singulars.txt")) pairs = [tuple(p) for l in load("/tmp/ru_variants.txt") if len(p := l.split("\t")) == 2] pdf = [w for w in all_words if cyr_ok(w)] lm = libmorph_analyze(pdf) def to_singular(w): s = singular(w, hmap.get(w)) return s if cyr_ok(s) else w fate = {} scrabble = set(oc) adj, verb, amb = [], [], [] for w in pdf: oc_noun, oc_known = oc_status(w) if oc_noun: fate[w] = "scrabble: сущ. по OpenCorpora" continue lm_known, lm_lemma, _ = lm.get(w, (False, None, frozenset())) if lm_lemma is not None: s = lm_lemma if cyr_ok(lm_lemma) else to_singular(w) scrabble.add(s) fate[w] = "scrabble: сущ. по libmorph" + ("" if s == w else f" → {s}") continue if oc_known or lm_known: fate[w] = "отброшено: словарь знает как не-существительное" continue if w in ed_nouns: scrabble.add(w) fate[w] = "scrabble: ед.ч. по помете «ед.»" continue c = classify(w, hmap.get(w)) if c == "noun": s = to_singular(w) scrabble.add(s) fate[w] = "scrabble: сущ. по помете орфословаря" + ("" if s == w else f" → {s}") elif c == "adj": adj.append(w) fate[w] = "отброшено: прилагательное (помета орфословаря)" elif c == "verb": verb.append(w) fate[w] = "отброшено: глагол (помета орфословаря)" else: amb.append(w) fate[w] = "undefined: неоднозначное (нет в словарях, помета не определяет)" # Manual confirmations: nouns the maintainer approved from the undefined tail. for w in load(os.path.join(OUT_DIR, "manual_confirm.txt")): if cyr_ok(w): scrabble.add(w) fate[w] = "scrabble: подтверждено вручную (manual_confirm.txt)" # Variant rescue: a word joined by "и" to a confirmed noun is itself a noun. pending = set(amb) - scrabble changed = True while changed: changed = False for a, b in pairs: for x, y in ((a, b), (b, a)): if x in scrabble and y in pending: scrabble.add(y) pending.discard(y) fate[y] = f"scrabble: вариант от «{x}» (через «и»)" changed = True undefined = [w for w in amb if w not in scrabble] return { "oc": oc, "scrabble": scrabble, "undefined": undefined, "adjectives": adj, "verbs": verb, "singulars": ed_nouns, "fate": fate, "all": set(all_words), } def trace(word, r): w = destress(word) if w in r["fate"]: return r["fate"][w] if w in r["scrabble"]: return "scrabble: лексикон OpenCorpora" if w in r["oc"] else "scrabble: производная/лемма" if w not in r["all"]: return "нет в russian_all (не извлечено на Stage 1 — нет в .pdf, либо имя собств./дефис/форма)" if not cyr_ok(w): return "отсеяно: длина или символы вне диапазона (2–15 кириллица)" return "не определено" def main(): ap = argparse.ArgumentParser(description="Stage 2 brain: build the noun dictionary, trace a word, or dump buckets.") ap.add_argument("--dump", action="store_true", help="also write the in-memory buckets (adjectives, verbs, singulars, variants, fate)") ap.add_argument("--trace", metavar="WORD", help="report how WORD did or did not reach the dictionary, then exit") args = ap.parse_args() r = build() if args.trace: print(f"{args.trace}: {trace(args.trace, r)}") return write(os.path.join(OUT_DIR, "scrabble.txt"), r["scrabble"]) print(f"=> dictprep/russian/scrabble.txt {len(r['scrabble'])}") print(f" undefined kept in memory: {len(set(r['undefined']))} (use --dump to write it)") if args.dump: write(os.path.join(OUT_DIR, "undefined.txt"), r["undefined"]) write(os.path.join(OUT_DIR, "adjectives.txt"), r["adjectives"]) write(os.path.join(OUT_DIR, "verbs.txt"), r["verbs"]) write(os.path.join(OUT_DIR, "singulars.txt"), r["singulars"]) fate_path = os.path.join(OUT_DIR, "fate.tsv") os.makedirs(OUT_DIR, exist_ok=True) with open(fate_path, "w", encoding="utf-8") as f: for w in sorted(r["fate"], key=key): f.write(f"{w}\t{r['fate'][w]}\n") print(f" dumped: undefined.txt ({len(set(r['undefined']))}), adjectives.txt, verbs.txt, singulars.txt, fate.tsv") if __name__ == "__main__": main()