Files
scrabble-solver/dictprep/ru_stage2.py
T
Ilia Denisov 540ee32178 dictprep: Russian orthographic dictionary → Scrabble noun pipeline
Build a committed Russian common-noun word list (dictprep/russian/scrabble.txt)
from the RAN orthographic dictionary, for the Эрудит ruleset.

- Stage 1 (Go, dictprep/ruwords): orfo_dict_2025.txt -> all.txt; extracts
  headwords, reconstructs "ед." singulars (suppressing plurals), pairs "и" variants.
- Stage 2 (Python brain, dictprep/ru_stage2.py): OpenCorpora (mawo-pymorphy3) +
  libmorph + orthographic notes select common nouns (nom. sing.); --trace explains
  a word's fate, --dump writes the in-memory buckets.
- libmorph C++ bridge (libmorph_check.cpp); manual_confirm.txt is merged in.
- orfo_dict_2025.txt is the committed pdftotext source of truth.
- See dictprep/README.md for methodology and reproducibility.
2026-06-01 23:27:17 +02:00

342 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Stage 2 — the "brain" of the Russian Scrabble word-list pipeline.
It reads the Stage-1 base word list (built once by ruwords so the heavy PDF is not
re-parsed) together with the grammatical notes and the singular/variant structure, runs
the whole noun-selection logic in memory, and writes a minimal result:
dictprep/russian/scrabble.txt — the working dictionary (common nouns, nom. sing.)
dictprep/russian/undefined.txt — the ambiguous tail, left for manual review
(dictprep/russian/all.txt is the Stage-1 base.) Every other bucket — adjectives, verbs,
the merged note-nouns, singulars, variants — stays in memory. Pass --dump to also write
them; pass --trace WORD to ask how a single word did or did not reach the dictionary.
Note: all.txt is a plain word list, so the grammatical notes, "ед." singulars and "и"
variants are read from the pdftotext output (slov.txt) and the Stage-1 side files; the
expensive PDF parse itself runs only once.
Sources, most authoritative first: OpenCorpora (mawo-pymorphy3), libmorph (libmorph_check),
and the orthographic dictionary's own notes. See dictprep/README.md.
Run: ru-venv/bin/python dictprep/ru_stage2.py [--dump] [--trace WORD]
"""
import argparse
import os
import re
import subprocess
HERE = os.path.dirname(os.path.abspath(__file__))
OUT_DIR = os.path.join(HERE, "russian")
SLOV = os.path.join(OUT_DIR, "orfo_dict_2025.txt") # committed pdftotext output (source of truth)
WL_FROM, WL_TO = 452, 168808 # 1-based inclusive bounds of the column word-list section
OC_CACHE = "/tmp/oc_nouns.txt"
LIBMORPH_BIN = os.path.join(HERE, "libmorph_check")
ALPHABET = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
ORDER = {c: i for i, c in enumerate(ALPHABET)}
PROPER = {"Name", "Surn", "Patr", "Geox", "Orgn", "Trad"}
LIBMORPH_NOUN_CODES = set(range(7, 22)) | {24} # 7..21 plus 24 (pluralia tantum)
ADJ_END = {"ая", "яя", "ое", "ее", "ье", "ья", "ьи"}
VERB3 = ("ет", "ёт", "ит", "ют", "ут", "ает", "яет", "ует", "уют", "нет", "жет", "чет")
GENPL = ("ов", "ёв", "ев", "ей")
def key(w):
return [ORDER.get(c, 99) for c in w]
def destress(s):
return "".join(c for c in s if ord(c) not in (0x0300, 0x0301)).lower()
def cyr_ok(w):
return 2 <= len(w) <= 15 and all(("а" <= c <= "я") or c == "ё" for c in w)
def load(p):
return [l.strip() for l in open(p, encoding="utf-8") if l.strip()] if os.path.exists(p) else []
def write(path, words):
os.makedirs(os.path.dirname(path), exist_ok=True)
open(path, "w", encoding="utf-8").write("\n".join(sorted(set(words), key=key)) + "\n")
import mawo_pymorphy3 # noqa: E402
M = mawo_pymorphy3.MorphAnalyzer()
D = M._dawg_dict
def oc_noun_lemmas():
"""Every common-noun lemma (nom. sing. / pluralia tantum) in OpenCorpora's words.dawg."""
gp, pt = D.get_paradigm, D.parse_tag_string
para0, tagc = {}, {}
def g0(pid):
r = para0.get(pid)
if r is None:
suf0, tag0, pre0 = gp(pid, 0)
_, gr = pt(tag0)
r = (pre0, suf0, gr)
para0[pid] = r
return r
def gt(pid, idx):
k = (pid, idx)
r = tagc.get(k)
if r is None:
suf, tag, pre = gp(pid, idx)
pos, gr = pt(tag)
r = (suf, pre, pos, gr)
tagc[k] = r
return r
out = set()
for word, rec in D.words_dawg.iteritems():
pid, idx = rec
suf, pre, pos, gr = gt(pid, idx)
if pos != "NOUN":
continue
pre0, suf0, gr0 = g0(pid)
if (PROPER & gr) or (PROPER & gr0):
continue
stem = word[len(pre):len(word) - len(suf)] if suf else word[len(pre):]
out.add(pre0 + stem + suf0)
return {w for w in out if cyr_ok(w)}
def oc_status(word):
"""(is_common_noun, in_dictionary) for word, from OpenCorpora only."""
parses = D.get_word_parses(word)
if not parses:
return False, False
gp, pt = D.get_paradigm, D.parse_tag_string
for pid, idx in parses:
suf, tag, pre = gp(pid, idx)
pos, gr = pt(tag)
if pos == "NOUN":
_, tag0, _ = gp(pid, 0)
_, gr0 = pt(tag0)
if not (PROPER & gr or PROPER & gr0):
return True, True
return False, True
def libmorph_analyze(words):
"""Map each word to (known, noun_lemma, codes) per libmorph; noun_lemma is None when it
is not a common noun there. Empty result if the helper binary is not built."""
words = list(words)
if not words or not os.path.exists(LIBMORPH_BIN):
return {}
proc = subprocess.run([LIBMORPH_BIN], input="\n".join(words), capture_output=True, text=True)
out = {}
for w, line in zip(words, proc.stdout.split("\n")):
fields = line.split("\t")
known = fields[:1] == ["1"]
codes, noun_lemmas = set(), []
for field in fields[1:]:
code, _, lex = field.partition(":")
if code.isdigit():
codes.add(int(code))
if int(code) in LIBMORPH_NOUN_CODES:
noun_lemmas.append(lex)
lemma = (w if w in noun_lemmas else noun_lemmas[0]) if noun_lemmas else None
out[w] = (known, lemma, codes)
return out
def build_notes():
"""Map each headword (destressed, lowercased) to its grammatical note."""
def is_hw(ch):
o = ord(ch)
return (0x0430 <= o <= 0x044F) or (0x0410 <= o <= 0x042F) or o in (0x0401, 0x0451, 0x0300, 0x0301)
hmap = {}
lines = open(SLOV, encoding="utf-8").read().split("\n")
for l in lines[WL_FROM - 1:WL_TO]:
s = l.lstrip()
e = 0
for ch in s:
if is_hw(ch):
e += 1
else:
break
hw = destress(s[:e])
if hw and hw not in hmap:
hmap[hw] = destress(s[e:]).strip()
return hmap
def classify(w, note):
"""Coarse part of speech of an out-of-dictionary word from its PDF note."""
if note is None:
return "amb"
n = re.sub(r"\([^)]*\)", "", note).strip() # drop domain/etymology parentheticals
if "кр. ф" in n or "кр.ф" in n or "прич." in n or "прил." in n:
return "adj"
ends = re.findall(r"-([а-яё]+)", n)
if any(e in ADJ_END for e in ends):
return "adj"
if "сов." in n or "несов." in n or "безл." in n:
return "verb"
if w.endswith("ся"): # reflexive: no Russian noun ends in -ся
return "verb"
if any(e.endswith(VERB3) for e in ends) and not any(m in n for m in ("ед.", "тв.", "род.", "м.", "ж.", "с.")):
return "verb"
if n == "" and w.endswith(("ый", "ий", "ой", "ая", "ое", "ые", "ие", "яя", "ее")):
return "adj"
if "нескл" in n:
return "noun" if any(g in n for g in ("м.", "ж.", "с.", "мн.")) else "amb"
if ends:
return "noun"
if n == "" and w.endswith(("ать", "ять", "еть", "ить", "оть", "уть", "ыть", "ти", "чь")):
return "verb"
return "amb"
def singular(w, note):
"""Nominative singular of a noun headword from the PDF note (authoritative) or, for a
plural headword without an explicit singular, the mawo lemma; pluralia tantum kept."""
n = note or ""
full = re.search(r"ед\.\s+([а-яё]+)", n)
if full:
return full.group(1)
suf = re.search(r"ед\.\s+-([а-яё]+)", n)
if suf:
s = suf.group(1)
i = w.rfind(s[0])
return w[:i] + s if i > 0 else w
ends = re.findall(r"-([а-яё]+)", re.sub(r"\([^)]*\)", "", n))
if ends and ends[0].endswith(GENPL):
for p in M.parse(w):
if str(p.tag.POS) == "NOUN":
return p.normal_form
return w
return w
def build():
"""Run the whole pipeline in memory. Returns the result sets plus a `fate` map giving
every word's outcome, so a word's path can be traced or the buckets dumped."""
oc = set(load(OC_CACHE)) or oc_noun_lemmas()
if not os.path.exists(OC_CACHE):
write(OC_CACHE, oc)
hmap = build_notes()
all_words = load(os.path.join(OUT_DIR, "all.txt"))
ed_nouns = set(load("/tmp/ru_singulars.txt"))
pairs = [tuple(p) for l in load("/tmp/ru_variants.txt") if len(p := l.split("\t")) == 2]
pdf = [w for w in all_words if cyr_ok(w)]
lm = libmorph_analyze(pdf)
def to_singular(w):
s = singular(w, hmap.get(w))
return s if cyr_ok(s) else w
fate = {}
scrabble = set(oc)
adj, verb, amb = [], [], []
for w in pdf:
oc_noun, oc_known = oc_status(w)
if oc_noun:
fate[w] = "scrabble: сущ. по OpenCorpora"
continue
lm_known, lm_lemma, _ = lm.get(w, (False, None, frozenset()))
if lm_lemma is not None:
s = lm_lemma if cyr_ok(lm_lemma) else to_singular(w)
scrabble.add(s)
fate[w] = "scrabble: сущ. по libmorph" + ("" if s == w else f"{s}")
continue
if oc_known or lm_known:
fate[w] = "отброшено: словарь знает как не-существительное"
continue
if w in ed_nouns:
scrabble.add(w)
fate[w] = "scrabble: ед.ч. по помете «ед.»"
continue
c = classify(w, hmap.get(w))
if c == "noun":
s = to_singular(w)
scrabble.add(s)
fate[w] = "scrabble: сущ. по помете орфословаря" + ("" if s == w else f"{s}")
elif c == "adj":
adj.append(w)
fate[w] = "отброшено: прилагательное (помета орфословаря)"
elif c == "verb":
verb.append(w)
fate[w] = "отброшено: глагол (помета орфословаря)"
else:
amb.append(w)
fate[w] = "undefined: неоднозначное (нет в словарях, помета не определяет)"
# Manual confirmations: nouns the maintainer approved from the undefined tail.
for w in load(os.path.join(OUT_DIR, "manual_confirm.txt")):
if cyr_ok(w):
scrabble.add(w)
fate[w] = "scrabble: подтверждено вручную (manual_confirm.txt)"
# Variant rescue: a word joined by "и" to a confirmed noun is itself a noun.
pending = set(amb) - scrabble
changed = True
while changed:
changed = False
for a, b in pairs:
for x, y in ((a, b), (b, a)):
if x in scrabble and y in pending:
scrabble.add(y)
pending.discard(y)
fate[y] = f"scrabble: вариант от «{x}» (через «и»)"
changed = True
undefined = [w for w in amb if w not in scrabble]
return {
"oc": oc, "scrabble": scrabble, "undefined": undefined,
"adjectives": adj, "verbs": verb, "singulars": ed_nouns,
"fate": fate, "all": set(all_words),
}
def trace(word, r):
w = destress(word)
if w in r["fate"]:
return r["fate"][w]
if w in r["scrabble"]:
return "scrabble: лексикон OpenCorpora" if w in r["oc"] else "scrabble: производная/лемма"
if w not in r["all"]:
return "нет в russian_all (не извлечено на Stage 1 — нет в .pdf, либо имя собств./дефис/форма)"
if not cyr_ok(w):
return "отсеяно: длина или символы вне диапазона (2–15 кириллица)"
return "не определено"
def main():
ap = argparse.ArgumentParser(description="Stage 2 brain: build the noun dictionary, trace a word, or dump buckets.")
ap.add_argument("--dump", action="store_true", help="also write the in-memory buckets (adjectives, verbs, singulars, variants, fate)")
ap.add_argument("--trace", metavar="WORD", help="report how WORD did or did not reach the dictionary, then exit")
args = ap.parse_args()
r = build()
if args.trace:
print(f"{args.trace}: {trace(args.trace, r)}")
return
write(os.path.join(OUT_DIR, "scrabble.txt"), r["scrabble"])
print(f"=> dictprep/russian/scrabble.txt {len(r['scrabble'])}")
print(f" undefined kept in memory: {len(set(r['undefined']))} (use --dump to write it)")
if args.dump:
write(os.path.join(OUT_DIR, "undefined.txt"), r["undefined"])
write(os.path.join(OUT_DIR, "adjectives.txt"), r["adjectives"])
write(os.path.join(OUT_DIR, "verbs.txt"), r["verbs"])
write(os.path.join(OUT_DIR, "singulars.txt"), r["singulars"])
fate_path = os.path.join(OUT_DIR, "fate.tsv")
os.makedirs(OUT_DIR, exist_ok=True)
with open(fate_path, "w", encoding="utf-8") as f:
for w in sorted(r["fate"], key=key):
f.write(f"{w}\t{r['fate'][w]}\n")
print(f" dumped: undefined.txt ({len(set(r['undefined']))}), adjectives.txt, verbs.txt, singulars.txt, fate.tsv")
if __name__ == "__main__":
main()