dictprep: Russian orthographic dictionary → Scrabble noun pipeline
Build a committed Russian common-noun word list (dictprep/russian/scrabble.txt) from the RAN orthographic dictionary, for the Эрудит ruleset. - Stage 1 (Go, dictprep/ruwords): orfo_dict_2025.txt -> all.txt; extracts headwords, reconstructs "ед." singulars (suppressing plurals), pairs "и" variants. - Stage 2 (Python brain, dictprep/ru_stage2.py): OpenCorpora (mawo-pymorphy3) + libmorph + orthographic notes select common nouns (nom. sing.); --trace explains a word's fate, --dump writes the in-memory buckets. - libmorph C++ bridge (libmorph_check.cpp); manual_confirm.txt is merged in. - orfo_dict_2025.txt is the committed pdftotext source of truth. - See dictprep/README.md for methodology and reproducibility.
This commit is contained in:
@@ -0,0 +1,341 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Stage 2 — the "brain" of the Russian Scrabble word-list pipeline.
|
||||
|
||||
It reads the Stage-1 base word list (built once by ruwords so the heavy PDF is not
|
||||
re-parsed) together with the grammatical notes and the singular/variant structure, runs
|
||||
the whole noun-selection logic in memory, and writes a minimal result:
|
||||
|
||||
dictprep/russian/scrabble.txt — the working dictionary (common nouns, nom. sing.)
|
||||
dictprep/russian/undefined.txt — the ambiguous tail, left for manual review
|
||||
|
||||
(dictprep/russian/all.txt is the Stage-1 base.) Every other bucket — adjectives, verbs,
|
||||
the merged note-nouns, singulars, variants — stays in memory. Pass --dump to also write
|
||||
them; pass --trace WORD to ask how a single word did or did not reach the dictionary.
|
||||
|
||||
Note: all.txt is a plain word list, so the grammatical notes, "ед." singulars and "и"
|
||||
variants are read from the pdftotext output (slov.txt) and the Stage-1 side files; the
|
||||
expensive PDF parse itself runs only once.
|
||||
|
||||
Sources, most authoritative first: OpenCorpora (mawo-pymorphy3), libmorph (libmorph_check),
|
||||
and the orthographic dictionary's own notes. See dictprep/README.md.
|
||||
|
||||
Run: ru-venv/bin/python dictprep/ru_stage2.py [--dump] [--trace WORD]
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
OUT_DIR = os.path.join(HERE, "russian")
|
||||
SLOV = os.path.join(OUT_DIR, "orfo_dict_2025.txt") # committed pdftotext output (source of truth)
|
||||
WL_FROM, WL_TO = 452, 168808 # 1-based inclusive bounds of the column word-list section
|
||||
OC_CACHE = "/tmp/oc_nouns.txt"
|
||||
LIBMORPH_BIN = os.path.join(HERE, "libmorph_check")
|
||||
|
||||
ALPHABET = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
|
||||
ORDER = {c: i for i, c in enumerate(ALPHABET)}
|
||||
PROPER = {"Name", "Surn", "Patr", "Geox", "Orgn", "Trad"}
|
||||
LIBMORPH_NOUN_CODES = set(range(7, 22)) | {24} # 7..21 plus 24 (pluralia tantum)
|
||||
ADJ_END = {"ая", "яя", "ое", "ее", "ье", "ья", "ьи"}
|
||||
VERB3 = ("ет", "ёт", "ит", "ют", "ут", "ает", "яет", "ует", "уют", "нет", "жет", "чет")
|
||||
GENPL = ("ов", "ёв", "ев", "ей")
|
||||
|
||||
|
||||
def key(w):
|
||||
return [ORDER.get(c, 99) for c in w]
|
||||
|
||||
|
||||
def destress(s):
|
||||
return "".join(c for c in s if ord(c) not in (0x0300, 0x0301)).lower()
|
||||
|
||||
|
||||
def cyr_ok(w):
|
||||
return 2 <= len(w) <= 15 and all(("а" <= c <= "я") or c == "ё" for c in w)
|
||||
|
||||
|
||||
def load(p):
|
||||
return [l.strip() for l in open(p, encoding="utf-8") if l.strip()] if os.path.exists(p) else []
|
||||
|
||||
|
||||
def write(path, words):
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
open(path, "w", encoding="utf-8").write("\n".join(sorted(set(words), key=key)) + "\n")
|
||||
|
||||
|
||||
import mawo_pymorphy3 # noqa: E402
|
||||
|
||||
M = mawo_pymorphy3.MorphAnalyzer()
|
||||
D = M._dawg_dict
|
||||
|
||||
|
||||
def oc_noun_lemmas():
|
||||
"""Every common-noun lemma (nom. sing. / pluralia tantum) in OpenCorpora's words.dawg."""
|
||||
gp, pt = D.get_paradigm, D.parse_tag_string
|
||||
para0, tagc = {}, {}
|
||||
|
||||
def g0(pid):
|
||||
r = para0.get(pid)
|
||||
if r is None:
|
||||
suf0, tag0, pre0 = gp(pid, 0)
|
||||
_, gr = pt(tag0)
|
||||
r = (pre0, suf0, gr)
|
||||
para0[pid] = r
|
||||
return r
|
||||
|
||||
def gt(pid, idx):
|
||||
k = (pid, idx)
|
||||
r = tagc.get(k)
|
||||
if r is None:
|
||||
suf, tag, pre = gp(pid, idx)
|
||||
pos, gr = pt(tag)
|
||||
r = (suf, pre, pos, gr)
|
||||
tagc[k] = r
|
||||
return r
|
||||
|
||||
out = set()
|
||||
for word, rec in D.words_dawg.iteritems():
|
||||
pid, idx = rec
|
||||
suf, pre, pos, gr = gt(pid, idx)
|
||||
if pos != "NOUN":
|
||||
continue
|
||||
pre0, suf0, gr0 = g0(pid)
|
||||
if (PROPER & gr) or (PROPER & gr0):
|
||||
continue
|
||||
stem = word[len(pre):len(word) - len(suf)] if suf else word[len(pre):]
|
||||
out.add(pre0 + stem + suf0)
|
||||
return {w for w in out if cyr_ok(w)}
|
||||
|
||||
|
||||
def oc_status(word):
|
||||
"""(is_common_noun, in_dictionary) for word, from OpenCorpora only."""
|
||||
parses = D.get_word_parses(word)
|
||||
if not parses:
|
||||
return False, False
|
||||
gp, pt = D.get_paradigm, D.parse_tag_string
|
||||
for pid, idx in parses:
|
||||
suf, tag, pre = gp(pid, idx)
|
||||
pos, gr = pt(tag)
|
||||
if pos == "NOUN":
|
||||
_, tag0, _ = gp(pid, 0)
|
||||
_, gr0 = pt(tag0)
|
||||
if not (PROPER & gr or PROPER & gr0):
|
||||
return True, True
|
||||
return False, True
|
||||
|
||||
|
||||
def libmorph_analyze(words):
|
||||
"""Map each word to (known, noun_lemma, codes) per libmorph; noun_lemma is None when it
|
||||
is not a common noun there. Empty result if the helper binary is not built."""
|
||||
words = list(words)
|
||||
if not words or not os.path.exists(LIBMORPH_BIN):
|
||||
return {}
|
||||
proc = subprocess.run([LIBMORPH_BIN], input="\n".join(words), capture_output=True, text=True)
|
||||
out = {}
|
||||
for w, line in zip(words, proc.stdout.split("\n")):
|
||||
fields = line.split("\t")
|
||||
known = fields[:1] == ["1"]
|
||||
codes, noun_lemmas = set(), []
|
||||
for field in fields[1:]:
|
||||
code, _, lex = field.partition(":")
|
||||
if code.isdigit():
|
||||
codes.add(int(code))
|
||||
if int(code) in LIBMORPH_NOUN_CODES:
|
||||
noun_lemmas.append(lex)
|
||||
lemma = (w if w in noun_lemmas else noun_lemmas[0]) if noun_lemmas else None
|
||||
out[w] = (known, lemma, codes)
|
||||
return out
|
||||
|
||||
|
||||
def build_notes():
|
||||
"""Map each headword (destressed, lowercased) to its grammatical note."""
|
||||
def is_hw(ch):
|
||||
o = ord(ch)
|
||||
return (0x0430 <= o <= 0x044F) or (0x0410 <= o <= 0x042F) or o in (0x0401, 0x0451, 0x0300, 0x0301)
|
||||
|
||||
hmap = {}
|
||||
lines = open(SLOV, encoding="utf-8").read().split("\n")
|
||||
for l in lines[WL_FROM - 1:WL_TO]:
|
||||
s = l.lstrip()
|
||||
e = 0
|
||||
for ch in s:
|
||||
if is_hw(ch):
|
||||
e += 1
|
||||
else:
|
||||
break
|
||||
hw = destress(s[:e])
|
||||
if hw and hw not in hmap:
|
||||
hmap[hw] = destress(s[e:]).strip()
|
||||
return hmap
|
||||
|
||||
|
||||
def classify(w, note):
|
||||
"""Coarse part of speech of an out-of-dictionary word from its PDF note."""
|
||||
if note is None:
|
||||
return "amb"
|
||||
n = re.sub(r"\([^)]*\)", "", note).strip() # drop domain/etymology parentheticals
|
||||
if "кр. ф" in n or "кр.ф" in n or "прич." in n or "прил." in n:
|
||||
return "adj"
|
||||
ends = re.findall(r"-([а-яё]+)", n)
|
||||
if any(e in ADJ_END for e in ends):
|
||||
return "adj"
|
||||
if "сов." in n or "несов." in n or "безл." in n:
|
||||
return "verb"
|
||||
if w.endswith("ся"): # reflexive: no Russian noun ends in -ся
|
||||
return "verb"
|
||||
if any(e.endswith(VERB3) for e in ends) and not any(m in n for m in ("ед.", "тв.", "род.", "м.", "ж.", "с.")):
|
||||
return "verb"
|
||||
if n == "" and w.endswith(("ый", "ий", "ой", "ая", "ое", "ые", "ие", "яя", "ее")):
|
||||
return "adj"
|
||||
if "нескл" in n:
|
||||
return "noun" if any(g in n for g in ("м.", "ж.", "с.", "мн.")) else "amb"
|
||||
if ends:
|
||||
return "noun"
|
||||
if n == "" and w.endswith(("ать", "ять", "еть", "ить", "оть", "уть", "ыть", "ти", "чь")):
|
||||
return "verb"
|
||||
return "amb"
|
||||
|
||||
|
||||
def singular(w, note):
|
||||
"""Nominative singular of a noun headword from the PDF note (authoritative) or, for a
|
||||
plural headword without an explicit singular, the mawo lemma; pluralia tantum kept."""
|
||||
n = note or ""
|
||||
full = re.search(r"ед\.\s+([а-яё]+)", n)
|
||||
if full:
|
||||
return full.group(1)
|
||||
suf = re.search(r"ед\.\s+-([а-яё]+)", n)
|
||||
if suf:
|
||||
s = suf.group(1)
|
||||
i = w.rfind(s[0])
|
||||
return w[:i] + s if i > 0 else w
|
||||
ends = re.findall(r"-([а-яё]+)", re.sub(r"\([^)]*\)", "", n))
|
||||
if ends and ends[0].endswith(GENPL):
|
||||
for p in M.parse(w):
|
||||
if str(p.tag.POS) == "NOUN":
|
||||
return p.normal_form
|
||||
return w
|
||||
return w
|
||||
|
||||
|
||||
def build():
|
||||
"""Run the whole pipeline in memory. Returns the result sets plus a `fate` map giving
|
||||
every word's outcome, so a word's path can be traced or the buckets dumped."""
|
||||
oc = set(load(OC_CACHE)) or oc_noun_lemmas()
|
||||
if not os.path.exists(OC_CACHE):
|
||||
write(OC_CACHE, oc)
|
||||
hmap = build_notes()
|
||||
all_words = load(os.path.join(OUT_DIR, "all.txt"))
|
||||
ed_nouns = set(load("/tmp/ru_singulars.txt"))
|
||||
pairs = [tuple(p) for l in load("/tmp/ru_variants.txt") if len(p := l.split("\t")) == 2]
|
||||
pdf = [w for w in all_words if cyr_ok(w)]
|
||||
lm = libmorph_analyze(pdf)
|
||||
|
||||
def to_singular(w):
|
||||
s = singular(w, hmap.get(w))
|
||||
return s if cyr_ok(s) else w
|
||||
|
||||
fate = {}
|
||||
scrabble = set(oc)
|
||||
adj, verb, amb = [], [], []
|
||||
for w in pdf:
|
||||
oc_noun, oc_known = oc_status(w)
|
||||
if oc_noun:
|
||||
fate[w] = "scrabble: сущ. по OpenCorpora"
|
||||
continue
|
||||
lm_known, lm_lemma, _ = lm.get(w, (False, None, frozenset()))
|
||||
if lm_lemma is not None:
|
||||
s = lm_lemma if cyr_ok(lm_lemma) else to_singular(w)
|
||||
scrabble.add(s)
|
||||
fate[w] = "scrabble: сущ. по libmorph" + ("" if s == w else f" → {s}")
|
||||
continue
|
||||
if oc_known or lm_known:
|
||||
fate[w] = "отброшено: словарь знает как не-существительное"
|
||||
continue
|
||||
if w in ed_nouns:
|
||||
scrabble.add(w)
|
||||
fate[w] = "scrabble: ед.ч. по помете «ед.»"
|
||||
continue
|
||||
c = classify(w, hmap.get(w))
|
||||
if c == "noun":
|
||||
s = to_singular(w)
|
||||
scrabble.add(s)
|
||||
fate[w] = "scrabble: сущ. по помете орфословаря" + ("" if s == w else f" → {s}")
|
||||
elif c == "adj":
|
||||
adj.append(w)
|
||||
fate[w] = "отброшено: прилагательное (помета орфословаря)"
|
||||
elif c == "verb":
|
||||
verb.append(w)
|
||||
fate[w] = "отброшено: глагол (помета орфословаря)"
|
||||
else:
|
||||
amb.append(w)
|
||||
fate[w] = "undefined: неоднозначное (нет в словарях, помета не определяет)"
|
||||
|
||||
# Manual confirmations: nouns the maintainer approved from the undefined tail.
|
||||
for w in load(os.path.join(OUT_DIR, "manual_confirm.txt")):
|
||||
if cyr_ok(w):
|
||||
scrabble.add(w)
|
||||
fate[w] = "scrabble: подтверждено вручную (manual_confirm.txt)"
|
||||
|
||||
# Variant rescue: a word joined by "и" to a confirmed noun is itself a noun.
|
||||
pending = set(amb) - scrabble
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
for a, b in pairs:
|
||||
for x, y in ((a, b), (b, a)):
|
||||
if x in scrabble and y in pending:
|
||||
scrabble.add(y)
|
||||
pending.discard(y)
|
||||
fate[y] = f"scrabble: вариант от «{x}» (через «и»)"
|
||||
changed = True
|
||||
|
||||
undefined = [w for w in amb if w not in scrabble]
|
||||
return {
|
||||
"oc": oc, "scrabble": scrabble, "undefined": undefined,
|
||||
"adjectives": adj, "verbs": verb, "singulars": ed_nouns,
|
||||
"fate": fate, "all": set(all_words),
|
||||
}
|
||||
|
||||
|
||||
def trace(word, r):
|
||||
w = destress(word)
|
||||
if w in r["fate"]:
|
||||
return r["fate"][w]
|
||||
if w in r["scrabble"]:
|
||||
return "scrabble: лексикон OpenCorpora" if w in r["oc"] else "scrabble: производная/лемма"
|
||||
if w not in r["all"]:
|
||||
return "нет в russian_all (не извлечено на Stage 1 — нет в .pdf, либо имя собств./дефис/форма)"
|
||||
if not cyr_ok(w):
|
||||
return "отсеяно: длина или символы вне диапазона (2–15 кириллица)"
|
||||
return "не определено"
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Stage 2 brain: build the noun dictionary, trace a word, or dump buckets.")
|
||||
ap.add_argument("--dump", action="store_true", help="also write the in-memory buckets (adjectives, verbs, singulars, variants, fate)")
|
||||
ap.add_argument("--trace", metavar="WORD", help="report how WORD did or did not reach the dictionary, then exit")
|
||||
args = ap.parse_args()
|
||||
|
||||
r = build()
|
||||
if args.trace:
|
||||
print(f"{args.trace}: {trace(args.trace, r)}")
|
||||
return
|
||||
|
||||
write(os.path.join(OUT_DIR, "scrabble.txt"), r["scrabble"])
|
||||
print(f"=> dictprep/russian/scrabble.txt {len(r['scrabble'])}")
|
||||
print(f" undefined kept in memory: {len(set(r['undefined']))} (use --dump to write it)")
|
||||
if args.dump:
|
||||
write(os.path.join(OUT_DIR, "undefined.txt"), r["undefined"])
|
||||
write(os.path.join(OUT_DIR, "adjectives.txt"), r["adjectives"])
|
||||
write(os.path.join(OUT_DIR, "verbs.txt"), r["verbs"])
|
||||
write(os.path.join(OUT_DIR, "singulars.txt"), r["singulars"])
|
||||
fate_path = os.path.join(OUT_DIR, "fate.tsv")
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
with open(fate_path, "w", encoding="utf-8") as f:
|
||||
for w in sorted(r["fate"], key=key):
|
||||
f.write(f"{w}\t{r['fate'][w]}\n")
|
||||
print(f" dumped: undefined.txt ({len(set(r['undefined']))}), adjectives.txt, verbs.txt, singulars.txt, fate.tsv")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user