dictprep: Russian orthographic dictionary → Scrabble noun pipeline
Build a committed Russian common-noun word list (dictprep/russian/scrabble.txt) from the RAN orthographic dictionary, for the Эрудит ruleset. - Stage 1 (Go, dictprep/ruwords): orfo_dict_2025.txt -> all.txt; extracts headwords, reconstructs "ед." singulars (suppressing plurals), pairs "и" variants. - Stage 2 (Python brain, dictprep/ru_stage2.py): OpenCorpora (mawo-pymorphy3) + libmorph + orthographic notes select common nouns (nom. sing.); --trace explains a word's fate, --dump writes the in-memory buckets. - libmorph C++ bridge (libmorph_check.cpp); manual_confirm.txt is merged in. - orfo_dict_2025.txt is the committed pdftotext source of truth. - See dictprep/README.md for methodology and reproducibility.
This commit is contained in:
@@ -0,0 +1,434 @@
|
||||
// Command ruwords extracts a clean Cyrillic word list from the plain text of a Russian
|
||||
// orthographic dictionary (the output of `pdftotext`).
|
||||
//
|
||||
// Stage 1 (this tool): from the column word-list section [from, to] it collects, per
|
||||
// entry, the headword (the leading token). When the headword is plural and the entry
|
||||
// gives its singular after "ед." — in full ("ящеры, …, ед. ящер") or as a replacement
|
||||
// suffix ("…, ед. -вец") — only the singular is kept, since a plural that has a singular
|
||||
// is never needed. It drops stress marks, lowercases, keeps ё, and discards proper nouns
|
||||
// (capitalized), hyphenated words, acronyms and non-Cyrillic tokens. The result is
|
||||
// de-duplicated and sorted in Russian alphabetical order (ё right after е), LF-separated.
|
||||
//
|
||||
// It also collects a variant headword joined by "и" when it carries its own grammatical
|
||||
// note (e.g. "аблатив, -а и аблятив, -а"). Suffix-singular reconstruction is heuristic;
|
||||
// Stage 2 (dictprep/ru_stage2.py) re-checks the words against real dictionaries.
|
||||
//
|
||||
// pdftotext dictprep/orfo_dict_2025.pdf /tmp/slov.txt
|
||||
// go run ./dictprep/ruwords -in /tmp/slov.txt -from 452 -to 168808 \
|
||||
// -out russian_all.txt -skip russian_skip.txt
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// ruAlphabet is the Russian alphabet in collation order (ё directly after е).
|
||||
const ruAlphabet = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
|
||||
|
||||
var ruRank = func() map[rune]int {
|
||||
m := make(map[rune]int, len(ruAlphabet))
|
||||
for i, r := range []rune(ruAlphabet) {
|
||||
m[r] = i
|
||||
}
|
||||
return m
|
||||
}()
|
||||
|
||||
func isCyrLetter(r rune) bool {
|
||||
return (r >= 'а' && r <= 'я') || (r >= 'А' && r <= 'Я') || r == 'ё' || r == 'Ё'
|
||||
}
|
||||
|
||||
func isUpperCyr(r rune) bool { return (r >= 'А' && r <= 'Я') || r == 'Ё' }
|
||||
|
||||
func isStress(r rune) bool { return r == 0x0300 || r == 0x0301 }
|
||||
|
||||
// cleanWord normalizes a run of letters/stress-marks into a lowercase Cyrillic word, or
|
||||
// returns ok=false for proper nouns (capitalized), hyphenated or non-Cyrillic runs.
|
||||
func cleanWord(run []rune) (string, bool) {
|
||||
if len(run) == 0 || isUpperCyr(run[0]) {
|
||||
return "", false
|
||||
}
|
||||
var b strings.Builder
|
||||
for _, r := range run {
|
||||
switch {
|
||||
case isStress(r), r == '': // drop stress accents and soft hyphens
|
||||
case r == '-': // a real hyphen means a hyphenated word: reject it
|
||||
return "", false
|
||||
default:
|
||||
b.WriteRune(unicode.ToLower(r))
|
||||
}
|
||||
}
|
||||
w := b.String()
|
||||
if w == "" {
|
||||
return "", false
|
||||
}
|
||||
for _, r := range w {
|
||||
if !((r >= 'а' && r <= 'я') || r == 'ё') {
|
||||
return "", false
|
||||
}
|
||||
}
|
||||
return w, true
|
||||
}
|
||||
|
||||
// headword returns the entry's headword: the leading run of letters, stress marks and
|
||||
// hyphens, normalized.
|
||||
func headword(line string) (string, bool) {
|
||||
// Trim leading whitespace, including the form-feed (U+000C) that pdftotext puts at
|
||||
// the top of each page — otherwise the first headword on every page is lost.
|
||||
line = strings.TrimLeftFunc(line, unicode.IsSpace)
|
||||
var run []rune
|
||||
for _, r := range line {
|
||||
if isCyrLetter(r) || isStress(r) || r == '-' || r == '' {
|
||||
run = append(run, r)
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
return cleanWord(run)
|
||||
}
|
||||
|
||||
// embeddedSingulars returns the singular form of a plural headword spelled out after
|
||||
// "ед.", either in full ("ед. ящер") or as a replacement suffix ("ед. -вец",
|
||||
// reconstructed from headword). It skips gender marks ("ед. м") and abbreviations that
|
||||
// merely start with "ед." ("ед. измер.", "ден. ед.").
|
||||
func embeddedSingulars(line, headword string) []string {
|
||||
var out []string
|
||||
for i := 0; ; {
|
||||
j := strings.Index(line[i:], "ед.")
|
||||
if j < 0 {
|
||||
break
|
||||
}
|
||||
i += j + len("ед.")
|
||||
rest := strings.TrimLeft(line[i:], " \t")
|
||||
|
||||
if strings.HasPrefix(rest, "-") { // suffix form: reconstruct from the headword
|
||||
var suf []rune
|
||||
for _, r := range rest[len("-"):] {
|
||||
if isCyrLetter(r) || isStress(r) {
|
||||
suf = append(suf, r)
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
if s, ok := cleanWord(suf); ok && len([]rune(s)) >= 2 {
|
||||
if recon := reconstructSingular(headword, s); recon != "" {
|
||||
out = append(out, recon)
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
var run []rune
|
||||
consumed := 0
|
||||
for _, r := range rest {
|
||||
if isCyrLetter(r) || isStress(r) {
|
||||
run = append(run, r)
|
||||
consumed += len(string(r))
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
if len(run) == 0 {
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(rest[consumed:], ".") {
|
||||
continue // an abbreviation like "ед. измер." rather than a singular form
|
||||
}
|
||||
w, ok := cleanWord(run)
|
||||
if !ok || len([]rune(w)) < 2 { // 2+ letters excludes the gender marks м/ж/с
|
||||
continue
|
||||
}
|
||||
out = append(out, w)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// reconstructSingular builds the singular from a plural headword and the replacement
|
||||
// suffix from "ед. -<suffix>", splicing where the suffix best overlaps the tail of the
|
||||
// headword (the position of longest common prefix between the suffix and a headword
|
||||
// suffix). It is a heuristic; Stage 2 re-checks the words against real dictionaries.
|
||||
func reconstructSingular(headword, suffix string) string {
|
||||
hw, sf := []rune(headword), []rune(suffix)
|
||||
bestK, bestLen := -1, 0
|
||||
for k := 0; k < len(hw); k++ {
|
||||
m := 0
|
||||
for k+m < len(hw) && m < len(sf) && hw[k+m] == sf[m] {
|
||||
m++
|
||||
}
|
||||
if m > bestLen {
|
||||
bestK, bestLen = k, m
|
||||
}
|
||||
}
|
||||
if bestK < 0 {
|
||||
return ""
|
||||
}
|
||||
return string(hw[:bestK]) + suffix
|
||||
}
|
||||
|
||||
// headwordNotes are the grammatical notes that mark a parallel headword (a lemma) after
|
||||
// "и", as opposed to an inflected form. A "-" ending also marks one; form labels such as
|
||||
// деепр. (gerund) or сравн. (comparative) deliberately do not.
|
||||
var headwordNotes = map[string]bool{
|
||||
"нескл": true, "неизм": true, "предлог": true, "предл": true, "нареч": true,
|
||||
"нар": true, "прил": true, "союз": true, "частица": true, "част": true,
|
||||
"межд": true, "мн": true, "ед": true, "тв": true, "числ": true, "мест": true,
|
||||
"м": true, "ж": true, "с": true, "вводн": true, "сказ": true,
|
||||
}
|
||||
|
||||
// variantNoteOK reports whether the note following a candidate variant marks a headword:
|
||||
// a "-" inflection ending or one of headwordNotes (and not a bare inflected word).
|
||||
func variantNoteOK(note string) bool {
|
||||
if strings.HasPrefix(note, "-") {
|
||||
return true
|
||||
}
|
||||
var stem []rune
|
||||
for _, r := range note {
|
||||
if (r >= 'а' && r <= 'я') || r == 'ё' {
|
||||
stem = append(stem, r)
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
return headwordNotes[string(stem)]
|
||||
}
|
||||
|
||||
// variants returns the second (and further) headwords of an entry, written as a parallel
|
||||
// form after " и ", e.g. "аблатив, -а и аблятив, -а" yields "аблятив" and "регги и реггей,
|
||||
// нескл." yields "реггей". Requiring a headword note after the comma keeps this from
|
||||
// matching "и" inside examples or picking up inflected forms.
|
||||
func variants(line string) []string {
|
||||
var out []string
|
||||
const sep = " и "
|
||||
for i := 0; ; {
|
||||
j := strings.Index(line[i:], sep)
|
||||
if j < 0 {
|
||||
break
|
||||
}
|
||||
i += j + len(sep)
|
||||
rest := line[i:]
|
||||
var run []rune
|
||||
consumed := 0
|
||||
for _, r := range rest {
|
||||
if isCyrLetter(r) || isStress(r) {
|
||||
run = append(run, r)
|
||||
consumed += len(string(r))
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
if len(run) == 0 {
|
||||
continue
|
||||
}
|
||||
after := rest[consumed:]
|
||||
if !strings.HasPrefix(after, ", ") || !variantNoteOK(after[len(", "):]) {
|
||||
continue
|
||||
}
|
||||
if w, ok := cleanWord(run); ok && len([]rune(w)) >= 2 {
|
||||
out = append(out, w)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// normToken normalizes any token (a run of letters and stress marks) for the skip set:
|
||||
// lowercase, stress removed, kept only if it is 2+ all-Cyrillic letters. Unlike
|
||||
// cleanWord it does NOT reject capitalized tokens — a lowercased proper noun belongs in
|
||||
// the skip set so it can be re-checked by a morphological analyzer.
|
||||
func normToken(run []rune) (string, bool) {
|
||||
var b strings.Builder
|
||||
for _, r := range run {
|
||||
if isStress(r) {
|
||||
continue
|
||||
}
|
||||
b.WriteRune(unicode.ToLower(r))
|
||||
}
|
||||
w := b.String()
|
||||
if len([]rune(w)) < 2 {
|
||||
return "", false
|
||||
}
|
||||
for _, r := range w {
|
||||
if !((r >= 'а' && r <= 'я') || r == 'ё') {
|
||||
return "", false
|
||||
}
|
||||
}
|
||||
return w, true
|
||||
}
|
||||
|
||||
// tokens returns every maximal run of Cyrillic letters (plus stress marks) in the line,
|
||||
// normalized; runs are split on every other character (so hyphens split a word).
|
||||
func tokens(line string) []string {
|
||||
var out []string
|
||||
var run []rune
|
||||
flush := func() {
|
||||
if len(run) > 0 {
|
||||
if w, ok := normToken(run); ok {
|
||||
out = append(out, w)
|
||||
}
|
||||
run = run[:0]
|
||||
}
|
||||
}
|
||||
for _, r := range line {
|
||||
if isCyrLetter(r) || isStress(r) {
|
||||
run = append(run, r)
|
||||
} else {
|
||||
flush()
|
||||
}
|
||||
}
|
||||
flush()
|
||||
return out
|
||||
}
|
||||
|
||||
func lessRu(a, b string) bool {
|
||||
ra, rb := []rune(a), []rune(b)
|
||||
for i := 0; i < len(ra) && i < len(rb); i++ {
|
||||
if ra[i] != rb[i] {
|
||||
return ruRank[ra[i]] < ruRank[rb[i]]
|
||||
}
|
||||
}
|
||||
return len(ra) < len(rb)
|
||||
}
|
||||
|
||||
func sortedRu(set map[string]struct{}) []string {
|
||||
words := make([]string, 0, len(set))
|
||||
for w := range set {
|
||||
words = append(words, w)
|
||||
}
|
||||
sort.Slice(words, func(i, j int) bool { return lessRu(words[i], words[j]) })
|
||||
return words
|
||||
}
|
||||
|
||||
func writeWords(path string, words []string) error {
|
||||
if dir := filepath.Dir(path); dir != "" && dir != "." {
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
o, err := os.Create(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
w := bufio.NewWriter(o)
|
||||
for _, word := range words {
|
||||
w.WriteString(word)
|
||||
w.WriteByte('\n')
|
||||
}
|
||||
if err := w.Flush(); err != nil {
|
||||
o.Close()
|
||||
return err
|
||||
}
|
||||
return o.Close()
|
||||
}
|
||||
|
||||
func main() {
|
||||
in := flag.String("in", "dictprep/russian/orfo_dict_2025.txt", "plain-text dictionary (pdftotext output)")
|
||||
out := flag.String("out", "dictprep/russian/all.txt", "output: the base word list (clean headwords + reconstructed singulars + variants)")
|
||||
skip := flag.String("skip", "/tmp/ru_skip.txt", "output: every other token, for a later morphology re-check")
|
||||
sings := flag.String("singulars", "/tmp/ru_singulars.txt", "output: singulars reconstructed from \"ед.\" (known nouns)")
|
||||
varsOut := flag.String("variants", "/tmp/ru_variants.txt", "output: variant pairs joined by \"и\" (primary<TAB>variant)")
|
||||
from := flag.Int("from", 452, "first line of the word-list section (1-based, inclusive)")
|
||||
to := flag.Int("to", 168808, "last line of the word-list section (inclusive)")
|
||||
flag.Parse()
|
||||
if *in == "" {
|
||||
log.Fatal("ruwords: -in is required")
|
||||
}
|
||||
|
||||
f, err := os.Open(*in)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
all := make(map[string]struct{})
|
||||
allTokens := make(map[string]struct{})
|
||||
singulars := make(map[string]struct{})
|
||||
variantPairs := make(map[string]struct{})
|
||||
entries, fromHead, fromSing, fromVar := 0, 0, 0, 0
|
||||
sc := bufio.NewScanner(f)
|
||||
sc.Buffer(make([]byte, 1<<20), 1<<20)
|
||||
for line := 0; sc.Scan(); {
|
||||
line++
|
||||
if line < *from || line > *to {
|
||||
continue
|
||||
}
|
||||
entries++
|
||||
text := sc.Text()
|
||||
hw, hwOK := headword(text)
|
||||
var sings []string
|
||||
if hwOK {
|
||||
sings = embeddedSingulars(text, hw)
|
||||
}
|
||||
primary := ""
|
||||
if len(sings) > 0 {
|
||||
// the headword is plural and the entry gives its singular: keep only the singular
|
||||
primary = sings[0]
|
||||
for _, w := range sings {
|
||||
if _, seen := all[w]; !seen {
|
||||
fromSing++
|
||||
all[w] = struct{}{}
|
||||
}
|
||||
singulars[w] = struct{}{}
|
||||
}
|
||||
} else if hwOK {
|
||||
primary = hw
|
||||
if _, seen := all[hw]; !seen {
|
||||
fromHead++
|
||||
}
|
||||
all[hw] = struct{}{}
|
||||
}
|
||||
for _, w := range variants(text) {
|
||||
if _, seen := all[w]; !seen {
|
||||
fromVar++
|
||||
all[w] = struct{}{}
|
||||
}
|
||||
if primary != "" && primary != w {
|
||||
variantPairs[primary+"\t"+w] = struct{}{}
|
||||
}
|
||||
}
|
||||
for _, w := range tokens(text) {
|
||||
allTokens[w] = struct{}{}
|
||||
}
|
||||
}
|
||||
if err := sc.Err(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
skipSet := make(map[string]struct{})
|
||||
for w := range allTokens {
|
||||
if _, ok := all[w]; !ok {
|
||||
skipSet[w] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
allWords := sortedRu(all)
|
||||
skipWords := sortedRu(skipSet)
|
||||
if err := writeWords(*out, allWords); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if err := writeWords(*skip, skipWords); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if err := writeWords(*sings, sortedRu(singulars)); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
pairList := make([]string, 0, len(variantPairs))
|
||||
for p := range variantPairs {
|
||||
pairList = append(pairList, p)
|
||||
}
|
||||
sort.Strings(pairList)
|
||||
if err := writeWords(*varsOut, pairList); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
fmt.Printf("scanned %d entries\n", entries)
|
||||
fmt.Printf(" %-20s %7d words (%d headwords + %d embedded singulars + %d variants)\n", *out, len(allWords), fromHead, fromSing, fromVar)
|
||||
fmt.Printf(" %-20s %7d words (tokens not in %s; for a morphology re-check)\n", *skip, len(skipWords), *out)
|
||||
fmt.Printf(" %-20s %7d words (singulars from \"ед.\"; known nouns)\n", *sings, len(singulars))
|
||||
fmt.Printf(" %-20s %7d pairs (variants joined by \"и\")\n", *varsOut, len(variantPairs))
|
||||
}
|
||||
Reference in New Issue
Block a user