Files
scrabble-solver/dictprep/ruwords/main.go
T
Ilia Denisov 540ee32178 dictprep: Russian orthographic dictionary → Scrabble noun pipeline
Build a committed Russian common-noun word list (dictprep/russian/scrabble.txt)
from the RAN orthographic dictionary, for the Эрудит ruleset.

- Stage 1 (Go, dictprep/ruwords): orfo_dict_2025.txt -> all.txt; extracts
  headwords, reconstructs "ед." singulars (suppressing plurals), pairs "и" variants.
- Stage 2 (Python brain, dictprep/ru_stage2.py): OpenCorpora (mawo-pymorphy3) +
  libmorph + orthographic notes select common nouns (nom. sing.); --trace explains
  a word's fate, --dump writes the in-memory buckets.
- libmorph C++ bridge (libmorph_check.cpp); manual_confirm.txt is merged in.
- orfo_dict_2025.txt is the committed pdftotext source of truth.
- See dictprep/README.md for methodology and reproducibility.
2026-06-01 23:27:17 +02:00

435 lines
13 KiB
Go
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Command ruwords extracts a clean Cyrillic word list from the plain text of a Russian
// orthographic dictionary (the output of `pdftotext`).
//
// Stage 1 (this tool): from the column word-list section [from, to] it collects, per
// entry, the headword (the leading token). When the headword is plural and the entry
// gives its singular after "ед." — in full ("ящеры, …, ед. ящер") or as a replacement
// suffix ("…, ед. -вец") — only the singular is kept, since a plural that has a singular
// is never needed. It drops stress marks, lowercases, keeps ё, and discards proper nouns
// (capitalized), hyphenated words, acronyms and non-Cyrillic tokens. The result is
// de-duplicated and sorted in Russian alphabetical order (ё right after е), LF-separated.
//
// It also collects a variant headword joined by "и" when it carries its own grammatical
// note (e.g. "аблатив, -а и аблятив, -а"). Suffix-singular reconstruction is heuristic;
// Stage 2 (dictprep/ru_stage2.py) re-checks the words against real dictionaries.
//
// pdftotext dictprep/orfo_dict_2025.pdf /tmp/slov.txt
// go run ./dictprep/ruwords -in /tmp/slov.txt -from 452 -to 168808 \
// -out russian_all.txt -skip russian_skip.txt
package main
import (
"bufio"
"flag"
"fmt"
"log"
"os"
"path/filepath"
"sort"
"strings"
"unicode"
)
// ruAlphabet is the Russian alphabet in collation order (ё directly after е).
const ruAlphabet = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
var ruRank = func() map[rune]int {
m := make(map[rune]int, len(ruAlphabet))
for i, r := range []rune(ruAlphabet) {
m[r] = i
}
return m
}()
func isCyrLetter(r rune) bool {
return (r >= 'а' && r <= 'я') || (r >= 'А' && r <= 'Я') || r == 'ё' || r == 'Ё'
}
func isUpperCyr(r rune) bool { return (r >= 'А' && r <= 'Я') || r == 'Ё' }
func isStress(r rune) bool { return r == 0x0300 || r == 0x0301 }
// cleanWord normalizes a run of letters/stress-marks into a lowercase Cyrillic word, or
// returns ok=false for proper nouns (capitalized), hyphenated or non-Cyrillic runs.
func cleanWord(run []rune) (string, bool) {
if len(run) == 0 || isUpperCyr(run[0]) {
return "", false
}
var b strings.Builder
for _, r := range run {
switch {
case isStress(r), r == '­': // drop stress accents and soft hyphens
case r == '-': // a real hyphen means a hyphenated word: reject it
return "", false
default:
b.WriteRune(unicode.ToLower(r))
}
}
w := b.String()
if w == "" {
return "", false
}
for _, r := range w {
if !((r >= 'а' && r <= 'я') || r == 'ё') {
return "", false
}
}
return w, true
}
// headword returns the entry's headword: the leading run of letters, stress marks and
// hyphens, normalized.
func headword(line string) (string, bool) {
// Trim leading whitespace, including the form-feed (U+000C) that pdftotext puts at
// the top of each page — otherwise the first headword on every page is lost.
line = strings.TrimLeftFunc(line, unicode.IsSpace)
var run []rune
for _, r := range line {
if isCyrLetter(r) || isStress(r) || r == '-' || r == '­' {
run = append(run, r)
} else {
break
}
}
return cleanWord(run)
}
// embeddedSingulars returns the singular form of a plural headword spelled out after
// "ед.", either in full ("ед. ящер") or as a replacement suffix ("ед. -вец",
// reconstructed from headword). It skips gender marks ("ед. м") and abbreviations that
// merely start with "ед." ("ед. измер.", "ден. ед.").
func embeddedSingulars(line, headword string) []string {
var out []string
for i := 0; ; {
j := strings.Index(line[i:], "ед.")
if j < 0 {
break
}
i += j + len("ед.")
rest := strings.TrimLeft(line[i:], "  \t")
if strings.HasPrefix(rest, "-") { // suffix form: reconstruct from the headword
var suf []rune
for _, r := range rest[len("-"):] {
if isCyrLetter(r) || isStress(r) {
suf = append(suf, r)
} else {
break
}
}
if s, ok := cleanWord(suf); ok && len([]rune(s)) >= 2 {
if recon := reconstructSingular(headword, s); recon != "" {
out = append(out, recon)
}
}
continue
}
var run []rune
consumed := 0
for _, r := range rest {
if isCyrLetter(r) || isStress(r) {
run = append(run, r)
consumed += len(string(r))
} else {
break
}
}
if len(run) == 0 {
continue
}
if strings.HasPrefix(rest[consumed:], ".") {
continue // an abbreviation like "ед. измер." rather than a singular form
}
w, ok := cleanWord(run)
if !ok || len([]rune(w)) < 2 { // 2+ letters excludes the gender marks м/ж/с
continue
}
out = append(out, w)
}
return out
}
// reconstructSingular builds the singular from a plural headword and the replacement
// suffix from "ед. -<suffix>", splicing where the suffix best overlaps the tail of the
// headword (the position of longest common prefix between the suffix and a headword
// suffix). It is a heuristic; Stage 2 re-checks the words against real dictionaries.
func reconstructSingular(headword, suffix string) string {
hw, sf := []rune(headword), []rune(suffix)
bestK, bestLen := -1, 0
for k := 0; k < len(hw); k++ {
m := 0
for k+m < len(hw) && m < len(sf) && hw[k+m] == sf[m] {
m++
}
if m > bestLen {
bestK, bestLen = k, m
}
}
if bestK < 0 {
return ""
}
return string(hw[:bestK]) + suffix
}
// headwordNotes are the grammatical notes that mark a parallel headword (a lemma) after
// "и", as opposed to an inflected form. A "-" ending also marks one; form labels such as
// деепр. (gerund) or сравн. (comparative) deliberately do not.
var headwordNotes = map[string]bool{
"нескл": true, "неизм": true, "предлог": true, "предл": true, "нареч": true,
"нар": true, "прил": true, "союз": true, "частица": true, "част": true,
"межд": true, "мн": true, "ед": true, "тв": true, "числ": true, "мест": true,
"м": true, "ж": true, "с": true, "вводн": true, "сказ": true,
}
// variantNoteOK reports whether the note following a candidate variant marks a headword:
// a "-" inflection ending or one of headwordNotes (and not a bare inflected word).
func variantNoteOK(note string) bool {
if strings.HasPrefix(note, "-") {
return true
}
var stem []rune
for _, r := range note {
if (r >= 'а' && r <= 'я') || r == 'ё' {
stem = append(stem, r)
} else {
break
}
}
return headwordNotes[string(stem)]
}
// variants returns the second (and further) headwords of an entry, written as a parallel
// form after " и ", e.g. "аблатив, -а и аблятив, -а" yields "аблятив" and "регги и реггей,
// нескл." yields "реггей". Requiring a headword note after the comma keeps this from
// matching "и" inside examples or picking up inflected forms.
func variants(line string) []string {
var out []string
const sep = " и "
for i := 0; ; {
j := strings.Index(line[i:], sep)
if j < 0 {
break
}
i += j + len(sep)
rest := line[i:]
var run []rune
consumed := 0
for _, r := range rest {
if isCyrLetter(r) || isStress(r) {
run = append(run, r)
consumed += len(string(r))
} else {
break
}
}
if len(run) == 0 {
continue
}
after := rest[consumed:]
if !strings.HasPrefix(after, ", ") || !variantNoteOK(after[len(", "):]) {
continue
}
if w, ok := cleanWord(run); ok && len([]rune(w)) >= 2 {
out = append(out, w)
}
}
return out
}
// normToken normalizes any token (a run of letters and stress marks) for the skip set:
// lowercase, stress removed, kept only if it is 2+ all-Cyrillic letters. Unlike
// cleanWord it does NOT reject capitalized tokens — a lowercased proper noun belongs in
// the skip set so it can be re-checked by a morphological analyzer.
func normToken(run []rune) (string, bool) {
var b strings.Builder
for _, r := range run {
if isStress(r) {
continue
}
b.WriteRune(unicode.ToLower(r))
}
w := b.String()
if len([]rune(w)) < 2 {
return "", false
}
for _, r := range w {
if !((r >= 'а' && r <= 'я') || r == 'ё') {
return "", false
}
}
return w, true
}
// tokens returns every maximal run of Cyrillic letters (plus stress marks) in the line,
// normalized; runs are split on every other character (so hyphens split a word).
func tokens(line string) []string {
var out []string
var run []rune
flush := func() {
if len(run) > 0 {
if w, ok := normToken(run); ok {
out = append(out, w)
}
run = run[:0]
}
}
for _, r := range line {
if isCyrLetter(r) || isStress(r) {
run = append(run, r)
} else {
flush()
}
}
flush()
return out
}
func lessRu(a, b string) bool {
ra, rb := []rune(a), []rune(b)
for i := 0; i < len(ra) && i < len(rb); i++ {
if ra[i] != rb[i] {
return ruRank[ra[i]] < ruRank[rb[i]]
}
}
return len(ra) < len(rb)
}
func sortedRu(set map[string]struct{}) []string {
words := make([]string, 0, len(set))
for w := range set {
words = append(words, w)
}
sort.Slice(words, func(i, j int) bool { return lessRu(words[i], words[j]) })
return words
}
func writeWords(path string, words []string) error {
if dir := filepath.Dir(path); dir != "" && dir != "." {
if err := os.MkdirAll(dir, 0o755); err != nil {
return err
}
}
o, err := os.Create(path)
if err != nil {
return err
}
w := bufio.NewWriter(o)
for _, word := range words {
w.WriteString(word)
w.WriteByte('\n')
}
if err := w.Flush(); err != nil {
o.Close()
return err
}
return o.Close()
}
func main() {
in := flag.String("in", "dictprep/russian/orfo_dict_2025.txt", "plain-text dictionary (pdftotext output)")
out := flag.String("out", "dictprep/russian/all.txt", "output: the base word list (clean headwords + reconstructed singulars + variants)")
skip := flag.String("skip", "/tmp/ru_skip.txt", "output: every other token, for a later morphology re-check")
sings := flag.String("singulars", "/tmp/ru_singulars.txt", "output: singulars reconstructed from \"ед.\" (known nouns)")
varsOut := flag.String("variants", "/tmp/ru_variants.txt", "output: variant pairs joined by \"и\" (primary<TAB>variant)")
from := flag.Int("from", 452, "first line of the word-list section (1-based, inclusive)")
to := flag.Int("to", 168808, "last line of the word-list section (inclusive)")
flag.Parse()
if *in == "" {
log.Fatal("ruwords: -in is required")
}
f, err := os.Open(*in)
if err != nil {
log.Fatal(err)
}
defer f.Close()
all := make(map[string]struct{})
allTokens := make(map[string]struct{})
singulars := make(map[string]struct{})
variantPairs := make(map[string]struct{})
entries, fromHead, fromSing, fromVar := 0, 0, 0, 0
sc := bufio.NewScanner(f)
sc.Buffer(make([]byte, 1<<20), 1<<20)
for line := 0; sc.Scan(); {
line++
if line < *from || line > *to {
continue
}
entries++
text := sc.Text()
hw, hwOK := headword(text)
var sings []string
if hwOK {
sings = embeddedSingulars(text, hw)
}
primary := ""
if len(sings) > 0 {
// the headword is plural and the entry gives its singular: keep only the singular
primary = sings[0]
for _, w := range sings {
if _, seen := all[w]; !seen {
fromSing++
all[w] = struct{}{}
}
singulars[w] = struct{}{}
}
} else if hwOK {
primary = hw
if _, seen := all[hw]; !seen {
fromHead++
}
all[hw] = struct{}{}
}
for _, w := range variants(text) {
if _, seen := all[w]; !seen {
fromVar++
all[w] = struct{}{}
}
if primary != "" && primary != w {
variantPairs[primary+"\t"+w] = struct{}{}
}
}
for _, w := range tokens(text) {
allTokens[w] = struct{}{}
}
}
if err := sc.Err(); err != nil {
log.Fatal(err)
}
skipSet := make(map[string]struct{})
for w := range allTokens {
if _, ok := all[w]; !ok {
skipSet[w] = struct{}{}
}
}
allWords := sortedRu(all)
skipWords := sortedRu(skipSet)
if err := writeWords(*out, allWords); err != nil {
log.Fatal(err)
}
if err := writeWords(*skip, skipWords); err != nil {
log.Fatal(err)
}
if err := writeWords(*sings, sortedRu(singulars)); err != nil {
log.Fatal(err)
}
pairList := make([]string, 0, len(variantPairs))
for p := range variantPairs {
pairList = append(pairList, p)
}
sort.Strings(pairList)
if err := writeWords(*varsOut, pairList); err != nil {
log.Fatal(err)
}
fmt.Printf("scanned %d entries\n", entries)
fmt.Printf(" %-20s %7d words (%d headwords + %d embedded singulars + %d variants)\n", *out, len(allWords), fromHead, fromSing, fromVar)
fmt.Printf(" %-20s %7d words (tokens not in %s; for a morphology re-check)\n", *skip, len(skipWords), *out)
fmt.Printf(" %-20s %7d words (singulars from \"ед.\"; known nouns)\n", *sings, len(singulars))
fmt.Printf(" %-20s %7d pairs (variants joined by \"и\")\n", *varsOut, len(variantPairs))
}