// Command ruwords extracts a clean Cyrillic word list from the plain text of a Russian // orthographic dictionary (the output of `pdftotext`). // // Stage 1 (this tool): from the column word-list section [from, to] it collects, per // entry, the headword (the leading token). When the headword is plural and the entry // gives its singular after "ед." — in full ("ящеры, …, ед. ящер") or as a replacement // suffix ("…, ед. -вец") — only the singular is kept, since a plural that has a singular // is never needed. It drops stress marks, lowercases, keeps ё, and discards proper nouns // (capitalized), hyphenated words, acronyms and non-Cyrillic tokens. The result is // de-duplicated and sorted in Russian alphabetical order (ё right after е), LF-separated. // // It also collects a variant headword joined by "и" when it carries its own grammatical // note (e.g. "аблатив, -а и аблятив, -а"). Suffix-singular reconstruction is heuristic; // Stage 2 (dictprep/ru_stage2.py) re-checks the words against real dictionaries. // // pdftotext dictprep/orfo_dict_2025.pdf /tmp/slov.txt // go run ./dictprep/ruwords -in /tmp/slov.txt -from 452 -to 168808 \ // -out russian_all.txt -skip russian_skip.txt package main import ( "bufio" "flag" "fmt" "log" "os" "path/filepath" "sort" "strings" "unicode" ) // ruAlphabet is the Russian alphabet in collation order (ё directly after е). const ruAlphabet = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя" var ruRank = func() map[rune]int { m := make(map[rune]int, len(ruAlphabet)) for i, r := range []rune(ruAlphabet) { m[r] = i } return m }() func isCyrLetter(r rune) bool { return (r >= 'а' && r <= 'я') || (r >= 'А' && r <= 'Я') || r == 'ё' || r == 'Ё' } func isUpperCyr(r rune) bool { return (r >= 'А' && r <= 'Я') || r == 'Ё' } func isStress(r rune) bool { return r == 0x0300 || r == 0x0301 } // cleanWord normalizes a run of letters/stress-marks into a lowercase Cyrillic word, or // returns ok=false for proper nouns (capitalized), hyphenated or non-Cyrillic runs. func cleanWord(run []rune) (string, bool) { if len(run) == 0 || isUpperCyr(run[0]) { return "", false } var b strings.Builder for _, r := range run { switch { case isStress(r), r == '­': // drop stress accents and soft hyphens case r == '-': // a real hyphen means a hyphenated word: reject it return "", false default: b.WriteRune(unicode.ToLower(r)) } } w := b.String() if w == "" { return "", false } for _, r := range w { if !((r >= 'а' && r <= 'я') || r == 'ё') { return "", false } } return w, true } // headword returns the entry's headword: the leading run of letters, stress marks and // hyphens, normalized. func headword(line string) (string, bool) { // Trim leading whitespace, including the form-feed (U+000C) that pdftotext puts at // the top of each page — otherwise the first headword on every page is lost. line = strings.TrimLeftFunc(line, unicode.IsSpace) var run []rune for _, r := range line { if isCyrLetter(r) || isStress(r) || r == '-' || r == '­' { run = append(run, r) } else { break } } return cleanWord(run) } // embeddedSingulars returns the singular form of a plural headword spelled out after // "ед.", either in full ("ед. ящер") or as a replacement suffix ("ед. -вец", // reconstructed from headword). It skips gender marks ("ед. м") and abbreviations that // merely start with "ед." ("ед. измер.", "ден. ед."). func embeddedSingulars(line, headword string) []string { var out []string for i := 0; ; { j := strings.Index(line[i:], "ед.") if j < 0 { break } i += j + len("ед.") rest := strings.TrimLeft(line[i:], "  \t") if strings.HasPrefix(rest, "-") { // suffix form: reconstruct from the headword var suf []rune for _, r := range rest[len("-"):] { if isCyrLetter(r) || isStress(r) { suf = append(suf, r) } else { break } } if s, ok := cleanWord(suf); ok && len([]rune(s)) >= 2 { if recon := reconstructSingular(headword, s); recon != "" { out = append(out, recon) } } continue } var run []rune consumed := 0 for _, r := range rest { if isCyrLetter(r) || isStress(r) { run = append(run, r) consumed += len(string(r)) } else { break } } if len(run) == 0 { continue } if strings.HasPrefix(rest[consumed:], ".") { continue // an abbreviation like "ед. измер." rather than a singular form } w, ok := cleanWord(run) if !ok || len([]rune(w)) < 2 { // 2+ letters excludes the gender marks м/ж/с continue } out = append(out, w) } return out } // reconstructSingular builds the singular from a plural headword and the replacement // suffix from "ед. -", splicing where the suffix best overlaps the tail of the // headword (the position of longest common prefix between the suffix and a headword // suffix). It is a heuristic; Stage 2 re-checks the words against real dictionaries. func reconstructSingular(headword, suffix string) string { hw, sf := []rune(headword), []rune(suffix) bestK, bestLen := -1, 0 for k := 0; k < len(hw); k++ { m := 0 for k+m < len(hw) && m < len(sf) && hw[k+m] == sf[m] { m++ } if m > bestLen { bestK, bestLen = k, m } } if bestK < 0 { return "" } return string(hw[:bestK]) + suffix } // headwordNotes are the grammatical notes that mark a parallel headword (a lemma) after // "и", as opposed to an inflected form. A "-" ending also marks one; form labels such as // деепр. (gerund) or сравн. (comparative) deliberately do not. var headwordNotes = map[string]bool{ "нескл": true, "неизм": true, "предлог": true, "предл": true, "нареч": true, "нар": true, "прил": true, "союз": true, "частица": true, "част": true, "межд": true, "мн": true, "ед": true, "тв": true, "числ": true, "мест": true, "м": true, "ж": true, "с": true, "вводн": true, "сказ": true, } // variantNoteOK reports whether the note following a candidate variant marks a headword: // a "-" inflection ending or one of headwordNotes (and not a bare inflected word). func variantNoteOK(note string) bool { if strings.HasPrefix(note, "-") { return true } var stem []rune for _, r := range note { if (r >= 'а' && r <= 'я') || r == 'ё' { stem = append(stem, r) } else { break } } return headwordNotes[string(stem)] } // variants returns the second (and further) headwords of an entry, written as a parallel // form after " и ", e.g. "аблатив, -а и аблятив, -а" yields "аблятив" and "регги и реггей, // нескл." yields "реггей". Requiring a headword note after the comma keeps this from // matching "и" inside examples or picking up inflected forms. func variants(line string) []string { var out []string const sep = " и " for i := 0; ; { j := strings.Index(line[i:], sep) if j < 0 { break } i += j + len(sep) rest := line[i:] var run []rune consumed := 0 for _, r := range rest { if isCyrLetter(r) || isStress(r) { run = append(run, r) consumed += len(string(r)) } else { break } } if len(run) == 0 { continue } after := rest[consumed:] if !strings.HasPrefix(after, ", ") || !variantNoteOK(after[len(", "):]) { continue } if w, ok := cleanWord(run); ok && len([]rune(w)) >= 2 { out = append(out, w) } } return out } // normToken normalizes any token (a run of letters and stress marks) for the skip set: // lowercase, stress removed, kept only if it is 2+ all-Cyrillic letters. Unlike // cleanWord it does NOT reject capitalized tokens — a lowercased proper noun belongs in // the skip set so it can be re-checked by a morphological analyzer. func normToken(run []rune) (string, bool) { var b strings.Builder for _, r := range run { if isStress(r) { continue } b.WriteRune(unicode.ToLower(r)) } w := b.String() if len([]rune(w)) < 2 { return "", false } for _, r := range w { if !((r >= 'а' && r <= 'я') || r == 'ё') { return "", false } } return w, true } // tokens returns every maximal run of Cyrillic letters (plus stress marks) in the line, // normalized; runs are split on every other character (so hyphens split a word). func tokens(line string) []string { var out []string var run []rune flush := func() { if len(run) > 0 { if w, ok := normToken(run); ok { out = append(out, w) } run = run[:0] } } for _, r := range line { if isCyrLetter(r) || isStress(r) { run = append(run, r) } else { flush() } } flush() return out } func lessRu(a, b string) bool { ra, rb := []rune(a), []rune(b) for i := 0; i < len(ra) && i < len(rb); i++ { if ra[i] != rb[i] { return ruRank[ra[i]] < ruRank[rb[i]] } } return len(ra) < len(rb) } func sortedRu(set map[string]struct{}) []string { words := make([]string, 0, len(set)) for w := range set { words = append(words, w) } sort.Slice(words, func(i, j int) bool { return lessRu(words[i], words[j]) }) return words } func writeWords(path string, words []string) error { if dir := filepath.Dir(path); dir != "" && dir != "." { if err := os.MkdirAll(dir, 0o755); err != nil { return err } } o, err := os.Create(path) if err != nil { return err } w := bufio.NewWriter(o) for _, word := range words { w.WriteString(word) w.WriteByte('\n') } if err := w.Flush(); err != nil { o.Close() return err } return o.Close() } func main() { in := flag.String("in", "dictprep/russian/orfo_dict_2025.txt", "plain-text dictionary (pdftotext output)") out := flag.String("out", "dictprep/russian/all.txt", "output: the base word list (clean headwords + reconstructed singulars + variants)") skip := flag.String("skip", "/tmp/ru_skip.txt", "output: every other token, for a later morphology re-check") sings := flag.String("singulars", "/tmp/ru_singulars.txt", "output: singulars reconstructed from \"ед.\" (known nouns)") varsOut := flag.String("variants", "/tmp/ru_variants.txt", "output: variant pairs joined by \"и\" (primaryvariant)") from := flag.Int("from", 452, "first line of the word-list section (1-based, inclusive)") to := flag.Int("to", 168808, "last line of the word-list section (inclusive)") flag.Parse() if *in == "" { log.Fatal("ruwords: -in is required") } f, err := os.Open(*in) if err != nil { log.Fatal(err) } defer f.Close() all := make(map[string]struct{}) allTokens := make(map[string]struct{}) singulars := make(map[string]struct{}) variantPairs := make(map[string]struct{}) entries, fromHead, fromSing, fromVar := 0, 0, 0, 0 sc := bufio.NewScanner(f) sc.Buffer(make([]byte, 1<<20), 1<<20) for line := 0; sc.Scan(); { line++ if line < *from || line > *to { continue } entries++ text := sc.Text() hw, hwOK := headword(text) var sings []string if hwOK { sings = embeddedSingulars(text, hw) } primary := "" if len(sings) > 0 { // the headword is plural and the entry gives its singular: keep only the singular primary = sings[0] for _, w := range sings { if _, seen := all[w]; !seen { fromSing++ all[w] = struct{}{} } singulars[w] = struct{}{} } } else if hwOK { primary = hw if _, seen := all[hw]; !seen { fromHead++ } all[hw] = struct{}{} } for _, w := range variants(text) { if _, seen := all[w]; !seen { fromVar++ all[w] = struct{}{} } if primary != "" && primary != w { variantPairs[primary+"\t"+w] = struct{}{} } } for _, w := range tokens(text) { allTokens[w] = struct{}{} } } if err := sc.Err(); err != nil { log.Fatal(err) } skipSet := make(map[string]struct{}) for w := range allTokens { if _, ok := all[w]; !ok { skipSet[w] = struct{}{} } } allWords := sortedRu(all) skipWords := sortedRu(skipSet) if err := writeWords(*out, allWords); err != nil { log.Fatal(err) } if err := writeWords(*skip, skipWords); err != nil { log.Fatal(err) } if err := writeWords(*sings, sortedRu(singulars)); err != nil { log.Fatal(err) } pairList := make([]string, 0, len(variantPairs)) for p := range variantPairs { pairList = append(pairList, p) } sort.Strings(pairList) if err := writeWords(*varsOut, pairList); err != nil { log.Fatal(err) } fmt.Printf("scanned %d entries\n", entries) fmt.Printf(" %-20s %7d words (%d headwords + %d embedded singulars + %d variants)\n", *out, len(allWords), fromHead, fromSing, fromVar) fmt.Printf(" %-20s %7d words (tokens not in %s; for a morphology re-check)\n", *skip, len(skipWords), *out) fmt.Printf(" %-20s %7d words (singulars from \"ед.\"; known nouns)\n", *sings, len(singulars)) fmt.Printf(" %-20s %7d pairs (variants joined by \"и\")\n", *varsOut, len(variantPairs)) }