Tidy sources into sources/<variant>/ + tools/
build / dawg (pull_request) Successful in 4m22s

Consolidate the scattered build inputs (dictionaries/english/, dictprep/russian/)
into one sources/ tree keyed by the variant labels (scrabble_en/scrabble_ru/
erudit_ru), and move the Russian prep pipeline to tools/. The dawg outputs and
their filenames are unchanged — rebuilt byte-identical (en_sowpods/ru_scrabble/
ru_erudit) — so the release artifact and the backend are unaffected.

ru_stage2.py OUT_DIR and the ruwords flag defaults are repointed to
sources/scrabble_ru/; Makefile / CI / cmd/builddict default / README updated;
pipeline intermediates git-ignored. Verified: make dawg byte-identical to the
committed baseline, py_compile + go vet of the moved tools. The full Russian
regeneration pipeline (pymorphy3/libmorph/orfo PDF) was not run here.
This commit is contained in:
Ilia Denisov
2026-06-09 12:25:33 +02:00
parent 38ad6d3a19
commit dd61ff1d51
17 changed files with 76 additions and 41 deletions
+3 -3
View File
@@ -34,9 +34,9 @@ jobs:
- name: Build DAWGs - name: Build DAWGs
run: | run: |
mkdir -p dawg mkdir -p dawg
go run ./cmd/builddict -dict dictionaries/english/sowpods.txt -alphabet latin -name en_sowpods -out dawg go run ./cmd/builddict -dict sources/scrabble_en/sowpods.txt -alphabet latin -name en_sowpods -out dawg
go run ./cmd/builddict -dict dictprep/russian/scrabble.txt -alphabet russian -name ru_scrabble -out dawg go run ./cmd/builddict -dict sources/scrabble_ru/scrabble.txt -alphabet russian -name ru_scrabble -out dawg
go run ./cmd/builddict -dict dictprep/russian/erudit.txt -alphabet russian -name ru_erudit -out dawg go run ./cmd/builddict -dict sources/erudit_ru/erudit.txt -alphabet russian -name ru_erudit -out dawg
ls -la dawg/ ls -la dawg/
for f in en_sowpods ru_scrabble ru_erudit; do for f in en_sowpods ru_scrabble ru_erudit; do
test -s "dawg/$f.dawg" || { echo "missing dawg/$f.dawg"; exit 1; } test -s "dawg/$f.dawg" || { echo "missing dawg/$f.dawg"; exit 1; }
+13
View File
@@ -1,3 +1,16 @@
# Built DAWGs are release artifacts (published by CI on a vX.Y.Z tag), not committed. # Built DAWGs are release artifacts (published by CI on a vX.Y.Z tag), not committed.
/dawg/ /dawg/
/scrabble-dawg-*.tar.gz /scrabble-dawg-*.tar.gz
# Russian prep-pipeline intermediates (regenerated locally by tools/; only the curated
# word lists in sources/scrabble_ru/ are committed).
/sources/scrabble_ru/orfo_dict_2025.txt
/sources/scrabble_ru/all.txt
/sources/scrabble_ru/undefined.txt
/sources/scrabble_ru/adjectives.txt
/sources/scrabble_ru/verbs.txt
/sources/scrabble_ru/singulars.txt
/sources/scrabble_ru/fate.tsv
/tools/libmorph_check
/tools/orfo_dict_2025.pdf
__pycache__/
+4 -4
View File
@@ -8,7 +8,7 @@
# ru_erudit.dawg — Эрудит (the Ё→Е folded + de-duped list, committed as russian/erudit.txt) # ru_erudit.dawg — Эрудит (the Ё→Е folded + de-duped list, committed as russian/erudit.txt)
# #
# CI builds the DAWGs as a validation gate; release artifacts are published from this output # CI builds the DAWGs as a validation gate; release artifacts are published from this output
# (see README.md). Regenerate russian/erudit.txt from scrabble.txt with dictprep/fold_yo.py. # (see README.md). Regenerate russian/erudit.txt from scrabble.txt with tools/fold_yo.py.
export GOPRIVATE := gitea.iliadenisov.ru/* export GOPRIVATE := gitea.iliadenisov.ru/*
@@ -21,13 +21,13 @@ BUILDDICT := $(GO) run ./cmd/builddict
dawg: dawg-en dawg-ru dawg-erudit dawg: dawg-en dawg-ru dawg-erudit
dawg-en: dawg-en:
$(BUILDDICT) -dict dictionaries/english/sowpods.txt -alphabet latin -name en_sowpods -out $(DAWG_DIR) $(BUILDDICT) -dict sources/scrabble_en/sowpods.txt -alphabet latin -name en_sowpods -out $(DAWG_DIR)
dawg-ru: dawg-ru:
$(BUILDDICT) -dict dictprep/russian/scrabble.txt -alphabet russian -name ru_scrabble -out $(DAWG_DIR) $(BUILDDICT) -dict sources/scrabble_ru/scrabble.txt -alphabet russian -name ru_scrabble -out $(DAWG_DIR)
dawg-erudit: dawg-erudit:
$(BUILDDICT) -dict dictprep/russian/erudit.txt -alphabet russian -name ru_erudit -out $(DAWG_DIR) $(BUILDDICT) -dict sources/erudit_ru/erudit.txt -alphabet russian -name ru_erudit -out $(DAWG_DIR)
clean-dawg: clean-dawg:
rm -f $(DAWG_DIR)/*.dawg rm -f $(DAWG_DIR)/*.dawg
+8 -8
View File
@@ -17,9 +17,9 @@ byte-identical to the solver's committed test fixtures.
| file | variant | source | | file | variant | source |
| --- | --- | --- | | --- | --- | --- |
| `en_sowpods.dawg` | English (SOWPODS) | `dictionaries/english/sowpods.txt` | | `en_sowpods.dawg` | English (SOWPODS) | `sources/scrabble_en/sowpods.txt` |
| `ru_scrabble.dawg` | Russian Scrabble | `dictprep/russian/scrabble.txt` | | `ru_scrabble.dawg` | Russian Scrabble | `sources/scrabble_ru/scrabble.txt` |
| `ru_erudit.dawg` | Эрудит | `dictprep/russian/erudit.txt` (Ё→Е folded `scrabble.txt`, via `dictprep/fold_yo.py`) | | `ru_erudit.dawg` | Эрудит | `sources/erudit_ru/erudit.txt` (Ё→Е folded `scrabble.txt`, via `tools/fold_yo.py`) |
The CI (`.gitea/workflows/build.yaml`) rebuilds them on every push/PR as a validation gate The CI (`.gitea/workflows/build.yaml`) rebuilds them on every push/PR as a validation gate
(inlined `go run`, no `make`/`python` needed on the runner). Release artifacts are published per (inlined `go run`, no `make`/`python` needed on the runner). Release artifacts are published per
@@ -30,11 +30,11 @@ a new release, never breaking a running backend).
## Sources / provenance ## Sources / provenance
- **English:** `dictionaries/english/sowpods.txt`, vendored from - **English:** `sources/scrabble_en/sowpods.txt`, vendored from
[`kamilmielnik/scrabble-dictionaries`](https://github.com/kamilmielnik/scrabble-dictionaries). [`kamilmielnik/scrabble-dictionaries`](https://github.com/kamilmielnik/scrabble-dictionaries).
- **Russian:** `dictprep/russian/scrabble.txt`, derived from the Russian academic orthographic - **Russian:** `sources/scrabble_ru/scrabble.txt`, derived from the Russian academic orthographic
dictionary by the tooling under `dictprep/` (see `dictprep/README.md`); `dictprep/russian/erudit.txt` dictionary by the tooling under `tools/` (see `tools/README.md`); `sources/erudit_ru/erudit.txt`
is its Ё→Е folded form (`dictprep/fold_yo.py`). Only the prepared word lists are vendored; the is its Ё→Е folded form (`tools/fold_yo.py`). Only the prepared word lists are vendored; the
heavy upstream source (the orfo PDF/text) is not. heavy upstream source (the orfo PDF/text) is not.
## Build ## Build
@@ -45,7 +45,7 @@ make dawg # -> dawg/{en_sowpods,ru_scrabble,ru_erudit}.dawg
Requires Go (module deps fetched with `GOPRIVATE=gitea.iliadenisov.ru/*`, exported by the Requires Go (module deps fetched with `GOPRIVATE=gitea.iliadenisov.ru/*`, exported by the
Makefile). No `python` is needed for the build — the Ё→Е fold is committed as `erudit.txt`; Makefile). No `python` is needed for the build — the Ё→Е fold is committed as `erudit.txt`;
regenerate it with `python3 dictprep/fold_yo.py dictprep/russian/scrabble.txt > dictprep/russian/erudit.txt`. regenerate it with `python3 tools/fold_yo.py sources/scrabble_ru/scrabble.txt > sources/erudit_ru/erudit.txt`.
## Release ## Release
+1 -1
View File
@@ -17,7 +17,7 @@ import (
) )
func main() { func main() {
dict := flag.String("dict", "dictionaries/english/sowpods.txt", "word list file (one word per line)") dict := flag.String("dict", "sources/scrabble_en/sowpods.txt", "word list file (one word per line)")
out := flag.String("out", "testdata", "output directory") out := flag.String("out", "testdata", "output directory")
name := flag.String("name", "sowpods", "base name for the output file") name := flag.String("name", "sowpods", "base name for the output file")
minLen := flag.Int("min", 2, "minimum word length") minLen := flag.Int("min", 2, "minimum word length")
+6
View File
@@ -0,0 +1,6 @@
# erudit_ru source
`erudit.txt` — the Эрудит word list: the Ё→Е folded and de-duplicated form of
[`../scrabble_ru/scrabble.txt`](../scrabble_ru/scrabble.txt), produced by `tools/fold_yo.py`
(the Эрудит ruleset has no Ё tile and treats Е/Ё as one letter). Built to `dawg/ru_erudit.dawg`
(`make dawg-erudit`).
+5
View File
@@ -0,0 +1,5 @@
# scrabble_en source
`sowpods.txt` — the English SOWPODS word list, vendored from
[`kamilmielnik/scrabble-dictionaries`](https://github.com/kamilmielnik/scrabble-dictionaries).
Built to `dawg/en_sowpods.dawg` (`make dawg-en`).
+9
View File
@@ -0,0 +1,9 @@
# scrabble_ru source
`scrabble.txt` — Russian Scrabble common nouns (nominative singular), produced by the prep
pipeline under [`../../tools/`](../../tools/README.md) from the Russian academic orthographic
dictionary, cross-checked against OpenCorpora and libmorph. `manual_confirm.txt` holds the
hand-reviewed additions the pipeline merges in. Built to `dawg/ru_scrabble.dawg` (`make dawg-ru`).
The pipeline's uncommitted intermediates (`orfo_dict_2025.txt`, `all.txt`, debug dumps) are
regenerated here locally and are git-ignored.
+12 -12
View File
@@ -1,14 +1,14 @@
# Russian word-list preparation (`dictprep`) # Russian word-list preparation (`tools`)
Builds the Russian **noun** word list for the Scrabble/Эрудит solver out of the official Builds the Russian **noun** word list for the Scrabble/Эрудит solver out of the official
Russian academic **orthographic dictionary**, cross-checked against two independent Russian academic **orthographic dictionary**, cross-checked against two independent
morphological dictionaries. morphological dictionaries.
The goal of the pipeline is a list of **common nouns in the nominative singular** The goal of the pipeline is a list of **common nouns in the nominative singular**
(`dictprep/russian/scrabble.txt`), plus an ambiguous tail for manual review. (`sources/scrabble_ru/scrabble.txt`), plus an ambiguous tail for manual review.
> This directory is self-contained tooling for *building* the word list. It is not part > This directory is self-contained tooling for *building* the word list. It is not part
> of the solver library. The committed result lives in `dictprep/russian/`. > of the solver library. The committed result lives in `sources/scrabble_ru/`.
## Source ## Source
@@ -23,7 +23,7 @@ The PDF is git-ignored (large, third-party); place it here as `orfo_dict_2025.pd
pdftotext output is committed as `russian/orfo_dict_2025.txt`, so the word list rebuilds pdftotext output is committed as `russian/orfo_dict_2025.txt`, so the word list rebuilds
from the text alone — the binary PDF is needed only to regenerate that text. from the text alone — the binary PDF is needed only to regenerate that text.
## Outputs (`dictprep/russian/`) ## Outputs (`sources/scrabble_ru/`)
The committed result is **three** files; every other bucket stays in the Stage-2 The committed result is **three** files; every other bucket stays in the Stage-2
process's memory (dump it with `--dump`, query it with `--trace WORD`). process's memory (dump it with `--dump`, query it with `--trace WORD`).
@@ -56,28 +56,28 @@ ru-venv/bin/pip install mawo-pymorphy3 # bundles OpenCorpora 2025 (wo
# 4. libmorph — the independent morphological dictionary (Stage 2 cross-check) # 4. libmorph — the independent morphological dictionary (Stage 2 cross-check)
sudo apt-get install -y morphrus morphrus-dev moonycode-dev morphapi-dev sudo apt-get install -y morphrus morphrus-dev moonycode-dev morphapi-dev
g++ -std=c++17 -O2 dictprep/libmorph_check.cpp -lmorphrus -lmoonycode -o dictprep/libmorph_check g++ -std=c++17 -O2 tools/libmorph_check.cpp -lmorphrus -lmoonycode -o tools/libmorph_check
``` ```
If `dictprep/libmorph_check` is absent, Stage 2 still runs — it simply drops libmorph from If `tools/libmorph_check` is absent, Stage 2 still runs — it simply drops libmorph from
the stack and reports `libmorph_helper=MISSING`. the stack and reports `libmorph_helper=MISSING`.
## How to run ## How to run
```sh ```sh
# Stage 0 — PDF -> plain text (committed as the source of truth; run once) # Stage 0 — PDF -> plain text (committed as the source of truth; run once)
pdftotext dictprep/orfo_dict_2025.pdf dictprep/russian/orfo_dict_2025.txt pdftotext tools/orfo_dict_2025.pdf sources/scrabble_ru/orfo_dict_2025.txt
# Stage 1 — build the base word list (Go): dictprep/russian/all.txt + /tmp/ru_*.txt # Stage 1 — build the base word list (Go): sources/scrabble_ru/all.txt + /tmp/ru_*.txt
go run ./dictprep/ruwords go run ./tools/ruwords
# Stage 2 — the brain (Python + mawo + libmorph): writes scrabble.txt # Stage 2 — the brain (Python + mawo + libmorph): writes scrabble.txt
ru-venv/bin/python dictprep/ru_stage2.py ru-venv/bin/python tools/ru_stage2.py
# ask how a word did or did not reach the dictionary # ask how a word did or did not reach the dictionary
ru-venv/bin/python dictprep/ru_stage2.py --trace травмпункт ru-venv/bin/python tools/ru_stage2.py --trace травмпункт
# also write the in-memory buckets (undefined, adjectives, verbs, singulars, fate.tsv) # also write the in-memory buckets (undefined, adjectives, verbs, singulars, fate.tsv)
ru-venv/bin/python dictprep/ru_stage2.py --dump ru-venv/bin/python tools/ru_stage2.py --dump
``` ```
`-from`/`-to` (defaulting to 452/168808) bound the column word-list section of `-from`/`-to` (defaulting to 452/168808) bound the column word-list section of
+1 -1
View File
@@ -5,7 +5,7 @@ The Эрудит ruleset has no Ё tile and treats Е/Ё as one letter, so its d
folded before the DAWG is built. Folding merges pairs like ёж/еж, hence the de-dup. Output folded before the DAWG is built. Folding merges pairs like ёж/еж, hence the de-dup. Output
is sorted (Russian order over the 32 folded letters) and LF-separated. is sorted (Russian order over the 32 folded letters) and LF-separated.
Run: python3 dictprep/fold_yo.py dictprep/russian/scrabble.txt > /tmp/ru_erudit_words.txt Run: python3 tools/fold_yo.py sources/scrabble_ru/scrabble.txt > /tmp/ru_erudit_words.txt
""" """
import sys import sys
+9 -7
View File
@@ -5,10 +5,10 @@ It reads the Stage-1 base word list (built once by ruwords so the heavy PDF is n
re-parsed) together with the grammatical notes and the singular/variant structure, runs re-parsed) together with the grammatical notes and the singular/variant structure, runs
the whole noun-selection logic in memory, and writes a minimal result: the whole noun-selection logic in memory, and writes a minimal result:
dictprep/russian/scrabble.txt the working dictionary (common nouns, nom. sing.) sources/scrabble_ru/scrabble.txt the working dictionary (common nouns, nom. sing.)
dictprep/russian/undefined.txt the ambiguous tail, left for manual review sources/scrabble_ru/undefined.txt the ambiguous tail, left for manual review
(dictprep/russian/all.txt is the Stage-1 base.) Every other bucket adjectives, verbs, (sources/scrabble_ru/all.txt is the Stage-1 base.) Every other bucket adjectives, verbs,
the merged note-nouns, singulars, variants stays in memory. Pass --dump to also write the merged note-nouns, singulars, variants stays in memory. Pass --dump to also write
them; pass --trace WORD to ask how a single word did or did not reach the dictionary. them; pass --trace WORD to ask how a single word did or did not reach the dictionary.
@@ -17,9 +17,9 @@ variants are read from the pdftotext output (slov.txt) and the Stage-1 side file
expensive PDF parse itself runs only once. expensive PDF parse itself runs only once.
Sources, most authoritative first: OpenCorpora (mawo-pymorphy3), libmorph (libmorph_check), Sources, most authoritative first: OpenCorpora (mawo-pymorphy3), libmorph (libmorph_check),
and the orthographic dictionary's own notes. See dictprep/README.md. and the orthographic dictionary's own notes. See tools/README.md.
Run: ru-venv/bin/python dictprep/ru_stage2.py [--dump] [--trace WORD] Run: ru-venv/bin/python tools/ru_stage2.py [--dump] [--trace WORD]
""" """
import argparse import argparse
import os import os
@@ -27,7 +27,9 @@ import re
import subprocess import subprocess
HERE = os.path.dirname(os.path.abspath(__file__)) HERE = os.path.dirname(os.path.abspath(__file__))
OUT_DIR = os.path.join(HERE, "russian") # The curated Russian word lists live in sources/scrabble_ru/ (this tool sits in tools/);
# the uncommitted pipeline intermediates (orfo/all/debug) are regenerated alongside them.
OUT_DIR = os.path.join(HERE, "..", "sources", "scrabble_ru")
SLOV = os.path.join(OUT_DIR, "orfo_dict_2025.txt") # committed pdftotext output (source of truth) SLOV = os.path.join(OUT_DIR, "orfo_dict_2025.txt") # committed pdftotext output (source of truth)
WL_FROM, WL_TO = 452, 168808 # 1-based inclusive bounds of the column word-list section WL_FROM, WL_TO = 452, 168808 # 1-based inclusive bounds of the column word-list section
OC_CACHE = "/tmp/oc_nouns.txt" OC_CACHE = "/tmp/oc_nouns.txt"
@@ -322,7 +324,7 @@ def main():
return return
write(os.path.join(OUT_DIR, "scrabble.txt"), r["scrabble"]) write(os.path.join(OUT_DIR, "scrabble.txt"), r["scrabble"])
print(f"=> dictprep/russian/scrabble.txt {len(r['scrabble'])}") print(f"=> sources/scrabble_ru/scrabble.txt {len(r['scrabble'])}")
print(f" undefined kept in memory: {len(set(r['undefined']))} (use --dump to write it)") print(f" undefined kept in memory: {len(set(r['undefined']))} (use --dump to write it)")
if args.dump: if args.dump:
write(os.path.join(OUT_DIR, "undefined.txt"), r["undefined"]) write(os.path.join(OUT_DIR, "undefined.txt"), r["undefined"])
@@ -11,10 +11,10 @@
// //
// It also collects a variant headword joined by "и" when it carries its own grammatical // It also collects a variant headword joined by "и" when it carries its own grammatical
// note (e.g. "аблатив, -а и аблятив, -а"). Suffix-singular reconstruction is heuristic; // note (e.g. "аблатив, -а и аблятив, -а"). Suffix-singular reconstruction is heuristic;
// Stage 2 (dictprep/ru_stage2.py) re-checks the words against real dictionaries. // Stage 2 (tools/ru_stage2.py) re-checks the words against real dictionaries.
// //
// pdftotext dictprep/orfo_dict_2025.pdf /tmp/slov.txt // pdftotext tools/orfo_dict_2025.pdf /tmp/slov.txt
// go run ./dictprep/ruwords -in /tmp/slov.txt -from 452 -to 168808 \ // go run ./tools/ruwords -in /tmp/slov.txt -from 452 -to 168808 \
// -out russian_all.txt -skip russian_skip.txt // -out russian_all.txt -skip russian_skip.txt
package main package main
@@ -327,8 +327,8 @@ func writeWords(path string, words []string) error {
} }
func main() { func main() {
in := flag.String("in", "dictprep/russian/orfo_dict_2025.txt", "plain-text dictionary (pdftotext output)") in := flag.String("in", "sources/scrabble_ru/orfo_dict_2025.txt", "plain-text dictionary (pdftotext output)")
out := flag.String("out", "dictprep/russian/all.txt", "output: the base word list (clean headwords + reconstructed singulars + variants)") out := flag.String("out", "sources/scrabble_ru/all.txt", "output: the base word list (clean headwords + reconstructed singulars + variants)")
skip := flag.String("skip", "/tmp/ru_skip.txt", "output: every other token, for a later morphology re-check") skip := flag.String("skip", "/tmp/ru_skip.txt", "output: every other token, for a later morphology re-check")
sings := flag.String("singulars", "/tmp/ru_singulars.txt", "output: singulars reconstructed from \"ед.\" (known nouns)") sings := flag.String("singulars", "/tmp/ru_singulars.txt", "output: singulars reconstructed from \"ед.\" (known nouns)")
varsOut := flag.String("variants", "/tmp/ru_variants.txt", "output: variant pairs joined by \"и\" (primary<TAB>variant)") varsOut := flag.String("variants", "/tmp/ru_variants.txt", "output: variant pairs joined by \"и\" (primary<TAB>variant)")