diff --git a/.gitea/workflows/build.yaml b/.gitea/workflows/build.yaml index 5c09646..1192f3b 100644 --- a/.gitea/workflows/build.yaml +++ b/.gitea/workflows/build.yaml @@ -34,9 +34,9 @@ jobs: - name: Build DAWGs run: | mkdir -p dawg - go run ./cmd/builddict -dict dictionaries/english/sowpods.txt -alphabet latin -name en_sowpods -out dawg - go run ./cmd/builddict -dict dictprep/russian/scrabble.txt -alphabet russian -name ru_scrabble -out dawg - go run ./cmd/builddict -dict dictprep/russian/erudit.txt -alphabet russian -name ru_erudit -out dawg + go run ./cmd/builddict -dict sources/scrabble_en/sowpods.txt -alphabet latin -name en_sowpods -out dawg + go run ./cmd/builddict -dict sources/scrabble_ru/scrabble.txt -alphabet russian -name ru_scrabble -out dawg + go run ./cmd/builddict -dict sources/erudit_ru/erudit.txt -alphabet russian -name ru_erudit -out dawg ls -la dawg/ for f in en_sowpods ru_scrabble ru_erudit; do test -s "dawg/$f.dawg" || { echo "missing dawg/$f.dawg"; exit 1; } diff --git a/.gitignore b/.gitignore index 364f267..1a24ac2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,16 @@ # Built DAWGs are release artifacts (published by CI on a vX.Y.Z tag), not committed. /dawg/ /scrabble-dawg-*.tar.gz + +# Russian prep-pipeline intermediates (regenerated locally by tools/; only the curated +# word lists in sources/scrabble_ru/ are committed). +/sources/scrabble_ru/orfo_dict_2025.txt +/sources/scrabble_ru/all.txt +/sources/scrabble_ru/undefined.txt +/sources/scrabble_ru/adjectives.txt +/sources/scrabble_ru/verbs.txt +/sources/scrabble_ru/singulars.txt +/sources/scrabble_ru/fate.tsv +/tools/libmorph_check +/tools/orfo_dict_2025.pdf +__pycache__/ diff --git a/Makefile b/Makefile index 04951de..b808542 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ # ru_erudit.dawg — Эрудит (the Ё→Е folded + de-duped list, committed as russian/erudit.txt) # # CI builds the DAWGs as a validation gate; release artifacts are published from this output -# (see README.md). Regenerate russian/erudit.txt from scrabble.txt with dictprep/fold_yo.py. +# (see README.md). Regenerate russian/erudit.txt from scrabble.txt with tools/fold_yo.py. export GOPRIVATE := gitea.iliadenisov.ru/* @@ -21,13 +21,13 @@ BUILDDICT := $(GO) run ./cmd/builddict dawg: dawg-en dawg-ru dawg-erudit dawg-en: - $(BUILDDICT) -dict dictionaries/english/sowpods.txt -alphabet latin -name en_sowpods -out $(DAWG_DIR) + $(BUILDDICT) -dict sources/scrabble_en/sowpods.txt -alphabet latin -name en_sowpods -out $(DAWG_DIR) dawg-ru: - $(BUILDDICT) -dict dictprep/russian/scrabble.txt -alphabet russian -name ru_scrabble -out $(DAWG_DIR) + $(BUILDDICT) -dict sources/scrabble_ru/scrabble.txt -alphabet russian -name ru_scrabble -out $(DAWG_DIR) dawg-erudit: - $(BUILDDICT) -dict dictprep/russian/erudit.txt -alphabet russian -name ru_erudit -out $(DAWG_DIR) + $(BUILDDICT) -dict sources/erudit_ru/erudit.txt -alphabet russian -name ru_erudit -out $(DAWG_DIR) clean-dawg: rm -f $(DAWG_DIR)/*.dawg diff --git a/README.md b/README.md index c746b67..49b205a 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,9 @@ byte-identical to the solver's committed test fixtures. | file | variant | source | | --- | --- | --- | -| `en_sowpods.dawg` | English (SOWPODS) | `dictionaries/english/sowpods.txt` | -| `ru_scrabble.dawg` | Russian Scrabble | `dictprep/russian/scrabble.txt` | -| `ru_erudit.dawg` | Эрудит | `dictprep/russian/erudit.txt` (Ё→Е folded `scrabble.txt`, via `dictprep/fold_yo.py`) | +| `en_sowpods.dawg` | English (SOWPODS) | `sources/scrabble_en/sowpods.txt` | +| `ru_scrabble.dawg` | Russian Scrabble | `sources/scrabble_ru/scrabble.txt` | +| `ru_erudit.dawg` | Эрудит | `sources/erudit_ru/erudit.txt` (Ё→Е folded `scrabble.txt`, via `tools/fold_yo.py`) | The CI (`.gitea/workflows/build.yaml`) rebuilds them on every push/PR as a validation gate (inlined `go run`, no `make`/`python` needed on the runner). Release artifacts are published per @@ -30,11 +30,11 @@ a new release, never breaking a running backend). ## Sources / provenance -- **English:** `dictionaries/english/sowpods.txt`, vendored from +- **English:** `sources/scrabble_en/sowpods.txt`, vendored from [`kamilmielnik/scrabble-dictionaries`](https://github.com/kamilmielnik/scrabble-dictionaries). -- **Russian:** `dictprep/russian/scrabble.txt`, derived from the Russian academic orthographic - dictionary by the tooling under `dictprep/` (see `dictprep/README.md`); `dictprep/russian/erudit.txt` - is its Ё→Е folded form (`dictprep/fold_yo.py`). Only the prepared word lists are vendored; the +- **Russian:** `sources/scrabble_ru/scrabble.txt`, derived from the Russian academic orthographic + dictionary by the tooling under `tools/` (see `tools/README.md`); `sources/erudit_ru/erudit.txt` + is its Ё→Е folded form (`tools/fold_yo.py`). Only the prepared word lists are vendored; the heavy upstream source (the orfo PDF/text) is not. ## Build @@ -45,7 +45,7 @@ make dawg # -> dawg/{en_sowpods,ru_scrabble,ru_erudit}.dawg Requires Go (module deps fetched with `GOPRIVATE=gitea.iliadenisov.ru/*`, exported by the Makefile). No `python` is needed for the build — the Ё→Е fold is committed as `erudit.txt`; -regenerate it with `python3 dictprep/fold_yo.py dictprep/russian/scrabble.txt > dictprep/russian/erudit.txt`. +regenerate it with `python3 tools/fold_yo.py sources/scrabble_ru/scrabble.txt > sources/erudit_ru/erudit.txt`. ## Release diff --git a/cmd/builddict/main.go b/cmd/builddict/main.go index 827eb3b..67a5208 100644 --- a/cmd/builddict/main.go +++ b/cmd/builddict/main.go @@ -17,7 +17,7 @@ import ( ) func main() { - dict := flag.String("dict", "dictionaries/english/sowpods.txt", "word list file (one word per line)") + dict := flag.String("dict", "sources/scrabble_en/sowpods.txt", "word list file (one word per line)") out := flag.String("out", "testdata", "output directory") name := flag.String("name", "sowpods", "base name for the output file") minLen := flag.Int("min", 2, "minimum word length") diff --git a/sources/erudit_ru/README.md b/sources/erudit_ru/README.md new file mode 100644 index 0000000..aa71799 --- /dev/null +++ b/sources/erudit_ru/README.md @@ -0,0 +1,6 @@ +# erudit_ru source + +`erudit.txt` — the Эрудит word list: the Ё→Е folded and de-duplicated form of +[`../scrabble_ru/scrabble.txt`](../scrabble_ru/scrabble.txt), produced by `tools/fold_yo.py` +(the Эрудит ruleset has no Ё tile and treats Е/Ё as one letter). Built to `dawg/ru_erudit.dawg` +(`make dawg-erudit`). diff --git a/dictprep/russian/erudit.txt b/sources/erudit_ru/erudit.txt similarity index 100% rename from dictprep/russian/erudit.txt rename to sources/erudit_ru/erudit.txt diff --git a/sources/scrabble_en/README.md b/sources/scrabble_en/README.md new file mode 100644 index 0000000..8d6c183 --- /dev/null +++ b/sources/scrabble_en/README.md @@ -0,0 +1,5 @@ +# scrabble_en source + +`sowpods.txt` — the English SOWPODS word list, vendored from +[`kamilmielnik/scrabble-dictionaries`](https://github.com/kamilmielnik/scrabble-dictionaries). +Built to `dawg/en_sowpods.dawg` (`make dawg-en`). diff --git a/dictionaries/english/sowpods.txt b/sources/scrabble_en/sowpods.txt similarity index 100% rename from dictionaries/english/sowpods.txt rename to sources/scrabble_en/sowpods.txt diff --git a/sources/scrabble_ru/README.md b/sources/scrabble_ru/README.md new file mode 100644 index 0000000..b116ea6 --- /dev/null +++ b/sources/scrabble_ru/README.md @@ -0,0 +1,9 @@ +# scrabble_ru source + +`scrabble.txt` — Russian Scrabble common nouns (nominative singular), produced by the prep +pipeline under [`../../tools/`](../../tools/README.md) from the Russian academic orthographic +dictionary, cross-checked against OpenCorpora and libmorph. `manual_confirm.txt` holds the +hand-reviewed additions the pipeline merges in. Built to `dawg/ru_scrabble.dawg` (`make dawg-ru`). + +The pipeline's uncommitted intermediates (`orfo_dict_2025.txt`, `all.txt`, debug dumps) are +regenerated here locally and are git-ignored. diff --git a/dictprep/russian/manual_confirm.txt b/sources/scrabble_ru/manual_confirm.txt similarity index 100% rename from dictprep/russian/manual_confirm.txt rename to sources/scrabble_ru/manual_confirm.txt diff --git a/dictprep/russian/scrabble.txt b/sources/scrabble_ru/scrabble.txt similarity index 100% rename from dictprep/russian/scrabble.txt rename to sources/scrabble_ru/scrabble.txt diff --git a/dictprep/README.md b/tools/README.md similarity index 91% rename from dictprep/README.md rename to tools/README.md index 31676a9..36a156b 100644 --- a/dictprep/README.md +++ b/tools/README.md @@ -1,14 +1,14 @@ -# Russian word-list preparation (`dictprep`) +# Russian word-list preparation (`tools`) Builds the Russian **noun** word list for the Scrabble/Эрудит solver out of the official Russian academic **orthographic dictionary**, cross-checked against two independent morphological dictionaries. The goal of the pipeline is a list of **common nouns in the nominative singular** -(`dictprep/russian/scrabble.txt`), plus an ambiguous tail for manual review. +(`sources/scrabble_ru/scrabble.txt`), plus an ambiguous tail for manual review. > This directory is self-contained tooling for *building* the word list. It is not part -> of the solver library. The committed result lives in `dictprep/russian/`. +> of the solver library. The committed result lives in `sources/scrabble_ru/`. ## Source @@ -23,7 +23,7 @@ The PDF is git-ignored (large, third-party); place it here as `orfo_dict_2025.pd pdftotext output is committed as `russian/orfo_dict_2025.txt`, so the word list rebuilds from the text alone — the binary PDF is needed only to regenerate that text. -## Outputs (`dictprep/russian/`) +## Outputs (`sources/scrabble_ru/`) The committed result is **three** files; every other bucket stays in the Stage-2 process's memory (dump it with `--dump`, query it with `--trace WORD`). @@ -56,28 +56,28 @@ ru-venv/bin/pip install mawo-pymorphy3 # bundles OpenCorpora 2025 (wo # 4. libmorph — the independent morphological dictionary (Stage 2 cross-check) sudo apt-get install -y morphrus morphrus-dev moonycode-dev morphapi-dev -g++ -std=c++17 -O2 dictprep/libmorph_check.cpp -lmorphrus -lmoonycode -o dictprep/libmorph_check +g++ -std=c++17 -O2 tools/libmorph_check.cpp -lmorphrus -lmoonycode -o tools/libmorph_check ``` -If `dictprep/libmorph_check` is absent, Stage 2 still runs — it simply drops libmorph from +If `tools/libmorph_check` is absent, Stage 2 still runs — it simply drops libmorph from the stack and reports `libmorph_helper=MISSING`. ## How to run ```sh # Stage 0 — PDF -> plain text (committed as the source of truth; run once) -pdftotext dictprep/orfo_dict_2025.pdf dictprep/russian/orfo_dict_2025.txt +pdftotext tools/orfo_dict_2025.pdf sources/scrabble_ru/orfo_dict_2025.txt -# Stage 1 — build the base word list (Go): dictprep/russian/all.txt + /tmp/ru_*.txt -go run ./dictprep/ruwords +# Stage 1 — build the base word list (Go): sources/scrabble_ru/all.txt + /tmp/ru_*.txt +go run ./tools/ruwords # Stage 2 — the brain (Python + mawo + libmorph): writes scrabble.txt -ru-venv/bin/python dictprep/ru_stage2.py +ru-venv/bin/python tools/ru_stage2.py # ask how a word did or did not reach the dictionary -ru-venv/bin/python dictprep/ru_stage2.py --trace травмпункт +ru-venv/bin/python tools/ru_stage2.py --trace травмпункт # also write the in-memory buckets (undefined, adjectives, verbs, singulars, fate.tsv) -ru-venv/bin/python dictprep/ru_stage2.py --dump +ru-venv/bin/python tools/ru_stage2.py --dump ``` `-from`/`-to` (defaulting to 452/168808) bound the column word-list section of diff --git a/dictprep/fold_yo.py b/tools/fold_yo.py similarity index 93% rename from dictprep/fold_yo.py rename to tools/fold_yo.py index c192653..db6625a 100644 --- a/dictprep/fold_yo.py +++ b/tools/fold_yo.py @@ -5,7 +5,7 @@ The Эрудит ruleset has no Ё tile and treats Е/Ё as one letter, so its d folded before the DAWG is built. Folding merges pairs like ёж/еж, hence the de-dup. Output is sorted (Russian order over the 32 folded letters) and LF-separated. -Run: python3 dictprep/fold_yo.py dictprep/russian/scrabble.txt > /tmp/ru_erudit_words.txt +Run: python3 tools/fold_yo.py sources/scrabble_ru/scrabble.txt > /tmp/ru_erudit_words.txt """ import sys diff --git a/dictprep/libmorph_check.cpp b/tools/libmorph_check.cpp similarity index 100% rename from dictprep/libmorph_check.cpp rename to tools/libmorph_check.cpp diff --git a/dictprep/ru_stage2.py b/tools/ru_stage2.py similarity index 94% rename from dictprep/ru_stage2.py rename to tools/ru_stage2.py index 5dc6730..938fe6a 100644 --- a/dictprep/ru_stage2.py +++ b/tools/ru_stage2.py @@ -5,10 +5,10 @@ It reads the Stage-1 base word list (built once by ruwords so the heavy PDF is n re-parsed) together with the grammatical notes and the singular/variant structure, runs the whole noun-selection logic in memory, and writes a minimal result: - dictprep/russian/scrabble.txt — the working dictionary (common nouns, nom. sing.) - dictprep/russian/undefined.txt — the ambiguous tail, left for manual review + sources/scrabble_ru/scrabble.txt — the working dictionary (common nouns, nom. sing.) + sources/scrabble_ru/undefined.txt — the ambiguous tail, left for manual review -(dictprep/russian/all.txt is the Stage-1 base.) Every other bucket — adjectives, verbs, +(sources/scrabble_ru/all.txt is the Stage-1 base.) Every other bucket — adjectives, verbs, the merged note-nouns, singulars, variants — stays in memory. Pass --dump to also write them; pass --trace WORD to ask how a single word did or did not reach the dictionary. @@ -17,9 +17,9 @@ variants are read from the pdftotext output (slov.txt) and the Stage-1 side file expensive PDF parse itself runs only once. Sources, most authoritative first: OpenCorpora (mawo-pymorphy3), libmorph (libmorph_check), -and the orthographic dictionary's own notes. See dictprep/README.md. +and the orthographic dictionary's own notes. See tools/README.md. -Run: ru-venv/bin/python dictprep/ru_stage2.py [--dump] [--trace WORD] +Run: ru-venv/bin/python tools/ru_stage2.py [--dump] [--trace WORD] """ import argparse import os @@ -27,7 +27,9 @@ import re import subprocess HERE = os.path.dirname(os.path.abspath(__file__)) -OUT_DIR = os.path.join(HERE, "russian") +# The curated Russian word lists live in sources/scrabble_ru/ (this tool sits in tools/); +# the uncommitted pipeline intermediates (orfo/all/debug) are regenerated alongside them. +OUT_DIR = os.path.join(HERE, "..", "sources", "scrabble_ru") SLOV = os.path.join(OUT_DIR, "orfo_dict_2025.txt") # committed pdftotext output (source of truth) WL_FROM, WL_TO = 452, 168808 # 1-based inclusive bounds of the column word-list section OC_CACHE = "/tmp/oc_nouns.txt" @@ -322,7 +324,7 @@ def main(): return write(os.path.join(OUT_DIR, "scrabble.txt"), r["scrabble"]) - print(f"=> dictprep/russian/scrabble.txt {len(r['scrabble'])}") + print(f"=> sources/scrabble_ru/scrabble.txt {len(r['scrabble'])}") print(f" undefined kept in memory: {len(set(r['undefined']))} (use --dump to write it)") if args.dump: write(os.path.join(OUT_DIR, "undefined.txt"), r["undefined"]) diff --git a/dictprep/ruwords/main.go b/tools/ruwords/main.go similarity index 96% rename from dictprep/ruwords/main.go rename to tools/ruwords/main.go index fd93c49..6ad2093 100644 --- a/dictprep/ruwords/main.go +++ b/tools/ruwords/main.go @@ -11,10 +11,10 @@ // // It also collects a variant headword joined by "и" when it carries its own grammatical // note (e.g. "аблатив, -а и аблятив, -а"). Suffix-singular reconstruction is heuristic; -// Stage 2 (dictprep/ru_stage2.py) re-checks the words against real dictionaries. +// Stage 2 (tools/ru_stage2.py) re-checks the words against real dictionaries. // -// pdftotext dictprep/orfo_dict_2025.pdf /tmp/slov.txt -// go run ./dictprep/ruwords -in /tmp/slov.txt -from 452 -to 168808 \ +// pdftotext tools/orfo_dict_2025.pdf /tmp/slov.txt +// go run ./tools/ruwords -in /tmp/slov.txt -from 452 -to 168808 \ // -out russian_all.txt -skip russian_skip.txt package main @@ -327,8 +327,8 @@ func writeWords(path string, words []string) error { } func main() { - in := flag.String("in", "dictprep/russian/orfo_dict_2025.txt", "plain-text dictionary (pdftotext output)") - out := flag.String("out", "dictprep/russian/all.txt", "output: the base word list (clean headwords + reconstructed singulars + variants)") + in := flag.String("in", "sources/scrabble_ru/orfo_dict_2025.txt", "plain-text dictionary (pdftotext output)") + out := flag.String("out", "sources/scrabble_ru/all.txt", "output: the base word list (clean headwords + reconstructed singulars + variants)") skip := flag.String("skip", "/tmp/ru_skip.txt", "output: every other token, for a later morphology re-check") sings := flag.String("singulars", "/tmp/ru_singulars.txt", "output: singulars reconstructed from \"ед.\" (known nouns)") varsOut := flag.String("variants", "/tmp/ru_variants.txt", "output: variant pairs joined by \"и\" (primaryvariant)")