2026-06-09 13:48:05 +00:00
17 changed files with 76 additions and 41 deletions
@@ -34,9 +34,9 @@ jobs:
      - name: Build DAWGs
        run: |
          mkdir -p dawg
-          go run ./cmd/builddict -dict dictionaries/english/sowpods.txt -alphabet latin   -name en_sowpods  -out dawg
-          go run ./cmd/builddict -dict dictprep/russian/scrabble.txt    -alphabet russian -name ru_scrabble -out dawg
-          go run ./cmd/builddict -dict dictprep/russian/erudit.txt      -alphabet russian -name ru_erudit   -out dawg
+          go run ./cmd/builddict -dict sources/scrabble_en/sowpods.txt -alphabet latin   -name en_sowpods  -out dawg
+          go run ./cmd/builddict -dict sources/scrabble_ru/scrabble.txt    -alphabet russian -name ru_scrabble -out dawg
+          go run ./cmd/builddict -dict sources/erudit_ru/erudit.txt      -alphabet russian -name ru_erudit   -out dawg
          ls -la dawg/
          for f in en_sowpods ru_scrabble ru_erudit; do
            test -s "dawg/$f.dawg" || { echo "missing dawg/$f.dawg"; exit 1; }
@@ -1,3 +1,16 @@
 # Built DAWGs are release artifacts (published by CI on a vX.Y.Z tag), not committed.
 /dawg/
 /scrabble-dawg-*.tar.gz
+
+# Russian prep-pipeline intermediates (regenerated locally by tools/; only the curated
+# word lists in sources/scrabble_ru/ are committed).
+/sources/scrabble_ru/orfo_dict_2025.txt
+/sources/scrabble_ru/all.txt
+/sources/scrabble_ru/undefined.txt
+/sources/scrabble_ru/adjectives.txt
+/sources/scrabble_ru/verbs.txt
+/sources/scrabble_ru/singulars.txt
+/sources/scrabble_ru/fate.tsv
+/tools/libmorph_check
+/tools/orfo_dict_2025.pdf
+__pycache__/
@@ -8,7 +8,7 @@
 #   ru_erudit.dawg   — Эрудит (the Ё→Е folded + de-duped list, committed as russian/erudit.txt)
 #
 # CI builds the DAWGs as a validation gate; release artifacts are published from this output
-# (see README.md). Regenerate russian/erudit.txt from scrabble.txt with dictprep/fold_yo.py.
+# (see README.md). Regenerate russian/erudit.txt from scrabble.txt with tools/fold_yo.py.

 export GOPRIVATE := gitea.iliadenisov.ru/*

@@ -21,13 +21,13 @@ BUILDDICT := $(GO) run ./cmd/builddict
 dawg: dawg-en dawg-ru dawg-erudit

 dawg-en:
-	$(BUILDDICT) -dict dictionaries/english/sowpods.txt -alphabet latin -name en_sowpods -out $(DAWG_DIR)
+	$(BUILDDICT) -dict sources/scrabble_en/sowpods.txt -alphabet latin -name en_sowpods -out $(DAWG_DIR)

 dawg-ru:
-	$(BUILDDICT) -dict dictprep/russian/scrabble.txt -alphabet russian -name ru_scrabble -out $(DAWG_DIR)
+	$(BUILDDICT) -dict sources/scrabble_ru/scrabble.txt -alphabet russian -name ru_scrabble -out $(DAWG_DIR)

 dawg-erudit:
-	$(BUILDDICT) -dict dictprep/russian/erudit.txt -alphabet russian -name ru_erudit -out $(DAWG_DIR)
+	$(BUILDDICT) -dict sources/erudit_ru/erudit.txt -alphabet russian -name ru_erudit -out $(DAWG_DIR)

 clean-dawg:
 	rm -f $(DAWG_DIR)/*.dawg
@@ -17,9 +17,9 @@ byte-identical to the solver's committed test fixtures.

 | file | variant | source |
 | --- | --- | --- |
-| `en_sowpods.dawg` | English (SOWPODS) | `dictionaries/english/sowpods.txt` |
-| `ru_scrabble.dawg` | Russian Scrabble | `dictprep/russian/scrabble.txt` |
-| `ru_erudit.dawg` | Эрудит | `dictprep/russian/erudit.txt` (Ё→Е folded `scrabble.txt`, via `dictprep/fold_yo.py`) |
+| `en_sowpods.dawg` | English (SOWPODS) | `sources/scrabble_en/sowpods.txt` |
+| `ru_scrabble.dawg` | Russian Scrabble | `sources/scrabble_ru/scrabble.txt` |
+| `ru_erudit.dawg` | Эрудит | `sources/erudit_ru/erudit.txt` (Ё→Е folded `scrabble.txt`, via `tools/fold_yo.py`) |

 The CI (`.gitea/workflows/build.yaml`) rebuilds them on every push/PR as a validation gate
 (inlined `go run`, no `make`/`python` needed on the runner). Release artifacts are published per
@@ -30,11 +30,11 @@ a new release, never breaking a running backend).

 ## Sources / provenance

- **English:** `dictionaries/english/sowpods.txt`, vendored from
+- **English:** `sources/scrabble_en/sowpods.txt`, vendored from
  [`kamilmielnik/scrabble-dictionaries`](https://github.com/kamilmielnik/scrabble-dictionaries).
- **Russian:** `dictprep/russian/scrabble.txt`, derived from the Russian academic orthographic
-  dictionary by the tooling under `dictprep/` (see `dictprep/README.md`); `dictprep/russian/erudit.txt`
-  is its Ё→Е folded form (`dictprep/fold_yo.py`). Only the prepared word lists are vendored; the
+- **Russian:** `sources/scrabble_ru/scrabble.txt`, derived from the Russian academic orthographic
+  dictionary by the tooling under `tools/` (see `tools/README.md`); `sources/erudit_ru/erudit.txt`
+  is its Ё→Е folded form (`tools/fold_yo.py`). Only the prepared word lists are vendored; the
  heavy upstream source (the orfo PDF/text) is not.

 ## Build
@@ -45,7 +45,7 @@ make dawg     # -> dawg/{en_sowpods,ru_scrabble,ru_erudit}.dawg

 Requires Go (module deps fetched with `GOPRIVATE=gitea.iliadenisov.ru/*`, exported by the
 Makefile). No `python` is needed for the build — the Ё→Е fold is committed as `erudit.txt`;
-regenerate it with `python3 dictprep/fold_yo.py dictprep/russian/scrabble.txt > dictprep/russian/erudit.txt`.
+regenerate it with `python3 tools/fold_yo.py sources/scrabble_ru/scrabble.txt > sources/erudit_ru/erudit.txt`.

 ## Release

@@ -17,7 +17,7 @@ import (
 )

 func main() {
-	dict := flag.String("dict", "dictionaries/english/sowpods.txt", "word list file (one word per line)")
+	dict := flag.String("dict", "sources/scrabble_en/sowpods.txt", "word list file (one word per line)")
 	out := flag.String("out", "testdata", "output directory")
 	name := flag.String("name", "sowpods", "base name for the output file")
 	minLen := flag.Int("min", 2, "minimum word length")
@@ -0,0 +1,6 @@
+# erudit_ru source
+
+`erudit.txt` — the Эрудит word list: the Ё→Е folded and de-duplicated form of
+[`../scrabble_ru/scrabble.txt`](../scrabble_ru/scrabble.txt), produced by `tools/fold_yo.py`
+(the Эрудит ruleset has no Ё tile and treats Е/Ё as one letter). Built to `dawg/ru_erudit.dawg`
+(`make dawg-erudit`).
@@ -0,0 +1,5 @@
+# scrabble_en source
+
+`sowpods.txt` — the English SOWPODS word list, vendored from
+[`kamilmielnik/scrabble-dictionaries`](https://github.com/kamilmielnik/scrabble-dictionaries).
+Built to `dawg/en_sowpods.dawg` (`make dawg-en`).
@@ -0,0 +1,9 @@
+# scrabble_ru source
+
+`scrabble.txt` — Russian Scrabble common nouns (nominative singular), produced by the prep
+pipeline under [`../../tools/`](../../tools/README.md) from the Russian academic orthographic
+dictionary, cross-checked against OpenCorpora and libmorph. `manual_confirm.txt` holds the
+hand-reviewed additions the pipeline merges in. Built to `dawg/ru_scrabble.dawg` (`make dawg-ru`).
+
+The pipeline's uncommitted intermediates (`orfo_dict_2025.txt`, `all.txt`, debug dumps) are
+regenerated here locally and are git-ignored.
@@ -1,14 +1,14 @@
-# Russian word-list preparation (`dictprep`)
+# Russian word-list preparation (`tools`)

 Builds the Russian **noun** word list for the Scrabble/Эрудит solver out of the official
 Russian academic **orthographic dictionary**, cross-checked against two independent
 morphological dictionaries.

 The goal of the pipeline is a list of **common nouns in the nominative singular**
-(`dictprep/russian/scrabble.txt`), plus an ambiguous tail for manual review.
+(`sources/scrabble_ru/scrabble.txt`), plus an ambiguous tail for manual review.

 > This directory is self-contained tooling for *building* the word list. It is not part
-> of the solver library. The committed result lives in `dictprep/russian/`.
+> of the solver library. The committed result lives in `sources/scrabble_ru/`.

 ## Source

@@ -23,7 +23,7 @@ The PDF is git-ignored (large, third-party); place it here as `orfo_dict_2025.pd
 pdftotext output is committed as `russian/orfo_dict_2025.txt`, so the word list rebuilds
 from the text alone — the binary PDF is needed only to regenerate that text.

-## Outputs (`dictprep/russian/`)
+## Outputs (`sources/scrabble_ru/`)

 The committed result is **three** files; every other bucket stays in the Stage-2
 process's memory (dump it with `--dump`, query it with `--trace WORD`).
@@ -56,28 +56,28 @@ ru-venv/bin/pip install mawo-pymorphy3            # bundles OpenCorpora 2025 (wo

 # 4. libmorph — the independent morphological dictionary (Stage 2 cross-check)
 sudo apt-get install -y morphrus morphrus-dev moonycode-dev morphapi-dev
-g++ -std=c++17 -O2 dictprep/libmorph_check.cpp -lmorphrus -lmoonycode -o dictprep/libmorph_check
+g++ -std=c++17 -O2 tools/libmorph_check.cpp -lmorphrus -lmoonycode -o tools/libmorph_check
 ```

-If `dictprep/libmorph_check` is absent, Stage 2 still runs — it simply drops libmorph from
+If `tools/libmorph_check` is absent, Stage 2 still runs — it simply drops libmorph from
 the stack and reports `libmorph_helper=MISSING`.

 ## How to run

 ```sh
 # Stage 0 — PDF -> plain text (committed as the source of truth; run once)
-pdftotext dictprep/orfo_dict_2025.pdf dictprep/russian/orfo_dict_2025.txt
+pdftotext tools/orfo_dict_2025.pdf sources/scrabble_ru/orfo_dict_2025.txt

-# Stage 1 — build the base word list (Go): dictprep/russian/all.txt + /tmp/ru_*.txt
-go run ./dictprep/ruwords
+# Stage 1 — build the base word list (Go): sources/scrabble_ru/all.txt + /tmp/ru_*.txt
+go run ./tools/ruwords

 # Stage 2 — the brain (Python + mawo + libmorph): writes scrabble.txt
-ru-venv/bin/python dictprep/ru_stage2.py
+ru-venv/bin/python tools/ru_stage2.py

 # ask how a word did or did not reach the dictionary
-ru-venv/bin/python dictprep/ru_stage2.py --trace травмпункт
+ru-venv/bin/python tools/ru_stage2.py --trace травмпункт
 # also write the in-memory buckets (undefined, adjectives, verbs, singulars, fate.tsv)
-ru-venv/bin/python dictprep/ru_stage2.py --dump
+ru-venv/bin/python tools/ru_stage2.py --dump
 ```

 `-from`/`-to` (defaulting to 452/168808) bound the column word-list section of
@@ -5,7 +5,7 @@ The Эрудит ruleset has no Ё tile and treats Е/Ё as one letter, so its d
 folded before the DAWG is built. Folding merges pairs like ёж/еж, hence the de-dup. Output
 is sorted (Russian order over the 32 folded letters) and LF-separated.

-Run:  python3 dictprep/fold_yo.py dictprep/russian/scrabble.txt > /tmp/ru_erudit_words.txt
+Run:  python3 tools/fold_yo.py sources/scrabble_ru/scrabble.txt > /tmp/ru_erudit_words.txt
 """
 import sys

@@ -5,10 +5,10 @@ It reads the Stage-1 base word list (built once by ruwords so the heavy PDF is n
 re-parsed) together with the grammatical notes and the singular/variant structure, runs
 the whole noun-selection logic in memory, and writes a minimal result:

-    dictprep/russian/scrabble.txt   — the working dictionary (common nouns, nom. sing.)
-    dictprep/russian/undefined.txt  — the ambiguous tail, left for manual review
+    sources/scrabble_ru/scrabble.txt   — the working dictionary (common nouns, nom. sing.)
+    sources/scrabble_ru/undefined.txt  — the ambiguous tail, left for manual review

-(dictprep/russian/all.txt is the Stage-1 base.) Every other bucket — adjectives, verbs,
+(sources/scrabble_ru/all.txt is the Stage-1 base.) Every other bucket — adjectives, verbs,
 the merged note-nouns, singulars, variants — stays in memory. Pass --dump to also write
 them; pass --trace WORD to ask how a single word did or did not reach the dictionary.

@@ -17,9 +17,9 @@ variants are read from the pdftotext output (slov.txt) and the Stage-1 side file
 expensive PDF parse itself runs only once.

 Sources, most authoritative first: OpenCorpora (mawo-pymorphy3), libmorph (libmorph_check),
-and the orthographic dictionary's own notes. See dictprep/README.md.
+and the orthographic dictionary's own notes. See tools/README.md.

-Run:  ru-venv/bin/python dictprep/ru_stage2.py [--dump] [--trace WORD]
+Run:  ru-venv/bin/python tools/ru_stage2.py [--dump] [--trace WORD]
 """
 import argparse
 import os
@@ -27,7 +27,9 @@ import re
 import subprocess

 HERE = os.path.dirname(os.path.abspath(__file__))
-OUT_DIR = os.path.join(HERE, "russian")
+# The curated Russian word lists live in sources/scrabble_ru/ (this tool sits in tools/);
+# the uncommitted pipeline intermediates (orfo/all/debug) are regenerated alongside them.
+OUT_DIR = os.path.join(HERE, "..", "sources", "scrabble_ru")
 SLOV = os.path.join(OUT_DIR, "orfo_dict_2025.txt")  # committed pdftotext output (source of truth)
 WL_FROM, WL_TO = 452, 168808  # 1-based inclusive bounds of the column word-list section
 OC_CACHE = "/tmp/oc_nouns.txt"
@@ -322,7 +324,7 @@ def main():
        return

    write(os.path.join(OUT_DIR, "scrabble.txt"), r["scrabble"])
-    print(f"=> dictprep/russian/scrabble.txt   {len(r['scrabble'])}")
+    print(f"=> sources/scrabble_ru/scrabble.txt   {len(r['scrabble'])}")
    print(f"   undefined kept in memory: {len(set(r['undefined']))} (use --dump to write it)")
    if args.dump:
        write(os.path.join(OUT_DIR, "undefined.txt"), r["undefined"])
@@ -11,10 +11,10 @@
 //
 // It also collects a variant headword joined by "и" when it carries its own grammatical
 // note (e.g. "аблатив, -а и аблятив, -а"). Suffix-singular reconstruction is heuristic;
-// Stage 2 (dictprep/ru_stage2.py) re-checks the words against real dictionaries.
+// Stage 2 (tools/ru_stage2.py) re-checks the words against real dictionaries.
 //
-//	pdftotext dictprep/orfo_dict_2025.pdf /tmp/slov.txt
-//	go run ./dictprep/ruwords -in /tmp/slov.txt -from 452 -to 168808 \
+//	pdftotext tools/orfo_dict_2025.pdf /tmp/slov.txt
+//	go run ./tools/ruwords -in /tmp/slov.txt -from 452 -to 168808 \
 //	    -out russian_all.txt -skip russian_skip.txt
 package main

@@ -327,8 +327,8 @@ func writeWords(path string, words []string) error {
 }

 func main() {
-	in := flag.String("in", "dictprep/russian/orfo_dict_2025.txt", "plain-text dictionary (pdftotext output)")
-	out := flag.String("out", "dictprep/russian/all.txt", "output: the base word list (clean headwords + reconstructed singulars + variants)")
+	in := flag.String("in", "sources/scrabble_ru/orfo_dict_2025.txt", "plain-text dictionary (pdftotext output)")
+	out := flag.String("out", "sources/scrabble_ru/all.txt", "output: the base word list (clean headwords + reconstructed singulars + variants)")
 	skip := flag.String("skip", "/tmp/ru_skip.txt", "output: every other token, for a later morphology re-check")
 	sings := flag.String("singulars", "/tmp/ru_singulars.txt", "output: singulars reconstructed from \"ед.\" (known nouns)")
 	varsOut := flag.String("variants", "/tmp/ru_variants.txt", "output: variant pairs joined by \"и\" (primary<TAB>variant)")