Tidy dictionary sources into sources/<variant>/ + tools/ #2
@@ -34,9 +34,9 @@ jobs:
|
||||
- name: Build DAWGs
|
||||
run: |
|
||||
mkdir -p dawg
|
||||
go run ./cmd/builddict -dict dictionaries/english/sowpods.txt -alphabet latin -name en_sowpods -out dawg
|
||||
go run ./cmd/builddict -dict dictprep/russian/scrabble.txt -alphabet russian -name ru_scrabble -out dawg
|
||||
go run ./cmd/builddict -dict dictprep/russian/erudit.txt -alphabet russian -name ru_erudit -out dawg
|
||||
go run ./cmd/builddict -dict sources/scrabble_en/sowpods.txt -alphabet latin -name en_sowpods -out dawg
|
||||
go run ./cmd/builddict -dict sources/scrabble_ru/scrabble.txt -alphabet russian -name ru_scrabble -out dawg
|
||||
go run ./cmd/builddict -dict sources/erudit_ru/erudit.txt -alphabet russian -name ru_erudit -out dawg
|
||||
ls -la dawg/
|
||||
for f in en_sowpods ru_scrabble ru_erudit; do
|
||||
test -s "dawg/$f.dawg" || { echo "missing dawg/$f.dawg"; exit 1; }
|
||||
|
||||
+13
@@ -1,3 +1,16 @@
|
||||
# Built DAWGs are release artifacts (published by CI on a vX.Y.Z tag), not committed.
|
||||
/dawg/
|
||||
/scrabble-dawg-*.tar.gz
|
||||
|
||||
# Russian prep-pipeline intermediates (regenerated locally by tools/; only the curated
|
||||
# word lists in sources/scrabble_ru/ are committed).
|
||||
/sources/scrabble_ru/orfo_dict_2025.txt
|
||||
/sources/scrabble_ru/all.txt
|
||||
/sources/scrabble_ru/undefined.txt
|
||||
/sources/scrabble_ru/adjectives.txt
|
||||
/sources/scrabble_ru/verbs.txt
|
||||
/sources/scrabble_ru/singulars.txt
|
||||
/sources/scrabble_ru/fate.tsv
|
||||
/tools/libmorph_check
|
||||
/tools/orfo_dict_2025.pdf
|
||||
__pycache__/
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
# ru_erudit.dawg — Эрудит (the Ё→Е folded + de-duped list, committed as russian/erudit.txt)
|
||||
#
|
||||
# CI builds the DAWGs as a validation gate; release artifacts are published from this output
|
||||
# (see README.md). Regenerate russian/erudit.txt from scrabble.txt with dictprep/fold_yo.py.
|
||||
# (see README.md). Regenerate russian/erudit.txt from scrabble.txt with tools/fold_yo.py.
|
||||
|
||||
export GOPRIVATE := gitea.iliadenisov.ru/*
|
||||
|
||||
@@ -21,13 +21,13 @@ BUILDDICT := $(GO) run ./cmd/builddict
|
||||
dawg: dawg-en dawg-ru dawg-erudit
|
||||
|
||||
dawg-en:
|
||||
$(BUILDDICT) -dict dictionaries/english/sowpods.txt -alphabet latin -name en_sowpods -out $(DAWG_DIR)
|
||||
$(BUILDDICT) -dict sources/scrabble_en/sowpods.txt -alphabet latin -name en_sowpods -out $(DAWG_DIR)
|
||||
|
||||
dawg-ru:
|
||||
$(BUILDDICT) -dict dictprep/russian/scrabble.txt -alphabet russian -name ru_scrabble -out $(DAWG_DIR)
|
||||
$(BUILDDICT) -dict sources/scrabble_ru/scrabble.txt -alphabet russian -name ru_scrabble -out $(DAWG_DIR)
|
||||
|
||||
dawg-erudit:
|
||||
$(BUILDDICT) -dict dictprep/russian/erudit.txt -alphabet russian -name ru_erudit -out $(DAWG_DIR)
|
||||
$(BUILDDICT) -dict sources/erudit_ru/erudit.txt -alphabet russian -name ru_erudit -out $(DAWG_DIR)
|
||||
|
||||
clean-dawg:
|
||||
rm -f $(DAWG_DIR)/*.dawg
|
||||
|
||||
@@ -17,9 +17,9 @@ byte-identical to the solver's committed test fixtures.
|
||||
|
||||
| file | variant | source |
|
||||
| --- | --- | --- |
|
||||
| `en_sowpods.dawg` | English (SOWPODS) | `dictionaries/english/sowpods.txt` |
|
||||
| `ru_scrabble.dawg` | Russian Scrabble | `dictprep/russian/scrabble.txt` |
|
||||
| `ru_erudit.dawg` | Эрудит | `dictprep/russian/erudit.txt` (Ё→Е folded `scrabble.txt`, via `dictprep/fold_yo.py`) |
|
||||
| `en_sowpods.dawg` | English (SOWPODS) | `sources/scrabble_en/sowpods.txt` |
|
||||
| `ru_scrabble.dawg` | Russian Scrabble | `sources/scrabble_ru/scrabble.txt` |
|
||||
| `ru_erudit.dawg` | Эрудит | `sources/erudit_ru/erudit.txt` (Ё→Е folded `scrabble.txt`, via `tools/fold_yo.py`) |
|
||||
|
||||
The CI (`.gitea/workflows/build.yaml`) rebuilds them on every push/PR as a validation gate
|
||||
(inlined `go run`, no `make`/`python` needed on the runner). Release artifacts are published per
|
||||
@@ -30,11 +30,11 @@ a new release, never breaking a running backend).
|
||||
|
||||
## Sources / provenance
|
||||
|
||||
- **English:** `dictionaries/english/sowpods.txt`, vendored from
|
||||
- **English:** `sources/scrabble_en/sowpods.txt`, vendored from
|
||||
[`kamilmielnik/scrabble-dictionaries`](https://github.com/kamilmielnik/scrabble-dictionaries).
|
||||
- **Russian:** `dictprep/russian/scrabble.txt`, derived from the Russian academic orthographic
|
||||
dictionary by the tooling under `dictprep/` (see `dictprep/README.md`); `dictprep/russian/erudit.txt`
|
||||
is its Ё→Е folded form (`dictprep/fold_yo.py`). Only the prepared word lists are vendored; the
|
||||
- **Russian:** `sources/scrabble_ru/scrabble.txt`, derived from the Russian academic orthographic
|
||||
dictionary by the tooling under `tools/` (see `tools/README.md`); `sources/erudit_ru/erudit.txt`
|
||||
is its Ё→Е folded form (`tools/fold_yo.py`). Only the prepared word lists are vendored; the
|
||||
heavy upstream source (the orfo PDF/text) is not.
|
||||
|
||||
## Build
|
||||
@@ -45,7 +45,7 @@ make dawg # -> dawg/{en_sowpods,ru_scrabble,ru_erudit}.dawg
|
||||
|
||||
Requires Go (module deps fetched with `GOPRIVATE=gitea.iliadenisov.ru/*`, exported by the
|
||||
Makefile). No `python` is needed for the build — the Ё→Е fold is committed as `erudit.txt`;
|
||||
regenerate it with `python3 dictprep/fold_yo.py dictprep/russian/scrabble.txt > dictprep/russian/erudit.txt`.
|
||||
regenerate it with `python3 tools/fold_yo.py sources/scrabble_ru/scrabble.txt > sources/erudit_ru/erudit.txt`.
|
||||
|
||||
## Release
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ import (
|
||||
)
|
||||
|
||||
func main() {
|
||||
dict := flag.String("dict", "dictionaries/english/sowpods.txt", "word list file (one word per line)")
|
||||
dict := flag.String("dict", "sources/scrabble_en/sowpods.txt", "word list file (one word per line)")
|
||||
out := flag.String("out", "testdata", "output directory")
|
||||
name := flag.String("name", "sowpods", "base name for the output file")
|
||||
minLen := flag.Int("min", 2, "minimum word length")
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
# erudit_ru source
|
||||
|
||||
`erudit.txt` — the Эрудит word list: the Ё→Е folded and de-duplicated form of
|
||||
[`../scrabble_ru/scrabble.txt`](../scrabble_ru/scrabble.txt), produced by `tools/fold_yo.py`
|
||||
(the Эрудит ruleset has no Ё tile and treats Е/Ё as one letter). Built to `dawg/ru_erudit.dawg`
|
||||
(`make dawg-erudit`).
|
||||
@@ -0,0 +1,5 @@
|
||||
# scrabble_en source
|
||||
|
||||
`sowpods.txt` — the English SOWPODS word list, vendored from
|
||||
[`kamilmielnik/scrabble-dictionaries`](https://github.com/kamilmielnik/scrabble-dictionaries).
|
||||
Built to `dawg/en_sowpods.dawg` (`make dawg-en`).
|
||||
@@ -0,0 +1,9 @@
|
||||
# scrabble_ru source
|
||||
|
||||
`scrabble.txt` — Russian Scrabble common nouns (nominative singular), produced by the prep
|
||||
pipeline under [`../../tools/`](../../tools/README.md) from the Russian academic orthographic
|
||||
dictionary, cross-checked against OpenCorpora and libmorph. `manual_confirm.txt` holds the
|
||||
hand-reviewed additions the pipeline merges in. Built to `dawg/ru_scrabble.dawg` (`make dawg-ru`).
|
||||
|
||||
The pipeline's uncommitted intermediates (`orfo_dict_2025.txt`, `all.txt`, debug dumps) are
|
||||
regenerated here locally and are git-ignored.
|
||||
@@ -1,14 +1,14 @@
|
||||
# Russian word-list preparation (`dictprep`)
|
||||
# Russian word-list preparation (`tools`)
|
||||
|
||||
Builds the Russian **noun** word list for the Scrabble/Эрудит solver out of the official
|
||||
Russian academic **orthographic dictionary**, cross-checked against two independent
|
||||
morphological dictionaries.
|
||||
|
||||
The goal of the pipeline is a list of **common nouns in the nominative singular**
|
||||
(`dictprep/russian/scrabble.txt`), plus an ambiguous tail for manual review.
|
||||
(`sources/scrabble_ru/scrabble.txt`), plus an ambiguous tail for manual review.
|
||||
|
||||
> This directory is self-contained tooling for *building* the word list. It is not part
|
||||
> of the solver library. The committed result lives in `dictprep/russian/`.
|
||||
> of the solver library. The committed result lives in `sources/scrabble_ru/`.
|
||||
|
||||
## Source
|
||||
|
||||
@@ -23,7 +23,7 @@ The PDF is git-ignored (large, third-party); place it here as `orfo_dict_2025.pd
|
||||
pdftotext output is committed as `russian/orfo_dict_2025.txt`, so the word list rebuilds
|
||||
from the text alone — the binary PDF is needed only to regenerate that text.
|
||||
|
||||
## Outputs (`dictprep/russian/`)
|
||||
## Outputs (`sources/scrabble_ru/`)
|
||||
|
||||
The committed result is **three** files; every other bucket stays in the Stage-2
|
||||
process's memory (dump it with `--dump`, query it with `--trace WORD`).
|
||||
@@ -56,28 +56,28 @@ ru-venv/bin/pip install mawo-pymorphy3 # bundles OpenCorpora 2025 (wo
|
||||
|
||||
# 4. libmorph — the independent morphological dictionary (Stage 2 cross-check)
|
||||
sudo apt-get install -y morphrus morphrus-dev moonycode-dev morphapi-dev
|
||||
g++ -std=c++17 -O2 dictprep/libmorph_check.cpp -lmorphrus -lmoonycode -o dictprep/libmorph_check
|
||||
g++ -std=c++17 -O2 tools/libmorph_check.cpp -lmorphrus -lmoonycode -o tools/libmorph_check
|
||||
```
|
||||
|
||||
If `dictprep/libmorph_check` is absent, Stage 2 still runs — it simply drops libmorph from
|
||||
If `tools/libmorph_check` is absent, Stage 2 still runs — it simply drops libmorph from
|
||||
the stack and reports `libmorph_helper=MISSING`.
|
||||
|
||||
## How to run
|
||||
|
||||
```sh
|
||||
# Stage 0 — PDF -> plain text (committed as the source of truth; run once)
|
||||
pdftotext dictprep/orfo_dict_2025.pdf dictprep/russian/orfo_dict_2025.txt
|
||||
pdftotext tools/orfo_dict_2025.pdf sources/scrabble_ru/orfo_dict_2025.txt
|
||||
|
||||
# Stage 1 — build the base word list (Go): dictprep/russian/all.txt + /tmp/ru_*.txt
|
||||
go run ./dictprep/ruwords
|
||||
# Stage 1 — build the base word list (Go): sources/scrabble_ru/all.txt + /tmp/ru_*.txt
|
||||
go run ./tools/ruwords
|
||||
|
||||
# Stage 2 — the brain (Python + mawo + libmorph): writes scrabble.txt
|
||||
ru-venv/bin/python dictprep/ru_stage2.py
|
||||
ru-venv/bin/python tools/ru_stage2.py
|
||||
|
||||
# ask how a word did or did not reach the dictionary
|
||||
ru-venv/bin/python dictprep/ru_stage2.py --trace травмпункт
|
||||
ru-venv/bin/python tools/ru_stage2.py --trace травмпункт
|
||||
# also write the in-memory buckets (undefined, adjectives, verbs, singulars, fate.tsv)
|
||||
ru-venv/bin/python dictprep/ru_stage2.py --dump
|
||||
ru-venv/bin/python tools/ru_stage2.py --dump
|
||||
```
|
||||
|
||||
`-from`/`-to` (defaulting to 452/168808) bound the column word-list section of
|
||||
@@ -5,7 +5,7 @@ The Эрудит ruleset has no Ё tile and treats Е/Ё as one letter, so its d
|
||||
folded before the DAWG is built. Folding merges pairs like ёж/еж, hence the de-dup. Output
|
||||
is sorted (Russian order over the 32 folded letters) and LF-separated.
|
||||
|
||||
Run: python3 dictprep/fold_yo.py dictprep/russian/scrabble.txt > /tmp/ru_erudit_words.txt
|
||||
Run: python3 tools/fold_yo.py sources/scrabble_ru/scrabble.txt > /tmp/ru_erudit_words.txt
|
||||
"""
|
||||
import sys
|
||||
|
||||
@@ -5,10 +5,10 @@ It reads the Stage-1 base word list (built once by ruwords so the heavy PDF is n
|
||||
re-parsed) together with the grammatical notes and the singular/variant structure, runs
|
||||
the whole noun-selection logic in memory, and writes a minimal result:
|
||||
|
||||
dictprep/russian/scrabble.txt — the working dictionary (common nouns, nom. sing.)
|
||||
dictprep/russian/undefined.txt — the ambiguous tail, left for manual review
|
||||
sources/scrabble_ru/scrabble.txt — the working dictionary (common nouns, nom. sing.)
|
||||
sources/scrabble_ru/undefined.txt — the ambiguous tail, left for manual review
|
||||
|
||||
(dictprep/russian/all.txt is the Stage-1 base.) Every other bucket — adjectives, verbs,
|
||||
(sources/scrabble_ru/all.txt is the Stage-1 base.) Every other bucket — adjectives, verbs,
|
||||
the merged note-nouns, singulars, variants — stays in memory. Pass --dump to also write
|
||||
them; pass --trace WORD to ask how a single word did or did not reach the dictionary.
|
||||
|
||||
@@ -17,9 +17,9 @@ variants are read from the pdftotext output (slov.txt) and the Stage-1 side file
|
||||
expensive PDF parse itself runs only once.
|
||||
|
||||
Sources, most authoritative first: OpenCorpora (mawo-pymorphy3), libmorph (libmorph_check),
|
||||
and the orthographic dictionary's own notes. See dictprep/README.md.
|
||||
and the orthographic dictionary's own notes. See tools/README.md.
|
||||
|
||||
Run: ru-venv/bin/python dictprep/ru_stage2.py [--dump] [--trace WORD]
|
||||
Run: ru-venv/bin/python tools/ru_stage2.py [--dump] [--trace WORD]
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
@@ -27,7 +27,9 @@ import re
|
||||
import subprocess
|
||||
|
||||
HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
OUT_DIR = os.path.join(HERE, "russian")
|
||||
# The curated Russian word lists live in sources/scrabble_ru/ (this tool sits in tools/);
|
||||
# the uncommitted pipeline intermediates (orfo/all/debug) are regenerated alongside them.
|
||||
OUT_DIR = os.path.join(HERE, "..", "sources", "scrabble_ru")
|
||||
SLOV = os.path.join(OUT_DIR, "orfo_dict_2025.txt") # committed pdftotext output (source of truth)
|
||||
WL_FROM, WL_TO = 452, 168808 # 1-based inclusive bounds of the column word-list section
|
||||
OC_CACHE = "/tmp/oc_nouns.txt"
|
||||
@@ -322,7 +324,7 @@ def main():
|
||||
return
|
||||
|
||||
write(os.path.join(OUT_DIR, "scrabble.txt"), r["scrabble"])
|
||||
print(f"=> dictprep/russian/scrabble.txt {len(r['scrabble'])}")
|
||||
print(f"=> sources/scrabble_ru/scrabble.txt {len(r['scrabble'])}")
|
||||
print(f" undefined kept in memory: {len(set(r['undefined']))} (use --dump to write it)")
|
||||
if args.dump:
|
||||
write(os.path.join(OUT_DIR, "undefined.txt"), r["undefined"])
|
||||
@@ -11,10 +11,10 @@
|
||||
//
|
||||
// It also collects a variant headword joined by "и" when it carries its own grammatical
|
||||
// note (e.g. "аблатив, -а и аблятив, -а"). Suffix-singular reconstruction is heuristic;
|
||||
// Stage 2 (dictprep/ru_stage2.py) re-checks the words against real dictionaries.
|
||||
// Stage 2 (tools/ru_stage2.py) re-checks the words against real dictionaries.
|
||||
//
|
||||
// pdftotext dictprep/orfo_dict_2025.pdf /tmp/slov.txt
|
||||
// go run ./dictprep/ruwords -in /tmp/slov.txt -from 452 -to 168808 \
|
||||
// pdftotext tools/orfo_dict_2025.pdf /tmp/slov.txt
|
||||
// go run ./tools/ruwords -in /tmp/slov.txt -from 452 -to 168808 \
|
||||
// -out russian_all.txt -skip russian_skip.txt
|
||||
package main
|
||||
|
||||
@@ -327,8 +327,8 @@ func writeWords(path string, words []string) error {
|
||||
}
|
||||
|
||||
func main() {
|
||||
in := flag.String("in", "dictprep/russian/orfo_dict_2025.txt", "plain-text dictionary (pdftotext output)")
|
||||
out := flag.String("out", "dictprep/russian/all.txt", "output: the base word list (clean headwords + reconstructed singulars + variants)")
|
||||
in := flag.String("in", "sources/scrabble_ru/orfo_dict_2025.txt", "plain-text dictionary (pdftotext output)")
|
||||
out := flag.String("out", "sources/scrabble_ru/all.txt", "output: the base word list (clean headwords + reconstructed singulars + variants)")
|
||||
skip := flag.String("skip", "/tmp/ru_skip.txt", "output: every other token, for a later morphology re-check")
|
||||
sings := flag.String("singulars", "/tmp/ru_singulars.txt", "output: singulars reconstructed from \"ед.\" (known nouns)")
|
||||
varsOut := flag.String("variants", "/tmp/ru_variants.txt", "output: variant pairs joined by \"и\" (primary<TAB>variant)")
|
||||
Reference in New Issue
Block a user