2026-06-01 22:02:56 +00:00
6 changed files with 67 additions and 4 deletions
@@ -0,0 +1,28 @@
 # Scrabble-solver build helpers.
 #
 # `make dawg` (re)builds the committed dictionary DAWGs under dawg/ from their word lists:
 #   en_sowpods.dawg  — English SOWPODS (Latin alphabet)
 #   ru_scrabble.dawg — Russian Scrabble nouns (Cyrillic, 33 letters)
 #   ru_erudit.dawg   — Эрудит (the same list with Ё→Е folded and de-duped)
 GO        ?= go
 PYTHON    ?= python3
 DAWG_DIR  := dawg
 BUILDDICT := $(GO) run ./cmd/builddict
 .PHONY: dawg dawg-en dawg-ru dawg-erudit clean-dawg
 dawg: dawg-en dawg-ru dawg-erudit
 dawg-en:
 	$(BUILDDICT) -dict dictionaries/english/sowpods.txt -alphabet latin -name en_sowpods -out $(DAWG_DIR)
 dawg-ru:
 	$(BUILDDICT) -dict dictprep/russian/scrabble.txt -alphabet russian -name ru_scrabble -out $(DAWG_DIR)
 dawg-erudit:
 	$(PYTHON) dictprep/fold_yo.py dictprep/russian/scrabble.txt > /tmp/ru_erudit_words.txt
 	$(BUILDDICT) -dict /tmp/ru_erudit_words.txt -alphabet russian -name ru_erudit -out $(DAWG_DIR)
 clean-dawg:
 	rm -f $(DAWG_DIR)/*.dawg
@@ -1,6 +1,5 @@
-// Command builddict converts a word list into a serialized DAWG, cached under testdata
+// Command builddict converts a word list into a serialized DAWG. By default it reads the
-// for the tests and the benchmark. By default it reads the English SOWPODS list from
+// English SOWPODS list (Latin alphabet); pass -alphabet russian for the Cyrillic lists.
 // the dictionaries submodule.
 package main
 import (
@@ -23,9 +22,18 @@ func main() {
 	name := flag.String("name", "sowpods", "base name for the output file")
 	minLen := flag.Int("min", 2, "minimum word length")
 	maxLen := flag.Int("max", 15, "maximum word length")
 	alpha := flag.String("alphabet", "latin", "alphabet: latin (English) or russian")
 	flag.Parse()
-	idx := alphabet.Latin()
+	var idx alphabet.Indexer
 	switch *alpha {
 	case "latin":
 		idx = alphabet.Latin()
 	case "russian":
 		idx = alphabet.Embedded(alphabet.Langs.LangRu)
 	default:
 		log.Fatalf("unknown -alphabet %q (want latin or russian)", *alpha)
 	}
 	t0 := time.Now()
 	words, err := wordlist.Read(*dict, idx, *minLen, *maxLen)
@@ -0,0 +1,27 @@
 #!/usr/bin/env python3
 """Fold Ё/ё → Е/е in a word list and de-duplicate — the dictionary prep for "Эрудит".
 The Эрудит ruleset has no Ё tile and treats Е/Ё as one letter, so its dictionary must be
 folded before the DAWG is built. Folding merges pairs like ёж/еж, hence the de-dup. Output
 is sorted (Russian order over the 32 folded letters) and LF-separated.
 Run:  python3 dictprep/fold_yo.py dictprep/russian/scrabble.txt > /tmp/ru_erudit_words.txt
 """
 import sys
 ORDER = {c: i for i, c in enumerate("абвгдежзийклмнопрстуфхцчшщъыьэюя")}  # 32 letters, no ё
 def key(w):
    return [ORDER.get(c, 99) for c in w]
 def main():
    src = sys.argv[1] if len(sys.argv) > 1 else "/dev/stdin"
    words = {line.strip().replace("ё", "е").replace("Ё", "Е") for line in open(src, encoding="utf-8")}
    words.discard("")
    sys.stdout.write("\n".join(sorted(words, key=key)) + "\n")
 if __name__ == "__main__":
    main()