diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..cd13623 --- /dev/null +++ b/Makefile @@ -0,0 +1,28 @@ +# Scrabble-solver build helpers. +# +# `make dawg` (re)builds the committed dictionary DAWGs under dawg/ from their word lists: +# en_sowpods.dawg — English SOWPODS (Latin alphabet) +# ru_scrabble.dawg — Russian Scrabble nouns (Cyrillic, 33 letters) +# ru_erudit.dawg — Эрудит (the same list with Ё→Е folded and de-duped) + +GO ?= go +PYTHON ?= python3 +DAWG_DIR := dawg +BUILDDICT := $(GO) run ./cmd/builddict + +.PHONY: dawg dawg-en dawg-ru dawg-erudit clean-dawg + +dawg: dawg-en dawg-ru dawg-erudit + +dawg-en: + $(BUILDDICT) -dict dictionaries/english/sowpods.txt -alphabet latin -name en_sowpods -out $(DAWG_DIR) + +dawg-ru: + $(BUILDDICT) -dict dictprep/russian/scrabble.txt -alphabet russian -name ru_scrabble -out $(DAWG_DIR) + +dawg-erudit: + $(PYTHON) dictprep/fold_yo.py dictprep/russian/scrabble.txt > /tmp/ru_erudit_words.txt + $(BUILDDICT) -dict /tmp/ru_erudit_words.txt -alphabet russian -name ru_erudit -out $(DAWG_DIR) + +clean-dawg: + rm -f $(DAWG_DIR)/*.dawg diff --git a/cmd/builddict/main.go b/cmd/builddict/main.go index a6a7320..567713d 100644 --- a/cmd/builddict/main.go +++ b/cmd/builddict/main.go @@ -1,6 +1,5 @@ -// Command builddict converts a word list into a serialized DAWG, cached under testdata -// for the tests and the benchmark. By default it reads the English SOWPODS list from -// the dictionaries submodule. +// Command builddict converts a word list into a serialized DAWG. By default it reads the +// English SOWPODS list (Latin alphabet); pass -alphabet russian for the Cyrillic lists. package main import ( @@ -23,9 +22,18 @@ func main() { name := flag.String("name", "sowpods", "base name for the output file") minLen := flag.Int("min", 2, "minimum word length") maxLen := flag.Int("max", 15, "maximum word length") + alpha := flag.String("alphabet", "latin", "alphabet: latin (English) or russian") flag.Parse() - idx := alphabet.Latin() + var idx alphabet.Indexer + switch *alpha { + case "latin": + idx = alphabet.Latin() + case "russian": + idx = alphabet.Embedded(alphabet.Langs.LangRu) + default: + log.Fatalf("unknown -alphabet %q (want latin or russian)", *alpha) + } t0 := time.Now() words, err := wordlist.Read(*dict, idx, *minLen, *maxLen) diff --git a/dawg/en_sowpods.dawg b/dawg/en_sowpods.dawg new file mode 100644 index 0000000..d8763c1 Binary files /dev/null and b/dawg/en_sowpods.dawg differ diff --git a/dawg/ru_erudit.dawg b/dawg/ru_erudit.dawg new file mode 100644 index 0000000..ccb3229 Binary files /dev/null and b/dawg/ru_erudit.dawg differ diff --git a/dawg/ru_scrabble.dawg b/dawg/ru_scrabble.dawg new file mode 100644 index 0000000..3342b70 Binary files /dev/null and b/dawg/ru_scrabble.dawg differ diff --git a/dictprep/fold_yo.py b/dictprep/fold_yo.py new file mode 100644 index 0000000..c192653 --- /dev/null +++ b/dictprep/fold_yo.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +"""Fold Ё/ё → Е/е in a word list and de-duplicate — the dictionary prep for "Эрудит". + +The Эрудит ruleset has no Ё tile and treats Е/Ё as one letter, so its dictionary must be +folded before the DAWG is built. Folding merges pairs like ёж/еж, hence the de-dup. Output +is sorted (Russian order over the 32 folded letters) and LF-separated. + +Run: python3 dictprep/fold_yo.py dictprep/russian/scrabble.txt > /tmp/ru_erudit_words.txt +""" +import sys + +ORDER = {c: i for i, c in enumerate("абвгдежзийклмнопрстуфхцчшщъыьэюя")} # 32 letters, no ё + + +def key(w): + return [ORDER.get(c, 99) for c in w] + + +def main(): + src = sys.argv[1] if len(sys.argv) > 1 else "/dev/stdin" + words = {line.strip().replace("ё", "е").replace("Ё", "Е") for line in open(src, encoding="utf-8")} + words.discard("") + sys.stdout.write("\n".join(sorted(words, key=key)) + "\n") + + +if __name__ == "__main__": + main()