0d9b998db3
- cmd/builddict: add -alphabet latin|russian (russian = alphabet.Embedded(LangRu)).
- dictprep/fold_yo.py: fold Ё→Е and de-dup, the Эрудит dictionary prep.
- Makefile: `make dawg` rebuilds dawg/{en_sowpods,ru_scrabble,ru_erudit}.dawg.
- dawg/: committed DAWGs verified by enumeration — 267752 / 83385 / 83343 words.
28 lines
984 B
Python
28 lines
984 B
Python
#!/usr/bin/env python3
|
||
"""Fold Ё/ё → Е/е in a word list and de-duplicate — the dictionary prep for "Эрудит".
|
||
|
||
The Эрудит ruleset has no Ё tile and treats Е/Ё as one letter, so its dictionary must be
|
||
folded before the DAWG is built. Folding merges pairs like ёж/еж, hence the de-dup. Output
|
||
is sorted (Russian order over the 32 folded letters) and LF-separated.
|
||
|
||
Run: python3 dictprep/fold_yo.py dictprep/russian/scrabble.txt > /tmp/ru_erudit_words.txt
|
||
"""
|
||
import sys
|
||
|
||
ORDER = {c: i for i, c in enumerate("абвгдежзийклмнопрстуфхцчшщъыьэюя")} # 32 letters, no ё
|
||
|
||
|
||
def key(w):
|
||
return [ORDER.get(c, 99) for c in w]
|
||
|
||
|
||
def main():
|
||
src = sys.argv[1] if len(sys.argv) > 1 else "/dev/stdin"
|
||
words = {line.strip().replace("ё", "е").replace("Ё", "Е") for line in open(src, encoding="utf-8")}
|
||
words.discard("")
|
||
sys.stdout.write("\n".join(sorted(words, key=key)) + "\n")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|