#!/usr/bin/env python3 """Fold Ё/ё → Е/е in a word list and de-duplicate — the dictionary prep for "Эрудит". The Эрудит ruleset has no Ё tile and treats Е/Ё as one letter, so its dictionary must be folded before the DAWG is built. Folding merges pairs like ёж/еж, hence the de-dup. Output is sorted (Russian order over the 32 folded letters) and LF-separated. Run: python3 dictprep/fold_yo.py dictprep/russian/scrabble.txt > /tmp/ru_erudit_words.txt """ import sys ORDER = {c: i for i, c in enumerate("абвгдежзийклмнопрстуфхцчшщъыьэюя")} # 32 letters, no ё def key(w): return [ORDER.get(c, 99) for c in w] def main(): src = sys.argv[1] if len(sys.argv) > 1 else "/dev/stdin" words = {line.strip().replace("ё", "е").replace("Ё", "Е") for line in open(src, encoding="utf-8")} words.discard("") sys.stdout.write("\n".join(sorted(words, key=key)) + "\n") if __name__ == "__main__": main()