dawg: build committed dictionary DAWGs (en SOWPODS, ru Scrabble, ru Эрудит)
- cmd/builddict: add -alphabet latin|russian (russian = alphabet.Embedded(LangRu)).
- dictprep/fold_yo.py: fold Ё→Е and de-dup, the Эрудит dictionary prep.
- Makefile: `make dawg` rebuilds dawg/{en_sowpods,ru_scrabble,ru_erudit}.dawg.
- dawg/: committed DAWGs verified by enumeration — 267752 / 83385 / 83343 words.
This commit is contained in:
+12
-4
@@ -1,6 +1,5 @@
|
||||
// Command builddict converts a word list into a serialized DAWG, cached under testdata
|
||||
// for the tests and the benchmark. By default it reads the English SOWPODS list from
|
||||
// the dictionaries submodule.
|
||||
// Command builddict converts a word list into a serialized DAWG. By default it reads the
|
||||
// English SOWPODS list (Latin alphabet); pass -alphabet russian for the Cyrillic lists.
|
||||
package main
|
||||
|
||||
import (
|
||||
@@ -23,9 +22,18 @@ func main() {
|
||||
name := flag.String("name", "sowpods", "base name for the output file")
|
||||
minLen := flag.Int("min", 2, "minimum word length")
|
||||
maxLen := flag.Int("max", 15, "maximum word length")
|
||||
alpha := flag.String("alphabet", "latin", "alphabet: latin (English) or russian")
|
||||
flag.Parse()
|
||||
|
||||
idx := alphabet.Latin()
|
||||
var idx alphabet.Indexer
|
||||
switch *alpha {
|
||||
case "latin":
|
||||
idx = alphabet.Latin()
|
||||
case "russian":
|
||||
idx = alphabet.Embedded(alphabet.Langs.LangRu)
|
||||
default:
|
||||
log.Fatalf("unknown -alphabet %q (want latin or russian)", *alpha)
|
||||
}
|
||||
|
||||
t0 := time.Now()
|
||||
words, err := wordlist.Read(*dict, idx, *minLen, *maxLen)
|
||||
|
||||
Reference in New Issue
Block a user