0d9b998db3
- cmd/builddict: add -alphabet latin|russian (russian = alphabet.Embedded(LangRu)).
- dictprep/fold_yo.py: fold Ё→Е and de-dup, the Эрудит dictionary prep.
- Makefile: `make dawg` rebuilds dawg/{en_sowpods,ru_scrabble,ru_erudit}.dawg.
- dawg/: committed DAWGs verified by enumeration — 267752 / 83385 / 83343 words.
76 lines
2.0 KiB
Go
76 lines
2.0 KiB
Go
// Command builddict converts a word list into a serialized DAWG. By default it reads the
|
|
// English SOWPODS list (Latin alphabet); pass -alphabet russian for the Cyrillic lists.
|
|
package main
|
|
|
|
import (
|
|
"flag"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"time"
|
|
|
|
"github.com/iliadenisov/alphabet"
|
|
|
|
"scrabble-solver/internal/dictdawg"
|
|
"scrabble-solver/internal/wordlist"
|
|
)
|
|
|
|
func main() {
|
|
dict := flag.String("dict", "dictionaries/english/sowpods.txt", "word list file (one word per line)")
|
|
out := flag.String("out", "testdata", "output directory")
|
|
name := flag.String("name", "sowpods", "base name for the output file")
|
|
minLen := flag.Int("min", 2, "minimum word length")
|
|
maxLen := flag.Int("max", 15, "maximum word length")
|
|
alpha := flag.String("alphabet", "latin", "alphabet: latin (English) or russian")
|
|
flag.Parse()
|
|
|
|
var idx alphabet.Indexer
|
|
switch *alpha {
|
|
case "latin":
|
|
idx = alphabet.Latin()
|
|
case "russian":
|
|
idx = alphabet.Embedded(alphabet.Langs.LangRu)
|
|
default:
|
|
log.Fatalf("unknown -alphabet %q (want latin or russian)", *alpha)
|
|
}
|
|
|
|
t0 := time.Now()
|
|
words, err := wordlist.Read(*dict, idx, *minLen, *maxLen)
|
|
if err != nil {
|
|
log.Fatalf("read %s: %v", *dict, err)
|
|
}
|
|
fmt.Printf("loaded %d words from %s in %s\n", len(words), *dict, time.Since(t0).Round(time.Millisecond))
|
|
|
|
if err := os.MkdirAll(*out, 0o755); err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
|
|
t := time.Now()
|
|
f, err := dictdawg.Build(idx, words)
|
|
if err != nil {
|
|
log.Fatalf("build dawg: %v", err)
|
|
}
|
|
path := filepath.Join(*out, *name+".dawg")
|
|
if err := dictdawg.Save(f, path); err != nil {
|
|
log.Fatalf("save: %v", err)
|
|
}
|
|
size := int64(0)
|
|
if fi, err := os.Stat(path); err == nil {
|
|
size = fi.Size()
|
|
}
|
|
fmt.Printf("DAWG %d nodes, %s, built+saved in %s -> %s\n",
|
|
f.NumNodes(), humanBytes(size), time.Since(t).Round(time.Millisecond), path)
|
|
}
|
|
|
|
func humanBytes(n int64) string {
|
|
switch {
|
|
case n >= 1<<20:
|
|
return fmt.Sprintf("%.2f MB", float64(n)/(1<<20))
|
|
case n >= 1<<10:
|
|
return fmt.Sprintf("%.1f KB", float64(n)/(1<<10))
|
|
default:
|
|
return fmt.Sprintf("%d B", n)
|
|
}
|
|
}
|