// Package wordlist reads dictionaries and encodes them into alphabet-index words, // ready to add to a DAWG. package wordlist import ( "bufio" "bytes" "os" "sort" "strings" "github.com/iliadenisov/alphabet" ) // Encode turns words into alphabet-index slices, keeping only those whose length is in // [minLen, maxLen] and whose characters all belong to idx's alphabet (case-folded). // The result is sorted by index order and de-duplicated, as a DAWG builder requires. func Encode(words []string, idx alphabet.Indexer, minLen, maxLen int) [][]byte { out := make([][]byte, 0, len(words)) for _, w := range words { w = strings.TrimSpace(w) if w == "" { continue } b, err := idx.Encode(strings.ToLower(w)) if err != nil { continue } if len(b) < minLen || len(b) > maxLen { continue } out = append(out, b) } sort.Slice(out, func(i, j int) bool { return bytes.Compare(out[i], out[j]) < 0 }) return Dedupe(out) } // Read is Encode applied to the lines (one word per line) of the file at path. func Read(path string, idx alphabet.Indexer, minLen, maxLen int) ([][]byte, error) { f, err := os.Open(path) if err != nil { return nil, err } defer f.Close() var words []string sc := bufio.NewScanner(f) sc.Buffer(make([]byte, 1<<20), 1<<20) for sc.Scan() { words = append(words, sc.Text()) } if err := sc.Err(); err != nil { return nil, err } return Encode(words, idx, minLen, maxLen), nil } // FoldYo replaces Ё/ё with Е/е. The Russian "Эрудит" variant has no Ё tile and treats // Е and Ё as the same letter, so apply this when preparing an Эрудит dictionary (it is a // dictionary-preparation step, not an engine behaviour). func FoldYo(s string) string { return strings.NewReplacer("ё", "е", "Ё", "Е").Replace(s) } // Dedupe removes adjacent duplicates from a sorted slice of index words in place. func Dedupe(s [][]byte) [][]byte { if len(s) == 0 { return s } out := s[:1] for i := 1; i < len(s); i++ { if !bytes.Equal(s[i], s[i-1]) { out = append(out, s[i]) } } return out }