256999b42c
- Rename module to gitea.iliadenisov.ru/developer/scrabble-solver so it can be consumed as a versioned dependency (no go.work replace / CI clone). - De-internalize wordlist and dictdawg as public packages. - Remove cmd/builddict, dictprep/, the dictionaries submodule and the dawg Makefile: the word-list parsing and DAWG build now live in the separate scrabble-dictionary repository, which publishes the DAWG set as a release artifact. - internal/dict loads the committed dawg/en_sowpods.dawg fixture for cmd/stress. - Update README/CLAUDE docs accordingly.
78 lines
2.0 KiB
Go
78 lines
2.0 KiB
Go
// Package wordlist reads dictionaries and encodes them into alphabet-index words,
|
||
// ready to add to a DAWG.
|
||
package wordlist
|
||
|
||
import (
|
||
"bufio"
|
||
"bytes"
|
||
"os"
|
||
"sort"
|
||
"strings"
|
||
|
||
"github.com/iliadenisov/alphabet"
|
||
)
|
||
|
||
// Encode turns words into alphabet-index slices, keeping only those whose length is in
|
||
// [minLen, maxLen] and whose characters all belong to idx's alphabet (case-folded).
|
||
// The result is sorted by index order and de-duplicated, as a DAWG builder requires.
|
||
func Encode(words []string, idx alphabet.Indexer, minLen, maxLen int) [][]byte {
|
||
out := make([][]byte, 0, len(words))
|
||
for _, w := range words {
|
||
w = strings.TrimSpace(w)
|
||
if w == "" {
|
||
continue
|
||
}
|
||
b, err := idx.Encode(strings.ToLower(w))
|
||
if err != nil {
|
||
continue
|
||
}
|
||
if len(b) < minLen || len(b) > maxLen {
|
||
continue
|
||
}
|
||
out = append(out, b)
|
||
}
|
||
sort.Slice(out, func(i, j int) bool { return bytes.Compare(out[i], out[j]) < 0 })
|
||
return Dedupe(out)
|
||
}
|
||
|
||
// Read is Encode applied to the lines (one word per line) of the file at path.
|
||
func Read(path string, idx alphabet.Indexer, minLen, maxLen int) ([][]byte, error) {
|
||
f, err := os.Open(path)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
defer f.Close()
|
||
|
||
var words []string
|
||
sc := bufio.NewScanner(f)
|
||
sc.Buffer(make([]byte, 1<<20), 1<<20)
|
||
for sc.Scan() {
|
||
words = append(words, sc.Text())
|
||
}
|
||
if err := sc.Err(); err != nil {
|
||
return nil, err
|
||
}
|
||
return Encode(words, idx, minLen, maxLen), nil
|
||
}
|
||
|
||
// FoldYo replaces Ё/ё with Е/е. The Russian "Эрудит" variant has no Ё tile and treats
|
||
// Е and Ё as the same letter, so apply this when preparing an Эрудит dictionary (it is a
|
||
// dictionary-preparation step, not an engine behaviour).
|
||
func FoldYo(s string) string {
|
||
return strings.NewReplacer("ё", "е", "Ё", "Е").Replace(s)
|
||
}
|
||
|
||
// Dedupe removes adjacent duplicates from a sorted slice of index words in place.
|
||
func Dedupe(s [][]byte) [][]byte {
|
||
if len(s) == 0 {
|
||
return s
|
||
}
|
||
out := s[:1]
|
||
for i := 1; i < len(s); i++ {
|
||
if !bytes.Equal(s[i], s[i-1]) {
|
||
out = append(out, s[i])
|
||
}
|
||
}
|
||
return out
|
||
}
|