Files
scrabble-solver/wordlist/wordlist.go
T
Ilia Denisov 256999b42c Publish as versioned Gitea module; move dictionary pipeline out
- Rename module to gitea.iliadenisov.ru/developer/scrabble-solver so it can be
  consumed as a versioned dependency (no go.work replace / CI clone).
- De-internalize wordlist and dictdawg as public packages.
- Remove cmd/builddict, dictprep/, the dictionaries submodule and the dawg
  Makefile: the word-list parsing and DAWG build now live in the separate
  scrabble-dictionary repository, which publishes the DAWG set as a release artifact.
- internal/dict loads the committed dawg/en_sowpods.dawg fixture for cmd/stress.
- Update README/CLAUDE docs accordingly.
2026-06-04 19:11:46 +02:00

78 lines
2.0 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Package wordlist reads dictionaries and encodes them into alphabet-index words,
// ready to add to a DAWG.
package wordlist
import (
"bufio"
"bytes"
"os"
"sort"
"strings"
"github.com/iliadenisov/alphabet"
)
// Encode turns words into alphabet-index slices, keeping only those whose length is in
// [minLen, maxLen] and whose characters all belong to idx's alphabet (case-folded).
// The result is sorted by index order and de-duplicated, as a DAWG builder requires.
func Encode(words []string, idx alphabet.Indexer, minLen, maxLen int) [][]byte {
out := make([][]byte, 0, len(words))
for _, w := range words {
w = strings.TrimSpace(w)
if w == "" {
continue
}
b, err := idx.Encode(strings.ToLower(w))
if err != nil {
continue
}
if len(b) < minLen || len(b) > maxLen {
continue
}
out = append(out, b)
}
sort.Slice(out, func(i, j int) bool { return bytes.Compare(out[i], out[j]) < 0 })
return Dedupe(out)
}
// Read is Encode applied to the lines (one word per line) of the file at path.
func Read(path string, idx alphabet.Indexer, minLen, maxLen int) ([][]byte, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
var words []string
sc := bufio.NewScanner(f)
sc.Buffer(make([]byte, 1<<20), 1<<20)
for sc.Scan() {
words = append(words, sc.Text())
}
if err := sc.Err(); err != nil {
return nil, err
}
return Encode(words, idx, minLen, maxLen), nil
}
// FoldYo replaces Ё/ё with Е/е. The Russian "Эрудит" variant has no Ё tile and treats
// Е and Ё as the same letter, so apply this when preparing an Эрудит dictionary (it is a
// dictionary-preparation step, not an engine behaviour).
func FoldYo(s string) string {
return strings.NewReplacer("ё", "е", "Ё", "Е").Replace(s)
}
// Dedupe removes adjacent duplicates from a sorted slice of index words in place.
func Dedupe(s [][]byte) [][]byte {
if len(s) == 0 {
return s
}
out := s[:1]
for i := 1; i < len(s); i++ {
if !bytes.Equal(s[i], s[i-1]) {
out = append(out, s[i])
}
}
return out
}