Publish as versioned Gitea module; move dictionary pipeline out

- Rename module to gitea.iliadenisov.ru/developer/scrabble-solver so it can be
  consumed as a versioned dependency (no go.work replace / CI clone).
- De-internalize wordlist and dictdawg as public packages.
- Remove cmd/builddict, dictprep/, the dictionaries submodule and the dawg
  Makefile: the word-list parsing and DAWG build now live in the separate
  scrabble-dictionary repository, which publishes the DAWG set as a release artifact.
- internal/dict loads the committed dawg/en_sowpods.dawg fixture for cmd/stress.
- Update README/CLAUDE docs accordingly.
This commit is contained in:
Ilia Denisov
2026-06-04 19:11:46 +02:00
parent 63a7c663bf
commit 256999b42c
41 changed files with 93 additions and 402477 deletions
+13 -43
View File
@@ -1,24 +1,18 @@
// Package dict loads the English test dictionary as a DAWG, preferring the serialized
// cache under testdata and falling back to building from the dictionaries submodule.
// Paths are resolved relative to the repository root so it works both from the repo root
// (commands) and from a package directory (tests).
// Package dict loads the English test dictionary as a DAWG from the committed
// dawg/en_sowpods.dawg fixture, for the cmd/stress benchmark. The dictionary build
// pipeline (word-list parsing and DAWG construction from sources) now lives in the
// separate scrabble-dictionary repository; this package only loads the committed
// artifact. Paths are resolved relative to the repository root so it works both from
// the repo root (commands) and from a package directory (tests).
package dict
import (
"os"
"path/filepath"
"github.com/iliadenisov/alphabet"
dawg "github.com/iliadenisov/dafsa"
"scrabble-solver/internal/dictdawg"
"scrabble-solver/internal/wordlist"
)
// MinLen and MaxLen bound playable word lengths (a 15x15 board holds at most 15).
const (
MinLen = 2
MaxLen = 15
"gitea.iliadenisov.ru/developer/scrabble-solver/dictdawg"
)
func exists(p string) bool { _, err := os.Stat(p); return err == nil }
@@ -42,35 +36,11 @@ func Root() string {
}
}
// DAWGCache and WordlistPath locate the English cache file and source word list,
// relative to the repository root.
func DAWGCache() string { return filepath.Join(Root(), "testdata", "sowpods.dawg") }
func WordlistPath() string { return filepath.Join(Root(), "dictionaries", "english", "sowpods.txt") }
// DAWGCache locates the committed English DAWG, relative to the repository root.
func DAWGCache() string { return filepath.Join(Root(), "dawg", "en_sowpods.dawg") }
// EnglishAvailable reports whether the English dictionary can be loaded (cache or source).
func EnglishAvailable() bool {
return exists(DAWGCache()) || exists(WordlistPath())
}
// EnglishAvailable reports whether the committed English DAWG is present.
func EnglishAvailable() bool { return exists(DAWGCache()) }
// EnglishWords returns the encoded English word list (from the submodule source).
func EnglishWords() ([][]byte, error) {
return wordlist.Read(WordlistPath(), alphabet.Latin(), MinLen, MaxLen)
}
// EnglishDAWG returns the English DAWG, loading the cache if present, otherwise building
// it from the word list and caching it (best effort).
func EnglishDAWG() (dawg.Finder, error) {
if exists(DAWGCache()) {
return dictdawg.Load(DAWGCache())
}
words, err := EnglishWords()
if err != nil {
return nil, err
}
f, err := dictdawg.Build(alphabet.Latin(), words)
if err != nil {
return nil, err
}
_ = dictdawg.Save(f, DAWGCache())
return f, nil
}
// EnglishDAWG loads the committed English DAWG.
func EnglishDAWG() (dawg.Finder, error) { return dictdawg.Load(DAWGCache()) }
-30
View File
@@ -1,30 +0,0 @@
// Package dictdawg builds a plain left-to-right DAWG of a dictionary, as used by the
// Appel-Jacobson move generator.
package dictdawg
import (
"github.com/iliadenisov/alphabet"
dawg "github.com/iliadenisov/dafsa"
)
// Build returns a DAWG Finder over words, which must be alphabet-index slices sorted by
// index order and de-duplicated (see wordlist.Encode).
func Build(idx alphabet.Indexer, words [][]byte) (dawg.Finder, error) {
d := dawg.New(idx)
for _, w := range words {
if err := d.AddB(w); err != nil {
return nil, err
}
}
return d.Finish(), nil
}
// Save writes the DAWG to filename. It requires an embedded alphabet (for example
// alphabet.Latin()), so that Load can reconstruct it.
func Save(f dawg.Finder, filename string) error {
_, err := f.Save(filename)
return err
}
// Load reopens a DAWG saved with Save.
func Load(filename string) (dawg.Finder, error) { return dawg.Load(filename) }
-44
View File
@@ -1,44 +0,0 @@
package dictdawg_test
import (
"path/filepath"
"testing"
"github.com/iliadenisov/alphabet"
"scrabble-solver/internal/dictdawg"
"scrabble-solver/internal/wordlist"
)
func TestBuildAndQuery(t *testing.T) {
words := wordlist.Encode([]string{"care", "cares", "cat"}, alphabet.Latin(), 2, 15)
f, err := dictdawg.Build(alphabet.Latin(), words)
if err != nil {
t.Fatal(err)
}
if f.NumAdded() != 3 {
t.Fatalf("NumAdded = %d, want 3", f.NumAdded())
}
if i := f.IndexOfB([]byte{2, 0, 17, 4}); i != 0 { // care
t.Errorf("IndexOf(care) = %d, want 0", i)
}
if i := f.IndexOfB([]byte{2, 0, 19}); i != 2 { // cat
t.Errorf("IndexOf(cat) = %d, want 2", i)
}
if i := f.IndexOfB([]byte{2, 0, 17}); i != -1 { // car (absent)
t.Errorf("IndexOf(car) = %d, want -1", i)
}
path := filepath.Join(t.TempDir(), "d.dawg")
if err := dictdawg.Save(f, path); err != nil {
t.Fatal(err)
}
g, err := dictdawg.Load(path)
if err != nil {
t.Fatal(err)
}
defer g.Close()
if i := g.IndexOfB([]byte{2, 0, 17, 4, 18}); i != 1 { // cares
t.Errorf("loaded IndexOf(cares) = %d, want 1", i)
}
}
+1 -1
View File
@@ -6,7 +6,7 @@ import (
"github.com/iliadenisov/alphabet"
dawg "github.com/iliadenisov/dafsa"
"scrabble-solver/internal/graph"
"gitea.iliadenisov.ru/developer/scrabble-solver/internal/graph"
)
// TestSpellSmoke also exercises the go.mod replace => ../dafsa wiring and the new
-77
View File
@@ -1,77 +0,0 @@
// Package wordlist reads dictionaries and encodes them into alphabet-index words,
// ready to add to a DAWG.
package wordlist
import (
"bufio"
"bytes"
"os"
"sort"
"strings"
"github.com/iliadenisov/alphabet"
)
// Encode turns words into alphabet-index slices, keeping only those whose length is in
// [minLen, maxLen] and whose characters all belong to idx's alphabet (case-folded).
// The result is sorted by index order and de-duplicated, as a DAWG builder requires.
func Encode(words []string, idx alphabet.Indexer, minLen, maxLen int) [][]byte {
out := make([][]byte, 0, len(words))
for _, w := range words {
w = strings.TrimSpace(w)
if w == "" {
continue
}
b, err := idx.Encode(strings.ToLower(w))
if err != nil {
continue
}
if len(b) < minLen || len(b) > maxLen {
continue
}
out = append(out, b)
}
sort.Slice(out, func(i, j int) bool { return bytes.Compare(out[i], out[j]) < 0 })
return Dedupe(out)
}
// Read is Encode applied to the lines (one word per line) of the file at path.
func Read(path string, idx alphabet.Indexer, minLen, maxLen int) ([][]byte, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
var words []string
sc := bufio.NewScanner(f)
sc.Buffer(make([]byte, 1<<20), 1<<20)
for sc.Scan() {
words = append(words, sc.Text())
}
if err := sc.Err(); err != nil {
return nil, err
}
return Encode(words, idx, minLen, maxLen), nil
}
// FoldYo replaces Ё/ё with Е/е. The Russian "Эрудит" variant has no Ё tile and treats
// Е and Ё as the same letter, so apply this when preparing an Эрудит dictionary (it is a
// dictionary-preparation step, not an engine behaviour).
func FoldYo(s string) string {
return strings.NewReplacer("ё", "е", "Ё", "Е").Replace(s)
}
// Dedupe removes adjacent duplicates from a sorted slice of index words in place.
func Dedupe(s [][]byte) [][]byte {
if len(s) == 0 {
return s
}
out := s[:1]
for i := 1; i < len(s); i++ {
if !bytes.Equal(s[i], s[i-1]) {
out = append(out, s[i])
}
}
return out
}
-37
View File
@@ -1,37 +0,0 @@
package wordlist
import (
"testing"
"github.com/iliadenisov/alphabet"
)
func TestFoldYo(t *testing.T) {
if got := FoldYo("ёлка"); got != "елка" {
t.Errorf("FoldYo(ёлка) = %q, want елка", got)
}
if got := FoldYo("Ёжик"); got != "Ежик" {
t.Errorf("FoldYo(Ёжик) = %q, want Ежик", got)
}
}
func TestEncodeFilterSortDedupe(t *testing.T) {
got := Encode([]string{
"cat", "CATS", "ab", "b", "abcdefghi", "cat", " do ", "qu1rk",
}, alphabet.Latin(), 2, 8)
want := [][]byte{
{0, 1}, // ab
{2, 0, 19}, // cat
{2, 0, 19, 18}, // cats (from CATS, case-folded)
{3, 14}, // do (trimmed)
}
if len(got) != len(want) {
t.Fatalf("got %d words %v, want %d", len(got), got, len(want))
}
for i := range want {
if string(got[i]) != string(want[i]) {
t.Errorf("word %d = %v, want %v", i, got[i], want[i])
}
}
}