▍ humdrum codex / glint v1.0.2
license AGPL-3.0
2.9 KB raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
// Package spell provides a lightweight, pure-Go spellchecker for glint: an
// embedded common-English wordlist for membership tests, a BK-tree for
// edit-distance suggestions, and a hand-editable personal dictionary. It carries
// no cgo dependency so `go build .` and the Homebrew formula stay clean.
package spell

import (
	"bufio"
	"bytes"
	"compress/gzip"
	_ "embed"
	"strings"
)

//go:embed words.txt.gz
var wordsGz []byte

// Dict is a loaded spellchecker: an embedded common-word set plus a personal
// dictionary, with a BK-tree over the embedded words for suggestions.
type Dict struct {
	words    map[string]struct{} // embedded words, lowercased
	ranked   []string            // embedded words in descending frequency order
	rank     map[string]int      // word -> frequency rank (0 = most common)
	personal map[string]struct{} // personal-dictionary words, lowercased
	bk       *bkTree             // suggestion index over embedded words

	personalPath string // ~/.config/glint/dict.txt; "" until SetPersonalPath
}

// Load reads the embedded wordlist into a Dict. The personal dictionary is not
// loaded here; call LoadPersonal once the config path is known.
func Load() (*Dict, error) {
	zr, err := gzip.NewReader(bytes.NewReader(wordsGz))
	if err != nil {
		return nil, err
	}
	defer func() { _ = zr.Close() }()

	d := &Dict{
		words:    make(map[string]struct{}, 60000),
		rank:     make(map[string]int, 60000),
		personal: make(map[string]struct{}),
	}
	sc := bufio.NewScanner(zr)
	for sc.Scan() {
		w := strings.TrimSpace(sc.Text())
		if w == "" {
			continue
		}
		if _, dup := d.words[w]; dup {
			continue
		}
		d.rank[w] = len(d.ranked)
		d.ranked = append(d.ranked, w)
		d.words[w] = struct{}{}
	}
	if err := sc.Err(); err != nil {
		return nil, err
	}
	d.bk = buildBKTree(d.ranked)
	return d, nil
}

// Known reports whether word is spelled correctly: a direct (case-insensitive)
// hit in either dictionary, or a lenient match after stripping a trailing
// possessive ('s) or a simple plural/inflection so "editor's" and "editors"
// ride on "editor".
func (d *Dict) Known(word string) bool {
	w := strings.ToLower(strings.TrimSpace(word))
	if w == "" {
		return true
	}
	if d.has(w) {
		return true
	}
	// Possessive: trim a trailing 's or ' and re-check the base.
	if base, ok := trimPossessive(w); ok && d.has(base) {
		return true
	}
	return false
}

// has is a direct membership test across the embedded and personal sets.
func (d *Dict) has(w string) bool {
	if _, ok := d.words[w]; ok {
		return true
	}
	_, ok := d.personal[w]
	return ok
}

// trimPossessive removes a trailing "'s" or "'" (straight or curly apostrophe),
// returning the base word and whether a trim happened.
func trimPossessive(w string) (string, bool) {
	for _, suf := range []string{"'s", "’s", "'", "’"} {
		if strings.HasSuffix(w, suf) {
			return strings.TrimSuffix(w, suf), true
		}
	}
	return w, false
}