1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
|
package spell
import "sort"
// bkTree is a Burkhard-Keller tree over the embedded wordlist: a metric tree
// keyed by Levenshtein distance that answers "all words within edit distance k"
// without scanning the whole dictionary. It is queried only on user trigger
// (the suggestion popup), never per render.
type bkTree struct {
word string
children map[int]*bkTree
}
// buildBKTree inserts words in the given order. Insertion order is descending
// frequency, which keeps common words near the root and lets Suggest break
// distance ties toward more frequent words cheaply.
func buildBKTree(words []string) *bkTree {
if len(words) == 0 {
return nil
}
root := &bkTree{word: words[0], children: map[int]*bkTree{}}
for _, w := range words[1:] {
root.insert(w)
}
return root
}
func (t *bkTree) insert(w string) {
d := levenshtein(t.word, w)
if d == 0 {
return // duplicate
}
if child, ok := t.children[d]; ok {
child.insert(w)
} else {
t.children[d] = &bkTree{word: w, children: map[int]*bkTree{}}
}
}
// candidate is a matched word with its edit distance from the query.
type candidate struct {
word string
dist int
}
// within collects every word within maxDist of query into out.
func (t *bkTree) within(query string, maxDist int, out *[]candidate) {
if t == nil {
return
}
d := levenshtein(t.word, query)
if d <= maxDist {
*out = append(*out, candidate{t.word, d})
}
// Triangle inequality: only children whose edge distance falls in
// [d-maxDist, d+maxDist] can hold matches.
for edge, child := range t.children {
if edge >= d-maxDist && edge <= d+maxDist {
child.within(query, maxDist, out)
}
}
}
// Suggest returns up to max corrections for word within edit distance 2, ranked
// by ascending edit distance and, within a distance, by ascending frequency
// rank (more common first).
func (d *Dict) Suggest(word string, max int) []string {
if d.bk == nil || max <= 0 {
return nil
}
q := lowerASCII(word)
var cands []candidate
// The BK-tree prunes with plain Levenshtein (a true metric, so the triangle
// inequality holds); a radius of 2 also captures single transpositions,
// which Levenshtein scores as 2.
d.bk.within(q, 2, &cands)
// Re-score with the transposition-aware OSA distance so "teh"->"the" ranks as
// a distance-1 fix, then break ties toward more frequent words.
for i := range cands {
cands[i].dist = osaDistance(q, cands[i].word)
}
sort.SliceStable(cands, func(i, j int) bool {
if cands[i].dist != cands[j].dist {
return cands[i].dist < cands[j].dist
}
return d.rank[cands[i].word] < d.rank[cands[j].word]
})
out := make([]string, 0, max)
for _, c := range cands {
if c.dist == 0 {
continue // the query itself is spelled fine; nothing to suggest
}
out = append(out, c.word)
if len(out) >= max {
break
}
}
return out
}
func lowerASCII(s string) string {
b := []byte(s)
for i, c := range b {
if c >= 'A' && c <= 'Z' {
b[i] = c + 32
}
}
return string(b)
}
// levenshtein is the classic two-row edit distance over runes.
func levenshtein(a, b string) int {
ra, rb := []rune(a), []rune(b)
if len(ra) == 0 {
return len(rb)
}
if len(rb) == 0 {
return len(ra)
}
prev := make([]int, len(rb)+1)
curr := make([]int, len(rb)+1)
for j := range prev {
prev[j] = j
}
for i := 1; i <= len(ra); i++ {
curr[0] = i
for j := 1; j <= len(rb); j++ {
cost := 1
if ra[i-1] == rb[j-1] {
cost = 0
}
curr[j] = min3(prev[j]+1, curr[j-1]+1, prev[j-1]+cost)
}
prev, curr = curr, prev
}
return prev[len(rb)]
}
// osaDistance is the optimal string alignment distance: Levenshtein plus
// adjacent transpositions at cost 1 (so "teh" is one edit from "the"). Used only
// to rank suggestion candidates, not for BK-tree pruning.
func osaDistance(a, b string) int {
ra, rb := []rune(a), []rune(b)
n, m := len(ra), len(rb)
if n == 0 {
return m
}
if m == 0 {
return n
}
d := make([][]int, n+1)
for i := range d {
d[i] = make([]int, m+1)
d[i][0] = i
}
for j := 0; j <= m; j++ {
d[0][j] = j
}
for i := 1; i <= n; i++ {
for j := 1; j <= m; j++ {
cost := 1
if ra[i-1] == rb[j-1] {
cost = 0
}
d[i][j] = min3(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost)
if i > 1 && j > 1 && ra[i-1] == rb[j-2] && ra[i-2] == rb[j-1] {
if t := d[i-2][j-2] + 1; t < d[i][j] {
d[i][j] = t
}
}
}
}
return d[n][m]
}
func min3(a, b, c int) int {
if b < a {
a = b
}
if c < a {
a = c
}
return a
}
|