2019-02-17 19:50:26 -05:00
|
|
|
// Copyright (c) 2018 Couchbase, Inc.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
2019-11-27 04:23:33 -05:00
|
|
|
package levenshtein
|
2019-02-17 19:50:26 -05:00
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"sort"
|
|
|
|
"unicode/utf8"
|
|
|
|
)
|
|
|
|
|
|
|
|
type FullCharacteristicVector []uint32
|
|
|
|
|
|
|
|
func (fcv FullCharacteristicVector) shiftAndMask(offset, mask uint32) uint32 {
|
|
|
|
bucketID := offset / 32
|
|
|
|
align := offset - bucketID*32
|
|
|
|
if align == 0 {
|
|
|
|
return fcv[bucketID] & mask
|
|
|
|
}
|
|
|
|
left := fcv[bucketID] >> align
|
|
|
|
right := fcv[bucketID+1] << (32 - align)
|
|
|
|
return (left | right) & mask
|
|
|
|
}
|
|
|
|
|
|
|
|
type tuple struct {
|
|
|
|
char rune
|
|
|
|
fcv FullCharacteristicVector
|
|
|
|
}
|
|
|
|
|
|
|
|
type sortRunes []rune
|
|
|
|
|
|
|
|
func (s sortRunes) Less(i, j int) bool {
|
|
|
|
return s[i] < s[j]
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s sortRunes) Swap(i, j int) {
|
|
|
|
s[i], s[j] = s[j], s[i]
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s sortRunes) Len() int {
|
|
|
|
return len(s)
|
|
|
|
}
|
|
|
|
|
|
|
|
func sortRune(r []rune) []rune {
|
|
|
|
sort.Sort(sortRunes(r))
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
type Alphabet struct {
|
|
|
|
charset []tuple
|
|
|
|
index uint32
|
|
|
|
}
|
|
|
|
|
|
|
|
func (a *Alphabet) resetNext() {
|
|
|
|
a.index = 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func (a *Alphabet) next() (rune, FullCharacteristicVector, error) {
|
|
|
|
if int(a.index) >= len(a.charset) {
|
|
|
|
return 0, nil, fmt.Errorf("eof")
|
|
|
|
}
|
|
|
|
|
|
|
|
rv := a.charset[a.index]
|
|
|
|
a.index++
|
|
|
|
return rv.char, rv.fcv, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func dedupe(in string) string {
|
|
|
|
lookUp := make(map[rune]struct{}, len(in))
|
|
|
|
var rv string
|
|
|
|
for len(in) > 0 {
|
|
|
|
r, size := utf8.DecodeRuneInString(in)
|
|
|
|
in = in[size:]
|
|
|
|
if _, ok := lookUp[r]; !ok {
|
|
|
|
rv += string(r)
|
|
|
|
lookUp[r] = struct{}{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
|
|
|
func queryChars(qChars string) Alphabet {
|
|
|
|
chars := dedupe(qChars)
|
|
|
|
inChars := sortRune([]rune(chars))
|
|
|
|
charsets := make([]tuple, 0, len(inChars))
|
|
|
|
|
|
|
|
for _, c := range inChars {
|
|
|
|
tempChars := qChars
|
|
|
|
var bits []uint32
|
|
|
|
for len(tempChars) > 0 {
|
|
|
|
var chunk string
|
|
|
|
if len(tempChars) > 32 {
|
|
|
|
chunk = tempChars[0:32]
|
|
|
|
tempChars = tempChars[32:]
|
|
|
|
} else {
|
|
|
|
chunk = tempChars
|
|
|
|
tempChars = tempChars[:0]
|
|
|
|
}
|
|
|
|
|
|
|
|
chunkBits := uint32(0)
|
|
|
|
bit := uint32(1)
|
|
|
|
for _, chr := range chunk {
|
|
|
|
if chr == c {
|
|
|
|
chunkBits |= bit
|
|
|
|
}
|
|
|
|
bit <<= 1
|
|
|
|
}
|
|
|
|
bits = append(bits, chunkBits)
|
|
|
|
}
|
|
|
|
bits = append(bits, 0)
|
|
|
|
charsets = append(charsets, tuple{char: c, fcv: FullCharacteristicVector(bits)})
|
|
|
|
}
|
|
|
|
return Alphabet{charset: charsets}
|
|
|
|
}
|