2018-06-26 15:57:41 -04:00
|
|
|
package strmatcher
|
|
|
|
|
2018-08-19 15:04:15 -04:00
|
|
|
import (
|
|
|
|
"regexp"
|
|
|
|
)
|
2018-06-26 15:57:41 -04:00
|
|
|
|
2021-03-03 16:39:51 -05:00
|
|
|
// PrimeRK is the prime base used in Rabin-Karp algorithm.
|
|
|
|
const PrimeRK = 16777619
|
|
|
|
|
2018-08-20 09:39:58 -04:00
|
|
|
// Matcher is the interface to determine a string matches a pattern.
|
2018-06-26 15:57:41 -04:00
|
|
|
type Matcher interface {
|
2018-08-20 09:39:58 -04:00
|
|
|
// Match returns true if the given string matches a predefined pattern.
|
2018-06-26 15:57:41 -04:00
|
|
|
Match(string) bool
|
2020-08-15 09:22:32 -04:00
|
|
|
String() string
|
2018-06-26 15:57:41 -04:00
|
|
|
}
|
|
|
|
|
2018-08-20 09:39:58 -04:00
|
|
|
// Type is the type of the matcher.
|
2018-06-26 15:57:41 -04:00
|
|
|
type Type byte
|
|
|
|
|
|
|
|
const (
|
2018-08-20 09:39:58 -04:00
|
|
|
// Full is the type of matcher that the input string must exactly equal to the pattern.
|
2018-06-26 15:57:41 -04:00
|
|
|
Full Type = iota
|
2018-08-20 09:39:58 -04:00
|
|
|
// Substr is the type of matcher that the input string must contain the pattern as a sub-string.
|
2018-06-26 15:57:41 -04:00
|
|
|
Substr
|
2018-08-20 09:39:58 -04:00
|
|
|
// Domain is the type of matcher that the input string must be a sub-domain or itself of the pattern.
|
2018-06-26 15:57:41 -04:00
|
|
|
Domain
|
2018-08-20 09:39:58 -04:00
|
|
|
// Regex is the type of matcher that the input string must matches the regular-expression pattern.
|
2018-06-26 15:57:41 -04:00
|
|
|
Regex
|
|
|
|
)
|
|
|
|
|
2018-08-20 09:39:58 -04:00
|
|
|
// New creates a new Matcher based on the given pattern.
|
2018-06-26 15:57:41 -04:00
|
|
|
func (t Type) New(pattern string) (Matcher, error) {
|
|
|
|
switch t {
|
|
|
|
case Full:
|
|
|
|
return fullMatcher(pattern), nil
|
|
|
|
case Substr:
|
|
|
|
return substrMatcher(pattern), nil
|
|
|
|
case Domain:
|
|
|
|
return domainMatcher(pattern), nil
|
|
|
|
case Regex:
|
|
|
|
r, err := regexp.Compile(pattern)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return ®exMatcher{
|
|
|
|
pattern: r,
|
|
|
|
}, nil
|
|
|
|
default:
|
|
|
|
panic("Unknown type")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-20 09:39:58 -04:00
|
|
|
// IndexMatcher is the interface for matching with a group of matchers.
|
2018-08-19 15:04:15 -04:00
|
|
|
type IndexMatcher interface {
|
2020-08-31 05:53:16 -04:00
|
|
|
// Match returns the index of a matcher that matches the input. It returns empty array if no such matcher exists.
|
2020-08-11 01:31:04 -04:00
|
|
|
Match(input string) []uint32
|
2018-08-19 15:04:15 -04:00
|
|
|
}
|
|
|
|
|
2018-06-26 15:57:41 -04:00
|
|
|
type matcherEntry struct {
|
|
|
|
m Matcher
|
|
|
|
id uint32
|
|
|
|
}
|
|
|
|
|
2021-01-20 02:53:07 -05:00
|
|
|
type ACAutomatonMatcherGroup struct {
|
|
|
|
count uint32
|
|
|
|
ac *ACAutomaton
|
2021-03-03 16:39:51 -05:00
|
|
|
nonSubstrMap map[uint32]string
|
2021-01-20 02:53:07 -05:00
|
|
|
otherMatchers []matcherEntry
|
|
|
|
}
|
|
|
|
|
|
|
|
func NewACAutomatonMatcherGroup() *ACAutomatonMatcherGroup {
|
|
|
|
var g = new(ACAutomatonMatcherGroup)
|
|
|
|
g.count = 1
|
2021-03-03 16:39:51 -05:00
|
|
|
g.nonSubstrMap = map[uint32]string{}
|
2021-01-20 02:53:07 -05:00
|
|
|
return g
|
|
|
|
}
|
|
|
|
|
2021-03-03 16:39:51 -05:00
|
|
|
// Add `full` or `domain` pattern to hashmap
|
|
|
|
func (g *ACAutomatonMatcherGroup) AddFullOrDomainPattern(pattern string, t Type) {
|
|
|
|
h := uint32(0)
|
|
|
|
for i := len(pattern) - 1; i >= 0; i-- {
|
|
|
|
h = h*PrimeRK + uint32(pattern[i])
|
|
|
|
}
|
|
|
|
switch t {
|
|
|
|
case Full:
|
|
|
|
g.nonSubstrMap[h] = pattern
|
|
|
|
case Domain:
|
|
|
|
g.nonSubstrMap[h] = pattern
|
|
|
|
g.nonSubstrMap[h*PrimeRK+uint32('.')] = "." + pattern
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-01-20 02:53:07 -05:00
|
|
|
func (g *ACAutomatonMatcherGroup) AddPattern(pattern string, t Type) (uint32, error) {
|
|
|
|
switch t {
|
2021-03-03 16:39:51 -05:00
|
|
|
case Substr:
|
|
|
|
if g.ac == nil {
|
|
|
|
g.ac = NewACAutomaton()
|
|
|
|
}
|
2021-01-20 02:53:07 -05:00
|
|
|
g.ac.Add(pattern, t)
|
2021-03-03 16:39:51 -05:00
|
|
|
case Full, Domain:
|
|
|
|
g.AddFullOrDomainPattern(pattern, t)
|
2021-01-20 02:53:07 -05:00
|
|
|
case Regex:
|
|
|
|
g.count++
|
|
|
|
r, err := regexp.Compile(pattern)
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
g.otherMatchers = append(g.otherMatchers, matcherEntry{
|
|
|
|
m: ®exMatcher{pattern: r},
|
|
|
|
id: g.count,
|
|
|
|
})
|
|
|
|
default:
|
|
|
|
panic("Unknown type")
|
|
|
|
}
|
|
|
|
return g.count, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *ACAutomatonMatcherGroup) Build() {
|
2021-03-03 16:39:51 -05:00
|
|
|
if g.ac != nil {
|
|
|
|
g.ac.Build()
|
|
|
|
}
|
2021-01-20 02:53:07 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
// Match implements IndexMatcher.Match.
|
|
|
|
func (g *ACAutomatonMatcherGroup) Match(pattern string) []uint32 {
|
|
|
|
result := []uint32{}
|
2021-03-03 16:39:51 -05:00
|
|
|
hash := uint32(0)
|
|
|
|
for i := len(pattern) - 1; i >= 0; i-- {
|
|
|
|
hash = hash*PrimeRK + uint32(pattern[i])
|
|
|
|
if pattern[i] == '.' {
|
|
|
|
if v, ok := g.nonSubstrMap[hash]; ok && v == pattern[i:] {
|
|
|
|
result = append(result, 1)
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if v, ok := g.nonSubstrMap[hash]; ok && v == pattern {
|
|
|
|
result = append(result, 1)
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
if g.ac != nil && g.ac.Match(pattern) {
|
2021-01-20 02:53:07 -05:00
|
|
|
result = append(result, 1)
|
2021-03-03 16:39:51 -05:00
|
|
|
return result
|
2021-01-20 02:53:07 -05:00
|
|
|
}
|
|
|
|
for _, e := range g.otherMatchers {
|
|
|
|
if e.m.Match(pattern) {
|
|
|
|
result = append(result, e.id)
|
2021-03-03 16:39:51 -05:00
|
|
|
return result
|
2021-01-20 02:53:07 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
2018-08-20 09:39:58 -04:00
|
|
|
// MatcherGroup is an implementation of IndexMatcher.
|
|
|
|
// Empty initialization works.
|
2018-06-26 15:57:41 -04:00
|
|
|
type MatcherGroup struct {
|
|
|
|
count uint32
|
2018-08-20 03:57:06 -04:00
|
|
|
fullMatcher FullMatcherGroup
|
2018-08-19 15:04:15 -04:00
|
|
|
domainMatcher DomainMatcherGroup
|
2018-06-26 15:57:41 -04:00
|
|
|
otherMatchers []matcherEntry
|
|
|
|
}
|
|
|
|
|
2018-08-20 09:39:58 -04:00
|
|
|
// Add adds a new Matcher into the MatcherGroup, and returns its index. The index will never be 0.
|
2018-06-26 15:57:41 -04:00
|
|
|
func (g *MatcherGroup) Add(m Matcher) uint32 {
|
|
|
|
g.count++
|
2018-08-20 03:57:06 -04:00
|
|
|
c := g.count
|
2018-06-26 15:57:41 -04:00
|
|
|
|
2018-08-19 15:04:15 -04:00
|
|
|
switch tm := m.(type) {
|
|
|
|
case fullMatcher:
|
2018-08-20 03:57:06 -04:00
|
|
|
g.fullMatcher.addMatcher(tm, c)
|
2018-08-19 15:04:15 -04:00
|
|
|
case domainMatcher:
|
2018-08-20 03:57:06 -04:00
|
|
|
g.domainMatcher.addMatcher(tm, c)
|
2018-08-19 15:04:15 -04:00
|
|
|
default:
|
2018-06-26 15:57:41 -04:00
|
|
|
g.otherMatchers = append(g.otherMatchers, matcherEntry{
|
|
|
|
m: m,
|
|
|
|
id: c,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
|
2018-08-20 09:39:58 -04:00
|
|
|
// Match implements IndexMatcher.Match.
|
2020-08-11 01:31:04 -04:00
|
|
|
func (g *MatcherGroup) Match(pattern string) []uint32 {
|
|
|
|
result := []uint32{}
|
|
|
|
result = append(result, g.fullMatcher.Match(pattern)...)
|
|
|
|
result = append(result, g.domainMatcher.Match(pattern)...)
|
2018-06-26 15:57:41 -04:00
|
|
|
for _, e := range g.otherMatchers {
|
|
|
|
if e.m.Match(pattern) {
|
2020-08-11 01:31:04 -04:00
|
|
|
result = append(result, e.id)
|
2018-06-26 15:57:41 -04:00
|
|
|
}
|
|
|
|
}
|
2020-08-11 01:31:04 -04:00
|
|
|
return result
|
2018-06-26 15:57:41 -04:00
|
|
|
}
|
|
|
|
|
2018-08-20 09:39:58 -04:00
|
|
|
// Size returns the number of matchers in the MatcherGroup.
|
2018-06-26 15:57:41 -04:00
|
|
|
func (g *MatcherGroup) Size() uint32 {
|
|
|
|
return g.count
|
|
|
|
}
|