package rubex /* #cgo CFLAGS: -I/usr/local/include #cgo LDFLAGS: -L/usr/local/lib -lonig #include #include #include "chelper.h" */ import "C" import ( "bytes" "errors" "fmt" "io" "log" "runtime" "strconv" "sync" "unicode/utf8" "unsafe" ) type strRange []int const numMatchStartSize = 4 const numReadBufferStartSize = 256 var mutex sync.Mutex type MatchData struct { count int indexes [][]int32 } type NamedGroupInfo map[string]int type Regexp struct { pattern string regex C.OnigRegex region *C.OnigRegion encoding C.OnigEncoding errorInfo *C.OnigErrorInfo errorBuf *C.char matchData *MatchData namedGroupInfo NamedGroupInfo } // NewRegexp creates and initializes a new Regexp with the given pattern and option. func NewRegexp(pattern string, option int) (re *Regexp, err error) { return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8}, option) } // NewRegexpASCII is equivalent to NewRegexp, but with the encoding restricted to ASCII. func NewRegexpASCII(pattern string, option int) (re *Regexp, err error) { return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_ASCII}, option) } func initRegexp(re *Regexp, option int) (*Regexp, error) { var err error patternCharPtr := C.CString(re.pattern) defer C.free(unsafe.Pointer(patternCharPtr)) mutex.Lock() defer mutex.Unlock() errorCode := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf) if errorCode != C.ONIG_NORMAL { err = errors.New(C.GoString(re.errorBuf)) } else { err = nil numCapturesInPattern := int(C.onig_number_of_captures(re.regex)) + 1 re.matchData = &MatchData{} re.matchData.indexes = make([][]int32, numMatchStartSize) for i := 0; i < numMatchStartSize; i++ { re.matchData.indexes[i] = make([]int32, numCapturesInPattern*2) } re.namedGroupInfo = re.getNamedGroupInfo() runtime.SetFinalizer(re, (*Regexp).Free) } return re, err } func Compile(str string) (*Regexp, error) { return NewRegexp(str, ONIG_OPTION_DEFAULT) } func MustCompile(str string) *Regexp { regexp, error := NewRegexp(str, ONIG_OPTION_DEFAULT) if error != nil { panic("regexp: compiling " + str + ": " + error.Error()) } return regexp } func CompileWithOption(str string, option int) (*Regexp, error) { return NewRegexp(str, option) } func MustCompileWithOption(str string, option int) *Regexp { regexp, error := NewRegexp(str, option) if error != nil { panic("regexp: compiling " + str + ": " + error.Error()) } return regexp } // MustCompileASCII is equivalent to MustCompile, but with the encoding restricted to ASCII. func MustCompileASCII(str string) *Regexp { regexp, error := NewRegexpASCII(str, ONIG_OPTION_DEFAULT) if error != nil { panic("regexp: compiling " + str + ": " + error.Error()) } return regexp } func (re *Regexp) Free() { mutex.Lock() if re.regex != nil { C.onig_free(re.regex) re.regex = nil } if re.region != nil { C.onig_region_free(re.region, 1) re.region = nil } mutex.Unlock() if re.errorInfo != nil { C.free(unsafe.Pointer(re.errorInfo)) re.errorInfo = nil } if re.errorBuf != nil { C.free(unsafe.Pointer(re.errorBuf)) re.errorBuf = nil } } func (re *Regexp) getNamedGroupInfo() (namedGroupInfo NamedGroupInfo) { numNamedGroups := int(C.onig_number_of_names(re.regex)) //when any named capture exisits, there is no numbered capture even if there are unnamed captures if numNamedGroups > 0 { namedGroupInfo = make(map[string]int) //try to get the names bufferSize := len(re.pattern) * 2 nameBuffer := make([]byte, bufferSize) groupNumbers := make([]int32, numNamedGroups) bufferPtr := unsafe.Pointer(&nameBuffer[0]) numbersPtr := unsafe.Pointer(&groupNumbers[0]) length := int(C.GetCaptureNames(re.regex, bufferPtr, (C.int)(bufferSize), (*C.int)(numbersPtr))) if length > 0 { namesAsBytes := bytes.Split(nameBuffer[:length], ([]byte)(";")) if len(namesAsBytes) != numNamedGroups { log.Fatalf("the number of named groups (%d) does not match the number names found (%d)\n", numNamedGroups, len(namesAsBytes)) } for i, nameAsBytes := range namesAsBytes { name := string(nameAsBytes) namedGroupInfo[name] = int(groupNumbers[i]) } } else { log.Fatalf("could not get the capture group names from %q", re.String()) } } return } func (re *Regexp) groupNameToId(name string) (id int) { if re.namedGroupInfo == nil { id = ONIGERR_UNDEFINED_NAME_REFERENCE } else { id = re.namedGroupInfo[name] } return } func (re *Regexp) processMatch(numCaptures int) (match []int32) { if numCaptures <= 0 { panic("cannot have 0 captures when processing a match") } matchData := re.matchData return matchData.indexes[matchData.count][:numCaptures*2] } func (re *Regexp) ClearMatchData() { matchData := re.matchData matchData.count = 0 } func (re *Regexp) find(b []byte, n int, offset int) (match []int) { if n == 0 { b = []byte{0} } ptr := unsafe.Pointer(&b[0]) matchData := re.matchData capturesPtr := unsafe.Pointer(&(matchData.indexes[matchData.count][0])) numCaptures := int32(0) numCapturesPtr := unsafe.Pointer(&numCaptures) pos := int(C.SearchOnigRegex((ptr), C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), re.regex, re.region, re.errorInfo, (*C.char)(nil), (*C.int)(capturesPtr), (*C.int)(numCapturesPtr))) if pos >= 0 { if numCaptures <= 0 { panic("cannot have 0 captures when processing a match") } match2 := matchData.indexes[matchData.count][:numCaptures*2] match = make([]int, len(match2)) for i := range match2 { match[i] = int(match2[i]) } numCapturesInPattern := int32(C.onig_number_of_captures(re.regex)) + 1 if numCapturesInPattern != numCaptures { log.Fatalf("expected %d captures but got %d\n", numCapturesInPattern, numCaptures) } } return } func getCapture(b []byte, beg int, end int) []byte { if beg < 0 || end < 0 { return nil } return b[beg:end] } func (re *Regexp) match(b []byte, n int, offset int) bool { re.ClearMatchData() if n == 0 { b = []byte{0} } ptr := unsafe.Pointer(&b[0]) pos := int(C.SearchOnigRegex((ptr), C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), re.regex, re.region, re.errorInfo, (*C.char)(nil), (*C.int)(nil), (*C.int)(nil))) return pos >= 0 } func (re *Regexp) findAll(b []byte, n int) (matches [][]int) { re.ClearMatchData() if n < 0 { n = len(b) } matchData := re.matchData offset := 0 for offset <= n { if matchData.count >= len(matchData.indexes) { length := len(matchData.indexes[0]) matchData.indexes = append(matchData.indexes, make([]int32, length)) } if match := re.find(b, n, offset); len(match) > 0 { matchData.count += 1 //move offset to the ending index of the current match and prepare to find the next non-overlapping match offset = match[1] //if match[0] == match[1], it means the current match does not advance the search. we need to exit the loop to avoid getting stuck here. if match[0] == match[1] { if offset < n && offset >= 0 { //there are more bytes, so move offset by a word _, width := utf8.DecodeRune(b[offset:]) offset += width } else { //search is over, exit loop break } } } else { break } } matches2 := matchData.indexes[:matchData.count] matches = make([][]int, len(matches2)) for i, v := range matches2 { matches[i] = make([]int, len(v)) for j, v2 := range v { matches[i][j] = int(v2) } } return } func (re *Regexp) FindIndex(b []byte) []int { re.ClearMatchData() match := re.find(b, len(b), 0) if len(match) == 0 { return nil } return match[:2] } func (re *Regexp) Find(b []byte) []byte { loc := re.FindIndex(b) if loc == nil { return nil } return getCapture(b, loc[0], loc[1]) } func (re *Regexp) FindString(s string) string { b := []byte(s) mb := re.Find(b) if mb == nil { return "" } return string(mb) } func (re *Regexp) FindStringIndex(s string) []int { b := []byte(s) return re.FindIndex(b) } func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { matches := re.findAll(b, n) if len(matches) == 0 { return nil } return matches } func (re *Regexp) FindAll(b []byte, n int) [][]byte { matches := re.FindAllIndex(b, n) if matches == nil { return nil } matchBytes := make([][]byte, 0, len(matches)) for _, match := range matches { matchBytes = append(matchBytes, getCapture(b, match[0], match[1])) } return matchBytes } func (re *Regexp) FindAllString(s string, n int) []string { b := []byte(s) matches := re.FindAllIndex(b, n) if matches == nil { return nil } matchStrings := make([]string, 0, len(matches)) for _, match := range matches { m := getCapture(b, match[0], match[1]) if m == nil { matchStrings = append(matchStrings, "") } else { matchStrings = append(matchStrings, string(m)) } } return matchStrings } func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { b := []byte(s) return re.FindAllIndex(b, n) } func (re *Regexp) findSubmatchIndex(b []byte) (match []int) { re.ClearMatchData() match = re.find(b, len(b), 0) return } func (re *Regexp) FindSubmatchIndex(b []byte) []int { match := re.findSubmatchIndex(b) if len(match) == 0 { return nil } return match } func (re *Regexp) FindSubmatch(b []byte) [][]byte { match := re.findSubmatchIndex(b) if match == nil { return nil } length := len(match) / 2 if length == 0 { return nil } results := make([][]byte, 0, length) for i := 0; i < length; i++ { results = append(results, getCapture(b, match[2*i], match[2*i+1])) } return results } func (re *Regexp) FindStringSubmatch(s string) []string { b := []byte(s) match := re.findSubmatchIndex(b) if match == nil { return nil } length := len(match) / 2 if length == 0 { return nil } results := make([]string, 0, length) for i := 0; i < length; i++ { cap := getCapture(b, match[2*i], match[2*i+1]) if cap == nil { results = append(results, "") } else { results = append(results, string(cap)) } } return results } func (re *Regexp) FindStringSubmatchIndex(s string) []int { b := []byte(s) return re.FindSubmatchIndex(b) } func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { matches := re.findAll(b, n) if len(matches) == 0 { return nil } return matches } func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { matches := re.findAll(b, n) if len(matches) == 0 { return nil } allCapturedBytes := make([][][]byte, 0, len(matches)) for _, match := range matches { length := len(match) / 2 capturedBytes := make([][]byte, 0, length) for i := 0; i < length; i++ { capturedBytes = append(capturedBytes, getCapture(b, match[2*i], match[2*i+1])) } allCapturedBytes = append(allCapturedBytes, capturedBytes) } return allCapturedBytes } func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { b := []byte(s) matches := re.findAll(b, n) if len(matches) == 0 { return nil } allCapturedStrings := make([][]string, 0, len(matches)) for _, match := range matches { length := len(match) / 2 capturedStrings := make([]string, 0, length) for i := 0; i < length; i++ { cap := getCapture(b, match[2*i], match[2*i+1]) if cap == nil { capturedStrings = append(capturedStrings, "") } else { capturedStrings = append(capturedStrings, string(cap)) } } allCapturedStrings = append(allCapturedStrings, capturedStrings) } return allCapturedStrings } func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { b := []byte(s) return re.FindAllSubmatchIndex(b, n) } func (re *Regexp) Match(b []byte) bool { return re.match(b, len(b), 0) } func (re *Regexp) MatchString(s string) bool { b := []byte(s) return re.Match(b) } func (re *Regexp) NumSubexp() int { return (int)(C.onig_number_of_captures(re.regex)) } func (re *Regexp) getNamedCapture(name []byte, capturedBytes [][]byte) []byte { nameStr := string(name) capNum := re.groupNameToId(nameStr) if capNum < 0 || capNum >= len(capturedBytes) { panic(fmt.Sprintf("capture group name (%q) has error\n", nameStr)) } return capturedBytes[capNum] } func (re *Regexp) getNumberedCapture(num int, capturedBytes [][]byte) []byte { //when named capture groups exist, numbered capture groups returns "" if re.namedGroupInfo == nil && num <= (len(capturedBytes)-1) && num >= 0 { return capturedBytes[num] } return ([]byte)("") } func fillCapturedValues(repl []byte, _ []byte, capturedBytes map[string][]byte) []byte { replLen := len(repl) newRepl := make([]byte, 0, replLen*3) inEscapeMode := false inGroupNameMode := false groupName := make([]byte, 0, replLen) for index := 0; index < replLen; index += 1 { ch := repl[index] if inGroupNameMode && ch == byte('<') { } else if inGroupNameMode && ch == byte('>') { inGroupNameMode = false groupNameStr := string(groupName) capBytes := capturedBytes[groupNameStr] newRepl = append(newRepl, capBytes...) groupName = groupName[:0] //reset the name } else if inGroupNameMode { groupName = append(groupName, ch) } else if inEscapeMode && ch <= byte('9') && byte('1') <= ch { capNumStr := string(ch) capBytes := capturedBytes[capNumStr] newRepl = append(newRepl, capBytes...) } else if inEscapeMode && ch == byte('k') && (index+1) < replLen && repl[index+1] == byte('<') { inGroupNameMode = true inEscapeMode = false index += 1 //bypass the next char '<' } else if inEscapeMode { newRepl = append(newRepl, '\\') newRepl = append(newRepl, ch) } else if ch != '\\' { newRepl = append(newRepl, ch) } if ch == byte('\\') || inEscapeMode { inEscapeMode = !inEscapeMode } } return newRepl } func (re *Regexp) replaceAll(src, repl []byte, replFunc func([]byte, []byte, map[string][]byte) []byte) []byte { srcLen := len(src) matches := re.findAll(src, srcLen) if len(matches) == 0 { return src } dest := make([]byte, 0, srcLen) for i, match := range matches { length := len(match) / 2 capturedBytes := make(map[string][]byte) if re.namedGroupInfo == nil { for j := 0; j < length; j++ { capturedBytes[strconv.Itoa(j)] = getCapture(src, match[2*j], match[2*j+1]) } } else { for name, j := range re.namedGroupInfo { capturedBytes[name] = getCapture(src, match[2*j], match[2*j+1]) } } matchBytes := getCapture(src, match[0], match[1]) newRepl := replFunc(repl, matchBytes, capturedBytes) prevEnd := 0 if i > 0 { prevMatch := matches[i-1][:2] prevEnd = prevMatch[1] } if match[0] > prevEnd && prevEnd >= 0 && match[0] <= srcLen { dest = append(dest, src[prevEnd:match[0]]...) } dest = append(dest, newRepl...) } lastEnd := matches[len(matches)-1][1] if lastEnd < srcLen && lastEnd >= 0 { dest = append(dest, src[lastEnd:]...) } return dest } func (re *Regexp) ReplaceAll(src, repl []byte) []byte { return re.replaceAll(src, repl, fillCapturedValues) } func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { return re.replaceAll(src, []byte(""), func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte { return repl(matchBytes) }) } func (re *Regexp) ReplaceAllString(src, repl string) string { return string(re.ReplaceAll([]byte(src), []byte(repl))) } func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { srcB := []byte(src) destB := re.replaceAll(srcB, []byte(""), func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte { return []byte(repl(string(matchBytes))) }) return string(destB) } func (re *Regexp) String() string { return re.pattern } func grow_buffer(b []byte, offset int, n int) []byte { if offset+n > cap(b) { buf := make([]byte, 2*cap(b)+n) copy(buf, b[:offset]) return buf } return b } func fromReader(r io.RuneReader) []byte { b := make([]byte, numReadBufferStartSize) offset := 0 var err error = nil for err == nil { rune, runeWidth, err := r.ReadRune() if err == nil { b = grow_buffer(b, offset, runeWidth) writeWidth := utf8.EncodeRune(b[offset:], rune) if runeWidth != writeWidth { panic("reading rune width not equal to the written rune width") } offset += writeWidth } else { break } } return b[:offset] } func (re *Regexp) FindReaderIndex(r io.RuneReader) []int { b := fromReader(r) return re.FindIndex(b) } func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { b := fromReader(r) return re.FindSubmatchIndex(b) } func (re *Regexp) MatchReader(r io.RuneReader) bool { b := fromReader(r) return re.Match(b) } func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { //no easy way to implement this return "", false } func MatchString(pattern string, s string) (matched bool, error error) { re, err := Compile(pattern) if err != nil { return false, err } return re.MatchString(s), nil } func (re *Regexp) Gsub(src, repl string) string { srcBytes := ([]byte)(src) replBytes := ([]byte)(repl) replaced := re.replaceAll(srcBytes, replBytes, fillCapturedValues) return string(replaced) } func (re *Regexp) GsubFunc(src string, replFunc func(string, map[string]string) string) string { srcBytes := ([]byte)(src) replaced := re.replaceAll(srcBytes, nil, func(_ []byte, matchBytes []byte, capturedBytes map[string][]byte) []byte { capturedStrings := make(map[string]string) for name, capBytes := range capturedBytes { capturedStrings[name] = string(capBytes) } matchString := string(matchBytes) return ([]byte)(replFunc(matchString, capturedStrings)) }) return string(replaced) }