// Copyright (c) 2014, David Kitchen <david@buro9.com> // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, this // list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * Neither the name of the organisation (Microcosm) nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package bluemonday import ( "bytes" "io" "net/url" "regexp" "strconv" "strings" "golang.org/x/net/html" "github.com/aymerick/douceur/parser" ) var ( dataAttribute = regexp.MustCompile("^data-.+") dataAttributeXMLPrefix = regexp.MustCompile("^xml.+") dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+") cssUnicodeChar = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`) ) // Sanitize takes a string that contains a HTML fragment or document and applies // the given policy whitelist. // // It returns a HTML string that has been sanitized by the policy or an empty // string if an error has occurred (most likely as a consequence of extremely // malformed input) func (p *Policy) Sanitize(s string) string { if strings.TrimSpace(s) == "" { return s } return p.sanitize(strings.NewReader(s)).String() } // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies // the given policy whitelist. // // It returns a []byte containing the HTML that has been sanitized by the policy // or an empty []byte if an error has occurred (most likely as a consequence of // extremely malformed input) func (p *Policy) SanitizeBytes(b []byte) []byte { if len(bytes.TrimSpace(b)) == 0 { return b } return p.sanitize(bytes.NewReader(b)).Bytes() } // SanitizeReader takes an io.Reader that contains a HTML fragment or document // and applies the given policy whitelist. // // It returns a bytes.Buffer containing the HTML that has been sanitized by the // policy. Errors during sanitization will merely return an empty result. func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer { return p.sanitize(r) } const escapedURLChars = "'<>\"\r" func escapeUrlComponent(val string) string { w := bytes.NewBufferString("") i := strings.IndexAny(val, escapedURLChars) for i != -1 { if _, err := w.WriteString(val[:i]); err != nil { return w.String() } var esc string switch val[i] { case '\'': // "'" is shorter than "'" and apos was not in HTML until HTML5. esc = "'" case '<': esc = "<" case '>': esc = ">" case '"': // """ is shorter than """. esc = """ case '\r': esc = " " default: panic("unrecognized escape character") } val = val[i+1:] if _, err := w.WriteString(esc); err != nil { return w.String() } i = strings.IndexAny(val, escapedURLChars) } w.WriteString(val) return w.String() } // Query represents a query type Query struct { Key string Value string HasValue bool } func parseQuery(query string) (values []Query, err error) { for query != "" { key := query if i := strings.IndexAny(key, "&;"); i >= 0 { key, query = key[:i], key[i+1:] } else { query = "" } if key == "" { continue } value := "" hasValue := false if i := strings.Index(key, "="); i >= 0 { key, value = key[:i], key[i+1:] hasValue = true } key, err1 := url.QueryUnescape(key) if err1 != nil { if err == nil { err = err1 } continue } value, err1 = url.QueryUnescape(value) if err1 != nil { if err == nil { err = err1 } continue } values = append(values, Query{ Key: key, Value: value, HasValue: hasValue, }) } return values, err } func encodeQueries(queries []Query) string { var buff bytes.Buffer for i, query := range queries { buff.WriteString(url.QueryEscape(query.Key)) if query.HasValue { buff.WriteString("=") buff.WriteString(url.QueryEscape(query.Value)) } if i < len(queries)-1 { buff.WriteString("&") } } return buff.String() } func sanitizedURL(val string) (string, error) { u, err := url.Parse(val) if err != nil { return "", err } // we use parseQuery but not u.Query to keep the order not change because // url.Values is a map which has a random order. queryValues, err := parseQuery(u.RawQuery) if err != nil { return "", err } // sanitize the url query params for i, query := range queryValues { queryValues[i].Key = html.EscapeString(query.Key) } u.RawQuery = encodeQueries(queryValues) // u.String() will also sanitize host/scheme/user/pass return u.String(), nil } func (p *Policy) writeLinkableBuf(buff *bytes.Buffer, token *html.Token) { // do not escape multiple query parameters tokenBuff := bytes.NewBufferString("") tokenBuff.WriteString("<") tokenBuff.WriteString(token.Data) for _, attr := range token.Attr { tokenBuff.WriteByte(' ') tokenBuff.WriteString(attr.Key) tokenBuff.WriteString(`="`) switch attr.Key { case "href", "src": u, ok := p.validURL(attr.Val) if !ok { tokenBuff.WriteString(html.EscapeString(attr.Val)) continue } u, err := sanitizedURL(u) if err == nil { tokenBuff.WriteString(u) } else { // fallthrough tokenBuff.WriteString(html.EscapeString(attr.Val)) } default: // re-apply tokenBuff.WriteString(html.EscapeString(attr.Val)) } tokenBuff.WriteByte('"') } if token.Type == html.SelfClosingTagToken { tokenBuff.WriteString("/") } tokenBuff.WriteString(">") buff.WriteString(tokenBuff.String()) } // Performs the actual sanitization process. func (p *Policy) sanitize(r io.Reader) *bytes.Buffer { // It is possible that the developer has created the policy via: // p := bluemonday.Policy{} // rather than: // p := bluemonday.NewPolicy() // If this is the case, and if they haven't yet triggered an action that // would initiliaze the maps, then we need to do that. p.init() var ( buff bytes.Buffer skipElementContent bool skippingElementsCount int64 skipClosingTag bool closingTagToSkipStack []string mostRecentlyStartedToken string ) tokenizer := html.NewTokenizer(r) for { if tokenizer.Next() == html.ErrorToken { err := tokenizer.Err() if err == io.EOF { // End of input means end of processing return &buff } // Raw tokenizer error return &bytes.Buffer{} } token := tokenizer.Token() switch token.Type { case html.DoctypeToken: // DocType is not handled as there is no safe parsing mechanism // provided by golang.org/x/net/html for the content, and this can // be misused to insert HTML tags that are not then sanitized // // One might wish to recursively sanitize here using the same policy // but I will need to do some further testing before considering // this. case html.CommentToken: // Comments are ignored by default case html.StartTagToken: mostRecentlyStartedToken = normaliseElementName(token.Data) aps, ok := p.elsAndAttrs[token.Data] if !ok { aa, matched := p.matchRegex(token.Data) if !matched { if _, ok := p.setOfElementsToSkipContent[token.Data]; ok { skipElementContent = true skippingElementsCount++ } if p.addSpaces { buff.WriteString(" ") } break } aps = aa } if len(token.Attr) != 0 { token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) } if len(token.Attr) == 0 { if !p.allowNoAttrs(token.Data) { skipClosingTag = true closingTagToSkipStack = append(closingTagToSkipStack, token.Data) if p.addSpaces { buff.WriteString(" ") } break } } if !skipElementContent { // do not escape multiple query parameters if linkable(token.Data) { p.writeLinkableBuf(&buff, &token) } else { buff.WriteString(token.String()) } } case html.EndTagToken: if mostRecentlyStartedToken == normaliseElementName(token.Data) { mostRecentlyStartedToken = "" } if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data { closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1] if len(closingTagToSkipStack) == 0 { skipClosingTag = false } if p.addSpaces { buff.WriteString(" ") } break } if _, ok := p.elsAndAttrs[token.Data]; !ok { match := false for regex := range p.elsMatchingAndAttrs { if regex.MatchString(token.Data) { skipElementContent = false match = true break } } if _, ok := p.setOfElementsToSkipContent[token.Data]; ok && !match { skippingElementsCount-- if skippingElementsCount == 0 { skipElementContent = false } } if !match { if p.addSpaces { buff.WriteString(" ") } break } } if !skipElementContent { buff.WriteString(token.String()) } case html.SelfClosingTagToken: aps, ok := p.elsAndAttrs[token.Data] if !ok { aa, matched := p.matchRegex(token.Data) if !matched { if p.addSpaces && !matched { buff.WriteString(" ") } break } aps = aa } if len(token.Attr) != 0 { token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) } if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) { if p.addSpaces { buff.WriteString(" ") break } } if !skipElementContent { // do not escape multiple query parameters if linkable(token.Data) { p.writeLinkableBuf(&buff, &token) } else { buff.WriteString(token.String()) } } case html.TextToken: if !skipElementContent { switch mostRecentlyStartedToken { case `script`: // not encouraged, but if a policy allows JavaScript we // should not HTML escape it as that would break the output buff.WriteString(token.Data) case `style`: // not encouraged, but if a policy allows CSS styles we // should not HTML escape it as that would break the output buff.WriteString(token.Data) default: // HTML escape the text buff.WriteString(token.String()) } } default: // A token that didn't exist in the html package when we wrote this return &bytes.Buffer{} } } } // sanitizeAttrs takes a set of element attribute policies and the global // attribute policies and applies them to the []html.Attribute returning a set // of html.Attributes that match the policies func (p *Policy) sanitizeAttrs( elementName string, attrs []html.Attribute, aps map[string]attrPolicy, ) []html.Attribute { if len(attrs) == 0 { return attrs } hasStylePolicies := false sps, elementHasStylePolicies := p.elsAndStyles[elementName] if len(p.globalStyles) > 0 || (elementHasStylePolicies && len(sps) > 0) { hasStylePolicies = true } // no specific element policy found, look for a pattern match if !hasStylePolicies { for k, v := range p.elsMatchingAndStyles { if k.MatchString(elementName) { if len(v) > 0 { hasStylePolicies = true break } } } } // Builds a new attribute slice based on the whether the attribute has been // whitelisted explicitly or globally. cleanAttrs := []html.Attribute{} for _, htmlAttr := range attrs { if p.allowDataAttributes { // If we see a data attribute, let it through. if isDataAttribute(htmlAttr.Key) { cleanAttrs = append(cleanAttrs, htmlAttr) continue } } // Is this a "style" attribute, and if so, do we need to sanitize it? if htmlAttr.Key == "style" && hasStylePolicies { htmlAttr = p.sanitizeStyles(htmlAttr, elementName) if htmlAttr.Val == "" { // We've sanitized away any and all styles; don't bother to // output the style attribute (even if it's allowed) continue } else { cleanAttrs = append(cleanAttrs, htmlAttr) continue } } // Is there an element specific attribute policy that applies? if ap, ok := aps[htmlAttr.Key]; ok { if ap.regexp != nil { if ap.regexp.MatchString(htmlAttr.Val) { cleanAttrs = append(cleanAttrs, htmlAttr) continue } } else { cleanAttrs = append(cleanAttrs, htmlAttr) continue } } // Is there a global attribute policy that applies? if ap, ok := p.globalAttrs[htmlAttr.Key]; ok { if ap.regexp != nil { if ap.regexp.MatchString(htmlAttr.Val) { cleanAttrs = append(cleanAttrs, htmlAttr) } } else { cleanAttrs = append(cleanAttrs, htmlAttr) } } } if len(cleanAttrs) == 0 { // If nothing was allowed, let's get out of here return cleanAttrs } // cleanAttrs now contains the attributes that are permitted if linkable(elementName) { if p.requireParseableURLs { // Ensure URLs are parseable: // - a.href // - area.href // - link.href // - blockquote.cite // - q.cite // - img.src // - script.src tmpAttrs := []html.Attribute{} for _, htmlAttr := range cleanAttrs { switch elementName { case "a", "area", "link": if htmlAttr.Key == "href" { if u, ok := p.validURL(htmlAttr.Val); ok { htmlAttr.Val = u tmpAttrs = append(tmpAttrs, htmlAttr) } break } tmpAttrs = append(tmpAttrs, htmlAttr) case "blockquote", "q": if htmlAttr.Key == "cite" { if u, ok := p.validURL(htmlAttr.Val); ok { htmlAttr.Val = u tmpAttrs = append(tmpAttrs, htmlAttr) } break } tmpAttrs = append(tmpAttrs, htmlAttr) case "img", "script": if htmlAttr.Key == "src" { if u, ok := p.validURL(htmlAttr.Val); ok { htmlAttr.Val = u tmpAttrs = append(tmpAttrs, htmlAttr) } break } tmpAttrs = append(tmpAttrs, htmlAttr) default: tmpAttrs = append(tmpAttrs, htmlAttr) } } cleanAttrs = tmpAttrs } if (p.requireNoFollow || p.requireNoFollowFullyQualifiedLinks || p.requireNoReferrer || p.requireNoReferrerFullyQualifiedLinks || p.addTargetBlankToFullyQualifiedLinks) && len(cleanAttrs) > 0 { // Add rel="nofollow" if a "href" exists switch elementName { case "a", "area", "link": var hrefFound bool var externalLink bool for _, htmlAttr := range cleanAttrs { if htmlAttr.Key == "href" { hrefFound = true u, err := url.Parse(htmlAttr.Val) if err != nil { continue } if u.Host != "" { externalLink = true } continue } } if hrefFound { var ( noFollowFound bool noReferrerFound bool targetBlankFound bool ) addNoFollow := (p.requireNoFollow || externalLink && p.requireNoFollowFullyQualifiedLinks) addNoReferrer := (p.requireNoReferrer || externalLink && p.requireNoReferrerFullyQualifiedLinks) addTargetBlank := (externalLink && p.addTargetBlankToFullyQualifiedLinks) tmpAttrs := []html.Attribute{} for _, htmlAttr := range cleanAttrs { var appended bool if htmlAttr.Key == "rel" && (addNoFollow || addNoReferrer) { if addNoFollow && !strings.Contains(htmlAttr.Val, "nofollow") { htmlAttr.Val += " nofollow" } if addNoReferrer && !strings.Contains(htmlAttr.Val, "noreferrer") { htmlAttr.Val += " noreferrer" } noFollowFound = addNoFollow noReferrerFound = addNoReferrer tmpAttrs = append(tmpAttrs, htmlAttr) appended = true } if elementName == "a" && htmlAttr.Key == "target" { if htmlAttr.Val == "_blank" { targetBlankFound = true } if addTargetBlank && !targetBlankFound { htmlAttr.Val = "_blank" targetBlankFound = true tmpAttrs = append(tmpAttrs, htmlAttr) appended = true } } if !appended { tmpAttrs = append(tmpAttrs, htmlAttr) } } if noFollowFound || noReferrerFound || targetBlankFound { cleanAttrs = tmpAttrs } if (addNoFollow && !noFollowFound) || (addNoReferrer && !noReferrerFound) { rel := html.Attribute{} rel.Key = "rel" if addNoFollow { rel.Val = "nofollow" } if addNoReferrer { if rel.Val != "" { rel.Val += " " } rel.Val += "noreferrer" } cleanAttrs = append(cleanAttrs, rel) } if elementName == "a" && addTargetBlank && !targetBlankFound { rel := html.Attribute{} rel.Key = "target" rel.Val = "_blank" targetBlankFound = true cleanAttrs = append(cleanAttrs, rel) } if targetBlankFound { // target="_blank" has a security risk that allows the // opened window/tab to issue JavaScript calls against // window.opener, which in effect allow the destination // of the link to control the source: // https://dev.to/ben/the-targetblank-vulnerability-by-example // // To mitigate this risk, we need to add a specific rel // attribute if it is not already present. // rel="noopener" // // Unfortunately this is processing the rel twice (we // already looked at it earlier ^^) as we cannot be sure // of the ordering of the href and rel, and whether we // have fully satisfied that we need to do this. This // double processing only happens *if* target="_blank" // is true. var noOpenerAdded bool tmpAttrs := []html.Attribute{} for _, htmlAttr := range cleanAttrs { var appended bool if htmlAttr.Key == "rel" { if strings.Contains(htmlAttr.Val, "noopener") { noOpenerAdded = true tmpAttrs = append(tmpAttrs, htmlAttr) } else { htmlAttr.Val += " noopener" noOpenerAdded = true tmpAttrs = append(tmpAttrs, htmlAttr) } appended = true } if !appended { tmpAttrs = append(tmpAttrs, htmlAttr) } } if noOpenerAdded { cleanAttrs = tmpAttrs } else { // rel attr was not found, or else noopener would // have been added already rel := html.Attribute{} rel.Key = "rel" rel.Val = "noopener" cleanAttrs = append(cleanAttrs, rel) } } } default: } } } if p.requireCrossOriginAnonymous && len(cleanAttrs) > 0 { switch elementName { case "audio", "img", "link", "script", "video": var crossOriginFound bool for _, htmlAttr := range cleanAttrs { if htmlAttr.Key == "crossorigin" { crossOriginFound = true htmlAttr.Val = "anonymous" } } if !crossOriginFound { crossOrigin := html.Attribute{} crossOrigin.Key = "crossorigin" crossOrigin.Val = "anonymous" cleanAttrs = append(cleanAttrs, crossOrigin) } } } return cleanAttrs } func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute { sps := p.elsAndStyles[elementName] if len(sps) == 0 { sps = map[string]stylePolicy{} // check for any matching elements, if we don't already have a policy found // if multiple matches are found they will be overwritten, it's best // to not have overlapping matchers for regex, policies := range p.elsMatchingAndStyles { if regex.MatchString(elementName) { for k, v := range policies { sps[k] = v } } } } //Add semi-colon to end to fix parsing issue if len(attr.Val) > 0 && attr.Val[len(attr.Val)-1] != ';' { attr.Val = attr.Val + ";" } decs, err := parser.ParseDeclarations(attr.Val) if err != nil { attr.Val = "" return attr } clean := []string{} prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"} for _, dec := range decs { addedProperty := false tempProperty := strings.ToLower(dec.Property) tempValue := removeUnicode(strings.ToLower(dec.Value)) for _, i := range prefixes { tempProperty = strings.TrimPrefix(tempProperty, i) } if sp, ok := sps[tempProperty]; ok { if sp.handler != nil { if sp.handler(tempValue) { clean = append(clean, dec.Property+": "+dec.Value) addedProperty = true } } else if len(sp.enum) > 0 { if stringInSlice(tempValue, sp.enum) { clean = append(clean, dec.Property+": "+dec.Value) addedProperty = true } } else if sp.regexp != nil { if sp.regexp.MatchString(tempValue) { clean = append(clean, dec.Property+": "+dec.Value) addedProperty = true } continue } } if sp, ok := p.globalStyles[tempProperty]; ok && !addedProperty { if sp.handler != nil { if sp.handler(tempValue) { clean = append(clean, dec.Property+": "+dec.Value) } } else if len(sp.enum) > 0 { if stringInSlice(tempValue, sp.enum) { clean = append(clean, dec.Property+": "+dec.Value) } } else if sp.regexp != nil { if sp.regexp.MatchString(tempValue) { clean = append(clean, dec.Property+": "+dec.Value) } continue } } } if len(clean) > 0 { attr.Val = strings.Join(clean, "; ") } else { attr.Val = "" } return attr } func (p *Policy) allowNoAttrs(elementName string) bool { _, ok := p.setOfElementsAllowedWithoutAttrs[elementName] if !ok { for _, r := range p.setOfElementsMatchingAllowedWithoutAttrs { if r.MatchString(elementName) { ok = true break } } } return ok } func (p *Policy) validURL(rawurl string) (string, bool) { if p.requireParseableURLs { // URLs are valid if when space is trimmed the URL is valid rawurl = strings.TrimSpace(rawurl) // URLs cannot contain whitespace, unless it is a data-uri if (strings.Contains(rawurl, " ") || strings.Contains(rawurl, "\t") || strings.Contains(rawurl, "\n")) && !strings.HasPrefix(rawurl, `data:`) { return "", false } // URLs are valid if they parse u, err := url.Parse(rawurl) if err != nil { return "", false } if u.Scheme != "" { urlPolicy, ok := p.allowURLSchemes[u.Scheme] if !ok { return "", false } if urlPolicy == nil || urlPolicy(u) == true { return u.String(), true } return "", false } if p.allowRelativeURLs { if u.String() != "" { return u.String(), true } } return "", false } return rawurl, true } func linkable(elementName string) bool { switch elementName { case "a", "area", "blockquote", "img", "link", "script": return true default: return false } } // stringInSlice returns true if needle exists in haystack func stringInSlice(needle string, haystack []string) bool { for _, straw := range haystack { if strings.ToLower(straw) == strings.ToLower(needle) { return true } } return false } func isDataAttribute(val string) bool { if !dataAttribute.MatchString(val) { return false } rest := strings.Split(val, "data-") if len(rest) == 1 { return false } // data-xml* is invalid. if dataAttributeXMLPrefix.MatchString(rest[1]) { return false } // no uppercase or semi-colons allowed. if dataAttributeInvalidChars.MatchString(rest[1]) { return false } return true } func removeUnicode(value string) string { substitutedValue := value currentLoc := cssUnicodeChar.FindStringIndex(substitutedValue) for currentLoc != nil { character := substitutedValue[currentLoc[0]+1 : currentLoc[1]] character = strings.TrimSpace(character) if len(character) < 4 { character = strings.Repeat("0", 4-len(character)) + character } else { for len(character) > 4 { if character[0] != '0' { character = "" break } else { character = character[1:] } } } character = "\\u" + character translatedChar, err := strconv.Unquote(`"` + character + `"`) translatedChar = strings.TrimSpace(translatedChar) if err != nil { return "" } substitutedValue = substitutedValue[0:currentLoc[0]] + translatedChar + substitutedValue[currentLoc[1]:] currentLoc = cssUnicodeChar.FindStringIndex(substitutedValue) } return substitutedValue } func (p *Policy) matchRegex(elementName string) (map[string]attrPolicy, bool) { aps := make(map[string]attrPolicy, 0) matched := false for regex, attrs := range p.elsMatchingAndAttrs { if regex.MatchString(elementName) { matched = true for k, v := range attrs { aps[k] = v } } } return aps, matched } // normaliseElementName takes a HTML element like <script> which is user input // and returns a lower case version of it that is immune to UTF-8 to ASCII // conversion tricks (like the use of upper case cyrillic i scrÄ°pt which a // strings.ToLower would convert to script). Instead this func will preserve // all non-ASCII as their escaped equivalent, i.e. \u0130 which reveals the // characters when lower cased func normaliseElementName(str string) string { // that useful QuoteToASCII put quote marks at the start and end // so those are trimmed off return strings.TrimSuffix( strings.TrimPrefix( strings.ToLower( strconv.QuoteToASCII(str), ), `"`), `"`, ) }