[utils] Improve get_elements_text_and_html_by_attribute regex (#2280)

Authored by: zmousm, pukkandan
This commit is contained in:
Zenon Mousmoulas 2022-01-09 20:14:56 +02:00 committed by GitHub
parent a70b71e85a
commit 0254f16274
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 15 additions and 16 deletions

View File

@ -1659,10 +1659,10 @@ Line 1
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual( self.assertEqual(
get_elements_text_and_html_by_attribute('class', 'foo bar', html), list(get_elements_text_and_html_by_attribute('class', 'foo bar', html)),
list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES))) list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), []) self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'foo', html)), [])
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), []) self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html)), [])
GET_ELEMENT_BY_TAG_TEST_STRING = ''' GET_ELEMENT_BY_TAG_TEST_STRING = '''
random text lorem ipsum</p> random text lorem ipsum</p>

View File

@ -473,24 +473,23 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value
attribute in the passed HTML document attribute in the passed HTML document
""" """
value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
value = re.escape(value) if escape_value else value value = re.escape(value) if escape_value else value
retlist = [] partial_element_re = r'''(?x)
for m in re.finditer(r'''(?xs)
<(?P<tag>[a-zA-Z0-9:._-]+) <(?P<tag>[a-zA-Z0-9:._-]+)
(?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*? (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
\s+%(attribute)s(?:=%(value)s|\s*=\s*(?P<_q>['"]?)%(value)s(?P=_q)) \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
(?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*? ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
\s*>
''' % {'attribute': re.escape(attribute), 'value': value}, html): for m in re.finditer(partial_element_re, html):
content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():]) content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
retlist.append(( yield (
unescapeHTML(re.sub(r'(?s)^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content)), unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
whole, whole
)) )
return retlist
class HTMLBreakOnClosingTagParser(compat_HTMLParser): class HTMLBreakOnClosingTagParser(compat_HTMLParser):