diff --git a/test/test_utils.py b/test/test_utils.py index c3ec798dc..2c8f2c03e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1659,10 +1659,10 @@ Line 1 html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING self.assertEqual( - get_elements_text_and_html_by_attribute('class', 'foo bar', html), + list(get_elements_text_and_html_by_attribute('class', 'foo bar', html)), list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES))) - self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), []) - self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), []) + self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'foo', html)), []) + self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html)), []) GET_ELEMENT_BY_TAG_TEST_STRING = ''' random text lorem ipsum
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 826ab5d29..9a66de9f5 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -473,24 +473,23 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value attribute in the passed HTML document """ + value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?' + value = re.escape(value) if escape_value else value - retlist = [] - for m in re.finditer(r'''(?xs) + partial_element_re = r'''(?x) <(?P["\'])(?P.*)(?P=q)$', r'\g ', content)), - whole, - )) + for m in re.finditer(partial_element_re, html): + content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():]) - return retlist + yield ( + unescapeHTML(re.sub(r'^(?P ["\'])(?P.*)(?P=q)$', r'\g ', content, flags=re.DOTALL)), + whole + ) class HTMLBreakOnClosingTagParser(compat_HTMLParser):