diff --git a/test/test_parsing.py b/test/test_parsing.py index 782a1196df..75ed8ebf34 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -1,29 +1,71 @@ import textwrap import unittest -from parsing import ( - FirstMatchingElementParser, - HTMLTagParser, +from yt_dlp.compat import compat_HTMLParseError +from yt_dlp.parsing import ( MatchingElementParser, + HTMLCommentRanges, + HTMLTagParser, ) -from yt_dlp.compat import compat_HTMLParseError - -get_element_by_attribute = FirstMatchingElementParser -get_element_by_class = FirstMatchingElementParser -get_element_html_by_attribute = FirstMatchingElementParser -get_element_html_by_class = FirstMatchingElementParser.get_element_html_by_class -get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag -get_elements_by_attribute = MatchingElementParser -get_elements_by_class = MatchingElementParser -get_elements_html_by_attribute = MatchingElementParser -get_elements_html_by_class = FirstMatchingElementParser.get_elements_html_by_class -get_elements_text_and_html_by_attribute = MatchingElementParser +extract_attributes = MatchingElementParser.extract_attributes +get_element_by_attribute = MatchingElementParser.get_element_by_attribute +get_element_by_class = MatchingElementParser.get_element_by_class +get_element_html_by_attribute = MatchingElementParser.get_element_html_by_attribute +get_element_html_by_class = MatchingElementParser.get_element_html_by_class +get_element_text_and_html_by_tag = MatchingElementParser.get_element_text_and_html_by_tag +get_elements_by_attribute = MatchingElementParser.get_elements_by_attribute +get_elements_by_class = MatchingElementParser.get_elements_by_class +get_elements_html_by_attribute = MatchingElementParser.get_elements_html_by_attribute +get_elements_html_by_class = MatchingElementParser.get_elements_html_by_class +get_elements_text_and_html_by_attribute = MatchingElementParser.get_elements_text_and_html_by_attribute +get_elements_text_and_html_by_tag = MatchingElementParser.get_elements_text_and_html_by_tag class TestParsing(unittest.TestCase): + def test_extract_attributes(self): + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': "a 'b' c"}) + self.assertEqual(extract_attributes(''), {'x': 'a "b" c'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '&'}) # XML + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2 + self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0 + self.assertEqual(extract_attributes(''), {'x': '&foo'}) + self.assertEqual(extract_attributes(''), {'x': "'"}) + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': None}) + self.assertEqual(extract_attributes(''), {'x': 'y', 'a': None}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'y': '2', 'x': '3'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '\ny\n'}) + self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'_:funny-name1': '1'}) + self.assertEqual(extract_attributes(''), {'x': 'Fáilte 世界 \U0001f600'}) + self.assertEqual(extract_attributes(''), {'x': 'décompose\u0301'}) + # "Narrow" Python builds don't support unicode code points outside BMP. + try: + chr(0x10000) + supports_outside_bmp = True + except ValueError: + supports_outside_bmp = False + if supports_outside_bmp: + self.assertEqual(extract_attributes(''), {'x': 'Smile \U0001f600!'}) + # Malformed HTML should not break attributes extraction on older Python + self.assertEqual(extract_attributes(''), {}) + GET_ELEMENT_BY_CLASS_TEST_STRING = ''' nice +
also nice
''' def test_get_element_by_class(self): @@ -35,7 +77,8 @@ class TestParsing(unittest.TestCase): def test_get_element_html_by_class(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING - self.assertEqual(get_element_html_by_class('foo', html), html.strip()) + self.assertEqual(get_element_html_by_class('foo', html), + 'nice') self.assertEqual(get_element_by_class('no-such-class', html), None) GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = ''' @@ -48,6 +91,7 @@ class TestParsing(unittest.TestCase): self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice') self.assertEqual(get_element_by_attribute('class', 'foo', html), None) self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None) + self.assertEqual(get_element_by_attribute('class', 'foo bar', html, tag='div'), 'also nice') html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING @@ -56,7 +100,8 @@ class TestParsing(unittest.TestCase): def test_get_element_html_by_attribute(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING - self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip()) + self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), + 'nice') self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None) self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None) @@ -110,7 +155,7 @@ class TestParsing(unittest.TestCase): self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), []) self.assertEqual(get_elements_text_and_html_by_attribute( - 'class', 'foo', 'nicenice', tag='a'), + 'class', 'foo', 'nicenot nice', tag='a'), [('nice', 'nice')]) def test_get_element_text_and_html_by_tag(self): @@ -138,7 +183,16 @@ class TestParsing(unittest.TestCase): self.assertEqual( get_element_text_and_html_by_tag('span', html), (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html)) - self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) + self.assertIsNone(get_element_text_and_html_by_tag('article', html)) + + def test_get_elements_text_and_html_by_tag(self): + test_string = ''' + + + ignore + ''' + items = get_elements_text_and_html_by_tag('img', test_string) + self.assertListEqual(items, [('', ''), ('', '')]) def test_get_element_text_and_html_by_tag_malformed(self): inner_text = 'inner text' @@ -157,10 +211,8 @@ class TestParsing(unittest.TestCase): get_element_text_and_html_by_tag('malnested_b', html), (f'{inner_text}', f'{inner_text}')) - self.assertRaises( - compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') - self.assertRaises( - compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') + self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}')) + self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}')) def test_strict_html_parsing(self): class StrictTagParser(HTMLTagParser): @@ -188,14 +240,14 @@ class TestParsing(unittest.TestCase): self.assertEqual(parser.taglist('

', reset=True), []) tags = parser.taglist('

', reset=True) - self.assertEqual(tags, [Tag('div'), Tag('p')]) + self.assertEqual(tags, [Tag('p'), Tag('div')]) tags = parser.taglist('

/p>

', reset=True) self.assertEqual(tags, [Tag('div')]) - tags = parser.taglist('

paragraph

', reset=True) - self.assertEqual(tags, [Tag('p'), Tag('div')]) - self.assertEqual(tags[0].text_and_html(), ('paragraph', '

paragraph

paragraph

', reset=True) + self.assertEqual(tags, [Tag('div'), Tag('p')]) + self.assertEqual(tags[1].text_and_html(), ('paragraph', '

paragraph')) tags = parser.taglist('must be empty', reset=True) self.assertEqual(tags, [Tag('img')]) @@ -216,3 +268,65 @@ class TestParsing(unittest.TestCase): html = '''''' tags = parser.taglist(html, reset=True) self.assertEqual(tags[0].text_and_html(), ('', html)) + + def test_tag_return_order(self): + Tag = HTMLTagParser.Tag + html = ''' + + + + + + + + + + + + + + ''' + parser = HTMLTagParser() + tags = parser.taglist(html, reset=True) + self.assertEqual( + str(tags), str([Tag('t0'), Tag('t1'), Tag('t2'), Tag('t3'), Tag('t4'), + Tag('t5'), Tag('t6'), Tag('t7'), Tag('t8')])) + + tags = parser.taglist(html, reset=True, depth_first=True) + self.assertEqual( + str(tags), str([Tag('t3'), Tag('t4'), Tag('t2'), Tag('t1'), Tag('t6'), + Tag('t5'), Tag('t0'), Tag('t8'), Tag('t7')])) + + # return tags in nested order + tags = parser.taglist(html, reset=True, depth_first=None) + self.assertEqual( + str(tags), str([ + [Tag('t0'), + [Tag('t1'), + [Tag('t2'), Tag('t3'), Tag('t4')]], + [Tag('t5'), Tag('t6')]], + [Tag('t7'), Tag('t8')]])) + + def test_within_html_comment(self): + def mark_comments(_string, char='^', nochar='-'): + cmts = HTMLCommentRanges(_string) + return "".join(char if _idx in cmts else nochar for _idx in range(len(_string))) + + html_string = ''' + no comments in this line + --------------------------------------------------------------------- + + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + before after + -------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-------- + here is and end + ------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------ + this ends here --> and not here + -----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---------------------------- + stray --> comment closings --> are ignored ' encountered + note: markers within quotes are not ignored + """ + + def __init__(self, html): + self._range_iter = self.ranges(html) + self._range = next(self._range_iter, None) + self._last_offset = 0 + + @staticmethod + def ranges(string, sopen=''): + assert not (sopen.startswith(sclose) or sclose.startswith(sopen)) + open_iter = iter_find(string, sopen) + close_len = len(sclose) + close_iter = (idx + close_len for idx in iter_find(string, sclose)) + next_open = next(open_iter, None) + next_close = next(close_iter, None) + + while True: + if next_open is None: + return + while next_close is not None and next_open > next_close: + next_close = next(close_iter, None) + yield slice(next_open, next_close) + if next_close is None: + return + while next_open is not None and next_open < next_close: + next_open = next(open_iter, None) + + def __contains__(self, offset): + assert isinstance(offset, int) + assert offset >= self._last_offset, 'offset must be in increasing order' + self._last_offset = offset + while self._range and self._range.stop is not None and offset >= self._range.stop: + self._range = next(self._range_iter, None) + + return not (self._range is None or offset < self._range.start) - # or return a list with all found tag objects - # this is faster by factor 2-5 compared to iteration - for tag_obj in HTMLTagParser(html).taglist(): + +class HTMLTagParser(HTMLParser): + """HTML parser which returns found elements as instances of 'Tag' + when STRICT=True can raise compat_HTMLParseError() on malformed HTML elements + + usage: + parser = HTMLTagParser() + for tag_obj in parser.taglist(html): tag_obj.text_and_html() + """ STRICT = False ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''') - CLOSING_TAG_REGEX = re.compile(r']+(?:\s*>)?') VOID_TAGS = { 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr', } class Tag: - __slots__ = 'name', 'string', 'start', 'start_len', 'stop', 'attrs' + __slots__ = 'name', 'string', 'attrs', '_openrange', '_closerange' - def __init__(self, name, *, string='', start=None, stop=None, attrs=()): + def __init__(self, name, *, string='', attrs=()): self.name = name self.string = string - self.start = start - self.start_len = 0 - self.stop = stop self.attrs = tuple(attrs) + self._openrange = None + self._closerange = None def __str__(self): return self.name @@ -55,52 +97,81 @@ class HTMLTagParser(HTMLParser): def __eq__(self, other): return self.name == other + def openrange(self, offset, startlen=0): + if isinstance(offset, slice): + self._openrange = offset + else: + self._openrange = slice(offset, offset + startlen) + + def closerange(self, offset, stoplen=0): + if isinstance(offset, slice): + self._closerange = offset + else: + self._closerange = slice(offset, offset + stoplen) + + def opentag(self): + return self.string[self._openrange] if self._openrange else '' + def html(self): - return self.string[self.start:self.stop] + if not self._openrange: + return '' + if self._closerange: + return self.string[self._openrange.start:self._closerange.stop] + return self.string[self._openrange] + + def text(self): + if self._openrange and self._closerange: + return self.string[self._openrange.stop:self._closerange.start] + return '' def text_and_html(self): - assert isinstance(self.start, int) - if not self.start_len: - match = HTMLTagParser.ANY_TAG_REGEX.match(self.string[self.start:]) - assert match - self.start_len = len(match.group()) - if self.stop is None: - return '', self.string[self.start: self.start + self.start_len] - html = self.html() - cidx = html.rindex('') or tag in self.VOID_TAGS: - if self.callback(obj) is not False: - self.found_tags.append(obj) + self._nestedtags[-1].append(tag_obj) + self.callback(tag_obj) return - else: - obj = None - - self.tagstack.appendleft(obj or tag) + nesting = [] + self._nestedtags[-1].append(nesting) + self._nestedtags.append(nesting) + self.tagstack.appendleft(tag_obj) handle_startendtag = handle_starttag @@ -141,79 +213,150 @@ class HTMLTagParser(HTMLParser): f'malnested closing tag {tag!r}, expected after {open_tags!r}') tag_obj = self.tagstack[idx] self.tagstack.remove(tag) - if not isinstance(tag_obj, str): - # since we landed here we'll always find a closing tag - match = self.CLOSING_TAG_REGEX.match(self.rawdata[self._offset:]) - tag_obj.stop = self._offset + match.end() - if self.callback(tag_obj) is not False: - self.found_tags.append(tag_obj) + if isinstance(tag_obj, self.Tag): + close_idx = self.rawdata.find('>', self._offset) + 1 + tag_obj.closerange(self._offset, close_idx - self._offset) + self._nestedtags.pop().insert(0, tag_obj) + self.callback(tag_obj) except ValueError as exc: if isinstance(exc, compat_HTMLParseError): raise - elif self.STRICT: - raise compat_HTMLParseError(f'stray closing tag {tag!r}') + if self.STRICT: + raise compat_HTMLParseError(f'stray closing tag {tag!r}') from exc -class ClassParser(HTMLTagParser): - def __init__(self, attribute, matchfunc, stop): +class MatchingElementParser(HTMLTagParser): + """ optimized version of HTMLTagParser + """ + def __init__(self, matchfunc): super().__init__() - self.search_attr = attribute self.matchfunc = matchfunc - self.stop = stop - self.processing = 0 + self.found_none = True + + def reset(self): + super().reset() + self.found_none = True + + def callback(self, tag_obj): + raise self.AbortException() def predicate(self, tag, attrs): - if self.processing <= 0 and self.stop is not None and self._offset > self.stop: - self.abort() - string = dict(attrs).get(self.search_attr, '') - if self.matchfunc(string): - self.processing += 1 + if self.found_none and self.matchfunc(tag, attrs): + self.found_none = False return True return False - def callback(self, tag_obj): - if self.stop is None: - self.abort(tag_obj) - self.processing -= 1 + @staticmethod + def class_value_regex(class_name): + return rf'[\w\s\-]*(?"']|"[^"]*"|'[^']*')*)? + \s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q) + ''' @classmethod - def get_elements_html_by_class(cls, class_name, html): - regex = re.compile(rf'[\w\- ]*\b{re.escape(class_name)}\b') - it = re.finditer(rf'<.+ class=[\'"]{regex.pattern}', html) - start = stop = None - for match in it: - if start is None: - start = match.start() - else: - stop = match.end() - if start is None: - return [] - parser = cls('class', lambda x: regex.match(x), stop) - return [tag.html() for tag in parser.taglist(html[start:])] + def iter_tags(cls, regex, html, *, matchfunc): + comments = HTMLCommentRanges(html) + parser = cls(matchfunc) + for match in re.finditer(regex, html): + if match.start() not in comments: + yield from parser.taglist(html[match.start():], reset=True) + @classmethod + def tags_by_name(cls, tag, html): + def matchfunc(tag_str, _attrs): + return tag_str == tag -class FirstMatchingElementParser(HTMLTagParser): - def __init__(self, matchfunc): - super().__init__() - self.matchfunc = matchfunc - self.found = False + yield from cls.iter_tags(rf'<\s*{re.escape(tag)}[\s>]', html, matchfunc=matchfunc) - def predicate(self, tag, attrs): - if not self.found and self.matchfunc(tag, attrs): - self.found = True - return True - return False + @classmethod + def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True): + def matchfunc(_tag_str, attrs): + return any(attr == attribute and re.fullmatch(value, value_str) + for attr, value_str in attrs) + + tag_regex = cls.matching_tag_regex(tag, attribute, value, escape_value) + yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc) + + @classmethod + def extract_attributes(cls, html): + attr_dict = {} + + def matchfunc(_tag, attrs): + attr_dict.update(attrs) + raise cls.AbortException() + + with contextlib.suppress(cls.AbortException): + cls(matchfunc).feed(html) - def callback(self, obj): - self.abort(obj) + return attr_dict + + @classmethod + def get_elements_text_and_html_by_tag(cls, tag, html): + return [tag.text_and_html() for tag in cls.tags_by_name(tag, html)] @classmethod def get_element_text_and_html_by_tag(cls, tag, html): - """ - For the first element with the specified tag in the given HTML document - return its content (text) and the whole element (html) - """ - parser = cls(lambda _tag, _: _tag == tag) - for tag_obj in parser.taglist(html): - return tag_obj.text_and_html() - raise compat_HTMLParseError(f'tag {tag} not found') + tag = next(cls.tags_by_name(tag, html), None) + return tag and tag.text_and_html() + + @classmethod + def get_elements_text_and_html_by_attribute(cls, *args, **kwargs): + return [tag.text_and_html() for tag in cls.tags_by_attribute(*args, **kwargs)] + + @classmethod + def get_elements_by_attribute(cls, *args, **kwargs): + return [tag.text_and_html()[0] for tag in cls.tags_by_attribute(*args, **kwargs)] + + @classmethod + def get_elements_html_by_attribute(cls, *args, **kwargs): + return [tag.html() for tag in cls.tags_by_attribute(*args, **kwargs)] + + @classmethod + def get_element_by_attribute(cls, *args, **kwargs): + tag = next(cls.tags_by_attribute(*args, **kwargs), None) + return tag and tag.text() + + @classmethod + def get_element_html_by_attribute(cls, *args, **kwargs): + tag = next(cls.tags_by_attribute(*args, **kwargs), None) + return tag and tag.html() + + @classmethod + def get_elements_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + return [tag.text() for tag + in cls.tags_by_attribute('class', value, html, escape_value=False)] + + @classmethod + def get_elements_html_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + return [tag.html() for tag + in cls.tags_by_attribute('class', value, html, escape_value=False)] + + @classmethod + def get_elements_text_and_html_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + return [tag.text() for tag + in cls.tags_by_attribute('class', value, html, escape_value=False)] + + @classmethod + def get_element_html_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None) + return tag and tag.html() + + @classmethod + def get_element_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None) + return tag and tag.text()