[parsing] rework interface, implemented all get_element(s) functions + extract_attributes() as MatchingElementParser class methods and improve performance

3 years ago · 176a156c65
parent e092ba9922
commit 176a156c65
2 changed files with 394 additions and 137 deletions
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@ -1,29 +1,71 @@
 import textwrap
 import unittest

-from parsing import (
-    FirstMatchingElementParser,
-    HTMLTagParser,
+from yt_dlp.compat import compat_HTMLParseError
+from yt_dlp.parsing import (
    MatchingElementParser,
+    HTMLCommentRanges,
+    HTMLTagParser,
 )

-from yt_dlp.compat import compat_HTMLParseError
-
-get_element_by_attribute = FirstMatchingElementParser
-get_element_by_class = FirstMatchingElementParser
-get_element_html_by_attribute = FirstMatchingElementParser
-get_element_html_by_class = FirstMatchingElementParser.get_element_html_by_class
-get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag
-get_elements_by_attribute = MatchingElementParser
-get_elements_by_class = MatchingElementParser
-get_elements_html_by_attribute = MatchingElementParser
-get_elements_html_by_class = FirstMatchingElementParser.get_elements_html_by_class
-get_elements_text_and_html_by_attribute = MatchingElementParser
+extract_attributes = MatchingElementParser.extract_attributes
+get_element_by_attribute = MatchingElementParser.get_element_by_attribute
+get_element_by_class = MatchingElementParser.get_element_by_class
+get_element_html_by_attribute = MatchingElementParser.get_element_html_by_attribute
+get_element_html_by_class = MatchingElementParser.get_element_html_by_class
+get_element_text_and_html_by_tag = MatchingElementParser.get_element_text_and_html_by_tag
+get_elements_by_attribute = MatchingElementParser.get_elements_by_attribute
+get_elements_by_class = MatchingElementParser.get_elements_by_class
+get_elements_html_by_attribute = MatchingElementParser.get_elements_html_by_attribute
+get_elements_html_by_class = MatchingElementParser.get_elements_html_by_class
+get_elements_text_and_html_by_attribute = MatchingElementParser.get_elements_text_and_html_by_attribute
+get_elements_text_and_html_by_tag = MatchingElementParser.get_elements_text_and_html_by_tag


 class TestParsing(unittest.TestCase):
+    def test_extract_attributes(self):
+        self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
+        self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e x=y>'), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e x="a \'b\' c">'), {'x': "a 'b' c"})
+        self.assertEqual(extract_attributes('<e x=\'a "b" c\'>'), {'x': 'a "b" c'})
+        self.assertEqual(extract_attributes('<e x="&#121;">'), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e x="&#x79;">'), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e x="&amp;">'), {'x': '&'})  # XML
+        self.assertEqual(extract_attributes('<e x="&quot;">'), {'x': '"'})
+        self.assertEqual(extract_attributes('<e x="&pound;">'), {'x': '£'})  # HTML 3.2
+        self.assertEqual(extract_attributes('<e x="&lambda;">'), {'x': 'λ'})  # HTML 4.0
+        self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'})
+        self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"})
+        self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'})
+        self.assertEqual(extract_attributes('<e x >'), {'x': None})
+        self.assertEqual(extract_attributes('<e x=y a>'), {'x': 'y', 'a': None})
+        self.assertEqual(extract_attributes('<e x= y>'), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e x=1 y=2 x=3>'), {'y': '2', 'x': '3'})
+        self.assertEqual(extract_attributes('<e \nx=\ny\n>'), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'})
+        self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'})
+        self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'})  # Names lowercased
+        self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'})
+        self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'})
+        self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'})
+        self.assertEqual(extract_attributes('<e x="Fáilte 世界 \U0001f600">'), {'x': 'Fáilte 世界 \U0001f600'})
+        self.assertEqual(extract_attributes('<e x="décompose&#769;">'), {'x': 'décompose\u0301'})
+        # "Narrow" Python builds don't support unicode code points outside BMP.
+        try:
+            chr(0x10000)
+            supports_outside_bmp = True
+        except ValueError:
+            supports_outside_bmp = False
+        if supports_outside_bmp:
+            self.assertEqual(extract_attributes('<e x="Smile &#128512;!">'), {'x': 'Smile \U0001f600!'})
+        # Malformed HTML should not break attributes extraction on older Python
+        self.assertEqual(extract_attributes('<mal"formed/>'), {})
+
    GET_ELEMENT_BY_CLASS_TEST_STRING = '''
        <span class="foo bar">nice</span>
+        <div class="foo bar">also nice</div>
    '''

    def test_get_element_by_class(self):
@ -35,7 +77,8 @@ class TestParsing(unittest.TestCase):
    def test_get_element_html_by_class(self):
        html = self.GET_ELEMENT_BY_CLASS_TEST_STRING

-        self.assertEqual(get_element_html_by_class('foo', html), html.strip())
+        self.assertEqual(get_element_html_by_class('foo', html),
+                         '<span class="foo bar">nice</span>')
        self.assertEqual(get_element_by_class('no-such-class', html), None)

    GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
@ -48,6 +91,7 @@ class TestParsing(unittest.TestCase):
        self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
        self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
        self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
+        self.assertEqual(get_element_by_attribute('class', 'foo bar', html, tag='div'), 'also nice')

        html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING

@ -56,7 +100,8 @@ class TestParsing(unittest.TestCase):
    def test_get_element_html_by_attribute(self):
        html = self.GET_ELEMENT_BY_CLASS_TEST_STRING

-        self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip())
+        self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html),
+                         '<span class="foo bar">nice</span>')
        self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
        self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)

@ -110,7 +155,7 @@ class TestParsing(unittest.TestCase):
        self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])

        self.assertEqual(get_elements_text_and_html_by_attribute(
-            'class', 'foo', '<a class="foo">nice</a><span class="foo">nice</span>', tag='a'),
+            'class', 'foo', '<a class="foo">nice</a><span class="foo">not nice</span>', tag='a'),
            [('nice', '<a class="foo">nice</a>')])

    def test_get_element_text_and_html_by_tag(self):
@ -138,7 +183,16 @@ class TestParsing(unittest.TestCase):
        self.assertEqual(
            get_element_text_and_html_by_tag('span', html),
            (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
-        self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
+        self.assertIsNone(get_element_text_and_html_by_tag('article', html))
+
+    def test_get_elements_text_and_html_by_tag(self):
+        test_string = '''
+            <img src="a.png">
+            <img src="b.png" />
+            <span>ignore</span>
+        '''
+        items = get_elements_text_and_html_by_tag('img', test_string)
+        self.assertListEqual(items, [('', '<img src="a.png">'), ('', '<img src="b.png" />')])

    def test_get_element_text_and_html_by_tag_malformed(self):
        inner_text = 'inner text'
@ -157,10 +211,8 @@ class TestParsing(unittest.TestCase):
            get_element_text_and_html_by_tag('malnested_b', html),
            (f'{inner_text}</malnested_a>',
             f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
-        self.assertRaises(
-            compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}</orphan>')
-        self.assertRaises(
-            compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'<orphan>{html}')
+        self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}</orphan>'))
+        self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'))

    def test_strict_html_parsing(self):
        class StrictTagParser(HTMLTagParser):
@ -188,14 +240,14 @@ class TestParsing(unittest.TestCase):
        self.assertEqual(parser.taglist('<div><p>', reset=True), [])

        tags = parser.taglist('<div><p></div></p>', reset=True)
-        self.assertEqual(tags, [Tag('div'), Tag('p')])
+        self.assertEqual(tags, [Tag('p'), Tag('div')])

        tags = parser.taglist('<div><p>/p></div>', reset=True)
        self.assertEqual(tags, [Tag('div')])

-        tags = parser.taglist('<div><p>paragraph</p<ignored /></div>', reset=True)
-        self.assertEqual(tags, [Tag('p'), Tag('div')])
-        self.assertEqual(tags[0].text_and_html(), ('paragraph', '<p>paragraph</p'))
+        tags = parser.taglist('<div><p>paragraph</p<ignored></div>', reset=True)
+        self.assertEqual(tags, [Tag('div'), Tag('p')])
+        self.assertEqual(tags[1].text_and_html(), ('paragraph', '<p>paragraph</p<ignored>'))

        tags = parser.taglist('<img width="300px">must be empty</img>', reset=True)
        self.assertEqual(tags, [Tag('img')])
@ -216,3 +268,65 @@ class TestParsing(unittest.TestCase):
        html = '''<img greater_a='1>0' greater_b="1>0">'''
        tags = parser.taglist(html, reset=True)
        self.assertEqual(tags[0].text_and_html(), ('', html))
+
+    def test_tag_return_order(self):
+        Tag = HTMLTagParser.Tag
+        html = '''
+        <t0>
+            <t1>
+                <t2>
+                    <t3 /> <t4 />
+                </t2>
+            </t1>
+            <t5>
+                <t6 />
+            </t5>
+        </t0>
+        <t7>
+            <t8 />
+        </t7>
+        '''
+        parser = HTMLTagParser()
+        tags = parser.taglist(html, reset=True)
+        self.assertEqual(
+            str(tags), str([Tag('t0'), Tag('t1'), Tag('t2'), Tag('t3'), Tag('t4'),
+                            Tag('t5'), Tag('t6'), Tag('t7'), Tag('t8')]))
+
+        tags = parser.taglist(html, reset=True, depth_first=True)
+        self.assertEqual(
+            str(tags), str([Tag('t3'), Tag('t4'), Tag('t2'), Tag('t1'), Tag('t6'),
+                            Tag('t5'), Tag('t0'), Tag('t8'), Tag('t7')]))
+
+        # return tags in nested order
+        tags = parser.taglist(html, reset=True, depth_first=None)
+        self.assertEqual(
+            str(tags), str([
+                [Tag('t0'),
+                 [Tag('t1'),
+                  [Tag('t2'), Tag('t3'), Tag('t4')]],
+                 [Tag('t5'), Tag('t6')]],
+                [Tag('t7'), Tag('t8')]]))
+
+    def test_within_html_comment(self):
+        def mark_comments(_string, char='^', nochar='-'):
+            cmts = HTMLCommentRanges(_string)
+            return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
+
+        html_string = '''
+        no              comments         in            this              line
+        ---------------------------------------------------------------------
+        <!--                 whole line represents a comment              -->
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+        before <!--                      comment                  -->   after
+        -------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^--------
+        here   is   <!-- a comment -->   and   <!-- another comment -->   end
+        ------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------
+        this <!-- nested  <!--     comment    -->  ends here --> and not here
+        -----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------------------------
+        stray --> comment closings --> are ignored <!-- but not <!-- openings
+        -------------------------------------------^^^^^^^^^^^^^^^^^^^^^^^^^^
+        '''
+
+        lines = textwrap.dedent(html_string).strip().splitlines()
+        for line, marker in zip(lines[0::2], lines[1::2]):
+            self.assertEqual((line, mark_comments(line)), (line, marker))
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@ -4,47 +4,89 @@ import itertools
 import re
 from html.parser import HTMLParser

+from .compat import compat_HTMLParseError
 from .utils import orderedSet

-from .compat import compat_HTMLParseError

+def iter_find(string, sub: str):
+    size = len(sub)
+    idx = -size
+    while True:
+        idx = string.find(sub, idx + size)
+        if idx == -1:
+            return
+        yield idx

-class HTMLTagParser(HTMLParser):
-    """HTML parser which acts as iterator
-    returns found elements as instances of Tag
-    nested elements will be returned before its parents

-    strict=True raises compat_HTMLParseError on malformed html
+class HTMLCommentRanges:
+    """computes the offsets of HTML comments

-    two modes of usage:
-        # as an lazy iterator:
-        for tag_obj in HTMLTagParser(html):
-            tag_obj.text_and_html()
+    comments start with '<!--' and end with the first '-->' encountered
+    note: markers within quotes are not ignored
+    """
+
+    def __init__(self, html):
+        self._range_iter = self.ranges(html)
+        self._range = next(self._range_iter, None)
+        self._last_offset = 0
+
+    @staticmethod
+    def ranges(string, sopen='<!--', sclose='-->'):
+        assert not (sopen.startswith(sclose) or sclose.startswith(sopen))
+        open_iter = iter_find(string, sopen)
+        close_len = len(sclose)
+        close_iter = (idx + close_len for idx in iter_find(string, sclose))
+        next_open = next(open_iter, None)
+        next_close = next(close_iter, None)
+
+        while True:
+            if next_open is None:
+                return
+            while next_close is not None and next_open > next_close:
+                next_close = next(close_iter, None)
+            yield slice(next_open, next_close)
+            if next_close is None:
+                return
+            while next_open is not None and next_open < next_close:
+                next_open = next(open_iter, None)
+
+    def __contains__(self, offset):
+        assert isinstance(offset, int)
+        assert offset >= self._last_offset, 'offset must be in increasing order'
+        self._last_offset = offset
+        while self._range and self._range.stop is not None and offset >= self._range.stop:
+            self._range = next(self._range_iter, None)
+
+        return not (self._range is None or offset < self._range.start)

-        # or return a list with all found tag objects
-        # this is faster by factor 2-5 compared to iteration
-        for tag_obj in HTMLTagParser(html).taglist():
+
+class HTMLTagParser(HTMLParser):
+    """HTML parser which returns found elements as instances of 'Tag'
+    when STRICT=True can raise compat_HTMLParseError() on malformed HTML elements
+
+    usage:
+        parser = HTMLTagParser()
+        for tag_obj in parser.taglist(html):
            tag_obj.text_and_html()
+
    """

    STRICT = False
    ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''')
-    CLOSING_TAG_REGEX = re.compile(r'</\s*[^\s<>]+(?:\s*>)?')
    VOID_TAGS = {
        'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
        'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
    }

    class Tag:
-        __slots__ = 'name', 'string', 'start', 'start_len', 'stop', 'attrs'
+        __slots__ = 'name', 'string', 'attrs', '_openrange', '_closerange'

-        def __init__(self, name, *, string='', start=None, stop=None, attrs=()):
+        def __init__(self, name, *, string='', attrs=()):
            self.name = name
            self.string = string
-            self.start = start
-            self.start_len = 0
-            self.stop = stop
            self.attrs = tuple(attrs)
+            self._openrange = None
+            self._closerange = None

        def __str__(self):
            return self.name
@ -55,52 +97,81 @@ class HTMLTagParser(HTMLParser):
        def __eq__(self, other):
            return self.name == other

+        def openrange(self, offset, startlen=0):
+            if isinstance(offset, slice):
+                self._openrange = offset
+            else:
+                self._openrange = slice(offset, offset + startlen)
+
+        def closerange(self, offset, stoplen=0):
+            if isinstance(offset, slice):
+                self._closerange = offset
+            else:
+                self._closerange = slice(offset, offset + stoplen)
+
+        def opentag(self):
+            return self.string[self._openrange] if self._openrange else ''
+
        def html(self):
-            return self.string[self.start:self.stop]
+            if not self._openrange:
+                return ''
+            if self._closerange:
+                return self.string[self._openrange.start:self._closerange.stop]
+            return self.string[self._openrange]
+
+        def text(self):
+            if self._openrange and self._closerange:
+                return self.string[self._openrange.stop:self._closerange.start]
+            return ''

        def text_and_html(self):
-            assert isinstance(self.start, int)
-            if not self.start_len:
-                match = HTMLTagParser.ANY_TAG_REGEX.match(self.string[self.start:])
-                assert match
-                self.start_len = len(match.group())
-            if self.stop is None:
-                return '', self.string[self.start: self.start + self.start_len]
-            html = self.html()
-            cidx = html.rindex('</')
-            return html[self.start_len:cidx], html
-
-    class EarlyExitException(Exception):
+            return self.text(), self.html()
+
+    class AbortException(Exception):
        pass

    def __init__(self):
-        super().__init__()
        self.tagstack = collections.deque()
+        self._nestedtags = [[]]
+        super().__init__()
        self._offset = self.offset
-        self.found_tags = []

    def predicate(self, tag, attrs):
+        """ return True for every encountered opening tag that should be processed """
        return True

    def callback(self, tag_obj):
-        pass
-
-    def abort(self, last_tag=None):
-        if last_tag:
-            self.found_tags.append(last_tag)
-        raise HTMLTagParser.EarlyExitException()
+        """ this will be called when the requested tag is closed """
+
+    def reset(self):
+        super().reset()
+        self.tagstack.clear()
+
+    def taglist(self, data, reset=True, depth_first=False):
+        """ parse data and return found tag objects
+        @param data:    html string
+        @param reset:   reset state
+        @param depth_first: return order: as opened (False), as closed (True), nested (None)
+        @return: list of Tag objects
+        """
+        def flatten(_list, first=True):
+            rlist = _list if first or not depth_first else itertools.chain(_list[1:], _list[:1])
+            for item in rlist:
+                if isinstance(item, list):
+                    yield from flatten(item, first=False)
+                else:
+                    yield item

-    def taglist(self, data, reset=True):
-        self.found_tags.clear()
        if reset:
            self.reset()
-            self.tagstack.clear()
-        with contextlib.suppress(HTMLTagParser.EarlyExitException):
+        with contextlib.suppress(HTMLTagParser.AbortException):
            self.feed(data)
        if self.STRICT and self.tagstack:
            orphans = ', '.join(map(repr, map(str, orderedSet(self.tagstack, lazy=True))))
            raise compat_HTMLParseError(f'unclosed tag {orphans}')
-        return self.found_tags
+        taglist = self._nestedtags[0] if depth_first is None else list(flatten(self._nestedtags[0]))
+        self._nestedtags = [[]]
+        return taglist

    def updatepos(self, i, j):
        offset = self._offset = super().updatepos(i, j)
@ -108,22 +179,23 @@ class HTMLTagParser(HTMLParser):

    def handle_starttag(self, tag, attrs):
        try:
-            # we use internal variable for performance reason
+            # we use internal variable for performance reasons
            tag_text = getattr(self, '_HTMLParser__starttag_text')
        except AttributeError:
            tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
+
+        tag_obj = tag
        if self.predicate(tag, attrs):
-            obj = self.Tag(
-                tag, string=self.rawdata, start=self._offset, attrs=attrs)
-            obj.start_len = len(tag_text)
+            tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
+            tag_obj.openrange(self._offset, len(tag_text))
            if tag_text.endswith('/>') or tag in self.VOID_TAGS:
-                if self.callback(obj) is not False:
-                    self.found_tags.append(obj)
+                self._nestedtags[-1].append(tag_obj)
+                self.callback(tag_obj)
                return
-        else:
-            obj = None
-
-        self.tagstack.appendleft(obj or tag)
+            nesting = []
+            self._nestedtags[-1].append(nesting)
+            self._nestedtags.append(nesting)
+        self.tagstack.appendleft(tag_obj)

    handle_startendtag = handle_starttag

@ -141,79 +213,150 @@ class HTMLTagParser(HTMLParser):
                    f'malnested closing tag {tag!r}, expected after {open_tags!r}')
            tag_obj = self.tagstack[idx]
            self.tagstack.remove(tag)
-            if not isinstance(tag_obj, str):
-                # since we landed here we'll always find a closing tag
-                match = self.CLOSING_TAG_REGEX.match(self.rawdata[self._offset:])
-                tag_obj.stop = self._offset + match.end()
-                if self.callback(tag_obj) is not False:
-                    self.found_tags.append(tag_obj)
+            if isinstance(tag_obj, self.Tag):
+                close_idx = self.rawdata.find('>', self._offset) + 1
+                tag_obj.closerange(self._offset, close_idx - self._offset)
+                self._nestedtags.pop().insert(0, tag_obj)
+                self.callback(tag_obj)
        except ValueError as exc:
            if isinstance(exc, compat_HTMLParseError):
                raise
-            elif self.STRICT:
-                raise compat_HTMLParseError(f'stray closing tag {tag!r}')
+            if self.STRICT:
+                raise compat_HTMLParseError(f'stray closing tag {tag!r}') from exc


-class ClassParser(HTMLTagParser):
-    def __init__(self, attribute, matchfunc, stop):
+class MatchingElementParser(HTMLTagParser):
+    """ optimized version of HTMLTagParser
+    """
+    def __init__(self, matchfunc):
        super().__init__()
-        self.search_attr = attribute
        self.matchfunc = matchfunc
-        self.stop = stop
-        self.processing = 0
+        self.found_none = True
+
+    def reset(self):
+        super().reset()
+        self.found_none = True
+
+    def callback(self, tag_obj):
+        raise self.AbortException()

    def predicate(self, tag, attrs):
-        if self.processing <= 0 and self.stop is not None and self._offset > self.stop:
-            self.abort()
-        string = dict(attrs).get(self.search_attr, '')
-        if self.matchfunc(string):
-            self.processing += 1
+        if self.found_none and self.matchfunc(tag, attrs):
+            self.found_none = False
            return True
        return False

-    def callback(self, tag_obj):
-        if self.stop is None:
-            self.abort(tag_obj)
-        self.processing -= 1
+    @staticmethod
+    def class_value_regex(class_name):
+        return rf'[\w\s\-]*(?<![\w\-]){re.escape(class_name)}(?![\w\-])[\w\s\-]*'
+
+    @staticmethod
+    def matching_tag_regex(tag, attribute, value_regex, escape=True):
+        if isinstance(value_regex, re.Pattern):
+            value_regex = value_regex.pattern
+        elif escape:
+            value_regex = re.escape(value_regex)
+
+        return rf'''(?x)
+            <(?:{tag})
+             (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
+             \s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
+            '''

    @classmethod
-    def get_elements_html_by_class(cls, class_name, html):
-        regex = re.compile(rf'[\w\- ]*\b{re.escape(class_name)}\b')
-        it = re.finditer(rf'<.+ class=[\'"]{regex.pattern}', html)
-        start = stop = None
-        for match in it:
-            if start is None:
-                start = match.start()
-            else:
-                stop = match.end()
-        if start is None:
-            return []
-        parser = cls('class', lambda x: regex.match(x), stop)
-        return [tag.html() for tag in parser.taglist(html[start:])]
+    def iter_tags(cls, regex, html, *, matchfunc):
+        comments = HTMLCommentRanges(html)
+        parser = cls(matchfunc)
+        for match in re.finditer(regex, html):
+            if match.start() not in comments:
+                yield from parser.taglist(html[match.start():], reset=True)

+    @classmethod
+    def tags_by_name(cls, tag, html):
+        def matchfunc(tag_str, _attrs):
+            return tag_str == tag

-class FirstMatchingElementParser(HTMLTagParser):
-    def __init__(self, matchfunc):
-        super().__init__()
-        self.matchfunc = matchfunc
-        self.found = False
+        yield from cls.iter_tags(rf'<\s*{re.escape(tag)}[\s>]', html, matchfunc=matchfunc)

-    def predicate(self, tag, attrs):
-        if not self.found and self.matchfunc(tag, attrs):
-            self.found = True
-            return True
-        return False
+    @classmethod
+    def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
+        def matchfunc(_tag_str, attrs):
+            return any(attr == attribute and re.fullmatch(value, value_str)
+                       for attr, value_str in attrs)
+
+        tag_regex = cls.matching_tag_regex(tag, attribute, value, escape_value)
+        yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
+
+    @classmethod
+    def extract_attributes(cls, html):
+        attr_dict = {}
+
+        def matchfunc(_tag, attrs):
+            attr_dict.update(attrs)
+            raise cls.AbortException()
+
+        with contextlib.suppress(cls.AbortException):
+            cls(matchfunc).feed(html)

-    def callback(self, obj):
-        self.abort(obj)
+        return attr_dict
+
+    @classmethod
+    def get_elements_text_and_html_by_tag(cls, tag, html):
+        return [tag.text_and_html() for tag in cls.tags_by_name(tag, html)]

    @classmethod
    def get_element_text_and_html_by_tag(cls, tag, html):
-        """
-        For the first element with the specified tag in the given HTML document
-        return its content (text) and the whole element (html)
-        """
-        parser = cls(lambda _tag, _: _tag == tag)
-        for tag_obj in parser.taglist(html):
-            return tag_obj.text_and_html()
-        raise compat_HTMLParseError(f'tag {tag} not found')
+        tag = next(cls.tags_by_name(tag, html), None)
+        return tag and tag.text_and_html()
+
+    @classmethod
+    def get_elements_text_and_html_by_attribute(cls, *args, **kwargs):
+        return [tag.text_and_html() for tag in cls.tags_by_attribute(*args, **kwargs)]
+
+    @classmethod
+    def get_elements_by_attribute(cls, *args, **kwargs):
+        return [tag.text_and_html()[0] for tag in cls.tags_by_attribute(*args, **kwargs)]
+
+    @classmethod
+    def get_elements_html_by_attribute(cls, *args, **kwargs):
+        return [tag.html() for tag in cls.tags_by_attribute(*args, **kwargs)]
+
+    @classmethod
+    def get_element_by_attribute(cls, *args, **kwargs):
+        tag = next(cls.tags_by_attribute(*args, **kwargs), None)
+        return tag and tag.text()
+
+    @classmethod
+    def get_element_html_by_attribute(cls, *args, **kwargs):
+        tag = next(cls.tags_by_attribute(*args, **kwargs), None)
+        return tag and tag.html()
+
+    @classmethod
+    def get_elements_by_class(cls, class_name, html):
+        value = cls.class_value_regex(class_name)
+        return [tag.text() for tag
+                in cls.tags_by_attribute('class', value, html, escape_value=False)]
+
+    @classmethod
+    def get_elements_html_by_class(cls, class_name, html):
+        value = cls.class_value_regex(class_name)
+        return [tag.html() for tag
+                in cls.tags_by_attribute('class', value, html, escape_value=False)]
+
+    @classmethod
+    def get_elements_text_and_html_by_class(cls, class_name, html):
+        value = cls.class_value_regex(class_name)
+        return [tag.text() for tag
+                in cls.tags_by_attribute('class', value, html, escape_value=False)]
+
+    @classmethod
+    def get_element_html_by_class(cls, class_name, html):
+        value = cls.class_value_regex(class_name)
+        tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
+        return tag and tag.html()
+
+    @classmethod
+    def get_element_by_class(cls, class_name, html):
+        value = cls.class_value_regex(class_name)
+        tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
+        return tag and tag.text()