[utils] Add get_element_by_class

For #9950
pull/8/head
Yen Chi Hsuan 8 years ago
parent ab49d7a9fa
commit 84c237fb8a
No known key found for this signature in database
GPG Key ID: 3FDDD575826C5C30

@ -33,6 +33,7 @@ from youtube_dl.utils import (
ExtractorError, ExtractorError,
find_xpath_attr, find_xpath_attr,
fix_xml_ampersands, fix_xml_ampersands,
get_element_by_class,
InAdvancePagedList, InAdvancePagedList,
intlist_to_bytes, intlist_to_bytes,
is_html, is_html,
@ -991,5 +992,13 @@ The first line
self.assertEqual(urshift(3, 1), 1) self.assertEqual(urshift(3, 1), 1)
self.assertEqual(urshift(-3, 1), 2147483646) self.assertEqual(urshift(-3, 1), 2147483646)
def test_get_element_by_class(self):
html = '''
<span class="foo bar">nice</span>
'''
self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

@ -310,9 +310,17 @@ def get_element_by_id(id, html):
return get_element_by_attribute('id', id, html) return get_element_by_attribute('id', id, html)
def get_element_by_attribute(attribute, value, html): def get_element_by_class(class_name, html):
return get_element_by_attribute(
'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
html, escape_value=False)
def get_element_by_attribute(attribute, value, html, escape_value=True):
"""Return the content of the tag with the specified attribute in the passed HTML document""" """Return the content of the tag with the specified attribute in the passed HTML document"""
value = re.escape(value) if escape_value else value
m = re.search(r'''(?xs) m = re.search(r'''(?xs)
<([a-zA-Z0-9:._-]+) <([a-zA-Z0-9:._-]+)
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
@ -321,7 +329,7 @@ def get_element_by_attribute(attribute, value, html):
\s*> \s*>
(?P<content>.*?) (?P<content>.*?)
</\1> </\1>
''' % (re.escape(attribute), re.escape(value)), html) ''' % (re.escape(attribute), value), html)
if not m: if not m:
return None return None

Loading…
Cancel
Save