Use a wrapper around xml.etree.ElementTree.fromstring in python 2.x (#7178)

Attributes aren't unicode objects, so they couldn't be directly used in info_dict fields (for example '--write-description' doesn't work with bytes).
pull/8/head
Jaime Marquínez Ferrándiz 9 years ago
parent 755ff8d22c
commit 36e6f62cd0

@ -13,8 +13,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.utils import get_filesystem_encoding from youtube_dl.utils import get_filesystem_encoding
from youtube_dl.compat import ( from youtube_dl.compat import (
compat_getenv, compat_getenv,
compat_etree_fromstring,
compat_expanduser, compat_expanduser,
compat_shlex_split, compat_shlex_split,
compat_str,
compat_urllib_parse_unquote, compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus, compat_urllib_parse_unquote_plus,
) )
@ -71,5 +73,10 @@ class TestCompat(unittest.TestCase):
def test_compat_shlex_split(self): def test_compat_shlex_split(self):
self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
def test_compat_etree_fromstring(self):
xml = '<el foo="bar"></el>'
doc = compat_etree_fromstring(xml.encode('utf-8'))
self.assertTrue(isinstance(doc.attrib['foo'], compat_str))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

@ -68,6 +68,9 @@ from youtube_dl.utils import (
cli_valueless_option, cli_valueless_option,
cli_bool_option, cli_bool_option,
) )
from youtube_dl.compat import (
compat_etree_fromstring,
)
class TestUtil(unittest.TestCase): class TestUtil(unittest.TestCase):
@ -242,7 +245,7 @@ class TestUtil(unittest.TestCase):
<node x="b" y="d" /> <node x="b" y="d" />
<node x="" /> <node x="" />
</root>''' </root>'''
doc = xml.etree.ElementTree.fromstring(testxml) doc = compat_etree_fromstring(testxml)
self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None)
self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None)
@ -263,7 +266,7 @@ class TestUtil(unittest.TestCase):
<url>http://server.com/download.mp3</url> <url>http://server.com/download.mp3</url>
</media:song> </media:song>
</root>''' </root>'''
doc = xml.etree.ElementTree.fromstring(testxml) doc = compat_etree_fromstring(testxml)
find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'})) find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'}))
self.assertTrue(find('media:song') is not None) self.assertTrue(find('media:song') is not None)
self.assertEqual(find('media:song/media:author').text, 'The Author') self.assertEqual(find('media:song/media:author').text, 'The Author')
@ -285,7 +288,7 @@ class TestUtil(unittest.TestCase):
<p>Foo</p> <p>Foo</p>
</div> </div>
</root>''' </root>'''
doc = xml.etree.ElementTree.fromstring(testxml) doc = compat_etree_fromstring(testxml)
self.assertEqual(xpath_text(doc, 'div/p'), 'Foo') self.assertEqual(xpath_text(doc, 'div/p'), 'Foo')
self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default') self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default')
self.assertTrue(xpath_text(doc, 'div/bar') is None) self.assertTrue(xpath_text(doc, 'div/bar') is None)
@ -297,7 +300,7 @@ class TestUtil(unittest.TestCase):
<p x="a">Foo</p> <p x="a">Foo</p>
</div> </div>
</root>''' </root>'''
doc = xml.etree.ElementTree.fromstring(testxml) doc = compat_etree_fromstring(testxml)
self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a') self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a')
self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None) self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None)
self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None) self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None)

@ -14,6 +14,7 @@ import socket
import subprocess import subprocess
import sys import sys
import itertools import itertools
import xml.etree.ElementTree
try: try:
@ -212,6 +213,29 @@ try:
except ImportError: # Python 2.6 except ImportError: # Python 2.6
from xml.parsers.expat import ExpatError as compat_xml_parse_error from xml.parsers.expat import ExpatError as compat_xml_parse_error
if sys.version_info[0] >= 3:
compat_etree_fromstring = xml.etree.ElementTree.fromstring
else:
# on python 2.x the the attributes of a node are str objects instead of
# unicode
etree = xml.etree.ElementTree
# on 2.6 XML doesn't have a parser argument, function copied from CPython
# 2.7 source
def _XML(text, parser=None):
if not parser:
parser = etree.XMLParser(target=etree.TreeBuilder())
parser.feed(text)
return parser.close()
def _element_factory(*args, **kwargs):
el = etree.Element(*args, **kwargs)
for k, v in el.items():
el.set(k, v.decode('utf-8'))
return el
def compat_etree_fromstring(text):
return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
try: try:
from urllib.parse import parse_qs as compat_parse_qs from urllib.parse import parse_qs as compat_parse_qs
@ -507,6 +531,7 @@ __all__ = [
'compat_chr', 'compat_chr',
'compat_cookiejar', 'compat_cookiejar',
'compat_cookies', 'compat_cookies',
'compat_etree_fromstring',
'compat_expanduser', 'compat_expanduser',
'compat_get_terminal_size', 'compat_get_terminal_size',
'compat_getenv', 'compat_getenv',

@ -5,10 +5,10 @@ import io
import itertools import itertools
import os import os
import time import time
import xml.etree.ElementTree as etree
from .fragment import FragmentFD from .fragment import FragmentFD
from ..compat import ( from ..compat import (
compat_etree_fromstring,
compat_urlparse, compat_urlparse,
compat_urllib_error, compat_urllib_error,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
@ -290,7 +290,7 @@ class F4mFD(FragmentFD):
man_url = urlh.geturl() man_url = urlh.geturl()
manifest = urlh.read() manifest = urlh.read()
doc = etree.fromstring(manifest) doc = compat_etree_fromstring(manifest)
formats = [(int(f.attrib.get('bitrate', -1)), f) formats = [(int(f.attrib.get('bitrate', -1)), f)
for f in self._get_unencrypted_media(doc)] for f in self._get_unencrypted_media(doc)]
if requested_bitrate is None: if requested_bitrate is None:

@ -2,7 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import xml.etree.ElementTree
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
@ -14,7 +13,10 @@ from ..utils import (
remove_end, remove_end,
unescapeHTML, unescapeHTML,
) )
from ..compat import compat_HTTPError from ..compat import (
compat_etree_fromstring,
compat_HTTPError,
)
class BBCCoUkIE(InfoExtractor): class BBCCoUkIE(InfoExtractor):
@ -344,7 +346,7 @@ class BBCCoUkIE(InfoExtractor):
url, programme_id, 'Downloading media selection XML') url, programme_id, 'Downloading media selection XML')
except ExtractorError as ee: except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
else: else:
raise raise
return self._process_media_selector(media_selection, programme_id) return self._process_media_selector(media_selection, programme_id)

@ -4,9 +4,11 @@ from __future__ import unicode_literals
import re import re
import itertools import itertools
import json import json
import xml.etree.ElementTree as ET
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import (
compat_etree_fromstring,
)
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
unified_strdate, unified_strdate,
@ -88,7 +90,7 @@ class BiliBiliIE(InfoExtractor):
except ValueError: except ValueError:
pass pass
lq_doc = ET.fromstring(lq_page) lq_doc = compat_etree_fromstring(lq_page)
lq_durls = lq_doc.findall('./durl') lq_durls = lq_doc.findall('./durl')
hq_doc = self._download_xml( hq_doc = self._download_xml(

@ -3,10 +3,10 @@ from __future__ import unicode_literals
import re import re
import json import json
import xml.etree.ElementTree
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_etree_fromstring,
compat_parse_qs, compat_parse_qs,
compat_str, compat_str,
compat_urllib_parse, compat_urllib_parse,
@ -119,7 +119,7 @@ class BrightcoveIE(InfoExtractor):
object_str = fix_xml_ampersands(object_str) object_str = fix_xml_ampersands(object_str)
try: try:
object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) object_doc = compat_etree_fromstring(object_str.encode('utf-8'))
except compat_xml_parse_error: except compat_xml_parse_error:
return return

@ -10,7 +10,6 @@ import re
import socket import socket
import sys import sys
import time import time
import xml.etree.ElementTree
from ..compat import ( from ..compat import (
compat_cookiejar, compat_cookiejar,
@ -23,6 +22,7 @@ from ..compat import (
compat_urllib_request, compat_urllib_request,
compat_urlparse, compat_urlparse,
compat_str, compat_str,
compat_etree_fromstring,
) )
from ..utils import ( from ..utils import (
NO_DEFAULT, NO_DEFAULT,
@ -461,7 +461,7 @@ class InfoExtractor(object):
return xml_string return xml_string
if transform_source: if transform_source:
xml_string = transform_source(xml_string) xml_string = transform_source(xml_string)
return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) return compat_etree_fromstring(xml_string.encode('utf-8'))
def _download_json(self, url_or_request, video_id, def _download_json(self, url_or_request, video_id,
note='Downloading JSON metadata', note='Downloading JSON metadata',

@ -5,12 +5,12 @@ import re
import json import json
import base64 import base64
import zlib import zlib
import xml.etree.ElementTree
from hashlib import sha1 from hashlib import sha1
from math import pow, sqrt, floor from math import pow, sqrt, floor
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_etree_fromstring,
compat_urllib_parse, compat_urllib_parse,
compat_urllib_parse_unquote, compat_urllib_parse_unquote,
compat_urllib_request, compat_urllib_request,
@ -234,7 +234,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
return output return output
def _extract_subtitles(self, subtitle): def _extract_subtitles(self, subtitle):
sub_root = xml.etree.ElementTree.fromstring(subtitle) sub_root = compat_etree_fromstring(subtitle)
return [{ return [{
'ext': 'srt', 'ext': 'srt',
'data': self._convert_subtitles_to_srt(sub_root), 'data': self._convert_subtitles_to_srt(sub_root),

@ -1,10 +1,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import xml.etree.ElementTree
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_etree_fromstring,
compat_urllib_request, compat_urllib_request,
) )
from ..utils import ( from ..utils import (
@ -97,7 +97,7 @@ class VevoIE(InfoExtractor):
if last_version['version'] == -1: if last_version['version'] == -1:
raise ExtractorError('Unable to extract last version of the video') raise ExtractorError('Unable to extract last version of the video')
renditions = xml.etree.ElementTree.fromstring(last_version['data']) renditions = compat_etree_fromstring(last_version['data'])
formats = [] formats = []
# Already sorted from worst to best quality # Already sorted from worst to best quality
for rend in renditions.findall('rendition'): for rend in renditions.findall('rendition'):
@ -114,7 +114,7 @@ class VevoIE(InfoExtractor):
def _formats_from_smil(self, smil_xml): def _formats_from_smil(self, smil_xml):
formats = [] formats = []
smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8')) smil_doc = compat_etree_fromstring(smil_xml.encode('utf-8'))
els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')
for el in els: for el in els:
src = el.attrib['src'] src = el.attrib['src']

@ -36,6 +36,7 @@ import zlib
from .compat import ( from .compat import (
compat_basestring, compat_basestring,
compat_chr, compat_chr,
compat_etree_fromstring,
compat_html_entities, compat_html_entities,
compat_http_client, compat_http_client,
compat_kwargs, compat_kwargs,
@ -1974,7 +1975,7 @@ def dfxp2srt(dfxp_data):
return out return out
dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = [] out = []
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')

Loading…
Cancel
Save