[extractor/common] Extract f4m and m3u8 formats, subtitles and info

10 years ago · a107193e4b
parent 3f125c8c70
commit a107193e4b
1 changed files with 149 additions and 51 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -18,6 +18,7 @@ from ..compat import (
    compat_HTTPError,
    compat_http_client,
    compat_urllib_error,
+    compat_urllib_parse,
    compat_urllib_parse_urlparse,
    compat_urllib_request,
    compat_urlparse,
@ -37,6 +38,7 @@ from ..utils import (
    RegexNotFoundError,
    sanitize_filename,
    unescapeHTML,
+    url_basename,
 )


@ -978,69 +980,165 @@ class InfoExtractor(object):
        self._sort_formats(formats)
        return formats

-    # TODO: improve extraction
-    def _extract_smil_formats(self, smil_url, video_id, fatal=True):
-        smil = self._download_xml(
-            smil_url, video_id, 'Downloading SMIL file',
-            'Unable to download SMIL file', fatal=fatal)
+    @staticmethod
+    def _xpath_ns(path, namespace=None):
+        if not namespace:
+            return path
+        out = []
+        for c in path.split('/'):
+            if not c or c == '.':
+                out.append(c)
+            else:
+                out.append('{%s}%s' % (namespace, c))
+        return '/'.join(out)
+
+    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
+        smil = self._download_smil(smil_url, video_id, fatal=fatal)
+
        if smil is False:
            assert not fatal
            return []

-        base = smil.find('./head/meta').get('base')
+        namespace = self._search_regex(
+            r'{([^}]+)?}smil', smil.tag, 'namespace', default=None)

-        formats = []
-        rtmp_count = 0
-        if smil.findall('./body/seq/video'):
-            video = smil.findall('./body/seq/video')[0]
-            fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
-            formats.extend(fmts)
-        else:
-            for video in smil.findall('./body/switch/video'):
-                fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
-                formats.extend(fmts)
+        return self._parse_smil_formats(
+            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)

-        self._sort_formats(formats)
+    def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
+        smil = self._download_smil(smil_url, video_id, fatal=fatal)
+        if smil is False:
+            return {}
+        return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)

-        return formats
+    def _download_smil(self, smil_url, video_id, fatal=True):
+        return self._download_xml(
+            smil_url, video_id, 'Downloading SMIL file',
+            'Unable to download SMIL file', fatal=fatal)
+
+    def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
+        namespace = self._search_regex(
+            r'{([^}]+)?}smil', smil.tag, 'namespace', default=None)
+
+        formats = self._parse_smil_formats(
+            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+        subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
+
+        video_id = os.path.splitext(url_basename(smil_url))[0]
+        title = None
+        description = None
+        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+            name = meta.attrib.get('name')
+            content = meta.attrib.get('content')
+            if not name or not content:
+                continue
+            if not title and name == 'title':
+                title = content
+            elif not description and name in ('description', 'abstract'):
+                description = content
+
+        return {
+            'id': video_id,
+            'title': title or video_id,
+            'description': description,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
+        base = smil_url
+        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+            b = meta.get('base') or meta.get('httpBase')
+            if b:
+                base = b
+                break
+
+        formats = []
+        rtmp_count = 0
+        http_count = 0

-    def _parse_smil_video(self, video, video_id, base, rtmp_count):
+        videos = smil.findall(self._xpath_ns('.//video', namespace))
+        for video in videos:
            src = video.get('src')
            if not src:
-            return [], rtmp_count
+                continue
+
            bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+            filesize = int_or_none(video.get('size') or video.get('fileSize'))
            width = int_or_none(video.get('width'))
            height = int_or_none(video.get('height'))
            proto = video.get('proto')
-        if not proto:
-            if base:
-                if base.startswith('rtmp'):
-                    proto = 'rtmp'
-                elif base.startswith('http'):
-                    proto = 'http'
            ext = video.get('ext')
-        if proto == 'm3u8':
-            return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
-        elif proto == 'rtmp':
-            rtmp_count += 1
+            src_ext = determine_ext(src)
            streamer = video.get('streamer') or base
-            return ([{
+
+            if proto == 'rtmp' or streamer.startswith('rtmp'):
+                rtmp_count += 1
+                formats.append({
                    'url': streamer,
                    'play_path': src,
                    'ext': 'flv',
                    'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
                    'tbr': bitrate,
+                    'filesize': filesize,
                    'width': width,
                    'height': height,
-            }], rtmp_count)
-        elif proto.startswith('http'):
-            return ([{
-                'url': base + src,
-                'ext': ext or 'flv',
+                })
+                continue
+
+            src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+
+            if proto == 'm3u8' or src_ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    src_url, video_id, ext or 'mp4', m3u8_id='hls'))
+                continue
+
+            if src_ext == 'f4m':
+                f4m_url = src_url
+                if not f4m_params:
+                    f4m_params = {
+                        'hdcore': '3.2.0',
+                        'plugin': 'flowplayer-3.2.0.1',
+                    }
+                f4m_url += '&' if '?' in f4m_url else '?'
+                f4m_url += compat_urllib_parse.urlencode(f4m_params).encode('utf-8')
+                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
+                continue
+
+            if src_url.startswith('http'):
+                http_count += 1
+                formats.append({
+                    'url': src_url,
+                    'ext': ext or src_ext or 'flv',
+                    'format_id': 'http-%d' % (bitrate or http_count),
                    'tbr': bitrate,
+                    'filesize': filesize,
                    'width': width,
                    'height': height,
-            }], rtmp_count)
+                })
+                continue
+
+        self._sort_formats(formats)
+
+        return formats
+
+    def _parse_smil_subtitles(self, smil, namespace=None):
+        subtitles = {}
+        for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
+            src = textstream.get('src')
+            if not src:
+                continue
+            ext = textstream.get('ext') or determine_ext(src)
+            if not ext:
+                type_ = textstream.get('type')
+                if type_ == 'text/srt':
+                    ext = 'srt'
+            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
+            subtitles.setdefault(lang, []).append({
+                'url': src,
+                'ext': ext,
+            })
+        return subtitles

    def _live_title(self, name):
        """ Generate the title for a live video """