diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py
index c7ba91862..82d7623f6 100644
--- a/yt_dlp/downloader/__init__.py
+++ b/yt_dlp/downloader/__init__.py
@@ -22,6 +22,7 @@ from .http import HttpFD
from .rtmp import RtmpFD
from .rtsp import RtspFD
from .ism import IsmFD
+from .mhtml import MhtmlFD
from .niconico import NiconicoDmcFD
from .youtube_live_chat import YoutubeLiveChatReplayFD
from .external import (
@@ -39,6 +40,7 @@ PROTOCOL_MAP = {
'f4m': F4mFD,
'http_dash_segments': DashSegmentsFD,
'ism': IsmFD,
+ 'mhtml': MhtmlFD,
'niconico_dmc': NiconicoDmcFD,
'youtube_live_chat_replay': YoutubeLiveChatReplayFD,
}
diff --git a/yt_dlp/downloader/mhtml.py b/yt_dlp/downloader/mhtml.py
new file mode 100644
index 000000000..81d95c7cb
--- /dev/null
+++ b/yt_dlp/downloader/mhtml.py
@@ -0,0 +1,202 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import io
+import quopri
+import re
+import uuid
+
+from .fragment import FragmentFD
+from ..utils import (
+ escapeHTML,
+ formatSeconds,
+ srt_subtitles_timecode,
+ urljoin,
+)
+from ..version import __version__ as YT_DLP_VERSION
+
+
+class MhtmlFD(FragmentFD):
+ FD_NAME = 'mhtml'
+
+ _STYLESHEET = """\
+html, body {
+ margin: 0;
+ padding: 0;
+ height: 100vh;
+}
+
+html {
+ overflow-y: scroll;
+ scroll-snap-type: y mandatory;
+}
+
+body {
+ scroll-snap-type: y mandatory;
+ display: flex;
+ flex-flow: column;
+}
+
+body > figure {
+ max-width: 100vw;
+ max-height: 100vh;
+ scroll-snap-align: center;
+}
+
+body > figure > figcaption {
+ text-align: center;
+ height: 2.5em;
+}
+
+body > figure > img {
+ display: block;
+ margin: auto;
+ max-width: 100%;
+ max-height: calc(100vh - 5em);
+}
+"""
+ _STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET)
+ _STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET)
+
+ @staticmethod
+ def _escape_mime(s):
+ return '=?utf-8?Q?' + (b''.join(
+ bytes((b,)) if b >= 0x20 else b'=%02X' % b
+ for b in quopri.encodestring(s.encode('utf-8'), header=True)
+ )).decode('us-ascii') + '?='
+
+ def _gen_cid(self, i, fragment, frag_boundary):
+ return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary)
+
+ def _gen_stub(self, *, fragments, frag_boundary, title):
+ output = io.StringIO()
+
+ output.write((
+ ''
+ ''
+ '
'
+ '' ''
+ '' '{title}'
+ '' ''
+ ''
+ ).format(
+ version=escapeHTML(YT_DLP_VERSION),
+ styles=self._STYLESHEET,
+ title=escapeHTML(title)
+ ))
+
+ t0 = 0
+ for i, frag in enumerate(fragments):
+ output.write('')
+ t0 = t1
+
+ return output.getvalue()
+
+ def real_download(self, filename, info_dict):
+ fragment_base_url = info_dict.get('fragment_base_url')
+ fragments = info_dict['fragments'][:1] if self.params.get(
+ 'test', False) else info_dict['fragments']
+ title = info_dict['title']
+ origin = info_dict['webpage_url']
+
+ ctx = {
+ 'filename': filename,
+ 'total_frags': len(fragments),
+ }
+
+ self._prepare_and_start_frag_download(ctx)
+
+ extra_state = ctx.setdefault('extra_state', {
+ 'header_written': False,
+ 'mime_boundary': str(uuid.uuid4()).replace('-', ''),
+ })
+
+ frag_boundary = extra_state['mime_boundary']
+
+ if not extra_state['header_written']:
+ stub = self._gen_stub(
+ fragments=fragments,
+ frag_boundary=frag_boundary,
+ title=title
+ )
+
+ ctx['dest_stream'].write((
+ 'MIME-Version: 1.0\r\n'
+ 'From: \r\n'
+ 'To: \r\n'
+ 'Subject: {title}\r\n'
+ 'Content-type: multipart/related; '
+ '' 'boundary="{boundary}"; '
+ '' 'type="text/html"\r\n'
+ 'X.yt-dlp.Origin: {origin}\r\n'
+ '\r\n'
+ '--{boundary}\r\n'
+ 'Content-Type: text/html; charset=utf-8\r\n'
+ 'Content-Length: {length}\r\n'
+ '\r\n'
+ '{stub}\r\n'
+ ).format(
+ origin=origin,
+ boundary=frag_boundary,
+ length=len(stub),
+ title=self._escape_mime(title),
+ stub=stub
+ ).encode('utf-8'))
+ extra_state['header_written'] = True
+
+ for i, fragment in enumerate(fragments):
+ if (i + 1) <= ctx['fragment_index']:
+ continue
+
+ fragment_url = urljoin(fragment_base_url, fragment['path'])
+ success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)
+ if not success:
+ continue
+
+ mime_type = b'image/jpeg'
+ if frag_content.startswith(b'\x89PNG\r\n\x1a\n'):
+ mime_type = b'image/png'
+ if frag_content.startswith((b'GIF87a', b'GIF89a')):
+ mime_type = b'image/gif'
+ if frag_content.startswith(b'RIFF') and frag_content[8:12] == 'WEBP':
+ mime_type = b'image/webp'
+
+ frag_header = io.BytesIO()
+ frag_header.write(
+ b'--%b\r\n' % frag_boundary.encode('us-ascii'))
+ frag_header.write(
+ b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'))
+ frag_header.write(
+ b'Content-type: %b\r\n' % mime_type)
+ frag_header.write(
+ b'Content-length: %u\r\n' % len(frag_content))
+ frag_header.write(
+ b'Content-location: %b\r\n' % fragment_url.encode('us-ascii'))
+ frag_header.write(
+ b'X.yt-dlp.Duration: %f\r\n' % fragment['duration'])
+ frag_header.write(b'\r\n')
+ self._append_fragment(
+ ctx, frag_header.getvalue() + frag_content + b'\r\n')
+
+ ctx['dest_stream'].write(
+ b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii'))
+ self._finish_frag_download(ctx)
+ return True
diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py
index 1b7c1d2ff..575f3d25c 100644
--- a/yt_dlp/extractor/canvas.py
+++ b/yt_dlp/extractor/canvas.py
@@ -24,7 +24,7 @@ class CanvasIE(InfoExtractor):
_VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P[^/?#&]+)'
_TESTS = [{
'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
- 'md5': '68993eda72ef62386a15ea2cf3c93107',
+ 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9',
'info_dict': {
'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
@@ -32,9 +32,9 @@ class CanvasIE(InfoExtractor):
'title': 'Nachtwacht: De Greystook',
'description': 'Nachtwacht: De Greystook',
'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 1468.04,
+ 'duration': 1468.02,
},
- 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
+ 'expected_warnings': ['is not a supported codec'],
}, {
'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
'only_matching': True,
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index 3603924e4..1524fcb15 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -2126,6 +2126,7 @@ class InfoExtractor(object):
format_id.append(str(format_index))
f = {
'format_id': '-'.join(format_id),
+ 'format_note': name,
'format_index': format_index,
'url': manifest_url,
'manifest_url': m3u8_url,
@@ -2637,7 +2638,7 @@ class InfoExtractor(object):
mime_type = representation_attrib['mimeType']
content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
- if content_type in ('video', 'audio', 'text'):
+ if content_type in ('video', 'audio', 'text') or mime_type == 'image/jpeg':
base_url = ''
for element in (representation, adaptation_set, period, mpd_doc):
base_url_e = element.find(_add_ns('BaseURL'))
@@ -2654,9 +2655,15 @@ class InfoExtractor(object):
url_el = representation.find(_add_ns('BaseURL'))
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
+ if representation_id is not None:
+ format_id = representation_id
+ else:
+ format_id = content_type
+ if mpd_id:
+ format_id = mpd_id + '-' + format_id
if content_type in ('video', 'audio'):
f = {
- 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
+ 'format_id': format_id,
'manifest_url': mpd_url,
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
@@ -2676,6 +2683,17 @@ class InfoExtractor(object):
'manifest_url': mpd_url,
'filesize': filesize,
}
+ elif mime_type == 'image/jpeg':
+ # See test case in VikiIE
+ # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
+ f = {
+ 'format_id': format_id,
+ 'ext': 'mhtml',
+ 'manifest_url': mpd_url,
+ 'format_note': 'DASH storyboards (jpeg)',
+ 'acodec': 'none',
+ 'vcodec': 'none',
+ }
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
def prepare_template(template_name, identifiers):
@@ -2694,7 +2712,8 @@ class InfoExtractor(object):
t += c
# Next, $...$ templates are translated to their
# %(...) counterparts to be used with % operator
- t = t.replace('$RepresentationID$', representation_id)
+ if representation_id is not None:
+ t = t.replace('$RepresentationID$', representation_id)
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
t.replace('$$', '$')
@@ -2811,7 +2830,7 @@ class InfoExtractor(object):
'url': mpd_url or base_url,
'fragment_base_url': base_url,
'fragments': [],
- 'protocol': 'http_dash_segments',
+ 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
})
if 'initialization_url' in representation_ms_info:
initialization_url = representation_ms_info['initialization_url']
@@ -2822,7 +2841,7 @@ class InfoExtractor(object):
else:
# Assuming direct URL to unfragmented media.
f['url'] = base_url
- if content_type in ('video', 'audio'):
+ if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
formats.append(f)
elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f)
diff --git a/yt_dlp/extractor/viki.py b/yt_dlp/extractor/viki.py
index 98d16f4d1..19bcf1d7b 100644
--- a/yt_dlp/extractor/viki.py
+++ b/yt_dlp/extractor/viki.py
@@ -142,6 +142,7 @@ class VikiIE(VikiBaseIE):
IE_NAME = 'viki'
_VALID_URL = r'%s(?:videos|player)/(?P[0-9]+v)' % VikiBaseIE._VALID_URL_BASE
_TESTS = [{
+ 'note': 'Free non-DRM video with storyboards in MPD',
'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1',
'info_dict': {
'id': '1175236v',
@@ -155,7 +156,6 @@ class VikiIE(VikiBaseIE):
'params': {
'format': 'bestvideo',
},
- 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, {
'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
'info_dict': {
@@ -173,7 +173,6 @@ class VikiIE(VikiBaseIE):
'format': 'bestvideo',
},
'skip': 'Blocked in the US',
- 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, {
# clip
'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
@@ -225,7 +224,6 @@ class VikiIE(VikiBaseIE):
'params': {
'format': 'bestvideo',
},
- 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, {
# youtube external
'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
@@ -264,7 +262,6 @@ class VikiIE(VikiBaseIE):
'params': {
'format': 'bestvideo',
},
- 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}]
def _real_extract(self, url):
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 3cb79b657..59445a1da 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -2244,6 +2244,17 @@ def unescapeHTML(s):
r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
+def escapeHTML(text):
+ return (
+ text
+ .replace('&', '&')
+ .replace('<', '<')
+ .replace('>', '>')
+ .replace('"', '"')
+ .replace("'", ''')
+ )
+
+
def process_communicate_or_kill(p, *args, **kwargs):
try:
return p.communicate(*args, **kwargs)
@@ -2323,13 +2334,14 @@ def decodeOption(optval):
return optval
-def formatSeconds(secs, delim=':'):
+def formatSeconds(secs, delim=':', msec=False):
if secs > 3600:
- return '%d%s%02d%s%02d' % (secs // 3600, delim, (secs % 3600) // 60, delim, secs % 60)
+ ret = '%d%s%02d%s%02d' % (secs // 3600, delim, (secs % 3600) // 60, delim, secs % 60)
elif secs > 60:
- return '%d%s%02d' % (secs // 60, delim, secs % 60)
+ ret = '%d%s%02d' % (secs // 60, delim, secs % 60)
else:
- return '%d' % secs
+ ret = '%d' % secs
+ return '%s.%03d' % (ret, secs % 1) if msec else ret
def make_HTTPS_handler(params, **kwargs):