[underline] Add extractor (draft)

3 years ago · ebab01bb73
parent 935bac1e4d
commit ebab01bb73
2 changed files with 104 additions and 0 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -2027,6 +2027,7 @@ from .dlive import (
 )
 from .drooble import DroobleIE
 from .umg import UMGDeIE
+from .underline import UnderlineIE
 from .unistra import UnistraIE
 from .unity import UnityIE
 from .unscripted import UnscriptedNewsVideoIE
--- a/yt_dlp/extractor/underline.py
+++ b/yt_dlp/extractor/underline.py
@ -0,0 +1,103 @@
+from .common import InfoExtractor
+
+DEBUG_P = False
+if DEBUG_P:
+    import json
+    from icecream import ic
+    from IPython import embed
+
+
+def gen_dict_extract(var, key):
+    if hasattr(var, "items"):
+        for k, v in var.items():
+            if k == key:
+                yield v
+            if isinstance(v, dict):
+                for result in gen_dict_extract(v, key):
+                    yield result
+            elif isinstance(v, list):
+                for d in v:
+                    for result in gen_dict_extract(d, key):
+                        yield result
+
+
+class UnderlineIE(InfoExtractor):
+    _VALID_URL = r"https?://(?:www\.)?underline\.io/events/(?P<id>[^?]+).*"
+
+    _TESTS = [
+        {
+            "params": {
+                "skip_download": True,
+            },
+            "url": "https://underline.io/events/342/posters/12863/poster/66463-mbti-personality-prediction-approach-on-persian-twitter?tab=video",
+            "md5": "md5:eaa894161adaef6efd6008681e1cd2c5",
+            # md5 sum of the first 10241 bytes of the video file (use --test)
+            "info_dict": {
+                "id": "342/posters/12863/poster/66463-mbti-personality-prediction-approach-on-persian-twitter",
+                "ext": "mp4",
+                "title": "MBTI Personality Prediction Approach on Persian Twitter",
+                # * A value
+                # * MD5 checksum; start the string with md5:
+                # * A regular expression; start the string with re:
+                # * Any Python type, e.g. int or float
+            },
+        }
+    ]
+
+    def _real_extract(self, url):
+        # cookies = self._get_cookies(url)
+        # if DEBUG_P:
+        #     ic(cookies)
+
+        # if not cookies:
+        #     self.raise_login_required('Cookies are needed to download from this website', method='cookies')
+
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        webpage_info = self._search_json(
+            r'<script\s+id="__NEXT_DATA__"\s+type="application/json">',
+            webpage,
+            "idk_what_this_arg_does",
+            video_id,
+            end_pattern=r"</script>",
+        )
+
+        if DEBUG_P:
+            # ic(webpage_info)
+            with open("./tmp.json", "w") as f:
+                json.dump(webpage_info, f)
+
+            # ic(webpage_info["props"]["pageProps"]["snapshot"]["models"][10]["title"])
+            # embed()
+
+        title = list(gen_dict_extract(webpage_info, "title"))
+        if DEBUG_P:
+            ic(title)
+
+        if len(title) == 0:
+            title = None
+        else:
+            title = title[0]
+
+        playlist_urls = list(gen_dict_extract(webpage_info, "playlist"))
+        if DEBUG_P:
+            ic(playlist_urls)
+
+        if len(playlist_urls) == 0:
+            url = None
+        else:
+            url = playlist_urls[0]
+
+        formats = []
+
+        m3u8_url = url
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native'))
+
+        return {
+            "id": video_id,
+            "title": title,
+            "formats": formats,
+        }