From febff4c1194de0528c087274bc17e3a8be3296ba Mon Sep 17 00:00:00 2001 From: Bepis <36346617+bbepis@users.noreply.github.com> Date: Sat, 19 Feb 2022 23:00:51 +1100 Subject: [PATCH] [tubitv] Fix/improve TV series extraction (#2829) Authored by: bbepis --- yt_dlp/extractor/tubitv.py | 12 ++++++++++-- yt_dlp/utils.py | 2 ++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/tubitv.py b/yt_dlp/extractor/tubitv.py index 2e9b325ba..e9b66ec77 100644 --- a/yt_dlp/extractor/tubitv.py +++ b/yt_dlp/extractor/tubitv.py @@ -107,6 +107,9 @@ class TubiTvIE(InfoExtractor): 'url': self._proto_relative_url(sub_url), }) + season_number, episode_number, episode_title = self._search_regex( + r'^S(\d+):E(\d+) - (.+)', title, 'episode info', fatal=False, group=(1, 2, 3), default=(None, None, None)) + return { 'id': video_id, 'title': title, @@ -117,6 +120,9 @@ class TubiTvIE(InfoExtractor): 'duration': int_or_none(video_data.get('duration')), 'uploader_id': video_data.get('publisher_id'), 'release_year': int_or_none(video_data.get('year')), + 'season_number': int_or_none(season_number), + 'episode_number': int_or_none(episode_number), + 'episode_title': episode_title } @@ -132,9 +138,11 @@ class TubiTvShowIE(InfoExtractor): def _entries(self, show_url, show_name): show_webpage = self._download_webpage(show_url, show_name) + show_json = self._parse_json(self._search_regex( - r"window\.__data\s*=\s*({.+?});\s*", - show_webpage, 'data',), show_name, transform_source=js_to_json)['video'] + r'window\.__data\s*=\s*({[^<]+});\s*', + show_webpage, 'data'), show_name, transform_source=js_to_json)['video'] + for episode_id in show_json['fullContentById'].keys(): yield self.url_result( 'tubitv:%s' % episode_id, diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index c5489d494..f5cad0e54 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3143,6 +3143,8 @@ def js_to_json(code, vars={}): return '"%s"' % v + code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) + return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|