From 2d02bdceeeb8e391d114d32e010f431712a80326 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 4 Feb 2025 14:07:08 +0100 Subject: [PATCH 1/6] abstract facebook _extract_metadata method out from inside '_extract_from_url' --- yt_dlp/extractor/facebook.py | 126 +++++++++++++++++------------------ 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 24ecb03505..f4ffa03330 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -477,72 +477,72 @@ class FacebookIE(InfoExtractor): except network_exceptions as err: self.report_warning(f'unable to log in: {err}') return + + def _extract_metadata(self, webpage, video_id): + post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( + r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] + post = traverse_obj(post_data, ( + ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] + media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( + k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) + title = get_first(media, ('title', 'text')) + description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) + page_title = title or self._html_search_regex(( + r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', + r'(?s)(?P.*?)', + self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'(?P<content>.+?)', + ), webpage, 'title', default=None, group='content') + description = description or self._html_search_meta( + ['description', 'og:description', 'twitter:description'], + webpage, 'description', default=None) + uploader_data = ( + get_first(media, ('owner', {dict})) + or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name'])) + or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) + or get_first(post, ('node', 'actors', ..., {dict})) + or get_first(post, ('event', 'event_creator', {dict})) + or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {}) + uploader = uploader_data.get('name') or ( + clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) + or self._search_regex( + (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False)) + timestamp = int_or_none(self._search_regex( + r']+data-utime=["\'](\d+)', webpage, + 'timestamp', default=None)) + thumbnail = self._html_search_meta( + ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) + # some webpages contain unretrievable thumbnail urls + # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1 + # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ + if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): + thumbnail = None + info_dict = { + 'description': description, + 'uploader': uploader, + 'uploader_id': uploader_data.get('id'), + 'timestamp': timestamp, + 'thumbnail': thumbnail, + 'view_count': parse_count(self._search_regex( + (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'), + webpage, 'view count', default=None)), + 'concurrent_view_count': get_first(post, ( + ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), + **traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', { + 'like_count': ('likers', 'count', {int}), + 'comment_count': ('total_comment_count', {int}), + 'repost_count': ('share_count_reduced', {parse_count}), + }), get_all=False), + } + + info_json_ld = self._search_json_ld(webpage, video_id, default={}) + info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '') + or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}') + return merge_dicts(info_json_ld, info_dict) def _extract_from_url(self, url, video_id): webpage = self._download_webpage( url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) - def extract_metadata(webpage): - post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( - r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] - post = traverse_obj(post_data, ( - ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] - media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( - k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) - title = get_first(media, ('title', 'text')) - description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) - page_title = title or self._html_search_regex(( - r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', - r'(?s)(?P.*?)', - self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'(?P<content>.+?)', - ), webpage, 'title', default=None, group='content') - description = description or self._html_search_meta( - ['description', 'og:description', 'twitter:description'], - webpage, 'description', default=None) - uploader_data = ( - get_first(media, ('owner', {dict})) - or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name'])) - or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) - or get_first(post, ('node', 'actors', ..., {dict})) - or get_first(post, ('event', 'event_creator', {dict})) - or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {}) - uploader = uploader_data.get('name') or ( - clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) - or self._search_regex( - (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False)) - timestamp = int_or_none(self._search_regex( - r']+data-utime=["\'](\d+)', webpage, - 'timestamp', default=None)) - thumbnail = self._html_search_meta( - ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) - # some webpages contain unretrievable thumbnail urls - # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1 - # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ - if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): - thumbnail = None - info_dict = { - 'description': description, - 'uploader': uploader, - 'uploader_id': uploader_data.get('id'), - 'timestamp': timestamp, - 'thumbnail': thumbnail, - 'view_count': parse_count(self._search_regex( - (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'), - webpage, 'view count', default=None)), - 'concurrent_view_count': get_first(post, ( - ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), - **traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', { - 'like_count': ('likers', 'count', {int}), - 'comment_count': ('total_comment_count', {int}), - 'repost_count': ('share_count_reduced', {parse_count}), - }), get_all=False), - } - - info_json_ld = self._search_json_ld(webpage, video_id, default={}) - info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '') - or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}') - return merge_dicts(info_json_ld, info_dict) - video_data = None def extract_video_data(instances): @@ -753,7 +753,7 @@ class FacebookIE(InfoExtractor): return self.playlist_result(entries, video_id) video_info = entries[0] if entries else {'id': video_id} - webpage_info = extract_metadata(webpage) + webpage_info = self._extract_metadata(webpage, video_id) # honor precise duration in video info if video_info.get('duration'): webpage_info['duration'] = video_info['duration'] @@ -885,7 +885,7 @@ class FacebookIE(InfoExtractor): 'subtitles': subtitles, } process_formats(info_dict) - info_dict.update(extract_metadata(webpage)) + info_dict.update(self._extract_metadata(webpage, video_id)) return info_dict From d047f06673d622b758a97573d79244804b229982 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 4 Feb 2025 14:20:24 +0100 Subject: [PATCH 2/6] Fix ruff checks --- yt_dlp/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index f4ffa03330..12d617b840 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -477,7 +477,7 @@ class FacebookIE(InfoExtractor): except network_exceptions as err: self.report_warning(f'unable to log in: {err}') return - + def _extract_metadata(self, webpage, video_id): post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] From 01c8529ab1f254334e1bcca644b245a1eca73d2f Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 4 Feb 2025 14:24:40 +0100 Subject: [PATCH 3/6] Fix autopep8 changes --- yt_dlp/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 12d617b840..3309314e6e 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -536,7 +536,7 @@ class FacebookIE(InfoExtractor): info_json_ld = self._search_json_ld(webpage, video_id, default={}) info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '') - or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}') + or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}') return merge_dicts(info_json_ld, info_dict) def _extract_from_url(self, url, video_id): From a19df1be3dbaa9ab5d8695c5398b2ac8c8eb67cb Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 6 Feb 2025 17:59:51 +0100 Subject: [PATCH 4/6] Further tidy-up of the facebook method --- yt_dlp/extractor/facebook.py | 276 +++++++++++++++++------------------ 1 file changed, 137 insertions(+), 139 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 3309314e6e..84ba31a54f 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -539,58 +539,157 @@ class FacebookIE(InfoExtractor): or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}') return merge_dicts(info_json_ld, info_dict) + def _extract_video_data(self, instances: list) -> list: + video_data = [] + for item in instances: + if try_get(item, lambda x: x[1][0]) == 'VideoConfig': + video_item = item[2][0] + if video_item.get('video_id'): + video_data.append(video_item['videoData']) + return video_data + + def _parse_graphql_video(self, video, video_id, webpage) -> dict: + v_id = video.get('videoId') or video.get('id') or video_id + reel_info = traverse_obj( + video, ('creation_story', 'short_form_video_context', 'playback_video', {dict})) + if reel_info: + video = video['creation_story'] + video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) + video.update(reel_info) + + formats = [] + q = qualities(['sd', 'hd']) + + # Legacy formats extraction + fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video + for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), + ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), + ('browser_native_sd_url', 'sd')): + playable_url = fmt_data.get(key) + if not playable_url: + continue + if determine_ext(playable_url) == 'mpd': + formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False)) + else: + formats.append({ + 'format_id': format_id, + # sd, hd formats w/o resolution info should be deprioritized below DASH + 'quality': q(format_id) - 3, + 'url': playable_url, + }) + self._extract_dash_manifest(fmt_data, formats) + + # New videoDeliveryResponse formats extraction + fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult')) + mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none})) + dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml'])) + for idx, dash_manifest in enumerate(dash_manifests): + self._extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx)) + if not dash_manifests: + # Only extract from MPD URLs if the manifests are not already provided + for mpd_url in mpd_urls: + formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False)) + for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])): + format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower})) + formats.append({ + 'format_id': format_id, + # sd, hd formats w/o resolution info should be deprioritized below DASH + 'quality': q(format_id) - 3, + 'url': prog_fmt['progressive_url'], + }) + for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})): + formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls')) + + if not formats: + # Do not append false positive entry w/o any formats + return + + automatic_captions, subtitles = {}, {} + is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool})) + for caption in traverse_obj(video, ( + 'video_available_captions_locales', + {lambda x: sorted(x, key=lambda c: c['locale'])}, + lambda _, v: url_or_none(v['captions_url']), + )): + lang = caption.get('localized_language') or 'und' + subs = { + 'url': caption['captions_url'], + 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang), + } + if caption.get('localized_creation_method') or is_broadcast: + automatic_captions.setdefault(caption['locale'], []).append(subs) + else: + subtitles.setdefault(caption['locale'], []).append(subs) + captions_url = traverse_obj(video, ('captions_url', {url_or_none})) + if captions_url and not automatic_captions and not subtitles: + locale = self._html_search_meta( + ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US') + (automatic_captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}] + + info = { + 'id': v_id, + 'formats': formats, + 'thumbnail': traverse_obj( + video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')), + 'uploader_id': traverse_obj(video, ('owner', 'id', {str_or_none})), + 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none), + 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000) + or float_or_none(video.get('length_in_second'))), + 'automatic_captions': automatic_captions, + 'subtitles': subtitles, + } + self._process_formats(info) + description = try_get(video, lambda x: x['savable_description']['text']) + title = video.get('name') + if title: + info.update({ + 'title': title, + 'description': description, + }) + else: + info['title'] = description or f'Facebook video #{v_id}' + return info + + def _extract_dash_manifest(self, vid_data, formats, mpd_url=None): + dash_manifest = traverse_obj( + vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str) + if dash_manifest: + formats.extend(self._parse_mpd_formats( + compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), + mpd_url=url_or_none(vid_data.get('dash_manifest_url')) or mpd_url)) + + def _process_formats(self, info: dict) -> None: + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. + for f in info['formats']: + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. + f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' + # Formats larger than ~500MB will return error 403 unless chunk size is regulated + f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20 + def _extract_from_url(self, url, video_id): webpage = self._download_webpage( url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) video_data = None - def extract_video_data(instances): - video_data = [] - for item in instances: - if try_get(item, lambda x: x[1][0]) == 'VideoConfig': - video_item = item[2][0] - if video_item.get('video_id'): - video_data.append(video_item['videoData']) - return video_data - server_js_data = self._parse_json(self._search_regex( [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'], webpage, 'server js data', default='{}'), video_id, fatal=False) if server_js_data: - video_data = extract_video_data(server_js_data.get('instances', [])) + video_data = self._extract_video_data(server_js_data.get('instances', [])) def extract_from_jsmods_instances(js_data): if js_data: - return extract_video_data(try_get( + return self._extract_video_data(try_get( js_data, lambda x: x['jsmods']['instances'], list) or []) - def extract_dash_manifest(vid_data, formats, mpd_url=None): - dash_manifest = traverse_obj( - vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str) - if dash_manifest: - formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), - mpd_url=url_or_none(vid_data.get('dash_manifest_url')) or mpd_url)) - - def process_formats(info): - # Downloads with browser's User-Agent are rate limited. Working around - # with non-browser User-Agent. - for f in info['formats']: - # Downloads with browser's User-Agent are rate limited. Working around - # with non-browser User-Agent. - f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - # Formats larger than ~500MB will return error 403 unless chunk size is regulated - f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20 - def yield_all_relay_data(_filter): for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})', webpage): yield self._parse_json(relay_data, video_id, fatal=False) or {} - def extract_relay_data(_filter): - return next(filter(None, yield_all_relay_data(_filter)), {}) - def extract_relay_prefetched_data(_filter, target_keys=None): path = 'data' if target_keys is not None: @@ -614,112 +713,10 @@ class FacebookIE(InfoExtractor): if data: entries = [] - def parse_graphql_video(video): - v_id = video.get('videoId') or video.get('id') or video_id - reel_info = traverse_obj( - video, ('creation_story', 'short_form_video_context', 'playback_video', {dict})) - if reel_info: - video = video['creation_story'] - video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) - video.update(reel_info) - - formats = [] - q = qualities(['sd', 'hd']) - - # Legacy formats extraction - fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video - for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), - ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), - ('browser_native_sd_url', 'sd')): - playable_url = fmt_data.get(key) - if not playable_url: - continue - if determine_ext(playable_url) == 'mpd': - formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False)) - else: - formats.append({ - 'format_id': format_id, - # sd, hd formats w/o resolution info should be deprioritized below DASH - 'quality': q(format_id) - 3, - 'url': playable_url, - }) - extract_dash_manifest(fmt_data, formats) - - # New videoDeliveryResponse formats extraction - fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult')) - mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none})) - dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml'])) - for idx, dash_manifest in enumerate(dash_manifests): - extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx)) - if not dash_manifests: - # Only extract from MPD URLs if the manifests are not already provided - for mpd_url in mpd_urls: - formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False)) - for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])): - format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower})) - formats.append({ - 'format_id': format_id, - # sd, hd formats w/o resolution info should be deprioritized below DASH - 'quality': q(format_id) - 3, - 'url': prog_fmt['progressive_url'], - }) - for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})): - formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls')) - - if not formats: - # Do not append false positive entry w/o any formats - return - - automatic_captions, subtitles = {}, {} - is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool})) - for caption in traverse_obj(video, ( - 'video_available_captions_locales', - {lambda x: sorted(x, key=lambda c: c['locale'])}, - lambda _, v: url_or_none(v['captions_url']), - )): - lang = caption.get('localized_language') or 'und' - subs = { - 'url': caption['captions_url'], - 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang), - } - if caption.get('localized_creation_method') or is_broadcast: - automatic_captions.setdefault(caption['locale'], []).append(subs) - else: - subtitles.setdefault(caption['locale'], []).append(subs) - captions_url = traverse_obj(video, ('captions_url', {url_or_none})) - if captions_url and not automatic_captions and not subtitles: - locale = self._html_search_meta( - ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US') - (automatic_captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}] - - info = { - 'id': v_id, - 'formats': formats, - 'thumbnail': traverse_obj( - video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')), - 'uploader_id': traverse_obj(video, ('owner', 'id', {str_or_none})), - 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none), - 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000) - or float_or_none(video.get('length_in_second'))), - 'automatic_captions': automatic_captions, - 'subtitles': subtitles, - } - process_formats(info) - description = try_get(video, lambda x: x['savable_description']['text']) - title = video.get('name') - if title: - info.update({ - 'title': title, - 'description': description, - }) - else: - info['title'] = description or f'Facebook video #{v_id}' - entries.append(info) - def parse_attachment(attachment, key='media'): media = attachment.get(key) or {} if media.get('__typename') == 'Video': - return parse_graphql_video(media) + entries.append(self._parse_graphql_video(media, video_id, webpage)) nodes = variadic(traverse_obj(data, 'nodes', 'node') or []) attachments = traverse_obj(nodes, ( @@ -747,7 +744,7 @@ class FacebookIE(InfoExtractor): for attachment in attachments: parse_attachment(attachment) if not entries: - parse_graphql_video(video) + entries.append(self._parse_graphql_video(video, video_id, webpage)) if len(entries) > 1: return self.playlist_result(entries, video_id) @@ -788,7 +785,8 @@ class FacebookIE(InfoExtractor): if lsd: post_data[lsd['name']] = lsd['value'] - relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,') + relay_data = next(filter(None, yield_all_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,')), {}) + for define in (relay_data.get('define') or []): if define[0] == 'RelayAPIConfigDefaults': self._api_config = define[2] @@ -874,7 +872,7 @@ class FacebookIE(InfoExtractor): 'quality': preference, 'height': 720 if quality == 'hd' else None, }) - extract_dash_manifest(f[0], formats) + self._extract_dash_manifest(f[0], formats) subtitles_src = f[0].get('subtitles_src') if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src}) @@ -884,7 +882,7 @@ class FacebookIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } - process_formats(info_dict) + self._process_formats(info_dict) info_dict.update(self._extract_metadata(webpage, video_id)) return info_dict From 0dd8644d27e0c08dfc89798f3f202aef43ec06cd Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Feb 2025 11:47:18 +0100 Subject: [PATCH 5/6] Further abstraction of methods out of facebook.py --- yt_dlp/extractor/facebook.py | 48 +++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 84ba31a54f..5e9a4d0dc0 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -668,6 +668,24 @@ class FacebookIE(InfoExtractor): # Formats larger than ~500MB will return error 403 unless chunk size is regulated f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20 + def _extract_from_jsmods_instances(self, js_data): + if js_data: + return self._extract_video_data(try_get( + js_data, lambda x: x['jsmods']['instances'], list) or []) + + def _yield_all_relay_data(self, _filter, video_id, webpage): + for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})', webpage): + yield self._parse_json(relay_data, video_id, fatal=False) or {} + + def _extract_relay_prefetched_data(self, _filter, video_id, webpage, target_keys=None): + path = 'data' + if target_keys is not None: + path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys)) + return traverse_obj(self, self._yield_all_relay_data(_filter, video_id, webpage), ( + ..., 'require', (None, (..., ..., ..., '__bbox', 'require')), + lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v), + ..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {} + def _extract_from_url(self, url, video_id): webpage = self._download_webpage( url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) @@ -681,34 +699,18 @@ class FacebookIE(InfoExtractor): if server_js_data: video_data = self._extract_video_data(server_js_data.get('instances', [])) - def extract_from_jsmods_instances(js_data): - if js_data: - return self._extract_video_data(try_get( - js_data, lambda x: x['jsmods']['instances'], list) or []) - - def yield_all_relay_data(_filter): - for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})', webpage): - yield self._parse_json(relay_data, video_id, fatal=False) or {} - - def extract_relay_prefetched_data(_filter, target_keys=None): - path = 'data' - if target_keys is not None: - path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys)) - return traverse_obj(yield_all_relay_data(_filter), ( - ..., 'require', (None, (..., ..., ..., '__bbox', 'require')), - lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v), - ..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {} - if not video_data: server_js_data = self._parse_json(self._search_regex([ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, rf'bigPipe\.onPageletArrive\(({{.*?id\s*:\s*"{self._SUPPORTED_PAGLETS_REGEX}".*?}})\);', ], webpage, 'js data', default='{}'), video_id, js_to_json, False) - video_data = extract_from_jsmods_instances(server_js_data) + video_data = self._extract_from_jsmods_instances(server_js_data) if not video_data: - data = extract_relay_prefetched_data( + data = self._extract_relay_prefetched_data( r'"(?:dash_manifest|playable_url(?:_quality_hd)?)', + video_id, + webpage, target_keys=('video', 'event', 'nodes', 'node', 'mediaset')) if data: entries = [] @@ -779,13 +781,13 @@ class FacebookIE(InfoExtractor): }), } - prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{') + prefetched_data = self._extract_relay_prefetched_data(r'"login_data"\s*:\s*{', video_id, webpage) if prefetched_data: lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict) if lsd: post_data[lsd['name']] = lsd['value'] - relay_data = next(filter(None, yield_all_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,')), {}) + relay_data = next(filter(None, self._yield_all_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,', video_id, webpage)), {}) for define in (relay_data.get('define') or []): if define[0] == 'RelayAPIConfigDefaults': @@ -833,7 +835,7 @@ class FacebookIE(InfoExtractor): r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, 'tahoe js data', default='{}'), video_id, fatal=False) - video_data = extract_from_jsmods_instances(tahoe_js_data) + video_data = self._extract_from_jsmods_instances(tahoe_js_data) if not video_data: raise ExtractorError('Cannot parse data') From a34fe2db4c3bad67e1fc39caa61ffca0d5de6f31 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 24 Feb 2025 11:14:08 +0000 Subject: [PATCH 6/6] further refactoring, add new unit tests, remove tahoe code --- yt_dlp/extractor/facebook.py | 50 ++++++++++-------------------------- 1 file changed, 14 insertions(+), 36 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 5e9a4d0dc0..c3ffa15edf 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -70,7 +70,6 @@ class FacebookIE(InfoExtractor): IE_NAME = 'facebook' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' - _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' _TESTS = [{ 'url': 'https://www.facebook.com/radiokicksfm/videos/3676516585958356/', @@ -238,7 +237,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1569199726448814', 'ext': 'mp4', - 'title': 'Pence MUST GO!', + 'title': 'Trump/Musk & Vance MUST GO!', 'description': 'Vickie Gentry shared a memory.', 'timestamp': 1511548260, 'upload_date': '20171124', @@ -413,6 +412,13 @@ class FacebookIE(InfoExtractor): }, { 'url': 'https://www.facebook.com/groups/1513990329015294/posts/d41d8cd9/2013209885760000/?app=fbl', 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/WatchESLOne/videos/297860117405429/', + 'info_dict': { + 'id': '297860117405429', + }, + 'playlist_count': 1, + 'skip': 'URL that previously required tahoe player, but currently not working. More info: https://github.com/ytdl-org/youtube-dl/issues/15441', }] _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' _api_config = { @@ -673,15 +679,15 @@ class FacebookIE(InfoExtractor): return self._extract_video_data(try_get( js_data, lambda x: x['jsmods']['instances'], list) or []) - def _yield_all_relay_data(self, _filter, video_id, webpage): + def _yield_all_relay_data(self, _filter, webpage): for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})', webpage): - yield self._parse_json(relay_data, video_id, fatal=False) or {} + yield self._parse_json(relay_data, None, fatal=False) or {} - def _extract_relay_prefetched_data(self, _filter, video_id, webpage, target_keys=None): + def _extract_relay_prefetched_data(self, _filter, webpage, target_keys=None): path = 'data' if target_keys is not None: path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys)) - return traverse_obj(self, self._yield_all_relay_data(_filter, video_id, webpage), ( + return traverse_obj(self._yield_all_relay_data(_filter, webpage), ( ..., 'require', (None, (..., ..., ..., '__bbox', 'require')), lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v), ..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {} @@ -709,7 +715,6 @@ class FacebookIE(InfoExtractor): if not video_data: data = self._extract_relay_prefetched_data( r'"(?:dash_manifest|playable_url(?:_quality_hd)?)', - video_id, webpage, target_keys=('video', 'event', 'nodes', 'node', 'mediaset')) if data: @@ -781,13 +786,13 @@ class FacebookIE(InfoExtractor): }), } - prefetched_data = self._extract_relay_prefetched_data(r'"login_data"\s*:\s*{', video_id, webpage) + prefetched_data = self._extract_relay_prefetched_data(r'"login_data"\s*:\s*{', webpage) if prefetched_data: lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict) if lsd: post_data[lsd['name']] = lsd['value'] - relay_data = next(filter(None, self._yield_all_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,', video_id, webpage)), {}) + relay_data = next(filter(None, self._yield_all_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,', webpage)), {}) for define in (relay_data.get('define') or []): if define[0] == 'RelayAPIConfigDefaults': @@ -810,33 +815,6 @@ class FacebookIE(InfoExtractor): return self.playlist_result(entries, video_id) - if not video_data: - # Video info not in first request, do a secondary request using - # tahoe player specific URL - tahoe_data = self._download_webpage( - self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, - data=urlencode_postdata({ - '__a': 1, - '__pc': self._search_regex( - r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, - 'pkg cohort', default='PHASED:DEFAULT'), - '__rev': self._search_regex( - r'client_revision["\']\s*:\s*(\d+),', webpage, - 'client revision', default='3944515'), - 'fb_dtsg': self._search_regex( - r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', - webpage, 'dtsg token', default=''), - }), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - tahoe_js_data = self._parse_json( - self._search_regex( - r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, - 'tahoe js data', default='{}'), - video_id, fatal=False) - video_data = self._extract_from_jsmods_instances(tahoe_js_data) - if not video_data: raise ExtractorError('Cannot parse data')