diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index f46e8b65ed..c604efb3e9 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -132,137 +132,169 @@ class TestInfoExtractor(unittest.TestCase): self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) def test_search_json_ld_realworld(self): - _TESTS = [ + _TESTS = [( # https://github.com/ytdl-org/youtube-dl/issues/23306 - ( - r'''''', - { - 'title': '1 On 1 With Kleio', - 'description': 'Kleio Valentien', - 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4', - 'timestamp': 1449347075, - 'duration': 743.0, - 'view_count': 1120958, - 'width': 1920, - 'height': 1080, + r''' + ''', - { - 'title': 'md5:91fe569e952e4d146485740ae927662b', - 'categories': ['Κοινωνία'], - 'creators': ['Ant1news'], - 'description': 'md5:16756d0a18f33bf550e683d134a72f3c', - 'modified_timestamp': 1636523573, - 'release_timestamp': 1636523400, - 'tags': 'count:6', - 'thumbnails': [{'url': 'https://ant1media.azureedge.net/imgHandler/1100/a635c968-be71-447c-bf9c-80d843ece21e.jpg'}], - 'uploader': 'Ant1news', + "aggregateRating": { + "@type": "AggregateRating", + "ratingValue": "88", + "ratingCount": "630", + "bestRating": "100", + "worstRating": "0" }, - {'expected_type': 'NewsArticle'}, - ), - ( - r''' + ''', { + 'ext': 'mp4', + 'title': '1 On 1 With Kleio', + 'age_limit': 18, + 'artists': ['1 On 1 With Kleio'], + 'average_rating': 88, + 'description': 'Kleio Valentien', + 'duration': 743, + 'height': 1080, + 'thumbnails': [ + {'url': 'https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg'}, + {'url': 'https://imggen.eporner.com/780814/1920/1080/9.jpg'}, + ], + 'timestamp': 1449347075, + 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4', + 'view_count': 1120958, + 'width': 1920, + }, {}, + ), ( + # https://github.com/yt-dlp/yt-dlp/pull/1983 + r''' + + ''', { + 'title': 'Rewilding Sharks - Inside the lucrative trade of shark fishing in Indonesia', + 'creators': ['CNA'], + 'description': 'md5:4ce967a72d546b32935cb98c8722346b', + 'modified_timestamp': 1744085232, + 'release_timestamp': 1742147880, + 'thumbnails': [{ + 'height': 100, + 'url': 'https://dam.mediacorp.sg/image/upload/s--0VYzW7We--/c_fill,g_auto,h_338,w_600/f_auto,q_auto/v1/mediacorp/cna/image/2025/03/17/1742148440-image.jpg?itok=rav-cQ_p', + 'width': 100, + }], + }, {'expected_type': 'NewsArticle'}, + ), ( + # https://github.com/yt-dlp/yt-dlp/pull/2031 + r''' + - ''', - { - 'chapters': [ - {'title': 'Explosie Turnhout', 'start_time': 70, 'end_time': 440}, - {'title': 'Jaarwisseling', 'start_time': 440, 'end_time': 1179}, - {'title': 'Natuurbranden Colorado', 'start_time': 1179, 'end_time': 1263}, - {'title': 'Klimaatverandering', 'start_time': 1263, 'end_time': 1367}, - {'title': 'Zacht weer', 'start_time': 1367, 'end_time': 1383}, - {'title': 'Financiële balans', 'start_time': 1383, 'end_time': 1484}, - {'title': 'Club Brugge', 'start_time': 1484, 'end_time': 1575}, - {'title': 'Mentale gezondheid bij topsporters', 'start_time': 1575, 'end_time': 1728}, - {'title': 'Olympische Winterspelen', 'start_time': 1728, 'end_time': 1873}, - {'title': 'Sober oudjaar in Nederland', 'start_time': 1873, 'end_time': 2079.23}, - ], - 'title': 'Het journaal - Aflevering 365 (Seizoen 2021)', - }, {}, - ), - ( - # test multiple thumbnails in a list - r''' -''', - { - 'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}], + "partOfSeries":{ + "name":"Het journaal", + "@id":"222831405527", + "@type":"TVSeries" }, - {}, - ), - ( - # test single thumbnail - r''' -''', - { - 'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}], - }, - {}, - ), - ( - # test thumbnail_url key without URL scheme - r''' -''', - { - 'thumbnails': [{'url': 'https://www.nobelprize.org/images/12693-landscape-medium-gallery.jpg'}], + "partOfSeason":{ + "name":"Seizoen 2021", + "@id":"961809365527", + "@type":"TVSeason" }, - {}, - ), - ( - r''' - -''', - { - 'title': 'md5:3f077843a74f01f768bbf0853c210855', - 'categories': ['Reportages'], - 'creators': ['Sabine Dupont'], - 'description': 'md5:1dc04a3aa56c5228503071baa8b4cc97', - 'modified_timestamp': 1747319520, - 'release_timestamp': 1747319520, - 'tags': 'count:1', - 'timestamp': 1747319520, - 'thumbnails': [{'url': 'https://www.telemb.be/cdn/ff/pKwkkhB7a5GqSf98QdDUcn9WlvGTYyilvXisHO3fHpI/1747320854/public/2025-05/00006554_avc-tmb-093031.jpeg'}], - 'uploader': 'Tele MB', - }, - {}, - ), - ] + "@context":"https://schema.org", + "@id":"961685295527", + "@type":"TVEpisode" + } + + ''', { + 'title': 'Het journaal - Aflevering 365 (Seizoen 2021)', + 'artists': ['Het journaal - Aflevering 365 (Seizoen 2021)'], + 'chapters': [ + {'title': 'Explosie Turnhout', 'start_time': 70, 'end_time': 440}, + {'title': 'Jaarwisseling', 'start_time': 440, 'end_time': 1179}, + {'title': 'Natuurbranden Colorado', 'start_time': 1179, 'end_time': 1263}, + {'title': 'Klimaatverandering', 'start_time': 1263, 'end_time': 1367}, + {'title': 'Zacht weer', 'start_time': 1367, 'end_time': 1383}, + {'title': 'Financiële balans', 'start_time': 1383, 'end_time': 1484}, + {'title': 'Club Brugge', 'start_time': 1484, 'end_time': 1575}, + {'title': 'Mentale gezondheid bij topsporters', 'start_time': 1575, 'end_time': 1728}, + {'title': 'Olympische Winterspelen', 'start_time': 1728, 'end_time': 1873}, + {'title': 'Sober oudjaar in Nederland', 'start_time': 1873, 'end_time': 2079.23}, + ], + 'description': 'Het journaal 19u van vrijdag 31 december 2021. Bekijk aflevering 365 van seizoen 2021 met VRT NU via de site of app.', + 'duration': 2079.23, + 'episode': 'Het journaal 19u', + 'episode_number': 365, + 'genres': ['Nieuws en actua'], + 'season': 'Het journaal 19u', + 'series': 'Het journaal', + 'thumbnails': [{'url': 'https://images.vrt.be/width1280/2021/12/31/80d5ed00-6a64-11ec-b07d-02b7b76bf47f.jpg'}], + 'timestamp': 1640973600, + }, {}, + ), ( + # thumbnailUrl, {str} + r''' + + ''', { + 'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}], + }, {}, + ), ( + # no scheme URL + # thumbnail_url, {str} + r''' + + ''', { + 'thumbnails': [{'url': 'https://www.nobelprize.org/images/12693-landscape-medium-gallery.jpg'}], + }, {}, + ), ( + # no scheme URL + # thumbnailURL, {str} + r''' + + ''', { + 'thumbnails': [{'url': 'https://images.ctfassets.net/o78em1y1w4i4/2XrBpSdjPK1OJXAAFYU8iw/76ffd0f25465502c9a704dcfc2aa6c64/Teaser-Elsevier-careers-video-thumbnail.jpg'}], + }, {}, + ), ( + # escaped URL + # image, {"url": {str}} + r''' + + ''', { + 'thumbnails': [{ + 'height': 630, + 'url': 'https://assets1.cbsnewsstatic.com/hub/i/r/2025/08/29/597b721d-9c95-424d-8720-05df6b8a4a4e/thumbnail/1200x630/45e846ad5f209972ab225651b40d0b4d/cbsn-fusion-al-gore-on-20-years-since-hurricane-katrina-thumbnail.jpg', + 'width': 1200, + }], + }, {}, + ), ( + # nested width/height + # image, {"url": {str}} + r''' + + ''', { + 'thumbnails': [{ + 'height': 576, + 'url': 'https://ichef.bbci.co.uk/ace/standard/1024/cpsprodpb/398e/live/1cc4dab0-8689-11f0-9cf6-cbf3e73ce2b9.jpg', + 'width': 1024, + }], + }, {}, + ), ( + # image, {"url": [{str}]} + r''' + + ''', { + 'thumbnails': [{ + 'height': 1103, + 'url': 'https://imagenes.elpais.com/resizer/v2/JUFK7HEHNNAIVJXC3WVUXNZT7M.jpg?auth=350ef4714331cf2e29299b840f86da4eb8b0ddde3a3bdb3f4302f0c90a9ae2d6&width=1960&height=1103&smart=true', + 'width': 1960, + }, { + 'height': 1470, + 'url': 'https://imagenes.elpais.com/resizer/v2/JUFK7HEHNNAIVJXC3WVUXNZT7M.jpg?auth=350ef4714331cf2e29299b840f86da4eb8b0ddde3a3bdb3f4302f0c90a9ae2d6&width=1960&height=1470&smart=true', + 'width': 1960, + }, { + 'height': 980, + 'url': 'https://imagenes.elpais.com/resizer/v2/JUFK7HEHNNAIVJXC3WVUXNZT7M.jpg?auth=350ef4714331cf2e29299b840f86da4eb8b0ddde3a3bdb3f4302f0c90a9ae2d6&width=980&height=980&smart=true', + 'width': 980, + }], + }, {}, + ), ( + # image, [{"contentUrl": {str}}] + r''' + + ''', { + 'thumbnails': [{ + 'height': 900, + 'url': 'https://www.planet-wissen.de/sendungen/lebensglueck-frau-sprung-strand-100~_v-gseagaleriexl.jpg', + 'width': 1600, + }, { + 'height': 1152, + 'url': 'https://www.planet-wissen.de/sendungen/lebensglueck-frau-sprung-strand-100~_v-HintergrundL.jpg', + 'width': 1536, + }], + }, {}, + ), ( + # duplicate thumbnails + # image, [{"url": {str}}], + # thumbnail, [{"url": {str}}] + # thumbnailUrl, [{str}] + r''' + + ''', { + 'thumbnails': [{ + 'height': 1080, + 'url': 'https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJiLwU/16x9-1920/leverkusen-322.jpg', + 'width': 1920, + }, { + 'height': 1400, + 'url': 'https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJibh0/1x1-1400/leverkusen-322.jpg', + 'width': 1400, + }, { + 'height': 960, + 'url': 'https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJiwCE/4x3/leverkusen-322.jpg?width=1280', + 'width': 1280, + }], + }, {}, + ), ( + # dateCreated, \d{4} + r''' + + ''', {'release_year': 2025}, {}, + ), ( + # dateCreated, \d{4}-\d{2}-\d{2} + r''' + + ''', {'upload_date': '20250901'}, {}, + ), ( + # dateCreated, ISO 8601 + r''' + + ''', {'timestamp': 1756684800}, {}, + )] for html, expected_dict, search_json_ld_kwargs in _TESTS: expect_dict( self, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 7d7beca0ea..6b6a83a300 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -72,11 +72,13 @@ from ..utils import ( mimetype2ext, netrc_from_content, orderedSet, + parse_age_limit, parse_bitrate, parse_codecs, parse_duration, parse_iso8601, parse_m3u8_attributes, + parse_qs, parse_resolution, qualities, sanitize_url, @@ -84,13 +86,11 @@ from ..utils import ( str_or_none, str_to_int, strip_or_none, - traverse_obj, truncate_string, try_call, try_get, unescapeHTML, unified_strdate, - unified_timestamp, url_basename, url_or_none, urlhandle_detect_ext, @@ -102,6 +102,7 @@ from ..utils import ( ) from ..utils._utils import _request_dump_filename from ..utils.jslib import devalue +from ..utils.traversal import traverse_obj class InfoExtractor: @@ -1673,38 +1674,76 @@ class InfoExtractor: chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration'] info['chapters'] = chapters + def extract_thumbnail_information(e): + thumbnails = traverse_obj(e, (( + 'image', 'thumbnail', 'thumbnailUrl', 'thumbnailURL', 'thumbnail_url', + ), ( + ({str}, {url_or_none}, {'url': None}, filter), + ({dict}, 'url', {list}, ..., {'url': None}, filter), + (({list}, ({dict}, all)), lambda _, v: + url_or_none(v.get('url')) or url_or_none(v.get('contentUrl'))), + ({list}, ..., {str}, {url_or_none}, {'url': None}, filter), + ), { + 'height': ('height', (None, 'value'), {int_or_none}, any), + 'url': (('contentUrl', 'url'), {str}, {unescapeHTML}, {self._proto_relative_url}, any), + 'width': ('width', (None, 'value'), {int_or_none}, any), + }, all, {orderedSet}, lambda _, v: url_or_none(v['url']))) + + dim_keys, url_table = {'height', 'width'}, {} + for thumbnail in thumbnails: + url = thumbnail['url'] + + query = parse_qs(thumbnail['url']) + for key, alt_keys in ( + ('height', ('height', 'h')), + ('width', ('width', 'w')), + ): + val = traverse_obj(query, (alt_keys, -1, {int_or_none}, any)) + if val is not None: + thumbnail.setdefault(key, val) + + res = parse_resolution(url_basename(url)) + for key in dim_keys: + val = res.get(key) + if val is not None: + thumbnail.setdefault(key, val) + + current = url_table.get(url) + if not current or len(dim_keys & thumbnail.keys()) > len(dim_keys & current.keys()): + url_table[url] = thumbnail + + info['thumbnails'] = list(url_table.values()) or None + def extract_video_object(e): - author = e.get('author') - info.update({ - 'url': url_or_none(e.get('contentUrl')), - 'ext': mimetype2ext(e.get('encodingFormat')), - 'title': unescapeHTML(e.get('name')), - 'description': unescapeHTML(e.get('description')), - 'thumbnails': traverse_obj(e, (('thumbnailUrl', 'thumbnailURL', 'thumbnail_url'), (None, ...), { - 'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}), - })), - 'duration': parse_duration(e.get('duration')), - 'timestamp': unified_timestamp(e.get('uploadDate')), - # author can be an instance of 'Organization' or 'Person' types. - # both types can have 'name' property(inherited from 'Thing' type). [1] - # however some websites are using 'Text' type instead. - # 1. https://schema.org/VideoObject - 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None, - 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str), - 'filesize': int_or_none(float_or_none(e.get('contentSize'))), - 'tbr': int_or_none(e.get('bitrate')), - 'width': int_or_none(e.get('width')), - 'height': int_or_none(e.get('height')), - 'view_count': int_or_none(e.get('interactionCount')), - 'tags': try_call(lambda: e.get('keywords').split(',')), - }) + info.update(traverse_obj(e, { + 'ext': ('encodingFormat', {mimetype2ext}), + 'title': ('name', {clean_html}, filter), + 'age_limit': ('isFamilyFriendly', {str}, {lambda x: 18 if x.lower() in ('false', '0') else None}), + 'artists': (('byArtist', 'name'), {clean_html}, filter, all, {orderedSet}, filter), + 'description': ('description', {clean_html}, filter), + 'duration': ('duration', {parse_duration}), + 'filesize': ('contentSize', {float_or_none}, {int_or_none}), + 'genres': ('genre', {clean_html}, filter, all, {orderedSet}, filter), + 'height': ('height', {int_or_none}), + 'is_live': ('publication', 'isLiveBroadcast', {bool}), + 'release_timestamp': ('datePublished', {parse_iso8601}), + 'tags': ('keywords', (None, ...), {clean_html}, + {lambda x: x.split(',')}, ..., {str.strip}, filter, all, {orderedSet}, filter), + 'tbr': ('bitrate', {int_or_none}), + 'timestamp': ('uploadDate', {parse_iso8601}), + 'uploader': ('author', (None, 'name'), {clean_html}, filter, any), + 'url': ('contentUrl', {self._proto_relative_url}, {url_or_none}), + 'view_count': ('interactionCount', {int_or_none}), + 'width': ('width', {int_or_none}), + })) if is_type(e, 'AudioObject'): info.update({ + 'abr': traverse_obj(e, ('bitrate', {int_or_none})), 'vcodec': 'none', - 'abr': int_or_none(e.get('bitrate')), }) extract_interaction_statistic(e) extract_chapter_information(e) + extract_thumbnail_information(e) def traverse_json_ld(json_ld, at_top_level=True): for e in variadic(json_ld): @@ -1717,50 +1756,70 @@ class InfoExtractor: continue if expected_type is not None and not is_type(e, expected_type): continue - rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none) + rating = traverse_obj(e, ('aggregateRating', 'ratingValue', {float_or_none})) if rating is not None: info['average_rating'] = rating if is_type(e, 'TVEpisode', 'Episode', 'PodcastEpisode'): - episode_name = unescapeHTML(e.get('name')) - info.update({ - 'episode': episode_name, - 'episode_number': int_or_none(e.get('episodeNumber')), - 'description': unescapeHTML(e.get('description')), - }) - if not info.get('title') and episode_name: - info['title'] = episode_name + info.update(traverse_obj(e, { + 'id': ('identifier', {str_or_none}), + 'ext': ('encodingFormat', {mimetype2ext}), + 'title': (('title', 'name'), {clean_html}, filter, any), + 'creators': ('productionCompany', {clean_html}, filter, all, {orderedSet}, filter), + 'description': ('description', {clean_html}, filter), + 'duration': ((('duration', {parse_duration}), ('timeRequired', {int_or_none})), any), + 'episode': ('name', {clean_html}, filter), + 'episode_number': ('episodeNumber', {int_or_none}), + 'genres': ('genre', ..., {clean_html}, filter, all, {orderedSet}, filter), + 'release_timestamp': ('datePublished', {parse_iso8601}), + })) + extract_thumbnail_information(e) part_of_season = e.get('partOfSeason') if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'): - info.update({ - 'season': unescapeHTML(part_of_season.get('name')), - 'season_number': int_or_none(part_of_season.get('seasonNumber')), - }) + info.update(traverse_obj(e, { + 'season': ('name', {clean_html}, filter), + 'season_number': ('seasonNumber', {int_or_none}), + })) part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'): - info['series'] = unescapeHTML(part_of_series.get('name')) + info['series'] = traverse_obj(part_of_series, ('name', {clean_html}, filter)) elif is_type(e, 'Movie'): - info.update({ - 'title': unescapeHTML(e.get('name')), - 'description': unescapeHTML(e.get('description')), - 'duration': parse_duration(e.get('duration')), - 'timestamp': unified_timestamp(e.get('dateCreated')), - }) + info.update(traverse_obj(e, { + 'title': ('name', {clean_html}, filter), + 'age_limit': ('contentRating', {parse_age_limit}), + 'cast': ('actor', ..., 'name', {clean_html}, filter, all, {orderedSet}, filter), + 'creators': ('director', (None, ((None, ...), 'name')), {clean_html}, filter, all, {orderedSet}, filter), + 'description': ('description', {clean_html}, filter), + 'duration': ('duration', {parse_duration}), + 'genres': ('genre', (None, ...), {clean_html}, filter, all, {orderedSet}, filter), + 'release_timestamp': ('datePublished', {parse_iso8601}), + })) + extract_thumbnail_information(e) + if date := traverse_obj(e, ('dateCreated', {str_or_none})): + if re.fullmatch(r'\d{4}', date): + info['release_year'] = int_or_none(date) + elif re.fullmatch(r'\d{4}-\d{2}-\d{2}', date): + info['upload_date'] = unified_strdate(date) + else: + info['timestamp'] = parse_iso8601(date) elif is_type(e, 'Article', 'NewsArticle'): - info.update(**traverse_obj(e, { + info.update(traverse_obj(e, { 'title': ('headline', {clean_html}, filter), 'alt_title': ('alternativeHeadline', {clean_html}, filter), - 'categories': ('articleSection', {clean_html}, filter, all, filter), - 'creators': ('author', (None, 'name'), {clean_html}, filter, all, filter), + 'categories': ('articleSection', (None, ...), {clean_html}, filter, all, {orderedSet}, filter), + 'comment_count': ('commentCount', {int_or_none}), + 'creators': ('author', (None, ...), 'name', {clean_html}, filter, all, {orderedSet}, filter), 'description': (('description', 'articleBody'), {clean_html}, filter, any), + 'duration': ('timeRequired', {int_or_none}), + 'genres': ('genre', (None, ...), {clean_html}, filter, all, {orderedSet}, filter), + 'location': ('contentLocation', 'name', {clean_html}, filter), 'modified_timestamp': ('dateModified', {parse_iso8601}), 'release_timestamp': ('datePublished', {parse_iso8601}), - 'tags': ('keywords', {clean_html}, {lambda x: x.split(',')}, ..., {str.strip}, filter, all, filter), - 'thumbnails': ('image', ..., { - 'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}), - }), + 'tags': ('keywords', (None, ...), {clean_html}, + {lambda x: x.split(',')}, ..., {str.strip}, filter, all, {orderedSet}, filter), 'timestamp': ('dateCreated', {parse_iso8601}), 'uploader': ('publisher', 'name', {clean_html}, filter), })) + extract_thumbnail_information(e) if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'): extract_video_object(e['video'][0]) elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):