pull/13395/merge
doe1080 3 days ago committed by GitHub
commit 774f00daf4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -132,17 +132,20 @@ class TestInfoExtractor(unittest.TestCase):
self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
def test_search_json_ld_realworld(self): def test_search_json_ld_realworld(self):
_TESTS = [ _TESTS = [(
# https://github.com/ytdl-org/youtube-dl/issues/23306 # https://github.com/ytdl-org/youtube-dl/issues/23306
( r'''
r'''<script type="application/ld+json"> <script type="application/ld+json">
{ {
"@context": "http://schema.org/", "@context": "http://schema.org/",
"@type": "VideoObject", "@type": "VideoObject",
"name": "1 On 1 With Kleio", "name": "1 On 1 With Kleio",
"url": "https://www.eporner.com/hd-porn/xN49A1cT3eB/1-On-1-With-Kleio/", "url": "https://www.eporner.com/hd-porn/xN49A1cT3eB/1-On-1-With-Kleio/",
"duration": "PT0H12M23S", "duration": "PT0H12M23S",
"thumbnailUrl": ["https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg", "https://imggen.eporner.com/780814/1920/1080/9.jpg"], "thumbnailUrl": [
"https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg",
"https://imggen.eporner.com/780814/1920/1080/9.jpg"
],
"contentUrl": "https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4", "contentUrl": "https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4",
"embedUrl": "https://www.eporner.com/embed/xN49A1cT3eB/1-On-1-With-Kleio/", "embedUrl": "https://www.eporner.com/embed/xN49A1cT3eB/1-On-1-With-Kleio/",
"image": "https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg", "image": "https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg",
@ -155,104 +158,140 @@ class TestInfoExtractor(unittest.TestCase):
"uploadDate": "2015-12-05T21:24:35+01:00", "uploadDate": "2015-12-05T21:24:35+01:00",
"interactionStatistic": { "interactionStatistic": {
"@type": "InteractionCounter", "@type": "InteractionCounter",
"interactionType": { "@type": "http://schema.org/WatchAction" }, "interactionType": {
"@type": "http://schema.org/WatchAction"
},
"userInteractionCount": 1120958 "userInteractionCount": 1120958
}, "aggregateRating": { },
"aggregateRating": {
"@type": "AggregateRating", "@type": "AggregateRating",
"ratingValue": "88", "ratingValue": "88",
"ratingCount": "630", "ratingCount": "630",
"bestRating": "100", "bestRating": "100",
"worstRating": "0" "worstRating": "0"
}, "actor": [{ },
"actor": [{
"@type": "Person", "@type": "Person",
"name": "Kleio Valentien", "name": "Kleio Valentien",
"url": "https://www.eporner.com/pornstar/kleio-valentien/" "url": "https://www.eporner.com/pornstar/kleio-valentien/"
}]} }]
</script>''', }
{ </script>
''', {
'ext': 'mp4',
'title': '1 On 1 With Kleio', 'title': '1 On 1 With Kleio',
'age_limit': 18,
'artists': ['1 On 1 With Kleio'],
'average_rating': 88,
'description': 'Kleio Valentien', 'description': 'Kleio Valentien',
'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4', 'duration': 743,
'height': 1080,
'thumbnails': [
{'url': 'https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg'},
{'url': 'https://imggen.eporner.com/780814/1920/1080/9.jpg'},
],
'timestamp': 1449347075, 'timestamp': 1449347075,
'duration': 743.0, 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4',
'view_count': 1120958, 'view_count': 1120958,
'width': 1920, 'width': 1920,
'height': 1080, }, {},
}, ), (
{}, # https://github.com/yt-dlp/yt-dlp/pull/1983
), r'''
( <script type="application/ld+json">
r'''<script type="application/ld+json">
{ {
"@context": "https://schema.org", "@context": "https://schema.org",
"@graph": [ "@graph": [{
{
"@type": "NewsArticle", "@type": "NewsArticle",
"mainEntityOfPage": { "name": "Rewilding Sharks - Inside the lucrative trade of shark fishing in Indonesia",
"@type": "WebPage", "headline": "Rewilding Sharks - Inside the lucrative trade of shark fishing in Indonesia",
"@id": "https://www.ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn" "description": "“Arif”, an environmental journalist investigating the shark fishing trade in Surabaya, Indonesia, speaks to industry insiders to understand the business — from the port where fishermen sell a wide variety of sharks, to a drying facility where sharks are processed before exporters pick them up. Shark fishing is l",
}, "about": [
"headline": "Συμμορία ανηλίκων δικηγόρος θυμάτων: ήθελαν να τους αποτελειώσουν", "leopard sharks",
"name": "Συμμορία ανηλίκων δικηγόρος θυμάτων: ήθελαν να τους αποτελειώσουν", "Rewilding Sharks",
"description": "Τα παιδιά δέχθηκαν την επίθεση επειδή αρνήθηκαν να γίνουν μέλη της συμμορίας, ανέφερε ο Γ. Ζαχαρόπουλος.", "sustainability",
"animals"
],
"image": { "image": {
"@type": "ImageObject", "@type": "ImageObject",
"url": "https://ant1media.azureedge.net/imgHandler/1100/a635c968-be71-447c-bf9c-80d843ece21e.jpg", "url": "https://dam.mediacorp.sg/image/upload/s--0VYzW7We--/c_fill,g_auto,h_338,w_600/f_auto,q_auto/v1/mediacorp/cna/image/2025/03/17/1742148440-image.jpg?itok=rav-cQ_p",
"width": 1100, "width": "100",
"height": 756 }, "height": "100"
"datePublished": "2021-11-10T08:50:00+03:00", },
"dateModified": "2021-11-10T08:52:53+03:00", "datePublished": "2025-03-17T01:58:00+08:00",
"dateModified": "2025-04-08T12:07:12+08:00",
"author": { "author": {
"@type": "Person", "@type": "Person",
"@id": "https://www.ant1news.gr/", "@id": "https://www.channelnewsasia.com/",
"name": "Ant1news", "name": "CNA",
"image": "https://www.ant1news.gr/images/logo-e5d7e4b3e714c88e8d2eca96130142f6.png", "url": "https://www.channelnewsasia.com/"
"url": "https://www.ant1news.gr/"
}, },
"publisher": { "publisher": {
"@type": "Organization", "@type": "Organization",
"@id": "https://www.ant1news.gr#publisher", "@id": "https://www.channelnewsasia.com/",
"name": "Ant1news", "name": "CNA",
"url": "https://www.ant1news.gr", "url": "https://www.channelnewsasia.com/",
"logo": { "logo": {
"@type": "ImageObject", "@type": "ImageObject",
"url": "https://www.ant1news.gr/images/logo-e5d7e4b3e714c88e8d2eca96130142f6.png", "url": "https://www.channelnewsasia.com/sites/default/themes/mc_cna_theme/images/logo.svg",
"width": 400, "width": "100",
"height": 400 }, "height": "100"
"sameAs": [
"https://www.facebook.com/Ant1news.gr",
"https://twitter.com/antennanews",
"https://www.youtube.com/channel/UC0smvAbfczoN75dP0Hw4Pzw",
"https://www.instagram.com/ant1news/"
]
},
"keywords": "μαχαίρωμα,συμμορία ανηλίκων,ΕΙΔΗΣΕΙΣ,ΕΙΔΗΣΕΙΣ ΣΗΜΕΡΑ,ΝΕΑ,Κοινωνία - Ant1news",
"articleSection": "Κοινωνία"
} }
] },
"mainEntityOfPage": "https://www.channelnewsasia.com/watch/rewilding-sharks/inside-lucrative-trade-shark-fishing-indonesia-5004256"
}, {
"@type": "VideoObject",
"thumbnailUrl": "https://cf-images.ap-southeast-1.prod.boltdns.net/v1/static/6057984932001/b49a7cc0-bbd3-4634-8049-756f0bf3d0c3/3e2f7ea5-0290-4760-889f-b084117a46e8/1280x720/match/image.jpg",
"uploadDate": "2025-04-08T12:07:12+08:00",
"description": "“Arif”, an environmental journalist investigating the shark fishing trade in Surabaya, Indonesia, speaks to industry insiders to understand the business — from the port where fishermen sell a wide variety of sharks, to a drying facility where sharks are processed before exporters pick them up. Shark fishing is legal in Indonesia, and sharks bring in good money. Some species are highly sought after. For example, leopard sharks are prized for their special skin. A fisherman revealed that he gets requests for up to 600kg of leopard shark in a month, worth about 282 million rupiah (US$17,000).",
"name": "Inside the lucrative trade of shark fishing in Indonesia",
"@id": "https://www.channelnewsasia.com/watch/rewilding-sharks/inside-lucrative-trade-shark-fishing-indonesia-5004256",
"duration": "PT472S",
"embedUrl": "https://www.channelnewsasia.com/watch/rewilding-sharks/inside-lucrative-trade-shark-fishing-indonesia-5004256?view=embed",
"contentUrl": "https://www.channelnewsasia.com/watch/rewilding-sharks/inside-lucrative-trade-shark-fishing-indonesia-5004256"
}]
} }
</script>''', </script>
''', {
'title': 'Rewilding Sharks - Inside the lucrative trade of shark fishing in Indonesia',
'creators': ['CNA'],
'description': 'md5:4ce967a72d546b32935cb98c8722346b',
'modified_timestamp': 1744085232,
'release_timestamp': 1742147880,
'thumbnails': [{
'height': 100,
'url': 'https://dam.mediacorp.sg/image/upload/s--0VYzW7We--/c_fill,g_auto,h_338,w_600/f_auto,q_auto/v1/mediacorp/cna/image/2025/03/17/1742148440-image.jpg?itok=rav-cQ_p',
'width': 100,
}],
}, {'expected_type': 'NewsArticle'},
), (
# https://github.com/yt-dlp/yt-dlp/pull/2031
r'''
<script type="application/ld+json">
{ {
'timestamp': 1636523400, "url":"/vrtnu/a-z/het-journaal/2021/het-journaal-het-journaal-19u-20211231/",
'title': 'md5:91fe569e952e4d146485740ae927662b',
},
{'expected_type': 'NewsArticle'},
),
(
r'''<script type="application/ld+json">
{"url":"/vrtnu/a-z/het-journaal/2021/het-journaal-het-journaal-19u-20211231/",
"name":"Het journaal 19u", "name":"Het journaal 19u",
"description":"Het journaal 19u van vrijdag 31 december 2021.", "description":"Het journaal 19u van vrijdag 31 december 2021.",
"potentialAction":{"url":"https://vrtnu.page.link/pfVy6ihgCAJKgHqe8","@type":"ShareAction"}, "potentialAction":{
"mainEntityOfPage":{"@id":"1640092242445","@type":"WebPage"}, "url":"https://vrtnu.page.link/pfVy6ihgCAJKgHqe8",
"@type":"ShareAction"
},
"mainEntityOfPage":{
"@id":"1640092242445",
"@type":"WebPage"
},
"publication":[{ "publication":[{
"startDate":"2021-12-31T19:00:00.000+01:00", "startDate":"2021-12-31T19:00:00.000+01:00",
"endDate":"2022-01-30T23:55:00.000+01:00", "endDate":"2022-01-30T23:55:00.000+01:00",
"publishedBy":{"name":"een","@type":"Organization"}, "publishedBy":{
"publishedOn":{"url":"https://www.vrt.be/vrtnu/","name":"VRT NU","@type":"BroadcastService"}, "name":"een",
"@type":"Organization"
},
"publishedOn":{
"url":"https://www.vrt.be/vrtnu/",
"name":"VRT NU",
"@type":"BroadcastService"
},
"@id":"pbs-pub-3a7ec233-da95-4c1e-9b2b-cf5fdfebcbe8", "@id":"pbs-pub-3a7ec233-da95-4c1e-9b2b-cf5fdfebcbe8",
"@type":"BroadcastEvent" "@type":"BroadcastEvent"
}], }],
@ -280,11 +319,24 @@ class TestInfoExtractor(unittest.TestCase):
}, },
"genre":["Nieuws en actua"], "genre":["Nieuws en actua"],
"episodeNumber":365, "episodeNumber":365,
"partOfSeries":{"name":"Het journaal","@id":"222831405527","@type":"TVSeries"}, "partOfSeries":{
"partOfSeason":{"name":"Seizoen 2021","@id":"961809365527","@type":"TVSeason"}, "name":"Het journaal",
"@context":"https://schema.org","@id":"961685295527","@type":"TVEpisode"}</script> "@id":"222831405527",
''', "@type":"TVSeries"
{ },
"partOfSeason":{
"name":"Seizoen 2021",
"@id":"961809365527",
"@type":"TVSeason"
},
"@context":"https://schema.org",
"@id":"961685295527",
"@type":"TVEpisode"
}
</script>
''', {
'title': 'Het journaal - Aflevering 365 (Seizoen 2021)',
'artists': ['Het journaal - Aflevering 365 (Seizoen 2021)'],
'chapters': [ 'chapters': [
{'title': 'Explosie Turnhout', 'start_time': 70, 'end_time': 440}, {'title': 'Explosie Turnhout', 'start_time': 70, 'end_time': 440},
{'title': 'Jaarwisseling', 'start_time': 440, 'end_time': 1179}, {'title': 'Jaarwisseling', 'start_time': 440, 'end_time': 1179},
@ -297,50 +349,279 @@ class TestInfoExtractor(unittest.TestCase):
{'title': 'Olympische Winterspelen', 'start_time': 1728, 'end_time': 1873}, {'title': 'Olympische Winterspelen', 'start_time': 1728, 'end_time': 1873},
{'title': 'Sober oudjaar in Nederland', 'start_time': 1873, 'end_time': 2079.23}, {'title': 'Sober oudjaar in Nederland', 'start_time': 1873, 'end_time': 2079.23},
], ],
'title': 'Het journaal - Aflevering 365 (Seizoen 2021)', 'description': 'Het journaal 19u van vrijdag 31 december 2021. Bekijk aflevering 365 van seizoen 2021 met VRT NU via de site of app.',
'duration': 2079.23,
'episode': 'Het journaal 19u',
'episode_number': 365,
'genres': ['Nieuws en actua'],
'season': 'Het journaal 19u',
'series': 'Het journaal',
'thumbnails': [{'url': 'https://images.vrt.be/width1280/2021/12/31/80d5ed00-6a64-11ec-b07d-02b7b76bf47f.jpg'}],
'timestamp': 1640973600,
}, {}, }, {},
), ), (
( # thumbnailUrl, {str}
# test multiple thumbnails in a list
r''' r'''
<script type="application/ld+json"> <script type="application/ld+json">
{"@context":"https://schema.org",
"@type":"VideoObject",
"thumbnailUrl":["https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg"]}
</script>''',
{ {
"@context":"https://schema.org",
"@type":"VideoObject",
"thumbnailUrl":"https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg"
}
</script>
''', {
'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}], 'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
}, }, {},
{}, ), (
), # no scheme URL
( # thumbnail_url, {str}
# test single thumbnail
r''' r'''
<script type="application/ld+json"> <script type="application/ld+json">
{"@context":"https://schema.org", {
"@context": "https://schema.org",
"@type": "VideoObject", "@type": "VideoObject",
"thumbnailUrl":"https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg"} "thumbnail_url": "//www.nobelprize.org/images/12693-landscape-medium-gallery.jpg"
</script>''', }
</script>
''', {
'thumbnails': [{'url': 'https://www.nobelprize.org/images/12693-landscape-medium-gallery.jpg'}],
}, {},
), (
# no scheme URL
# thumbnailURL, {str}
r'''
<script type="application/ld+json">
{ {
'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}], "@context": "https://schema.org/",
"@type": "VideoObject",
"thumbnailURL": "//images.ctfassets.net/o78em1y1w4i4/2XrBpSdjPK1OJXAAFYU8iw/76ffd0f25465502c9a704dcfc2aa6c64/Teaser-Elsevier-careers-video-thumbnail.jpg"
}
</script>
''', {
'thumbnails': [{'url': 'https://images.ctfassets.net/o78em1y1w4i4/2XrBpSdjPK1OJXAAFYU8iw/76ffd0f25465502c9a704dcfc2aa6c64/Teaser-Elsevier-careers-video-thumbnail.jpg'}],
}, {},
), (
# escaped URL
# image, {"url": {str}}
r'''
<script type="application/ld+json">
{
"@context": "https:\/\/schema.org",
"@type": "NewsArticle",
"image": {
"@context": "https:\/\/schema.org",
"@type": "ImageObject",
"height": 630,
"width": 1200,
"url": "https:\/\/assets1.cbsnewsstatic.com\/hub\/i\/r\/2025\/08\/29\/597b721d-9c95-424d-8720-05df6b8a4a4e\/thumbnail\/1200x630\/45e846ad5f209972ab225651b40d0b4d\/cbsn-fusion-al-gore-on-20-years-since-hurricane-katrina-thumbnail.jpg"
}
}
</script>
''', {
'thumbnails': [{
'height': 630,
'url': 'https://assets1.cbsnewsstatic.com/hub/i/r/2025/08/29/597b721d-9c95-424d-8720-05df6b8a4a4e/thumbnail/1200x630/45e846ad5f209972ab225651b40d0b4d/cbsn-fusion-al-gore-on-20-years-since-hurricane-katrina-thumbnail.jpg',
'width': 1200,
}],
}, {},
), (
# nested width/height
# image, {"url": {str}}
r'''
<script type="application/ld+json">
{
"@context": "http://schema.org",
"@type": "NewsArticle",
"image": {
"@type": "ImageObject",
"width": {
"@type": "QuantitativeValue",
"unitText": "px",
"value": 1024
}, },
{}, "height": {
), "@type": "QuantitativeValue",
( "unitText": "px",
# test thumbnail_url key without URL scheme "value": 576
},
"url": "https://ichef.bbci.co.uk/ace/standard/1024/cpsprodpb/398e/live/1cc4dab0-8689-11f0-9cf6-cbf3e73ce2b9.jpg"
}
}
</script>
''', {
'thumbnails': [{
'height': 576,
'url': 'https://ichef.bbci.co.uk/ace/standard/1024/cpsprodpb/398e/live/1cc4dab0-8689-11f0-9cf6-cbf3e73ce2b9.jpg',
'width': 1024,
}],
}, {},
), (
# image, {"url": [{str}]}
r''' r'''
<script type="application/ld+json"> <script type="application/ld+json">
{ {
"@context": "https://schema.org", "@context": "https://schema.org/",
"@type": [
"NewsArticle",
"Article"
],
"image": {
"@type": "ImageObject",
"url": [
"https://imagenes.elpais.com/resizer/v2/JUFK7HEHNNAIVJXC3WVUXNZT7M.jpg?auth=350ef4714331cf2e29299b840f86da4eb8b0ddde3a3bdb3f4302f0c90a9ae2d6&width=1960&height=1103&smart=true",
"https://imagenes.elpais.com/resizer/v2/JUFK7HEHNNAIVJXC3WVUXNZT7M.jpg?auth=350ef4714331cf2e29299b840f86da4eb8b0ddde3a3bdb3f4302f0c90a9ae2d6&width=1960&height=1470&smart=true",
"https://imagenes.elpais.com/resizer/v2/JUFK7HEHNNAIVJXC3WVUXNZT7M.jpg?auth=350ef4714331cf2e29299b840f86da4eb8b0ddde3a3bdb3f4302f0c90a9ae2d6&width=980&height=980&smart=true"
]
}
}
</script>
''', {
'thumbnails': [{
'height': 1103,
'url': 'https://imagenes.elpais.com/resizer/v2/JUFK7HEHNNAIVJXC3WVUXNZT7M.jpg?auth=350ef4714331cf2e29299b840f86da4eb8b0ddde3a3bdb3f4302f0c90a9ae2d6&width=1960&height=1103&smart=true',
'width': 1960,
}, {
'height': 1470,
'url': 'https://imagenes.elpais.com/resizer/v2/JUFK7HEHNNAIVJXC3WVUXNZT7M.jpg?auth=350ef4714331cf2e29299b840f86da4eb8b0ddde3a3bdb3f4302f0c90a9ae2d6&width=1960&height=1470&smart=true',
'width': 1960,
}, {
'height': 980,
'url': 'https://imagenes.elpais.com/resizer/v2/JUFK7HEHNNAIVJXC3WVUXNZT7M.jpg?auth=350ef4714331cf2e29299b840f86da4eb8b0ddde3a3bdb3f4302f0c90a9ae2d6&width=980&height=980&smart=true',
'width': 980,
}],
}, {},
), (
# image, [{"contentUrl": {str}}]
r'''
<script type="application/ld+json">
{
"@context": "https://schema.org/",
"@type": "VideoObject", "@type": "VideoObject",
"thumbnail_url": "//www.nobelprize.org/images/12693-landscape-medium-gallery.jpg" "image": [
}</script>''',
{ {
'thumbnails': [{'url': 'https://www.nobelprize.org/images/12693-landscape-medium-gallery.jpg'}], "@type": "ImageObject",
"contentUrl": "https://www.planet-wissen.de/sendungen/lebensglueck-frau-sprung-strand-100~_v-gseagaleriexl.jpg",
"height": 900,
"width": 1600
}, },
{}, {
), "@type": "ImageObject",
"contentUrl": "https://www.planet-wissen.de/sendungen/lebensglueck-frau-sprung-strand-100~_v-HintergrundL.jpg",
"height": 1152,
"width": 1536
}
]
}
</script>
''', {
'thumbnails': [{
'height': 900,
'url': 'https://www.planet-wissen.de/sendungen/lebensglueck-frau-sprung-strand-100~_v-gseagaleriexl.jpg',
'width': 1600,
}, {
'height': 1152,
'url': 'https://www.planet-wissen.de/sendungen/lebensglueck-frau-sprung-strand-100~_v-HintergrundL.jpg',
'width': 1536,
}],
}, {},
), (
# duplicate thumbnails
# image, [{"url": {str}}],
# thumbnail, [{"url": {str}}]
# thumbnailUrl, [{str}]
r'''
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "VideoObject",
"image": [{
"@type": "ImageObject",
"url": "https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJiLwU/16x9-1920/leverkusen-322.jpg",
"width": 1920,
"height": 1080
}, {
"@type": "ImageObject",
"url": "https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJibh0/1x1-1400/leverkusen-322.jpg",
"width": 1400,
"height": 1400
}, {
"@type": "ImageObject",
"url": "https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJiwCE/4x3/leverkusen-322.jpg?width=1280",
"width": 1280,
"height": 960
}],
"thumbnail": [{
"@type": "ImageObject",
"url": "https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJiLwU/16x9-1920/leverkusen-322.jpg",
"width": 1920,
"height": 1080
}, {
"@type": "ImageObject",
"url": "https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJibh0/1x1-1400/leverkusen-322.jpg",
"width": 1400,
"height": 1400
}, {
"@type": "ImageObject",
"url": "https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJiwCE/4x3/leverkusen-322.jpg?width=1280",
"width": 1280,
"height": 960
}],
"thumbnailUrl": [
"https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJiLwU/16x9-1920/leverkusen-322.jpg",
"https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJibh0/1x1-1400/leverkusen-322.jpg",
"https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJiwCE/4x3/leverkusen-322.jpg?width=1280"
] ]
}
</script>
''', {
'thumbnails': [{
'height': 1080,
'url': 'https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJiLwU/16x9-1920/leverkusen-322.jpg',
'width': 1920,
}, {
'height': 1400,
'url': 'https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJibh0/1x1-1400/leverkusen-322.jpg',
'width': 1400,
}, {
'height': 960,
'url': 'https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJiwCE/4x3/leverkusen-322.jpg?width=1280',
'width': 1280,
}],
}, {},
), (
# dateCreated, \d{4}
r'''
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Movie",
"dateCreated": "2025"
}
</script>
''', {'release_year': 2025}, {},
), (
# dateCreated, \d{4}-\d{2}-\d{2}
r'''
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Movie",
"dateCreated": "2025-09-01"
}
</script>
''', {'upload_date': '20250901'}, {},
), (
# dateCreated, ISO 8601
r'''
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Movie",
"dateCreated": "2025-09-01T00:00:00Z"
}
</script>
''', {'timestamp': 1756684800}, {},
)]
for html, expected_dict, search_json_ld_kwargs in _TESTS: for html, expected_dict, search_json_ld_kwargs in _TESTS:
expect_dict( expect_dict(
self, self,

@ -72,11 +72,13 @@ from ..utils import (
mimetype2ext, mimetype2ext,
netrc_from_content, netrc_from_content,
orderedSet, orderedSet,
parse_age_limit,
parse_bitrate, parse_bitrate,
parse_codecs, parse_codecs,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
parse_m3u8_attributes, parse_m3u8_attributes,
parse_qs,
parse_resolution, parse_resolution,
qualities, qualities,
sanitize_url, sanitize_url,
@ -84,13 +86,11 @@ from ..utils import (
str_or_none, str_or_none,
str_to_int, str_to_int,
strip_or_none, strip_or_none,
traverse_obj,
truncate_string, truncate_string,
try_call, try_call,
try_get, try_get,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
unified_timestamp,
url_basename, url_basename,
url_or_none, url_or_none,
urlhandle_detect_ext, urlhandle_detect_ext,
@ -102,6 +102,7 @@ from ..utils import (
) )
from ..utils._utils import _request_dump_filename from ..utils._utils import _request_dump_filename
from ..utils.jslib import devalue from ..utils.jslib import devalue
from ..utils.traversal import traverse_obj
class InfoExtractor: class InfoExtractor:
@ -1673,38 +1674,76 @@ class InfoExtractor:
chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration'] chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
info['chapters'] = chapters info['chapters'] = chapters
def extract_thumbnail_information(e):
thumbnails = traverse_obj(e, ((
'image', 'thumbnail', 'thumbnailUrl', 'thumbnailURL', 'thumbnail_url',
), (
({str}, {url_or_none}, {'url': None}, filter),
({dict}, 'url', {list}, ..., {'url': None}, filter),
(({list}, ({dict}, all)), lambda _, v:
url_or_none(v.get('url')) or url_or_none(v.get('contentUrl'))),
({list}, ..., {str}, {url_or_none}, {'url': None}, filter),
), {
'height': ('height', (None, 'value'), {int_or_none}, any),
'url': (('contentUrl', 'url'), {str}, {unescapeHTML}, {self._proto_relative_url}, any),
'width': ('width', (None, 'value'), {int_or_none}, any),
}, all, {orderedSet}, lambda _, v: url_or_none(v['url'])))
dim_keys, url_table = {'height', 'width'}, {}
for thumbnail in thumbnails:
url = thumbnail['url']
query = parse_qs(thumbnail['url'])
for key, alt_keys in (
('height', ('height', 'h')),
('width', ('width', 'w')),
):
val = traverse_obj(query, (alt_keys, -1, {int_or_none}, any))
if val is not None:
thumbnail.setdefault(key, val)
res = parse_resolution(url_basename(url))
for key in dim_keys:
val = res.get(key)
if val is not None:
thumbnail.setdefault(key, val)
current = url_table.get(url)
if not current or len(dim_keys & thumbnail.keys()) > len(dim_keys & current.keys()):
url_table[url] = thumbnail
info['thumbnails'] = list(url_table.values()) or None
def extract_video_object(e): def extract_video_object(e):
author = e.get('author') info.update(traverse_obj(e, {
info.update({ 'ext': ('encodingFormat', {mimetype2ext}),
'url': url_or_none(e.get('contentUrl')), 'title': ('name', {clean_html}, filter),
'ext': mimetype2ext(e.get('encodingFormat')), 'age_limit': ('isFamilyFriendly', {str}, {lambda x: 18 if x.lower() in ('false', '0') else None}),
'title': unescapeHTML(e.get('name')), 'artists': (('byArtist', 'name'), {clean_html}, filter, all, {orderedSet}, filter),
'description': unescapeHTML(e.get('description')), 'description': ('description', {clean_html}, filter),
'thumbnails': traverse_obj(e, (('thumbnailUrl', 'thumbnailURL', 'thumbnail_url'), (None, ...), { 'duration': ('duration', {parse_duration}),
'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}), 'filesize': ('contentSize', {float_or_none}, {int_or_none}),
})), 'genres': ('genre', {clean_html}, filter, all, {orderedSet}, filter),
'duration': parse_duration(e.get('duration')), 'height': ('height', {int_or_none}),
'timestamp': unified_timestamp(e.get('uploadDate')), 'is_live': ('publication', 'isLiveBroadcast', {bool}),
# author can be an instance of 'Organization' or 'Person' types. 'release_timestamp': ('datePublished', {parse_iso8601}),
# both types can have 'name' property(inherited from 'Thing' type). [1] 'tags': ('keywords', (None, ...), {clean_html},
# however some websites are using 'Text' type instead. {lambda x: x.split(',')}, ..., {str.strip}, filter, all, {orderedSet}, filter),
# 1. https://schema.org/VideoObject 'tbr': ('bitrate', {int_or_none}),
'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None, 'timestamp': ('uploadDate', {parse_iso8601}),
'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str), 'uploader': ('author', (None, 'name'), {clean_html}, filter, any),
'filesize': int_or_none(float_or_none(e.get('contentSize'))), 'url': ('contentUrl', {self._proto_relative_url}, {url_or_none}),
'tbr': int_or_none(e.get('bitrate')), 'view_count': ('interactionCount', {int_or_none}),
'width': int_or_none(e.get('width')), 'width': ('width', {int_or_none}),
'height': int_or_none(e.get('height')), }))
'view_count': int_or_none(e.get('interactionCount')),
'tags': try_call(lambda: e.get('keywords').split(',')),
})
if is_type(e, 'AudioObject'): if is_type(e, 'AudioObject'):
info.update({ info.update({
'abr': traverse_obj(e, ('bitrate', {int_or_none})),
'vcodec': 'none', 'vcodec': 'none',
'abr': int_or_none(e.get('bitrate')),
}) })
extract_interaction_statistic(e) extract_interaction_statistic(e)
extract_chapter_information(e) extract_chapter_information(e)
extract_thumbnail_information(e)
def traverse_json_ld(json_ld, at_top_level=True): def traverse_json_ld(json_ld, at_top_level=True):
for e in variadic(json_ld): for e in variadic(json_ld):
@ -1717,40 +1756,70 @@ class InfoExtractor:
continue continue
if expected_type is not None and not is_type(e, expected_type): if expected_type is not None and not is_type(e, expected_type):
continue continue
rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none) rating = traverse_obj(e, ('aggregateRating', 'ratingValue', {float_or_none}))
if rating is not None: if rating is not None:
info['average_rating'] = rating info['average_rating'] = rating
if is_type(e, 'TVEpisode', 'Episode', 'PodcastEpisode'): if is_type(e, 'TVEpisode', 'Episode', 'PodcastEpisode'):
episode_name = unescapeHTML(e.get('name')) info.update(traverse_obj(e, {
info.update({ 'id': ('identifier', {str_or_none}),
'episode': episode_name, 'ext': ('encodingFormat', {mimetype2ext}),
'episode_number': int_or_none(e.get('episodeNumber')), 'title': (('title', 'name'), {clean_html}, filter, any),
'description': unescapeHTML(e.get('description')), 'creators': ('productionCompany', {clean_html}, filter, all, {orderedSet}, filter),
}) 'description': ('description', {clean_html}, filter),
if not info.get('title') and episode_name: 'duration': ((('duration', {parse_duration}), ('timeRequired', {int_or_none})), any),
info['title'] = episode_name 'episode': ('name', {clean_html}, filter),
'episode_number': ('episodeNumber', {int_or_none}),
'genres': ('genre', ..., {clean_html}, filter, all, {orderedSet}, filter),
'release_timestamp': ('datePublished', {parse_iso8601}),
}))
extract_thumbnail_information(e)
part_of_season = e.get('partOfSeason') part_of_season = e.get('partOfSeason')
if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'): if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
info.update({ info.update(traverse_obj(e, {
'season': unescapeHTML(part_of_season.get('name')), 'season': ('name', {clean_html}, filter),
'season_number': int_or_none(part_of_season.get('seasonNumber')), 'season_number': ('seasonNumber', {int_or_none}),
}) }))
part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'): if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
info['series'] = unescapeHTML(part_of_series.get('name')) info['series'] = traverse_obj(part_of_series, ('name', {clean_html}, filter))
elif is_type(e, 'Movie'): elif is_type(e, 'Movie'):
info.update({ info.update(traverse_obj(e, {
'title': unescapeHTML(e.get('name')), 'title': ('name', {clean_html}, filter),
'description': unescapeHTML(e.get('description')), 'age_limit': ('contentRating', {parse_age_limit}),
'duration': parse_duration(e.get('duration')), 'cast': ('actor', ..., 'name', {clean_html}, filter, all, {orderedSet}, filter),
'timestamp': unified_timestamp(e.get('dateCreated')), 'creators': ('director', (None, ((None, ...), 'name')), {clean_html}, filter, all, {orderedSet}, filter),
}) 'description': ('description', {clean_html}, filter),
'duration': ('duration', {parse_duration}),
'genres': ('genre', (None, ...), {clean_html}, filter, all, {orderedSet}, filter),
'release_timestamp': ('datePublished', {parse_iso8601}),
}))
extract_thumbnail_information(e)
if date := traverse_obj(e, ('dateCreated', {str_or_none})):
if re.fullmatch(r'\d{4}', date):
info['release_year'] = int_or_none(date)
elif re.fullmatch(r'\d{4}-\d{2}-\d{2}', date):
info['upload_date'] = unified_strdate(date)
else:
info['timestamp'] = parse_iso8601(date)
elif is_type(e, 'Article', 'NewsArticle'): elif is_type(e, 'Article', 'NewsArticle'):
info.update({ info.update(traverse_obj(e, {
'timestamp': parse_iso8601(e.get('datePublished')), 'title': ('headline', {clean_html}, filter),
'title': unescapeHTML(e.get('headline')), 'alt_title': ('alternativeHeadline', {clean_html}, filter),
'description': unescapeHTML(e.get('articleBody') or e.get('description')), 'categories': ('articleSection', (None, ...), {clean_html}, filter, all, {orderedSet}, filter),
}) 'comment_count': ('commentCount', {int_or_none}),
'creators': ('author', (None, ...), 'name', {clean_html}, filter, all, {orderedSet}, filter),
'description': (('description', 'articleBody'), {clean_html}, filter, any),
'duration': ('timeRequired', {int_or_none}),
'genres': ('genre', (None, ...), {clean_html}, filter, all, {orderedSet}, filter),
'location': ('contentLocation', 'name', {clean_html}, filter),
'modified_timestamp': ('dateModified', {parse_iso8601}),
'release_timestamp': ('datePublished', {parse_iso8601}),
'tags': ('keywords', (None, ...), {clean_html},
{lambda x: x.split(',')}, ..., {str.strip}, filter, all, {orderedSet}, filter),
'timestamp': ('dateCreated', {parse_iso8601}),
'uploader': ('publisher', 'name', {clean_html}, filter),
}))
extract_thumbnail_information(e)
if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'): if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
extract_video_object(e['video'][0]) extract_video_object(e['video'][0])
elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'): elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):

Loading…
Cancel
Save