From 0dab25e4358b1efbbf17430e710380d7611a3d55 Mon Sep 17 00:00:00 2001
From: doe1080 <98906116+doe1080@users.noreply.github.com>
Date: Thu, 5 Jun 2025 10:26:34 +0900
Subject: [PATCH 1/6] [ie] Improve JSON LD metadata extraction
---
test/test_InfoExtractor.py | 57 +++++++++++++++++++++++++++++++++++++-
yt_dlp/extractor/common.py | 19 +++++++++----
2 files changed, 70 insertions(+), 6 deletions(-)
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
index bc89b2955e..a16bc16eff 100644
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -224,8 +224,15 @@ class TestInfoExtractor(unittest.TestCase):
}
''',
{
- 'timestamp': 1636523400,
'title': 'md5:91fe569e952e4d146485740ae927662b',
+ 'categories': ['Κοινωνία'],
+ 'creators': ['Ant1news'],
+ 'description': 'md5:16756d0a18f33bf550e683d134a72f3c',
+ 'modified_timestamp': 1636523573,
+ 'release_timestamp': 1636523400,
+ 'tags': ['μαχαίρωμα,συμμορία ανηλίκων,ΕΙΔΗΣΕΙΣ,ΕΙΔΗΣΕΙΣ ΣΗΜΕΡΑ,ΝΕΑ,Κοινωνία - Ant1news'],
+ 'thumbnails': [{'url': 'https://ant1media.azureedge.net/imgHandler/1100/a635c968-be71-447c-bf9c-80d843ece21e.jpg'}],
+ 'uploader': 'Ant1news',
},
{'expected_type': 'NewsArticle'},
),
@@ -328,6 +335,54 @@ class TestInfoExtractor(unittest.TestCase):
},
{},
),
+ (
+ r'''
+
+''',
+ {
+ 'title': 'md5:3f077843a74f01f768bbf0853c210855',
+ 'categories': ['Reportages'],
+ 'creators': ['Sabine Dupont'],
+ 'description': 'md5:40eaf402631e0a77d8d74f66574bb978',
+ 'modified_timestamp': 1747319520,
+ 'release_timestamp': 1747319520,
+ 'tags': ['enseignement secondaire'],
+ 'timestamp': 1747319520,
+ 'thumbnails': [{'url': 'https://www.telemb.be/cdn/ff/pKwkkhB7a5GqSf98QdDUcn9WlvGTYyilvXisHO3fHpI/1747320854/public/2025-05/00006554_avc-tmb-093031.jpeg'}],
+ 'uploader': 'Tele MB',
+ },
+ {},
+ ),
]
for html, expected_dict, search_json_ld_kwargs in _TESTS:
expect_dict(
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index 1174bd4f5e..d3abba036f 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1741,11 +1741,20 @@ class InfoExtractor:
'timestamp': unified_timestamp(e.get('dateCreated')),
})
elif is_type(e, 'Article', 'NewsArticle'):
- info.update({
- 'timestamp': parse_iso8601(e.get('datePublished')),
- 'title': unescapeHTML(e.get('headline')),
- 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
- })
+ info.update(**traverse_obj(e, {
+ 'title': ('headline', {str}, {unescapeHTML}),
+ 'categories': ('articleSection', {str}, {unescapeHTML}, filter, all),
+ 'creators': ('author', (None, 'name'), {str}, {unescapeHTML}, filter, all),
+ 'description': (('description', 'articleBody'), {str}, {unescapeHTML}, any),
+ 'modified_timestamp': ('dateModified', {parse_iso8601}),
+ 'release_timestamp': ('datePublished', {parse_iso8601}),
+ 'tags': ('keywords', {str}, {unescapeHTML}, filter, all),
+ 'thumbnails': ('image', ..., {
+ 'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}),
+ }),
+ 'timestamp': ('dateCreated', {parse_iso8601}),
+ 'uploader': ('publisher', 'name', {str}, {unescapeHTML}),
+ }))
if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
extract_video_object(e['video'][0])
elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
From b9d2858b205e1e891dba4bd0aa00c98362d2821f Mon Sep 17 00:00:00 2001
From: doe1080 <98906116+doe1080@users.noreply.github.com>
Date: Thu, 5 Jun 2025 10:50:05 +0900
Subject: [PATCH 2/6] fix tags
---
test/test_InfoExtractor.py | 4 ++--
yt_dlp/extractor/common.py | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
index a16bc16eff..d22b61f621 100644
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -230,7 +230,7 @@ class TestInfoExtractor(unittest.TestCase):
'description': 'md5:16756d0a18f33bf550e683d134a72f3c',
'modified_timestamp': 1636523573,
'release_timestamp': 1636523400,
- 'tags': ['μαχαίρωμα,συμμορία ανηλίκων,ΕΙΔΗΣΕΙΣ,ΕΙΔΗΣΕΙΣ ΣΗΜΕΡΑ,ΝΕΑ,Κοινωνία - Ant1news'],
+ 'tags': 'count:6',
'thumbnails': [{'url': 'https://ant1media.azureedge.net/imgHandler/1100/a635c968-be71-447c-bf9c-80d843ece21e.jpg'}],
'uploader': 'Ant1news',
},
@@ -376,7 +376,7 @@ class TestInfoExtractor(unittest.TestCase):
'description': 'md5:40eaf402631e0a77d8d74f66574bb978',
'modified_timestamp': 1747319520,
'release_timestamp': 1747319520,
- 'tags': ['enseignement secondaire'],
+ 'tags': 'count:1',
'timestamp': 1747319520,
'thumbnails': [{'url': 'https://www.telemb.be/cdn/ff/pKwkkhB7a5GqSf98QdDUcn9WlvGTYyilvXisHO3fHpI/1747320854/public/2025-05/00006554_avc-tmb-093031.jpeg'}],
'uploader': 'Tele MB',
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index d3abba036f..74ed840503 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1748,7 +1748,7 @@ class InfoExtractor:
'description': (('description', 'articleBody'), {str}, {unescapeHTML}, any),
'modified_timestamp': ('dateModified', {parse_iso8601}),
'release_timestamp': ('datePublished', {parse_iso8601}),
- 'tags': ('keywords', {str}, {unescapeHTML}, filter, all),
+ 'tags': ('keywords', {str}, {unescapeHTML}, {lambda x: x.split(',')}, filter),
'thumbnails': ('image', ..., {
'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}),
}),
From e2bb3a52f227897d223f229c285661279e161a42 Mon Sep 17 00:00:00 2001
From: doe1080 <98906116+doe1080@users.noreply.github.com>
Date: Sun, 15 Jun 2025 17:01:00 +0900
Subject: [PATCH 3/6] filter
---
yt_dlp/extractor/common.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index eec1742a48..ba32664e40 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1744,8 +1744,8 @@ class InfoExtractor:
elif is_type(e, 'Article', 'NewsArticle'):
info.update(**traverse_obj(e, {
'title': ('headline', {str}, {unescapeHTML}),
- 'categories': ('articleSection', {str}, {unescapeHTML}, filter, all),
- 'creators': ('author', (None, 'name'), {str}, {unescapeHTML}, filter, all),
+ 'categories': ('articleSection', {str}, {unescapeHTML}, filter, all, filter),
+ 'creators': ('author', (None, 'name'), {str}, {unescapeHTML}, filter, all, filter),
'description': (('description', 'articleBody'), {str}, {unescapeHTML}, any),
'modified_timestamp': ('dateModified', {parse_iso8601}),
'release_timestamp': ('datePublished', {parse_iso8601}),
From f5091a346a02d2ed17d7f95d2a2bbc85b6eee93d Mon Sep 17 00:00:00 2001
From: doe1080 <98906116+doe1080@users.noreply.github.com>
Date: Sun, 15 Jun 2025 17:08:38 +0900
Subject: [PATCH 4/6] alt_title
---
yt_dlp/extractor/common.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index ba32664e40..e2ed97ea6e 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1744,6 +1744,7 @@ class InfoExtractor:
elif is_type(e, 'Article', 'NewsArticle'):
info.update(**traverse_obj(e, {
'title': ('headline', {str}, {unescapeHTML}),
+ 'alt_title': ('alternativeHeadline', {str}, {unescapeHTML}),
'categories': ('articleSection', {str}, {unescapeHTML}, filter, all, filter),
'creators': ('author', (None, 'name'), {str}, {unescapeHTML}, filter, all, filter),
'description': (('description', 'articleBody'), {str}, {unescapeHTML}, any),
From a59c0bd4775fca77e90d54f3842b0d4171f0bf70 Mon Sep 17 00:00:00 2001
From: doe1080 <98906116+doe1080@users.noreply.github.com>
Date: Thu, 19 Jun 2025 11:53:38 +0900
Subject: [PATCH 5/6] filter
---
test/test_InfoExtractor.py | 2 +-
yt_dlp/extractor/common.py | 14 +++++++-------
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
index 8b367ffff7..4f881eb4bb 100644
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -373,7 +373,7 @@ class TestInfoExtractor(unittest.TestCase):
'title': 'md5:3f077843a74f01f768bbf0853c210855',
'categories': ['Reportages'],
'creators': ['Sabine Dupont'],
- 'description': 'md5:40eaf402631e0a77d8d74f66574bb978',
+ 'description': 'md5:1dc04a3aa56c5228503071baa8b4cc97',
'modified_timestamp': 1747319520,
'release_timestamp': 1747319520,
'tags': 'count:1',
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index e2ed97ea6e..5fde840e5c 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1743,19 +1743,19 @@ class InfoExtractor:
})
elif is_type(e, 'Article', 'NewsArticle'):
info.update(**traverse_obj(e, {
- 'title': ('headline', {str}, {unescapeHTML}),
- 'alt_title': ('alternativeHeadline', {str}, {unescapeHTML}),
- 'categories': ('articleSection', {str}, {unescapeHTML}, filter, all, filter),
- 'creators': ('author', (None, 'name'), {str}, {unescapeHTML}, filter, all, filter),
- 'description': (('description', 'articleBody'), {str}, {unescapeHTML}, any),
+ 'title': ('headline', {clean_html}, filter),
+ 'alt_title': ('alternativeHeadline', {clean_html}, filter),
+ 'categories': ('articleSection', {clean_html}, filter, all, filter),
+ 'creators': ('author', (None, 'name'), {clean_html}, filter, all, filter),
+ 'description': (('description', 'articleBody'), {clean_html}, filter, any),
'modified_timestamp': ('dateModified', {parse_iso8601}),
'release_timestamp': ('datePublished', {parse_iso8601}),
- 'tags': ('keywords', {str}, {unescapeHTML}, {lambda x: x.split(',')}, filter),
+ 'tags': ('keywords', {clean_html}, {lambda x: x.split(',')}, ..., {str.strip}, filter, all, filter),
'thumbnails': ('image', ..., {
'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}),
}),
'timestamp': ('dateCreated', {parse_iso8601}),
- 'uploader': ('publisher', 'name', {str}, {unescapeHTML}),
+ 'uploader': ('publisher', 'name', {clean_html}, filter),
}))
if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
extract_video_object(e['video'][0])
From 7cdc226150dc5b6bf95ee6b02354221f55354429 Mon Sep 17 00:00:00 2001
From: doe1080 <98906116+doe1080@users.noreply.github.com>
Date: Tue, 9 Sep 2025 05:20:05 +0900
Subject: [PATCH 6/6] rework
---
test/test_InfoExtractor.py | 684 ++++++++++++++++++++++++-------------
yt_dlp/extractor/common.py | 167 ++++++---
2 files changed, 568 insertions(+), 283 deletions(-)
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
index f46e8b65ed..c604efb3e9 100644
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -132,137 +132,169 @@ class TestInfoExtractor(unittest.TestCase):
self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
def test_search_json_ld_realworld(self):
- _TESTS = [
+ _TESTS = [(
# https://github.com/ytdl-org/youtube-dl/issues/23306
- (
- r'''''',
- {
- 'title': '1 On 1 With Kleio',
- 'description': 'Kleio Valentien',
- 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4',
- 'timestamp': 1449347075,
- 'duration': 743.0,
- 'view_count': 1120958,
- 'width': 1920,
- 'height': 1080,
+ r'''
+ ''',
- {
- 'title': 'md5:91fe569e952e4d146485740ae927662b',
- 'categories': ['Κοινωνία'],
- 'creators': ['Ant1news'],
- 'description': 'md5:16756d0a18f33bf550e683d134a72f3c',
- 'modified_timestamp': 1636523573,
- 'release_timestamp': 1636523400,
- 'tags': 'count:6',
- 'thumbnails': [{'url': 'https://ant1media.azureedge.net/imgHandler/1100/a635c968-be71-447c-bf9c-80d843ece21e.jpg'}],
- 'uploader': 'Ant1news',
+ "aggregateRating": {
+ "@type": "AggregateRating",
+ "ratingValue": "88",
+ "ratingCount": "630",
+ "bestRating": "100",
+ "worstRating": "0"
},
- {'expected_type': 'NewsArticle'},
- ),
- (
- r'''
+ ''', {
+ 'ext': 'mp4',
+ 'title': '1 On 1 With Kleio',
+ 'age_limit': 18,
+ 'artists': ['1 On 1 With Kleio'],
+ 'average_rating': 88,
+ 'description': 'Kleio Valentien',
+ 'duration': 743,
+ 'height': 1080,
+ 'thumbnails': [
+ {'url': 'https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg'},
+ {'url': 'https://imggen.eporner.com/780814/1920/1080/9.jpg'},
+ ],
+ 'timestamp': 1449347075,
+ 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4',
+ 'view_count': 1120958,
+ 'width': 1920,
+ }, {},
+ ), (
+ # https://github.com/yt-dlp/yt-dlp/pull/1983
+ r'''
+
+ ''', {
+ 'title': 'Rewilding Sharks - Inside the lucrative trade of shark fishing in Indonesia',
+ 'creators': ['CNA'],
+ 'description': 'md5:4ce967a72d546b32935cb98c8722346b',
+ 'modified_timestamp': 1744085232,
+ 'release_timestamp': 1742147880,
+ 'thumbnails': [{
+ 'height': 100,
+ 'url': 'https://dam.mediacorp.sg/image/upload/s--0VYzW7We--/c_fill,g_auto,h_338,w_600/f_auto,q_auto/v1/mediacorp/cna/image/2025/03/17/1742148440-image.jpg?itok=rav-cQ_p',
+ 'width': 100,
+ }],
+ }, {'expected_type': 'NewsArticle'},
+ ), (
+ # https://github.com/yt-dlp/yt-dlp/pull/2031
+ r'''
+
- ''',
- {
- 'chapters': [
- {'title': 'Explosie Turnhout', 'start_time': 70, 'end_time': 440},
- {'title': 'Jaarwisseling', 'start_time': 440, 'end_time': 1179},
- {'title': 'Natuurbranden Colorado', 'start_time': 1179, 'end_time': 1263},
- {'title': 'Klimaatverandering', 'start_time': 1263, 'end_time': 1367},
- {'title': 'Zacht weer', 'start_time': 1367, 'end_time': 1383},
- {'title': 'Financiële balans', 'start_time': 1383, 'end_time': 1484},
- {'title': 'Club Brugge', 'start_time': 1484, 'end_time': 1575},
- {'title': 'Mentale gezondheid bij topsporters', 'start_time': 1575, 'end_time': 1728},
- {'title': 'Olympische Winterspelen', 'start_time': 1728, 'end_time': 1873},
- {'title': 'Sober oudjaar in Nederland', 'start_time': 1873, 'end_time': 2079.23},
- ],
- 'title': 'Het journaal - Aflevering 365 (Seizoen 2021)',
- }, {},
- ),
- (
- # test multiple thumbnails in a list
- r'''
-''',
- {
- 'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
+ "partOfSeries":{
+ "name":"Het journaal",
+ "@id":"222831405527",
+ "@type":"TVSeries"
},
- {},
- ),
- (
- # test single thumbnail
- r'''
-''',
- {
- 'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
- },
- {},
- ),
- (
- # test thumbnail_url key without URL scheme
- r'''
-''',
- {
- 'thumbnails': [{'url': 'https://www.nobelprize.org/images/12693-landscape-medium-gallery.jpg'}],
+ "partOfSeason":{
+ "name":"Seizoen 2021",
+ "@id":"961809365527",
+ "@type":"TVSeason"
},
- {},
- ),
- (
- r'''
-
-''',
- {
- 'title': 'md5:3f077843a74f01f768bbf0853c210855',
- 'categories': ['Reportages'],
- 'creators': ['Sabine Dupont'],
- 'description': 'md5:1dc04a3aa56c5228503071baa8b4cc97',
- 'modified_timestamp': 1747319520,
- 'release_timestamp': 1747319520,
- 'tags': 'count:1',
- 'timestamp': 1747319520,
- 'thumbnails': [{'url': 'https://www.telemb.be/cdn/ff/pKwkkhB7a5GqSf98QdDUcn9WlvGTYyilvXisHO3fHpI/1747320854/public/2025-05/00006554_avc-tmb-093031.jpeg'}],
- 'uploader': 'Tele MB',
- },
- {},
- ),
- ]
+ "@context":"https://schema.org",
+ "@id":"961685295527",
+ "@type":"TVEpisode"
+ }
+
+ ''', {
+ 'title': 'Het journaal - Aflevering 365 (Seizoen 2021)',
+ 'artists': ['Het journaal - Aflevering 365 (Seizoen 2021)'],
+ 'chapters': [
+ {'title': 'Explosie Turnhout', 'start_time': 70, 'end_time': 440},
+ {'title': 'Jaarwisseling', 'start_time': 440, 'end_time': 1179},
+ {'title': 'Natuurbranden Colorado', 'start_time': 1179, 'end_time': 1263},
+ {'title': 'Klimaatverandering', 'start_time': 1263, 'end_time': 1367},
+ {'title': 'Zacht weer', 'start_time': 1367, 'end_time': 1383},
+ {'title': 'Financiële balans', 'start_time': 1383, 'end_time': 1484},
+ {'title': 'Club Brugge', 'start_time': 1484, 'end_time': 1575},
+ {'title': 'Mentale gezondheid bij topsporters', 'start_time': 1575, 'end_time': 1728},
+ {'title': 'Olympische Winterspelen', 'start_time': 1728, 'end_time': 1873},
+ {'title': 'Sober oudjaar in Nederland', 'start_time': 1873, 'end_time': 2079.23},
+ ],
+ 'description': 'Het journaal 19u van vrijdag 31 december 2021. Bekijk aflevering 365 van seizoen 2021 met VRT NU via de site of app.',
+ 'duration': 2079.23,
+ 'episode': 'Het journaal 19u',
+ 'episode_number': 365,
+ 'genres': ['Nieuws en actua'],
+ 'season': 'Het journaal 19u',
+ 'series': 'Het journaal',
+ 'thumbnails': [{'url': 'https://images.vrt.be/width1280/2021/12/31/80d5ed00-6a64-11ec-b07d-02b7b76bf47f.jpg'}],
+ 'timestamp': 1640973600,
+ }, {},
+ ), (
+ # thumbnailUrl, {str}
+ r'''
+
+ ''', {
+ 'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
+ }, {},
+ ), (
+ # no scheme URL
+ # thumbnail_url, {str}
+ r'''
+
+ ''', {
+ 'thumbnails': [{'url': 'https://www.nobelprize.org/images/12693-landscape-medium-gallery.jpg'}],
+ }, {},
+ ), (
+ # no scheme URL
+ # thumbnailURL, {str}
+ r'''
+
+ ''', {
+ 'thumbnails': [{'url': 'https://images.ctfassets.net/o78em1y1w4i4/2XrBpSdjPK1OJXAAFYU8iw/76ffd0f25465502c9a704dcfc2aa6c64/Teaser-Elsevier-careers-video-thumbnail.jpg'}],
+ }, {},
+ ), (
+ # escaped URL
+ # image, {"url": {str}}
+ r'''
+
+ ''', {
+ 'thumbnails': [{
+ 'height': 630,
+ 'url': 'https://assets1.cbsnewsstatic.com/hub/i/r/2025/08/29/597b721d-9c95-424d-8720-05df6b8a4a4e/thumbnail/1200x630/45e846ad5f209972ab225651b40d0b4d/cbsn-fusion-al-gore-on-20-years-since-hurricane-katrina-thumbnail.jpg',
+ 'width': 1200,
+ }],
+ }, {},
+ ), (
+ # nested width/height
+ # image, {"url": {str}}
+ r'''
+
+ ''', {
+ 'thumbnails': [{
+ 'height': 576,
+ 'url': 'https://ichef.bbci.co.uk/ace/standard/1024/cpsprodpb/398e/live/1cc4dab0-8689-11f0-9cf6-cbf3e73ce2b9.jpg',
+ 'width': 1024,
+ }],
+ }, {},
+ ), (
+ # image, {"url": [{str}]}
+ r'''
+
+ ''', {
+ 'thumbnails': [{
+ 'height': 1103,
+ 'url': 'https://imagenes.elpais.com/resizer/v2/JUFK7HEHNNAIVJXC3WVUXNZT7M.jpg?auth=350ef4714331cf2e29299b840f86da4eb8b0ddde3a3bdb3f4302f0c90a9ae2d6&width=1960&height=1103&smart=true',
+ 'width': 1960,
+ }, {
+ 'height': 1470,
+ 'url': 'https://imagenes.elpais.com/resizer/v2/JUFK7HEHNNAIVJXC3WVUXNZT7M.jpg?auth=350ef4714331cf2e29299b840f86da4eb8b0ddde3a3bdb3f4302f0c90a9ae2d6&width=1960&height=1470&smart=true',
+ 'width': 1960,
+ }, {
+ 'height': 980,
+ 'url': 'https://imagenes.elpais.com/resizer/v2/JUFK7HEHNNAIVJXC3WVUXNZT7M.jpg?auth=350ef4714331cf2e29299b840f86da4eb8b0ddde3a3bdb3f4302f0c90a9ae2d6&width=980&height=980&smart=true',
+ 'width': 980,
+ }],
+ }, {},
+ ), (
+ # image, [{"contentUrl": {str}}]
+ r'''
+
+ ''', {
+ 'thumbnails': [{
+ 'height': 900,
+ 'url': 'https://www.planet-wissen.de/sendungen/lebensglueck-frau-sprung-strand-100~_v-gseagaleriexl.jpg',
+ 'width': 1600,
+ }, {
+ 'height': 1152,
+ 'url': 'https://www.planet-wissen.de/sendungen/lebensglueck-frau-sprung-strand-100~_v-HintergrundL.jpg',
+ 'width': 1536,
+ }],
+ }, {},
+ ), (
+ # duplicate thumbnails
+ # image, [{"url": {str}}],
+ # thumbnail, [{"url": {str}}]
+ # thumbnailUrl, [{str}]
+ r'''
+
+ ''', {
+ 'thumbnails': [{
+ 'height': 1080,
+ 'url': 'https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJiLwU/16x9-1920/leverkusen-322.jpg',
+ 'width': 1920,
+ }, {
+ 'height': 1400,
+ 'url': 'https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJibh0/1x1-1400/leverkusen-322.jpg',
+ 'width': 1400,
+ }, {
+ 'height': 960,
+ 'url': 'https://images.sportschau.de/image/031d1445-9620-4b24-b1f5-9cbbff422f21/AAABmQQq_vA/AAABmKJiwCE/4x3/leverkusen-322.jpg?width=1280',
+ 'width': 1280,
+ }],
+ }, {},
+ ), (
+ # dateCreated, \d{4}
+ r'''
+
+ ''', {'release_year': 2025}, {},
+ ), (
+ # dateCreated, \d{4}-\d{2}-\d{2}
+ r'''
+
+ ''', {'upload_date': '20250901'}, {},
+ ), (
+ # dateCreated, ISO 8601
+ r'''
+
+ ''', {'timestamp': 1756684800}, {},
+ )]
for html, expected_dict, search_json_ld_kwargs in _TESTS:
expect_dict(
self,
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index 7d7beca0ea..6b6a83a300 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -72,11 +72,13 @@ from ..utils import (
mimetype2ext,
netrc_from_content,
orderedSet,
+ parse_age_limit,
parse_bitrate,
parse_codecs,
parse_duration,
parse_iso8601,
parse_m3u8_attributes,
+ parse_qs,
parse_resolution,
qualities,
sanitize_url,
@@ -84,13 +86,11 @@ from ..utils import (
str_or_none,
str_to_int,
strip_or_none,
- traverse_obj,
truncate_string,
try_call,
try_get,
unescapeHTML,
unified_strdate,
- unified_timestamp,
url_basename,
url_or_none,
urlhandle_detect_ext,
@@ -102,6 +102,7 @@ from ..utils import (
)
from ..utils._utils import _request_dump_filename
from ..utils.jslib import devalue
+from ..utils.traversal import traverse_obj
class InfoExtractor:
@@ -1673,38 +1674,76 @@ class InfoExtractor:
chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
info['chapters'] = chapters
+ def extract_thumbnail_information(e):
+ thumbnails = traverse_obj(e, ((
+ 'image', 'thumbnail', 'thumbnailUrl', 'thumbnailURL', 'thumbnail_url',
+ ), (
+ ({str}, {url_or_none}, {'url': None}, filter),
+ ({dict}, 'url', {list}, ..., {'url': None}, filter),
+ (({list}, ({dict}, all)), lambda _, v:
+ url_or_none(v.get('url')) or url_or_none(v.get('contentUrl'))),
+ ({list}, ..., {str}, {url_or_none}, {'url': None}, filter),
+ ), {
+ 'height': ('height', (None, 'value'), {int_or_none}, any),
+ 'url': (('contentUrl', 'url'), {str}, {unescapeHTML}, {self._proto_relative_url}, any),
+ 'width': ('width', (None, 'value'), {int_or_none}, any),
+ }, all, {orderedSet}, lambda _, v: url_or_none(v['url'])))
+
+ dim_keys, url_table = {'height', 'width'}, {}
+ for thumbnail in thumbnails:
+ url = thumbnail['url']
+
+ query = parse_qs(thumbnail['url'])
+ for key, alt_keys in (
+ ('height', ('height', 'h')),
+ ('width', ('width', 'w')),
+ ):
+ val = traverse_obj(query, (alt_keys, -1, {int_or_none}, any))
+ if val is not None:
+ thumbnail.setdefault(key, val)
+
+ res = parse_resolution(url_basename(url))
+ for key in dim_keys:
+ val = res.get(key)
+ if val is not None:
+ thumbnail.setdefault(key, val)
+
+ current = url_table.get(url)
+ if not current or len(dim_keys & thumbnail.keys()) > len(dim_keys & current.keys()):
+ url_table[url] = thumbnail
+
+ info['thumbnails'] = list(url_table.values()) or None
+
def extract_video_object(e):
- author = e.get('author')
- info.update({
- 'url': url_or_none(e.get('contentUrl')),
- 'ext': mimetype2ext(e.get('encodingFormat')),
- 'title': unescapeHTML(e.get('name')),
- 'description': unescapeHTML(e.get('description')),
- 'thumbnails': traverse_obj(e, (('thumbnailUrl', 'thumbnailURL', 'thumbnail_url'), (None, ...), {
- 'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}),
- })),
- 'duration': parse_duration(e.get('duration')),
- 'timestamp': unified_timestamp(e.get('uploadDate')),
- # author can be an instance of 'Organization' or 'Person' types.
- # both types can have 'name' property(inherited from 'Thing' type). [1]
- # however some websites are using 'Text' type instead.
- # 1. https://schema.org/VideoObject
- 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
- 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
- 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
- 'tbr': int_or_none(e.get('bitrate')),
- 'width': int_or_none(e.get('width')),
- 'height': int_or_none(e.get('height')),
- 'view_count': int_or_none(e.get('interactionCount')),
- 'tags': try_call(lambda: e.get('keywords').split(',')),
- })
+ info.update(traverse_obj(e, {
+ 'ext': ('encodingFormat', {mimetype2ext}),
+ 'title': ('name', {clean_html}, filter),
+ 'age_limit': ('isFamilyFriendly', {str}, {lambda x: 18 if x.lower() in ('false', '0') else None}),
+ 'artists': (('byArtist', 'name'), {clean_html}, filter, all, {orderedSet}, filter),
+ 'description': ('description', {clean_html}, filter),
+ 'duration': ('duration', {parse_duration}),
+ 'filesize': ('contentSize', {float_or_none}, {int_or_none}),
+ 'genres': ('genre', {clean_html}, filter, all, {orderedSet}, filter),
+ 'height': ('height', {int_or_none}),
+ 'is_live': ('publication', 'isLiveBroadcast', {bool}),
+ 'release_timestamp': ('datePublished', {parse_iso8601}),
+ 'tags': ('keywords', (None, ...), {clean_html},
+ {lambda x: x.split(',')}, ..., {str.strip}, filter, all, {orderedSet}, filter),
+ 'tbr': ('bitrate', {int_or_none}),
+ 'timestamp': ('uploadDate', {parse_iso8601}),
+ 'uploader': ('author', (None, 'name'), {clean_html}, filter, any),
+ 'url': ('contentUrl', {self._proto_relative_url}, {url_or_none}),
+ 'view_count': ('interactionCount', {int_or_none}),
+ 'width': ('width', {int_or_none}),
+ }))
if is_type(e, 'AudioObject'):
info.update({
+ 'abr': traverse_obj(e, ('bitrate', {int_or_none})),
'vcodec': 'none',
- 'abr': int_or_none(e.get('bitrate')),
})
extract_interaction_statistic(e)
extract_chapter_information(e)
+ extract_thumbnail_information(e)
def traverse_json_ld(json_ld, at_top_level=True):
for e in variadic(json_ld):
@@ -1717,50 +1756,70 @@ class InfoExtractor:
continue
if expected_type is not None and not is_type(e, expected_type):
continue
- rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
+ rating = traverse_obj(e, ('aggregateRating', 'ratingValue', {float_or_none}))
if rating is not None:
info['average_rating'] = rating
if is_type(e, 'TVEpisode', 'Episode', 'PodcastEpisode'):
- episode_name = unescapeHTML(e.get('name'))
- info.update({
- 'episode': episode_name,
- 'episode_number': int_or_none(e.get('episodeNumber')),
- 'description': unescapeHTML(e.get('description')),
- })
- if not info.get('title') and episode_name:
- info['title'] = episode_name
+ info.update(traverse_obj(e, {
+ 'id': ('identifier', {str_or_none}),
+ 'ext': ('encodingFormat', {mimetype2ext}),
+ 'title': (('title', 'name'), {clean_html}, filter, any),
+ 'creators': ('productionCompany', {clean_html}, filter, all, {orderedSet}, filter),
+ 'description': ('description', {clean_html}, filter),
+ 'duration': ((('duration', {parse_duration}), ('timeRequired', {int_or_none})), any),
+ 'episode': ('name', {clean_html}, filter),
+ 'episode_number': ('episodeNumber', {int_or_none}),
+ 'genres': ('genre', ..., {clean_html}, filter, all, {orderedSet}, filter),
+ 'release_timestamp': ('datePublished', {parse_iso8601}),
+ }))
+ extract_thumbnail_information(e)
part_of_season = e.get('partOfSeason')
if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
- info.update({
- 'season': unescapeHTML(part_of_season.get('name')),
- 'season_number': int_or_none(part_of_season.get('seasonNumber')),
- })
+ info.update(traverse_obj(e, {
+ 'season': ('name', {clean_html}, filter),
+ 'season_number': ('seasonNumber', {int_or_none}),
+ }))
part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
- info['series'] = unescapeHTML(part_of_series.get('name'))
+ info['series'] = traverse_obj(part_of_series, ('name', {clean_html}, filter))
elif is_type(e, 'Movie'):
- info.update({
- 'title': unescapeHTML(e.get('name')),
- 'description': unescapeHTML(e.get('description')),
- 'duration': parse_duration(e.get('duration')),
- 'timestamp': unified_timestamp(e.get('dateCreated')),
- })
+ info.update(traverse_obj(e, {
+ 'title': ('name', {clean_html}, filter),
+ 'age_limit': ('contentRating', {parse_age_limit}),
+ 'cast': ('actor', ..., 'name', {clean_html}, filter, all, {orderedSet}, filter),
+ 'creators': ('director', (None, ((None, ...), 'name')), {clean_html}, filter, all, {orderedSet}, filter),
+ 'description': ('description', {clean_html}, filter),
+ 'duration': ('duration', {parse_duration}),
+ 'genres': ('genre', (None, ...), {clean_html}, filter, all, {orderedSet}, filter),
+ 'release_timestamp': ('datePublished', {parse_iso8601}),
+ }))
+ extract_thumbnail_information(e)
+ if date := traverse_obj(e, ('dateCreated', {str_or_none})):
+ if re.fullmatch(r'\d{4}', date):
+ info['release_year'] = int_or_none(date)
+ elif re.fullmatch(r'\d{4}-\d{2}-\d{2}', date):
+ info['upload_date'] = unified_strdate(date)
+ else:
+ info['timestamp'] = parse_iso8601(date)
elif is_type(e, 'Article', 'NewsArticle'):
- info.update(**traverse_obj(e, {
+ info.update(traverse_obj(e, {
'title': ('headline', {clean_html}, filter),
'alt_title': ('alternativeHeadline', {clean_html}, filter),
- 'categories': ('articleSection', {clean_html}, filter, all, filter),
- 'creators': ('author', (None, 'name'), {clean_html}, filter, all, filter),
+ 'categories': ('articleSection', (None, ...), {clean_html}, filter, all, {orderedSet}, filter),
+ 'comment_count': ('commentCount', {int_or_none}),
+ 'creators': ('author', (None, ...), 'name', {clean_html}, filter, all, {orderedSet}, filter),
'description': (('description', 'articleBody'), {clean_html}, filter, any),
+ 'duration': ('timeRequired', {int_or_none}),
+ 'genres': ('genre', (None, ...), {clean_html}, filter, all, {orderedSet}, filter),
+ 'location': ('contentLocation', 'name', {clean_html}, filter),
'modified_timestamp': ('dateModified', {parse_iso8601}),
'release_timestamp': ('datePublished', {parse_iso8601}),
- 'tags': ('keywords', {clean_html}, {lambda x: x.split(',')}, ..., {str.strip}, filter, all, filter),
- 'thumbnails': ('image', ..., {
- 'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}),
- }),
+ 'tags': ('keywords', (None, ...), {clean_html},
+ {lambda x: x.split(',')}, ..., {str.strip}, filter, all, {orderedSet}, filter),
'timestamp': ('dateCreated', {parse_iso8601}),
'uploader': ('publisher', 'name', {clean_html}, filter),
}))
+ extract_thumbnail_information(e)
if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
extract_video_object(e['video'][0])
elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):