From 42ca3d601ee10cef89d698f72e2b5d44fab4f013 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 30 Jul 2025 01:11:09 -0500 Subject: [PATCH] [ie/archive.org] Fix metadata extraction (#13880) Closes #13881 Authored by: bashonly --- yt_dlp/extractor/archiveorg.py | 64 +++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index 572bd6bfe..1864ddbfd 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -33,7 +33,6 @@ from ..utils import ( unified_timestamp, url_or_none, urlhandle_detect_ext, - variadic, ) @@ -232,6 +231,23 @@ class ArchiveOrgIE(InfoExtractor): 'release_date': '19950402', 'timestamp': 1084927901, }, + }, { + # metadata['metadata']['description'] is a list of strings instead of str + 'url': 'https://archive.org/details/pra-KZ1908.02', + 'info_dict': { + 'id': 'pra-KZ1908.02', + 'ext': 'mp3', + 'display_id': 'KZ1908.02_01.wav', + 'title': 'Crips and Bloods speak about gang life', + 'description': 'md5:2b56b35ff021311e3554b47a285e70b3', + 'uploader': 'jake@archive.org', + 'duration': 1733.74, + 'track': 'KZ1908.02 01', + 'track_number': 1, + 'timestamp': 1336026026, + 'upload_date': '20120503', + 'release_year': 1992, + }, }] @staticmethod @@ -274,34 +290,40 @@ class ArchiveOrgIE(InfoExtractor): m = metadata['metadata'] identifier = m['identifier'] - info = { + info = traverse_obj(m, { + 'title': ('title', {str}), + 'description': ('description', ({str}, (..., all, {' '.join})), {clean_html}, filter, any), + 'uploader': (('uploader', 'adder'), {str}, any), + 'creators': ('creator', (None, ...), {str}, filter, all, filter), + 'license': ('licenseurl', {url_or_none}), + 'release_date': ('date', {unified_strdate}), + 'timestamp': (('publicdate', 'addeddate'), {unified_timestamp}, any), + 'location': ('venue', {str}), + 'release_year': ('year', {int_or_none}), + }) + info.update({ 'id': identifier, - 'title': m['title'], - 'description': clean_html(m.get('description')), - 'uploader': dict_get(m, ['uploader', 'adder']), - 'creators': traverse_obj(m, ('creator', {variadic}, {lambda x: x[0] and list(x)})), - 'license': m.get('licenseurl'), - 'release_date': unified_strdate(m.get('date')), - 'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])), 'webpage_url': f'https://archive.org/details/{identifier}', - 'location': m.get('venue'), - 'release_year': int_or_none(m.get('year'))} + }) for f in metadata['files']: if f['name'] in entries: entries[f['name']] = merge_dicts(entries[f['name']], { 'id': identifier + '/' + f['name'], - 'title': f.get('title') or f['name'], - 'display_id': f['name'], - 'description': clean_html(f.get('description')), - 'creators': traverse_obj(f, ('creator', {variadic}, {lambda x: x[0] and list(x)})), - 'duration': parse_duration(f.get('length')), - 'track_number': int_or_none(f.get('track')), - 'album': f.get('album'), - 'discnumber': int_or_none(f.get('disc')), - 'release_year': int_or_none(f.get('year'))}) + **traverse_obj(f, { + 'title': (('title', 'name'), {str}, any), + 'display_id': ('name', {str}), + 'description': ('description', ({str}, (..., all, {' '.join})), {clean_html}, filter, any), + 'creators': ('creator', (None, ...), {str}, filter, all, filter), + 'duration': ('length', {parse_duration}), + 'track_number': ('track', {int_or_none}), + 'album': ('album', {str}), + 'discnumber': ('disc', {int_or_none}), + 'release_year': ('year', {int_or_none}), + }), + }) entry = entries[f['name']] - elif traverse_obj(f, 'original', expected_type=str) in entries: + elif traverse_obj(f, ('original', {str})) in entries: entry = entries[f['original']] else: continue