|
|
@ -1743,19 +1743,19 @@ class InfoExtractor:
|
|
|
|
})
|
|
|
|
})
|
|
|
|
elif is_type(e, 'Article', 'NewsArticle'):
|
|
|
|
elif is_type(e, 'Article', 'NewsArticle'):
|
|
|
|
info.update(**traverse_obj(e, {
|
|
|
|
info.update(**traverse_obj(e, {
|
|
|
|
'title': ('headline', {str}, {unescapeHTML}),
|
|
|
|
'title': ('headline', {clean_html}, filter),
|
|
|
|
'alt_title': ('alternativeHeadline', {str}, {unescapeHTML}),
|
|
|
|
'alt_title': ('alternativeHeadline', {clean_html}, filter),
|
|
|
|
'categories': ('articleSection', {str}, {unescapeHTML}, filter, all, filter),
|
|
|
|
'categories': ('articleSection', {clean_html}, filter, all, filter),
|
|
|
|
'creators': ('author', (None, 'name'), {str}, {unescapeHTML}, filter, all, filter),
|
|
|
|
'creators': ('author', (None, 'name'), {clean_html}, filter, all, filter),
|
|
|
|
'description': (('description', 'articleBody'), {str}, {unescapeHTML}, any),
|
|
|
|
'description': (('description', 'articleBody'), {clean_html}, filter, any),
|
|
|
|
'modified_timestamp': ('dateModified', {parse_iso8601}),
|
|
|
|
'modified_timestamp': ('dateModified', {parse_iso8601}),
|
|
|
|
'release_timestamp': ('datePublished', {parse_iso8601}),
|
|
|
|
'release_timestamp': ('datePublished', {parse_iso8601}),
|
|
|
|
'tags': ('keywords', {str}, {unescapeHTML}, {lambda x: x.split(',')}, filter),
|
|
|
|
'tags': ('keywords', {clean_html}, {lambda x: x.split(',')}, ..., {str.strip}, filter, all, filter),
|
|
|
|
'thumbnails': ('image', ..., {
|
|
|
|
'thumbnails': ('image', ..., {
|
|
|
|
'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}),
|
|
|
|
'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}),
|
|
|
|
}),
|
|
|
|
}),
|
|
|
|
'timestamp': ('dateCreated', {parse_iso8601}),
|
|
|
|
'timestamp': ('dateCreated', {parse_iso8601}),
|
|
|
|
'uploader': ('publisher', 'name', {str}, {unescapeHTML}),
|
|
|
|
'uploader': ('publisher', 'name', {clean_html}, filter),
|
|
|
|
}))
|
|
|
|
}))
|
|
|
|
if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
|
|
|
|
if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
|
|
|
|
extract_video_object(e['video'][0])
|
|
|
|
extract_video_object(e['video'][0])
|
|
|
|