|
|
@ -1419,6 +1419,10 @@ class InfoExtractor:
|
|
|
|
'ViewAction': 'view',
|
|
|
|
'ViewAction': 'view',
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_type(e, *expected_types):
|
|
|
|
|
|
|
|
type = variadic(traverse_obj(e, '@type'))
|
|
|
|
|
|
|
|
return any(x in type for x in expected_types)
|
|
|
|
|
|
|
|
|
|
|
|
def extract_interaction_type(e):
|
|
|
|
def extract_interaction_type(e):
|
|
|
|
interaction_type = e.get('interactionType')
|
|
|
|
interaction_type = e.get('interactionType')
|
|
|
|
if isinstance(interaction_type, dict):
|
|
|
|
if isinstance(interaction_type, dict):
|
|
|
@ -1432,9 +1436,7 @@ class InfoExtractor:
|
|
|
|
if not isinstance(interaction_statistic, list):
|
|
|
|
if not isinstance(interaction_statistic, list):
|
|
|
|
return
|
|
|
|
return
|
|
|
|
for is_e in interaction_statistic:
|
|
|
|
for is_e in interaction_statistic:
|
|
|
|
if not isinstance(is_e, dict):
|
|
|
|
if not is_type(is_e, 'InteractionCounter'):
|
|
|
|
continue
|
|
|
|
|
|
|
|
if is_e.get('@type') != 'InteractionCounter':
|
|
|
|
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
interaction_type = extract_interaction_type(is_e)
|
|
|
|
interaction_type = extract_interaction_type(is_e)
|
|
|
|
if not interaction_type:
|
|
|
|
if not interaction_type:
|
|
|
@ -1471,7 +1473,7 @@ class InfoExtractor:
|
|
|
|
info['chapters'] = chapters
|
|
|
|
info['chapters'] = chapters
|
|
|
|
|
|
|
|
|
|
|
|
def extract_video_object(e):
|
|
|
|
def extract_video_object(e):
|
|
|
|
assert e['@type'] == 'VideoObject'
|
|
|
|
assert is_type(e, 'VideoObject')
|
|
|
|
author = e.get('author')
|
|
|
|
author = e.get('author')
|
|
|
|
info.update({
|
|
|
|
info.update({
|
|
|
|
'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none),
|
|
|
|
'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none),
|
|
|
@ -1503,13 +1505,12 @@ class InfoExtractor:
|
|
|
|
if at_top_level and set(e.keys()) == {'@context', '@graph'}:
|
|
|
|
if at_top_level and set(e.keys()) == {'@context', '@graph'}:
|
|
|
|
traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
|
|
|
|
traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
|
|
|
|
break
|
|
|
|
break
|
|
|
|
item_type = e.get('@type')
|
|
|
|
if expected_type is not None and not is_type(e, expected_type):
|
|
|
|
if expected_type is not None and expected_type != item_type:
|
|
|
|
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
|
|
|
|
rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
|
|
|
|
if rating is not None:
|
|
|
|
if rating is not None:
|
|
|
|
info['average_rating'] = rating
|
|
|
|
info['average_rating'] = rating
|
|
|
|
if item_type in ('TVEpisode', 'Episode'):
|
|
|
|
if is_type(e, 'TVEpisode', 'Episode'):
|
|
|
|
episode_name = unescapeHTML(e.get('name'))
|
|
|
|
episode_name = unescapeHTML(e.get('name'))
|
|
|
|
info.update({
|
|
|
|
info.update({
|
|
|
|
'episode': episode_name,
|
|
|
|
'episode': episode_name,
|
|
|
@ -1519,39 +1520,39 @@ class InfoExtractor:
|
|
|
|
if not info.get('title') and episode_name:
|
|
|
|
if not info.get('title') and episode_name:
|
|
|
|
info['title'] = episode_name
|
|
|
|
info['title'] = episode_name
|
|
|
|
part_of_season = e.get('partOfSeason')
|
|
|
|
part_of_season = e.get('partOfSeason')
|
|
|
|
if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
|
|
|
|
if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
|
|
|
|
info.update({
|
|
|
|
info.update({
|
|
|
|
'season': unescapeHTML(part_of_season.get('name')),
|
|
|
|
'season': unescapeHTML(part_of_season.get('name')),
|
|
|
|
'season_number': int_or_none(part_of_season.get('seasonNumber')),
|
|
|
|
'season_number': int_or_none(part_of_season.get('seasonNumber')),
|
|
|
|
})
|
|
|
|
})
|
|
|
|
part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
|
|
|
|
part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
|
|
|
|
if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
|
|
|
|
if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
|
|
|
|
info['series'] = unescapeHTML(part_of_series.get('name'))
|
|
|
|
info['series'] = unescapeHTML(part_of_series.get('name'))
|
|
|
|
elif item_type == 'Movie':
|
|
|
|
elif is_type(e, 'Movie'):
|
|
|
|
info.update({
|
|
|
|
info.update({
|
|
|
|
'title': unescapeHTML(e.get('name')),
|
|
|
|
'title': unescapeHTML(e.get('name')),
|
|
|
|
'description': unescapeHTML(e.get('description')),
|
|
|
|
'description': unescapeHTML(e.get('description')),
|
|
|
|
'duration': parse_duration(e.get('duration')),
|
|
|
|
'duration': parse_duration(e.get('duration')),
|
|
|
|
'timestamp': unified_timestamp(e.get('dateCreated')),
|
|
|
|
'timestamp': unified_timestamp(e.get('dateCreated')),
|
|
|
|
})
|
|
|
|
})
|
|
|
|
elif item_type in ('Article', 'NewsArticle'):
|
|
|
|
elif is_type(e, 'Article', 'NewsArticle'):
|
|
|
|
info.update({
|
|
|
|
info.update({
|
|
|
|
'timestamp': parse_iso8601(e.get('datePublished')),
|
|
|
|
'timestamp': parse_iso8601(e.get('datePublished')),
|
|
|
|
'title': unescapeHTML(e.get('headline')),
|
|
|
|
'title': unescapeHTML(e.get('headline')),
|
|
|
|
'description': unescapeHTML(e.get('articleBody') or e.get('description')),
|
|
|
|
'description': unescapeHTML(e.get('articleBody') or e.get('description')),
|
|
|
|
})
|
|
|
|
})
|
|
|
|
if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
|
|
|
|
if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
|
|
|
|
extract_video_object(e['video'][0])
|
|
|
|
extract_video_object(e['video'][0])
|
|
|
|
elif traverse_obj(e, ('subjectOf', 0, '@type')) == 'VideoObject':
|
|
|
|
elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
|
|
|
|
extract_video_object(e['subjectOf'][0])
|
|
|
|
extract_video_object(e['subjectOf'][0])
|
|
|
|
elif item_type == 'VideoObject':
|
|
|
|
elif is_type(e, 'VideoObject'):
|
|
|
|
extract_video_object(e)
|
|
|
|
extract_video_object(e)
|
|
|
|
if expected_type is None:
|
|
|
|
if expected_type is None:
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
break
|
|
|
|
video = e.get('video')
|
|
|
|
video = e.get('video')
|
|
|
|
if isinstance(video, dict) and video.get('@type') == 'VideoObject':
|
|
|
|
if is_type(video, 'VideoObject'):
|
|
|
|
extract_video_object(video)
|
|
|
|
extract_video_object(video)
|
|
|
|
if expected_type is None:
|
|
|
|
if expected_type is None:
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|