[youtube] Extract comments' approximate timestamp (#221)

Authored by: colethedj
pull/230/head
coletdjnz 4 years ago committed by pukkandan
parent 9e62f283ff
commit d92f5d5a90
No known key found for this signature in database
GPG Key ID: 0F00D95A001F4698

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import calendar
import hashlib import hashlib
import itertools import itertools
import json import json
@ -27,6 +28,7 @@ from ..utils import (
bool_or_none, bool_or_none,
clean_html, clean_html,
dict_get, dict_get,
datetime_from_str,
ExtractorError, ExtractorError,
format_field, format_field,
float_or_none, float_or_none,
@ -46,7 +48,7 @@ from ..utils import (
update_url_query, update_url_query,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
urljoin, urljoin
) )
@ -1499,6 +1501,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
regex), webpage, name, default='{}'), video_id, fatal=False) regex), webpage, name, default='{}'), video_id, fatal=False)
@staticmethod
def parse_time_text(time_text):
"""
Parse the comment time text
time_text is in the format 'X units ago (edited)'
"""
time_text_split = time_text.split(' ')
if len(time_text_split) >= 3:
return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
@staticmethod @staticmethod
def _join_text_entries(runs): def _join_text_entries(runs):
text = None text = None
@ -1521,7 +1533,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
text = self._join_text_entries(comment_text_runs) or '' text = self._join_text_entries(comment_text_runs) or ''
comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or [] comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or []
time_text = self._join_text_entries(comment_time_text) time_text = self._join_text_entries(comment_time_text)
timestamp = calendar.timegm(self.parse_time_text(time_text).timetuple())
author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str) author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str)
author_id = try_get(comment_renderer, author_id = try_get(comment_renderer,
lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str) lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
@ -1532,11 +1544,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool) author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool) is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool)
return { return {
'id': comment_id, 'id': comment_id,
'text': text, 'text': text,
# TODO: This should be parsed to timestamp 'timestamp': timestamp,
'time_text': time_text, 'time_text': time_text,
'like_count': votes, 'like_count': votes,
'is_favorited': is_liked, 'is_favorited': is_liked,
@ -1624,12 +1635,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1]) comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
if page_num == 0: if page_num == 0:
if first_continuation: if first_continuation:
note_prefix = "Downloading initial comment continuation page" note_prefix = 'Downloading initial comment continuation page'
else: else:
note_prefix = " Downloading comment reply thread %d %s" % (comment_counts[2], comment_prog_str) note_prefix = ' Downloading comment reply thread %d %s' % (comment_counts[2], comment_prog_str)
else: else:
note_prefix = "%sDownloading comment%s page %d %s" % ( note_prefix = '%sDownloading comment%s page %d %s' % (
" " if parent else "", ' ' if parent else '',
' replies' if parent else '', ' replies' if parent else '',
page_num, page_num,
comment_prog_str) comment_prog_str)
@ -1644,13 +1655,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413): if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413):
if e.cause.code == 413: if e.cause.code == 413:
self.report_warning("Assumed end of comments (received HTTP Error 413)") self.report_warning('Assumed end of comments (received HTTP Error 413)')
return return
# Downloading page may result in intermittent 5xx HTTP error # Downloading page may result in intermittent 5xx HTTP error
# Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
last_error = 'HTTP Error %s' % e.cause.code last_error = 'HTTP Error %s' % e.cause.code
if e.cause.code == 404: if e.cause.code == 404:
last_error = last_error + " (this API is probably deprecated)" last_error = last_error + ' (this API is probably deprecated)'
if count < retries: if count < retries:
continue continue
raise raise
@ -1668,7 +1679,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth) # YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth)
if browse.get('reload'): if browse.get('reload'):
raise ExtractorError("Invalid or missing params in continuation request", expected=False) raise ExtractorError('Invalid or missing params in continuation request', expected=False)
# TODO: not tested, merged from old extractor # TODO: not tested, merged from old extractor
err_msg = browse.get('externalErrorMessage') err_msg = browse.get('externalErrorMessage')
@ -1708,7 +1719,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if expected_comment_count: if expected_comment_count:
comment_counts[1] = str_to_int(expected_comment_count) comment_counts[1] = str_to_int(expected_comment_count)
self.to_screen("Downloading ~%d comments" % str_to_int(expected_comment_count)) self.to_screen('Downloading ~%d comments' % str_to_int(expected_comment_count))
yield comment_counts[1] yield comment_counts[1]
# TODO: cli arg. # TODO: cli arg.
@ -1724,7 +1735,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
continuation = YoutubeTabIE._build_continuation_query( continuation = YoutubeTabIE._build_continuation_query(
continuation=sort_continuation_renderer.get('continuation'), continuation=sort_continuation_renderer.get('continuation'),
ctp=sort_continuation_renderer.get('clickTrackingParams')) ctp=sort_continuation_renderer.get('clickTrackingParams'))
self.to_screen("Sorting comments by %s" % ('popular' if comment_sort_index == 0 else 'newest')) self.to_screen('Sorting comments by %s' % ('popular' if comment_sort_index == 0 else 'newest'))
break break
for entry in known_continuation_renderers[key](continuation_renderer): for entry in known_continuation_renderers[key](continuation_renderer):
@ -1757,7 +1768,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
continue continue
comments.append(comment) comments.append(comment)
break break
self.to_screen("Downloaded %d/%d comments" % (len(comments), estimated_total)) self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
return { return {
'comments': comments, 'comments': comments,
'comment_count': len(comments), 'comment_count': len(comments),
@ -2979,7 +2990,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
self.report_warning('%s. Retrying ...' % last_error) self.report_warning('%s. Retrying ...' % last_error)
try: try:
response = self._call_api( response = self._call_api(
ep="browse", fatal=True, headers=headers, ep='browse', fatal=True, headers=headers,
video_id='%s page %s' % (item_id, page_num), video_id='%s page %s' % (item_id, page_num),
query={ query={
'continuation': continuation['continuation'], 'continuation': continuation['continuation'],

Loading…
Cancel
Save