[ie/googledrive] Fix subtitles extraction (#14139)

Authored by: zakaryan2004
pull/12126/head
Gegham Zakaryan 1 week ago committed by GitHub
parent 487a90c8ef
commit 18fe696df9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -12,6 +12,7 @@ from ..utils import (
get_element_html_by_id, get_element_html_by_id,
int_or_none, int_or_none,
lowercase_escape, lowercase_escape,
parse_qs,
try_get, try_get,
update_url_query, update_url_query,
) )
@ -111,14 +112,18 @@ class GoogleDriveIE(InfoExtractor):
self._caption_formats_ext.append(f.attrib['fmt_code']) self._caption_formats_ext.append(f.attrib['fmt_code'])
def _get_captions_by_type(self, video_id, subtitles_id, caption_type, def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
origin_lang_code=None): origin_lang_code=None, origin_lang_name=None):
if not subtitles_id or not caption_type: if not subtitles_id or not caption_type:
return return
captions = {} captions = {}
for caption_entry in self._captions_xml.findall( for caption_entry in self._captions_xml.findall(
self._CAPTIONS_ENTRY_TAG[caption_type]): self._CAPTIONS_ENTRY_TAG[caption_type]):
caption_lang_code = caption_entry.attrib.get('lang_code') caption_lang_code = caption_entry.attrib.get('lang_code')
if not caption_lang_code: caption_name = caption_entry.attrib.get('name') or origin_lang_name
if not caption_lang_code or not caption_name:
self.report_warning(f'Missing necessary caption metadata. '
f'Need lang_code and name attributes. '
f'Found: {caption_entry.attrib}')
continue continue
caption_format_data = [] caption_format_data = []
for caption_format in self._caption_formats_ext: for caption_format in self._caption_formats_ext:
@ -129,7 +134,7 @@ class GoogleDriveIE(InfoExtractor):
'lang': (caption_lang_code if origin_lang_code is None 'lang': (caption_lang_code if origin_lang_code is None
else origin_lang_code), else origin_lang_code),
'type': 'track', 'type': 'track',
'name': '', 'name': caption_name,
'kind': '', 'kind': '',
} }
if origin_lang_code is not None: if origin_lang_code is not None:
@ -155,14 +160,15 @@ class GoogleDriveIE(InfoExtractor):
self._download_subtitles_xml(video_id, subtitles_id, hl) self._download_subtitles_xml(video_id, subtitles_id, hl)
if not self._captions_xml: if not self._captions_xml:
return return
track = self._captions_xml.find('track') track = next((t for t in self._captions_xml.findall('track') if t.attrib.get('cantran') == 'true'), None)
if track is None: if track is None:
return return
origin_lang_code = track.attrib.get('lang_code') origin_lang_code = track.attrib.get('lang_code')
if not origin_lang_code: origin_lang_name = track.attrib.get('name')
if not origin_lang_code or not origin_lang_name:
return return
return self._get_captions_by_type( return self._get_captions_by_type(
video_id, subtitles_id, 'automatic_captions', origin_lang_code) video_id, subtitles_id, 'automatic_captions', origin_lang_code, origin_lang_name)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -268,10 +274,8 @@ class GoogleDriveIE(InfoExtractor):
subtitles_id = None subtitles_id = None
ttsurl = get_value('ttsurl') ttsurl = get_value('ttsurl')
if ttsurl: if ttsurl:
# the video Id for subtitles will be the last value in the ttsurl # the subtitles ID is the vid param of the ttsurl query
# query string subtitles_id = parse_qs(ttsurl).get('vid', [None])[-1]
subtitles_id = ttsurl.encode().decode(
'unicode_escape').split('=')[-1]
self.cookiejar.clear(domain='.google.com', path='/', name='NID') self.cookiejar.clear(domain='.google.com', path='/', name='NID')

Loading…
Cancel
Save