From 18fe696df9d60804a8f5cb8cd74f38111d6eb711 Mon Sep 17 00:00:00 2001 From: Gegham Zakaryan Date: Thu, 28 Aug 2025 05:12:08 +0400 Subject: [PATCH] [ie/googledrive] Fix subtitles extraction (#14139) Authored by: zakaryan2004 --- yt_dlp/extractor/googledrive.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index dfba2d3ba1..0c84f0b241 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -12,6 +12,7 @@ from ..utils import ( get_element_html_by_id, int_or_none, lowercase_escape, + parse_qs, try_get, update_url_query, ) @@ -111,14 +112,18 @@ class GoogleDriveIE(InfoExtractor): self._caption_formats_ext.append(f.attrib['fmt_code']) def _get_captions_by_type(self, video_id, subtitles_id, caption_type, - origin_lang_code=None): + origin_lang_code=None, origin_lang_name=None): if not subtitles_id or not caption_type: return captions = {} for caption_entry in self._captions_xml.findall( self._CAPTIONS_ENTRY_TAG[caption_type]): caption_lang_code = caption_entry.attrib.get('lang_code') - if not caption_lang_code: + caption_name = caption_entry.attrib.get('name') or origin_lang_name + if not caption_lang_code or not caption_name: + self.report_warning(f'Missing necessary caption metadata. ' + f'Need lang_code and name attributes. ' + f'Found: {caption_entry.attrib}') continue caption_format_data = [] for caption_format in self._caption_formats_ext: @@ -129,7 +134,7 @@ class GoogleDriveIE(InfoExtractor): 'lang': (caption_lang_code if origin_lang_code is None else origin_lang_code), 'type': 'track', - 'name': '', + 'name': caption_name, 'kind': '', } if origin_lang_code is not None: @@ -155,14 +160,15 @@ class GoogleDriveIE(InfoExtractor): self._download_subtitles_xml(video_id, subtitles_id, hl) if not self._captions_xml: return - track = self._captions_xml.find('track') + track = next((t for t in self._captions_xml.findall('track') if t.attrib.get('cantran') == 'true'), None) if track is None: return origin_lang_code = track.attrib.get('lang_code') - if not origin_lang_code: + origin_lang_name = track.attrib.get('name') + if not origin_lang_code or not origin_lang_name: return return self._get_captions_by_type( - video_id, subtitles_id, 'automatic_captions', origin_lang_code) + video_id, subtitles_id, 'automatic_captions', origin_lang_code, origin_lang_name) def _real_extract(self, url): video_id = self._match_id(url) @@ -268,10 +274,8 @@ class GoogleDriveIE(InfoExtractor): subtitles_id = None ttsurl = get_value('ttsurl') if ttsurl: - # the video Id for subtitles will be the last value in the ttsurl - # query string - subtitles_id = ttsurl.encode().decode( - 'unicode_escape').split('=')[-1] + # the subtitles ID is the vid param of the ttsurl query + subtitles_id = parse_qs(ttsurl).get('vid', [None])[-1] self.cookiejar.clear(domain='.google.com', path='/', name='NID')