From a7c901cbbeb138f9a1e4f7d02d9fedeadb69ef4c Mon Sep 17 00:00:00 2001 From: Jody Bruchon Date: Thu, 12 Nov 2020 11:47:15 -0500 Subject: [PATCH] Pre-check YouTube URLs in the archive before downloading This tries to stop single video downloads that are in the archive file from triggering any actual HTTP requests. This became an issue during the playlist download capability being broken by the disable_polymer option; as a workaround, I put together download batch files using a regex-manipulated copy of the playlist page DOM, so each batch file would run a download for each single video in a playlist rather than having the program fetch the playlist. The major problem is that even if a video ID is in the archive, the video info page must still be downloaded before it is ever checked within the archive for rejection. By checking the video ID before anything is downloaded, the rejection happens much faster and no unnecessary HTTP requests are sent out. This should be extended in the future for other services that do not require a page download to retrieve the video ID. This uses a regex match to check for a youtube.com URL and should basically be a no-op for all other sites. --- youtube_dlc/YoutubeDL.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index dd55ba0f2..4aafd327a 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -827,6 +827,10 @@ class YoutubeDL(object): 'and will probably not work.') try: + reason = self.url_archive_precheck(url) # Avoid downloading if we can check against the archive beforehand + if reason is not None: + self.to_screen(reason) + break ie_result = ie.extract(url) if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) break @@ -2199,6 +2203,18 @@ class YoutubeDL(object): archive_file.write(vid_id + '\n') self.archive.add(vid_id) + def url_archive_precheck(self, url): + # Check YouTube single video downloads in archive before any web page access + if re.match("^https://[a-zA-Z.]*youtube.com/", url): + temp_id = url.split("?v=") + if len(temp_id) == 2: + temp_id = temp_id[1].split("&")[0] + temp_info_dict = {'id': temp_id, 'ie_key': "youtube"} + if self.in_download_archive(temp_info_dict): + reason = "[download] [youtube] ID %s has already been recorded in archive" % temp_id + return reason + return None + @staticmethod def format_resolution(format, default='unknown'): if format.get('vcodec') == 'none':