From a7c901cbbeb138f9a1e4f7d02d9fedeadb69ef4c Mon Sep 17 00:00:00 2001
From: Jody Bruchon <jody@jodybruchon.com>
Date: Thu, 12 Nov 2020 11:47:15 -0500
Subject: [PATCH] Pre-check YouTube URLs in the archive before downloading

This tries to stop single video downloads that are in the archive
file from triggering any actual HTTP requests. This became an
issue during the playlist download capability being broken by the
disable_polymer option; as a workaround, I put together download
batch files using a regex-manipulated copy of the playlist page
DOM, so each batch file would run a download for each single video
in a playlist rather than having the program fetch the playlist.

The major problem is that even if a video ID is in the archive,
the video info page must still be downloaded before it is ever
checked within the archive for rejection. By checking the video ID
before anything is downloaded, the rejection happens much faster
and no unnecessary HTTP requests are sent out.

This should be extended in the future for other services that do
not require a page download to retrieve the video ID.

This uses a regex match to check for a youtube.com URL and should
basically be a no-op for all other sites.
---
 youtube_dlc/YoutubeDL.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py
index dd55ba0f2..4aafd327a 100644
--- a/youtube_dlc/YoutubeDL.py
+++ b/youtube_dlc/YoutubeDL.py
@@ -827,6 +827,10 @@ class YoutubeDL(object):
                                     'and will probably not work.')
 
             try:
+                reason = self.url_archive_precheck(url)  # Avoid downloading if we can check against the archive beforehand
+                if reason is not None:
+                    self.to_screen(reason)
+                    break
                 ie_result = ie.extract(url)
                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
                     break
@@ -2199,6 +2203,18 @@ class YoutubeDL(object):
             archive_file.write(vid_id + '\n')
         self.archive.add(vid_id)
 
+    def url_archive_precheck(self, url):
+        # Check YouTube single video downloads in archive before any web page access
+        if re.match("^https://[a-zA-Z.]*youtube.com/", url):
+            temp_id = url.split("?v=")
+            if len(temp_id) == 2:
+                temp_id = temp_id[1].split("&")[0]
+                temp_info_dict = {'id': temp_id, 'ie_key': "youtube"}
+                if self.in_download_archive(temp_info_dict):
+                    reason = "[download] [youtube] ID %s has already been recorded in archive" % temp_id
+                    return reason
+        return None
+
     @staticmethod
     def format_resolution(format, default='unknown'):
         if format.get('vcodec') == 'none':