[cleanup] Misc fixes

Closes #4027
3 years ago · 56ba69e4c9
parent d05460e5fe
commit 56ba69e4c9
13 changed files with 72 additions and 83 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -431,7 +431,7 @@ title = self._search_regex(  # correct
    r'<span[^>]+class="title"[^>]*>([^<]+)', webpage, 'title')
 ```
-Or even better:
+which tolerates potential changes in the `style` attribute's value. Or even better:
 ```python
 title = self._search_regex(  # correct
@ -439,7 +439,7 @@ title = self._search_regex(  # correct
    webpage, 'title', group='title')
 ```
-Note how you tolerate potential changes in the `style` attribute's value or switch from using double quotes to single for `class` attribute: 
+which also handles both single quotes in addition to double quotes.
 The code definitely should not look like:
--- a/README.md
+++ b/README.md
@ -103,7 +103,7 @@ yt-dlp is a [youtube-dl](https://github.com/ytdl-org/youtube-dl) fork based on t
 * **New and fixed extractors**: Many new extractors have been added and a lot of existing ones have been fixed. See the [changelog](Changelog.md) or the [list of supported sites](supportedsites.md)
-* **New MSOs**: Philo, Spectrum, SlingTV, Cablevision, RCN
+* **New MSOs**: Philo, Spectrum, SlingTV, Cablevision, RCN etc.
 * **Subtitle extraction from manifests**: Subtitles can be extracted from streaming media manifests. See [commit/be6202f](https://github.com/yt-dlp/yt-dlp/commit/be6202f12b97858b9d716e608394b51065d0419f) for details
@ -1710,7 +1710,7 @@ The following extractors use this feature:
 #### youtube
 * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and auto-translated subtitles respectively
-* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (Eg: `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `android,web` is used, but tv_embedded and creator variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients.
+* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (Eg: `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients.
 * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details
 * `include_live_dash`: Include live dash formats even without `--live-from-start` (These formats don't download properly)
 * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
--- a/setup.py
+++ b/setup.py
@ -140,6 +140,9 @@ setup(
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',
        'Programming Language :: Python :: 3.8',
        'Programming Language :: Python :: 3.9',
        'Programming Language :: Python :: 3.10',
        'Programming Language :: Python :: 3.11',
        'Programming Language :: Python :: Implementation',
        'Programming Language :: Python :: Implementation :: CPython',
        'Programming Language :: Python :: Implementation :: PyPy',
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -2570,7 +2570,7 @@ class YoutubeDL:
                format['dynamic_range'] = 'SDR'
            if (info_dict.get('duration') and format.get('tbr')
                    and not format.get('filesize') and not format.get('filesize_approx')):
-                format['filesize_approx'] = info_dict['duration'] * format['tbr'] * (1024 / 8)
+                format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
            # Add HTTP headers, so that external programs can use them from the
            # json output
@ -3059,16 +3059,15 @@ class YoutubeDL:
                    return file
                success = True
-                merger = FFmpegMergerPP(self)
+                merger, fd = FFmpegMergerPP(self), None
                if info_dict.get('url'):
                    fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
                    if fd is not FFmpegFD and (
                            info_dict.get('section_start') or info_dict.get('section_end')):
                        msg = ('This format cannot be partially downloaded' if merger.available
                               else 'You have requested downloading the video partially, but ffmpeg is not installed')
-                    if not self.params.get('ignoreerrors'):
+                        self.report_error(f'{msg}. Aborting')
                        self.report_error(f'{msg}. Aborting due to --abort-on-error')
                        return
                    self.report_warning(f'{msg}. The entire video will be downloaded')
                if info_dict.get('requested_formats') is not None:
--- a/yt_dlp/cookies.py
+++ b/yt_dlp/cookies.py
@ -337,14 +337,11 @@ class ChromeCookieDecryptor:
 def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None):
-    if sys.platform in ('linux', 'linux2'):
+    if sys.platform == 'darwin':
        return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring)
    elif sys.platform == 'darwin':
        return MacChromeCookieDecryptor(browser_keyring_name, logger)
-    elif sys.platform == 'win32':
+    elif sys.platform in ('win32', 'cygwin'):
        return WindowsChromeCookieDecryptor(browser_root, logger)
-    else:
+    return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring)
        raise NotImplementedError(f'Chrome cookie decryption is not supported on this platform: {sys.platform}')
 class LinuxChromeCookieDecryptor(ChromeCookieDecryptor):
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -1487,7 +1487,7 @@ class InfoExtractor:
                # however some websites are using 'Text' type instead.
                # 1. https://schema.org/VideoObject
                'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
-                'filesize': float_or_none(e.get('contentSize')),
+                'filesize': int_or_none(float_or_none(e.get('contentSize'))),
                'tbr': int_or_none(e.get('bitrate')),
                'width': int_or_none(e.get('width')),
                'height': int_or_none(e.get('height')),
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@ -534,7 +534,6 @@ from .foxnews import (
 )
 from .foxsports import FoxSportsIE
 from .fptplay import FptplayIE
 from .franceculture import FranceCultureIE
 from .franceinter import FranceInterIE
 from .francetv import (
    FranceTVIE,
@ -1348,7 +1347,7 @@ from .radiocanada import (
 from .radiode import RadioDeIE
 from .radiojavan import RadioJavanIE
 from .radiobremen import RadioBremenIE
-from .radiofrance import RadioFranceIE
+from .radiofrance import FranceCultureIE, RadioFranceIE
 from .radiozet import RadioZetPodcastIE
 from .radiokapital import (
    RadioKapitalIE,
--- a/yt_dlp/extractor/franceculture.py
+++ b/yt_dlp/extractor/franceculture.py
@ -1,46 +0,0 @@
 from .common import InfoExtractor
 from ..utils import int_or_none, parse_duration, unified_strdate
 class FranceCultureIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/franceculture/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])'
    _TESTS = [
        {
            'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
            'info_dict': {
                'id': '8440487',
                'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau',
                'ext': 'mp3',
                'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?',
                'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?',
                'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg',
                'upload_date': '20220514',
                'duration': 2750,
            },
        },
    ]
    def _real_extract(self, url):
        video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
        webpage = self._download_webpage(url, display_id)
        # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846
        video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'\s*"@type"\s*:\s*"AudioObject"\s*.+')
        return {
            'id': video_id,
            'display_id': display_id,
            'url': video_data['contentUrl'],
            'ext': video_data.get('encodingFormat'),
            'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
            'duration': parse_duration(video_data.get('duration')),
            'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
                                             webpage, 'title', default=self._og_search_title(webpage)),
            'description': self._html_search_regex(
                r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None),
            'thumbnail': self._og_search_thumbnail(webpage),
            'uploader': self._html_search_regex(
                r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None),
            'upload_date': unified_strdate(self._search_regex(
                r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False))
        }
--- a/yt_dlp/extractor/freetv.py
+++ b/yt_dlp/extractor/freetv.py
@ -2,11 +2,7 @@ import itertools
 import re
 from .common import InfoExtractor
-from ..utils import (
+from ..utils import int_or_none, traverse_obj, urlencode_postdata
    int_or_none,
    traverse_obj,
    urlencode_postdata,
 )
 class FreeTvBaseIE(InfoExtractor):
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@ -1,6 +1,7 @@
 import re
 from .common import InfoExtractor
 from ..utils import parse_duration, unified_strdate
 class RadioFranceIE(InfoExtractor):
@ -54,3 +55,47 @@ class RadioFranceIE(InfoExtractor):
            'description': description,
            'uploader': uploader,
        }
 class FranceCultureIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/franceculture/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])'
    _TESTS = [
        {
            'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
            'info_dict': {
                'id': '8440487',
                'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau',
                'ext': 'mp3',
                'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?',
                'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?',
                'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg',
                'upload_date': '20220514',
                'duration': 2750,
            },
        },
    ]
    def _real_extract(self, url):
        video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
        webpage = self._download_webpage(url, display_id)
        # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846
        video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'\s*"@type"\s*:\s*"AudioObject"\s*.+')
        return {
            'id': video_id,
            'display_id': display_id,
            'url': video_data['contentUrl'],
            'ext': video_data.get('encodingFormat'),
            'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
            'duration': parse_duration(video_data.get('duration')),
            'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
                                             webpage, 'title', default=self._og_search_title(webpage)),
            'description': self._html_search_regex(
                r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None),
            'thumbnail': self._og_search_thumbnail(webpage),
            'uploader': self._html_search_regex(
                r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None),
            'upload_date': unified_strdate(self._search_regex(
                r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False))
        }
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -3674,8 +3674,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        initial_data = None
        if webpage:
-            initial_data = self._search_json(
+            initial_data = self.extract_yt_initial_data(video_id, webpage, fatal=False)
                self._YT_INITIAL_DATA_RE, webpage, 'yt initial data', video_id, fatal=False)
        if not initial_data:
            query = {'videoId': video_id}
            query.update(self._get_checkok_params())
--- a/yt_dlp/postprocessor/common.py
+++ b/yt_dlp/postprocessor/common.py
@ -45,9 +45,6 @@ class PostProcessor(metaclass=PostProcessorMetaClass):
    an initial argument and then with the returned value of the previous
    PostProcessor.
    The chain will be stopped if one of them ever returns None or the end
    of the chain is reached.
    PostProcessor objects follow a "mutual registration" process similar
    to InfoExtractor objects.
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -3498,13 +3498,13 @@ def match_filter_func(filters):
 def download_range_func(chapters, ranges):
    def inner(info_dict, ydl):
        warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
-                   else 'Chapter information is unavailable')
+                   else 'Cannot match chapters since chapter information is unavailable')
        for regex in chapters or []:
            for i, chapter in enumerate(info_dict.get('chapters') or []):
                if re.search(regex, chapter['title']):
                    warning = None
                    yield {**chapter, 'index': i}
-        if warning:
+        if chapters and warning:
            ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
        yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
@ -4903,9 +4903,9 @@ def to_high_limit_path(path):
    return path
-def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
+def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=None):
    val = traverse_obj(obj, *variadic(field))
-    if val in ignore:
+    if (not val and val != 0) if ignore is NO_DEFAULT else val in ignore:
        return default
    return template % (func(val) if func else val)