diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 5df13ad9b5..3b0ef323d7 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -77,3 +77,11 @@ body: render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 644c87a7ed..c8702c3569 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -89,3 +89,11 @@ body: render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 59d0474c28..5a6d2b0fbd 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -85,3 +85,11 @@ body: render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index e207396737..a17770f614 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -70,3 +70,11 @@ body: render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index e06db9ccf8..c600a9dcb6 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -64,3 +64,11 @@ body: [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 571223a9c5..57bc9daf51 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -70,3 +70,11 @@ body: [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9a1a22e8f5..d062d7720d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -72,7 +72,7 @@ on: default: true type: boolean windows: - description: yt-dlp.exe, yt-dlp_min.exe, yt-dlp_win.zip + description: yt-dlp.exe, yt-dlp_win.zip default: true type: boolean windows32: @@ -199,22 +199,24 @@ jobs: GITHUB_WORKFLOW: build githubToken: ${{ github.token }} # To cache image arch: ${{ matrix.architecture }} - distro: ubuntu18.04 # Standalone executable should be built on minimum supported OS + distro: ubuntu20.04 # Standalone executable should be built on minimum supported OS dockerRunArgs: --volume "${PWD}/repo:/repo" install: | # Installing Python 3.10 from the Deadsnakes repo raises errors apt update - apt -y install zlib1g-dev libffi-dev python3.8 python3.8-dev python3.8-distutils python3-pip - python3.8 -m pip install -U pip setuptools wheel - # Cannot access any files from the repo directory at this stage - python3.8 -m pip install -U Pyinstaller mutagen pycryptodomex websockets brotli certifi secretstorage cffi + apt -y install zlib1g-dev libffi-dev python3.9 python3.9-dev python3.9-distutils python3-pip \ + python3-secretstorage # Cannot build cryptography wheel in virtual armv7 environment + python3.9 -m pip install -U pip wheel 'setuptools>=71.0.2' + # XXX: Keep this in sync with pyproject.toml (it can't be accessed at this stage) and exclude secretstorage + python3.9 -m pip install -U Pyinstaller mutagen pycryptodomex brotli certifi cffi \ + 'requests>=2.32.2,<3' 'urllib3>=1.26.17,<3' 'websockets>=13.0' run: | cd repo - python3.8 devscripts/install_deps.py -o --include build - python3.8 devscripts/install_deps.py --include pyinstaller --include secretstorage # Cached version may be out of date - python3.8 devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" - python3.8 devscripts/make_lazy_extractors.py - python3.8 -m bundle.pyinstaller + python3.9 devscripts/install_deps.py -o --include build + python3.9 devscripts/install_deps.py --include pyinstaller # Cached versions may be out of date + python3.9 devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" + python3.9 devscripts/make_lazy_extractors.py + python3.9 -m bundle.pyinstaller if ${{ vars.UPDATE_TO_VERIFICATION && 'true' || 'false' }}; then arch="${{ (matrix.architecture == 'armv7' && 'armv7l') || matrix.architecture }}" @@ -237,27 +239,43 @@ jobs: macos: needs: process if: inputs.macos - runs-on: macos-12 + permissions: + contents: read + actions: write # For cleaning up cache + runs-on: macos-13 steps: - uses: actions/checkout@v4 # NB: Building universal2 does not work with python from actions/setup-python + + - name: Restore cached requirements + id: restore-cache + uses: actions/cache/restore@v4 + env: + SEGMENT_DOWNLOAD_TIMEOUT_MINS: 1 + with: + path: | + ~/yt-dlp-build-venv + key: cache-reqs-${{ github.job }} + - name: Install Requirements run: | brew install coreutils - python3 devscripts/install_deps.py --user -o --include build + python3 -m venv ~/yt-dlp-build-venv + source ~/yt-dlp-build-venv/bin/activate + python3 devscripts/install_deps.py -o --include build python3 devscripts/install_deps.py --print --include pyinstaller > requirements.txt # We need to ignore wheels otherwise we break universal2 builds - python3 -m pip install -U --user --no-binary :all: -r requirements.txt + python3 -m pip install -U --no-binary :all: -r requirements.txt # We need to fuse our own universal2 wheels for curl_cffi - python3 -m pip install -U --user delocate + python3 -m pip install -U 'delocate==0.11.0' mkdir curl_cffi_whls curl_cffi_universal2 python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do python3 -m pip download \ --only-binary=:all: \ --platform "${platform}" \ - --pre -d curl_cffi_whls \ + -d curl_cffi_whls \ -r requirements.txt done ( # Overwrite x86_64-only libs with fat/universal2 libs or else Pyinstaller will do the opposite @@ -274,9 +292,10 @@ jobs: ) python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/curl_cffi*.whl -w curl_cffi_universal2 python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/cffi*.whl -w curl_cffi_universal2 - cd curl_cffi_universal2 - for wheel in ./*cffi*.whl; do mv -n -- "${wheel}" "${wheel/x86_64/universal2}"; done - python3 -m pip install -U --user ./*cffi*.whl + for wheel in curl_cffi_universal2/*cffi*.whl; do + mv -n -- "${wheel}" "${wheel/x86_64/universal2}" + done + python3 -m pip install --force-reinstall -U curl_cffi_universal2/*cffi*.whl - name: Prepare run: | @@ -284,6 +303,7 @@ jobs: python3 devscripts/make_lazy_extractors.py - name: Build run: | + source ~/yt-dlp-build-venv/bin/activate python3 -m bundle.pyinstaller --target-architecture universal2 --onedir (cd ./dist/yt-dlp_macos && zip -r ../yt-dlp_macos.zip .) python3 -m bundle.pyinstaller --target-architecture universal2 @@ -307,10 +327,28 @@ jobs: dist/yt-dlp_macos.zip compression-level: 0 + - name: Cleanup cache + if: steps.restore-cache.outputs.cache-hit == 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + cache_key: cache-reqs-${{ github.job }} + repository: ${{ github.repository }} + branch: ${{ github.ref }} + run: | + gh extension install actions/gh-actions-cache + gh actions-cache delete "${cache_key}" -R "${repository}" -B "${branch}" --confirm + + - name: Cache requirements + uses: actions/cache/save@v4 + with: + path: | + ~/yt-dlp-build-venv + key: cache-reqs-${{ github.job }} + macos_legacy: needs: process if: inputs.macos_legacy - runs-on: macos-12 + runs-on: macos-13 steps: - uses: actions/checkout@v4 @@ -367,13 +405,13 @@ jobs: steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 - with: # 3.8 is used for Win7 support - python-version: "3.8" + with: + python-version: "3.10" - name: Install Requirements run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds python devscripts/install_deps.py -o --include build python devscripts/install_deps.py --include curl-cffi - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.7.0-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.10.0-py3-none-any.whl" - name: Prepare run: | @@ -383,22 +421,12 @@ jobs: run: | python -m bundle.pyinstaller python -m bundle.pyinstaller --onedir - Move-Item ./dist/yt-dlp.exe ./dist/yt-dlp_real.exe Compress-Archive -Path ./dist/yt-dlp/* -DestinationPath ./dist/yt-dlp_win.zip - - name: Install Requirements (py2exe) - run: | - python devscripts/install_deps.py --include py2exe - - name: Build (py2exe) - run: | - python -m bundle.py2exe - Move-Item ./dist/yt-dlp.exe ./dist/yt-dlp_min.exe - Move-Item ./dist/yt-dlp_real.exe ./dist/yt-dlp.exe - - name: Verify --update-to if: vars.UPDATE_TO_VERIFICATION run: | - foreach ($name in @("yt-dlp","yt-dlp_min")) { + foreach ($name in @("yt-dlp")) { Copy-Item "./dist/${name}.exe" "./dist/${name}_downgraded.exe" $version = & "./dist/${name}.exe" --version & "./dist/${name}_downgraded.exe" -v --update-to yt-dlp/yt-dlp@2023.03.04 @@ -414,7 +442,6 @@ jobs: name: build-bin-${{ github.job }} path: | dist/yt-dlp.exe - dist/yt-dlp_min.exe dist/yt-dlp_win.zip compression-level: 0 @@ -427,13 +454,13 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.10" architecture: "x86" - name: Install Requirements run: | python devscripts/install_deps.py -o --include build python devscripts/install_deps.py - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.7.0-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.10.0-py3-none-any.whl" - name: Prepare run: | @@ -489,6 +516,10 @@ jobs: # make sure SHA sums are also printed to stdout sha256sum -- * | tee ../SHA2-256SUMS sha512sum -- * | tee ../SHA2-512SUMS + # also print as permanent annotations to the summary page + while read -r shasum; do + echo "::notice title=${shasum##* }::sha256: ${shasum% *}" + done < ../SHA2-256SUMS - name: Make Update spec run: | @@ -497,13 +528,29 @@ jobs: lock 2022.08.18.36 .+ Python 3\.6 lock 2023.11.16 (?!win_x86_exe).+ Python 3\.7 lock 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) + lock 2024.10.22 py2exe .+ + lock 2024.10.22 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b + lock 2024.10.22 (?!\w+_exe).+ Python 3\.8 + lock 2024.10.22 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lockV2 yt-dlp/yt-dlp 2022.08.18.36 .+ Python 3\.6 lockV2 yt-dlp/yt-dlp 2023.11.16 (?!win_x86_exe).+ Python 3\.7 lockV2 yt-dlp/yt-dlp 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) + lockV2 yt-dlp/yt-dlp 2024.10.22 py2exe .+ + lockV2 yt-dlp/yt-dlp 2024.10.22 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b + lockV2 yt-dlp/yt-dlp 2024.10.22 (?!\w+_exe).+ Python 3\.8 + lockV2 yt-dlp/yt-dlp 2024.10.22 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 (?!win_x86_exe).+ Python 3\.7 lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 win_x86_exe .+ Windows-(?:Vista|2008Server) + lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 py2exe .+ + lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b + lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 (?!\w+_exe).+ Python 3\.8 + lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 (?!win_x86_exe).+ Python 3\.7 lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 win_x86_exe .+ Windows-(?:Vista|2008Server) + lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.045052 py2exe .+ + lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b + lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 (?!\w+_exe).+ Python 3\.8 + lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) EOF - name: Sign checksum files diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index fdfdebc65d..9a4342a585 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -36,16 +36,20 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - # CPython 3.8 is in quick-test - python-version: ['3.9', '3.10', '3.11', '3.12', pypy-3.8, pypy-3.10] + # CPython 3.9 is in quick-test + python-version: ['3.10', '3.11', '3.12', '3.13', pypy-3.10] include: # atleast one of each CPython/PyPy tests must be in windows - os: windows-latest - python-version: '3.8' + python-version: '3.9' + - os: windows-latest + python-version: '3.10' - os: windows-latest python-version: '3.12' - os: windows-latest - python-version: pypy-3.9 + python-version: '3.13' + - os: windows-latest + python-version: pypy-3.10 steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} @@ -55,7 +59,8 @@ jobs: - name: Install test requirements run: python3 ./devscripts/install_deps.py --include test --include curl-cffi - name: Run tests + timeout-minutes: 15 continue-on-error: False run: | python3 -m yt_dlp -v || true # Print debug head - python3 ./devscripts/run_tests.py core + python3 ./devscripts/run_tests.py --pytest-args '--reruns 2 --reruns-delay 3.0' core diff --git a/.github/workflows/download.yml b/.github/workflows/download.yml index 7256804d93..6849fba9b6 100644 --- a/.github/workflows/download.yml +++ b/.github/workflows/download.yml @@ -28,13 +28,13 @@ jobs: fail-fast: true matrix: os: [ubuntu-latest] - python-version: ['3.10', '3.11', '3.12', pypy-3.8, pypy-3.10] + python-version: ['3.10', '3.11', '3.12', '3.13', pypy-3.10] include: # atleast one of each CPython/PyPy tests must be in windows - os: windows-latest - python-version: '3.8' + python-version: '3.9' - os: windows-latest - python-version: pypy-3.9 + python-version: pypy-3.10 steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/issue-lockdown.yml b/.github/workflows/issue-lockdown.yml new file mode 100644 index 0000000000..4b973e2e61 --- /dev/null +++ b/.github/workflows/issue-lockdown.yml @@ -0,0 +1,21 @@ +name: Issue Lockdown +on: + issues: + types: [opened] + +permissions: + issues: write + +jobs: + lockdown: + name: Issue Lockdown + if: vars.ISSUE_LOCKDOWN + runs-on: ubuntu-latest + steps: + - name: "Lock new issue" + env: + GH_TOKEN: ${{ github.token }} + ISSUE_NUMBER: ${{ github.event.issue.number }} + REPOSITORY: ${{ github.repository }} + run: | + gh issue lock "${ISSUE_NUMBER}" -R "${REPOSITORY}" diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index 3afb51a308..1a32bbfe31 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -10,16 +10,17 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: Install test requirements - run: python3 ./devscripts/install_deps.py --include test + run: python3 ./devscripts/install_deps.py -o --include test - name: Run tests + timeout-minutes: 15 run: | python3 -m yt_dlp -v || true - python3 ./devscripts/run_tests.py core + python3 ./devscripts/run_tests.py --pytest-args '--reruns 2 --reruns-delay 3.0' core check: name: Code check if: "!contains(github.event.head_commit.message, 'ci skip all')" @@ -28,7 +29,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: Install dev dependencies run: python3 ./devscripts/install_deps.py -o --include static-analysis - name: Make lazy extractors diff --git a/.github/workflows/release-master.yml b/.github/workflows/release-master.yml index a84547580b..c49319b171 100644 --- a/.github/workflows/release-master.yml +++ b/.github/workflows/release-master.yml @@ -24,6 +24,7 @@ jobs: source: master permissions: contents: write - packages: write + packages: write # For package cache + actions: write # For cleaning up cache id-token: write # mandatory for trusted publishing secrets: inherit diff --git a/.github/workflows/release-nightly.yml b/.github/workflows/release-nightly.yml index f459a3a17e..b536c50669 100644 --- a/.github/workflows/release-nightly.yml +++ b/.github/workflows/release-nightly.yml @@ -37,6 +37,7 @@ jobs: source: nightly permissions: contents: write - packages: write + packages: write # For package cache + actions: write # For cleaning up cache id-token: write # mandatory for trusted publishing secrets: inherit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 32268b32f3..2bc09c64d0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -204,7 +204,7 @@ jobs: git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" git add -u git commit -m "Release ${{ env.version }}" \ - -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all :ci run dl" + -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all" git push origin --force ${{ github.event.ref }}:release - name: Get target commitish @@ -228,7 +228,8 @@ jobs: origin: ${{ needs.prepare.outputs.target_repo }} permissions: contents: read - packages: write # For package cache + packages: write # For package cache + actions: write # For cleaning up cache secrets: GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }} @@ -281,6 +282,7 @@ jobs: uses: pypa/gh-action-pypi-publish@release/v1 with: verbose: true + attestations: false # Currently doesn't work w/ reusable workflows (breaks nightly) publish: needs: [prepare, build] @@ -324,7 +326,7 @@ jobs: "(https://github.com/yt-dlp/yt-dlp-master-builds/releases/latest \"Master builds\")"' || '' }} > ./RELEASE_NOTES printf '\n\n' >> ./RELEASE_NOTES cat >> ./RELEASE_NOTES << EOF - #### A description of the various files are in the [README](https://github.com/${{ github.repository }}#release-files) + #### A description of the various files is in the [README](https://github.com/${{ github.repository }}#release-files) --- $(python ./devscripts/make_changelog.py -vv --collapsible) EOF diff --git a/.github/workflows/sanitize-comment.yml b/.github/workflows/sanitize-comment.yml new file mode 100644 index 0000000000..45c87cdd47 --- /dev/null +++ b/.github/workflows/sanitize-comment.yml @@ -0,0 +1,17 @@ +name: Sanitize comment + +on: + issue_comment: + types: [created, edited] + +permissions: + issues: write + +jobs: + sanitize-comment: + name: Sanitize comment + if: vars.SANITIZE_COMMENT && !github.event.issue.pull_request + runs-on: ubuntu-latest + steps: + - name: Sanitize comment + uses: yt-dlp/sanitize-comment@v1 diff --git a/.gitignore b/.gitignore index db322c4f08..fdd904f7fe 100644 --- a/.gitignore +++ b/.gitignore @@ -51,7 +51,6 @@ cookies *.srt *.ssa *.swf -*.swp *.tt *.ttml *.url @@ -119,6 +118,7 @@ yt-dlp.zip .vscode *.sublime-* *.code-workspace +*.swp # Lazy extractors */extractor/lazy_extractors.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index aeba3c44d1..fd7b0f1210 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -37,14 +37,18 @@ Bugs and suggestions should be reported at: [yt-dlp/yt-dlp/issues](https://githu **Please include the full output of yt-dlp when run with `-vU`**, i.e. **add** `-vU` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: ``` $ yt-dlp -vU -[debug] Command-line config: ['-v', 'demo.com'] -[debug] Encodings: locale UTF-8, fs utf-8, out utf-8, pref UTF-8 -[debug] yt-dlp version 2021.09.25 (zip) -[debug] Python version 3.8.10 (CPython 64bit) - Linux-5.4.0-74-generic-x86_64-with-glibc2.29 -[debug] exe versions: ffmpeg 4.2.4, ffprobe 4.2.4 +[debug] Command-line config: ['-vU', 'https://www.example.com/'] +[debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 +[debug] yt-dlp version nightly@... from yt-dlp/yt-dlp-nightly-builds [1a176d874] (win_exe) +[debug] Python 3.10.11 (CPython AMD64 64bit) - Windows-10-10.0.20348-SP0 (OpenSSL 1.1.1t 7 Feb 2023) +[debug] exe versions: ffmpeg 7.0.2 (setts), ffprobe 7.0.2 +[debug] Optional libraries: Cryptodome-3.21.0, brotli-1.1.0, certifi-2024.08.30, curl_cffi-0.5.10, mutagen-1.47.0, requests-2.32.3, sqlite3-3.40.1, urllib3-2.2.3, websockets-13.1 [debug] Proxy map: {} -Current Build Hash 25cc412d1d3c0725a1f2f5b7e4682f6fb40e6d15f7024e96f7afd572e9919535 -yt-dlp is up to date (2021.09.25) +[debug] Request Handlers: urllib, requests, websockets, curl_cffi +[debug] Loaded 1838 extractors +[debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest +Latest version: nightly@... from yt-dlp/yt-dlp-nightly-builds +yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) ... ``` **Do not post screenshots of verbose logs; only plain text is acceptable.** @@ -127,7 +131,7 @@ While these steps won't necessarily ensure that no misuse of the account takes p ### Is the website primarily used for piracy? -We follow [youtube-dl's policy](https://github.com/ytdl-org/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free) to not support services that is primarily used for infringing copyright. Additionally, it has been decided to not to support porn sites that specialize in fakes. We also cannot support any service that serves only [DRM protected content](https://en.wikipedia.org/wiki/Digital_rights_management). +We follow [youtube-dl's policy](https://github.com/ytdl-org/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free) to not support services that is primarily used for infringing copyright. Additionally, it has been decided to not to support porn sites that specialize in fakes. We also cannot support any service that serves only [DRM protected content](https://en.wikipedia.org/wiki/Digital_rights_management). @@ -215,8 +219,8 @@ After you have ensured this site is distributing its content legally, you can fo ```python from .common import InfoExtractor - - + + class YourExtractorIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' _TESTS = [{ @@ -233,7 +237,7 @@ After you have ensured this site is distributing its content legally, you can fo # * MD5 checksum; start the string with 'md5:', e.g. # 'description': 'md5:098f6bcd4621d373cade4e832627b4f6', # * A regular expression; start the string with 're:', e.g. - # 'thumbnail': r're:^https?://.*\.jpg$', + # 'thumbnail': r're:https?://.*\.jpg$', # * A count of elements in a list; start the string with 'count:', e.g. # 'tags': 'count:10', # * Any Python type, e.g. @@ -244,7 +248,7 @@ After you have ensured this site is distributing its content legally, you can fo def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - + # TODO more code goes here, for example ... title = self._html_search_regex(r'

(.+?)

', webpage, 'title') @@ -268,7 +272,7 @@ After you have ensured this site is distributing its content legally, you can fo You can use `hatch fmt` to automatically fix problems. Rules that the linter/formatter enforces should not be disabled with `# noqa` unless a maintainer requests it. The only exception allowed is for old/printf-style string formatting in GraphQL query templates (use `# noqa: UP031`). -1. Make sure your code works under all [Python](https://www.python.org/) versions supported by yt-dlp, namely CPython and PyPy for Python 3.8 and above. Backward compatibility is not required for even older versions of Python. +1. Make sure your code works under all [Python](https://www.python.org/) versions supported by yt-dlp, namely CPython >=3.9 and PyPy >=3.10. Backward compatibility is not required for even older versions of Python. 1. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files, [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: ```shell @@ -302,10 +306,9 @@ Extractors are very fragile by nature since they depend on the layout of the sou For extraction to work yt-dlp relies on metadata your extractor extracts and provides to yt-dlp expressed by an [information dictionary](yt_dlp/extractor/common.py#L119-L440) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by yt-dlp: - `id` (media identifier) - - `title` (media title) - `url` (media download URL) or `formats` -The aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. While all extractors must return a `title`, they must also allow it's extraction to be non-fatal. +The aforementioned metadata fields are the critical data without which extraction does not make any sense. If any of them fail to be extracted, then the extractor is considered broken. All other metadata extraction should be completely non-fatal. For pornographic sites, appropriate `age_limit` must also be returned. @@ -320,7 +323,7 @@ Say you have some source dictionary `meta` that you've fetched as JSON with HTTP ```python meta = self._download_json(url, video_id) ``` - + Assume at this point `meta`'s layout is: ```python @@ -750,7 +753,7 @@ Use `url_or_none` for safe URL processing. Use `traverse_obj` and `try_call` (superseeds `dict_get` and `try_get`) for safe metadata extraction from parsed JSON. -Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. +Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. Explore [`yt_dlp/utils/`](yt_dlp/utils/) for more useful convenience functions. diff --git a/CONTRIBUTORS b/CONTRIBUTORS index e0d1668ee2..949bc89c47 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -631,3 +631,60 @@ voidful vtexier WyohKnott trueauracoral +ASertacAkkaya +axpauls +chilinux +hafeoz +JSubelj +jucor +megumintyan +mgedmin +Niluge-KiWi +peisenwang +TheZ3ro +tippfehlr +varunchopra +DrakoCpp +PatrykMis +DinhHuy2010 +exterrestris +harbhim +LeSuisse +DunnesH +iancmy +mokrueger +luvyana +szantnerb +hugepower +scribblemaniac +Codenade +Demon000 +Deukhoofd +grqz +hibes +Khaoklong51 +kieraneglin +lengzuo +naglis +ndyanx +otovalek +quad +rakslice +sahilsinghss73 +tony-hn +xingchensong +BallzCrasher +coreywright +eric321 +poyhen +tetra-fox +444995 +63427083 +allendema +DarkZeros +DTrombett +imranh2 +KarboniteKream +mikkovedru +pktiuk +rubyevadestaxes diff --git a/Changelog.md b/Changelog.md index 267330208e..0efccadd10 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,396 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.10.22 + +#### Important changes +- **Following this release, yt-dlp's Python dependencies *must* be installed using the `default` group** +If you're installing yt-dlp with pip/pipx or requiring yt-dlp in your own Python project, you'll need to specify `yt-dlp[default]` if you want to also install yt-dlp's optional dependencies (which were previously included by default). [Read more](https://github.com/yt-dlp/yt-dlp/pull/11255) +- **py2exe is no longer supported** +This release's `yt-dlp_min.exe` will be the last, and it's actually a PyInstaller-bundled executable so that yt-dlp users updating their py2exe build with `-U` will be automatically migrated. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10087) + +#### Core changes +- [Add extractor helpers](https://github.com/yt-dlp/yt-dlp/commit/d710a6ca7c622705c0c8c8a3615916f531137d5d) ([#10653](https://github.com/yt-dlp/yt-dlp/issues/10653)) by [Grub4K](https://github.com/Grub4K) +- [Add option `--plugin-dirs`](https://github.com/yt-dlp/yt-dlp/commit/0f593dca9fa995d88eb763170a932da61c8f24dc) ([#11277](https://github.com/yt-dlp/yt-dlp/issues/11277)) by [coletdjnz](https://github.com/coletdjnz), [imranh2](https://github.com/imranh2) +- **cookies**: [Fix compatibility for Python <=3.9 in traceback](https://github.com/yt-dlp/yt-dlp/commit/c5f0f58efd8c3930de8202c15a5c53b1b635bd51) by [Grub4K](https://github.com/Grub4K) +- **utils** + - `Popen`: [Reset PyInstaller environment](https://github.com/yt-dlp/yt-dlp/commit/fbc66e3ab35743cc847a21223c67d88bb463cd9c) ([#11258](https://github.com/yt-dlp/yt-dlp/issues/11258)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - `sanitize_path`: [Reimplement function](https://github.com/yt-dlp/yt-dlp/commit/85b87c991af25dcb35630fa94580fd418e78ee33) ([#11198](https://github.com/yt-dlp/yt-dlp/issues/11198)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **adobepass**: [Use newer user-agent for provider redirect request](https://github.com/yt-dlp/yt-dlp/commit/dcfeea4dd5e5686821350baa6c7767a011944867) ([#11250](https://github.com/yt-dlp/yt-dlp/issues/11250)) by [bashonly](https://github.com/bashonly) +- **afreecatv**: [Adapt extractors to new sooplive.co.kr domain](https://github.com/yt-dlp/yt-dlp/commit/46fe60ff19395698a87113b2944453779e04ab9d) ([#11266](https://github.com/yt-dlp/yt-dlp/issues/11266)) by [63427083](https://github.com/63427083), [bashonly](https://github.com/bashonly) +- **cda**: [Support folders](https://github.com/yt-dlp/yt-dlp/commit/c4d95f67ddc522297bb1fea875255cf94b34d595) ([#10786](https://github.com/yt-dlp/yt-dlp/issues/10786)) by [pktiuk](https://github.com/pktiuk) +- **cwtv**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/9d43dcb2c5c38f443f84dfc126cd32720e1a1ad6) ([#11230](https://github.com/yt-dlp/yt-dlp/issues/11230)) by [bashonly](https://github.com/bashonly) +- **drtv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f4338714241b11d9d43768ae71a25f5e952f677d) ([#11141](https://github.com/yt-dlp/yt-dlp/issues/11141)) by [444995](https://github.com/444995) +- **funk**: [Extend `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/8de431ec97a4b62b73df8f686b6e21e462775336) ([#11269](https://github.com/yt-dlp/yt-dlp/issues/11269)) by [seproDev](https://github.com/seproDev) +- **gem.cbc.ca**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/40054cb4a7ebbea30d335d444e6f58b298a3baa0) ([#11196](https://github.com/yt-dlp/yt-dlp/issues/11196)) by [DavidSkrundz](https://github.com/DavidSkrundz) +- **generic**: [Impersonate browser by default](https://github.com/yt-dlp/yt-dlp/commit/edfd095b1917701c5046bd51f9542897c17d41a7) ([#11206](https://github.com/yt-dlp/yt-dlp/issues/11206)) by [Grub4K](https://github.com/Grub4K) +- **imgur** + - [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/87408ccfd772ddf31a8323d8151c24f9577cbc9f) ([#11298](https://github.com/yt-dlp/yt-dlp/issues/11298)) by [seproDev](https://github.com/seproDev) + - [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/5af774d7a36c00bea618c7047c9326532cd3f616) ([#11075](https://github.com/yt-dlp/yt-dlp/issues/11075)) by [Deer-Spangle](https://github.com/Deer-Spangle) +- **patreon**: campaign: [Stricter URL matching](https://github.com/yt-dlp/yt-dlp/commit/babb70960595e2146f06f81affc29c7e713e34e2) ([#11235](https://github.com/yt-dlp/yt-dlp/issues/11235)) by [bashonly](https://github.com/bashonly) +- **reddit**: [Detect and raise when login is required](https://github.com/yt-dlp/yt-dlp/commit/cba7868502f04175fecf9ab3e363296aee7ebec2) ([#11202](https://github.com/yt-dlp/yt-dlp/issues/11202)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **substack**: [Resolve podcast file extensions](https://github.com/yt-dlp/yt-dlp/commit/3148c1822f66533998278f0a1cf842b9bea1526a) ([#11275](https://github.com/yt-dlp/yt-dlp/issues/11275)) by [bashonly](https://github.com/bashonly) +- **telecinco**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/0b7ec08816fb196cd41d392f8331b4eb8366c4f8) ([#11142](https://github.com/yt-dlp/yt-dlp/issues/11142)) by [bashonly](https://github.com/bashonly), [DarkZeros](https://github.com/DarkZeros) +- **tubitv**: [Strip extra whitespace from titles](https://github.com/yt-dlp/yt-dlp/commit/e68b4c19af122876561a41f2dd8093fae7b417c7) ([#10795](https://github.com/yt-dlp/yt-dlp/issues/10795)) by [allendema](https://github.com/allendema) +- **tver**: [Support series URLs](https://github.com/yt-dlp/yt-dlp/commit/ceaea731b6e314dbbdfb2e358d7677785ed0b4fc) ([#9507](https://github.com/yt-dlp/yt-dlp/issues/9507)) by [pzhlkj6612](https://github.com/pzhlkj6612), [vvto33](https://github.com/vvto33) +- **twitter**: spaces: [Allow extraction when not logged in](https://github.com/yt-dlp/yt-dlp/commit/679c68240a26481ea7c07cc0c014745631ea8481) ([#11289](https://github.com/yt-dlp/yt-dlp/issues/11289)) by [rubyevadestaxes](https://github.com/rubyevadestaxes) +- **weverse**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/5310fa87f6cb7f66bf42e2520878952fbf6b1652) ([#11215](https://github.com/yt-dlp/yt-dlp/issues/11215)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Fix `comment_count` extraction](https://github.com/yt-dlp/yt-dlp/commit/7af1ddaaf2a6a0a750373a9ab53c7770af4f9fe4) ([#11274](https://github.com/yt-dlp/yt-dlp/issues/11274)) by [bashonly](https://github.com/bashonly) + - [Remove broken `android_producer` client](https://github.com/yt-dlp/yt-dlp/commit/fed53d70bdb7d3e37ef63dd7fcf0ef74356167fd) ([#11297](https://github.com/yt-dlp/yt-dlp/issues/11297)) by [bashonly](https://github.com/bashonly) + - [Remove broken age-restriction workaround](https://github.com/yt-dlp/yt-dlp/commit/ec2f4bf0823a13043f98f5bd0bf6677837bf09dc) ([#11297](https://github.com/yt-dlp/yt-dlp/issues/11297)) by [bashonly](https://github.com/bashonly) + - [Support logging in with OAuth](https://github.com/yt-dlp/yt-dlp/commit/b8635c1d4779da195e71aa281f73aaad702c935e) ([#11001](https://github.com/yt-dlp/yt-dlp/issues/11001)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **build** + - [Migrate `py2exe` builds to `win_exe`](https://github.com/yt-dlp/yt-dlp/commit/a886cf3e900f4a2ec00af705f883539269545609) ([#11256](https://github.com/yt-dlp/yt-dlp/issues/11256)) by [bashonly](https://github.com/bashonly) + - [Use `macos-13` image for macOS builds](https://github.com/yt-dlp/yt-dlp/commit/64d84d75ca8c19ec06558cc7c511f5f4f7a822bc) ([#11236](https://github.com/yt-dlp/yt-dlp/issues/11236)) by [bashonly](https://github.com/bashonly) + - `make_lazy_extractors`: [Force running without plugins](https://github.com/yt-dlp/yt-dlp/commit/1a830394a21a81a3e9918f9e175abc9fbb21f089) ([#11205](https://github.com/yt-dlp/yt-dlp/issues/11205)) by [Grub4K](https://github.com/Grub4K) +- **cleanup**: Miscellaneous: [67adeb7](https://github.com/yt-dlp/yt-dlp/commit/67adeb7bab00662ba55d473e405b301abb42fe61) by [bashonly](https://github.com/bashonly), [DTrombett](https://github.com/DTrombett), [grqz](https://github.com/grqz), [Grub4K](https://github.com/Grub4K), [KarboniteKream](https://github.com/KarboniteKream), [mikkovedru](https://github.com/mikkovedru), [seproDev](https://github.com/seproDev) +- **test**: [Allow running tests explicitly](https://github.com/yt-dlp/yt-dlp/commit/16eb28026a2ddf5608d0a628ef15949b8d3805a9) ([#11203](https://github.com/yt-dlp/yt-dlp/issues/11203)) by [Grub4K](https://github.com/Grub4K) + +### 2024.10.07 + +#### Core changes +- **cookies**: [Fix cookie load error handling](https://github.com/yt-dlp/yt-dlp/commit/e59c82a74cda5139eb3928c75b0bd45484dbe7f0) ([#11140](https://github.com/yt-dlp/yt-dlp/issues/11140)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **applepodcasts**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6328e2e67a4e126e08af382e6a387073082d5c5f) ([#10903](https://github.com/yt-dlp/yt-dlp/issues/10903)) by [coreywright](https://github.com/coreywright) +- **cwtv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4b7bec66d8100978b82bb24110ed44e2a7749931) ([#11135](https://github.com/yt-dlp/yt-dlp/issues/11135)) by [kclauhk](https://github.com/kclauhk) +- **instagram** + - [Do not hardcode user-agent](https://github.com/yt-dlp/yt-dlp/commit/079a7bc334281d3c13d347770ae5f9f2b7da471a) ([#11155](https://github.com/yt-dlp/yt-dlp/issues/11155)) by [poyhen](https://github.com/poyhen) + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/cf85cba5d9496bd2689e1070005b4d1b4cd3dc6d) ([#11156](https://github.com/yt-dlp/yt-dlp/issues/11156)) by [tetra-fox](https://github.com/tetra-fox) +- **noodlemagazine**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/ccb23e1bac9768d1c70535beb744e668ed4a2720) ([#11144](https://github.com/yt-dlp/yt-dlp/issues/11144)) by [BallzCrasher](https://github.com/BallzCrasher) +- **patreon**: [Extract all m3u8 formats for locked posts](https://github.com/yt-dlp/yt-dlp/commit/f91645aceaf13926cf35be2c1dfef61b3aab97fb) ([#11138](https://github.com/yt-dlp/yt-dlp/issues/11138)) by [bashonly](https://github.com/bashonly) +- **youtube**: [Change default player clients to `ios,mweb`](https://github.com/yt-dlp/yt-dlp/commit/de2062753a188060d76f587e45becce61fe399f9) ([#11190](https://github.com/yt-dlp/yt-dlp/issues/11190)) by [seproDev](https://github.com/seproDev) + +#### Postprocessor changes +- **xattrmetadata**: [Try to write each attribute](https://github.com/yt-dlp/yt-dlp/commit/3a193346eeb27ac2959ff30c370adb899ec94732) ([#11115](https://github.com/yt-dlp/yt-dlp/issues/11115)) by [eric321](https://github.com/eric321) + +#### Misc. changes +- **ci**: [Rerun failed tests](https://github.com/yt-dlp/yt-dlp/commit/b31b81d85f00601710d4fac590c3e4efb4133283) ([#11143](https://github.com/yt-dlp/yt-dlp/issues/11143)) by [Grub4K](https://github.com/Grub4K) +- **cleanup**: Miscellaneous: [1a176d8](https://github.com/yt-dlp/yt-dlp/commit/1a176d874e6772cd898ce507379ea388e96ee3f7) by [bashonly](https://github.com/bashonly) + +### 2024.09.27 + +#### Important changes +- **The minimum *recommended* Python version has been raised to 3.9** +Since Python 3.8 will reach end-of-life in October 2024, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10086) + +#### Core changes +- [Allow `none` arg to negate `--convert-subs` and `--convert-thumbnails`](https://github.com/yt-dlp/yt-dlp/commit/c08e0b20b5edd8957b8318716bc14e896d1b96f4) ([#11066](https://github.com/yt-dlp/yt-dlp/issues/11066)) by [kieraneglin](https://github.com/kieraneglin) +- [Fix format sorting bug with vp9.2 vcodec](https://github.com/yt-dlp/yt-dlp/commit/8f4ea14680c7865d8ffac10a9174205d1d84ada7) ([#10884](https://github.com/yt-dlp/yt-dlp/issues/10884)) by [rakslice](https://github.com/rakslice) +- [Raise minimum recommended Python version to 3.9](https://github.com/yt-dlp/yt-dlp/commit/cca534cd9e6850c70244f225a4a1895ef4bcdbec) ([#11098](https://github.com/yt-dlp/yt-dlp/issues/11098)) by [bashonly](https://github.com/bashonly) +- **cookies**: [Improve error message for Windows `--cookies-from-browser chrome` issue](https://github.com/yt-dlp/yt-dlp/commit/b397a64691421ace5df09457c2a764821a2dc6f2) ([#11090](https://github.com/yt-dlp/yt-dlp/issues/11090)) by [seproDev](https://github.com/seproDev) +- **utils**: `mimetype2ext`: [Recognize `aacp` as `aac`](https://github.com/yt-dlp/yt-dlp/commit/cc85596d5b59f0c14e9381b3675f619c1e12e597) ([#10860](https://github.com/yt-dlp/yt-dlp/issues/10860)) by [bashonly](https://github.com/bashonly) + +#### Extractor changes +- [Fix JW Player format parsing](https://github.com/yt-dlp/yt-dlp/commit/409f8e9e3b4bde81ef76fc563256f876d2ff8099) ([#10956](https://github.com/yt-dlp/yt-dlp/issues/10956)) by [seproDev](https://github.com/seproDev) +- [Handle decode errors when reading responses](https://github.com/yt-dlp/yt-dlp/commit/325001317d97f4545d66fac44c4ba772c6f45f22) ([#10868](https://github.com/yt-dlp/yt-dlp/issues/10868)) by [bashonly](https://github.com/bashonly) +- **abc.net.au**: iview, showseries: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/7f909046f4dc0fba472b4963145aef6e0d42491b) ([#11101](https://github.com/yt-dlp/yt-dlp/issues/11101)) by [bashonly](https://github.com/bashonly) +- **adn**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/cc88a54bb1ef285154775f8a6a413335ce4c71ce) ([#10749](https://github.com/yt-dlp/yt-dlp/issues/10749)) by [infanf](https://github.com/infanf) +- **asobistage**: [Support redirected URLs](https://github.com/yt-dlp/yt-dlp/commit/a7d3235c84dac57a127cbe0ff38f7f7c2fdd8fa0) ([#10768](https://github.com/yt-dlp/yt-dlp/issues/10768)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **bandcamp**: user: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/5d0176547f16a3642cd71627126e9dfc24981e20) ([#10328](https://github.com/yt-dlp/yt-dlp/issues/10328)) by [bashonly](https://github.com/bashonly), [quad](https://github.com/quad) +- **beacon**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b4760c778d0c92c6e3f2bc8346cd72c8f08595ae) ([#9901](https://github.com/yt-dlp/yt-dlp/issues/9901)) by [Deukhoofd](https://github.com/Deukhoofd) +- **bilibili** + - [Fix chapters and subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/a2000bc85730c950351d78bb818493dc39dca3cb) ([#11099](https://github.com/yt-dlp/yt-dlp/issues/11099)) by [bashonly](https://github.com/bashonly) + - [Fix festival URL support](https://github.com/yt-dlp/yt-dlp/commit/b43bd864851f2862e26caa85461c5d825d49d463) ([#10740](https://github.com/yt-dlp/yt-dlp/issues/10740)) by [bashonly](https://github.com/bashonly), [grqz](https://github.com/grqz) +- **biliintl**: [Fix referer header](https://github.com/yt-dlp/yt-dlp/commit/a06bb586795ebab87a2356923acfc674d6f0e152) ([#11003](https://github.com/yt-dlp/yt-dlp/issues/11003)) by [Khaoklong51](https://github.com/Khaoklong51) +- **dropbox**: [Fix password-protected video support](https://github.com/yt-dlp/yt-dlp/commit/63da31b3b29af90062d8a72a905ffe4b5e499042) ([#10735](https://github.com/yt-dlp/yt-dlp/issues/10735)) by [ndyanx](https://github.com/ndyanx) +- **ertgr**: [Fix video extraction](https://github.com/yt-dlp/yt-dlp/commit/416686ed0cf792ec44ab059f3b229dd776077e14) ([#11091](https://github.com/yt-dlp/yt-dlp/issues/11091)) by [seproDev](https://github.com/seproDev) +- **eurosport**: [Support local URL variants](https://github.com/yt-dlp/yt-dlp/commit/f0bb28504c8c2b75ee3e5796aed50de2a7f90a1b) ([#10785](https://github.com/yt-dlp/yt-dlp/issues/10785)) by [seproDev](https://github.com/seproDev) +- **facebook** + - ads: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/d62fef7e07d454c0d2ba2d69fb96d691dba1ded0) ([#10704](https://github.com/yt-dlp/yt-dlp/issues/10704)) by [kclauhk](https://github.com/kclauhk) + - reel: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/0e1b941c6b2caa688b0d3332e723d16dbafa4311) by [lengzuo](https://github.com/lengzuo) +- **germanupa**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/124f058b546d652a359c67025bb479789bfbef0b) ([#10538](https://github.com/yt-dlp/yt-dlp/issues/10538)) by [grqz](https://github.com/grqz) +- **hgtvde**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/a555389c9bb32e589e00b4664974423fb7b04dcd) ([#10992](https://github.com/yt-dlp/yt-dlp/issues/10992)) by [bashonly](https://github.com/bashonly), [rdamas](https://github.com/rdamas) +- **huya**: video: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/25c1cdaa2650563494d3bf00a38f72d0d9486bff) ([#10686](https://github.com/yt-dlp/yt-dlp/issues/10686)) by [hugepower](https://github.com/hugepower) +- **iprima**: [Fix zoom URL support](https://github.com/yt-dlp/yt-dlp/commit/4a27b8f092f7f7c10b7a334d3535c97c2af02f0a) ([#10959](https://github.com/yt-dlp/yt-dlp/issues/10959)) by [otovalek](https://github.com/otovalek) +- **khanacademy**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/0fba08485b6445b72b5b63ae23ca2a73fa5d967f) ([#10913](https://github.com/yt-dlp/yt-dlp/issues/10913)) by [seproDev](https://github.com/seproDev) +- **kick** + - clips: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/0aa4426e9a35f7f8e184f1f2082b3b313c1448f7) ([#11107](https://github.com/yt-dlp/yt-dlp/issues/11107)) by [bashonly](https://github.com/bashonly) + - vod: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/173d54c151b987409e3eb09552d8d89ed8fc50f7) ([#10988](https://github.com/yt-dlp/yt-dlp/issues/10988)) by [bashonly](https://github.com/bashonly), [grqz](https://github.com/grqz) +- **kika**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e6f48ca80821939c1fd11ec2a0cdbf2fba9b258a) ([#5788](https://github.com/yt-dlp/yt-dlp/issues/5788)) by [1100101](https://github.com/1100101) +- **lnkgo**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/fa83d0b36bc43d30fe9241c1e923f4614864b758) ([#10904](https://github.com/yt-dlp/yt-dlp/issues/10904)) by [naglis](https://github.com/naglis) +- **loom**: [Fix m3u8 formats extraction](https://github.com/yt-dlp/yt-dlp/commit/7509d692b37a7ec6230ea75bfe1e44a8de5eefce) ([#10760](https://github.com/yt-dlp/yt-dlp/issues/10760)) by [kclauhk](https://github.com/kclauhk) +- **mediaklikk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/e2b3634e299be9c16a247ece3b1858d83889c324) ([#11083](https://github.com/yt-dlp/yt-dlp/issues/11083)) by [szantnerb](https://github.com/szantnerb) +- **mojevideo**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/28b0ecba2af5b4919f198474b3d00a76ef322c31) ([#11019](https://github.com/yt-dlp/yt-dlp/issues/11019)) by [04-pasha-04](https://github.com/04-pasha-04), [pzhlkj6612](https://github.com/pzhlkj6612) +- **niconico**: [Fix m3u8 formats extraction](https://github.com/yt-dlp/yt-dlp/commit/eabb4680fdb09ba1f48d174a700a2e3b43f82add) ([#11103](https://github.com/yt-dlp/yt-dlp/issues/11103)) by [bashonly](https://github.com/bashonly) +- **nzz**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4a9bc8c3630378bc29f0266126b503f6190c0430) ([#10461](https://github.com/yt-dlp/yt-dlp/issues/10461)) by [1-Byte](https://github.com/1-Byte) +- **patreoncampaign**: [Support API URLs](https://github.com/yt-dlp/yt-dlp/commit/232e6db30c474d1b387e405342f34173ceeaf832) ([#10734](https://github.com/yt-dlp/yt-dlp/issues/10734)) by [bashonly](https://github.com/bashonly), [hibes](https://github.com/hibes) +- **pinterest**: [Extend `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/c8c078fe28b0ffc15ef9646346c00c592fe71a78) ([#10867](https://github.com/yt-dlp/yt-dlp/issues/10867)) by [bashonly](https://github.com/bashonly), [sahilsinghss73](https://github.com/sahilsinghss73) +- **radiko**: [Extract unique `id` values](https://github.com/yt-dlp/yt-dlp/commit/c8d096c5ce111411fbdbe2abb8fed54f317a6182) ([#10726](https://github.com/yt-dlp/yt-dlp/issues/10726)) by [garret1317](https://github.com/garret1317) +- **rtp**: [Support more subpages](https://github.com/yt-dlp/yt-dlp/commit/d02df303d8e49390599db9f34482697e4d1cf5b2) ([#10787](https://github.com/yt-dlp/yt-dlp/issues/10787)) by [Demon000](https://github.com/Demon000) +- **rumblechannel**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/ad0b857f459a6d390fbf124183916218c52f223a) ([#11049](https://github.com/yt-dlp/yt-dlp/issues/11049)) by [tony-hn](https://github.com/tony-hn) +- **rutube**: [Support livestreams](https://github.com/yt-dlp/yt-dlp/commit/41be32e78c3845000dbac188ffb90ea3ea7c4dfa) ([#10844](https://github.com/yt-dlp/yt-dlp/issues/10844)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **samplefocus**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/46f4c80bc363ee8116c33d37f65202e6c3470954) ([#10947](https://github.com/yt-dlp/yt-dlp/issues/10947)) by [seproDev](https://github.com/seproDev) +- **screenrec**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/36f9e602ad55679764bc75a4f67f7562b1d6adcf) ([#10917](https://github.com/yt-dlp/yt-dlp/issues/10917)) by [naglis](https://github.com/naglis) +- **sen**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/41a241ca6ffb95b3d9aaf4f42106ca8cba9af1a6) ([#10952](https://github.com/yt-dlp/yt-dlp/issues/10952)) by [seproDev](https://github.com/seproDev) +- **servus**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/300c91274f7ea5b1b0528fc5ee11cf1a61d4079e) ([#10944](https://github.com/yt-dlp/yt-dlp/issues/10944)) by [seproDev](https://github.com/seproDev) +- **snapchatspotlight**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b37417e4f934fd8909788b493d017777155b0ae5) ([#11030](https://github.com/yt-dlp/yt-dlp/issues/11030)) by [seproDev](https://github.com/seproDev) +- **svtpage**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/5a8a05aebb49693e78e1123015837ed5e961ff76) ([#11010](https://github.com/yt-dlp/yt-dlp/issues/11010)) by [diman8](https://github.com/diman8) +- **tenplay**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/d8d473002b654ab0e7b97ead869f58b4361eeae1) ([#10928](https://github.com/yt-dlp/yt-dlp/issues/10928)) by [aarubui](https://github.com/aarubui) +- **tiktok**: [Fix web formats extraction](https://github.com/yt-dlp/yt-dlp/commit/3ad0b7f422d547204df687b6d0b2d9110fff3990) ([#11074](https://github.com/yt-dlp/yt-dlp/issues/11074)) by [bashonly](https://github.com/bashonly) +- **twitter**: spaces: [Support video spaces](https://github.com/yt-dlp/yt-dlp/commit/bef1d4d6fc9493fda7f75e2289c07c507d10092f) ([#10789](https://github.com/yt-dlp/yt-dlp/issues/10789)) by [bashonly](https://github.com/bashonly) +- **vidflex**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e978c312d6550a6ae4c9df18001afb1b420cb72f) ([#10002](https://github.com/yt-dlp/yt-dlp/issues/10002)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **vimeo** + - [Always try to extract original format](https://github.com/yt-dlp/yt-dlp/commit/4115c24d157c5b5f63089d75c4e0f51d1f8b4489) ([#10721](https://github.com/yt-dlp/yt-dlp/issues/10721)) by [bashonly](https://github.com/bashonly) (With fixes in [e8e6a98](https://github.com/yt-dlp/yt-dlp/commit/e8e6a982a1b659eed434d225d7922f632bac6568) by [seproDev](https://github.com/seproDev)) + - [Fix HLS audio format sorting](https://github.com/yt-dlp/yt-dlp/commit/a1b4ac2b8ed8e6eaa56044d439f1e0d00c2ba218) ([#11082](https://github.com/yt-dlp/yt-dlp/issues/11082)) by [fireattack](https://github.com/fireattack) +- **watchespn**: [Improve auth support](https://github.com/yt-dlp/yt-dlp/commit/7adff8caf152dcf96d03aff69ed8545c0a63567c) ([#10910](https://github.com/yt-dlp/yt-dlp/issues/10910)) by [ischmidt20](https://github.com/ischmidt20) +- **wistia**: [Support password-protected videos](https://github.com/yt-dlp/yt-dlp/commit/9f5c9a90898c5a1e672922d9cd799716c73cee34) ([#11100](https://github.com/yt-dlp/yt-dlp/issues/11100)) by [bashonly](https://github.com/bashonly) +- **ximalaya**: [Add VIP support](https://github.com/yt-dlp/yt-dlp/commit/3dfd720d098b4d49d69cfc77e6376f22bcd90934) ([#10832](https://github.com/yt-dlp/yt-dlp/issues/10832)) by [seproDev](https://github.com/seproDev), [xingchensong](https://github.com/xingchensong) +- **xinpianchang**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/3aa0156e05662923d130ddbc1c82596e38c01a00) ([#10950](https://github.com/yt-dlp/yt-dlp/issues/10950)) by [seproDev](https://github.com/seproDev) +- **yleareena**: [Support podcasts](https://github.com/yt-dlp/yt-dlp/commit/48d629d461e05b1b19f5e53dc959bb9ebe95da42) ([#11104](https://github.com/yt-dlp/yt-dlp/issues/11104)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Add `po_token`, `visitor_data`, `data_sync_id` extractor args](https://github.com/yt-dlp/yt-dlp/commit/3a3bd00037e9908e87da4fa9f2ad772aa34dc60e) ([#10648](https://github.com/yt-dlp/yt-dlp/issues/10648)) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [seproDev](https://github.com/seproDev) (With fixes in [fa2be9a](https://github.com/yt-dlp/yt-dlp/commit/fa2be9a7c63babede07480151363e54eee5702bd) by [bashonly](https://github.com/bashonly)) + - [Support excluding `player_client`s in extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/49f3741a820ed142f6866317c2e7d247b130960e) ([#10710](https://github.com/yt-dlp/yt-dlp/issues/10710)) by [bashonly](https://github.com/bashonly) + - clip: [Prioritize `https` formats](https://github.com/yt-dlp/yt-dlp/commit/1d84b780cf33a1d84756825ac23f990a905703df) ([#11102](https://github.com/yt-dlp/yt-dlp/issues/11102)) by [bashonly](https://github.com/bashonly) + - tab: [Fix shorts tab extraction](https://github.com/yt-dlp/yt-dlp/commit/9431777b4c37129a6093080c77ca59960afbb9d7) ([#10938](https://github.com/yt-dlp/yt-dlp/issues/10938)) by [seproDev](https://github.com/seproDev) + +#### Networking changes +- [Fix handler not being added to RequestError](https://github.com/yt-dlp/yt-dlp/commit/d1c4d88b2d912e8da5e76db455562ca63b1af690) ([#10955](https://github.com/yt-dlp/yt-dlp/issues/10955)) by [coletdjnz](https://github.com/coletdjnz) +- [Pin `curl-cffi` version to < 0.7.2](https://github.com/yt-dlp/yt-dlp/commit/5bb1aa04dafce13ba9de707ea53169fab58b5207) ([#11092](https://github.com/yt-dlp/yt-dlp/issues/11092)) by [bashonly](https://github.com/bashonly) +- **Request Handler**: websockets: [Upgrade websockets to 13.0](https://github.com/yt-dlp/yt-dlp/commit/6f9e6537434562d513d0c9b68ced8a61ade94a64) ([#10815](https://github.com/yt-dlp/yt-dlp/issues/10815)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **build** + - [Bump PyInstaller version pin to `>=6.10.0`](https://github.com/yt-dlp/yt-dlp/commit/fb8b7f226d251e521a89b23c415e249e5b788e5c) ([#10709](https://github.com/yt-dlp/yt-dlp/issues/10709)) by [bashonly](https://github.com/bashonly) + - [Pin `delocate` version for `macos`](https://github.com/yt-dlp/yt-dlp/commit/7e41628ff523b3fe373b0981a5db441358980dab) ([#10901](https://github.com/yt-dlp/yt-dlp/issues/10901)) by [bashonly](https://github.com/bashonly) +- **ci** + - [Add comment sanitization workflow](https://github.com/yt-dlp/yt-dlp/commit/b6200bdcf3a9415ae36859188f9a57e3e461c696) ([#10915](https://github.com/yt-dlp/yt-dlp/issues/10915)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Add issue tracker anti-spam protection](https://github.com/yt-dlp/yt-dlp/commit/ad9a8115aa29a1a95c961b16fcf129a228d98f50) ([#10861](https://github.com/yt-dlp/yt-dlp/issues/10861)) by [bashonly](https://github.com/bashonly) +- **cleanup**: Miscellaneous: [c6387ab](https://github.com/yt-dlp/yt-dlp/commit/c6387abc1af9842bb0541288a5610abba9b1ab51) by [bashonly](https://github.com/bashonly), [Codenade](https://github.com/Codenade), [coletdjnz](https://github.com/coletdjnz), [grqz](https://github.com/grqz), [Grub4K](https://github.com/Grub4K), [pzhlkj6612](https://github.com/pzhlkj6612), [seproDev](https://github.com/seproDev) + +### 2024.08.06 + +#### Core changes +- **jsinterp**: [Improve `slice` implementation](https://github.com/yt-dlp/yt-dlp/commit/bb8bf1db993f59752d20b73b861bd55e40cf0e31) ([#10664](https://github.com/yt-dlp/yt-dlp/issues/10664)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **discoveryplusitaly**: [Support sport and olympics URLs](https://github.com/yt-dlp/yt-dlp/commit/e7d73bc4531ee3f91a46b15e218dcc1fbeb6226c) ([#10655](https://github.com/yt-dlp/yt-dlp/issues/10655)) by [bashonly](https://github.com/bashonly) +- **gem.cbc.ca**: live: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/fc5eecfa31c9571b6031cc3968aaa0394be55d7a) ([#10565](https://github.com/yt-dlp/yt-dlp/issues/10565)) by [bashonly](https://github.com/bashonly), [scribblemaniac](https://github.com/scribblemaniac) +- **niconico**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4d9231208332d4c32364b8cd814bff8b20232cae) ([#10677](https://github.com/yt-dlp/yt-dlp/issues/10677)) by [bashonly](https://github.com/bashonly) +- **olympics**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/919540a9644e55deb78cdd6751757ec8fdaf76f4) ([#10625](https://github.com/yt-dlp/yt-dlp/issues/10625)) by [bashonly](https://github.com/bashonly) +- **youku**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/0088c6de23d832b117061a33e984dc452d992e9c) ([#10626](https://github.com/yt-dlp/yt-dlp/issues/10626)) by [hugepower](https://github.com/hugepower) +- **youtube** + - [Change default player clients to `ios,web_creator`](https://github.com/yt-dlp/yt-dlp/commit/406f4c2e47502fffc1b0c210b4ee6487c89a44cb) ([#10674](https://github.com/yt-dlp/yt-dlp/issues/10674)) by [bashonly](https://github.com/bashonly) + - [Fix `n` function name extraction for player `b12cc44b`](https://github.com/yt-dlp/yt-dlp/commit/c86891eb9434b4d7eec426d38c0c625b5e13cb2f) ([#10668](https://github.com/yt-dlp/yt-dlp/issues/10668)) by [seproDev](https://github.com/seproDev) + +### 2024.08.01 + +#### Core changes +- **utils**: `unified_timestamp`: [Recognize Sunday](https://github.com/yt-dlp/yt-dlp/commit/6daf2c27c0464fba98337be30de0b66d520d0db1) ([#10589](https://github.com/yt-dlp/yt-dlp/issues/10589)) by [bashonly](https://github.com/bashonly) + +#### Extractor changes +- **abematv**: [Fix availability extraction](https://github.com/yt-dlp/yt-dlp/commit/ef36d517f9b05785d61abca7691d9ab7d63cc75c) ([#10569](https://github.com/yt-dlp/yt-dlp/issues/10569)) by [middlingphys](https://github.com/middlingphys) +- **cbc.ca**: player: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/94a1c5e642e468cebeb51f74c6c220434cb47d96) ([#10302](https://github.com/yt-dlp/yt-dlp/issues/10302)) by [bashonly](https://github.com/bashonly), [trainman261](https://github.com/trainman261) +- **discoveryplus**: [Support olympics URLs](https://github.com/yt-dlp/yt-dlp/commit/0b7728618417e1aa382722a4d29b916b594d4459) ([#10566](https://github.com/yt-dlp/yt-dlp/issues/10566)) by [bashonly](https://github.com/bashonly) +- **kick**: clips: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/bb3936ae2b3ce96d0b53f9e17cad1082058f032b) ([#10572](https://github.com/yt-dlp/yt-dlp/issues/10572)) by [luvyana](https://github.com/luvyana) +- **learningonscreen**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/fe15d3178e242803ae7a934b90137f13598eba2e) ([#10590](https://github.com/yt-dlp/yt-dlp/issues/10590)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) +- **mediaklikk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7e3e4779ad13e4511c9ba3869879e53f0267bd7a) ([#10605](https://github.com/yt-dlp/yt-dlp/issues/10605)) by [szantnerb](https://github.com/szantnerb) +- **mlbtv**: [Fix makeup game extraction](https://github.com/yt-dlp/yt-dlp/commit/4b69e1b53ea21e631cd5dd68ff531e2f1671ec17) ([#10607](https://github.com/yt-dlp/yt-dlp/issues/10607)) by [bashonly](https://github.com/bashonly) +- **olympics**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/2f1ddfe12a2c174bc777264c5c8ffe7ca0922d94) ([#10604](https://github.com/yt-dlp/yt-dlp/issues/10604)) by [bashonly](https://github.com/bashonly) +- **tva**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/28d485714fef88937c82635438afba5db81f9089) ([#10567](https://github.com/yt-dlp/yt-dlp/issues/10567)) by [bashonly](https://github.com/bashonly) +- **tver**: [Support olympic URLs](https://github.com/yt-dlp/yt-dlp/commit/5260696b1cba77161828941fdb38f09f14ac6c60) ([#10600](https://github.com/yt-dlp/yt-dlp/issues/10600)) by [vvto33](https://github.com/vvto33) +- **vimeo**: review: [Fix password-protected video extraction](https://github.com/yt-dlp/yt-dlp/commit/2b6df93a243bdfb9d6bb5c1e18020625cd02d465) ([#10598](https://github.com/yt-dlp/yt-dlp/issues/10598)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Change default player clients to `ios,tv`](https://github.com/yt-dlp/yt-dlp/commit/efb42763dec23ccf6a2e3bac3afbfefce8efd012) ([#10457](https://github.com/yt-dlp/yt-dlp/issues/10457)) by [seproDev](https://github.com/seproDev) + - [Fix `n` function name extraction for player `20dfca59`](https://github.com/yt-dlp/yt-dlp/commit/011b4a04db2a636c3ef0a0ad4e2d3ae482c9fd76) ([#10611](https://github.com/yt-dlp/yt-dlp/issues/10611)) by [bashonly](https://github.com/bashonly) + - [Fix age-verification workaround](https://github.com/yt-dlp/yt-dlp/commit/d19fcb934269465fd707e68a87f735ec6983e93d) ([#10610](https://github.com/yt-dlp/yt-dlp/issues/10610)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Player client maintenance](https://github.com/yt-dlp/yt-dlp/commit/0e539617a41913c7da1edd74fb6543c10ad727b3) ([#10573](https://github.com/yt-dlp/yt-dlp/issues/10573)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- **cleanup**: Miscellaneous: [ffd7781](https://github.com/yt-dlp/yt-dlp/commit/ffd7781d6588926f820b44a34b9e6e3068fb9f97) by [bashonly](https://github.com/bashonly) + +### 2024.07.25 + +#### Extractor changes +- **abematv**: [Adapt key retrieval to request handler framework](https://github.com/yt-dlp/yt-dlp/commit/a3bab4752a2b3d56e5a59b4e0411bb8f695c010b) ([#10491](https://github.com/yt-dlp/yt-dlp/issues/10491)) by [bashonly](https://github.com/bashonly) +- **facebook**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/1a34a802f44a1dab8f642c79c3cc810e21541d3b) ([#10531](https://github.com/yt-dlp/yt-dlp/issues/10531)) by [bashonly](https://github.com/bashonly) +- **mlbtv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f0993391e6052ec8f7aacc286609564f226943b9) ([#10515](https://github.com/yt-dlp/yt-dlp/issues/10515)) by [bashonly](https://github.com/bashonly) +- **tiktok**: [Fix and deprioritize JSON subtitles](https://github.com/yt-dlp/yt-dlp/commit/2f97779f335ac069ecccd9c7bf81abf4a83cfe7a) ([#10516](https://github.com/yt-dlp/yt-dlp/issues/10516)) by [bashonly](https://github.com/bashonly) +- **vimeo**: [Fix chapters extraction](https://github.com/yt-dlp/yt-dlp/commit/a0a1bc3d8d8e3bb9a48a06e835815a0460e90e77) ([#10544](https://github.com/yt-dlp/yt-dlp/issues/10544)) by [bashonly](https://github.com/bashonly) +- **youtube**: [Fix `n` function name extraction for player `3400486c`](https://github.com/yt-dlp/yt-dlp/commit/713b4cd18f00556771af8cfdd9cea6cc1a09e948) ([#10542](https://github.com/yt-dlp/yt-dlp/issues/10542)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- **build**: [Pin `setuptools` version](https://github.com/yt-dlp/yt-dlp/commit/e046db8a116b1c320d4785daadd48ea0b22a3987) ([#10493](https://github.com/yt-dlp/yt-dlp/issues/10493)) by [bashonly](https://github.com/bashonly) + +### 2024.07.16 + +#### Core changes +- [Fix `noprogress` if `test=True` with `--quiet` and `--verbose`](https://github.com/yt-dlp/yt-dlp/commit/66ce3d76d87af3f81cc9dfec4be4704016cb1cdb) ([#10454](https://github.com/yt-dlp/yt-dlp/issues/10454)) by [Grub4K](https://github.com/Grub4K) +- [Support `auto-tty` and `no_color-tty` for `--color`](https://github.com/yt-dlp/yt-dlp/commit/d9cbced493cae2008508d94a2db5dd98be7c01fc) ([#10453](https://github.com/yt-dlp/yt-dlp/issues/10453)) by [Grub4K](https://github.com/Grub4K) +- **update**: [Fix network error handling](https://github.com/yt-dlp/yt-dlp/commit/ed1b9ed93dd90d2cc960c0d8eaa9d919db224203) ([#10486](https://github.com/yt-dlp/yt-dlp/issues/10486)) by [bashonly](https://github.com/bashonly) +- **utils**: `parse_codecs`: [Fix parsing of mixed case codec strings](https://github.com/yt-dlp/yt-dlp/commit/cc0070f6496e501d77352bad475fb02d6a86846a) by [bashonly](https://github.com/bashonly) + +#### Extractor changes +- **adn**: [Adjust for .com domain change](https://github.com/yt-dlp/yt-dlp/commit/959b7a379b8e5da059d110a63339c964b6265736) ([#10399](https://github.com/yt-dlp/yt-dlp/issues/10399)) by [infanf](https://github.com/infanf) +- **afreecatv**: [Fix login and use `legacy_ssl`](https://github.com/yt-dlp/yt-dlp/commit/4cd41469243624d90b7a2009b95cbe0609343efe) ([#10440](https://github.com/yt-dlp/yt-dlp/issues/10440)) by [bashonly](https://github.com/bashonly) +- **box**: [Support enterprise URLs](https://github.com/yt-dlp/yt-dlp/commit/705f5b84dec75cc7af97f42fd1530e8062735970) ([#10419](https://github.com/yt-dlp/yt-dlp/issues/10419)) by [seproDev](https://github.com/seproDev) +- **digitalconcerthall**: [Extract HEVC and FLAC formats](https://github.com/yt-dlp/yt-dlp/commit/e62fa6b0e0186f8c5666c2c5ab64cf191abdafc1) ([#10470](https://github.com/yt-dlp/yt-dlp/issues/10470)) by [bashonly](https://github.com/bashonly) +- **dplay**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/39e6c4cb44b9292e89ac0afec3cd0afc2ae8775f) ([#10471](https://github.com/yt-dlp/yt-dlp/issues/10471)) by [bashonly](https://github.com/bashonly) +- **epidemicsound**: [Support sound effects URLs](https://github.com/yt-dlp/yt-dlp/commit/8531d2b03bac9cc746f2ee8098aaf8f115505f5b) ([#10436](https://github.com/yt-dlp/yt-dlp/issues/10436)) by [iancmy](https://github.com/iancmy) +- **generic**: [Fix direct video link extensions](https://github.com/yt-dlp/yt-dlp/commit/b9afb99e7c34d0eb15ddc6689cd7d20eebfda68e) ([#10468](https://github.com/yt-dlp/yt-dlp/issues/10468)) by [bashonly](https://github.com/bashonly) +- **picarto**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/bacd18b7df08b4995644fd12cee1f8c8e8636bc7) ([#10414](https://github.com/yt-dlp/yt-dlp/issues/10414)) by [Frankgoji](https://github.com/Frankgoji) +- **soundcloud**: permalink, user: [Extract tracks only](https://github.com/yt-dlp/yt-dlp/commit/22870b81bad97dfa6307a7add44753b2dffc76a9) ([#10463](https://github.com/yt-dlp/yt-dlp/issues/10463)) by [DunnesH](https://github.com/DunnesH) +- **tiktok**: live: [Fix room ID extraction](https://github.com/yt-dlp/yt-dlp/commit/d2189d3d36987ebeac426fd70a60a5fe86325a2b) ([#10408](https://github.com/yt-dlp/yt-dlp/issues/10408)) by [mokrueger](https://github.com/mokrueger) +- **tv5monde**: [Support browser impersonation](https://github.com/yt-dlp/yt-dlp/commit/9b95a6765a5f6325af99c4aca961587f0c426e8c) ([#10417](https://github.com/yt-dlp/yt-dlp/issues/10417)) by [bashonly](https://github.com/bashonly) (With fixes in [cc1a309](https://github.com/yt-dlp/yt-dlp/commit/cc1a3098c00995c6aebc2a16bd1050a66bad64db)) +- **youtube** + - [Avoid poToken experiment player responses](https://github.com/yt-dlp/yt-dlp/commit/8b8b442cb005a8d85315f301615f83fb736b967a) ([#10456](https://github.com/yt-dlp/yt-dlp/issues/10456)) by [seproDev](https://github.com/seproDev) (With fixes in [16da8ef](https://github.com/yt-dlp/yt-dlp/commit/16da8ef9937ff76632dfef02e5062c5ba99c8ea2)) + - [Invalidate nsig cache from < 2024.07.09](https://github.com/yt-dlp/yt-dlp/commit/04e17ba20a139f1b3e30ec4bafa3fba26888f0b3) ([#10401](https://github.com/yt-dlp/yt-dlp/issues/10401)) by [bashonly](https://github.com/bashonly) + - [Reduce android client priority](https://github.com/yt-dlp/yt-dlp/commit/b85eef0a615a01304f88a3847309c667e09a20df) ([#10467](https://github.com/yt-dlp/yt-dlp/issues/10467)) by [seproDev](https://github.com/seproDev) + +#### Networking changes +- [Add `legacy_ssl` request extension](https://github.com/yt-dlp/yt-dlp/commit/150ecc45d9cacc919550c13b04fd998ac5103a6b) ([#10448](https://github.com/yt-dlp/yt-dlp/issues/10448)) by [coletdjnz](https://github.com/coletdjnz) +- **Request Handler**: curl_cffi: [Support `curl_cffi` 0.7.X](https://github.com/yt-dlp/yt-dlp/commit/42bfca00a6b460fc053514cdd7ac6f5b5daddf0c) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **build** + - [Include `curl_cffi` in `yt-dlp_linux`](https://github.com/yt-dlp/yt-dlp/commit/4521f30d1479315cd5c3bf4abdad19391952df98) by [bashonly](https://github.com/bashonly) + - [Pin `curl-cffi` to 0.5.10 for Windows](https://github.com/yt-dlp/yt-dlp/commit/ac30941ae682f71eab010877c9a977736a61d3cf) by [bashonly](https://github.com/bashonly) +- **cleanup**: Miscellaneous: [89a161e](https://github.com/yt-dlp/yt-dlp/commit/89a161e8c62569a662deda1c948664152efcb6b4) by [bashonly](https://github.com/bashonly) + +### 2024.07.09 + +#### Core changes +- [Do not alter default format selection when simulated](https://github.com/yt-dlp/yt-dlp/commit/0b570f2a90ce2363ba06089217514d644e7be2e0) ([#9862](https://github.com/yt-dlp/yt-dlp/issues/9862)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **youtube**: [Remove broken `n` function extraction fallback](https://github.com/yt-dlp/yt-dlp/commit/7ead7332af69422cee931aec3faa277288e9e212) ([#10396](https://github.com/yt-dlp/yt-dlp/issues/10396)) by [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) + +### 2024.07.08 + +#### Core changes +- **jsinterp**: [Implement `Function.prototype` resolving for `call` and `apply`](https://github.com/yt-dlp/yt-dlp/commit/6c056ea7aeb03660281653a9668547f2548f194f) ([#10392](https://github.com/yt-dlp/yt-dlp/issues/10392)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **soundcloud**: [Fix rate-limit handling](https://github.com/yt-dlp/yt-dlp/commit/4b50b292cc98534fb8c7cdf0ae5cb85862f7ebfc) ([#10389](https://github.com/yt-dlp/yt-dlp/issues/10389)) by [bashonly](https://github.com/bashonly) +- **youtube**: [Fix JS `n` function name extraction](https://github.com/yt-dlp/yt-dlp/commit/297b0a379282a15c80d82d51f3757c961db2dae1) ([#10390](https://github.com/yt-dlp/yt-dlp/issues/10390)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +### 2024.07.07 + +#### Important changes +- Security: [[ie/douyutv] Do not use dangerous javascript source/URL](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3v33-3wmw-3785) + - A dependency on potentially malicious third-party JavaScript code has been removed from the Douyu extractors + +#### Core changes +- [Address gaps in allowed extensions](https://github.com/yt-dlp/yt-dlp/commit/2469119490d7e0397ebbf5c5ae327316f955eef2) ([#10362](https://github.com/yt-dlp/yt-dlp/issues/10362)) by [bashonly](https://github.com/bashonly) +- [Fix `--ignore-no-formats-error`](https://github.com/yt-dlp/yt-dlp/commit/cc767e9490056efaaa11c186b0d032e4b4969180) ([#10345](https://github.com/yt-dlp/yt-dlp/issues/10345)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **abematv**: [Extract availability](https://github.com/yt-dlp/yt-dlp/commit/2a1a1b8e67e864289ac7ba5d05ec63dbb19a639f) ([#10348](https://github.com/yt-dlp/yt-dlp/issues/10348)) by [middlingphys](https://github.com/middlingphys) +- **chzzk**: [Extract with API v3](https://github.com/yt-dlp/yt-dlp/commit/4862a29854d4044120e3f97b52199711ad04bee1) ([#10363](https://github.com/yt-dlp/yt-dlp/issues/10363)) by [hui1601](https://github.com/hui1601) +- **douyutv**: [Do not use dangerous javascript source/URL](https://github.com/yt-dlp/yt-dlp/commit/6075a029dba70a89675ae1250e7cdfd91f0eba41) ([#10347](https://github.com/yt-dlp/yt-dlp/issues/10347)) by [LeSuisse](https://github.com/LeSuisse) +- **jiosaavn**: playlist: [Support featured playlists](https://github.com/yt-dlp/yt-dlp/commit/f0f867f008a1728f5f6ac1224b9e014b5d27f817) ([#10382](https://github.com/yt-dlp/yt-dlp/issues/10382)) by [harbhim](https://github.com/harbhim) +- **vidyard**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/00766ece0c5c7a80781a4ff677198c5fb69d9dc0) ([#10155](https://github.com/yt-dlp/yt-dlp/issues/10155)) by [exterrestris](https://github.com/exterrestris) +- **vimeo**: [Fix password-protected video extraction](https://github.com/yt-dlp/yt-dlp/commit/c1c9bb4adb42d0d93a2fb5d93a7de0a87b6ba884) ([#10341](https://github.com/yt-dlp/yt-dlp/issues/10341)) by [bashonly](https://github.com/bashonly) +- **vtv**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/987a1f94c24275f2b0cd82e719956687415dd732) ([#10173](https://github.com/yt-dlp/yt-dlp/issues/10173)) by [DinhHuy2010](https://github.com/DinhHuy2010) +- **yle_areena** + - [Fix metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/4cdc976bd861b5835601ae402bef543eacd88f3d) ([#10380](https://github.com/yt-dlp/yt-dlp/issues/10380)) by [seproDev](https://github.com/seproDev) + - [Fix subtitle extraction](https://github.com/yt-dlp/yt-dlp/commit/0d174e8bed32081eb38ef7f5d1a1282ae154f517) ([#10379](https://github.com/yt-dlp/yt-dlp/issues/10379)) by [Grub4K](https://github.com/Grub4K) + +#### Misc. changes +- **cleanup**: Miscellaneous: [b337d29](https://github.com/yt-dlp/yt-dlp/commit/b337d2989ce0614651d363383f6f743d977248ef) by [bashonly](https://github.com/bashonly) + +### 2024.07.02 + +#### Core changes +- [Fix `--compat-opt allow-unsafe-ext`](https://github.com/yt-dlp/yt-dlp/commit/773bbb181506856ffda95496ab60c1c9603f1f71) ([#10336](https://github.com/yt-dlp/yt-dlp/issues/10336)) by [bashonly](https://github.com/bashonly), [rdamas](https://github.com/rdamas) + +#### Extractor changes +- **banbye**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7509791385ba88cb7ec0ab17e826681f4af4b66e) ([#10332](https://github.com/yt-dlp/yt-dlp/issues/10332)) by [PatrykMis](https://github.com/PatrykMis), [seproDev](https://github.com/seproDev) +- **murrtube**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6403530e2dfe259a87afe444708c4f3024cc45b8) ([#9249](https://github.com/yt-dlp/yt-dlp/issues/9249)) by [DrakoCpp](https://github.com/DrakoCpp) +- **zaiko**: [Support JWT video URLs](https://github.com/yt-dlp/yt-dlp/commit/7799e518956387bb3c1064c9beae26eab8d5044a) ([#10130](https://github.com/yt-dlp/yt-dlp/issues/10130)) by [pzhlkj6612](https://github.com/pzhlkj6612) + +#### Postprocessor changes +- **embedthumbnail**: [Fix embedding with mutagen](https://github.com/yt-dlp/yt-dlp/commit/d502f4c6d95b74896f40070d07229997f0850f31) ([#10337](https://github.com/yt-dlp/yt-dlp/issues/10337)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- **cleanup**: Miscellaneous: [93d33cb](https://github.com/yt-dlp/yt-dlp/commit/93d33cb29af9e2e84369ac43589d50ce8e0160ef) by [bashonly](https://github.com/bashonly) + +### 2024.07.01 + +#### Important changes +- Security: [[CVE-2024-38519](https://nvd.nist.gov/vuln/detail/CVE-2024-38519)] [Properly sanitize file-extension to prevent file system modification and RCE](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j) + - Unsafe extensions are now blocked from being downloaded + +#### Core changes +- [Add `playlist_channel` and `playlist_channel_id` fields](https://github.com/yt-dlp/yt-dlp/commit/55e3e6fd21e741ec5ae3d8624de5e5ea345810eb) ([#10266](https://github.com/yt-dlp/yt-dlp/issues/10266)) by [bashonly](https://github.com/bashonly) +- [Disallow unsafe extensions (CVE-2024-38519)](https://github.com/yt-dlp/yt-dlp/commit/5ce582448ececb8d9c30c8c31f58330090ced03a) by [Grub4K](https://github.com/Grub4K) +- **cookies**: [Fix `--cookies-from-browser` DE detection on Linux](https://github.com/yt-dlp/yt-dlp/commit/a8520244b8642880e4d35925e9e49eff94d548de) ([#10237](https://github.com/yt-dlp/yt-dlp/issues/10237)) by [peisenwang](https://github.com/peisenwang) + +#### Extractor changes +- **afreecatv** + - [Support browser impersonation](https://github.com/yt-dlp/yt-dlp/commit/e8352ad6599de7b5371dc39a1a1edc7890aaedb4) ([#10174](https://github.com/yt-dlp/yt-dlp/issues/10174)) by [hui1601](https://github.com/hui1601) + - catchstory: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/054a3ba7d1293f9fbe21800d62d1e5ddcbded238) ([#10235](https://github.com/yt-dlp/yt-dlp/issues/10235)) by [hui1601](https://github.com/hui1601) +- **bilibili**: [Support legacy formats](https://github.com/yt-dlp/yt-dlp/commit/1d6ab17d0752ee9cf19e3e63c7dec7b600d3f228) ([#9117](https://github.com/yt-dlp/yt-dlp/issues/9117)) by [c-basalt](https://github.com/c-basalt), [GD-Slime](https://github.com/GD-Slime) +- **bitchute**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/5b1a2aa978d0074cee278e7659f32f52ecc4ab53) ([#10301](https://github.com/yt-dlp/yt-dlp/issues/10301)) by [seproDev](https://github.com/seproDev) +- **brightcove**: [Upgrade requests to HTTPS](https://github.com/yt-dlp/yt-dlp/commit/90c3721a322756bb7f4ca10ceb73744500bee37e) ([#10202](https://github.com/yt-dlp/yt-dlp/issues/10202)) by [bashonly](https://github.com/bashonly) +- **cloudflarestream**: [Fix `_VALID_URL` and embed extraction](https://github.com/yt-dlp/yt-dlp/commit/7aa322c02cec54eb77154a89da7e400194f0bd03) ([#10215](https://github.com/yt-dlp/yt-dlp/issues/10215)) by [bashonly](https://github.com/bashonly) +- **cloudycdn**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/b758877afa225747fba81c8a580e27583a231734) ([#10271](https://github.com/yt-dlp/yt-dlp/issues/10271)) by [Caesim404](https://github.com/Caesim404) +- **digitalconcerthall**: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/2a4f2e82dbeeb0c9130883c83dac689d5260c871) ([#10152](https://github.com/yt-dlp/yt-dlp/issues/10152)) by [seproDev](https://github.com/seproDev), [tippfehlr](https://github.com/tippfehlr) +- **facebook**: reel: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/8ca1d57ed08d00efa117820a5a82f763b20e2d1d) ([#10232](https://github.com/yt-dlp/yt-dlp/issues/10232)) by [bashonly](https://github.com/bashonly) +- **francetv** + - [Detect and raise errors for DRM](https://github.com/yt-dlp/yt-dlp/commit/3690c2f59827c79a1bbe388a7c1ae75db7477db2) ([#10165](https://github.com/yt-dlp/yt-dlp/issues/10165)) by [bashonly](https://github.com/bashonly) + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/081708d6074dfbb907e25af61ba530bba0d4b31d) ([#10177](https://github.com/yt-dlp/yt-dlp/issues/10177)) by [bashonly](https://github.com/bashonly) +- **generic**: [Add `key_query` extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/5dbac313ae4e3e8521dfe2e1a6a048a98ff4b4fe) by [bashonly](https://github.com/bashonly) +- **graspop**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/1d369b4096d79233e0ac2c93762746a64d7a69c8) ([#10268](https://github.com/yt-dlp/yt-dlp/issues/10268)) by [Niluge-KiWi](https://github.com/Niluge-KiWi) +- **jiocinema**: series: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/61714f46956f61612032bba857aed7ad1387eccd) ([#10139](https://github.com/yt-dlp/yt-dlp/issues/10139)) by [varunchopra](https://github.com/varunchopra) +- **khanacademy**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/4093eb1fcc29a0e2aea9adfcba479787d9ae0c0c) ([#9136](https://github.com/yt-dlp/yt-dlp/issues/9136)) by [c-basalt](https://github.com/c-basalt) +- **laracasts**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b8da8a98f897599095d4ef1644b8c5fd39921118) ([#10055](https://github.com/yt-dlp/yt-dlp/issues/10055)) by [ASertacAkkaya](https://github.com/ASertacAkkaya), [seproDev](https://github.com/seproDev) +- **matchtv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f3411af12e209bc5624e1ac31271b8aabe2d3c90) ([#10190](https://github.com/yt-dlp/yt-dlp/issues/10190)) by [megumintyan](https://github.com/megumintyan) +- **mediasite**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/0953209a857c51648aee89d205c086b0e1dd3864) ([#10273](https://github.com/yt-dlp/yt-dlp/issues/10273)) by [bashonly](https://github.com/bashonly) +- **microsoftembed**: [Add extractors for dev materials](https://github.com/yt-dlp/yt-dlp/commit/9200bc70c94546b2191bb6fbfc9cea98a919cc56) ([#9177](https://github.com/yt-dlp/yt-dlp/issues/9177)) by [c-basalt](https://github.com/c-basalt) +- **mlbtv**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/61edf57f8f13f6dfd81154174e647eb5fdd26089) ([#10296](https://github.com/yt-dlp/yt-dlp/issues/10296)) by [bashonly](https://github.com/bashonly) +- **neteasemusic**: [Extract more formats from new API](https://github.com/yt-dlp/yt-dlp/commit/7a03f88c40b80d3cf54f68edd9d4bdd6aa527570) ([#10258](https://github.com/yt-dlp/yt-dlp/issues/10258)) by [hafeoz](https://github.com/hafeoz) +- **nhkradiru**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/b8e2a5e0e1030076f833917906e19bb6c7b318f6) ([#10106](https://github.com/yt-dlp/yt-dlp/issues/10106)) by [garret1317](https://github.com/garret1317) +- **nuum**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/aefede25561a06cba398d4f593eee2fbe942693b) ([#10316](https://github.com/yt-dlp/yt-dlp/issues/10316)) by [DmitryScaletta](https://github.com/DmitryScaletta) +- **orf** + - on + - [Add `prefer_segments_playlist` extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/e6a22834df1776ec4e486526f6df2bf53cb7e06f) ([#10314](https://github.com/yt-dlp/yt-dlp/issues/10314)) by [seproDev](https://github.com/seproDev) + - [Support segmented episodes](https://github.com/yt-dlp/yt-dlp/commit/8b46ad4d8b8ee8c5472af0cde863baa89ca3f425) ([#10053](https://github.com/yt-dlp/yt-dlp/issues/10053)) by [seproDev](https://github.com/seproDev) +- **patreoncampaign**: [Fix `campaign_id` extraction](https://github.com/yt-dlp/yt-dlp/commit/2e5a47da400b645aadbda6afd1156bd89c744f48) ([#10070](https://github.com/yt-dlp/yt-dlp/issues/10070)) by [bashonly](https://github.com/bashonly) +- **podbayfm**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/d4b52ce3fcb8d9578ed12365648eaba8718c603e) ([#10195](https://github.com/yt-dlp/yt-dlp/issues/10195)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) +- **pokergo**: [Make metadata extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/36e8dd832579b5375a0f6626af4268b86b4eb21a) ([#10319](https://github.com/yt-dlp/yt-dlp/issues/10319)) by [axpauls](https://github.com/axpauls) +- **qqmusic**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/4f5d7be3c5590bb257d8ff521572aee9839ab754) ([#9768](https://github.com/yt-dlp/yt-dlp/issues/9768)) by [c-basalt](https://github.com/c-basalt) +- **rtvslo.si**: show: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/92a1c4abaeeba9a69d611c57b73555cb1a1f00ad) ([#8418](https://github.com/yt-dlp/yt-dlp/issues/8418)) by [JSubelj](https://github.com/JSubelj), [seproDev](https://github.com/seproDev) +- **soundcloud**: [Fix `download` format extraction](https://github.com/yt-dlp/yt-dlp/commit/e53e56b73543799638fa6abb0c78f8b091aa84e1) ([#10125](https://github.com/yt-dlp/yt-dlp/issues/10125)) by [bashonly](https://github.com/bashonly) +- **sproutvideo**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/d6c2c2bc84f1434255be5c73baeb17d893d2c0d4) ([#10098](https://github.com/yt-dlp/yt-dlp/issues/10098)) by [bashonly](https://github.com/bashonly), [TheZ3ro](https://github.com/TheZ3ro) +- **tiktok** + - [Detect and raise when login is required](https://github.com/yt-dlp/yt-dlp/commit/ea88129784fcbb6987161df9ba05909325d8e2e9) ([#10124](https://github.com/yt-dlp/yt-dlp/issues/10124)) by [bashonly](https://github.com/bashonly) + - [Fix API extraction](https://github.com/yt-dlp/yt-dlp/commit/96472d72f29550c25c5dcedcde02c38c192b0011) ([#10216](https://github.com/yt-dlp/yt-dlp/issues/10216)) by [bashonly](https://github.com/bashonly) +- **tubitv** + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/bef9a9e5361fd7a72e21d0f1a8c8afb70d89e8c5) ([#9975](https://github.com/yt-dlp/yt-dlp/issues/9975)) by [chilinux](https://github.com/chilinux) + - series: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/d7d861811c15585a4f7ec9d5ae68d2ac28de28a0) ([#10116](https://github.com/yt-dlp/yt-dlp/issues/10116)) by [bashonly](https://github.com/bashonly) +- **vimeo**: [Support browser impersonation](https://github.com/yt-dlp/yt-dlp/commit/d4b99a233314bf31f9c842035ea9884673d5313a) ([#10327](https://github.com/yt-dlp/yt-dlp/issues/10327)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Extract all formats from multi-language m3u8s](https://github.com/yt-dlp/yt-dlp/commit/9bd85019931927a99b0fe0dc58ac51acca9fbe72) ([#9875](https://github.com/yt-dlp/yt-dlp/issues/9875)) by [bashonly](https://github.com/bashonly), [clienthax](https://github.com/clienthax) + - [Skip formats if nsig decoding fails](https://github.com/yt-dlp/yt-dlp/commit/800ec085ccf98420584d8bb38c20a2c079669b09) ([#10223](https://github.com/yt-dlp/yt-dlp/issues/10223)) by [bashonly](https://github.com/bashonly) + - [Suppress "Unavailable videos are hidden" warning](https://github.com/yt-dlp/yt-dlp/commit/24f3097ea9a470a984d0454dc013cafa2325f5f8) ([#10159](https://github.com/yt-dlp/yt-dlp/issues/10159)) by [mgedmin](https://github.com/mgedmin) + - tab: [Fix channel metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/a0d9967f6822fc279e86bce33464194985148727) ([#10071](https://github.com/yt-dlp/yt-dlp/issues/10071)) by [bashonly](https://github.com/bashonly), [shoxie007](https://github.com/shoxie007) + +#### Downloader changes +- **hls**: [Apply `extra_param_to_key_url` from info dict](https://github.com/yt-dlp/yt-dlp/commit/ca8885edd93bdf8912af6c22ee335b6222cb9ba9) by [bashonly](https://github.com/bashonly) + +#### Postprocessor changes +- **embedthumbnail**: [Fix postprocessor](https://github.com/yt-dlp/yt-dlp/commit/f2a4ea1794718e4dc0148bc172cb877f1080903b) ([#10248](https://github.com/yt-dlp/yt-dlp/issues/10248)) by [Grub4K](https://github.com/Grub4K) + +#### Networking changes +- **Request Handler**: requests: [Bump minimum `requests` version to 2.32.2](https://github.com/yt-dlp/yt-dlp/commit/db50f19d76c6870a5a13d0cab9287d684fd7449a) ([#10079](https://github.com/yt-dlp/yt-dlp/issues/10079)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- **build** + - [Bump Pyinstaller to `>=6.7.0` for all builds](https://github.com/yt-dlp/yt-dlp/commit/5fdd13006a1c5d78642c8d3c4c7df0448273c2ae) ([#10069](https://github.com/yt-dlp/yt-dlp/issues/10069)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + - [Cache dependencies for `macos` job](https://github.com/yt-dlp/yt-dlp/commit/46c1b7cfec1d0e6155083ca7e6948674c64ecb97) ([#10088](https://github.com/yt-dlp/yt-dlp/issues/10088)) by [bashonly](https://github.com/bashonly) + - [Use `macos-12` image for `yt-dlp_macos`](https://github.com/yt-dlp/yt-dlp/commit/03334d639d5282cd4107edb32c623ba400262fc4) ([#10063](https://github.com/yt-dlp/yt-dlp/issues/10063)) by [bashonly](https://github.com/bashonly) +- **cleanup** + - [Add more ruff rules](https://github.com/yt-dlp/yt-dlp/commit/add96eb9f84cfffe85682bf2fb85135746994ee8) ([#10149](https://github.com/yt-dlp/yt-dlp/issues/10149)) by [seproDev](https://github.com/seproDev) + - [Bump ruff to 0.5.x](https://github.com/yt-dlp/yt-dlp/commit/7814c50948a2b9a4c746441ecbc509ae563d5d1f) ([#10282](https://github.com/yt-dlp/yt-dlp/issues/10282)) by [seproDev](https://github.com/seproDev) + - Miscellaneous: [6aaf96a](https://github.com/yt-dlp/yt-dlp/commit/6aaf96a3d6e7d0d426e97e11a2fcf52fda00e733) by [bashonly](https://github.com/bashonly), [c-basalt](https://github.com/c-basalt), [jucor](https://github.com/jucor), [seproDev](https://github.com/seproDev) +- **test**: download: [Raise on network errors](https://github.com/yt-dlp/yt-dlp/commit/54a63e80af82791d2f0985bd0176bb182963fd5f) ([#10283](https://github.com/yt-dlp/yt-dlp/issues/10283)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + ### 2024.05.27 #### Extractor changes diff --git a/Collaborators.md b/Collaborators.md index 894a853c9b..ee748eb7fd 100644 --- a/Collaborators.md +++ b/Collaborators.md @@ -61,3 +61,10 @@ You can also find lists of all [contributors of yt-dlp](CONTRIBUTORS) and [autho * Reworked internals like `traverse_obj`, various core refactors and bugs fixes * Implemented proper progress reporting for parallel downloads * Improved/fixed/added Bundestag, crunchyroll, pr0gramm, Twitter, WrestleUniverse etc + + +## [sepro](https://github.com/seproDev) + +* UX improvements: Warn when ffmpeg is missing, warn when double-clicking exe +* Code cleanup: Remove dead extractors, mark extractors as broken, enable/apply ruff rules +* Improved/fixed/added ArdMediathek, DRTV, Floatplane, MagentaMusik, Naver, Nebula, OnDemandKorea, Vbox7 etc diff --git a/Makefile b/Makefile index e1de7f3e91..6c72ead1ef 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ clean-test: rm -rf test/testdata/sigs/player-*.js tmp/ *.annotations.xml *.aria2 *.description *.dump *.frag \ *.frag.aria2 *.frag.urls *.info.json *.live_chat.json *.meta *.part* *.tmp *.temp *.unknown_video *.ytdl \ *.3gp *.ape *.ass *.avi *.desktop *.f4v *.flac *.flv *.gif *.jpeg *.jpg *.lrc *.m4a *.m4v *.mhtml *.mkv *.mov *.mp3 *.mp4 \ - *.mpg *.mpga *.oga *.ogg *.opus *.png *.sbv *.srt *.ssa *.swf *.swp *.tt *.ttml *.url *.vtt *.wav *.webloc *.webm *.webp + *.mpg *.mpga *.oga *.ogg *.opus *.png *.sbv *.srt *.ssa *.swf *.tt *.ttml *.url *.vtt *.wav *.webloc *.webm *.webp clean-dist: rm -rf yt-dlp.1.temp.md yt-dlp.1 README.txt MANIFEST build/ dist/ .coverage cover/ yt-dlp.tar.gz completions/ \ yt_dlp/extractor/lazy_extractors.py *.spec CONTRIBUTING.md.tmp yt-dlp yt-dlp.exe yt_dlp.egg-info/ AUTHORS diff --git a/README.md b/README.md index 42ffd9b520..418203eea9 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![YT-DLP](https://raw.githubusercontent.com/yt-dlp/yt-dlp/master/.github/banner.svg)](#readme) [![Release version](https://img.shields.io/github/v/release/yt-dlp/yt-dlp?color=brightgreen&label=Download&style=for-the-badge)](#installation "Installation") -[![PyPi](https://img.shields.io/badge/-PyPi-blue.svg?logo=pypi&labelColor=555555&style=for-the-badge)](https://pypi.org/project/yt-dlp "PyPi") +[![PyPI](https://img.shields.io/badge/-PyPI-blue.svg?logo=pypi&labelColor=555555&style=for-the-badge)](https://pypi.org/project/yt-dlp "PyPI") [![Donate](https://img.shields.io/badge/_-Donate-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)](Collaborators.md#collaborators "Donate") [![Matrix](https://img.shields.io/matrix/yt-dlp:matrix.org?color=brightgreen&labelColor=555555&label=&logo=element&style=for-the-badge)](https://matrix.to/#/#yt-dlp:matrix.org "Matrix") [![Discord](https://img.shields.io/discord/807245652072857610?color=blue&labelColor=555555&label=&logo=discord&style=for-the-badge)](https://discord.gg/H5MNcFW63r "Discord") @@ -81,7 +81,7 @@ yt-dlp is a feature-rich command-line audio/video downloader with support for [t [![Windows](https://img.shields.io/badge/-Windows_x64-blue.svg?style=for-the-badge&logo=windows)](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.exe) [![Unix](https://img.shields.io/badge/-Linux/BSD-red.svg?style=for-the-badge&logo=linux)](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp) [![MacOS](https://img.shields.io/badge/-MacOS-lightblue.svg?style=for-the-badge&logo=apple)](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_macos) -[![PyPi](https://img.shields.io/badge/-PyPi-blue.svg?logo=pypi&labelColor=555555&style=for-the-badge)](https://pypi.org/project/yt-dlp) +[![PyPI](https://img.shields.io/badge/-PyPI-blue.svg?logo=pypi&labelColor=555555&style=for-the-badge)](https://pypi.org/project/yt-dlp) [![Source Tarball](https://img.shields.io/badge/-Source_tar-green.svg?style=for-the-badge)](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz) [![Other variants](https://img.shields.io/badge/-Other-grey.svg?style=for-the-badge)](#release-files) [![All versions](https://img.shields.io/badge/-All_Versions-lightgrey.svg?style=for-the-badge)](https://github.com/yt-dlp/yt-dlp/releases) @@ -98,15 +98,14 @@ You can install yt-dlp using [the binaries](#release-files), [pip](https://pypi. File|Description :---|:--- [yt-dlp](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp)|Platform-independent [zipimport](https://docs.python.org/3/library/zipimport.html) binary. Needs Python (recommended for **Linux/BSD**) -[yt-dlp.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.exe)|Windows (Win7 SP1+) standalone x64 binary (recommended for **Windows**) +[yt-dlp.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.exe)|Windows (Win8+) standalone x64 binary (recommended for **Windows**) [yt-dlp_macos](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_macos)|Universal MacOS (10.15+) standalone executable (recommended for **MacOS**) #### Alternatives File|Description :---|:--- -[yt-dlp_x86.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_x86.exe)|Windows (Win7 SP1+) standalone x86 (32-bit) binary -[yt-dlp_min.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_min.exe)|Windows (Win7 SP1+) standalone x64 binary built with `py2exe`
([Not recommended](#standalone-py2exe-builds-windows)) +[yt-dlp_x86.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_x86.exe)|Windows (Win8+) standalone x86 (32-bit) binary [yt-dlp_linux](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_linux)|Linux standalone x64 binary [yt-dlp_linux_armv7l](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_linux_armv7l)|Linux standalone armv7l (32-bit) binary [yt-dlp_linux_aarch64](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_linux_aarch64)|Linux standalone aarch64 (64-bit) binary @@ -141,7 +140,7 @@ You can use `yt-dlp -U` to update if you are using the [release binaries](#relea If you [installed with pip](https://github.com/yt-dlp/yt-dlp/wiki/Installation#with-pip), simply re-run the same command that was used to install the program -For other third-party package managers, see [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation#third-party-package-managers) or refer their documentation +For other third-party package managers, see [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation#third-party-package-managers) or refer to their documentation @@ -173,21 +172,21 @@ python3 -m pip install -U --pre "yt-dlp[default]" ``` ## DEPENDENCIES -Python versions 3.8+ (CPython and PyPy) are supported. Other versions and implementations may or may not work correctly. +Python versions 3.9+ (CPython) and 3.10+ (PyPy) are supported. Other versions and implementations may or may not work correctly. While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly recommended ### Strongly recommended -* [**ffmpeg** and **ffprobe**](https://www.ffmpeg.org) - Required for [merging separate video and audio files](#format-selection) as well as for various [post-processing](#post-processing-options) tasks. License [depends on the build](https://www.ffmpeg.org/legal.html) +* [**ffmpeg** and **ffprobe**](https://www.ffmpeg.org) - Required for [merging separate video and audio files](#format-selection), as well as for various [post-processing](#post-processing-options) tasks. License [depends on the build](https://www.ffmpeg.org/legal.html) There are bugs in ffmpeg that cause various issues when used alongside yt-dlp. Since ffmpeg is such an important dependency, we provide [custom builds](https://github.com/yt-dlp/FFmpeg-Builds#ffmpeg-static-auto-builds) with patches for some of these issues at [yt-dlp/FFmpeg-Builds](https://github.com/yt-dlp/FFmpeg-Builds). See [the readme](https://github.com/yt-dlp/FFmpeg-Builds#patches-applied) for details on the specific issues solved by these builds - + **Important**: What you need is ffmpeg *binary*, **NOT** [the Python package of the same name](https://pypi.org/project/ffmpeg) ### Networking @@ -198,11 +197,11 @@ While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly #### Impersonation -The following provide support for impersonating browser requests. This may be required for some sites that employ TLS fingerprinting. +The following provide support for impersonating browser requests. This may be required for some sites that employ TLS fingerprinting. -* [**curl_cffi**](https://github.com/yifeikong/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/yifeikong/curl_cffi/blob/main/LICENSE) +* [**curl_cffi**](https://github.com/lexiforest/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lexiforest/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/lexiforest/curl_cffi/blob/main/LICENSE) * Can be installed with the `curl-cffi` group, e.g. `pip install "yt-dlp[default,curl-cffi]"` - * Currently only included in `yt-dlp.exe` and `yt-dlp_macos` builds + * Currently included in `yt-dlp.exe`, `yt-dlp_linux` and `yt-dlp_macos` builds ### Metadata @@ -254,31 +253,19 @@ On some systems, you may need to use `py` or `python` instead of `python3`. **Important**: Running `pyinstaller` directly **instead of** using `python -m bundle.pyinstaller` is **not** officially supported. This may or may not work correctly. ### Platform-independent Binary (UNIX) -You will need the build tools `python` (3.8+), `zip`, `make` (GNU), `pandoc`\* and `pytest`\*. +You will need the build tools `python` (3.9+), `zip`, `make` (GNU), `pandoc`\* and `pytest`\*. After installing these, simply run `make`. You can also run `make yt-dlp` instead to compile only the binary without updating any of the additional files. (The build tools marked with **\*** are not needed for this) -### Standalone Py2Exe Builds (Windows) - -While we provide the option to build with [py2exe](https://www.py2exe.org), it is recommended to build [using PyInstaller](#standalone-pyinstaller-builds) instead since the py2exe builds **cannot contain `pycryptodomex`/`certifi`/`requests` and need VC++14** on the target computer to run. - -If you wish to build it anyway, install Python (if it is not already installed) and you can run the following commands: - -``` -py devscripts/install_deps.py --include py2exe -py devscripts/make_lazy_extractors.py -py -m bundle.py2exe -``` - ### Related scripts * **`devscripts/install_deps.py`** - Install dependencies for yt-dlp. -* **`devscripts/update-version.py`** - Update the version number based on current date. +* **`devscripts/update-version.py`** - Update the version number based on the current date. * **`devscripts/set-variant.py`** - Set the build variant of the executable. * **`devscripts/make_changelog.py`** - Create a markdown changelog using short commit messages and update `CONTRIBUTORS` file. -* **`devscripts/make_lazy_extractors.py`** - Create lazy extractors. Running this before building the binaries (any variant) will improve their startup performance. Set the environment variable `YTDLP_NO_LAZY_EXTRACTORS=1` if you wish to forcefully disable lazy extractor loading. +* **`devscripts/make_lazy_extractors.py`** - Create lazy extractors. Running this before building the binaries (any variant) will improve their startup performance. Set the environment variable `YTDLP_NO_LAZY_EXTRACTORS` to something nonempty to forcefully disable lazy extractor loading. Note: See their `--help` for more info. @@ -348,6 +335,13 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git containing directory ("-" for stdin). Can be used multiple times and inside other configuration files + --plugin-dirs PATH Path to an additional directory to search + for plugins. This option can be used + multiple times to add multiple directories. + Note that this currently only works for + extractor plugins; postprocessor plugins can + only be loaded from the default plugin + directories --flat-playlist Do not extract the videos of a playlist, only list them --no-flat-playlist Fully extract the videos of a playlist @@ -368,7 +362,9 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git stderr) to apply the setting to. Can be one of "always", "auto" (default), "never", or "no_color" (use non color terminal - sequences). Can be used multiple times + sequences). Use "auto-tty" or "no_color-tty" + to decide based on terminal support only. + Can be used multiple times --compat-options OPTS Options that can help keep compatibility with youtube-dl or youtube-dlc configurations by reverting some of the @@ -442,10 +438,10 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git E.g. "--date today-2weeks" downloads only videos uploaded on the same day two weeks ago --datebefore DATE Download only videos uploaded on or before - this date. The date formats accepted is the + this date. The date formats accepted are the same as --date --dateafter DATE Download only videos uploaded on or after - this date. The date formats accepted is the + this date. The date formats accepted are the same as --date --match-filters FILTER Generic video filter. Any "OUTPUT TEMPLATE" field can be compared with a number or a @@ -456,18 +452,18 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git is not present, and "&" to check multiple conditions. Use a "\" to escape "&" or quotes if needed. If used multiple times, - the filter matches if atleast one of the - conditions are met. E.g. --match-filter - !is_live --match-filter "like_count>?100 & + the filter matches if at least one of the + conditions is met. E.g. --match-filters + !is_live --match-filters "like_count>?100 & description~='(?i)\bcats \& dogs\b'" matches only videos that are not live OR those that have a like count more than 100 (or the like field is not available) and also has a description that contains the phrase "cats & - dogs" (caseless). Use "--match-filter -" to + dogs" (caseless). Use "--match-filters -" to interactively ask whether to download each video - --no-match-filters Do not use any --match-filter (default) + --no-match-filters Do not use any --match-filters (default) --break-match-filters FILTER Same as "--match-filters" but stops the download process when a video is rejected --no-break-match-filters Do not use any --break-match-filters (default) @@ -488,7 +484,7 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git encountering a file that is in the archive (default) --break-per-input Alters --max-downloads, --break-on-existing, - --break-match-filter, and autonumber to + --break-match-filters, and autonumber to reset per input URL --no-break-per-input --break-on-existing and similar options terminates the entire download queue @@ -674,7 +670,7 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git PROFILE to load cookies from, and the CONTAINER name (if Firefox) ("none" for no container) can be given with their - respective seperators. By default, all + respective separators. By default, all containers of the most recently accessed profile are used. Currently supported keyrings are: basictext, gnomekeyring, @@ -730,16 +726,16 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git used. This option can be used multiple times --print-to-file [WHEN:]TEMPLATE FILE Append given template to the file. The - values of WHEN and TEMPLATE are same as that - of --print. FILE uses the same syntax as the - output template. This option can be used - multiple times + values of WHEN and TEMPLATE are the same as + that of --print. FILE uses the same syntax + as the output template. This option can be + used multiple times -j, --dump-json Quiet, but print JSON information for each video. Simulate unless --no-simulate is used. See "OUTPUT TEMPLATE" for a description of available keys -J, --dump-single-json Quiet, but print JSON information for each - url or infojson passed. Simulate unless + URL or infojson passed. Simulate unless --no-simulate is used. If the URL refers to a playlist, the whole playlist information is dumped in a single line @@ -814,9 +810,9 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git --no-audio-multistreams Only one audio stream is downloaded for each output file (default) --prefer-free-formats Prefer video formats with free containers - over non-free ones of same quality. Use with - "-S ext" to strictly prefer free containers - irrespective of quality + over non-free ones of the same quality. Use + with "-S ext" to strictly prefer free + containers irrespective of quality --no-prefer-free-formats Don't give any special preference to free containers (default) --check-formats Make sure formats are selected only from @@ -841,15 +837,17 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git (default) (Alias: --no-write-automatic-subs) --list-subs List available subtitles of each video. Simulate unless --no-simulate is used - --sub-format FORMAT Subtitle format; accepts formats preference, - e.g. "srt" or "ass/srt/best" + --sub-format FORMAT Subtitle format; accepts formats preference + separated by "/", e.g. "srt" or "ass/srt/best" --sub-langs LANGS Languages of the subtitles to download (can be regex) or "all" separated by commas, e.g. - --sub-langs "en.*,ja". You can prefix the - language code with a "-" to exclude it from - the requested languages, e.g. --sub-langs - all,-live_chat. Use --list-subs for a list - of available language tags + --sub-langs "en.*,ja" (where "en.*" is a + regex pattern that matches "en" followed by + 0 or more of any character). You can prefix + the language code with a "-" to exclude it + from the requested languages, e.g. --sub- + langs all,-live_chat. Use --list-subs for a + list of available language tags ## Authentication Options: -u, --username USERNAME Login with this account ID @@ -897,9 +895,9 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git necessary (currently supported: avi, flv, gif, mkv, mov, mp4, webm, aac, aiff, alac, flac, m4a, mka, mp3, ogg, opus, vorbis, - wav). If target container does not support - the video/audio codec, remuxing will fail. - You can specify multiple rules; e.g. + wav). If the target container does not + support the video/audio codec, remuxing will + fail. You can specify multiple rules; e.g. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 and anything else to mkv --recode-video FORMAT Re-encode the video into another format if @@ -967,29 +965,29 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git are the same as that of --use-postprocessor (default: pre_process) --xattrs Write metadata to the video file's xattrs - (using dublin core and xdg standards) + (using Dublin Core and XDG standards) --concat-playlist POLICY Concatenate videos in a playlist. One of "never", "always", or "multi_video" (default; only when the videos form a single - show). All the video files must have same - codecs and number of streams to be - concatable. The "pl_video:" prefix can be + show). All the video files must have the + same codecs and number of streams to be + concatenable. The "pl_video:" prefix can be used with "--paths" and "--output" to set the output filename for the concatenated files. See "OUTPUT TEMPLATE" for details --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn (the - default; fix file if we can, warn - otherwise), force (try fixing even if file - already exists) + default; fix the file if we can, warn + otherwise), force (try fixing even if the + file already exists) --ffmpeg-location PATH Location of the ffmpeg binary; either the path to the binary or its containing directory --exec [WHEN:]CMD Execute a command, optionally prefixed with when to execute it, separated by a ":". Supported values of "WHEN" are the same as that of --use-postprocessor (default: - after_move). Same syntax as the output + after_move). The same syntax as the output template can be used to pass any field as arguments to the command. If no fields are passed, %(filepath,_filename|)q is appended @@ -997,12 +995,16 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git be used multiple times --no-exec Remove any previously defined --exec --convert-subs FORMAT Convert the subtitles to another format - (currently supported: ass, lrc, srt, vtt) - (Alias: --convert-subtitles) + (currently supported: ass, lrc, srt, vtt). + Use "--convert-subs none" to disable + conversion (default) (Alias: --convert- + subtitles) --convert-thumbnails FORMAT Convert the thumbnails to another format (currently supported: jpg, png, webp). You can specify multiple rules using similar - syntax as --remux-video + syntax as "--remux-video". Use "--convert- + thumbnails none" to disable conversion + (default) --split-chapters Split video into multiple files based on internal chapters. The "chapter:" prefix can be used with "--paths" and "--output" to set @@ -1023,7 +1025,7 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git --no-force-keyframes-at-cuts Do not force keyframes around the chapters when cutting/splitting (default) --use-postprocessor NAME[:ARGS] - The (case sensitive) name of plugin + The (case-sensitive) name of plugin postprocessors to be enabled, and (optionally) arguments to be passed to it, separated by a colon ":". ARGS are a @@ -1036,8 +1038,8 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git --print/--output), "before_dl" (before each video download), "post_process" (after each video download; default), "after_move" - (after moving video file to it's final - locations), "after_video" (after downloading + (after moving the video file to its final + location), "after_video" (after downloading and processing all formats of a video), or "playlist" (at end of playlist). This option can be used multiple times to add different @@ -1055,7 +1057,7 @@ Make chapter entries for, or remove various segments (sponsor, music_offtopic, poi_highlight, chapter, all and default (=all). You can prefix the category with a "-" to exclude it. See [1] - for description of the categories. E.g. + for descriptions of the categories. E.g. --sponsorblock-mark all,-preview [1] https://wiki.sponsor.ajay.app/w/Segment_Categories --sponsorblock-remove CATS SponsorBlock categories to be removed from @@ -1087,7 +1089,7 @@ Make chapter entries for, or remove various segments (sponsor, (Alias: --no-allow-dynamic-mpd) --hls-split-discontinuity Split HLS playlists to different formats at discontinuities such as ad breaks - --no-hls-split-discontinuity Do not split HLS playlists to different + --no-hls-split-discontinuity Do not split HLS playlists into different formats at discontinuities such as ad breaks (default) --extractor-args IE_KEY:ARGS Pass ARGS arguments to the IE_KEY extractor. @@ -1097,7 +1099,7 @@ Make chapter entries for, or remove various segments (sponsor, # CONFIGURATION -You can configure yt-dlp by placing any supported command line option to a configuration file. The configuration is loaded from the following locations: +You can configure yt-dlp by placing any supported command line option in a configuration file. The configuration is loaded from the following locations: 1. **Main Configuration**: * The file given to `--config-location` @@ -1125,7 +1127,7 @@ You can configure yt-dlp by placing any supported command line option to a confi * `/etc/yt-dlp/config` * `/etc/yt-dlp/config.txt` -E.g. with the following configuration file yt-dlp will always extract the audio, not copy the mtime, use a proxy and save all videos under `YouTube` directory in your home directory: +E.g. with the following configuration file, yt-dlp will always extract the audio, not copy the mtime, use a proxy and save all videos under `YouTube` directory in your home directory: ``` # Lines starting with # are comments @@ -1142,7 +1144,7 @@ E.g. with the following configuration file yt-dlp will always extract the audio, -o ~/YouTube/%(title)s.%(ext)s ``` -**Note**: Options in configuration file are just the same options aka switches used in regular command line calls; thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. They must also be quoted when necessary as-if it were a UNIX shell. +**Note**: Options in a configuration file are just the same options aka switches used in regular command line calls; thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. They must also be quoted when necessary, as if it were a UNIX shell. You can use `--ignore-config` if you want to disable all configuration files for a particular yt-dlp run. If `--ignore-config` is found inside any configuration file, no further configuration will be loaded. For example, having the option in the portable configuration file prevents loading of home, user, and system configurations. Additionally, (for backward compatibility) if `--ignore-config` is found inside the system configuration file, the user configuration is not loaded. @@ -1154,12 +1156,12 @@ If you want your file to be decoded differently, add `# coding: ENCODING` to the ### Authentication with netrc -You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every yt-dlp execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per-extractor basis. For that you will need to create a `.netrc` file in `--netrc-location` and restrict permissions to read/write by only you: +You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every yt-dlp execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per-extractor basis. For that, you will need to create a `.netrc` file in `--netrc-location` and restrict permissions to read/write by only you: ``` touch ${HOME}/.netrc chmod a-rwx,u+rw ${HOME}/.netrc ``` -After that you can add credentials for an extractor in the following format, where *extractor* is the name of the extractor in lowercase: +After that, you can add credentials for an extractor in the following format, where *extractor* is the name of the extractor in lowercase: ``` machine login password ``` @@ -1176,13 +1178,13 @@ As an alternative to using the `.netrc` file, which has the disadvantage of keep E.g. To use an encrypted `.netrc` file stored as `.authinfo.gpg` ``` -yt-dlp --netrc-cmd 'gpg --decrypt ~/.authinfo.gpg' https://www.youtube.com/watch?v=BaW_jenozKc +yt-dlp --netrc-cmd 'gpg --decrypt ~/.authinfo.gpg' 'https://www.youtube.com/watch?v=BaW_jenozKc' ``` ### Notes about environment variables * Environment variables are normally specified as `${VARIABLE}`/`$VARIABLE` on UNIX and `%VARIABLE%` on Windows; but is always shown as `${VARIABLE}` in this documentation -* yt-dlp also allow using UNIX-style variables on Windows for path-like options; e.g. `--output`, `--config-location` +* yt-dlp also allows using UNIX-style variables on Windows for path-like options; e.g. `--output`, `--config-location` * If unset, `${XDG_CONFIG_HOME}` defaults to `~/.config` and `${XDG_CACHE_HOME}` to `~/.cache` * On Windows, `~` points to `${HOME}` if present; or, `${USERPROFILE}` or `${HOMEDRIVE}${HOMEPATH}` otherwise * On Windows, `${USERPROFILE}` generally points to `C:\Users\` and `${APPDATA}` to `${USERPROFILE}\AppData\Roaming` @@ -1201,7 +1203,7 @@ It may however also contain special sequences that will be replaced when downloa The field names themselves (the part inside the parenthesis) can also have some special formatting: -1. **Object traversal**: The dictionaries and lists available in metadata can be traversed by using a dot `.` separator; e.g. `%(tags.0)s`, `%(subtitles.en.-1.ext)s`. You can do Python slicing with colon `:`; E.g. `%(id.3:7:-1)s`, `%(formats.:.format_id)s`. Curly braces `{}` can be used to build dictionaries with only specific keys; e.g. `%(formats.:.{format_id,height})#j`. An empty field name `%()s` refers to the entire infodict; e.g. `%(.{id,title})s`. Note that all the fields that become available using this method are not listed below. Use `-j` to see such fields +1. **Object traversal**: The dictionaries and lists available in metadata can be traversed by using a dot `.` separator; e.g. `%(tags.0)s`, `%(subtitles.en.-1.ext)s`. You can do Python slicing with colon `:`; E.g. `%(id.3:7)s`, `%(id.6:2:-1)s`, `%(formats.:.format_id)s`. Curly braces `{}` can be used to build dictionaries with only specific keys; e.g. `%(formats.:.{format_id,height})#j`. An empty field name `%()s` refers to the entire infodict; e.g. `%(.{id,title})s`. Note that all the fields that become available using this method are not listed below. Use `-j` to see such fields 1. **Arithmetic**: Simple arithmetic can be done on numeric fields using `+`, `-` and `*`. E.g. `%(playlist_index+10)03d`, `%(n_entries+1-playlist_index)d` @@ -1222,7 +1224,7 @@ To summarize, the general syntax for a field is: %(name[.keys][addition][>strf][,alternate][&replacement][|default])[flags][width][.precision][length]type ``` -Additionally, you can set different output templates for the various metadata files separately from the general output template by specifying the type of file followed by the template separated by a colon `:`. The different file types supported are `subtitle`, `thumbnail`, `description`, `annotation` (deprecated), `infojson`, `link`, `pl_thumbnail`, `pl_description`, `pl_infojson`, `chapter`, `pl_video`. E.g. `-o "%(title)s.%(ext)s" -o "thumbnail:%(title)s\%(title)s.%(ext)s"` will put the thumbnails in a folder with the same name as the video. If any of the templates is empty, that type of file will not be written. E.g. `--write-thumbnail -o "thumbnail:"` will write thumbnails only for playlists and not for video. +Additionally, you can set different output templates for the various metadata files separately from the general output template by specifying the type of file followed by the template separated by a colon `:`. The different file types supported are `subtitle`, `thumbnail`, `description`, `annotation` (deprecated), `infojson`, `link`, `pl_thumbnail`, `pl_description`, `pl_infojson`, `chapter`, `pl_video`. E.g. `-o "%(title)s.%(ext)s" -o "thumbnail:%(title)s\%(title)s.%(ext)s"` will put the thumbnails in a folder with the same name as the video. If any of the templates is empty, that type of file will not be written. E.g. `--write-thumbnail -o "thumbnail:"` will write thumbnails only for playlists and not for video. @@ -1263,7 +1265,7 @@ The available fields are: - `like_count` (numeric): Number of positive ratings of the video - `dislike_count` (numeric): Number of negative ratings of the video - `repost_count` (numeric): Number of reposts of the video - - `average_rating` (numeric): Average rating give by users, the scale used depends on the webpage + - `average_rating` (numeric): Average rating given by users, the scale used depends on the webpage - `comment_count` (numeric): Number of comments on the video (For some extractors, comments are only downloaded at the end, and so this field cannot be used) - `age_limit` (numeric): Age restriction for the video (years) - `live_status` (string): One of "not_live", "is_live", "is_upcoming", "was_live", "post_live" (was live, but VOD is not yet processed) @@ -1282,16 +1284,18 @@ The available fields are: - `n_entries` (numeric): Total number of extracted items in the playlist - `playlist_id` (string): Identifier of the playlist that contains the video - `playlist_title` (string): Name of the playlist that contains the video - - `playlist` (string): `playlist_id` or `playlist_title` + - `playlist` (string): `playlist_title` if available or else `playlist_id` - `playlist_count` (numeric): Total number of items in the playlist. May not be known if entire playlist is not extracted - `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according the final index - `playlist_autonumber` (numeric): Position of the video in the playlist download queue padded with leading zeros according to the total length of the playlist - `playlist_uploader` (string): Full name of the playlist uploader - `playlist_uploader_id` (string): Nickname or id of the playlist uploader - - `webpage_url` (string): A URL to the video webpage which if given to yt-dlp should allow to get the same result again + - `playlist_channel` (string): Display name of the channel that uploaded the playlist + - `playlist_channel_id` (string): Identifier of the channel that uploaded the playlist + - `webpage_url` (string): A URL to the video webpage which, if given to yt-dlp, should yield the same result again - `webpage_url_basename` (string): The basename of the webpage URL - `webpage_url_domain` (string): The domain of the webpage URL - - `original_url` (string): The URL given by the user (or same as `webpage_url` for playlist entries) + - `original_url` (string): The URL given by the user (or the same as `webpage_url` for playlist entries) - `categories` (list): List of categories the video belongs to - `tags` (list): List of tags assigned to the video - `cast` (list): List of cast members @@ -1304,10 +1308,10 @@ Available for the video that belongs to some logical chapter or section: - `chapter_number` (numeric): Number of the chapter the video belongs to - `chapter_id` (string): Id of the chapter the video belongs to -Available for the video that is an episode of some series or programme: +Available for the video that is an episode of some series or program: - - `series` (string): Title of the series or programme the video episode belongs to - - `series_id` (string): Id of the series or programme the video episode belongs to + - `series` (string): Title of the series or program the video episode belongs to + - `series_id` (string): Id of the series or program the video episode belongs to - `season` (string): Title of the season the video episode belongs to - `season_number` (numeric): Number of the season the video episode belongs to - `season_id` (string): Id of the season the video episode belongs to @@ -1347,9 +1351,9 @@ Available only when used in `--print`: - `thumbnails_table` (table): The thumbnail format table as printed by `--list-thumbnails` - `subtitles_table` (table): The subtitle format table as printed by `--list-subs` - `automatic_captions_table` (table): The automatic subtitle format table as printed by `--list-subs` - + Available only after the video is downloaded (`post_process`/`after_move`): - + - `filepath`: Actual path of downloaded video file Available only in `--sponsorblock-chapter-title`: @@ -1364,11 +1368,11 @@ Available only in `--sponsorblock-chapter-title`: Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. E.g. for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `yt-dlp test video` and id `BaW_jenozKc`, this will result in a `yt-dlp test video-BaW_jenozKc.mp4` file created in the current directory. -**Note**: Some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with placeholder value provided with `--output-na-placeholder` (`NA` by default). +**Note**: Some of the sequences are not guaranteed to be present, since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with placeholder value provided with `--output-na-placeholder` (`NA` by default). **Tip**: Look at the `-j` output to identify which fields are available for the particular URL -For numeric sequences you can use [numeric related formatting](https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting); e.g. `%(view_count)05d` will result in a string with view count padded with zeros up to 5 characters, like in `00042`. +For numeric sequences, you can use [numeric related formatting](https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting); e.g. `%(view_count)05d` will result in a string with view count padded with zeros up to 5 characters, like in `00042`. Output templates can also contain arbitrary hierarchical path, e.g. `-o "%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s"` which will result in downloading each video in a directory corresponding to this path template. Any missing directory will be automatically created for you. @@ -1410,7 +1414,7 @@ $ yt-dlp -P "C:/MyVideos" -o "%(series)s/%(season_number)s - %(season)s/%(episod # Download video as "C:\MyVideos\uploader\title.ext", subtitles as "C:\MyVideos\subs\uploader\title.ext" # and put all temporary files in "C:\MyVideos\tmp" -$ yt-dlp -P "C:/MyVideos" -P "temp:tmp" -P "subtitle:subs" -o "%(uploader)s/%(title)s.%(ext)s" BaW_jenoz --write-subs +$ yt-dlp -P "C:/MyVideos" -P "temp:tmp" -P "subtitle:subs" -o "%(uploader)s/%(title)s.%(ext)s" BaW_jenozKc --write-subs # Download video as "C:\MyVideos\uploader\title.ext" and subtitles as "C:\MyVideos\uploader\subs\title.ext" $ yt-dlp -P "C:/MyVideos" -o "%(uploader)s/%(title)s.%(ext)s" -o "subtitle:%(uploader)s/subs/%(title)s.%(ext)s" BaW_jenozKc --write-subs @@ -1442,7 +1446,7 @@ You can also use special names to select particular edge case formats: - `all`: Select **all formats** separately - `mergeall`: Select and **merge all formats** (Must be used with `--audio-multistreams`, `--video-multistreams` or both) - - `b*`, `best*`: Select the best quality format that **contains either** a video or an audio or both (ie; `vcodec!=none or acodec!=none`) + - `b*`, `best*`: Select the best quality format that **contains either** a video or an audio or both (i.e.; `vcodec!=none or acodec!=none`) - `b`, `best`: Select the best quality format that **contains both** video and audio. Equivalent to `best*[vcodec!=none][acodec!=none]` - `bv`, `bestvideo`: Select the best quality **video-only** format. Equivalent to `best*[acodec=none]` - `bv*`, `bestvideo*`: Select the best quality format that **contains video**. It may also contain audio. Equivalent to `best*[vcodec!=none]` @@ -1455,7 +1459,7 @@ You can also use special names to select particular edge case formats: - `wa`, `worstaudio`: Select the worst quality audio-only format. Equivalent to `worst*[vcodec=none]` - `wa*`, `worstaudio*`: Select the worst quality format that contains audio. It may also contain video. Equivalent to `worst*[acodec!=none]` -For example, to download the worst quality video-only format you can use `-f worstvideo`. It is however recommended not to use `worst` and related options. When your format selector is `worst`, the format which is worst in all respects is selected. Most of the time, what you actually want is the video with the smallest filesize instead. So it is generally better to use `-S +size` or more rigorously, `-S +size,+br,+res,+fps` instead of `-f worst`. See [Sorting Formats](#sorting-formats) for more details. +For example, to download the worst quality video-only format you can use `-f worstvideo`. It is, however, recommended not to use `worst` and related options. When your format selector is `worst`, the format which is worst in all respects is selected. Most of the time, what you actually want is the video with the smallest filesize instead. So it is generally better to use `-S +size` or more rigorously, `-S +size,+br,+res,+fps` instead of `-f worst`. See [Sorting Formats](#sorting-formats) for more details. You can select the n'th best format of a type by using `best.`. For example, `best.2` will select the 2nd best combined format. Similarly, `bv*.3` will select the 3rd best format that contains a video stream. @@ -1505,7 +1509,7 @@ Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). The comparand of a string comparison needs to be quoted with either double or single quotes if it contains spaces or special characters other than `._-`. -**Note**: None of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the website. Any other field made available by the extractor can also be used for filtering. +**Note**: None of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by the particular extractor, i.e. the metadata offered by the website. Any other field made available by the extractor can also be used for filtering. Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "bv[height<=?720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 kbps. You can also use the filters with `all` to download all formats that satisfy the filter, e.g. `-f "all[vcodec=none]"` selects all audio-only formats. @@ -1544,14 +1548,14 @@ The available fields are: - `abr`: Average audio bitrate in [kbps](## "1000 bits/sec") - `br`: Average bitrate in [kbps](## "1000 bits/sec"), `tbr`/`vbr`/`abr` - `asr`: Audio sample rate in Hz - + **Deprecation warning**: Many of these fields have (currently undocumented) aliases, that may be removed in a future version. It is recommended to use only the documented field names. All fields, unless specified otherwise, are sorted in descending order. To reverse this, prefix the field with a `+`. E.g. `+res` prefers format with the smallest resolution. Additionally, you can suffix a preferred value for the fields, separated by a `:`. E.g. `res:720` prefers larger videos, but no larger than 720p and the smallest video if there are no videos less than 720p. For `codec` and `ext`, you can provide two preferred values, the first for video and the second for audio. E.g. `+codec:avc:m4a` (equivalent to `+vcodec:avc,+acodec:m4a`) sets the video codec preference to `h264` > `h265` > `vp9` > `vp9.2` > `av01` > `vp8` > `h263` > `theora` and audio codec preference to `mp4a` > `aac` > `vorbis` > `opus` > `mp3` > `ac3` > `dts`. You can also make the sorting prefer the nearest values to the provided by using `~` as the delimiter. E.g. `filesize~1G` prefers the format with filesize closest to 1 GiB. -The fields `hasvid` and `ie_pref` are always given highest priority in sorting, irrespective of the user-defined order. This behaviour can be changed by using `--format-sort-force`. Apart from these, the default order used is: `lang,quality,res,fps,hdr:12,vcodec:vp9.2,channels,acodec,size,br,asr,proto,ext,hasaud,source,id`. The extractors may override this default order, but they cannot override the user-provided order. +The fields `hasvid` and `ie_pref` are always given highest priority in sorting, irrespective of the user-defined order. This behavior can be changed by using `--format-sort-force`. Apart from these, the default order used is: `lang,quality,res,fps,hdr:12,vcodec:vp9.2,channels,acodec,size,br,asr,proto,ext,hasaud,source,id`. The extractors may override this default order, but they cannot override the user-provided order. -Note that the default has `vcodec:vp9.2`; i.e. `av1` is not preferred. Similarly, the default for hdr is `hdr:12`; i.e. dolby vision is not preferred. These choices are made since DV and AV1 formats are not yet fully compatible with most devices. This may be changed in the future as more devices become capable of smoothly playing back these formats. +Note that the default has `vcodec:vp9.2`; i.e. `av1` is not preferred. Similarly, the default for hdr is `hdr:12`; i.e. Dolby Vision is not preferred. These choices are made since DV and AV1 formats are not yet fully compatible with most devices. This may be changed in the future as more devices become capable of smoothly playing back these formats. If your format selector is `worst`, the last item is selected after sorting. This means it will select the format that is worst in all respects. Most of the time, what you actually want is the video with the smallest filesize instead. So it is generally better to use `-f best -S +size,+br,+res,+fps`. @@ -1628,11 +1632,11 @@ $ yt-dlp -S "res:480" # or the worst video (that also has audio) if there is no video under 50 MB $ yt-dlp -f "b[filesize<50M] / w" -# Download largest video (that also has audio) but no bigger than 50 MB, +# Download the largest video (that also has audio) but no bigger than 50 MB, # or the smallest video (that also has audio) if there is no video under 50 MB $ yt-dlp -f "b" -S "filesize:50M" -# Download best video (that also has audio) that is closest in size to 50 MB +# Download the best video (that also has audio) that is closest in size to 50 MB $ yt-dlp -f "b" -S "filesize~50M" @@ -1688,7 +1692,7 @@ The metadata obtained by the extractors can be modified by using `--parse-metada The general syntax of `--parse-metadata FROM:TO` is to give the name of a field or an [output template](#output-template) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [Python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups, a single field name, or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields. -Note that these options preserve their relative order, allowing replacements to be made in parsed fields and viceversa. Also, any field thus created can be used in the [output template](#output-template) and will also affect the media file's metadata added when using `--embed-metadata`. +Note that these options preserve their relative order, allowing replacements to be made in parsed fields and vice versa. Also, any field thus created can be used in the [output template](#output-template) and will also affect the media file's metadata added when using `--embed-metadata`. This option also has a few special uses: @@ -1754,7 +1758,7 @@ $ yt-dlp --replace-in-metadata "title,uploader" "[ _]" "-" # EXTRACTOR ARGUMENTS -Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) separated string of `ARG=VAL1,VAL2`. E.g. `--extractor-args "youtube:player-client=android_embedded,web;formats=incomplete" --extractor-args "funimation:version=uncut"` +Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) separated string of `ARG=VAL1,VAL2`. E.g. `--extractor-args "youtube:player-client=mediaconnect,web;formats=incomplete" --extractor-args "funimation:version=uncut"` Note: In CLI, `ARG` can use `-` instead of `_`; e.g. `youtube:player-client"` becomes `youtube:player_client"` @@ -1763,7 +1767,7 @@ The following extractors use this feature: #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb`, `mweb_embedscreen` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. The `android` clients will always be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mweb`, `mediaconnect`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,mweb` is used, and `web_creator,mediaconnect` is added as needed for age-gated videos when account age verification is required. Similarly, the `_music` variants are added for `music.youtube.com` URLs. Some clients, such as `web` and `android`, require a `po_token` for their formats to be downloadable. Some clients, such as the `_creator` variants, will only work with authentication. You can use `all` to use all the clients, and `default` for the default clients. You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=all,-web` * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) @@ -1771,18 +1775,23 @@ The following extractors use this feature: * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total * `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8) * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others -* `innertube_key`: Innertube API key to use for all API requests +* `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning +* `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage` +* `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID) +* `po_token`: Proof of Origin (PO) Token(s) to use for requesting video playback. Comma seperated list of PO Tokens in the format `CLIENT+PO_TOKEN`, e.g. `youtube:po_token=web+XXX,android+YYY` #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) * `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off #### generic -* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Does not apply to ffmpeg +* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Note that if the stream has an HLS AES-128 key, then the query parameters will be passed to the key URI as well, unless the `key_query` extractor-arg is passed, or unless an external key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg * `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs if no value is provided, or else apply the query string given as `variant_query=VALUE` +* `key_query`: Passthrough the master m3u8 URL query to its HLS AES-128 decryption key URI if no value is provided, or else apply the query string given as `key_query=VALUE`. Note that this will have no effect if the key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg * `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist * `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live` +* `impersonate`: Target(s) to try and impersonate with the initial webpage request; e.g. `generic:impersonate=safari,chrome-110`. Use `generic:impersonate` to impersonate any available target, and use `generic:impersonate=false` to disable impersonation (default) #### funimation * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese` @@ -1848,7 +1857,16 @@ The following extractors use this feature: * `cdn`: One or more CDN IDs to use with the API call for stream URLs, e.g. `gcp_cdn`, `gs_cdn_pc_app`, `gs_cdn_mobile_web`, `gs_cdn_pc_web` #### soundcloud -* `formats`: Formats to request from the API. Requested values should be in the format of `{protocol}_{extension}` (omitting the bitrate), e.g. `hls_opus,http_aac`. The `*` character functions as a wildcard, e.g. `*_mp3`, and can passed by itself to request all formats. Known protocols include `http`, `hls` and `hls-aes`; known extensions include `aac`, `opus` and `mp3`. Original `download` formats are always extracted. Default is `http_aac,hls_aac,http_opus,hls_opus,http_mp3,hls_mp3` +* `formats`: Formats to request from the API. Requested values should be in the format of `{protocol}_{extension}` (omitting the bitrate), e.g. `hls_opus,http_aac`. The `*` character functions as a wildcard, e.g. `*_mp3`, and can be passed by itself to request all formats. Known protocols include `http`, `hls` and `hls-aes`; known extensions include `aac`, `opus` and `mp3`. Original `download` formats are always extracted. Default is `http_aac,hls_aac,http_opus,hls_opus,http_mp3,hls_mp3` + +#### orfon (orf:on) +* `prefer_segments_playlist`: Prefer a playlist of program segments instead of a single complete video when available. If individual segments are desired, use `--concat-playlist never --extractor-args "orfon:prefer_segments_playlist"` + +#### bilibili +* `prefer_multi_flv`: Prefer extracting flv formats over mp4 for older videos that still provide legacy formats + +#### digitalconcerthall +* `prefer_combined_hls`: Prefer extracting combined/pre-merged video and audio HLS formats. This will exclude 4K/HEVC video and lossless/FLAC audio formats, which are only available as split video/audio HLS formats **Note**: These options may be changed/removed in the future without concern for backward compatibility @@ -1859,16 +1877,16 @@ The following extractors use this feature: Note that **all** plugins are imported even if not invoked, and that **there are no checks** performed on plugin code. **Use plugins at your own risk and only if you trust the code!** -Plugins can be of ``s `extractor` or `postprocessor`. -- Extractor plugins do not need to be enabled from the CLI and are automatically invoked when the input URL is suitable for it. -- Extractor plugins take priority over builtin extractors. +Plugins can be of ``s `extractor` or `postprocessor`. +- Extractor plugins do not need to be enabled from the CLI and are automatically invoked when the input URL is suitable for it. +- Extractor plugins take priority over built-in extractors. - Postprocessor plugins can be invoked using `--use-postprocessor NAME`. Plugins are loaded from the namespace packages `yt_dlp_plugins.extractor` and `yt_dlp_plugins.postprocessor`. In other words, the file structure on the disk looks something like: - + yt_dlp_plugins/ extractor/ myplugin.py @@ -1876,6 +1894,7 @@ In other words, the file structure on the disk looks something like: myplugin.py yt-dlp looks for these `yt_dlp_plugins` namespace folders in many locations (see below) and loads in plugins from **all** of them. +Set the environment variable `YTDLP_NO_PLUGINS` to something nonempty to disable loading plugins entirely. See the [wiki for some known plugins](https://github.com/yt-dlp/yt-dlp/wiki/Plugins) @@ -1903,7 +1922,7 @@ Plugins can be installed using various methods and locations. * Plugin packages can be installed and managed using `pip`. See [yt-dlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for an example. * Note: plugin files between plugin packages installed with pip must have unique filenames. * Any path in `PYTHONPATH` is searched in for the `yt_dlp_plugins` namespace folder. - * Note: This does not apply for Pyinstaller/py2exe builds. + * Note: This does not apply for Pyinstaller builds. `.zip`, `.egg` and `.whl` archives containing a `yt_dlp_plugins` namespace folder in their root are also supported as plugin packages. @@ -1916,7 +1935,7 @@ Run yt-dlp with `--verbose` to check if the plugin has been loaded. See the [yt-dlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) repo for a template plugin package and the [Plugin Development](https://github.com/yt-dlp/yt-dlp/wiki/Plugin-Development) section of the wiki for a plugin development guide. -All public classes with a name ending in `IE`/`PP` are imported from each file for extractors and postprocessors repectively. This respects underscore prefix (e.g. `_MyBasePluginIE` is private) and `__all__`. Modules can similarly be excluded by prefixing the module name with an underscore (e.g. `_myplugin.py`). +All public classes with a name ending in `IE`/`PP` are imported from each file for extractors and postprocessors respectively. This respects underscore prefix (e.g. `_MyBasePluginIE` is private) and `__all__`. Modules can similarly be excluded by prefixing the module name with an underscore (e.g. `_myplugin.py`). To replace an existing extractor with a subclass of one, set the `plugin_name` class keyword argument (e.g. `class MyPluginIE(ABuiltInIE, plugin_name='myplugin')` will replace `ABuiltInIE` with `MyPluginIE`). Since the extractor replaces the parent, you should exclude the subclass extractor from being imported separately by making it private using one of the methods described above. @@ -1928,7 +1947,7 @@ See the [Developer Instructions](https://github.com/yt-dlp/yt-dlp/blob/master/CO yt-dlp makes the best effort to be a good command-line program, and thus should be callable from any programming language. -Your program should avoid parsing the normal stdout since they may change in future versions. Instead they should use options such as `-J`, `--print`, `--progress-template`, `--exec` etc to create console output that you can reliably reproduce and parse. +Your program should avoid parsing the normal stdout since they may change in future versions. Instead, they should use options such as `-J`, `--print`, `--progress-template`, `--exec` etc to create console output that you can reliably reproduce and parse. From a Python program, you can embed yt-dlp in a more powerful fashion, like this: @@ -2137,9 +2156,9 @@ with yt_dlp.YoutubeDL(ydl_opts) as ydl: * **YouTube improvements**: * Supports Clips, Stories (`ytstories:`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) * Fix for [n-sig based throttling](https://github.com/ytdl-org/youtube-dl/issues/29326) **\*** - * Supports some (but not all) age-gated content without cookies * Download livestreams from the start using `--live-from-start` (*experimental*) * Channel URLs download all uploads of the channel, including shorts and live + * Support for [logging in with OAuth](https://github.com/yt-dlp/yt-dlp/wiki/Extractors#logging-in-with-oauth) * **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER]` @@ -2163,9 +2182,9 @@ with yt_dlp.YoutubeDL(ydl_opts) as ydl: * **Output template improvements**: Output templates can now have date-time formatting, numeric offsets, object traversal etc. See [output template](#output-template) for details. Even more advanced operations can also be done with the help of `--parse-metadata` and `--replace-in-metadata` -* **Other new options**: Many new options have been added such as `--alias`, `--print`, `--concat-playlist`, `--wait-for-video`, `--retry-sleep`, `--sleep-requests`, `--convert-thumbnails`, `--force-download-archive`, `--force-overwrites`, `--break-match-filter` etc +* **Other new options**: Many new options have been added such as `--alias`, `--print`, `--concat-playlist`, `--wait-for-video`, `--retry-sleep`, `--sleep-requests`, `--convert-thumbnails`, `--force-download-archive`, `--force-overwrites`, `--break-match-filters` etc -* **Improvements**: Regex and other operators in `--format`/`--match-filter`, multiple `--postprocessor-args` and `--downloader-args`, faster archive checking, more [format selection options](#format-selection), merge multi-video/audio, multiple `--config-locations`, `--exec` at different stages, etc +* **Improvements**: Regex and other operators in `--format`/`--match-filters`, multiple `--postprocessor-args` and `--downloader-args`, faster archive checking, more [format selection options](#format-selection), merge multi-video/audio, multiple `--config-locations`, `--exec` at different stages, etc * **Plugins**: Extractors and PostProcessors can be loaded from an external file. See [plugins](#plugins) for details @@ -2181,7 +2200,7 @@ Features marked with a **\*** have been back-ported to youtube-dl Some of yt-dlp's default options are different from that of youtube-dl and youtube-dlc: -* yt-dlp supports only [Python 3.8+](## "Windows 7"), and *may* remove support for more versions as they [become EOL](https://devguide.python.org/versions/#python-release-cycle); while [youtube-dl still supports Python 2.6+ and 3.2+](https://github.com/ytdl-org/youtube-dl/issues/30568#issue-1118238743) +* yt-dlp supports only [Python 3.9+](## "Windows 8"), and will remove support for more versions as they [become EOL](https://devguide.python.org/versions/#python-release-cycle); while [youtube-dl still supports Python 2.6+ and 3.2+](https://github.com/ytdl-org/youtube-dl/issues/30568#issue-1118238743) * The options `--auto-number` (`-A`), `--title` (`-t`) and `--literal` (`-l`), no longer work. See [removed options](#Removed) for details * `avconv` is not supported as an alternative to `ffmpeg` * yt-dlp stores config files in slightly different locations to youtube-dl. See [CONFIGURATION](#configuration) for a list of correct locations @@ -2206,20 +2225,29 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu * `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi` * yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior * ~~yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: [aria2c](https://github.com/yt-dlp/yt-dlp/issues/5931)). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is~~ -* yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filter` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this +* yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filters` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this * yt-dlp versions between 2021.11.10 and 2023.06.21 estimated `filesize_approx` values for fragmented/manifest formats. This was added for convenience in [f2fe69](https://github.com/yt-dlp/yt-dlp/commit/f2fe69c7b0d208bdb1f6292b4ae92bc1e1a7444a), but was reverted in [0dff8e](https://github.com/yt-dlp/yt-dlp/commit/0dff8e4d1e6e9fb938f4256ea9af7d81f42fd54f) due to the potentially extreme inaccuracy of the estimated values. Use `--compat-options manifest-filesize-approx` to keep extracting the estimated values * yt-dlp uses modern http client backends such as `requests`. Use `--compat-options prefer-legacy-http-handler` to prefer the legacy http handler (`urllib`) to be used for standard http requests. * The sub-modules `swfinterp`, `casefold` are removed. +* Passing `--simulate` (or calling `extract_info` with `download=False`) no longer alters the default format selection. See [#9843](https://github.com/yt-dlp/yt-dlp/issues/9843) for details. For ease of use, a few more compat options are available: -* `--compat-options all`: Use all compat options (Do NOT use) -* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx` -* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx` +* `--compat-options all`: Use all compat options (**Do NOT use this!**) +* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext` +* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` * `--compat-options 2022`: Same as `--compat-options 2023,playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx` * `--compat-options 2023`: Currently does nothing. Use this to enable all future compat options +The following compat options restore vulnerable behavior from before security patches: + +* `--compat-options allow-unsafe-ext`: Allow files with any extension (including unsafe ones) to be downloaded ([GHSA-79w7-vh3h-8g4j]()) + + > :warning: Only use if a valid file download is rejected because its extension is detected as uncommon + > + > **This option can enable remote code execution! Consider [opening an issue]() instead!** + ### Deprecated options These are all the deprecated options and the current alternative to achieve the same effect @@ -2243,13 +2271,13 @@ While these options are redundant, they are still expected to be used due to the --get-thumbnail --print thumbnail -e, --get-title --print title -g, --get-url --print urls - --match-title REGEX --match-filter "title ~= (?i)REGEX" - --reject-title REGEX --match-filter "title !~= (?i)REGEX" - --min-views COUNT --match-filter "view_count >=? COUNT" - --max-views COUNT --match-filter "view_count <=? COUNT" - --break-on-reject Use --break-match-filter - --user-agent UA --add-header "User-Agent:UA" - --referer URL --add-header "Referer:URL" + --match-title REGEX --match-filters "title ~= (?i)REGEX" + --reject-title REGEX --match-filters "title !~= (?i)REGEX" + --min-views COUNT --match-filters "view_count >=? COUNT" + --max-views COUNT --match-filters "view_count <=? COUNT" + --break-on-reject Use --break-match-filters + --user-agent UA --add-headers "User-Agent:UA" + --referer URL --add-headers "Referer:URL" --playlist-start NUMBER -I NUMBER: --playlist-end NUMBER -I :NUMBER --playlist-reverse -I ::-1 diff --git a/bundle/docker/static/entrypoint.sh b/bundle/docker/static/entrypoint.sh index 93d84fa9b7..2202759742 100755 --- a/bundle/docker/static/entrypoint.sh +++ b/bundle/docker/static/entrypoint.sh @@ -2,7 +2,7 @@ set -e source ~/.local/share/pipx/venvs/pyinstaller/bin/activate -python -m devscripts.install_deps --include secretstorage +python -m devscripts.install_deps --include secretstorage --include curl-cffi python -m devscripts.make_lazy_extractors python devscripts/update-version.py -c "${channel}" -r "${origin}" "${version}" python -m bundle.pyinstaller diff --git a/bundle/py2exe.py b/bundle/py2exe.py deleted file mode 100755 index 5b7f4883bc..0000000000 --- a/bundle/py2exe.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Allow execution from anywhere -import os -import sys - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -import warnings - -from py2exe import freeze - -from devscripts.utils import read_version - -VERSION = read_version() - - -def main(): - warnings.warn( - 'py2exe builds do not support pycryptodomex and needs VC++14 to run. ' - 'It is recommended to run "pyinst.py" to build using pyinstaller instead') - - freeze( - console=[{ - 'script': './yt_dlp/__main__.py', - 'dest_base': 'yt-dlp', - 'icon_resources': [(1, 'devscripts/logo.ico')], - }], - version_info={ - 'version': VERSION, - 'description': 'A feature-rich command-line audio/video downloader', - 'comments': 'Official repository: ', - 'product_name': 'yt-dlp', - 'product_version': VERSION, - }, - options={ - 'bundle_files': 0, - 'compressed': 1, - 'optimize': 2, - 'dist_dir': './dist', - 'excludes': [ - # py2exe cannot import Crypto - 'Crypto', - 'Cryptodome', - # requests >=2.32.0 breaks py2exe builds due to certifi dependency - 'requests', - 'urllib3', - ], - 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], - # Modules that are only imported dynamically must be added here - 'includes': ['yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated', - 'yt_dlp.utils._legacy', 'yt_dlp.utils._deprecated'], - }, - zipfile=None, - ) - - -if __name__ == '__main__': - main() diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 86e8ec2f99..08ea9666ed 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -169,5 +169,70 @@ "when": "5c019f6328ad40d66561eac3c4de0b3cd070d0f6", "short": "[cleanup] Misc (#9765)", "authors": ["bashonly", "Grub4K", "seproDev"] + }, + { + "action": "change", + "when": "e6a22834df1776ec4e486526f6df2bf53cb7e06f", + "short": "[ie/orf:on] Add `prefer_segments_playlist` extractor-arg (#10314)", + "authors": ["seproDev"] + }, + { + "action": "add", + "when": "6aaf96a3d6e7d0d426e97e11a2fcf52fda00e733", + "short": "[priority] Security: [[CVE-2024-38519](https://nvd.nist.gov/vuln/detail/CVE-2024-38519)] [Properly sanitize file-extension to prevent file system modification and RCE](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j)\n - Unsafe extensions are now blocked from being downloaded" + }, + { + "action": "add", + "when": "6075a029dba70a89675ae1250e7cdfd91f0eba41", + "short": "[priority] Security: [[ie/douyutv] Do not use dangerous javascript source/URL](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3v33-3wmw-3785)\n - A dependency on potentially malicious third-party JavaScript code has been removed from the Douyu extractors" + }, + { + "action": "add", + "when": "fb8b7f226d251e521a89b23c415e249e5b788e5c", + "short": "[priority] **The minimum *recommended* Python version has been raised to 3.9**\nSince Python 3.8 will reach end-of-life in October 2024, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10086)" + }, + { + "action": "change", + "when": "b31b81d85f00601710d4fac590c3e4efb4133283", + "short": "[ci] Rerun failed tests (#11143)", + "authors": ["Grub4K"] + }, + { + "action": "add", + "when": "a886cf3e900f4a2ec00af705f883539269545609", + "short": "[priority] **py2exe is no longer supported**\nThis release's `yt-dlp_min.exe` will be the last, and it's actually a PyInstaller-bundled executable so that yt-dlp users updating their py2exe build with `-U` will be automatically migrated. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10087)" + }, + { + "action": "add", + "when": "a886cf3e900f4a2ec00af705f883539269545609", + "short": "[priority] **Following this release, yt-dlp's Python dependencies *must* be installed using the `default` group**\nIf you're installing yt-dlp with pip/pipx or requiring yt-dlp in your own Python project, you'll need to specify `yt-dlp[default]` if you want to also install yt-dlp's optional dependencies (which were previously included by default). [Read more](https://github.com/yt-dlp/yt-dlp/pull/11255)" + }, + { + "action": "add", + "when": "87884f15580910e4e0fe0e1db73508debc657471", + "short": "[priority] **Beginning with this release, yt-dlp's Python dependencies *must* be installed using the `default` group**\nIf you're installing yt-dlp with pip/pipx or requiring yt-dlp in your own Python project, you'll need to specify `yt-dlp[default]` if you want to also install yt-dlp's optional dependencies (which were previously included by default). [Read more](https://github.com/yt-dlp/yt-dlp/pull/11255)" + }, + { + "action": "add", + "when": "d784464399b600ba9516bbcec6286f11d68974dd", + "short": "[priority] **The minimum *required* Python version has been raised to 3.9**\nPython 3.8 reached its end-of-life on 2024.10.07, and yt-dlp has now removed support for it. As an unfortunate side effect, the official `yt-dlp.exe` and `yt-dlp_x86.exe` binaries are no longer supported on Windows 7. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10086)" + }, + { + "action": "change", + "when": "914af9a0cf51c9a3f74aa88d952bee8334c67511", + "short": "Expand paths in `--plugin-dirs` (#11334)", + "authors": ["bashonly"] + }, + { + "action": "change", + "when": "c29f5a7fae93a08f3cfbb6127b2faa75145b06a0", + "short": "[ie/generic] Do not impersonate by default (#11336)", + "authors": ["bashonly"] + }, + { + "action": "change", + "when": "57212a5f97ce367590aaa5c3e9a135eead8f81f7", + "short": "[ie/vimeo] Fix API retries (#11351)", + "authors": ["bashonly"] } ] diff --git a/devscripts/cli_to_api.py b/devscripts/cli_to_api.py old mode 100644 new mode 100755 index 2aa51eb6e9..9c2710e09f --- a/devscripts/cli_to_api.py +++ b/devscripts/cli_to_api.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Allow direct execution import os import sys diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 00634fb911..7c876101b4 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -71,14 +71,13 @@ class CommitGroup(enum.Enum): def get(cls, value: str) -> tuple[CommitGroup | None, str | None]: group, _, subgroup = (group.strip().lower() for group in value.partition('/')) - result = cls.group_lookup().get(group) - if not result: - if subgroup: - return None, value - subgroup = group - result = cls.subgroup_lookup().get(subgroup) + if result := cls.group_lookup().get(group): + return result, subgroup or None - return result, subgroup or None + if subgroup: + return None, value + + return cls.subgroup_lookup().get(group), group or None @dataclass @@ -136,8 +135,7 @@ class Changelog: first = False yield '\n

Changelog

\n' - group = groups[item] - if group: + if group := groups[item]: yield self.format_module(item.value, group) if self._collapsible: @@ -253,7 +251,7 @@ class CommitRange: ''', re.VERBOSE | re.DOTALL) EXTRACTOR_INDICATOR_RE = re.compile(r'(?:Fix|Add)\s+Extractors?', re.IGNORECASE) REVERT_RE = re.compile(r'(?:\[[^\]]+\]\s+)?(?i:Revert)\s+([\da-f]{40})') - FIXES_RE = re.compile(r'(?i:Fix(?:es)?(?:\s+bugs?)?(?:\s+in|\s+for)?|Revert|Improve)\s+([\da-f]{40})') + FIXES_RE = re.compile(r'(?i:(?:bug\s*)?fix(?:es)?(?:\s+bugs?)?(?:\s+in|\s+for)?|Improve)\s+([\da-f]{40})') UPSTREAM_MERGE_RE = re.compile(r'Update to ytdl-commit-([\da-f]+)') def __init__(self, start, end, default_author=None): @@ -287,11 +285,16 @@ class CommitRange: short = next(lines) skip = short.startswith('Release ') or short == '[version] update' + fix_commitish = None + if match := self.FIXES_RE.search(short): + fix_commitish = match.group(1) + authors = [default_author] if default_author else [] for line in iter(lambda: next(lines), self.COMMIT_SEPARATOR): - match = self.AUTHOR_INDICATOR_RE.match(line) - if match: + if match := self.AUTHOR_INDICATOR_RE.match(line): authors = sorted(map(str.strip, line[match.end():].split(',')), key=str.casefold) + if not fix_commitish and (match := self.FIXES_RE.fullmatch(line)): + fix_commitish = match.group(1) commit = Commit(commit_hash, short, authors) if skip and (self._start or not i): @@ -301,21 +304,17 @@ class CommitRange: logger.debug(f'Reached Release commit, breaking: {commit}') break - revert_match = self.REVERT_RE.fullmatch(commit.short) - if revert_match: - reverts[revert_match.group(1)] = commit + if match := self.REVERT_RE.fullmatch(commit.short): + reverts[match.group(1)] = commit continue - fix_match = self.FIXES_RE.search(commit.short) - if fix_match: - commitish = fix_match.group(1) - fixes[commitish].append(commit) + if fix_commitish: + fixes[fix_commitish].append(commit) commits[commit.hash] = commit for commitish, revert_commit in reverts.items(): - reverted = commits.pop(commitish, None) - if reverted: + if reverted := commits.pop(commitish, None): logger.debug(f'{commitish} fully reverted {reverted}') else: commits[revert_commit.hash] = revert_commit @@ -461,8 +460,7 @@ def create_changelog(args): logger.info(f'Loaded {len(commits)} commits') - new_contributors = get_new_contributors(args.contributors_path, commits) - if new_contributors: + if new_contributors := get_new_contributors(args.contributors_path, commits): if args.contributors: write_file(args.contributors_path, '\n'.join(new_contributors) + '\n', mode='a') logger.info(f'New contributors: {", ".join(new_contributors)}') diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py index a5d59f3c03..2a418ddbf7 100644 --- a/devscripts/make_issue_template.py +++ b/devscripts/make_issue_template.py @@ -32,20 +32,29 @@ VERBOSE_TMPL = ''' placeholder: | [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) - [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 - [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp-nightly-builds [1a176d874] (win_exe) + [debug] Python 3.10.11 (CPython AMD64 64bit) - Windows-10-10.0.20348-SP0 (OpenSSL 1.1.1t 7 Feb 2023) + [debug] exe versions: ffmpeg 7.0.2 (setts), ffprobe 7.0.2 + [debug] Optional libraries: Cryptodome-3.21.0, brotli-1.1.0, certifi-2024.08.30, curl_cffi-0.5.10, mutagen-1.47.0, requests-2.32.3, sqlite3-3.40.1, urllib3-2.2.3, websockets-13.1 [debug] Proxy map: {} - [debug] Request Handlers: urllib, requests - [debug] Loaded 1893 extractors - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + [debug] Request Handlers: urllib, requests, websockets, curl_cffi + [debug] Loaded 1838 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest + Latest version: nightly@... from yt-dlp/yt-dlp-nightly-builds yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. '''.strip() NO_SKIP = ''' diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index d74ea202f0..d288d84296 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -2,7 +2,6 @@ # Allow direct execution import os -import shutil import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -34,18 +33,14 @@ MODULE_TEMPLATE = read_file('devscripts/lazy_load_template.py') def main(): - lazy_extractors_filename = get_filename_args(default_outfile='yt_dlp/extractor/lazy_extractors.py') - if os.path.exists(lazy_extractors_filename): - os.remove(lazy_extractors_filename) + os.environ['YTDLP_NO_PLUGINS'] = 'true' + os.environ['YTDLP_NO_LAZY_EXTRACTORS'] = 'true' - _ALL_CLASSES = get_all_ies() # Must be before import + lazy_extractors_filename = get_filename_args(default_outfile='yt_dlp/extractor/lazy_extractors.py') - import yt_dlp.plugins + from yt_dlp.extractor.extractors import _ALL_CLASSES from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor - # Filter out plugins - _ALL_CLASSES = [cls for cls in _ALL_CLASSES if not cls.__module__.startswith(f'{yt_dlp.plugins.PACKAGE_NAME}.')] - DummyInfoExtractor = type('InfoExtractor', (InfoExtractor,), {'IE_NAME': NO_ATTR}) module_src = '\n'.join(( MODULE_TEMPLATE, @@ -58,20 +53,6 @@ def main(): write_file(lazy_extractors_filename, f'{module_src}\n') -def get_all_ies(): - PLUGINS_DIRNAME = 'ytdlp_plugins' - BLOCKED_DIRNAME = f'{PLUGINS_DIRNAME}_blocked' - if os.path.exists(PLUGINS_DIRNAME): - # os.rename cannot be used, e.g. in Docker. See https://github.com/yt-dlp/yt-dlp/pull/4958 - shutil.move(PLUGINS_DIRNAME, BLOCKED_DIRNAME) - try: - from yt_dlp.extractor.extractors import _ALL_CLASSES - finally: - if os.path.exists(BLOCKED_DIRNAME): - shutil.move(BLOCKED_DIRNAME, PLUGINS_DIRNAME) - return _ALL_CLASSES - - def extra_ie_code(ie, base=None): for var in STATIC_CLASS_PROPERTIES: val = getattr(ie, var) diff --git a/devscripts/run_tests.py b/devscripts/run_tests.py index c605aa62cf..eb614fe591 100755 --- a/devscripts/run_tests.py +++ b/devscripts/run_tests.py @@ -16,7 +16,7 @@ fix_test_name = functools.partial(re.compile(r'IE(_all|_\d+)?$').sub, r'\1') def parse_args(): parser = argparse.ArgumentParser(description='Run selected yt-dlp tests') parser.add_argument( - 'test', help='a extractor tests, or one of "core" or "download"', nargs='*') + 'test', help='an extractor test, test path, or one of "core" or "download"', nargs='*') parser.add_argument( '-k', help='run a test matching EXPRESSION. Same as "pytest -k"', metavar='EXPRESSION') parser.add_argument( @@ -27,7 +27,6 @@ def parse_args(): def run_tests(*tests, pattern=None, ci=False): run_core = 'core' in tests or (not pattern and not tests) run_download = 'download' in tests - tests = list(map(fix_test_name, tests)) pytest_args = args.pytest_args or os.getenv('HATCH_TEST_ARGS', '') arguments = ['pytest', '-Werror', '--tb=short', *shlex.split(pytest_args)] @@ -41,7 +40,9 @@ def run_tests(*tests, pattern=None, ci=False): arguments.extend(['-m', 'download']) else: arguments.extend( - f'test/test_download.py::TestDownload::test_{test}' for test in tests) + test if '/' in test + else f'test/test_download.py::TestDownload::test_{fix_test_name(test)}' + for test in tests) print(f'Running {arguments}', flush=True) try: diff --git a/pyproject.toml b/pyproject.toml index 01162b794c..55bd55bb9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,10 +9,11 @@ maintainers = [ {name = "Grub4K", email = "contact@grub4k.xyz"}, {name = "bashonly", email = "bashonly@protonmail.com"}, {name = "coletdjnz", email = "coletdjnz@protonmail.com"}, + {name = "sepro", email = "sepro@sepr0.com"}, ] description = "A feature-rich command-line audio/video downloader" readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" keywords = [ "youtube-dl", "video-downloader", @@ -28,11 +29,11 @@ classifiers = [ "Environment :: Console", "Programming Language :: Python", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", @@ -40,7 +41,10 @@ classifiers = [ "Operating System :: OS Independent", ] dynamic = ["version"] -dependencies = [ +dependencies = [] + +[project.optional-dependencies] +default = [ "brotli; implementation_name=='cpython'", "brotlicffi; implementation_name!='cpython'", "certifi", @@ -48,12 +52,12 @@ dependencies = [ "pycryptodomex", "requests>=2.32.2,<3", "urllib3>=1.26.17,<3", - "websockets>=12.0", + "websockets>=13.0", +] +curl-cffi = [ + "curl-cffi==0.5.10; os_name=='nt' and implementation_name=='cpython'", + "curl-cffi>=0.5.10,!=0.6.*,<0.7.2; os_name!='nt' and implementation_name=='cpython'", ] - -[project.optional-dependencies] -default = [] -curl-cffi = ["curl-cffi==0.5.10; implementation_name=='cpython'"] secretstorage = [ "cffi", "secretstorage", @@ -62,7 +66,7 @@ build = [ "build", "hatchling", "pip", - "setuptools", + "setuptools>=71.0.2", # 71.0.0 broke pyinstaller "wheel", ] dev = [ @@ -72,16 +76,14 @@ dev = [ ] static-analysis = [ "autopep8~=2.0", - "ruff~=0.4.4", + "ruff~=0.7.0", ] test = [ "pytest~=8.1", + "pytest-rerunfailures~=14.0", ] pyinstaller = [ - "pyinstaller>=6.7.0", # for compat with setuptools>=70 -] -py2exe = [ - "py2exe>=0.12", + "pyinstaller>=6.10.0", # Windows temp cleanup fixed in 6.10.0 ] [project.urls] @@ -158,7 +160,6 @@ lint-fix = "ruff check --fix {args:.}" features = ["test"] dependencies = [ "pytest-randomly~=3.15", - "pytest-rerunfailures~=14.0", "pytest-xdist[psutil]~=3.5", ] @@ -168,13 +169,11 @@ run-cov = "echo Code coverage not implemented && exit 1" [[tool.hatch.envs.hatch-test.matrix]] python = [ - "3.8", "3.9", "3.10", "3.11", "3.12", - "pypy3.8", - "pypy3.9", + "3.13", "pypy3.10", ] @@ -211,6 +210,7 @@ ignore = [ "TD002", # missing-todo-author "TD003", # missing-todo-link "PLE0604", # invalid-all-object (false positives) + "PLE0643", # potential-index-error (false positives) "PLW0603", # global-statement "PLW1510", # subprocess-run-without-check "PLW2901", # redefined-loop-name @@ -298,7 +298,7 @@ banned-from = [ "string", "sys", "time", - "urllib", + "urllib.parse", "uuid", "xml", ] diff --git a/setup.cfg b/setup.cfg index 340cc3b4d9..20d40cd303 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,7 +16,7 @@ remove-unused-variables = true [tox:tox] skipsdist = true -envlist = py{38,39,310,311,312},pypy{38,39,310} +envlist = py{39,310,311,312,313},pypy310 skip_missing_interpreters = true [testenv] # tox @@ -29,7 +29,7 @@ setenv = [isort] -py_version = 38 +py_version = 39 multi_line_output = VERTICAL_HANGING_INDENT line_length = 80 reverse_relative = true diff --git a/supportedsites.md b/supportedsites.md index 3873956133..7b22e8c6fa 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -45,9 +45,6 @@ - **aenetworks:collection** - **aenetworks:show** - **AeonCo** - - **afreecatv**: [*afreecatv*](## "netrc machine") afreecatv.com - - **afreecatv:live**: [*afreecatv*](## "netrc machine") afreecatv.com livestreams - - **afreecatv:user** - **AirTV** - **AitubeKZVideo** - **AliExpressLive** @@ -142,6 +139,7 @@ - **BBVTV**: [*bbvtv*](## "netrc machine") - **BBVTVLive**: [*bbvtv*](## "netrc machine") - **BBVTVRecordings**: [*bbvtv*](## "netrc machine") + - **BeaconTv** - **BeatBumpPlaylist** - **BeatBumpVideo** - **Beatport** @@ -252,6 +250,7 @@ - **CCMA** - **CCTV**: 央视网 - **CDA**: [*cdapl*](## "netrc machine") + - **CDAFolder** - **Cellebrite** - **CeskaTelevize** - **CGTN** @@ -353,7 +352,6 @@ - **DigitallySpeaking** - **Digiteka** - **DiscogsReleasePlaylist** - - **Discovery** - **DiscoveryLife** - **DiscoveryNetworksDe** - **DiscoveryPlus** @@ -362,7 +360,6 @@ - **DiscoveryPlusItaly** - **DiscoveryPlusItalyShow** - **Disney** - - **DIYNetwork** - **dlf** - **dlf:corpus**: DLF Multi-feed Archives - **dlive:stream** @@ -506,6 +503,7 @@ - **gem.cbc.ca:playlist** - **Genius** - **GeniusLyrics** + - **Germanupa**: germanupa.de - **GetCourseRu**: [*getcourseru*](## "netrc machine") - **GetCourseRuPlayer** - **Gettr** @@ -515,7 +513,6 @@ - **GlattvisionTVLive**: [*glattvisiontv*](## "netrc machine") - **GlattvisionTVRecordings**: [*glattvisiontv*](## "netrc machine") - **Glide**: Glide mobile video messages (glide.me) - - **GlobalCyclingNetworkPlus** - **GlobalPlayerAudio** - **GlobalPlayerAudioEpisode** - **GlobalPlayerLive** @@ -542,6 +539,7 @@ - **Goshgay** - **GoToStage** - **GPUTechConf** + - **Graspop** - **Gronkh** - **gronkh:feed** - **gronkh:vods** @@ -581,6 +579,7 @@ - **HungamaAlbumPlaylist** - **HungamaSong** - **huya:live**: huya.com + - **huya:video**: 虎牙视频 - **Hypem** - **Hytale** - **Icareus** @@ -656,10 +655,12 @@ - **Ketnet** - **khanacademy** - **khanacademy:unit** - - **Kick** + - **kick:clips** + - **kick:live** + - **kick:vod** - **Kicker** - **KickStarter** - - **KickVOD** + - **Kika**: KiKA.de - **kinja:embed** - **KinoPoisk** - **Kommunetv** @@ -678,6 +679,8 @@ - **la7.it** - **la7.it:​pod:episode** - **la7.it:podcast** + - **laracasts** + - **laracasts:series** - **LastFM** - **LastFMPlaylist** - **LastFMUser** @@ -689,6 +692,7 @@ - **Lcp** - **LcpPlay** - **Le**: 乐视网 + - **LearningOnScreen** - **Lecture2Go**: (**Currently broken**) - **Lecturio**: [*lecturio*](## "netrc machine") - **LecturioCourse**: [*lecturio*](## "netrc machine") @@ -719,7 +723,6 @@ - **livestream:original** - **Livestreamfails** - **Lnk** - - **LnkGo** - **loc**: Library of Congress - **loom** - **loom:folder** @@ -753,7 +756,7 @@ - **Masters** - **MatchTV** - **MBN**: mbn.co.kr (매일방송) - - **MDR**: MDR.DE and KiKA + - **MDR**: MDR.DE - **MedalTV** - **media.ccc.de** - **media.ccc.de:lists** @@ -775,7 +778,12 @@ - **MelonVOD** - **Metacritic** - **mewatch** + - **MicrosoftBuild** - **MicrosoftEmbed** + - **MicrosoftLearnEpisode** + - **MicrosoftLearnPlaylist** + - **MicrosoftLearnSession** + - **MicrosoftMedius** - **microsoftstream**: Microsoft Stream - **mildom**: Record ongoing live by specific user in Mildom - **mildom:clip**: Clip in Mildom @@ -803,6 +811,7 @@ - **MNetTVLive**: [*mnettv*](## "netrc machine") - **MNetTVRecordings**: [*mnettv*](## "netrc machine") - **MochaVideo** + - **Mojevideo**: mojevideo.sk - **Mojvideo** - **Monstercat** - **MonsterSirenHypergryphMusic** @@ -811,8 +820,6 @@ - **MotherlessGroup** - **MotherlessUploader** - **Motorsport**: motorsport.com (**Currently broken**) - - **MotorTrend** - - **MotorTrendOnDemand** - **MovieFap** - **Moviepilot** - **MoviewPlay** @@ -830,7 +837,7 @@ - **MTVUutisetArticle**: (**Currently broken**) - **MuenchenTV**: münchen.tv (**Currently broken**) - **MujRozhlas** - - **Murrtube**: (**Currently broken**) + - **Murrtube** - **MurrtubeUser**: Murrtube user profile (**Currently broken**) - **MuseAI** - **MuseScore** @@ -838,8 +845,6 @@ - **MusicdexArtist** - **MusicdexPlaylist** - **MusicdexSong** - - **mva**: Microsoft Virtual Academy videos - - **mva:course**: Microsoft Virtual Academy courses - **Mx3** - **Mx3Neo** - **Mx3Volksmusik** @@ -1038,8 +1043,8 @@ - **Parler**: Posts on parler.com - **parliamentlive.tv**: UK parliament videos - **Parlview**: (**Currently broken**) - - **Patreon** - - **PatreonCampaign** + - **patreon** + - **patreon:campaign** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **PBSKids** - **PearVideo** @@ -1131,13 +1136,13 @@ - **QingTing** - **qqmusic**: QQ音乐 - **qqmusic:album**: QQ音乐 - 专辑 + - **qqmusic:mv**: QQ音乐 - MV - **qqmusic:playlist**: QQ音乐 - 歌单 - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 - **QuantumTV**: [*quantumtv*](## "netrc machine") - **QuantumTVLive**: [*quantumtv*](## "netrc machine") - **QuantumTVRecordings**: [*quantumtv*](## "netrc machine") - - **Qub** - **R7**: (**Currently broken**) - **R7Article**: (**Currently broken**) - **Radiko** @@ -1237,6 +1242,7 @@ - **rtve.es:television** - **RTVS** - **rtvslo.si** + - **rtvslo.si:show** - **RudoVideo** - **Rule34Video** - **Rumble** @@ -1280,12 +1286,14 @@ - **Screencast** - **Screencastify** - **ScreencastOMatic** + - **ScreenRec** - **ScrippsNetworks** - **scrippsnetworks:watch** - **Scrolller** - **SCTE**: [*scte*](## "netrc machine") (**Currently broken**) - **SCTECourse**: [*scte*](## "netrc machine") (**Currently broken**) - **sejm** + - **Sen** - **SenalColombiaLive**: (**Currently broken**) - **SenateGov** - **SenateISVP** @@ -1322,11 +1330,16 @@ - **SlidesLive** - **Slutload** - **Smotrim** + - **SnapchatSpotlight** - **Snotr** - **Sohu** - **SohuV** - **SonyLIV**: [*sonyliv*](## "netrc machine") - **SonyLIVSeries** + - **soop**: [*afreecatv*](## "netrc machine") sooplive.co.kr + - **soop:catchstory**: [*afreecatv*](## "netrc machine") sooplive.co.kr catch story + - **soop:live**: [*afreecatv*](## "netrc machine") sooplive.co.kr livestreams + - **soop:user**: [*afreecatv*](## "netrc machine") - **soundcloud**: [*soundcloud*](## "netrc machine") - **soundcloud:playlist**: [*soundcloud*](## "netrc machine") - **soundcloud:related**: [*soundcloud*](## "netrc machine") @@ -1360,6 +1373,7 @@ - **SpreakerShowPage** - **SpringboardPlatform** - **Sprout** + - **SproutVideo** - **sr:mediathek**: Saarländischer Rundfunk (**Currently broken**) - **SRGSSR** - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites @@ -1494,8 +1508,8 @@ - **Tube8**: (**Currently broken**) - **TubeTuGraz**: [*tubetugraz*](## "netrc machine") tube.tugraz.at - **TubeTuGrazSeries**: [*tubetugraz*](## "netrc machine") - - **TubiTv**: [*tubitv*](## "netrc machine") - - **TubiTvShow** + - **tubitv**: [*tubitv*](## "netrc machine") + - **tubitv:series** - **Tumblr**: [*tumblr*](## "netrc machine") - **TuneInPodcast** - **TuneInPodcastEpisode** @@ -1512,9 +1526,9 @@ - **tv5unis** - **tv5unis:video** - **tv8.it** - - **TVA** - **TVANouvelles** - **TVANouvellesArticle** + - **tvaplus**: TVA+ - **TVC** - **TVCArticle** - **TVer** @@ -1602,11 +1616,14 @@ - **videomore:season** - **videomore:video** - **VideoPress** + - **Vidflex** - **Vidio**: [*vidio*](## "netrc machine") - **VidioLive**: [*vidio*](## "netrc machine") - **VidioPremier**: [*vidio*](## "netrc machine") - **VidLii** - **Vidly** + - **vids.io** + - **Vidyard** - **viewlift** - **viewlift:embed** - **Viidea** @@ -1654,6 +1671,8 @@ - **VRT**: VRT NWS, Flanders News, Flandern Info and Sporza - **VrtNU**: [*vrtnu*](## "netrc machine") VRT MAX - **VTM**: (**Currently broken**) + - **VTV** + - **VTVGo** - **VTXTV**: [*vtxtv*](## "netrc machine") - **VTXTVLive**: [*vtxtv*](## "netrc machine") - **VTXTVRecordings**: [*vtxtv*](## "netrc machine") @@ -1726,7 +1745,7 @@ - **XiaoHongShu**: 小红书 - **ximalaya**: 喜马拉雅FM - **ximalaya:album**: 喜马拉雅FM 专辑 - - **xinpianchang**: xinpianchang.com (**Currently broken**) + - **Xinpianchang**: 新片场 - **XMinus**: (**Currently broken**) - **XNXX** - **Xstream** @@ -1760,24 +1779,24 @@ - **YouPornStar**: YouPorn Pornstar, with description, sorting and pagination - **YouPornTag**: YouPorn tag (porntags), with sorting, filtering and pagination - **YouPornVideos**: YouPorn video (browse) playlists, with sorting, filtering and pagination - - **youtube**: YouTube - - **youtube:clip** - - **youtube:favorites**: YouTube liked videos; ":ytfav" keyword (requires cookies) - - **youtube:history**: Youtube watch history; ":ythis" keyword (requires cookies) - - **youtube:​music:search_url**: YouTube music search URLs with selectable sections, e.g. #songs - - **youtube:notif**: YouTube notifications; ":ytnotif" keyword (requires cookies) - - **youtube:playlist**: YouTube playlists - - **youtube:recommended**: YouTube recommended videos; ":ytrec" keyword - - **youtube:search**: YouTube search; "ytsearch:" prefix - - **youtube:​search:date**: YouTube search, newest videos first; "ytsearchdate:" prefix - - **youtube:search_url**: YouTube search URLs with sorting and filter support - - **youtube:​shorts:pivot:audio**: YouTube Shorts audio pivot (Shorts using audio of a given video) - - **youtube:subscriptions**: YouTube subscriptions feed; ":ytsubs" keyword (requires cookies) - - **youtube:tab**: YouTube Tabs - - **youtube:user**: YouTube user videos; "ytuser:" prefix - - **youtube:watchlater**: Youtube watch later list; ":ytwatchlater" keyword (requires cookies) - - **YoutubeLivestreamEmbed**: YouTube livestream embeds - - **YoutubeYtBe**: youtu.be + - **youtube**: [*youtube*](## "netrc machine") YouTube + - **youtube:clip**: [*youtube*](## "netrc machine") + - **youtube:favorites**: [*youtube*](## "netrc machine") YouTube liked videos; ":ytfav" keyword (requires cookies) + - **youtube:history**: [*youtube*](## "netrc machine") Youtube watch history; ":ythis" keyword (requires cookies) + - **youtube:​music:search_url**: [*youtube*](## "netrc machine") YouTube music search URLs with selectable sections, e.g. #songs + - **youtube:notif**: [*youtube*](## "netrc machine") YouTube notifications; ":ytnotif" keyword (requires cookies) + - **youtube:playlist**: [*youtube*](## "netrc machine") YouTube playlists + - **youtube:recommended**: [*youtube*](## "netrc machine") YouTube recommended videos; ":ytrec" keyword + - **youtube:search**: [*youtube*](## "netrc machine") YouTube search; "ytsearch:" prefix + - **youtube:​search:date**: [*youtube*](## "netrc machine") YouTube search, newest videos first; "ytsearchdate:" prefix + - **youtube:search_url**: [*youtube*](## "netrc machine") YouTube search URLs with sorting and filter support + - **youtube:​shorts:pivot:audio**: [*youtube*](## "netrc machine") YouTube Shorts audio pivot (Shorts using audio of a given video) + - **youtube:subscriptions**: [*youtube*](## "netrc machine") YouTube subscriptions feed; ":ytsubs" keyword (requires cookies) + - **youtube:tab**: [*youtube*](## "netrc machine") YouTube Tabs + - **youtube:user**: [*youtube*](## "netrc machine") YouTube user videos; "ytuser:" prefix + - **youtube:watchlater**: [*youtube*](## "netrc machine") Youtube watch later list; ":ytwatchlater" keyword (requires cookies) + - **YoutubeLivestreamEmbed**: [*youtube*](## "netrc machine") YouTube livestream embeds + - **YoutubeYtBe**: [*youtube*](## "netrc machine") youtu.be - **Zaiko** - **ZaikoETicket** - **Zapiks** diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 31e8f82448..54f35ef552 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -53,6 +53,18 @@ class TestInfoExtractor(unittest.TestCase): def test_ie_key(self): self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE) + def test_get_netrc_login_info(self): + for params in [ + {'usenetrc': True, 'netrc_location': './test/testdata/netrc/netrc'}, + {'netrc_cmd': f'{sys.executable} ./test/testdata/netrc/print_netrc.py'}, + ]: + ie = DummyIE(FakeYDL(params)) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='normal_use'), ('user', 'pass')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='empty_user'), ('', 'pass')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='empty_pass'), ('user', '')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='both_empty'), ('', '')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='nonexistent'), (None, None)) + def test_html_search_regex(self): html = '

Watch this video

' search = lambda re, *args: self.ie._html_search_regex(re, html, *args) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 841ce1af3e..a99e624080 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -4,6 +4,7 @@ import os import sys import unittest +from unittest.mock import patch sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -235,6 +236,35 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'vid-vcodec-dot') + def test_format_selection_by_vcodec_sort(self): + formats = [ + {'format_id': 'av1-format', 'ext': 'mp4', 'vcodec': 'av1', 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'vp9-hdr-format', 'ext': 'mp4', 'vcodec': 'vp09.02.50.10.01.09.18.09.00', 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'vp9-sdr-format', 'ext': 'mp4', 'vcodec': 'vp09.00.50.08', 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'h265-format', 'ext': 'mp4', 'vcodec': 'h265', 'acodec': 'none', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['vcodec:vp9.2']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-hdr-format') + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['vcodec:vp9']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-sdr-format') + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['+vcodec:vp9.2']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-hdr-format') + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['+vcodec:vp9']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-sdr-format') + def test_format_selection_string_ops(self): formats = [ {'format_id': 'abc-cba', 'ext': 'mp4', 'url': TEST_URL}, @@ -520,7 +550,33 @@ class TestFormatSelection(unittest.TestCase): ydl.process_ie_result(info_dict) self.assertEqual(ydl.downloaded_info_dicts, []) - def test_default_format_spec(self): + @patch('yt_dlp.postprocessor.ffmpeg.FFmpegMergerPP.available', False) + def test_default_format_spec_without_ffmpeg(self): + ydl = YDL({}) + self.assertEqual(ydl._default_format_spec({}), 'best/bestvideo+bestaudio') + + ydl = YDL({'simulate': True}) + self.assertEqual(ydl._default_format_spec({}), 'best/bestvideo+bestaudio') + + ydl = YDL({}) + self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best/bestvideo+bestaudio') + + ydl = YDL({'simulate': True}) + self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best/bestvideo+bestaudio') + + ydl = YDL({'outtmpl': '-'}) + self.assertEqual(ydl._default_format_spec({}), 'best/bestvideo+bestaudio') + + ydl = YDL({}) + self.assertEqual(ydl._default_format_spec({}), 'best/bestvideo+bestaudio') + self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best/bestvideo+bestaudio') + + @patch('yt_dlp.postprocessor.ffmpeg.FFmpegMergerPP.available', True) + @patch('yt_dlp.postprocessor.ffmpeg.FFmpegMergerPP.can_merge', lambda _: True) + def test_default_format_spec_with_ffmpeg(self): + ydl = YDL({}) + self.assertEqual(ydl._default_format_spec({}), 'bestvideo*+bestaudio/best') + ydl = YDL({'simulate': True}) self.assertEqual(ydl._default_format_spec({}), 'bestvideo*+bestaudio/best') @@ -528,13 +584,13 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best/bestvideo+bestaudio') ydl = YDL({'simulate': True}) - self.assertEqual(ydl._default_format_spec({'is_live': True}), 'bestvideo*+bestaudio/best') + self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best/bestvideo+bestaudio') ydl = YDL({'outtmpl': '-'}) self.assertEqual(ydl._default_format_spec({}), 'best/bestvideo+bestaudio') ydl = YDL({}) - self.assertEqual(ydl._default_format_spec({}, download=False), 'bestvideo*+bestaudio/best') + self.assertEqual(ydl._default_format_spec({}), 'bestvideo*+bestaudio/best') self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best/bestvideo+bestaudio') diff --git a/test/test_cookies.py b/test/test_cookies.py index a682fee1d3..e1271f67eb 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -67,6 +67,7 @@ class TestCookies(unittest.TestCase): ({'XDG_CURRENT_DESKTOP': 'GNOME'}, _LinuxDesktopEnvironment.GNOME), ({'XDG_CURRENT_DESKTOP': 'GNOME:GNOME-Classic'}, _LinuxDesktopEnvironment.GNOME), ({'XDG_CURRENT_DESKTOP': 'GNOME : GNOME-Classic'}, _LinuxDesktopEnvironment.GNOME), + ({'XDG_CURRENT_DESKTOP': 'ubuntu:GNOME'}, _LinuxDesktopEnvironment.GNOME), ({'XDG_CURRENT_DESKTOP': 'Unity', 'DESKTOP_SESSION': 'gnome-fallback'}, _LinuxDesktopEnvironment.GNOME), ({'XDG_CURRENT_DESKTOP': 'KDE', 'KDE_SESSION_VERSION': '5'}, _LinuxDesktopEnvironment.KDE5), diff --git a/test/test_download.py b/test/test_download.py index 882d545650..3f36869d9d 100755 --- a/test/test_download.py +++ b/test/test_download.py @@ -20,7 +20,6 @@ from test.helper import ( gettestcases, getwebpagetestcases, is_download_test, - report_warning, try_rm, ) @@ -178,8 +177,7 @@ def generator(test_case, tname): raise if try_num == RETRIES: - report_warning(f'{tname} failed due to network errors, skipping...') - return + raise print(f'Retrying: {try_num} failed tries\n\n##########\n\n') diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 86928a6a02..06840ed85c 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -92,6 +92,7 @@ class TestJSInterpreter(unittest.TestCase): self._test('function f(){return 0 && 1 || 2;}', 2) self._test('function f(){return 0 ?? 42;}', 0) self._test('function f(){return "life, the universe and everything" < 42;}', False) + self._test('function f(){return 0 - 7 * - 6;}', 42) def test_array_access(self): self._test('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}', [5, 2, 7]) @@ -375,6 +376,61 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('''function f(p,a,c,k,e,d){while(c--)if(k[c])p=p.replace(new RegExp('\\b'+c.toString(a)+'\\b','g'),k[c]);return p}''') self.assertEqual(jsi.call_function('f', '''h 7=g("1j");7.7h({7g:[{33:"w://7f-7e-7d-7c.v.7b/7a/79/78/77/76.74?t=73&s=2s&e=72&f=2t&71=70.0.0.1&6z=6y&6x=6w"}],6v:"w://32.v.u/6u.31",16:"r%",15:"r%",6t:"6s",6r:"",6q:"l",6p:"l",6o:"6n",6m:\'6l\',6k:"6j",9:[{33:"/2u?b=6i&n=50&6h=w://32.v.u/6g.31",6f:"6e"}],1y:{6d:1,6c:\'#6b\',6a:\'#69\',68:"67",66:30,65:r,},"64":{63:"%62 2m%m%61%5z%5y%5x.u%5w%5v%5u.2y%22 2k%m%1o%22 5t%m%1o%22 5s%m%1o%22 2j%m%5r%22 16%m%5q%22 15%m%5p%22 5o%2z%5n%5m%2z",5l:"w://v.u/d/1k/5k.2y",5j:[]},\'5i\':{"5h":"5g"},5f:"5e",5d:"w://v.u",5c:{},5b:l,1x:[0.25,0.50,0.75,1,1.25,1.5,2]});h 1m,1n,5a;h 59=0,58=0;h 7=g("1j");h 2x=0,57=0,56=0;$.55({54:{\'53-52\':\'2i-51\'}});7.j(\'4z\',6(x){c(5>0&&x.1l>=5&&1n!=1){1n=1;$(\'q.4y\').4x(\'4w\')}});7.j(\'13\',6(x){2x=x.1l});7.j(\'2g\',6(x){2w(x)});7.j(\'4v\',6(){$(\'q.2v\').4u()});6 2w(x){$(\'q.2v\').4t();c(1m)19;1m=1;17=0;c(4s.4r===l){17=1}$.4q(\'/2u?b=4p&2l=1k&4o=2t-4n-4m-2s-4l&4k=&4j=&4i=&17=\'+17,6(2r){$(\'#4h\').4g(2r)});$(\'.3-8-4f-4e:4d("4c")\').2h(6(e){2q();g().4b(0);g().4a(l)});6 2q(){h $14=$("").2p({1l:"49",16:"r%",15:"r%",48:0,2n:0,2o:47,46:"45(10%, 10%, 10%, 0.4)","44-43":"42"});$("<41 />").2p({16:"60%",15:"60%",2o:40,"3z-2n":"3y"}).3x({\'2m\':\'/?b=3w&2l=1k\',\'2k\':\'0\',\'2j\':\'2i\'}).2f($14);$14.2h(6(){$(3v).3u();g().2g()});$14.2f($(\'#1j\'))}g().13(0);}6 3t(){h 9=7.1b(2e);2d.2c(9);c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==2e){2d.2c(\'!!=\'+i);7.1p(i)}}}}7.j(\'3s\',6(){g().1h("/2a/3r.29","3q 10 28",6(){g().13(g().27()+10)},"2b");$("q[26=2b]").23().21(\'.3-20-1z\');g().1h("/2a/3p.29","3o 10 28",6(){h 12=g().27()-10;c(12<0)12=0;g().13(12)},"24");$("q[26=24]").23().21(\'.3-20-1z\');});6 1i(){}7.j(\'3n\',6(){1i()});7.j(\'3m\',6(){1i()});7.j("k",6(y){h 9=7.1b();c(9.n<2)19;$(\'.3-8-3l-3k\').3j(6(){$(\'#3-8-a-k\').1e(\'3-8-a-z\');$(\'.3-a-k\').p(\'o-1f\',\'11\')});7.1h("/3i/3h.3g","3f 3e",6(){$(\'.3-1w\').3d(\'3-8-1v\');$(\'.3-8-1y, .3-8-1x\').p(\'o-1g\',\'11\');c($(\'.3-1w\').3c(\'3-8-1v\')){$(\'.3-a-k\').p(\'o-1g\',\'l\');$(\'.3-a-k\').p(\'o-1f\',\'l\');$(\'.3-8-a\').1e(\'3-8-a-z\');$(\'.3-8-a:1u\').3b(\'3-8-a-z\')}3a{$(\'.3-a-k\').p(\'o-1g\',\'11\');$(\'.3-a-k\').p(\'o-1f\',\'11\');$(\'.3-8-a:1u\').1e(\'3-8-a-z\')}},"39");7.j("38",6(y){1d.37(\'1c\',y.9[y.36].1a)});c(1d.1t(\'1c\')){35("1s(1d.1t(\'1c\'));",34)}});h 18;6 1s(1q){h 9=7.1b();c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==1q){c(i==18){19}18=i;7.1p(i)}}}}',36,270,'|||jw|||function|player|settings|tracks|submenu||if||||jwplayer|var||on|audioTracks|true|3D|length|aria|attr|div|100|||sx|filemoon|https||event|active||false|tt|seek|dd|height|width|adb|current_audio|return|name|getAudioTracks|default_audio|localStorage|removeClass|expanded|checked|addButton|callMeMaybe|vplayer|0fxcyc2ajhp1|position|vvplay|vvad|220|setCurrentAudioTrack|audio_name|for|audio_set|getItem|last|open|controls|playbackRates|captions|rewind|icon|insertAfter||detach|ff00||button|getPosition|sec|png|player8|ff11|log|console|track_name|appendTo|play|click|no|scrolling|frameborder|file_code|src|top|zIndex|css|showCCform|data|1662367683|383371|dl|video_ad|doPlay|prevt|mp4|3E||jpg|thumbs|file|300|setTimeout|currentTrack|setItem|audioTrackChanged|dualSound|else|addClass|hasClass|toggleClass|Track|Audio|svg|dualy|images|mousedown|buttons|topbar|playAttemptFailed|beforePlay|Rewind|fr|Forward|ff|ready|set_audio_track|remove|this|upload_srt|prop|50px|margin|1000001|iframe|center|align|text|rgba|background|1000000|left|absolute|pause|setCurrentCaptions|Upload|contains|item|content|html|fviews|referer|prem|embed|3e57249ef633e0d03bf76ceb8d8a4b65|216|83|hash|view|get|TokenZir|window|hide|show|complete|slow|fadeIn|video_ad_fadein|time||cache|Cache|Content|headers|ajaxSetup|v2done|tott|vastdone2|vastdone1|vvbefore|playbackRateControls|cast|aboutlink|FileMoon|abouttext|UHD|1870|qualityLabels|sites|GNOME_POWER|link|2Fiframe|3C|allowfullscreen|22360|22640|22no|marginheight|marginwidth|2FGNOME_POWER|2F0fxcyc2ajhp1|2Fe|2Ffilemoon|2F|3A||22https|3Ciframe|code|sharing|fontOpacity|backgroundOpacity|Tahoma|fontFamily|303030|backgroundColor|FFFFFF|color|userFontScale|thumbnails|kind|0fxcyc2ajhp10000|url|get_slides|start|startparam|none|preload|html5|primary|hlshtml|androidhls|duration|uniform|stretching|0fxcyc2ajhp1_xt|image|2048|sp|6871|asn|127|srv|43200|_g3XlBcu2lmD9oDexD2NLWSmah2Nu3XcDrl93m9PwXY|m3u8||master|0fxcyc2ajhp1_x|00076|01|hls2|to|s01|delivery|storage|moon|sources|setup'''.split('|'))) + def test_join(self): + test_input = list('test') + tests = [ + 'function f(a, b){return a.join(b)}', + 'function f(a, b){return Array.prototype.join.call(a, b)}', + 'function f(a, b){return Array.prototype.join.apply(a, [b])}', + ] + for test in tests: + jsi = JSInterpreter(test) + self._test(jsi, 'test', args=[test_input, '']) + self._test(jsi, 't-e-s-t', args=[test_input, '-']) + self._test(jsi, '', args=[[], '-']) + + def test_split(self): + test_result = list('test') + tests = [ + 'function f(a, b){return a.split(b)}', + 'function f(a, b){return String.prototype.split.call(a, b)}', + 'function f(a, b){return String.prototype.split.apply(a, [b])}', + ] + for test in tests: + jsi = JSInterpreter(test) + self._test(jsi, test_result, args=['test', '']) + self._test(jsi, test_result, args=['t-e-s-t', '-']) + self._test(jsi, [''], args=['', '-']) + self._test(jsi, [], args=['', '']) + + def test_slice(self): + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice()}', [0, 1, 2, 3, 4, 5, 6, 7, 8]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(0)}', [0, 1, 2, 3, 4, 5, 6, 7, 8]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(5)}', [5, 6, 7, 8]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(99)}', []) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-2)}', [7, 8]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-99)}', [0, 1, 2, 3, 4, 5, 6, 7, 8]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(0, 0)}', []) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(1, 0)}', []) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(0, 1)}', [0]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(3, 6)}', [3, 4, 5]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(1, -1)}', [1, 2, 3, 4, 5, 6, 7]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-1, 1)}', []) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-3, -1)}', [6, 7]) + self._test('function f(){return "012345678".slice()}', '012345678') + self._test('function f(){return "012345678".slice(0)}', '012345678') + self._test('function f(){return "012345678".slice(5)}', '5678') + self._test('function f(){return "012345678".slice(99)}', '') + self._test('function f(){return "012345678".slice(-2)}', '78') + self._test('function f(){return "012345678".slice(-99)}', '012345678') + self._test('function f(){return "012345678".slice(0, 0)}', '') + self._test('function f(){return "012345678".slice(1, 0)}', '') + self._test('function f(){return "012345678".slice(0, 1)}', '0') + self._test('function f(){return "012345678".slice(3, 6)}', '345') + self._test('function f(){return "012345678".slice(1, -1)}', '1234567') + self._test('function f(){return "012345678".slice(-1, 1)}', '') + self._test('function f(){return "012345678".slice(-3, -1)}', '67') + if __name__ == '__main__': unittest.main() diff --git a/test/test_networking.py b/test/test_networking.py index af3ece3b44..d96624af18 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -265,6 +265,11 @@ class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): self.end_headers() self.wfile.write(payload) self.finish() + elif self.path == '/get_cookie': + self.send_response(200) + self.send_header('Set-Cookie', 'test=ytdlp; path=/') + self.end_headers() + self.finish() else: self._status(404) @@ -338,6 +343,52 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) assert not issubclass(exc_info.type, CertificateVerifyError) + @pytest.mark.skip_handler('CurlCFFI', 'legacy_ssl ignored by CurlCFFI') + def test_legacy_ssl_extension(self, handler): + # HTTPS server with old ciphers + # XXX: is there a better way to test this than to create a new server? + https_httpd = http.server.ThreadingHTTPServer( + ('127.0.0.1', 0), HTTPTestRequestHandler) + sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + sslctx.maximum_version = ssl.TLSVersion.TLSv1_2 + sslctx.set_ciphers('SHA1:AESCCM:aDSS:eNULL:aNULL') + sslctx.load_cert_chain(os.path.join(TEST_DIR, 'testcert.pem'), None) + https_httpd.socket = sslctx.wrap_socket(https_httpd.socket, server_side=True) + https_port = http_server_port(https_httpd) + https_server_thread = threading.Thread(target=https_httpd.serve_forever) + https_server_thread.daemon = True + https_server_thread.start() + + with handler(verify=False) as rh: + res = validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers', extensions={'legacy_ssl': True})) + assert res.status == 200 + res.close() + + # Ensure only applies to request extension + with pytest.raises(SSLError): + validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) + + @pytest.mark.skip_handler('CurlCFFI', 'legacy_ssl ignored by CurlCFFI') + def test_legacy_ssl_support(self, handler): + # HTTPS server with old ciphers + # XXX: is there a better way to test this than to create a new server? + https_httpd = http.server.ThreadingHTTPServer( + ('127.0.0.1', 0), HTTPTestRequestHandler) + sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + sslctx.maximum_version = ssl.TLSVersion.TLSv1_2 + sslctx.set_ciphers('SHA1:AESCCM:aDSS:eNULL:aNULL') + sslctx.load_cert_chain(os.path.join(TEST_DIR, 'testcert.pem'), None) + https_httpd.socket = sslctx.wrap_socket(https_httpd.socket, server_side=True) + https_port = http_server_port(https_httpd) + https_server_thread = threading.Thread(target=https_httpd.serve_forever) + https_server_thread.daemon = True + https_server_thread.start() + + with handler(verify=False, legacy_ssl_support=True) as rh: + res = validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) + assert res.status == 200 + res.close() + def test_percent_encode(self, handler): with handler() as rh: # Unicode characters should be encoded with uppercase percent-encoding @@ -490,6 +541,24 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={'cookiejar': cookiejar})).read() assert b'cookie: test=ytdlp' in data.lower() + def test_cookie_sync_only_cookiejar(self, handler): + # Ensure that cookies are ONLY being handled by the cookiejar + with handler() as rh: + validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/get_cookie', extensions={'cookiejar': YoutubeDLCookieJar()})) + data = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={'cookiejar': YoutubeDLCookieJar()})).read() + assert b'cookie: test=ytdlp' not in data.lower() + + def test_cookie_sync_delete_cookie(self, handler): + # Ensure that cookies are ONLY being handled by the cookiejar + cookiejar = YoutubeDLCookieJar() + with handler(cookiejar=cookiejar) as rh: + validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/get_cookie')) + data = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/headers')).read() + assert b'cookie: test=ytdlp' in data.lower() + cookiejar.clear_session_cookies() + data = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/headers')).read() + assert b'cookie: test=ytdlp' not in data.lower() + def test_headers(self, handler): with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh: @@ -753,6 +822,24 @@ class TestRequestHandlerMisc: rh.close() assert len(logging_handlers) == before_count + def test_wrap_request_errors(self): + class TestRequestHandler(RequestHandler): + def _validate(self, request): + if request.headers.get('x-fail'): + raise UnsupportedRequest('test error') + + def _send(self, request: Request): + raise RequestError('test error') + + with TestRequestHandler(logger=FakeLogger()) as rh: + with pytest.raises(UnsupportedRequest, match='test error') as exc_info: + rh.validate(Request('http://example.com', headers={'x-fail': '1'})) + assert exc_info.value.handler is rh + + with pytest.raises(RequestError, match='test error') as exc_info: + rh.send(Request('http://example.com')) + assert exc_info.value.handler is rh + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) class TestUrllibRequestHandler(TestRequestHandlerBase): @@ -914,7 +1001,6 @@ class TestRequestsRequestHandler(TestRequestHandlerBase): class TestCurlCFFIRequestHandler(TestRequestHandlerBase): @pytest.mark.parametrize('params,extensions', [ - ({}, {'impersonate': ImpersonateTarget('chrome')}), ({'impersonate': ImpersonateTarget('chrome', '110')}, {}), ({'impersonate': ImpersonateTarget('chrome', '99')}, {'impersonate': ImpersonateTarget('chrome', '110')}), ]) @@ -1200,6 +1286,9 @@ class TestRequestHandlerValidation: ({'timeout': 1}, False), ({'timeout': 'notatimeout'}, AssertionError), ({'unsupported': 'value'}, UnsupportedRequest), + ({'legacy_ssl': False}, False), + ({'legacy_ssl': True}, False), + ({'legacy_ssl': 'notabool'}, AssertionError), ]), ('Requests', 'http', [ ({'cookiejar': 'notacookiejar'}, AssertionError), @@ -1207,6 +1296,9 @@ class TestRequestHandlerValidation: ({'timeout': 1}, False), ({'timeout': 'notatimeout'}, AssertionError), ({'unsupported': 'value'}, UnsupportedRequest), + ({'legacy_ssl': False}, False), + ({'legacy_ssl': True}, False), + ({'legacy_ssl': 'notabool'}, AssertionError), ]), ('CurlCFFI', 'http', [ ({'cookiejar': 'notacookiejar'}, AssertionError), @@ -1220,6 +1312,9 @@ class TestRequestHandlerValidation: ({'impersonate': ImpersonateTarget(None, None, None, None)}, False), ({'impersonate': ImpersonateTarget()}, False), ({'impersonate': 'chrome'}, AssertionError), + ({'legacy_ssl': False}, False), + ({'legacy_ssl': True}, False), + ({'legacy_ssl': 'notabool'}, AssertionError), ]), (NoCheckRH, 'http', [ ({'cookiejar': 'notacookiejar'}, False), @@ -1228,6 +1323,9 @@ class TestRequestHandlerValidation: ('Websockets', 'ws', [ ({'cookiejar': YoutubeDLCookieJar()}, False), ({'timeout': 2}, False), + ({'legacy_ssl': False}, False), + ({'legacy_ssl': True}, False), + ({'legacy_ssl': 'notabool'}, AssertionError), ]), ] diff --git a/test/test_plugins.py b/test/test_plugins.py index c82158e9fc..77545d136c 100644 --- a/test/test_plugins.py +++ b/test/test_plugins.py @@ -10,6 +10,7 @@ TEST_DATA_DIR = Path(os.path.dirname(os.path.abspath(__file__)), 'testdata') sys.path.append(str(TEST_DATA_DIR)) importlib.invalidate_caches() +from yt_dlp.utils import Config from yt_dlp.plugins import PACKAGE_NAME, directories, load_plugins @@ -68,6 +69,24 @@ class TestPlugins(unittest.TestCase): os.remove(zip_path) importlib.invalidate_caches() # reset the import caches + def test_plugin_dirs(self): + # Internal plugin dirs hack for CLI --plugin-dirs + # To be replaced with proper system later + custom_plugin_dir = TEST_DATA_DIR / 'plugin_packages' + Config._plugin_dirs = [str(custom_plugin_dir)] + importlib.invalidate_caches() # reset the import caches + + try: + package = importlib.import_module(f'{PACKAGE_NAME}.extractor') + self.assertIn(custom_plugin_dir / 'testpackage' / PACKAGE_NAME / 'extractor', map(Path, package.__path__)) + + plugins_ie = load_plugins('extractor', 'IE') + self.assertIn('PackagePluginIE', plugins_ie.keys()) + + finally: + Config._plugin_dirs = [] + importlib.invalidate_caches() # reset the import caches + if __name__ == '__main__': unittest.main() diff --git a/test/test_traversal.py b/test/test_traversal.py index 5d9fbe1d16..f1d123bd6e 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -4,8 +4,19 @@ import xml.etree.ElementTree import pytest -from yt_dlp.utils import dict_get, int_or_none, str_or_none -from yt_dlp.utils.traversal import traverse_obj +from yt_dlp.utils import ( + ExtractorError, + determine_ext, + dict_get, + int_or_none, + str_or_none, +) +from yt_dlp.utils.traversal import ( + require, + subs_list_to_dict, + traverse_obj, + trim_str, +) _TEST_DATA = { 100: 100, @@ -420,6 +431,85 @@ class TestTraversal: assert traverse_obj(morsel, [(None,), any]) == morsel, \ 'Morsel should not be implicitly changed to dict on usage' + def test_traversal_filter(self): + data = [None, False, True, 0, 1, 0.0, 1.1, '', 'str', {}, {0: 0}, [], [1]] + + assert traverse_obj(data, [..., filter]) == [True, 1, 1.1, 'str', {0: 0}, [1]], \ + '`filter` should filter falsy values' + + +class TestTraversalHelpers: + def test_traversal_require(self): + with pytest.raises(ExtractorError): + traverse_obj(_TEST_DATA, ['None', {require('value')}]) + assert traverse_obj(_TEST_DATA, ['str', {require('value')}]) == 'str', \ + '`require` should pass through non `None` values' + + def test_subs_list_to_dict(self): + assert traverse_obj([ + {'name': 'de', 'url': 'https://example.com/subs/de.vtt'}, + {'name': 'en', 'url': 'https://example.com/subs/en1.ass'}, + {'name': 'en', 'url': 'https://example.com/subs/en2.ass'}, + ], [..., { + 'id': 'name', + 'url': 'url', + }, all, {subs_list_to_dict}]) == { + 'de': [{'url': 'https://example.com/subs/de.vtt'}], + 'en': [ + {'url': 'https://example.com/subs/en1.ass'}, + {'url': 'https://example.com/subs/en2.ass'}, + ], + }, 'function should build subtitle dict from list of subtitles' + assert traverse_obj([ + {'name': 'de', 'url': 'https://example.com/subs/de.ass'}, + {'name': 'de'}, + {'name': 'en', 'content': 'content'}, + {'url': 'https://example.com/subs/en'}, + ], [..., { + 'id': 'name', + 'data': 'content', + 'url': 'url', + }, all, {subs_list_to_dict}]) == { + 'de': [{'url': 'https://example.com/subs/de.ass'}], + 'en': [{'data': 'content'}], + }, 'subs with mandatory items missing should be filtered' + assert traverse_obj([ + {'url': 'https://example.com/subs/de.ass', 'name': 'de'}, + {'url': 'https://example.com/subs/en', 'name': 'en'}, + ], [..., { + 'id': 'name', + 'ext': ['url', {lambda x: determine_ext(x, default_ext=None)}], + 'url': 'url', + }, all, {subs_list_to_dict(ext='ext')}]) == { + 'de': [{'url': 'https://example.com/subs/de.ass', 'ext': 'ass'}], + 'en': [{'url': 'https://example.com/subs/en', 'ext': 'ext'}], + }, '`ext` should set default ext but leave existing value untouched' + assert traverse_obj([ + {'name': 'en', 'url': 'https://example.com/subs/en2', 'prio': True}, + {'name': 'en', 'url': 'https://example.com/subs/en1', 'prio': False}, + ], [..., { + 'id': 'name', + 'quality': ['prio', {int}], + 'url': 'url', + }, all, {subs_list_to_dict(ext='ext')}]) == {'en': [ + {'url': 'https://example.com/subs/en1', 'ext': 'ext'}, + {'url': 'https://example.com/subs/en2', 'ext': 'ext'}, + ]}, '`quality` key should sort subtitle list accordingly' + + def test_trim_str(self): + with pytest.raises(TypeError): + trim_str('positional') + + assert callable(trim_str(start='a')) + assert trim_str(start='ab')('abc') == 'c' + assert trim_str(end='bc')('abc') == 'a' + assert trim_str(start='a', end='c')('abc') == 'b' + assert trim_str(start='ab', end='c')('abc') == '' + assert trim_str(start='a', end='bc')('abc') == '' + assert trim_str(start='ab', end='bc')('abc') == '' + assert trim_str(start='abc', end='abc')('abc') == '' + assert trim_str(start='', end='')('abc') == 'abc' + class TestDictGet: def test_dict_get(self): diff --git a/test/test_update.py b/test/test_update.py index 63a21e445f..23c12d38c1 100644 --- a/test/test_update.py +++ b/test/test_update.py @@ -82,16 +82,32 @@ TEST_LOCKFILE_V1 = rf'''{TEST_LOCKFILE_COMMENT} lock 2022.08.18.36 .+ Python 3\.6 lock 2023.11.16 (?!win_x86_exe).+ Python 3\.7 lock 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) +lock 2024.10.22 py2exe .+ +lock 2024.10.22 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b +lock 2024.10.22 (?!\w+_exe).+ Python 3\.8 +lock 2024.10.22 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) ''' TEST_LOCKFILE_V2_TMPL = r'''%s lockV2 yt-dlp/yt-dlp 2022.08.18.36 .+ Python 3\.6 lockV2 yt-dlp/yt-dlp 2023.11.16 (?!win_x86_exe).+ Python 3\.7 lockV2 yt-dlp/yt-dlp 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) +lockV2 yt-dlp/yt-dlp 2024.10.22 py2exe .+ +lockV2 yt-dlp/yt-dlp 2024.10.22 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b +lockV2 yt-dlp/yt-dlp 2024.10.22 (?!\w+_exe).+ Python 3\.8 +lockV2 yt-dlp/yt-dlp 2024.10.22 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 (?!win_x86_exe).+ Python 3\.7 lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 win_x86_exe .+ Windows-(?:Vista|2008Server) +lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 py2exe .+ +lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b +lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 (?!\w+_exe).+ Python 3\.8 +lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 (?!win_x86_exe).+ Python 3\.7 lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 win_x86_exe .+ Windows-(?:Vista|2008Server) +lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.045052 py2exe .+ +lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b +lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 (?!\w+_exe).+ Python 3\.8 +lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) ''' TEST_LOCKFILE_V2 = TEST_LOCKFILE_V2_TMPL % TEST_LOCKFILE_COMMENT @@ -145,43 +161,76 @@ class TestUpdate(unittest.TestCase): for lockfile in (TEST_LOCKFILE_V1, TEST_LOCKFILE_V2, TEST_LOCKFILE_ACTUAL, TEST_LOCKFILE_FORK): # Normal operation test(lockfile, 'zip Python 3.12.0', '2023.12.31', '2023.12.31') - test(lockfile, 'zip stable Python 3.12.0', '2023.12.31', '2023.12.31', exact=True) - # Python 3.6 --update should update only to its lock + test(lockfile, 'zip Python 3.12.0', '2023.12.31', '2023.12.31', exact=True) + # py2exe should never update beyond 2024.10.22 + test(lockfile, 'py2exe Python 3.8', '2025.01.01', '2024.10.22') + test(lockfile, 'py2exe Python 3.8', '2025.01.01', None, exact=True) + # Python 3.6 --update should update only to the py3.6 lock test(lockfile, 'zip Python 3.6.0', '2023.11.16', '2022.08.18.36') - # --update-to an exact version later than the lock should return None - test(lockfile, 'zip stable Python 3.6.0', '2023.11.16', None, exact=True) - # Python 3.7 should be able to update to its lock + # Python 3.6 --update-to an exact version later than the py3.6 lock should return None + test(lockfile, 'zip Python 3.6.0', '2023.11.16', None, exact=True) + # Python 3.7 should be able to update to the py3.7 lock test(lockfile, 'zip Python 3.7.0', '2023.11.16', '2023.11.16') - test(lockfile, 'zip stable Python 3.7.1', '2023.11.16', '2023.11.16', exact=True) - # Non-win_x86_exe builds on py3.7 must be locked + test(lockfile, 'zip Python 3.7.1', '2023.11.16', '2023.11.16', exact=True) + # Non-win_x86_exe builds on py3.7 must be locked at py3.7 lock test(lockfile, 'zip Python 3.7.1', '2023.12.31', '2023.11.16') - test(lockfile, 'zip stable Python 3.7.1', '2023.12.31', None, exact=True) - test( # Windows Vista w/ win_x86_exe must be locked - lockfile, 'win_x86_exe stable Python 3.7.9 (CPython x86 32bit) - Windows-Vista-6.0.6003-SP2', + test(lockfile, 'zip Python 3.7.1', '2023.12.31', None, exact=True) + # Python 3.8 should only update to the py3.8 lock + test(lockfile, 'zip Python 3.8.10', '2025.01.01', '2024.10.22') + test(lockfile, 'zip Python 3.8.110', '2025.01.01', None, exact=True) + test( # Windows Vista w/ win_x86_exe must be locked at Vista lock + lockfile, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-Vista-6.0.6003-SP2', '2023.12.31', '2023.11.16') - test( # Windows 2008Server w/ win_x86_exe must be locked + test( # Windows 2008Server w/ win_x86_exe must be locked at Vista lock lockfile, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-2008Server', '2023.12.31', None, exact=True) - test( # Windows 7 w/ win_x86_exe py3.7 build should be able to update beyond lock - lockfile, 'win_x86_exe stable Python 3.7.9 (CPython x86 32bit) - Windows-7-6.1.7601-SP1', - '2023.12.31', '2023.12.31') - test( # Windows 8.1 w/ '2008Server' in platform string should be able to update beyond lock + test( # Windows 7 w/ win_x86_exe py3.7 build should be able to update beyond py3.7 lock + lockfile, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-7-6.1.7601-SP1', + '2023.12.31', '2023.12.31', exact=True) + test( # Windows 7 win_x86_exe should only update to Win7 lock + lockfile, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-7-6.1.7601-SP1', + '2025.01.01', '2024.10.22') + test( # Windows 2008ServerR2 win_exe should only update to Win7 lock + lockfile, 'win_exe Python 3.8.10 (CPython x86 32bit) - Windows-2008ServerR2', + '2025.12.31', '2024.10.22') + test( # Windows 8.1 w/ '2008Server' in platform string should be able to update beyond py3.7 lock lockfile, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-post2008Server-6.2.9200', '2023.12.31', '2023.12.31', exact=True) + test( # win_exe built w/Python 3.8 on Windows>=8 should be able to update beyond py3.8 lock + lockfile, 'win_exe Python 3.8.10 (CPython AMD64 64bit) - Windows-10-10.0.20348-SP0', + '2025.01.01', '2025.01.01', exact=True) + test( # linux_armv7l_exe w/glibc2.7 should only update to glibc<2.31 lock + lockfile, 'linux_armv7l_exe Python 3.8.0 (CPython armv7l 32bit) - Linux-6.5.0-1025-azure-armv7l-with-glibc2.7', + '2025.01.01', '2024.10.22') + test( # linux_armv7l_exe w/Python 3.8 and glibc>=2.31 should be able to update beyond py3.8 and glibc<2.31 locks + lockfile, 'linux_armv7l_exe Python 3.8.0 (CPython armv7l 32bit) - Linux-6.5.0-1025-azure-armv7l-with-glibc2.31', + '2025.01.01', '2025.01.01') + test( # linux_armv7l_exe w/glibc2.30 should only update to glibc<2.31 lock + lockfile, 'linux_armv7l_exe Python 3.8.0 (CPython armv7l 64bit) - Linux-6.5.0-1025-azure-aarch64-with-glibc2.30 (OpenSSL', + '2025.01.01', '2024.10.22') + test( # linux_aarch64_exe w/glibc2.17 should only update to glibc<2.31 lock + lockfile, 'linux_aarch64_exe Python 3.8.0 (CPython aarch64 64bit) - Linux-6.5.0-1025-azure-aarch64-with-glibc2.17', + '2025.01.01', '2024.10.22') + test( # linux_aarch64_exe w/glibc2.40 and glibc>=2.31 should be able to update beyond py3.8 and glibc<2.31 locks + lockfile, 'linux_aarch64_exe Python 3.8.0 (CPython aarch64 64bit) - Linux-6.5.0-1025-azure-aarch64-with-glibc2.40', + '2025.01.01', '2025.01.01') + test( # linux_aarch64_exe w/glibc2.3 should only update to glibc<2.31 lock + lockfile, 'linux_aarch64_exe Python 3.8.0 (CPython aarch64 64bit) - Linux-6.5.0-1025-azure-aarch64-with-glibc2.3 (OpenSSL', + '2025.01.01', '2024.10.22') # Forks can block updates to non-numeric tags rather than lock test(TEST_LOCKFILE_FORK, 'zip Python 3.6.3', 'pr0000', None, repo='fork/yt-dlp') - test(TEST_LOCKFILE_FORK, 'zip stable Python 3.7.4', 'pr0000', 'pr0000', repo='fork/yt-dlp') - test(TEST_LOCKFILE_FORK, 'zip stable Python 3.7.4', 'pr1234', None, repo='fork/yt-dlp') + test(TEST_LOCKFILE_FORK, 'zip Python 3.7.4', 'pr0000', 'pr0000', repo='fork/yt-dlp') + test(TEST_LOCKFILE_FORK, 'zip Python 3.7.4', 'pr1234', None, repo='fork/yt-dlp') test(TEST_LOCKFILE_FORK, 'zip Python 3.8.1', 'pr1234', 'pr1234', repo='fork/yt-dlp', exact=True) test( - TEST_LOCKFILE_FORK, 'win_x86_exe stable Python 3.7.9 (CPython x86 32bit) - Windows-Vista-6.0.6003-SP2', + TEST_LOCKFILE_FORK, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-Vista-6.0.6003-SP2', 'pr1234', None, repo='fork/yt-dlp') test( - TEST_LOCKFILE_FORK, 'win_x86_exe stable Python 3.7.9 (CPython x86 32bit) - Windows-7-6.1.7601-SP1', + TEST_LOCKFILE_FORK, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-7-6.1.7601-SP1', '2023.12.31', '2023.12.31', repo='fork/yt-dlp') test(TEST_LOCKFILE_FORK, 'zip Python 3.11.2', 'pr9999', None, repo='fork/yt-dlp', exact=True) - test(TEST_LOCKFILE_FORK, 'zip stable Python 3.12.0', 'pr9999', 'pr9999', repo='fork/yt-dlp') + test(TEST_LOCKFILE_FORK, 'zip Python 3.12.0', 'pr9999', 'pr9999', repo='fork/yt-dlp') def test_query_update(self): ydl = FakeYDL() diff --git a/test/test_utils.py b/test/test_utils.py index 251739686e..04f91547a4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,6 +4,7 @@ import os import sys import unittest +import unittest.mock import warnings import datetime as dt @@ -71,6 +72,7 @@ from yt_dlp.utils import ( intlist_to_bytes, iri_to_uri, is_html, + join_nonempty, js_to_json, limit_length, locked_file, @@ -130,6 +132,7 @@ from yt_dlp.utils import ( xpath_text, xpath_with_ns, ) +from yt_dlp.utils._utils import _UnsafeExtensionError from yt_dlp.utils.networking import ( HTTPHeaderDict, escape_rfc3986, @@ -220,9 +223,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI') def test_sanitize_path(self): - if sys.platform != 'win32': - return + with unittest.mock.patch('sys.platform', 'win32'): + self._test_sanitize_path() + def _test_sanitize_path(self): self.assertEqual(sanitize_path('abc'), 'abc') self.assertEqual(sanitize_path('abc/def'), 'abc\\def') self.assertEqual(sanitize_path('abc\\def'), 'abc\\def') @@ -255,6 +259,11 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_path('./abc'), 'abc') self.assertEqual(sanitize_path('./../abc'), '..\\abc') + self.assertEqual(sanitize_path('\\abc'), '\\abc') + self.assertEqual(sanitize_path('C:abc'), 'C:abc') + self.assertEqual(sanitize_path('C:abc\\..\\'), 'C:..') + self.assertEqual(sanitize_path('C:\\abc:%(title)s.%(ext)s'), 'C:\\abc#%(title)s.%(ext)s') + def test_sanitize_url(self): self.assertEqual(sanitize_url('//foo.bar'), 'http://foo.bar') self.assertEqual(sanitize_url('httpss://foo.bar'), 'https://foo.bar') @@ -281,6 +290,13 @@ class TestUtil(unittest.TestCase): finally: os.environ['HOME'] = old_home or '' + _uncommon_extensions = [ + ('exe', 'abc.exe.ext'), + ('de', 'abc.de.ext'), + ('../.mp4', None), + ('..\\.mp4', None), + ] + def test_prepend_extension(self): self.assertEqual(prepend_extension('abc.ext', 'temp'), 'abc.temp.ext') self.assertEqual(prepend_extension('abc.ext', 'temp', 'ext'), 'abc.temp.ext') @@ -289,6 +305,19 @@ class TestUtil(unittest.TestCase): self.assertEqual(prepend_extension('.abc', 'temp'), '.abc.temp') self.assertEqual(prepend_extension('.abc.ext', 'temp'), '.abc.temp.ext') + # Test uncommon extensions + self.assertEqual(prepend_extension('abc.ext', 'bin'), 'abc.bin.ext') + for ext, result in self._uncommon_extensions: + with self.assertRaises(_UnsafeExtensionError): + prepend_extension('abc', ext) + if result: + self.assertEqual(prepend_extension('abc.ext', ext, 'ext'), result) + else: + with self.assertRaises(_UnsafeExtensionError): + prepend_extension('abc.ext', ext, 'ext') + with self.assertRaises(_UnsafeExtensionError): + prepend_extension('abc.unexpected_ext', ext, 'ext') + def test_replace_extension(self): self.assertEqual(replace_extension('abc.ext', 'temp'), 'abc.temp') self.assertEqual(replace_extension('abc.ext', 'temp', 'ext'), 'abc.temp') @@ -297,6 +326,16 @@ class TestUtil(unittest.TestCase): self.assertEqual(replace_extension('.abc', 'temp'), '.abc.temp') self.assertEqual(replace_extension('.abc.ext', 'temp'), '.abc.temp') + # Test uncommon extensions + self.assertEqual(replace_extension('abc.ext', 'bin'), 'abc.unknown_video') + for ext, _ in self._uncommon_extensions: + with self.assertRaises(_UnsafeExtensionError): + replace_extension('abc', ext) + with self.assertRaises(_UnsafeExtensionError): + replace_extension('abc.ext', ext, 'ext') + with self.assertRaises(_UnsafeExtensionError): + replace_extension('abc.unexpected_ext', ext, 'ext') + def test_subtitles_filename(self): self.assertEqual(subtitles_filename('abc.ext', 'en', 'vtt'), 'abc.en.vtt') self.assertEqual(subtitles_filename('abc.ext', 'en', 'vtt', 'ext'), 'abc.en.vtt') @@ -306,11 +345,13 @@ class TestUtil(unittest.TestCase): self.assertEqual(remove_start(None, 'A - '), None) self.assertEqual(remove_start('A - B', 'A - '), 'B') self.assertEqual(remove_start('B - A', 'A - '), 'B - A') + self.assertEqual(remove_start('non-empty', ''), 'non-empty') def test_remove_end(self): self.assertEqual(remove_end(None, ' - B'), None) self.assertEqual(remove_end('A - B', ' - B'), 'A') self.assertEqual(remove_end('B - A', ' - B'), 'B - A') + self.assertEqual(remove_end('non-empty', ''), 'non-empty') def test_remove_quotes(self): self.assertEqual(remove_quotes(None), None) @@ -413,6 +454,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540) self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140) self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363) + self.assertEqual(unified_timestamp('Sunday, 26 Nov 2006, 19:00'), 1164567600) + self.assertEqual(unified_timestamp('wed, aug 16, 2008, 12:00pm'), 1218931200) self.assertEqual(unified_timestamp('December 31 1969 20:00:01 EDT'), 1) self.assertEqual(unified_timestamp('Wednesday 31 December 1969 18:01:26 MDT'), 86) @@ -888,6 +931,11 @@ class TestUtil(unittest.TestCase): 'acodec': 'none', 'dynamic_range': 'HDR10', }) + self.assertEqual(parse_codecs('vp09.02.50.10.01.09.18.09.00'), { + 'vcodec': 'vp09.02.50.10.01.09.18.09.00', + 'acodec': 'none', + 'dynamic_range': 'HDR10', + }) self.assertEqual(parse_codecs('av01.0.12M.10.0.110.09.16.09.0'), { 'vcodec': 'av01.0.12M.10.0.110.09.16.09.0', 'acodec': 'none', @@ -898,6 +946,11 @@ class TestUtil(unittest.TestCase): 'acodec': 'none', 'dynamic_range': 'DV', }) + self.assertEqual(parse_codecs('fLaC'), { + 'vcodec': 'none', + 'acodec': 'flac', + 'dynamic_range': None, + }) self.assertEqual(parse_codecs('theora, vorbis'), { 'vcodec': 'theora', 'acodec': 'vorbis', @@ -2099,6 +2152,16 @@ Line 1 assert run_shell(args) == expected assert run_shell(shell_quote(args, shell=True)) == expected + def test_partial_application(self): + assert callable(int_or_none(scale=10)), 'missing positional parameter should apply partially' + assert int_or_none(10, scale=0.1) == 100, 'positionally passed argument should call function' + assert int_or_none(v=10) == 10, 'keyword passed positional should call function' + assert int_or_none(scale=0.1)(10) == 100, 'call after partial applicatino should call the function' + + assert callable(join_nonempty(delim=', ')), 'varargs positional should apply partially' + assert callable(join_nonempty()), 'varargs positional should apply partially' + assert join_nonempty(None, delim=', ') == '', 'passed varargs should call the function' + if __name__ == '__main__': unittest.main() diff --git a/test/test_websockets.py b/test/test_websockets.py index 5f101abcc6..06112cc0b8 100644 --- a/test/test_websockets.py +++ b/test/test_websockets.py @@ -61,6 +61,10 @@ def process_request(self, request): return websockets.http11.Response( status.value, status.phrase, websockets.datastructures.Headers([('Location', '/')]), b'') return self.protocol.reject(status.value, status.phrase) + elif request.path.startswith('/get_cookie'): + response = self.protocol.accept(request) + response.headers['Set-Cookie'] = 'test=ytdlp' + return response return self.protocol.accept(request) @@ -84,7 +88,7 @@ def create_wss_websocket_server(): certfn = os.path.join(TEST_DIR, 'testcert.pem') sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) sslctx.load_cert_chain(certfn, None) - return create_websocket_server(ssl_context=sslctx) + return create_websocket_server(ssl=sslctx) MTLS_CERT_DIR = os.path.join(TEST_DIR, 'testdata', 'certificate') @@ -99,7 +103,16 @@ def create_mtls_wss_websocket_server(): sslctx.load_verify_locations(cafile=cacertfn) sslctx.load_cert_chain(certfn, None) - return create_websocket_server(ssl_context=sslctx) + return create_websocket_server(ssl=sslctx) + + +def create_legacy_wss_websocket_server(): + certfn = os.path.join(TEST_DIR, 'testcert.pem') + sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + sslctx.maximum_version = ssl.TLSVersion.TLSv1_2 + sslctx.set_ciphers('SHA1:AESCCM:aDSS:eNULL:aNULL') + sslctx.load_cert_chain(certfn, None) + return create_websocket_server(ssl=sslctx) def ws_validate_and_send(rh, req): @@ -126,12 +139,15 @@ class TestWebsSocketRequestHandlerConformance: cls.wss_thread, cls.wss_port = create_wss_websocket_server() cls.wss_base_url = f'wss://127.0.0.1:{cls.wss_port}' - cls.bad_wss_thread, cls.bad_wss_port = create_websocket_server(ssl_context=ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)) + cls.bad_wss_thread, cls.bad_wss_port = create_websocket_server(ssl=ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)) cls.bad_wss_host = f'wss://127.0.0.1:{cls.bad_wss_port}' cls.mtls_wss_thread, cls.mtls_wss_port = create_mtls_wss_websocket_server() cls.mtls_wss_base_url = f'wss://127.0.0.1:{cls.mtls_wss_port}' + cls.legacy_wss_thread, cls.legacy_wss_port = create_legacy_wss_websocket_server() + cls.legacy_wss_host = f'wss://127.0.0.1:{cls.legacy_wss_port}' + def test_basic_websockets(self, handler): with handler() as rh: ws = ws_validate_and_send(rh, Request(self.ws_base_url)) @@ -166,6 +182,22 @@ class TestWebsSocketRequestHandlerConformance: ws_validate_and_send(rh, Request(self.bad_wss_host)) assert not issubclass(exc_info.type, CertificateVerifyError) + def test_legacy_ssl_extension(self, handler): + with handler(verify=False) as rh: + ws = ws_validate_and_send(rh, Request(self.legacy_wss_host, extensions={'legacy_ssl': True})) + assert ws.status == 101 + ws.close() + + # Ensure only applies to request extension + with pytest.raises(SSLError): + ws_validate_and_send(rh, Request(self.legacy_wss_host)) + + def test_legacy_ssl_support(self, handler): + with handler(verify=False, legacy_ssl_support=True) as rh: + ws = ws_validate_and_send(rh, Request(self.legacy_wss_host)) + assert ws.status == 101 + ws.close() + @pytest.mark.parametrize('path,expected', [ # Unicode characters should be encoded with uppercase percent-encoding ('/中文', '/%E4%B8%AD%E6%96%87'), @@ -248,6 +280,32 @@ class TestWebsSocketRequestHandlerConformance: assert json.loads(ws.recv())['cookie'] == 'test=ytdlp' ws.close() + @pytest.mark.skip_handler('Websockets', 'Set-Cookie not supported by websockets') + def test_cookie_sync_only_cookiejar(self, handler): + # Ensure that cookies are ONLY being handled by the cookiejar + with handler() as rh: + ws_validate_and_send(rh, Request(f'{self.ws_base_url}/get_cookie', extensions={'cookiejar': YoutubeDLCookieJar()})) + ws = ws_validate_and_send(rh, Request(self.ws_base_url, extensions={'cookiejar': YoutubeDLCookieJar()})) + ws.send('headers') + assert 'cookie' not in json.loads(ws.recv()) + ws.close() + + @pytest.mark.skip_handler('Websockets', 'Set-Cookie not supported by websockets') + def test_cookie_sync_delete_cookie(self, handler): + # Ensure that cookies are ONLY being handled by the cookiejar + cookiejar = YoutubeDLCookieJar() + with handler(verbose=True, cookiejar=cookiejar) as rh: + ws_validate_and_send(rh, Request(f'{self.ws_base_url}/get_cookie')) + ws = ws_validate_and_send(rh, Request(self.ws_base_url)) + ws.send('headers') + assert json.loads(ws.recv())['cookie'] == 'test=ytdlp' + ws.close() + cookiejar.clear_session_cookies() + ws = ws_validate_and_send(rh, Request(self.ws_base_url)) + ws.send('headers') + assert 'cookie' not in json.loads(ws.recv()) + ws.close() + def test_source_address(self, handler): source_address = f'127.0.0.{random.randint(5, 255)}' verify_address_availability(source_address) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index bfaff83a0a..0f7ae34f44 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -163,6 +163,26 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/b7910ca8/player_ias.vflset/en_US/base.js', '_hXMCwMt9qE310D', 'LoZMgkkofRMCZQ', ), + ( + 'https://www.youtube.com/s/player/590f65a6/player_ias.vflset/en_US/base.js', + '1tm7-g_A9zsI8_Lay_', 'xI4Vem4Put_rOg', + ), + ( + 'https://www.youtube.com/s/player/b22ef6e7/player_ias.vflset/en_US/base.js', + 'b6HcntHGkvBLk_FRf', 'kNPW6A7FyP2l8A', + ), + ( + 'https://www.youtube.com/s/player/3400486c/player_ias.vflset/en_US/base.js', + 'lL46g3XifCKUZn1Xfw', 'z767lhet6V2Skl', + ), + ( + 'https://www.youtube.com/s/player/20dfca59/player_ias.vflset/en_US/base.js', + '-fLCxedkAk4LUTK2', 'O8kfRq1y1eyHGw', + ), + ( + 'https://www.youtube.com/s/player/b12cc44b/player_ias.vflset/en_US/base.js', + 'keLa5R2U00sR9SQK', 'N1OGyujjEwMnLw', + ), ] diff --git a/test/testdata/netrc/netrc b/test/testdata/netrc/netrc new file mode 100644 index 0000000000..bafe92fe6a --- /dev/null +++ b/test/testdata/netrc/netrc @@ -0,0 +1,4 @@ +machine normal_use login user password pass +machine empty_user login "" password pass +machine empty_pass login user password "" +machine both_empty login "" password "" diff --git a/test/testdata/netrc/print_netrc.py b/test/testdata/netrc/print_netrc.py new file mode 100644 index 0000000000..5c25814f84 --- /dev/null +++ b/test/testdata/netrc/print_netrc.py @@ -0,0 +1,2 @@ +with open('./test/testdata/netrc/netrc', encoding='utf-8') as fp: + print(fp.read()) diff --git a/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py b/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py new file mode 100644 index 0000000000..b860300d8d --- /dev/null +++ b/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py @@ -0,0 +1,5 @@ +from yt_dlp.extractor.common import InfoExtractor + + +class PackagePluginIE(InfoExtractor): + pass diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 5abcb4635c..f08a31afac 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -4,6 +4,7 @@ import copy import datetime as dt import errno import fileinput +import functools import http.cookiejar import io import itertools @@ -24,9 +25,9 @@ import traceback import unicodedata from .cache import Cache -from .compat import functools, urllib # isort: split +from .compat import urllib # isort: split from .compat import compat_os_name, urllib_req_to_req -from .cookies import LenientSimpleCookie, load_cookies +from .cookies import CookieLoadError, LenientSimpleCookie, load_cookies from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader.rtmp import rtmpdump_version from .extractor import gen_extractor_classes, get_info_extractor @@ -153,12 +154,11 @@ from .utils import ( try_get, url_basename, variadic, - version_tuple, windows_enable_vt_mode, write_json_file, write_string, ) -from .utils._utils import _YDLLogger +from .utils._utils import _UnsafeExtensionError, _YDLLogger from .utils.networking import ( HTTPHeaderDict, clean_headers, @@ -171,6 +171,20 @@ if compat_os_name == 'nt': import ctypes +def _catch_unsafe_extension_error(func): + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + try: + return func(self, *args, **kwargs) + except _UnsafeExtensionError as error: + self.report_error( + f'The extracted extension ({error.extension!r}) is unusual ' + 'and will be skipped for safety reasons. ' + f'If you believe this is an error{bug_reports_message(",")}') + + return wrapper + + class YoutubeDL: """YoutubeDL class. @@ -236,7 +250,7 @@ class YoutubeDL: format_sort_force: Force the given format_sort. see "Sorting Formats" for more details. prefer_free_formats: Whether to prefer video formats with free containers - over non-free ones of same quality. + over non-free ones of the same quality. allow_multiple_video_streams: Allow multiple video streams to be merged into a single file allow_multiple_audio_streams: Allow multiple audio streams to be merged @@ -270,7 +284,7 @@ class YoutubeDL: rejecttitle: Reject downloads for matching titles. logger: Log messages to a logging.Logger instance. logtostderr: Print everything to stderr instead of stdout. - consoletitle: Display progress in console window's titlebar. + consoletitle: Display progress in the console window's titlebar. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file clean_infojson: Remove internal metadata from the infojson @@ -437,7 +451,8 @@ class YoutubeDL: Can also just be a single color policy, in which case it applies to all outputs. Valid stream names are 'stdout' and 'stderr'. - Valid color policies are one of 'always', 'auto', 'no_color' or 'never'. + Valid color policies are one of 'always', 'auto', + 'no_color', 'never', 'auto-tty' or 'no_color-tty'. geo_bypass: Bypass geographic restriction via faking X-Forwarded-For HTTP header geo_bypass_country: @@ -453,8 +468,9 @@ class YoutubeDL: Set the value to 'native' to use the native downloader compat_opts: Compatibility options. See "Differences in default behavior". The following options do not work when used through the API: - filename, abort-on-error, multistreams, no-live-chat, format-sort - no-clean-infojson, no-playlist-metafiles, no-keep-subs, no-attach-info-json. + filename, abort-on-error, multistreams, no-live-chat, + format-sort, no-clean-infojson, no-playlist-metafiles, + no-keep-subs, no-attach-info-json, allow-unsafe-ext. Refer __init__.py for their implementation progress_template: Dictionary of templates for progress outputs. Allowed keys are 'download', 'postprocess', @@ -496,7 +512,7 @@ class YoutubeDL: The following options are used by the extractors: extractor_retries: Number of times to retry for known errors (default: 3) dynamic_mpd: Whether to process dynamic DASH manifests (default: True) - hls_split_discontinuity: Split HLS playlists to different formats at + hls_split_discontinuity: Split HLS playlists into different formats at discontinuities such as ad breaks (default: False) extractor_args: A dictionary of arguments to be passed to the extractors. See "EXTRACTOR ARGUMENTS" for details. @@ -536,7 +552,7 @@ class YoutubeDL: include_ads: - Doesn't work Download ads as well call_home: - Not implemented - Boolean, true iff we are allowed to contact the + Boolean, true if we are allowed to contact the yt-dlp servers for debugging. post_hooks: - Register a custom postprocessor A list of functions that get called as the final step @@ -581,8 +597,9 @@ class YoutubeDL: 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'is_dash_periods', 'request_data', 'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies', - 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'hls_aes', 'downloader_options', - 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time', + 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'extra_param_to_key_url', + 'hls_aes', 'downloader_options', 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', + 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time', } _deprecated_multivalue_fields = { 'album_artist': 'album_artists', @@ -642,12 +659,15 @@ class YoutubeDL: self.params['color'] = 'no_color' term_allow_color = os.getenv('TERM', '').lower() != 'dumb' - no_color = bool(os.getenv('NO_COLOR')) + base_no_color = bool(os.getenv('NO_COLOR')) def process_color_policy(stream): stream_name = {sys.stdout: 'stdout', sys.stderr: 'stderr'}[stream] - policy = traverse_obj(self.params, ('color', (stream_name, None), {str}), get_all=False) - if policy in ('auto', None): + policy = traverse_obj(self.params, ('color', (stream_name, None), {str}, any)) or 'auto' + if policy in ('auto', 'auto-tty', 'no_color-tty'): + no_color = base_no_color + if policy.endswith('tty'): + no_color = policy.startswith('no_color') if term_allow_color and supports_terminal_sequences(stream): return 'no_color' if no_color else True return False @@ -1398,6 +1418,7 @@ class YoutubeDL: outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs) return self.escape_outtmpl(outtmpl) % info_dict + @_catch_unsafe_extension_error def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None): assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive' if outtmpl is None: @@ -1602,7 +1623,7 @@ class YoutubeDL: while True: try: return func(self, *args, **kwargs) - except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError): + except (CookieLoadError, DownloadCancelled, LazyList.IndexError, PagedList.IndexError): raise except ReExtractInfo as e: if e.expected: @@ -1925,6 +1946,8 @@ class YoutubeDL: 'playlist_title': ie_result.get('title'), 'playlist_uploader': ie_result.get('uploader'), 'playlist_uploader_id': ie_result.get('uploader_id'), + 'playlist_channel': ie_result.get('channel'), + 'playlist_channel_id': ie_result.get('channel_id'), **kwargs, } if strict: @@ -2170,9 +2193,8 @@ class YoutubeDL: or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio })) - def _default_format_spec(self, info_dict, download=True): - download = download and not self.params.get('simulate') - prefer_best = download and ( + def _default_format_spec(self, info_dict): + prefer_best = ( self.params['outtmpl']['default'] == '-' or info_dict.get('is_live') and not self.params.get('live_from_start')) @@ -2180,7 +2202,7 @@ class YoutubeDL: merger = FFmpegMergerPP(self) return merger.available and merger.can_merge() - if not prefer_best and download and not can_merge(): + if not prefer_best and not can_merge(): prefer_best = True formats = self._get_formats(info_dict) evaluate_formats = lambda spec: self._select_formats(formats, self.build_format_selector(spec)) @@ -2827,13 +2849,10 @@ class YoutubeDL: sanitize_string_field(fmt, 'format_id') sanitize_numeric_fields(fmt) fmt['url'] = sanitize_url(fmt['url']) - if fmt.get('ext') is None: - fmt['ext'] = determine_ext(fmt['url']).lower() + FormatSorter._fill_sorting_fields(fmt) if fmt['ext'] in ('aac', 'opus', 'mp3', 'flac', 'vorbis'): if fmt.get('acodec') is None: fmt['acodec'] = fmt['ext'] - if fmt.get('protocol') is None: - fmt['protocol'] = determine_protocol(fmt) if fmt.get('resolution') is None: fmt['resolution'] = self.format_resolution(fmt, default=None) if fmt.get('dynamic_range') is None and fmt.get('vcodec') != 'none': @@ -2939,7 +2958,7 @@ class YoutubeDL: continue if format_selector is None: - req_format = self._default_format_spec(info_dict, download=download) + req_format = self._default_format_spec(info_dict) self.write_debug(f'Default format spec: {req_format}') format_selector = self.build_format_selector(req_format) @@ -3149,11 +3168,12 @@ class YoutubeDL: if test: verbose = self.params.get('verbose') + quiet = self.params.get('quiet') or not verbose params = { 'test': True, - 'quiet': self.params.get('quiet') or not verbose, + 'quiet': quiet, 'verbose': verbose, - 'noprogress': not verbose, + 'noprogress': quiet, 'nopart': True, 'skip_unavailable_fragments': False, 'keep_fragments': False, @@ -3188,6 +3208,7 @@ class YoutubeDL: os.remove(file) return None + @_catch_unsafe_extension_error def process_info(self, info_dict): """Process a single resolved IE result. (Modifies it in-place)""" @@ -3555,6 +3576,8 @@ class YoutubeDL: def wrapper(*args, **kwargs): try: res = func(*args, **kwargs) + except CookieLoadError: + raise except UnavailableVideoError as e: self.report_error(e) except DownloadCancelled as e: @@ -4043,6 +4066,10 @@ class YoutubeDL: write_debug(f'Proxy map: {self.proxies}') write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}') + if os.environ.get('YTDLP_NO_PLUGINS'): + write_debug('Plugins are forcibly disabled') + return + for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items(): display_list = ['{}{}'.format( klass.__name__, '' if klass.__name__ == name else f' as {name}') @@ -4058,17 +4085,6 @@ class YoutubeDL: if plugin_dirs: write_debug(f'Plugin directories: {plugin_dirs}') - # Not implemented - if False and self.params.get('call_home'): - ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode() - write_debug(f'Public IP address: {ipaddr}') - latest_version = self.urlopen( - 'https://yt-dl.org/latest/version').read().decode() - if version_tuple(latest_version) > version_tuple(__version__): - self.report_warning( - f'You are using an outdated version (newest version: {latest_version})! ' - 'See https://yt-dl.org/update if you need help updating.') - @functools.cached_property def proxies(self): """Global proxy configuration""" @@ -4088,8 +4104,14 @@ class YoutubeDL: @functools.cached_property def cookiejar(self): """Global cookiejar instance""" - return load_cookies( - self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self) + try: + return load_cookies( + self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self) + except CookieLoadError as error: + cause = error.__context__ + # compat: <=py3.9: `traceback.format_exception` has a different signature + self.report_error(str(cause), tb=''.join(traceback.format_exception(None, cause, cause.__traceback__))) + raise @property def _opener(self): diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index c18af75891..9b3bd4acd2 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -1,8 +1,8 @@ import sys -if sys.version_info < (3, 8): +if sys.version_info < (3, 9): raise ImportError( - f'You are using an unsupported version of Python. Only Python versions 3.8 and above are supported by yt-dlp') # noqa: F541 + f'You are using an unsupported version of Python. Only Python versions 3.9 and above are supported by yt-dlp') # noqa: F541 __license__ = 'The Unlicense' @@ -15,7 +15,7 @@ import re import traceback from .compat import compat_os_name -from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS +from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS, CookieLoadError from .downloader.external import get_external_downloader from .extractor import list_extractor_classes from .extractor.adobepass import MSO_INFO @@ -34,6 +34,7 @@ from .postprocessor import ( ) from .update import Updater from .utils import ( + Config, NO_DEFAULT, POSTPROCESS_WHEN, DateRange, @@ -64,6 +65,7 @@ from .utils import ( write_string, ) from .utils.networking import std_headers +from .utils._utils import _UnsafeExtensionError from .YoutubeDL import YoutubeDL _IN_CLI = False @@ -234,6 +236,11 @@ def validate_options(opts): validate_regex('format sorting', f, FormatSorter.regex) # Postprocessor formats + if opts.convertsubtitles == 'none': + opts.convertsubtitles = None + if opts.convertthumbnails == 'none': + opts.convertthumbnails = None + validate_regex('merge output format', opts.merge_output_format, r'({0})(/({0}))*'.format('|'.join(map(re.escape, FFmpegMergerPP.SUPPORTED_EXTS)))) validate_regex('audio format', opts.audioformat, FFmpegExtractAudioPP.FORMAT_RE) @@ -467,7 +474,7 @@ def validate_options(opts): default_downloader = ed.get_basename() for policy in opts.color.values(): - if policy not in ('always', 'auto', 'no_color', 'never'): + if policy not in ('always', 'auto', 'auto-tty', 'no_color', 'no_color-tty', 'never'): raise ValueError(f'"{policy}" is not a valid color policy') warnings, deprecation_warnings = [], [] @@ -593,6 +600,13 @@ def validate_options(opts): if opts.ap_username is not None and opts.ap_password is None: opts.ap_password = getpass.getpass('Type TV provider account password and press [Return]: ') + # compat option changes global state destructively; only allow from cli + if 'allow-unsafe-ext' in opts.compat_opts: + warnings.append( + 'Using allow-unsafe-ext opens you up to potential attacks. ' + 'Use with great care!') + _UnsafeExtensionError.sanitize_extension = lambda x, prepend=False: x + return warnings, deprecation_warnings @@ -954,6 +968,11 @@ def _real_main(argv=None): parser, opts, all_urls, ydl_opts = parse_options(argv) + # HACK: Set the plugin dirs early on + # TODO(coletdjnz): remove when plugin globals system is implemented + if opts.plugin_dirs is not None: + Config._plugin_dirs = list(map(expand_path, opts.plugin_dirs)) + # Dump user agent if opts.dump_user_agent: ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent']) @@ -1071,7 +1090,7 @@ def main(argv=None): _IN_CLI = True try: _exit(*variadic(_real_main(argv))) - except DownloadError: + except (CookieLoadError, DownloadError): _exit(1) except SameFileError as e: _exit(f'ERROR: {e}') diff --git a/yt_dlp/compat/compat_utils.py b/yt_dlp/compat/compat_utils.py index d62b7d0488..d8b3c45cd3 100644 --- a/yt_dlp/compat/compat_utils.py +++ b/yt_dlp/compat/compat_utils.py @@ -57,7 +57,7 @@ def passthrough_module(parent, child, allowed_attributes=(..., ), *, callback=la callback(attr) return ret - @functools.lru_cache(maxsize=None) + @functools.cache def from_child(attr): nonlocal child if attr not in allowed_attributes: diff --git a/yt_dlp/compat/functools.py b/yt_dlp/compat/functools.py index 96689575f6..c2e9e90279 100644 --- a/yt_dlp/compat/functools.py +++ b/yt_dlp/compat/functools.py @@ -5,8 +5,3 @@ from .compat_utils import passthrough_module passthrough_module(__name__, 'functools') del passthrough_module - -try: - _ = cache # >= 3.9 -except NameError: - cache = lru_cache(maxsize=None) diff --git a/yt_dlp/compat/imghdr.py b/yt_dlp/compat/imghdr.py index 5d64ab07bc..4ae173fdec 100644 --- a/yt_dlp/compat/imghdr.py +++ b/yt_dlp/compat/imghdr.py @@ -1,16 +1,22 @@ -tests = { - 'webp': lambda h: h[0:4] == b'RIFF' and h[8:] == b'WEBP', - 'png': lambda h: h[:8] == b'\211PNG\r\n\032\n', - 'jpeg': lambda h: h[6:10] in (b'JFIF', b'Exif'), - 'gif': lambda h: h[:6] in (b'GIF87a', b'GIF89a'), -} - - def what(file=None, h=None): """Detect format of image (Currently supports jpeg, png, webp, gif only) - Ref: https://github.com/python/cpython/blob/3.10/Lib/imghdr.py + Ref: https://github.com/python/cpython/blob/3.11/Lib/imghdr.py + Ref: https://www.w3.org/Graphics/JPEG/itu-t81.pdf """ if h is None: with open(file, 'rb') as f: h = f.read(12) - return next((type_ for type_, test in tests.items() if test(h)), None) + + if h.startswith(b'RIFF') and h.startswith(b'WEBP', 8): + return 'webp' + + if h.startswith(b'\x89PNG'): + return 'png' + + if h.startswith(b'\xFF\xD8\xFF'): + return 'jpeg' + + if h.startswith(b'GIF'): + return 'gif' + + return None diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 0850ad2600..4a69c576be 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -2,7 +2,9 @@ import base64 import collections import contextlib import datetime as dt +import functools import glob +import hashlib import http.cookiejar import http.cookies import io @@ -17,14 +19,12 @@ import tempfile import time import urllib.request from enum import Enum, auto -from hashlib import pbkdf2_hmac from .aes import ( aes_cbc_decrypt_bytes, aes_gcm_decrypt_and_verify_bytes, unpad_pkcs7, ) -from .compat import functools # isort: split from .compat import compat_os_name from .dependencies import ( _SECRETSTORAGE_UNAVAILABLE_REASON, @@ -34,6 +34,7 @@ from .dependencies import ( from .minicurses import MultilinePrinter, QuietMultilinePrinter from .utils import ( DownloadError, + YoutubeDLError, Popen, error_to_str, expand_path, @@ -86,24 +87,31 @@ def _create_progress_bar(logger): return printer +class CookieLoadError(YoutubeDLError): + pass + + def load_cookies(cookie_file, browser_specification, ydl): - cookie_jars = [] - if browser_specification is not None: - browser_name, profile, keyring, container = _parse_browser_specification(*browser_specification) - cookie_jars.append( - extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring, container=container)) + try: + cookie_jars = [] + if browser_specification is not None: + browser_name, profile, keyring, container = _parse_browser_specification(*browser_specification) + cookie_jars.append( + extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring, container=container)) - if cookie_file is not None: - is_filename = is_path_like(cookie_file) - if is_filename: - cookie_file = expand_path(cookie_file) + if cookie_file is not None: + is_filename = is_path_like(cookie_file) + if is_filename: + cookie_file = expand_path(cookie_file) - jar = YoutubeDLCookieJar(cookie_file) - if not is_filename or os.access(cookie_file, os.R_OK): - jar.load() - cookie_jars.append(jar) + jar = YoutubeDLCookieJar(cookie_file) + if not is_filename or os.access(cookie_file, os.R_OK): + jar.load() + cookie_jars.append(jar) - return _merge_cookie_jars(cookie_jars) + return _merge_cookie_jars(cookie_jars) + except Exception: + raise CookieLoadError('failed to load cookies') def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None, container=None): @@ -740,40 +748,38 @@ def _get_linux_desktop_environment(env, logger): xdg_current_desktop = env.get('XDG_CURRENT_DESKTOP', None) desktop_session = env.get('DESKTOP_SESSION', None) if xdg_current_desktop is not None: - xdg_current_desktop = xdg_current_desktop.split(':')[0].strip() - - if xdg_current_desktop == 'Unity': - if desktop_session is not None and 'gnome-fallback' in desktop_session: + for part in map(str.strip, xdg_current_desktop.split(':')): + if part == 'Unity': + if desktop_session is not None and 'gnome-fallback' in desktop_session: + return _LinuxDesktopEnvironment.GNOME + else: + return _LinuxDesktopEnvironment.UNITY + elif part == 'Deepin': + return _LinuxDesktopEnvironment.DEEPIN + elif part == 'GNOME': return _LinuxDesktopEnvironment.GNOME - else: - return _LinuxDesktopEnvironment.UNITY - elif xdg_current_desktop == 'Deepin': - return _LinuxDesktopEnvironment.DEEPIN - elif xdg_current_desktop == 'GNOME': - return _LinuxDesktopEnvironment.GNOME - elif xdg_current_desktop == 'X-Cinnamon': - return _LinuxDesktopEnvironment.CINNAMON - elif xdg_current_desktop == 'KDE': - kde_version = env.get('KDE_SESSION_VERSION', None) - if kde_version == '5': - return _LinuxDesktopEnvironment.KDE5 - elif kde_version == '6': - return _LinuxDesktopEnvironment.KDE6 - elif kde_version == '4': - return _LinuxDesktopEnvironment.KDE4 - else: - logger.info(f'unknown KDE version: "{kde_version}". Assuming KDE4') - return _LinuxDesktopEnvironment.KDE4 - elif xdg_current_desktop == 'Pantheon': - return _LinuxDesktopEnvironment.PANTHEON - elif xdg_current_desktop == 'XFCE': - return _LinuxDesktopEnvironment.XFCE - elif xdg_current_desktop == 'UKUI': - return _LinuxDesktopEnvironment.UKUI - elif xdg_current_desktop == 'LXQt': - return _LinuxDesktopEnvironment.LXQT - else: - logger.info(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"') + elif part == 'X-Cinnamon': + return _LinuxDesktopEnvironment.CINNAMON + elif part == 'KDE': + kde_version = env.get('KDE_SESSION_VERSION', None) + if kde_version == '5': + return _LinuxDesktopEnvironment.KDE5 + elif kde_version == '6': + return _LinuxDesktopEnvironment.KDE6 + elif kde_version == '4': + return _LinuxDesktopEnvironment.KDE4 + else: + logger.info(f'unknown KDE version: "{kde_version}". Assuming KDE4') + return _LinuxDesktopEnvironment.KDE4 + elif part == 'Pantheon': + return _LinuxDesktopEnvironment.PANTHEON + elif part == 'XFCE': + return _LinuxDesktopEnvironment.XFCE + elif part == 'UKUI': + return _LinuxDesktopEnvironment.UKUI + elif part == 'LXQt': + return _LinuxDesktopEnvironment.LXQT + logger.info(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"') elif desktop_session is not None: if desktop_session == 'deepin': @@ -1001,7 +1007,7 @@ def _get_windows_v10_key(browser_root, logger): def pbkdf2_sha1(password, salt, iterations, key_length): - return pbkdf2_hmac('sha1', password, salt, iterations, key_length) + return hashlib.pbkdf2_hmac('sha1', password, salt, iterations, key_length) def _decrypt_aes_cbc_multi(ciphertext, keys, logger, initialization_vector=b' ' * 16): @@ -1055,8 +1061,9 @@ def _decrypt_windows_dpapi(ciphertext, logger): ctypes.byref(blob_out), # pDataOut ) if not ret: - logger.warning('failed to decrypt with DPAPI', only_once=True) - return None + message = 'Failed to decrypt with DPAPI. See https://github.com/yt-dlp/yt-dlp/issues/10927 for more info' + logger.error(message) + raise DownloadError(message) # force exit result = ctypes.string_at(blob_out.pbData, blob_out.cbData) ctypes.windll.kernel32.LocalFree(blob_out.pbData) diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 8b45c671a0..6c1ec403c8 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -1,4 +1,5 @@ import enum +import functools import json import os import re @@ -9,7 +10,6 @@ import time import uuid from .fragment import FragmentFD -from ..compat import functools from ..networking import Request from ..postprocessor.ffmpeg import EXT_TO_OUT_FORMATS, FFmpegPostProcessor from ..utils import ( @@ -108,7 +108,7 @@ class ExternalFD(FragmentFD): return all(( not info_dict.get('to_stdout') or Features.TO_STDOUT in cls.SUPPORTED_FEATURES, '+' not in info_dict['protocol'] or Features.MULTIPLE_FORMATS in cls.SUPPORTED_FEATURES, - not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url'), + not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url', 'extra_param_to_key_url'), all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')), )) @@ -508,7 +508,7 @@ class FFmpegFD(ExternalFD): env = None proxy = self.params.get('proxy') if proxy: - if not re.match(r'^[\da-zA-Z]+://', proxy): + if not re.match(r'[\da-zA-Z]+://', proxy): proxy = f'http://{proxy}' if proxy.startswith('socks'): @@ -559,7 +559,7 @@ class FFmpegFD(ExternalFD): selected_formats = info_dict.get('requested_formats') or [info_dict] for i, fmt in enumerate(selected_formats): - is_http = re.match(r'^https?://', fmt['url']) + is_http = re.match(r'https?://', fmt['url']) cookies = self.ydl.cookiejar.get_cookies_for_url(fmt['url']) if is_http else [] if cookies: args.extend(['-cookies', ''.join( diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index 9cb4f014c0..0a00d5dabb 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -160,10 +160,12 @@ class HlsFD(FragmentFD): extra_state = ctx.setdefault('extra_state', {}) format_index = info_dict.get('format_index') - extra_query = None - extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') - if extra_param_to_segment_url: - extra_query = urllib.parse.parse_qs(extra_param_to_segment_url) + extra_segment_query = None + if extra_param_to_segment_url := info_dict.get('extra_param_to_segment_url'): + extra_segment_query = urllib.parse.parse_qs(extra_param_to_segment_url) + extra_key_query = None + if extra_param_to_key_url := info_dict.get('extra_param_to_key_url'): + extra_key_query = urllib.parse.parse_qs(extra_param_to_key_url) i = 0 media_sequence = 0 decrypt_info = {'METHOD': 'NONE'} @@ -190,8 +192,8 @@ class HlsFD(FragmentFD): if frag_index <= ctx['fragment_index']: continue frag_url = urljoin(man_url, line) - if extra_query: - frag_url = update_url_query(frag_url, extra_query) + if extra_segment_query: + frag_url = update_url_query(frag_url, extra_segment_query) fragments.append({ 'frag_index': frag_index, @@ -212,8 +214,8 @@ class HlsFD(FragmentFD): frag_index += 1 map_info = parse_m3u8_attributes(line[11:]) frag_url = urljoin(man_url, map_info.get('URI')) - if extra_query: - frag_url = update_url_query(frag_url, extra_query) + if extra_segment_query: + frag_url = update_url_query(frag_url, extra_segment_query) if map_info.get('BYTERANGE'): splitted_byte_range = map_info.get('BYTERANGE').split('@') @@ -244,8 +246,10 @@ class HlsFD(FragmentFD): decrypt_info['KEY'] = external_aes_key else: decrypt_info['URI'] = urljoin(man_url, decrypt_info['URI']) - if extra_query: - decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) + if extra_key_query or extra_segment_query: + # Fall back to extra_segment_query to key for backwards compat + decrypt_info['URI'] = update_url_query( + decrypt_info['URI'], extra_key_query or extra_segment_query) if decrypt_url != decrypt_info['URI']: decrypt_info['KEY'] = None diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 9300c8824c..887b096056 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -76,6 +76,7 @@ from .aenetworks import ( ) from .aeonco import AeonCoIE from .afreecatv import ( + AfreecaTVCatchStoryIE, AfreecaTVIE, AfreecaTVLiveIE, AfreecaTVUserIE, @@ -216,6 +217,7 @@ from .bbc import ( BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE, ) +from .beacon import BeaconTvIE from .beatbump import ( BeatBumpPlaylistIE, BeatBumpVideoIE, @@ -361,7 +363,10 @@ from .ccc import ( ) from .ccma import CCMAIE from .cctv import CCTVIE -from .cda import CDAIE +from .cda import ( + CDAIE, + CDAFolderIE, +) from .cellebrite import CellebriteIE from .ceskatelevize import CeskaTelevizeIE from .cgtn import CGTNIE @@ -396,8 +401,6 @@ from .cmt import CMTIE from .cnbc import CNBCVideoIE from .cnn import ( CNNIE, - CNNArticleIE, - CNNBlogsIE, CNNIndonesiaIE, ) from .comedycentral import ( @@ -504,7 +507,6 @@ from .dhm import DHMIE from .digitalconcerthall import DigitalConcertHallIE from .digiteka import DigitekaIE from .discogs import DiscogsReleasePlaylistIE -from .discovery import DiscoveryIE from .disney import DisneyIE from .dispeak import DigitallySpeakingIE from .dlf import ( @@ -532,16 +534,12 @@ from .dplay import ( DiscoveryPlusIndiaShowIE, DiscoveryPlusItalyIE, DiscoveryPlusItalyShowIE, - DIYNetworkIE, DPlayIE, FoodNetworkIE, - GlobalCyclingNetworkPlusIE, GoDiscoveryIE, HGTVDeIE, HGTVUsaIE, InvestigationDiscoveryIE, - MotorTrendIE, - MotorTrendOnDemandIE, ScienceChannelIE, TravelChannelIE, ) @@ -734,6 +732,7 @@ from .genius import ( GeniusIE, GeniusLyricsIE, ) +from .germanupa import GermanupaIE from .getcourseru import ( GetCourseRuIE, GetCourseRuPlayerIE, @@ -780,6 +779,7 @@ from .gopro import GoProIE from .goshgay import GoshgayIE from .gotostage import GoToStageIE from .gputechconf import GPUTechConfIE +from .graspop import GraspopIE from .gronkh import ( GronkhFeedIE, GronkhIE, @@ -826,7 +826,10 @@ from .hungama import ( HungamaIE, HungamaSongIE, ) -from .huya import HuyaLiveIE +from .huya import ( + HuyaLiveIE, + HuyaVideoIE, +) from .hypem import HypemIE from .hypergryph import MonsterSirenHypergryphMusicIE from .hytale import HytaleIE @@ -943,11 +946,13 @@ from .khanacademy import ( KhanAcademyUnitIE, ) from .kick import ( + KickClipIE, KickIE, KickVODIE, ) from .kicker import KickerIE from .kickstarter import KickStarterIE +from .kika import KikaIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .kommunetv import KommunetvIE @@ -970,6 +975,10 @@ from .la7 import ( LA7PodcastEpisodeIE, LA7PodcastIE, ) +from .laracasts import ( + LaracastsIE, + LaracastsPlaylistIE, +) from .lastfm import ( LastFMIE, LastFMPlaylistIE, @@ -986,6 +995,7 @@ from .lcp import ( LcpIE, LcpPlayIE, ) +from .learningonscreen import LearningOnScreenIE from .lecture2go import Lecture2GoIE from .lecturio import ( LecturioCourseIE, @@ -1034,10 +1044,7 @@ from .livestream import ( LivestreamShortenerIE, ) from .livestreamfails import LivestreamfailsIE -from .lnkgo import ( - LnkGoIE, - LnkIE, -) +from .lnk import LnkIE from .loom import ( LoomFolderIE, LoomIE, @@ -1114,12 +1121,15 @@ from .meipai import MeipaiIE from .melonvod import MelonVODIE from .metacritic import MetacriticIE from .mgtv import MGTVIE -from .microsoftembed import MicrosoftEmbedIE -from .microsoftstream import MicrosoftStreamIE -from .microsoftvirtualacademy import ( - MicrosoftVirtualAcademyCourseIE, - MicrosoftVirtualAcademyIE, +from .microsoftembed import ( + MicrosoftBuildIE, + MicrosoftEmbedIE, + MicrosoftLearnEpisodeIE, + MicrosoftLearnPlaylistIE, + MicrosoftLearnSessionIE, + MicrosoftMediusIE, ) +from .microsoftstream import MicrosoftStreamIE from .mildom import ( MildomClipIE, MildomIE, @@ -1159,6 +1169,7 @@ from .mlb import ( ) from .mlssoccer import MLSSoccerIE from .mocha import MochaVideoIE +from .mojevideo import MojevideoIE from .mojvideo import MojvideoIE from .monstercat import MonstercatIE from .motherless import ( @@ -1604,6 +1615,7 @@ from .qqmusic import ( QQMusicPlaylistIE, QQMusicSingerIE, QQMusicToplistIE, + QQMusicVideoIE, ) from .r7 import ( R7IE, @@ -1756,7 +1768,10 @@ from .rtve import ( RTVETelevisionIE, ) from .rtvs import RTVSIE -from .rtvslo import RTVSLOIE +from .rtvslo import ( + RTVSLOIE, + RTVSLOShowIE, +) from .rudovideo import RudoVideoIE from .rule34video import Rule34VideoIE from .rumble import ( @@ -1801,6 +1816,7 @@ from .screen9 import Screen9IE from .screencast import ScreencastIE from .screencastify import ScreencastifyIE from .screencastomatic import ScreencastOMaticIE +from .screenrec import ScreenRecIE from .scrippsnetworks import ( ScrippsNetworksIE, ScrippsNetworksWatchIE, @@ -1811,6 +1827,7 @@ from .scte import ( SCTECourseIE, ) from .sejmpl import SejmIE +from .sen import SenIE from .senalcolombia import SenalColombiaLiveIE from .senategov import ( SenateGovIE, @@ -1866,6 +1883,7 @@ from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE from .slutload import SlutloadIE from .smotrim import SmotrimIE +from .snapchat import SnapchatSpotlightIE from .snotr import SnotrIE from .sohu import ( SohuIE, @@ -1926,6 +1944,10 @@ from .spreaker import ( ) from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE +from .sproutvideo import ( + SproutVideoIE, + VidsIoIE, +) from .srgssr import ( SRGSSRIE, SRGSSRPlayIE, @@ -2158,10 +2180,7 @@ from .tv5unis import ( TV5UnisVideoIE, ) from .tv24ua import TV24UAVideoIE -from .tva import ( - TVAIE, - QubIE, -) +from .tva import TVAIE from .tvanouvelles import ( TVANouvellesArticleIE, TVANouvellesIE, @@ -2301,6 +2320,7 @@ from .videomore import ( VideomoreVideoIE, ) from .videopress import VideoPressIE +from .vidflex import VidflexIE from .vidio import ( VidioIE, VidioLiveIE, @@ -2308,6 +2328,7 @@ from .vidio import ( ) from .vidlii import VidLiiIE from .vidly import VidlyIE +from .vidyard import VidyardIE from .viewlift import ( ViewLiftEmbedIE, ViewLiftIE, @@ -2373,6 +2394,10 @@ from .vrt import ( VrtNUIE, ) from .vtm import VTMIE +from .vtv import ( + VTVIE, + VTVGoIE, +) from .vuclip import VuClipIE from .vvvvid import ( VVVVIDIE, diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index 7518ba6f0d..7296be73b3 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -387,17 +387,27 @@ class ABCIViewShowSeriesIE(InfoExtractor): 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.*\.jpg$', }, 'playlist_count': 15, + 'skip': 'This program is not currently available in ABC iview', + }, { + 'url': 'https://iview.abc.net.au/show/inbestigators', + 'info_dict': { + 'id': '175343-1', + 'title': 'Series 1', + 'description': 'md5:b9976935a6450e5b78ce2a940a755685', + 'series': 'The Inbestigators', + 'season': 'Series 1', + 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.+\.jpg', + }, + 'playlist_count': 17, }] def _real_extract(self, url): show_id = self._match_id(url) webpage = self._download_webpage(url, show_id) - webpage_data = self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*[\'"](.+?)[\'"]\s*;', - webpage, 'initial state') - video_data = self._parse_json( - unescapeHTML(webpage_data).encode().decode('unicode_escape'), show_id) - video_data = video_data['route']['pageData']['_embedded'] + video_data = self._search_json( + r'window\.__INITIAL_STATE__\s*=\s*[\'"]', webpage, 'initial state', show_id, + transform_source=lambda x: x.encode().decode('unicode_escape'), + end_pattern=r'[\'"]\s*;')['route']['pageData']['_embedded'] highlight = try_get(video_data, lambda x: x['highlightVideo']['shareUrl']) if not self._yes_playlist(show_id, bool(highlight), video_label='highlight video'): diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 293a6c40e0..66ab083fe0 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -9,12 +9,12 @@ import re import struct import time import urllib.parse -import urllib.request -import urllib.response import uuid from .common import InfoExtractor from ..aes import aes_ecb_decrypt +from ..networking import RequestHandler, Response +from ..networking.exceptions import TransportError from ..utils import ( ExtractorError, OnDemandPagedList, @@ -26,37 +26,36 @@ from ..utils import ( traverse_obj, update_url_query, ) -from ..utils.networking import clean_proxies - - -def add_opener(ydl, handler): # FIXME: Create proper API in .networking - """Add a handler for opening URLs, like _download_webpage""" - # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 - # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 - rh = ydl._request_director.handlers['Urllib'] - if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES: - return - headers = ydl.params['http_headers'].copy() - proxies = ydl.proxies.copy() - clean_proxies(proxies, headers) - opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=proxies) - assert isinstance(opener, urllib.request.OpenerDirector) - opener.add_handler(handler) - rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license') - - -class AbemaLicenseHandler(urllib.request.BaseHandler): - handler_order = 499 - STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' - HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E' - - def __init__(self, ie: 'AbemaTVIE'): - # the protocol that this should really handle is 'abematv-license://' - # abematv_license_open is just a placeholder for development purposes - # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510 - setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open', None)) + + +class AbemaLicenseRH(RequestHandler): + _SUPPORTED_URL_SCHEMES = ('abematv-license',) + _SUPPORTED_PROXY_SCHEMES = None + _SUPPORTED_FEATURES = None + RH_NAME = 'abematv_license' + + _STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' + _HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E' + + def __init__(self, *, ie: 'AbemaTVIE', **kwargs): + super().__init__(**kwargs) self.ie = ie + def _send(self, request): + url = request.url + ticket = urllib.parse.urlparse(url).netloc + + try: + response_data = self._get_videokey_from_ticket(ticket) + except ExtractorError as e: + raise TransportError(cause=e.cause) from e + except (IndexError, KeyError, TypeError) as e: + raise TransportError(cause=repr(e)) from e + + return Response( + io.BytesIO(response_data), url, + headers={'Content-Length': str(len(response_data))}) + def _get_videokey_from_ticket(self, ticket): to_show = self.ie.get_param('verbose', False) media_token = self.ie._get_media_token(to_show=to_show) @@ -72,25 +71,17 @@ class AbemaLicenseHandler(urllib.request.BaseHandler): 'Content-Type': 'application/json', }) - res = decode_base_n(license_response['k'], table=self.STRTABLE) + res = decode_base_n(license_response['k'], table=self._STRTABLE) encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff)) h = hmac.new( - binascii.unhexlify(self.HKEY), + binascii.unhexlify(self._HKEY), (license_response['cid'] + self.ie._DEVICE_ID).encode(), digestmod=hashlib.sha256) enckey = bytes_to_intlist(h.digest()) return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey)) - def abematv_license_open(self, url): - url = url.get_full_url() if isinstance(url, urllib.request.Request) else url - ticket = urllib.parse.urlparse(url).netloc - response_data = self._get_videokey_from_ticket(ticket) - return urllib.response.addinfourl(io.BytesIO(response_data), headers={ - 'Content-Length': str(len(response_data)), - }, url=url, code=200) - class AbemaTVBaseIE(InfoExtractor): _NETRC_MACHINE = 'abematv' @@ -139,7 +130,7 @@ class AbemaTVBaseIE(InfoExtractor): if self._USERTOKEN: return self._USERTOKEN - add_opener(self._downloader, AbemaLicenseHandler(self)) + self._downloader._request_director.add_handler(AbemaLicenseRH(ie=self, logger=None)) username, _ = self._get_login_info() auth_cache = username and self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19') @@ -368,6 +359,7 @@ class AbemaTVIE(AbemaTVBaseIE): info['episode_number'] = epis if epis < 2000 else None is_live, m3u8_url = False, None + availability = 'public' if video_type == 'now-on-air': is_live = True channel_url = 'https://api.abema.io/v1/channels' @@ -385,10 +377,10 @@ class AbemaTVIE(AbemaTVBaseIE): f'https://api.abema.io/v1/video/programs/{video_id}', video_id, note='Checking playability', headers=headers) - ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType')) - if 3 not in ondemand_types: + if not traverse_obj(api_response, ('label', 'free', {bool})): # cannot acquire decryption key for these streams self.report_warning('This is a premium-only stream') + availability = 'premium_only' info.update(traverse_obj(api_response, { 'series': ('series', 'title'), 'season': ('season', 'name'), @@ -408,6 +400,7 @@ class AbemaTVIE(AbemaTVBaseIE): headers=headers) if not traverse_obj(api_response, ('slot', 'flags', 'timeshiftFree'), default=False): self.report_warning('This is a premium-only stream') + availability = 'premium_only' m3u8_url = f'https://vod-abematv.akamaized.net/slot/{video_id}/playlist.m3u8' else: @@ -425,6 +418,7 @@ class AbemaTVIE(AbemaTVBaseIE): 'description': description, 'formats': formats, 'is_live': is_live, + 'availability': availability, }) return info diff --git a/yt_dlp/extractor/academicearth.py b/yt_dlp/extractor/academicearth.py index d9691cb5c6..b997a02885 100644 --- a/yt_dlp/extractor/academicearth.py +++ b/yt_dlp/extractor/academicearth.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class AcademicEarthCourseIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P[^?#/]+)' + _VALID_URL = r'https?://(?:www\.)?academicearth\.org/playlists/(?P[^?#/]+)' IE_NAME = 'AcademicEarth:Course' _TEST = { 'url': 'http://academicearth.org/playlists/laws-of-nature/', diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index 7be990b9cf..c8a2613754 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -16,6 +16,7 @@ from ..utils import ( float_or_none, int_or_none, intlist_to_bytes, + join_nonempty, long_to_bytes, parse_iso8601, pkcs1pad, @@ -48,9 +49,9 @@ class ADNBaseIE(InfoExtractor): class ADNIE(ADNBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.(?Pfr|de)/video/[^/?#]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?animationdigitalnetwork\.com/(?:(?Pde)/)?video/[^/?#]+/(?P\d+)' _TESTS = [{ - 'url': 'https://animationdigitalnetwork.fr/video/fruits-basket/9841-episode-1-a-ce-soir', + 'url': 'https://animationdigitalnetwork.com/video/558-fruits-basket/9841-episode-1-a-ce-soir', 'md5': '1c9ef066ceb302c86f80c2b371615261', 'info_dict': { 'id': '9841', @@ -70,10 +71,7 @@ class ADNIE(ADNBaseIE): }, 'skip': 'Only available in French and German speaking Europe', }, { - 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', - 'only_matching': True, - }, { - 'url': 'https://animationdigitalnetwork.de/video/the-eminence-in-shadow/23550-folge-1', + 'url': 'https://animationdigitalnetwork.com/de/video/973-the-eminence-in-shadow/23550-folge-1', 'md5': '5c5651bf5791fa6fcd7906012b9d94e8', 'info_dict': { 'id': '23550', @@ -166,7 +164,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' 'username': username, })) or {}).get('accessToken') if access_token: - self._HEADERS = {'authorization': 'Bearer ' + access_token} + self._HEADERS['Authorization'] = f'Bearer {access_token}' except ExtractorError as e: message = None if isinstance(e.cause, HTTPError) and e.cause.status == 401: @@ -177,6 +175,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' def _real_extract(self, url): lang, video_id = self._match_valid_url(url).group('lang', 'id') + self._HEADERS['X-Target-Distribution'] = lang or 'fr' video_base_url = self._PLAYER_BASE_URL + f'video/{video_id}/' player = self._download_json( video_base_url + 'configuration', video_id, @@ -217,7 +216,6 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' links_data = self._download_json( links_url, video_id, 'Downloading links JSON metadata', headers={ 'X-Player-Token': authorization, - 'X-Target-Distribution': lang, **self._HEADERS, }, query={ 'freeWithAds': 'true', @@ -256,6 +254,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' load_balancer_data = self._download_json( load_balancer_url, video_id, f'Downloading {format_id} {quality} JSON metadata', + headers=self._HEADERS, fatal=False) or {} m3u8_url = load_balancer_data.get('location') if not m3u8_url: @@ -276,7 +275,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' video = (self._download_json( self._API_BASE_URL + f'video/{video_id}', video_id, - 'Downloading additional video metadata', fatal=False) or {}).get('video') or {} + 'Downloading additional video metadata', fatal=False, headers=self._HEADERS) or {}).get('video') or {} show = video.get('show') or {} return { @@ -298,9 +297,9 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' class ADNSeasonIE(ADNBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.(?Pfr|de)/video/(?P[^/?#]+)/?(?:$|[#?])' + _VALID_URL = r'https?://(?:www\.)?animationdigitalnetwork\.com/(?:(?Pde)/)?video/(?P\d+)[^/?#]*/?(?:$|[#?])' _TESTS = [{ - 'url': 'https://animationdigitalnetwork.fr/video/tokyo-mew-mew-new', + 'url': 'https://animationdigitalnetwork.com/video/911-tokyo-mew-mew-new', 'playlist_count': 12, 'info_dict': { 'id': '911', @@ -311,24 +310,22 @@ class ADNSeasonIE(ADNBaseIE): def _real_extract(self, url): lang, video_show_slug = self._match_valid_url(url).group('lang', 'id') + self._HEADERS['X-Target-Distribution'] = lang or 'fr' show = self._download_json( f'{self._API_BASE_URL}show/{video_show_slug}/', video_show_slug, 'Downloading show JSON metadata', headers=self._HEADERS)['show'] show_id = str(show['id']) episodes = self._download_json( f'{self._API_BASE_URL}video/show/{show_id}', video_show_slug, - 'Downloading episode list', headers={ - 'X-Target-Distribution': lang, - **self._HEADERS, - }, query={ + 'Downloading episode list', headers=self._HEADERS, query={ 'order': 'asc', 'limit': '-1', }) def entries(): for episode_id in traverse_obj(episodes, ('videos', ..., 'id', {str_or_none})): - yield self.url_result( - f'https://animationdigitalnetwork.{lang}/video/{video_show_slug}/{episode_id}', - ADNIE, episode_id) + yield self.url_result(join_nonempty( + 'https://animationdigitalnetwork.com', lang, 'video', + video_show_slug, episode_id, delim='/'), ADNIE, episode_id) return self.playlist_result(entries(), show_id, show.get('title')) diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index eb7e597e52..7cc15ec7b6 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -1355,6 +1355,7 @@ MSO_INFO = { class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' + _MODERN_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:131.0) Gecko/20100101 Firefox/131.0' _MVPD_CACHE = 'ap-mvpd' _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' @@ -1454,7 +1455,11 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en 'no_iframe': 'false', 'domain_name': 'adobe.com', 'redirect_url': url, - }) + }, headers={ + # yt-dlp's default user-agent is usually too old for Comcast_SSO + # See: https://github.com/yt-dlp/yt-dlp/issues/10848 + 'User-Agent': self._MODERN_USER_AGENT, + } if mso_id == 'Comcast_SSO' else None) elif not self._cookies_passed: raise_mvpd_required() diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py index bcfb02cb95..83e510d1a2 100644 --- a/yt_dlp/extractor/afreecatv.py +++ b/yt_dlp/extractor/afreecatv.py @@ -1,6 +1,7 @@ import functools from .common import InfoExtractor +from ..networking import Request from ..utils import ( ExtractorError, OnDemandPagedList, @@ -32,21 +33,21 @@ class AfreecaTVBaseIE(InfoExtractor): } response = self._download_json( - 'https://login.afreecatv.com/app/LoginAction.php', None, + 'https://login.sooplive.co.kr/app/LoginAction.php', None, 'Logging in', data=urlencode_postdata(login_form)) _ERRORS = { -4: 'Your account has been suspended due to a violation of our terms and policies.', - -5: 'https://member.afreecatv.com/app/user_delete_progress.php', - -6: 'https://login.afreecatv.com/membership/changeMember.php', - -8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.", - -9: 'https://member.afreecatv.com/app/pop_login_block.php', - -11: 'https://login.afreecatv.com/afreeca/second_login.php', - -12: 'https://member.afreecatv.com/app/user_security.php', + -5: 'https://member.sooplive.co.kr/app/user_delete_progress.php', + -6: 'https://login.sooplive.co.kr/membership/changeMember.php', + -8: "Hello! Soop here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.", + -9: 'https://member.sooplive.co.kr/app/pop_login_block.php', + -11: 'https://login.sooplive.co.kr/afreeca/second_login.php', + -12: 'https://member.sooplive.co.kr/app/user_security.php', 0: 'The username does not exist or you have entered the wrong password.', -1: 'The username does not exist or you have entered the wrong password.', -3: 'You have entered your username/password incorrectly.', - -7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.', + -7: 'You cannot use your Global Soop account to access Korean Soop.', -10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.', -32008: 'You have failed to log in. Please contact our Help Center.', } @@ -58,71 +59,42 @@ class AfreecaTVBaseIE(InfoExtractor): f'Unable to login: {self.IE_NAME} said: {error}', expected=True) + def _call_api(self, endpoint, display_id, data=None, headers=None, query=None): + return self._download_json(Request( + f'https://api.m.sooplive.co.kr/{endpoint}', + data=data, headers=headers, query=query, + extensions={'legacy_ssl': True}), display_id, + 'Downloading API JSON', 'Unable to download API JSON') + class AfreecaTVIE(AfreecaTVBaseIE): - IE_NAME = 'afreecatv' - IE_DESC = 'afreecatv.com' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:(?:live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)? - (?: - /app/(?:index|read_ucc_bbs)\.cgi| - /player/[Pp]layer\.(?:swf|html) - )\?.*?\bnTitleNo=| - vod\.afreecatv\.com/(PLAYER/STATION|player)/ - ) - (?P\d+) - ''' + IE_NAME = 'soop' + IE_DESC = 'sooplive.co.kr' + _VALID_URL = r'https?://vod\.(?:sooplive\.co\.kr|afreecatv\.com)/(?:PLAYER/STATION|player)/(?P\d+)/?(?:$|[?#&])' _TESTS = [{ - 'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=', - 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', + 'url': 'https://vod.sooplive.co.kr/player/96753363', 'info_dict': { - 'id': '36164052', + 'id': '20230108_9FF5BEE1_244432674_1', 'ext': 'mp4', - 'title': '데일리 에이프릴 요정들의 시상식!', - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'upload_date': '20160503', + 'uploader_id': 'rlantnghks', + 'uploader': '페이즈으', + 'duration': 10840, + 'thumbnail': r're:https?://videoimg\.sooplive\.co/.kr/.+', + 'upload_date': '20230108', + 'timestamp': 1673218805, + 'title': '젠지 페이즈', }, - 'skip': 'Video is gone', - }, { - 'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867', - 'info_dict': { - 'id': '36153164', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', + 'params': { + 'skip_download': True, }, - 'playlist_count': 2, - 'playlist': [{ - 'md5': 'd8b7c174568da61d774ef0203159bf97', - 'info_dict': { - 'id': '36153164_1', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'upload_date': '20160502', - }, - }, { - 'md5': '58f2ce7f6044e34439ab2d50612ab02b', - 'info_dict': { - 'id': '36153164_2', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'upload_date': '20160502', - }, - }], - 'skip': 'Video is gone', }, { # non standard key - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605', + 'url': 'http://vod.sooplive.co.kr/PLAYER/STATION/20515605', 'info_dict': { 'id': '20170411_BE689A0E_190960999_1_2_h', 'ext': 'mp4', 'title': '혼자사는여자집', - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'thumbnail': r're:https?://(?:video|st)img\.sooplive\.co\.kr/.+', 'uploader': '♥이슬이', 'uploader_id': 'dasl8121', 'upload_date': '20170411', @@ -134,12 +106,12 @@ class AfreecaTVIE(AfreecaTVBaseIE): }, }, { # adult content - 'url': 'https://vod.afreecatv.com/player/97267690', + 'url': 'https://vod.sooplive.co.kr/player/97267690', 'info_dict': { 'id': '20180327_27901457_202289533_1', 'ext': 'mp4', 'title': '[생]빨개요♥ (part 1)', - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'thumbnail': r're:https?://(?:video|st)img\.sooplive\.co\.kr/.+', 'uploader': '[SA]서아', 'uploader_id': 'bjdyrksu', 'upload_date': '20180327', @@ -149,44 +121,25 @@ class AfreecaTVIE(AfreecaTVBaseIE): 'skip_download': True, }, 'skip': 'The VOD does not exist', - }, { - 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', - 'only_matching': True, - }, { - 'url': 'https://vod.afreecatv.com/player/96753363', - 'info_dict': { - 'id': '20230108_9FF5BEE1_244432674_1', - 'ext': 'mp4', - 'uploader_id': 'rlantnghks', - 'uploader': '페이즈으', - 'duration': 10840, - 'thumbnail': r're:https?://videoimg\.afreecatv\.com/.+', - 'upload_date': '20230108', - 'timestamp': 1673218805, - 'title': '젠지 페이즈', - }, - 'params': { - 'skip_download': True, - }, }, { # adult content - 'url': 'https://vod.afreecatv.com/player/70395877', + 'url': 'https://vod.sooplive.co.kr/player/70395877', 'only_matching': True, }, { # subscribers only - 'url': 'https://vod.afreecatv.com/player/104647403', + 'url': 'https://vod.sooplive.co.kr/player/104647403', 'only_matching': True, }, { # private - 'url': 'https://vod.afreecatv.com/player/81669846', + 'url': 'https://vod.sooplive.co.kr/player/81669846', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'https://api.m.afreecatv.com/station/video/a/view', video_id, - headers={'Referer': url}, data=urlencode_postdata({ + data = self._call_api( + 'station/video/a/view', video_id, headers={'Referer': url}, + data=urlencode_postdata({ 'nTitleNo': video_id, 'nApiLevel': 10, }))['data'] @@ -253,12 +206,49 @@ class AfreecaTVIE(AfreecaTVBaseIE): return self.playlist_result(entries, video_id, multi_video=True, **common_info) +class AfreecaTVCatchStoryIE(AfreecaTVBaseIE): + IE_NAME = 'soop:catchstory' + IE_DESC = 'sooplive.co.kr catch story' + _VALID_URL = r'https?://vod\.(?:sooplive\.co\.kr|afreecatv\.com)/player/(?P\d+)/catchstory' + _TESTS = [{ + 'url': 'https://vod.sooplive.co.kr/player/103247/catchstory', + 'info_dict': { + 'id': '103247', + }, + 'playlist_count': 2, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api( + 'catchstory/a/view', video_id, headers={'Referer': url}, + query={'aStoryListIdx': '', 'nStoryIdx': video_id}) + + return self.playlist_result(self._entries(data), video_id) + + @staticmethod + def _entries(data): + # 'files' is always a list with 1 element + yield from traverse_obj(data, ( + 'data', lambda _, v: v['story_type'] == 'catch', + 'catch_list', lambda _, v: v['files'][0]['file'], { + 'id': ('files', 0, 'file_info_key', {str}), + 'url': ('files', 0, 'file', {url_or_none}), + 'duration': ('files', 0, 'duration', {functools.partial(int_or_none, scale=1000)}), + 'title': ('title', {str}), + 'uploader': ('writer_nick', {str}), + 'uploader_id': ('writer_id', {str}), + 'thumbnail': ('thumb', {url_or_none}), + 'timestamp': ('write_timestamp', {int_or_none}), + })) + + class AfreecaTVLiveIE(AfreecaTVBaseIE): - IE_NAME = 'afreecatv:live' - IE_DESC = 'afreecatv.com livestreams' - _VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P[^/]+)(?:/(?P\d+))?' + IE_NAME = 'soop:live' + IE_DESC = 'sooplive.co.kr livestreams' + _VALID_URL = r'https?://play\.(?:sooplive\.co\.kr|afreecatv\.com)/(?P[^/?#]+)(?:/(?P\d+))?' _TESTS = [{ - 'url': 'https://play.afreecatv.com/pyh3646/237852185', + 'url': 'https://play.sooplive.co.kr/pyh3646/237852185', 'info_dict': { 'id': '237852185', 'ext': 'mp4', @@ -270,30 +260,30 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE): }, 'skip': 'Livestream has ended', }, { - 'url': 'https://play.afreecatv.com/pyh3646/237852185', + 'url': 'https://play.sooplive.co.kr/pyh3646/237852185', 'only_matching': True, }, { - 'url': 'https://play.afreecatv.com/pyh3646', + 'url': 'https://play.sooplive.co.kr/pyh3646', 'only_matching': True, }] - _LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php' + _LIVE_API_URL = 'https://live.sooplive.co.kr/afreeca/player_live_api.php' _WORKING_CDNS = [ - 'gcp_cdn', # live-global-cdn-v02.afreecatv.com - 'gs_cdn_pc_app', # pc-app.stream.afreecatv.com - 'gs_cdn_mobile_web', # mobile-web.stream.afreecatv.com - 'gs_cdn_pc_web', # pc-web.stream.afreecatv.com + 'gcp_cdn', # live-global-cdn-v02.sooplive.co.kr + 'gs_cdn_pc_app', # pc-app.stream.sooplive.co.kr + 'gs_cdn_mobile_web', # mobile-web.stream.sooplive.co.kr + 'gs_cdn_pc_web', # pc-web.stream.sooplive.co.kr ] _BAD_CDNS = [ 'gs_cdn', # chromecast.afreeca.gscdn.com (cannot resolve) - 'gs_cdn_chromecast', # chromecast.stream.afreecatv.com (HTTP Error 400) - 'azure_cdn', # live-global-cdn-v01.afreecatv.com (cannot resolve) - 'aws_cf', # live-global-cdn-v03.afreecatv.com (cannot resolve) - 'kt_cdn', # kt.stream.afreecatv.com (HTTP Error 400) + 'gs_cdn_chromecast', # chromecast.stream.sooplive.co.kr (HTTP Error 400) + 'azure_cdn', # live-global-cdn-v01.sooplive.co.kr (cannot resolve) + 'aws_cf', # live-global-cdn-v03.sooplive.co.kr (cannot resolve) + 'kt_cdn', # kt.stream.sooplive.co.kr (HTTP Error 400) ] def _extract_formats(self, channel_info, broadcast_no, aid): - stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com' + stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.sooplive.co.kr' # If user has not passed CDN IDs, try API-provided CDN ID followed by other working CDN IDs default_cdn_ids = orderedSet([ @@ -313,7 +303,7 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE): try: return self._extract_m3u8_formats( m3u8_url, broadcast_no, 'mp4', m3u8_id='hls', query={'aid': aid}, - headers={'Referer': 'https://play.afreecatv.com/'}) + headers={'Referer': 'https://play.sooplive.co.kr/'}) except ExtractorError as e: if attempt == len(cdn_ids): raise @@ -329,7 +319,13 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE): broadcaster_id = channel_info.get('BJID') or broadcaster_id broadcast_no = channel_info.get('BNO') or broadcast_no if not broadcast_no: - raise UserNotLive(video_id=broadcaster_id) + result = channel_info.get('RESULT') + if result == 0: + raise UserNotLive(video_id=broadcaster_id) + elif result == -6: + self.raise_login_required( + 'This channel is streaming for subscribers only', method='password') + raise ExtractorError('Unable to extract broadcast number') password = self.get_param('videopassword') if channel_info.get('BPWD') == 'Y' and password is None: @@ -358,7 +354,7 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE): formats = self._extract_formats(channel_info, broadcast_no, aid) station_info = traverse_obj(self._download_json( - 'https://st.afreecatv.com/api/get_station_status.php', broadcast_no, + 'https://st.sooplive.co.kr/api/get_station_status.php', broadcast_no, 'Downloading channel metadata', 'Unable to download channel metadata', query={'szBjId': broadcaster_id}, fatal=False), {dict}) or {} @@ -374,11 +370,11 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE): } -class AfreecaTVUserIE(InfoExtractor): - IE_NAME = 'afreecatv:user' - _VALID_URL = r'https?://bj\.afreeca(?:tv)?\.com/(?P[^/]+)/vods/?(?P[^/]+)?' +class AfreecaTVUserIE(AfreecaTVBaseIE): + IE_NAME = 'soop:user' + _VALID_URL = r'https?://ch\.(?:sooplive\.co\.kr|afreecatv\.com)/(?P[^/?#]+)/vods/?(?P[^/?#]+)?' _TESTS = [{ - 'url': 'https://bj.afreecatv.com/ryuryu24/vods/review', + 'url': 'https://ch.sooplive.co.kr/ryuryu24/vods/review', 'info_dict': { '_type': 'playlist', 'id': 'ryuryu24', @@ -386,7 +382,7 @@ class AfreecaTVUserIE(InfoExtractor): }, 'playlist_count': 218, }, { - 'url': 'https://bj.afreecatv.com/parang1995/vods/highlight', + 'url': 'https://ch.sooplive.co.kr/parang1995/vods/highlight', 'info_dict': { '_type': 'playlist', 'id': 'parang1995', @@ -394,7 +390,7 @@ class AfreecaTVUserIE(InfoExtractor): }, 'playlist_count': 997, }, { - 'url': 'https://bj.afreecatv.com/ryuryu24/vods', + 'url': 'https://ch.sooplive.co.kr/ryuryu24/vods', 'info_dict': { '_type': 'playlist', 'id': 'ryuryu24', @@ -402,7 +398,7 @@ class AfreecaTVUserIE(InfoExtractor): }, 'playlist_count': 221, }, { - 'url': 'https://bj.afreecatv.com/ryuryu24/vods/balloonclip', + 'url': 'https://ch.sooplive.co.kr/ryuryu24/vods/balloonclip', 'info_dict': { '_type': 'playlist', 'id': 'ryuryu24', @@ -414,12 +410,12 @@ class AfreecaTVUserIE(InfoExtractor): def _fetch_page(self, user_id, user_type, page): page += 1 - info = self._download_json(f'https://bjapi.afreecatv.com/api/{user_id}/vods/{user_type}', user_id, + info = self._download_json(f'https://chapi.sooplive.co.kr/api/{user_id}/vods/{user_type}', user_id, query={'page': page, 'per_page': self._PER_PAGE, 'orderby': 'reg_date'}, note=f'Downloading {user_type} video page {page}') for item in info['data']: yield self.url_result( - f'https://vod.afreecatv.com/player/{item["title_no"]}/', AfreecaTVIE, item['title_no']) + f'https://vod.sooplive.co.kr/player/{item["title_no"]}/', AfreecaTVIE, item['title_no']) def _real_extract(self, url): user_id, user_type = self._match_valid_url(url).group('id', 'slug_type') diff --git a/yt_dlp/extractor/applepodcasts.py b/yt_dlp/extractor/applepodcasts.py index bd301e904a..b99d24e0eb 100644 --- a/yt_dlp/extractor/applepodcasts.py +++ b/yt_dlp/extractor/applepodcasts.py @@ -1,27 +1,42 @@ from .common import InfoExtractor from ..utils import ( - clean_html, clean_podcast_url, - get_element_by_class, int_or_none, parse_iso8601, - try_get, ) +from ..utils.traversal import traverse_obj class ApplePodcastsIE(InfoExtractor): _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P\d+)' _TESTS = [{ + 'url': 'https://podcasts.apple.com/us/podcast/ferreck-dawn-to-the-break-of-dawn-117/id1625658232?i=1000665010654', + 'md5': '82cc219b8cc1dcf8bfc5a5e99b23b172', + 'info_dict': { + 'id': '1000665010654', + 'ext': 'mp3', + 'title': 'Ferreck Dawn - To The Break of Dawn 117', + 'episode': 'Ferreck Dawn - To The Break of Dawn 117', + 'description': 'md5:1fc571102f79dbd0a77bfd71ffda23bc', + 'upload_date': '20240812', + 'timestamp': 1723449600, + 'duration': 3596, + 'series': 'Ferreck Dawn - To The Break of Dawn', + 'thumbnail': 're:.+[.](png|jpe?g|webp)', + }, + }, { 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', - 'md5': '41dc31cd650143e530d9423b6b5a344f', + 'md5': 'baf8a6b8b8aa6062dbb4639ed73d0052', 'info_dict': { 'id': '1000482637777', 'ext': 'mp3', 'title': '207 - Whitney Webb Returns', + 'episode': '207 - Whitney Webb Returns', + 'episode_number': 207, 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6', 'upload_date': '20200705', 'timestamp': 1593932400, - 'duration': 6454, + 'duration': 5369, 'series': 'The Tim Dillon Show', 'thumbnail': 're:.+[.](png|jpe?g|webp)', }, @@ -39,47 +54,24 @@ class ApplePodcastsIE(InfoExtractor): def _real_extract(self, url): episode_id = self._match_id(url) webpage = self._download_webpage(url, episode_id) - episode_data = {} - ember_data = {} - # new page type 2021-11 - amp_data = self._parse_json(self._search_regex( - r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<', - webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {} - amp_data = try_get(amp_data, - lambda a: self._parse_json( - next(a[x] for x in iter(a) if episode_id in x), - episode_id), - dict) or {} - amp_data = amp_data.get('d') or [] - episode_data = try_get( - amp_data, - lambda a: next(x for x in a - if x['type'] == 'podcast-episodes' and x['id'] == episode_id), - dict) - if not episode_data: - # try pre 2021-11 page type: TODO: consider deleting if no longer used - ember_data = self._parse_json(self._search_regex( - r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', - webpage, 'ember data'), episode_id) or {} - ember_data = ember_data.get(episode_id) or ember_data - episode_data = try_get(ember_data, lambda x: x['data'], dict) - episode = episode_data['attributes'] - description = episode.get('description') or {} - - series = None - for inc in (amp_data or ember_data.get('included') or []): - if inc.get('type') == 'media/podcast': - series = try_get(inc, lambda x: x['attributes']['name']) - series = series or clean_html(get_element_by_class('podcast-header__identity', webpage)) + server_data = self._search_json( + r'', - webpage, 'replay data', default='{}'), video_id, fatal=False) or {} + def yield_all_relay_data(_filter): + for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})', webpage): + yield self._parse_json(relay_data, video_id, fatal=False) or {} - def extract_relay_prefetched_data(_filter): - return traverse_obj(extract_relay_data(_filter), ( - 'require', (None, (..., ..., ..., '__bbox', 'require')), + def extract_relay_data(_filter): + return next(filter(None, yield_all_relay_data(_filter)), {}) + + def extract_relay_prefetched_data(_filter, target_keys=None): + path = 'data' + if target_keys is not None: + path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys)) + return traverse_obj(yield_all_relay_data(_filter), ( + ..., 'require', (None, (..., ..., ..., '__bbox', 'require')), lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v), - ..., ..., '__bbox', 'result', 'data', {dict}), get_all=False) or {} + ..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {} if not video_data: server_js_data = self._parse_json(self._search_regex([ @@ -591,7 +606,8 @@ class FacebookIE(InfoExtractor): if not video_data: data = extract_relay_prefetched_data( - r'"(?:dash_manifest|playable_url(?:_quality_hd)?)') + r'"(?:dash_manifest|playable_url(?:_quality_hd)?)', + target_keys=('video', 'event', 'nodes', 'node', 'mediaset')) if data: entries = [] @@ -603,12 +619,13 @@ class FacebookIE(InfoExtractor): video = video['creation_story'] video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) video.update(reel_info) + fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video formats = [] q = qualities(['sd', 'hd']) for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), ('browser_native_sd_url', 'sd')): - playable_url = video.get(key) + playable_url = fmt_data.get(key) if not playable_url: continue if determine_ext(playable_url) == 'mpd': @@ -620,7 +637,10 @@ class FacebookIE(InfoExtractor): 'quality': q(format_id) - 3, 'url': playable_url, }) - extract_dash_manifest(video, formats) + extract_dash_manifest(fmt_data, formats) + if not formats: + # Do not append false positive entry w/o any formats + return automatic_captions, subtitles = {}, {} is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool})) @@ -923,18 +943,21 @@ class FacebookReelIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.facebook.com/reel/1195289147628387', - 'md5': 'f13dd37f2633595982db5ed8765474d3', + 'md5': 'a53256d10fc2105441fe0c4212ed8cea', 'info_dict': { 'id': '1195289147628387', 'ext': 'mp4', - 'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e', - 'description': 'md5:22f03309b216ac84720183961441d8db', - 'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1', + 'title': r're:9\.6K views · 355 reactions .+ Let the “Slapathon” commence!! .+ LL COOL J · Mama Said Knock You Out$', + 'description': r're:When your trying to help your partner .+ LL COOL J · Mama Said Knock You Out$', + 'uploader': 'Beast Camp Training', 'uploader_id': '100040874179269', 'duration': 9.579, 'timestamp': 1637502609, 'upload_date': '20211121', 'thumbnail': r're:^https?://.*', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }] @@ -954,6 +977,7 @@ class FacebookAdsIE(InfoExtractor): 'id': '899206155126718', 'ext': 'mp4', 'title': 'video by Kandao', + 'description': 'md5:0822724069e3aca97cbed5dabbab282e', 'uploader': 'Kandao', 'uploader_id': '774114102743284', 'uploader_url': r're:^https?://.*', @@ -962,6 +986,22 @@ class FacebookAdsIE(InfoExtractor): 'upload_date': '20231214', 'like_count': int, }, + }, { + # key 'watermarked_video_sd_url' missing + 'url': 'https://www.facebook.com/ads/library/?id=501152689226254', + 'info_dict': { + 'id': '501152689226254', + 'ext': 'mp4', + 'title': 'video by mat.nawrocki', + 'description': 'md5:02a446ace7ff8c3c37a2892922492490', + 'uploader': 'mat.nawrocki', + 'uploader_id': '148586968341456', + 'uploader_url': r're:^https?://.*', + 'timestamp': 1723452305, + 'thumbnail': r're:^https?://.*', + 'upload_date': '20240812', + 'like_count': int, + }, }, { 'url': 'https://www.facebook.com/ads/library/?id=893637265423481', 'info_dict': { @@ -1008,34 +1048,42 @@ class FacebookAdsIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - post_data = [self._parse_json(j, video_id, fatal=False) - for j in re.findall(r's\.handle\(({.*})\);requireLazy\(', webpage)] - data = traverse_obj(post_data, ( - ..., 'require', ..., ..., ..., 'props', 'deeplinkAdCard', 'snapshot', {dict}), get_all=False) + post_data = traverse_obj( + re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage), (..., {json.loads})) + data = get_first(post_data, ( + 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., + 'entryPointRoot', 'otherProps', 'deeplinkAdCard', 'snapshot', {dict})) if not data: raise ExtractorError('Unable to extract ad data') title = data.get('title') if not title or title == '{{product.name}}': title = join_nonempty('display_format', 'page_name', delim=' by ', from_dict=data) - - info_dict = traverse_obj(data, { - 'description': ('link_description', {str}, {lambda x: x if x != '{{product.description}}' else None}), + markup_id = traverse_obj(data, ('body', '__m', {str})) + markup = traverse_obj(post_data, ( + ..., 'require', ..., ..., ..., '__bbox', 'markup', lambda _, v: v[0].startswith(markup_id), + ..., '__html', {clean_html}, {lambda x: not x.startswith('{{product.') and x}, any)) + + info_dict = merge_dicts({ + 'title': title, + 'description': markup or None, + }, traverse_obj(data, { + 'description': ('link_description', {lambda x: x if not x.startswith('{{product.') else None}), 'uploader': ('page_name', {str}), 'uploader_id': ('page_id', {str_or_none}), 'uploader_url': ('page_profile_uri', {url_or_none}), 'timestamp': ('creation_time', {int_or_none}), 'like_count': ('page_like_count', {int_or_none}), - }) + })) entries = [] for idx, entry in enumerate(traverse_obj( - data, (('videos', 'cards'), lambda _, v: any(url_or_none(v[f]) for f in self._FORMATS_MAP))), 1, + data, (('videos', 'cards'), lambda _, v: any(url_or_none(v.get(f)) for f in self._FORMATS_MAP))), 1, ): entries.append({ 'id': f'{video_id}_{idx}', 'title': entry.get('title') or title, - 'description': entry.get('link_description') or info_dict.get('description'), + 'description': traverse_obj(entry, 'body', 'link_description') or info_dict.get('description'), 'thumbnail': url_or_none(entry.get('video_preview_image_url')), 'formats': self._extract_formats(entry), }) diff --git a/yt_dlp/extractor/fc2.py b/yt_dlp/extractor/fc2.py index eac70f6a96..f7b883155c 100644 --- a/yt_dlp/extractor/fc2.py +++ b/yt_dlp/extractor/fc2.py @@ -14,7 +14,7 @@ from ..utils import ( class FC2IE(InfoExtractor): - _VALID_URL = r'^(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P[^/]+)' + _VALID_URL = r'(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P[^/]+)' IE_NAME = 'fc2' _NETRC_MACHINE = 'fc2' _TESTS = [{ diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index f732d56772..ab08f1c6bf 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from .dailymotion import DailymotionIE from ..networking import HEADRequest from ..utils import ( + clean_html, determine_ext, filter_dict, format_field, @@ -33,6 +34,7 @@ class FranceTVIE(InfoExtractor): _GEO_BYPASS = False _TESTS = [{ + # tokenized url is in dinfo['video']['token'] 'url': 'francetv:ec217ecc-0733-48cf-ac06-af1347b849d1', 'info_dict': { 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', @@ -44,6 +46,19 @@ class FranceTVIE(InfoExtractor): 'upload_date': '20170813', }, 'params': {'skip_download': 'm3u8'}, + }, { + # tokenized url is in dinfo['video']['token']['akamai'] + 'url': 'francetv:c5bda21d-2c6f-4470-8849-3d8327adb2ba', + 'info_dict': { + 'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba', + 'ext': 'mp4', + 'title': '13h15, le dimanche... - Les mystères de Jésus', + 'timestamp': 1514118300, + 'duration': 2880, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20171224', + }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'francetv:162311093', 'only_matching': True, @@ -68,6 +83,7 @@ class FranceTVIE(InfoExtractor): def _extract_video(self, video_id, hostname=None): is_live = None videos = [] + drm_formats = False title = None subtitle = None episode_number = None @@ -85,13 +101,12 @@ class FranceTVIE(InfoExtractor): 'device_type': device_type, 'browser': browser, 'domain': hostname, - }), fatal=False) + }), fatal=False, expected_status=422) # 422 json gives detailed error code/message if not dinfo: continue - video = traverse_obj(dinfo, ('video', {dict})) - if video: + if video := traverse_obj(dinfo, ('video', {dict})): videos.append(video) if duration is None: duration = video.get('duration') @@ -99,9 +114,19 @@ class FranceTVIE(InfoExtractor): is_live = video.get('is_live') if spritesheets is None: spritesheets = video.get('spritesheets') + elif code := traverse_obj(dinfo, ('code', {int})): + if code == 2009: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + elif code in (2015, 2017): + # 2015: L'accès à cette vidéo est impossible. (DRM-only) + # 2017: Cette vidéo n'est pas disponible depuis le site web mobile (b/c DRM) + drm_formats = True + continue + self.report_warning( + f'{self.IE_NAME} said: {code} "{clean_html(dinfo.get("message"))}"') + continue - meta = traverse_obj(dinfo, ('meta', {dict})) - if meta: + if meta := traverse_obj(dinfo, ('meta', {dict})): if title is None: title = meta.get('title') # meta['pre_title'] contains season and episode number for series in format "S E" @@ -114,12 +139,15 @@ class FranceTVIE(InfoExtractor): if timestamp is None: timestamp = parse_iso8601(meta.get('broadcasted_at')) + if not videos and drm_formats: + self.report_drm(video_id) + formats, subtitles, video_url = [], {}, None for video in traverse_obj(videos, lambda _, v: url_or_none(v['url'])): video_url = video['url'] format_id = video.get('format') - if token_url := url_or_none(video.get('token')): + if token_url := traverse_obj(video, ('token', (None, 'akamai'), {url_or_none}, any)): tokenized_url = traverse_obj(self._download_json( token_url, video_id, f'Downloading signed {format_id} manifest URL', fatal=False, query={ @@ -225,13 +253,13 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): _TESTS = [{ 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', 'info_dict': { - 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', + 'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba', 'ext': 'mp4', 'title': '13h15, le dimanche... - Les mystères de Jésus', - 'timestamp': 1502623500, - 'duration': 2580, + 'timestamp': 1514118300, + 'duration': 2880, 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20170813', + 'upload_date': '20171224', }, 'params': { 'skip_download': True, diff --git a/yt_dlp/extractor/funk.py b/yt_dlp/extractor/funk.py index 8bdea3fce7..ef8ea72a8c 100644 --- a/yt_dlp/extractor/funk.py +++ b/yt_dlp/extractor/funk.py @@ -3,7 +3,7 @@ from .nexx import NexxIE class FunkIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|origin\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P[0-9a-z-]+)-(?P\d+)' + _VALID_URL = r'https?://(?:(?:www|origin|play)\.)?funk\.net/(?:channel|playlist)/[^/?#]+/(?P[0-9a-z-]+)-(?P\d+)' _TESTS = [{ 'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821', 'md5': '8610449476156f338761a75391b0017d', @@ -27,6 +27,9 @@ class FunkIE(InfoExtractor): }, { 'url': 'https://www.funk.net/playlist/neuesteVideos/kameras-auf-dem-fusion-festival-1618699', 'only_matching': True, + }, { + 'url': 'https://play.funk.net/playlist/neuesteVideos/george-floyd-wenn-die-polizei-toetet-der-fall-2004391', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index cc17890e76..320a47772b 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -8,6 +8,9 @@ from .common import InfoExtractor from .commonprotocols import RtmpIE from .youtube import YoutubeIE from ..compat import compat_etree_fromstring +from ..cookies import LenientSimpleCookie +from ..networking.exceptions import HTTPError +from ..networking.impersonate import ImpersonateTarget from ..utils import ( KNOWN_EXTENSIONS, MEDIA_EXTENSIONS, @@ -43,6 +46,7 @@ from ..utils import ( xpath_text, xpath_with_ns, ) +from ..utils._utils import _UnsafeExtensionError class GenericIE(InfoExtractor): @@ -2167,7 +2171,15 @@ class GenericIE(InfoExtractor): urllib.parse.urlparse(fragment_query).query or fragment_query or urllib.parse.urlparse(manifest_url).query or None) - hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None + key_query = self._configuration_arg('key_query', [None], casesense=True)[0] + if key_query is not None: + info['extra_param_to_key_url'] = ( + urllib.parse.urlparse(key_query).query or key_query + or urllib.parse.urlparse(manifest_url).query or None) + + def hex_or_none(value): + return value if re.fullmatch(r'(0x)?[\da-f]+', value, re.IGNORECASE) else None + info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), { 'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}), }) or None @@ -2331,7 +2343,7 @@ class GenericIE(InfoExtractor): default_search = 'fixup_error' if default_search in ('auto', 'auto_warning', 'fixup_error'): - if re.match(r'^[^\s/]+\.[^\s/]+/', url): + if re.match(r'[^\s/]+\.[^\s/]+/', url): self.report_warning('The url doesn\'t specify the protocol, trying with http') return self.url_result('http://' + url) elif default_search != 'fixup_error': @@ -2364,6 +2376,11 @@ class GenericIE(InfoExtractor): else: video_id = self._generic_id(url) + # Do not impersonate by default; see https://github.com/yt-dlp/yt-dlp/issues/11335 + impersonate = self._configuration_arg('impersonate', ['false']) + if 'false' in impersonate: + impersonate = None + # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) # making it impossible to download only chunk of the file (yet we need only 512kB to # test whether it's HTML or not). According to yt-dlp default Accept-Encoding @@ -2372,10 +2389,29 @@ class GenericIE(InfoExtractor): # to accept raw bytes and being able to download only a chunk. # It may probably better to solve this by checking Content-Type for application/octet-stream # after a HEAD request, but not sure if we can rely on this. - full_response = self._request_webpage(url, video_id, headers=filter_dict({ - 'Accept-Encoding': 'identity', - 'Referer': smuggled_data.get('referer'), - })) + try: + full_response = self._request_webpage(url, video_id, headers=filter_dict({ + 'Accept-Encoding': 'identity', + 'Referer': smuggled_data.get('referer'), + }), impersonate=impersonate) + except ExtractorError as e: + if not (isinstance(e.cause, HTTPError) and e.cause.status == 403 + and e.cause.response.get_header('cf-mitigated') == 'challenge' + and e.cause.response.extensions.get('impersonate') is None): + raise + cf_cookie_domain = traverse_obj( + LenientSimpleCookie(e.cause.response.get_header('set-cookie')), + ('__cf_bm', 'domain')) + if cf_cookie_domain: + self.write_debug(f'Clearing __cf_bm cookie for {cf_cookie_domain}') + self.cookiejar.clear(domain=cf_cookie_domain, path='/', name='__cf_bm') + msg = 'Got HTTP Error 403 caused by Cloudflare anti-bot challenge; ' + if not self._downloader._impersonate_target_available(ImpersonateTarget()): + msg += ('see https://github.com/yt-dlp/yt-dlp#impersonation for ' + 'how to install the required impersonation dependency, and ') + raise ExtractorError( + f'{msg}try again with --extractor-args "generic:impersonate"', expected=True) + new_url = full_response.url if new_url != extract_basic_auth(url)[0]: self.report_following_redirect(new_url) @@ -2391,7 +2427,7 @@ class GenericIE(InfoExtractor): # Check for direct link to a video content_type = full_response.headers.get('Content-Type', '').lower() - m = re.match(r'^(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P[^;\s]+)', content_type) + m = re.match(r'(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P[^;\s]+)', content_type) if m: self.report_detected('direct video link') headers = filter_dict({'Referer': smuggled_data.get('referer')}) @@ -2438,9 +2474,13 @@ class GenericIE(InfoExtractor): if not is_html(first_bytes): self.report_warning( 'URL could be a direct video link, returning it as such.') + ext = determine_ext(url) + if ext not in _UnsafeExtensionError.ALLOWED_EXTENSIONS: + ext = 'unknown_video' info_dict.update({ 'direct': True, 'url': url, + 'ext': ext, }) return info_dict diff --git a/yt_dlp/extractor/germanupa.py b/yt_dlp/extractor/germanupa.py new file mode 100644 index 0000000000..e40f016b2f --- /dev/null +++ b/yt_dlp/extractor/germanupa.py @@ -0,0 +1,91 @@ +from .common import InfoExtractor +from .vimeo import VimeoIE +from ..utils import ( + parse_qs, + traverse_obj, + url_or_none, +) + + +class GermanupaIE(InfoExtractor): + IE_DESC = 'germanupa.de' + _VALID_URL = r'https?://germanupa\.de/mediathek/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://germanupa.de/mediathek/4-figma-beratung-deine-sprechstunde-fuer-figma-fragen', + 'info_dict': { + 'id': '909179246', + 'title': 'Tutorial: #4 Figma Beratung - Deine Sprechstunde für Figma-Fragen', + 'ext': 'mp4', + 'uploader': 'German UPA', + 'uploader_id': 'germanupa', + 'thumbnail': 'https://i.vimeocdn.com/video/1792564420-7415283ccef8bf8702dab8c6b7515555ceeb7a1c11371ffcc133b8e887dbf70e-d_1280', + 'uploader_url': 'https://vimeo.com/germanupa', + 'duration': 3987, + }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'audio, uses GenericIE', + 'url': 'https://germanupa.de/mediathek/live-vom-ux-festival-neuigkeiten-von-figma-jobmarkt-agenturszene-interview-zu-sustainable', + 'info_dict': { + 'id': '1867346676', + 'title': 'Live vom UX Festival: Neuigkeiten von Figma, Jobmarkt, Agenturszene & Interview zu Sustainable UX', + 'ext': 'opus', + 'timestamp': 1720545088, + 'upload_date': '20240709', + 'duration': 3910.557, + 'like_count': int, + 'description': 'md5:db2aed5ff131e177a7b33901e9a8db05', + 'uploader': 'German UPA', + 'repost_count': int, + 'genres': ['Science'], + 'license': 'all-rights-reserved', + 'uploader_url': 'https://soundcloud.com/user-80097677', + 'uploader_id': '471579486', + 'view_count': int, + 'comment_count': int, + 'thumbnail': 'https://i1.sndcdn.com/artworks-oCti2e9GhaZFWBqY-48ybGw-original.jpg', + }, + }, { + 'note': 'Nur für Mitglieder/Just for members', + 'url': 'https://germanupa.de/mediathek/ux-festival-2024-usability-tests-und-ai', + 'info_dict': { + 'id': '986994430', + 'title': 'UX Festival 2024 "Usability Tests und AI" von Lennart Weber', + 'ext': 'mp4', + 'release_date': '20240719', + 'uploader_url': 'https://vimeo.com/germanupa', + 'timestamp': 1721373980, + 'license': 'by-sa', + 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/1904187064-2a672630c30f9ad787bd390bff3f51d7506a3e8416763ba6dbf465732b165c5c-d_1280', + 'duration': 2146, + 'release_timestamp': 1721373980, + 'uploader': 'German UPA', + 'uploader_id': 'germanupa', + 'upload_date': '20240719', + 'comment_count': int, + }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + 'skip': 'login required', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + param_url = traverse_obj( + self._search_regex( + r']+data-src\s*?=\s*?([\'"])(?Phttps://germanupa\.de/media/oembed\?url=(?:(?!\1).)+)\1', + webpage, 'embedded video', default=None, group='url'), + ({parse_qs}, 'url', 0, {url_or_none})) + + if not param_url: + if self._search_regex( + r']+class\s*?=\s*?([\'"])(?:(?!\1).)*login-wrapper(?:(?!\1).)*\1', + webpage, 'login wrapper', default=None): + self.raise_login_required('This video is only available for members') + return self.url_result(url, 'Generic') # Fall back to generic to extract audio + + real_url = param_url.replace('https://vimeo.com/', 'https://player.vimeo.com/video/') + return self.url_result(VimeoIE._smuggle_referrer(real_url, url), VimeoIE, video_id) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 53b881011c..b7581d77e2 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -52,7 +52,7 @@ class GetCourseRuIE(InfoExtractor): _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})' _VALID_URL = [ rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P[^?#]+)', - rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', + rf'{_BASE_URL_RE}/(?:pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', ] _TESTS = [{ 'url': 'http://academymel.online/3video_1', diff --git a/yt_dlp/extractor/go.py b/yt_dlp/extractor/go.py index bbb23ffc0a..83c1979db8 100644 --- a/yt_dlp/extractor/go.py +++ b/yt_dlp/extractor/go.py @@ -5,6 +5,7 @@ from ..utils import ( ExtractorError, determine_ext, int_or_none, + join_nonempty, parse_age_limit, remove_end, remove_start, @@ -287,7 +288,7 @@ class GoIE(AdobePassIE): if mobj: height = int(mobj.group(2)) f.update({ - 'format_id': (f'{format_id}-' if format_id else '') + f'{height}P', + 'format_id': join_nonempty(format_id, f'{height}P'), 'width': int(mobj.group(1)), 'height': height, }) diff --git a/yt_dlp/extractor/golem.py b/yt_dlp/extractor/golem.py index 90d2fe6c26..964bf6519d 100644 --- a/yt_dlp/extractor/golem.py +++ b/yt_dlp/extractor/golem.py @@ -7,7 +7,7 @@ from ..utils import ( class GolemIE(InfoExtractor): - _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P.+?)/' + _VALID_URL = r'https?://video\.golem\.de/.+?/(?P.+?)/' _TEST = { 'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html', 'md5': 'c1a2c0a3c863319651c7c992c5ee29bf', diff --git a/yt_dlp/extractor/graspop.py b/yt_dlp/extractor/graspop.py new file mode 100644 index 0000000000..09371f8c46 --- /dev/null +++ b/yt_dlp/extractor/graspop.py @@ -0,0 +1,32 @@ +from .common import InfoExtractor +from ..utils import update_url, url_or_none +from ..utils.traversal import traverse_obj + + +class GraspopIE(InfoExtractor): + _VALID_URL = r'https?://vod\.graspop\.be/[a-z]{2}/(?P\d+)/' + _TESTS = [{ + 'url': 'https://vod.graspop.be/fr/101556/thy-art-is-murder-concert/', + 'info_dict': { + 'id': '101556', + 'ext': 'mp4', + 'title': 'Thy Art Is Murder', + 'thumbnail': r're:https://cdn-mds\.pickx\.be/festivals/v3/global/original/.+\.jpg', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + metadata = self._download_json( + f'https://tv.proximus.be/MWC/videocenter/festivals/{video_id}/stream', video_id) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats( + # Downgrade manifest request to avoid incomplete certificate chain error + update_url(metadata['source']['assetUri'], scheme='http'), video_id, 'mp4'), + **traverse_obj(metadata, { + 'title': ('name', {str}), + 'thumbnail': ('source', 'poster', {url_or_none}), + }), + } diff --git a/yt_dlp/extractor/hbo.py b/yt_dlp/extractor/hbo.py index 146d8a23d8..34cff458d8 100644 --- a/yt_dlp/extractor/hbo.py +++ b/yt_dlp/extractor/hbo.py @@ -3,6 +3,7 @@ import re from .common import InfoExtractor from ..utils import ( int_or_none, + join_nonempty, parse_duration, urljoin, xpath_element, @@ -69,7 +70,7 @@ class HBOBaseIE(InfoExtractor): height = format_info.get('height') fmt = { 'url': path, - 'format_id': 'http{}'.format(f'-{height}p' if height else ''), + 'format_id': join_nonempty('http'. height and f'{height}p'), 'width': format_info.get('width'), 'height': height, } diff --git a/yt_dlp/extractor/hketv.py b/yt_dlp/extractor/hketv.py index bffb6afe02..3998abc121 100644 --- a/yt_dlp/extractor/hketv.py +++ b/yt_dlp/extractor/hketv.py @@ -44,9 +44,6 @@ class HKETVIE(InfoExtractor): 'duration': 907, 'subtitles': {}, }, - 'params': { - 'geo_verification_proxy': '', - }, 'skip': 'Geo restricted to HK', }] diff --git a/yt_dlp/extractor/hrfensehen.py b/yt_dlp/extractor/hrfensehen.py index 17673d5b8f..b5a7b14a58 100644 --- a/yt_dlp/extractor/hrfensehen.py +++ b/yt_dlp/extractor/hrfensehen.py @@ -13,7 +13,7 @@ from ..utils import ( class HRFernsehenIE(InfoExtractor): IE_NAME = 'hrfernsehen' - _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P[0-9]{6})\.html' + _VALID_URL = r'https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P[0-9]{6})\.html' _TESTS = [{ 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html', 'md5': '5c4e0ba94677c516a2f65a84110fc536', diff --git a/yt_dlp/extractor/huya.py b/yt_dlp/extractor/huya.py index 5663a78a37..f79e032e4a 100644 --- a/yt_dlp/extractor/huya.py +++ b/yt_dlp/extractor/huya.py @@ -8,15 +8,19 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + parse_duration, str_or_none, try_get, unescapeHTML, + unified_strdate, update_url_query, + url_or_none, ) +from ..utils.traversal import traverse_obj class HuyaLiveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?P[^/#?&]+)(?:\D|$)' + _VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?!(?:video/play/))(?P[^/#?&]+)(?:\D|$)' IE_NAME = 'huya:live' IE_DESC = 'huya.com' TESTS = [{ @@ -24,6 +28,7 @@ class HuyaLiveIE(InfoExtractor): 'info_dict': { 'id': '572329', 'title': str, + 'ext': 'flv', 'description': str, 'is_live': True, 'view_count': int, @@ -131,3 +136,76 @@ class HuyaLiveIE(InfoExtractor): fm = base64.b64decode(params['fm']).decode().split('_', 1)[0] ss = hashlib.md5('|'.join([params['seqid'], params['ctype'], params['t']])) return fm, ss + + +class HuyaVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?huya\.com/video/play/(?P\d+)\.html' + IE_NAME = 'huya:video' + IE_DESC = '虎牙视频' + + _TESTS = [{ + 'url': 'https://www.huya.com/video/play/1002412640.html', + 'info_dict': { + 'id': '1002412640', + 'ext': 'mp4', + 'title': '8月3日', + 'thumbnail': r're:https?://.*\.jpg', + 'duration': 14, + 'uploader': '虎牙-ATS欧卡车队青木', + 'uploader_id': '1564376151', + 'upload_date': '20240803', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + }, + }, + { + 'url': 'https://www.huya.com/video/play/556054543.html', + 'info_dict': { + 'id': '556054543', + 'ext': 'mp4', + 'title': '我不挑事 也不怕事', + 'thumbnail': r're:https?://.*\.jpg', + 'duration': 1864, + 'uploader': '卡尔', + 'uploader_id': '367138632', + 'upload_date': '20210811', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + }, + }] + + def _real_extract(self, url: str): + video_id = self._match_id(url) + video_data = self._download_json( + 'https://liveapi.huya.com/moment/getMomentContent', video_id, + query={'videoId': video_id})['data']['moment']['videoInfo'] + + formats = [] + for definition in traverse_obj(video_data, ('definitions', lambda _, v: url_or_none(v['url']))): + formats.append({ + 'url': definition['url'], + **traverse_obj(definition, { + 'format_id': ('defName', {str}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': ('size', {int_or_none}), + }), + }) + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(video_data, { + 'title': ('videoTitle', {str}), + 'thumbnail': ('videoCover', {url_or_none}), + 'duration': ('videoDuration', {parse_duration}), + 'uploader': ('nickName', {str}), + 'uploader_id': ('uid', {str_or_none}), + 'upload_date': ('videoUploadTime', {unified_strdate}), + 'view_count': ('videoPlayNum', {int_or_none}), + 'comment_count': ('videoCommentNum', {int_or_none}), + 'like_count': ('favorCount', {int_or_none}), + }), + } diff --git a/yt_dlp/extractor/imgur.py b/yt_dlp/extractor/imgur.py index f0c3419d49..e2644e6a40 100644 --- a/yt_dlp/extractor/imgur.py +++ b/yt_dlp/extractor/imgur.py @@ -37,7 +37,7 @@ class ImgurBaseIE(InfoExtractor): class ImgurIE(ImgurBaseIE): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)(?:[^/?#]+-)?(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://imgur.com/A61SaA1', @@ -54,6 +54,22 @@ class ImgurIE(ImgurBaseIE): 'like_count': int, 'thumbnail': 'https://i.imgur.com/A61SaA1h.jpg', }, + }, { + # Test with URL slug + 'url': 'https://imgur.com/mrw-gifv-is-up-running-without-any-bugs-A61SaA1', + 'info_dict': { + 'id': 'A61SaA1', + 'ext': 'mp4', + 'title': 'MRW gifv is up and running without any bugs', + 'timestamp': 1416446068, + 'upload_date': '20141120', + 'dislike_count': int, + 'comment_count': int, + 'release_timestamp': 1416446068, + 'release_date': '20141120', + 'like_count': int, + 'thumbnail': 'https://i.imgur.com/A61SaA1h.jpg', + }, }, { 'url': 'https://i.imgur.com/A61SaA1.gifv', 'only_matching': True, @@ -92,6 +108,7 @@ class ImgurIE(ImgurBaseIE): 'comment_count': int, 'release_timestamp': 1710491255, 'release_date': '20240315', + 'thumbnail': 'https://i.imgur.com/zV03bd5h.jpg', }, }] @@ -208,7 +225,10 @@ class ImgurIE(ImgurBaseIE): }), get_all=False), 'id': video_id, 'formats': formats, - 'thumbnail': url_or_none(search('thumbnailUrl')), + 'thumbnails': [{ + 'url': thumbnail_url, + 'http_headers': {'Accept': '*/*'}, + }] if (thumbnail_url := search(['thumbnailUrl', 'twitter:image', 'og:image'])) else None, 'http_headers': {'Accept': '*/*'}, } @@ -252,17 +272,9 @@ class ImgurGalleryBaseIE(ImgurBaseIE): class ImgurGalleryIE(ImgurGalleryBaseIE): IE_NAME = 'imgur:gallery' - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/?#]+)/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/?#]+)/(?:[^/?#]+-)?(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'http://imgur.com/gallery/Q95ko', - 'info_dict': { - 'id': 'Q95ko', - 'title': 'Adding faces make every GIF better', - }, - 'playlist_count': 25, - 'skip': 'Zoinks! You\'ve taken a wrong turn.', - }, { # TODO: static images - replace with animated/video gallery 'url': 'http://imgur.com/topic/Aww/ll5Vk', 'only_matching': True, @@ -280,7 +292,27 @@ class ImgurGalleryIE(ImgurGalleryBaseIE): 'release_timestamp': 1358554297, 'thumbnail': 'https://i.imgur.com/YcAQlkxh.jpg', 'release_date': '20130119', - 'uploader_url': 'https://i.imgur.com/u3R4I2S_d.png?maxwidth=290&fidelity=grand', + 'uploader_url': 'https://i.imgur.com/N5Flb2v_d.png?maxwidth=290&fidelity=grand', + 'comment_count': int, + 'dislike_count': int, + 'like_count': int, + }, + }, { + # Test with slug + 'url': 'https://imgur.com/gallery/classic-steve-carell-gif-cracks-me-up-everytime-repost-downvotes-YcAQlkx', + 'add_ies': ['Imgur'], + 'info_dict': { + 'id': 'YcAQlkx', + 'ext': 'mp4', + 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', + 'timestamp': 1358554297, + 'upload_date': '20130119', + 'uploader_id': '1648642', + 'uploader': 'wittyusernamehere', + 'release_timestamp': 1358554297, + 'release_date': '20130119', + 'thumbnail': 'https://i.imgur.com/YcAQlkxh.jpg', + 'uploader_url': 'https://i.imgur.com/N5Flb2v_d.png?maxwidth=290&fidelity=grand', 'comment_count': int, 'dislike_count': int, 'like_count': int, @@ -317,6 +349,13 @@ class ImgurGalleryIE(ImgurGalleryBaseIE): 'title': 'Penguins !', }, 'playlist_count': 3, + }, { + 'url': 'https://imgur.com/t/unmuted/penguins-penguins-6lAn9VQ', + 'info_dict': { + 'id': '6lAn9VQ', + 'title': 'Penguins !', + }, + 'playlist_count': 3, }, { 'url': 'https://imgur.com/t/unmuted/kx2uD3C', 'add_ies': ['Imgur'], @@ -357,7 +396,7 @@ class ImgurGalleryIE(ImgurGalleryBaseIE): class ImgurAlbumIE(ImgurGalleryBaseIE): IE_NAME = 'imgur:album' - _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?:[^/?#]+-)?(?P[a-zA-Z0-9]+)' _GALLERY = False _TESTS = [{ # TODO: only static images - replace with animated/video gallery @@ -372,6 +411,14 @@ class ImgurAlbumIE(ImgurGalleryBaseIE): 'title': 'enen-no-shouboutai', }, 'playlist_count': 2, + }, { + # Test with URL slug + 'url': 'https://imgur.com/a/enen-no-shouboutai-iX265HX', + 'info_dict': { + 'id': 'iX265HX', + 'title': 'enen-no-shouboutai', + }, + 'playlist_count': 2, }, { 'url': 'https://imgur.com/a/8pih2Ed', 'info_dict': { diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index c8bf7e9c4a..dee8cb85d5 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -48,7 +48,6 @@ class InstagramBaseIE(InfoExtractor): 'X-IG-WWW-Claim': '0', 'Origin': 'https://www.instagram.com', 'Accept': '*/*', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36', } def _perform_login(self, username, password): @@ -435,10 +434,10 @@ class InstagramIE(InstagramBaseIE): 'X-Requested-With': 'XMLHttpRequest', 'Referer': url, }, query={ - 'query_hash': '9f8827793ef34641b2fb195d4d41151c', + 'doc_id': '8845758582119845', 'variables': json.dumps(variables, separators=(',', ':')), }) - media.update(traverse_obj(general_info, ('data', 'shortcode_media')) or {}) + media.update(traverse_obj(general_info, ('data', 'xdt_shortcode_media')) or {}) if not general_info: self.report_warning('General metadata extraction failed (some metadata might be missing).', video_id) @@ -453,7 +452,7 @@ class InstagramIE(InstagramBaseIE): else: self.report_warning('Main webpage is locked behind the login page. Retrying with embed webpage (some metadata might be missing).') webpage = self._download_webpage( - f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False) + f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False) or '' additional_data = self._search_json( r'window\.__additionalDataLoaded\s*\(\s*[^,]+,', webpage, 'additional data', video_id, fatal=False) if not additional_data and not media: diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index ab26dc5efe..9b91a454b1 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -25,9 +25,29 @@ class IPrimaIE(InfoExtractor): 'id': 'p51388', 'ext': 'mp4', 'title': 'Partička (92)', - 'description': 'md5:859d53beae4609e6dd7796413f1b6cac', - 'upload_date': '20201103', - 'timestamp': 1604437480, + 'description': 'md5:57943f6a50d6188288c3a579d2fd5f01', + 'episode': 'Partička (92)', + 'season': 'Partička', + 'series': 'Prima Partička', + 'episode_number': 92, + 'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-ef6cf9de-c980-4443-92e4-17fe8bccd45c-16x9.jpeg', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + }, { + 'url': 'https://zoom.iprima.cz/porady/krasy-kanarskych-ostrovu/tenerife-v-risi-ohne', + 'info_dict': { + 'id': 'p1412199', + 'ext': 'mp4', + 'episode_number': 3, + 'episode': 'Tenerife: V říši ohně', + 'description': 'md5:4b4a05c574b5eaef130e68d4811c3f2c', + 'duration': 3111.0, + 'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-f66dd7fb-c1a0-47d1-b3bc-7db328d566c5-16x9-1711636518.jpg/t_16x9_medium_1366_768', + 'title': 'Tenerife: V říši ohně', + 'timestamp': 1711825800, + 'upload_date': '20240330', }, 'params': { 'skip_download': True, # m3u8 download @@ -131,6 +151,7 @@ class IPrimaIE(InfoExtractor): video_id = self._search_regex(( r'productId\s*=\s*([\'"])(?Pp\d+)\1', r'pproduct_id\s*=\s*([\'"])(?Pp\d+)\1', + r'let\s+videos\s*=\s*([\'"])(?Pp\d+)\1', ), webpage, 'real id', group='id', default=None) if not video_id: @@ -176,7 +197,7 @@ class IPrimaIE(InfoExtractor): final_result = self._search_json_ld(webpage, video_id, default={}) final_result.update({ 'id': video_id, - 'title': title, + 'title': final_result.get('title') or title, 'thumbnail': self._html_search_meta( ['thumbnail', 'og:image', 'twitter:image'], webpage, 'thumbnail', default=None), diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index df2088f9e7..5b5c367ad8 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -2,7 +2,6 @@ import functools import hashlib import json import time -import urllib.error import urllib.parse from .common import InfoExtractor diff --git a/yt_dlp/extractor/japandiet.py b/yt_dlp/extractor/japandiet.py index 2ef091aff2..994da22ae0 100644 --- a/yt_dlp/extractor/japandiet.py +++ b/yt_dlp/extractor/japandiet.py @@ -194,11 +194,14 @@ class ShugiinItvVodIE(ShugiinItvBaseIE): class SangiinInstructionIE(InfoExtractor): - _VALID_URL = r'^https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php' + _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php' IE_DESC = False # this shouldn't be listed as a supported site def _real_extract(self, url): - raise ExtractorError('Copy the link from the botton below the video description or player, and use the link to download. If there are no button in the frame, get the URL of the frame showing the video.', expected=True) + raise ExtractorError( + 'Copy the link from the button below the video description/player ' + 'and use that link to download. If there is no button in the frame, ' + 'get the URL of the frame showing the video.', expected=True) class SangiinIE(InfoExtractor): diff --git a/yt_dlp/extractor/jiocinema.py b/yt_dlp/extractor/jiocinema.py index 5898e1f497..30d98ba796 100644 --- a/yt_dlp/extractor/jiocinema.py +++ b/yt_dlp/extractor/jiocinema.py @@ -364,20 +364,25 @@ class JioCinemaSeriesIE(JioCinemaBaseIE): 'title': 'naagin', }, 'playlist_mincount': 120, + }, { + 'url': 'https://www.jiocinema.com/tv-shows/mtv-splitsvilla-x5/3499820', + 'info_dict': { + 'id': '3499820', + 'title': 'mtv-splitsvilla-x5', + }, + 'playlist_mincount': 310, }] def _entries(self, series_id): - seasons = self._download_json( - f'{self._METADATA_API_BASE}/voot/v1/voot-web/content/generic/season-by-show', series_id, - 'Downloading series metadata JSON', query={ - 'sort': 'season:asc', - 'id': series_id, - 'responseType': 'common', - }) + seasons = traverse_obj(self._download_json( + f'{self._METADATA_API_BASE}/voot/v1/voot-web/view/show/{series_id}', series_id, + 'Downloading series metadata JSON', query={'responseType': 'common'}), ( + 'trays', lambda _, v: v['trayId'] == 'season-by-show-multifilter', + 'trayTabs', lambda _, v: v['id'])) - for season_num, season in enumerate(traverse_obj(seasons, ('result', lambda _, v: v['id'])), 1): + for season_num, season in enumerate(seasons, start=1): season_id = season['id'] - label = season.get('season') or season_num + label = season.get('label') or season_num for page_num in itertools.count(1): episodes = traverse_obj(self._download_json( f'{self._METADATA_API_BASE}/voot/v1/voot-web/content/generic/series-wise-episode', diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py index 542e41b803..030fe686bd 100644 --- a/yt_dlp/extractor/jiosaavn.py +++ b/yt_dlp/extractor/jiosaavn.py @@ -158,7 +158,7 @@ class JioSaavnAlbumIE(JioSaavnBaseIE): class JioSaavnPlaylistIE(JioSaavnBaseIE): IE_NAME = 'jiosaavn:playlist' - _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/s/playlist/(?:[^/?#]+/){2}(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/(?:s/playlist/(?:[^/?#]+/){2}|featured/[^/?#]+/)(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-english/LlJ8ZWT1ibN5084vKHRj2Q__', 'info_dict': { @@ -173,6 +173,13 @@ class JioSaavnPlaylistIE(JioSaavnBaseIE): 'title': 'Mood Hindi', }, 'playlist_mincount': 801, + }, { + 'url': 'https://www.jiosaavn.com/featured/taaza-tunes/Me5RridRfDk_', + 'info_dict': { + 'id': 'Me5RridRfDk_', + 'title': 'Taaza Tunes', + }, + 'playlist_mincount': 301, }] _PAGE_SIZE = 50 diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py index e5737b1e9e..6d51e32f6d 100644 --- a/yt_dlp/extractor/kaltura.py +++ b/yt_dlp/extractor/kaltura.py @@ -22,7 +22,7 @@ class KalturaIE(InfoExtractor): (?: kaltura:(?P\w+):(?P\w+)(?::(?P\w+))?| https?:// - (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/ + (?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/ (?: (?: # flash player diff --git a/yt_dlp/extractor/khanacademy.py b/yt_dlp/extractor/khanacademy.py index 5333036a8b..42eef3c922 100644 --- a/yt_dlp/extractor/khanacademy.py +++ b/yt_dlp/extractor/khanacademy.py @@ -3,43 +3,52 @@ import json from .common import InfoExtractor from ..utils import ( int_or_none, + make_archive_id, parse_iso8601, - try_get, + str_or_none, + traverse_obj, + url_or_none, + urljoin, ) class KhanAcademyBaseIE(InfoExtractor): _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P(?:[^/]+/){%s}%s[^?#/&]+)' + _PUBLISHED_CONTENT_VERSION = 'dc34750f0572c80f5effe7134082fe351143c1e4' + def _parse_video(self, video): return { '_type': 'url_transparent', 'url': video['youtubeId'], - 'id': video.get('slug'), - 'title': video.get('title'), - 'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'), - 'duration': int_or_none(video.get('duration')), - 'description': video.get('description'), + 'id': video['youtubeId'], 'ie_key': 'Youtube', + **traverse_obj(video, { + 'display_id': ('id', {str_or_none}), + 'title': ('translatedTitle', {str}), + 'thumbnail': ('thumbnailUrls', ..., 'url', {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'description': ('description', {str}), + }, get_all=False), } def _real_extract(self, url): display_id = self._match_id(url) content = self._download_json( - 'https://www.khanacademy.org/api/internal/graphql/FetchContentData', - display_id, query={ + 'https://www.khanacademy.org/api/internal/graphql/ContentForPath', display_id, + query={ 'fastly_cacheable': 'persist_until_publish', - 'hash': '4134764944', - 'lang': 'en', + 'pcv': self._PUBLISHED_CONTENT_VERSION, + 'hash': '3712657851', 'variables': json.dumps({ 'path': display_id, - 'queryParams': 'lang=en', - 'isModal': False, - 'followRedirects': True, 'countryCode': 'US', + 'kaLocale': 'en', + 'clientPublishedContentVersion': self._PUBLISHED_CONTENT_VERSION, }), - })['data']['contentJson'] - return self._parse_component_props(self._parse_json(content, display_id)['componentProps']) + 'lang': 'en', + })['data']['contentRoute']['listedPathData'] + return self._parse_component_props(content, display_id) class KhanAcademyIE(KhanAcademyBaseIE): @@ -47,64 +56,98 @@ class KhanAcademyIE(KhanAcademyBaseIE): _VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/') _TEST = { 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad', - 'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0', + 'md5': '1d5c2e70fa6aa29c38eca419f12515ce', 'info_dict': { 'id': 'FlIG3TvQCBQ', 'ext': 'mp4', 'title': 'The one-time pad', 'description': 'The perfect cipher', + 'display_id': '716378217', 'duration': 176, - 'uploader': 'Brit Cruise', - 'uploader_id': 'khanacademy', + 'uploader': 'Khan Academy', + 'uploader_id': '@khanacademy', + 'uploader_url': 'https://www.youtube.com/@khanacademy', 'upload_date': '20120411', 'timestamp': 1334170113, 'license': 'cc-by-nc-sa', + 'live_status': 'not_live', + 'channel': 'Khan Academy', + 'channel_id': 'UC4a-Gbdw7vOaccHmFo40b9g', + 'channel_url': 'https://www.youtube.com/channel/UC4a-Gbdw7vOaccHmFo40b9g', + 'channel_is_verified': True, + 'playable_in_embed': True, + 'categories': ['Education'], + 'creators': ['Brit Cruise'], + 'tags': [], + 'age_limit': 0, + 'availability': 'public', + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': str, + 'view_count': int, + 'like_count': int, + 'heatmap': list, }, 'add_ie': ['Youtube'], } - def _parse_component_props(self, component_props): - video = component_props['tutorialPageData']['contentModel'] - info = self._parse_video(video) - author_names = video.get('authorNames') - info.update({ - 'uploader': ', '.join(author_names) if author_names else None, - 'timestamp': parse_iso8601(video.get('dateAdded')), - 'license': video.get('kaUserLicense'), - }) - return info + def _parse_component_props(self, component_props, display_id): + video = component_props['content'] + return { + **self._parse_video(video), + **traverse_obj(video, { + 'creators': ('authorNames', ..., {str}), + 'timestamp': ('dateAdded', {parse_iso8601}), + 'license': ('kaUserLicense', {str}), + }), + } class KhanAcademyUnitIE(KhanAcademyBaseIE): IE_NAME = 'khanacademy:unit' - _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)' - _TEST = { + _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('1,2', '')) + '/?(?:[?#&]|$)' + _TESTS = [{ 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography', 'info_dict': { - 'id': 'cryptography', + 'id': 'x48c910b6', 'title': 'Cryptography', 'description': 'How have humans protected their secret messages through history? What has changed today?', + 'display_id': 'computing/computer-science/cryptography', + '_old_archive_ids': ['khanacademyunit cryptography'], }, 'playlist_mincount': 31, - } + }, { + 'url': 'https://www.khanacademy.org/computing/computer-science', + 'info_dict': { + 'id': 'x301707a0', + 'title': 'Computer science theory', + 'description': 'md5:4b472a4646e6cf6ec4ccb52c4062f8ba', + 'display_id': 'computing/computer-science', + '_old_archive_ids': ['khanacademyunit computer-science'], + }, + 'playlist_mincount': 50, + }] + + def _parse_component_props(self, component_props, display_id): + course = component_props['course'] + selected_unit = traverse_obj(course, ( + 'unitChildren', lambda _, v: v['relativeUrl'] == f'/{display_id}', any)) or course - def _parse_component_props(self, component_props): - curation = component_props['curation'] + def build_entry(entry): + return self.url_result(urljoin( + 'https://www.khanacademy.org', entry['canonicalUrl']), + KhanAcademyIE, title=entry.get('translatedTitle')) - entries = [] - tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or [] - for tutorial_number, tutorial in enumerate(tutorials, 1): - chapter_info = { - 'chapter': tutorial.get('title'), - 'chapter_number': tutorial_number, - 'chapter_id': tutorial.get('id'), - } - for content_item in (tutorial.get('contentItems') or []): - if content_item.get('kind') == 'Video': - info = self._parse_video(content_item) - info.update(chapter_info) - entries.append(info) + entries = traverse_obj(selected_unit, ( + (('unitChildren', ...), None), 'allOrderedChildren', ..., 'curatedChildren', + lambda _, v: v['contentKind'] == 'Video' and v['canonicalUrl'], {build_entry})) return self.playlist_result( - entries, curation.get('unit'), curation.get('title'), - curation.get('description')) + entries, + display_id=display_id, + **traverse_obj(selected_unit, { + 'id': ('id', {str}), + 'title': ('translatedTitle', {str}), + 'description': ('translatedDescription', {str}), + '_old_archive_ids': ('slug', {str}, {lambda x: [make_archive_id(self, x)] if x else None}), + })) diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index 889548f526..bd21e59501 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -1,9 +1,14 @@ +import functools + from .common import InfoExtractor from ..networking import HEADRequest from ..utils import ( UserNotLive, + determine_ext, float_or_none, + int_or_none, merge_dicts, + parse_iso8601, str_or_none, traverse_obj, unified_timestamp, @@ -25,104 +30,212 @@ class KickBaseIE(InfoExtractor): def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, **kwargs): return self._download_json( - f'https://kick.com/api/v1/{path}', display_id, note=note, + f'https://kick.com/api/{path}', display_id, note=note, headers=merge_dicts(headers, self._API_HEADERS), impersonate=True, **kwargs) class KickIE(KickBaseIE): + IE_NAME = 'kick:live' _VALID_URL = r'https?://(?:www\.)?kick\.com/(?!(?:video|categories|search|auth)(?:[/?#]|$))(?P[\w-]+)' _TESTS = [{ - 'url': 'https://kick.com/yuppy', + 'url': 'https://kick.com/buddha', 'info_dict': { - 'id': '6cde1-kickrp-joe-flemmingskick-info-heremust-knowmust-see21', + 'id': '92722911-nopixel-40', 'ext': 'mp4', 'title': str, 'description': str, - 'channel': 'yuppy', - 'channel_id': '33538', - 'uploader': 'Yuppy', - 'uploader_id': '33793', - 'upload_date': str, - 'live_status': 'is_live', 'timestamp': int, - 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnail': r're:https?://.+\.jpg', 'categories': list, + 'upload_date': str, + 'channel': 'buddha', + 'channel_id': '32807', + 'uploader': 'Buddha', + 'uploader_id': '33057', + 'live_status': 'is_live', + 'concurrent_view_count': int, + 'release_timestamp': int, + 'age_limit': 18, + 'release_date': str, }, - 'skip': 'livestream', + 'params': {'skip_download': 'livestream'}, + # 'skip': 'livestream', }, { - 'url': 'https://kick.com/kmack710', + 'url': 'https://kick.com/xqc', 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if (KickVODIE.suitable(url) or KickClipIE.suitable(url)) else super().suitable(url) + def _real_extract(self, url): channel = self._match_id(url) - response = self._call_api(f'channels/{channel}', channel) + response = self._call_api(f'v2/channels/{channel}', channel) if not traverse_obj(response, 'livestream', expected_type=dict): raise UserNotLive(video_id=channel) return { - 'id': str(traverse_obj( - response, ('livestream', ('slug', 'id')), get_all=False, default=channel)), - 'formats': self._extract_m3u8_formats( - response['playback_url'], channel, 'mp4', live=True), - 'title': traverse_obj( - response, ('livestream', ('session_title', 'slug')), get_all=False, default=''), - 'description': traverse_obj(response, ('user', 'bio')), 'channel': channel, - 'channel_id': str_or_none(traverse_obj(response, 'id', ('livestream', 'channel_id'))), - 'uploader': traverse_obj(response, 'name', ('user', 'username')), - 'uploader_id': str_or_none(traverse_obj(response, 'user_id', ('user', 'id'))), 'is_live': True, - 'timestamp': unified_timestamp(traverse_obj(response, ('livestream', 'created_at'))), - 'thumbnail': traverse_obj( - response, ('livestream', 'thumbnail', 'url'), expected_type=url_or_none), - 'categories': traverse_obj(response, ('recent_categories', ..., 'name')), + 'formats': self._extract_m3u8_formats(response['playback_url'], channel, 'mp4', live=True), + **traverse_obj(response, { + 'id': ('livestream', 'slug', {str}), + 'title': ('livestream', 'session_title', {str}), + 'description': ('user', 'bio', {str}), + 'channel_id': (('id', ('livestream', 'channel_id')), {int}, {str_or_none}, any), + 'uploader': (('name', ('user', 'username')), {str}, any), + 'uploader_id': (('user_id', ('user', 'id')), {int}, {str_or_none}, any), + 'timestamp': ('livestream', 'created_at', {unified_timestamp}), + 'release_timestamp': ('livestream', 'start_time', {unified_timestamp}), + 'thumbnail': ('livestream', 'thumbnail', 'url', {url_or_none}), + 'categories': ('recent_categories', ..., 'name', {str}), + 'concurrent_view_count': ('livestream', 'viewer_count', {int_or_none}), + 'age_limit': ('livestream', 'is_mature', {bool}, {lambda x: 18 if x else 0}), + }), } class KickVODIE(KickBaseIE): - _VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + IE_NAME = 'kick:vod' + _VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+/videos/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' _TESTS = [{ - 'url': 'https://kick.com/video/58bac65b-e641-4476-a7ba-3707a35e60e3', + 'url': 'https://kick.com/xqc/videos/8dd97a8d-e17f-48fb-8bc3-565f88dbc9ea', 'md5': '3870f94153e40e7121a6e46c068b70cb', 'info_dict': { - 'id': '58bac65b-e641-4476-a7ba-3707a35e60e3', + 'id': '8dd97a8d-e17f-48fb-8bc3-565f88dbc9ea', 'ext': 'mp4', - 'title': '🤠REBIRTH IS BACK!!!!🤠!stake CODE JAREDFPS 🤠', - 'description': 'md5:02b0c46f9b4197fb545ab09dddb85b1d', - 'channel': 'jaredfps', - 'channel_id': '26608', - 'uploader': 'JaredFPS', - 'uploader_id': '26799', - 'upload_date': '20240402', - 'timestamp': 1712097108, - 'duration': 33859.0, + 'title': '18+ #ad 🛑LIVE🛑CLICK🛑DRAMA🛑NEWS🛑STUFF🛑REACT🛑GET IN HHERE🛑BOP BOP🛑WEEEE WOOOO🛑', + 'description': 'THE BEST AT ABSOLUTELY EVERYTHING. THE JUICER. LEADER OF THE JUICERS.', + 'channel': 'xqc', + 'channel_id': '668', + 'uploader': 'xQc', + 'uploader_id': '676', + 'upload_date': '20240909', + 'timestamp': 1725919141, + 'duration': 10155.0, 'thumbnail': r're:^https?://.*\.jpg', - 'categories': ['Call of Duty: Warzone'], - }, - 'params': { - 'skip_download': 'm3u8', + 'view_count': int, + 'categories': ['Just Chatting'], + 'age_limit': 0, }, - 'expected_warnings': [r'impersonation'], + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): video_id = self._match_id(url) - response = self._call_api(f'video/{video_id}', video_id) + response = self._call_api(f'v1/video/{video_id}', video_id) return { 'id': video_id, 'formats': self._extract_m3u8_formats(response['source'], video_id, 'mp4'), - 'title': traverse_obj( - response, ('livestream', ('session_title', 'slug')), get_all=False, default=''), - 'description': traverse_obj(response, ('livestream', 'channel', 'user', 'bio')), - 'channel': traverse_obj(response, ('livestream', 'channel', 'slug')), - 'channel_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'id'))), - 'uploader': traverse_obj(response, ('livestream', 'channel', 'user', 'username')), - 'uploader_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'user_id'))), - 'timestamp': unified_timestamp(response.get('created_at')), - 'duration': float_or_none(traverse_obj(response, ('livestream', 'duration')), scale=1000), - 'thumbnail': traverse_obj( - response, ('livestream', 'thumbnail'), expected_type=url_or_none), - 'categories': traverse_obj(response, ('livestream', 'categories', ..., 'name')), + **traverse_obj(response, { + 'title': ('livestream', ('session_title', 'slug'), {str}, any), + 'description': ('livestream', 'channel', 'user', 'bio', {str}), + 'channel': ('livestream', 'channel', 'slug', {str}), + 'channel_id': ('livestream', 'channel', 'id', {int}, {str_or_none}), + 'uploader': ('livestream', 'channel', 'user', 'username', {str}), + 'uploader_id': ('livestream', 'channel', 'user_id', {int}, {str_or_none}), + 'timestamp': ('created_at', {parse_iso8601}), + 'duration': ('livestream', 'duration', {functools.partial(float_or_none, scale=1000)}), + 'thumbnail': ('livestream', 'thumbnail', {url_or_none}), + 'categories': ('livestream', 'categories', ..., 'name', {str}), + 'view_count': ('views', {int_or_none}), + 'age_limit': ('livestream', 'is_mature', {bool}, {lambda x: 18 if x else 0}), + }), + } + + +class KickClipIE(KickBaseIE): + IE_NAME = 'kick:clips' + _VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+(?:/clips/|/?\?(?:[^#]+&)?clip=)(?Pclip_[\w-]+)' + _TESTS = [{ + 'url': 'https://kick.com/mxddy?clip=clip_01GYXVB5Y8PWAPWCWMSBCFB05X', + 'info_dict': { + 'id': 'clip_01GYXVB5Y8PWAPWCWMSBCFB05X', + 'ext': 'mp4', + 'title': 'Maddy detains Abd D:', + 'channel': 'mxddy', + 'channel_id': '133789', + 'uploader': 'AbdCreates', + 'uploader_id': '3309077', + 'thumbnail': r're:^https?://.*\.jpeg', + 'duration': 35, + 'timestamp': 1682481453, + 'upload_date': '20230426', + 'view_count': int, + 'like_count': int, + 'categories': ['VALORANT'], + 'age_limit': 18, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://kick.com/destiny?clip=clip_01H9SKET879NE7N9RJRRDS98J3', + 'info_dict': { + 'id': 'clip_01H9SKET879NE7N9RJRRDS98J3', + 'title': 'W jews', + 'ext': 'mp4', + 'channel': 'destiny', + 'channel_id': '1772249', + 'uploader': 'punished_furry', + 'uploader_id': '2027722', + 'duration': 49.0, + 'upload_date': '20230908', + 'timestamp': 1694150180, + 'thumbnail': 'https://clips.kick.com/clips/j3/clip_01H9SKET879NE7N9RJRRDS98J3/thumbnail.png', + 'view_count': int, + 'like_count': int, + 'categories': ['Just Chatting'], + 'age_limit': 0, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://kick.com/spreen/clips/clip_01J8RGZRKHXHXXKJEHGRM932A5', + 'info_dict': { + 'id': 'clip_01J8RGZRKHXHXXKJEHGRM932A5', + 'ext': 'mp4', + 'title': 'KLJASLDJKLJKASDLJKDAS', + 'channel': 'spreen', + 'channel_id': '5312671', + 'uploader': 'AnormalBarraBaja', + 'uploader_id': '26518262', + 'duration': 43.0, + 'upload_date': '20240927', + 'timestamp': 1727399987, + 'thumbnail': 'https://clips.kick.com/clips/f2/clip_01J8RGZRKHXHXXKJEHGRM932A5/thumbnail.webp', + 'view_count': int, + 'like_count': int, + 'categories': ['Minecraft'], + 'age_limit': 0, + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + clip_id = self._match_id(url) + clip = self._call_api(f'v2/clips/{clip_id}/play', clip_id)['clip'] + clip_url = clip['clip_url'] + + if determine_ext(clip_url) == 'm3u8': + formats = self._extract_m3u8_formats(clip_url, clip_id, 'mp4') + else: + formats = [{'url': clip_url}] + + return { + 'id': clip_id, + 'formats': formats, + **traverse_obj(clip, { + 'title': ('title', {str}), + 'channel': ('channel', 'slug', {str}), + 'channel_id': ('channel', 'id', {int}, {str_or_none}), + 'uploader': ('creator', 'username', {str}), + 'uploader_id': ('creator', 'id', {int}, {str_or_none}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + 'duration': ('duration', {float_or_none}), + 'categories': ('category', 'name', {str}, all), + 'timestamp': ('created_at', {parse_iso8601}), + 'view_count': ('views', {int_or_none}), + 'like_count': ('likes', {int_or_none}), + 'age_limit': ('is_mature', {bool}, {lambda x: 18 if x else 0}), + }), } diff --git a/yt_dlp/extractor/kika.py b/yt_dlp/extractor/kika.py new file mode 100644 index 0000000000..852a4de3f2 --- /dev/null +++ b/yt_dlp/extractor/kika.py @@ -0,0 +1,126 @@ +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class KikaIE(InfoExtractor): + IE_DESC = 'KiKA.de' + _VALID_URL = r'https?://(?:www\.)?kika\.de/[\w/-]+/videos/(?P[a-z-]+\d+)' + _GEO_COUNTRIES = ['DE'] + + _TESTS = [{ + 'url': 'https://www.kika.de/logo/videos/logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100', + 'md5': 'fbfc8da483719ef06f396e5e5b938c69', + 'info_dict': { + 'id': 'logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100', + 'ext': 'mp4', + 'upload_date': '20240831', + 'timestamp': 1725126600, + 'season_number': 2024, + 'modified_date': '20240831', + 'episode': 'Episode 476', + 'episode_number': 476, + 'season': 'Season 2024', + 'duration': 634, + 'title': 'logo! vom Samstag, 31. August 2024', + 'modified_timestamp': 1725129983, + }, + }, { + 'url': 'https://www.kika.de/kaltstart/videos/video92498', + 'md5': '710ece827e5055094afeb474beacb7aa', + 'info_dict': { + 'id': 'video92498', + 'ext': 'mp4', + 'title': '7. Wo ist Leo?', + 'description': 'md5:fb48396a5b75068bcac1df74f1524920', + 'duration': 436, + 'timestamp': 1702926876, + 'upload_date': '20231218', + 'episode_number': 7, + 'modified_date': '20240319', + 'modified_timestamp': 1710880610, + 'episode': 'Episode 7', + 'season_number': 1, + 'season': 'Season 1', + }, + }, { + 'url': 'https://www.kika.de/bernd-das-brot/astrobrot/videos/video90088', + 'md5': 'ffd1b700d7de0a6616a1d08544c77294', + 'info_dict': { + 'id': 'video90088', + 'ext': 'mp4', + 'upload_date': '20221102', + 'timestamp': 1667390580, + 'duration': 197, + 'modified_timestamp': 1711093771, + 'episode_number': 8, + 'title': 'Es ist nicht leicht, ein Astrobrot zu sein', + 'modified_date': '20240322', + 'description': 'md5:d3641deaf1b5515a160788b2be4159a9', + 'season_number': 1, + 'episode': 'Episode 8', + 'season': 'Season 1', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + doc = self._download_json(f'https://www.kika.de/_next-api/proxy/v1/videos/{video_id}', video_id) + video_assets = self._download_json(doc['assets']['url'], video_id) + + subtitles = {} + if ttml_resource := url_or_none(video_assets.get('videoSubtitle')): + subtitles['de'] = [{ + 'url': ttml_resource, + 'ext': 'ttml', + }] + if webvtt_resource := url_or_none(video_assets.get('webvttUrl')): + subtitles.setdefault('de', []).append({ + 'url': webvtt_resource, + 'ext': 'vtt', + }) + + return { + 'id': video_id, + 'formats': list(self._extract_formats(video_assets, video_id)), + 'subtitles': subtitles, + **traverse_obj(doc, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('date', {parse_iso8601}), + 'modified_timestamp': ('modificationDate', {parse_iso8601}), + 'duration': (( + ('durationInSeconds', {int_or_none}), + ('duration', {parse_duration})), any), + 'episode_number': ('episodeNumber', {int_or_none}), + 'season_number': ('season', {int_or_none}), + }), + } + + def _extract_formats(self, media_info, video_id): + for media in traverse_obj(media_info, ('assets', lambda _, v: url_or_none(v['url']))): + stream_url = media['url'] + ext = determine_ext(stream_url) + if ext == 'm3u8': + yield from self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + else: + yield { + 'url': stream_url, + 'format_id': ext, + **traverse_obj(media, { + 'width': ('frameWidth', {int_or_none}), + 'height': ('frameHeight', {int_or_none}), + # NB: filesize is 0 if unknown, bitrate is -1 if unknown + 'filesize': ('fileSize', {int_or_none}, {lambda x: x or None}), + 'abr': ('bitrateAudio', {int_or_none}, {lambda x: None if x == -1 else x}), + 'vbr': ('bitrateVideo', {int_or_none}, {lambda x: None if x == -1 else x}), + }), + } diff --git a/yt_dlp/extractor/laracasts.py b/yt_dlp/extractor/laracasts.py new file mode 100644 index 0000000000..4494c4b79a --- /dev/null +++ b/yt_dlp/extractor/laracasts.py @@ -0,0 +1,114 @@ +import json + +from .common import InfoExtractor +from .vimeo import VimeoIE +from ..utils import ( + clean_html, + extract_attributes, + get_element_html_by_id, + int_or_none, + parse_duration, + str_or_none, + unified_strdate, + url_or_none, + urljoin, +) +from ..utils.traversal import traverse_obj + + +class LaracastsBaseIE(InfoExtractor): + def _get_prop_data(self, url, display_id): + webpage = self._download_webpage(url, display_id) + return traverse_obj( + get_element_html_by_id('app', webpage), + ({extract_attributes}, 'data-page', {json.loads}, 'props')) + + def _parse_episode(self, episode): + if not traverse_obj(episode, 'vimeoId'): + self.raise_login_required('This video is only available for subscribers.') + return self.url_result( + VimeoIE._smuggle_referrer( + f'https://player.vimeo.com/video/{episode["vimeoId"]}', 'https://laracasts.com/'), + VimeoIE, url_transparent=True, + **traverse_obj(episode, { + 'id': ('id', {int}, {str_or_none}), + 'webpage_url': ('path', {lambda x: urljoin('https://laracasts.com', x)}), + 'title': ('title', {clean_html}), + 'season_number': ('chapter', {int_or_none}), + 'episode_number': ('position', {int_or_none}), + 'description': ('body', {clean_html}), + 'thumbnail': ('largeThumbnail', {url_or_none}), + 'duration': ('length', {int_or_none}), + 'date': ('dateSegments', 'published', {unified_strdate}), + })) + + +class LaracastsIE(LaracastsBaseIE): + IE_NAME = 'laracasts' + _VALID_URL = r'https?://(?:www\.)?laracasts\.com/series/(?P[\w-]+/episodes/\d+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://laracasts.com/series/30-days-to-learn-laravel-11/episodes/1', + 'md5': 'c8f5e7b02ad0e438ef9280a08c8493dc', + 'info_dict': { + 'id': '922040563', + 'title': 'Hello, Laravel', + 'ext': 'mp4', + 'duration': 519, + 'date': '20240312', + 'thumbnail': 'https://laracasts.s3.amazonaws.com/videos/thumbnails/youtube/30-days-to-learn-laravel-11-1.png', + 'description': 'md5:ddd658bb241975871d236555657e1dd1', + 'season_number': 1, + 'season': 'Season 1', + 'episode_number': 1, + 'episode': 'Episode 1', + 'uploader': 'Laracasts', + 'uploader_id': 'user20182673', + 'uploader_url': 'https://vimeo.com/user20182673', + }, + 'expected_warnings': ['Failed to parse XML'], # TODO: Remove when vimeo extractor is fixed + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._parse_episode(self._get_prop_data(url, display_id)['lesson']) + + +class LaracastsPlaylistIE(LaracastsBaseIE): + IE_NAME = 'laracasts:series' + _VALID_URL = r'https?://(?:www\.)?laracasts\.com/series/(?P[\w-]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://laracasts.com/series/30-days-to-learn-laravel-11', + 'info_dict': { + 'title': '30 Days to Learn Laravel', + 'id': '210', + 'thumbnail': 'https://laracasts.s3.amazonaws.com/series/thumbnails/social-cards/30-days-to-learn-laravel-11.png?v=2', + 'duration': 30600.0, + 'modified_date': '20240511', + 'description': 'md5:27c260a1668a450984e8f901579912dd', + 'categories': ['Frameworks'], + 'tags': ['Laravel'], + 'display_id': '30-days-to-learn-laravel-11', + }, + 'playlist_count': 30, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + series = self._get_prop_data(url, display_id)['series'] + + metadata = { + 'display_id': display_id, + **traverse_obj(series, { + 'title': ('title', {str}), + 'id': ('id', {int}, {str_or_none}), + 'description': ('body', {clean_html}), + 'thumbnail': (('large_thumbnail', 'thumbnail'), {url_or_none}, any), + 'duration': ('runTime', {parse_duration}), + 'categories': ('taxonomy', 'name', {str}, {lambda x: x and [x]}), + 'tags': ('topics', ..., 'name', {str}), + 'modified_date': ('lastUpdated', {unified_strdate}), + }), + } + + return self.playlist_result(traverse_obj( + series, ('chapters', ..., 'episodes', lambda _, v: v['vimeoId'], {self._parse_episode})), **metadata) diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index c764d49611..322852dd6f 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -136,6 +136,7 @@ class LBRYBaseIE(InfoExtractor): class LBRYIE(LBRYBaseIE): IE_NAME = 'lbry' + IE_DESC = 'odysee.com' _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf''' (?:\$/(?:download|embed)/)? (?P @@ -364,6 +365,7 @@ class LBRYIE(LBRYBaseIE): class LBRYChannelIE(LBRYBaseIE): IE_NAME = 'lbry:channel' + IE_DESC = 'odysee.com channels' _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf'(?P@{LBRYBaseIE._OPT_CLAIM_ID})/?(?:[?&]|$)' _TESTS = [{ 'url': 'https://lbry.tv/@LBRYFoundation:0', @@ -391,6 +393,7 @@ class LBRYChannelIE(LBRYBaseIE): class LBRYPlaylistIE(LBRYBaseIE): IE_NAME = 'lbry:playlist' + IE_DESC = 'odysee.com playlists' _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'\$/(?:play)?list/(?P[0-9a-f-]+)' _TESTS = [{ 'url': 'https://odysee.com/$/playlist/ffef782f27486f0ac138bde8777f72ebdd0548c2', diff --git a/yt_dlp/extractor/learningonscreen.py b/yt_dlp/extractor/learningonscreen.py new file mode 100644 index 0000000000..dcf83144c8 --- /dev/null +++ b/yt_dlp/extractor/learningonscreen.py @@ -0,0 +1,78 @@ +import functools +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_id, + join_nonempty, + parse_duration, + unified_timestamp, +) +from ..utils.traversal import traverse_obj + + +class LearningOnScreenIE(InfoExtractor): + _VALID_URL = r'https?://learningonscreen\.ac\.uk/ondemand/index\.php/prog/(?P\w+)' + _TESTS = [{ + 'url': 'https://learningonscreen.ac.uk/ondemand/index.php/prog/005D81B2?bcast=22757013', + 'info_dict': { + 'id': '005D81B2', + 'ext': 'mp4', + 'title': 'Planet Earth', + 'duration': 3600.0, + 'timestamp': 1164567600.0, + 'upload_date': '20061126', + 'thumbnail': 'https://stream.learningonscreen.ac.uk/trilt-cover-images/005D81B2-Planet-Earth-2006-11-26T190000Z-BBC4.jpg', + }, + }] + + def _real_initialize(self): + if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'): + self.raise_login_required( + 'Use --cookies for authentication. See ' + ' https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp ' + 'for how to manually pass cookies', method=None) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + details = traverse_obj(webpage, ( + {functools.partial(get_element_html_by_id, 'programme-details')}, { + 'title': ({functools.partial(re.search, r'

([^<]+)

')}, 1, {clean_html}), + 'timestamp': ( + {functools.partial(get_element_by_class, 'broadcast-date')}, + {functools.partial(re.match, r'([^<]+)')}, 1, {unified_timestamp}), + 'duration': ( + {functools.partial(get_element_by_class, 'prog-running-time')}, + {clean_html}, {parse_duration}), + })) + + title = details.pop('title', None) or traverse_obj(webpage, ( + {functools.partial(get_element_html_by_id, 'add-to-existing-playlist')}, + {extract_attributes}, 'data-record-title', {clean_html})) + + entries = self._parse_html5_media_entries( + 'https://stream.learningonscreen.ac.uk', webpage, video_id, m3u8_id='hls', mpd_id='dash', + _headers={'Origin': 'https://learningonscreen.ac.uk', 'Referer': 'https://learningonscreen.ac.uk/'}) + if not entries: + raise ExtractorError('No video found') + + if len(entries) > 1: + duration = details.pop('duration', None) + for idx, entry in enumerate(entries, start=1): + entry.update(details) + entry['id'] = join_nonempty(video_id, idx) + entry['title'] = join_nonempty(title, idx) + return self.playlist_result(entries, video_id, title, duration=duration) + + return { + **entries[0], + **details, + 'id': video_id, + 'title': title, + } diff --git a/yt_dlp/extractor/lnkgo.py b/yt_dlp/extractor/lnk.py similarity index 53% rename from yt_dlp/extractor/lnkgo.py rename to yt_dlp/extractor/lnk.py index 31a7cefd82..593f73410d 100644 --- a/yt_dlp/extractor/lnkgo.py +++ b/yt_dlp/extractor/lnk.py @@ -1,86 +1,11 @@ from .common import InfoExtractor from ..utils import ( - clean_html, format_field, int_or_none, - parse_iso8601, unified_strdate, ) -class LnkGoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?lnk(?:go)?\.(?:alfa\.)?lt/(?:visi-video/[^/]+|video)/(?P[A-Za-z0-9-]+)(?:/(?P\d+))?' - _TESTS = [{ - 'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai', - 'info_dict': { - 'id': '10809', - 'ext': 'mp4', - 'title': "Put'ka: Trys Klausimai", - 'upload_date': '20161216', - 'description': 'Seniai matytas Put’ka užduoda tris klausimėlius. Pabandykime surasti atsakymus.', - 'age_limit': 18, - 'duration': 117, - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1481904000, - }, - 'params': { - 'skip_download': True, # HLS download - }, - }, { - 'url': 'http://lnkgo.alfa.lt/visi-video/aktualai-pratesimas/ziurek-nerdas-taiso-kompiuteri-2', - 'info_dict': { - 'id': '10467', - 'ext': 'mp4', - 'title': 'Nėrdas: Kompiuterio Valymas', - 'upload_date': '20150113', - 'description': 'md5:7352d113a242a808676ff17e69db6a69', - 'age_limit': 18, - 'duration': 346, - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1421164800, - }, - 'params': { - 'skip_download': True, # HLS download - }, - }, { - 'url': 'https://lnk.lt/video/neigalieji-tv-bokste/37413', - 'only_matching': True, - }] - _AGE_LIMITS = { - 'N-7': 7, - 'N-14': 14, - 'S': 18, - } - _M3U8_TEMPL = 'https://vod.lnk.lt/lnk_vod/lnk/lnk/%s:%s/playlist.m3u8%s' - - def _real_extract(self, url): - display_id, video_id = self._match_valid_url(url).groups() - - video_info = self._download_json( - 'https://lnk.lt/api/main/video-page/{}/{}/false'.format(display_id, video_id or '0'), - display_id)['videoConfig']['videoInfo'] - - video_id = str(video_info['id']) - title = video_info['title'] - prefix = 'smil' if video_info.get('isQualityChangeAvailable') else 'mp4' - formats = self._extract_m3u8_formats( - self._M3U8_TEMPL % (prefix, video_info['videoUrl'], video_info.get('secureTokenParams') or ''), - video_id, 'mp4', 'm3u8_native') - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': format_field(video_info, 'posterImage', 'https://lnk.lt/all-images/%s'), - 'duration': int_or_none(video_info.get('duration')), - 'description': clean_html(video_info.get('htmlDescription')), - 'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0), - 'timestamp': parse_iso8601(video_info.get('airDate')), - 'view_count': int_or_none(video_info.get('viewsCount')), - } - - class LnkIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?lnk\.lt/[^/]+/(?P\d+)' diff --git a/yt_dlp/extractor/loom.py b/yt_dlp/extractor/loom.py index 1191aa17ea..b0878c33e2 100644 --- a/yt_dlp/extractor/loom.py +++ b/yt_dlp/extractor/loom.py @@ -92,9 +92,9 @@ class LoomIE(InfoExtractor): }, 'params': {'videopassword': 'seniorinfants2'}, }, { - # embed, transcoded-url endpoint sends empty JSON response + # embed, transcoded-url endpoint sends empty JSON response, split video and audio HLS formats 'url': 'https://www.loom.com/embed/ddcf1c1ad21f451ea7468b1e33917e4e', - 'md5': '8488817242a0db1cb2ad0ea522553cf6', + 'md5': 'b321d261656848c184a94e3b93eae28d', 'info_dict': { 'id': 'ddcf1c1ad21f451ea7468b1e33917e4e', 'ext': 'mp4', @@ -104,6 +104,7 @@ class LoomIE(InfoExtractor): 'timestamp': 1657216459, 'duration': 181, }, + 'params': {'format': 'bestvideo'}, # Test video-only fixup 'expected_warnings': ['Failed to parse JSON'], }] _WEBPAGE_TESTS = [{ @@ -293,7 +294,11 @@ class LoomIE(InfoExtractor): format_url = format_url.replace('-split.m3u8', '.m3u8') m3u8_formats = self._extract_m3u8_formats( format_url, video_id, 'mp4', m3u8_id=f'hls-{format_id}', fatal=False, quality=quality) + # Sometimes only split video/audio formats are available, need to fixup video-only formats + is_not_premerged = 'none' in traverse_obj(m3u8_formats, (..., 'vcodec')) for fmt in m3u8_formats: + if is_not_premerged and fmt.get('vcodec') != 'none': + fmt['acodec'] = 'none' yield { **fmt, 'url': update_url(fmt['url'], query=query), diff --git a/yt_dlp/extractor/mailru.py b/yt_dlp/extractor/mailru.py index cca678f14a..0496a87f00 100644 --- a/yt_dlp/extractor/mailru.py +++ b/yt_dlp/extractor/mailru.py @@ -126,7 +126,7 @@ class MailRuIE(InfoExtractor): video_data = None # fix meta_url if missing the host address - if re.match(r'^\/\+\/', meta_url): + if re.match(r'\/\+\/', meta_url): meta_url = urljoin('https://my.mail.ru', meta_url) if meta_url: diff --git a/yt_dlp/extractor/matchtv.py b/yt_dlp/extractor/matchtv.py index a67fa9fe4c..93799fe859 100644 --- a/yt_dlp/extractor/matchtv.py +++ b/yt_dlp/extractor/matchtv.py @@ -1,51 +1,35 @@ -import random - from .common import InfoExtractor -from ..utils import xpath_text class MatchTVIE(InfoExtractor): - _VALID_URL = r'https?://matchtv\.ru(?:/on-air|/?#live-player)' + _VALID_URL = [ + r'https?://matchtv\.ru/on-air/?(?:$|[?#])', + r'https?://video\.matchtv\.ru/iframe/channel/106/?(?:$|[?#])', + ] _TESTS = [{ - 'url': 'http://matchtv.ru/#live-player', + 'url': 'http://matchtv.ru/on-air/', 'info_dict': { 'id': 'matchtv-live', - 'ext': 'flv', + 'ext': 'mp4', 'title': r're:^Матч ТВ - Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', - 'is_live': True, + 'live_status': 'is_live', }, 'params': { 'skip_download': True, }, }, { - 'url': 'http://matchtv.ru/on-air/', + 'url': 'https://video.matchtv.ru/iframe/channel/106', 'only_matching': True, }] def _real_extract(self, url): video_id = 'matchtv-live' - video_url = self._download_json( - 'http://player.matchtv.ntvplus.tv/player/smil', video_id, - query={ - 'ts': '', - 'quality': 'SD', - 'contentId': '561d2c0df7159b37178b4567', - 'sign': '', - 'includeHighlights': '0', - 'userId': '', - 'sessionId': random.randint(1, 1000000000), - 'contentType': 'channel', - 'timeShift': '0', - 'platform': 'portal', - }, - headers={ - 'Referer': 'http://player.matchtv.ntvplus.tv/embed-player/NTVEmbedPlayer.swf', - })['data']['videoUrl'] - f4m_url = xpath_text(self._download_xml(video_url, video_id), './to') - formats = self._extract_f4m_formats(f4m_url, video_id) + webpage = self._download_webpage('https://video.matchtv.ru/iframe/channel/106', video_id) + video_url = self._html_search_regex( + r'data-config="config=(https?://[^?"]+)[?"]', webpage, 'video URL').replace('/feed/', '/media/') + '.m3u8' return { 'id': video_id, 'title': 'Матч ТВ - Прямой эфир', 'is_live': True, - 'formats': formats, + 'formats': self._extract_m3u8_formats(video_url, video_id, 'mp4', live=True), } diff --git a/yt_dlp/extractor/mdr.py b/yt_dlp/extractor/mdr.py index 46097fa20e..dfda3cc534 100644 --- a/yt_dlp/extractor/mdr.py +++ b/yt_dlp/extractor/mdr.py @@ -13,8 +13,8 @@ from ..utils import ( class MDRIE(InfoExtractor): - IE_DESC = 'MDR.DE and KiKA' - _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P\d+)(?:_.+?)?\.html' + IE_DESC = 'MDR.DE' + _VALID_URL = r'https?://(?:www\.)?mdr\.de/(?:.*)/[a-z-]+-?(?P\d+)(?:_.+?)?\.html' _GEO_COUNTRIES = ['DE'] @@ -34,30 +34,6 @@ class MDRIE(InfoExtractor): 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, 'skip': '404 not found', - }, { - 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'duration': 134, - 'uploader': 'KIKA', - }, - 'skip': '404 not found', - }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - 'timestamp': 1482541200, - 'upload_date': '20161224', - 'duration': 4628, - 'uploader': 'KIKA', - }, }, { # audio with alternative playerURL pattern 'url': 'http://www.mdr.de/kultur/videos-und-audios/audio-radio/operation-mindfuck-robert-wilson100.html', @@ -68,28 +44,7 @@ class MDRIE(InfoExtractor): 'duration': 3239, 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, - }, { - # empty bitrateVideo and bitrateAudio - 'url': 'https://www.kika.de/filme/sendung128372_zc-572e3f45_zs-1d9fb70e.html', - 'info_dict': { - 'id': '128372', - 'ext': 'mp4', - 'title': 'Der kleine Wichtel kehrt zurück', - 'description': 'md5:f77fafdff90f7aa1e9dca14f662c052a', - 'duration': 4876, - 'timestamp': 1607823300, - 'upload_date': '20201213', - 'uploader': 'ZDF', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', - 'only_matching': True, - }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', - 'only_matching': True, + 'skip': '404 not found', }, { 'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html', 'only_matching': True, diff --git a/yt_dlp/extractor/mediaklikk.py b/yt_dlp/extractor/mediaklikk.py index bd1a27fccc..197e91d1d9 100644 --- a/yt_dlp/extractor/mediaklikk.py +++ b/yt_dlp/extractor/mediaklikk.py @@ -16,6 +16,15 @@ class MediaKlikkIE(InfoExtractor): (?P[^/#?_]+)''' _TESTS = [{ + 'url': 'https://mediaklikk.hu/filmajanlo/cikk/az-ajto/', + 'info_dict': { + 'id': '668177', + 'title': 'Az ajtó', + 'display_id': 'az-ajto', + 'ext': 'mp4', + 'thumbnail': 'https://cdn.cms.mtv.hu/wp-content/uploads/sites/4/2016/01/vlcsnap-2023-07-31-14h18m52s111.jpg', + }, + }, { # (old) mediaklikk. date in html. 'url': 'https://mediaklikk.hu/video/hazajaro-delnyugat-bacska-a-duna-menten-palankatol-doroszloig/', 'info_dict': { @@ -37,6 +46,7 @@ class MediaKlikkIE(InfoExtractor): 'upload_date': '20230903', 'thumbnail': 'https://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg', }, + 'skip': 'Webpage redirects to 404 page', }, { # (old) m4sport 'url': 'https://m4sport.hu/video/2021/08/30/gyemant-liga-parizs/', @@ -59,6 +69,7 @@ class MediaKlikkIE(InfoExtractor): 'upload_date': '20230908', 'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-08-22h43m18s691.jpg', }, + 'skip': 'Webpage redirects to 404 page', }, { # m4sport with *video/ url and no date 'url': 'https://m4sport.hu/bl-video/real-madrid-chelsea-1-1/', @@ -69,6 +80,7 @@ class MediaKlikkIE(InfoExtractor): 'ext': 'mp4', 'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png', }, + 'skip': 'Webpage redirects to 404 page', }, { # (old) hirado 'url': 'https://hirado.hu/videok/felteteleket-szabott-a-fovaros/', @@ -90,6 +102,7 @@ class MediaKlikkIE(InfoExtractor): 'upload_date': '20230911', 'thumbnail': 'https://hirado.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-11-09h16m09s882.jpg', }, + 'skip': 'Webpage redirects to video list page', }, { # (old) petofilive 'url': 'https://petofilive.hu/video/2021/06/07/tha-shudras-az-akusztikban/', @@ -112,6 +125,7 @@ class MediaKlikkIE(InfoExtractor): 'upload_date': '20230909', 'thumbnail': 'https://petofilive.hu/wp-content/uploads/sites/4/2023/09/Clipboard11-2.jpg', }, + 'skip': 'Webpage redirects to video list page', }] def _real_extract(self, url): @@ -133,7 +147,9 @@ class MediaKlikkIE(InfoExtractor): r']+\bclass="article_date">([^<]+)<', webpage, 'upload date', default=None)) player_data['video'] = player_data.pop('token') - player_page = self._download_webpage('https://player.mediaklikk.hu/playernew/player.php', video_id, query=player_data) + player_page = self._download_webpage( + 'https://player.mediaklikk.hu/playernew/player.php', video_id, + query=player_data, headers={'Referer': url}) player_json = self._search_json( r'\bpl\.setup\s*\(', player_page, 'player json', video_id, end_pattern=r'\);') playlist_url = traverse_obj( @@ -141,14 +157,14 @@ class MediaKlikkIE(InfoExtractor): if not playlist_url: raise ExtractorError('Unable to extract playlist url') - formats = self._extract_wowza_formats( - playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash']) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(playlist_url, video_id) return { 'id': video_id, 'title': title, 'display_id': display_id, 'formats': formats, + 'subtitles': subtitles, 'upload_date': upload_date, 'thumbnail': player_data.get('bgImage') or self._og_search_thumbnail(webpage), } diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index beb12f8a40..ad7ab27e28 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -15,6 +15,7 @@ from ..utils import ( url_or_none, urljoin, ) +from ..utils.traversal import traverse_obj _ID_RE = r'(?:[0-9a-f]{32,34}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12,14})' @@ -212,13 +213,14 @@ class MediasiteIE(InfoExtractor): stream_type, 'type%u' % stream_type) stream_formats = [] - for unum, video_url in enumerate(video_urls): - video_url = url_or_none(video_url.get('Location')) + for unum, video in enumerate(video_urls): + video_url = url_or_none(video.get('Location')) if not video_url: continue # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS - media_type = video_url.get('MediaType') + media_type = video.get('MediaType') + ext = mimetype2ext(video.get('MimeType')) if media_type == 'SS': stream_formats.extend(self._extract_ism_formats( video_url, resource_id, @@ -229,15 +231,20 @@ class MediasiteIE(InfoExtractor): video_url, resource_id, mpd_id=f'{stream_id}-{snum}.{unum}', fatal=False)) + elif ext in ('m3u', 'm3u8'): + stream_formats.extend(self._extract_m3u8_formats( + video_url, resource_id, + m3u8_id=f'{stream_id}-{snum}.{unum}', + fatal=False)) else: stream_formats.append({ 'format_id': f'{stream_id}-{snum}.{unum}', 'url': video_url, - 'ext': mimetype2ext(video_url.get('MimeType')), + 'ext': ext, }) - if stream.get('HasSlideContent', False): - images = player_options['PlayerLayoutOptions']['Images'] + images = traverse_obj(player_options, ('PlayerLayoutOptions', 'Images', {dict})) + if stream.get('HasSlideContent') and images: stream_formats.append(self.__extract_slides( stream_id=stream_id, snum=snum, diff --git a/yt_dlp/extractor/mgtv.py b/yt_dlp/extractor/mgtv.py index d5dda06f99..c793626fde 100644 --- a/yt_dlp/extractor/mgtv.py +++ b/yt_dlp/extractor/mgtv.py @@ -16,7 +16,7 @@ from ..utils import ( class MGTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P\d+)\.html' + _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/[bv]/(?:[^/]+/)*(?P\d+)\.html' IE_DESC = '芒果TV' IE_NAME = 'MangoTV' diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index 98d50b18a9..d0135f5a9c 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -1,5 +1,14 @@ +import re + from .common import InfoExtractor -from ..utils import int_or_none, traverse_obj, unified_timestamp +from ..utils import ( + int_or_none, + parse_iso8601, + traverse_obj, + unified_timestamp, + url_basename, + url_or_none, +) class MicrosoftEmbedIE(InfoExtractor): @@ -63,3 +72,250 @@ class MicrosoftEmbedIE(InfoExtractor): 'subtitles': subtitles, 'thumbnails': thumbnails, } + + +class MicrosoftMediusBaseIE(InfoExtractor): + @staticmethod + def _sub_to_dict(subtitle_list): + subtitles = {} + for sub in subtitle_list: + subtitles.setdefault(sub.pop('tag', 'und'), []).append(sub) + return subtitles + + def _extract_ism(self, ism_url, video_id): + formats = self._extract_ism_formats(ism_url, video_id) + for fmt in formats: + if fmt['language'] != 'eng' and 'English' not in fmt['format_id']: + fmt['language_preference'] = -10 + return formats + + +class MicrosoftMediusIE(MicrosoftMediusBaseIE): + _VALID_URL = r'https?://medius\.microsoft\.com/Embed/(?:Video\?id=|video-nc/|VideoDetails/)(?P[\da-f-]+)' + + _TESTS = [{ + 'url': 'https://medius.microsoft.com/Embed/video-nc/9640d86c-f513-4889-959e-5dace86e7d2b', + 'info_dict': { + 'id': '9640d86c-f513-4889-959e-5dace86e7d2b', + 'ext': 'ismv', + 'title': 'Rapidly code, test and ship from secure cloud developer environments', + 'description': 'md5:33c8e4facadc438613476eea24165f71', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + 'subtitles': 'count:30', + }, + }, { + 'url': 'https://medius.microsoft.com/Embed/video-nc/81215af5-c813-4dcd-aede-94f4e1a7daa3', + 'info_dict': { + 'id': '81215af5-c813-4dcd-aede-94f4e1a7daa3', + 'ext': 'ismv', + 'title': 'Microsoft Build opening', + 'description': 'md5:43455096141077a1f23144cab8cec1cb', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + 'subtitles': 'count:31', + }, + }, { + 'url': 'https://medius.microsoft.com/Embed/VideoDetails/78493569-9b3b-4a85-a409-ee76e789e25c', + 'info_dict': { + 'id': '78493569-9b3b-4a85-a409-ee76e789e25c', + 'ext': 'ismv', + 'title': ' Anomaly Detection & Root cause at Edge', + 'description': 'md5:f8f1ad93d7918649bfb97fa081b03b83', + 'thumbnail': r're:https://mediusdownload.event.microsoft.com/asset.*\.jpg.*', + 'subtitles': 'count:17', + }, + }, { + 'url': 'https://medius.microsoft.com/Embed/Video?id=0dc69bda-079b-4070-a7db-a8da1a06a9c7', + 'only_matching': True, + }, { + 'url': 'https://medius.microsoft.com/Embed/video-nc/fe823a91-959c-465b-96d4-8f4db624f72c', + 'only_matching': True, + }] + + def _extract_subtitle(self, webpage, video_id): + captions = traverse_obj( + self._search_json(r'const\s+captionsConfiguration\s*=', webpage, 'captions', video_id, default=None), + ('languageList', lambda _, v: url_or_none(v['src']), { + 'url': 'src', + 'tag': ('srclang', {str}), + 'name': ('kind', {str}), + })) or [{'url': url, 'tag': url_basename(url).split('.vtt')[0].split('_')[-1]} + for url in re.findall(r'var\s+file\s+=\s+\{[^}]+\'(https://[^\']+\.vtt\?[^\']+)', webpage)] + + return self._sub_to_dict(captions) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://medius.microsoft.com/Embed/video-nc/{video_id}', video_id) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'formats': self._extract_ism( + self._search_regex(r'StreamUrl\s*=\s*"([^"]+manifest)"', webpage, 'ism url'), video_id), + 'thumbnail': self._og_search_thumbnail(webpage), + 'subtitles': self._extract_subtitle(webpage, video_id), + } + + +class MicrosoftLearnPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://learn\.microsoft\.com/(?:[\w-]+/)?(?Pshows|events)/(?P[\w-]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://learn.microsoft.com/en-us/shows/bash-for-beginners', + 'info_dict': { + 'id': 'bash-for-beginners', + 'title': 'Bash for Beginners', + 'description': 'md5:16a91c07222117d1e00912f0dbc02c2c', + }, + 'playlist_count': 20, + }, { + 'url': 'https://learn.microsoft.com/en-us/events/build-2022', + 'info_dict': { + 'id': 'build-2022', + 'title': 'Microsoft Build 2022 - Events', + 'description': 'md5:c16b43848027df837b22c6fbac7648d3', + }, + 'playlist_count': 201, + }] + + def _entries(self, url_base, video_id): + skip = 0 + while True: + playlist_info = self._download_json(url_base, video_id, f'Downloading entries {skip}', query={ + 'locale': 'en-us', + '$skip': skip, + }) + url_paths = traverse_obj(playlist_info, ('results', ..., 'url', {str})) + for url_path in url_paths: + yield self.url_result(f'https://learn.microsoft.com/en-us{url_path}') + skip += len(url_paths) + if skip >= playlist_info.get('count', 0) or not url_paths: + break + + def _real_extract(self, url): + playlist_id, playlist_type = self._match_valid_url(url).group('id', 'type') + webpage = self._download_webpage(url, playlist_id) + + metainfo = { + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + } + sub_type = 'episodes' if playlist_type == 'shows' else 'sessions' + + url_base = f'https://learn.microsoft.com/api/contentbrowser/search/{playlist_type}/{playlist_id}/{sub_type}' + return self.playlist_result(self._entries(url_base, playlist_id), playlist_id, **metainfo) + + +class MicrosoftLearnEpisodeIE(MicrosoftMediusBaseIE): + _VALID_URL = r'https?://learn\.microsoft\.com/(?:[\w-]+/)?shows/[\w-]+/(?P[^?#/]+)' + _TESTS = [{ + 'url': 'https://learn.microsoft.com/en-us/shows/bash-for-beginners/what-is-the-difference-between-a-terminal-and-a-shell-2-of-20-bash-for-beginners/', + 'info_dict': { + 'id': 'd44e1a03-a0e5-45c2-9496-5c9fa08dc94c', + 'ext': 'ismv', + 'title': 'What is the Difference Between a Terminal and a Shell? (Part 2 of 20)', + 'description': 'md5:7bbbfb593d21c2cf2babc3715ade6b88', + 'timestamp': 1676339547, + 'upload_date': '20230214', + 'thumbnail': r're:https://learn\.microsoft\.com/video/media/.*\.png', + 'subtitles': 'count:14', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + entry_id = self._html_search_meta('entryId', webpage, 'entryId', fatal=True) + video_info = self._download_json( + f'https://learn.microsoft.com/api/video/public/v1/entries/{entry_id}', video_id) + return { + 'id': entry_id, + 'formats': self._extract_ism(video_info['publicVideo']['adaptiveVideoUrl'], video_id), + 'subtitles': self._sub_to_dict(traverse_obj(video_info, ( + 'publicVideo', 'captions', lambda _, v: url_or_none(v['url']), { + 'tag': ('language', {str}), + 'url': 'url', + }))), + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + **traverse_obj(video_info, { + 'timestamp': ('createTime', {parse_iso8601}), + 'thumbnails': ('publicVideo', 'thumbnailOtherSizes', ..., {'url': {url_or_none}}), + }), + } + + +class MicrosoftLearnSessionIE(InfoExtractor): + _VALID_URL = r'https?://learn\.microsoft\.com/(?:[\w-]+/)?events/[\w-]+/(?P[^?#/]+)' + _TESTS = [{ + 'url': 'https://learn.microsoft.com/en-us/events/build-2022/ts01-rapidly-code-test-ship-from-secure-cloud-developer-environments', + 'info_dict': { + 'id': '9640d86c-f513-4889-959e-5dace86e7d2b', + 'ext': 'ismv', + 'title': 'Rapidly code, test and ship from secure cloud developer environments - Events', + 'description': 'md5:f26c1a85d41c1cffd27a0279254a25c3', + 'timestamp': 1653408600, + 'upload_date': '20220524', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + metainfo = { + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'timestamp': parse_iso8601(self._html_search_meta('startDate', webpage, 'startDate')), + } + + return self.url_result( + self._html_search_meta('externalVideoUrl', webpage, 'videoUrl', fatal=True), + url_transparent=True, ie=MicrosoftMediusIE, **metainfo) + + +class MicrosoftBuildIE(InfoExtractor): + _VALID_URL = [ + r'https?://build\.microsoft\.com/[\w-]+/sessions/(?P[\da-f-]+)', + r'https?://build\.microsoft\.com/[\w-]+/(?Psessions)/?(?:[?#]|$)', + ] + + _TESTS = [{ + 'url': 'https://build.microsoft.com/en-US/sessions/b49feb31-afcd-4217-a538-d3ca1d171198?source=sessions', + 'info_dict': { + 'id': 'aee55fb5-fcf9-4b38-b764-a3527cb57554', + 'ext': 'ismv', + 'title': 'Microsoft Build opening keynote', + 'description': 'md5:d38338f336ef4b6ef9ad2a7466a76655', + 'timestamp': 1716307200, + 'upload_date': '20240521', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + }, + }, { + 'url': 'https://build.microsoft.com/en-US/sessions', + 'info_dict': { + 'id': 'sessions', + }, + 'playlist_mincount': 418, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + entries = [ + self.url_result( + video_info['onDemand'], ie=MicrosoftMediusIE, url_transparent=True, **traverse_obj(video_info, { + 'id': ('sessionId', {str}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('startDateTime', {parse_iso8601}), + })) + for video_info in self._download_json( + 'https://api-v2.build.microsoft.com/api/session/all/en-US', video_id, 'Downloading video info') + ] + if video_id == 'sessions': + return self.playlist_result(entries, video_id) + else: + return traverse_obj(entries, (lambda _, v: v['id'] == video_id), get_all=False) diff --git a/yt_dlp/extractor/microsoftvirtualacademy.py b/yt_dlp/extractor/microsoftvirtualacademy.py deleted file mode 100644 index e354d8a507..0000000000 --- a/yt_dlp/extractor/microsoftvirtualacademy.py +++ /dev/null @@ -1,188 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - smuggle_url, - unsmuggle_url, - xpath_text, -) - - -class MicrosoftVirtualAcademyBaseIE(InfoExtractor): - def _extract_base_url(self, course_id, display_id): - return self._download_json( - f'https://api-mlxprod.microsoft.com/services/products/anonymous/{course_id}', - display_id, 'Downloading course base URL') - - def _extract_chapter_and_title(self, title): - if not title: - return None, None - m = re.search(r'(?P\d+)\s*\|\s*(?P.+)', title) - return (int(m.group('chapter')), m.group('title')) if m else (None, title) - - -class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): - IE_NAME = 'mva' - IE_DESC = 'Microsoft Virtual Academy videos' - _VALID_URL = rf'(?:{IE_NAME}:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' - - _TESTS = [{ - 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', - 'md5': '7826c44fc31678b12ad8db11f6b5abb9', - 'info_dict': { - 'id': 'gfVXISmEB_6804984382', - 'ext': 'mp4', - 'title': 'Course Introduction', - 'formats': 'mincount:3', - 'subtitles': { - 'en': [{ - 'ext': 'ttml', - }], - }, - }, - }, { - 'url': 'mva:11788:gfVXISmEB_6804984382', - 'only_matching': True, - }] - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - mobj = self._match_valid_url(url) - course_id = mobj.group('course_id') - video_id = mobj.group('id') - - base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id) - - settings = self._download_xml( - f'{base_url}/content/content_{video_id}/videosettings.xml?v=1', - video_id, 'Downloading video settings XML') - - _, title = self._extract_chapter_and_title(xpath_text( - settings, './/Title', 'title', fatal=True)) - - formats = [] - - for sources in settings.findall('.//MediaSources'): - sources_type = sources.get('videoType') - for source in sources.findall('./MediaSource'): - video_url = source.text - if not video_url or not video_url.startswith('http'): - continue - if sources_type == 'smoothstreaming': - formats.extend(self._extract_ism_formats( - video_url, video_id, 'mss', fatal=False)) - continue - video_mode = source.get('videoMode') - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) - codec = source.get('codec') - acodec, vcodec = [None] * 2 - if codec: - codecs = codec.split(',') - if len(codecs) == 2: - acodec, vcodec = codecs - elif len(codecs) == 1: - vcodec = codecs[0] - formats.append({ - 'url': video_url, - 'format_id': video_mode, - 'height': height, - 'acodec': acodec, - 'vcodec': vcodec, - }) - - subtitles = {} - for source in settings.findall('.//MarkerResourceSource'): - subtitle_url = source.text - if not subtitle_url: - continue - subtitles.setdefault('en', []).append({ - 'url': f'{base_url}/{subtitle_url}', - 'ext': source.get('type'), - }) - - return { - 'id': video_id, - 'title': title, - 'subtitles': subtitles, - 'formats': formats, - } - - -class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): - IE_NAME = 'mva:course' - IE_DESC = 'Microsoft Virtual Academy courses' - _VALID_URL = rf'(?:{IE_NAME}:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' - - _TESTS = [{ - 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', - 'info_dict': { - 'id': '11788', - 'title': 'Microsoft Azure Fundamentals: Virtual Machines', - }, - 'playlist_count': 36, - }, { - # with emphasized chapters - 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335', - 'info_dict': { - 'id': '16335', - 'title': 'Developing Windows 10 Games with Construct 2', - }, - 'playlist_count': 10, - }, { - 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', - 'only_matching': True, - }, { - 'url': 'mva:course:11788', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if MicrosoftVirtualAcademyIE.suitable(url) else super().suitable(url) - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - course_id = mobj.group('id') - display_id = mobj.group('display_id') - - base_url = self._extract_base_url(course_id, display_id) - - manifest = self._download_json( - f'{base_url}/imsmanifestlite.json', - display_id, 'Downloading course manifest JSON')['manifest'] - - organization = manifest['organizations']['organization'][0] - - entries = [] - for chapter in organization['item']: - chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title')) - chapter_id = chapter.get('@identifier') - for item in chapter.get('item', []): - item_id = item.get('@identifier') - if not item_id: - continue - metadata = item.get('resource', {}).get('metadata') or {} - if metadata.get('learningresourcetype') != 'Video': - continue - _, title = self._extract_chapter_and_title(item.get('title')) - duration = parse_duration(metadata.get('duration')) - description = metadata.get('description') - entries.append({ - '_type': 'url_transparent', - 'url': smuggle_url( - f'mva:{course_id}:{item_id}', {'base_url': base_url}), - 'title': title, - 'description': description, - 'duration': duration, - 'chapter': chapter_title, - 'chapter_number': chapter_number, - 'chapter_id': chapter_id, - }) - - title = organization.get('title') or manifest.get('metadata', {}).get('title') - - return self.playlist_result(entries, course_id, title) diff --git a/yt_dlp/extractor/mit.py b/yt_dlp/extractor/mit.py index e75c540a23..66c3b07936 100644 --- a/yt_dlp/extractor/mit.py +++ b/yt_dlp/extractor/mit.py @@ -65,7 +65,7 @@ class TechTVMITIE(InfoExtractor): class OCWMITIE(InfoExtractor): IE_NAME = 'ocw.mit.edu' - _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' + _VALID_URL = r'https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' _BASE_URL = 'http://ocw.mit.edu/' _TESTS = [ diff --git a/yt_dlp/extractor/mitele.py b/yt_dlp/extractor/mitele.py index ea29986729..3573a2a3fd 100644 --- a/yt_dlp/extractor/mitele.py +++ b/yt_dlp/extractor/mitele.py @@ -1,14 +1,13 @@ -from .telecinco import TelecincoIE +from .telecinco import TelecincoBaseIE from ..utils import ( int_or_none, parse_iso8601, ) -class MiTeleIE(TelecincoIE): # XXX: Do not subclass from concrete IE +class MiTeleIE(TelecincoBaseIE): IE_DESC = 'mitele.es' _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P<id>[^/]+)/player' - _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/57b0dfb9c715da65618b4afa/player', 'info_dict': { @@ -27,6 +26,7 @@ class MiTeleIE(TelecincoIE): # XXX: Do not subclass from concrete IE 'timestamp': 1471209401, 'upload_date': '20160814', }, + 'skip': 'HTTP Error 404 Not Found', }, { # no explicit title 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player', @@ -49,6 +49,26 @@ class MiTeleIE(TelecincoIE): # XXX: Do not subclass from concrete IE 'params': { 'skip_download': True, }, + 'skip': 'HTTP Error 404 Not Found', + }, { + 'url': 'https://www.mitele.es/programas-tv/horizonte/temporada-5/programa-171-40_013480051/player/', + 'info_dict': { + 'id': '7adbe22e-cd41-4787-afa4-36f3da7c2c6f', + 'ext': 'mp4', + 'title': 'Horizonte Temporada 5 Programa 171', + 'description': 'md5:97f1fb712c5ac27e5693a8b3c5c0c6e3', + 'episode': 'Las Zonas de Bajas Emisiones, a debate', + 'episode_number': 171, + 'season': 'Season 5', + 'season_number': 5, + 'series': 'Horizonte', + 'duration': 7012, + 'upload_date': '20240927', + 'timestamp': 1727416450, + 'thumbnail': 'https://album.mediaset.es/eimg/2024/09/27/horizonte-171_9f02.jpg', + 'age_limit': 12, + }, + 'params': {'geo_bypass_country': 'ES'}, }, { 'url': 'http://www.mitele.es/series-online/la-que-se-avecina/57aac5c1c915da951a8b45ed/player', 'only_matching': True, diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index 8a693dc0be..935bf85615 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -1,17 +1,23 @@ +import json import re -import urllib.parse +import time import uuid from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, determine_ext, int_or_none, join_nonempty, + jwt_decode_hs256, parse_duration, parse_iso8601, - traverse_obj, try_get, + url_or_none, + urlencode_postdata, ) +from ..utils.traversal import traverse_obj class MLBBaseIE(InfoExtractor): @@ -275,76 +281,225 @@ class MLBVideoIE(MLBBaseIE): class MLBTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mlb\.com/tv/g(?P<id>\d{6})' _NETRC_MACHINE = 'mlb' - _TESTS = [{ 'url': 'https://www.mlb.com/tv/g661581/vee2eff5f-a7df-4c20-bdb4-7b926fa12638', 'info_dict': { 'id': '661581', 'ext': 'mp4', 'title': '2022-07-02 - St. Louis Cardinals @ Philadelphia Phillies', + 'release_date': '20220702', + 'release_timestamp': 1656792300, }, - 'params': { - 'skip_download': True, + 'params': {'skip_download': 'm3u8'}, + }, { + # makeup game: has multiple dates, need to avoid games with 'rescheduleDate' + 'url': 'https://www.mlb.com/tv/g747039/vd22541c4-5a29-45f7-822b-635ec041cf5e', + 'info_dict': { + 'id': '747039', + 'ext': 'mp4', + 'title': '2024-07-29 - Toronto Blue Jays @ Baltimore Orioles', + 'release_date': '20240729', + 'release_timestamp': 1722280200, }, + 'params': {'skip_download': 'm3u8'}, }] + _GRAPHQL_INIT_QUERY = '''\ +mutation initSession($device: InitSessionInput!, $clientType: ClientType!, $experience: ExperienceTypeInput) { + initSession(device: $device, clientType: $clientType, experience: $experience) { + deviceId + sessionId + entitlements { + code + } + location { + countryCode + regionName + zipCode + latitude + longitude + } + clientExperience + features + } + }''' + _GRAPHQL_PLAYBACK_QUERY = '''\ +mutation initPlaybackSession( + $adCapabilities: [AdExperienceType] + $mediaId: String! + $deviceId: String! + $sessionId: String! + $quality: PlaybackQuality + ) { + initPlaybackSession( + adCapabilities: $adCapabilities + mediaId: $mediaId + deviceId: $deviceId + sessionId: $sessionId + quality: $quality + ) { + playbackSessionId + playback { + url + token + expiration + cdn + } + } + }''' + _APP_VERSION = '7.8.2' + _device_id = None + _session_id = None _access_token = None + _token_expiry = 0 + + @property + def _api_headers(self): + if (self._token_expiry - 120) <= time.time(): + self.write_debug('Access token has expired; re-logging in') + self._perform_login(*self._get_login_info()) + return {'Authorization': f'Bearer {self._access_token}'} def _real_initialize(self): if not self._access_token: self.raise_login_required( 'All videos are only available to registered users', method='password') - def _perform_login(self, username, password): - data = f'grant_type=password&username={urllib.parse.quote(username)}&password={urllib.parse.quote(password)}&scope=openid offline_access&client_id=0oa3e1nutA1HLzAKG356' - access_token = self._download_json( - 'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None, - headers={ - 'User-Agent': 'okhttp/3.12.1', - 'Content-Type': 'application/x-www-form-urlencoded', - }, data=data.encode())['access_token'] + def _set_device_id(self, username): + if not self._device_id: + self._device_id = self.cache.load( + self._NETRC_MACHINE, 'device_ids', default={}).get(username) + if self._device_id: + return + self._device_id = str(uuid.uuid4()) + self.cache.store(self._NETRC_MACHINE, 'device_ids', {username: self._device_id}) - entitlement = self._download_webpage( - f'https://media-entitlement.mlb.com/api/v3/jwt?os=Android&appname=AtBat&did={uuid.uuid4()}', None, - headers={ - 'User-Agent': 'okhttp/3.12.1', - 'Authorization': f'Bearer {access_token}', - }) + def _perform_login(self, username, password): + try: + self._access_token = self._download_json( + 'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None, + 'Logging in', 'Unable to log in', headers={ + 'User-Agent': 'okhttp/3.12.1', + 'Content-Type': 'application/x-www-form-urlencoded', + }, data=urlencode_postdata({ + 'grant_type': 'password', + 'username': username, + 'password': password, + 'scope': 'openid offline_access', + 'client_id': '0oa3e1nutA1HLzAKG356', + }))['access_token'] + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 400: + raise ExtractorError('Invalid username or password', expected=True) + raise + + self._token_expiry = traverse_obj(self._access_token, ({jwt_decode_hs256}, 'exp', {int})) or 0 + self._set_device_id(username) + + self._session_id = self._call_api({ + 'operationName': 'initSession', + 'query': self._GRAPHQL_INIT_QUERY, + 'variables': { + 'device': { + 'appVersion': self._APP_VERSION, + 'deviceFamily': 'desktop', + 'knownDeviceId': self._device_id, + 'languagePreference': 'ENGLISH', + 'manufacturer': '', + 'model': '', + 'os': '', + 'osVersion': '', + }, + 'clientType': 'WEB', + }, + }, None, 'session ID')['data']['initSession']['sessionId'] - data = f'grant_type=urn:ietf:params:oauth:grant-type:token-exchange&subject_token={entitlement}&subject_token_type=urn:ietf:params:oauth:token-type:jwt&platform=android-tv' - self._access_token = self._download_json( - 'https://us.edge.bamgrid.com/token', None, + def _call_api(self, data, video_id, description='GraphQL JSON', fatal=True): + return self._download_json( + 'https://media-gateway.mlb.com/graphql', video_id, + f'Downloading {description}', f'Unable to download {description}', fatal=fatal, headers={ + **self._api_headers, 'Accept': 'application/json', - 'Authorization': 'Bearer bWxidHYmYW5kcm9pZCYxLjAuMA.6LZMbH2r--rbXcgEabaDdIslpo4RyZrlVfWZhsAgXIk', - 'Content-Type': 'application/x-www-form-urlencoded', - }, data=data.encode())['access_token'] + 'Content-Type': 'application/json', + 'x-client-name': 'WEB', + 'x-client-version': self._APP_VERSION, + }, data=json.dumps(data, separators=(',', ':')).encode()) + + def _extract_formats_and_subtitles(self, broadcast, video_id): + feed = traverse_obj(broadcast, ('homeAway', {str.title})) + medium = traverse_obj(broadcast, ('type', {str})) + language = traverse_obj(broadcast, ('language', {str.lower})) + format_id = join_nonempty(feed, medium, language) + + response = self._call_api({ + 'operationName': 'initPlaybackSession', + 'query': self._GRAPHQL_PLAYBACK_QUERY, + 'variables': { + 'adCapabilities': ['GOOGLE_STANDALONE_AD_PODS'], + 'deviceId': self._device_id, + 'mediaId': broadcast['mediaId'], + 'quality': 'PLACEHOLDER', + 'sessionId': self._session_id, + }, + }, video_id, f'{format_id} broadcast JSON', fatal=False) + + playback = traverse_obj(response, ('data', 'initPlaybackSession', 'playback', {dict})) + m3u8_url = traverse_obj(playback, ('url', {url_or_none})) + token = traverse_obj(playback, ('token', {str})) + + if not (m3u8_url and token): + errors = '; '.join(traverse_obj(response, ('errors', ..., 'message', {str}))) + if 'not entitled' in errors: + raise ExtractorError(errors, expected=True) + elif errors: # Only warn when 'blacked out' since radio formats are available + self.report_warning(f'API returned errors for {format_id}: {errors}') + else: + self.report_warning(f'No formats available for {format_id} broadcast; skipping') + return [], {} + + cdn_headers = {'x-cdn-token': token} + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url.replace(f'/{token}/', '/'), video_id, 'mp4', + m3u8_id=format_id, fatal=False, headers=cdn_headers) + for fmt in fmts: + fmt['http_headers'] = cdn_headers + fmt.setdefault('format_note', join_nonempty(feed, medium, delim=' ')) + fmt.setdefault('language', language) + if fmt.get('vcodec') == 'none' and fmt['language'] == 'en': + fmt['source_preference'] = 10 + + return fmts, subs def _real_extract(self, url): video_id = self._match_id(url) - airings = self._download_json( - f'https://search-api-mlbtv.mlb.com/svc/search/v2/graphql/persisted/query/core/Airings?variables=%7B%22partnerProgramIds%22%3A%5B%22{video_id}%22%5D%2C%22applyEsniMediaRightsLabels%22%3Atrue%7D', - video_id)['data']['Airings'] + data = self._download_json( + 'https://statsapi.mlb.com/api/v1/schedule', video_id, query={ + 'gamePk': video_id, + 'hydrate': 'broadcasts(all),statusFlags', + }) + metadata = traverse_obj(data, ( + 'dates', ..., 'games', + lambda _, v: str(v['gamePk']) == video_id and not v.get('rescheduleDate'), any)) + + broadcasts = traverse_obj(metadata, ( + 'broadcasts', lambda _, v: v['mediaId'] and v['mediaState']['mediaStateCode'] != 'MEDIA_OFF')) formats, subtitles = [], {} - for airing in airings: - m3u8_url = self._download_json( - airing['playbackUrls'][0]['href'].format(scenario='browser~csai'), video_id, - headers={ - 'Authorization': self._access_token, - 'Accept': 'application/vnd.media-service+json; version=2', - })['stream']['complete'] - f, s = self._extract_m3u8_formats_and_subtitles( - m3u8_url, video_id, 'mp4', m3u8_id=join_nonempty(airing.get('feedType'), airing.get('feedLanguage'))) - formats.extend(f) - self._merge_subtitles(s, target=subtitles) + for broadcast in broadcasts: + fmts, subs = self._extract_formats_and_subtitles(broadcast, video_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) return { 'id': video_id, - 'title': traverse_obj(airings, (..., 'titles', 0, 'episodeName'), get_all=False), - 'is_live': traverse_obj(airings, (..., 'mediaConfig', 'productType'), get_all=False) == 'LIVE', + 'title': join_nonempty( + traverse_obj(metadata, ('officialDate', {str})), + traverse_obj(metadata, ('teams', ('away', 'home'), 'team', 'name', {str}, all, {' @ '.join})), + delim=' - '), + 'is_live': traverse_obj(broadcasts, (..., 'mediaState', 'mediaStateCode', {str}, any)) == 'MEDIA_ON', + 'release_timestamp': traverse_obj(metadata, ('gameDate', {parse_iso8601})), 'formats': formats, 'subtitles': subtitles, - 'http_headers': {'Authorization': f'Bearer {self._access_token}'}, } diff --git a/yt_dlp/extractor/mojevideo.py b/yt_dlp/extractor/mojevideo.py new file mode 100644 index 0000000000..145e306970 --- /dev/null +++ b/yt_dlp/extractor/mojevideo.py @@ -0,0 +1,121 @@ +from .common import InfoExtractor +from ..utils import js_to_json, remove_end, update_url_query + + +class MojevideoIE(InfoExtractor): + IE_DESC = 'mojevideo.sk' + _VALID_URL = r'https?://(?:www\.)?mojevideo\.sk/video/(?P<id>\w+)/(?P<display_id>[\w()]+?)\.html' + + _TESTS = [{ + 'url': 'https://www.mojevideo.sk/video/3d17c/chlapci_dobetonovali_sme_mame_hotovo.html', + 'md5': '384a4628bd2bbd261c5206cf77c38c17', + 'info_dict': { + 'id': '3d17c', + 'ext': 'mp4', + 'title': 'Chlapci dobetónovali sme, máme hotovo!', + 'display_id': 'chlapci_dobetonovali_sme_mame_hotovo', + 'description': 'md5:a0822126044050d304a9ef58c92ddb34', + 'thumbnail': 'https://fs5.mojevideo.sk/imgfb/250236.jpg', + 'duration': 21.0, + 'upload_date': '20230919', + 'timestamp': 1695129706, + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + # 720p + 'url': 'https://www.mojevideo.sk/video/14677/den_blbec.html', + 'md5': '517c3e111c53a67d10b429c1f344ba2f', + 'info_dict': { + 'id': '14677', + 'ext': 'mp4', + 'title': 'Deň blbec?', + 'display_id': 'den_blbec', + 'description': 'I maličkosť vám môže zmeniť celý deň. Nikdy nezahadzujte žuvačky na zem!', + 'thumbnail': 'https://fs5.mojevideo.sk/imgfb/83575.jpg', + 'duration': 100.0, + 'upload_date': '20120515', + 'timestamp': 1337076481, + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + # 1080p + 'url': 'https://www.mojevideo.sk/video/2feb2/band_maid_onset_(instrumental)_live_zepp_tokyo_(full_hd).html', + 'md5': '64599a23d3ac31cf2fe069e4353d8162', + 'info_dict': { + 'id': '2feb2', + 'ext': 'mp4', + 'title': 'BAND-MAID - onset (Instrumental) Live - Zepp Tokyo (Full HD)', + 'display_id': 'band_maid_onset_(instrumental)_live_zepp_tokyo_(full_hd)', + 'description': 'Výborná inštrumentálna skladba od skupiny BAND-MAID.', + 'thumbnail': 'https://fs5.mojevideo.sk/imgfb/196274.jpg', + 'duration': 240.0, + 'upload_date': '20190708', + 'timestamp': 1562576592, + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + # 720p + 'url': 'https://www.mojevideo.sk/video/358c8/dva_nissany_skyline_strielaju_v_londyne.html', + 'only_matching': True, + }, { + # 720p + 'url': 'https://www.mojevideo.sk/video/2455d/gopro_hero4_session_nova_sportova_vodotesna_kamera.html', + 'only_matching': True, + }, { + # 1080p + 'url': 'https://www.mojevideo.sk/video/352ee/amd_rx_6800_xt_vs_nvidia_rtx_3080_(test_v_9_hrach).html', + 'only_matching': True, + }, { + # 1080p + 'url': 'https://www.mojevideo.sk/video/2cbeb/trailer_z_avengers_infinity_war.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, video_id) + + video_id_dec = self._search_regex( + r'\bvId\s*=\s*(\d+)', webpage, 'video id', fatal=False) or str(int(video_id, 16)) + video_exp = self._search_regex(r'\bvEx\s*=\s*["\'](\d+)', webpage, 'video expiry') + video_hashes = self._search_json( + r'\bvHash\s*=', webpage, 'video hashes', video_id, + contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json) + + formats = [] + for video_hash, (suffix, quality, format_note) in zip(video_hashes, [ + ('', 1, 'normálna kvalita'), + ('_lq', 0, 'nízka kvalita'), + ('_hd', 2, 'HD-720p'), + ('_fhd', 3, 'FULL HD-1080p'), + ('_2k', 4, '2K-1440p'), + ]): + formats.append({ + 'format_id': f'mp4-{quality}', + 'quality': quality, + 'format_note': format_note, + 'url': update_url_query( + f'https://cache01.mojevideo.sk/securevideos69/{video_id_dec}{suffix}.mp4', { + 'md5': video_hash, + 'expires': video_exp, + }), + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'title': (self._og_search_title(webpage, default=None) + or remove_end(self._html_extract_title(webpage, 'title'), ' - Mojevideo')), + 'description': self._og_search_description(webpage), + **self._search_json_ld(webpage, video_id, default={}), + } diff --git a/yt_dlp/extractor/murrtube.py b/yt_dlp/extractor/murrtube.py index 3b39a1b9ad..9067b8781e 100644 --- a/yt_dlp/extractor/murrtube.py +++ b/yt_dlp/extractor/murrtube.py @@ -5,114 +5,111 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, OnDemandPagedList, - determine_ext, - int_or_none, - try_get, + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_id, + parse_count, + remove_end, + update_url, + urlencode_postdata, ) class MurrtubeIE(InfoExtractor): - _WORKING = False _VALID_URL = r'''(?x) (?: murrtube:| - https?://murrtube\.net/videos/(?P<slug>[a-z0-9\-]+)\- + https?://murrtube\.net/(?:v/|videos/(?P<slug>[a-z0-9-]+?)-) ) - (?P<id>[a-f0-9]{8}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{12}) + (?P<id>[A-Z0-9]{4}|[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}) ''' - _TEST = { + _TESTS = [{ 'url': 'https://murrtube.net/videos/inferno-x-skyler-148b6f2a-fdcc-4902-affe-9c0f41aaaca0', - 'md5': '169f494812d9a90914b42978e73aa690', + 'md5': '70380878a77e8565d4aea7f68b8bbb35', 'info_dict': { - 'id': '148b6f2a-fdcc-4902-affe-9c0f41aaaca0', + 'id': 'ca885d8456b95de529b6723b158032e11115d', 'ext': 'mp4', 'title': 'Inferno X Skyler', 'description': 'Humping a very good slutty sheppy (roomate)', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 284, 'uploader': 'Inferno Wolf', 'age_limit': 18, + 'thumbnail': 'https://storage.murrtube.net/murrtube-production/ekbs3zcfvuynnqfx72nn2tkokvsd', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + }, + }, { + 'url': 'https://murrtube.net/v/0J2Q', + 'md5': '31262f6ac56f0ca75e5a54a0f3fefcb6', + 'info_dict': { + 'id': '8442998c52134968d9caa36e473e1a6bac6ca', + 'ext': 'mp4', + 'uploader': 'Hayel', + 'title': 'Who\'s in charge now?', + 'description': 'md5:795791e97e5b0f1805ea84573f02a997', + 'age_limit': 18, + 'thumbnail': 'https://storage.murrtube.net/murrtube-production/fb1ojjwiucufp34ya6hxu5vfqi5s', 'comment_count': int, 'view_count': int, 'like_count': int, - 'tags': ['hump', 'breed', 'Fursuit', 'murrsuit', 'bareback'], }, - } + }] - def _download_gql(self, video_id, op, note=None, fatal=True): - result = self._download_json( - 'https://murrtube.net/graphql', - video_id, note, data=json.dumps(op).encode(), fatal=fatal, - headers={'Content-Type': 'application/json'}) - return result['data'] + def _extract_count(self, name, html): + return parse_count(self._search_regex( + rf'([\d,]+)\s+<span[^>]*>{name}</span>', html, name, default=None)) + + def _real_initialize(self): + homepage = self._download_webpage( + 'https://murrtube.net', None, note='Getting session token') + self._request_webpage( + 'https://murrtube.net/accept_age_check', None, 'Setting age cookie', + data=urlencode_postdata(self._hidden_inputs(homepage))) def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_gql(video_id, { - 'operationName': 'Medium', - 'variables': { - 'id': video_id, - }, - 'query': '''\ -query Medium($id: ID!) { - medium(id: $id) { - title - description - key - duration - commentsCount - likesCount - viewsCount - thumbnailKey - tagList - user { - name - __typename - } - __typename - } -}'''}) - meta = data['medium'] - - storage_url = 'https://storage.murrtube.net/murrtube/' - format_url = storage_url + meta.get('key', '') - thumbnail = storage_url + meta.get('thumbnailKey', '') - - if determine_ext(format_url) == 'm3u8': - formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', fatal=False) - else: - formats = [{'url': format_url}] + if video_id.startswith('murrtube:'): + raise ExtractorError('Support for murrtube: prefix URLs is broken') + video_page = self._download_webpage(url, video_id) + video_attrs = extract_attributes(get_element_html_by_id('video', video_page)) + playlist = update_url(video_attrs['data-url'], query=None) + video_id = self._search_regex(r'/([\da-f]+)/index.m3u8', playlist, 'video id') return { 'id': video_id, - 'title': meta.get('title'), - 'description': meta.get('description'), - 'formats': formats, - 'thumbnail': thumbnail, - 'duration': int_or_none(meta.get('duration')), - 'uploader': try_get(meta, lambda x: x['user']['name']), - 'view_count': meta.get('viewsCount'), - 'like_count': meta.get('likesCount'), - 'comment_count': meta.get('commentsCount'), - 'tags': meta.get('tagList'), + 'title': remove_end(self._og_search_title(video_page), ' - Murrtube'), 'age_limit': 18, + 'formats': self._extract_m3u8_formats(playlist, video_id, 'mp4'), + 'description': self._og_search_description(video_page), + 'thumbnail': update_url(self._og_search_thumbnail(video_page, default=''), query=None) or None, + 'uploader': clean_html(get_element_by_class('pl-1 is-size-6 has-text-lighter', video_page)), + 'view_count': self._extract_count('Views', video_page), + 'like_count': self._extract_count('Likes', video_page), + 'comment_count': self._extract_count('Comments', video_page), } -class MurrtubeUserIE(MurrtubeIE): # XXX: Do not subclass from concrete IE +class MurrtubeUserIE(InfoExtractor): _WORKING = False IE_DESC = 'Murrtube user profile' _VALID_URL = r'https?://murrtube\.net/(?P<id>[^/]+)$' - _TEST = { + _TESTS = [{ 'url': 'https://murrtube.net/stormy', 'info_dict': { 'id': 'stormy', }, 'playlist_mincount': 27, - } + }] _PAGE_SIZE = 10 + def _download_gql(self, video_id, op, note=None, fatal=True): + result = self._download_json( + 'https://murrtube.net/graphql', + video_id, note, data=json.dumps(op).encode(), fatal=fatal, + headers={'Content-Type': 'application/json'}) + return result['data'] + def _fetch_page(self, username, user_id, page): data = self._download_gql(username, { 'operationName': 'Media', diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 1da2cad3d4..8f6fb22b17 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -16,6 +16,7 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + join_nonempty, mimetype2ext, parse_age_limit, parse_duration, @@ -498,10 +499,8 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE m3u8_id=format_id, fatal=False)) continue tbr = int_or_none(va.get('bitrate'), 1000) - if tbr: - format_id += f'-{tbr}' formats.append({ - 'format_id': format_id, + 'format_id': join_nonempty(format_id, tbr), 'url': public_url, 'width': int_or_none(va.get('width')), 'height': int_or_none(va.get('height')), diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py index dd50efe51a..a759da2147 100644 --- a/yt_dlp/extractor/neteasemusic.py +++ b/yt_dlp/extractor/neteasemusic.py @@ -22,12 +22,22 @@ from ..utils import ( class NetEaseMusicBaseIE(InfoExtractor): - _FORMATS = ['bMusic', 'mMusic', 'hMusic'] + # XXX: _extract_formats logic depends on the order of the levels in each tier + _LEVELS = ( + 'standard', # free tier; 标准; 128kbps mp3 or aac + 'higher', # free tier; 192kbps mp3 or aac + 'exhigh', # free tier; 极高 (HQ); 320kbps mp3 or aac + 'lossless', # VIP tier; 无损 (SQ); 48kHz/16bit flac + 'hires', # VIP tier; 高解析度无损 (Hi-Res); 192kHz/24bit flac + 'jyeffect', # VIP tier; 高清臻音 (Spatial Audio); 96kHz/24bit flac + 'jymaster', # SVIP tier; 超清母带 (Master); 192kHz/24bit flac + 'sky', # SVIP tier; 沉浸环绕声 (Surround Audio); flac + ) _API_BASE = 'http://music.163.com/api/' _GEO_BYPASS = False @staticmethod - def kilo_or_none(value): + def _kilo_or_none(value): return int_or_none(value, scale=1000) def _create_eapi_cipher(self, api_path, query_body, cookies): @@ -66,45 +76,43 @@ class NetEaseMusicBaseIE(InfoExtractor): **headers, }, **kwargs) - def _call_player_api(self, song_id, bitrate): + def _call_player_api(self, song_id, level): return self._download_eapi_json( - '/song/enhance/player/url', song_id, {'ids': f'[{song_id}]', 'br': bitrate}, - note=f'Downloading song URL info: bitrate {bitrate}') + '/song/enhance/player/url/v1', song_id, + {'ids': f'[{song_id}]', 'level': level, 'encodeType': 'flac'}, + note=f'Downloading song URL info: level {level}') - def extract_formats(self, info): - err = 0 + def _extract_formats(self, info): formats = [] song_id = info['id'] - for song_format in self._FORMATS: - details = info.get(song_format) - if not details: + for level in self._LEVELS: + song = traverse_obj( + self._call_player_api(song_id, level), ('data', lambda _, v: url_or_none(v['url']), any)) + if not song: + break # Media is not available due to removal or geo-restriction + actual_level = song.get('level') + if actual_level and actual_level != level: + if level in ('lossless', 'jymaster'): + break # We've already extracted the highest level of the user's account tier continue - bitrate = int_or_none(details.get('bitrate')) or 999000 - for song in traverse_obj(self._call_player_api(song_id, bitrate), ('data', lambda _, v: url_or_none(v['url']))): - song_url = song['url'] - if self._is_valid_url(song_url, info['id'], 'song'): - formats.append({ - 'url': song_url, - 'format_id': song_format, - 'asr': traverse_obj(details, ('sr', {int_or_none})), - **traverse_obj(song, { - 'ext': ('type', {str}), - 'abr': ('br', {self.kilo_or_none}), - 'filesize': ('size', {int_or_none}), - }), - }) - elif err == 0: - err = traverse_obj(song, ('code', {int})) or 0 - + formats.append({ + 'url': song['url'], + 'format_id': level, + 'vcodec': 'none', + **traverse_obj(song, { + 'ext': ('type', {str}), + 'abr': ('br', {self._kilo_or_none}), + 'filesize': ('size', {int_or_none}), + }), + }) + if not actual_level: + break # Only 1 level is available if API does not return a value (netease:program) if not formats: - if err != 0 and (err < 200 or err >= 400): - raise ExtractorError(f'No media links found (site code {err})', expected=True) - else: - self.raise_geo_restricted( - 'No media links found: probably due to geo restriction.', countries=['CN']) + self.raise_geo_restricted( + 'No media links found; possibly due to geo restriction', countries=['CN']) return formats - def query_api(self, endpoint, video_id, note): + def _query_api(self, endpoint, video_id, note): result = self._download_json( f'{self._API_BASE}{endpoint}', video_id, note, headers={'Referer': self._API_BASE}) code = traverse_obj(result, ('code', {int})) @@ -128,32 +136,29 @@ class NetEaseMusicBaseIE(InfoExtractor): class NetEaseMusicIE(NetEaseMusicBaseIE): IE_NAME = 'netease:song' IE_DESC = '网易云音乐' - _VALID_URL = r'https?://(y\.)?music\.163\.com/(?:[#m]/)?song\?.*?\bid=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:y\.)?music\.163\.com/(?:[#m]/)?song\?.*?\bid=(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'https://music.163.com/#/song?id=548648087', + 'url': 'https://music.163.com/#/song?id=550136151', 'info_dict': { - 'id': '548648087', + 'id': '550136151', 'ext': 'mp3', - 'title': '戒烟 (Live)', - 'creator': '李荣浩 / 朱正廷 / 陈立农 / 尤长靖 / ONER灵超 / ONER木子洋 / 杨非同 / 陆定昊', + 'title': 'It\'s Ok (Live)', + 'creators': 'count:10', 'timestamp': 1522944000, 'upload_date': '20180405', - 'description': 'md5:3650af9ee22c87e8637cb2dde22a765c', - 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, - 'duration': 256, + 'description': 'md5:9fd07059c2ccee3950dc8363429a3135', + 'duration': 197, 'thumbnail': r're:^http.*\.jpg', 'album': '偶像练习生 表演曲目合集', 'average_rating': int, - 'album_artist': '偶像练习生', + 'album_artists': ['偶像练习生'], }, }, { - 'note': 'No lyrics.', 'url': 'http://music.163.com/song?id=17241424', 'info_dict': { 'id': '17241424', 'ext': 'mp3', 'title': 'Opus 28', - 'creator': 'Dustin O\'Halloran', 'upload_date': '20080211', 'timestamp': 1202745600, 'duration': 263, @@ -161,15 +166,18 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'album': 'Piano Solos Vol. 2', 'album_artist': 'Dustin O\'Halloran', 'average_rating': int, + 'description': '[00:05.00]纯音乐,请欣赏\n', + 'album_artists': ['Dustin O\'Halloran'], + 'creators': ['Dustin O\'Halloran'], + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, }, }, { 'url': 'https://y.music.163.com/m/song?app_version=8.8.45&id=95670&uct2=sKnvS4+0YStsWkqsPhFijw%3D%3D&dlt=0846', - 'md5': '95826c73ea50b1c288b22180ec9e754d', + 'md5': 'b896be78d8d34bd7bb665b26710913ff', 'info_dict': { 'id': '95670', 'ext': 'mp3', 'title': '国际歌', - 'creator': '马备', 'upload_date': '19911130', 'timestamp': 691516800, 'description': 'md5:1ba2f911a2b0aa398479f595224f2141', @@ -180,6 +188,8 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'average_rating': int, 'album': '红色摇滚', 'album_artist': '侯牧人', + 'creators': ['马备'], + 'album_artists': ['侯牧人'], }, }, { 'url': 'http://music.163.com/#/song?id=32102397', @@ -188,7 +198,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'id': '32102397', 'ext': 'mp3', 'title': 'Bad Blood', - 'creator': 'Taylor Swift / Kendrick Lamar', + 'creators': ['Taylor Swift', 'Kendrick Lamar'], 'upload_date': '20150516', 'timestamp': 1431792000, 'description': 'md5:21535156efb73d6d1c355f95616e285a', @@ -207,7 +217,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'id': '22735043', 'ext': 'mp3', 'title': '소원을 말해봐 (Genie)', - 'creator': '少女时代', + 'creators': ['少女时代'], 'upload_date': '20100127', 'timestamp': 1264608000, 'description': 'md5:03d1ffebec3139aa4bafe302369269c5', @@ -251,12 +261,12 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): def _real_extract(self, url): song_id = self._match_id(url) - info = self.query_api( + info = self._query_api( f'song/detail?id={song_id}&ids=%5B{song_id}%5D', song_id, 'Downloading song info')['songs'][0] - formats = self.extract_formats(info) + formats = self._extract_formats(info) - lyrics = self._process_lyrics(self.query_api( + lyrics = self._process_lyrics(self._query_api( f'song/lyric?id={song_id}&lv=-1&tv=-1', song_id, 'Downloading lyrics data')) lyric_data = { 'description': traverse_obj(lyrics, (('lyrics_merged', 'lyrics'), 0, 'data'), get_all=False), @@ -267,14 +277,14 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'id': song_id, 'formats': formats, 'alt_title': '/'.join(traverse_obj(info, (('transNames', 'alias'), ...))) or None, - 'creator': ' / '.join(traverse_obj(info, ('artists', ..., 'name'))) or None, - 'album_artist': ' / '.join(traverse_obj(info, ('album', 'artists', ..., 'name'))) or None, + 'creators': traverse_obj(info, ('artists', ..., 'name')) or None, + 'album_artists': traverse_obj(info, ('album', 'artists', ..., 'name')) or None, **lyric_data, **traverse_obj(info, { 'title': ('name', {str}), - 'timestamp': ('album', 'publishTime', {self.kilo_or_none}), + 'timestamp': ('album', 'publishTime', {self._kilo_or_none}), 'thumbnail': ('album', 'picUrl', {url_or_none}), - 'duration': ('duration', {self.kilo_or_none}), + 'duration': ('duration', {self._kilo_or_none}), 'album': ('album', 'name', {str}), 'average_rating': ('score', {int_or_none}), }), @@ -284,7 +294,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): IE_NAME = 'netease:album' IE_DESC = '网易云音乐 - 专辑' - _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://music\.163\.com/(?:#/)?album\?id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://music.163.com/#/album?id=133153666', 'info_dict': { @@ -294,7 +304,7 @@ class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): 'description': '桃几2021年翻唱合集', 'thumbnail': r're:^http.*\.jpg', }, - 'playlist_mincount': 13, + 'playlist_mincount': 12, }, { 'url': 'http://music.163.com/#/album?id=220780', 'info_dict': { @@ -328,7 +338,7 @@ class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): class NetEaseMusicSingerIE(NetEaseMusicBaseIE): IE_NAME = 'netease:singer' IE_DESC = '网易云音乐 - 歌手' - _VALID_URL = r'https?://music\.163\.com/(#/)?artist\?id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://music\.163\.com/(?:#/)?artist\?id=(?P<id>[0-9]+)' _TESTS = [{ 'note': 'Singer has aliases.', 'url': 'http://music.163.com/#/artist?id=10559', @@ -358,7 +368,7 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE): def _real_extract(self, url): singer_id = self._match_id(url) - info = self.query_api( + info = self._query_api( f'artist/{singer_id}?id={singer_id}', singer_id, note='Downloading singer data') name = join_nonempty( @@ -372,7 +382,7 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE): class NetEaseMusicListIE(NetEaseMusicBaseIE): IE_NAME = 'netease:playlist' IE_DESC = '网易云音乐 - 歌单' - _VALID_URL = r'https?://music\.163\.com/(#/)?(playlist|discover/toplist)\?id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://music\.163\.com/(?:#/)?(?:playlist|discover/toplist)\?id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://music.163.com/#/playlist?id=79177352', 'info_dict': { @@ -405,11 +415,15 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE): 'url': 'http://music.163.com/#/discover/toplist?id=3733003', 'info_dict': { 'id': '3733003', - 'title': 're:韩国Melon排行榜周榜 [0-9]{4}-[0-9]{2}-[0-9]{2}', + 'title': 're:韩国Melon排行榜周榜(?: [0-9]{4}-[0-9]{2}-[0-9]{2})?', 'description': 'md5:73ec782a612711cadc7872d9c1e134fc', + 'upload_date': '20200109', + 'uploader_id': '2937386', + 'tags': ['韩语', '榜单'], + 'uploader': 'Melon榜单', + 'timestamp': 1578569373, }, 'playlist_count': 50, - 'skip': 'Blocked outside Mainland China', }] def _real_extract(self, url): @@ -426,7 +440,7 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE): 'tags': ('tags', ..., {str}), 'uploader': ('creator', 'nickname', {str}), 'uploader_id': ('creator', 'userId', {str_or_none}), - 'timestamp': ('updateTime', {self.kilo_or_none}), + 'timestamp': ('updateTime', {self._kilo_or_none}), })) if traverse_obj(info, ('playlist', 'specialType')) == 10: metainfo['title'] = f'{metainfo.get("title")} {strftime_or_none(metainfo.get("timestamp"), "%Y-%m-%d")}' @@ -437,7 +451,7 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE): class NetEaseMusicMvIE(NetEaseMusicBaseIE): IE_NAME = 'netease:mv' IE_DESC = '网易云音乐 - MV' - _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://music\.163\.com/(?:#/)?mv\?id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://music.163.com/#/mv?id=10958064', 'info_dict': { @@ -445,7 +459,7 @@ class NetEaseMusicMvIE(NetEaseMusicBaseIE): 'ext': 'mp4', 'title': '交换余生', 'description': 'md5:e845872cff28820642a2b02eda428fea', - 'creator': '林俊杰', + 'creators': ['林俊杰'], 'upload_date': '20200916', 'thumbnail': r're:http.*\.jpg', 'duration': 364, @@ -460,7 +474,7 @@ class NetEaseMusicMvIE(NetEaseMusicBaseIE): 'ext': 'mp4', 'title': '이럴거면 그러지말지', 'description': '白雅言自作曲唱甜蜜爱情', - 'creator': '白娥娟', + 'creators': ['白娥娟'], 'upload_date': '20150520', 'thumbnail': r're:http.*\.jpg', 'duration': 216, @@ -468,12 +482,28 @@ class NetEaseMusicMvIE(NetEaseMusicBaseIE): 'like_count': int, 'comment_count': int, }, + 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'This MV has multiple creators.', + 'url': 'https://music.163.com/#/mv?id=22593543', + 'info_dict': { + 'id': '22593543', + 'ext': 'mp4', + 'title': '老北京杀器', + 'creators': ['秃子2z', '辉子', 'Saber梁维嘉'], + 'duration': 206, + 'upload_date': '20240618', + 'like_count': int, + 'comment_count': int, + 'thumbnail': r're:http.*\.jpg', + 'view_count': int, + }, }] def _real_extract(self, url): mv_id = self._match_id(url) - info = self.query_api( + info = self._query_api( f'mv/detail?id={mv_id}&type=mp4', mv_id, 'Downloading mv info')['data'] formats = [ @@ -484,13 +514,13 @@ class NetEaseMusicMvIE(NetEaseMusicBaseIE): return { 'id': mv_id, 'formats': formats, + 'creators': traverse_obj(info, ('artists', ..., 'name')) or [info.get('artistName')], **traverse_obj(info, { 'title': ('name', {str}), 'description': (('desc', 'briefDesc'), {str}, {lambda x: x or None}), - 'creator': ('artistName', {str}), 'upload_date': ('publishTime', {unified_strdate}), 'thumbnail': ('cover', {url_or_none}), - 'duration': ('duration', {self.kilo_or_none}), + 'duration': ('duration', {self._kilo_or_none}), 'view_count': ('playCount', {int_or_none}), 'like_count': ('likeCount', {int_or_none}), 'comment_count': ('commentCount', {int_or_none}), @@ -501,7 +531,7 @@ class NetEaseMusicMvIE(NetEaseMusicBaseIE): class NetEaseMusicProgramIE(NetEaseMusicBaseIE): IE_NAME = 'netease:program' IE_DESC = '网易云音乐 - 电台节目' - _VALID_URL = r'https?://music\.163\.com/(#/?)program\?id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://music\.163\.com/(?:#/)?program\?id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://music.163.com/#/program?id=10109055', 'info_dict': { @@ -509,7 +539,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): 'ext': 'mp3', 'title': '不丹足球背后的故事', 'description': '喜马拉雅人的足球梦 ...', - 'creator': '大话西藏', + 'creators': ['大话西藏'], 'timestamp': 1434179287, 'upload_date': '20150613', 'thumbnail': r're:http.*\.jpg', @@ -522,7 +552,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): 'id': '10141022', 'title': '滚滚电台的有声节目', 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', - 'creator': '滚滚电台ORZ', + 'creators': ['滚滚电台ORZ'], 'timestamp': 1434450733, 'upload_date': '20150616', 'thumbnail': r're:http.*\.jpg', @@ -536,7 +566,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): 'ext': 'mp3', 'title': '滚滚电台的有声节目', 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', - 'creator': '滚滚电台ORZ', + 'creators': ['滚滚电台ORZ'], 'timestamp': 1434450733, 'upload_date': '20150616', 'thumbnail': r're:http.*\.jpg', @@ -550,7 +580,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): def _real_extract(self, url): program_id = self._match_id(url) - info = self.query_api( + info = self._query_api( f'dj/program/detail?id={program_id}', program_id, note='Downloading program info')['program'] metainfo = traverse_obj(info, { @@ -558,17 +588,17 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): 'description': ('description', {str}), 'creator': ('dj', 'brand', {str}), 'thumbnail': ('coverUrl', {url_or_none}), - 'timestamp': ('createTime', {self.kilo_or_none}), + 'timestamp': ('createTime', {self._kilo_or_none}), }) if not self._yes_playlist( info['songs'] and program_id, info['mainSong']['id'], playlist_label='program', video_label='song'): - formats = self.extract_formats(info['mainSong']) + formats = self._extract_formats(info['mainSong']) return { 'id': str(info['mainSong']['id']), 'formats': formats, - 'duration': traverse_obj(info, ('mainSong', 'duration', {self.kilo_or_none})), + 'duration': traverse_obj(info, ('mainSong', 'duration', {self._kilo_or_none})), **metainfo, } @@ -579,7 +609,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): IE_NAME = 'netease:djradio' IE_DESC = '网易云音乐 - 电台' - _VALID_URL = r'https?://music\.163\.com/(#/)?djradio\?id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://music\.163\.com/(?:#/)?djradio\?id=(?P<id>[0-9]+)' _TEST = { 'url': 'http://music.163.com/#/djradio?id=42', 'info_dict': { @@ -597,7 +627,7 @@ class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): metainfo = {} entries = [] for offset in itertools.count(start=0, step=self._PAGE_SIZE): - info = self.query_api( + info = self._query_api( f'dj/program/byradio?asc=false&limit={self._PAGE_SIZE}&radioId={dj_id}&offset={offset}', dj_id, note=f'Downloading dj programs - {offset}') diff --git a/yt_dlp/extractor/nexx.py b/yt_dlp/extractor/nexx.py index cd32892fa0..ee1bc281c6 100644 --- a/yt_dlp/extractor/nexx.py +++ b/yt_dlp/extractor/nexx.py @@ -371,7 +371,7 @@ class NexxIE(InfoExtractor): # not all videos work via arc, e.g. nexx:741:1269984 if not video: # Reverse engineered from JS code (see getDeviceID function) - device_id = f'{random.randint(1, 4)}:{int(time.time())}:{random.randint(1e4, 99999)}{random.randint(1, 9)}' + device_id = f'{random.randint(1, 4)}:{int(time.time())}:{random.randint(10000, 99999)}{random.randint(1, 9)}' result = self._call_api(domain_id, 'session/init', video_id, data={ 'nxp_devh': device_id, diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 0ff25a6909..0bd6edfcba 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, clean_html, + filter_dict, get_element_by_class, int_or_none, join_nonempty, @@ -590,21 +591,22 @@ class NhkRadiruIE(InfoExtractor): IE_DESC = 'NHK らじる (Radiru/Rajiru)' _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?' _TESTS = [{ - 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3926210', - 'skip': 'Episode expired on 2024-02-24', + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_4003239', + 'skip': 'Episode expired on 2024-06-09', 'info_dict': { - 'title': 'ジャズ・トゥナイト シリーズJAZZジャイアンツ 56 ジョニー・ホッジス', - 'id': '0449_01_3926210', + 'title': 'ジャズ・トゥナイト ジャズ「Night and Day」特集', + 'id': '0449_01_4003239', 'ext': 'm4a', + 'uploader': 'NHK FM 東京', + 'description': 'md5:ad05f3c3f3f6e99b2e69f9b5e49551dc', 'series': 'ジャズ・トゥナイト', - 'uploader': 'NHK-FM', - 'channel': 'NHK-FM', + 'channel': 'NHK FM 東京', 'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg', - 'release_date': '20240217', - 'description': 'md5:a456ee8e5e59e6dd2a7d32e62386e811', - 'timestamp': 1708185600, - 'release_timestamp': 1708178400, - 'upload_date': '20240217', + 'upload_date': '20240601', + 'series_id': '0449_01', + 'release_date': '20240601', + 'timestamp': 1717257600, + 'release_timestamp': 1717250400, }, }, { # playlist, airs every weekday so it should _hopefully_ be okay forever @@ -613,71 +615,145 @@ class NhkRadiruIE(InfoExtractor): 'id': '0458_01', 'title': 'ベストオブクラシック', 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。', - 'channel': 'NHK-FM', - 'uploader': 'NHK-FM', 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg', + 'series_id': '0458_01', + 'uploader': 'NHK FM', + 'channel': 'NHK FM', + 'series': 'ベストオブクラシック', }, 'playlist_mincount': 3, }, { # one with letters in the id - 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470', - 'note': 'Expires on 2024-03-31', + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F683_01_3910688', + 'note': 'Expires on 2025-03-31', 'info_dict': { - 'id': 'F300_06_3738470', + 'id': 'F683_01_3910688', 'ext': 'm4a', - 'title': '有島武郎「一房のぶどう」', - 'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)', - 'channel': 'NHKラジオ第1、NHK-FM', - 'uploader': 'NHKラジオ第1、NHK-FM', - 'timestamp': 1635757200, - 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg', - 'release_date': '20161207', - 'series': 'らじる文庫 by ラジオ深夜便 ', - 'release_timestamp': 1481126700, - 'upload_date': '20211101', + 'title': '夏目漱石「文鳥」第1回', + 'series': '【らじる文庫】夏目漱石「文鳥」(全4回)', + 'series_id': 'F683_01', + 'description': '朗読:浅井理アナウンサー', + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F683/img/roudoku_05_rod_640.jpg', + 'upload_date': '20240106', + 'release_date': '20240106', + 'uploader': 'NHK R1', + 'release_timestamp': 1704511800, + 'channel': 'NHK R1', + 'timestamp': 1704512700, }, - 'expected_warnings': ['Unable to download JSON metadata', 'Failed to get extended description'], + 'expected_warnings': ['Unable to download JSON metadata', + 'Failed to get extended metadata. API returned Error 1: Invalid parameters'], }, { # news - 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109', - 'skip': 'Expires on 2023-04-17', + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_4012173', 'info_dict': { - 'id': 'F261_01_3855109', + 'id': 'F261_01_4012173', 'ext': 'm4a', 'channel': 'NHKラジオ第1', 'uploader': 'NHKラジオ第1', - 'timestamp': 1681635900, - 'release_date': '20230416', 'series': 'NHKラジオニュース', - 'title': '午後6時のNHKニュース', + 'title': '午前0時のNHKニュース', 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', - 'upload_date': '20230416', - 'release_timestamp': 1681635600, + 'release_timestamp': 1718290800, + 'release_date': '20240613', + 'timestamp': 1718291400, + 'upload_date': '20240613', }, + }, { + # fallback when extended metadata fails + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=2834_01_4009298', + 'skip': 'Expires on 2024-06-07', + 'info_dict': { + 'id': '2834_01_4009298', + 'title': 'まち☆キラ!開成町特集', + 'ext': 'm4a', + 'release_date': '20240531', + 'upload_date': '20240531', + 'series': 'はま☆キラ!', + 'thumbnail': 'https://www.nhk.or.jp/prog/img/2834/g2834.jpg', + 'channel': 'NHK R1,FM', + 'description': '', + 'timestamp': 1717123800, + 'uploader': 'NHK R1,FM', + 'release_timestamp': 1717120800, + 'series_id': '2834_01', + }, + 'expected_warnings': ['Failed to get extended metadata. API returned empty list.'], }] _API_URL_TMPL = None - def _extract_extended_description(self, episode_id, episode): - service, _, area = traverse_obj(episode, ('aa_vinfo2', {str}, {lambda x: (x or '').partition(',')})) - aa_vinfo3 = traverse_obj(episode, ('aa_vinfo3', {str})) + def _extract_extended_metadata(self, episode_id, aa_vinfo): + service, _, area = traverse_obj(aa_vinfo, (2, {str}, {lambda x: (x or '').partition(',')})) detail_url = try_call( - lambda: self._API_URL_TMPL.format(service=service, area=area, dateid=aa_vinfo3)) + lambda: self._API_URL_TMPL.format(area=area, service=service, dateid=aa_vinfo[3])) if not detail_url: - return + return {} + + response = self._download_json( + detail_url, episode_id, 'Downloading extended metadata', + 'Failed to download extended metadata', fatal=False, expected_status=400) + if not response: + return {} + + if error := traverse_obj(response, ('error', {dict})): + self.report_warning( + 'Failed to get extended metadata. API returned ' + f'Error {join_nonempty("code", "message", from_dict=error, delim=": ")}') + return {} + + full_meta = traverse_obj(response, ('list', service, 0, {dict})) + if not full_meta: + self.report_warning('Failed to get extended metadata. API returned empty list.') + return {} + + station = ' '.join(traverse_obj(full_meta, (('service', 'area'), 'name', {str}))) or None + thumbnails = [{ + 'id': str(id_), + 'preference': 1 if id_.startswith('thumbnail') else -2 if id_.startswith('logo') else -1, + **traverse_obj(thumb, { + 'url': 'url', + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + } for id_, thumb in traverse_obj(full_meta, ('images', {dict.items}, lambda _, v: v[1]['url']))] + + return filter_dict({ + 'channel': station, + 'uploader': station, + 'description': join_nonempty( + 'subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta), + 'thumbnails': thumbnails, + **traverse_obj(full_meta, { + 'title': ('title', {str}), + 'timestamp': ('end_time', {unified_timestamp}), + 'release_timestamp': ('start_time', {unified_timestamp}), + }), + }) - full_meta = traverse_obj( - self._download_json(detail_url, episode_id, 'Downloading extended metadata', fatal=False), - ('list', service, 0, {dict})) or {} - return join_nonempty('subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta) + def _extract_episode_info(self, episode, programme_id, series_meta): + episode_id = f'{programme_id}_{episode["id"]}' + aa_vinfo = traverse_obj(episode, ('aa_contents_id', {lambda x: x.split(';')})) + extended_metadata = self._extract_extended_metadata(episode_id, aa_vinfo) + fallback_start_time, _, fallback_end_time = traverse_obj( + aa_vinfo, (4, {str}, {lambda x: (x or '').partition('_')})) - def _extract_episode_info(self, headline, programme_id, series_meta): + return { + **series_meta, + 'id': episode_id, + 'formats': self._extract_m3u8_formats(episode.get('stream_url'), episode_id, fatal=False), + 'container': 'm4a_dash', # force fixup, AAC-only HLS + 'was_live': True, + 'title': episode.get('program_title'), + 'description': episode.get('program_sub_title'), # fallback + 'timestamp': unified_timestamp(fallback_end_time), + 'release_timestamp': unified_timestamp(fallback_start_time), + **extended_metadata, + } + + def _extract_news_info(self, headline, programme_id, series_meta): episode_id = f'{programme_id}_{headline["headline_id"]}' episode = traverse_obj(headline, ('file_list', 0, {dict})) - description = self._extract_extended_description(episode_id, episode) - if not description: - self.report_warning('Failed to get extended description, falling back to summary') - description = traverse_obj(episode, ('file_title_sub', {str})) return { **series_meta, @@ -687,9 +763,9 @@ class NhkRadiruIE(InfoExtractor): 'was_live': True, 'series': series_meta.get('title'), 'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'), - 'description': description, **traverse_obj(episode, { - 'title': 'file_title', + 'title': ('file_title', {str}), + 'description': ('file_title_sub', {str}), 'timestamp': ('open_time', {unified_timestamp}), 'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}), }), @@ -706,32 +782,58 @@ class NhkRadiruIE(InfoExtractor): site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline') programme_id = f'{site_id}_{corner_id}' - if site_id == 'F261': - json_url = 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json' - else: - json_url = f'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json' - - meta = self._download_json(json_url, programme_id)['main'] + if site_id == 'F261': # XXX: News programmes use old API (for now?) + meta = self._download_json( + 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json', programme_id)['main'] + series_meta = traverse_obj(meta, { + 'title': ('program_name', {str}), + 'channel': ('media_name', {str}), + 'uploader': ('media_name', {str}), + 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}), + }, get_all=False) + + if headline_id: + headline = traverse_obj( + meta, ('detail_list', lambda _, v: v['headline_id'] == headline_id, any)) + if not headline: + raise ExtractorError('Content not found; it has most likely expired', expected=True) + return self._extract_news_info(headline, programme_id, series_meta) + + def news_entries(): + for headline in traverse_obj(meta, ('detail_list', ..., {dict})): + yield self._extract_news_info(headline, programme_id, series_meta) + + return self.playlist_result( + news_entries(), programme_id, description=meta.get('site_detail'), **series_meta) + + meta = self._download_json( + 'https://www.nhk.or.jp/radio-api/app/v1/web/ondemand/series', programme_id, query={ + 'site_id': site_id, + 'corner_site_id': corner_id, + }) - series_meta = traverse_obj(meta, { - 'title': 'program_name', - 'channel': 'media_name', - 'uploader': 'media_name', - 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}), - }, get_all=False) + fallback_station = join_nonempty('NHK', traverse_obj(meta, ('radio_broadcast', {str})), delim=' ') + series_meta = { + 'series': join_nonempty('title', 'corner_name', delim=' ', from_dict=meta), + 'series_id': programme_id, + 'thumbnail': traverse_obj(meta, ('thumbnail_url', {url_or_none})), + 'channel': fallback_station, + 'uploader': fallback_station, + } if headline_id: - return self._extract_episode_info( - traverse_obj(meta, ( - 'detail_list', lambda _, v: v['headline_id'] == headline_id), get_all=False), - programme_id, series_meta) + episode = traverse_obj(meta, ('episodes', lambda _, v: v['id'] == int(headline_id), any)) + if not episode: + raise ExtractorError('Content not found; it has most likely expired', expected=True) + return self._extract_episode_info(episode, programme_id, series_meta) def entries(): - for headline in traverse_obj(meta, ('detail_list', ..., {dict})): - yield self._extract_episode_info(headline, programme_id, series_meta) + for episode in traverse_obj(meta, ('episodes', ..., {dict})): + yield self._extract_episode_info(episode, programme_id, series_meta) return self.playlist_result( - entries(), programme_id, playlist_description=meta.get('site_detail'), **series_meta) + entries(), programme_id, title=series_meta.get('series'), + description=meta.get('series_description'), **series_meta) class NhkRadioNewsPageIE(InfoExtractor): diff --git a/yt_dlp/extractor/nhl.py b/yt_dlp/extractor/nhl.py index 83dd480cfa..ca47a81211 100644 --- a/yt_dlp/extractor/nhl.py +++ b/yt_dlp/extractor/nhl.py @@ -2,6 +2,7 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + join_nonempty, parse_duration, parse_iso8601, ) @@ -41,7 +42,7 @@ class NHLBaseIE(InfoExtractor): else: height = int_or_none(playback.get('height')) formats.append({ - 'format_id': playback.get('name', 'http' + (f'-{height}p' if height else '')), + 'format_id': playback.get('name') or join_nonempty('http', height and f'{height}p'), 'url': playback_url, 'width': int_or_none(playback.get('width')), 'height': height, diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 9d7b010c54..961dd0c5e9 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -40,7 +40,6 @@ class NiconicoIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', - 'md5': 'd1a75c0823e2f629128c43e1212760f9', 'info_dict': { 'id': 'sm22312215', 'ext': 'mp4', @@ -56,8 +55,8 @@ class NiconicoIE(InfoExtractor): 'comment_count': int, 'genres': ['未設定'], 'tags': [], - 'expected_protocol': str, }, + 'params': {'skip_download': 'm3u8'}, }, { # File downloaded with and without credentials are different, so omit # the md5 field @@ -77,8 +76,8 @@ class NiconicoIE(InfoExtractor): 'view_count': int, 'genres': ['音楽・サウンド'], 'tags': ['Translation_Request', 'Kagamine_Rin', 'Rin_Original'], - 'expected_protocol': str, }, + 'params': {'skip_download': 'm3u8'}, }, { # 'video exists but is marked as "deleted" # md5 is unstable @@ -112,7 +111,6 @@ class NiconicoIE(InfoExtractor): }, { # video not available via `getflv`; "old" HTML5 video 'url': 'http://www.nicovideo.jp/watch/sm1151009', - 'md5': 'f95a3d259172667b293530cc2e41ebda', 'info_dict': { 'id': 'sm1151009', 'ext': 'mp4', @@ -128,11 +126,10 @@ class NiconicoIE(InfoExtractor): 'comment_count': int, 'genres': ['ゲーム'], 'tags': [], - 'expected_protocol': str, }, + 'params': {'skip_download': 'm3u8'}, }, { # "New" HTML5 video - # md5 is unstable 'url': 'http://www.nicovideo.jp/watch/sm31464864', 'info_dict': { 'id': 'sm31464864', @@ -149,12 +146,11 @@ class NiconicoIE(InfoExtractor): 'comment_count': int, 'genres': ['アニメ'], 'tags': [], - 'expected_protocol': str, }, + 'params': {'skip_download': 'm3u8'}, }, { # Video without owner 'url': 'http://www.nicovideo.jp/watch/sm18238488', - 'md5': 'd265680a1f92bdcbbd2a507fc9e78a9e', 'info_dict': { 'id': 'sm18238488', 'ext': 'mp4', @@ -168,8 +164,8 @@ class NiconicoIE(InfoExtractor): 'comment_count': int, 'genres': ['エンターテイメント'], 'tags': [], - 'expected_protocol': str, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'only_matching': True, @@ -424,7 +420,7 @@ class NiconicoIE(InfoExtractor): 'x-request-with': 'https://www.nicovideo.jp', })['data']['contentUrl'] # Getting all audio formats results in duplicate video formats which we filter out later - dms_fmts = self._extract_m3u8_formats(dms_m3u8_url, video_id) + dms_fmts = self._extract_m3u8_formats(dms_m3u8_url, video_id, 'mp4') # m3u8 extraction does not provide audio bitrates, so extract from the API data and fix for audio_fmt in traverse_obj(dms_fmts, lambda _, v: v['vcodec'] == 'none'): @@ -436,7 +432,6 @@ class NiconicoIE(InfoExtractor): 'asr': ('samplingRate', {int_or_none}), }), get_all=False), 'acodec': 'aac', - 'ext': 'm4a', } # Sort before removing dupes to keep the format dicts with the lowest tbr @@ -458,9 +453,11 @@ class NiconicoIE(InfoExtractor): if video_id.startswith('so'): video_id = self._match_id(handle.url) - api_data = self._parse_json(self._html_search_regex( - 'data-api-data="([^"]+)"', webpage, - 'API data', default='{}'), video_id) + api_data = traverse_obj( + self._parse_json(self._html_search_meta('server-response', webpage) or '', video_id), + ('data', 'response', {dict})) + if not api_data: + raise ExtractorError('Server response data not found') except ExtractorError as e: try: api_data = self._download_json( @@ -872,7 +869,7 @@ class NicovideoTagURLIE(NicovideoSearchBaseIE): class NiconicoUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)/?(?:$|[#?])' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)(?:/video)?/?(?:$|[#?])' _TEST = { 'url': 'https://www.nicovideo.jp/user/419948', 'info_dict': { @@ -880,7 +877,7 @@ class NiconicoUserIE(InfoExtractor): }, 'playlist_mincount': 101, } - _API_URL = 'https://nvapi.nicovideo.jp/v1/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s' + _API_URL = 'https://nvapi.nicovideo.jp/v2/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s' _PAGE_SIZE = 100 _API_HEADERS = { @@ -900,12 +897,13 @@ class NiconicoUserIE(InfoExtractor): total_count = int_or_none(json_parsed['data'].get('totalCount')) for entry in json_parsed['data']['items']: count += 1 - yield self.url_result('https://www.nicovideo.jp/watch/{}'.format(entry['id'])) + yield self.url_result( + f'https://www.nicovideo.jp/watch/{entry["essential"]["id"]}', ie=NiconicoIE) page_num += 1 def _real_extract(self, url): list_id = self._match_id(url) - return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key()) + return self.playlist_result(self._entries(list_id), list_id) class NiconicoLiveIE(InfoExtractor): diff --git a/yt_dlp/extractor/noodlemagazine.py b/yt_dlp/extractor/noodlemagazine.py index 6414f46efb..4a73e4779a 100644 --- a/yt_dlp/extractor/noodlemagazine.py +++ b/yt_dlp/extractor/noodlemagazine.py @@ -43,14 +43,8 @@ class NoodleMagazineIE(InfoExtractor): def build_url(url_or_path): return urljoin('https://adult.noodlemagazine.com', url_or_path) - headers = {'Referer': url} - player_path = self._html_search_regex( - r'<iframe[^>]+\bid="iplayer"[^>]+\bsrc="([^"]+)"', webpage, 'player path') - player_iframe = self._download_webpage( - build_url(player_path), video_id, 'Downloading iframe page', headers=headers) - playlist_url = self._search_regex( - r'window\.playlistUrl\s*=\s*["\']([^"\']+)["\']', player_iframe, 'playlist url') - playlist_info = self._download_json(build_url(playlist_url), video_id, headers=headers) + playlist_info = self._search_json( + r'window\.playlist\s*=', webpage, video_id, 'playlist info') formats = [] for source in traverse_obj(playlist_info, ('sources', lambda _, v: v['file'])): diff --git a/yt_dlp/extractor/nuum.py b/yt_dlp/extractor/nuum.py index 3db663ded0..697fc6b32e 100644 --- a/yt_dlp/extractor/nuum.py +++ b/yt_dlp/extractor/nuum.py @@ -43,15 +43,17 @@ class NuumBaseIE(InfoExtractor): is_live = media.get('media_status') == 'RUNNING' formats, subtitles = None, None + headers = {'Referer': 'https://nuum.ru/'} if extract_formats: formats, subtitles = self._extract_m3u8_formats_and_subtitles( - media_url, video_id, 'mp4', live=is_live) + media_url, video_id, 'mp4', live=is_live, headers=headers) return filter_dict({ 'id': video_id, 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, + 'http_headers': headers, **traverse_obj(container, { 'title': ('media_container_name', {str}), 'description': ('media_container_description', {str}), @@ -78,7 +80,7 @@ class NuumMediaIE(NuumBaseIE): 'only_matching': True, }, { 'url': 'https://nuum.ru/videos/1567547-toxi-hurtz', - 'md5': 'f1d9118a30403e32b702a204eb03aca3', + 'md5': 'ce28837a5bbffe6952d7bfd3d39811b0', 'info_dict': { 'id': '1567547', 'ext': 'mp4', diff --git a/yt_dlp/extractor/nzonscreen.py b/yt_dlp/extractor/nzonscreen.py index 5fc516daf4..755039804e 100644 --- a/yt_dlp/extractor/nzonscreen.py +++ b/yt_dlp/extractor/nzonscreen.py @@ -10,7 +10,7 @@ from ..utils import ( class NZOnScreenIE(InfoExtractor): - _VALID_URL = r'^https?://www\.nzonscreen\.com/title/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://www\.nzonscreen\.com/title/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.nzonscreen.com/title/shoop-shoop-diddy-wop-cumma-cumma-wang-dang-1982', 'info_dict': { diff --git a/yt_dlp/extractor/nzz.py b/yt_dlp/extractor/nzz.py index ac3b73156e..047c4e1ac9 100644 --- a/yt_dlp/extractor/nzz.py +++ b/yt_dlp/extractor/nzz.py @@ -1,9 +1,6 @@ import re from .common import InfoExtractor -from ..utils import ( - extract_attributes, -) class NZZIE(InfoExtractor): @@ -22,19 +19,14 @@ class NZZIE(InfoExtractor): 'playlist_count': 1, }] + def _entries(self, webpage, page_id): + for script in re.findall(r'(?s)<script[^>]* data-hid="jw-video-jw[^>]+>(.+?)</script>', webpage): + settings = self._search_json(r'var\s+settings\s*=[^{]*', script, 'settings', page_id, fatal=False) + if entry := self._parse_jwplayer_data(settings, page_id): + yield entry + def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - entries = [] - for player_element in re.findall( - r'(<[^>]+class="kalturaPlayer[^"]*"[^>]*>)', webpage): - player_params = extract_attributes(player_element) - if player_params.get('data-type') not in ('kaltura_singleArticle',): - self.report_warning('Unsupported player type') - continue - entry_id = player_params['data-id'] - entries.append(self.url_result( - 'kaltura:1750922:' + entry_id, 'Kaltura', entry_id)) - - return self.playlist_result(entries, page_id) + return self.playlist_result(self._entries(webpage, page_id), page_id) diff --git a/yt_dlp/extractor/olympics.py b/yt_dlp/extractor/olympics.py index becf052f6f..bbf83e531a 100644 --- a/yt_dlp/extractor/olympics.py +++ b/yt_dlp/extractor/olympics.py @@ -1,9 +1,19 @@ from .common import InfoExtractor -from ..utils import int_or_none, try_get +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + parse_qs, + try_get, + update_url, + url_or_none, +) +from ..utils.traversal import traverse_obj class OlympicsReplayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?olympics\.com(?:/tokyo-2020)?/[a-z]{2}/(?:replay|video)/(?P<id>[^/#&?]+)' + _VALID_URL = r'https?://(?:www\.)?olympics\.com/[a-z]{2}/(?:paris-2024/)?(?:replay|videos?|original-series/episode)/(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://olympics.com/fr/video/men-s-109kg-group-a-weightlifting-tokyo-2020-replays', 'info_dict': { @@ -11,26 +21,105 @@ class OlympicsReplayIE(InfoExtractor): 'ext': 'mp4', 'title': '+109kg (H) Groupe A - Haltérophilie | Replay de Tokyo 2020', 'upload_date': '20210801', - 'timestamp': 1627783200, + 'timestamp': 1627797600, 'description': 'md5:c66af4a5bc7429dbcc43d15845ff03b3', - 'uploader': 'International Olympic Committee', + 'thumbnail': 'https://img.olympics.com/images/image/private/t_1-1_1280/primary/nua4o7zwyaznoaejpbk2', + 'duration': 7017.0, }, - 'params': { - 'skip_download': True, + }, { + 'url': 'https://olympics.com/en/original-series/episode/b-boys-and-b-girls-take-the-spotlight-breaking-life-road-to-paris-2024', + 'info_dict': { + 'id': '32633650-c5ee-4280-8b94-fb6defb6a9b5', + 'ext': 'mp4', + 'title': 'B-girl Nicka - Breaking Life, Road to Paris 2024 | Episode 1', + 'upload_date': '20240517', + 'timestamp': 1715948200, + 'description': 'md5:f63d728a41270ec628f6ac33ce471bb1', + 'thumbnail': 'https://img.olympics.com/images/image/private/t_1-1_1280/primary/a3j96l7j6so3vyfijby1', + 'duration': 1321.0, + }, + }, { + 'url': 'https://olympics.com/en/paris-2024/videos/men-s-preliminaries-gbr-esp-ned-rsa-hockey-olympic-games-paris-2024', + 'info_dict': { + 'id': '3d96db23-8eee-4b7c-8ef5-488a0361026c', + 'ext': 'mp4', + 'title': 'Men\'s Preliminaries GBR-ESP & NED-RSA | Hockey | Olympic Games Paris 2024', + 'upload_date': '20240727', + 'timestamp': 1722066600, }, + 'skip': 'Geo-restricted to RU, BR, BT, NP, TM, BD, TL', }, { - 'url': 'https://olympics.com/tokyo-2020/en/replay/bd242924-4b22-49a5-a846-f1d4c809250d/mens-bronze-medal-match-hun-esp', - 'only_matching': True, + 'url': 'https://olympics.com/en/paris-2024/videos/dnp-suni-lee-i-have-goals-and-i-have-expectations-for-myself-but-i-also-am-trying-to-give-myself-grace', + 'info_dict': { + 'id': 'a42f37ab-8a74-41d0-a7d9-af27b7b02a90', + 'ext': 'mp4', + 'title': 'md5:c7cfbc9918636a98e66400a812e4d407', + 'upload_date': '20240729', + 'timestamp': 1722288600, + }, }] + _GEO_BYPASS = False + + def _extract_from_nextjs_data(self, webpage, video_id): + data = traverse_obj(self._search_nextjs_data(webpage, video_id, default={}), ( + 'props', 'pageProps', 'page', 'items', + lambda _, v: v['name'] == 'videoPlaylist', 'data', 'currentVideo', {dict}, any)) + if not data: + return None + + geo_countries = traverse_obj(data, ('countries', ..., {str})) + if traverse_obj(data, ('geoRestrictedVideo', {bool})): + self.raise_geo_restricted(countries=geo_countries) + + is_live = traverse_obj(data, ('streamingStatus', {str})) == 'LIVE' + m3u8_url = traverse_obj(data, ('videoUrl', {url_or_none})) or data['streamUrl'] + tokenized_url = self._tokenize_url(m3u8_url, data['jwtToken'], is_live, video_id) + + try: + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + tokenized_url, video_id, 'mp4', m3u8_id='hls') + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and 'georestricted' in e.cause.msg: + self.raise_geo_restricted(countries=geo_countries) + raise + + return { + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + **traverse_obj(data, { + 'id': ('videoID', {str}), + 'title': ('title', {str}), + 'timestamp': ('contentDate', {parse_iso8601}), + }), + } + + def _tokenize_url(self, url, token, is_live, video_id): + return self._download_json( + 'https://metering.olympics.com/tokengenerator', video_id, + 'Downloading tokenized m3u8 url', query={ + **parse_qs(url), + 'url': update_url(url, query=None), + 'service-id': 'live' if is_live else 'vod', + 'user-auth': token, + })['data']['url'] + + def _legacy_tokenize_url(self, url, video_id): + return self._download_json( + 'https://olympics.com/tokenGenerator', video_id, + 'Downloading legacy tokenized m3u8 url', query={'url': url}) def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + if info := self._extract_from_nextjs_data(webpage, video_id): + return info + title = self._html_search_meta(('title', 'og:title', 'twitter:title'), webpage) - uuid = self._html_search_meta('episode_uid', webpage) + video_uuid = self._html_search_meta('episode_uid', webpage) m3u8_url = self._html_search_meta('video_url', webpage) - json_ld = self._search_json_ld(webpage, uuid) + json_ld = self._search_json_ld(webpage, video_uuid) thumbnails_list = json_ld.get('image') if not thumbnails_list: thumbnails_list = self._html_search_regex( @@ -48,12 +137,12 @@ class OlympicsReplayIE(InfoExtractor): 'width': width, 'height': int_or_none(try_get(width, lambda x: x * height_a / width_a)), }) - m3u8_url = self._download_json( - f'https://olympics.com/tokenGenerator?url={m3u8_url}', uuid, note='Downloading m3u8 url') - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, uuid, 'mp4', m3u8_id='hls') + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + self._legacy_tokenize_url(m3u8_url, video_uuid), video_uuid, 'mp4', m3u8_id='hls') return { - 'id': uuid, + 'id': video_uuid, 'title': title, 'thumbnails': thumbnails, 'formats': formats, diff --git a/yt_dlp/extractor/orf.py b/yt_dlp/extractor/orf.py index f1403d9207..9c37a54d62 100644 --- a/yt_dlp/extractor/orf.py +++ b/yt_dlp/extractor/orf.py @@ -550,7 +550,8 @@ class ORFONIE(InfoExtractor): return self._extract_video_info(segment_id, selected_segment) # Even some segmented videos have an unsegmented version available in API response root - if not traverse_obj(api_json, ('sources', ..., ..., 'src', {url_or_none})): + if (self._configuration_arg('prefer_segments_playlist') + or not traverse_obj(api_json, ('sources', ..., ..., 'src', {url_or_none}))): return self.playlist_result( (self._extract_video_info(str(segment['id']), segment) for segment in segments), video_id, **self._parse_metadata(api_json), multi_video=True) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 26ca84ab34..4d668cd37d 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -1,7 +1,9 @@ +import functools import itertools import urllib.parse from .common import InfoExtractor +from .sproutvideo import VidsIoIE from .vimeo import VimeoIE from ..networking.exceptions import HTTPError from ..utils import ( @@ -12,6 +14,7 @@ from ..utils import ( int_or_none, mimetype2ext, parse_iso8601, + smuggle_url, str_or_none, traverse_obj, url_or_none, @@ -20,13 +23,19 @@ from ..utils import ( class PatreonBaseIE(InfoExtractor): - USER_AGENT = 'Patreon/7.6.28 (Android; Android 11; Scale/2.10)' + @functools.cached_property + def patreon_user_agent(self): + # Patreon mobile UA is needed to avoid triggering Cloudflare anti-bot protection. + # Newer UA yields higher res m3u8 formats for locked posts, but gives 401 if not logged-in + if self._get_cookies('https://www.patreon.com/').get('session_id'): + return 'Patreon/72.2.28 (Android; Android 14; Scale/2.10)' + return 'Patreon/7.6.28 (Android; Android 11; Scale/2.10)' def _call_api(self, ep, item_id, query=None, headers=None, fatal=True, note=None): if headers is None: headers = {} if 'User-Agent' not in headers: - headers['User-Agent'] = self.USER_AGENT + headers['User-Agent'] = self.patreon_user_agent if query: query.update({'json-api-version': 1.0}) @@ -46,6 +55,7 @@ class PatreonBaseIE(InfoExtractor): class PatreonIE(PatreonBaseIE): + IE_NAME = 'patreon' _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.patreon.com/creation?hid=743933', @@ -109,6 +119,7 @@ class PatreonIE(PatreonBaseIE): 'comment_count': int, 'channel_is_verified': True, 'chapters': 'count:4', + 'timestamp': 1423689666, }, 'params': { 'noplaylist': True, @@ -219,6 +230,7 @@ class PatreonIE(PatreonBaseIE): 'thumbnail': r're:^https?://.+', }, 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # multiple attachments/embeds 'url': 'https://www.patreon.com/posts/holy-wars-solos-100601977', @@ -305,22 +317,33 @@ class PatreonIE(PatreonBaseIE): 'channel_follower_count': ('attributes', 'patron_count', {int_or_none}), })) + # all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, Vimeo + headers = {'referer': 'https://patreon.com/'} + # handle Vimeo embeds if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': v_url = urllib.parse.unquote(self._html_search_regex( r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', traverse_obj(attributes, ('embed', 'html', {str})), 'vimeo url', fatal=False) or '') if url_or_none(v_url) and self._request_webpage( - v_url, video_id, 'Checking Vimeo embed URL', - headers={'Referer': 'https://patreon.com/'}, - fatal=False, errnote=False): + v_url, video_id, 'Checking Vimeo embed URL', headers=headers, + fatal=False, errnote=False, expected_status=429): # 429 is TLS fingerprint rejection entries.append(self.url_result( VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'), VimeoIE, url_transparent=True)) embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none})) - if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False): - entries.append(self.url_result(embed_url)) + if embed_url and (urlh := self._request_webpage( + embed_url, video_id, 'Checking embed URL', headers=headers, + fatal=False, errnote=False, expected_status=403)): + # Vimeo's Cloudflare anti-bot protection will return HTTP status 200 for 404, so we need + # to check for "Sorry, we couldn&rsquo;t find that page" in the meta description tag + meta_description = clean_html(self._html_search_meta( + 'description', self._webpage_read_content(urlh, embed_url, video_id, fatal=False), default=None)) + # Password-protected vids.io embeds return 403 errors w/o --video-password or session cookie + if ((urlh.status != 403 and meta_description != 'Sorry, we couldn’t find that page') + or VidsIoIE.suitable(embed_url)): + entries.append(self.url_result(smuggle_url(embed_url, headers))) post_file = traverse_obj(attributes, ('post_file', {dict})) if post_file: @@ -411,15 +434,19 @@ class PatreonIE(PatreonBaseIE): class PatreonCampaignIE(PatreonBaseIE): - - _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?:(?:m/(?P<campaign_id>\d+))|(?P<vanity>[-\w]+))' + IE_NAME = 'patreon:campaign' + _VALID_URL = r'''(?x) + https?://(?:www\.)?patreon\.com/(?: + (?:m|api/campaigns)/(?P<campaign_id>\d+)| + (?P<vanity>(?!creation[?/]|posts/|rss[?/])[\w-]+) + )(?:/posts)?/?(?:$|[?#])''' _TESTS = [{ 'url': 'https://www.patreon.com/dissonancepod/', 'info_dict': { 'title': 'Cognitive Dissonance Podcast', 'channel_url': 'https://www.patreon.com/dissonancepod', 'id': '80642', - 'description': 'md5:eb2fa8b83da7ab887adeac34da6b7af7', + 'description': r're:(?s).*We produce a weekly news podcast focusing on stories that deal with skepticism and religion.*', 'channel_id': '80642', 'channel': 'Cognitive Dissonance Podcast', 'age_limit': 0, @@ -434,31 +461,46 @@ class PatreonCampaignIE(PatreonBaseIE): 'url': 'https://www.patreon.com/m/4767637/posts', 'info_dict': { 'title': 'Not Just Bikes', - 'channel_follower_count': int, 'id': '4767637', 'channel_id': '4767637', 'channel_url': 'https://www.patreon.com/notjustbikes', - 'description': 'md5:595c6e7dca76ae615b1d38c298a287a1', + 'description': r're:(?s).*Not Just Bikes started as a way to explain why we chose to live in the Netherlands.*', 'age_limit': 0, 'channel': 'Not Just Bikes', 'uploader_url': 'https://www.patreon.com/notjustbikes', - 'uploader': 'Not Just Bikes', + 'uploader': 'Jason', 'uploader_id': '37306634', 'thumbnail': r're:^https?://.*$', }, 'playlist_mincount': 71, + }, { + 'url': 'https://www.patreon.com/api/campaigns/4243769/posts', + 'info_dict': { + 'title': 'Second Thought', + 'channel_follower_count': int, + 'id': '4243769', + 'channel_id': '4243769', + 'channel_url': 'https://www.patreon.com/secondthought', + 'description': r're:(?s).*Second Thought is an educational YouTube channel.*', + 'age_limit': 0, + 'channel': 'Second Thought', + 'uploader_url': 'https://www.patreon.com/secondthought', + 'uploader': 'JT Chapman', + 'uploader_id': '32718287', + 'thumbnail': r're:^https?://.*$', + }, + 'playlist_mincount': 201, }, { 'url': 'https://www.patreon.com/dissonancepod/posts', 'only_matching': True, }, { 'url': 'https://www.patreon.com/m/5932659', 'only_matching': True, + }, { + 'url': 'https://www.patreon.com/api/campaigns/4243769', + 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if PatreonIE.suitable(url) else super().suitable(url) - def _entries(self, campaign_id): cursor = None params = { @@ -485,7 +527,7 @@ class PatreonCampaignIE(PatreonBaseIE): campaign_id, vanity = self._match_valid_url(url).group('campaign_id', 'vanity') if campaign_id is None: - webpage = self._download_webpage(url, vanity, headers={'User-Agent': self.USER_AGENT}) + webpage = self._download_webpage(url, vanity, headers={'User-Agent': self.patreon_user_agent}) campaign_id = self._search_nextjs_data( webpage, vanity)['props']['pageProps']['bootstrapEnvelope']['pageBootstrap']['campaign']['data']['id'] diff --git a/yt_dlp/extractor/peloton.py b/yt_dlp/extractor/peloton.py index c0443e9c91..5999d4a6a0 100644 --- a/yt_dlp/extractor/peloton.py +++ b/yt_dlp/extractor/peloton.py @@ -41,7 +41,7 @@ class PelotonIE(InfoExtractor): }, 'params': { 'skip_download': 'm3u8', }, - '_skip': 'Account needed', + 'skip': 'Account needed', }, { 'url': 'https://members.onepeloton.com/classes/player/26603d53d6bb4de1b340514864a6a6a8', 'info_dict': { @@ -61,7 +61,7 @@ class PelotonIE(InfoExtractor): }, 'params': { 'skip_download': 'm3u8', }, - '_skip': 'Account needed', + 'skip': 'Account needed', }] _MANIFEST_URL_TEMPLATE = '%s?hdnea=%s' @@ -199,7 +199,7 @@ class PelotonLiveIE(InfoExtractor): 'params': { 'skip_download': 'm3u8', }, - '_skip': 'Account needed', + 'skip': 'Account needed', } def _real_extract(self, url): diff --git a/yt_dlp/extractor/performgroup.py b/yt_dlp/extractor/performgroup.py index c0d5575912..df726c975b 100644 --- a/yt_dlp/extractor/performgroup.py +++ b/yt_dlp/extractor/performgroup.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import int_or_none, join_nonempty class PerformGroupIE(InfoExtractor): @@ -50,11 +50,8 @@ class PerformGroupIE(InfoExtractor): if not c_url: continue tbr = int_or_none(c.get('bitrate'), 1000) - format_id = 'http' - if tbr: - format_id += f'-{tbr}' formats.append({ - 'format_id': format_id, + 'format_id': join_nonempty('http', tbr), 'url': c_url, 'tbr': tbr, 'width': int_or_none(c.get('width')), diff --git a/yt_dlp/extractor/picarto.py b/yt_dlp/extractor/picarto.py index 726fe41425..72e89c31ed 100644 --- a/yt_dlp/extractor/picarto.py +++ b/yt_dlp/extractor/picarto.py @@ -5,6 +5,7 @@ from ..utils import ( ExtractorError, str_or_none, traverse_obj, + update_url, ) @@ -43,15 +44,16 @@ class PicartoIE(InfoExtractor): url } }''' % (channel_id, channel_id), # noqa: UP031 - })['data'] + }, headers={'Accept': '*/*', 'Content-Type': 'application/json'})['data'] metadata = data['channel'] if metadata.get('online') == 0: raise ExtractorError('Stream is offline', expected=True) title = metadata['title'] - cdn_data = self._download_json( - data['getLoadBalancerUrl']['url'] + '/stream/json_' + metadata['stream_name'] + '.js', + cdn_data = self._download_json(''.join(( + update_url(data['getLoadBalancerUrl']['url'], scheme='https'), + '/stream/json_', metadata['stream_name'], '.js')), channel_id, 'Downloading load balancing info') formats = [] @@ -99,10 +101,10 @@ class PicartoVodIE(InfoExtractor): }, 'skip': 'The VOD does not exist', }, { - 'url': 'https://picarto.tv/ArtofZod/videos/772650', - 'md5': '00067a0889f1f6869cc512e3e79c521b', + 'url': 'https://picarto.tv/ArtofZod/videos/771008', + 'md5': 'abef5322f2700d967720c4c6754b2a34', 'info_dict': { - 'id': '772650', + 'id': '771008', 'ext': 'mp4', 'title': 'Art of Zod - Drawing and Painting', 'thumbnail': r're:^https?://.*\.jpg', @@ -131,7 +133,7 @@ class PicartoVodIE(InfoExtractor): }} }} }}''', - })['data']['video'] + }, headers={'Accept': '*/*', 'Content-Type': 'application/json'})['data']['video'] file_name = data['file_name'] netloc = urllib.parse.urlparse(data['video_recording_image_url']).netloc diff --git a/yt_dlp/extractor/pinterest.py b/yt_dlp/extractor/pinterest.py index 07f249498c..f0b38893b2 100644 --- a/yt_dlp/extractor/pinterest.py +++ b/yt_dlp/extractor/pinterest.py @@ -109,7 +109,7 @@ class PinterestBaseIE(InfoExtractor): class PinterestIE(PinterestBaseIE): - _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?P<id>\d+)' + _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?:[\w-]+--)?(?P<id>\d+)' _TESTS = [{ # formats found in data['videos'] 'url': 'https://www.pinterest.com/pin/664281013778109217/', @@ -174,6 +174,25 @@ class PinterestIE(PinterestBaseIE): }, { 'url': 'https://co.pinterest.com/pin/824721750502199491/', 'only_matching': True, + }, + { + 'url': 'https://pinterest.com/pin/dive-into-serenity-blue-lagoon-pedi-nails-for-a-tranquil-and-refreshing-spa-experience-video-in-2024--2885187256207927', + 'info_dict': { + 'id': '2885187256207927', + 'ext': 'mp4', + 'title': 'Dive into Serenity: Blue Lagoon Pedi Nails for a Tranquil and Refreshing Spa Experience! 💙💅', + 'description': 'md5:5da41c767d2317e42e49b663b0b2150f', + 'uploader': 'Glamour Artistry |Everyday Outfits, Luxury Fashion & Nail Designs', + 'uploader_id': '1142999717836434688', + 'upload_date': '20240702', + 'timestamp': 1719939156, + 'duration': 7.967, + 'comment_count': int, + 'repost_count': int, + 'categories': 'count:9', + 'tags': ['#BlueLagoonPediNails', '#SpaExperience'], + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/podbayfm.py b/yt_dlp/extractor/podbayfm.py index 2a26fd2b36..0141eca909 100644 --- a/yt_dlp/extractor/podbayfm.py +++ b/yt_dlp/extractor/podbayfm.py @@ -1,28 +1,40 @@ from .common import InfoExtractor -from ..utils import OnDemandPagedList, int_or_none, jwt_decode_hs256, try_call +from ..utils import ( + OnDemandPagedList, + clean_html, + int_or_none, + jwt_decode_hs256, + url_or_none, +) +from ..utils.traversal import traverse_obj -def result_from_props(props, episode_id=None): +def result_from_props(props): return { - 'id': props.get('podcast_id') or episode_id, - 'title': props.get('title'), - 'url': props['mediaURL'], + **traverse_obj(props, { + 'id': ('_id', {str}), + 'title': ('title', {str}), + 'url': ('mediaURL', {url_or_none}), + 'description': ('description', {clean_html}), + 'thumbnail': ('image', {jwt_decode_hs256}, 'url', {url_or_none}), + 'timestamp': ('timestamp', {int_or_none}), + 'duration': ('duration', {int_or_none}), + }), 'ext': 'mp3', - 'thumbnail': try_call(lambda: jwt_decode_hs256(props['image'])['url']), - 'timestamp': props.get('timestamp'), - 'duration': int_or_none(props.get('duration')), + 'vcodec': 'none', } class PodbayFMIE(InfoExtractor): - _VALID_URL = r'https?://podbay\.fm/p/[^/]*/e/(?P<id>[^/]*)/?(?:[\?#].*)?$' + _VALID_URL = r'https?://podbay\.fm/p/[^/?#]+/e/(?P<id>\d+)' _TESTS = [{ 'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400', - 'md5': '98b41285dcf7989d105a4ed0404054cf', + 'md5': '895ac8505de349515f5ee8a4a3195c93', 'info_dict': { - 'id': '1647338400', + 'id': '62306451f4a48e58d0c4d6a8', 'title': 'Part One: Kissinger', 'ext': 'mp3', + 'description': r're:^We begin our epic six part series on Henry Kissinger.+', 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1647338400, 'duration': 5001, @@ -34,24 +46,25 @@ class PodbayFMIE(InfoExtractor): episode_id = self._match_id(url) webpage = self._download_webpage(url, episode_id) data = self._search_nextjs_data(webpage, episode_id) - return result_from_props(data['props']['pageProps']['episode'], episode_id) + return result_from_props(data['props']['pageProps']['episode']) class PodbayFMChannelIE(InfoExtractor): - _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/]*)/?(?:[\?#].*)?$' + _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/?#]+)/?(?:$|[?#])' _TESTS = [{ 'url': 'https://podbay.fm/p/behind-the-bastards', 'info_dict': { 'id': 'behind-the-bastards', 'title': 'Behind the Bastards', }, + 'playlist_mincount': 21, }] _PAGE_SIZE = 10 def _fetch_page(self, channel_id, pagenum): return self._download_json( f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}', - channel_id)['podcast'] + f'Downloading channel JSON page {pagenum + 1}', channel_id)['podcast'] @staticmethod def _results_from_page(channel_id, page): diff --git a/yt_dlp/extractor/pokergo.py b/yt_dlp/extractor/pokergo.py index e22348053c..72cbce0a0c 100644 --- a/yt_dlp/extractor/pokergo.py +++ b/yt_dlp/extractor/pokergo.py @@ -5,6 +5,7 @@ from ..utils import ( ExtractorError, try_get, ) +from ..utils.traversal import traverse_obj class PokerGoBaseIE(InfoExtractor): @@ -65,7 +66,7 @@ class PokerGoIE(PokerGoBaseIE): 'width': image.get('width'), 'height': image.get('height'), } for image in data_json.get('images') or [] if image.get('url')] - series_json = next(dct for dct in data_json.get('show_tags') or [] if dct.get('video_id') == video_id) or {} + series_json = traverse_obj(data_json, ('show_tags', lambda _, v: v['video_id'] == video_id, any)) or {} return { '_type': 'url_transparent', diff --git a/yt_dlp/extractor/pornbox.py b/yt_dlp/extractor/pornbox.py index e15244dac0..9b89adbf9d 100644 --- a/yt_dlp/extractor/pornbox.py +++ b/yt_dlp/extractor/pornbox.py @@ -1,5 +1,6 @@ +import functools + from .common import InfoExtractor -from ..compat import functools from ..utils import ( int_or_none, parse_duration, diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 679dc63234..e1e9777e8e 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -628,8 +628,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): page_entries = self._extract_entries(webpage, host) if not page_entries: break - for e in page_entries: - yield e + yield from page_entries if not self._has_more(webpage): break diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py index f2c4e12e66..b0d6475fe4 100644 --- a/yt_dlp/extractor/pr0gramm.py +++ b/yt_dlp/extractor/pr0gramm.py @@ -1,9 +1,9 @@ import datetime as dt +import functools import json import urllib.parse from .common import InfoExtractor -from ..compat import functools from ..utils import ( ExtractorError, float_or_none, diff --git a/yt_dlp/extractor/prosiebensat1.py b/yt_dlp/extractor/prosiebensat1.py index 6a3e0971cd..e8a4712051 100644 --- a/yt_dlp/extractor/prosiebensat1.py +++ b/yt_dlp/extractor/prosiebensat1.py @@ -7,6 +7,7 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + join_nonempty, merge_dicts, unified_strdate, ) @@ -147,13 +148,13 @@ class ProSiebenSat1BaseIE(InfoExtractor): 'page_url': 'http://www.prosieben.de', 'tbr': tbr, 'ext': 'flv', - 'format_id': 'rtmp{}'.format(f'-{tbr}' if tbr else ''), + 'format_id': join_nonempty('rtmp', tbr), }) else: formats.append({ 'url': source_url, 'tbr': tbr, - 'format_id': 'http{}'.format(f'-{tbr}' if tbr else ''), + 'format_id': join_nonempty('http', tbr), }) return { diff --git a/yt_dlp/extractor/qqmusic.py b/yt_dlp/extractor/qqmusic.py index a57dd5fb35..d0238692f6 100644 --- a/yt_dlp/extractor/qqmusic.py +++ b/yt_dlp/extractor/qqmusic.py @@ -1,48 +1,125 @@ +import base64 +import functools +import json import random -import re import time from .common import InfoExtractor from ..utils import ( ExtractorError, + OnDemandPagedList, clean_html, + int_or_none, + join_nonempty, + js_to_json, + str_or_none, strip_jsonp, + traverse_obj, unescapeHTML, + url_or_none, + urljoin, ) -class QQMusicIE(InfoExtractor): +class QQMusicBaseIE(InfoExtractor): + def _get_cookie(self, key, default=None): + return getattr(self._get_cookies('https://y.qq.com').get(key), 'value', default) + + def _get_g_tk(self): + n = 5381 + for c in self._get_cookie('qqmusic_key', ''): + n += (n << 5) + ord(c) + return n & 2147483647 + + def _get_uin(self): + return int_or_none(self._get_cookie('uin')) or 0 + + @property + def is_logged_in(self): + return bool(self._get_uin() and self._get_cookie('fqm_pvqid')) + + # Reference: m_r_GetRUin() in top_player.js + # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js + @staticmethod + def _m_r_get_ruin(): + cur_ms = int(time.time() * 1000) % 1000 + return int(round(random.random() * 2147483647) * cur_ms % 1E10) + + def _download_init_data(self, url, mid, fatal=True): + webpage = self._download_webpage(url, mid, fatal=fatal) + return self._search_json(r'window\.__INITIAL_DATA__\s*=', webpage, + 'init data', mid, transform_source=js_to_json, fatal=fatal) + + def _make_fcu_req(self, req_dict, mid, headers={}, **kwargs): + return self._download_json( + 'https://u.y.qq.com/cgi-bin/musicu.fcg', mid, data=json.dumps({ + 'comm': { + 'cv': 0, + 'ct': 24, + 'format': 'json', + 'uin': self._get_uin(), + }, + **req_dict, + }, separators=(',', ':')).encode(), headers=headers, **kwargs) + + +class QQMusicIE(QQMusicBaseIE): IE_NAME = 'qqmusic' IE_DESC = 'QQ音乐' - _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html' + _VALID_URL = r'https?://y\.qq\.com/n/ryqq/songDetail/(?P<id>[0-9A-Za-z]+)' _TESTS = [{ - 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html', + 'url': 'https://y.qq.com/n/ryqq/songDetail/004Ti8rT003TaZ', + 'md5': 'd7adc5c438d12e2cb648cca81593fd47', + 'info_dict': { + 'id': '004Ti8rT003TaZ', + 'ext': 'mp3', + 'title': '永夜のパレード (永夜的游行)', + 'album': '幻想遊園郷 -Fantastic Park-', + 'release_date': '20111230', + 'duration': 281, + 'creators': ['ケーキ姫', 'JUMA'], + 'genres': ['Pop'], + 'description': 'md5:b5261f3d595657ae561e9e6aee7eb7d9', + 'size': 4501244, + 'thumbnail': r're:^https?://.*\.jpg(?:$|[#?])', + 'subtitles': 'count:1', + }, + }, { + 'url': 'https://y.qq.com/n/ryqq/songDetail/004295Et37taLD', 'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8', 'info_dict': { 'id': '004295Et37taLD', 'ext': 'mp3', 'title': '可惜没如果', - 'release_date': '20141227', - 'creator': '林俊杰', - 'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac', - 'thumbnail': r're:^https?://.*\.jpg$', + 'album': '新地球 - 人 (Special Edition)', + 'release_date': '20150129', + 'duration': 298, + 'creators': ['林俊杰'], + 'genres': ['Pop'], + 'description': 'md5:f568421ff618d2066e74b65a04149c4e', + 'thumbnail': r're:^https?://.*\.jpg(?:$|[#?])', }, + 'skip': 'premium member only', }, { 'note': 'There is no mp3-320 version of this song.', - 'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html', - 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', + 'url': 'https://y.qq.com/n/ryqq/songDetail/004MsGEo3DdNxV', + 'md5': '028aaef1ae13d8a9f4861a92614887f9', 'info_dict': { 'id': '004MsGEo3DdNxV', 'ext': 'mp3', 'title': '如果', + 'album': '新传媒电视连续剧金曲系列II', 'release_date': '20050626', - 'creator': '李季美', - 'description': 'md5:46857d5ed62bc4ba84607a805dccf437', - 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 220, + 'creators': ['李季美'], + 'genres': [], + 'description': 'md5:fc711212aa623b28534954dc4bd67385', + 'size': 3535730, + 'thumbnail': r're:^https?://.*\.jpg(?:$|[#?])', }, }, { 'note': 'lyrics not in .lrc format', - 'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html', + 'url': 'https://y.qq.com/n/ryqq/songDetail/001JyApY11tIp6', 'info_dict': { 'id': '001JyApY11tIp6', 'ext': 'mp3', @@ -50,185 +127,193 @@ class QQMusicIE(InfoExtractor): 'release_date': '19970225', 'creator': 'Dark Funeral', 'description': 'md5:c9b20210587cbcd6836a1c597bab4525', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - 'skip_download': True, + 'thumbnail': r're:^https?://.*\.jpg(?:$|[#?])', }, + 'params': {'skip_download': True}, + 'skip': 'no longer available', }] _FORMATS = { - 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320}, - 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128}, - 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10}, + 'F000': {'name': 'flac', 'prefix': 'F000', 'ext': 'flac', 'preference': 60}, + 'A000': {'name': 'ape', 'prefix': 'A000', 'ext': 'ape', 'preference': 50}, + 'M800': {'name': '320mp3', 'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320}, + 'M500': {'name': '128mp3', 'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128}, + 'C400': {'name': '96aac', 'prefix': 'C400', 'ext': 'm4a', 'preference': 20, 'abr': 96}, + 'C200': {'name': '48aac', 'prefix': 'C200', 'ext': 'm4a', 'preference': 20, 'abr': 48}, } - # Reference: m_r_GetRUin() in top_player.js - # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js - @staticmethod - def m_r_get_ruin(): - cur_ms = int(time.time() * 1000) % 1000 - return int(round(random.random() * 2147483647) * cur_ms % 1E10) - def _real_extract(self, url): mid = self._match_id(url) - detail_info_page = self._download_webpage( - f'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid={mid}&play=0', - mid, note='Download song detail info', - errnote='Unable to get song detail info', encoding='gbk') - - song_name = self._html_search_regex( - r"songname:\s*'([^']+)'", detail_info_page, 'song name') - - publish_time = self._html_search_regex( - r'发行时间:(\d{4}-\d{2}-\d{2})', detail_info_page, - 'publish time', default=None) - if publish_time: - publish_time = publish_time.replace('-', '') - - singer = self._html_search_regex( - r"singer:\s*'([^']+)", detail_info_page, 'singer', default=None) - - lrc_content = self._html_search_regex( - r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>', - detail_info_page, 'LRC lyrics', default=None) - if lrc_content: - lrc_content = lrc_content.replace('\\n', '\n') - - thumbnail_url = None - albummid = self._search_regex( - [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], - detail_info_page, 'album mid', default=None) - if albummid: - thumbnail_url = f'http://i.gtimg.cn/music/photo/mid_album_500/{albummid[-2:-1]}/{albummid[-1]}/{albummid}.jpg' - - guid = self.m_r_get_ruin() - - vkey = self._download_json( - f'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid={guid}', - mid, note='Retrieve vkey', errnote='Unable to get vkey', - transform_source=strip_jsonp)['key'] - + init_data = self._download_init_data(url, mid, fatal=False) + info_data = self._make_fcu_req({'info': { + 'module': 'music.pf_song_detail_svr', + 'method': 'get_song_detail_yqq', + 'param': { + 'song_mid': mid, + 'song_type': 0, + }, + }}, mid, note='Downloading song info')['info']['data']['track_info'] + + media_mid = info_data['file']['media_mid'] + + data = self._make_fcu_req({ + 'req_1': { + 'module': 'vkey.GetVkeyServer', + 'method': 'CgiGetVkey', + 'param': { + 'guid': str(self._m_r_get_ruin()), + 'songmid': [mid] * len(self._FORMATS), + 'songtype': [0] * len(self._FORMATS), + 'uin': str(self._get_uin()), + 'loginflag': 1, + 'platform': '20', + 'filename': [f'{f["prefix"]}{media_mid}.{f["ext"]}' for f in self._FORMATS.values()], + }, + }, + 'req_2': { + 'module': 'music.musichallSong.PlayLyricInfo', + 'method': 'GetPlayLyricInfo', + 'param': {'songMID': mid}, + }, + }, mid, note='Downloading formats and lyric', headers=self.geo_verification_headers()) + + code = traverse_obj(data, ('req_1', 'code', {int})) + if code != 0: + raise ExtractorError(f'Failed to download format info, error code {code or "unknown"}') formats = [] - for format_id, details in self._FORMATS.items(): + for media_info in traverse_obj(data, ( + 'req_1', 'data', 'midurlinfo', lambda _, v: v['songmid'] == mid and v['purl']), + ): + format_key = traverse_obj(media_info, ('filename', {str}, {lambda x: x[:4]})) + format_info = self._FORMATS.get(format_key) or {} + format_id = format_info.get('name') formats.append({ - 'url': 'http://cc.stream.qqmusic.qq.com/{}{}.{}?vkey={}&guid={}&fromtag=0'.format( - details['prefix'], mid, details['ext'], vkey, guid), + 'url': urljoin('https://dl.stream.qqmusic.qq.com', media_info['purl']), 'format': format_id, 'format_id': format_id, - 'quality': details['preference'], - 'abr': details.get('abr'), + 'size': traverse_obj(info_data, ('file', f'size_{format_id}', {int_or_none})), + 'quality': format_info.get('preference'), + 'abr': format_info.get('abr'), + 'ext': format_info.get('ext'), + 'vcodec': 'none', }) - self._check_formats(formats, mid) - actual_lrc_lyrics = ''.join( - line + '\n' for line in re.findall( - r'(?m)^(\[[0-9]{2}:[0-9]{2}(?:\.[0-9]{2,})?\][^\n]*|\[[^\]]*\])', lrc_content)) + if not formats and not self.is_logged_in: + self.raise_login_required() + + if traverse_obj(data, ('req_2', 'code')): + self.report_warning(f'Failed to download lyric, error {data["req_2"]["code"]!r}') + lrc_content = traverse_obj(data, ('req_2', 'data', 'lyric', {lambda x: base64.b64decode(x).decode('utf-8')})) info_dict = { 'id': mid, 'formats': formats, - 'title': song_name, - 'release_date': publish_time, - 'creator': singer, - 'description': lrc_content, - 'thumbnail': thumbnail_url, + **traverse_obj(info_data, { + 'title': ('title', {str}), + 'album': ('album', 'title', {str}, {lambda x: x or None}), + 'release_date': ('time_public', {lambda x: x.replace('-', '') or None}), + 'creators': ('singer', ..., 'name', {str}), + 'alt_title': ('subtitle', {str}, {lambda x: x or None}), + 'duration': ('interval', {int_or_none}), + }), + **traverse_obj(init_data, ('detail', { + 'thumbnail': ('picurl', {url_or_none}), + 'description': ('info', 'intro', 'content', ..., 'value', {str}), + 'genres': ('info', 'genre', 'content', ..., 'value', {str}, all), + }), get_all=False), } - if actual_lrc_lyrics: - info_dict['subtitles'] = { - 'origin': [{ - 'ext': 'lrc', - 'data': actual_lrc_lyrics, - }], - } + if lrc_content: + info_dict['subtitles'] = {'origin': [{'ext': 'lrc', 'data': lrc_content}]} + info_dict['description'] = join_nonempty(info_dict.get('description'), lrc_content, delim='\n') return info_dict -class QQPlaylistBaseIE(InfoExtractor): - @staticmethod - def qq_static_url(category, mid): - return f'http://y.qq.com/y/static/{category}/{mid[-2]}/{mid[-1]}/{mid}.html' - - def get_singer_all_songs(self, singmid, num): - return self._download_webpage( - r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid, - query={ - 'format': 'json', - 'inCharset': 'utf8', - 'outCharset': 'utf-8', - 'platform': 'yqq', - 'needNewCode': 0, - 'singermid': singmid, - 'order': 'listen', - 'begin': 0, - 'num': num, - 'songstatus': 1, - }) - - def get_entries_from_page(self, singmid): - entries = [] - - default_num = 1 - json_text = self.get_singer_all_songs(singmid, default_num) - json_obj_all_songs = self._parse_json(json_text, singmid) - - if json_obj_all_songs['code'] == 0: - total = json_obj_all_songs['data']['total'] - json_text = self.get_singer_all_songs(singmid, total) - json_obj_all_songs = self._parse_json(json_text, singmid) - - for item in json_obj_all_songs['data']['list']: - if item['musicData'].get('songmid') is not None: - songmid = item['musicData']['songmid'] - entries.append(self.url_result( - rf'https://y.qq.com/n/yqq/song/{songmid}.html', 'QQMusic', songmid)) - - return entries - - -class QQMusicSingerIE(QQPlaylistBaseIE): +class QQMusicSingerIE(QQMusicBaseIE): IE_NAME = 'qqmusic:singer' IE_DESC = 'QQ音乐 - 歌手' - _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html' - _TEST = { - 'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html', + _VALID_URL = r'https?://y\.qq\.com/n/ryqq/singer/(?P<id>[0-9A-Za-z]+)' + _TESTS = [{ + 'url': 'https://y.qq.com/n/ryqq/singer/001BLpXF2DyJe2', 'info_dict': { 'id': '001BLpXF2DyJe2', 'title': '林俊杰', - 'description': 'md5:870ec08f7d8547c29c93010899103751', + 'description': 'md5:10624ce73b06fa400bc846f59b0305fa', + 'thumbnail': r're:^https?://.*\.jpg(?:$|[#?])', }, - 'playlist_mincount': 12, - } + 'playlist_mincount': 100, + }, { + 'url': 'https://y.qq.com/n/ryqq/singer/000Q00f213YzNV', + 'info_dict': { + 'id': '000Q00f213YzNV', + 'title': '桃几OvO', + 'description': '小破站小唱见~希望大家喜欢听我唱歌~!', + 'thumbnail': r're:^https?://.*\.jpg(?:$|[#?])', + }, + 'playlist_count': 12, + 'playlist': [{ + 'info_dict': { + 'id': '0016cvsy02mmCl', + 'ext': 'mp3', + 'title': '群青', + 'album': '桃几2021年翻唱集', + 'release_date': '20210913', + 'duration': 248, + 'creators': ['桃几OvO'], + 'genres': ['Pop'], + 'description': 'md5:4296005a04edcb5cdbe0889d5055a7ae', + 'size': 3970822, + 'thumbnail': r're:^https?://.*\.jpg(?:$|[#?])', + }, + }], + }] + + _PAGE_SIZE = 50 + + def _fetch_page(self, mid, page_size, page_num): + data = self._make_fcu_req({'req_1': { + 'module': 'music.web_singer_info_svr', + 'method': 'get_singer_detail_info', + 'param': { + 'sort': 5, + 'singermid': mid, + 'sin': page_num * page_size, + 'num': page_size, + }}}, mid, note=f'Downloading page {page_num}') + yield from traverse_obj(data, ('req_1', 'data', 'songlist', ..., {lambda x: self.url_result( + f'https://y.qq.com/n/ryqq/songDetail/{x["mid"]}', QQMusicIE, x['mid'], x.get('title'))})) def _real_extract(self, url): mid = self._match_id(url) + init_data = self._download_init_data(url, mid, fatal=False) - entries = self.get_entries_from_page(mid) - singer_page = self._download_webpage(url, mid, 'Download singer page') - singer_name = self._html_search_regex( - r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None) - singer_desc = None - - if mid: - singer_desc_page = self._download_xml( - 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid, - 'Donwload singer description XML', - query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid}, - headers={'Referer': 'https://y.qq.com/n/yqq/singer/'}) + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, mid, self._PAGE_SIZE), self._PAGE_SIZE), + mid, **traverse_obj(init_data, ('singerDetail', { + 'title': ('basic_info', 'name', {str}), + 'description': ('ex_info', 'desc', {str}), + 'thumbnail': ('pic', 'pic', {url_or_none}), + }))) - singer_desc = singer_desc_page.find('./data/info/desc').text - return self.playlist_result(entries, mid, singer_name, singer_desc) +class QQPlaylistBaseIE(InfoExtractor): + def _extract_entries(self, info_json, path): + for song in traverse_obj(info_json, path): + song_mid = song.get('songmid') + if not song_mid: + continue + yield self.url_result( + f'https://y.qq.com/n/ryqq/songDetail/{song_mid}', + QQMusicIE, song_mid, song.get('songname')) class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' IE_DESC = 'QQ音乐 - 专辑' - _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html' + _VALID_URL = r'https?://y\.qq\.com/n/ryqq/albumDetail/(?P<id>[0-9A-Za-z]+)' _TESTS = [{ - 'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html', + 'url': 'https://y.qq.com/n/ryqq/albumDetail/000gXCTb2AhRR1', 'info_dict': { 'id': '000gXCTb2AhRR1', 'title': '我们都是这样长大的', @@ -236,10 +321,10 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): }, 'playlist_count': 4, }, { - 'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html', + 'url': 'https://y.qq.com/n/ryqq/albumDetail/002Y5a3b3AlCu3', 'info_dict': { 'id': '002Y5a3b3AlCu3', - 'title': '그리고...', + 'title': '그리고…', 'description': 'md5:a48823755615508a95080e81b51ba729', }, 'playlist_count': 8, @@ -248,49 +333,45 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): def _real_extract(self, url): mid = self._match_id(url) - album = self._download_json( - f'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid={mid}&format=json', - mid, 'Download album page')['data'] + album_json = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg', + mid, 'Download album page', + query={'albummid': mid, 'format': 'json'})['data'] - entries = [ - self.url_result( - 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'], - ) for song in album['list'] - ] - album_name = album.get('name') - album_detail = album.get('desc') - if album_detail is not None: - album_detail = album_detail.strip() + entries = self._extract_entries(album_json, ('list', ...)) - return self.playlist_result(entries, mid, album_name, album_detail) + return self.playlist_result(entries, mid, **traverse_obj(album_json, { + 'title': ('name', {str}), + 'description': ('desc', {str.strip}), + })) class QQMusicToplistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:toplist' IE_DESC = 'QQ音乐 - 排行榜' - _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://y\.qq\.com/n/ryqq/toplist/(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'https://y.qq.com/n/yqq/toplist/123.html', + 'url': 'https://y.qq.com/n/ryqq/toplist/123', 'info_dict': { 'id': '123', - 'title': '美国iTunes榜', - 'description': 'md5:89db2335fdbb10678dee2d43fe9aba08', + 'title': r're:美国热门音乐榜 \d{4}-\d{2}-\d{2}', + 'description': '美国热门音乐榜,每周一更新。', }, - 'playlist_count': 100, + 'playlist_count': 95, }, { - 'url': 'https://y.qq.com/n/yqq/toplist/3.html', + 'url': 'https://y.qq.com/n/ryqq/toplist/3', 'info_dict': { 'id': '3', - 'title': '巅峰榜·欧美', - 'description': 'md5:5a600d42c01696b26b71f8c4d43407da', + 'title': r're:巅峰榜·欧美 \d{4}-\d{2}-\d{2}', + 'description': 'md5:4def03b60d3644be4c9a36f21fd33857', }, 'playlist_count': 100, }, { - 'url': 'https://y.qq.com/n/yqq/toplist/106.html', + 'url': 'https://y.qq.com/n/ryqq/toplist/106', 'info_dict': { 'id': '106', - 'title': '韩国Mnet榜', + 'title': r're:韩国Mnet榜 \d{4}-\d{2}-\d{2}', 'description': 'md5:cb84b325215e1d21708c615cac82a6e7', }, 'playlist_count': 50, @@ -304,33 +385,20 @@ class QQMusicToplistIE(QQPlaylistBaseIE): note='Download toplist page', query={'type': 'toplist', 'topid': list_id, 'format': 'json'}) - entries = [self.url_result( - 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', - song['data']['songmid']) - for song in toplist_json['songlist']] - - topinfo = toplist_json.get('topinfo', {}) - list_name = topinfo.get('ListName') - list_description = topinfo.get('info') - return self.playlist_result(entries, list_id, list_name, list_description) + return self.playlist_result( + self._extract_entries(toplist_json, ('songlist', ..., 'data')), list_id, + playlist_title=join_nonempty(*traverse_obj( + toplist_json, ((('topinfo', 'ListName'), 'update_time'), None)), delim=' '), + playlist_description=traverse_obj(toplist_json, ('topinfo', 'info'))) class QQMusicPlaylistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:playlist' IE_DESC = 'QQ音乐 - 歌单' - _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://y\.qq\.com/n/ryqq/playlist/(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html', - 'info_dict': { - 'id': '3462654915', - 'title': '韩国5月新歌精选下旬', - 'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4', - }, - 'playlist_count': 40, - 'skip': 'playlist gone', - }, { - 'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html', + 'url': 'https://y.qq.com/n/ryqq/playlist/1374105607', 'info_dict': { 'id': '1374105607', 'title': '易入人心的华语民谣', @@ -346,19 +414,83 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg', list_id, 'Download list page', query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id}, - transform_source=strip_jsonp) + transform_source=strip_jsonp, headers={'Referer': url}) if not len(list_json.get('cdlist', [])): - if list_json.get('code'): - raise ExtractorError( - 'QQ Music said: error %d in fetching playlist info' % list_json['code'], - expected=True) - raise ExtractorError('Unable to get playlist info') - - cdlist = list_json['cdlist'][0] - entries = [self.url_result( - 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']) - for song in cdlist['songlist']] - - list_name = cdlist.get('dissname') - list_description = clean_html(unescapeHTML(cdlist.get('desc'))) - return self.playlist_result(entries, list_id, list_name, list_description) + raise ExtractorError(join_nonempty( + 'Unable to get playlist info', + join_nonempty('code', 'subcode', from_dict=list_json), + list_json.get('msg'), delim=': ')) + + entries = self._extract_entries(list_json, ('cdlist', 0, 'songlist', ...)) + + return self.playlist_result(entries, list_id, **traverse_obj(list_json, ('cdlist', 0, { + 'title': ('dissname', {str}), + 'description': ('desc', {unescapeHTML}, {clean_html}), + }))) + + +class QQMusicVideoIE(QQMusicBaseIE): + IE_NAME = 'qqmusic:mv' + IE_DESC = 'QQ音乐 - MV' + _VALID_URL = r'https?://y\.qq\.com/n/ryqq/mv/(?P<id>[0-9A-Za-z]+)' + + _TESTS = [{ + 'url': 'https://y.qq.com/n/ryqq/mv/002Vsarh3SVU8K', + 'info_dict': { + 'id': '002Vsarh3SVU8K', + 'ext': 'mp4', + 'title': 'The Chant (Extended Mix / Audio)', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg(?:$|[#?])', + 'release_timestamp': 1688918400, + 'release_date': '20230709', + 'duration': 313, + 'creators': ['Duke Dumont'], + 'view_count': int, + }, + }] + + def _parse_url_formats(self, url_data): + return traverse_obj(url_data, ('mp4', lambda _, v: v['freeflow_url'], { + 'url': ('freeflow_url', 0, {url_or_none}), + 'filesize': ('fileSize', {int_or_none}), + 'format_id': ('newFileType', {str_or_none}), + })) + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_info = self._make_fcu_req({ + 'mvInfo': { + 'module': 'music.video.VideoData', + 'method': 'get_video_info_batch', + 'param': { + 'vidlist': [video_id], + 'required': [ + 'vid', 'type', 'sid', 'cover_pic', 'duration', 'singers', + 'video_pay', 'hint', 'code', 'msg', 'name', 'desc', + 'playcnt', 'pubdate', 'play_forbid_reason'], + }, + }, + 'mvUrl': { + 'module': 'music.stream.MvUrlProxy', + 'method': 'GetMvUrls', + 'param': {'vids': [video_id]}, + }, + }, video_id, headers=self.geo_verification_headers()) + if traverse_obj(video_info, ('mvInfo', 'data', video_id, 'play_forbid_reason')) == 3: + self.raise_geo_restricted() + + return { + 'id': video_id, + 'formats': self._parse_url_formats(traverse_obj(video_info, ('mvUrl', 'data', video_id))), + **traverse_obj(video_info, ('mvInfo', 'data', video_id, { + 'title': ('name', {str}), + 'description': ('desc', {str}), + 'thumbnail': ('cover_pic', {url_or_none}), + 'release_timestamp': ('pubdate', {int_or_none}), + 'duration': ('duration', {int_or_none}), + 'creators': ('singers', ..., 'name', {str}), + 'view_count': ('playcnt', {int_or_none}), + })), + } diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py index b0b6681c9f..f94d6a3e72 100644 --- a/yt_dlp/extractor/radiko.py +++ b/yt_dlp/extractor/radiko.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, clean_html, + join_nonempty, time_seconds, try_call, unified_timestamp, @@ -167,7 +168,7 @@ class RadikoBaseIE(InfoExtractor): class RadikoIE(RadikoBaseIE): - _VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/ts/(?P<station>[A-Z0-9-]+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/ts/(?P<station>[A-Z0-9-]+)/(?P<timestring>\d+)' _TESTS = [{ # QRR (文化放送) station provides <desc> @@ -183,8 +184,9 @@ class RadikoIE(RadikoBaseIE): }] def _real_extract(self, url): - station, video_id = self._match_valid_url(url).groups() - vid_int = unified_timestamp(video_id, False) + station, timestring = self._match_valid_url(url).group('station', 'timestring') + video_id = join_nonempty(station, timestring) + vid_int = unified_timestamp(timestring, False) prog, station_program, ft, radio_begin, radio_end = self._find_program(video_id, station, vid_int) auth_token, area_id = self._auth_client() @@ -207,7 +209,7 @@ class RadikoIE(RadikoBaseIE): 'ft': radio_begin, 'end_at': radio_end, 'to': radio_end, - 'seek': video_id, + 'seek': timestring, }, ), } diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py index ff21963541..9d90439841 100644 --- a/yt_dlp/extractor/radiofrance.py +++ b/yt_dlp/extractor/radiofrance.py @@ -16,7 +16,7 @@ from ..utils import ( class RadioFranceIE(InfoExtractor): - _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' + _VALID_URL = r'https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' IE_NAME = 'radiofrance' _TEST = { diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index bc3e5f7eee..b633dc48af 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -1,3 +1,4 @@ +import json import urllib.parse from .common import InfoExtractor @@ -17,7 +18,7 @@ from ..utils import ( class RedditIE(InfoExtractor): _NETRC_MACHINE = 'reddit' - _VALID_URL = r'https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))' + _VALID_URL = r'https?://(?:\w+\.)?reddit(?:media)?\.com/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { @@ -251,15 +252,15 @@ class RedditIE(InfoExtractor): return {'en': [{'url': caption_url}]} def _real_extract(self, url): - host, slug, video_id = self._match_valid_url(url).group('host', 'slug', 'id') + slug, video_id = self._match_valid_url(url).group('slug', 'id') - data = self._download_json( - f'https://{host}/{slug}/.json', video_id, fatal=False, expected_status=403) - if not data: - fallback_host = 'old.reddit.com' if host != 'old.reddit.com' else 'www.reddit.com' - self.to_screen(f'{host} request failed, retrying with {fallback_host}') + try: data = self._download_json( - f'https://{fallback_host}/{slug}/.json', video_id, expected_status=403) + f'https://www.reddit.com/{slug}/.json', video_id, expected_status=403) + except ExtractorError as e: + if isinstance(e.cause, json.JSONDecodeError): + self.raise_login_required('Account authentication is required') + raise if traverse_obj(data, 'error') == 403: reason = data.get('reason') diff --git a/yt_dlp/extractor/reverbnation.py b/yt_dlp/extractor/reverbnation.py index ddf8c3753f..f3bcc2c328 100644 --- a/yt_dlp/extractor/reverbnation.py +++ b/yt_dlp/extractor/reverbnation.py @@ -6,7 +6,7 @@ from ..utils import ( class ReverbNationIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$' + _VALID_URL = r'https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$' _TESTS = [{ 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', 'md5': 'c0aaf339bcee189495fdf5a8c8ba8645', diff --git a/yt_dlp/extractor/rtp.py b/yt_dlp/extractor/rtp.py index 944e8636ab..26aec2e4cc 100644 --- a/yt_dlp/extractor/rtp.py +++ b/yt_dlp/extractor/rtp.py @@ -8,7 +8,7 @@ from ..utils import js_to_json class RTPIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?' + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/(?:(?:estudoemcasa|palco|zigzag)/)?p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', 'md5': 'e736ce0c665e459ddb818546220b4ef8', @@ -19,9 +19,25 @@ class RTPIE(InfoExtractor): 'description': 'As paixões musicais de António Cartaxo e António Macedo', 'thumbnail': r're:^https?://.*\.jpg', }, + }, { + 'url': 'https://www.rtp.pt/play/zigzag/p13166/e757904/25-curiosidades-25-de-abril', + 'md5': '9a81ed53f2b2197cfa7ed455b12f8ade', + 'info_dict': { + 'id': 'e757904', + 'ext': 'mp4', + 'title': '25 Curiosidades, 25 de Abril', + 'description': 'Estudar ou não estudar - Em cada um dos episódios descobrimos uma curiosidade acerca de como era viver em Portugal antes da revolução do 25 de abr', + 'thumbnail': r're:^https?://.*\.jpg', + }, }, { 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', 'only_matching': True, + }, { + 'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/portugues-1-ano', + 'only_matching': True, + }, { + 'url': 'https://www.rtp.pt/play/palco/p13785/l7nnon', + 'only_matching': True, }] _RX_OBFUSCATION = re.compile(r'''(?xs) @@ -49,17 +65,17 @@ class RTPIE(InfoExtractor): f, config = self._search_regex( r'''(?sx) - var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s* + (?:var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*)? var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/) ''', webpage, 'player config', group=('f', 'config')) - f = self._parse_json( - f, video_id, - lambda data: self.__unobfuscate(data, video_id=video_id)) config = self._parse_json( config, video_id, lambda data: self.__unobfuscate(data, video_id=video_id)) + f = config['file'] if not f else self._parse_json( + f, video_id, + lambda data: self.__unobfuscate(data, video_id=video_id)) formats = [] if isinstance(f, dict): diff --git a/yt_dlp/extractor/rtvslo.py b/yt_dlp/extractor/rtvslo.py index e71d01d1e0..9c2e6fb6b5 100644 --- a/yt_dlp/extractor/rtvslo.py +++ b/yt_dlp/extractor/rtvslo.py @@ -1,3 +1,5 @@ +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -6,6 +8,7 @@ from ..utils import ( traverse_obj, unified_timestamp, url_or_none, + urljoin, ) @@ -21,75 +24,73 @@ class RTVSLOIE(InfoExtractor): _API_BASE = 'https://api.rtvslo.si/ava/{}/{}?client_id=82013fb3a531d5414f478747c1aca622' SUB_LANGS_MAP = {'Slovenski': 'sl'} - _TESTS = [ - { - 'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv', - 'info_dict': { - 'id': '174842550', - 'ext': 'mp4', - 'release_timestamp': 1643140032, - 'upload_date': '20220125', - 'series': 'Dnevnik', - 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/92/dnevnik_3_wide2.jpg', - 'description': 'md5:76a18692757aeb8f0f51221106277dd2', - 'timestamp': 1643137046, - 'title': 'Dnevnik', - 'series_id': '92', - 'release_date': '20220125', - 'duration': 1789, - }, - }, { - 'url': 'https://365.rtvslo.si/arhiv/utrip/174843754', - 'info_dict': { - 'id': '174843754', - 'ext': 'mp4', - 'series_id': '94', - 'release_date': '20220129', - 'timestamp': 1643484455, - 'title': 'Utrip', - 'duration': 813, - 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/94/utrip_1_wide2.jpg', - 'description': 'md5:77f2892630c7b17bb7a5bb84319020c9', - 'release_timestamp': 1643485825, - 'upload_date': '20220129', - 'series': 'Utrip', - }, - }, { - 'url': 'https://365.rtvslo.si/arhiv/il-giornale-della-sera/174844609', - 'info_dict': { - 'id': '174844609', - 'ext': 'mp3', - 'series_id': '106615841', - 'title': 'Il giornale della sera', - 'duration': 1328, - 'series': 'Il giornale della sera', - 'timestamp': 1643743800, - 'release_timestamp': 1643745424, - 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/il-giornale-della-sera_wide2.jpg', - 'upload_date': '20220201', - 'tbr': 128000, - 'release_date': '20220201', - }, - }, { - 'url': 'https://365.rtvslo.si/arhiv/razred-zase/148350750', - 'info_dict': { - 'id': '148350750', - 'ext': 'mp4', - 'title': 'Prvi šolski dan, mozaična oddaja za mlade', - 'series': 'Razred zase', - 'series_id': '148185730', - 'duration': 1481, - 'upload_date': '20121019', - 'timestamp': 1350672122, - 'release_date': '20121019', - 'release_timestamp': 1350672122, - 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/148185730/razred_zase_2014_logo_4d_wide2.jpg', - }, - }, { - 'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550', - 'only_matching': True, + _TESTS = [{ + 'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv', + 'info_dict': { + 'id': '174842550', + 'ext': 'mp4', + 'release_timestamp': 1643140032, + 'upload_date': '20220125', + 'series': 'Dnevnik', + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/92/dnevnik_3_wide2.jpg', + 'description': 'md5:76a18692757aeb8f0f51221106277dd2', + 'timestamp': 1643137046, + 'title': 'Dnevnik', + 'series_id': '92', + 'release_date': '20220125', + 'duration': 1789, + }, + }, { + 'url': 'https://365.rtvslo.si/arhiv/utrip/174843754', + 'info_dict': { + 'id': '174843754', + 'ext': 'mp4', + 'series_id': '94', + 'release_date': '20220129', + 'timestamp': 1643484455, + 'title': 'Utrip', + 'duration': 813, + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/94/utrip_1_wide2.jpg', + 'description': 'md5:77f2892630c7b17bb7a5bb84319020c9', + 'release_timestamp': 1643485825, + 'upload_date': '20220129', + 'series': 'Utrip', + }, + }, { + 'url': 'https://365.rtvslo.si/arhiv/il-giornale-della-sera/174844609', + 'info_dict': { + 'id': '174844609', + 'ext': 'mp3', + 'series_id': '106615841', + 'title': 'Il giornale della sera', + 'duration': 1328, + 'series': 'Il giornale della sera', + 'timestamp': 1643743800, + 'release_timestamp': 1643745424, + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/il-giornale-della-sera_wide2.jpg', + 'upload_date': '20220201', + 'tbr': 128000, + 'release_date': '20220201', }, - ] + }, { + 'url': 'https://365.rtvslo.si/arhiv/razred-zase/148350750', + 'info_dict': { + 'id': '148350750', + 'ext': 'mp4', + 'title': 'Prvi šolski dan, mozaična oddaja za mlade', + 'series': 'Razred zase', + 'series_id': '148185730', + 'duration': 1481, + 'upload_date': '20121019', + 'timestamp': 1350672122, + 'release_date': '20121019', + 'release_timestamp': 1350672122, + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/148185730/razred_zase_2014_logo_4d_wide2.jpg', + }, + }, { + 'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550', + 'only_matching': True, + }] def _real_extract(self, url): v_id = self._match_id(url) @@ -164,3 +165,26 @@ class RTVSLOIE(InfoExtractor): 'series': meta.get('showName'), 'series_id': meta.get('showId'), } + + +class RTVSLOShowIE(InfoExtractor): + IE_NAME = 'rtvslo.si:show' + _VALID_URL = r'https?://(?:365|4d)\.rtvslo.si/oddaja/[^/?#&]+/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://365.rtvslo.si/oddaja/ekipa-bled/173250997', + 'info_dict': { + 'id': '173250997', + 'title': 'Ekipa Bled', + }, + 'playlist_count': 18, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return self.playlist_from_matches( + re.findall(r'<a [^>]*\bhref="(/arhiv/[^"]+)"', webpage), + playlist_id, self._html_extract_title(webpage), + getter=lambda x: urljoin('https://365.rtvslo.si', x), ie=RTVSLOIE) diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index db780a2cf4..74c7e4f176 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -8,14 +8,17 @@ from ..utils import ( UnsupportedError, clean_html, determine_ext, + extract_attributes, format_field, get_element_by_class, + get_elements_html_by_class, int_or_none, join_nonempty, parse_count, parse_iso8601, traverse_obj, unescapeHTML, + urljoin, ) @@ -382,8 +385,10 @@ class RumbleChannelIE(InfoExtractor): if isinstance(e.cause, HTTPError) and e.cause.status == 404: break raise - for video_url in re.findall(r'class="[^>"]*videostream__link[^>]+href="([^"]+\.html)"', webpage): - yield self.url_result('https://rumble.com' + video_url) + for video_url in traverse_obj( + get_elements_html_by_class('videostream__link', webpage), (..., {extract_attributes}, 'href'), + ): + yield self.url_result(urljoin('https://rumble.com', video_url)) def _real_extract(self, url): url, playlist_id = self._match_valid_url(url).groups() diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py index d389b32091..2c416811af 100644 --- a/yt_dlp/extractor/rutube.py +++ b/yt_dlp/extractor/rutube.py @@ -6,6 +6,7 @@ from ..utils import ( determine_ext, int_or_none, parse_qs, + traverse_obj, try_get, unified_timestamp, url_or_none, @@ -80,6 +81,8 @@ class RutubeBaseIE(InfoExtractor): 'url': format_url, 'format_id': format_id, }) + for hls_url in traverse_obj(options, ('live_streams', 'hls', ..., 'url', {url_or_none})): + formats.extend(self._extract_m3u8_formats(hls_url, video_id, ext='mp4', fatal=False)) return formats def _download_and_extract_formats(self, video_id, query=None): @@ -90,7 +93,7 @@ class RutubeBaseIE(InfoExtractor): class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' - _VALID_URL = r'https?://rutube\.ru/(?:video(?:/private)?|(?:play/)?embed)/(?P<id>[\da-z]{32})' + _VALID_URL = r'https?://rutube\.ru/(?:(?:live/)?video(?:/private)?|(?:play/)?embed)/(?P<id>[\da-z]{32})' _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/(?:play/)?embed/[\da-z]{32}.*?)\1'] _TESTS = [{ @@ -164,6 +167,29 @@ class RutubeIE(RutubeBaseIE): 'uploader': 'Стас Быков', }, 'expected_warnings': ['Unable to download f4m'], + }, { + 'url': 'https://rutube.ru/live/video/c58f502c7bb34a8fcdd976b221fca292/', + 'info_dict': { + 'id': 'c58f502c7bb34a8fcdd976b221fca292', + 'ext': 'mp4', + 'categories': ['Телепередачи'], + 'description': '', + 'thumbnail': 'http://pic.rutubelist.ru/video/14/19/14190807c0c48b40361aca93ad0867c7.jpg', + 'live_status': 'is_live', + 'age_limit': 0, + 'uploader_id': '23460655', + 'timestamp': 1652972968, + 'view_count': int, + 'upload_date': '20220519', + 'title': r're:Первый канал. Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'uploader': 'Первый канал', + }, + }, { + 'url': 'https://rutube.ru/video/5ab908fccfac5bb43ef2b1e4182256b0/', + 'only_matching': True, + }, { + 'url': 'https://rutube.ru/live/video/private/c58f502c7bb34a8fcdd976b221fca292/', + 'only_matching': True, }] @classmethod diff --git a/yt_dlp/extractor/samplefocus.py b/yt_dlp/extractor/samplefocus.py index 36ceb0254d..3db3ce1424 100644 --- a/yt_dlp/extractor/samplefocus.py +++ b/yt_dlp/extractor/samplefocus.py @@ -36,7 +36,7 @@ class SampleFocusIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + webpage = self._download_webpage(url, display_id, impersonate=True) sample_id = self._search_regex( r'<input[^>]+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P<id>\d+)', @@ -82,7 +82,15 @@ class SampleFocusIE(InfoExtractor): return { 'id': sample_id, 'title': title, - 'url': mp3_url, + 'formats': [{ + 'url': mp3_url, + 'ext': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + 'http_headers': { + 'Referer': url, + }, + }], 'display_id': display_id, 'thumbnail': thumbnail, 'uploader': uploader, diff --git a/yt_dlp/extractor/screenrec.py b/yt_dlp/extractor/screenrec.py new file mode 100644 index 0000000000..64f8d2494a --- /dev/null +++ b/yt_dlp/extractor/screenrec.py @@ -0,0 +1,33 @@ +from .common import InfoExtractor + + +class ScreenRecIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?screenrec\.com/share/(?P<id>\w{10})' + _TESTS = [{ + 'url': 'https://screenrec.com/share/DasLtbknYo', + 'info_dict': { + 'id': 'DasLtbknYo', + 'ext': 'mp4', + 'title': '02.05.2024_03.01.25_REC', + 'description': 'Recorded with ScreenRec', + 'thumbnail': r're:^https?://.*\.gif$', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8_url = self._search_regex( + r'customUrl\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'm3u8 URL', group='url') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage, default=None) or self._html_extract_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4'), + } diff --git a/yt_dlp/extractor/sen.py b/yt_dlp/extractor/sen.py new file mode 100644 index 0000000000..d8f14ecdc0 --- /dev/null +++ b/yt_dlp/extractor/sen.py @@ -0,0 +1,36 @@ +from .common import InfoExtractor +from ..utils import url_or_none +from ..utils.traversal import traverse_obj + + +class SenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sen\.com/video/(?P<id>[0-9a-f-]+)' + _TEST = { + 'url': 'https://www.sen.com/video/eef46eb1-4d79-4e28-be9d-bd937767f8c4', + 'md5': 'ff615aca9691053c94f8f10d96cd7884', + 'info_dict': { + 'id': 'eef46eb1-4d79-4e28-be9d-bd937767f8c4', + 'ext': 'mp4', + 'description': 'Florida, 28 Sep 2022', + 'title': 'Hurricane Ian', + 'tags': ['North America', 'Storm', 'Weather'], + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + api_data = self._download_json(f'https://api.sen.com/content/public/video/{video_id}', video_id) + m3u8_url = (traverse_obj(api_data, ( + 'data', 'nodes', lambda _, v: v['id'] == 'player', 'video', 'url', {url_or_none}, any)) + or f'https://vod.sen.com/videos/{video_id}/manifest.m3u8') + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4'), + **traverse_obj(api_data, ('data', 'nodes', lambda _, v: v['id'] == 'details', any, 'content', { + 'title': ('title', 'text', {str}), + 'description': ('descriptions', 0, 'text', {str}), + 'tags': ('badges', ..., 'text', {str}), + })), + } diff --git a/yt_dlp/extractor/servus.py b/yt_dlp/extractor/servus.py index 117f180814..841c7ebf33 100644 --- a/yt_dlp/extractor/servus.py +++ b/yt_dlp/extractor/servus.py @@ -27,7 +27,7 @@ class ServusIE(InfoExtractor): 'info_dict': { 'id': 'AA-28BYCQNH92111', 'ext': 'mp4', - 'title': 'Klettersteige in den Alpen', + 'title': 'Vie Ferrate - Klettersteige in den Alpen', 'description': 'md5:25e47ddd83a009a0f9789ba18f2850ce', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2823, @@ -38,6 +38,7 @@ class ServusIE(InfoExtractor): 'season_number': 11, 'episode': 'Episode 8 - Vie Ferrate – Klettersteige in den Alpen', 'episode_number': 8, + 'categories': ['Bergwelten'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -71,8 +72,11 @@ class ServusIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url).upper() + webpage = self._download_webpage(url, video_id) + next_data = self._search_nextjs_data(webpage, video_id, fatal=False) + video = self._download_json( - 'https://api-player.redbull.com/stv/servus-tv?timeZone=Europe/Berlin', + 'https://api-player.redbull.com/stv/servus-tv-playnet', video_id, 'Downloading video JSON', query={'videoId': video_id}) if not video.get('videoUrl'): self._report_errors(video) @@ -89,7 +93,7 @@ class ServusIE(InfoExtractor): return { 'id': video_id, 'title': video.get('title'), - 'description': self._get_description(video_id) or video.get('description'), + 'description': self._get_description(next_data) or video.get('description'), 'thumbnail': video.get('poster'), 'duration': float_or_none(video.get('duration')), 'timestamp': unified_timestamp(video.get('currentSunrise')), @@ -100,16 +104,19 @@ class ServusIE(InfoExtractor): 'episode_number': episode_number, 'formats': formats, 'subtitles': subtitles, + **traverse_obj(next_data, ('props', 'pageProps', 'data', { + 'title': ('title', 'rendered', {str}), + 'timestamp': ('stv_date', 'raw', {int}), + 'duration': ('stv_duration', {float_or_none}), + 'categories': ('category_names', ..., {str}), + })), } - def _get_description(self, video_id): - info = self._download_json( - f'https://backend.servustv.com/wp-json/rbmh/v2/media_asset/aa_id/{video_id}?fieldset=page', - video_id, fatal=False) - - return join_nonempty(*traverse_obj(info, ( - ('stv_short_description', 'stv_long_description'), - {lambda x: unescapeHTML(x.replace('\n\n', '\n'))})), delim='\n\n') + def _get_description(self, next_data): + return join_nonempty(*traverse_obj(next_data, ( + 'props', 'pageProps', 'data', + ('stv_short_description', 'stv_long_description'), {str}, + {lambda x: x.replace('\n\n', '\n')}, {unescapeHTML})), delim='\n\n') def _report_errors(self, video): playability_errors = traverse_obj(video, ('playabilityErrors', ...)) diff --git a/yt_dlp/extractor/snapchat.py b/yt_dlp/extractor/snapchat.py new file mode 100644 index 0000000000..732677c190 --- /dev/null +++ b/yt_dlp/extractor/snapchat.py @@ -0,0 +1,76 @@ +from .common import InfoExtractor +from ..utils import float_or_none, int_or_none, url_or_none +from ..utils.traversal import traverse_obj + + +class SnapchatSpotlightIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?snapchat\.com/spotlight/(?P<id>\w+)' + + _TESTS = [{ + 'url': 'https://www.snapchat.com/spotlight/W7_EDlXWTBiXAEEniNoMPwAAYYWtidGhudGZpAX1TKn0JAX1TKnXJAAAAAA', + 'md5': '46c580f63592d0cbb76e974d2f9f0fcc', + 'info_dict': { + 'id': 'W7_EDlXWTBiXAEEniNoMPwAAYYWtidGhudGZpAX1TKn0JAX1TKnXJAAAAAA', + 'ext': 'mp4', + 'title': 'Views 💕', + 'description': '', + 'thumbnail': r're:https://cf-st\.sc-cdn\.net/d/kKJHIR1QAznRKK9jgYYDq\.256\.IRZXSOY', + 'duration': 4.665, + 'timestamp': 1637777831.369, + 'upload_date': '20211124', + 'repost_count': int, + 'uploader': 'shreypatel57', + 'uploader_url': 'https://www.snapchat.com/add/shreypatel57', + }, + }, { + 'url': 'https://www.snapchat.com/spotlight/W7_EDlXWTBiXAEEniNoMPwAAYcnVjYWdwcGV1AZEaIYn5AZEaIYnrAAAAAQ', + 'md5': '4cd9626458c1a0e3e6dbe72c544a9ec2', + 'info_dict': { + 'id': 'W7_EDlXWTBiXAEEniNoMPwAAYcnVjYWdwcGV1AZEaIYn5AZEaIYnrAAAAAQ', + 'ext': 'mp4', + 'title': 'Spotlight Snap', + 'description': 'How he flirt her teacher🤭🤭🤩😍 #kdrama#cdrama #dramaclips #dramaspotlight', + 'thumbnail': r're:https://cf-st\.sc-cdn\.net/i/ztfr6xFs0FOcFhwVczWfj\.256\.IRZXSOY', + 'duration': 10.91, + 'timestamp': 1722720291.307, + 'upload_date': '20240803', + 'view_count': int, + 'repost_count': int, + 'uploader': 'ganda0535', + 'uploader_url': 'https://www.snapchat.com/add/ganda0535', + 'tags': ['#dramaspotlight', '#dramaclips', '#cdrama', '#kdrama'], + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + page_props = self._search_nextjs_data(webpage, video_id)['props']['pageProps'] + video_data = traverse_obj(page_props, ( + 'spotlightFeed', 'spotlightStories', + lambda _, v: v['story']['storyId']['value'] == video_id, 'metadata', any), None) + + return { + 'id': video_id, + 'ext': 'mp4', + **traverse_obj(video_data, ('videoMetadata', { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'timestamp': ('uploadDateMs', {lambda x: float_or_none(x, 1000)}), + 'view_count': ('viewCount', {int_or_none}, {lambda x: None if x == -1 else x}), + 'repost_count': ('shareCount', {int_or_none}), + 'url': ('contentUrl', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'duration': ('durationMs', {lambda x: float_or_none(x, 1000)}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + 'uploader': ('creator', 'personCreator', 'username', {str}), + 'uploader_url': ('creator', 'personCreator', 'url', {url_or_none}), + })), + **traverse_obj(video_data, { + 'description': ('description', {str}), + 'tags': ('hashtags', ..., {str}), + 'view_count': ('engagementStats', 'viewCount', {int_or_none}, {lambda x: None if x == -1 else x}), + 'repost_count': ('engagementStats', 'shareCount', {int_or_none}), + }), + } diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 0f73684355..f4beab75b7 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -95,7 +95,7 @@ class SoundcloudBaseIE(InfoExtractor): return raise ExtractorError('Unable to extract client id') - def _download_json(self, *args, **kwargs): + def _call_api(self, *args, **kwargs): non_fatal = kwargs.get('fatal') is False if non_fatal: del kwargs['fatal'] @@ -104,7 +104,7 @@ class SoundcloudBaseIE(InfoExtractor): query['client_id'] = self._CLIENT_ID kwargs['query'] = query try: - return super()._download_json(*args, **kwargs) + return self._download_json(*args, **kwargs) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403): self._store_client_id(None) @@ -163,7 +163,7 @@ class SoundcloudBaseIE(InfoExtractor): 'user_agent': self._USER_AGENT } - response = self._download_json( + response = self._call_api( self._API_AUTH_URL_PW % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID), None, note='Verifying login token...', fatal=False, data=json.dumps(payload).encode()) @@ -208,7 +208,6 @@ class SoundcloudBaseIE(InfoExtractor): def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_flat=False): track_id = str(info['id']) - title = info['title'] format_urls = set() formats = [] @@ -217,12 +216,26 @@ class SoundcloudBaseIE(InfoExtractor): query['secret_token'] = secret_token if not extract_flat and info.get('downloadable') and info.get('has_downloads_left'): - download_url = update_url_query( - self._API_V2_BASE + 'tracks/' + track_id + '/download', query) - redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') - if redirect_url: + try: + # Do not use _call_api(); HTTP Error codes have different meanings for this request + download_data = self._download_json( + f'{self._API_V2_BASE}tracks/{track_id}/download', track_id, + 'Downloading original download format info JSON', query=query, headers=self._HEADERS) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + self.report_warning( + 'Original download format is only available ' + f'for registered users. {self._login_hint()}') + elif isinstance(e.cause, HTTPError) and e.cause.status == 403: + self.write_debug('Original download format is not available for this client') + else: + self.report_warning(e.msg) + download_data = None + + if redirect_url := traverse_obj(download_data, ('redirectUri', {url_or_none})): urlh = self._request_webpage( - HEADRequest(redirect_url), track_id, 'Checking for original download format', fatal=False) + HEADRequest(redirect_url), track_id, 'Checking original download format availability', + 'Original download format is not available', fatal=False) if urlh: format_url = urlh.url format_urls.add(format_url) @@ -300,23 +313,11 @@ class SoundcloudBaseIE(InfoExtractor): self.write_debug(f'"{identifier}" is not a requested format, skipping') continue - stream = None - for retry in self.RetryManager(fatal=False): - try: - stream = self._download_json( - format_url, track_id, f'Downloading {identifier} format info JSON', - query=query, headers=self._HEADERS) - except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status == 429: - self.report_warning( - 'You have reached the API rate limit, which is ~600 requests per ' - '10 minutes. Use the --extractor-retries and --retry-sleep options ' - 'to configure an appropriate retry count and wait time', only_once=True) - retry.error = e.cause - else: - self.report_warning(e.msg) - - stream_url = traverse_obj(stream, ('url', {url_or_none})) + # XXX: if not extract_flat, 429 error must be caught where _extract_info_dict is called + stream_url = traverse_obj(self._call_api( + format_url, track_id, f'Downloading {identifier} format info JSON', + query=query, headers=self._HEADERS), ('url', {url_or_none})) + if invalid_url(stream_url): continue format_urls.add(stream_url) @@ -365,7 +366,7 @@ class SoundcloudBaseIE(InfoExtractor): 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), 'uploader_url': user.get('permalink_url'), 'timestamp': unified_timestamp(info.get('created_at')), - 'title': title, + 'title': info.get('title'), 'description': info.get('description'), 'thumbnails': thumbnails, 'duration': float_or_none(info.get('duration'), 1000), @@ -375,7 +376,8 @@ class SoundcloudBaseIE(InfoExtractor): 'like_count': extract_count('favoritings') or extract_count('likes'), 'comment_count': extract_count('comment'), 'repost_count': extract_count('reposts'), - 'genres': traverse_obj(info, ('genre', {str}, {lambda x: x or None}, all)), + 'genres': traverse_obj(info, ('genre', {str}, filter, all, filter)), + 'artists': traverse_obj(info, ('publisher_metadata', 'artist', {str}, filter, all, filter)), 'formats': formats if not extract_flat else None, } @@ -427,7 +429,6 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg', 'uploader_url': 'https://soundcloud.com/ethmusic', - 'genres': [], }, }, # geo-restricted @@ -451,6 +452,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_url': 'https://soundcloud.com/the-concept-band', 'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg', 'genres': ['Alternative'], + 'artists': ['The Royal Concept'], }, }, # private link @@ -523,6 +525,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'view_count': int, 'genres': ['Dance & EDM'], + 'artists': ['80M'], }, }, # private link, downloadable format @@ -547,6 +550,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'thumbnail': 'https://i1.sndcdn.com/artworks-000240712245-kedn4p-original.jpg', 'uploader_url': 'https://soundcloud.com/oriuplift', 'genres': ['Trance'], + 'artists': ['Ori Uplift'], }, }, # no album art, use avatar pic for thumbnail @@ -570,7 +574,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'comment_count': int, 'repost_count': int, 'uploader_url': 'https://soundcloud.com/garyvee', - 'genres': [], + 'artists': ['MadReal'], }, 'params': { 'skip_download': True, @@ -630,10 +634,20 @@ class SoundcloudIE(SoundcloudBaseIE): resolve_title += f'/{token}' info_json_url = self._resolv_url(self._BASE_URL + resolve_title) - info = self._download_json( + info = self._call_api( info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS) - return self._extract_info_dict(info, full_title, token) + for retry in self.RetryManager(): + try: + return self._extract_info_dict(info, full_title, token) + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or not e.cause.status == 429: + raise + self.report_warning( + 'You have reached the API rate limit, which is ~600 requests per ' + '10 minutes. Use the --extractor-retries and --retry-sleep options ' + 'to configure an appropriate retry count and wait time', only_once=True) + retry.error = e.cause class SoundcloudPlaylistBaseIE(SoundcloudBaseIE): @@ -641,7 +655,7 @@ class SoundcloudPlaylistBaseIE(SoundcloudBaseIE): playlist_id = str(playlist['id']) tracks = playlist.get('tracks') or [] if not all(t.get('permalink_url') for t in tracks) and token: - tracks = self._download_json( + tracks = self._call_api( self._API_V2_BASE + 'tracks', playlist_id, 'Downloading tracks', query={ 'ids': ','.join([str(t['id']) for t in tracks]), @@ -699,7 +713,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): if token: full_title += '/' + token - info = self._download_json(self._resolv_url( + info = self._call_api(self._resolv_url( self._BASE_URL + full_title), full_title, headers=self._HEADERS) if 'errors' in info: @@ -730,7 +744,7 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE): for i in itertools.count(): for retry in self.RetryManager(): try: - response = self._download_json( + response = self._call_api( url, playlist_id, query=query, headers=self._HEADERS, note=f'Downloading track page {i + 1}') break @@ -838,7 +852,7 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): mobj = self._match_valid_url(url) uploader = mobj.group('user') - user = self._download_json( + user = self._call_api( self._resolv_url(self._BASE_URL + uploader), uploader, 'Downloading user info', headers=self._HEADERS) @@ -859,16 +873,16 @@ class SoundcloudUserPermalinkIE(SoundcloudPagedPlaylistBaseIE): 'id': '30909869', 'title': 'neilcic', }, - 'playlist_mincount': 23, + 'playlist_mincount': 22, }] def _real_extract(self, url): user_id = self._match_id(url) - user = self._download_json( + user = self._call_api( self._resolv_url(url), user_id, 'Downloading user info', headers=self._HEADERS) return self._extract_playlist( - f'{self._API_V2_BASE}stream/users/{user["id"]}', str(user['id']), user.get('username')) + f'{self._API_V2_BASE}users/{user["id"]}/tracks', str(user['id']), user.get('username')) class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): @@ -886,7 +900,7 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): def _real_extract(self, url): track_name = self._match_id(url) - track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS) + track = self._call_api(self._resolv_url(url), track_name, headers=self._HEADERS) track_id = self._search_regex( r'soundcloud:track-stations:(\d+)', track['id'], 'track id') @@ -930,7 +944,7 @@ class SoundcloudRelatedIE(SoundcloudPagedPlaylistBaseIE): def _real_extract(self, url): slug, relation = self._match_valid_url(url).group('slug', 'relation') - track = self._download_json( + track = self._call_api( self._resolv_url(self._BASE_URL + slug), slug, 'Downloading track info', headers=self._HEADERS) @@ -965,7 +979,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): if token: query['secret_token'] = token - data = self._download_json( + data = self._call_api( self._API_V2_BASE + 'playlists/' + playlist_id, playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS) @@ -1000,7 +1014,7 @@ class SoundcloudSearchIE(SoundcloudBaseIE, SearchInfoExtractor): next_url = update_url_query(self._API_V2_BASE + endpoint, query) for i in itertools.count(1): - response = self._download_json( + response = self._call_api( next_url, collection_id, f'Downloading page {i}', 'Unable to download API page', headers=self._HEADERS) diff --git a/yt_dlp/extractor/sproutvideo.py b/yt_dlp/extractor/sproutvideo.py new file mode 100644 index 0000000000..c0923594e5 --- /dev/null +++ b/yt_dlp/extractor/sproutvideo.py @@ -0,0 +1,198 @@ +import base64 +import urllib.parse + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + qualities, + remove_start, + smuggle_url, + unsmuggle_url, + update_url_query, + url_or_none, + urlencode_postdata, +) +from ..utils.traversal import traverse_obj + + +class SproutVideoIE(InfoExtractor): + _NO_SCHEME_RE = r'//videos\.sproutvideo\.com/embed/(?P<id>[\da-f]+)/[\da-f]+' + _VALID_URL = rf'https?:{_NO_SCHEME_RE}' + _EMBED_REGEX = [rf'<iframe [^>]*\bsrc=["\'](?P<url>(?:https?:)?{_NO_SCHEME_RE}[^"\']*)["\']'] + _TESTS = [{ + 'url': 'https://videos.sproutvideo.com/embed/4c9dddb01910e3c9c4/0fc24387c4f24ee3', + 'md5': '1343ce1a6cb39d67889bfa07c7b02b0e', + 'info_dict': { + 'id': '4c9dddb01910e3c9c4', + 'ext': 'mp4', + 'title': 'Adrien Labaeye : Berlin, des communautés aux communs', + 'duration': 576, + 'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg', + }, + }, { + 'url': 'https://videos.sproutvideo.com/embed/a79fdcb21f1be2c62e/93bf31e41e39ca27', + 'md5': 'cebae5cf558cca83271917cf4ec03f26', + 'info_dict': { + 'id': 'a79fdcb21f1be2c62e', + 'ext': 'mp4', + 'title': 'HS_01_Live Stream 2023-01-14 10:00', + 'duration': 703, + 'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg', + }, + }, { + # http formats 'sd' and 'hd' are available + 'url': 'https://videos.sproutvideo.com/embed/119cd6bc1a18e6cd98/30751a1761ae5b90', + 'md5': 'f368c78df07e78a749508b221528672c', + 'info_dict': { + 'id': '119cd6bc1a18e6cd98', + 'ext': 'mp4', + 'title': '3. Updating your Partner details', + 'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg', + 'duration': 60, + }, + 'params': {'format': 'hd'}, + }, { + # subtitles + 'url': 'https://videos.sproutvideo.com/embed/119dd8ba121ee0cc98/4ee50c88a343215d?type=hd', + 'md5': '7f6798f037d7a3e3e07e67959de68fc6', + 'info_dict': { + 'id': '119dd8ba121ee0cc98', + 'ext': 'mp4', + 'title': 'Recipients Setup - Domestic Wire Only', + 'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg', + 'duration': 77, + 'subtitles': {'en': 'count:1'}, + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.solidarum.org/vivre-ensemble/adrien-labaeye-berlin-des-communautes-aux-communs', + 'info_dict': { + 'id': '4c9dddb01910e3c9c4', + 'ext': 'mp4', + 'title': 'Adrien Labaeye : Berlin, des communautés aux communs', + 'duration': 576, + 'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg', + }, + }] + _M3U8_URL_TMPL = 'https://{base}.videos.sproutvideo.com/{s3_user_hash}/{s3_video_hash}/video/index.m3u8' + _QUALITIES = ('hd', 'uhd', 'source') # Exclude 'sd' to prioritize hls formats above it + + @staticmethod + def _policy_to_qs(policy, signature_key, as_string=False): + query = {} + for key, value in policy['signatures'][signature_key].items(): + query[remove_start(key, 'CloudFront-')] = value + query['sessionID'] = policy['sessionID'] + return urllib.parse.urlencode(query, doseq=True) if as_string else query + + @classmethod + def _extract_embed_urls(cls, url, webpage): + for embed_url in super()._extract_embed_urls(url, webpage): + if embed_url.startswith('//'): + embed_url = f'https:{embed_url}' + yield smuggle_url(embed_url, {'referer': url}) + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + video_id = self._match_id(url) + webpage = self._download_webpage( + url, video_id, headers=traverse_obj(smuggled_data, {'Referer': 'referer'})) + data = self._search_json( + r'var\s+dat\s*=\s*["\']', webpage, 'data', video_id, contains_pattern=r'[A-Za-z0-9+/=]+', + end_pattern=r'["\'];', transform_source=lambda x: base64.b64decode(x).decode()) + + formats, subtitles = [], {} + headers = { + 'Accept': '*/*', + 'Origin': 'https://videos.sproutvideo.com', + 'Referer': url, + } + + # HLS extraction is fatal; only attempt it if the JSON data says it's available + if traverse_obj(data, 'hls'): + manifest_query = self._policy_to_qs(data, 'm') + fragment_query = self._policy_to_qs(data, 't', as_string=True) + key_query = self._policy_to_qs(data, 'k', as_string=True) + + formats.extend(self._extract_m3u8_formats( + self._M3U8_URL_TMPL.format(**data), video_id, 'mp4', + m3u8_id='hls', headers=headers, query=manifest_query)) + for fmt in formats: + fmt.update({ + 'url': update_url_query(fmt['url'], manifest_query), + 'extra_param_to_segment_url': fragment_query, + 'extra_param_to_key_url': key_query, + }) + + if downloads := traverse_obj(data, ('downloads', {dict.items}, lambda _, v: url_or_none(v[1]))): + quality = qualities(self._QUALITIES) + acodec = 'none' if data.get('has_audio') is False else None + formats.extend([{ + 'format_id': str(format_id), + 'url': format_url, + 'ext': 'mp4', + 'quality': quality(format_id), + 'acodec': acodec, + } for format_id, format_url in downloads]) + + for sub_data in traverse_obj(data, ('subtitleData', lambda _, v: url_or_none(v['src']))): + subtitles.setdefault(sub_data.get('srclang', 'en'), []).append({ + 'url': sub_data['src'], + }) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': headers, + **traverse_obj(data, { + 'title': ('title', {str}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('posterframe_url', {url_or_none}), + }), + } + + +class VidsIoIE(InfoExtractor): + IE_NAME = 'vids.io' + _VALID_URL = r'https?://[\w-]+\.vids\.io/videos/(?P<id>[\da-f]+)/(?P<display_id>[\w-]+)' + _TESTS = [{ + 'url': 'https://how-to-video.vids.io/videos/799cd8b11c10efc1f0/how-to-video-live-streaming', + 'md5': '9bbbb2c0c0739eb163b80f87b8d77c9e', + 'info_dict': { + 'id': '799cd8b11c10efc1f0', + 'ext': 'mp4', + 'title': 'How to Video: Live Streaming', + 'duration': 2787, + 'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg', + }, + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') + webpage, urlh = self._download_webpage_handle(url, display_id, expected_status=403) + + if urlh.status == 403: + password = self.get_param('videopassword') + if not password: + raise ExtractorError( + 'This video is password-protected; use the --video-password option', expected=True) + try: + webpage = self._download_webpage( + url, display_id, 'Submitting video password', + data=urlencode_postdata({ + 'password': password, + **self._hidden_inputs(webpage), + })) + # Requests with user's session cookie `_sproutvideo_session` are now authorized + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + raise ExtractorError('Incorrect password', expected=True) + raise + + if embed_url := next(SproutVideoIE._extract_embed_urls(url, webpage), None): + return self.url_result(embed_url, SproutVideoIE, video_id) + + raise ExtractorError('Unable to extract any SproutVideo embed url') diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py index 30cb322dc2..b70d40f2ca 100644 --- a/yt_dlp/extractor/substack.py +++ b/yt_dlp/extractor/substack.py @@ -2,7 +2,13 @@ import re import urllib.parse from .common import InfoExtractor -from ..utils import js_to_json, str_or_none, traverse_obj +from ..networking import HEADRequest +from ..utils import ( + determine_ext, + js_to_json, + str_or_none, +) +from ..utils.traversal import traverse_obj class SubstackIE(InfoExtractor): @@ -43,6 +49,19 @@ class SubstackIE(InfoExtractor): 'uploader': "Andrew Zimmern's Spilled Milk ", 'uploader_id': '577659', }, + }, { + # Podcast that needs its file extension resolved to mp3 + 'url': 'https://persuasion1.substack.com/p/summers', + 'md5': '1456a755d46084744facdfac9edf900f', + 'info_dict': { + 'id': '141970405', + 'ext': 'mp3', + 'title': 'Larry Summers on What Went Wrong on Campus', + 'description': 'Yascha Mounk and Larry Summers also discuss the promise and perils of artificial intelligence.', + 'thumbnail': r're:https://substackcdn\.com/image/.+\.jpeg', + 'uploader': 'Persuasion', + 'uploader_id': '61579', + }, }] @classmethod @@ -89,7 +108,15 @@ class SubstackIE(InfoExtractor): post_type = webpage_info['post']['type'] formats, subtitles = [], {} if post_type == 'podcast': - formats, subtitles = [{'url': webpage_info['post']['podcast_url']}], {} + fmt = {'url': webpage_info['post']['podcast_url']} + if not determine_ext(fmt['url'], default_ext=None): + # The redirected format URL expires but the original URL doesn't, + # so we only want to extract the extension from this request + fmt['ext'] = determine_ext(self._request_webpage( + HEADRequest(fmt['url']), display_id, + 'Resolving podcast file extension', + 'Podcast URL is invalid').url) + formats.append(fmt) elif post_type == 'video': formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], canonical_url) else: diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index 38782abac7..b5df2e1a18 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -472,7 +472,7 @@ class SVTPageIE(SVTBaseIE): title = self._og_search_title(webpage) urql_state = self._search_json( - r'window\.svt\.nyh\.urqlState\s*=', webpage, 'json data', display_id) + r'window\.svt\.(?:nyh\.)?urqlState\s*=', webpage, 'json data', display_id) data = traverse_obj(urql_state, (..., 'data', {str}, {json.loads}), get_all=False) or {} diff --git a/yt_dlp/extractor/swearnet.py b/yt_dlp/extractor/swearnet.py index b4835c5adc..2d6fb3eb47 100644 --- a/yt_dlp/extractor/swearnet.py +++ b/yt_dlp/extractor/swearnet.py @@ -1,55 +1,31 @@ -from .common import InfoExtractor -from ..utils import ExtractorError, int_or_none, traverse_obj +from .vidyard import VidyardBaseIE +from ..utils import ExtractorError, int_or_none, make_archive_id -class SwearnetEpisodeIE(InfoExtractor): +class SwearnetEpisodeIE(VidyardBaseIE): _VALID_URL = r'https?://www\.swearnet\.com/shows/(?P<id>[\w-]+)/seasons/(?P<season_num>\d+)/episodes/(?P<episode_num>\d+)' _TESTS = [{ 'url': 'https://www.swearnet.com/shows/gettin-learnt-with-ricky/seasons/1/episodes/1', 'info_dict': { - 'id': '232819', + 'id': 'wicK2EOzjOdxkUXGDIgcPw', + 'display_id': '232819', 'ext': 'mp4', 'episode_number': 1, 'episode': 'Episode 1', 'duration': 719, - 'description': 'md5:c48ef71440ce466284c07085cd7bd761', + 'description': r're:Are you drunk and high and craving a grilled cheese sandwich.+', 'season': 'Season 1', 'title': 'Episode 1 - Grilled Cheese Sammich', 'season_number': 1, - 'thumbnail': 'https://cdn.vidyard.com/thumbnails/232819/_RX04IKIq60a2V6rIRqq_Q_small.jpg', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/custom/0dd74f9b-388a-452e-b570-b407fb64435b_small.jpg', + 'tags': ['Getting Learnt with Ricky', 'drunk', 'grilled cheese', 'high'], + '_old_archive_ids': ['swearnetepisode 232819'], }, }] - def _get_formats_and_subtitle(self, video_source, video_id): - video_source = video_source or {} - formats, subtitles = [], {} - for key, value in video_source.items(): - if key == 'hls': - for video_hls in value: - fmts, subs = self._extract_m3u8_formats_and_subtitles(video_hls.get('url'), video_id) - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) - else: - formats.extend({ - 'url': video_mp4.get('url'), - 'ext': 'mp4', - } for video_mp4 in value) - - return formats, subtitles - - def _get_direct_subtitle(self, caption_json): - subs = {} - for caption in caption_json: - subs.setdefault(caption.get('language') or 'und', []).append({ - 'url': caption.get('vttUrl'), - 'name': caption.get('name'), - }) - - return subs - def _real_extract(self, url): - display_id, season_number, episode_number = self._match_valid_url(url).group('id', 'season_num', 'episode_num') - webpage = self._download_webpage(url, display_id) + slug, season_number, episode_number = self._match_valid_url(url).group('id', 'season_num', 'episode_num') + webpage = self._download_webpage(url, slug) try: external_id = self._search_regex(r'externalid\s*=\s*"([^"]+)', webpage, 'externalid') @@ -58,22 +34,12 @@ class SwearnetEpisodeIE(InfoExtractor): self.raise_login_required() raise - json_data = self._download_json( - f'https://play.vidyard.com/player/{external_id}.json', display_id)['payload']['chapters'][0] - - formats, subtitles = self._get_formats_and_subtitle(json_data['sources'], display_id) - self._merge_subtitles(self._get_direct_subtitle(json_data.get('captions')), target=subtitles) + info = self._process_video_json(self._fetch_video_json(external_id)['chapters'][0], external_id) + if info.get('display_id'): + info['_old_archive_ids'] = [make_archive_id(self, info['display_id'])] return { - 'id': str(json_data['videoId']), - 'title': json_data.get('name') or self._html_search_meta(['og:title', 'twitter:title'], webpage), - 'description': (json_data.get('description') - or self._html_search_meta(['og:description', 'twitter:description'], webpage)), - 'duration': int_or_none(json_data.get('seconds')), - 'formats': formats, - 'subtitles': subtitles, + **info, 'season_number': int_or_none(season_number), 'episode_number': int_or_none(episode_number), - 'thumbnails': [{'url': thumbnail_url} - for thumbnail_url in traverse_obj(json_data, ('thumbnailUrls', ...))], } diff --git a/yt_dlp/extractor/tele13.py b/yt_dlp/extractor/tele13.py index c5ca208fb4..0d721773ed 100644 --- a/yt_dlp/extractor/tele13.py +++ b/yt_dlp/extractor/tele13.py @@ -8,7 +8,7 @@ from ..utils import ( class Tele13IE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' _TESTS = [ { 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', diff --git a/yt_dlp/extractor/telecinco.py b/yt_dlp/extractor/telecinco.py index 7a9dcd71c5..9ef621446d 100644 --- a/yt_dlp/extractor/telecinco.py +++ b/yt_dlp/extractor/telecinco.py @@ -2,15 +2,69 @@ import json import re from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, clean_html, int_or_none, + join_nonempty, str_or_none, - try_get, + traverse_obj, + update_url, + url_or_none, ) -class TelecincoIE(InfoExtractor): +class TelecincoBaseIE(InfoExtractor): + def _parse_content(self, content, url): + video_id = content['dataMediaId'] + config = self._download_json( + content['dataConfig'], video_id, 'Downloading config JSON') + services = config['services'] + caronte = self._download_json(services['caronte'], video_id) + if traverse_obj(caronte, ('dls', 0, 'drm', {bool})): + self.report_drm(video_id) + + stream = caronte['dls'][0]['stream'] + headers = { + 'Referer': url, + 'Origin': re.match(r'https?://[^/]+', url).group(0), + } + geo_headers = {**headers, **self.geo_verification_headers()} + + try: + cdn = self._download_json( + caronte['cerbero'], video_id, data=json.dumps({ + 'bbx': caronte['bbx'], + 'gbx': self._download_json(services['gbx'], video_id)['gbx'], + }).encode(), headers={ + 'Content-Type': 'application/json', + **geo_headers, + })['tokens']['1']['cdn'] + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 403: + error_code = traverse_obj( + self._webpage_read_content(error.cause.response, caronte['cerbero'], video_id, fatal=False), + ({json.loads}, 'code', {int})) + if error_code == 4038: + self.raise_geo_restricted(countries=['ES']) + raise + + formats = self._extract_m3u8_formats( + update_url(stream, query=cdn), video_id, 'mp4', m3u8_id='hls', headers=geo_headers) + + return { + 'id': video_id, + 'title': traverse_obj(config, ('info', 'title', {str})), + 'formats': formats, + 'thumbnail': (traverse_obj(content, ('dataPoster', {url_or_none})) + or traverse_obj(config, 'poster', 'imageUrl', expected_type=url_or_none)), + 'duration': traverse_obj(content, ('dataDuration', {int_or_none})), + 'http_headers': headers, + } + + +class TelecincoIE(TelecincoBaseIE): IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' _VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' @@ -30,6 +84,7 @@ class TelecincoIE(InfoExtractor): 'duration': 662, }, }], + 'skip': 'HTTP Error 410 Gone', }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', 'md5': 'c86fe0d99e3bdb46b7950d38bf6ef12a', @@ -40,23 +95,24 @@ class TelecincoIE(InfoExtractor): 'description': 'md5:a62ecb5f1934fc787107d7b9a2262805', 'duration': 79, }, + 'skip': 'Redirects to main page', }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': 'eddb50291df704ce23c74821b995bcac', + 'md5': '5ce057f43f30b634fbaf0f18c71a140a', 'info_dict': { 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', 'title': '#DOYLACARA. Con la trata no hay trato', - 'description': 'md5:2771356ff7bfad9179c5f5cd954f1477', 'duration': 50, + 'thumbnail': 'https://album.mediaset.es/eimg/2017/11/02/1tlQLO5Q3mtKT24f3EaC24.jpg', }, }, { # video in opening's content 'url': 'https://www.telecinco.es/vivalavida/fiorella-sobrina-edmundo-arrocet-entrevista_18_2907195140.html', 'info_dict': { - 'id': '2907195140', + 'id': '1691427', 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', - 'description': 'md5:73f340a7320143d37ab895375b2bf13a', + 'description': r're:Fiorella, la sobrina de Edmundo Arrocet, concedió .{727}', }, 'playlist': [{ 'md5': 'adb28c37238b675dad0f042292f209a7', @@ -65,6 +121,7 @@ class TelecincoIE(InfoExtractor): 'ext': 'mp4', 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', 'duration': 1015, + 'thumbnail': 'https://album.mediaset.es/eimg/2020/02/29/5opaC37lUhKlZ7FoDhiVC.jpg', }, }], 'params': { @@ -81,66 +138,29 @@ class TelecincoIE(InfoExtractor): 'only_matching': True, }] - def _parse_content(self, content, url): - video_id = content['dataMediaId'] - config = self._download_json( - content['dataConfig'], video_id, 'Downloading config JSON') - title = config['info']['title'] - services = config['services'] - caronte = self._download_json(services['caronte'], video_id) - stream = caronte['dls'][0]['stream'] - headers = self.geo_verification_headers() - headers.update({ - 'Content-Type': 'application/json;charset=UTF-8', - 'Origin': re.match(r'https?://[^/]+', url).group(0), - }) - cdn = self._download_json( - caronte['cerbero'], video_id, data=json.dumps({ - 'bbx': caronte['bbx'], - 'gbx': self._download_json(services['gbx'], video_id)['gbx'], - }).encode(), headers=headers)['tokens']['1']['cdn'] - formats = self._extract_m3u8_formats( - stream + '?' + cdn, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'), - 'duration': int_or_none(content.get('dataDuration')), - } - def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - article = self._parse_json(self._search_regex( - r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=\s*({.+})', - webpage, 'article'), display_id)['article'] - title = article.get('title') - description = clean_html(article.get('leadParagraph')) or '' + article = self._search_json( + r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=', + webpage, 'article', display_id)['article'] + description = traverse_obj(article, ('leadParagraph', {clean_html}, filter)) + if article.get('editorialType') != 'VID': entries = [] - body = [article.get('opening')] - body.extend(try_get(article, lambda x: x['body'], list) or []) - for p in body: - if not isinstance(p, dict): - continue - content = p.get('content') - if not content: - continue + + for p in traverse_obj(article, ((('opening', all), 'body'), lambda _, v: v['content'])): + content = p['content'] type_ = p.get('type') - if type_ == 'paragraph': - content_str = str_or_none(content) - if content_str: - description += content_str - continue - if type_ == 'video' and isinstance(content, dict): + if type_ == 'paragraph' and isinstance(content, str): + description = join_nonempty(description, content, delim='') + elif type_ == 'video' and isinstance(content, dict): entries.append(self._parse_content(content, url)) + return self.playlist_result( - entries, str_or_none(article.get('id')), title, description) - content = article['opening']['content'] - info = self._parse_content(content, url) - info.update({ - 'description': description, - }) + entries, str_or_none(article.get('id')), + traverse_obj(article, ('title', {str})), clean_html(description)) + + info = self._parse_content(article['opening']['content'], url) + info['description'] = description return info diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index d8c556acef..07db583470 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -1,33 +1,31 @@ -import base64 -import datetime as dt import functools import itertools from .common import InfoExtractor from ..networking import HEADRequest -from ..utils import int_or_none, traverse_obj, urlencode_postdata, urljoin +from ..utils import int_or_none, traverse_obj, url_or_none, urljoin class TenPlayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?P<id>tpv\d{6}[a-z]{5})' _NETRC_MACHINE = '10play' _TESTS = [{ - 'url': 'https://10play.com.au/neighbours/web-extras/season-39/nathan-borg-is-the-first-aussie-actor-with-a-cochlear-implant-to-join-neighbours/tpv210128qupwd', + 'url': 'https://10play.com.au/neighbours/web-extras/season-41/heres-a-first-look-at-mischa-bartons-neighbours-debut/tpv230911hyxnz', 'info_dict': { - 'id': '6226844312001', + 'id': '6336940246112', 'ext': 'mp4', - 'title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', - 'alt_title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', - 'description': 'md5:a02d0199c901c2dd4c796f1e7dd0de43', - 'duration': 186, - 'season': 'Season 39', - 'season_number': 39, + 'title': 'Here\'s A First Look At Mischa Barton\'s Neighbours Debut', + 'alt_title': 'Here\'s A First Look At Mischa Barton\'s Neighbours Debut', + 'description': 'Neighbours Premieres Monday, September 18 At 4:30pm On 10 And 10 Play And 6:30pm On 10 Peach', + 'duration': 74, + 'season': 'Season 41', + 'season_number': 41, 'series': 'Neighbours', 'thumbnail': r're:https://.*\.jpg', 'uploader': 'Channel 10', 'age_limit': 15, - 'timestamp': 1611810000, - 'upload_date': '20210128', + 'timestamp': 1694386800, + 'upload_date': '20230910', 'uploader_id': '2199827728001', }, 'params': { @@ -35,21 +33,30 @@ class TenPlayIE(InfoExtractor): }, 'skip': 'Only available in Australia', }, { - 'url': 'https://10play.com.au/todd-sampsons-body-hack/episodes/season-4/episode-7/tpv200921kvngh', + 'url': 'https://10play.com.au/neighbours/episodes/season-42/episode-9107/tpv240902nzqyp', 'info_dict': { - 'id': '6192880312001', + 'id': '9000000000091177', 'ext': 'mp4', - 'title': "Todd Sampson's Body Hack - S4 Ep. 2", - 'description': 'md5:fa278820ad90f08ea187f9458316ac74', + 'title': 'Neighbours - S42 Ep. 9107', + 'alt_title': 'Thu 05 Sep', + 'description': 'md5:37a1f4271be34b9ee2b533426a5fbaef', + 'duration': 1388, + 'episode': 'Episode 9107', + 'episode_number': 9107, + 'season': 'Season 42', + 'season_number': 42, + 'series': 'Neighbours', + 'thumbnail': r're:https://.*\.jpg', 'age_limit': 15, - 'timestamp': 1600770600, - 'upload_date': '20200922', + 'timestamp': 1725517860, + 'upload_date': '20240905', 'uploader': 'Channel 10', 'uploader_id': '2199827728001', }, 'params': { 'skip_download': True, }, + 'skip': 'Only available in Australia', }, { 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', 'only_matching': True, @@ -66,55 +73,42 @@ class TenPlayIE(InfoExtractor): 'X': 18, } - def _get_bearer_token(self, video_id): - username, password = self._get_login_info() - if username is None or password is None: - self.raise_login_required('Your 10play account\'s details must be provided with --username and --password.') - _timestamp = dt.datetime.now().strftime('%Y%m%d000000') - _auth_header = base64.b64encode(_timestamp.encode('ascii')).decode('ascii') - data = self._download_json('https://10play.com.au/api/user/auth', video_id, 'Getting bearer token', headers={ - 'X-Network-Ten-Auth': _auth_header, - }, data=urlencode_postdata({ - 'email': username, - 'password': password, - })) - return 'Bearer ' + data['jwt']['accessToken'] - def _real_extract(self, url): content_id = self._match_id(url) data = self._download_json( 'https://10play.com.au/api/v1/videos/' + content_id, content_id) - headers = {} - - if data.get('memberGated') is True: - _token = self._get_bearer_token(content_id) - headers = {'Authorization': _token} - _video_url = self._download_json( - data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON', - headers=headers).get('source') - m3u8_url = self._request_webpage(HEADRequest( - _video_url), content_id).url + video_data = self._download_json( + f'https://vod.ten.com.au/api/videos/bcquery?command=find_videos_by_id&video_id={data["altId"]}', + content_id, 'Downloading video JSON') + m3u8_url = self._request_webpage( + HEADRequest(video_data['items'][0]['HLSURL']), + content_id, 'Checking stream URL').url if '10play-not-in-oz' in m3u8_url: self.raise_geo_restricted(countries=['AU']) + # Attempt to get a higher quality stream + m3u8_url = m3u8_url.replace(',150,75,55,0000', ',300,150,75,55,0000') formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4') return { + 'id': content_id, 'formats': formats, - 'subtitles': {'en': [{'url': data.get('captionUrl')}]} if data.get('captionUrl') else None, - 'id': data.get('altId') or content_id, - 'duration': data.get('duration'), - 'title': data.get('subtitle'), - 'alt_title': data.get('title'), - 'description': data.get('description'), - 'age_limit': self._AUS_AGES.get(data.get('classification')), - 'series': data.get('tvShow'), - 'season_number': int_or_none(data.get('season')), - 'episode_number': int_or_none(data.get('episode')), - 'timestamp': data.get('published'), - 'thumbnail': data.get('imageUrl'), + 'subtitles': {'en': [{'url': data['captionUrl']}]} if url_or_none(data.get('captionUrl')) else None, 'uploader': 'Channel 10', 'uploader_id': '2199827728001', + **traverse_obj(data, { + 'id': ('altId', {str}), + 'duration': ('duration', {int_or_none}), + 'title': ('subtitle', {str}), + 'alt_title': ('title', {str}), + 'description': ('description', {str}), + 'age_limit': ('classification', {self._AUS_AGES.get}), + 'series': ('tvShow', {str}), + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + 'timestamp': ('published', {int_or_none}), + 'thumbnail': ('imageUrl', {url_or_none}), + }), } diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index dc74d4a1f5..f7e103fe9f 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -23,13 +23,13 @@ from ..utils import ( mimetype2ext, parse_qs, qualities, - remove_start, srt_subtitles_timecode, str_or_none, traverse_obj, try_call, try_get, url_or_none, + urlencode_postdata, ) @@ -43,8 +43,8 @@ class TikTokBaseIE(InfoExtractor): 'iid': None, # TikTok (KR/PH/TW/TH/VN) = trill, TikTok (rest of world) = musical_ly, Douyin = aweme 'app_name': 'musical_ly', - 'app_version': '34.1.2', - 'manifest_app_version': '2023401020', + 'app_version': '35.1.3', + 'manifest_app_version': '2023501030', # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0 'aid': '0', } @@ -114,7 +114,7 @@ class TikTokBaseIE(InfoExtractor): 'universal data', display_id, end_pattern=r'</script>', default={}), ('__DEFAULT_SCOPE__', {dict})) or {} - def _call_api_impl(self, ep, query, video_id, fatal=True, + def _call_api_impl(self, ep, video_id, query=None, data=None, headers=None, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160))) webpage_cookies = self._get_cookies(self._WEBPAGE_HOST) @@ -125,7 +125,8 @@ class TikTokBaseIE(InfoExtractor): fatal=fatal, note=note, errnote=errnote, headers={ 'User-Agent': self._APP_USER_AGENT, 'Accept': 'application/json', - }, query=query) + **(headers or {}), + }, query=query, data=data) def _build_api_query(self, query): return filter_dict({ @@ -174,7 +175,7 @@ class TikTokBaseIE(InfoExtractor): 'openudid': ''.join(random.choices('0123456789abcdef', k=16)), }) - def _call_api(self, ep, query, video_id, fatal=True, + def _call_api(self, ep, video_id, query=None, data=None, headers=None, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): if not self._APP_INFO and not self._get_next_app_info(): message = 'No working app info is available' @@ -187,9 +188,11 @@ class TikTokBaseIE(InfoExtractor): max_tries = len(self._APP_INFO_POOL) + 1 # _APP_INFO_POOL + _APP_INFO for count in itertools.count(1): self.write_debug(str(self._APP_INFO)) - real_query = self._build_api_query(query) + real_query = self._build_api_query(query or {}) try: - return self._call_api_impl(ep, real_query, video_id, fatal, note, errnote) + return self._call_api_impl( + ep, video_id, query=real_query, data=data, headers=headers, + fatal=fatal, note=note, errnote=errnote) except ExtractorError as e: if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: message = str(e.cause or e.msg) @@ -204,17 +207,29 @@ class TikTokBaseIE(InfoExtractor): raise def _extract_aweme_app(self, aweme_id): - feed_list = self._call_api( - 'feed', {'aweme_id': aweme_id}, aweme_id, note='Downloading video feed', - errnote='Unable to download video feed').get('aweme_list') or [] - aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None) + aweme_detail = traverse_obj( + self._call_api('multi/aweme/detail', aweme_id, data=urlencode_postdata({ + 'aweme_ids': f'[{aweme_id}]', + 'request_source': '0', + }), headers={'X-Argus': ''}), ('aweme_details', 0, {dict})) if not aweme_detail: - raise ExtractorError('Unable to find video in feed', video_id=aweme_id) + raise ExtractorError('Unable to extract aweme detail info', video_id=aweme_id) return self._parse_aweme_video_app(aweme_detail) def _extract_web_data_and_status(self, url, video_id, fatal=True): - webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=fatal) or '' - video_data, status = {}, None + video_data, status = {}, -1 + + res = self._download_webpage_handle(url, video_id, fatal=fatal, headers={'User-Agent': 'Mozilla/5.0'}) + if res is False: + return video_data, status + + webpage, urlh = res + if urllib.parse.urlparse(urlh.url).path == '/login': + message = 'TikTok is requiring login for access to this content' + if fatal: + self.raise_login_required(message) + self.report_warning(f'{message}. {self._login_hint()}') + return video_data, status if universal_data := self._get_universal_data(webpage, video_id): self.write_debug('Found universal data for rehydration') @@ -238,7 +253,16 @@ class TikTokBaseIE(InfoExtractor): def _get_subtitles(self, aweme_detail, aweme_id, user_name): # TODO: Extract text positioning info + + EXT_MAP = { # From lowest to highest preference + 'creator_caption': 'json', + 'srt': 'srt', + 'webvtt': 'vtt', + } + preference = qualities(tuple(EXT_MAP.values())) + subtitles = {} + # aweme/detail endpoint subs captions_info = traverse_obj( aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict) @@ -262,8 +286,8 @@ class TikTokBaseIE(InfoExtractor): if not caption.get('url'): continue subtitles.setdefault(caption.get('lang') or 'en', []).append({ - 'ext': remove_start(caption.get('caption_format'), 'web'), 'url': caption['url'], + 'ext': EXT_MAP.get(caption.get('Format')), }) # webpage subs if not subtitles: @@ -272,9 +296,14 @@ class TikTokBaseIE(InfoExtractor): self._create_url(user_name, aweme_id), aweme_id, fatal=False) for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', lambda _, v: v['Url'])): subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({ - 'ext': remove_start(caption.get('Format'), 'web'), 'url': caption['Url'], + 'ext': EXT_MAP.get(caption.get('Format')), }) + + # Deprioritize creator_caption json since it can't be embedded or used by media players + for lang, subs_list in subtitles.items(): + subtitles[lang] = sorted(subs_list, key=lambda x: preference(x['ext'])) + return subtitles def _parse_url_key(self, url_key): @@ -513,16 +542,12 @@ class TikTokBaseIE(InfoExtractor): **COMMON_FORMAT_INFO, 'format_id': 'download', 'url': self._proto_relative_url(download_url), + 'format_note': 'watermarked', + 'preference': -2, }) self._remove_duplicate_formats(formats) - for f in traverse_obj(formats, lambda _, v: 'unwatermarked' not in v['url']): - f.update({ - 'format_note': join_nonempty(f.get('format_note'), 'watermarked', delim=', '), - 'preference': f.get('preference') or -2, - }) - # Is it a slideshow with only audio for download? if not formats and traverse_obj(aweme_detail, ('music', 'playUrl', {url_or_none})): audio_url = aweme_detail['music']['playUrl'] @@ -536,7 +561,8 @@ class TikTokBaseIE(InfoExtractor): 'vcodec': 'none', }) - return formats + # Filter out broken formats, see https://github.com/yt-dlp/yt-dlp/issues/11034 + return [f for f in formats if urllib.parse.urlparse(f['url']).hostname != 'www.tiktok.com'] def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id, extract_flat=False): author_info = traverse_obj(aweme_detail, (('authorInfo', 'author', None), { @@ -1026,7 +1052,8 @@ class TikTokBaseListIE(TikTokBaseIE): # XXX: Conventionally, base classes shoul for retry in self.RetryManager(): try: post_list = self._call_api( - self._API_ENDPOINT, query, display_id, note=f'Downloading video list page {page}', + self._API_ENDPOINT, display_id, query=query, + note=f'Downloading video list page {page}', errnote='Unable to download video list') except ExtractorError as e: if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: @@ -1441,9 +1468,11 @@ class TikTokLiveIE(TikTokBaseIE): if webpage: data = self._get_sigi_state(webpage, uploader or room_id) - room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False) - or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None) - or room_id) + room_id = ( + traverse_obj(data, (( + ('LiveRoom', 'liveRoomUserInfo', 'user'), + ('UserModule', 'users', ...)), 'roomId', {str}, any)) + or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=room_id)) uploader = uploader or traverse_obj( data, ('LiveRoom', 'liveRoomUserInfo', 'user', 'uniqueId'), ('UserModule', 'users', ..., 'uniqueId'), get_all=False, expected_type=str) diff --git a/yt_dlp/extractor/toggle.py b/yt_dlp/extractor/toggle.py index de2e03f178..fbef7cc0f2 100644 --- a/yt_dlp/extractor/toggle.py +++ b/yt_dlp/extractor/toggle.py @@ -28,35 +28,11 @@ class ToggleIE(InfoExtractor): 'skip_download': 'm3u8 download', }, }, { - 'note': 'DRM-protected video', 'url': 'http://www.mewatch.sg/en/movies/dug-s-special-mission/341413', - 'info_dict': { - 'id': '341413', - 'ext': 'wvm', - 'title': 'Dug\'s Special Mission', - 'description': 'md5:e86c6f4458214905c1772398fabc93e0', - 'upload_date': '20150827', - 'timestamp': 1440644006, - }, - 'params': { - 'skip_download': 'DRM-protected wvm download', - }, + 'only_matching': True, }, { - # this also tests correct video id extraction - 'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay', 'url': 'http://www.mewatch.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861', - 'info_dict': { - 'id': '332861', - 'ext': 'mp4', - 'title': '28th SEA Games (5 Show) - Episode 11', - 'description': 'md5:3cd4f5f56c7c3b1340c50a863f896faa', - 'upload_date': '20150605', - 'timestamp': 1433480166, - }, - 'params': { - 'skip_download': 'DRM-protected wvm download', - }, - 'skip': 'm3u8 links are geo-restricted', + 'only_matching': True, }, { 'url': 'http://video.toggle.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', 'only_matching': True, diff --git a/yt_dlp/extractor/tubetugraz.py b/yt_dlp/extractor/tubetugraz.py index e13375f0a1..805e2686f7 100644 --- a/yt_dlp/extractor/tubetugraz.py +++ b/yt_dlp/extractor/tubetugraz.py @@ -21,7 +21,7 @@ class TubeTuGrazBaseIE(InfoExtractor): if not urlh: return - content, urlh = self._download_webpage_handle( + response = self._download_webpage_handle( urlh.url, None, fatal=False, headers={'referer': urlh.url}, note='logging in', errnote='unable to log in', data=urlencode_postdata({ @@ -30,7 +30,11 @@ class TubeTuGrazBaseIE(InfoExtractor): 'j_username': username, 'j_password': password, })) - if not urlh or urlh.url == 'https://tube.tugraz.at/paella/ui/index.html': + if not response: + return + + content, urlh = response + if urlh.url == 'https://tube.tugraz.at/paella/ui/index.html': return if not self._html_search_regex( @@ -39,7 +43,7 @@ class TubeTuGrazBaseIE(InfoExtractor): self.report_warning('unable to login: incorrect password') return - content, urlh = self._download_webpage_handle( + urlh = self._request_webpage( urlh.url, None, fatal=False, headers={'referer': urlh.url}, note='logging in with TFA', errnote='unable to log in with TFA', data=urlencode_postdata({ @@ -232,7 +236,7 @@ class TubeTuGrazSeriesIE(TubeTuGrazBaseIE): }, }, ], - 'min_playlist_count': 4, + 'playlist_mincount': 4, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/tubitv.py b/yt_dlp/extractor/tubitv.py index 9d9ddae720..694a92fcd4 100644 --- a/yt_dlp/extractor/tubitv.py +++ b/yt_dlp/extractor/tubitv.py @@ -6,6 +6,7 @@ from ..utils import ( ExtractorError, int_or_none, js_to_json, + strip_or_none, traverse_obj, url_or_none, urlencode_postdata, @@ -13,6 +14,7 @@ from ..utils import ( class TubiTvIE(InfoExtractor): + IE_NAME = 'tubitv' _VALID_URL = r'https?://(?:www\.)?tubitv\.com/(?P<type>video|movies|tv-shows)/(?P<id>\d+)' _LOGIN_URL = 'http://tubitv.com/login' _NETRC_MACHINE = 'tubitv' @@ -131,12 +133,12 @@ class TubiTvIE(InfoExtractor): return { 'id': video_id, - 'title': title, + 'title': strip_or_none(title), 'formats': formats, 'subtitles': subtitles, 'season_number': int_or_none(season_number), 'episode_number': int_or_none(episode_number), - 'episode': episode_title, + 'episode': strip_or_none(episode_title), **traverse_obj(video_data, { 'description': ('description', {str}), 'duration': ('duration', {int_or_none}), @@ -148,30 +150,54 @@ class TubiTvIE(InfoExtractor): class TubiTvShowIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://(?:www\.)?tubitv\.com/series/[0-9]+/(?P<show_name>[^/?#]+)' + IE_NAME = 'tubitv:series' + _VALID_URL = r'https?://(?:www\.)?tubitv\.com/series/\d+/(?P<show_name>[^/?#]+)(?:/season-(?P<season>\d+))?' _TESTS = [{ 'url': 'https://tubitv.com/series/3936/the-joy-of-painting-with-bob-ross?start=true', - 'playlist_mincount': 390, + 'playlist_mincount': 389, 'info_dict': { 'id': 'the-joy-of-painting-with-bob-ross', }, + }, { + 'url': 'https://tubitv.com/series/2311/the-saddle-club/season-1', + 'playlist_count': 26, + 'info_dict': { + 'id': 'the-saddle-club-season-1', + }, + }, { + 'url': 'https://tubitv.com/series/2311/the-saddle-club/season-3', + 'playlist_count': 19, + 'info_dict': { + 'id': 'the-saddle-club-season-3', + }, + }, { + 'url': 'https://tubitv.com/series/2311/the-saddle-club/', + 'playlist_mincount': 71, + 'info_dict': { + 'id': 'the-saddle-club', + }, }] - def _entries(self, show_url, show_name): - show_webpage = self._download_webpage(show_url, show_name) + def _entries(self, show_url, playlist_id, selected_season): + webpage = self._download_webpage(show_url, playlist_id) + + data = self._search_json( + r'window\.__data\s*=', webpage, 'data', playlist_id, + transform_source=js_to_json)['video'] - show_json = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({[^<]+});\s*</script>', - show_webpage, 'data'), show_name, transform_source=js_to_json)['video'] + # v['number'] is already a decimal string, but stringify to protect against API changes + path = [lambda _, v: str(v['number']) == selected_season] if selected_season else [..., {dict}] - for episode_id in show_json['fullContentById']: - if traverse_obj(show_json, ('byId', episode_id, 'type')) == 's': - continue - yield self.url_result( - f'https://tubitv.com/tv-shows/{episode_id}/', - ie=TubiTvIE.ie_key(), video_id=episode_id) + for season in traverse_obj(data, ('byId', lambda _, v: v['type'] == 's', 'seasons', *path)): + season_number = int_or_none(season.get('number')) + for episode in traverse_obj(season, ('episodes', lambda _, v: v['id'])): + episode_id = episode['id'] + yield self.url_result( + f'https://tubitv.com/tv-shows/{episode_id}/', TubiTvIE, episode_id, + season_number=season_number, episode_number=int_or_none(episode.get('num'))) def _real_extract(self, url): - show_name = self._match_valid_url(url).group('show_name') - return self.playlist_result(self._entries(url, show_name), playlist_id=show_name) + playlist_id, selected_season = self._match_valid_url(url).group('show_name', 'season') + if selected_season: + playlist_id = f'{playlist_id}-season-{selected_season}' + return self.playlist_result(self._entries(url, playlist_id, selected_season), playlist_id) diff --git a/yt_dlp/extractor/tv5mondeplus.py b/yt_dlp/extractor/tv5mondeplus.py index 52ff230f2a..953eb77ede 100644 --- a/yt_dlp/extractor/tv5mondeplus.py +++ b/yt_dlp/extractor/tv5mondeplus.py @@ -96,7 +96,7 @@ class TV5MondePlusIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + webpage = self._download_webpage(url, display_id, impersonate=True) if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage: self.raise_geo_restricted(countries=['FR']) @@ -122,8 +122,9 @@ class TV5MondePlusIE(InfoExtractor): if not token: continue deferred_json = self._download_json( - f'https://api.tv5monde.com/player/asset/{d_param}/resolve?condenseKS=true', display_id, - note='Downloading deferred info', headers={'Authorization': f'Bearer {token}'}, fatal=False) + f'https://api.tv5monde.com/player/asset/{d_param}/resolve?condenseKS=true', + display_id, 'Downloading deferred info', fatal=False, impersonate=True, + headers={'Authorization': f'Bearer {token}'}) v_url = traverse_obj(deferred_json, (0, 'url', {url_or_none})) if not v_url: continue diff --git a/yt_dlp/extractor/tva.py b/yt_dlp/extractor/tva.py index e3e10557c2..d702640f33 100644 --- a/yt_dlp/extractor/tva.py +++ b/yt_dlp/extractor/tva.py @@ -1,60 +1,29 @@ import functools import re +from .brightcove import BrightcoveNewIE from .common import InfoExtractor from ..utils import float_or_none, int_or_none, smuggle_url, strip_or_none from ..utils.traversal import traverse_obj class TVAIE(InfoExtractor): - _VALID_URL = r'https?://videos?\.tva\.ca/details/_(?P<id>\d+)' + IE_NAME = 'tvaplus' + IE_DESC = 'TVA+' + _VALID_URL = r'https?://(?:www\.)?tvaplus\.ca/(?:[^/?#]+/)*[\w-]+-(?P<id>\d+)(?:$|[#?])' _TESTS = [{ - 'url': 'https://videos.tva.ca/details/_5596811470001', - 'info_dict': { - 'id': '5596811470001', - 'ext': 'mp4', - 'title': 'Un extrait de l\'épisode du dimanche 8 octobre 2017 !', - 'uploader_id': '5481942443001', - 'upload_date': '20171003', - 'timestamp': 1507064617, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'HTTP Error 404: Not Found', - }, { - 'url': 'https://video.tva.ca/details/_5596811470001', - 'only_matching': True, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5481942443001/default_default/index.html?videoId=%s' - - def _real_extract(self, url): - video_id = self._match_id(url) - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['CA']}), - 'ie_key': 'BrightcoveNew', - } - - -class QubIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?qub\.ca/(?:[^/]+/)*[0-9a-z-]+-(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.qub.ca/tvaplus/tva/alerte-amber/saison-1/episode-01-1000036619', + 'url': 'https://www.tvaplus.ca/tva/alerte-amber/saison-1/episode-01-1000036619', 'md5': '949490fd0e7aee11d0543777611fbd53', 'info_dict': { 'id': '6084352463001', 'ext': 'mp4', - 'title': 'Ép 01. Mon dernier jour', + 'title': 'Mon dernier jour', 'uploader_id': '5481942443001', 'upload_date': '20190907', 'timestamp': 1567899756, 'description': 'md5:9c0d7fbb90939420c651fd977df90145', 'thumbnail': r're:https://.+\.jpg', - 'episode': 'Ép 01. Mon dernier jour', + 'episode': 'Mon dernier jour', 'episode_number': 1, 'tags': ['alerte amber', 'alerte amber saison 1', 'surdemande'], 'duration': 2625.963, @@ -64,23 +33,36 @@ class QubIE(InfoExtractor): 'channel': 'TVA', }, }, { - 'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943', - 'only_matching': True, + 'url': 'https://www.tvaplus.ca/tva/le-baiser-du-barbu/le-baiser-du-barbu-886644190', + 'info_dict': { + 'id': '6354448043112', + 'ext': 'mp4', + 'title': 'Le Baiser du barbu', + 'uploader_id': '5481942443001', + 'upload_date': '20240606', + 'timestamp': 1717694023, + 'description': 'md5:025b1219086c1cbf4bc27e4e034e8b57', + 'thumbnail': r're:https://.+\.jpg', + 'episode': 'Le Baiser du barbu', + 'tags': ['fullepisode', 'films'], + 'duration': 6053.504, + 'series': 'Le Baiser du barbu', + 'channel': 'TVA', + }, }] - # reference_id also works with old account_id(5481942443001) - # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5813221784001/default_default/index.html?videoId=ref:%s' + _BC_URL_TMPL = 'https://players.brightcove.net/5481942443001/default_default/index.html?videoId={}' def _real_extract(self, url): entity_id = self._match_id(url) webpage = self._download_webpage(url, entity_id) - entity = self._search_nextjs_data(webpage, entity_id)['props']['initialProps']['pageProps']['fallbackData'] + entity = self._search_nextjs_data(webpage, entity_id)['props']['pageProps']['staticEntity'] video_id = entity['videoId'] episode = strip_or_none(entity.get('name')) return { '_type': 'url_transparent', - 'url': f'https://videos.tva.ca/details/_{video_id}', - 'ie_key': TVAIE.ie_key(), + 'url': smuggle_url(self._BC_URL_TMPL.format(video_id), {'geo_countries': ['CA']}), + 'ie_key': BrightcoveNewIE.ie_key(), 'id': video_id, 'title': episode, 'episode': episode, diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index 8105db41cd..a8865fe649 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -6,11 +6,12 @@ from ..utils import ( str_or_none, strip_or_none, traverse_obj, + update_url_query, ) class TVerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?P<type>lp|corner|series|episodes?|feature|tokyo2020/video)/)+(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?P<type>lp|corner|series|episodes?|feature)/)+(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ 'skip': 'videos are only available for 7 days', 'url': 'https://tver.jp/episodes/ep83nf3w4p', @@ -21,6 +22,10 @@ class TVerIE(InfoExtractor): 'episode': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', 'alt_title': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', 'channel': 'テレビ朝日', + 'id': 'ep83nf3w4p', + 'ext': 'mp4', + 'onair_label': '5月3日(火)放送分', + 'ext_title': '家事ヤロウ!!! 売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着! テレビ朝日 5月3日(火)放送分', }, 'add_ie': ['BrightcoveNew'], }, { @@ -29,50 +34,103 @@ class TVerIE(InfoExtractor): }, { 'url': 'https://tver.jp/lp/f0033031', 'only_matching': True, + }, { + 'url': 'https://tver.jp/series/srtxft431v', + 'info_dict': { + 'id': 'srtxft431v', + 'title': '名探偵コナン', + }, + 'playlist': [ + { + 'md5': '779ffd97493ed59b0a6277ea726b389e', + 'info_dict': { + 'id': 'ref:conan-1137-241005', + 'ext': 'mp4', + 'title': '名探偵コナン #1137「行列店、味変の秘密」', + 'uploader_id': '5330942432001', + 'tags': [], + 'channel': '読売テレビ', + 'series': '名探偵コナン', + 'description': 'md5:601fccc1d2430d942a2c8068c4b33eb5', + 'episode': '#1137「行列店、味変の秘密」', + 'duration': 1469.077, + 'timestamp': 1728030405, + 'upload_date': '20241004', + 'alt_title': '名探偵コナン #1137「行列店、味変の秘密」 読売テレビ 10月5日(土)放送分', + 'thumbnail': r're:https://.+\.jpg', + }, + }], + }, { + 'url': 'https://tver.jp/series/sru35hwdd2', + 'info_dict': { + 'id': 'sru35hwdd2', + 'title': '神回だけ見せます!', + }, + 'playlist_count': 11, + }, { + 'url': 'https://tver.jp/series/srkq2shp9d', + 'only_matching': True, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' - _PLATFORM_UID = None - _PLATFORM_TOKEN = None + _HEADERS = {'x-tver-platform-type': 'web'} + _PLATFORM_QUERY = {} def _real_initialize(self): - create_response = self._download_json( - 'https://platform-api.tver.jp/v2/api/platform_users/browser/create', None, - note='Creating session', data=b'device_type=pc', headers={ - 'Origin': 'https://s.tver.jp', - 'Referer': 'https://s.tver.jp/', - 'Content-Type': 'application/x-www-form-urlencoded', + session_info = self._download_json( + 'https://platform-api.tver.jp/v2/api/platform_users/browser/create', + None, 'Creating session', data=b'device_type=pc') + self._PLATFORM_QUERY = traverse_obj(session_info, ('result', { + 'platform_uid': 'platform_uid', + 'platform_token': 'platform_token', + })) + + def _call_platform_api(self, path, video_id, note=None, fatal=True, query=None): + return self._download_json( + f'https://platform-api.tver.jp/service/api/{path}', video_id, note, + fatal=fatal, headers=self._HEADERS, query={ + **self._PLATFORM_QUERY, + **(query or {}), }) - self._PLATFORM_UID = traverse_obj(create_response, ('result', 'platform_uid')) - self._PLATFORM_TOKEN = traverse_obj(create_response, ('result', 'platform_token')) + + def _yield_episode_ids_for_series(self, series_id): + seasons_info = self._download_json( + f'https://service-api.tver.jp/api/v1/callSeriesSeasons/{series_id}', + series_id, 'Downloading seasons info', headers=self._HEADERS) + for season_id in traverse_obj( + seasons_info, ('result', 'contents', lambda _, v: v['type'] == 'season', 'content', 'id', {str})): + episodes_info = self._call_platform_api( + f'v1/callSeasonEpisodes/{season_id}', series_id, f'Downloading season {season_id} episodes info') + yield from traverse_obj(episodes_info, ( + 'result', 'contents', lambda _, v: v['type'] == 'episode', 'content', 'id', {str})) def _real_extract(self, url): video_id, video_type = self._match_valid_url(url).group('id', 'type') - if video_type not in {'series', 'episodes'}: + + if video_type == 'series': + series_info = self._call_platform_api( + f'v2/callSeries/{video_id}', video_id, 'Downloading series info') + return self.playlist_from_matches( + self._yield_episode_ids_for_series(video_id), video_id, + traverse_obj(series_info, ('result', 'content', 'content', 'title', {str})), + ie=TVerIE, getter=lambda x: f'https://tver.jp/episodes/{x}') + + if video_type != 'episodes': webpage = self._download_webpage(url, video_id, note='Resolving to new URL') video_id = self._match_id(self._search_regex( (r'canonical"\s*href="(https?://tver\.jp/[^"]+)"', r'&link=(https?://tver\.jp/[^?&]+)[?&]'), webpage, 'url regex')) - episode_info = self._download_json( - f'https://platform-api.tver.jp/service/api/v1/callEpisode/{video_id}?require_data=mylist,later[epefy106ur],good[epefy106ur],resume[epefy106ur]', - video_id, fatal=False, - query={ - 'platform_uid': self._PLATFORM_UID, - 'platform_token': self._PLATFORM_TOKEN, - }, headers={ - 'x-tver-platform-type': 'web', + episode_info = self._call_platform_api( + f'v1/callEpisode/{video_id}', video_id, 'Downloading episode info', fatal=False, query={ + 'require_data': 'mylist,later[epefy106ur],good[epefy106ur],resume[epefy106ur]', }) episode_content = traverse_obj( episode_info, ('result', 'episode', 'content')) or {} + version = traverse_obj(episode_content, ('version', {str_or_none}), default='5') video_info = self._download_json( - f'https://statics.tver.jp/content/episode/{video_id}.json', video_id, - query={ - 'v': str_or_none(episode_content.get('version')) or '5', - }, headers={ - 'Origin': 'https://tver.jp', - 'Referer': 'https://tver.jp/', - }) + f'https://statics.tver.jp/content/episode/{video_id}.json', video_id, 'Downloading video info', + query={'v': version}, headers={'Referer': 'https://tver.jp/'}) p_id = video_info['video']['accountID'] r_id = traverse_obj(video_info, ('video', ('videoRefID', 'videoID')), get_all=False) if not r_id: @@ -88,6 +146,23 @@ class TVerIE(InfoExtractor): provider = str_or_none(episode_content.get('productionProviderName')) onair_label = str_or_none(episode_content.get('broadcastDateLabel')) + thumbnails = [ + { + 'id': quality, + 'url': update_url_query( + f'https://statics.tver.jp/images/content/thumbnail/episode/{quality}/{video_id}.jpg', + {'v': version}), + 'width': width, + 'height': height, + } + for quality, width, height in [ + ('small', 480, 270), + ('medium', 640, 360), + ('large', 960, 540), + ('xlarge', 1280, 720), + ] + ] + return { '_type': 'url_transparent', 'title': title, @@ -97,6 +172,7 @@ class TVerIE(InfoExtractor): 'alt_title': join_nonempty(title, provider, onair_label, delim=' '), 'channel': provider, 'description': str_or_none(video_info.get('description')), + 'thumbnails': thumbnails, 'url': smuggle_url( self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), {'geo_countries': ['JP']}), 'ie_key': 'BrightcoveNew', diff --git a/yt_dlp/extractor/tvn24.py b/yt_dlp/extractor/tvn24.py index 0dc43a9d47..a0590e4f70 100644 --- a/yt_dlp/extractor/tvn24.py +++ b/yt_dlp/extractor/tvn24.py @@ -8,7 +8,7 @@ from ..utils import ( class TVN24IE(InfoExtractor): _WORKING = False - _VALID_URL = r'https?://(?:(?:[^/]+)\.)?tvn24(?:bis)?\.pl/(?:[^/]+/)*(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:(?!eurosport)[^/]+\.)?tvn24(?:bis)?\.pl/(?:[^/?#]+/)*(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'http://www.tvn24.pl/wiadomosci-z-kraju,3/oredzie-artura-andrusa,702428.html', 'md5': 'fbdec753d7bc29d96036808275f2130c', diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index 53b4084694..bf9c6348cb 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -270,7 +270,7 @@ class TwitCastingLiveIE(InfoExtractor): class TwitCastingUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<id>[^/?#]+)/(:?show|archive)/?(?:[#?]|$)' + _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<id>[^/?#]+)/(?:show|archive)/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://twitcasting.tv/natsuiromatsuri/archive/', 'info_dict': { diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 4ed48ec5ac..8196ce6c32 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -14,6 +14,7 @@ from ..utils import ( float_or_none, format_field, int_or_none, + join_nonempty, make_archive_id, remove_end, str_or_none, @@ -107,7 +108,7 @@ class TwitterBaseIE(InfoExtractor): tbr = int_or_none(dict_get(variant, ('bitrate', 'bit_rate')), 1000) or None f = { 'url': variant_url, - 'format_id': 'http' + (f'-{tbr}' if tbr else ''), + 'format_id': join_nonempty('http', tbr), 'tbr': tbr, } self._search_dimensions_in_video_url(f, variant_url) @@ -149,14 +150,6 @@ class TwitterBaseIE(InfoExtractor): def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) - # XXX: Temporary workaround until twitter.com => x.com migration is completed - def _real_initialize(self): - if self.is_logged_in or not self._get_cookies('https://twitter.com/').get('auth_token'): - return - # User has not yet been migrated to x.com and has passed twitter.com cookies - TwitterBaseIE._API_BASE = 'https://api.twitter.com/1.1/' - TwitterBaseIE._GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' - @functools.cached_property def _selected_api(self): return self._configuration_arg('api', ['graphql'], ie_key='Twitter')[0] @@ -933,14 +926,13 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': 'MoniqueCamarra', 'live_status': 'was_live', 'release_timestamp': 1658417414, - 'description': 'md5:acce559345fd49f129c20dbcda3f1201', + 'description': r're:Twitter Space participated by Sergej Sumlenny.+', 'timestamp': 1658407771, 'release_date': '20220721', 'upload_date': '20220721', }, 'add_ie': ['TwitterSpaces'], 'params': {'skip_download': 'm3u8'}, - 'skip': 'Requires authentication', }, { # URL specifies video number but --yes-playlist 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/1', @@ -1763,7 +1755,7 @@ class TwitterSpacesIE(TwitterBaseIE): 'release_timestamp': 1659904215, 'release_date': '20220807', }, - 'params': {'skip_download': 'm3u8'}, + 'skip': 'No longer available', }, { # post_live/TimedOut but downloadable 'url': 'https://twitter.com/i/spaces/1vAxRAVQWONJl', @@ -1779,6 +1771,8 @@ class TwitterSpacesIE(TwitterBaseIE): 'upload_date': '20230413', 'release_timestamp': 1681839000, 'release_date': '20230418', + 'protocol': 'm3u8', # ffmpeg is forced + 'container': 'm4a_dash', # audio-only format fixup is applied }, 'params': {'skip_download': 'm3u8'}, }, { @@ -1789,11 +1783,31 @@ class TwitterSpacesIE(TwitterBaseIE): 'ext': 'm4a', 'title': 'あ', 'description': 'Twitter Space participated by nobody yet', - 'uploader': '息根とめる🔪Twitchで復活', + 'uploader': '息根とめる', 'uploader_id': 'tomeru_ikinone', 'live_status': 'was_live', 'timestamp': 1685617198, 'upload_date': '20230601', + 'protocol': 'm3u8', # ffmpeg is forced + 'container': 'm4a_dash', # audio-only format fixup is applied + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # Video Space + 'url': 'https://x.com/i/spaces/1DXGydznBYWKM', + 'info_dict': { + 'id': '1DXGydznBYWKM', + 'ext': 'mp4', + 'title': 'America and Israel’s “special relationship”', + 'description': 'Twitter Space participated by nobody yet', + 'uploader': 'Candace Owens', + 'uploader_id': 'RealCandaceO', + 'live_status': 'was_live', + 'timestamp': 1723931351, + 'upload_date': '20240817', + 'release_timestamp': 1723932000, + 'release_date': '20240817', + 'protocol': 'm3u8_native', # not ffmpeg, detected as video space }, 'params': {'skip_download': 'm3u8'}, }] @@ -1833,8 +1847,6 @@ class TwitterSpacesIE(TwitterBaseIE): def _real_extract(self, url): space_id = self._match_id(url) - if not self.is_logged_in: - self.raise_login_required('Twitter Spaces require authentication') space_data = self._call_graphql_api('HPEisOmj1epUNLCWTYhUWw/AudioSpaceById', space_id)['audioSpace'] if not space_data: raise ExtractorError('Twitter Space not found', expected=True) @@ -1853,13 +1865,17 @@ class TwitterSpacesIE(TwitterBaseIE): source = traverse_obj( self._call_api(f'live_video_stream/status/{metadata["media_key"]}', metadata['media_key']), ('source', ('noRedirectPlaybackUrl', 'location'), {url_or_none}), get_all=False) - formats = self._extract_m3u8_formats( # XXX: Some Spaces need ffmpeg as downloader - source, metadata['media_key'], 'm4a', entry_protocol='m3u8', live=is_live, - headers=headers, fatal=False) if source else [] - for fmt in formats: - fmt.update({'vcodec': 'none', 'acodec': 'aac'}) - if not is_live: - fmt['container'] = 'm4a_dash' + is_audio_space = source and 'audio-space' in source + formats = self._extract_m3u8_formats( + source, metadata['media_key'], 'm4a' if is_audio_space else 'mp4', + # XXX: Some audio-only Spaces need ffmpeg as downloader + entry_protocol='m3u8' if is_audio_space else 'm3u8_native', + live=is_live, headers=headers, fatal=False) if source else [] + if is_audio_space: + for fmt in formats: + fmt.update({'vcodec': 'none', 'acodec': 'aac'}) + if not is_live: + fmt['container'] = 'm4a_dash' participants = ', '.join(traverse_obj( space_data, ('participants', 'speakers', ..., 'display_name'))) or 'nobody yet' diff --git a/yt_dlp/extractor/unsupported.py b/yt_dlp/extractor/unsupported.py index 1e2d118aa6..8b7ec1dd96 100644 --- a/yt_dlp/extractor/unsupported.py +++ b/yt_dlp/extractor/unsupported.py @@ -49,6 +49,7 @@ class KnownDRMIE(UnsupportedInfoExtractor): r'amazon\.(?:\w{2}\.)?\w+/gp/video', r'music\.amazon\.(?:\w{2}\.)?\w+', r'(?:watch|front)\.njpwworld\.com', + r'qub\.ca/vrai', ) _TESTS = [{ @@ -149,6 +150,9 @@ class KnownDRMIE(UnsupportedInfoExtractor): }, { 'url': 'https://front.njpwworld.com/p/s_series_00563_16_bs', 'only_matching': True, + }, { + 'url': 'https://www.qub.ca/vrai/l-effet-bocuse-d-or/saison-1/l-effet-bocuse-d-or-saison-1-bande-annonce-1098225063', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/ustream.py b/yt_dlp/extractor/ustream.py index 33cf8f454d..0fdf8f7484 100644 --- a/yt_dlp/extractor/ustream.py +++ b/yt_dlp/extractor/ustream.py @@ -73,7 +73,7 @@ class UstreamIE(InfoExtractor): def num_to_hex(n): return hex(n)[2:] - rnd = random.randrange + rnd = lambda x: random.randrange(int(x)) if not extra_note: extra_note = '' diff --git a/yt_dlp/extractor/veoh.py b/yt_dlp/extractor/veoh.py index dc1bf96ec6..aac768f3c6 100644 --- a/yt_dlp/extractor/veoh.py +++ b/yt_dlp/extractor/veoh.py @@ -8,7 +8,8 @@ from ..utils import ( int_or_none, parse_duration, qualities, - try_get, + remove_start, + strip_or_none, ) @@ -108,7 +109,7 @@ class VeohIE(InfoExtractor): categories = metadata.get('categoryPath') if not categories: - category = try_get(video, lambda x: x['category'].strip().removeprefix('category_')) + category = remove_start(strip_or_none(video.get('category')), 'category_') categories = [category] if category else None tags = video.get('tags') diff --git a/yt_dlp/extractor/vidflex.py b/yt_dlp/extractor/vidflex.py new file mode 100644 index 0000000000..ce0880b472 --- /dev/null +++ b/yt_dlp/extractor/vidflex.py @@ -0,0 +1,148 @@ +import base64 +import json + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + join_nonempty, + mimetype2ext, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class VidflexIE(InfoExtractor): + _DOMAINS_RE = [ + r'[^.]+\.vidflex\.tv', + r'(?:www\.)?acactv\.ca', + r'(?:www\.)?albertalacrossetv\.com', + r'(?:www\.)?cjfltv\.com', + r'(?:www\.)?figureitoutbaseball\.com', + r'(?:www\.)?ocaalive\.com', + r'(?:www\.)?pegasussports\.tv', + r'(?:www\.)?praxisseries\.ca', + r'(?:www\.)?silenticetv\.com', + r'(?:www\.)?tuffhedemantv\.com', + r'(?:www\.)?watchfuntv\.com', + r'live\.ofsaa\.on\.ca', + r'tv\.procoro\.ca', + r'tv\.realcastmedia\.net', + r'tv\.fringetheatre\.ca', + r'video\.haisla\.ca', + r'video\.hockeycanada\.ca', + r'video\.huuayaht\.org', + r'video\.turningpointensemble\.ca', + r'videos\.livingworks\.net', + r'videos\.telusworldofscienceedmonton\.ca', + r'watch\.binghamtonbulldogs\.com', + r'watch\.rekindle\.tv', + r'watch\.wpca\.com', + ] + _VALID_URL = rf'https?://(?:{"|".join(_DOMAINS_RE)})/[a-z]{{2}}(?:-[a-z]{{2}})?/c/[\w-]+\.(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://video.hockeycanada.ca/en/c/nwt-micd-up-with-jamie-lee-rattray.107486', + 'only_matching': True, + }, { + # m3u8 + https + 'url': 'https://video.hockeycanada.ca/en-us/c/nwt-micd-up-with-jamie-lee-rattray.107486', + 'info_dict': { + 'id': '107486', + 'title': 'NWT: Mic’d up with Jamie Lee Rattray', + 'ext': 'mp4', + 'duration': 115, + 'timestamp': 1634310409, + 'upload_date': '20211015', + 'tags': ['English', '2021', "National Women's Team"], + 'description': 'md5:efb1cf6165b48cc3f5555c4262dd5b23', + 'thumbnail': r're:^https?://wpmedia01-a\.akamaihd\.net/en/asset/public/image/.+', + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://video.hockeycanada.ca/en/c/mwc-remembering-the-wild-ride-in-riga.112307', + 'info_dict': { + 'id': '112307', + 'title': 'MWC: Remembering the wild ride in Riga', + 'ext': 'mp4', + 'duration': 322, + 'timestamp': 1716235607, + 'upload_date': '20240520', + 'tags': ['English', '2024', "National Men's Team", 'IIHF World Championship', 'Fan'], + 'description': r're:.+Canada’s National Men’s Team.+', + 'thumbnail': r're:^https?://wpmedia01-a\.akamaihd\.net/en/asset/public/image/.+', + }, + 'params': {'skip_download': True}, + }, { + # the same video in French + 'url': 'https://video.hockeycanada.ca/fr/c/cmm-retour-sur-un-parcours-endiable-a-riga.112304', + 'info_dict': { + 'id': '112304', + 'title': 'CMM : Retour sur un parcours endiablé à Riga', + 'ext': 'mp4', + 'duration': 322, + 'timestamp': 1716235545, + 'upload_date': '20240520', + 'tags': ['French', '2024', "National Men's Team", 'IIHF World Championship', 'Fan'], + 'description': 'md5:cf825222882a3dab1cd62cffcf3b4d1f', + 'thumbnail': r're:^https?://wpmedia01-a\.akamaihd\.net/en/asset/public/image/.+', + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://myfbcgreenville.vidflex.tv/en/c/may-12th-2024.658', + 'only_matching': True, + }, { + 'url': 'https://www.figureitoutbaseball.com/en/c/fiob-podcast-14-dan-bertolini-ncaa-d1-head-coach-recorded-11-29-2018.1367', + 'only_matching': True, + }, { + 'url': 'https://videos.telusworldofscienceedmonton.ca/en/c/the-aurora-project-timelapse-4.577', + 'only_matching': True, + }, { + 'url': 'https://www.tuffhedemantv.com/en/c/2022-tuff-hedeman-tour-hobbs-nm-january-22.227', + 'only_matching': True, + }, { + 'url': 'https://www.albertalacrossetv.com/en/c/up-floor-ground-balls-one-more.3449', + 'only_matching': True, + }, { + 'url': 'https://www.silenticetv.com/en/c/jp-unlocked-day-in-the-life-of-langley-ha-15u.5197', + 'only_matching': True, + }, { + 'url': 'https://jphl.vidflex.tv/en/c/jp-unlocked-day-in-the-life-of-langley-ha-15u.5197', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + data_url = self._html_search_regex( + r'content_api:\s*(["\'])(?P<url>https?://(?:(?!\1).)+)\1', webpage, 'content api url', group='url') + media_config = traverse_obj( + self._download_json(data_url, video_id), + ('config', {base64.b64decode}, {bytes.decode}, {json.loads}, {dict})) + + return { + 'id': video_id, + 'formats': list(self._yield_formats(media_config, video_id)), + **self._search_json_ld( + webpage.replace('/*<![CDATA[*/', '').replace('/*]]>*/', ''), video_id), + } + + def _yield_formats(self, media_config, video_id): + for media_source in traverse_obj(media_config, ('media', 'source', lambda _, v: url_or_none(v['src']))): + media_url = media_source['src'] + media_type = mimetype2ext(media_source.get('type')) + + if media_type == 'm3u8': + yield from self._extract_m3u8_formats(media_url, video_id, fatal=False, m3u8_id='hls') + elif media_type == 'mp4': + bitrate = self._search_regex(r'_(\d+)k\.mp4', media_url, 'bitrate', default=None) + yield { + 'format_id': join_nonempty('http', bitrate), + 'url': media_url, + 'ext': 'mp4', + 'tbr': int_or_none(bitrate), + } + else: + yield { + 'url': media_url, + 'ext': media_type, + } diff --git a/yt_dlp/extractor/vidyard.py b/yt_dlp/extractor/vidyard.py new file mode 100644 index 0000000000..20a54b1618 --- /dev/null +++ b/yt_dlp/extractor/vidyard.py @@ -0,0 +1,426 @@ +import functools +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + float_or_none, + int_or_none, + join_nonempty, + mimetype2ext, + parse_resolution, + str_or_none, + unescapeHTML, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class VidyardBaseIE(InfoExtractor): + _HEADERS = {'Referer': 'https://play.vidyard.com/'} + + def _get_formats_and_subtitles(self, sources, video_id): + formats, subtitles = [], {} + + def add_hls_fmts_and_subs(m3u8_url): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', m3u8_id='hls', headers=self._HEADERS, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + hls_list = isinstance(sources, dict) and sources.pop('hls', None) + if master_m3u8_url := traverse_obj( + hls_list, (lambda _, v: v['profile'] == 'auto', 'url', {url_or_none}, any)): + add_hls_fmts_and_subs(master_m3u8_url) + if not formats: # These are duplicate and unnecesary requests if we got 'auto' hls fmts + for variant_m3u8_url in traverse_obj(hls_list, (..., 'url', {url_or_none})): + add_hls_fmts_and_subs(variant_m3u8_url) + + for source_type, source_list in traverse_obj(sources, ({dict.items}, ...)): + for source in traverse_obj(source_list, lambda _, v: url_or_none(v['url'])): + profile = source.get('profile') + formats.append({ + 'url': source['url'], + 'ext': mimetype2ext(source.get('mimeType'), default=None), + 'format_id': join_nonempty('http', source_type, profile), + **parse_resolution(profile), + }) + + self._remove_duplicate_formats(formats) + return formats, subtitles + + def _get_direct_subtitles(self, caption_json): + subs = {} + for caption in traverse_obj(caption_json, lambda _, v: url_or_none(v['vttUrl'])): + subs.setdefault(caption.get('language') or 'und', []).append({ + 'url': caption['vttUrl'], + 'name': caption.get('name'), + }) + + return subs + + def _fetch_video_json(self, video_id): + return self._download_json( + f'https://play.vidyard.com/player/{video_id}.json', video_id)['payload'] + + def _process_video_json(self, json_data, video_id): + formats, subtitles = self._get_formats_and_subtitles(json_data['sources'], video_id) + self._merge_subtitles(self._get_direct_subtitles(json_data.get('captions')), target=subtitles) + + return { + **traverse_obj(json_data, { + 'id': ('facadeUuid', {str}), + 'display_id': ('videoId', {int}, {str_or_none}), + 'title': ('name', {str}), + 'description': ('description', {str}, {unescapeHTML}, {lambda x: x or None}), + 'duration': (( + ('milliseconds', {functools.partial(float_or_none, scale=1000)}), + ('seconds', {int_or_none})), any), + 'thumbnails': ('thumbnailUrls', ('small', 'normal'), {'url': {url_or_none}}), + 'tags': ('tags', ..., 'name', {str}), + }), + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': self._HEADERS, + } + + +class VidyardIE(VidyardBaseIE): + _VALID_URL = [ + r'https?://[\w-]+(?:\.hubs)?\.vidyard\.com/watch/(?P<id>[\w-]+)', + r'https?://(?:embed|share)\.vidyard\.com/share/(?P<id>[\w-]+)', + r'https?://play\.vidyard\.com/(?:player/)?(?P<id>[\w-]+)', + ] + _EMBED_REGEX = [r'<iframe[^>]* src=["\'](?P<url>(?:https?:)?//play\.vidyard\.com/[\w-]+)'] + _TESTS = [{ + 'url': 'https://vyexample03.hubs.vidyard.com/watch/oTDMPlUv--51Th455G5u7Q', + 'info_dict': { + 'id': 'oTDMPlUv--51Th455G5u7Q', + 'display_id': '50347', + 'ext': 'mp4', + 'title': 'Homepage Video', + 'description': 'Look I changed the description.', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/50347/OUPa5LTKV46849sLYngMqQ_small.jpg', + 'duration': 99, + 'tags': ['these', 'are', 'all', 'tags'], + }, + }, { + 'url': 'https://share.vidyard.com/watch/PaQzDAT1h8JqB8ivEu2j6Y?', + 'info_dict': { + 'id': 'PaQzDAT1h8JqB8ivEu2j6Y', + 'display_id': '9281024', + 'ext': 'mp4', + 'title': 'Inline Embed', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/spacer.gif', + 'duration': 41.186, + }, + }, { + 'url': 'https://embed.vidyard.com/share/oTDMPlUv--51Th455G5u7Q', + 'info_dict': { + 'id': 'oTDMPlUv--51Th455G5u7Q', + 'display_id': '50347', + 'ext': 'mp4', + 'title': 'Homepage Video', + 'description': 'Look I changed the description.', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/50347/OUPa5LTKV46849sLYngMqQ_small.jpg', + 'duration': 99, + 'tags': ['these', 'are', 'all', 'tags'], + }, + }, { + # First video from playlist below + 'url': 'https://embed.vidyard.com/share/SyStyHtYujcBHe5PkZc5DL', + 'info_dict': { + 'id': 'SyStyHtYujcBHe5PkZc5DL', + 'display_id': '41974005', + 'ext': 'mp4', + 'title': 'Prepare the Frame and Track for Palm Beach Polysatin Shutters With BiFold Track', + 'description': r're:In this video, you will learn how to prepare the frame.+', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/41974005/IJw7oCaJcF1h7WWu3OVZ8A_small.png', + 'duration': 258.666, + }, + }, { + # Playlist + 'url': 'https://thelink.hubs.vidyard.com/watch/pwu7pCYWSwAnPxs8nDoFrE', + 'info_dict': { + 'id': 'pwu7pCYWSwAnPxs8nDoFrE', + 'title': 'PLAYLIST - Palm Beach Shutters- Bi-Fold Track System Installation', + 'entries': [{ + 'id': 'SyStyHtYujcBHe5PkZc5DL', + 'display_id': '41974005', + 'ext': 'mp4', + 'title': 'Prepare the Frame and Track for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/41974005/IJw7oCaJcF1h7WWu3OVZ8A_small.png', + 'duration': 258.666, + }, { + 'id': '1Fw4B84jZTXLXWqkE71RiM', + 'display_id': '5861113', + 'ext': 'mp4', + 'title': 'Palm Beach - Bi-Fold Track System "Frame Installation"', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5861113/29CJ54s5g1_aP38zkKLHew_small.jpg', + 'duration': 167.858, + }, { + 'id': 'DqP3wBvLXSpxrcqpT5kEeo', + 'display_id': '41976334', + 'ext': 'mp4', + 'title': 'Install the Track for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5861090/RwG2VaTylUa6KhSTED1r1Q_small.png', + 'duration': 94.229, + }, { + 'id': 'opfybfxpzQArxqtQYB6oBU', + 'display_id': '41976364', + 'ext': 'mp4', + 'title': 'Install the Panel for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5860926/JIOaJR08dM4QgXi_iQ2zGA_small.png', + 'duration': 191.467, + }, { + 'id': 'rWrXvkbTNNaNqD6189HJya', + 'display_id': '41976382', + 'ext': 'mp4', + 'title': 'Adjust the Panels for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5860687/CwHxBv4UudAhOh43FVB4tw_small.png', + 'duration': 138.155, + }, { + 'id': 'eYPTB521MZ9TPEArSethQ5', + 'display_id': '41976409', + 'ext': 'mp4', + 'title': 'Assemble and Install the Valance for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5861425/0y68qlMU4O5VKU7bJ8i_AA_small.png', + 'duration': 148.224, + }], + }, + 'playlist_count': 6, + }, { + # Non hubs.vidyard.com playlist + 'url': 'https://salesforce.vidyard.com/watch/d4vqPjs7Q5EzVEis5QT3jd', + 'info_dict': { + 'id': 'd4vqPjs7Q5EzVEis5QT3jd', + 'title': 'How To: Service Cloud: Import External Content in Lightning Knowledge', + 'entries': [{ + 'id': 'mcjDpSZir2iSttbvFkx6Rv', + 'display_id': '29479036', + 'ext': 'mp4', + 'title': 'Welcome to this Expert Coaching Series', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/ouyQi9WuwyiOupChUWNmjQ/7170d3485ba602e012df05_small.jpg', + 'duration': 38.205, + }, { + 'id': '84bPYwpg243G6xYEfJdYw9', + 'display_id': '21820704', + 'ext': 'mp4', + 'title': 'Chapter 1 - Title + Agenda', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/HFPN0ZgQq4Ow8BghGcQSow/bfaa30123c8f6601e7d7f2_small.jpg', + 'duration': 98.016, + }, { + 'id': 'nP17fMuvA66buVHUrzqjTi', + 'display_id': '21820707', + 'ext': 'mp4', + 'title': 'Chapter 2 - Import Options', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/rGRIF5nFjPI9OOA2qJ_Dbg/86a8d02bfec9a566845dd4_small.jpg', + 'duration': 199.136, + }, { + 'id': 'm54EcwXdpA5gDBH5rgCYoV', + 'display_id': '21820710', + 'ext': 'mp4', + 'title': 'Chapter 3 - Importing Article Translations', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/IVX4XR8zpSsiNIHx45kz-A/1ccbf8a29a33856d06b3ed_small.jpg', + 'duration': 184.352, + }, { + 'id': 'j4nzS42oq4hE9oRV73w3eQ', + 'display_id': '21820716', + 'ext': 'mp4', + 'title': 'Chapter 4 - Best Practices', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/BtrRrQpRDLbA4AT95YQyog/1f1e6b8e7fdc3fa95ec8d3_small.jpg', + 'duration': 296.960, + }, { + 'id': 'y28PYfW5pftvers9PXzisC', + 'display_id': '21820727', + 'ext': 'mp4', + 'title': 'Chapter 5 - Migration Steps', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/K2CdQOXDfLcrVTF60r0bdw/a09239ada28b6ffce12b1f_small.jpg', + 'duration': 620.640, + }, { + 'id': 'YWU1eQxYvhj29SjYoPw5jH', + 'display_id': '21820733', + 'ext': 'mp4', + 'title': 'Chapter 6 - Demo', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/rsmhP-cO8dAa8ilvFGCX0g/7911ef415167cd14032068_small.jpg', + 'duration': 631.456, + }, { + 'id': 'nmEvVqpwdJUgb74zKsLGxn', + 'display_id': '29479037', + 'ext': 'mp4', + 'title': 'Schedule Your Follow-Up', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/Rtwc7X4PEkF4Ae5kHi-Jvw/174ebed3f34227b1ffa1d0_small.jpg', + 'duration': 33.608, + }], + }, + 'playlist_count': 8, + }, { + # URL of iframe embed src + 'url': 'https://play.vidyard.com/iDqTwWGrd36vaLuaCY3nTs.html', + 'info_dict': { + 'id': 'iDqTwWGrd36vaLuaCY3nTs', + 'display_id': '9281009', + 'ext': 'mp4', + 'title': 'Lightbox Embed', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/spacer.gif', + 'duration': 39.035, + }, + }, { + # Player JSON URL + 'url': 'https://play.vidyard.com/player/7GAApnNNbcZZ46k6JqJQSh.json?disable_analytics=0', + 'info_dict': { + 'id': '7GAApnNNbcZZ46k6JqJQSh', + 'display_id': '820026', + 'ext': 'mp4', + 'title': 'The Art of Storytelling: How to Deliver Your Brand Story with Content & Social', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/MhbE-5sEFQu4x3fI6FkNlA/41eb5717c557cd19456910_small.jpg', + 'duration': 2153.013, + 'tags': ['Summit2017'], + }, + }, { + 'url': 'http://share.vidyard.com/share/diYeo6YR2yiGgL8odvS8Ri', + 'only_matching': True, + }, { + 'url': 'https://play.vidyard.com/FFlz3ZpxhIfKQ1fd9DAryA', + 'only_matching': True, + }, { + 'url': 'https://play.vidyard.com/qhMAu5A76GZVrFzOPgSf9A/type/standalone', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + # URL containing inline/lightbox embedded video + 'url': 'https://resources.altium.com/p/2-the-extreme-importance-of-pc-board-stack-up', + 'info_dict': { + 'id': 'GDx1oXrFWj4XHbipfoXaMn', + 'display_id': '3225198', + 'ext': 'mp4', + 'title': 'The Extreme Importance of PC Board Stack Up', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/73_Q3_hBexWX7Og1sae6cg/9998fa4faec921439e2c04_small.jpg', + 'duration': 3422.742, + }, + }, { + # <script ... id="vidyard_embed_code_DXx2sW4WaLA6hTdGFz7ja8" src="//play.vidyard.com/DXx2sW4WaLA6hTdGFz7ja8.js? + 'url': 'http://videos.vivint.com/watch/DXx2sW4WaLA6hTdGFz7ja8', + 'info_dict': { + 'id': 'DXx2sW4WaLA6hTdGFz7ja8', + 'display_id': '2746529', + 'ext': 'mp4', + 'title': 'How To Powercycle the Smart Hub Panel', + 'duration': 30.613, + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/_-6cw8xQUJ3qiCs_JENc_A/b21d7a5e47967f49399d30_small.jpg', + }, + }, { + # <script id="vidyard_embed_code_MIBHhiLVTxga7wqLsuoDjQ" src="//embed.vidyard.com/embed/MIBHhiLVTxga7wqLsuoDjQ/inline?v=2.1"> + 'url': 'https://www.babypips.com/learn/forex/introduction-to-metatrader4', + 'info_dict': { + 'id': 'MIBHhiLVTxga7wqLsuoDjQ', + 'display_id': '20291', + 'ext': 'mp4', + 'title': 'Lesson 1 - Opening an MT4 Account', + 'description': 'Never heard of MetaTrader4? Here\'s the 411 on the popular trading platform!', + 'duration': 168, + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/20291/IM-G2WXQR9VBLl2Cmzvftg_small.jpg', + }, + }, { + # <iframe ... src="//play.vidyard.com/d61w8EQoZv1LDuPxDkQP2Q/type/background?preview=1" + 'url': 'https://www.avaya.com/en/', + 'info_dict': { + # These values come from the generic extractor and don't matter + 'id': str, + 'title': str, + 'age_limit': 0, + 'upload_date': str, + 'description': str, + 'thumbnail': str, + 'timestamp': float, + }, + 'playlist': [{ + 'info_dict': { + 'id': 'd61w8EQoZv1LDuPxDkQP2Q', + 'display_id': '42456529', + 'ext': 'mp4', + 'title': 'GettyImages-1027', + 'duration': 6.0, + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/42061563/p6bY08d2N4e4IDz-7J4_wkgsPq3-qgcx_small.jpg', + }, + }, { + 'info_dict': { + 'id': 'VAsYDi7eiqZRbHodUA2meC', + 'display_id': '42456569', + 'ext': 'mp4', + 'title': 'GettyImages-1325598833', + 'duration': 6.083, + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/42052358/y3qrbDpn_2quWr_5XBi7yzS3UvEI__ZM_small.jpg', + }, + }], + 'playlist_count': 2, + }, { + # <div class="vidyard-player-embed" data-uuid="vpCWTVHw3qrciLtVY94YkS" + 'url': 'https://www.gogoair.com/', + 'info_dict': { + # These values come from the generic extractor and don't matter + 'id': str, + 'title': str, + 'description': str, + 'age_limit': 0, + }, + 'playlist': [{ + 'info_dict': { + 'id': 'vpCWTVHw3qrciLtVY94YkS', + 'display_id': '40780699', + 'ext': 'mp4', + 'title': 'Upgrade to AVANCE 100% worth it - Jason Talley, Owner and Pilot, Testimonial', + 'description': 'md5:f609824839439a51990cef55ffc472aa', + 'duration': 70.737, + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/40780699/KzjfYZz5MZl2gHF_e-4i2c6ib1cLDweQ_small.jpg', + }, + }, { + 'info_dict': { + 'id': 'xAmV9AsLbnitCw35paLBD8', + 'display_id': '31130867', + 'ext': 'mp4', + 'title': 'Brad Keselowski goes faster with Gogo AVANCE inflight Wi-Fi', + 'duration': 132.565, + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/31130867/HknyDtLdm2Eih9JZ4A5XLjhfBX_6HRw5_small.jpg', + }, + }, { + 'info_dict': { + 'id': 'RkkrFRNxfP79nwCQavecpF', + 'display_id': '39009815', + 'ext': 'mp4', + 'title': 'Live Demo of Gogo Galileo', + 'description': 'md5:e2df497236f4e12c3fef8b392b5f23e0', + 'duration': 112.128, + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/38144873/CWLlxfUbJ4Gh0ThbUum89IsEM4yupzMb_small.jpg', + }, + }], + 'playlist_count': 3, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + # Handle protocol-less embed URLs + for embed_url in super()._extract_embed_urls(url, webpage): + if embed_url.startswith('//'): + embed_url = f'https:{embed_url}' + yield embed_url + + # Extract inline/lightbox embeds + for embed_element in re.findall( + r'(<(?:img|div)[^>]* class=(["\'])(?:[^>"\']* )?vidyard-player-embed(?: [^>"\']*)?\2[^>]+>)', webpage): + if video_id := extract_attributes(embed_element[0]).get('data-uuid'): + yield f'https://play.vidyard.com/{video_id}' + + for embed_id in re.findall(r'<script[^>]* id=["\']vidyard_embed_code_([\w-]+)["\']', webpage): + yield f'https://play.vidyard.com/{embed_id}' + + def _real_extract(self, url): + video_id = self._match_id(url) + video_json = self._fetch_video_json(video_id) + + if len(video_json['chapters']) == 1: + return self._process_video_json(video_json['chapters'][0], video_id) + + return self.playlist_result( + [self._process_video_json(chapter, video_id) for chapter in video_json['chapters']], + str(video_json['playerUuid']), video_json.get('name')) diff --git a/yt_dlp/extractor/viewlift.py b/yt_dlp/extractor/viewlift.py index 7ac094f2f0..4a7ba9839e 100644 --- a/yt_dlp/extractor/viewlift.py +++ b/yt_dlp/extractor/viewlift.py @@ -5,6 +5,7 @@ from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, + join_nonempty, parse_age_limit, traverse_obj, ) @@ -120,7 +121,7 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'height', default=None)) formats.append({ 'url': video_asset_url, - 'format_id': 'http{}'.format(f'-{bitrate}' if bitrate else ''), + 'format_id': join_nonempty('http', bitrate), 'tbr': bitrate, 'height': height, 'vcodec': video_asset.get('codec'), diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 7e79032f28..0ed7b9ec1f 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -1,6 +1,7 @@ import base64 import functools import itertools +import json import re import urllib.parse @@ -14,11 +15,13 @@ from ..utils import ( determine_ext, get_element_by_class, int_or_none, + join_nonempty, js_to_json, merge_dicts, parse_filesize, parse_iso8601, parse_qs, + qualities, smuggle_url, str_or_none, traverse_obj, @@ -84,29 +87,23 @@ class VimeoBaseInfoExtractor(InfoExtractor): expected=True) return password - def _verify_video_password(self, url, video_id, password, token, vuid): - if url.startswith('http://'): - # vimeo only supports https now, but the user can give an http url - url = url.replace('http://', 'https://') - self._set_vimeo_cookie('vuid', vuid) - return self._download_webpage( - url + '/password', video_id, 'Verifying the password', - 'Wrong password', data=urlencode_postdata({ - 'password': password, - 'token': token, - }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': url, - }) - - def _extract_xsrft_and_vuid(self, webpage): - xsrft = self._search_regex( - r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)', - webpage, 'login token', group='xsrft') - vuid = self._search_regex( - r'["\']vuid["\']\s*:\s*(["\'])(?P<vuid>.+?)\1', - webpage, 'vuid', group='vuid') - return xsrft, vuid + def _verify_video_password(self, video_id, password, token): + url = f'https://vimeo.com/{video_id}' + try: + return self._download_webpage( + f'{url}/password', video_id, + 'Submitting video password', data=json.dumps({ + 'password': password, + 'token': token, + }, separators=(',', ':')).encode(), headers={ + 'Accept': '*/*', + 'Content-Type': 'application/json', + 'Referer': url, + }, impersonate=True) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 418: + raise ExtractorError('Wrong password', expected=True) + raise def _extract_vimeo_config(self, webpage, video_id, *args, **kwargs): vimeo_config = self._search_regex( @@ -150,6 +147,8 @@ class VimeoBaseInfoExtractor(InfoExtractor): }) # TODO: fix handling of 308 status code returned for live archive manifest requests + QUALITIES = ('low', 'medium', 'high') + quality = qualities(QUALITIES) sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): @@ -170,6 +169,11 @@ class VimeoBaseInfoExtractor(InfoExtractor): m_url, video_id, 'mp4', live=is_live, m3u8_id=f_id, note=f'Downloading {cdn_name} m3u8 information', fatal=False) + # m3u8 doesn't give audio bitrates; need to prioritize based on GROUP-ID + # See: https://github.com/yt-dlp/yt-dlp/issues/10854 + for f in fmts: + if mobj := re.search(rf'audio-({"|".join(QUALITIES)})', f['format_id']): + f['quality'] = quality(mobj.group(1)) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif files_type == 'dash': @@ -216,16 +220,6 @@ class VimeoBaseInfoExtractor(InfoExtractor): owner = video_data.get('owner') or {} video_uploader_url = owner.get('url') - duration = int_or_none(video_data.get('duration')) - chapter_data = try_get(config, lambda x: x['embed']['chapters']) or [] - chapters = [{ - 'title': current_chapter.get('title'), - 'start_time': current_chapter.get('timecode'), - 'end_time': next_chapter.get('timecode'), - } for current_chapter, next_chapter in zip(chapter_data, chapter_data[1:] + [{'timecode': duration}])] - if chapters and chapters[0]['start_time']: # Chapters may not start from 0 - chapters[:0] = [{'title': '<Untitled>', 'start_time': 0, 'end_time': chapters[0]['start_time']}] - return { 'id': str_or_none(video_data.get('id')) or video_id, 'title': video_title, @@ -233,8 +227,12 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None, 'uploader_url': video_uploader_url, 'thumbnails': thumbnails, - 'duration': duration, - 'chapters': chapters or None, + 'duration': int_or_none(video_data.get('duration')), + 'chapters': sorted(traverse_obj(config, ( + 'embed', 'chapters', lambda _, v: int(v['timecode']) is not None, { + 'title': ('title', {str}), + 'start_time': ('timecode', {int_or_none}), + })), key=lambda c: c['start_time']) or None, 'formats': formats, 'subtitles': subtitles, 'live_status': live_status, @@ -244,13 +242,30 @@ class VimeoBaseInfoExtractor(InfoExtractor): '_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'), } - def _extract_original_format(self, url, video_id, unlisted_hash=None): + def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None, **kwargs): + return self._download_json( + join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), + video_id, 'Downloading API JSON', headers={ + 'Authorization': f'jwt {jwt_token}', + 'Accept': 'application/json', + }, query={ + 'fields': ','.join(( + 'config_url', 'created_time', 'description', 'download', 'license', + 'metadata.connections.comments.total', 'metadata.connections.likes.total', + 'release_time', 'stats.plays')), + }, **kwargs) + + def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, api_data=None): + # Original/source formats are only available when logged in + if not self._get_cookies('https://vimeo.com/').get('vimeo'): + return + query = {'action': 'load_download_config'} if unlisted_hash: query['unlisted_hash'] = unlisted_hash download_data = self._download_json( - url, video_id, fatal=False, query=query, - headers={'X-Requested-With': 'XMLHttpRequest'}, + url, video_id, 'Loading download config JSON', fatal=False, + query=query, headers={'X-Requested-With': 'XMLHttpRequest'}, expected_status=(403, 404)) or {} source_file = download_data.get('source_file') download_url = try_get(source_file, lambda x: x['download_url']) @@ -271,15 +286,13 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'quality': 1, } - jwt_response = self._download_json( - 'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {} - if not jwt_response.get('jwt'): + jwt = jwt or traverse_obj(self._download_json( + 'https://vimeo.com/_rv/viewer', video_id, 'Downloading jwt token', fatal=False), ('jwt', {str})) + if not jwt: return - headers = {'Authorization': 'jwt {}'.format(jwt_response['jwt']), 'Accept': 'application/json'} - original_response = self._download_json( - f'https://api.vimeo.com/videos/{video_id}', video_id, - headers=headers, fatal=False, expected_status=(403, 404)) or {} - for download_data in original_response.get('download') or []: + original_response = api_data or self._call_videos_api( + video_id, jwt, unlisted_hash, fatal=False, expected_status=(403, 404)) + for download_data in traverse_obj(original_response, ('download', ..., {dict})): download_url = download_data.get('link') if not download_url or download_data.get('quality') != 'source': continue @@ -364,7 +377,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip': 'No longer available', }, { - 'url': 'http://player.vimeo.com/video/54469442', + 'url': 'https://player.vimeo.com/video/54469442', 'md5': '619b811a4417aa4abe78dc653becf511', 'note': 'Videos that embed the url in the player page', 'info_dict': { @@ -380,6 +393,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/68375962', @@ -389,22 +403,23 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', - 'timestamp': 1371200155, + 'timestamp': 1371214555, 'upload_date': '20130614', + 'release_timestamp': 1371214555, + 'release_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', - 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', - 'view_count': int, 'comment_count': int, 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', }, 'params': { 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/channels/keypeele/75629013', @@ -428,29 +443,38 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, }, 'params': {'format': 'http-1080p'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/76979871', 'note': 'Video with subtitles', 'info_dict': { 'id': '76979871', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'The New Vimeo Player (You Know, For Videos)', - 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', - 'timestamp': 1381846109, + 'description': str, # FIXME: Dynamic SEO spam description + 'timestamp': 1381860509, 'upload_date': '20131015', + 'release_timestamp': 1381860509, + 'release_date': '20131015', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff', 'uploader_id': 'staff', - 'uploader': 'Vimeo Staff', + 'uploader': 'Vimeo', 'duration': 62, + 'comment_count': int, + 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/452001751-8216e0571c251a09d7a8387550942d89f7f86f6398f8ed886e639b0dd50d3c90-d_1280', 'subtitles': { - 'de': [{'ext': 'vtt'}], - 'en': [{'ext': 'vtt'}], - 'es': [{'ext': 'vtt'}], - 'fr': [{'ext': 'vtt'}], + 'de': 'count:3', + 'en': 'count:3', + 'es': 'count:3', + 'fr': 'count:3', }, }, - 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'expected_warnings': [ + 'Ignoring subtitle tracks found in the HLS manifest', + 'Failed to parse XML: not well-formed', + ], }, { # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/ @@ -466,11 +490,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 118, 'thumbnail': 'https://i.vimeocdn.com/video/478636036-c18440305ef3df9decfb6bf207a61fe39d2d17fa462a96f6f2d93d30492b037d-d_1280', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { - # contains original format + # contains Original format 'url': 'https://vimeo.com/33951933', - 'md5': '53c688fa95a55bf4b7293d37a89c5c53', + # 'md5': '53c688fa95a55bf4b7293d37a89c5c53', 'info_dict': { 'id': '33951933', 'ext': 'mp4', @@ -486,15 +511,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d_1280', 'like_count': int, + 'tags': 'count:11', }, + # 'params': {'format': 'Original'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { - 'note': 'Contains original format not accessible in webpage', + 'note': 'Contains source format not accessible in webpage', 'url': 'https://vimeo.com/393756517', - 'md5': 'c464af248b592190a5ffbb5d33f382b0', + # 'md5': 'c464af248b592190a5ffbb5d33f382b0', 'info_dict': { 'id': '393756517', - 'ext': 'mov', + # 'ext': 'mov', + 'ext': 'mp4', 'timestamp': 1582642091, 'uploader_id': 'frameworkla', 'title': 'Straight To Hell - Sabrina: Netflix', @@ -505,6 +534,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d_1280', 'uploader_url': 'https://vimeo.com/frameworkla', }, + # 'params': {'format': 'source'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # only available via https://vimeo.com/channels/tributes/6213729 and @@ -521,16 +552,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'channel_id': 'tributes', 'timestamp': 1250886430, 'upload_date': '20090821', - 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + 'description': str, # FIXME: Dynamic SEO spam description 'duration': 321, 'comment_count': int, 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/22728298-bfc22146f930de7cf497821c7b0b9f168099201ecca39b00b6bd31fcedfca7a6-d_1280', 'like_count': int, + 'tags': ['[the shining', 'vimeohq', 'cv', 'vimeo tribute]'], }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # redirects to ondemand extractor and should be passed through it @@ -553,28 +586,23 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip': 'this page is no longer available.', }, { - 'url': 'http://player.vimeo.com/video/68375962', + 'url': 'https://player.vimeo.com/video/68375962', 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', 'info_dict': { 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', - 'timestamp': 1371200155, - 'upload_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', - 'view_count': int, - 'comment_count': int, - 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', @@ -602,7 +630,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc", 'uploader': 'Philipp Hagemeister', 'uploader_id': 'user20132939', - 'description': 'md5:fa7b6c6d8db0bdc353893df2f111855b', + 'description': str, # FIXME: Dynamic SEO spam description 'upload_date': '20150209', 'timestamp': 1423518307, 'thumbnail': 'https://i.vimeocdn.com/video/default_1280', @@ -616,6 +644,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # source file returns 403: Forbidden @@ -643,11 +672,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'release_date': '20160329', }, 'params': {'skip_download': True}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/138909882', 'info_dict': { 'id': '138909882', + # 'ext': 'm4v', 'ext': 'mp4', 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!', 'description': 'md5:5967e090768a831488f6e74b7821b3c1', @@ -655,11 +686,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Firework Champions', 'upload_date': '20150910', 'timestamp': 1441901895, + 'thumbnail': 'https://i.vimeocdn.com/video/534715882-6ff8e4660cbf2fea68282876d8d44f318825dfe572cc4016e73b3266eac8ae3a-d_1280', + 'uploader_url': 'https://vimeo.com/fireworkchampions', + 'tags': 'count:6', + 'duration': 229, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, 'params': { 'skip_download': True, - 'format': 'Original', + # 'format': 'source', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/channels/staffpicks/143603739', @@ -680,8 +719,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, 'uploader_url': 'https://vimeo.com/karimhd', 'channel_url': 'https://vimeo.com/channels/staffpicks', + 'tags': 'count:6', }, 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # requires passing unlisted_hash(a52724358e) to load_download_config request @@ -711,6 +752,82 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, + { + # chapters must be sorted, see: https://github.com/yt-dlp/yt-dlp/issues/5308 + 'url': 'https://player.vimeo.com/video/756714419', + 'info_dict': { + 'id': '756714419', + 'ext': 'mp4', + 'title': 'Dr Arielle Schwartz - Therapeutic yoga for optimum sleep', + 'uploader': 'Alex Howard', + 'uploader_id': 'user54729178', + 'uploader_url': 'https://vimeo.com/user54729178', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/1520099929-[\da-f]+-d_1280', + 'duration': 2636, + 'chapters': [ + {'start_time': 0, 'end_time': 10, 'title': '<Untitled Chapter 1>'}, + {'start_time': 10, 'end_time': 106, 'title': 'Welcoming Dr Arielle Schwartz'}, + {'start_time': 106, 'end_time': 305, 'title': 'What is therapeutic yoga?'}, + {'start_time': 305, 'end_time': 594, 'title': 'Vagal toning practices'}, + {'start_time': 594, 'end_time': 888, 'title': 'Trauma and difficulty letting go'}, + {'start_time': 888, 'end_time': 1059, 'title': "Dr Schwartz' insomnia experience"}, + {'start_time': 1059, 'end_time': 1471, 'title': 'A strategy for helping sleep issues'}, + {'start_time': 1471, 'end_time': 1667, 'title': 'Yoga nidra'}, + {'start_time': 1667, 'end_time': 2121, 'title': 'Wisdom in stillness'}, + {'start_time': 2121, 'end_time': 2386, 'title': 'What helps us be more able to let go?'}, + {'start_time': 2386, 'end_time': 2510, 'title': 'Practical tips to help ourselves'}, + {'start_time': 2510, 'end_time': 2636, 'title': 'Where to find out more'}, + ], + }, + 'params': { + 'http_headers': {'Referer': 'https://sleepsuperconference.com'}, + 'skip_download': 'm3u8', + }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, + { + # vimeo.com URL with unlisted hash and Original format + 'url': 'https://vimeo.com/144579403/ec02229140', + # 'md5': '6b662c2884e0373183fbde2a0d15cb78', + 'info_dict': { + 'id': '144579403', + 'ext': 'mp4', + 'title': 'SALESMANSHIP', + 'description': 'md5:4338302f347a1ff8841b4a3aecaa09f0', + 'uploader': 'Off the Picture Pictures', + 'uploader_id': 'offthepicturepictures', + 'uploader_url': 'https://vimeo.com/offthepicturepictures', + 'duration': 669, + 'upload_date': '20151104', + 'timestamp': 1446607180, + 'release_date': '20151104', + 'release_timestamp': 1446607180, + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/1018638656-[\da-f]+-d_1280', + }, + # 'params': {'format': 'Original'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, + { + # player.vimeo.com URL with source format + 'url': 'https://player.vimeo.com/video/859028877', + # 'md5': '19ca3d2463441dee2d2f0671ac2916a2', + 'info_dict': { + 'id': '859028877', + 'ext': 'mp4', + 'title': 'Ariana Grande - Honeymoon Avenue (Live from London)', + 'uploader': 'Raja Virdi', + 'uploader_id': 'rajavirdi', + 'uploader_url': 'https://vimeo.com/rajavirdi', + 'duration': 309, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/1716727772-[\da-f]+-d_1280', + }, + # 'params': {'format': 'source'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # user playlist alias -> https://vimeo.com/258705797 @@ -746,22 +863,31 @@ class VimeoIE(VimeoBaseInfoExtractor): return checked def _extract_from_api(self, video_id, unlisted_hash=None): - token = self._download_json( - 'https://vimeo.com/_rv/jwt', video_id, headers={ - 'X-Requested-With': 'XMLHttpRequest', - })['token'] - api_url = 'https://api.vimeo.com/videos/' + video_id - if unlisted_hash: - api_url += ':' + unlisted_hash - video = self._download_json( - api_url, video_id, headers={ - 'Authorization': 'jwt ' + token, - 'Accept': 'application/json', - }, query={ - 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', - }) + viewer = self._download_json( + 'https://vimeo.com/_next/viewer', video_id, 'Downloading viewer info') + + for retry in (False, True): + try: + video = self._call_videos_api(video_id, viewer['jwt'], unlisted_hash) + break + except ExtractorError as e: + if (not retry and isinstance(e.cause, HTTPError) and e.cause.status == 400 + and 'password' in traverse_obj( + self._webpage_read_content(e.cause.response, e.cause.response.url, video_id, fatal=False), + ({json.loads}, 'invalid_parameters', ..., 'field'), + )): + self._verify_video_password( + video_id, self._get_video_password(), viewer['xsrft']) + continue + raise + info = self._parse_config(self._download_json( video['config_url'], video_id), video_id) + source_format = self._extract_original_format( + f'https://vimeo.com/{video_id}', video_id, unlisted_hash, jwt=viewer['jwt'], api_data=video) + if source_format: + info['formats'].append(source_format) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) info.update({ 'description': video.get('description'), @@ -829,21 +955,33 @@ class VimeoIE(VimeoBaseInfoExtractor): url = 'https://vimeo.com/' + video_id self._try_album_password(url) + is_secure = urllib.parse.urlparse(url).scheme == 'https' try: # Retrieve video webpage to extract further information webpage, urlh = self._download_webpage_handle( - url, video_id, headers=headers) + url, video_id, headers=headers, impersonate=is_secure) redirect_url = urlh.url - except ExtractorError as ee: - if isinstance(ee.cause, HTTPError) and ee.cause.status == 403: - errmsg = ee.cause.response.read() - if b'Because of its privacy settings, this video cannot be played here' in errmsg: - raise ExtractorError( - 'Cannot download embed-only video without embedding ' - 'URL. Please call yt-dlp with the URL of the page ' - 'that embeds this video.', - expected=True) - raise + except ExtractorError as error: + if not isinstance(error.cause, HTTPError) or error.cause.status not in (403, 429): + raise + errmsg = error.cause.response.read() + if b'Because of its privacy settings, this video cannot be played here' in errmsg: + raise ExtractorError( + 'Cannot download embed-only video without embedding URL. Please call yt-dlp ' + 'with the URL of the page that embeds this video.', expected=True) + # 403 == vimeo.com TLS fingerprint or DC IP block; 429 == player.vimeo.com TLS FP block + status = error.cause.status + dcip_msg = 'If you are using a data center IP or VPN/proxy, your IP may be blocked' + if target := error.cause.response.extensions.get('impersonate'): + raise ExtractorError( + f'Got HTTP Error {status} when using impersonate target "{target}". {dcip_msg}') + elif not is_secure: + raise ExtractorError(f'Got HTTP Error {status}. {dcip_msg}', expected=True) + raise ExtractorError( + 'This request has been blocked due to its TLS fingerprint. Install a ' + 'required impersonation dependency if possible, or else if you are okay with ' + f'{self._downloader._format_err("compromising your security/cookies", "light red")}, ' + f'try replacing "https:" with "http:" in the input URL. {dcip_msg}.', expected=True) if '://player.vimeo.com/video/' in url: config = self._search_json( @@ -851,13 +989,12 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password( redirect_url, video_id, headers) - return self._parse_config(config, video_id) - - if re.search(r'<form[^>]+?id="pw_form"', webpage): - video_password = self._get_video_password() - token, vuid = self._extract_xsrft_and_vuid(webpage) - webpage = self._verify_video_password( - redirect_url, video_id, video_password, token, vuid) + info = self._parse_config(config, video_id) + source_format = self._extract_original_format( + f'https://vimeo.com/{video_id}', video_id, unlisted_hash) + if source_format: + info['formats'].append(source_format) + return info vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: @@ -1225,8 +1362,22 @@ class VimeoGroupsIE(VimeoChannelIE): # XXX: Do not subclass from concrete IE class VimeoReviewIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:review' IE_DESC = 'Review pages on vimeo' - _VALID_URL = r'(?P<url>https://vimeo\.com/[^/]+/review/(?P<id>[^/]+)/[0-9a-f]{10})' + _VALID_URL = r'https?://vimeo\.com/(?P<user>[^/?#]+)/review/(?P<id>\d+)/(?P<hash>[\da-f]{10})' _TESTS = [{ + 'url': 'https://vimeo.com/user170863801/review/996447483/a316d6ed8d', + 'info_dict': { + 'id': '996447483', + 'ext': 'mp4', + 'title': 'Rodeo day 1-_2', + 'uploader': 'BROADKAST', + 'uploader_id': 'user170863801', + 'uploader_url': 'https://vimeo.com/user170863801', + 'duration': 30, + 'thumbnail': 'https://i.vimeocdn.com/video/1912612821-09a43bd2e75c203d503aed89de7534f28fc4474a48f59c51999716931a246af5-d_1280', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML'], + }, { 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', 'md5': 'c507a72f780cacc12b2248bb4006d253', 'info_dict': { @@ -1240,6 +1391,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'thumbnail': 'https://i.vimeocdn.com/video/450115033-43303819d9ebe24c2630352e18b7056d25197d09b3ae901abdac4c4f1d68de71-d_1280', 'uploader_url': 'https://vimeo.com/user21297594', }, + 'skip': '404 Not Found', }, { 'note': 'video player needs Referer', 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', @@ -1271,28 +1423,23 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): }] def _real_extract(self, url): - page_url, video_id = self._match_valid_url(url).groups() - data = self._download_json( - page_url.replace('/review/', '/review/data/'), video_id) + user, video_id, review_hash = self._match_valid_url(url).group('user', 'id', 'hash') + data_url = f'https://vimeo.com/{user}/review/data/{video_id}/{review_hash}' + data = self._download_json(data_url, video_id) + viewer = {} if data.get('isLocked') is True: video_password = self._get_video_password() viewer = self._download_json( 'https://vimeo.com/_rv/viewer', video_id) - webpage = self._verify_video_password( - 'https://vimeo.com/' + video_id, video_id, - video_password, viewer['xsrft'], viewer['vuid']) - clip_page_config = self._parse_json(self._search_regex( - r'window\.vimeo\.clip_page_config\s*=\s*({.+?});', - webpage, 'clip page config'), video_id) - config_url = clip_page_config['player']['config_url'] - clip_data = clip_page_config.get('clip') or {} - else: - clip_data = data['clipData'] - config_url = clip_data['configUrl'] + self._verify_video_password(video_id, video_password, viewer['xsrft']) + data = self._download_json(data_url, video_id) + clip_data = data['clipData'] + config_url = clip_data['configUrl'] config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( - page_url + '/action', video_id) + f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', + video_id, unlisted_hash=clip_data.get('unlistedHash'), jwt=viewer.get('jwt')) if source_format: info_dict['formats'].append(source_format) info_dict['description'] = clean_html(clip_data.get('description')) diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py index 01e59352bf..f4ed96bf62 100644 --- a/yt_dlp/extractor/viu.py +++ b/yt_dlp/extractor/viu.py @@ -90,7 +90,7 @@ class ViuIE(ViuBaseIE): formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4') for key, value in video_data.items(): - mobj = re.match(r'^subtitle_(?P<lang>[^_]+)_(?P<ext>(vtt|srt))', key) + mobj = re.match(r'subtitle_(?P<lang>[^_]+)_(?P<ext>(vtt|srt))', key) if not mobj: continue subtitles.setdefault(mobj.group('lang'), []).append({ diff --git a/yt_dlp/extractor/vtv.py b/yt_dlp/extractor/vtv.py new file mode 100644 index 0000000000..97134ee196 --- /dev/null +++ b/yt_dlp/extractor/vtv.py @@ -0,0 +1,108 @@ +from .common import InfoExtractor +from ..utils import extract_attributes, get_element_html_by_class, remove_start + + +class VTVGoIE(InfoExtractor): + _VALID_URL = [ + r'https?://(?:www\.)?vtvgo\.vn/(kho-video|tin-tuc)/[\w.-]*?(?P<id>\d+)(?:\.[a-z]+|/)?(?:$|[?#])', + r'https?://(?:www\.)?vtvgo\.vn/digital/detail\.php\?(?:[^#]+&)?content_id=(?P<id>\d+)', + ] + _TESTS = [{ + 'url': 'https://vtvgo.vn/kho-video/bep-vtv-vit-chao-rieng-so-24-888456.html', + 'info_dict': { + 'id': '888456', + 'ext': 'mp4', + 'title': 'Bếp VTV | Vịt chao riềng | Số 24', + 'description': 'md5:2b4e93ec2b954304170d32be288ce2c8', + 'thumbnail': 'https://vtvgo-images.vtvdigital.vn/images/20230201/VIT-CHAO-RIENG_VTV_638108894672812459.jpg', + }, + }, { + 'url': 'https://vtvgo.vn/tin-tuc/hot-search-1-zlife-khong-ngo-toi-phai-khong-862074', + 'info_dict': { + 'id': '862074', + 'ext': 'mp4', + 'title': 'Hot Search #1 | Zlife | Không ngờ tới phải không? ', + 'description': 'md5:e967d0e2efbbebbee8814a55799b4d0f', + 'thumbnail': 'https://vtvgo-images.vtvdigital.vn/images/20220504/6b9a8552-e71c-46ce-bc9d-50c9bb506f9c.jpeg', + }, + }, { + 'url': 'https://vtvgo.vn/kho-video/918311.html', + 'info_dict': { + 'id': '918311', + 'title': 'Cà phê sáng | 05/02/2024 | Tái hiện hình ảnh Hà Nội xưa tại ngôi nhà di sản', + 'ext': 'mp4', + 'thumbnail': 'https://vtvgo-images.vtvdigital.vn/images/20240205/0506_ca_phe_sang_638427226021318322.jpg', + 'description': 'md5:b121c67948f1ce58e6a036042fc14c1b', + }, + }, { + 'url': 'https://vtvgo.vn/digital/detail.php?digital_id=168&content_id=918634', + 'info_dict': { + 'id': '918634', + 'ext': 'mp4', + 'title': 'Gặp nhau cuối năm | Táo quân 2024', + 'description': 'md5:a1c221e78e5954d29d49b2a11c20513c', + 'thumbnail': 'https://vtvgo-images.vtvdigital.vn/images/20240210/d0f73369-8f03-4108-9edd-83d4bc3997b2.png', + }, + }, { + 'url': 'https://vtvgo.vn/digital/detail.php?content_id=919358', + 'info_dict': { + 'id': '919358', + 'ext': 'mp4', + 'title': 'Chúng ta của 8 năm sau | Tập 45 | Dương có bằng chứng, nhân chứng vạch mặt ông Khiêm', + 'description': 'md5:16ff5208cac6585137f554472a4677f3', + 'thumbnail': 'https://vtvgo-images.vtvdigital.vn/images/20240221/550deff9-7736-4a0e-8b5d-33274d97cd7d.jpg', + }, + }, { + 'url': 'https://vtvgo.vn/kho-video/888456', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + m3u8_url = self._search_regex( + r'(?:var\s+link\s*=\s*|addPlayer\()["\'](https://[^"\']+/index\.m3u8)["\']', webpage, 'm3u8 url') + return { + 'id': video_id, + 'title': self._og_search_title(webpage, default=None), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4'), + } + + +class VTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vtv\.vn/video/[\w-]*?(?P<id>\d+)\.htm' + _TESTS = [{ + 'url': 'https://vtv.vn/video/thoi-su-20h-vtv1-12-6-2024-680411.htm', + 'info_dict': { + 'id': '680411', + 'ext': 'mp4', + 'title': 'Thời sự 20h VTV1 - 12/6/2024 - Video đã phát trên VTV1 | VTV.VN', + 'thumbnail': 'https://cdn-images.vtv.vn/zoom/600_315/66349b6076cb4dee98746cf1/2024/06/12/thumb/1206-ts-20h-02929741475480320806760.mp4/thumb0.jpg', + }, + }, { + 'url': 'https://vtv.vn/video/zlife-1-khong-ngo-toi-phai-khong-vtv24-560248.htm', + 'info_dict': { + 'id': '560248', + 'ext': 'mp4', + 'title': 'ZLife #1: Không ngờ tới phải không? | VTV24 - Video đã phát trên VTV-NEWS | VTV.VN', + 'description': 'Ai đứng sau vụ việc thay đổi ảnh đại diện trên các trang mạng xã hội của VTV Digital tối 2/5?', + 'thumbnail': 'https://video-thumbs.mediacdn.vn/zoom/600_315/vtv/2022/5/13/t67s6btf3ji-16524555726231894427334.jpg', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data_vid = extract_attributes(get_element_html_by_class( + 'VCSortableInPreviewMode', get_element_html_by_class( + 'video-highlight-box', webpage)))['data-vid'] + m3u8_url = f'https://cdn-videos.vtv.vn/{remove_start(data_vid, "vtv.mediacdn.vn/")}/master.m3u8' + return { + 'id': video_id, + 'title': self._og_search_title(webpage, default=None), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4'), + } diff --git a/yt_dlp/extractor/weibo.py b/yt_dlp/extractor/weibo.py index 3e82909825..b5c0e926f8 100644 --- a/yt_dlp/extractor/weibo.py +++ b/yt_dlp/extractor/weibo.py @@ -52,6 +52,7 @@ class WeiboBaseIE(InfoExtractor): }) def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs): + # XXX: Always fatal; _download_webpage_handle only returns False (not a tuple) on error webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs) if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com': self._update_visitor_cookies(urlh.url, video_id) diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py index c94ca9db97..6f1a8b95d8 100644 --- a/yt_dlp/extractor/weverse.py +++ b/yt_dlp/extractor/weverse.py @@ -27,8 +27,9 @@ from ..utils import ( class WeverseBaseIE(InfoExtractor): _NETRC_MACHINE = 'weverse' - _ACCOUNT_API_BASE = 'https://accountapi.weverse.io/web/api/v2' + _ACCOUNT_API_BASE = 'https://accountapi.weverse.io/web/api' _API_HEADERS = { + 'Accept': 'application/json', 'Referer': 'https://weverse.io/', 'WEV-device-Id': str(uuid.uuid4()), } @@ -39,14 +40,14 @@ class WeverseBaseIE(InfoExtractor): headers = { 'x-acc-app-secret': '5419526f1c624b38b10787e5c10b2a7a', - 'x-acc-app-version': '2.2.6', + 'x-acc-app-version': '3.3.6', 'x-acc-language': 'en', 'x-acc-service-id': 'weverse', 'x-acc-trace-id': str(uuid.uuid4()), 'x-clog-user-device-id': str(uuid.uuid4()), } valid_username = traverse_obj(self._download_json( - f'{self._ACCOUNT_API_BASE}/signup/email/status', None, note='Checking username', + f'{self._ACCOUNT_API_BASE}/v2/signup/email/status', None, note='Checking username', query={'email': username}, headers=headers, expected_status=(400, 404)), 'hasPassword') if not valid_username: raise ExtractorError('Invalid username provided', expected=True) @@ -54,8 +55,9 @@ class WeverseBaseIE(InfoExtractor): headers['content-type'] = 'application/json' try: auth = self._download_json( - f'{self._ACCOUNT_API_BASE}/auth/token/by-credentials', None, data=json.dumps({ + f'{self._ACCOUNT_API_BASE}/v3/auth/token/by-credentials', None, data=json.dumps({ 'email': username, + 'otpSessionId': 'BY_PASS', 'password': password, }, separators=(',', ':')).encode(), headers=headers, note='Logging in') except ExtractorError as e: @@ -78,8 +80,10 @@ class WeverseBaseIE(InfoExtractor): # From https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/main.e206f7c1.js: key = b'1b9cb6378d959b45714bec49971ade22e6e24e42' api_path = update_url_query(ep, { + # 'gcc': 'US', 'appId': 'be4d79eb8fc7bd008ee82c8ec4ff6fd4', 'language': 'en', + 'os': 'WEB', 'platform': 'WEB', 'wpf': 'pc', }) @@ -152,7 +156,7 @@ class WeverseBaseIE(InfoExtractor): 'description': ((('extension', 'mediaInfo', 'body'), 'body'), {str}), 'uploader': ('author', 'profileName', {str}), 'uploader_id': ('author', 'memberId', {str}), - 'creator': ('community', 'communityName', {str}), + 'creators': ('community', 'communityName', {str}, all), 'channel_id': (('community', 'author'), 'communityId', {str_or_none}), 'duration': ('extension', 'video', 'playTime', {float_or_none}), 'timestamp': ('publishedAt', {lambda x: int_or_none(x, 1000)}), @@ -196,7 +200,7 @@ class WeverseIE(WeverseBaseIE): 'channel': 'billlie', 'channel_id': '72', 'channel_url': 'https://weverse.io/billlie', - 'creator': 'Billlie', + 'creators': ['Billlie'], 'timestamp': 1666262062, 'upload_date': '20221020', 'release_timestamp': 1666262058, @@ -222,7 +226,7 @@ class WeverseIE(WeverseBaseIE): 'channel': 'lesserafim', 'channel_id': '47', 'channel_url': 'https://weverse.io/lesserafim', - 'creator': 'LE SSERAFIM', + 'creators': ['LE SSERAFIM'], 'timestamp': 1659353400, 'upload_date': '20220801', 'release_timestamp': 1659353400, @@ -286,7 +290,7 @@ class WeverseIE(WeverseBaseIE): elif live_status == 'is_live': video_info = self._call_api( - f'/video/v1.0/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', + f'/video/v1.2/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', video_id, note='Downloading live JSON') playback = self._parse_json(video_info['lipPlayback'], video_id) m3u8_url = traverse_obj(playback, ( @@ -302,7 +306,7 @@ class WeverseIE(WeverseBaseIE): else: infra_video_id = post['extension']['video']['infraVideoId'] in_key = self._call_api( - f'/video/v1.0/vod/{api_video_id}/inKey?preview=false', video_id, + f'/video/v1.1/vod/{api_video_id}/inKey?preview=false', video_id, data=b'{}', note='Downloading VOD API key')['inKey'] video_info = self._download_json( @@ -347,7 +351,6 @@ class WeverseMediaIE(WeverseBaseIE): _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P<artist>[^/?#]+)/media/(?P<id>[\d-]+)' _TESTS = [{ 'url': 'https://weverse.io/billlie/media/4-116372884', - 'md5': '8efc9cfd61b2f25209eb1a5326314d28', 'info_dict': { 'id': 'e-C9wLSQs6o', 'ext': 'mp4', @@ -358,8 +361,9 @@ class WeverseMediaIE(WeverseBaseIE): 'channel_url': 'https://www.youtube.com/channel/UCyc9sUCxELTDK9vELO5Fzeg', 'uploader': 'Billlie', 'uploader_id': '@Billlie', - 'uploader_url': 'http://www.youtube.com/@Billlie', + 'uploader_url': 'https://www.youtube.com/@Billlie', 'upload_date': '20230403', + 'timestamp': 1680533992, 'duration': 211, 'age_limit': 0, 'playable_in_embed': True, @@ -372,6 +376,8 @@ class WeverseMediaIE(WeverseBaseIE): 'thumbnail': 'https://i.ytimg.com/vi/e-C9wLSQs6o/maxresdefault.jpg', 'categories': ['Entertainment'], 'tags': 'count:7', + 'channel_is_verified': True, + 'heatmap': 'count:100', }, }, { 'url': 'https://weverse.io/billlie/media/3-102914520', @@ -386,7 +392,7 @@ class WeverseMediaIE(WeverseBaseIE): 'channel': 'billlie', 'channel_id': '72', 'channel_url': 'https://weverse.io/billlie', - 'creator': 'Billlie', + 'creators': ['Billlie'], 'timestamp': 1662174000, 'upload_date': '20220903', 'release_timestamp': 1662174000, @@ -432,7 +438,7 @@ class WeverseMomentIE(WeverseBaseIE): 'uploader_id': '66a07e164b56a696ee71c99315ffe27b', 'channel': 'secretnumber', 'channel_id': '56', - 'creator': 'SECRET NUMBER', + 'creators': ['SECRET NUMBER'], 'duration': 10, 'upload_date': '20230405', 'timestamp': 1680653968, @@ -441,7 +447,6 @@ class WeverseMomentIE(WeverseBaseIE): 'comment_count': int, 'availability': 'needs_auth', }, - 'skip': 'Moment has expired', }] def _real_extract(self, url): @@ -571,7 +576,7 @@ class WeverseLiveIE(WeverseBaseIE): 'channel': 'purplekiss', 'channel_id': '35', 'channel_url': 'https://weverse.io/purplekiss', - 'creator': 'PURPLE KISS', + 'creators': ['PURPLE KISS'], 'timestamp': 1680780892, 'upload_date': '20230406', 'release_timestamp': 1680780883, @@ -584,6 +589,31 @@ class WeverseLiveIE(WeverseBaseIE): 'live_status': 'is_live', }, 'skip': 'Livestream has ended', + }, { + 'url': 'https://weverse.io/lesserafim', + 'info_dict': { + 'id': '4-181521628', + 'ext': 'mp4', + 'title': r're:심심해서요', + 'description': '', + 'uploader': '채채🤎', + 'uploader_id': 'd49b8b06f3cc1d92d655b25ab27ac2e7', + 'channel': 'lesserafim', + 'channel_id': '47', + 'creators': ['LE SSERAFIM'], + 'channel_url': 'https://weverse.io/lesserafim', + 'timestamp': 1728570273, + 'upload_date': '20241010', + 'release_timestamp': 1728570264, + 'release_date': '20241010', + 'thumbnail': r're:https://phinf\.wevpstatic\.net/.+\.png', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'is_live', + }, + 'skip': 'Livestream has ended', }, { 'url': 'https://weverse.io/billlie/', 'only_matching': True, diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py index fb2a8648fd..df7ecb3cdc 100644 --- a/yt_dlp/extractor/wistia.py +++ b/yt_dlp/extractor/wistia.py @@ -8,6 +8,7 @@ from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, + filter_dict, float_or_none, int_or_none, parse_qs, @@ -25,16 +26,25 @@ class WistiaBaseIE(InfoExtractor): def _download_embed_config(self, config_type, config_id, referer): base_url = self._EMBED_BASE_URL + f'{config_type}/{config_id}' + video_password = self.get_param('videopassword') embed_config = self._download_json( base_url + '.json', config_id, headers={ 'Referer': referer if referer.startswith('http') else base_url, # Some videos require this. - }) + }, query=filter_dict({'password': video_password})) error = traverse_obj(embed_config, 'error') if error: raise ExtractorError( f'Error while getting the playlist: {error}', expected=True) + if traverse_obj(embed_config, ( + 'media', ('embed_options', 'embedOptions'), 'plugin', + 'passwordProtectedVideo', 'on', any)) == 'true': + if video_password: + raise ExtractorError('Invalid video password', expected=True) + raise ExtractorError( + 'This content is password-protected. Use the --video-password option', expected=True) + return embed_config def _get_real_ext(self, url): diff --git a/yt_dlp/extractor/wsj.py b/yt_dlp/extractor/wsj.py index 7b3f6aa2ae..b6b656f7d3 100644 --- a/yt_dlp/extractor/wsj.py +++ b/yt_dlp/extractor/wsj.py @@ -2,6 +2,7 @@ from .common import InfoExtractor from ..utils import ( float_or_none, int_or_none, + join_nonempty, unified_strdate, ) @@ -76,7 +77,7 @@ class WSJIE(InfoExtractor): tbr = int_or_none(v.get('bitrate')) formats.append({ 'url': mp4_url, - 'format_id': 'http' + (f'-{tbr}' if tbr else ''), + 'format_id': join_nonempty('http', tbr), 'tbr': tbr, 'width': int_or_none(v.get('width')), 'height': int_or_none(v.get('height')), diff --git a/yt_dlp/extractor/ximalaya.py b/yt_dlp/extractor/ximalaya.py index e900a4ad9f..02bf6a7beb 100644 --- a/yt_dlp/extractor/ximalaya.py +++ b/yt_dlp/extractor/ximalaya.py @@ -1,7 +1,17 @@ +import base64 import math +import time from .common import InfoExtractor -from ..utils import InAdvancePagedList, str_or_none, traverse_obj, try_call +from .videa import VideaIE +from ..utils import ( + InAdvancePagedList, + int_or_none, + str_or_none, + traverse_obj, + try_call, + update_url_query, +) class XimalayaBaseIE(InfoExtractor): @@ -11,7 +21,7 @@ class XimalayaBaseIE(InfoExtractor): class XimalayaIE(XimalayaBaseIE): IE_NAME = 'ximalaya' IE_DESC = '喜马拉雅FM' - _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(:?(?P<uid>\d+)/)?sound/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?:(?P<uid>\d+)/)?sound/(?P<id>[0-9]+)' _TESTS = [ { 'url': 'http://www.ximalaya.com/sound/47740352/', @@ -71,23 +81,92 @@ class XimalayaIE(XimalayaBaseIE): 'like_count': int, }, }, + { + # VIP-restricted audio + 'url': 'https://www.ximalaya.com/sound/562111701', + 'only_matching': True, + }, ] + @staticmethod + def _decrypt_filename(file_id, seed): + cgstr = '' + key = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890' + for _ in key: + seed = float(int(211 * seed + 30031) % 65536) + r = int(seed / 65536 * len(key)) + cgstr += key[r] + key = key.replace(key[r], '') + parts = file_id.split('*') + filename = ''.join(cgstr[int(part)] for part in parts if part.isdecimal()) + if not filename.startswith('/'): + filename = '/' + filename + return filename + + @staticmethod + def _decrypt_url_params(encrypted_params): + params = VideaIE.rc4( + base64.b64decode(encrypted_params), 'xkt3a41psizxrh9l').split('-') + # sign, token, timestamp + return params[1], params[2], params[3] + def _real_extract(self, url): scheme = 'https' if url.startswith('https') else 'http' audio_id = self._match_id(url) - audio_info_file = f'{scheme}://m.ximalaya.com/tracks/{audio_id}.json' audio_info = self._download_json( - audio_info_file, audio_id, - f'Downloading info json {audio_info_file}', 'Unable to download info file') - - formats = [{ + f'{scheme}://m.ximalaya.com/tracks/{audio_id}.json', audio_id, + 'Downloading info json', 'Unable to download info file') + + formats = [] + # NOTE: VIP-restricted audio + if audio_info.get('is_paid'): + ts = int(time.time()) + vip_info = self._download_json( + f'{scheme}://mpay.ximalaya.com/mobile/track/pay/{audio_id}/{ts}', + audio_id, 'Downloading VIP info json', 'Unable to download VIP info file', + query={'device': 'pc', 'isBackend': 'true', '_': ts}) + filename = self._decrypt_filename(vip_info['fileId'], vip_info['seed']) + sign, token, timestamp = self._decrypt_url_params(vip_info['ep']) + vip_url = update_url_query( + f'{vip_info["domain"]}/download/{vip_info["apiVersion"]}{filename}', { + 'sign': sign, + 'token': token, + 'timestamp': timestamp, + 'buy_key': vip_info['buyKey'], + 'duration': vip_info['duration'], + }) + fmt = { + 'format_id': 'vip', + 'url': vip_url, + 'vcodec': 'none', + } + if '_preview_' in vip_url: + self.report_warning( + f'This tracks requires a VIP account. Using a sample instead. {self._login_hint()}') + fmt.update({ + 'format_note': 'Sample', + 'preference': -10, + **traverse_obj(vip_info, { + 'filesize': ('sampleLength', {int_or_none}), + 'duration': ('sampleDuration', {int_or_none}), + }), + }) + else: + fmt.update(traverse_obj(vip_info, { + 'filesize': ('totalLength', {int_or_none}), + 'duration': ('duration', {int_or_none}), + })) + + fmt['abr'] = try_call(lambda: fmt['filesize'] * 8 / fmt['duration'] / 1024) + formats.append(fmt) + + formats.extend([{ 'format_id': f'{bps}k', 'url': audio_info[k], 'abr': bps, 'vcodec': 'none', - } for bps, k in ((24, 'play_path_32'), (64, 'play_path_64')) if audio_info.get(k)] + } for bps, k in ((24, 'play_path_32'), (64, 'play_path_64')) if audio_info.get(k)]) thumbnails = [] for k in audio_info: diff --git a/yt_dlp/extractor/xinpianchang.py b/yt_dlp/extractor/xinpianchang.py index 10849916b8..23ed9270da 100644 --- a/yt_dlp/extractor/xinpianchang.py +++ b/yt_dlp/extractor/xinpianchang.py @@ -3,16 +3,13 @@ from ..utils import ( int_or_none, str_or_none, try_get, - update_url_query, url_or_none, ) class XinpianchangIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://www\.xinpianchang\.com/(?P<id>[^/]+?)(?:\D|$)' - IE_NAME = 'xinpianchang' - IE_DESC = 'xinpianchang.com' + _VALID_URL = r'https?://(www\.)?xinpianchang\.com/(?P<id>a\d+)' + IE_DESC = '新片场' _TESTS = [{ 'url': 'https://www.xinpianchang.com/a11766551', 'info_dict': { @@ -49,11 +46,11 @@ class XinpianchangIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id=video_id) - domain = self.find_value_with_regex(var='requireNewDomain', webpage=webpage) - vid = self.find_value_with_regex(var='vid', webpage=webpage) - app_key = self.find_value_with_regex(var='modeServerAppKey', webpage=webpage) - api = update_url_query(f'{domain}/mod/api/v2/media/{vid}', {'appKey': app_key}) - data = self._download_json(api, video_id=video_id)['data'] + video_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['detail']['video'] + + data = self._download_json( + f'https://mod-api.xinpianchang.com/mod/api/v2/media/{video_data["vid"]}', video_id, + query={'appKey': video_data['appKey']})['data'] formats, subtitles = [], {} for k, v in data.get('resource').items(): if k in ('dash', 'hls'): @@ -72,6 +69,10 @@ class XinpianchangIE(InfoExtractor): 'width': int_or_none(prog.get('width')), 'height': int_or_none(prog.get('height')), 'ext': 'mp4', + 'http_headers': { + # NB: Server returns 403 without the Range header + 'Range': 'bytes=0-', + }, } for prog in v if prog.get('url') or []]) return { @@ -87,6 +88,3 @@ class XinpianchangIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } - - def find_value_with_regex(self, var, webpage): - return self._search_regex(rf'var\s{var}\s=\s\"(?P<vid>[^\"]+)\"', webpage, name=var) diff --git a/yt_dlp/extractor/yahoo.py b/yt_dlp/extractor/yahoo.py index f0ba830380..35e71209c5 100644 --- a/yt_dlp/extractor/yahoo.py +++ b/yt_dlp/extractor/yahoo.py @@ -8,6 +8,7 @@ from ..utils import ( ExtractorError, clean_html, int_or_none, + join_nonempty, mimetype2ext, parse_iso8601, traverse_obj, @@ -213,7 +214,7 @@ class YahooIE(InfoExtractor): tbr = int_or_none(s.get('bitrate')) formats.append({ 'url': s_url, - 'format_id': fmt + (f'-{tbr}' if tbr else ''), + 'format_id': join_nonempty(fmt, tbr), 'width': int_or_none(s.get('width')), 'height': int_or_none(s.get('height')), 'tbr': tbr, @@ -371,12 +372,13 @@ class YahooJapanNewsIE(InfoExtractor): url, content_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) else: + bitrate = int_or_none(vid.get('bitrate')) formats.append({ 'url': url, - 'format_id': f'http-{vid.get("bitrate")}', + 'format_id': join_nonempty('http', bitrate), 'height': int_or_none(vid.get('height')), 'width': int_or_none(vid.get('width')), - 'tbr': int_or_none(vid.get('bitrate')), + 'tbr': bitrate, }) self._remove_duplicate_formats(formats) diff --git a/yt_dlp/extractor/yandexdisk.py b/yt_dlp/extractor/yandexdisk.py index 56aa792929..3214816701 100644 --- a/yt_dlp/extractor/yandexdisk.py +++ b/yt_dlp/extractor/yandexdisk.py @@ -5,6 +5,7 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + join_nonempty, mimetype2ext, try_get, urljoin, @@ -116,12 +117,9 @@ class YandexDiskIE(InfoExtractor): else: size = video.get('size') or {} height = int_or_none(size.get('height')) - format_id = 'hls' - if height: - format_id += f'-{height}p' formats.append({ 'ext': 'mp4', - 'format_id': format_id, + 'format_id': join_nonempty('hls', height and f'{height}p'), 'height': height, 'protocol': 'm3u8_native', 'url': format_url, diff --git a/yt_dlp/extractor/yle_areena.py b/yt_dlp/extractor/yle_areena.py index 796f7f3167..c2daddfa6c 100644 --- a/yt_dlp/extractor/yle_areena.py +++ b/yt_dlp/extractor/yle_areena.py @@ -1,61 +1,53 @@ from .common import InfoExtractor from .kaltura import KalturaIE from ..utils import ( + ExtractorError, int_or_none, + parse_iso8601, smuggle_url, - traverse_obj, - unified_strdate, url_or_none, ) +from ..utils.traversal import traverse_obj class YleAreenaIE(InfoExtractor): - _VALID_URL = r'https?://areena\.yle\.fi/(?P<id>[\d-]+)' + _VALID_URL = r'https?://areena\.yle\.fi/(?P<podcast>podcastit/)?(?P<id>[\d-]+)' + _GEO_COUNTRIES = ['FI'] _TESTS = [ { 'url': 'https://areena.yle.fi/1-4371942', - 'md5': '932edda0ecf5dfd6423804182d32f8ac', + 'md5': 'd87e9a1e74e67e009990ddd413e426b4', 'info_dict': { - 'id': '0_a3tjk92c', + 'id': '1-4371942', 'ext': 'mp4', 'title': 'Pouchit', - 'description': 'md5:d487309c3abbe5650265bbd1742d2f82', + 'description': 'md5:01071d7056ceec375f63960f90c35366', 'series': 'Modernit miehet', 'season': 'Season 1', 'season_number': 1, 'episode': 'Episode 2', 'episode_number': 2, - 'thumbnail': 'http://cfvod.kaltura.com/p/1955031/sp/195503100/thumbnail/entry_id/0_a3tjk92c/version/100061', - 'uploader_id': 'ovp@yle.fi', - 'duration': 1435, - 'view_count': int, - 'upload_date': '20181204', - 'release_date': '20190106', - 'timestamp': 1543916210, - 'subtitles': {'fin': [{'url': r're:^https?://', 'ext': 'srt'}]}, + 'thumbnail': r're:https://images\.cdn\.yle\.fi/image/upload/.+\.jpg', 'age_limit': 7, - 'webpage_url': 'https://areena.yle.fi/1-4371942', + 'release_date': '20190105', + 'release_timestamp': 1546725660, + 'duration': 1435, }, }, { 'url': 'https://areena.yle.fi/1-2158940', - 'md5': 'cecb603661004e36af8c5188b5212b12', + 'md5': '6369ddc5e07b5fdaeda27a495184143c', 'info_dict': { - 'id': '1_l38iz9ur', + 'id': '1-2158940', 'ext': 'mp4', 'title': 'Albi haluaa vessan', - 'description': 'md5:15236d810c837bed861fae0e88663c33', + 'description': 'Albi haluaa vessan.', 'series': 'Albi Lumiukko', - 'thumbnail': 'http://cfvod.kaltura.com/p/1955031/sp/195503100/thumbnail/entry_id/1_l38iz9ur/version/100021', - 'uploader_id': 'ovp@yle.fi', - 'duration': 319, - 'view_count': int, - 'upload_date': '20211202', - 'release_date': '20211215', - 'timestamp': 1638448202, - 'subtitles': {}, + 'thumbnail': r're:https://images\.cdn\.yle\.fi/image/upload/.+\.jpg', 'age_limit': 0, - 'webpage_url': 'https://areena.yle.fi/1-2158940', + 'release_date': '20211215', + 'release_timestamp': 1639555200, + 'duration': 319, }, }, { @@ -66,69 +58,125 @@ class YleAreenaIE(InfoExtractor): 'title': 'HKO & Mälkki & Tanner', 'description': 'md5:b4f1b1af2c6569b33f75179a86eea156', 'series': 'Helsingin kaupunginorkesterin konsertteja', - 'thumbnail': r're:^https?://.+\.jpg$', + 'thumbnail': r're:https://images\.cdn\.yle\.fi/image/upload/.+\.jpg', 'release_date': '20230120', + 'release_timestamp': 1674242079, + 'duration': 8004, }, 'params': { 'skip_download': 'm3u8', }, }, + { + 'url': 'https://areena.yle.fi/1-72251830', + 'info_dict': { + 'id': '1-72251830', + 'ext': 'mp4', + 'title': r're:Pentulive 2024 | Pentulive \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'description': 'md5:1f118707d9093bf894a34fbbc865397b', + 'series': 'Pentulive', + 'thumbnail': r're:https://images\.cdn\.yle\.fi/image/upload/.+\.jpg', + 'live_status': 'is_live', + 'release_date': '20241025', + 'release_timestamp': 1729875600, + }, + 'params': { + 'skip_download': 'livestream', + }, + }, + { + 'url': 'https://areena.yle.fi/podcastit/1-71022852', + 'info_dict': { + 'id': '1-71022852', + 'ext': 'mp3', + 'title': 'Värityspäivä', + 'description': 'md5:c3a02b0455ec71d32cbe09d32ec161e2', + 'series': 'Murun ja Paukun ikioma kaupunki', + 'episode': 'Episode 1', + 'episode_number': 1, + 'release_date': '20240607', + 'release_timestamp': 1717736400, + 'duration': 442, + }, + }, ] def _real_extract(self, url): - video_id = self._match_id(url) - info = self._search_json_ld(self._download_webpage(url, video_id), video_id, default={}) + video_id, is_podcast = self._match_valid_url(url).group('id', 'podcast') + json_ld = self._search_json_ld(self._download_webpage(url, video_id), video_id, default={}) video_data = self._download_json( f'https://player.api.yle.fi/v1/preview/{video_id}.json?app_id=player_static_prod&app_key=8930d72170e48303cf5f3867780d549b', video_id, headers={ 'origin': 'https://areena.yle.fi', 'referer': 'https://areena.yle.fi/', 'content-type': 'application/json', - }) + })['data'] # Example title: 'K1, J2: Pouchit | Modernit miehet' - series, season_number, episode_number, episode = self._search_regex( - r'K(?P<season_no>[\d]+),\s*J(?P<episode_no>[\d]+):?\s*\b(?P<episode>[^|]+)\s*|\s*(?P<series>.+)', - info.get('title') or '', 'episode metadata', group=('season_no', 'episode_no', 'episode', 'series'), + season_number, episode_number, episode, series = self._search_regex( + r'K(?P<season_no>\d+),\s*J(?P<episode_no>\d+):?\s*\b(?P<episode>[^|]+)\s*|\s*(?P<series>.+)', + json_ld.get('title') or '', 'episode metadata', group=('season_no', 'episode_no', 'episode', 'series'), default=(None, None, None, None)) - description = traverse_obj(video_data, ('data', 'ongoing_ondemand', 'description', 'fin'), expected_type=str) + description = traverse_obj(video_data, ('ongoing_ondemand', 'description', 'fin', {str})) subtitles = {} - for sub in traverse_obj(video_data, ('data', 'ongoing_ondemand', 'subtitles', ...)): - if url_or_none(sub.get('uri')): - subtitles.setdefault(sub.get('language') or 'und', []).append({ - 'url': sub['uri'], - 'ext': 'srt', - 'name': sub.get('kind'), - }) + for sub in traverse_obj(video_data, ('ongoing_ondemand', 'subtitles', lambda _, v: url_or_none(v['uri']))): + subtitles.setdefault(sub.get('language') or 'und', []).append({ + 'url': sub['uri'], + 'ext': 'srt', + 'name': sub.get('kind'), + }) - kaltura_id = traverse_obj(video_data, ('data', 'ongoing_ondemand', 'kaltura', 'id'), expected_type=str) - if kaltura_id: - info_dict = { + info_dict, metadata = {}, {} + if is_podcast and traverse_obj(video_data, ('ongoing_ondemand', 'media_url', {url_or_none})): + metadata = video_data['ongoing_ondemand'] + info_dict['url'] = metadata['media_url'] + elif traverse_obj(video_data, ('ongoing_event', 'manifest_url', {url_or_none})): + metadata = video_data['ongoing_event'] + metadata.pop('duration', None) # Duration is not accurate for livestreams + info_dict['live_status'] = 'is_live' + elif traverse_obj(video_data, ('ongoing_ondemand', 'manifest_url', {url_or_none})): + metadata = video_data['ongoing_ondemand'] + # XXX: Has all externally-hosted Kaltura content been moved to native hosting? + elif kaltura_id := traverse_obj(video_data, ('ongoing_ondemand', 'kaltura', 'id', {str})): + metadata = video_data['ongoing_ondemand'] + info_dict.update({ '_type': 'url_transparent', 'url': smuggle_url(f'kaltura:1955031:{kaltura_id}', {'source_url': url}), 'ie_key': KalturaIE.ie_key(), - } + }) + elif traverse_obj(video_data, ('gone', {dict})): + self.raise_no_formats('The content is no longer available', expected=True, video_id=video_id) + metadata = video_data['gone'] else: - info_dict = { - 'id': video_id, - 'formats': self._extract_m3u8_formats( - video_data['data']['ongoing_ondemand']['manifest_url'], video_id, 'mp4', m3u8_id='hls'), - } + raise ExtractorError('Unable to extract content') + + if not info_dict.get('url') and metadata.get('manifest_url'): + info_dict['formats'], subs = self._extract_m3u8_formats_and_subtitles( + metadata['manifest_url'], video_id, 'mp4', m3u8_id='hls') + self._merge_subtitles(subs, target=subtitles) return { - **info_dict, - 'title': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'title', 'fin'), expected_type=str) - or episode or info.get('title')), + **traverse_obj(json_ld, { + 'title': 'title', + 'thumbnails': ('thumbnails', ..., {'url': 'url'}), + }), + 'id': video_id, + 'title': episode, 'description': description, - 'series': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'series', 'title', 'fin'), expected_type=str) - or series), + 'series': series, 'season_number': (int_or_none(self._search_regex(r'Kausi (\d+)', description, 'season number', default=None)) or int_or_none(season_number)), - 'episode_number': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'episode_number'), expected_type=int_or_none) - or int_or_none(episode_number)), - 'thumbnails': traverse_obj(info, ('thumbnails', ..., {'url': 'url'})), - 'age_limit': traverse_obj(video_data, ('data', 'ongoing_ondemand', 'content_rating', 'age_restriction'), expected_type=int_or_none), - 'subtitles': subtitles, - 'release_date': unified_strdate(traverse_obj(video_data, ('data', 'ongoing_ondemand', 'start_time'), expected_type=str)), + 'episode_number': int_or_none(episode_number), + 'subtitles': subtitles or None, + **traverse_obj(metadata, { + 'title': ('title', 'fin', {str}), + 'description': ('description', 'fin', {str}), + 'series': ('series', 'title', 'fin', {str}), + 'episode_number': ('episode_number', {int_or_none}), + 'age_limit': ('content_rating', 'age_restriction', {int_or_none}), + 'release_timestamp': ('start_time', {parse_iso8601}), + 'duration': ('duration', 'duration_in_seconds', {int_or_none}), + }), + **info_dict, } diff --git a/yt_dlp/extractor/youku.py b/yt_dlp/extractor/youku.py index fa6b0539bb..3bdfa6c933 100644 --- a/yt_dlp/extractor/youku.py +++ b/yt_dlp/extractor/youku.py @@ -136,7 +136,7 @@ class YoukuIE(InfoExtractor): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0524', + 'ccode': '0564', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a227f24258..99b8bfecc9 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4,6 +4,7 @@ import collections import copy import datetime as dt import enum +import functools import hashlib import itertools import json @@ -20,9 +21,8 @@ import urllib.parse from .common import InfoExtractor, SearchInfoExtractor from .openload import PhantomJSwrapper -from ..compat import functools from ..jsinterp import JSInterpreter -from ..networking.exceptions import HTTPError, network_exceptions +from ..networking.exceptions import HTTPError, TransportError, network_exceptions from ..utils import ( NO_DEFAULT, ExtractorError, @@ -55,6 +55,7 @@ from ..utils import ( str_or_none, str_to_int, strftime_or_none, + time_seconds, traverse_obj, try_call, try_get, @@ -69,148 +70,182 @@ from ..utils import ( ) STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' +STREAMING_DATA_PO_TOKEN = '__yt_dlp_po_token' + # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { 'web': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB', - 'clientVersion': '2.20220801.00.00', + 'clientVersion': '2.20240726.00.00', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'REQUIRE_PO_TOKEN': True, + }, + # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats + 'web_safari': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20240726.00.00', + 'userAgent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15,gzip(gfe)', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'REQUIRE_PO_TOKEN': True, }, 'web_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_EMBEDDED_PLAYER', - 'clientVersion': '1.20220731.00.00', + 'clientVersion': '1.20240723.01.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 56, }, 'web_music': { - 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30', 'INNERTUBE_HOST': 'music.youtube.com', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_REMIX', - 'clientVersion': '1.20220727.01.00', + 'clientVersion': '1.20240724.00.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, }, + # This client now requires sign-in for every video 'web_creator': { - 'INNERTUBE_API_KEY': 'AIzaSyBUPetSUmoZL-OhlxA7wSac5XinrygCqMo', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_CREATOR', - 'clientVersion': '1.20220726.00.00', + 'clientVersion': '1.20240723.03.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, }, 'android': { - 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID', - 'clientVersion': '19.09.37', + 'clientVersion': '19.29.37', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.youtube/19.09.37 (Linux; U; Android 11) gzip', + 'userAgent': 'com.google.android.youtube/19.29.37 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'REQUIRE_JS_PLAYER': False, - }, - 'android_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyCjc_pVEDi4qsv5MtC2dMXzpIaDoRFLsxw', - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'ANDROID_EMBEDDED_PLAYER', - 'clientVersion': '19.09.37', - 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.youtube/19.09.37 (Linux; U; Android 11) gzip', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, - 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, 'android_music': { - 'INNERTUBE_API_KEY': 'AIzaSyAOghZGza2MQSZkY_zfZ370N-PUdXEo8AI', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_MUSIC', - 'clientVersion': '6.42.52', + 'clientVersion': '7.11.50', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.apps.youtube.music/6.42.52 (Linux; U; Android 11) gzip', + 'userAgent': 'com.google.android.apps.youtube.music/7.11.50 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, + # This client now requires sign-in for every video 'android_creator': { - 'INNERTUBE_API_KEY': 'AIzaSyD_qjV8zaaUMehtLkrKFgVeSX_Iqbtyws8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_CREATOR', - 'clientVersion': '22.30.100', + 'clientVersion': '24.30.100', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.apps.youtube.creator/22.30.100 (Linux; U; Android 11) gzip', + 'userAgent': 'com.google.android.apps.youtube.creator/24.30.100 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, - # iOS clients have HLS live streams. Setting device model to get 60fps formats. - # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 - 'ios': { - 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc', + # YouTube Kids videos aren't returned on this client for some reason + 'android_vr': { 'INNERTUBE_CONTEXT': { 'client': { - 'clientName': 'IOS', - 'clientVersion': '19.09.3', - 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtube/19.09.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)', + 'clientName': 'ANDROID_VR', + 'clientVersion': '1.57.29', + 'deviceMake': 'Oculus', + 'deviceModel': 'Quest 3', + 'androidSdkVersion': 32, + 'userAgent': 'com.google.android.apps.youtube.vr.oculus/1.57.29 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip', + 'osName': 'Android', + 'osVersion': '12L', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 28, 'REQUIRE_JS_PLAYER': False, }, - 'ios_embedded': { + 'android_testsuite': { 'INNERTUBE_CONTEXT': { 'client': { - 'clientName': 'IOS_MESSAGES_EXTENSION', - 'clientVersion': '19.09.3', - 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtube/19.09.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)', + 'clientName': 'ANDROID_TESTSUITE', + 'clientVersion': '1.9', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.youtube/1.9 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 30, + 'REQUIRE_JS_PLAYER': False, + 'PLAYER_PARAMS': '2AMB', + }, + # iOS clients have HLS live streams. Setting device model to get 60fps formats. + # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 + 'ios': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS', + 'clientVersion': '19.29.1', + 'deviceMake': 'Apple', + 'deviceModel': 'iPhone16,2', + 'userAgent': 'com.google.ios.youtube/19.29.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X;)', + 'osName': 'iPhone', + 'osVersion': '17.5.1.21F90', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, 'REQUIRE_JS_PLAYER': False, }, 'ios_music': { - 'INNERTUBE_API_KEY': 'AIzaSyBAETezhkwP0ZWA02RsqT1zu78Fpt0bC_s', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MUSIC', - 'clientVersion': '6.33.3', - 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtubemusic/6.33.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)', + 'clientVersion': '7.08.2', + 'deviceMake': 'Apple', + 'deviceModel': 'iPhone16,2', + 'userAgent': 'com.google.ios.youtubemusic/7.08.2 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X;)', + 'osName': 'iPhone', + 'osVersion': '17.5.1.21F90', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, 'REQUIRE_JS_PLAYER': False, }, + # This client now requires sign-in for every video 'ios_creator': { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_CREATOR', - 'clientVersion': '22.33.101', - 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.ytcreator/22.33.101 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)', + 'clientVersion': '24.30.100', + 'deviceMake': 'Apple', + 'deviceModel': 'iPhone16,2', + 'userAgent': 'com.google.ios.ytcreator/24.30.100 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X;)', + 'osName': 'iPhone', + 'osVersion': '17.5.1.21F90', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, @@ -219,19 +254,27 @@ INNERTUBE_CLIENTS = { # mweb has 'ultralow' formats # See: https://github.com/yt-dlp/yt-dlp/pull/557 'mweb': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'MWEB', - 'clientVersion': '2.20220801.00.00', + 'clientVersion': '2.20240726.01.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, }, - # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option) - # See: https://github.com/zerodytrash/YouTube-Internal-Clients + 'tv': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'TVHTML5', + 'clientVersion': '7.20240724.13.00', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 7, + }, + # This client now requires sign-in for every video + # It was previously an age-gate workaround for videos that were `playable_in_embed` + # It may still be useful if signed into an EU account that is not age-verified 'tv_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', @@ -249,6 +292,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 95, + 'REQUIRE_JS_PLAYER': False, }, } @@ -262,7 +306,7 @@ def _split_innertube_client(client_name): def short_client_name(client_name): - main, *parts = _split_innertube_client(client_name)[0].replace('embedscreen', 'e_s').split('_') + main, *parts = _split_innertube_client(client_name)[0].split('_') return join_nonempty(main[:4], ''.join(x[0] for x in parts)).upper() @@ -270,27 +314,23 @@ def build_innertube_clients(): THIRD_PARTY = { 'embedUrl': 'https://www.youtube.com/', # Can be any valid URL } - BASE_CLIENTS = ('ios', 'android', 'web', 'tv', 'mweb') + BASE_CLIENTS = ('ios', 'web', 'tv', 'mweb', 'android') priority = qualities(BASE_CLIENTS[::-1]) for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): - ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8') ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) + ytcfg.setdefault('REQUIRE_PO_TOKEN', False) + ytcfg.setdefault('PLAYER_PARAMS', None) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') _, base_client, variant = _split_innertube_client(client) ytcfg['priority'] = 10 * priority(base_client) - if not variant: - INNERTUBE_CLIENTS[f'{client}_embedscreen'] = embedscreen = copy.deepcopy(ytcfg) - embedscreen['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED' - embedscreen['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY - embedscreen['priority'] -= 3 - elif variant == 'embedded': + if variant == 'embedded': ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY ytcfg['priority'] -= 2 - else: + elif variant: ytcfg['priority'] -= 3 @@ -468,11 +508,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'si', 'th', 'lo', 'my', 'ka', 'am', 'km', 'zh-CN', 'zh-TW', 'zh-HK', 'ja', 'ko', ] - _IGNORED_WARNINGS = {'Unavailable videos will be hidden during playback'} + _IGNORED_WARNINGS = { + 'Unavailable videos will be hidden during playback', + 'Unavailable videos are hidden', + } _YT_HANDLE_RE = r'@[\w.-]{3,30}' # https://support.google.com/youtube/answer/11585688?hl=en _YT_CHANNEL_UCID_RE = r'UC[\w-]{22}' + _NETRC_MACHINE = 'youtube' + def ucid_or_none(self, ucid): return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None) @@ -531,9 +576,214 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self._initialize_consent() self._check_login_required() + def _perform_login(self, username, password): + auth_type, _, user = (username or '').partition('+') + + if auth_type != 'oauth': + raise ExtractorError(self._youtube_login_hint, expected=True) + + self._initialize_oauth(user, password) + + ''' + OAuth 2.0 Device Authorization Grant flow, used by the YouTube TV client (youtube.com/tv). + + For more information regarding OAuth 2.0 and the Device Authorization Grant flow in general, see: + - https://developers.google.com/identity/protocols/oauth2/limited-input-device + - https://accounts.google.com/.well-known/openid-configuration + - https://www.rfc-editor.org/rfc/rfc8628 + - https://www.rfc-editor.org/rfc/rfc6749 + + Note: The official client appears to use a proxied version of the oauth2 endpoints on youtube.com/o/oauth2, + which applies some modifications to the response (such as returning errors as 200 OK). + Since the client works with the standard API, we will use that as it is well-documented. + ''' + + _OAUTH_PROFILE = None + _OAUTH_ACCESS_TOKEN_CACHE = {} + _OAUTH_DISPLAY_ID = 'oauth' + + # YouTube TV (TVHTML5) client. You can find these at youtube.com/tv + _OAUTH_CLIENT_ID = '861556708454-d6dlm3lh05idd8npek18k6be8ba3oc68.apps.googleusercontent.com' + _OAUTH_CLIENT_SECRET = 'SboVhoG9s0rNafixCSGGKXAT' + _OAUTH_SCOPE = 'http://gdata.youtube.com https://www.googleapis.com/auth/youtube-paid-content' + + # From https://accounts.google.com/.well-known/openid-configuration + # Technically, these should be fetched dynamically and not hard-coded. + # However, as these endpoints rarely change, we can risk saving an extra request for every invocation. + _OAUTH_DEVICE_AUTHORIZATION_ENDPOINT = 'https://oauth2.googleapis.com/device/code' + _OAUTH_TOKEN_ENDPOINT = 'https://oauth2.googleapis.com/token' + + @property + def _oauth_cache_key(self): + return f'oauth_refresh_token_{self._OAUTH_PROFILE}' + + def _read_oauth_error_response(self, response): + return traverse_obj( + self._webpage_read_content(response, self._OAUTH_TOKEN_ENDPOINT, self._OAUTH_DISPLAY_ID, fatal=False), + ({json.loads}, 'error', {str})) + + def _set_oauth_info(self, token_response): + YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE.setdefault(self._OAUTH_PROFILE, {}).update({ + 'access_token': token_response['access_token'], + 'token_type': token_response['token_type'], + 'expiry': time_seconds( + seconds=traverse_obj(token_response, ('expires_in', {float_or_none}), default=300) - 10), + }) + refresh_token = traverse_obj(token_response, ('refresh_token', {str})) + if refresh_token: + self.cache.store(self._NETRC_MACHINE, self._oauth_cache_key, refresh_token) + YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE[self._OAUTH_PROFILE]['refresh_token'] = refresh_token + + def _initialize_oauth(self, user, refresh_token): + self._OAUTH_PROFILE = user or 'default' + + if self._OAUTH_PROFILE in YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE: + self.write_debug(f'{self._OAUTH_DISPLAY_ID}: Using cached access token for profile "{self._OAUTH_PROFILE}"') + return + + YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE[self._OAUTH_PROFILE] = {} + + if refresh_token: + msg = f'{self._OAUTH_DISPLAY_ID}: Using password input as refresh token' + if self.get_param('cachedir') is not False: + msg += ' and caching token to disk; you should supply an empty password next time' + self.to_screen(msg) + self.cache.store(self._NETRC_MACHINE, self._oauth_cache_key, refresh_token) + else: + refresh_token = self.cache.load(self._NETRC_MACHINE, self._oauth_cache_key) + + if refresh_token: + YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE[self._OAUTH_PROFILE]['refresh_token'] = refresh_token + try: + token_response = self._refresh_token(refresh_token) + except ExtractorError as e: + error_msg = str(e.orig_msg).replace('Failed to refresh access token: ', '') + self.report_warning(f'{self._OAUTH_DISPLAY_ID}: Failed to refresh access token: {error_msg}') + token_response = self._oauth_authorize + else: + token_response = self._oauth_authorize + + self._set_oauth_info(token_response) + self.write_debug(f'{self._OAUTH_DISPLAY_ID}: Logged in using profile "{self._OAUTH_PROFILE}"') + + def _refresh_token(self, refresh_token): + try: + token_response = self._download_json( + self._OAUTH_TOKEN_ENDPOINT, + video_id=self._OAUTH_DISPLAY_ID, + note='Refreshing access token', + data=json.dumps({ + 'client_id': self._OAUTH_CLIENT_ID, + 'client_secret': self._OAUTH_CLIENT_SECRET, + 'refresh_token': refresh_token, + 'grant_type': 'refresh_token', + }).encode(), + headers={'Content-Type': 'application/json'}) + except ExtractorError as e: + if isinstance(e.cause, HTTPError): + error = self._read_oauth_error_response(e.cause.response) + if error == 'invalid_grant': + # RFC6749 § 5.2 + raise ExtractorError( + 'Failed to refresh access token: Refresh token is invalid, revoked, or expired (invalid_grant)', + expected=True, video_id=self._OAUTH_DISPLAY_ID) + raise ExtractorError( + f'Failed to refresh access token: Authorization server returned error {error}', + video_id=self._OAUTH_DISPLAY_ID) + raise + return token_response + + @property + def _oauth_authorize(self): + code_response = self._download_json( + self._OAUTH_DEVICE_AUTHORIZATION_ENDPOINT, + video_id=self._OAUTH_DISPLAY_ID, + note='Initializing authorization flow', + data=json.dumps({ + 'client_id': self._OAUTH_CLIENT_ID, + 'scope': self._OAUTH_SCOPE, + }).encode(), + headers={'Content-Type': 'application/json'}) + + verification_url = traverse_obj(code_response, ('verification_url', {str})) + user_code = traverse_obj(code_response, ('user_code', {str})) + if not verification_url or not user_code: + raise ExtractorError( + 'Authorization server did not provide verification_url or user_code', video_id=self._OAUTH_DISPLAY_ID) + + # note: The whitespace is intentional + self.to_screen( + f'{self._OAUTH_DISPLAY_ID}: To give yt-dlp access to your account, ' + f'go to {verification_url} and enter code {user_code}') + + # RFC8628 § 3.5: default poll interval is 5 seconds if not provided + poll_interval = traverse_obj(code_response, ('interval', {int}), default=5) + + for retry in self.RetryManager(): + while True: + try: + token_response = self._download_json( + self._OAUTH_TOKEN_ENDPOINT, + video_id=self._OAUTH_DISPLAY_ID, + note=False, + errnote='Failed to request access token', + data=json.dumps({ + 'client_id': self._OAUTH_CLIENT_ID, + 'client_secret': self._OAUTH_CLIENT_SECRET, + 'device_code': code_response['device_code'], + 'grant_type': 'urn:ietf:params:oauth:grant-type:device_code', + }).encode(), + headers={'Content-Type': 'application/json'}) + except ExtractorError as e: + if isinstance(e.cause, TransportError): + retry.error = e + break + elif isinstance(e.cause, HTTPError): + error = self._read_oauth_error_response(e.cause.response) + if not error: + retry.error = e + break + + if error == 'authorization_pending': + time.sleep(poll_interval) + continue + elif error == 'expired_token': + raise ExtractorError( + 'Authorization timed out', expected=True, video_id=self._OAUTH_DISPLAY_ID) + elif error == 'access_denied': + raise ExtractorError( + 'You denied access to an account', expected=True, video_id=self._OAUTH_DISPLAY_ID) + elif error == 'slow_down': + # RFC8628 § 3.5: add 5 seconds to the poll interval + poll_interval += 5 + time.sleep(poll_interval) + continue + else: + raise ExtractorError( + f'Authorization server returned an error when fetching access token: {error}', + video_id=self._OAUTH_DISPLAY_ID) + raise + + return token_response + + def _update_oauth(self): + token = YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE.get(self._OAUTH_PROFILE) + if token is None or token['expiry'] > time.time(): + return + + self._set_oauth_info(self._refresh_token(token['refresh_token'])) + + @property + def _youtube_login_hint(self): + return ('Use --username=oauth[+PROFILE] --password="" to log in using oauth, ' + f'or else u{self._login_hint(method="cookies")[1:]}. ' + 'See https://github.com/yt-dlp/yt-dlp/wiki/Extractors#logging-in-with-oauth for more on how to use oauth. ' + 'See https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies for help with cookies') + def _check_login_required(self): - if self._LOGIN_REQUIRED and not self._cookies_passed: - self.raise_login_required('Login details are needed to download this content', method='cookies') + if self._LOGIN_REQUIRED and not self.is_authenticated: + self.raise_login_required( + f'Login details are needed to download this content. {self._youtube_login_hint}', method=None) _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=' _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=' @@ -563,9 +813,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return (self._configuration_arg('innertube_host', [''], ie_key=YoutubeIE.ie_key())[0] or req_api_hostname or self._get_innertube_host(default_client or 'web')) - def _extract_api_key(self, ytcfg=None, default_client='web'): - return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], str, default_client) - def _extract_context(self, ytcfg=None, default_client='web'): context = get_first( (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) @@ -611,13 +858,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor): real_headers.update({'content-type': 'application/json'}) if headers: real_headers.update(headers) - api_key = (self._configuration_arg('innertube_key', [''], ie_key=YoutubeIE.ie_key(), casesense=True)[0] - or api_key or self._extract_api_key(default_client=default_client)) return self._download_json( f'https://{self._select_api_hostname(api_hostname, default_client)}/youtubei/v1/{ep}', video_id=video_id, fatal=fatal, note=note, errnote=errnote, data=json.dumps(data).encode('utf8'), headers=real_headers, - query={'key': api_key, 'prettyPrint': 'false'}) + query=filter_dict({ + 'key': self._configuration_arg( + 'innertube_key', [api_key], ie_key=YoutubeIE.ie_key(), casesense=True)[0], + 'prettyPrint': 'false', + }, cndn=lambda _, v: v)) def extract_yt_initial_data(self, item_id, webpage, fatal=True): return self._search_json(self._YT_INITIAL_DATA_RE, webpage, 'yt initial data', item_id, fatal=fatal) @@ -633,49 +882,53 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if session_index is not None: return session_index - # Deprecated? - def _extract_identity_token(self, ytcfg=None, webpage=None): - if ytcfg: - token = try_get(ytcfg, lambda x: x['ID_TOKEN'], str) - if token: - return token - if webpage: - return self._search_regex( - r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, - 'identity token', default=None, fatal=False) + def _data_sync_id_to_delegated_session_id(self, data_sync_id): + if not data_sync_id: + return + # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel + # and just "user_syncid||" for primary channel. We only want the channel_syncid + channel_syncid, _, user_syncid = data_sync_id.partition('||') + if user_syncid: + return channel_syncid - @staticmethod - def _extract_account_syncid(*args): + def _extract_account_syncid(self, *args): """ - Extract syncId required to download private playlists of secondary channels + Extract current session ID required to download private playlists of secondary channels @params response and/or ytcfg """ - for data in args: - # ytcfg includes channel_syncid if on secondary channel - delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], str) - if delegated_sid: - return delegated_sid - sync_ids = (try_get( - data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], - lambda x: x['DATASYNC_ID']), str) or '').split('||') - if len(sync_ids) >= 2 and sync_ids[1]: - # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel - # and just "user_syncid||" for primary channel. We only want the channel_syncid - return sync_ids[0] + # ytcfg includes channel_syncid if on secondary channel + if delegated_sid := traverse_obj(args, (..., 'DELEGATED_SESSION_ID', {str}, any)): + return delegated_sid - @staticmethod - def _extract_visitor_data(*args): + data_sync_id = self._extract_data_sync_id(*args) + return self._data_sync_id_to_delegated_session_id(data_sync_id) + + def _extract_data_sync_id(self, *args): + """ + Extract current account dataSyncId. + In the format DELEGATED_SESSION_ID||USER_SESSION_ID or USER_SESSION_ID|| + @params response and/or ytcfg + """ + if data_sync_id := self._configuration_arg('data_sync_id', [None], ie_key=YoutubeIE, casesense=True)[0]: + return data_sync_id + + return traverse_obj( + args, (..., ('DATASYNC_ID', ('responseContext', 'mainAppWebResponseContext', 'datasyncId')), {str}, any)) + + def _extract_visitor_data(self, *args): """ Extracts visitorData from an API response or ytcfg Appears to be used to track session state """ + if visitor_data := self._configuration_arg('visitor_data', [None], ie_key=YoutubeIE, casesense=True)[0]: + return visitor_data return get_first( args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], expected_type=str) @functools.cached_property def is_authenticated(self): - return bool(self._generate_sapisidhash_header()) + return self._OAUTH_PROFILE or bool(self._generate_sapisidhash_header()) def extract_ytcfg(self, video_id, webpage): if not webpage: @@ -685,9 +938,36 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', default='{}'), video_id, fatal=False) or {} + def _generate_oauth_headers(self): + self._update_oauth() + oauth_token = YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE.get(self._OAUTH_PROFILE) + if not oauth_token: + return {} + + return { + 'Authorization': f'{oauth_token["token_type"]} {oauth_token["access_token"]}', + } + + def _generate_cookie_auth_headers(self, *, ytcfg=None, account_syncid=None, session_index=None, origin=None, **kwargs): + headers = {} + account_syncid = account_syncid or self._extract_account_syncid(ytcfg) + if account_syncid: + headers['X-Goog-PageId'] = account_syncid + if session_index is None: + session_index = self._extract_session_index(ytcfg) + if account_syncid or session_index is not None: + headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0 + + auth = self._generate_sapisidhash_header(origin) + if auth is not None: + headers['Authorization'] = auth + headers['X-Origin'] = origin + + return headers + def generate_api_headers( self, *, ytcfg=None, account_syncid=None, session_index=None, - visitor_data=None, identity_token=None, api_hostname=None, default_client='web'): + visitor_data=None, api_hostname=None, default_client='web', **kwargs): origin = 'https://' + (self._select_api_hostname(api_hostname, default_client)) headers = { @@ -695,22 +975,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)), 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client), 'Origin': origin, - 'X-Youtube-Identity-Token': identity_token or self._extract_identity_token(ytcfg), - 'X-Goog-PageId': account_syncid or self._extract_account_syncid(ytcfg), 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg), 'User-Agent': self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT']['client']['userAgent'], default_client=default_client), + **self._generate_oauth_headers(), + **self._generate_cookie_auth_headers(ytcfg=ytcfg, account_syncid=account_syncid, session_index=session_index, origin=origin), } - if session_index is None: - session_index = self._extract_session_index(ytcfg) - if account_syncid or session_index is not None: - headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0 - - auth = self._generate_sapisidhash_header(origin) - if auth is not None: - headers['Authorization'] = auth - headers['X-Origin'] = origin return filter_dict(headers) + def _generate_webpage_headers(self): + return self._generate_oauth_headers() + def _download_ytcfg(self, client, video_id): url = { 'web': 'https://www.youtube.com', @@ -720,7 +994,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if not url: return {} webpage = self._download_webpage( - url, video_id, fatal=False, note=f'Downloading {client.replace("_", " ").strip()} client config') + url, video_id, fatal=False, note=f'Downloading {client.replace("_", " ").strip()} client config', + headers=self._generate_webpage_headers()) return self.extract_ytcfg(video_id, webpage) or {} @staticmethod @@ -885,14 +1160,14 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return count @staticmethod - def _extract_thumbnails(data, *path_list): + def _extract_thumbnails(data, *path_list, final_key='thumbnails'): """ Extract thumbnails from thumbnails dict @param path_list: path list to level that contains 'thumbnails' key """ thumbnails = [] for path in path_list or [()]: - for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...)): + for thumbnail in traverse_obj(data, (*variadic(path), final_key, ...)): thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue @@ -969,7 +1244,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): ep=ep, fatal=True, headers=headers, video_id=item_id, query=query, note=note, context=self._extract_context(ytcfg, default_client), - api_key=self._extract_api_key(ytcfg, default_client), api_hostname=api_hostname, default_client=default_client) except ExtractorError as e: if not isinstance(e.cause, network_exceptions): @@ -1291,6 +1565,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, } _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') + _DEFAULT_CLIENTS = ('ios', 'mweb') _GEO_BYPASS = False @@ -1458,6 +1733,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'heatmap': 'count:100', 'timestamp': 1401991663, }, + 'skip': 'Age-restricted; requires authentication', }, { 'note': 'Age-gate video with embed allowed in public site', @@ -1488,6 +1764,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'comment_count': int, 'channel_is_verified': True, }, + 'skip': 'Age-restricted; requires authentication', }, { 'note': 'Age-gate video embedable only with clientScreen=EMBED', @@ -1518,6 +1795,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@ProjektMelody', 'timestamp': 1577508724, }, + 'skip': 'Age-restricted; requires authentication', }, { 'note': 'Non-Agegated non-embeddable video', @@ -2289,6 +2567,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'timestamp': 1405513526, }, + 'skip': 'Age-restricted; requires authentication', }, { # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685 @@ -2659,6 +2938,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'timestamp': 1577508724, }, 'params': {'extractor_args': {'youtube': {'player_client': ['tv_embedded']}}, 'format': '251-drc'}, + 'skip': 'Age-restricted; requires authentication', }, { 'url': 'https://www.youtube.com/live/qVv6vCqciTM', @@ -2980,7 +3260,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): code = self._download_webpage( player_url, video_id, fatal=fatal, note='Downloading player ' + player_id, - errnote=f'Download of {player_url} failed') + errnote=f'Download of {player_url} failed', + headers=self._generate_webpage_headers()) if code: self._code_cache[player_id] = code return self._code_cache.get(player_id) @@ -3125,11 +3406,42 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.write_debug(f'Decrypted nsig {s} => {ret}') return ret - def _extract_n_function_name(self, jscode): + def _extract_n_function_name(self, jscode, player_url=None): + # Examples (with placeholders nfunc, narray, idx): + # * .get("n"))&&(b=nfunc(b) + # * .get("n"))&&(b=narray[idx](b) + # * b=String.fromCharCode(110),c=a.get(b))&&c=narray[idx](c) + # * a.D&&(b="nn"[+a.D],c=a.get(b))&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("") + # * a.D&&(PL(a),b=a.j.n||null)&&(b=narray[0](b),a.set("n",b),narray.length||nfunc("") + # * a.D&&(b="nn"[+a.D],vL(a),c=a.j[b]||null)&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("") funcname, idx = self._search_regex( - r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)', - jscode, 'Initial JS player n function name', group=('nfunc', 'idx')) - if not idx: + r'''(?x) + (?: + \.get\("n"\)\)&&\(b=| + (?: + b=String\.fromCharCode\(110\)| + (?P<str_idx>[a-zA-Z0-9_$.]+)&&\(b="nn"\[\+(?P=str_idx)\] + ) + (?: + ,[a-zA-Z0-9_$]+\(a\))?,c=a\. + (?: + get\(b\)| + [a-zA-Z0-9_$]+\[b\]\|\|null + )\)&&\(c=| + \b(?P<var>[a-zA-Z0-9_$]+)= + )(?P<nfunc>[a-zA-Z0-9_$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z]\) + (?(var),[a-zA-Z0-9_$]+\.set\("n"\,(?P=var)\),(?P=nfunc)\.length)''', + jscode, 'n function name', group=('nfunc', 'idx'), default=(None, None)) + if not funcname: + self.report_warning(join_nonempty( + 'Falling back to generic n function search', + player_url and f' player = {player_url}', delim='\n')) + return self._search_regex( + r'''(?xs) + ;\s*(?P<name>[a-zA-Z0-9_$]+)\s*=\s*function\([a-zA-Z0-9_$]+\) + \s*\{(?:(?!};).)+?["']enhanced_except_''', + jscode, 'Initial JS player n function name', group='name') + elif not idx: return funcname return json.loads(js_to_json(self._search_regex( @@ -3138,26 +3450,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id, min_ver='2022.09.1') + func_code = self.cache.load('youtube-nsig', player_id, min_ver='2024.07.09') jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) if func_code: return jsi, player_id, func_code - func_name = self._extract_n_function_name(jscode) + func_name = self._extract_n_function_name(jscode, player_url=player_url) - # For redundancy - func_code = self._search_regex( - rf'''(?xs){func_name}\s*=\s*function\s*\((?P<var>[\w$]+)\)\s* - # NB: The end of the regex is intentionally kept strict - {{(?P<code>.+?}}\s*return\ [\w$]+.join\(""\))}};''', - jscode, 'nsig function', group=('var', 'code'), default=None) - if func_code: - func_code = ([func_code[0]], func_code[1]) - else: - self.write_debug('Extracting nsig function with jsinterp') - func_code = jsi.extract_function_code(func_name) + func_code = jsi.extract_function_code(func_name) self.cache.store('youtube-nsig', player_id, func_code) return jsi, player_id, func_code @@ -3242,7 +3544,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._download_webpage( url, video_id, f'Marking {label}watched', - 'Unable to mark watched', fatal=False) + 'Unable to mark watched', fatal=False, + headers=self._generate_webpage_headers()) @classmethod def _extract_from_webpage(cls, url, webpage): @@ -3631,6 +3934,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor): **cls._get_checkok_params(), } + def _get_config_po_token(self, client): + po_token_strs = self._configuration_arg('po_token', [], ie_key=YoutubeIE, casesense=True) + for token_str in po_token_strs: + po_token_client, sep, po_token = token_str.partition('+') + if not sep: + self.report_warning( + f'Invalid po_token configuration format. Expected "client+po_token", got "{token_str}"', only_once=True) + continue + if po_token_client == client: + return po_token + + def fetch_po_token(self, client='web', visitor_data=None, data_sync_id=None, player_url=None, **kwargs): + # PO Token is bound to visitor_data / Visitor ID when logged out. Must have visitor_data for it to function. + if not visitor_data and not self.is_authenticated and player_url: + self.report_warning( + f'Unable to fetch PO Token for {client} client: Missing required Visitor Data. ' + f'You may need to pass Visitor Data with --extractor-args "youtube:visitor_data=XXX"') + return + + config_po_token = self._get_config_po_token(client) + if config_po_token: + # PO token is bound to data_sync_id / account Session ID when logged in. However, for the config po_token, + # if using first channel in an account then we don't need the data_sync_id anymore... + if not data_sync_id and self.is_authenticated and player_url: + self.report_warning( + f'Got a PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.' + f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') + + return config_po_token + + # Require PO Token if logged in for external fetching + if not data_sync_id and self.is_authenticated and player_url: + self.report_warning( + f'Unable to fetch PO Token for {client} client: Missing required Data Sync ID for account. ' + f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') + return + + return self._fetch_po_token( + client=client, + visitor_data=visitor_data, + data_sync_id=data_sync_id, + player_url=player_url, + **kwargs, + ) + + def _fetch_po_token(self, client, visitor_data=None, data_sync_id=None, player_url=None, **kwargs): + """External PO Token fetch stub""" + @staticmethod def _is_agegated(player_response): if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')): @@ -3647,22 +3998,31 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): - - session_index = self._extract_session_index(player_ytcfg, master_ytcfg) - syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr) - sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None + def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, visitor_data, data_sync_id, po_token): headers = self.generate_api_headers( - ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client) + ytcfg=player_ytcfg, + default_client=client, + visitor_data=visitor_data, + session_index=self._extract_session_index(master_ytcfg, player_ytcfg), + account_syncid=( + self._data_sync_id_to_delegated_session_id(data_sync_id) + or self._extract_account_syncid(master_ytcfg, initial_pr, player_ytcfg) + ), + ) yt_query = { 'videoId': video_id, } - pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0] - if pp_arg: - yt_query['params'] = pp_arg + default_pp = traverse_obj( + INNERTUBE_CLIENTS, (_split_innertube_client(client)[0], 'PLAYER_PARAMS', {str})) + if player_params := self._configuration_arg('player_params', [default_pp], casesense=True)[0]: + yt_query['params'] = player_params + + if po_token: + yt_query['serviceIntegrityDimensions'] = {'poToken': po_token} + sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None yt_query.update(self._generate_player_context(sts)) return self._extract_response( item_id=video_id, ep='player', query=yt_query, @@ -3673,30 +4033,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _get_requested_clients(self, url, smuggled_data): requested_clients = [] - android_clients = [] - default = ['ios', 'web'] + excluded_clients = [] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) for client in self._configuration_arg('player_client'): if client == 'default': - requested_clients.extend(default) + requested_clients.extend(self._DEFAULT_CLIENTS) elif client == 'all': requested_clients.extend(allowed_clients) + elif client.startswith('-'): + excluded_clients.append(client[1:]) elif client not in allowed_clients: - self.report_warning(f'Skipping unsupported client {client}') - elif client.startswith('android'): - android_clients.append(client) + self.report_warning(f'Skipping unsupported client "{client}"') else: requested_clients.append(client) - # Force deprioritization of broken Android clients for format de-duplication - requested_clients.extend(android_clients) if not requested_clients: - requested_clients = default + requested_clients.extend(self._DEFAULT_CLIENTS) + for excluded_client in excluded_clients: + if excluded_client in requested_clients: + requested_clients.remove(excluded_client) + if not requested_clients: + raise ExtractorError('No player clients have been requested', expected=True) if smuggled_data.get('is_music_url') or self.is_music_url(url): - requested_clients.extend( - f'{client}_music' for client in requested_clients if f'{client}_music' in INNERTUBE_CLIENTS) + for requested_client in requested_clients: + _, base_client, variant = _split_innertube_client(requested_client) + music_client = f'{base_client}_music' + if variant != 'music' and music_client in INNERTUBE_CLIENTS: + requested_clients.append(music_client) return orderedSet(requested_clients) @@ -3713,6 +4078,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False) prs = [] + deprioritized_prs = [] + if initial_pr and not self._invalid_player_response(initial_pr, video_id): # Android player_response does not have microFormats which are needed for # extraction of some data. So we return the initial_pr with formats @@ -3734,9 +4101,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return tried_iframe_fallback = False - player_url = None + player_url = visitor_data = data_sync_id = None skipped_clients = {} while clients: + deprioritize_pr = False client, base_client, variant = _split_innertube_client(clients.pop()) player_ytcfg = master_ytcfg if client == 'web' else {} if 'configs' not in self._configuration_arg('player_skip') and client != 'web': @@ -3752,9 +4120,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_url = self._download_player_url(video_id) tried_iframe_fallback = True + visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg) + data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg) + po_token = self.fetch_po_token( + client=client, visitor_data=visitor_data, + data_sync_id=data_sync_id if self.is_authenticated else None, + player_url=player_url if require_js_player else None, + ) + + require_po_token = self._get_default_ytcfg(client).get('REQUIRE_PO_TOKEN') + if not po_token and require_po_token: + self.report_warning( + f'No PO Token provided for {client} client, ' + f'which is required for working {client} formats. ' + f'You can manually pass a PO Token for this client with ' + f'--extractor-args "youtube:po_token={client}+XXX"', + only_once=True) + deprioritize_pr = True + + pr = initial_pr if client == 'web' else None try: - pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response( - client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr, smuggled_data) + pr = pr or self._extract_player_response( + client, video_id, + master_ytcfg=player_ytcfg or master_ytcfg, + player_ytcfg=player_ytcfg, + player_url=player_url, + initial_pr=initial_pr, + visitor_data=visitor_data, + data_sync_id=data_sync_id, + po_token=po_token) except ExtractorError as e: self.report_warning(e) continue @@ -3763,21 +4157,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor): skipped_clients[client] = pr_id elif pr: # Save client name for introspection later - name = short_client_name(client) sd = traverse_obj(pr, ('streamingData', {dict})) or {} - sd[STREAMING_DATA_CLIENT_NAME] = name + sd[STREAMING_DATA_CLIENT_NAME] = client + sd[STREAMING_DATA_PO_TOKEN] = po_token for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): - f[STREAMING_DATA_CLIENT_NAME] = name - prs.append(pr) - - # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in - if variant == 'embedded' and self._is_unplayable(pr) and self.is_authenticated: - append_client(f'{base_client}_creator') - elif self._is_agegated(pr): - if variant == 'tv_embedded': - append_client(f'{base_client}_embedded') - elif not variant: - append_client(f'tv_embedded.{base_client}', f'{base_client}_embedded') + f[STREAMING_DATA_CLIENT_NAME] = client + f[STREAMING_DATA_PO_TOKEN] = po_token + if deprioritize_pr: + deprioritized_prs.append(pr) + else: + prs.append(pr) + + # EU countries require age-verification for accounts to access age-restricted videos + # If account is not age-verified, _is_agegated() will be truthy for non-embedded clients + if self.is_authenticated and self._is_agegated(pr): + self.to_screen( + f'{video_id}: This video is age-restricted and YouTube is requiring ' + 'account age-verification; some formats may be missing', only_once=True) + # web_creator and mediaconnect can work around the age-verification requirement + # _testsuite & _vr variants can also work around age-verification + # tv_embedded may(?) still work around age-verification if the video is embeddable + append_client('web_creator', 'mediaconnect') + + prs.extend(deprioritized_prs) if skipped_clients: self.report_warning( @@ -3797,6 +4199,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): CHUNK_SIZE = 10 << 20 + PREFERRED_LANG_VALUE = 10 + original_language = None itags, stream_ids = collections.defaultdict(set), [] itag_qualities, res_qualities = {}, {0: None} q = qualities([ @@ -3845,6 +4249,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): itag_qualities[itag] = quality if height: res_qualities[height] = quality + + is_default = audio_track.get('audioIsDefault') + is_descriptive = 'descriptive' in (audio_track.get('displayName') or '').lower() + language_code = audio_track.get('id', '').split('.')[0] + if language_code and is_default: + original_language = language_code + # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment # (adding `&sq=0` to the URL) and parsing emsg box to determine the # number of fragment that would subsequently requested with (`&sq=N`) @@ -3870,7 +4281,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue query = parse_qs(fmt_url) - throttled = False if query.get('n'): try: decrypt_nsig = self._cached(self._decrypt_nsig, 'nsig', query['n'][0]) @@ -3884,20 +4294,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f'to workaround the issue. {PhantomJSwrapper.INSTALL_HINT}\n') if player_url: self.report_warning( - f'nsig extraction failed: You may experience throttling for some formats\n{phantomjs_hint}' + f'nsig extraction failed: Some formats may be missing\n{phantomjs_hint}' f' n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True) self.write_debug(e, only_once=True) else: self.report_warning( - 'Cannot decrypt nsig without player_url: You may experience throttling for some formats', + 'Cannot decrypt nsig without player_url: Some formats may be missing', video_id=video_id, only_once=True) - throttled = True + continue tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) - language_preference = ( - 10 if audio_track.get('audioIsDefault') and 10 - else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 - else -1) format_duration = traverse_obj(fmt, ('approxDurationMs', {lambda x: float_or_none(x, 1000)})) # Some formats may have much smaller duration than others (possibly damaged during encoding) # E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 @@ -3908,14 +4314,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_warning( f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) - client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) - # Android client formats are broken due to integrity check enforcement - # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554 - is_broken = client_name and client_name.startswith(short_client_name('android')) + client_name = fmt[STREAMING_DATA_CLIENT_NAME] + po_token = fmt.get(STREAMING_DATA_PO_TOKEN) + + if po_token: + fmt_url = update_url_query(fmt_url, {'pot': po_token}) + + # Clients that require PO Token return videoplayback URLs that may return 403 + is_broken = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) if is_broken: self.report_warning( - f'{video_id}: Android client formats are broken and may yield HTTP Error 403. ' - 'They will be deprioritized', only_once=True) + f'{video_id}: {client_name} client formats require a PO Token which was not provided. ' + 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' fps = int_or_none(fmt.get('fps')) or 0 @@ -3924,17 +4334,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'filesize': int_or_none(fmt.get('contentLength')), 'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}', 'format_note': join_nonempty( - join_nonempty(audio_track.get('displayName'), - language_preference > 0 and ' (default)', delim=''), + join_nonempty(audio_track.get('displayName'), is_default and ' (default)', delim=''), name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), - throttled and 'THROTTLED', is_damaged and 'DAMAGED', is_broken and 'BROKEN', - (self.get_param('verbose') or all_formats) and client_name, + is_damaged and 'DAMAGED', is_broken and 'BROKEN', + (self.get_param('verbose') or all_formats) and short_client_name(client_name), delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 - 'source_preference': ((-10 if throttled else -5 if itag == '22' else -1) - + (100 if 'Premium' in name else 0)), + 'source_preference': (-5 if itag == '22' else -1) + (100 if 'Premium' in name else 0), 'fps': fps if fps > 1 else None, # For some formats, fps is wrongly returned as 1 'audio_channels': fmt.get('audioChannels'), 'height': height, @@ -3944,9 +4352,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'filesize_approx': filesize_from_tbr(tbr, format_duration), 'url': fmt_url, 'width': int_or_none(fmt.get('width')), - 'language': join_nonempty(audio_track.get('id', '').split('.')[0], - 'desc' if language_preference < -1 else '') or None, - 'language_preference': language_preference, + 'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None, + 'language_preference': PREFERRED_LANG_VALUE if is_default else -10 if is_descriptive else -1, # Strictly de-prioritize broken, damaged and 3gp formats 'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None, } @@ -3994,12 +4401,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': skip_manifests.add('dash') - def process_manifest_format(f, proto, client_name, itag): + def process_manifest_format(f, proto, client_name, itag, po_token): key = (proto, f.get('language')) if not all_formats and key in itags[itag]: return False itags[itag].add(key) + if f.get('source_preference') is None: + f['source_preference'] = -1 + + # Clients that require PO Token return videoplayback URLs that may return 403 + # hls does not currently require PO Token + if (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) and proto != 'hls': + self.report_warning( + f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. ' + 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) + f['format_note'] = join_nonempty(f.get('format_note'), 'BROKEN', delim=' ') + f['source_preference'] -= 20 + if itag and all_formats: f['format_id'] = f'{itag}-{proto}' elif any(p != proto for p, _ in itags[itag]): @@ -4007,8 +4426,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif itag: f['format_id'] = itag - if f.get('source_preference') is None: - f['source_preference'] = -1 + if original_language and f.get('language') == original_language: + f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ') + f['language_preference'] = PREFERRED_LANG_VALUE if itag in ('616', '235'): f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') @@ -4018,7 +4438,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if f['quality'] == -1 and f.get('height'): f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) if self.get_param('verbose') or all_formats: - f['format_note'] = join_nonempty(f.get('format_note'), client_name, delim=', ') + f['format_note'] = join_nonempty( + f.get('format_note'), short_client_name(client_name), delim=', ') if f.get('fps') and f['fps'] <= 1: del f['fps'] @@ -4029,24 +4450,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): subtitles = {} for sd in streaming_data: - client_name = sd.get(STREAMING_DATA_CLIENT_NAME) - + client_name = sd[STREAMING_DATA_CLIENT_NAME] + po_token = sd.get(STREAMING_DATA_PO_TOKEN) hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') if hls_manifest_url: + if po_token: + hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}' fmts, subs = self._extract_m3u8_formats_and_subtitles( hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') subtitles = self._merge_subtitles(subs, subtitles) for f in fmts: if process_manifest_format(f, 'hls', client_name, self._search_regex( - r'/itag/(\d+)', f['url'], 'itag', default=None)): + r'/itag/(\d+)', f['url'], 'itag', default=None), po_token): yield f dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl') if dash_manifest_url: + if po_token: + dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}' formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH for f in formats: - if process_manifest_format(f, 'dash', client_name, f['format_id']): + if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token): f['filesize'] = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if needs_live_processing: @@ -4101,7 +4526,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if pp: query['pp'] = pp webpage = self._download_webpage( - webpage_url, video_id, fatal=False, query=query) + webpage_url, video_id, fatal=False, query=query, headers=self._generate_webpage_headers()) master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg() @@ -4351,7 +4776,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'), 'live_status': live_status, 'release_timestamp': live_start_time, - '_format_sort_fields': ( # source_preference is lower for throttled/potentially damaged formats + '_format_sort_fields': ( # source_preference is lower for potentially damaged formats 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec:vp9.2', 'channels', 'acodec', 'lang', 'proto'), } @@ -4480,11 +4905,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): headers=self.generate_api_headers(ytcfg=master_ytcfg), note='Downloading initial data API JSON') + COMMENTS_SECTION_IDS = ('comment-item-section', 'engagement-panel-comments-section') info['comment_count'] = traverse_obj(initial_data, ( 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'itemSectionRenderer', 'contents', ..., 'commentsEntryPointHeaderRenderer', 'commentCount', ), ( - 'engagementPanels', lambda _, v: v['engagementPanelSectionListRenderer']['panelIdentifier'] == 'comment-item-section', + 'engagementPanels', lambda _, v: v['engagementPanelSectionListRenderer']['panelIdentifier'] in COMMENTS_SECTION_IDS, 'engagementPanelSectionListRenderer', 'header', 'engagementPanelTitleHeaderRenderer', 'contextualInfo', ), expected_type=self._get_count, get_all=False) @@ -4868,7 +5294,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _rich_entries(self, rich_grid_renderer): renderer = traverse_obj( rich_grid_renderer, - ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer')), get_all=False) or {} + ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer', 'shortsLockupViewModel'), any)) or {} video_id = renderer.get('videoId') if video_id: yield self._extract_video(renderer) @@ -4880,6 +5306,21 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=self._get_text(renderer, 'title')) return + # shortsLockupViewModel extraction + entity_id = renderer.get('entityId') + if entity_id: + video_id = traverse_obj(renderer, ('onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId', {str})) + if not video_id: + return + yield self.url_result( + f'https://www.youtube.com/shorts/{video_id}', + ie=YoutubeIE, video_id=video_id, + **traverse_obj(renderer, ('overlayMetadata', { + 'title': ('primaryText', 'content', {str}), + 'view_count': ('secondaryText', 'content', {parse_count}), + })), + thumbnails=self._extract_thumbnails(renderer, 'thumbnail', final_key='sources')) + return def _video_entry(self, video_renderer): video_id = video_renderer.get('videoId') @@ -5124,6 +5565,10 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): else: metadata_renderer = traverse_obj(data, ('metadata', 'playlistMetadataRenderer'), expected_type=dict) + # pageHeaderViewModel slow rollout began April 2024 + page_header_view_model = traverse_obj(data, ( + 'header', 'pageHeaderRenderer', 'content', 'pageHeaderViewModel', {dict})) + # We can get the uncropped banner/avatar by replacing the crop params with '=s0' # See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714 def _get_uncropped(url): @@ -5139,8 +5584,10 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): 'preference': 1, }) - channel_banners = self._extract_thumbnails( - data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner'))) + channel_banners = ( + self._extract_thumbnails(data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner'))) + or self._extract_thumbnails( + page_header_view_model, ('banner', 'imageBannerViewModel', 'image'), final_key='sources')) for banner in channel_banners: banner['preference'] = -10 @@ -5167,7 +5614,11 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): or self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or info['id']), 'availability': self._extract_availability(data), - 'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')), + 'channel_follower_count': ( + self._get_count(data, ('header', ..., 'subscriberCountText')) + or traverse_obj(page_header_view_model, ( + 'metadata', 'contentMetadataViewModel', 'metadataRows', ..., 'metadataParts', + lambda _, v: 'subscribers' in v['text']['content'], 'text', 'content', {parse_count}, any))), 'description': try_get(metadata_renderer, lambda x: x.get('description', '')), 'tags': (traverse_obj(data, ('microformat', 'microformatDataRenderer', 'tags', ..., {str})) or traverse_obj(metadata_renderer, ('keywords', {lambda x: x and shlex.split(x)}, ...))), @@ -5363,7 +5814,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): webpage, data = None, None for retry in self.RetryManager(fatal=fatal): try: - webpage = self._download_webpage(url, item_id, note='Downloading webpage') + webpage = self._download_webpage(url, item_id, note='Downloading webpage', headers=self._generate_webpage_headers()) data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {} except ExtractorError as e: if isinstance(e.cause, network_exceptions): @@ -6737,7 +7188,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): raise ExtractorError('Unable to recognize tab page') -class YoutubePlaylistIE(InfoExtractor): +class YoutubePlaylistIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube playlists' _VALID_URL = r'''(?x)(?: (?:https?://)? @@ -6851,7 +7302,7 @@ class YoutubePlaylistIE(InfoExtractor): return self.url_result(url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id) -class YoutubeYtBeIE(InfoExtractor): +class YoutubeYtBeIE(YoutubeBaseInfoExtractor): IE_DESC = 'youtu.be' _VALID_URL = rf'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{{11}})/*?.*?\blist=(?P<playlist_id>{YoutubeBaseInfoExtractor._PLAYLIST_ID_RE})' _TESTS = [{ @@ -6902,7 +7353,7 @@ class YoutubeYtBeIE(InfoExtractor): }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id) -class YoutubeLivestreamEmbedIE(InfoExtractor): +class YoutubeLivestreamEmbedIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube livestream embeds' _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/embed/live_stream/?\?(?:[^#]+&)?channel=(?P<id>[^&#]+)' _TESTS = [{ @@ -6917,7 +7368,7 @@ class YoutubeLivestreamEmbedIE(InfoExtractor): ie=YoutubeTabIE.ie_key(), video_id=channel_id) -class YoutubeYtUserIE(InfoExtractor): +class YoutubeYtUserIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube user videos; "ytuser:" prefix' IE_NAME = 'youtube:user' _VALID_URL = r'ytuser:(?P<id>.+)' @@ -7204,7 +7655,7 @@ class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor): return self.playlist_result(self._search_results(query, params, default_client='web_music'), title, title) -class YoutubeFeedsInfoExtractor(InfoExtractor): +class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ Base class for feed extractors Subclasses must re-define the _FEED_NAME property. @@ -7212,9 +7663,6 @@ class YoutubeFeedsInfoExtractor(InfoExtractor): _LOGIN_REQUIRED = True _FEED_NAME = 'feeds' - def _real_initialize(self): - YoutubeBaseInfoExtractor._check_login_required(self) - @classproperty def IE_NAME(cls): return f'youtube:{cls._FEED_NAME}' @@ -7224,7 +7672,7 @@ class YoutubeFeedsInfoExtractor(InfoExtractor): f'https://www.youtube.com/feed/{self._FEED_NAME}', ie=YoutubeTabIE.ie_key()) -class YoutubeWatchLaterIE(InfoExtractor): +class YoutubeWatchLaterIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list; ":ytwatchlater" keyword (requires cookies)' _VALID_URL = r':ytwatchlater' @@ -7278,7 +7726,7 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): }] -class YoutubeShortsAudioPivotIE(InfoExtractor): +class YoutubeShortsAudioPivotIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube Shorts audio pivot (Shorts using audio of a given video)' IE_NAME = 'youtube:shorts:pivot:audio' _VALID_URL = r'https?://(?:www\.)?youtube\.com/source/(?P<id>[\w-]{11})/shorts' @@ -7302,7 +7750,7 @@ class YoutubeShortsAudioPivotIE(InfoExtractor): ie=YoutubeTabIE) -class YoutubeTruncatedURLIE(InfoExtractor): +class YoutubeTruncatedURLIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list _VALID_URL = r'''(?x) @@ -7345,9 +7793,9 @@ class YoutubeTruncatedURLIE(InfoExtractor): raise ExtractorError( 'Did you forget to quote the URL? Remember that & is a meta ' 'character in most shells, so you want to put the URL in quotes, ' - 'like youtube-dl ' + 'like yt-dlp ' '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' - ' or simply youtube-dl BaW_jenozKc .', + ' or simply yt-dlp BaW_jenozKc .', expected=True) @@ -7409,6 +7857,8 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): 'id': clip_id, 'section_start': int(clip_data['startTimeMs']) / 1000, 'section_end': int(clip_data['endTimeMs']) / 1000, + '_format_sort_fields': ( # https protocol is prioritized for ffmpeg compatibility + 'proto:https', 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec:vp9.2', 'channels', 'acodec', 'lang'), } @@ -7459,7 +7909,7 @@ class YoutubeConsentRedirectIE(YoutubeBaseInfoExtractor): return self.url_result(redirect_url) -class YoutubeTruncatedIDIE(InfoExtractor): +class YoutubeTruncatedIDIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube:truncated_id' IE_DESC = False # Do not list _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' diff --git a/yt_dlp/extractor/zaiko.py b/yt_dlp/extractor/zaiko.py index c8c4ec0b87..4563b7ba07 100644 --- a/yt_dlp/extractor/zaiko.py +++ b/yt_dlp/extractor/zaiko.py @@ -66,7 +66,9 @@ class ZaikoIE(ZaikoBaseIE): stream_meta['stream-access']['video_source'], video_id, 'Downloading player page', headers={'referer': 'https://zaiko.io/'}) player_meta = self._parse_vue_element_attr('player', player_page, video_id) - status = traverse_obj(player_meta, ('initial_event_info', 'status', {str})) + initial_event_info = traverse_obj(player_meta, ('initial_event_info', {dict})) or {} + + status = traverse_obj(initial_event_info, ('status', {str})) live_status, msg, expected = { 'vod': ('was_live', 'No VOD stream URL was found', False), 'archiving': ('post_live', 'Event VOD is still being processed', True), @@ -80,14 +82,20 @@ class ZaikoIE(ZaikoBaseIE): 'cancelled': ('not_live', 'Event has been cancelled', True), }.get(status) or ('not_live', f'Unknown event status "{status}"', False) - stream_url = traverse_obj(player_meta, ('initial_event_info', 'endpoint', {url_or_none})) + if traverse_obj(initial_event_info, ('is_jwt_protected', {bool})): + stream_url = self._download_json( + initial_event_info['jwt_token_url'], video_id, 'Downloading JWT-protected stream URL', + 'Failed to download JWT-protected stream URL')['playback_url'] + else: + stream_url = traverse_obj(initial_event_info, ('endpoint', {url_or_none})) + formats = self._extract_m3u8_formats( stream_url, video_id, live=True, fatal=False) if stream_url else [] if not formats: self.raise_no_formats(msg, expected=expected) thumbnail_urls = [ - traverse_obj(player_meta, ('initial_event_info', 'poster_url')), + traverse_obj(initial_event_info, ('poster_url', {url_or_none})), self._og_search_thumbnail(self._download_webpage( f'https://zaiko.io/event/{video_id}', video_id, 'Downloading event page', fatal=False) or ''), ] @@ -103,9 +111,7 @@ class ZaikoIE(ZaikoBaseIE): 'release_timestamp': ('stream', 'start', 'timestamp', {int_or_none}), 'categories': ('event', 'genres', ..., {lambda x: x or None}), }), - **traverse_obj(player_meta, ('initial_event_info', { - 'alt_title': ('title', {str}), - })), + 'alt_title': traverse_obj(initial_event_info, ('title', {str})), 'thumbnails': [{'url': url, 'id': url_basename(url)} for url in thumbnail_urls if url_or_none(url)], } diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 5c82de19ea..ba059babbd 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -636,6 +636,8 @@ class JSInterpreter: raise self.Exception(f'{member} {msg}', expr) def eval_method(): + nonlocal member + if (variable, member) == ('console', 'debug'): if Debugger.ENABLED: Debugger.write(self.interpret_expression(f'[{arg_str}]', local_vars, allow_recursion)) @@ -644,6 +646,7 @@ class JSInterpreter: types = { 'String': str, 'Math': float, + 'Array': list, } obj = local_vars.get(variable, types.get(variable, NO_DEFAULT)) if obj is NO_DEFAULT: @@ -667,12 +670,27 @@ class JSInterpreter: self.interpret_expression(v, local_vars, allow_recursion) for v in self._separate(arg_str)] - if obj == str: + # Fixup prototype call + if isinstance(obj, type) and member.startswith('prototype.'): + new_member, _, func_prototype = member.partition('.')[2].partition('.') + assertion(argvals, 'takes one or more arguments') + assertion(isinstance(argvals[0], obj), f'needs binding to type {obj}') + if func_prototype == 'call': + obj, *argvals = argvals + elif func_prototype == 'apply': + assertion(len(argvals) == 2, 'takes two arguments') + obj, argvals = argvals + assertion(isinstance(argvals, list), 'second argument needs to be a list') + else: + raise self.Exception(f'Unsupported Function method {func_prototype}', expr) + member = new_member + + if obj is str: if member == 'fromCharCode': assertion(argvals, 'takes one or more arguments') return ''.join(map(chr, argvals)) raise self.Exception(f'Unsupported String method {member}', expr) - elif obj == float: + elif obj is float: if member == 'pow': assertion(len(argvals) == 2, 'takes two arguments') return argvals[0] ** argvals[1] @@ -691,9 +709,9 @@ class JSInterpreter: obj.reverse() return obj elif member == 'slice': - assertion(isinstance(obj, list), 'must be applied on a list') - assertion(len(argvals) == 1, 'takes exactly one argument') - return obj[argvals[0]:] + assertion(isinstance(obj, (list, str)), 'must be applied on a list or string') + assertion(len(argvals) <= 2, 'takes between 0 and 2 arguments') + return obj[slice(*argvals, None)] elif member == 'splice': assertion(isinstance(obj, list), 'must be applied on a list') assertion(argvals, 'takes one or more arguments') diff --git a/yt_dlp/networking/_curlcffi.py b/yt_dlp/networking/_curlcffi.py index b1f0fb82e8..0643348e7e 100644 --- a/yt_dlp/networking/_curlcffi.py +++ b/yt_dlp/networking/_curlcffi.py @@ -2,6 +2,7 @@ from __future__ import annotations import io import math +import re import urllib.parse from ._helper import InstanceStoreMixin, select_proxy @@ -27,11 +28,12 @@ from ..utils import int_or_none if curl_cffi is None: raise ImportError('curl_cffi is not installed') -curl_cffi_version = tuple(int_or_none(x, default=0) for x in curl_cffi.__version__.split('.')) -if curl_cffi_version != (0, 5, 10): +curl_cffi_version = tuple(map(int, re.split(r'[^\d]+', curl_cffi.__version__)[:3])) + +if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 7, 2)): curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)' - raise ImportError('Only curl_cffi 0.5.10 is supported') + raise ImportError('Only curl_cffi versions 0.5.10, 0.7.0 and 0.7.1 are supported') import curl_cffi.requests from curl_cffi.const import CurlECode, CurlOpt @@ -110,6 +112,13 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin): _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY) _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h') _SUPPORTED_IMPERSONATE_TARGET_MAP = { + **({ + ImpersonateTarget('chrome', '124', 'macos', '14'): curl_cffi.requests.BrowserType.chrome124, + ImpersonateTarget('chrome', '123', 'macos', '14'): curl_cffi.requests.BrowserType.chrome123, + ImpersonateTarget('chrome', '120', 'macos', '14'): curl_cffi.requests.BrowserType.chrome120, + ImpersonateTarget('chrome', '119', 'macos', '14'): curl_cffi.requests.BrowserType.chrome119, + ImpersonateTarget('chrome', '116', 'windows', '10'): curl_cffi.requests.BrowserType.chrome116, + } if curl_cffi_version >= (0, 7, 0) else {}), ImpersonateTarget('chrome', '110', 'windows', '10'): curl_cffi.requests.BrowserType.chrome110, ImpersonateTarget('chrome', '107', 'windows', '10'): curl_cffi.requests.BrowserType.chrome107, ImpersonateTarget('chrome', '104', 'windows', '10'): curl_cffi.requests.BrowserType.chrome104, @@ -118,9 +127,15 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin): ImpersonateTarget('chrome', '99', 'windows', '10'): curl_cffi.requests.BrowserType.chrome99, ImpersonateTarget('edge', '101', 'windows', '10'): curl_cffi.requests.BrowserType.edge101, ImpersonateTarget('edge', '99', 'windows', '10'): curl_cffi.requests.BrowserType.edge99, + **({ + ImpersonateTarget('safari', '17.0', 'macos', '14'): curl_cffi.requests.BrowserType.safari17_0, + } if curl_cffi_version >= (0, 7, 0) else {}), ImpersonateTarget('safari', '15.5', 'macos', '12'): curl_cffi.requests.BrowserType.safari15_5, ImpersonateTarget('safari', '15.3', 'macos', '11'): curl_cffi.requests.BrowserType.safari15_3, ImpersonateTarget('chrome', '99', 'android', '12'): curl_cffi.requests.BrowserType.chrome99_android, + **({ + ImpersonateTarget('safari', '17.2', 'ios', '17.2'): curl_cffi.requests.BrowserType.safari17_2_ios, + } if curl_cffi_version >= (0, 7, 0) else {}), } def _create_instance(self, cookiejar=None): @@ -131,6 +146,9 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin): extensions.pop('impersonate', None) extensions.pop('cookiejar', None) extensions.pop('timeout', None) + # CurlCFFIRH ignores legacy ssl options currently. + # Impersonation generally uses a looser SSL configuration than urllib/requests. + extensions.pop('legacy_ssl', None) def send(self, request: Request) -> Response: target = self._get_request_target(request) @@ -187,7 +205,7 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin): timeout = self._calculate_timeout(request) # set CURLOPT_LOW_SPEED_LIMIT and CURLOPT_LOW_SPEED_TIME to act as a read timeout. [1] - # curl_cffi does not currently do this. [2] + # This is required only for 0.5.10 [2] # Note: CURLOPT_LOW_SPEED_TIME is in seconds, so we need to round up to the nearest second. [3] # [1] https://unix.stackexchange.com/a/305311 # [2] https://github.com/yifeikong/curl_cffi/issues/156 @@ -203,7 +221,7 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin): data=request.data, verify=self.verify, max_redirects=5, - timeout=timeout, + timeout=(timeout, timeout), impersonate=self._SUPPORTED_IMPERSONATE_TARGET_MAP.get( self._get_request_target(request)), interface=self.source_address, @@ -222,7 +240,7 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin): elif ( e.code == CurlECode.PROXY - or (e.code == CurlECode.RECV_ERROR and 'Received HTTP code 407 from proxy after CONNECT' in str(e)) + or (e.code == CurlECode.RECV_ERROR and 'CONNECT' in str(e)) ): raise ProxyError(cause=e) from e else: diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index fe3354ea29..b86d3606d8 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -10,7 +10,7 @@ import typing import urllib.parse import urllib.request -from .exceptions import RequestError, UnsupportedRequest +from .exceptions import RequestError from ..dependencies import certifi from ..socks import ProxyType, sockssocket from ..utils import format_field, traverse_obj @@ -206,7 +206,7 @@ def wrap_request_errors(func): def wrapper(self, *args, **kwargs): try: return func(self, *args, **kwargs) - except UnsupportedRequest as e: + except RequestError as e: if e.handler is None: e.handler = self raise diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py index c69c54b3a0..7de95ab3bf 100644 --- a/yt_dlp/networking/_requests.py +++ b/yt_dlp/networking/_requests.py @@ -230,9 +230,7 @@ class Urllib3LoggingFilter(logging.Filter): def filter(self, record): # Ignore HTTP request messages since HTTPConnection prints those - if record.msg == '%s://%s:%s "%s %s %s" %s %s': - return False - return True + return record.msg != '%s://%s:%s "%s %s %s" %s %s' class Urllib3LoggingHandler(logging.Handler): @@ -297,11 +295,12 @@ class RequestsRH(RequestHandler, InstanceStoreMixin): super()._check_extensions(extensions) extensions.pop('cookiejar', None) extensions.pop('timeout', None) + extensions.pop('legacy_ssl', None) - def _create_instance(self, cookiejar): + def _create_instance(self, cookiejar, legacy_ssl_support=None): session = RequestsSession() http_adapter = RequestsHTTPAdapter( - ssl_context=self._make_sslcontext(), + ssl_context=self._make_sslcontext(legacy_ssl_support=legacy_ssl_support), source_address=self.source_address, max_retries=urllib3.util.retry.Retry(False), ) @@ -320,7 +319,10 @@ class RequestsRH(RequestHandler, InstanceStoreMixin): max_redirects_exceeded = False - session = self._get_instance(cookiejar=self._get_cookiejar(request)) + session = self._get_instance( + cookiejar=self._get_cookiejar(request), + legacy_ssl_support=request.extensions.get('legacy_ssl'), + ) try: requests_res = session.request( diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 62995823bf..510bb2a691 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -348,14 +348,15 @@ class UrllibRH(RequestHandler, InstanceStoreMixin): super()._check_extensions(extensions) extensions.pop('cookiejar', None) extensions.pop('timeout', None) + extensions.pop('legacy_ssl', None) - def _create_instance(self, proxies, cookiejar): + def _create_instance(self, proxies, cookiejar, legacy_ssl_support=None): opener = urllib.request.OpenerDirector() handlers = [ ProxyHandler(proxies), HTTPHandler( debuglevel=int(bool(self.verbose)), - context=self._make_sslcontext(), + context=self._make_sslcontext(legacy_ssl_support=legacy_ssl_support), source_address=self.source_address), HTTPCookieProcessor(cookiejar), DataHandler(), @@ -391,6 +392,7 @@ class UrllibRH(RequestHandler, InstanceStoreMixin): opener = self._get_instance( proxies=self._get_proxies(request), cookiejar=self._get_cookiejar(request), + legacy_ssl_support=request.extensions.get('legacy_ssl'), ) try: res = opener.open(urllib_req, timeout=self._calculate_timeout(request)) diff --git a/yt_dlp/networking/_websockets.py b/yt_dlp/networking/_websockets.py index 2153080a34..ec55567dae 100644 --- a/yt_dlp/networking/_websockets.py +++ b/yt_dlp/networking/_websockets.py @@ -1,6 +1,7 @@ from __future__ import annotations import contextlib +import functools import io import logging import ssl @@ -22,7 +23,6 @@ from .exceptions import ( TransportError, ) from .websocket import WebSocketRequestHandler, WebSocketResponse -from ..compat import functools from ..dependencies import websockets from ..socks import ProxyError as SocksProxyError from ..utils import int_or_none @@ -33,8 +33,8 @@ if not websockets: import websockets.version websockets_version = tuple(map(int_or_none, websockets.version.version.split('.'))) -if websockets_version < (12, 0): - raise ImportError('Only websockets>=12.0 is supported') +if websockets_version < (13, 0): + raise ImportError('Only websockets>=13.0 is supported') import websockets.sync.client from websockets.uri import parse_uri @@ -47,10 +47,7 @@ from websockets.uri import parse_uri # 2: "AttributeError: 'ClientConnection' object has no attribute 'recv_events_exc'. Did you mean: 'recv_events'?" import websockets.sync.connection # isort: split with contextlib.suppress(Exception): - # > 12.0 websockets.sync.connection.Connection.recv_exc = None - # 12.0 - websockets.sync.connection.Connection.recv_events_exc = None class WebsocketsResponseAdapter(WebSocketResponse): @@ -118,6 +115,7 @@ class WebsocketsRH(WebSocketRequestHandler): super()._check_extensions(extensions) extensions.pop('timeout', None) extensions.pop('cookiejar', None) + extensions.pop('legacy_ssl', None) def close(self): # Remove the logging handler that contains a reference to our logger @@ -154,13 +152,14 @@ class WebsocketsRH(WebSocketRequestHandler): address=(wsuri.host, wsuri.port), **create_conn_kwargs, ) + ssl_ctx = self._make_sslcontext(legacy_ssl_support=request.extensions.get('legacy_ssl')) conn = websockets.sync.client.connect( sock=sock, uri=request.url, additional_headers=headers, open_timeout=timeout, user_agent_header=None, - ssl_context=self._make_sslcontext() if wsuri.secure else None, + ssl=ssl_ctx if wsuri.secure else None, close_timeout=0, # not ideal, but prevents yt-dlp hanging ) return WebsocketsResponseAdapter(conn, url=request.url) diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index a6db167158..e8951c7e7d 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -205,6 +205,7 @@ class RequestHandler(abc.ABC): The following extensions are defined for RequestHandler: - `cookiejar`: Cookiejar to use for this request. - `timeout`: socket timeout to use for this request. + - `legacy_ssl`: Enable legacy SSL options for this request. See legacy_ssl_support. To enable these, add extensions.pop('<extension>', None) to _check_extensions Apart from the url protocol, proxies dict may contain the following keys: @@ -247,10 +248,10 @@ class RequestHandler(abc.ABC): self.legacy_ssl_support = legacy_ssl_support super().__init__() - def _make_sslcontext(self): + def _make_sslcontext(self, legacy_ssl_support=None): return make_ssl_context( verify=self.verify, - legacy_support=self.legacy_ssl_support, + legacy_support=legacy_ssl_support if legacy_ssl_support is not None else self.legacy_ssl_support, use_certifi=not self.prefer_system_certs, **self._client_cert, ) @@ -262,7 +263,8 @@ class RequestHandler(abc.ABC): return float(request.extensions.get('timeout') or self.timeout) def _get_cookiejar(self, request): - return request.extensions.get('cookiejar') or self.cookiejar + cookiejar = request.extensions.get('cookiejar') + return self.cookiejar if cookiejar is None else cookiejar def _get_proxies(self, request): return (request.proxies or self.proxies).copy() @@ -314,6 +316,7 @@ class RequestHandler(abc.ABC): """Check extensions for unsupported extensions. Subclasses should extend this.""" assert isinstance(extensions.get('cookiejar'), (YoutubeDLCookieJar, NoneType)) assert isinstance(extensions.get('timeout'), (float, int, NoneType)) + assert isinstance(extensions.get('legacy_ssl'), (bool, NoneType)) def _validate(self, request): self._check_url_scheme(request) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 3d4c076610..c4d2a72743 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -408,6 +408,14 @@ def create_parser(): help=( 'Location of the main configuration file; either the path to the config or its containing directory ' '("-" for stdin). Can be used multiple times and inside other configuration files')) + general.add_option( + '--plugin-dirs', + dest='plugin_dirs', metavar='PATH', action='append', + help=( + 'Path to an additional directory to search for plugins. ' + 'This option can be used multiple times to add multiple directories. ' + 'Note that this currently only works for extractor plugins; ' + 'postprocessor plugins can only be loaded from the default plugin directories')) general.add_option( '--flat-playlist', action='store_const', dest='extract_flat', const='in_playlist', default=False, @@ -462,6 +470,7 @@ def create_parser(): 'the STREAM (stdout or stderr) to apply the setting to. ' 'Can be one of "always", "auto" (default), "never", or ' '"no_color" (use non color terminal sequences). ' + 'Use "auto-tty" or "no_color-tty" to decide based on terminal support only. ' 'Can be used multiple times')) general.add_option( '--compat-options', @@ -474,10 +483,10 @@ def create_parser(): 'no-attach-info-json', 'embed-thumbnail-atomicparsley', 'no-external-downloader-progress', 'embed-metadata', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-youtube-prefer-utc-upload-date', - 'prefer-legacy-http-handler', 'manifest-filesize-approx', + 'prefer-legacy-http-handler', 'manifest-filesize-approx', 'allow-unsafe-ext', }, 'aliases': { - 'youtube-dl': ['all', '-multistreams', '-playlist-match-filter', '-manifest-filesize-approx'], - 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter', '-manifest-filesize-approx'], + 'youtube-dl': ['all', '-multistreams', '-playlist-match-filter', '-manifest-filesize-approx', '-allow-unsafe-ext'], + 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter', '-manifest-filesize-approx', '-allow-unsafe-ext'], '2021': ['2022', 'no-certifi', 'filename-sanitization'], '2022': ['2023', 'no-external-downloader-progress', 'playlist-match-filter', 'prefer-legacy-http-handler', 'manifest-filesize-approx'], '2023': [], @@ -622,13 +631,13 @@ def create_parser(): metavar='DATE', dest='datebefore', default=None, help=( 'Download only videos uploaded on or before this date. ' - 'The date formats accepted is the same as --date')) + 'The date formats accepted are the same as --date')) selection.add_option( '--dateafter', metavar='DATE', dest='dateafter', default=None, help=( 'Download only videos uploaded on or after this date. ' - 'The date formats accepted is the same as --date')) + 'The date formats accepted are the same as --date')) selection.add_option( '--min-views', metavar='COUNT', dest='min_views', default=None, type=int, @@ -646,16 +655,16 @@ def create_parser(): 'You can also simply specify a field to match if the field is present, ' 'use "!field" to check if the field is not present, and "&" to check multiple conditions. ' 'Use a "\\" to escape "&" or quotes if needed. If used multiple times, ' - 'the filter matches if atleast one of the conditions are met. E.g. --match-filter ' - '!is_live --match-filter "like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' + 'the filter matches if at least one of the conditions is met. E.g. --match-filters ' + '!is_live --match-filters "like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' 'matches only videos that are not live OR those that have a like count more than 100 ' '(or the like field is not available) and also has a description ' 'that contains the phrase "cats & dogs" (caseless). ' - 'Use "--match-filter -" to interactively ask whether to download each video')) + 'Use "--match-filters -" to interactively ask whether to download each video')) selection.add_option( '--no-match-filters', dest='match_filter', action='store_const', const=None, - help='Do not use any --match-filter (default)') + help='Do not use any --match-filters (default)') selection.add_option( '--break-match-filters', metavar='FILTER', dest='breaking_match_filter', action='append', @@ -703,7 +712,7 @@ def create_parser(): selection.add_option( '--break-per-input', action='store_true', dest='break_per_url', default=False, - help='Alters --max-downloads, --break-on-existing, --break-match-filter, and autonumber to reset per input URL') + help='Alters --max-downloads, --break-on-existing, --break-match-filters, and autonumber to reset per input URL') selection.add_option( '--no-break-per-input', action='store_false', dest='break_per_url', @@ -824,7 +833,7 @@ def create_parser(): '--prefer-free-formats', action='store_true', dest='prefer_free_formats', default=False, help=( - 'Prefer video formats with free containers over non-free ones of same quality. ' + 'Prefer video formats with free containers over non-free ones of the same quality. ' 'Use with "-S ext" to strictly prefer free containers irrespective of quality')) video_format.add_option( '--no-prefer-free-formats', @@ -898,13 +907,14 @@ def create_parser(): subtitles.add_option( '--sub-format', action='store', dest='subtitlesformat', metavar='FORMAT', default='best', - help='Subtitle format; accepts formats preference, e.g. "srt" or "ass/srt/best"') + help='Subtitle format; accepts formats preference separated by "/", e.g. "srt" or "ass/srt/best"') subtitles.add_option( '--sub-langs', '--srt-langs', action='callback', dest='subtitleslangs', metavar='LANGS', type='str', default=[], callback=_list_from_options_callback, help=( - 'Languages of the subtitles to download (can be regex) or "all" separated by commas, e.g. --sub-langs "en.*,ja". ' + 'Languages of the subtitles to download (can be regex) or "all" separated by commas, e.g. --sub-langs "en.*,ja" ' + '(where "en.*" is a regex pattern that matches "en" followed by 0 or more of any character). ' 'You can prefix the language code with a "-" to exclude it from the requested languages, e.g. --sub-langs all,-live_chat. ' 'Use --list-subs for a list of available language tags')) @@ -1173,7 +1183,7 @@ def create_parser(): '--print-to-file', metavar='[WHEN:]TEMPLATE FILE', dest='print_to_file', nargs=2, **when_prefix('video'), help=( - 'Append given template to the file. The values of WHEN and TEMPLATE are same as that of --print. ' + 'Append given template to the file. The values of WHEN and TEMPLATE are the same as that of --print. ' 'FILE uses the same syntax as the output template. This option can be used multiple times')) verbosity.add_option( '-g', '--get-url', @@ -1217,7 +1227,7 @@ def create_parser(): '-J', '--dump-single-json', action='store_true', dest='dump_single_json', default=False, help=( - 'Quiet, but print JSON information for each url or infojson passed. Simulate unless --no-simulate is used. ' + 'Quiet, but print JSON information for each URL or infojson passed. Simulate unless --no-simulate is used. ' 'If the URL refers to a playlist, the whole playlist information is dumped in a single line')) verbosity.add_option( '--print-json', @@ -1479,7 +1489,7 @@ def create_parser(): 'Optionally, the KEYRING used for decrypting Chromium cookies on Linux, ' 'the name/path of the PROFILE to load cookies from, ' 'and the CONTAINER name (if Firefox) ("none" for no container) ' - 'can be given with their respective seperators. ' + 'can be given with their respective separators. ' 'By default, all containers of the most recently accessed profile are used. ' f'Currently supported keyrings are: {", ".join(map(str.lower, sorted(SUPPORTED_KEYRINGS)))}')) filesystem.add_option( @@ -1561,7 +1571,7 @@ def create_parser(): help=( 'Remux the video into another container if necessary ' f'(currently supported: {", ".join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS)}). ' - 'If target container does not support the video/audio codec, remuxing will fail. You can specify multiple rules; ' + 'If the target container does not support the video/audio codec, remuxing will fail. You can specify multiple rules; ' 'e.g. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 and anything else to mkv')) postproc.add_option( '--recode-video', @@ -1667,7 +1677,7 @@ def create_parser(): postproc.add_option( '--xattrs', '--xattr', action='store_true', dest='xattrs', default=False, - help='Write metadata to the video file\'s xattrs (using dublin core and xdg standards)') + help='Write metadata to the video file\'s xattrs (using Dublin Core and XDG standards)') postproc.add_option( '--concat-playlist', metavar='POLICY', dest='concat_playlist', default='multi_video', @@ -1675,7 +1685,7 @@ def create_parser(): help=( 'Concatenate videos in a playlist. One of "never", "always", or ' '"multi_video" (default; only when the videos form a single show). ' - 'All the video files must have same codecs and number of streams to be concatable. ' + 'All the video files must have the same codecs and number of streams to be concatenable. ' 'The "pl_video:" prefix can be used with "--paths" and "--output" to ' 'set the output filename for the concatenated files. See "OUTPUT TEMPLATE" for details')) postproc.add_option( @@ -1685,8 +1695,8 @@ def create_parser(): help=( 'Automatically correct known faults of the file. ' 'One of never (do nothing), warn (only emit a warning), ' - 'detect_or_warn (the default; fix file if we can, warn otherwise), ' - 'force (try fixing even if file already exists)')) + 'detect_or_warn (the default; fix the file if we can, warn otherwise), ' + 'force (try fixing even if the file already exists)')) postproc.add_option( '--prefer-avconv', '--no-prefer-ffmpeg', action='store_false', dest='prefer_ffmpeg', @@ -1705,7 +1715,7 @@ def create_parser(): help=( 'Execute a command, optionally prefixed with when to execute it, separated by a ":". ' 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: after_move). ' - 'Same syntax as the output template can be used to pass any field as arguments to the command. ' + 'The same syntax as the output template can be used to pass any field as arguments to the command. ' 'If no fields are passed, %(filepath,_filename|)q is appended to the end of the command. ' 'This option can be used multiple times')) postproc.add_option( @@ -1724,15 +1734,17 @@ def create_parser(): '--convert-subs', '--convert-sub', '--convert-subtitles', metavar='FORMAT', dest='convertsubtitles', default=None, help=( - 'Convert the subtitles to another format (currently supported: {}) ' - '(Alias: --convert-subtitles)'.format(', '.join(sorted(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))))) + 'Convert the subtitles to another format ' + f'(currently supported: {", ".join(sorted(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))}). ' + 'Use "--convert-subs none" to disable conversion (default) (Alias: --convert-subtitles)')) postproc.add_option( '--convert-thumbnails', metavar='FORMAT', dest='convertthumbnails', default=None, help=( 'Convert the thumbnails to another format ' f'(currently supported: {", ".join(sorted(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS))}). ' - 'You can specify multiple rules using similar syntax as --remux-video')) + 'You can specify multiple rules using similar syntax as "--remux-video". ' + 'Use "--convert-thumbnails none" to disable conversion (default)')) postproc.add_option( '--split-chapters', '--split-tracks', dest='split_chapters', action='store_true', default=False, @@ -1774,14 +1786,14 @@ def create_parser(): 'delim': None, 'process': lambda val: dict(_postprocessor_opts_parser(*val.split(':', 1))), }, help=( - 'The (case sensitive) name of plugin postprocessors to be enabled, ' + 'The (case-sensitive) name of plugin postprocessors to be enabled, ' 'and (optionally) arguments to be passed to it, separated by a colon ":". ' 'ARGS are a semicolon ";" delimited list of NAME=VALUE. ' 'The "when" argument determines when the postprocessor is invoked. ' 'It can be one of "pre_process" (after video extraction), "after_filter" (after video passes filter), ' '"video" (after --format; before --print/--output), "before_dl" (before each video download), ' '"post_process" (after each video download; default), ' - '"after_move" (after moving video file to it\'s final locations), ' + '"after_move" (after moving the video file to its final location), ' '"after_video" (after downloading and processing all formats of a video), ' 'or "playlist" (at end of playlist). ' 'This option can be used multiple times to add different postprocessors')) @@ -1798,7 +1810,7 @@ def create_parser(): }, help=( 'SponsorBlock categories to create chapters for, separated by commas. ' f'Available categories are {", ".join(SponsorBlockPP.CATEGORIES.keys())}, all and default (=all). ' - 'You can prefix the category with a "-" to exclude it. See [1] for description of the categories. ' + 'You can prefix the category with a "-" to exclude it. See [1] for descriptions of the categories. ' 'E.g. --sponsorblock-mark all,-preview [1] https://wiki.sponsor.ajay.app/w/Segment_Categories')) sponsorblock.add_option( '--sponsorblock-remove', metavar='CATS', @@ -1884,7 +1896,7 @@ def create_parser(): extractor.add_option( '--no-hls-split-discontinuity', dest='hls_split_discontinuity', action='store_false', - help='Do not split HLS playlists to different formats at discontinuities such as ad breaks (default)') + help='Do not split HLS playlists into different formats at discontinuities such as ad breaks (default)') _extractor_arg_parser = lambda key, vals='': (key.strip().lower().replace('-', '_'), [ val.replace(r'\,', ',').strip() for val in re.split(r'(?<!\\),', vals)]) extractor.add_option( diff --git a/yt_dlp/plugins.py b/yt_dlp/plugins.py index 3cc879fd7e..2bf55df71e 100644 --- a/yt_dlp/plugins.py +++ b/yt_dlp/plugins.py @@ -1,10 +1,12 @@ import contextlib +import functools import importlib import importlib.abc import importlib.machinery import importlib.util import inspect import itertools +import os import pkgutil import sys import traceback @@ -12,8 +14,8 @@ import zipimport from pathlib import Path from zipfile import ZipFile -from .compat import functools # isort: split from .utils import ( + Config, get_executable_path, get_system_config_dirs, get_user_config_dirs, @@ -83,6 +85,12 @@ class PluginFinder(importlib.abc.MetaPathFinder): with contextlib.suppress(ValueError): # Added when running __main__.py directly candidate_locations.remove(Path(__file__).parent) + # TODO(coletdjnz): remove when plugin globals system is implemented + if Config._plugin_dirs: + candidate_locations.extend(_get_package_paths( + *Config._plugin_dirs, + containing_folder='')) + parts = Path(*fullname.split('.')) for path in orderedSet(candidate_locations, lazy=True): candidate = path / parts @@ -137,6 +145,8 @@ def load_module(module, module_name, suffix): def load_plugins(name, suffix): classes = {} + if os.environ.get('YTDLP_NO_PLUGINS'): + return classes for finder, module_name, _ in iter_modules(name): if any(x.startswith('_') for x in module_name.split('.')): diff --git a/yt_dlp/postprocessor/embedthumbnail.py b/yt_dlp/postprocessor/embedthumbnail.py index 673a924685..16c8bcdda7 100644 --- a/yt_dlp/postprocessor/embedthumbnail.py +++ b/yt_dlp/postprocessor/embedthumbnail.py @@ -119,15 +119,22 @@ class EmbedThumbnailPP(FFmpegPostProcessor): if not mutagen or prefer_atomicparsley: success = False else: + self._report_run('mutagen', filename) + f = {'jpeg': MP4Cover.FORMAT_JPEG, 'png': MP4Cover.FORMAT_PNG} try: - self._report_run('mutagen', filename) + with open(thumbnail_filename, 'rb') as thumbfile: + thumb_data = thumbfile.read() + + type_ = imghdr.what(h=thumb_data) + if not type_: + raise ValueError('could not determine image type') + elif type_ not in f: + raise ValueError(f'incompatible image type: {type_}') + meta = MP4(filename) # NOTE: the 'covr' atom is a non-standard MPEG-4 atom, # Apple iTunes 'M4A' files include the 'moov.udta.meta.ilst' atom. - f = {'jpeg': MP4Cover.FORMAT_JPEG, 'png': MP4Cover.FORMAT_PNG}[imghdr.what(thumbnail_filename)] - with open(thumbnail_filename, 'rb') as thumbfile: - thumb_data = thumbfile.read() - meta.tags['covr'] = [MP4Cover(data=thumb_data, imageformat=f)] + meta.tags['covr'] = [MP4Cover(data=thumb_data, imageformat=f[type_])] meta.save() temp_filename = filename except Exception as err: @@ -160,9 +167,10 @@ class EmbedThumbnailPP(FFmpegPostProcessor): stdout, stderr, returncode = Popen.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if returncode: self.report_warning(f'Unable to embed thumbnails using AtomicParsley; {stderr.strip()}') + success = False # for formats that don't support thumbnails (like 3gp) AtomicParsley # won't create to the temporary file - if 'No changes' in stdout: + elif 'No changes' in stdout: self.report_warning('The file format doesn\'t support embedding a thumbnail') success = False diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 1ed37af518..164c46d143 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -1,5 +1,6 @@ import collections import contextvars +import functools import itertools import json import os @@ -8,7 +9,7 @@ import subprocess import time from .common import PostProcessor -from ..compat import functools, imghdr +from ..compat import imghdr from ..utils import ( MEDIA_EXTENSIONS, ISO639Utils, diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index 6cf9ab62ea..b3fc8b54a8 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -33,7 +33,7 @@ class SponsorBlockPP(FFmpegPostProcessor): def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'): FFmpegPostProcessor.__init__(self, downloader) self._categories = tuple(categories or self.CATEGORIES.keys()) - self._API_URL = api if re.match('^https?://', api) else 'https://' + api + self._API_URL = api if re.match('https?://', api) else 'https://' + api def run(self, info): extractor = info['extractor_key'] diff --git a/yt_dlp/postprocessor/xattrpp.py b/yt_dlp/postprocessor/xattrpp.py index f822eff41c..166aabaf92 100644 --- a/yt_dlp/postprocessor/xattrpp.py +++ b/yt_dlp/postprocessor/xattrpp.py @@ -26,38 +26,40 @@ class XAttrMetadataPP(PostProcessor): XATTR_MAPPING = { 'user.xdg.referrer.url': 'webpage_url', - # 'user.xdg.comment': 'description', 'user.dublincore.title': 'title', 'user.dublincore.date': 'upload_date', - 'user.dublincore.description': 'description', 'user.dublincore.contributor': 'uploader', 'user.dublincore.format': 'format', + # We do this last because it may get us close to the xattr limits + # (e.g., 4kB on ext4), and we don't want to have the other ones fail + 'user.dublincore.description': 'description', + # 'user.xdg.comment': 'description', } def run(self, info): mtime = os.stat(info['filepath']).st_mtime self.to_screen('Writing metadata to file\'s xattrs') - try: - for xattrname, infoname in self.XATTR_MAPPING.items(): + for xattrname, infoname in self.XATTR_MAPPING.items(): + try: value = info.get(infoname) if value: if infoname == 'upload_date': value = hyphenate_date(value) write_xattr(info['filepath'], xattrname, value.encode()) - except XAttrUnavailableError as e: - raise PostProcessingError(str(e)) - except XAttrMetadataError as e: - if e.reason == 'NO_SPACE': - self.report_warning( - 'There\'s no disk space left, disk quota exceeded or filesystem xattr limit exceeded. ' - 'Some extended attributes are not written') - elif e.reason == 'VALUE_TOO_LONG': - self.report_warning('Unable to write extended attributes due to too long values.') - else: - tip = ('You need to use NTFS' if compat_os_name == 'nt' - else 'You may have to enable them in your "/etc/fstab"') - raise PostProcessingError(f'This filesystem doesn\'t support extended attributes. {tip}') + except XAttrUnavailableError as e: + raise PostProcessingError(str(e)) + except XAttrMetadataError as e: + if e.reason == 'NO_SPACE': + self.report_warning( + 'There\'s no disk space left, disk quota exceeded or filesystem xattr limit exceeded. ' + f'Extended attribute "{xattrname}" was not written.') + elif e.reason == 'VALUE_TOO_LONG': + self.report_warning(f'Unable to write extended attribute "{xattrname}" due to too long values.') + else: + tip = ('You need to use NTFS' if compat_os_name == 'nt' + else 'You may have to enable them in your "/etc/fstab"') + raise PostProcessingError(f'This filesystem doesn\'t support extended attributes. {tip}') self.try_utime(info['filepath'], mtime, mtime) return [], info diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 8c6790d610..90df2509f0 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -2,6 +2,7 @@ from __future__ import annotations import atexit import contextlib +import functools import hashlib import json import os @@ -12,7 +13,6 @@ import sys from dataclasses import dataclass from zipimport import zipimporter -from .compat import functools # isort: split from .compat import compat_realpath from .networking import Request from .networking.exceptions import HTTPError, network_exceptions @@ -103,7 +103,6 @@ def current_git_head(): _FILE_SUFFIXES = { 'zip': '', - 'py2exe': '_min.exe', 'win_exe': '.exe', 'win_x86_exe': '_x86.exe', 'darwin_exe': '_macos', @@ -117,6 +116,7 @@ _NON_UPDATEABLE_REASONS = { **{variant: None for variant in _FILE_SUFFIXES}, # Updatable **{variant: f'Auto-update is not supported for unpackaged {name} executable; Re-download the latest release' for variant, name in {'win32_dir': 'Windows', 'darwin_dir': 'MacOS', 'linux_dir': 'Linux'}.items()}, + 'py2exe': 'py2exe is no longer supported by yt-dlp; This executable cannot be updated', 'source': 'You cannot update when running from source code; Use git to pull the latest changes', 'unknown': 'You installed yt-dlp from a manual build or with a package manager; Use that to update', 'other': 'You are using an unofficial build of yt-dlp; Build the executable again', @@ -135,20 +135,18 @@ def _get_binary_name(): def _get_system_deprecation(): - MIN_SUPPORTED, MIN_RECOMMENDED = (3, 8), (3, 8) + MIN_SUPPORTED, MIN_RECOMMENDED = (3, 9), (3, 9) if sys.version_info > MIN_RECOMMENDED: return None major, minor = sys.version_info[:2] + PYTHON_MSG = f'Please update to Python {".".join(map(str, MIN_RECOMMENDED))} or above' + if sys.version_info < MIN_SUPPORTED: - msg = f'Python version {major}.{minor} is no longer supported' - else: - msg = (f'Support for Python version {major}.{minor} has been deprecated. ' - '\nYou may stop receiving updates on this version at any time') + return f'Python version {major}.{minor} is no longer supported! {PYTHON_MSG}' - major, minor = MIN_RECOMMENDED - return f'{msg}! Please update to Python {major}.{minor} or above' + return f'Support for Python version {major}.{minor} has been deprecated. {PYTHON_MSG}' def _sha256_file(path): @@ -310,6 +308,7 @@ class Updater: if isinstance(error, HTTPError) and error.status == 404: continue self._report_network_error(f'fetch update spec: {error}') + return None self._report_error( f'The requested tag {self.requested_tag} does not exist for {self.requested_repo}', True) @@ -339,7 +338,8 @@ class Updater: continue self._report_error( - f'yt-dlp cannot be updated to {resolved_tag} since you are on an older Python version', True) + f'yt-dlp cannot be updated to {resolved_tag} since you are on an older Python version ' + 'or your operating system is not compatible with the requested build', True) return None return resolved_tag @@ -502,7 +502,7 @@ class Updater: return os.rename(old_filename, self.filename) variant = detect_variant() - if variant.startswith('win') or variant == 'py2exe': + if variant.startswith('win'): atexit.register(Popen, f'ping 127.0.0.1 -n 5 -w 1000 & del /F "{old_filename}"', shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) elif old_filename: @@ -557,9 +557,10 @@ class Updater: def _report_network_error(self, action, delim=';', tag=None): if not tag: tag = self.requested_tag + path = tag if tag == 'latest' else f'tag/{tag}' self._report_error( - f'Unable to {action}{delim} visit https://github.com/{self.requested_repo}/releases/' - + tag if tag == 'latest' else f'tag/{tag}', True) + f'Unable to {action}{delim} visit ' + f'https://github.com/{self.requested_repo}/releases/{path}', True) # XXX: Everything below this line in this class is deprecated / for compat only @property diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 664675a099..e30008e931 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -9,6 +9,7 @@ import datetime as dt import email.header import email.utils import errno +import functools import hashlib import hmac import html.entities @@ -44,7 +45,6 @@ import xml.etree.ElementTree from . import traversal -from ..compat import functools # isort: split from ..compat import ( compat_etree_fromstring, compat_expanduser, @@ -212,6 +212,23 @@ def write_json_file(obj, fn): raise +def partial_application(func): + sig = inspect.signature(func) + required_args = [ + param.name for param in sig.parameters.values() + if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.VAR_POSITIONAL) + if param.default is inspect.Parameter.empty + ] + + @functools.wraps(func) + def wrapped(*args, **kwargs): + if set(required_args[len(args):]).difference(kwargs): + return functools.partial(func, *args, **kwargs) + return func(*args, **kwargs) + + return wrapped + + def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z_-]+$', key) @@ -664,31 +681,51 @@ def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT): return result +def _sanitize_path_parts(parts): + sanitized_parts = [] + for part in parts: + if not part or part == '.': + continue + elif part == '..': + if sanitized_parts and sanitized_parts[-1] != '..': + sanitized_parts.pop() + sanitized_parts.append('..') + continue + # Replace invalid segments with `#` + # - trailing dots and spaces (`asdf...` => `asdf..#`) + # - invalid chars (`<>` => `##`) + sanitized_part = re.sub(r'[/<>:"\|\\?\*]|[\s.]$', '#', part) + sanitized_parts.append(sanitized_part) + + return sanitized_parts + + def sanitize_path(s, force=False): """Sanitizes and normalizes path on Windows""" - # XXX: this handles drive relative paths (c:sth) incorrectly - if sys.platform == 'win32': - force = False - drive_or_unc, _ = os.path.splitdrive(s) - elif force: - drive_or_unc = '' + if sys.platform != 'win32': + if not force: + return s + root = '/' if s.startswith('/') else '' + return root + '/'.join(_sanitize_path_parts(s.split('/'))) + + normed = s.replace('/', '\\') + + if normed.startswith('\\\\'): + # UNC path (`\\SERVER\SHARE`) or device path (`\\.`, `\\?`) + parts = normed.split('\\') + root = '\\'.join(parts[:4]) + '\\' + parts = parts[4:] + elif normed[1:2] == ':': + # absolute path or drive relative path + offset = 3 if normed[2:3] == '\\' else 2 + root = normed[:offset] + parts = normed[offset:].split('\\') else: - return s + # relative/drive root relative path + root = '\\' if normed[:1] == '\\' else '' + parts = normed.split('\\') - norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) - if drive_or_unc: - norm_path.pop(0) - sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part) - for path_part in norm_path] - if drive_or_unc: - sanitized_path.insert(0, drive_or_unc + os.path.sep) - elif force and s and s[0] == os.path.sep: - sanitized_path.insert(0, os.path.sep) - # TODO: Fix behavioral differences <3.12 - # The workaround using `normpath` only superficially passes tests - # Ref: https://github.com/python/cpython/pull/100351 - return os.path.normpath(os.path.join(*sanitized_path)) + return root + '\\'.join(_sanitize_path_parts(parts)) def sanitize_url(url, *, scheme='http'): @@ -804,14 +841,18 @@ class Popen(subprocess.Popen): _startupinfo = None @staticmethod - def _fix_pyinstaller_ld_path(env): - """Restore LD_LIBRARY_PATH when using PyInstaller - Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations - https://github.com/yt-dlp/yt-dlp/issues/4573 - """ + def _fix_pyinstaller_issues(env): if not hasattr(sys, '_MEIPASS'): return + # Force spawning independent subprocesses for exes bundled with PyInstaller>=6.10 + # Ref: https://pyinstaller.org/en/v6.10.0/CHANGES.html#incompatible-changes + # https://github.com/yt-dlp/yt-dlp/issues/11259 + env['PYINSTALLER_RESET_ENVIRONMENT'] = '1' + + # Restore LD_LIBRARY_PATH when using PyInstaller + # Ref: https://pyinstaller.org/en/v6.10.0/runtime-information.html#ld-library-path-libpath-considerations + # https://github.com/yt-dlp/yt-dlp/issues/4573 def _fix(key): orig = env.get(f'{key}_ORIG') if orig is None: @@ -825,7 +866,7 @@ class Popen(subprocess.Popen): def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs): if env is None: env = os.environ.copy() - self._fix_pyinstaller_ld_path(env) + self._fix_pyinstaller_issues(env) self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines') if text is True: @@ -1168,6 +1209,7 @@ def extract_timezone(date_str, default=None): return timezone, date_str +@partial_application def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ @@ -1217,7 +1259,7 @@ def unified_timestamp(date_str, day_first=True): return None date_str = re.sub(r'\s+', ' ', re.sub( - r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str)) + r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?|sun)(day)?', '', date_str)) pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 timezone, date_str = extract_timezone(date_str) @@ -1245,6 +1287,7 @@ def unified_timestamp(date_str, day_first=True): return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds() +@partial_application def determine_ext(url, default_ext='unknown_video'): if url is None or '.' not in url: return default_ext @@ -1920,7 +1963,7 @@ def remove_start(s, start): def remove_end(s, end): - return s[:-len(end)] if s is not None and s.endswith(end) else s + return s[:-len(end)] if s is not None and end and s.endswith(end) else s def remove_quotes(s): @@ -1949,12 +1992,13 @@ def base_url(url): return re.match(r'https?://[^?#]+/', url).group() +@partial_application def urljoin(base, path): if isinstance(path, bytes): path = path.decode() if not isinstance(path, str) or not path: return None - if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): + if re.match(r'(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): return path if isinstance(base, bytes): base = base.decode() @@ -1964,11 +2008,15 @@ def urljoin(base, path): return urllib.parse.urljoin(base, path) -def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): +@partial_application +def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1, base=None): if get_attr and v is not None: v = getattr(v, get_attr, None) + if invscale == 1 and scale < 1: + invscale = int(1 / scale) + scale = 1 try: - return int(v) * invscale // scale + return (int(v) if base is None else int(v, base=base)) * invscale // scale except (ValueError, TypeError, OverflowError): return default @@ -1986,9 +2034,13 @@ def str_to_int(int_str): return int_or_none(int_str) +@partial_application def float_or_none(v, scale=1, invscale=1, default=None): if v is None: return default + if invscale == 1 and scale < 1: + invscale = int(1 / scale) + scale = 1 try: return float(v) * invscale / scale except (ValueError, TypeError): @@ -2007,7 +2059,7 @@ def url_or_none(url): if not url or not isinstance(url, str): return None url = url.strip() - return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None + return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None def strftime_or_none(timestamp, date_format='%Y%m%d', default=None): @@ -2085,17 +2137,20 @@ def parse_duration(s): (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1))) -def prepend_extension(filename, ext, expected_real_ext=None): +def _change_extension(prepend, filename, ext, expected_real_ext=None): name, real_ext = os.path.splitext(filename) - return ( - f'{name}.{ext}{real_ext}' - if not expected_real_ext or real_ext[1:] == expected_real_ext - else f'{filename}.{ext}') + if not expected_real_ext or real_ext[1:] == expected_real_ext: + filename = name + if prepend and real_ext: + _UnsafeExtensionError.sanitize_extension(ext, prepend=True) + return f'{filename}.{ext}{real_ext}' -def replace_extension(filename, ext, expected_real_ext=None): - name, real_ext = os.path.splitext(filename) - return f'{name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename}.{ext}' + return f'{filename}.{_UnsafeExtensionError.sanitize_extension(ext)}' + + +prepend_extension = functools.partial(_change_extension, True) +replace_extension = functools.partial(_change_extension, False) def check_executable(exe, args=[]): @@ -2533,6 +2588,7 @@ def urlencode_postdata(*args, **kargs): return urllib.parse.urlencode(*args, **kargs).encode('ascii') +@partial_application def update_url(url, *, query_update=None, **kwargs): """Replace URL components specified by kwargs @param url str or parse url tuple @@ -2553,6 +2609,7 @@ def update_url(url, *, query_update=None, **kwargs): return urllib.parse.urlunparse(url._replace(**kwargs)) +@partial_application def update_url_query(url, query): return update_url(url, query_update=query) @@ -2874,6 +2931,7 @@ def error_to_str(err): return f'{type(err).__name__}: {err}' +@partial_application def mimetype2ext(mt, default=NO_DEFAULT): if not isinstance(mt, str): if default is not NO_DEFAULT: @@ -2916,6 +2974,7 @@ def mimetype2ext(mt, default=NO_DEFAULT): 'audio/webm': 'webm', 'audio/x-matroska': 'mka', 'audio/x-mpegurl': 'm3u', + 'aacp': 'aac', 'midi': 'mid', 'ogg': 'ogg', 'wav': 'wav', @@ -2981,6 +3040,7 @@ def parse_codecs(codecs_str): str.strip, codecs_str.strip().strip(',').split(',')))) vcodec, acodec, scodec, hdr = None, None, None, None for full_codec in split_codecs: + full_codec = re.sub(r'^([^.]+)', lambda m: m.group(1).lower(), full_codec) parts = re.sub(r'0+(?=\d)', '', full_codec).split('.') if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'): @@ -3108,7 +3168,7 @@ def is_html(first_bytes): while first_bytes.startswith(bom): encoding, first_bytes = enc, first_bytes[len(bom):] - return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace')) + return re.match(r'\s*<', first_bytes.decode(encoding, 'replace')) def determine_protocol(info_dict): @@ -4612,6 +4672,7 @@ def to_high_limit_path(path): return path +@partial_application def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY): val = traversal.traverse_obj(obj, *variadic(field)) if not val if ignore is NO_DEFAULT else val in variadic(ignore): @@ -4776,6 +4837,7 @@ def number_of_digits(number): return len('%d' % number) +@partial_application def join_nonempty(*values, delim='-', from_dict=None): if from_dict is not None: values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values) @@ -4845,6 +4907,10 @@ class Config: filename = None __initialized = False + # Internal only, do not use! Hack to enable --plugin-dirs + # TODO(coletdjnz): remove when plugin globals system is implemented + _plugin_dirs = None + def __init__(self, parser, label=None): self.parser, self.label = parser, label self._loaded_paths, self.configs = set(), [] @@ -5023,7 +5089,7 @@ MEDIA_EXTENSIONS = Namespace( common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'), video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'), common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'), - audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'), + audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'), thumbnails=('jpg', 'png', 'webp'), storyboards=('mhtml', ), subtitles=('srt', 'vtt', 'ass', 'lrc'), @@ -5035,6 +5101,137 @@ MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests) +class _UnsafeExtensionError(Exception): + """ + Mitigation exception for uncommon/malicious file extensions + This should be caught in YoutubeDL.py alongside a warning + + Ref: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j + """ + ALLOWED_EXTENSIONS = frozenset([ + # internal + 'description', + 'json', + 'meta', + 'orig', + 'part', + 'temp', + 'uncut', + 'unknown_video', + 'ytdl', + + # video + *MEDIA_EXTENSIONS.video, + 'asx', + 'ismv', + 'm2t', + 'm2ts', + 'm2v', + 'm4s', + 'mng', + 'mp2v', + 'mp4v', + 'mpe', + 'mpeg', + 'mpeg1', + 'mpeg2', + 'mpeg4', + 'mxf', + 'ogm', + 'qt', + 'rm', + 'swf', + 'ts', + 'vob', + 'vp9', + + # audio + *MEDIA_EXTENSIONS.audio, + '3ga', + 'ac3', + 'adts', + 'aif', + 'au', + 'dts', + 'isma', + 'it', + 'mid', + 'mod', + 'mpga', + 'mp1', + 'mp2', + 'mp4a', + 'mpa', + 'ra', + 'shn', + 'xm', + + # image + *MEDIA_EXTENSIONS.thumbnails, + 'avif', + 'bmp', + 'gif', + 'heic', + 'ico', + 'image', + 'jng', + 'jpe', + 'jpeg', + 'jxl', + 'svg', + 'tif', + 'tiff', + 'wbmp', + + # subtitle + *MEDIA_EXTENSIONS.subtitles, + 'dfxp', + 'fs', + 'ismt', + 'json3', + 'sami', + 'scc', + 'srv1', + 'srv2', + 'srv3', + 'ssa', + 'tt', + 'ttml', + 'xml', + + # others + *MEDIA_EXTENSIONS.manifests, + *MEDIA_EXTENSIONS.storyboards, + 'desktop', + 'ism', + 'm3u', + 'sbv', + 'url', + 'webloc', + ]) + + def __init__(self, extension, /): + super().__init__(f'unsafe file extension: {extension!r}') + self.extension = extension + + @classmethod + def sanitize_extension(cls, extension, /, *, prepend=False): + if extension is None: + return None + + if '/' in extension or '\\' in extension: + raise cls(extension) + + if not prepend: + _, _, last = extension.rpartition('.') + if last == 'bin': + extension = last = 'unknown_video' + if last.lower() not in cls.ALLOWED_EXTENSIONS: + raise cls(extension) + + return extension + + class RetryManager: """Usage: for retry in RetryManager(...): @@ -5091,6 +5288,7 @@ class RetryManager: time.sleep(delay) +@partial_application def make_archive_id(ie, video_id): ie_key = ie if isinstance(ie, str) else ie.ie_key() return f'{ie_key.lower()} {video_id}' @@ -5146,7 +5344,7 @@ class FormatSorter: settings = { 'vcodec': {'type': 'ordered', 'regex': True, - 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, + 'order': ['av0?1', 'vp0?9.0?2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, 'acodec': {'type': 'ordered', 'regex': True, 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', @@ -5392,14 +5590,15 @@ class FormatSorter: value = get_value(field) return self._calculate_field_preference_from_value(format_, field, type_, value) - def calculate_preference(self, format): + @staticmethod + def _fill_sorting_fields(format): # Determine missing protocol if not format.get('protocol'): format['protocol'] = determine_protocol(format) # Determine missing ext if not format.get('ext') and 'url' in format: - format['ext'] = determine_ext(format['url']) + format['ext'] = determine_ext(format['url']).lower() if format.get('vcodec') == 'none': format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none' format['video_ext'] = 'none' @@ -5427,6 +5626,8 @@ class FormatSorter: if not format.get('tbr'): format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None + def calculate_preference(self, format): + self._fill_sorting_fields(format) return tuple(self._calculate_field_preference(format, field) for field in self._order) diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py index 96eb2eddf5..dd9b4690be 100644 --- a/yt_dlp/utils/traversal.py +++ b/yt_dlp/utils/traversal.py @@ -1,18 +1,35 @@ +from __future__ import annotations + +import collections import collections.abc import contextlib +import functools import http.cookies import inspect import itertools import re +import typing import xml.etree.ElementTree from ._utils import ( IDENTITY, NO_DEFAULT, + ExtractorError, LazyList, deprecation_warning, + get_elements_html_by_class, + get_elements_html_by_attribute, + get_elements_by_attribute, + get_element_html_by_attribute, + get_element_by_attribute, + get_element_html_by_id, + get_element_by_id, + get_element_html_by_class, + get_elements_by_class, + get_element_text_and_html_by_tag, is_iterable_like, try_call, + url_or_none, variadic, ) @@ -38,7 +55,7 @@ def traverse_obj( The keys in the path can be one of: - `None`: Return the current object. - `set`: Requires the only item in the set to be a type or function, - like `{type}`/`{type, type, ...}/`{func}`. If a `type`, return only + like `{type}`/`{type, type, ...}`/`{func}`. If a `type`, return only values of this type. If a function, returns `func(obj)`. - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`. - `slice`: Branch out and return all values in `obj[key]`. @@ -54,10 +71,11 @@ def traverse_obj( Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. - `any`-builtin: Take the first matching object and return it, resetting branching. - `all`-builtin: Take all matching objects and return them as a list, resetting branching. + - `filter`-builtin: Return the value if it is truthy, `None` otherwise. `tuple`, `list`, and `dict` all support nested paths and branches. - @params paths Paths which to traverse by. + @params paths Paths by which to traverse. @param default Value to return if the paths do not match. If the last key in the path is a `dict`, it will apply to each value inside the dict instead, depth first. Try to avoid if using nested `dict` keys. @@ -247,6 +265,10 @@ def traverse_obj( objs = (list(filtered_objs),) continue + if key is filter: + objs = filter(None, objs) + continue + if __debug__ and callable(key): # Verify function signature inspect.signature(key).bind(None, None) @@ -277,13 +299,156 @@ def traverse_obj( return results[0] if results else {} if allow_empty and is_dict else None for index, path in enumerate(paths, 1): - result = _traverse_obj(obj, path, index == len(paths), True) - if result is not None: - return result + is_last = index == len(paths) + try: + result = _traverse_obj(obj, path, is_last, True) + if result is not None: + return result + except _RequiredError as e: + if is_last: + # Reraise to get cleaner stack trace + raise ExtractorError(e.orig_msg, expected=e.expected) from None return None if default is NO_DEFAULT else default +def value(value, /): + return lambda _: value + + +def require(name, /, *, expected=False): + def func(value): + if value is None: + raise _RequiredError(f'Unable to extract {name}', expected=expected) + + return value + + return func + + +class _RequiredError(ExtractorError): + pass + + +@typing.overload +def subs_list_to_dict(*, ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ... + + +@typing.overload +def subs_list_to_dict(subs: list[dict] | None, /, *, ext: str | None = None) -> dict[str, list[dict]]: ... + + +def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None): + """ + Convert subtitles from a traversal into a subtitle dict. + The path should have an `all` immediately before this function. + + Arguments: + `ext` The default value for `ext` in the subtitle dict + + In the dict you can set the following additional items: + `id` The subtitle id to sort the dict into + `quality` The sort order for each subtitle + """ + if subs is None: + return functools.partial(subs_list_to_dict, ext=ext) + + result = collections.defaultdict(list) + + for sub in subs: + if not url_or_none(sub.get('url')) and not sub.get('data'): + continue + sub_id = sub.pop('id', None) + if sub_id is None: + continue + if ext is not None and not sub.get('ext'): + sub['ext'] = ext + result[sub_id].append(sub) + result = dict(result) + + for subs in result.values(): + subs.sort(key=lambda x: x.pop('quality', 0) or 0) + + return result + + +@typing.overload +def find_element(*, attr: str, value: str, tag: str | None = None, html=False): ... + + +@typing.overload +def find_element(*, cls: str, html=False): ... + + +@typing.overload +def find_element(*, id: str, tag: str | None = None, html=False): ... + + +@typing.overload +def find_element(*, tag: str, html=False): ... + + +def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=False): + # deliberately using `id=` and `cls=` for ease of readability + assert tag or id or cls or (attr and value), 'One of tag, id, cls or (attr AND value) is required' + ANY_TAG = r'[\w:.-]+' + + if attr and value: + assert not cls, 'Cannot match both attr and cls' + assert not id, 'Cannot match both attr and id' + func = get_element_html_by_attribute if html else get_element_by_attribute + return functools.partial(func, attr, value, tag=tag or ANY_TAG) + + elif cls: + assert not id, 'Cannot match both cls and id' + assert tag is None, 'Cannot match both cls and tag' + func = get_element_html_by_class if html else get_elements_by_class + return functools.partial(func, cls) + + elif id: + func = get_element_html_by_id if html else get_element_by_id + return functools.partial(func, id, tag=tag or ANY_TAG) + + index = int(bool(html)) + return lambda html: get_element_text_and_html_by_tag(tag, html)[index] + + +@typing.overload +def find_elements(*, cls: str, html=False): ... + + +@typing.overload +def find_elements(*, attr: str, value: str, tag: str | None = None, html=False): ... + + +def find_elements(*, tag=None, cls=None, attr=None, value=None, html=False): + # deliberately using `cls=` for ease of readability + assert cls or (attr and value), 'One of cls or (attr AND value) is required' + + if attr and value: + assert not cls, 'Cannot match both attr and cls' + func = get_elements_html_by_attribute if html else get_elements_by_attribute + return functools.partial(func, attr, value, tag=tag or r'[\w:.-]+') + + assert not tag, 'Cannot match both cls and tag' + func = get_elements_html_by_class if html else get_elements_by_class + return functools.partial(func, cls) + + +def trim_str(*, start=None, end=None): + def trim(s): + if s is None: + return None + start_idx = 0 + if start and s.startswith(start): + start_idx = len(start) + if end and s.endswith(end): + return s[start_idx:-len(end)] + return s[start_idx:] + + return trim + + def get_first(obj, *paths, **kwargs): return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False) diff --git a/yt_dlp/version.py b/yt_dlp/version.py index a90b288c9a..17d7881845 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.05.27' +__version__ = '2024.10.22' -RELEASE_GIT_HEAD = '12b248ce60be1aa1362edd839d915bba70dbee4b' +RELEASE_GIT_HEAD = '67adeb7bab00662ba55d473e405b301abb42fe61' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.05.27' +_pkg_version = '2024.10.22'