[extractor/bitchute] Simplify extractor (#5066)

* Check alternate domains when a URL does not work
* Obey `--no-check-formats`
* Remove webseeds (doesnt seem to exist anymore)

Authored by: flashdagger, pukkandan

Co-authored-by: Marcel <flashdagger@googlemail.com>
pull/5066/head
pukkandan 2 years ago
parent 58fb927ebd
commit f72218c199
No known key found for this signature in database
GPG Key ID: 7EEE9E1E817D0A39

@ -4,8 +4,12 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
GeoRestrictedError, HEADRequest,
clean_html,
get_element_by_class,
int_or_none,
orderedSet, orderedSet,
traverse_obj,
unified_strdate, unified_strdate,
urlencode_postdata, urlencode_postdata,
) )
@ -18,7 +22,7 @@ class BitChuteIE(InfoExtractor):
'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/', 'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
'md5': '7e427d7ed7af5a75b5855705ec750e2b', 'md5': '7e427d7ed7af5a75b5855705ec750e2b',
'info_dict': { 'info_dict': {
'id': 'szoMrox2JEI', 'id': 'UGlrF9o9b-Q',
'ext': 'mp4', 'ext': 'mp4',
'title': 'This is the first video on #BitChute !', 'title': 'This is the first video on #BitChute !',
'description': 'md5:a0337e7b1fe39e32336974af8173a034', 'description': 'md5:a0337e7b1fe39e32336974af8173a034',
@ -26,6 +30,21 @@ class BitChuteIE(InfoExtractor):
'uploader': 'BitChute', 'uploader': 'BitChute',
'upload_date': '20170103', 'upload_date': '20170103',
}, },
}, {
# video not downloadable in browser, but we can recover it
'url': 'https://www.bitchute.com/video/2s6B3nZjAk7R/',
'md5': '05c12397d5354bf24494885b08d24ed1',
'info_dict': {
'id': '2s6B3nZjAk7R',
'ext': 'mp4',
'filesize': 71537926,
'title': 'STYXHEXENHAMMER666 - Election Fraud, Clinton 2020, EU Armies, and Gun Control',
'description': 'md5:228ee93bd840a24938f536aeac9cf749',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'BitChute',
'upload_date': '20181113',
},
'params': {'check_formats': None},
}, { }, {
'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
'only_matching': True, 'only_matching': True,
@ -34,67 +53,57 @@ class BitChuteIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
'Referer': 'https://www.bitchute.com/',
}
def _check_format(self, video_url, video_id):
urls = orderedSet(
re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url)
for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153'))
for url in urls:
try:
response = self._request_webpage(
HEADRequest(url), video_id=video_id, note=f'Checking {url}', headers=self._HEADERS)
except ExtractorError as e:
self.to_screen(f'{video_id}: URL is invalid, skipping: {e.cause}')
continue
return {
'url': url,
'filesize': int_or_none(response.headers.get('Content-Length'))
}
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage( webpage = self._download_webpage(
'https://www.bitchute.com/video/%s' % video_id, video_id, headers={ f'https://www.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS)
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
})
title = self._html_search_regex( publish_date = clean_html(get_element_by_class('video-publish-date', webpage))
(r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'), entries = self._parse_html5_media_entries(url, webpage, video_id)
webpage, 'title', default=None) or self._html_search_meta(
'description', webpage, 'title',
default=None) or self._og_search_description(webpage)
format_urls = [] formats = []
for mobj in re.finditer( for format_ in traverse_obj(entries, (0, 'formats', ...)):
r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): if self.get_param('check_formats') is not False:
format_urls.append(mobj.group('url')) format_.update(self._check_format(format_.pop('url'), video_id) or {})
format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage)) if 'url' not in format_:
continue
formats = [ formats.append(format_)
{'url': format_url}
for format_url in orderedSet(format_urls)]
if not formats: if not formats:
entries = self._parse_html5_media_entries( self.raise_no_formats(
url, webpage, video_id) 'Video is unavailable. Please make sure this video is playable in the browser '
if not entries: 'before reporting this issue.', expected=True, video_id=video_id)
error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video')
if error == 'Video Unavailable':
raise GeoRestrictedError(error)
raise ExtractorError(error, expected=True)
formats = entries[0]['formats']
self._check_formats(formats, video_id)
if not formats:
raise self.raise_no_formats('Video is unavailable', expected=True, video_id=video_id)
self._sort_formats(formats) self._sort_formats(formats)
description = self._html_search_regex(
r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>',
webpage, 'description', fatal=False)
thumbnail = self._og_search_thumbnail(
webpage, default=None) or self._html_search_meta(
'twitter:image:src', webpage, 'thumbnail')
uploader = self._html_search_regex(
(r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>',
r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'),
webpage, 'uploader', fatal=False)
upload_date = unified_strdate(self._search_regex(
r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.',
webpage, 'upload date', fatal=False))
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': self._html_extract_title(webpage) or self._og_search_title(webpage),
'description': description, 'description': self._og_search_description(webpage, default=None),
'thumbnail': thumbnail, 'thumbnail': self._og_search_thumbnail(webpage),
'uploader': uploader, 'uploader': clean_html(get_element_by_class('owner', webpage)),
'upload_date': upload_date, 'upload_date': unified_strdate(self._search_regex(
r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)),
'formats': formats, 'formats': formats,
} }

Loading…
Cancel
Save