[tnaflix] Generalize tnaflix extractors

10 years ago · d16154d163
parent c342041fba
commit d16154d163
4 changed files with 234 additions and 219 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -144,7 +144,6 @@ from .ellentv import (
 )
 from .elpais import ElPaisIE
 from .embedly import EmbedlyIE
 from .empflix import EMPFlixIE
 from .engadget import EngadgetIE
 from .eporner import EpornerIE
 from .eroprofile import EroProfileIE
@ -311,7 +310,6 @@ from .morningstar import MorningstarIE
 from .motherless import MotherlessIE
 from .motorsport import MotorsportIE
 from .movieclips import MovieClipsIE
 from .moviefap import MovieFapIE
 from .moviezine import MoviezineIE
 from .movshare import MovShareIE
 from .mtv import (
@ -578,7 +576,11 @@ from .tmz import (
    TMZIE,
    TMZArticleIE,
 )
-from .tnaflix import TNAFlixIE
+from .tnaflix import (
    TNAFlixIE,
    EMPFlixIE,
    MovieFapIE,
 )
 from .thvideo import (
    THVideoIE,
    THVideoPlaylistIE
--- a/youtube_dl/extractor/empflix.py
+++ b/youtube_dl/extractor/empflix.py
@ -1,31 +0,0 @@
 from __future__ import unicode_literals
 from .tnaflix import TNAFlixIE
 class EMPFlixIE(TNAFlixIE):
    _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html'
    _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"'
    _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"'
    _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
    _TESTS = [
        {
            'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
            'md5': 'b1bc15b6412d33902d6e5952035fcabc',
            'info_dict': {
                'id': '33051',
                'display_id': 'Amateur-Finger-Fuck',
                'ext': 'mp4',
                'title': 'Amateur Finger Fuck',
                'description': 'Amateur solo finger fucking.',
                'thumbnail': 're:https?://.*\.jpg$',
                'age_limit': 18,
            }
        },
        {
            'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html',
            'only_matching': True,
        }
    ]
--- a/youtube_dl/extractor/moviefap.py
+++ b/youtube_dl/extractor/moviefap.py
@ -1,135 +0,0 @@
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import (
    xpath_text,
    str_to_int
 )
 from ..compat import compat_str
 class MovieFapIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<name>[a-z-_]+)'
    _TESTS = [{
        # normal, multi-format video
        'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html',
        'md5': '26624b4e2523051b550067d547615906',
        'info_dict': {
            'id': 'be9867c9416c19f54a4a',
            'ext': 'mp4',
            'title': 'Experienced MILF Amazing Handjob',
            'description': 'Experienced MILF giving an Amazing Handjob',
            'thumbnail': 'http://img.moviefap.com/a16:9w990r/thumbs/be/322032-20l.jpg',
            'uploader_id': 'darvinfred06',
            'display_id': 'experienced-milf-amazing-handjob',
            'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing']
        }
    }, {
        # quirky single-format case where the extension is given as fid, but the video is really an flv
        'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html',
        'md5': 'fa56683e291fc80635907168a743c9ad',
        'info_dict': {
            'id': 'e5da0d3edce5404418f5',
            'ext': 'flv',
            'title': 'Jeune Couple Russe',
            'description': 'Amateur',
            'thumbnail': 'http://pic.moviefap.com/thumbs/e5/949-18l.jpg',
            'uploader_id': 'whiskeyjar',
            'display_id': 'jeune-couple-russe',
            'categories': ['Amateur', 'Teen']
        }
    }]
    @staticmethod
    def __get_thumbnail_data(xml):
        """
        Constructs a list of video thumbnails from timeline preview images.
        :param xml: the information XML document to parse
        """
        timeline = xml.find('timeline')
        if timeline is None:
            # not all videos have the data - ah well
            return []
        # get the required information from the XML
        width = str_to_int(timeline.find('imageWidth').text)
        height = str_to_int(timeline.find('imageHeight').text)
        first = str_to_int(timeline.find('imageFirst').text)
        last = str_to_int(timeline.find('imageLast').text)
        pattern = timeline.find('imagePattern').text
        # generate the list of thumbnail information dicts
        thumbnails = []
        for i in range(first, last + 1):
            thumbnails.append({
                'url': pattern.replace('#', compat_str(i)),
                'width': width,
                'height': height
            })
        return thumbnails
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        # find and retrieve the XML document detailing video download URLs
        info_url = self._html_search_regex(
            r'flashvars\.config = escape\("(.+?)"', webpage, 'player parameters')
        xml = self._download_xml(info_url, video_id)
        # find the video container
        if xml.find('videoConfig') is not None:
            ext = xml.find('videoConfig').find('type').text
        else:
            ext = 'flv'  # guess...
        # work out the video URL(s)
        formats = []
        if xml.find('videoLink') is not None:
            # single format available
            formats.append({
                'url': xpath_text(xml, 'videoLink', 'url', True),
                'ext': ext
            })
        else:
            # multiple formats available
            for item in xml.find('quality').findall('item'):
                resolution = xpath_text(item, 'res', 'resolution', True)  # 480p etc.
                formats.append({
                    'url': xpath_text(item, 'videoLink', 'url', True),
                    'ext': ext,
                    'resolution': resolution,
                    'height': int(re.findall(r'\d+', resolution)[0])
                })
            self._sort_formats(formats)
        return {
            'id': video_id,
            'formats': formats,
            'title': self._html_search_regex(
                r'<div id="view_title"><h1>(.*?)</h1>', webpage, 'title'),
            'display_id': re.compile(self._VALID_URL).match(url).group('name'),
            'thumbnails': self.__get_thumbnail_data(xml),
            'thumbnail': xpath_text(xml, 'startThumb', 'thumbnail'),
            'description': self._html_search_regex(
                r'name="description" value="(.*?)"', webpage, 'description', fatal=False),
            'uploader_id': self._html_search_regex(
                r'name="username" value="(.*?)"', webpage, 'uploader_id', fatal=False),
            'view_count': str_to_int(self._html_search_regex(
                r'<br>Views <strong>([0-9]+)</strong>', webpage, 'view_count, fatal=False')),
            'average_rating': float(self._html_search_regex(
                r'Current Rating<br> <strong>(.*?)</strong>', webpage, 'average_rating', fatal=False)),
            'comment_count': str_to_int(self._html_search_regex(
                r'<span id="comCount">([0-9]+)</span>', webpage, 'comment_count', fatal=False)),
            'age_limit': 18,
            'webpage_url': self._html_search_regex(
                r'name="link" value="(.*?)"', webpage, 'webpage_url', fatal=False),
            'categories': self._html_search_regex(
                r'</div>\s*(.*?)\s*<br>', webpage, 'categories', fatal=False).split(', ')
        }
--- a/youtube_dl/extractor/tnaflix.py
+++ b/youtube_dl/extractor/tnaflix.py
@ -3,39 +3,70 @@ from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..compat import compat_str
 from ..utils import (
    parse_duration,
    fix_xml_ampersands,
    float_or_none,
    int_or_none,
    parse_duration,
    str_to_int,
    xpath_text,
 )
-class TNAFlixIE(InfoExtractor):
+class TNAFlixNetworkBaseIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
+    # May be overridden in descendants if necessary
    _CONFIG_REGEX = [
        r'flashvars\.config\s*=\s*escape\("([^"]+)"',
        r'<input[^>]+name="config\d?" value="([^"]+)"',
    ]
    _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"'
    _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"'
    _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"'
    _VIEW_COUNT_REGEX = None
    _COMMENT_COUNT_REGEX = None
    _AVERAGE_RATING_REGEX = None
    _CATEGORIES_REGEX = r'<li[^>]*>\s*<span[^>]+class="infoTitle"[^>]*>Categories:</span>\s*<span[^>]+class="listView"[^>]*>(.+?)</span>\s*</li>'
-    _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
+    def _extract_thumbnails(self, flix_xml):
    _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
    _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
-    _TESTS = [
+        def get_child(elem, names):
-        {
+            for name in names:
-            'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
+                child = elem.find(name)
-            'md5': 'ecf3498417d09216374fc5907f9c6ec0',
+                if child is not None:
-            'info_dict': {
+                    return child
-                'id': '553878',
+
-                'display_id': 'Carmella-Decesare-striptease',
+        timeline = get_child(flix_xml, ['timeline', 'rolloverBarImage'])
-                'ext': 'mp4',
+        if timeline is None:
-                'title': 'Carmella Decesare - striptease',
+            return
-                'description': '',
+
-                'thumbnail': 're:https?://.*\.jpg$',
+        pattern_el = get_child(timeline, ['imagePattern', 'pattern'])
-                'duration': 91,
+        if pattern_el is None or not pattern_el.text:
-                'age_limit': 18,
+            return
-            }
+
-        },
+        first_el = get_child(timeline, ['imageFirst', 'first'])
-        {
+        last_el = get_child(timeline, ['imageLast', 'last'])
-            'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632',
+        if first_el is None or last_el is None:
-            'only_matching': True,
+            return
-        }
+
-    ]
+        first_text = first_el.text
        last_text = last_el.text
        if not first_text.isdigit() or not last_text.isdigit():
            return
        first = int(first_text)
        last = int(last_text)
        if first > last:
            return
        width = int_or_none(xpath_text(timeline, './imageWidth', 'thumbnail width'))
        height = int_or_none(xpath_text(timeline, './imageHeight', 'thumbnail height'))
        return [{
            'url': self._proto_relative_url(pattern_el.text.replace('#', compat_str(i)), 'http:'),
            'width': width,
            'height': height,
        } for i in range(first, last + 1)]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
@ -44,47 +75,195 @@ class TNAFlixIE(InfoExtractor):
        webpage = self._download_webpage(url, display_id)
        title = self._html_search_regex(
            self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
        description = self._html_search_regex(
            self._DESCRIPTION_REGEX, webpage, 'description', fatal=False, default='')
        age_limit = self._rta_search(webpage)
        duration = parse_duration(self._html_search_meta(
            'duration', webpage, 'duration', default=None))
        cfg_url = self._proto_relative_url(self._html_search_regex(
            self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:')
        cfg_xml = self._download_xml(
-            cfg_url, display_id, note='Downloading metadata',
+            cfg_url, display_id, 'Downloading metadata',
            transform_source=fix_xml_ampersands)
        thumbnail = self._proto_relative_url(
            cfg_xml.find('./startThumb').text, 'http:')
        formats = []
        def extract_video_url(vl):
            return re.sub('speed=\d+', 'speed=', vl.text)
        video_link = cfg_xml.find('./videoLink')
        if video_link is not None:
            formats.append({
                'url': extract_video_url(video_link),
                'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'),
            })
        for item in cfg_xml.findall('./quality/item'):
-            video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text)
+            video_link = item.find('./videoLink')
-            format_id = item.find('res').text
+            if video_link is None:
-            fmt = {
+                continue
-                'url': self._proto_relative_url(video_url, 'http:'),
+            res = item.find('res')
            format_id = None if res is None else res.text
            height = int_or_none(self._search_regex(
                r'^(\d+)[pP]', format_id, 'height', default=None))
            formats.append({
                'url': self._proto_relative_url(extract_video_url(video_link), 'http:'),
                'format_id': format_id,
-            }
+                'height': height,
-            m = re.search(r'^(\d+)', format_id)
+            })
-            if m:
+
                fmt['height'] = int(m.group(1))
            formats.append(fmt)
        self._sort_formats(formats)
        thumbnail = self._proto_relative_url(
            xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:')
        thumbnails = self._extract_thumbnails(cfg_xml)
        title = self._html_search_regex(
            self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
        age_limit = self._rta_search(webpage)
        duration = parse_duration(self._html_search_meta(
            'duration', webpage, 'duration', default=None))
        def extract_field(pattern, name):
            return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None
        description = extract_field(self._DESCRIPTION_REGEX, 'description')
        uploader = extract_field(self._UPLOADER_REGEX, 'uploader')
        view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count'))
        comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count'))
        average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating'))
        categories_str = extract_field(self._CATEGORIES_REGEX, 'categories')
        categories = categories_str.split(', ') if categories_str is not None else []
        return {
            'id': video_id,
            'display_id': display_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'thumbnails': thumbnails,
            'duration': duration,
            'age_limit': age_limit,
            'uploader': uploader,
            'view_count': view_count,
            'comment_count': comment_count,
            'average_rating': average_rating,
            'categories': categories,
            'formats': formats,
        }
 class TNAFlixIE(TNAFlixNetworkBaseIE):
    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
    _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
    _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
    _UPLOADER_REGEX = r'(?s)<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)<div'
    _TESTS = [{
        # anonymous uploader, no categories
        'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
        'md5': 'ecf3498417d09216374fc5907f9c6ec0',
        'info_dict': {
            'id': '553878',
            'display_id': 'Carmella-Decesare-striptease',
            'ext': 'mp4',
            'title': 'Carmella Decesare - striptease',
            'thumbnail': 're:https?://.*\.jpg$',
            'duration': 91,
            'age_limit': 18,
            'uploader': 'Anonymous',
            'categories': [],
        }
    }, {
        # non-anonymous uploader, categories
        'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538',
        'md5': '0f5d4d490dbfd117b8607054248a07c0',
        'info_dict': {
            'id': '6538',
            'display_id': 'Educational-xxx-video',
            'ext': 'mp4',
            'title': 'Educational xxx video',
            'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8',
            'thumbnail': 're:https?://.*\.jpg$',
            'duration': 164,
            'age_limit': 18,
            'uploader': 'bobwhite39',
            'categories': ['Amateur Porn', 'Squirting Videos', 'Teen Girls 18+'],
        }
    }, {
        'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632',
        'only_matching': True,
    }]
 class EMPFlixIE(TNAFlixNetworkBaseIE):
    _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html'
    _UPLOADER_REGEX = r'<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)</li>'
    _TESTS = [{
        'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
        'md5': 'b1bc15b6412d33902d6e5952035fcabc',
        'info_dict': {
            'id': '33051',
            'display_id': 'Amateur-Finger-Fuck',
            'ext': 'mp4',
            'title': 'Amateur Finger Fuck',
            'description': 'Amateur solo finger fucking.',
            'thumbnail': 're:https?://.*\.jpg$',
            'duration': 83,
            'age_limit': 18,
            'uploader': 'cwbike',
            'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'],
        }
    }, {
        'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html',
        'only_matching': True,
    }]
 class MovieFapIE(TNAFlixNetworkBaseIE):
    _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html'
    _VIEW_COUNT_REGEX = r'<br>Views\s*<strong>([\d,.]+)</strong>'
    _COMMENT_COUNT_REGEX = r'<span[^>]+id="comCount"[^>]*>([\d,.]+)</span>'
    _AVERAGE_RATING_REGEX = r'Current Rating\s*<br>\s*<strong>([\d.]+)</strong>'
    _CATEGORIES_REGEX = r'(?s)<div[^>]+id="vid_info"[^>]*>\s*<div[^>]*>.+?</div>(.*?)<br>'
    _TESTS = [{
        # normal, multi-format video
        'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html',
        'md5': '26624b4e2523051b550067d547615906',
        'info_dict': {
            'id': 'be9867c9416c19f54a4a',
            'display_id': 'experienced-milf-amazing-handjob',
            'ext': 'mp4',
            'title': 'Experienced MILF Amazing Handjob',
            'description': 'Experienced MILF giving an Amazing Handjob',
            'thumbnail': 're:https?://.*\.jpg$',
            'age_limit': 18,
            'uploader': 'darvinfred06',
            'view_count': int,
            'comment_count': int,
            'average_rating': float,
            'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing'],
        }
    }, {
        # quirky single-format case where the extension is given as fid, but the video is really an flv
        'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html',
        'md5': 'fa56683e291fc80635907168a743c9ad',
        'info_dict': {
            'id': 'e5da0d3edce5404418f5',
            'display_id': 'jeune-couple-russe',
            'ext': 'flv',
            'title': 'Jeune Couple Russe',
            'description': 'Amateur',
            'thumbnail': 're:https?://.*\.jpg$',
            'age_limit': 18,
            'uploader': 'whiskeyjar',
            'view_count': int,
            'comment_count': int,
            'average_rating': float,
            'categories': ['Amateur', 'Teen'],
        }
    }]