youtube-dl/youtube_dl/InfoExtractors.py

import base64
import datetime
import itertools
import netrc
import os
import re
import socket
import time
import email.utils
import xml.etree.ElementTree
import random
import math
import operator
import hashlib
import binascii
import urllib

from .utils import *
from .extractor.common import InfoExtractor, SearchInfoExtractor

from .extractor.ard import ARDIE
from .extractor.arte import ArteTvIE
from .extractor.bandcamp import BandcampIE
from .extractor.bliptv import BlipTVIE, BlipTVUserIE
from .extractor.comedycentral import ComedyCentralIE
from .extractor.collegehumor import CollegeHumorIE
from .extractor.dailymotion import DailymotionIE
from .extractor.depositfiles import DepositFilesIE
from .extractor.eighttracks import EightTracksIE
from .extractor.escapist import EscapistIE
from .extractor.facebook import FacebookIE
from .extractor.flickr import FlickrIE
from .extractor.funnyordie import FunnyOrDieIE
from .extractor.gametrailers import GametrailersIE
from .extractor.generic import GenericIE
from .extractor.googleplus import GooglePlusIE
from .extractor.googlesearch import GoogleSearchIE
from .extractor.howcast import HowcastIE
from .extractor.hypem import HypemIE
from .extractor.ina import InaIE
from .extractor.infoq import InfoQIE
from .extractor.justintv import JustinTVIE
from .extractor.keek import KeekIE
from .extractor.liveleak import LiveLeakIE
from .extractor.metacafe import MetacafeIE
from .extractor.mixcloud import MixcloudIE
from .extractor.mtv import MTVIE
from .extractor.myspass import MySpassIE
from .extractor.myvideo import MyVideoIE
from .extractor.nba import NBAIE
from .extractor.statigram import StatigramIE
from .extractor.photobucket import PhotobucketIE
from .extractor.pornotube import PornotubeIE
from .extractor.rbmaradio import RBMARadioIE
from .extractor.redtube import RedTubeIE
from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
from .extractor.spiegel import SpiegelIE
from .extractor.stanfordoc import StanfordOpenClassroomIE
from .extractor.steam import SteamIE
from .extractor.teamcoco import TeamcocoIE
from .extractor.ted import TEDIE
from .extractor.tumblr import TumblrIE
from .extractor.ustream import UstreamIE
from .extractor.vbox7 import Vbox7IE
from .extractor.vimeo import VimeoIE
from .extractor.vine import VineIE
from .extractor.worldstarhiphop import WorldStarHipHopIE
from .extractor.xnxx import XNXXIE
from .extractor.xvideos import XVideosIE
from .extractor.yahoo import YahooIE, YahooSearchIE
from .extractor.youjizz import YouJizzIE
from .extractor.youku import YoukuIE
from .extractor.youporn import YouPornIE
from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
from .extractor.zdf import ZDFIE


class XHamsterIE(InfoExtractor):
    """Information Extractor for xHamster"""
    _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'

    def _real_extract(self,url):
        mobj = re.match(self._VALID_URL, url)

        video_id = mobj.group('id')
        mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
        webpage = self._download_webpage(mrss_url, video_id)

        mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
        if mobj is None:
            raise ExtractorError(u'Unable to extract media URL')
        if len(mobj.group('server')) == 0:
            video_url = compat_urllib_parse.unquote(mobj.group('file'))
        else:
            video_url = mobj.group('server')+'/key='+mobj.group('file')
        video_extension = video_url.split('.')[-1]

        video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
            webpage, u'title')

        # Can't see the description anywhere in the UI
        # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
        #     webpage, u'description', fatal=False)
        # if video_description: video_description = unescapeHTML(video_description)

        mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
        if mobj:
            video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
        else:
            video_upload_date = None
            self._downloader.report_warning(u'Unable to extract upload date')

        video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
            webpage, u'uploader id', default=u'anonymous')

        video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
            webpage, u'thumbnail', fatal=False)

        return [{
            'id':       video_id,
            'url':      video_url,
            'ext':      video_extension,
            'title':    video_title,
            # 'description': video_description,
            'upload_date': video_upload_date,
            'uploader_id': video_uploader_id,
            'thumbnail': video_thumbnail
        }]


def gen_extractors():
    """ Return a list of an instance of every supported extractor.
    The order does matter; the first extractor matched is the one handling the URL.
    """
    return [
        YoutubePlaylistIE(),
        YoutubeChannelIE(),
        YoutubeUserIE(),
        YoutubeSearchIE(),
        YoutubeIE(),
        MetacafeIE(),
        DailymotionIE(),
        GoogleSearchIE(),
        PhotobucketIE(),
        YahooIE(),
        YahooSearchIE(),
        DepositFilesIE(),
        FacebookIE(),
        BlipTVIE(),
        BlipTVUserIE(),
        VimeoIE(),
        MyVideoIE(),
        ComedyCentralIE(),
        EscapistIE(),
        CollegeHumorIE(),
        XVideosIE(),
        SoundcloudSetIE(),
        SoundcloudIE(),
        InfoQIE(),
        MixcloudIE(),
        StanfordOpenClassroomIE(),
        MTVIE(),
        YoukuIE(),
        XNXXIE(),
        YouJizzIE(),
        PornotubeIE(),
        YouPornIE(),
        GooglePlusIE(),
        ArteTvIE(),
        NBAIE(),
        WorldStarHipHopIE(),
        JustinTVIE(),
        FunnyOrDieIE(),
        SteamIE(),
        UstreamIE(),
        RBMARadioIE(),
        EightTracksIE(),
        KeekIE(),
        TEDIE(),
        MySpassIE(),
        SpiegelIE(),
        LiveLeakIE(),
        ARDIE(),
        ZDFIE(),
        TumblrIE(),
        BandcampIE(),
        RedTubeIE(),
        InaIE(),
        HowcastIE(),
        VineIE(),
        FlickrIE(),
        TeamcocoIE(),
        XHamsterIE(),
        HypemIE(),
        Vbox7IE(),
        GametrailersIE(),
        StatigramIE(),
        GenericIE()
    ]

def get_info_extractor(ie_name):
    """Returns the info extractor class with the given ie_name"""
    return globals()[ie_name+'IE']
Fix infoQ in Python3 12 years ago			`import base64`
Split code as a package, compiled into an executable zip 13 years ago			`import datetime`
8tracks IE (Closes #652) 12 years ago			`import itertools`
Split code as a package, compiled into an executable zip 13 years ago			`import netrc`
			`import os`
			`import re`
			`import socket`
			`import time`
			`import email.utils`
dropped the support for Python 2.5 let's elaborate the decision: Python 2.5 is a 6 years old release and "under the current release policy, no security issues in Python 2.5 will be fixed anymore" (!!); also, it doesn't support the new zipfile distribution format. 13 years ago			`import xml.etree.ElementTree`
add youku support 13 years ago			`import random`
			`import math`
Switch YTPlaylistIE to API (relevant: #586); fixes #651; fixes #673; fixes #661 12 years ago			`import operator`
MyVideoIE: add rtmp support 12 years ago			`import hashlib`
			`import binascii`
			`import urllib`
Split code as a package, compiled into an executable zip 13 years ago
Use relative imports 12 years ago			`from .utils import *`
Fix generic class move (add all files) 12 years ago			`from .extractor.common import InfoExtractor, SearchInfoExtractor`
Move ARD, Arte, ZDF into their own files 12 years ago
			`from .extractor.ard import ARDIE`
			`from .extractor.arte import ArteTvIE`
[Bandcamp] move into own file 12 years ago			`from .extractor.bandcamp import BandcampIE`
Move blip.tv extractors into their own file 12 years ago			`from .extractor.bliptv import BlipTVIE, BlipTVUserIE`
Move comedycentral into its own file 12 years ago			`from .extractor.comedycentral import ComedyCentralIE`
Move Collegehumor IE into its own file 12 years ago			`from .extractor.collegehumor import CollegeHumorIE`
Move DailyMotion into its own file 12 years ago			`from .extractor.dailymotion import DailymotionIE`
Move DepositFiles into its own IE 12 years ago			`from .extractor.depositfiles import DepositFilesIE`
[8tracks] Move into own file 12 years ago			`from .extractor.eighttracks import EightTracksIE`
Move Escapist into its own file 12 years ago			`from .extractor.escapist import EscapistIE`
Add facebook import 12 years ago			`from .extractor.facebook import FacebookIE`
[flickr] Move into own file 12 years ago			`from .extractor.flickr import FlickrIE`
Move FunnyOrDie into its own file 12 years ago			`from .extractor.funnyordie import FunnyOrDieIE`
Move gametrailers IE into its own file 12 years ago			`from .extractor.gametrailers import GametrailersIE`
Move GenericIE into its own file 12 years ago			`from .extractor.generic import GenericIE`
Move G+ IE into its own file, and move google search into a more descriptive module 12 years ago			`from .extractor.googleplus import GooglePlusIE`
			`from .extractor.googlesearch import GoogleSearchIE`
[howcast] Move into own file 12 years ago			`from .extractor.howcast import HowcastIE`
[hypem] Move into own file 12 years ago			`from .extractor.hypem import HypemIE`
[ina] Move into own file 12 years ago			`from .extractor.ina import InaIE`
Move infoq into its own file 12 years ago			`from .extractor.infoq import InfoQIE`
[justin.tv] move into own file 12 years ago			`from .extractor.justintv import JustinTVIE`
[keek] move into own file 12 years ago			`from .extractor.keek import KeekIE`
[LiveLeak] move into own file 12 years ago			`from .extractor.liveleak import LiveLeakIE`
Move Metacafe and Statigram into their own files, and remove absolute import 12 years ago			`from .extractor.metacafe import MetacafeIE`
Move MixCloud into its own file 12 years ago			`from .extractor.mixcloud import MixcloudIE`
Move MTV IE into its own file 12 years ago			`from .extractor.mtv import MTVIE`
[myspass] Move into own file and default to mp4 ext 12 years ago			`from .extractor.myspass import MySpassIE`
Move MyVideo into its own file 12 years ago			`from .extractor.myvideo import MyVideoIE`
Move NBA IE into its own file 12 years ago			`from .extractor.nba import NBAIE`
Move Metacafe and Statigram into their own files, and remove absolute import 12 years ago			`from .extractor.statigram import StatigramIE`
Move Photobucket into its own file 12 years ago			`from .extractor.photobucket import PhotobucketIE`
[pornotube] move into own file 12 years ago			`from .extractor.pornotube import PornotubeIE`
[RBMARadio] move into own file 12 years ago			`from .extractor.rbmaradio import RBMARadioIE`
[redtube] move into own file 12 years ago			`from .extractor.redtube import RedTubeIE`
Move Soundcloud into its own file 12 years ago			`from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE`
[Spiegel] move into own file 12 years ago			`from .extractor.spiegel import SpiegelIE`
Move StanfordOC IE into its own file 12 years ago			`from .extractor.stanfordoc import StanfordOpenClassroomIE`
Move Steam IE into its own file 12 years ago			`from .extractor.steam import SteamIE`
[Teamcoco] Move into own file 12 years ago			`from .extractor.teamcoco import TeamcocoIE`
Move TED IE into its own file 12 years ago			`from .extractor.ted import TEDIE`
[Tumblr] move into own file 12 years ago			`from .extractor.tumblr import TumblrIE`
[ustream] move into its own file 12 years ago			`from .extractor.ustream import UstreamIE`
[VBox7] move into own file 12 years ago			`from .extractor.vbox7 import Vbox7IE`
Move Vimeo into its own file 12 years ago			`from .extractor.vimeo import VimeoIE`
[Vine] move into own file 12 years ago			`from .extractor.vine import VineIE`
Move WorldStarHipHop into its own file 12 years ago			`from .extractor.worldstarhiphop import WorldStarHipHopIE`
Move Steam IE into its own file 12 years ago			`from .extractor.xnxx import XNXXIE`
Move XVideos IE into its own file (and simplify it a bit) 12 years ago			`from .extractor.xvideos import XVideosIE`
Move YahooSearchIE to youtube_dl.extractor.yahoo 12 years ago			`from .extractor.yahoo import YahooIE, YahooSearchIE`
[youjizz] move into own file 12 years ago			`from .extractor.youjizz import YouJizzIE`
Move Youku IE into its own file 12 years ago			`from .extractor.youku import YoukuIE`
[YouPorn] move into own file 12 years ago			`from .extractor.youporn import YouPornIE`
Move YoutubeSearchIE to the other youtube IEs 12 years ago			`from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE`
Move ARD, Arte, ZDF into their own files 12 years ago			`from .extractor.zdf import ZDFIE`
Add base class SearchInfoExtractor for search queries IEs 12 years ago
_download_webpage helper function 12 years ago
Split code as a package, compiled into an executable zip 13 years ago


added InfoExtractor for XNXX 13 years ago
add youku support 13 years ago
Merge pull request #398 from tempname/master 13 years ago

Merge PR #422 from 'kevinamadeus/master' Add InfoExtractor for Google Plus video (with fixes) 12 years ago

NBA IE (Closes #590) 12 years ago
Preliminary support for twitch.tv and justin.tv 12 years ago
FunnyOrDie IE (Fixes #599) 12 years ago
TweetReel IE 12 years ago
Make ustream IE more robust 12 years ago
Move gen_extractors to InfoExtractors 12 years ago
Support for WorldStarHipHop.com 12 years ago
Move gen_extractors to InfoExtractors 12 years ago
Added extractors for 3 porn sites 12 years ago

Switch YTPlaylistIE to API (relevant: #586); fixes #651; fixes #673; fixes #661 12 years ago
Added extractors for 3 porn sites 12 years ago



Add KeekIE() 12 years ago

added new InfoExtractor for myspass.de 12 years ago
Spiegel IE 12 years ago
Update InfoExtractors.py 12 years ago
add ZDFIE and _download_with_mplayer(mms://,rtsp://) 12 years ago

TumblrIE I haven't found many videos to test, so it may not work for all. 12 years ago
Add BandcampIE (closes #568) 12 years ago
Spiegel IE 12 years ago
Add support for Howcast.com - closes #835 12 years ago
Add support for Vine - closes #845 12 years ago
add support for Flickr videos - closes #261 12 years ago
use search_regex in new IEs 12 years ago
Added support for xhamster in infoextractors 12 years ago			`class XHamsterIE(InfoExtractor):`
			`"""Information Extractor for xHamster"""`
			`_VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'`

			`def _real_extract(self,url):`
			`mobj = re.match(self._VALID_URL, url)`

			`video_id = mobj.group('id')`
use search_regex in new IEs 12 years ago			`mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id`
Added support for xhamster in infoextractors 12 years ago			`webpage = self._download_webpage(mrss_url, video_id)`
use search_regex in new IEs 12 years ago
Added support for xhamster in infoextractors 12 years ago			`mobj = re.search(r'\'srv\': \'(?P<server>[^\'])\',\s\'file\': \'(?P<file>[^\']+)\',', webpage)`
			`if mobj is None:`
			`raise ExtractorError(u'Unable to extract media URL')`
			`if len(mobj.group('server')) == 0:`
			`video_url = compat_urllib_parse.unquote(mobj.group('file'))`
			`else:`
			`video_url = mobj.group('server')+'/key='+mobj.group('file')`
			`video_extension = video_url.split('.')[-1]`

_html_search_regex with clean_html superpowers 12 years ago			`video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',`
use search_regex in new IEs 12 years ago			`webpage, u'title')`
Added support for xhamster in infoextractors 12 years ago
XHamster: Can't see the description anywhere in the UI 12 years ago			`# Can't see the description anywhere in the UI`
_html_search_regex with clean_html superpowers 12 years ago			`# video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',`
XHamster: Can't see the description anywhere in the UI 12 years ago			`# webpage, u'description', fatal=False)`
			`# if video_description: video_description = unescapeHTML(video_description)`
Added support for xhamster in infoextractors 12 years ago
			`mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)`
use search_regex in new IEs 12 years ago			`if mobj:`
			`video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')`
Added support for xhamster in infoextractors 12 years ago			`else:`
use search_regex in new IEs 12 years ago			`video_upload_date = None`
			`self._downloader.report_warning(u'Unable to extract upload date')`
Added support for xhamster in infoextractors 12 years ago
test: extend the reach of info_dict checking * print the info_dict in a format suitable to easy adding to tests.json during tests if un-tested fields are detected * make it possible to put the crc32 in tests.json if the field is too long * complete the "info_dict" fields in existing tests * fixed the bugs catched doing this 12 years ago			`video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',`
use search_regex in new IEs 12 years ago			`webpage, u'uploader id', default=u'anonymous')`

			`video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',`
			`webpage, u'thumbnail', fatal=False)`
Added support for xhamster in infoextractors 12 years ago
			`return [{`
			`'id': video_id,`
			`'url': video_url,`
			`'ext': video_extension,`
			`'title': video_title,`
XHamster: Can't see the description anywhere in the UI 12 years ago			`# 'description': video_description,`
Added support for xhamster in infoextractors 12 years ago			`'upload_date': video_upload_date,`
			`'uploader_id': video_uploader_id,`
			`'thumbnail': video_thumbnail`
			`}]`
add support for Flickr videos - closes #261 12 years ago
added HypemIE rebased, closes PR #871 12 years ago


Add GametrailersIE 12 years ago
Move gen_extractors to InfoExtractors 12 years ago			`def gen_extractors():`
			`""" Return a list of an instance of every supported extractor.`
			`The order does matter; the first extractor matched is the one handling the URL.`
			`"""`
			`return [`
			`YoutubePlaylistIE(),`
			`YoutubeChannelIE(),`
			`YoutubeUserIE(),`
			`YoutubeSearchIE(),`
			`YoutubeIE(),`
			`MetacafeIE(),`
			`DailymotionIE(),`
			`GoogleSearchIE(),`
			`PhotobucketIE(),`
			`YahooIE(),`
			`YahooSearchIE(),`
			`DepositFilesIE(),`
			`FacebookIE(),`
			`BlipTVIE(),`
BlipTV: accept urls in the format http://a.blip.tv/api.swf#{id} (closes #857) Tweak the regex so that BlipTV can be before BlipTVUser. 12 years ago			`BlipTVUserIE(),`
Move gen_extractors to InfoExtractors 12 years ago			`VimeoIE(),`
			`MyVideoIE(),`
			`ComedyCentralIE(),`
			`EscapistIE(),`
			`CollegeHumorIE(),`
			`XVideosIE(),`
SoundcloudSetIE info extractor for soundcloud sets 12 years ago			`SoundcloudSetIE(),`
Move gen_extractors to InfoExtractors 12 years ago			`SoundcloudIE(),`
			`InfoQIE(),`
			`MixcloudIE(),`
			`StanfordOpenClassroomIE(),`
			`MTVIE(),`
			`YoukuIE(),`
			`XNXXIE(),`
oops - didn't remove some reminders 12 years ago			`YouJizzIE(),`
			`PornotubeIE(),`
			`YouPornIE(),`
Move gen_extractors to InfoExtractors 12 years ago			`GooglePlusIE(),`
			`ArteTvIE(),`
			`NBAIE(),`
Support for WorldStarHipHop.com 12 years ago			`WorldStarHipHopIE(),`
Move gen_extractors to InfoExtractors 12 years ago			`JustinTVIE(),`
			`FunnyOrDieIE(),`
			`SteamIE(),`
			`UstreamIE(),`
RBMA IE (Closes #630) 12 years ago			`RBMARadioIE(),`
8tracks IE (Closes #652) 12 years ago			`EightTracksIE(),`
Add KeekIE() 12 years ago			`KeekIE(),`
Basic support for TED 12 years ago			`TEDIE(),`
Add tests to MySpass 12 years ago			`MySpassIE(),`
Spiegel IE 12 years ago			`SpiegelIE(),`
Rebased, fixed and extended LiveLeak.com support close #757 - close #761 12 years ago			`LiveLeakIE(),`
added ARD InfoExtractor (german state television) 12 years ago			`ARDIE(),`
add ZDFIE and _download_with_mplayer(mms://,rtsp://) 12 years ago			`ZDFIE(),`
TumblrIE I haven't found many videos to test, so it may not work for all. 12 years ago			`TumblrIE(),`
Add BandcampIE (closes #568) 12 years ago			`BandcampIE(),`
Simplify RedTube 12 years ago			`RedTubeIE(),`
Clean up InaIE (Closes #823) 12 years ago			`InaIE(),`
Add support for Howcast.com - closes #835 12 years ago			`HowcastIE(),`
Add support for Vine - closes #845 12 years ago			`VineIE(),`
add support for Flickr videos - closes #261 12 years ago			`FlickrIE(),`
Add TeamcocoIE (closes #212) 12 years ago			`TeamcocoIE(),`
Added support for xhamster in infoextractors 12 years ago			`XHamsterIE(),`
added HypemIE rebased, closes PR #871 12 years ago			`HypemIE(),`
Added Vbox7 Infoextractor 12 years ago			`Vbox7IE(),`
Add GametrailersIE 12 years ago			`GametrailersIE(),`
Improve Statigr.am IE 12 years ago			`StatigramIE(),`
Move gen_extractors to InfoExtractors 12 years ago			`GenericIE()`
			`]`
Create a function in InfoExtractors that returns the InfoExtractor class with the given name 12 years ago
			`def get_info_extractor(ie_name):`
			`"""Returns the info extractor class with the given ie_name"""`
			`return globals()[ie_name+'IE']`