yt-dlc/youtube_dl/extractor/gogoanime.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    compat_urllib_parse,
    get_element_by_attribute,
    unescapeHTML
)


class GoGoAnimeIE(InfoExtractor):
    IE_NAME = 'gogoanime'
    IE_DESC = 'GoGoAnime'
    _VALID_URL = r'http://www.gogoanime.com/(?P<id>[A-Za-z0-9-]+)'

    _TEST = {
        'url': 'http://www.gogoanime.com/mahou-shoujo-madoka-magica-movie-1',
        'info_dict': {
            'id': 'mahou-shoujo-madoka-magica-movie-1'
        },
        'playlist_count': 3
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)
        page = self._download_webpage(url, video_id)

        if 'Oops! Page Not Found</font>' in page:
            raise ExtractorError('Video does not exist', expected=True)

        content = get_element_by_attribute("class", "postcontent", page)
        vids = re.findall(r'<iframe[^>]*?src=[\'"](h[^\'"]+)[\'"]', content)
        vids = [
            unescapeHTML(compat_urllib_parse.unquote(x))
            for x in vids if not re.search(r".*videofun.*", x)]

        if re.search(r'<div class="postcontent">[^<]*<p><iframe src=[\'"][^>]+></iframe><br />', page):
            return self.playlist_result([self.url_result(vid) for vid in vids], video_id)

        title = self._html_search_regex(
            r'<div class="postdesc">[^<]*<h1>([^<]+)</h1>', page, 'title')

        return {
            '_type': 'url',
            'id': video_id,
            'url': vids[0],
            'title': title,
        }


class GoGoAnimeSearchIE(InfoExtractor):
    IE_NAME = 'gogoanime:search'
    IE_DESC = 'GoGoAnime Search'

    _VALID_URL = r'http://www\.gogoanime\.com/.*\?s=(?P<id>[^&]*)'
    _TEST = {
        'url': 'http://www.gogoanime.com/?s=bokusatsu',
        'info_dict': {
            'id': 'bokusatsu'
        },
        'playlist_count': 6
    }

    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        webpage = self._download_webpage(url, playlist_id)

        posts = re.findall(
            r'<div class="postlist">[^<]*<p[^>]*>[^<]*<a href="(?P<url>[^"]+)"',
            webpage)

        return self.playlist_result(
            [self.url_result(p) for p in posts], playlist_id)
Add various anime sites (Closes #4554) 10 years ago			`from __future__ import unicode_literals`

			`import re`

			`from .common import InfoExtractor`
			`from ..utils import (`
			`ExtractorError,`
			`compat_urllib_parse,`
			`get_element_by_attribute,`
			`unescapeHTML`
			`)`


			`class GoGoAnimeIE(InfoExtractor):`
			`IE_NAME = 'gogoanime'`
			`IE_DESC = 'GoGoAnime'`
			`_VALID_URL = r'http://www.gogoanime.com/(?P<id>[A-Za-z0-9-]+)'`

			`_TEST = {`
			`'url': 'http://www.gogoanime.com/mahou-shoujo-madoka-magica-movie-1',`
			`'info_dict': {`
			`'id': 'mahou-shoujo-madoka-magica-movie-1'`
			`},`
			`'playlist_count': 3`
			`}`

			`def _real_extract(self, url):`
			`video_id = self._match_id(url)`
			`page = self._download_webpage(url, video_id)`

			`if 'Oops! Page Not Found</font>' in page:`
			`raise ExtractorError('Video does not exist', expected=True)`

			`content = get_element_by_attribute("class", "postcontent", page)`
			`vids = re.findall(r'<iframe[^>]*?src=[\'"](h[^\'"]+)[\'"]', content)`
			`vids = [`
			`unescapeHTML(compat_urllib_parse.unquote(x))`
			`for x in vids if not re.search(r".videofun.", x)]`

			`if re.search(r'<div class="postcontent">[^<]*<p><iframe src=[\'"][^>]+></iframe><br />', page):`
			`return self.playlist_result([self.url_result(vid) for vid in vids], video_id)`

			`title = self._html_search_regex(`
			`r'<div class="postdesc">[^<]*<h1>([^<]+)</h1>', page, 'title')`

			`return {`
			`'_type': 'url',`
			`'id': video_id,`
			`'url': vids[0],`
			`'title': title,`
			`}`


			`class GoGoAnimeSearchIE(InfoExtractor):`
			`IE_NAME = 'gogoanime:search'`
			`IE_DESC = 'GoGoAnime Search'`

			`_VALID_URL = r'http://www\.gogoanime\.com/.\?s=(?P<id>[^&])'`
			`_TEST = {`
			`'url': 'http://www.gogoanime.com/?s=bokusatsu',`
			`'info_dict': {`
			`'id': 'bokusatsu'`
			`},`
			`'playlist_count': 6`
			`}`

			`def _real_extract(self, url):`
			`playlist_id = self._match_id(url)`
			`webpage = self._download_webpage(url, playlist_id)`

			`posts = re.findall(`
			`r'<div class="postlist">[^<]<p[^>]>[^<]*<a href="(?P<url>[^"]+)"',`
			`webpage)`

			`return self.playlist_result(`
			`[self.url_result(p) for p in posts], playlist_id)`