[khanacademy] fix extraction(closes #2887)(closes #26803)

pull/27728/head
Remita Amine 4 years ago
parent 2c337f4e85
commit 61e669acff

@ -526,7 +526,10 @@ from .karaoketv import KaraoketvIE
from .karrierevideos import KarriereVideosIE from .karrierevideos import KarriereVideosIE
from .keezmovies import KeezMoviesIE from .keezmovies import KeezMoviesIE
from .ketnet import KetnetIE from .ketnet import KetnetIE
from .khanacademy import KhanAcademyIE from .khanacademy import (
KhanAcademyIE,
KhanAcademyUnitIE,
)
from .kickstarter import KickStarterIE from .kickstarter import KickStarterIE
from .kinja import KinjaEmbedIE from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE from .kinopoisk import KinoPoiskIE

@ -1,82 +1,107 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
unified_strdate, int_or_none,
parse_iso8601,
try_get,
) )
class KhanAcademyIE(InfoExtractor): class KhanAcademyBaseIE(InfoExtractor):
_VALID_URL = r'^https?://(?:(?:www|api)\.)?khanacademy\.org/(?P<key>[^/]+)/(?:[^/]+/){,2}(?P<id>[^?#/]+)(?:$|[?#])' _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'
IE_NAME = 'KhanAcademy'
_TESTS = [{ def _parse_video(self, video):
'url': 'http://www.khanacademy.org/video/one-time-pad', return {
'md5': '7b391cce85e758fb94f763ddc1bbb979', '_type': 'url_transparent',
'url': video['youtubeId'],
'id': video.get('slug'),
'title': video.get('title'),
'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'),
'duration': int_or_none(video.get('duration')),
'description': video.get('description'),
'ie_key': 'Youtube',
}
def _real_extract(self, url):
display_id = self._match_id(url)
component_props = self._parse_json(self._download_json(
'https://www.khanacademy.org/api/internal/graphql',
display_id, query={
'hash': 1604303425,
'variables': json.dumps({
'path': display_id,
'queryParams': '',
}),
})['data']['contentJson'], display_id)['componentProps']
return self._parse_component_props(component_props)
class KhanAcademyIE(KhanAcademyBaseIE):
IE_NAME = 'khanacademy'
_VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
_TEST = {
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0',
'info_dict': { 'info_dict': {
'id': 'one-time-pad', 'id': 'FlIG3TvQCBQ',
'ext': 'webm', 'ext': 'mp4',
'title': 'The one-time pad', 'title': 'The one-time pad',
'description': 'The perfect cipher', 'description': 'The perfect cipher',
'duration': 176, 'duration': 176,
'uploader': 'Brit Cruise', 'uploader': 'Brit Cruise',
'uploader_id': 'khanacademy', 'uploader_id': 'khanacademy',
'upload_date': '20120411', 'upload_date': '20120411',
'timestamp': 1334170113,
'license': 'cc-by-nc-sa',
}, },
'add_ie': ['Youtube'], 'add_ie': ['Youtube'],
}, { }
'url': 'https://www.khanacademy.org/math/applied-math/cryptography',
def _parse_component_props(self, component_props):
video = component_props['tutorialPageData']['contentModel']
info = self._parse_video(video)
author_names = video.get('authorNames')
info.update({
'uploader': ', '.join(author_names) if author_names else None,
'timestamp': parse_iso8601(video.get('dateAdded')),
'license': video.get('kaUserLicense'),
})
return info
class KhanAcademyUnitIE(KhanAcademyBaseIE):
IE_NAME = 'khanacademy:unit'
_VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)'
_TEST = {
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography',
'info_dict': { 'info_dict': {
'id': 'cryptography', 'id': 'cryptography',
'title': 'Journey into cryptography', 'title': 'Cryptography',
'description': 'How have humans protected their secret messages through history? What has changed today?', 'description': 'How have humans protected their secret messages through history? What has changed today?',
}, },
'playlist_mincount': 3, 'playlist_mincount': 31,
}] }
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
if m.group('key') == 'video': def _parse_component_props(self, component_props):
data = self._download_json( curation = component_props['curation']
'http://api.khanacademy.org/api/v1/videos/' + video_id,
video_id, 'Downloading video info')
upload_date = unified_strdate(data['date_added']) entries = []
uploader = ', '.join(data['author_names']) tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or []
return { for tutorial_number, tutorial in enumerate(tutorials, 1):
'_type': 'url_transparent', chapter_info = {
'url': data['url'], 'chapter': tutorial.get('title'),
'id': video_id, 'chapter_number': tutorial_number,
'title': data['title'], 'chapter_id': tutorial.get('id'),
'thumbnail': data['image_url'],
'duration': data['duration'],
'description': data['description'],
'uploader': uploader,
'upload_date': upload_date,
} }
else: for content_item in (tutorial.get('contentItems') or []):
# topic if content_item.get('kind') == 'Video':
data = self._download_json( info = self._parse_video(content_item)
'http://api.khanacademy.org/api/v1/topic/' + video_id, info.update(chapter_info)
video_id, 'Downloading topic info') entries.append(info)
entries = [ return self.playlist_result(
{ entries, curation.get('unit'), curation.get('title'),
'_type': 'url', curation.get('description'))
'url': c['url'],
'id': c['id'],
'title': c['title'],
}
for c in data['children'] if c['kind'] in ('Video', 'Topic')]
return {
'_type': 'playlist',
'id': video_id,
'title': data['title'],
'description': data['description'],
'entries': entries,
}

Loading…
Cancel
Save