Document and test categories (#2923)

pull/2934/head
Philipp Hagemeister 11 years ago
parent 5afa7f8bee
commit ad3bc6acd5

@ -113,6 +113,8 @@ class InfoExtractor(object):
webpage_url: The url to the video webpage, if given to youtube-dl it webpage_url: The url to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set should allow to get the same result again. (It will be set
by YoutubeDL if it's missing) by YoutubeDL if it's missing)
categories: A list of categories that the video falls in, for example
["Sports", "Berlin"]
Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, the fields should be Unicode strings.

@ -242,7 +242,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"uploader": u"Philipp Hagemeister", u"uploader": u"Philipp Hagemeister",
u"uploader_id": u"phihag", u"uploader_id": u"phihag",
u"upload_date": u"20121002", u"upload_date": u"20121002",
u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ." u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
u"categories": [u'Science & Technology'],
} }
}, },
{ {
@ -1136,18 +1137,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# upload date # upload date
upload_date = None upload_date = None
mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL) mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
if mobj is not None: if mobj is not None:
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
upload_date = unified_strdate(upload_date) upload_date = unified_strdate(upload_date)
video_categories = []
# categories
m_cat_container = get_element_by_id("eow-category", video_webpage) m_cat_container = get_element_by_id("eow-category", video_webpage)
if m_cat_container: if m_cat_container:
video_categories = re.findall(r'<a[^<]+>(.*?)</a>', category = self._html_search_regex(
m_cat_container, re.DOTALL) r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'cateory',
default=None)
video_categories = None if category is None else [category]
else:
video_categories = None
# description # description
video_description = get_element_by_id("eow-description", video_webpage) video_description = get_element_by_id("eow-description", video_webpage)

Loading…
Cancel
Save