diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 4d13c17e44..fbf40f3ca9 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -191,19 +191,37 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info - def _search_regex(self, pattern, text, name, fatal=True, flags=0): - """Extract a field from some text based on regex""" - mobj = re.search(pattern, text, flags) - if mobj is None and fatal: + def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Perform a regex search on the given string, using a single or a list of + patterns returning the first matching group. + In case of failure return a default value or raise a WARNING or a + ExtractorError, depending on fatal, specifying the field name. + """ + if isinstance(pattern, (str, compat_str, compiled_regex_type)): + mobj = re.search(pattern, string, flags) + else: + for p in pattern: + mobj = re.search(p, string, flags) + if mobj: break + + if sys.stderr.isatty() and os.name != 'nt': + _name = u'\033[0;34m%s\033[0m' % name + else: + _name = name + + if mobj: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + elif default is not None: + return default + elif fatal: raise ExtractorError(u'Unable to extract %s; ' - u'please report this issue on GitHub.' % name) - elif mobj is None: + u'please report this issue on GitHub.' % _name) + else: self._downloader.report_warning(u'unable to extract %s; ' - u'please report this issue on GitHub.' % name) + u'please report this issue on GitHub.' % _name) return None - else: - # return the first matched group - return next(g for g in mobj.groups() if g is not None) class SearchInfoExtractor(InfoExtractor): """ @@ -2820,12 +2838,8 @@ class StanfordOpenClassroomIE(InfoExtractor): note='Downloading course info page', errnote='Unable to download course info page') - # TODO: implement default_value in search_regex - m = re.search('

([^<]+)

', coursepage) - if m: - info['title'] = unescapeHTML(m.group(1)) - else: - info['title'] = info['id'] + info['title'] = self._search_regex('

([^<]+)

', coursepage, 'title', default=info['id']) + info['title'] = unescapeHTML(info['title']) info['description'] = self._search_regex('([^<]+)', coursepage, u'description', fatal=False) @@ -3108,12 +3122,8 @@ class GooglePlusIE(InfoExtractor): # Extract title # Get the first line for title - # TODO: implement default_value in search_regex - video_title = u'NA' - pattern = r'Date: (.*?)', webpage, 'upload_date', fatal=False) + + description = self._search_regex(r'
(.*?)', webpage, 'description', fatal=False) + info = { 'id': shortened_video_id, 'url': video_url, 'ext': 'mp4', 'title': title, - 'uploader_date': _findProp(r'Date: (.*?)
'), - 'description': _findProp(r'
(.*?)'), + 'uploader_date': uploader_date, + 'description': description, } return [info] @@ -3335,13 +3343,9 @@ class FunnyOrDieIE(InfoExtractor): webpage, u'video URL', flags=re.DOTALL) video_url = unescapeHTML(video_url) - # TODO: implement fallbacks in regex_search - m = re.search(r"

(?P.*?)</h1>", webpage, flags=re.DOTALL) - if not m: - m = re.search(r'<title>(?P<title>[^<]+?)', webpage) - if not m: - raise ExtractorError(u'Cannot find video title') - title = clean_html(m.group('title')) + title = self._search_regex((r"

(?P.*?)</h1>", + r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) + title = clean_html(title) video_description = self._search_regex(r'