[ie/blackboardcollaborate] Support more URl formats and cleanup ClassCollaborate and CollaborateUltra extractors

2 months ago · 6f9fae2337
parent b4a3632666
commit 6f9fae2337
1 changed files with 97 additions and 73 deletions
--- a/yt_dlp/extractor/blackboardcollaborate.py
+++ b/yt_dlp/extractor/blackboardcollaborate.py
@ -1,3 +1,5 @@
+import urllib.parse
+
 from .common import InfoExtractor
 from ..utils import (
    UnsupportedError,
@ -13,6 +15,7 @@ from ..utils import (
    unsmuggle_url,
    url_or_none,
    urlencode_postdata,
+    value,
 )
 from ..utils.traversal import traverse_obj

@ -184,7 +187,7 @@ class BlackboardCollaborateLaunchIE(InfoExtractor):


 class BlackboardClassCollaborateIE(InfoExtractor):
-    _VALID_URL = r'https?://(?P<region>[a-z]+)-lti\.bbcollab\.com/collab/ui/scheduler/lti\?token=[\w%\.\-=]+'
+    _VALID_URL = r'https?://(?P<region>[a-z]+)-lti\.bbcollab\.com/collab/ui/scheduler/lti'

    _TESTS = [
        {
@ -193,13 +196,20 @@ class BlackboardClassCollaborateIE(InfoExtractor):
        },
    ]

-    def _call_api(self, region, video_id='', api_call='', token=None, note='Downloading JSON metadata', fatal=True):
-        if video_id == '':
-            channel_id = jwt_decode_hs256(token)['context']
-
-        return self._download_json(f'https://{region}.bbcollab.com/collab/api/csa/recordings/{video_id + api_call}',
-                                   video_id or channel_id, note=note,
-                                   headers={'Authorization': f'Bearer {token}'} if token else '', fatal=fatal)
+    def _call_api(self, region, video_id=None, path=None, token=None, note='Downloading JSON metadata', fatal=True):
+        return self._download_json(
+            f'https://{region}.bbcollab.com/collab/api/csa/recordings/{join_nonempty(video_id, path, delim="/")}',
+            video_id or jwt_decode_hs256(token)['context'], note=note, fatal=fatal,
+            headers={'Authorization': f'Bearer {token}'} if token else '')
+
+    def _entries(self, data, region, token):
+        for item in traverse_obj(data, ('results', ...)):
+            yield self.url_result(self._call_api(region, item['id'], 'url', token)['url'], BlackboardCollaborateLaunchIE, **traverse_obj(item, {
+                'id': ('id', {str_or_none}),
+                'view_count': ('playbackCount', {int_or_none}),
+                'duration': ('duration', {int_or_none(scale=1000)}),
+                'availability': ('publicLinkAllowed', {self._parse_availability}),
+            }))

    def _parse_availability(self, public_link_allowed):
        if public_link_allowed:
@ -210,45 +220,35 @@ class BlackboardClassCollaborateIE(InfoExtractor):
    def _real_extract(self, url):
        url, data = unsmuggle_url(url, {})
        region = self._match_valid_url(url)['region']
-        token = parse_qs(url).get('token')
+        token = parse_qs(url).get('token')[-1]

        playlist_info = self._call_api(region, token=token, note='Downloading playlist information')
-        entries = traverse_obj(playlist_info, ('results', ..., {
-            'id': ('id', {str_or_none}),
-            'view_count': ('playbackCount', {int_or_none}),
-            'duration': ('duration', {int_or_none(scale=1000)}),
-            'availability': ('publicLinkAllowed', {self._parse_availability}),
-        }))
-
-        for i in entries:
-            i['_type'] = 'url'
-            i['url'] = self._call_api(region, i['id'], '/url', token)['url']
-            i['ie_key'] = BlackboardCollaborateLaunchIE.ie_key()

        return self.playlist_result(
-            entries,
-            playlist_count=playlist_info['size'],
-            title=data.get('title'),
-            alt_title=data.get('alt_title'),
-            description=data.get('description'),
-            modified_timestamp=data.get('modified_timestamp'),
-            channel_id=data.get('channel_id'))
+            entries=self._entries(playlist_info, region, token),
+            playlist_count=playlist_info.get('size'),
+            **data,
+        )


 class BlackboardCollaborateUltraSingleCourseIE(InfoExtractor):
-    # Support format of either host/webapps/collab-ultra/tool/collabultra?course_id=course_id or
-    #                          host/webapps/collab-ultra/tool/collabultra/lti/launch?course_id=course_id
-    #                          host/webapps/blackboard/execute/courseMain?course_id=course_id
-    #                          host/ultra/courses/course_id/cl/outline
+    """
+    Match various URL formats including:
+    * host/webapps/collab-ultra/tool/collabultra?course_id=course_id or
+    * host/webapps/collab-ultra/tool/collabultra/lti/launch?course_id=course_id
+    * host/webapps/blackboard/execute/courseMain?course_id=course_id
+    * host/ultra/courses/course_id/cl/outline
+    * host/webapps/blackboard/execute/announcement?course_id=course_id
+    * host/webapps/blackboard/content/listContent.jsp?course_id=course_id
+    """
    _VALID_URL = r'''(?x)
-                        https://(?P<host>[\w\.]+)/(?:
+                        https://[\w\.]+/(?:
                         (?:webapps/
-                                (?:collab-ultra/tool/collabultra(?:/lti/launch)?
-                                |blackboard/execute/courseMain)
-                                    \?[\w\d_=&]*course_id=(?P<course_id>[\d_]+))
-                        |(?:ultra/courses/(?P<course_id2>[\d_]+)/cl/outline))'''
+                           (?:collab-ultra/tool/collabultra(?:/lti/launch)?
+                             |blackboard/(?:execute/(?:courseMain|announcement)|content/listContent.jsp)))
+                        |(?:ultra/courses/(?P<course_id>[\d_]+)/cl/outline))'''

-    _TESTS = [  # All Require a login
+    _TESTS = [
        {
            'url': 'https://umb.umassonline.net/webapps/collab-ultra/tool/collabultra/lti/launch?course_id=_70544_1',
            'only_matching': True,
@ -265,38 +265,52 @@ class BlackboardCollaborateUltraSingleCourseIE(InfoExtractor):
            'url': 'https://blackboard.salford.ac.uk/ultra/courses/_175809_1/cl/outline',
            'only_matching': True,
        },
+        {
+            'url': 'https://blackboard.example.com/webapps/blackboard/execute/announcement?method=search&context=course_entry&course_id=_123456_1',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://vuws.westernsydney.edu.au/webapps/blackboard/content/listContent.jsp?course_id= _41005_1&content_id=_7747469_1',
+            'only_matching': True,
+        },
    ]

    def _real_extract(self, url):
-        mobj = self._match_valid_url(url)
-        course_id = mobj.group('course_id') or mobj.group('course_id2')
-        host = mobj.group('host')
+        course_id = parse_qs(url).get('course_id')[-1] or self._match_valid_url(url)['course_id']
+        host = urllib.parse.urlparse(url).hostname

        course_data = self._download_webpage(
-            f'https://{host}/webapps/collab-ultra/tool/collabultra/lti/launch?course_id={course_id}', course_id, 'Downloading course data')
+            f'https://{host}/webapps/collab-ultra/tool/collabultra/lti/launch?course_id={course_id}',
+            course_id, note='Downloading course data')

        attrs = self._hidden_inputs(course_data)
        endpoint = self._html_search_regex(r'<form[^>]+action="([^"]+)"', course_data, 'form_action')
-
        redirect_url = self._request_webpage(endpoint, course_id, 'Getting authentication token',
                                             data=urlencode_postdata(attrs)).url

-        course_info = self._download_json(f'https://{host}/learn/api/v1/courses/{course_id}',
-                                          course_id, 'Downloading extra metadata', fatal=False)
+        # Ref: https://developer.blackboard.com/portal/displayApi
+        course_info = self._download_json(f'https://{host}/learn/api/v1/courses/{course_id}', course_id,
+                                          note='Downloading extra metadata', fatal=False)

-        return self.url_result(smuggle_url(redirect_url, {
-            'title': course_info.get('displayName'),
-            'alt_title': course_info.get('displayId'),  # Could also use courseId
-            'description': course_info.get('description'),
-            'modified_timestamp': parse_iso8601(course_info.get('modifiedDate')),
-            'channel_id': course_id,
-        }), ie=BlackboardClassCollaborateIE.ie_key(), video_id=None)
+        if self.suitable(redirect_url):
+            raise UnsupportedError(redirect_url)
+
+        return self.url_result(smuggle_url(
+            redirect_url,
+            traverse_obj(course_info, {
+                'title': (('name', 'displayName'), {str}, any),
+                'id': (('id', 'displayId', 'courseId'), {str}, any),
+                'url': (('guestAccessUrl', 'externalAccessUrl', {value(join_nonempty('https://', host, course_info['homePageUrl'], delim=''))}), {url_or_none}, any),
+                'description': ('description', {str}),
+                'modified_timestamp': ('modifiedDate', {parse_iso8601}),
+            }),
+        ), ie=BlackboardClassCollaborateIE, video_id=None)


 class BlackboardCollaborateUltraAllCoursesIE(InfoExtractor):
-    _VALID_URL = r'https://(?P<host>[\w\.]+)/ultra/institution-page'
+    _VALID_URL = r'https://[\w\.]+/ultra/institution-page'

-    _TESTS = [  # All Require a login
+    _TESTS = [
        {
            'url': 'https://umb.umassonline.net/ultra/institution-page',
            'only_matching': True,
@ -315,28 +329,38 @@ class BlackboardCollaborateUltraAllCoursesIE(InfoExtractor):
        },
    ]

+    def _download_course_list(self, host, offset):
+        # Ref: https://developer.blackboard.com/portal/displayApi
+        return self._download_json(
+            f'https://{host}/learn/api/v1/users/me/memberships?fields=course&includeCount=true&offset={offset}',
+            video_id=None, note='Finding courses')
+
+    def _entries(self, data, host):
+        for item in traverse_obj(data, (..., 'course')):
+            if item['isAvailable']:
+                yield self.url_result(
+                    ie=BlackboardCollaborateUltraSingleCourseIE,
+                    **traverse_obj(item, {
+                        'id': ('id', {str}, any),
+                        'display_id': ('courseId', 'displayId', {str}, any),
+                        'description': ('description', {str}),
+                        'title': ('displayName', 'name', {str}, any),
+                        'availability': ('isAllowGuests', {bool}, any),
+                        'url': ('guestAccessUrl', 'externalAccessUrl',
+                                {value(join_nonempty('https://', host, item['homePageUrl'], delim=''))},
+                                {url_or_none}, any),
+                    }))
+
    def _real_extract(self, url):
-        host = self._match_valid_url(url)['host']
-        endpoint = f'https://{host}/learn/api/v1/users/me/memberships?fields=course&includeCount=true'
-        number_of_courses, courses_found, user_id, entries = 1, 0, None, []
+        host = urllib.parse.urlparse(url).hostname
+        first_page = self._download_course_list(host, 0)
+        results = first_page['results']
+        number_of_courses = traverse_obj(first_page, ('paging', 'count'))
+        courses_found = number_of_courses

-        # Number of results per page seems to depend on the host and while it can be changed by '&limit=', each host seems to have a different upperbound, so a loop might be better
        while number_of_courses > courses_found:
-            current_page = self._download_json(f'{endpoint}&offset={courses_found}', user_id,
-                                               'Finding courses')
-            number_of_courses = traverse_obj(current_page, ('paging', 'count'))
-            user_id = traverse_obj(current_page, ('results', '0', 'userId'))
+            current_page = self._download_course_list(host, number_of_courses)
+            results.append(*current_page['results'])
            courses_found += len(current_page['results'])

-            for current_course in traverse_obj(current_page, ('results', ..., 'course')):
-                if current_course['isAvailable']:
-                    entries.append({
-                        'id': current_course.get('id'),
-                        'title': current_course.get('displayName'),
-                        'alt_title': current_course.get('displayId'),
-                        '_type': 'url',
-                        'url': current_course.get('externalAccessUrl') or f"{host}/{current_course.get('homePageUrl')}",
-                        'ie_key': BlackboardCollaborateUltraSingleCourseIE.ie_key(),
-                    })
-
-        return self.playlist_result(entries)
+        return self.playlist_result(self._entries(results, host))