From ed4d9a40c145974abb62b36dabd5e4ab3db6aee9 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Tue, 29 Oct 2024 22:52:24 +0800 Subject: [PATCH 1/3] [ie/mediasite] Extract transcripts --- yt_dlp/extractor/mediasite.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index ad7ab27e28..a7bbaa5344 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -5,6 +5,7 @@ import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, + determine_ext, float_or_none, mimetype2ext, smuggle_url, @@ -268,7 +269,29 @@ class MediasiteIE(InfoExtractor): formats.extend(stream_formats) # XXX: Presentation['Presenters'] - # XXX: Presentation['Transcript'] + transcripts = presentation.get('Transcripts', {}) + captions, subtitles = {}, {} + for transcript in transcripts: + lang_code = traverse_obj( + transcript, (('DetailedLanguageCode', 'LanguageCode'), {str}), get_all=False) + lang_name = transcript.get('Language') + t = { + 'url': transcript.get('CaptionsUrl'), + 'name': lang_name, + } + if 'Auto-Generated' in lang_name: + captions.setdefault(lang_code, []).append(t) + else: + subtitles.setdefault(lang_code, []).append(t) + if transcript_url := presentation.get('TranscriptUrl'): + if determine_ext(transcript_url) != 'txt': + if len(transcripts) == 1 and captions: + captions.setdefault(lang_code, []).append({ + 'url': transcript_url, + 'name': lang_name, + }) + else: + subtitles.setdefault('und', []).append({'url': transcript_url}) return { 'id': resource_id, @@ -277,6 +300,8 @@ class MediasiteIE(InfoExtractor): 'duration': float_or_none(presentation.get('Duration'), 1000), 'timestamp': float_or_none(presentation.get('UnixTime'), 1000), 'formats': formats, + 'automatic_captions': captions, + 'subtitles': subtitles, 'thumbnails': thumbnails, } From 1508588107e5003b917f95316bb795d618c9ffb5 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Tue, 24 Dec 2024 17:14:51 +0800 Subject: [PATCH 2/3] Fix grouping --- yt_dlp/extractor/mediasite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index a7bbaa5344..8b7040c2b4 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -285,8 +285,8 @@ class MediasiteIE(InfoExtractor): subtitles.setdefault(lang_code, []).append(t) if transcript_url := presentation.get('TranscriptUrl'): if determine_ext(transcript_url) != 'txt': - if len(transcripts) == 1 and captions: - captions.setdefault(lang_code, []).append({ + if len(transcripts) == 1: + (captions or subtitles).setdefault(lang_code, []).append({ 'url': transcript_url, 'name': lang_name, }) From 370347c3f895b3f4e0f1f1a7dce1dfb6eed5c58d Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Fri, 27 Dec 2024 21:52:35 +0800 Subject: [PATCH 3/3] Deprioritize TranscriptUrl --- yt_dlp/extractor/mediasite.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index 8b7040c2b4..09f957394e 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -7,6 +7,7 @@ from ..utils import ( ExtractorError, determine_ext, float_or_none, + join_nonempty, mimetype2ext, smuggle_url, str_or_none, @@ -269,11 +270,11 @@ class MediasiteIE(InfoExtractor): formats.extend(stream_formats) # XXX: Presentation['Presenters'] - transcripts = presentation.get('Transcripts', {}) + transcripts = presentation.get('Transcripts', []) captions, subtitles = {}, {} for transcript in transcripts: lang_code = traverse_obj( - transcript, (('DetailedLanguageCode', 'LanguageCode'), {str}), get_all=False) + transcript, (('DetailedLanguageCode', 'LanguageCode'), {str}), get_all=False) or 'und' lang_name = transcript.get('Language') t = { 'url': transcript.get('CaptionsUrl'), @@ -283,15 +284,19 @@ class MediasiteIE(InfoExtractor): captions.setdefault(lang_code, []).append(t) else: subtitles.setdefault(lang_code, []).append(t) - if transcript_url := presentation.get('TranscriptUrl'): + if transcript_url := url_or_none(presentation.get('TranscriptUrl')): + if 'playbackTicket=' not in transcript_url: + transcript_url = join_nonempty( + transcript_url, traverse_obj(presentation, ('Streams', 0, 'SlidePlaybackTicketId', {str})), + delim='?playbackTicket=') if determine_ext(transcript_url) != 'txt': if len(transcripts) == 1: - (captions or subtitles).setdefault(lang_code, []).append({ + (captions or subtitles)[lang_code].insert(0, { 'url': transcript_url, 'name': lang_name, }) else: - subtitles.setdefault('und', []).append({'url': transcript_url}) + subtitles.setdefault('und', []).insert(0, {'url': transcript_url}) return { 'id': resource_id,