[extractor/VIMP] Rework metadata extraction

* Fix extraction for sites with non-standard HLS location
* Improve thumbnail extraction
pull/12738/head
Gregor Düster 4 weeks ago
parent 6f6c1ae6a0
commit 46bd001415
No known key found for this signature in database
GPG Key ID: 1B4181FC97673B9D

@ -2,8 +2,7 @@ import functools
import re
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import ExtractorError, OnDemandPagedList, urlencode_postdata
from ..utils import OnDemandPagedList, traverse_obj, urlencode_postdata
class VideocampusSachsenIE(InfoExtractor):
@ -106,13 +105,48 @@ class VideocampusSachsenIE(InfoExtractor):
)'''.format('|'.join(map(re.escape, _INSTANCES)))
_TESTS = [
# non-standard hls location
{
'url': 'https://video.desy.de/video/vakuum-begreifen-luftwiderstand/d60d3682854de441d0ed092e2c825f6e',
'info_dict': {
'id': 'd60d3682854de441d0ed092e2c825f6e',
'title': 'Vakuum begreifen: Luftwiderstand',
# as the url suggests the thumbnail images files live in the
# cache, urls will change from time to time
'thumbnail': 'https://video.desy.de/cache/c1180148290d5ce154de7f821f432b8e.png',
'display_id': 'vakuum-begreifen-luftwiderstand',
'ext': 'mp4',
},
},
# no hls but mp4 with different qualities
{
'url': 'https://video.hcu-hamburg.de/video/stahl-und-holzbau-2-handout-2/29c45dae21191bce493e4ea18cc929a4',
'info_dict': {
'id': '29c45dae21191bce493e4ea18cc929a4',
'title': 'Stahl- und Holzbau 2 - Handout 2',
'thumbnail': 'https://video.hcu-hamburg.de/cache/7ae482b7ba22c01a0a2426df7da9e854.png',
'display_id': 'stahl-und-holzbau-2-handout-2',
'ext': 'mp4',
},
},
# no player options
{
'url': 'https://video.tu-freiberg.de/video/schauvorlesung-/19ffd2eb8a82b080b7a23c2b70a4c1a1',
'info_dict': {
'id': '19ffd2eb8a82b080b7a23c2b70a4c1a1',
'title': 'Schauvorlesung Fakultät 2 - 2024',
'description': 'md5:229b686fd94e801d7ffef6531c8710fb',
'thumbnail': 'https://video.tu-freiberg.de/cache/42ea37d0223e2e3905a850463bd8d6d5.png',
'display_id': 'schauvorlesung-',
'ext': 'mp4',
},
},
{
'url': 'https://videocampus.sachsen.de/m/e0d6c8ce6e394c188f1342f1ab7c50ed6fc4490b808699801def5cb2e46d76ca7367f622a9f516c542ffb805b24d6b643bd7c81f385acaac4c59081b87a2767b',
'info_dict': {
'id': 'e6b9349905c1628631f175712250f2a1',
'title': 'Konstruktiver Entwicklungsprozess Vorlesung 7',
'description': 'Konstruktiver Entwicklungsprozess Vorlesung 7',
'thumbnail': 'https://videocampus.sachsen.de/cache/1a985379ad3aecba8097a6902c7daa4e.jpg',
'thumbnail': 'https://videocampus.sachsen.de/cache/6730fb25578cf4e00cd6afbdc977585e.png',
'ext': 'mp4',
},
},
@ -121,8 +155,7 @@ class VideocampusSachsenIE(InfoExtractor):
'info_dict': {
'id': 'fc99c527e4205b121cb7c74433469262',
'title': 'Was ist selbstgesteuertes Lernen?',
'description': 'md5:196aa3b0509a526db62f84679522a2f5',
'thumbnail': 'https://videocampus.sachsen.de/cache/6f4a85096ba24cb398e6ce54446b57ae.jpg',
'thumbnail': 'https://videocampus.sachsen.de/cache/a7765658ac3df9e75947a4d06aef7402.png',
'display_id': 'Was-ist-selbstgesteuertes-Lernen',
'ext': 'mp4',
},
@ -132,24 +165,11 @@ class VideocampusSachsenIE(InfoExtractor):
'info_dict': {
'id': '09d4ed029002eb1bdda610f1103dd54c',
'title': 'Tutorial zur Nutzung von Adobe Connect aus Veranstalter-Sicht',
'description': 'md5:3d379ca3cc17b9da6784d7f58cca4d58',
'thumbnail': 'https://videocampus.sachsen.de/cache/2452498fe8c2d5a7dc79a05d30f407b6.jpg',
'thumbnail': 'https://videocampus.sachsen.de/cache/173fc4fe2133cc41b2905ca8976a4760.png',
'display_id': 'Tutorial-zur-Nutzung-von-Adobe-Connect-aus-Veranstalter-Sicht',
'ext': 'mp4',
},
},
{
'url': 'https://www2.univ-sba.dz/video/Presentation-de-la-Faculte-de-droit-et-des-sciences-politiques-Journee-portes-ouvertes-202122/0183356e41af7bfb83d7667b20d9b6a3',
'info_dict': {
'url': 'https://www2.univ-sba.dz/getMedium/0183356e41af7bfb83d7667b20d9b6a3.mp4',
'id': '0183356e41af7bfb83d7667b20d9b6a3',
'title': 'Présentation de la Faculté de droit et des sciences politiques - Journée portes ouvertes 2021/22',
'description': 'md5:508958bd93e0ca002ac731d94182a54f',
'thumbnail': 'https://www2.univ-sba.dz/cache/4d5d4a0b4189271a8cc6cb5328e14769.jpg',
'display_id': 'Presentation-de-la-Faculte-de-droit-et-des-sciences-politiques-Journee-portes-ouvertes-202122',
'ext': 'mp4',
},
},
{
'url': 'https://vimp.weka-fachmedien.de/video/Preisverleihung-Produkte-des-Jahres-2022/c8816f1cc942c12b6cce57c835cffd7c',
'info_dict': {
@ -166,6 +186,7 @@ class VideocampusSachsenIE(InfoExtractor):
'info_dict': {
'id': 'fc99c527e4205b121cb7c74433469262',
'title': 'Was ist selbstgesteuertes Lernen?',
'thumbnail': 'https://videocampus.sachsen.de/cache/a7765658ac3df9e75947a4d06aef7402.png',
'ext': 'mp4',
},
},
@ -176,30 +197,33 @@ class VideocampusSachsenIE(InfoExtractor):
'host', 'id', 'tmp_id', 'display_id', 'embed_id')
webpage = self._download_webpage(url, video_id or tmp_id, fatal=False) or ''
if not video_id:
video_id = embed_id or self._html_search_regex(
rf'src="https?://{host}/media/embed.*(?:\?|&)key=([0-9a-f]+)&?',
webpage, 'video_id')
formats, subtitles = [], {}
title = description = thumbnail = None
metadata = self._search_json(r'var\s+options\s*=\s*', webpage, 'player options', video_id or tmp_id or embed_id, default=None, fatal=False)
if metadata:
for source in metadata.get('sources', []):
if source.get('type') == 'application/x-mpegURL' and source.get('src'):
_formats, _subtitles = self._extract_m3u8_formats_and_subtitles(source.get('src'), video_id or tmp_id or embed_id, fatal=False)
formats.extend(_formats)
subtitles.update(_subtitles)
elif source.get('src'):
formats.append({'url': source.get('src')})
if not (display_id or tmp_id):
# Title, description from embedded page's meta wouldn't be correct
title = self._html_search_regex(r'<video-js[^>]* data-piwik-title="([^"<]+)"', webpage, 'title', fatal=False)
description = None
thumbnail = None
thumbnail = f'https://{host}{metadata.get('poster')}'
video_id = traverse_obj(metadata, ('videojsVimpOptions', 'Mediakey'))
else:
title = self._html_search_meta(('og:title', 'twitter:title', 'title'), webpage, fatal=False)
description = self._html_search_meta(
('og:description', 'twitter:description', 'description'), webpage, fatal=False)
thumbnail = self._html_search_meta(('og:image', 'twitter:image'), webpage, fatal=False)
description = self._html_search_meta(('og:description', 'twitter:description', 'description'), webpage, fatal=False)
title = self._html_search_meta(('og:title', 'twitter:title', 'title'), webpage, fatal=False)
formats, subtitles = [], {}
try:
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
f'https://{host}/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8',
video_id, 'mp4', m3u8_id='hls', fatal=True)
except ExtractorError as e:
if not isinstance(e.cause, HTTPError) or e.cause.status not in (404, 500):
raise
if not title:
title = self._html_search_regex(r'<h1[^>]*>([^<]+)</h1>', webpage, 'title', default=None, fatal=False)
if not title:
embed_data = self._download_json(f'https://{host}/media/embedCode', video_id, data=f'key={video_id}'.encode(), fatal=False)
if embed_data:
title = self._html_search_regex(r'title="([^"]+)"', embed_data.get('embedCode', ''), 'title', fatal=False)
formats.append({'url': f'https://{host}/getMedium/{video_id}.mp4'})

Loading…
Cancel
Save