From a0fe51d5623a18eb7c2c460a3d35f916e1752504 Mon Sep 17 00:00:00 2001
From: Teemu Ikonen <tpikonen@gmail.com>
Date: Sat, 7 May 2022 14:24:41 +0300
Subject: [PATCH] [ruutu] Support hs.fi embeds (#3547)

Authored by: tpikonen, pukkandan
---
 yt_dlp/extractor/generic.py | 29 +++++++++++++++++++++---
 yt_dlp/extractor/ruutu.py   | 45 ++++++++++++++++++++++++++++---------
 2 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py
index 8192fbb860..340161a421 100644
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -2517,6 +2517,29 @@ class GenericIE(InfoExtractor):
                 'upload_date': '20220308',
             },
         },
+        {
+            # Multiple Ruutu embeds
+            'url': 'https://www.hs.fi/kotimaa/art-2000008762560.html',
+            'info_dict': {
+                'title': 'Koronavirus | Epidemiahuippu voi olla Suomessa ohi, mutta koronaviruksen poistamista yleisvaarallisten tautien joukosta harkitaan vasta syksyllä',
+                'id': 'art-2000008762560'
+            },
+            'playlist_count': 3
+        },
+        {
+            # Ruutu embed in hs.fi with a single video
+            'url': 'https://www.hs.fi/kotimaa/art-2000008793421.html',
+            'md5': 'f8964e65d8fada6e8a562389bf366bb4',
+            'info_dict': {
+                'id': '4081841',
+                'ext': 'mp4',
+                'title': 'Puolustusvoimat siirsi panssariajoneuvoja harjoituksiin Niinisaloon 2.5.2022',
+                'thumbnail': r're:^https?://.+\.jpg$',
+                'duration': 138,
+                'age_limit': 0,
+                'upload_date': '20220504',
+            },
+        },
     ]
 
     def report_following_redirect(self, new_url):
@@ -3749,9 +3772,9 @@ class GenericIE(InfoExtractor):
             return self.playlist_from_matches(panopto_urls, video_id, video_title)
 
         # Look for Ruutu embeds
-        ruutu_url = RuutuIE._extract_url(webpage)
-        if ruutu_url:
-            return self.url_result(ruutu_url, RuutuIE)
+        ruutu_urls = RuutuIE._extract_urls(webpage)
+        if ruutu_urls:
+            return self.playlist_from_matches(ruutu_urls, video_id, video_title)
 
         # Look for HTML5 media
         entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
diff --git a/yt_dlp/extractor/ruutu.py b/yt_dlp/extractor/ruutu.py
index f5dadf2786..c6d94c1002 100644
--- a/yt_dlp/extractor/ruutu.py
+++ b/yt_dlp/extractor/ruutu.py
@@ -38,6 +38,7 @@ class RuutuIE(InfoExtractor):
                 'thumbnail': r're:^https?://.*\.jpg$',
                 'duration': 114,
                 'age_limit': 0,
+                'upload_date': '20150508',
             },
         },
         {
@@ -51,6 +52,9 @@ class RuutuIE(InfoExtractor):
                 'thumbnail': r're:^https?://.*\.jpg$',
                 'duration': 40,
                 'age_limit': 0,
+                'upload_date': '20150507',
+                'series': 'Superpesis',
+                'categories': ['Urheilu'],
             },
         },
         {
@@ -63,6 +67,8 @@ class RuutuIE(InfoExtractor):
                 'description': 'md5:7d90f358c47542e3072ff65d7b1bcffe',
                 'thumbnail': r're:^https?://.*\.jpg$',
                 'age_limit': 0,
+                'upload_date': '20151012',
+                'series': 'Läpivalaisu',
             },
         },
         # Episode where <SourceFile> is "NOT-USED", but has other
@@ -82,6 +88,9 @@ class RuutuIE(InfoExtractor):
                 'description': 'md5:bbb6963df17dfd0ecd9eb9a61bf14b52',
                 'thumbnail': r're:^https?://.*\.jpg$',
                 'age_limit': 0,
+                'upload_date': '20190320',
+                'series': 'Mysteeritarinat',
+                'duration': 1324,
             },
             'expected_warnings': [
                 'HTTP Error 502: Bad Gateway',
@@ -126,14 +135,30 @@ class RuutuIE(InfoExtractor):
     _API_BASE = 'https://gatling.nelonenmedia.fi'
 
     @classmethod
-    def _extract_url(cls, webpage):
+    def _extract_urls(cls, webpage):
+        # nelonen.fi
         settings = try_call(
             lambda: json.loads(re.search(
                 r'jQuery\.extend\(Drupal\.settings, ({.+?})\);', webpage).group(1), strict=False))
-        video_id = traverse_obj(settings, (
-            'mediaCrossbowSettings', 'file', 'field_crossbow_video_id', 'und', 0, 'value'))
-        if video_id:
-            return f'http://www.ruutu.fi/video/{video_id}'
+        if settings:
+            video_id = traverse_obj(settings, (
+                'mediaCrossbowSettings', 'file', 'field_crossbow_video_id', 'und', 0, 'value'))
+            if video_id:
+                return [f'http://www.ruutu.fi/video/{video_id}']
+        # hs.fi and is.fi
+        settings = try_call(
+            lambda: json.loads(re.search(
+                '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
+                webpage).group(1), strict=False))
+        if settings:
+            video_ids = set(traverse_obj(settings, (
+                'props', 'pageProps', 'page', 'assetData', 'splitBody', ..., 'video', 'sourceId')) or [])
+            if video_ids:
+                return [f'http://www.ruutu.fi/video/{v}' for v in video_ids]
+            video_id = traverse_obj(settings, (
+                'props', 'pageProps', 'page', 'assetData', 'mainVideo', 'sourceId'))
+            if video_id:
+                return [f'http://www.ruutu.fi/video/{video_id}']
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -206,10 +231,10 @@ class RuutuIE(InfoExtractor):
         extract_formats(video_xml.find('./Clip'))
 
         def pv(name):
-            node = find_xpath_attr(
-                video_xml, './Clip/PassthroughVariables/variable', 'name', name)
-            if node is not None:
-                return node.get('value')
+            value = try_call(lambda: find_xpath_attr(
+                video_xml, './Clip/PassthroughVariables/variable', 'name', name).get('value'))
+            if value != 'NA':
+                return value or None
 
         if not formats:
             if (not self.get_param('allow_unplayable_formats')
@@ -234,6 +259,6 @@ class RuutuIE(InfoExtractor):
             'series': pv('series_name'),
             'season_number': int_or_none(pv('season_number')),
             'episode_number': int_or_none(pv('episode_number')),
-            'categories': themes.split(',') if themes else [],
+            'categories': themes.split(',') if themes else None,
             'formats': formats,
         }