[pornhub] Extract `cast`

Closes #406, https://github.com/ytdl-org/youtube-dl/pull/27384
4 years ago · d0fb4bd16f
parent 3fd4c2a543
commit d0fb4bd16f
2 changed files with 5 additions and 1 deletions
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -290,6 +290,7 @@ class InfoExtractor(object):
    categories:     A list of categories that the video falls in, for example
                    ["Sports", "Berlin"]
    tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
+    cast:           A list of the video cast
    is_live:        True, False, or None (=unknown). Whether this video is a
                    live stream that goes on instead of a fixed-length video.
    was_live:       True, False, or None (=unknown). Whether this video was
--- a/yt_dlp/extractor/pornhub.py
+++ b/yt_dlp/extractor/pornhub.py
@ -14,6 +14,7 @@ from ..compat import (
 )
 from .openload import PhantomJSwrapper
 from ..utils import (
+    clean_html,
    determine_ext,
    ExtractorError,
    int_or_none,
@ -145,6 +146,7 @@ class PornHubIE(PornHubBaseIE):
            'age_limit': 18,
            'tags': list,
            'categories': list,
+            'cast': list,
        },
    }, {
        # non-ASCII title
@ -464,7 +466,7 @@ class PornHubIE(PornHubBaseIE):
                r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>'
                % meta_key, webpage, meta_key, default=None)
            if div:
-                return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div)
+                return [clean_html(x).strip() for x in re.findall(r'(?s)<a[^>]+\bhref=[^>]+>.+?</a>', div)]

        info = self._search_json_ld(webpage, video_id, default={})
        # description provided in JSON-LD is irrelevant
@ -485,6 +487,7 @@ class PornHubIE(PornHubBaseIE):
            'age_limit': 18,
            'tags': extract_list('tags'),
            'categories': extract_list('categories'),
+            'cast': extract_list('pornstars'),
            'subtitles': subtitles,
        }, info)