@ -1,8 +1,8 @@
# coding: utf-8
# coding: utf-8
from __future__ import unicode_literals
from __future__ import unicode_literals
import json
import re
import re
from datetime import datetime
from . . utils import float_or_none , try_get , str_to_int , unified_timestamp , merge_dicts
from . . utils import float_or_none , try_get , str_to_int , unified_timestamp
from . . compat import compat_str
from . . compat import compat_str
from . common import InfoExtractor
from . common import InfoExtractor
@ -10,10 +10,8 @@ from .common import InfoExtractor
class PodchaserIE ( InfoExtractor ) :
class PodchaserIE ( InfoExtractor ) :
_VALID_URL = r ''' (?x)
_VALID_URL = r ''' (?x)
https ? : / / ( ? : www \. ) ? podchaser \. com /
https ? : / / ( ? : www \. ) ? podchaser \. com /
( ? : ( ? : podcasts ) | ( ? : creators ) )
/ [ \w - ] + -
( ? :
( ? :
( ? P < creator_id > [ \d ] + [ \w ] + ) | ( ? P < podcast_id > [ \d ] + ) )
( ? : podcasts / [ \w - ] + - ( ? P < podcast_id > [ \d ] + ) ) )
( ? : / episodes / [ \w \- ] + -
( ? : / episodes / [ \w \- ] + -
( ? P < id > [ \d ] + ) ) ? '''
( ? P < id > [ \d ] + ) ) ? '''
@ -26,6 +24,7 @@ class PodchaserIE(InfoExtractor):
' thumbnail ' : r ' re:^https?://.* \ .jpg$ ' ,
' thumbnail ' : r ' re:^https?://.* \ .jpg$ ' ,
' ext ' : ' mp3 ' ,
' ext ' : ' mp3 ' ,
' categories ' : [ ' Comedy ' ] ,
' categories ' : [ ' Comedy ' ] ,
' tags ' : [ ' comedy ' , ' dark humor ' ] ,
' series ' : ' Cum Town ' ,
' series ' : ' Cum Town ' ,
' duration ' : 3708 ,
' duration ' : 3708 ,
' timestamp ' : 1636531259 ,
' timestamp ' : 1636531259 ,
@ -36,63 +35,80 @@ class PodchaserIE(InfoExtractor):
' info_dict ' : {
' info_dict ' : {
' id ' : ' 28853 ' ,
' id ' : ' 28853 ' ,
' title ' : ' The Bone Zone ' ,
' title ' : ' The Bone Zone ' ,
' description ' : ' md5:c39acd897170a8bf3ad94fc45dc25060 ' ,
' description ' : ' Podcast by The Bone Zone ' ,
} ,
} ,
' playlist_count ' : 6
' playlist_count ' : 275
} , {
' url ' : ' https://www.podchaser.com/creators/todd-glass-107ZzkFiEQ ' ,
' info_dict ' : {
' id ' : ' 107ZzkFiEQ ' ,
' title ' : ' Todd Glass ' ,
' description ' : ' md5:0771e81d879f304f11254e5a56a97a58 ' ,
} ,
' playlist_mincount ' : 48
} , {
} , {
' url ' : ' https://www.podchaser.com/podcasts/sean-carrolls-mindscape-scienc-699349/episodes ' ,
' url ' : ' https://www.podchaser.com/podcasts/sean-carrolls-mindscape-scienc-699349/episodes ' ,
' info_dict ' : {
' info_dict ' : {
' id ' : ' 699349 ' ,
' id ' : ' 699349 ' ,
' title ' : " Sean Carroll ' s Mindscape: Science, Society, Philosophy, Culture, Arts, and Ideas " ,
' title ' : " Sean Carroll ' s Mindscape: Science, Society, Philosophy, Culture, Arts, and Ideas " ,
' description ' : ' md5: 8692ce0c50cb900c5e4eb27b437dd67b '
' description ' : ' md5:2cbd8f4749891a84dc8235342e0b5ff1 '
} ,
} ,
' playlist_ count' : 25
' playlist_mincount ' : 199
} ]
} ]
def _real_extract ( self , url ) :
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
mobj = re . match ( self . _VALID_URL , url )
audio_id , podcast_id , creator_id = mobj . group ( ' id ' ) , \
audio_id , podcast_id = mobj . group ( ' id ' ) , mobj . group ( ' podcast_id ' )
mobj . group ( ' podcast_id ' ) , mobj . group ( ' creator_id ' )
webpage = self . _download_webpage ( url , audio_id )
page_title = self . _html_search_meta ( [ ' title ' , ' og:title ' , ' twitter:title ' ] , webpage , default = None ) \
or self . _search_regex (
r ' <h1[^>]*>(.+?)</h1> ' , webpage , ' title ' , fatal = False , default = " Podchaser Podcast " )
page_description = self . _html_search_meta ( [ ' description ' , ' og:description ' , ' twitter:description ' ] , webpage , default = None )
data = self . _search_regex (
# If one episode
r ' window.__APP_STATE__ \ s*= \ s*([ " \' ]? { .+?}[ " \' ]?);</script> ' , webpage , ' app state ' )
if audio_id :
episodes = [ self . _download_json ( " https://api.podchaser.com/episodes/ %s " % audio_id , audio_id ) ]
while isinstance ( data , compat_str ) :
# Else get every episode available
data = self . _parse_json ( data , audio_id )
else :
total_episode_count = self . _download_json (
" https://api.podchaser.com/list/episode " , podcast_id ,
headers = { ' Content-Type ' : ' application/json;charset=utf-8 ' } ,
data = json . dumps ( {
" filters " : { " podcast_id " : podcast_id }
} ) . encode ( ) ) . get ( ' total ' )
episodes = [ ]
print ( total_episode_count )
for i in range ( total_episode_count / / 100 + 1 ) :
curr_episodes_data = self . _download_json (
" https://api.podchaser.com/list/episode " , podcast_id ,
headers = { ' Content-Type ' : ' application/json;charset=utf-8 ' } ,
data = json . dumps ( {
" start " : i * 100 ,
" count " : ( i + 1 ) * 100 ,
" sort_order " : " SORT_ORDER_RECENT " ,
" filters " : {
" podcast_id " : podcast_id
} , " options " : { }
} ) . encode ( ) )
curr_episodes = curr_episodes_data . get ( ' entities ' ) or [ ]
if len ( curr_episodes ) + len ( episodes ) < = total_episode_count :
episodes . extend ( curr_episodes )
episodes = try_get ( data , lambda x : x [ ' podcast ' ] [ ' episodes ' ] [ ' entities ' ] , dict ) or { }
podcast_data = merge_dicts (
episode_list = [ ( episodes . get ( episode_id ) , episode_id ) for episode_id in episodes ]
self . _download_json ( " https://api.podchaser.com/podcasts/ %s " % podcast_id , audio_id or podcast_id ) or { } ,
episodes [ 0 ] . get ( ' podcast ' ) or { } if episodes else { } )
entries = [ {
entries = [ {
' id ' : episode_id ,
' id ' : compat_str( episode . get ( ' id ' ) ) ,
' title ' : episode . get ( ' title ' ) ,
' title ' : episode . get ( ' title ' ) ,
' description ' : episode . get ( ' description ' ) ,
' description ' : episode . get ( ' description ' ) ,
' url ' : episode . get ( ' audio_url ' ) ,
' url ' : episode . get ( ' audio_url ' ) ,
' thumbnail ' : try_get ( episode , lambda x : x [ ' podcast ' ] [ ' image_url ' ] ) ,
' thumbnail ' : episode. get ( ' image_url ' ) ,
' duration ' : str_to_int ( episode . get ( ' length ' ) ) ,
' duration ' : str_to_int ( episode . get ( ' length ' ) ) ,
' timestamp ' : unified_timestamp ( episode . get ( ' air_date ' ) ) ,
' timestamp ' : unified_timestamp ( episode . get ( ' air_date ' ) ) ,
' rating ' : float_or_none ( episode . get ( ' rating ' ) ) ,
' rating ' : float_or_none ( episode . get ( ' rating ' ) ) ,
' categories ' : [ x [ ' text ' ] for x in try_get ( episode , lambda x : x [ ' podcast ' ] [ ' categories ' ] , list ) or [ ] ] ,
' categories ' : [
' tags ' : [ tag [ ' text ' ] for tag in episode . get ( ' tags ' ) or [ ] ] ,
x . get ( ' text ' ) for x in
' series ' : try_get ( episode , lambda x : x [ ' podcast ' ] [ ' title ' ] , compat_str ) ,
podcast_data . get ( ' categories ' )
} for episode , episode_id in episode_list ]
or try_get ( podcast_data , lambda x : x [ ' summary ' ] [ ' categories ' ] , list ) or [ ] ] ,
' tags ' : [ tag . get ( ' text ' ) for tag in podcast_data . get ( ' tags ' ) or [ ] ] ,
' series ' : podcast_data . get ( ' title ' ) ,
} for episode in episodes ]
if len ( entries ) > 1 :
if len ( entries ) > 1 :
# Return playlist
return self . playlist_result (
return self . playlist_result (
entries , playlist_id = ( creator_id or podcast_id ) , playlist_title = page_title ,
entries , playlist_id = compat_str ( podcast_data . get ( ' id ' ) ) ,
playlist_description = page_description )
playlist_title = podcast_data . get ( ' title ' ) ,
playlist_description = podcast_data . get ( ' description ' ) )
# Return episode
return entries [ 0 ]
return entries [ 0 ]