[ie/youtube] Make nsig extraction more robust

Authored by: bashonly
pull/12761/head
bashonly 4 months ago
parent 74a2ee7508
commit 0f8bb067e3
No known key found for this signature in database
GPG Key ID: 783F096F253D15B0

@ -327,26 +327,26 @@ def t_factory(name, sig_func, url_pattern):
urllib.request.urlretrieve(url, fn) urllib.request.urlretrieve(url, fn)
with open(fn, encoding='utf-8') as testf: with open(fn, encoding='utf-8') as testf:
jscode = testf.read() jscode = testf.read()
self.assertEqual(sig_func(jscode, sig_input), expected_sig) self.assertEqual(sig_func(jscode, sig_input, url), expected_sig)
test_func.__name__ = f'test_{name}_js_{test_id}' test_func.__name__ = f'test_{name}_js_{test_id}'
setattr(TestSignature, test_func.__name__, test_func) setattr(TestSignature, test_func.__name__, test_func)
return make_tfunc return make_tfunc
def signature(jscode, sig_input): def signature(jscode, sig_input, player_url):
func = YoutubeIE(FakeYDL())._parse_sig_js(jscode) func = YoutubeIE(FakeYDL())._parse_sig_js(jscode, player_url)
src_sig = ( src_sig = (
str(string.printable[:sig_input]) str(string.printable[:sig_input])
if isinstance(sig_input, int) else sig_input) if isinstance(sig_input, int) else sig_input)
return func(src_sig) return func(src_sig)
def n_sig(jscode, sig_input): def n_sig(jscode, sig_input, player_url):
ie = YoutubeIE(FakeYDL()) ie = YoutubeIE(FakeYDL())
funcname = ie._extract_n_function_name(jscode) funcname = ie._extract_n_function_name(jscode, player_url=player_url)
jsi = JSInterpreter(jscode) jsi = JSInterpreter(jscode)
func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname), jscode)) func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname), jscode, player_url))
return func([sig_input]) return func([sig_input])

@ -34,6 +34,7 @@ from ...utils import (
clean_html, clean_html,
datetime_from_str, datetime_from_str,
filesize_from_tbr, filesize_from_tbr,
filter_dict,
float_or_none, float_or_none,
format_field, format_field,
get_first, get_first,
@ -1991,7 +1992,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if not cache_spec: if not cache_spec:
code = self._load_player(video_id, player_url) code = self._load_player(video_id, player_url)
if code: if code:
res = self._parse_sig_js(code) res = self._parse_sig_js(code, player_url)
test_string = ''.join(map(chr, range(len(example_sig)))) test_string = ''.join(map(chr, range(len(example_sig))))
cache_spec = [ord(c) for c in res(test_string)] cache_spec = [ord(c) for c in res(test_string)]
self.cache.store('youtube-sigfuncs', func_id, cache_spec) self.cache.store('youtube-sigfuncs', func_id, cache_spec)
@ -2039,7 +2040,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
f' return {expr_code}\n') f' return {expr_code}\n')
self.to_screen('Extracted signature function:\n' + code) self.to_screen('Extracted signature function:\n' + code)
def _parse_sig_js(self, jscode): def _parse_sig_js(self, jscode, player_url):
# Examples where `sig` is funcname: # Examples where `sig` is funcname:
# sig=function(a){a=a.split(""); ... ;return a.join("")}; # sig=function(a){a=a.split(""); ... ;return a.join("")};
# ;c&&(c=sig(decodeURIComponent(c)),a.set(b,encodeURIComponent(c)));return a}; # ;c&&(c=sig(decodeURIComponent(c)),a.set(b,encodeURIComponent(c)));return a};
@ -2063,12 +2064,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
jscode, 'Initial JS player signature function name', group='sig') jscode, 'Initial JS player signature function name', group='sig')
varname, global_list = self._interpret_player_js_global_var(jscode, player_url)
jsi = JSInterpreter(jscode) jsi = JSInterpreter(jscode)
global_var_map = {} initial_function = jsi.extract_function(funcname, filter_dict({varname: global_list}))
_, varname, value = self._extract_player_js_global_var(jscode)
if varname:
global_var_map[varname] = jsi.interpret_expression(value, {}, allow_recursion=100)
initial_function = jsi.extract_function(funcname, global_var_map)
return lambda s: initial_function([s]) return lambda s: initial_function([s])
def _cached(self, func, *cache_id): def _cached(self, func, *cache_id):
@ -2150,20 +2148,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return ret return ret
def _extract_n_function_name(self, jscode, player_url=None): def _extract_n_function_name(self, jscode, player_url=None):
_, varname, global_arr = self._extract_player_js_global_var(jscode) varname, global_list = self._interpret_player_js_global_var(jscode, player_url)
if global_arr: if debug_str := traverse_obj(global_list, (lambda _, v: v.endswith('_w8_'), any)):
jsi = JSInterpreter(global_arr) return self._search_regex(
global_list = jsi.interpret_expression(global_arr, {}, allow_recursion=100)
debug_str = traverse_obj(global_list, (lambda _, v: v.endswith('_w8_'), any))
if debug_str and (funcname := self._search_regex(
r'''(?xs) r'''(?xs)
[;\n](?P<funcname>[a-zA-Z0-9_$]+)\s*=\s*function\s*\([a-zA-Z0-9_$]+\)\s*\{ [;\n](?:
(?:(?!\};\s*[a-zA-Z0-9_$]+\s*=\s*function).)+ (?P<f>function\s+)|
(?:var\s+)?
)(?P<funcname>[a-zA-Z0-9_$]+)\s*(?(f)|=\s*function\s*)
\((?P<argname>[a-zA-Z0-9_$]+)\)\s*\{
(?:(?!\}[;\n]).)+
\}\s*catch\(\s*[a-zA-Z0-9_$]+\s*\)\s* \}\s*catch\(\s*[a-zA-Z0-9_$]+\s*\)\s*
\{\s*return\s+%s\[%i\][^}]+\}\s*return\s+[^};]+\}; \{\s*return\s+%s\[%d\]\s*\+\s*(?P=argname)\s*\}\s*return\s+[^}]+\}[;\n]
''' % (re.escape(varname), global_list.index(debug_str)), ''' % (re.escape(varname), global_list.index(debug_str)),
jscode, 'n function name', group='funcname', default=None)): jscode, 'nsig function name', group='funcname')
return funcname
# Examples (with placeholders nfunc, narray, idx): # Examples (with placeholders nfunc, narray, idx):
# * .get("n"))&&(b=nfunc(b) # * .get("n"))&&(b=nfunc(b)
@ -2207,9 +2205,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode, rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode,
f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)]
def _extract_player_js_global_var(self, jscode): def _extract_player_js_global_var(self, jscode, player_url):
"""Returns tuple of strings: variable assignment code, variable name, variable value code""" """Returns tuple of strings: variable assignment code, variable name, variable value code"""
return self._search_regex( extract_global_var = self._cached(self._search_regex, 'js global array', player_url)
varcode, varname, varvalue = extract_global_var(
r'''(?x) r'''(?x)
(?P<q1>["\'])use\s+strict(?P=q1);\s* (?P<q1>["\'])use\s+strict(?P=q1);\s*
(?P<code> (?P<code>
@ -2221,18 +2220,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
) )
)[;,] )[;,]
''', jscode, 'global variable', group=('code', 'name', 'value'), default=(None, None, None)) ''', jscode, 'global variable', group=('code', 'name', 'value'), default=(None, None, None))
if not varcode:
def _fixup_n_function_code(self, argnames, code, full_code): self.write_debug(join_nonempty(
global_var, varname, _ = self._extract_player_js_global_var(full_code) 'No global array variable found in player JS',
if global_var: player_url and f' player = {player_url}', delim='\n'), only_once=True)
self.write_debug(f'Prepending n function code with global array variable "{varname}"') return varcode, varname, varvalue
code = global_var + '; ' + code
def _interpret_player_js_global_var(self, jscode, player_url):
"""Returns tuple of: variable name string, variable value list"""
_, varname, array_code = self._extract_player_js_global_var(jscode, player_url)
jsi = JSInterpreter(array_code)
interpret_global_var = self._cached(jsi.interpret_expression, 'js global list', player_url)
return varname, interpret_global_var(array_code, {}, allow_recursion=1)
def _fixup_n_function_code(self, argnames, nsig_code, jscode, player_url):
varcode, varname, _ = self._extract_player_js_global_var(jscode, player_url)
if varcode and varname:
nsig_code = varcode + '; ' + nsig_code
_, global_list = self._interpret_player_js_global_var(jscode, player_url)
else: else:
self.write_debug('No global array variable found in player JS')
varname = 'dlp_wins' varname = 'dlp_wins'
return argnames, re.sub( global_list = []
rf';\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(?:(["\'])undefined\1|{re.escape(varname)}\[\d+\])\s*\)\s*return\s+{re.escape(argnames[0])};',
';', code) undefined_idx = global_list.index('undefined') if 'undefined' in global_list else r'\d+'
fixed_code = re.sub(
rf'''(?x)
;\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(?:
(["\'])undefined\1|
{re.escape(varname)}\[{undefined_idx}\]
)\s*\)\s*return\s+{re.escape(argnames[0])};
''', ';', nsig_code)
if fixed_code == nsig_code:
self.write_debug(join_nonempty(
'No typeof statement found in nsig function code',
player_url and f' player = {player_url}', delim='\n'), only_once=True)
return argnames, fixed_code
def _extract_n_function_code(self, video_id, player_url): def _extract_n_function_code(self, video_id, player_url):
player_id = self._extract_player_info(player_url) player_id = self._extract_player_info(player_url)
@ -2246,7 +2268,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
func_name = self._extract_n_function_name(jscode, player_url=player_url) func_name = self._extract_n_function_name(jscode, player_url=player_url)
# XXX: Workaround for the global array variable and lack of `typeof` implementation # XXX: Workaround for the global array variable and lack of `typeof` implementation
func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name), jscode) func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name), jscode, player_url)
return jsi, player_id, func_code return jsi, player_id, func_code

Loading…
Cancel
Save