[ie/youtube] Allow fallback parsing for global nsig helpers

This makes _extract_js_function a true “last-resort” parser while the normal path keeps depending on JSInterpreter.extract_function_code.

  - _fixup_n_function_code now tries to extract helper signatures with JSInterpreter(jscode).extract_function_code(...) first; only when that raises JSInterpreter.Exception do we fall back to the legacy _extract_js_function(...) string parser. Successful inlines keep the existing debug and caching behaviour (yt_dlp/extractor/youtube/_video.py:2416-2440).
  - _extract_js_function is reduced to a pure fallback: it parses var|let|const foo = function(...) {...} with a regex and no longer re-enters JSInterpreter, avoiding duplicated work (yt_dlp/extractor/youtube/_video.py around line 2449).

Global Helper Inline Loop

  We also rely on the while True retry loop inside _fixup_n_function_code to progressively inline missing dependencies:

  1. Execute the candidate nsig implementation via JSInterpreter. If nothing is missing, the loop exits immediately.
  2. When execution raises errors such as “Could not find object CI”, we capture the helper name from the message, extract that helper (CI, OB5, nq5, …), and prepend var helper = function(...) { ... }; to fixed_code.
  3. With the helper injected we retry the interpreter; if another dependency is missing we repeat this process until no more Could not find … errors occur (or an unhandled exception surfaces).
  4. After the loop, any residual names collected in jsi._undefined_varnames are handled by the follow-up for func_name in global_funcnames: pass, which inlines whatever helpers the interpreter still flagged.

Together these steps guarantee that all global functions referenced by the nsig routine are made available before execution, giving the interpreter a stable environment even when YouTube reshuffles helper definitions.
pull/14433/head
Yuan-Yi Chang 2 days ago
parent 4429fd0450
commit cf738860b7

@ -2270,6 +2270,31 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
)(?P<nfunc>[a-zA-Z0-9_$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z]\) )(?P<nfunc>[a-zA-Z0-9_$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z]\)
(?(var),[a-zA-Z0-9_$]+\.set\((?:"n+"|[a-zA-Z0-9_$]+)\,(?P=var)\))''', (?(var),[a-zA-Z0-9_$]+\.set\((?:"n+"|[a-zA-Z0-9_$]+)\,(?P=var)\))''',
jscode, 'n function name', group=('nfunc', 'idx'), default=(None, None)) jscode, 'n function name', group=('nfunc', 'idx'), default=(None, None))
if not funcname:
inline_pattern = block_pattern = None
set_idx = next((i for i, v in enumerate(global_list or []) if v == 'set'), None)
n_idx = next((i for i, v in enumerate(global_list or []) if v == 'n'), None)
if set_idx is not None and n_idx is not None:
inline_pattern = rf'''(?sx)
if\s*\([^)]*\)\s*
(?P<var>[A-Za-z0-9_$]+)\s*=\s*
(?P<array>[A-Za-z0-9_$]+)\[(?P<idx>\d+)\]\(\s*(?P=var)\s*\)
\s*,\s*
[A-Za-z0-9_$]+\[\s*{re.escape(varname)}\[{set_idx}\]\s*\]\(\s*{re.escape(varname)}\[{n_idx}\]\s*,\s*(?P=var)\s*\)
'''
block_pattern = rf'''(?sx)
(?P<prefix>if\s*\([^)]*\)\s*\{{\s*)?
(?P<var>[A-Za-z0-9_$]+)\s*=\s*
(?P<array>[A-Za-z0-9_$]+)\[(?P<idx>\d+)\]\(\s*(?P=var)\s*\)
\s*;\s*
[A-Za-z0-9_$]+\[\s*{re.escape(varname)}\[{set_idx}\]\s*\]\(\s*{re.escape(varname)}\[{n_idx}\]\s*,\s*(?P=var)\s*\)
(?:\s*;?\s*\}})?
'''
for pattern in (inline_pattern, block_pattern):
match = pattern and re.search(pattern, jscode)
if match:
funcname, idx = match.group('array', 'idx')
break
if not funcname: if not funcname:
self.report_warning(join_nonempty( self.report_warning(join_nonempty(
'Falling back to generic n function search', 'Falling back to generic n function search',
@ -2301,6 +2326,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
) )
)[;,] )[;,]
''', jscode, 'global variable', group=('code', 'name', 'value'), default=(None, None, None)) ''', jscode, 'global variable', group=('code', 'name', 'value'), default=(None, None, None))
if not varcode:
# Fallback: search without requiring a nearby 'use strict'
varcode, varname, varvalue = self._search_regex(
r'''(?x)
var\s+(?P<name>[a-zA-Z0-9_$]+)\s*=\s*
(?P<value>
(?P<q2>["\'])(?:(?!(?P=q2)).|\\.)+(?P=q2)
\.split\((?P<q3>["\'])(?:(?!(?P=q3)).)+(?P=q3)\)
|\[\s*(?:(?P<q4>["\'])(?:(?!(?P=q4)).|\\.)*(?P=q4)\s*,?\s*)+\]
)
[;,]
''', jscode, 'global variable (loose)', group=('value', 'name', 'value'), default=(None, None, None))
if varcode:
# Reconstruct minimal code for JSInterpreter context
varcode = f'var {varname}={varcode};'
if not varcode: if not varcode:
self.write_debug(join_nonempty( self.write_debug(join_nonempty(
'No global array variable found in player JS', 'No global array variable found in player JS',
@ -2335,32 +2375,123 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_url and f' player = {player_url}', delim='\n'), only_once=True) player_url and f' player = {player_url}', delim='\n'), only_once=True)
# Fixup global funcs # Fixup global funcs
jsi = JSInterpreter(fixed_code)
cache_id = (self._NSIG_FUNC_CACHE_ID, player_url) cache_id = (self._NSIG_FUNC_CACHE_ID, player_url)
inlined_globals = set()
while True:
jsi = JSInterpreter(fixed_code)
try: try:
self._cached( self._cached(
self._extract_n_function_from_code, *cache_id)(jsi, (argnames, fixed_code))(self._DUMMY_STRING) self._extract_n_function_from_code, *cache_id)(jsi, (argnames, fixed_code))(self._DUMMY_STRING)
except JSInterpreter.Exception: except JSInterpreter.Exception as exc:
self._player_cache.pop(cache_id, None) self._player_cache.pop(cache_id, None)
global_funcnames = jsi._undefined_varnames missing_name = None
debug_names = [] msg = getattr(exc, 'orig_msg', None) or str(exc)
preview = fixed_code.replace('\n', ' ')[:400]
self.write_debug(join_nonempty(
'nsig inline execution failed; attempting helper injection',
f' error = {msg}',
f' fixed_code head = {preview}...',
player_url and f' player = {player_url}', delim='\n'))
match = re.search(r'Could not find (?:function "(?P<func>[a-zA-Z0-9_$]+)"|object (?P<obj>[a-zA-Z0-9_$]+))', msg)
if match:
missing_name = match.group('func') or match.group('obj')
if not missing_name or missing_name in inlined_globals:
if not missing_name:
self.write_debug(join_nonempty(
'Helper lookup gave no candidate; aborting inline retry',
player_url and f' player = {player_url}', delim='\n'))
break
try:
func_args, func_code = JSInterpreter(jscode).extract_function_code(missing_name)
except JSInterpreter.Exception:
try:
func_args, func_code = self._extract_js_function(jscode, missing_name)
except JSInterpreter.Exception:
self.write_debug(join_nonempty(
f'Unable to extract helper {missing_name} body from player JS',
player_url and f' player = {player_url}', delim='\n'))
break
except Exception:
self.write_debug(join_nonempty(
f'Unexpected failure extracting helper {missing_name} from player JS',
player_url and f' player = {player_url}', delim='\n'))
break
self.write_debug(join_nonempty(
f'Inlining global nsig helper {missing_name}',
f' body = {func_code}',
player_url and f' player = {player_url}', delim='\n'))
fixed_code = f'var {missing_name} = function({", ".join(func_args)}) {{ {func_code} }}; {fixed_code}'
inlined_globals.add(missing_name)
continue
else:
break
global_funcnames = jsi._undefined_varnames | inlined_globals
debug_names = sorted(inlined_globals)
jsi = JSInterpreter(jscode) jsi = JSInterpreter(jscode)
for func_name in global_funcnames: for func_name in global_funcnames:
if func_name in inlined_globals:
continue
try: try:
func_args, func_code = jsi.extract_function_code(func_name) func_args, func_code = jsi.extract_function_code(func_name)
fixed_code = f'var {func_name} = function({", ".join(func_args)}) {{ {func_code} }}; {fixed_code}' except JSInterpreter.Exception:
debug_names.append(func_name) try:
func_args, func_code = self._extract_js_function(jscode, func_name)
except Exception:
self.report_warning(join_nonempty(
f'Unable to extract global nsig function {func_name} from player JS',
player_url and f' player = {player_url}', delim='\n'), only_once=True)
continue
except Exception: except Exception:
self.report_warning(join_nonempty( self.report_warning(join_nonempty(
f'Unable to extract global nsig function {func_name} from player JS', f'Unable to extract global nsig function {func_name} from player JS',
player_url and f' player = {player_url}', delim='\n'), only_once=True) player_url and f' player = {player_url}', delim='\n'), only_once=True)
continue
fixed_code = f'var {func_name} = function({", ".join(func_args)}) {{ {func_code} }}; {fixed_code}'
debug_names.append(func_name)
if debug_names: if debug_names:
self.write_debug(f'Extracted global nsig functions: {", ".join(debug_names)}') self.write_debug(f'Extracted global nsig functions: {", ".join(debug_names)}')
return argnames, fixed_code return argnames, fixed_code
def _extract_js_function(self, jscode, func_name):
pattern = re.compile(rf'''(?x)
(?P<prefix>
function\s+{re.escape(func_name)}\s*\((?P<args_decl>[^)]*)\)|
(?<![a-zA-Z0-9_$]){re.escape(func_name)}\s*=\s*function\s*\((?P<args_expr>[^)]*)\)|
(?:var|const|let)\s+{re.escape(func_name)}\s*=\s*function\s*\((?P<args_var>[^)]*)\)
)
''')
match = pattern.search(jscode)
if not match:
raise JSInterpreter.Exception(f'Could not parse function {func_name}')
args = next(filter(None, match.group('args_decl', 'args_expr', 'args_var')), '')
brace_start = jscode.find('{', match.end())
if brace_start == -1:
raise JSInterpreter.Exception(f'Could not parse function {func_name}')
depth = 0
for idx in range(brace_start, len(jscode)):
ch = jscode[idx]
if ch == '{':
depth += 1
elif ch == '}':
depth -= 1
if depth == 0:
func_code = jscode[brace_start + 1:idx]
arg_list = [a.strip() for a in args.split(',') if a.strip()]
return arg_list, func_code
raise JSInterpreter.Exception(f'Could not parse function {func_name}')
def _extract_n_function_code(self, video_id, player_url): def _extract_n_function_code(self, video_id, player_url):
player_id = self._extract_player_info(player_url) player_id = self._extract_player_info(player_url)
func_code = self._load_player_data_from_cache('nsig', player_url) func_code = self._load_player_data_from_cache('nsig', player_url)

Loading…
Cancel
Save