From cf738860b77c46d130b736eadf067cbfa862346c Mon Sep 17 00:00:00 2001 From: Yuan-Yi Chang Date: Thu, 25 Sep 2025 08:55:33 +0800 Subject: [PATCH] [ie/youtube] Allow fallback parsing for global nsig helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes _extract_js_function a true “last-resort” parser while the normal path keeps depending on JSInterpreter.extract_function_code. - _fixup_n_function_code now tries to extract helper signatures with JSInterpreter(jscode).extract_function_code(...) first; only when that raises JSInterpreter.Exception do we fall back to the legacy _extract_js_function(...) string parser. Successful inlines keep the existing debug and caching behaviour (yt_dlp/extractor/youtube/_video.py:2416-2440). - _extract_js_function is reduced to a pure fallback: it parses var|let|const foo = function(...) {...} with a regex and no longer re-enters JSInterpreter, avoiding duplicated work (yt_dlp/extractor/youtube/_video.py around line 2449). Global Helper Inline Loop We also rely on the while True retry loop inside _fixup_n_function_code to progressively inline missing dependencies: 1. Execute the candidate nsig implementation via JSInterpreter. If nothing is missing, the loop exits immediately. 2. When execution raises errors such as “Could not find object CI”, we capture the helper name from the message, extract that helper (CI, OB5, nq5, …), and prepend var helper = function(...) { ... }; to fixed_code. 3. With the helper injected we retry the interpreter; if another dependency is missing we repeat this process until no more Could not find … errors occur (or an unhandled exception surfaces). 4. After the loop, any residual names collected in jsi._undefined_varnames are handled by the follow-up for func_name in global_funcnames: pass, which inlines whatever helpers the interpreter still flagged. Together these steps guarantee that all global functions referenced by the nsig routine are made available before execution, giving the interpreter a stable environment even when YouTube reshuffles helper definitions. --- yt_dlp/extractor/youtube/_video.py | 167 +++++++++++++++++++++++++---- 1 file changed, 149 insertions(+), 18 deletions(-) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 6dba724cee..8682248f29 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -2271,14 +2271,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?(var),[a-zA-Z0-9_$]+\.set\((?:"n+"|[a-zA-Z0-9_$]+)\,(?P=var)\))''', jscode, 'n function name', group=('nfunc', 'idx'), default=(None, None)) if not funcname: - self.report_warning(join_nonempty( - 'Falling back to generic n function search', - player_url and f' player = {player_url}', delim='\n'), only_once=True) - return self._search_regex( - r'''(?xs) - ;\s*(?P[a-zA-Z0-9_$]+)\s*=\s*function\([a-zA-Z0-9_$]+\) - \s*\{(?:(?!};).)+?return\s*(?P["'])[\w-]+_w8_(?P=q)\s*\+\s*[a-zA-Z0-9_$]+''', - jscode, 'Initial JS player n function name', group='name') + inline_pattern = block_pattern = None + set_idx = next((i for i, v in enumerate(global_list or []) if v == 'set'), None) + n_idx = next((i for i, v in enumerate(global_list or []) if v == 'n'), None) + if set_idx is not None and n_idx is not None: + inline_pattern = rf'''(?sx) + if\s*\([^)]*\)\s* + (?P[A-Za-z0-9_$]+)\s*=\s* + (?P[A-Za-z0-9_$]+)\[(?P\d+)\]\(\s*(?P=var)\s*\) + \s*,\s* + [A-Za-z0-9_$]+\[\s*{re.escape(varname)}\[{set_idx}\]\s*\]\(\s*{re.escape(varname)}\[{n_idx}\]\s*,\s*(?P=var)\s*\) + ''' + block_pattern = rf'''(?sx) + (?Pif\s*\([^)]*\)\s*\{{\s*)? + (?P[A-Za-z0-9_$]+)\s*=\s* + (?P[A-Za-z0-9_$]+)\[(?P\d+)\]\(\s*(?P=var)\s*\) + \s*;\s* + [A-Za-z0-9_$]+\[\s*{re.escape(varname)}\[{set_idx}\]\s*\]\(\s*{re.escape(varname)}\[{n_idx}\]\s*,\s*(?P=var)\s*\) + (?:\s*;?\s*\}})? + ''' + for pattern in (inline_pattern, block_pattern): + match = pattern and re.search(pattern, jscode) + if match: + funcname, idx = match.group('array', 'idx') + break + if not funcname: + self.report_warning(join_nonempty( + 'Falling back to generic n function search', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + return self._search_regex( + r'''(?xs) + ;\s*(?P[a-zA-Z0-9_$]+)\s*=\s*function\([a-zA-Z0-9_$]+\) + \s*\{(?:(?!};).)+?return\s*(?P["'])[\w-]+_w8_(?P=q)\s*\+\s*[a-zA-Z0-9_$]+''', + jscode, 'Initial JS player n function name', group='name') elif not idx: return funcname @@ -2301,6 +2326,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) )[;,] ''', jscode, 'global variable', group=('code', 'name', 'value'), default=(None, None, None)) + if not varcode: + # Fallback: search without requiring a nearby 'use strict' + varcode, varname, varvalue = self._search_regex( + r'''(?x) + var\s+(?P[a-zA-Z0-9_$]+)\s*=\s* + (?P + (?P["\'])(?:(?!(?P=q2)).|\\.)+(?P=q2) + \.split\((?P["\'])(?:(?!(?P=q3)).)+(?P=q3)\) + |\[\s*(?:(?P["\'])(?:(?!(?P=q4)).|\\.)*(?P=q4)\s*,?\s*)+\] + ) + [;,] + ''', jscode, 'global variable (loose)', group=('value', 'name', 'value'), default=(None, None, None)) + if varcode: + # Reconstruct minimal code for JSInterpreter context + varcode = f'var {varname}={varcode};' if not varcode: self.write_debug(join_nonempty( 'No global array variable found in player JS', @@ -2335,32 +2375,123 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_url and f' player = {player_url}', delim='\n'), only_once=True) # Fixup global funcs - jsi = JSInterpreter(fixed_code) cache_id = (self._NSIG_FUNC_CACHE_ID, player_url) - try: - self._cached( - self._extract_n_function_from_code, *cache_id)(jsi, (argnames, fixed_code))(self._DUMMY_STRING) - except JSInterpreter.Exception: - self._player_cache.pop(cache_id, None) + inlined_globals = set() + + while True: + jsi = JSInterpreter(fixed_code) + try: + self._cached( + self._extract_n_function_from_code, *cache_id)(jsi, (argnames, fixed_code))(self._DUMMY_STRING) + except JSInterpreter.Exception as exc: + self._player_cache.pop(cache_id, None) + + missing_name = None + msg = getattr(exc, 'orig_msg', None) or str(exc) + preview = fixed_code.replace('\n', ' ')[:400] + self.write_debug(join_nonempty( + 'nsig inline execution failed; attempting helper injection', + f' error = {msg}', + f' fixed_code head = {preview}...', + player_url and f' player = {player_url}', delim='\n')) + match = re.search(r'Could not find (?:function "(?P[a-zA-Z0-9_$]+)"|object (?P[a-zA-Z0-9_$]+))', msg) + if match: + missing_name = match.group('func') or match.group('obj') + + if not missing_name or missing_name in inlined_globals: + if not missing_name: + self.write_debug(join_nonempty( + 'Helper lookup gave no candidate; aborting inline retry', + player_url and f' player = {player_url}', delim='\n')) + break + + try: + func_args, func_code = JSInterpreter(jscode).extract_function_code(missing_name) + except JSInterpreter.Exception: + try: + func_args, func_code = self._extract_js_function(jscode, missing_name) + except JSInterpreter.Exception: + self.write_debug(join_nonempty( + f'Unable to extract helper {missing_name} body from player JS', + player_url and f' player = {player_url}', delim='\n')) + break + except Exception: + self.write_debug(join_nonempty( + f'Unexpected failure extracting helper {missing_name} from player JS', + player_url and f' player = {player_url}', delim='\n')) + break + + self.write_debug(join_nonempty( + f'Inlining global nsig helper {missing_name}', + f' body = {func_code}', + player_url and f' player = {player_url}', delim='\n')) + fixed_code = f'var {missing_name} = function({", ".join(func_args)}) {{ {func_code} }}; {fixed_code}' + inlined_globals.add(missing_name) + continue + else: + break - global_funcnames = jsi._undefined_varnames - debug_names = [] + global_funcnames = jsi._undefined_varnames | inlined_globals + debug_names = sorted(inlined_globals) jsi = JSInterpreter(jscode) for func_name in global_funcnames: + if func_name in inlined_globals: + continue try: func_args, func_code = jsi.extract_function_code(func_name) - fixed_code = f'var {func_name} = function({", ".join(func_args)}) {{ {func_code} }}; {fixed_code}' - debug_names.append(func_name) + except JSInterpreter.Exception: + try: + func_args, func_code = self._extract_js_function(jscode, func_name) + except Exception: + self.report_warning(join_nonempty( + f'Unable to extract global nsig function {func_name} from player JS', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + continue except Exception: self.report_warning(join_nonempty( f'Unable to extract global nsig function {func_name} from player JS', player_url and f' player = {player_url}', delim='\n'), only_once=True) + continue + + fixed_code = f'var {func_name} = function({", ".join(func_args)}) {{ {func_code} }}; {fixed_code}' + debug_names.append(func_name) if debug_names: self.write_debug(f'Extracted global nsig functions: {", ".join(debug_names)}') return argnames, fixed_code + def _extract_js_function(self, jscode, func_name): + pattern = re.compile(rf'''(?x) + (?P + function\s+{re.escape(func_name)}\s*\((?P[^)]*)\)| + (?[^)]*)\)| + (?:var|const|let)\s+{re.escape(func_name)}\s*=\s*function\s*\((?P[^)]*)\) + ) + ''') + match = pattern.search(jscode) + if not match: + raise JSInterpreter.Exception(f'Could not parse function {func_name}') + + args = next(filter(None, match.group('args_decl', 'args_expr', 'args_var')), '') + brace_start = jscode.find('{', match.end()) + if brace_start == -1: + raise JSInterpreter.Exception(f'Could not parse function {func_name}') + + depth = 0 + for idx in range(brace_start, len(jscode)): + ch = jscode[idx] + if ch == '{': + depth += 1 + elif ch == '}': + depth -= 1 + if depth == 0: + func_code = jscode[brace_start + 1:idx] + arg_list = [a.strip() for a in args.split(',') if a.strip()] + return arg_list, func_code + + raise JSInterpreter.Exception(f'Could not parse function {func_name}') + def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) func_code = self._load_player_data_from_cache('nsig', player_url)