From d9d07a95815a992bf5f876a62f25c831eb3f32ac Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 3 May 2023 12:06:34 +0100 Subject: [PATCH] [utils] Improve js_to_json, align with yt-dlp * support variable substitution, from https://github.com/yt-dlp/yt-dlp/pull/#521 etc, thanks ChillingPepper, Grub4k, pukkandan * improve escape handling, from https://github.com/yt-dlp/yt-dlp/pull/#521 thanks Grub4k * support template strings from https://github.com/yt-dlp/yt-dlp/pull/6623 thanks Grub4k * add limited `!` evaluation (eg, !!0 -> false, see tests) --- test/test_utils.py | 103 +++++++++++++++++++++++++++++++++++++-- youtube_dl/utils.py | 114 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 187 insertions(+), 30 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 1b5d170fe..e83977f29 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -905,6 +905,85 @@ class TestUtil(unittest.TestCase): ) self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') + def test_js_to_json_vars_strings(self): + self.assertDictEqual( + json.loads(js_to_json( + '''{ + 'null': a, + 'nullStr': b, + 'true': c, + 'trueStr': d, + 'false': e, + 'falseStr': f, + 'unresolvedVar': g, + }''', + { + 'a': 'null', + 'b': '"null"', + 'c': 'true', + 'd': '"true"', + 'e': 'false', + 'f': '"false"', + 'g': 'var', + } + )), + { + 'null': None, + 'nullStr': 'null', + 'true': True, + 'trueStr': 'true', + 'false': False, + 'falseStr': 'false', + 'unresolvedVar': 'var' + } + ) + + self.assertDictEqual( + json.loads(js_to_json( + '''{ + 'int': a, + 'intStr': b, + 'float': c, + 'floatStr': d, + }''', + { + 'a': '123', + 'b': '"123"', + 'c': '1.23', + 'd': '"1.23"', + } + )), + { + 'int': 123, + 'intStr': '123', + 'float': 1.23, + 'floatStr': '1.23', + } + ) + + self.assertDictEqual( + json.loads(js_to_json( + '''{ + 'object': a, + 'objectStr': b, + 'array': c, + 'arrayStr': d, + }''', + { + 'a': '{}', + 'b': '"{}"', + 'c': '[]', + 'd': '"[]"', + } + )), + { + 'object': {}, + 'objectStr': '{}', + 'array': [], + 'arrayStr': '[]', + } + ) + def test_js_to_json_realworld(self): inp = '''{ 'clip':{'provider':'pseudo'} @@ -975,10 +1054,10 @@ class TestUtil(unittest.TestCase): !42: 42 }''') self.assertEqual(json.loads(on), { - 'a': 0, - 'b': 1, - 'c': 0, - 'd': 42.42, + 'a': True, + 'b': False, + 'c': False, + 'd': True, 'e': [], 'f': "abc", 'g': "", @@ -1048,10 +1127,26 @@ class TestUtil(unittest.TestCase): on = js_to_json('{ "040": "040" }') self.assertEqual(json.loads(on), {'040': '040'}) + on = js_to_json('[1,//{},\n2]') + self.assertEqual(json.loads(on), [1, 2]) + + on = js_to_json(r'"\^\$\#"') + self.assertEqual(json.loads(on), R'^$#', msg='Unnecessary escapes should be stripped') + + on = js_to_json('\'"\\""\'') + self.assertEqual(json.loads(on), '"""', msg='Unnecessary quote escape should be escaped') + def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') + def test_js_to_json_template_literal(self): + self.assertEqual(js_to_json('`Hello ${name}`', {'name': '"world"'}), '"Hello world"') + self.assertEqual(js_to_json('`${name}${name}`', {'name': '"X"'}), '"XX"') + self.assertEqual(js_to_json('`${name}${name}`', {'name': '5'}), '"55"') + self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""') + self.assertEqual(js_to_json('`${name}`', {}), '"name"') + def test_extract_attributes(self): self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(""), {'x': 'y'}) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b77a7fb0e..b05f65283 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4365,46 +4365,108 @@ def strip_jsonp(code): r'\g', code) -def js_to_json(code): - COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*' +def js_to_json(code, *args, **kwargs): + + # vars is a dict of (var, val) pairs to substitute + vars = args[0] if len(args) > 0 else kwargs.get('vars', {}) + strict = kwargs.get('strict', False) + + STRING_QUOTES = '\'"`' + STRING_RE = '|'.join(r'{0}(?:\\.|[^\\{0}])*{0}'.format(q) for q in STRING_QUOTES) + COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n' SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE) INTEGER_TABLE = ( (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16), (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), + (r'(?s)^(\d+){skip}:?$'.format(skip=SKIP_RE), 10), ) + # compat candidate + JSONDecodeError = json.JSONDecodeError if 'JSONDecodeError' in dir(json) else ValueError + + def process_escape(match): + JSON_PASSTHROUGH_ESCAPES = r'"\bfnrtu' + escape = match.group(1) or match.group(2) + + return ('\\' + escape if escape in JSON_PASSTHROUGH_ESCAPES + else '\\u00' if escape == 'x' + else '' if escape == '\n' + else escape) + + def template_substitute(match): + evaluated = js_to_json(match.group(1), vars, strict=strict) + if evaluated[0] == '"': + return json.loads(evaluated) + return evaluated def fix_kv(m): v = m.group(0) if v in ('true', 'false', 'null'): return v - elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': - return "" - - if v[0] in ("'", '"'): - v = re.sub(r'(?s)\\.|"', lambda m: { - '"': '\\"', - "\\'": "'", - '\\\n': '', - '\\x': '\\u00', - }.get(m.group(0), m.group(0)), v[1:-1]) - else: - for regex, base in INTEGER_TABLE: - im = re.match(regex, v) - if im: - i = int(im.group(1), base) - return '"%d":' % i if v.endswith(':') else '%d' % i + elif v in ('undefined', 'void 0'): + return 'null' + elif v.startswith('/*') or v.startswith('//') or v == ',': + return '' + + if v[0] in STRING_QUOTES: + v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1] + escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v) + return '"{0}"'.format(escaped) + + inv = IDENTITY + im = re.split(r'^!+', v) + if len(im) > 1 and not im[-1].endswith(':'): + if (len(v) - len(im[1])) % 2 == 1: + inv = lambda x: 'true' if x == 0 else 'false' + else: + inv = lambda x: 'false' if x == 0 else 'true' + if not any(x for x in im): + return + v = im[-1] + + for regex, base in INTEGER_TABLE: + im = re.match(regex, v) + if im: + i = int(im.group(1), base) + return ('"%s":' if v.endswith(':') else '%s') % inv(i) + + if v in vars: + try: + if not strict: + json.loads(vars[v]) + except JSONDecodeError: + return inv(json.dumps(vars[v])) + else: + return inv(vars[v]) + + if not strict: + v = try_call(inv, args=(v,), default=v) + if v in ('true', 'false'): + return v + return '"{0}"'.format(v) + + raise ValueError('Unknown value: ' + v) + + def create_map(mobj): + return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars)))) - return '"%s"' % v + code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) + if not strict: + code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) + code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code) + code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code) + code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code) return re.sub(r'''(?sx) - "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| - '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - {comment}|,(?={skip}[\]}}])| - (?:(?