How to convert raw javascript object to a dictionary?
Question:
When screen-scraping some website, I extract data from <script>
tags.
The data I get is not in standard JSON
format. I cannot use json.loads()
.
# from
js_obj = '{x:1, y:2, z:3}'
# to
py_obj = {'x':1, 'y':2, 'z':3}
Currently, I use regex
to transform the raw data to JSON
format.
But I feel pretty bad when I encounter complicated data structure.
Do you have some better solutions?
Answers:
This will likely not work everywhere, but as a start, here’s a simple regex that should convert the keys into quoted strings so you can pass into json.loads. Or is this what you’re already doing?
In[70] : quote_keys_regex = r'([{s,])(w+)(:)'
In[71] : re.sub(quote_keys_regex, r'1"2"3', js_obj)
Out[71]: '{"x":1, "y":2, "z":3}'
In[72] : js_obj_2 = '{x:1, y:2, z:{k:3,j:2}}'
Int[73]: re.sub(quote_keys_regex, r'1"2"3', js_obj_2)
Out[73]: '{"x":1, "y":2, "z":{"k":3,"j":2}}'
demjson.decode()
import demjson
# from
js_obj = '{x:1, y:2, z:3}'
# to
py_obj = demjson.decode(js_obj)
chompjs.parse_js_object()
import chompjs
# from
js_obj = '{x:1, y:2, z:3}'
# to
py_obj = chompjs.parse_js_object(js_obj)
jsonnet.evaluate_snippet()
import json, _jsonnet
# from
js_obj = '{x:1, y:2, z:3}'
# to
py_obj = json.loads(_jsonnet.evaluate_snippet('snippet', js_obj))
ast.literal_eval()
import ast
# from
js_obj = "{'x':1, 'y':2, 'z':3}"
# to
py_obj = ast.literal_eval(js_obj)
Not including objects
json.loads()
json.loads()
doesn’t accept undefined, you have to change to null
json.loads()
only accept double quotes
{"foo": 1, "bar": null}
Use this if you are sure that your javascript code only have double quotes on key names.
import json
json_text = """{"foo": 1, "bar": undefined}"""
json_text = re.sub(r'("s*:s*)undefined(s*[,}])', '\1null\2', json_text)
py_obj = json.loads(json_text)
ast.literal_eval()
ast.literal_eval()
doesn’t accept undefined, you have to change to None
ast.literal_eval()
doesn’t accept null, you have to change to None
ast.literal_eval()
doesn’t accept true, you have to change to True
ast.literal_eval()
doesn’t accept false, you have to change to False
ast.literal_eval()
accept single and double quotes
{"foo": 1, "bar": None}
or {'foo': 1, 'bar': None}
import ast
js_obj = """{'foo': 1, 'bar': undefined}"""
js_obj = re.sub(r'(['"]s*:s*)undefined(s*[,}])', '\1None\2', js_obj)
js_obj = re.sub(r'(['"]s*:s*)null(s*[,}])', '\1None\2', js_obj)
js_obj = re.sub(r'(['"]s*:s*)NaN(s*[,}])', '\1None\2', js_obj)
js_obj = re.sub(r'(['"]s*:s*)true(s*[,}])', '\1True\2', js_obj)
js_obj = re.sub(r'(['"]s*:s*)false(s*[,}])', '\1False\2', js_obj)
py_obj = ast.literal_eval(js_obj)
If you have node
available on the system, you can ask it to evaluate the javascript expression for you, and print the stringified result. The resulting JSON can then be fed to json.loads
:
def evaluate_javascript(s):
"""Evaluate and stringify a javascript expression in node.js, and convert the
resulting JSON to a Python object"""
node = Popen(['node', '-'], stdin=PIPE, stdout=PIPE)
stdout, _ = node.communicate(f'console.log(JSON.stringify({s}))'.encode('utf8'))
return json.loads(stdout.decode('utf8'))
Use json5
import json5
js_obj = '{x:1, y:2, z:3}'
py_obj = json5.loads(js_obj)
print(py_obj)
# output
# {'x': 1, 'y': 2, 'z': 3}
When screen-scraping some website, I extract data from <script>
tags.
The data I get is not in standard JSON
format. I cannot use json.loads()
.
# from
js_obj = '{x:1, y:2, z:3}'
# to
py_obj = {'x':1, 'y':2, 'z':3}
Currently, I use regex
to transform the raw data to JSON
format.
But I feel pretty bad when I encounter complicated data structure.
Do you have some better solutions?
This will likely not work everywhere, but as a start, here’s a simple regex that should convert the keys into quoted strings so you can pass into json.loads. Or is this what you’re already doing?
In[70] : quote_keys_regex = r'([{s,])(w+)(:)'
In[71] : re.sub(quote_keys_regex, r'1"2"3', js_obj)
Out[71]: '{"x":1, "y":2, "z":3}'
In[72] : js_obj_2 = '{x:1, y:2, z:{k:3,j:2}}'
Int[73]: re.sub(quote_keys_regex, r'1"2"3', js_obj_2)
Out[73]: '{"x":1, "y":2, "z":{"k":3,"j":2}}'
demjson.decode()
import demjson
# from
js_obj = '{x:1, y:2, z:3}'
# to
py_obj = demjson.decode(js_obj)
chompjs.parse_js_object()
import chompjs
# from
js_obj = '{x:1, y:2, z:3}'
# to
py_obj = chompjs.parse_js_object(js_obj)
jsonnet.evaluate_snippet()
import json, _jsonnet
# from
js_obj = '{x:1, y:2, z:3}'
# to
py_obj = json.loads(_jsonnet.evaluate_snippet('snippet', js_obj))
ast.literal_eval()
import ast
# from
js_obj = "{'x':1, 'y':2, 'z':3}"
# to
py_obj = ast.literal_eval(js_obj)
Not including objects
json.loads()
json.loads()
doesn’t accept undefined, you have to change to nulljson.loads()
only accept double quotes{"foo": 1, "bar": null}
Use this if you are sure that your javascript code only have double quotes on key names.
import json
json_text = """{"foo": 1, "bar": undefined}"""
json_text = re.sub(r'("s*:s*)undefined(s*[,}])', '\1null\2', json_text)
py_obj = json.loads(json_text)
ast.literal_eval()
ast.literal_eval()
doesn’t accept undefined, you have to change to Noneast.literal_eval()
doesn’t accept null, you have to change to Noneast.literal_eval()
doesn’t accept true, you have to change to Trueast.literal_eval()
doesn’t accept false, you have to change to Falseast.literal_eval()
accept single and double quotes{"foo": 1, "bar": None}
or{'foo': 1, 'bar': None}
import ast
js_obj = """{'foo': 1, 'bar': undefined}"""
js_obj = re.sub(r'(['"]s*:s*)undefined(s*[,}])', '\1None\2', js_obj)
js_obj = re.sub(r'(['"]s*:s*)null(s*[,}])', '\1None\2', js_obj)
js_obj = re.sub(r'(['"]s*:s*)NaN(s*[,}])', '\1None\2', js_obj)
js_obj = re.sub(r'(['"]s*:s*)true(s*[,}])', '\1True\2', js_obj)
js_obj = re.sub(r'(['"]s*:s*)false(s*[,}])', '\1False\2', js_obj)
py_obj = ast.literal_eval(js_obj)
If you have node
available on the system, you can ask it to evaluate the javascript expression for you, and print the stringified result. The resulting JSON can then be fed to json.loads
:
def evaluate_javascript(s):
"""Evaluate and stringify a javascript expression in node.js, and convert the
resulting JSON to a Python object"""
node = Popen(['node', '-'], stdin=PIPE, stdout=PIPE)
stdout, _ = node.communicate(f'console.log(JSON.stringify({s}))'.encode('utf8'))
return json.loads(stdout.decode('utf8'))
Use json5
import json5
js_obj = '{x:1, y:2, z:3}'
py_obj = json5.loads(js_obj)
print(py_obj)
# output
# {'x': 1, 'y': 2, 'z': 3}