From text file to JSON file with python
Question:
Suppose I have a txt file that looks like this (indentation is 4 spaces):
key1=value1
key2
key2_1=value2_1
key2_2
key2_2_1=value2_2_1
key2_3=value2_3_1,value2_3_2,value2_3_3
key3=value3_1,value3_2,value3_3
I want to convert it into any VALID json, like this one:
{
'key1':'value1',
'key2': {
'key2_1':'value2_1',
'key2_2':{
'key2_2_1':'value2_2_1'
},
'key2_3':['value2_3_1','value2_3_2','value2_3_3']
},
'key3':['value3_1','value3_2','value3_3']
}
I have tried this (which I got from another post):
# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
indentCount = 0
indentVal = " "
for position, eachLine in enumerate(inputString):
if "=" not in eachLine:
continue
else:
strSplit = eachLine.split("=", 1)
#get previous indentation
prevIndent = inputString[position].count(indentVal)
newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
inputString[position] = strSplit[0] + 'n'
inputString.insert(position+1, newVal)
flatList = "".join(inputString)
return flatList
# helper class for node usage
class Node:
def __init__(self, indented_line):
self.children = []
self.level = len(indented_line) - len(indented_line.lstrip())
self.text = indented_line.strip()
def add_children(self, nodes):
childlevel = nodes[0].level
while nodes:
node = nodes.pop(0)
if node.level == childlevel: # add node as a child
self.children.append(node)
elif node.level > childlevel: # add nodes as grandchildren of the last child
nodes.insert(0,node)
self.children[-1].add_children(nodes)
elif node.level <= self.level: # this node is a sibling, no more children
nodes.insert(0,node)
return
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text
# process our file here
with open(filename, 'r') as fh:
fileContent = fh.readlines()
fileParse = convertIndentation(fileContent)
# convert equals signs to indentation
root = Node('root')
root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
d = root.as_dict()['root']
# this variable is storing the json output
jsonOutput = json.dumps(d, indent = 4, sort_keys = False)
print(jsonOutput)
which yields the following:
[
{
"key1": "value1"
},
{
"key2": [
{
"key2_1": "value2_1"
},
{
"key2_2": {
"key2_2_1": "value2_2_1"
}
},
{
"key2_3": "value2_3_1,value2_3_2,value2_3_3"
},
]
},
{
"key3": "value3_1,value3_2,value3_3"
}
]
Yet this is still not a valid JSON file.
When I try to open the output file using ‘json’ module, I get this predictable message: "JSONDecodeError: Expecting property name enclosed in double quotes: line 10 column 5 (char 165)".
with open(r'C:UsersnigelOneDriveDocumentsLABleansample_01.02_R00.json', 'r', encoding='utf-8') as read_file:
data = json.load(read_file)
output:
JSONDecodeError Traceback (most recent call last)
Input In [2], in <cell line: 1>()
1 with open(r'C:UsersnigelOneDriveDocumentsLABleansample_01.02_R00.json', 'r', encoding='utf-8') as read_file:
----> 2 data = json.load(read_file)
File ~Anaconda3libjson__init__.py:293, in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
274 def load(fp, *, cls=None, object_hook=None, parse_float=None,
275 parse_int=None, parse_constant=None, object_pairs_hook=None, **kw):
276 """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
277 a JSON document) to a Python object.
278
(...)
291 kwarg; otherwise ``JSONDecoder`` is used.
292 """
--> 293 return loads(fp.read(),
294 cls=cls, object_hook=object_hook,
295 parse_float=parse_float, parse_int=parse_int,
296 parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
File ~Anaconda3libjson__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
341 s = s.decode(detect_encoding(s), 'surrogatepass')
343 if (cls is None and object_hook is None and
344 parse_int is None and parse_float is None and
345 parse_constant is None and object_pairs_hook is None and not kw):
--> 346 return _default_decoder.decode(s)
347 if cls is None:
348 cls = JSONDecoder
File ~Anaconda3libjsondecoder.py:337, in JSONDecoder.decode(self, s, _w)
332 def decode(self, s, _w=WHITESPACE.match):
333 """Return the Python representation of ``s`` (a ``str`` instance
334 containing a JSON document).
335
336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
338 end = _w(s, end).end()
339 if end != len(s):
File ~Anaconda3libjsondecoder.py:353, in JSONDecoder.raw_decode(self, s, idx)
344 """Decode a JSON document from ``s`` (a ``str`` beginning with
345 a JSON document) and return a 2-tuple of the Python
346 representation and the index in ``s`` where the document ended.
(...)
350
351 """
352 try:
--> 353 obj, end = self.scan_once(s, idx)
354 except StopIteration as err:
355 raise JSONDecodeError("Expecting value", s, err.value) from None
JSONDecodeError: Expecting property name enclosed in double quotes: line 10 column 5 (char 165)
The reason is that JSON expects to find keys (strings enclosed in double quotes) when it actually finds json objects (nested dictionaries) in their places. That is it!
I truly appreciate any comments. Best,
Nigel
Answers:
An aside for users that land on this page: I could not reproduce the error that the OP posted. json.dumps()
would be very highly unlikely to output "bad json". This was merely an attempt to help out the poster.
Splitting The Strings Into Lists
I am assuming per your comment that you mean that you want to take your strings, for example, this line
key2_3=value2_3_1,value2_3_2,value2_3_3
and break these values up into "key2_3": ["value2_3_1", "value2_3_2", "value2_3_3"]
.
To do so, you’d have to make the following adjustment to the code provided to you:
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text.split(",") # was self.text
Dictionaries of Dictionaries Instead of Lists
To make the output dictionary a dictionary of dictionaries with node base values of lists, ie {k1: {k2: [1, 2, 3]}}
, and of the like, we have to make 2 changes.
- Update the as_dict method to use
{}
instead of []
.
- Include a function to compress keys.
When I was doing this, I had a hard time outputting the correct data structure… it’d look basically like this, {k1: {k1: {k2: {k2: value}}}}
. This becomes obvious when you don’t run the d = compress(root.as_dict()['root'])
(d = root.as_dict()['root']
) function in the code. So the code went from
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text.split(",") if "," in self.text else self.text
to
def as_dict(self):
if len(self.children) > 1:
return {self.text: {node.text: node.as_dict() for node in self.children}}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text.split(",") if "," in self.text else self.text
, then I included the compress function
# for merging like sub keys and values
def compress(dictionary):
if isinstance(dictionary, dict):
for k, v in dictionary.items():
if isinstance(v, dict):
if k in v.keys():
dictionary[k] = dictionary[k].pop(k)
compress(dictionary[k])
compress(k)
return dictionary
Full Code
If you put the below in a file and run it from the command line, it should work 100%. Otherwise its probably a problem with anaconda or version of python (though that doesn’t really seem likely).
from io import StringIO
import json
# for merging like sub keys and values
def compress(dictionary):
if isinstance(dictionary, dict):
for k, v in dictionary.items():
if isinstance(v, dict):
if k in v.keys():
dictionary[k] = dictionary[k].pop(k)
compress(dictionary[k])
compress(k)
return dictionary
# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
indentCount = 0
indentVal = " "
for position, eachLine in enumerate(inputString):
if "=" not in eachLine:
continue
else:
strSplit = eachLine.split("=", 1)
#get previous indentation
prevIndent = inputString[position].count(indentVal)
newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
inputString[position] = strSplit[0] + 'n'
inputString.insert(position+1, newVal)
flatList = "".join(inputString)
return flatList
# helper class for node usage
class Node:
def __init__(self, indented_line):
self.children = []
self.level = len(indented_line) - len(indented_line.lstrip())
self.text = indented_line.strip()
def add_children(self, nodes):
childlevel = nodes[0].level
while nodes:
node = nodes.pop(0)
if node.level == childlevel: # add node as a child
self.children.append(node)
elif node.level > childlevel: # add nodes as grandchildren of the last child
nodes.insert(0,node)
self.children[-1].add_children(nodes)
elif node.level <= self.level: # this node is a sibling, no more children
nodes.insert(0,node)
return
def as_dict(self):
if len(self.children) > 1:
return {self.text: {node.text: node.as_dict() for node in self.children}}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text.split(",") if "," in self.text else self.text
if __name__ == "__main__":
s = """
key1=value1
key2
key2_1=value2_1
key2_2
key2_2_1
key2_2_1_1=value2_2_1_1
key2_3=value2_3_1,value2_3_2,value2_3_3
key3=value3_1,value3_2,value3_3
"""
fh = StringIO(s)
fileContent = fh.readlines()
fileParse = convertIndentation(fileContent)
# convert equals signs to indentation
root = Node('root')
root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
d = compress(root.as_dict()['root'])
# this variable is storing the json output
jsonOutput = json.dumps(d, indent=4, sort_keys=False)
f = StringIO(jsonOutput)
# load the "file"
loaded = json.load(f)
print(s)
print(jsonOutput)
print(loaded)
Suppose I have a txt file that looks like this (indentation is 4 spaces):
key1=value1
key2
key2_1=value2_1
key2_2
key2_2_1=value2_2_1
key2_3=value2_3_1,value2_3_2,value2_3_3
key3=value3_1,value3_2,value3_3
I want to convert it into any VALID json, like this one:
{
'key1':'value1',
'key2': {
'key2_1':'value2_1',
'key2_2':{
'key2_2_1':'value2_2_1'
},
'key2_3':['value2_3_1','value2_3_2','value2_3_3']
},
'key3':['value3_1','value3_2','value3_3']
}
I have tried this (which I got from another post):
# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
indentCount = 0
indentVal = " "
for position, eachLine in enumerate(inputString):
if "=" not in eachLine:
continue
else:
strSplit = eachLine.split("=", 1)
#get previous indentation
prevIndent = inputString[position].count(indentVal)
newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
inputString[position] = strSplit[0] + 'n'
inputString.insert(position+1, newVal)
flatList = "".join(inputString)
return flatList
# helper class for node usage
class Node:
def __init__(self, indented_line):
self.children = []
self.level = len(indented_line) - len(indented_line.lstrip())
self.text = indented_line.strip()
def add_children(self, nodes):
childlevel = nodes[0].level
while nodes:
node = nodes.pop(0)
if node.level == childlevel: # add node as a child
self.children.append(node)
elif node.level > childlevel: # add nodes as grandchildren of the last child
nodes.insert(0,node)
self.children[-1].add_children(nodes)
elif node.level <= self.level: # this node is a sibling, no more children
nodes.insert(0,node)
return
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text
# process our file here
with open(filename, 'r') as fh:
fileContent = fh.readlines()
fileParse = convertIndentation(fileContent)
# convert equals signs to indentation
root = Node('root')
root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
d = root.as_dict()['root']
# this variable is storing the json output
jsonOutput = json.dumps(d, indent = 4, sort_keys = False)
print(jsonOutput)
which yields the following:
[
{
"key1": "value1"
},
{
"key2": [
{
"key2_1": "value2_1"
},
{
"key2_2": {
"key2_2_1": "value2_2_1"
}
},
{
"key2_3": "value2_3_1,value2_3_2,value2_3_3"
},
]
},
{
"key3": "value3_1,value3_2,value3_3"
}
]
Yet this is still not a valid JSON file.
When I try to open the output file using ‘json’ module, I get this predictable message: "JSONDecodeError: Expecting property name enclosed in double quotes: line 10 column 5 (char 165)".
with open(r'C:UsersnigelOneDriveDocumentsLABleansample_01.02_R00.json', 'r', encoding='utf-8') as read_file:
data = json.load(read_file)
output:
JSONDecodeError Traceback (most recent call last)
Input In [2], in <cell line: 1>()
1 with open(r'C:UsersnigelOneDriveDocumentsLABleansample_01.02_R00.json', 'r', encoding='utf-8') as read_file:
----> 2 data = json.load(read_file)
File ~Anaconda3libjson__init__.py:293, in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
274 def load(fp, *, cls=None, object_hook=None, parse_float=None,
275 parse_int=None, parse_constant=None, object_pairs_hook=None, **kw):
276 """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
277 a JSON document) to a Python object.
278
(...)
291 kwarg; otherwise ``JSONDecoder`` is used.
292 """
--> 293 return loads(fp.read(),
294 cls=cls, object_hook=object_hook,
295 parse_float=parse_float, parse_int=parse_int,
296 parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
File ~Anaconda3libjson__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
341 s = s.decode(detect_encoding(s), 'surrogatepass')
343 if (cls is None and object_hook is None and
344 parse_int is None and parse_float is None and
345 parse_constant is None and object_pairs_hook is None and not kw):
--> 346 return _default_decoder.decode(s)
347 if cls is None:
348 cls = JSONDecoder
File ~Anaconda3libjsondecoder.py:337, in JSONDecoder.decode(self, s, _w)
332 def decode(self, s, _w=WHITESPACE.match):
333 """Return the Python representation of ``s`` (a ``str`` instance
334 containing a JSON document).
335
336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
338 end = _w(s, end).end()
339 if end != len(s):
File ~Anaconda3libjsondecoder.py:353, in JSONDecoder.raw_decode(self, s, idx)
344 """Decode a JSON document from ``s`` (a ``str`` beginning with
345 a JSON document) and return a 2-tuple of the Python
346 representation and the index in ``s`` where the document ended.
(...)
350
351 """
352 try:
--> 353 obj, end = self.scan_once(s, idx)
354 except StopIteration as err:
355 raise JSONDecodeError("Expecting value", s, err.value) from None
JSONDecodeError: Expecting property name enclosed in double quotes: line 10 column 5 (char 165)
The reason is that JSON expects to find keys (strings enclosed in double quotes) when it actually finds json objects (nested dictionaries) in their places. That is it!
I truly appreciate any comments. Best,
Nigel
An aside for users that land on this page: I could not reproduce the error that the OP posted. json.dumps()
would be very highly unlikely to output "bad json". This was merely an attempt to help out the poster.
Splitting The Strings Into Lists
I am assuming per your comment that you mean that you want to take your strings, for example, this line
key2_3=value2_3_1,value2_3_2,value2_3_3
and break these values up into "key2_3": ["value2_3_1", "value2_3_2", "value2_3_3"]
.
To do so, you’d have to make the following adjustment to the code provided to you:
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text.split(",") # was self.text
Dictionaries of Dictionaries Instead of Lists
To make the output dictionary a dictionary of dictionaries with node base values of lists, ie {k1: {k2: [1, 2, 3]}}
, and of the like, we have to make 2 changes.
- Update the as_dict method to use
{}
instead of[]
. - Include a function to compress keys.
When I was doing this, I had a hard time outputting the correct data structure… it’d look basically like this, {k1: {k1: {k2: {k2: value}}}}
. This becomes obvious when you don’t run the d = compress(root.as_dict()['root'])
(d = root.as_dict()['root']
) function in the code. So the code went from
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text.split(",") if "," in self.text else self.text
to
def as_dict(self):
if len(self.children) > 1:
return {self.text: {node.text: node.as_dict() for node in self.children}}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text.split(",") if "," in self.text else self.text
, then I included the compress function
# for merging like sub keys and values
def compress(dictionary):
if isinstance(dictionary, dict):
for k, v in dictionary.items():
if isinstance(v, dict):
if k in v.keys():
dictionary[k] = dictionary[k].pop(k)
compress(dictionary[k])
compress(k)
return dictionary
Full Code
If you put the below in a file and run it from the command line, it should work 100%. Otherwise its probably a problem with anaconda or version of python (though that doesn’t really seem likely).
from io import StringIO
import json
# for merging like sub keys and values
def compress(dictionary):
if isinstance(dictionary, dict):
for k, v in dictionary.items():
if isinstance(v, dict):
if k in v.keys():
dictionary[k] = dictionary[k].pop(k)
compress(dictionary[k])
compress(k)
return dictionary
# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
indentCount = 0
indentVal = " "
for position, eachLine in enumerate(inputString):
if "=" not in eachLine:
continue
else:
strSplit = eachLine.split("=", 1)
#get previous indentation
prevIndent = inputString[position].count(indentVal)
newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
inputString[position] = strSplit[0] + 'n'
inputString.insert(position+1, newVal)
flatList = "".join(inputString)
return flatList
# helper class for node usage
class Node:
def __init__(self, indented_line):
self.children = []
self.level = len(indented_line) - len(indented_line.lstrip())
self.text = indented_line.strip()
def add_children(self, nodes):
childlevel = nodes[0].level
while nodes:
node = nodes.pop(0)
if node.level == childlevel: # add node as a child
self.children.append(node)
elif node.level > childlevel: # add nodes as grandchildren of the last child
nodes.insert(0,node)
self.children[-1].add_children(nodes)
elif node.level <= self.level: # this node is a sibling, no more children
nodes.insert(0,node)
return
def as_dict(self):
if len(self.children) > 1:
return {self.text: {node.text: node.as_dict() for node in self.children}}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text.split(",") if "," in self.text else self.text
if __name__ == "__main__":
s = """
key1=value1
key2
key2_1=value2_1
key2_2
key2_2_1
key2_2_1_1=value2_2_1_1
key2_3=value2_3_1,value2_3_2,value2_3_3
key3=value3_1,value3_2,value3_3
"""
fh = StringIO(s)
fileContent = fh.readlines()
fileParse = convertIndentation(fileContent)
# convert equals signs to indentation
root = Node('root')
root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
d = compress(root.as_dict()['root'])
# this variable is storing the json output
jsonOutput = json.dumps(d, indent=4, sort_keys=False)
f = StringIO(jsonOutput)
# load the "file"
loaded = json.load(f)
print(s)
print(jsonOutput)
print(loaded)