From text doc to JSON with python
Question:
Suppose I have multiple txt files that look like this (indentation is 4 spaces):
key1=value1
key2
key2_1=value2_1
key2_2
key2_2_1=value2_2_1
key2_3=value2_3
key3=value3
How do I convert one (or all) of them into this format:
{
'key1':'value1',
'key2':
{
'key2_1':'value2_1',
'key2_2':
{
'key2_2_1':'value2_2_1'
},
'key2_3':'value2_3'
},
'key3':'value3'
}
or in a flattened dictionary format.
I appreciate any comments. Best,
Nigel
##########################################
New inclusion (as of 2022/10/28):
I am using the code proposed by @jrynes, but with a slight change: I am using pathlib
to call files in the directory and then the method .splitlines()
from pathlib will split lines:
import json
def convertIndentation(inputString):
indentCount = 0
indentVal = " "
for position, eachLine in enumerate(inputString):
if "=" not in eachLine:
continue
else:
strSplit = eachLine.split("=", 1)
prevIndent = inputString[position].count(indentVal)
newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
inputString[position] = strSplit[0] + 'n'
inputString.insert(position+1, newVal)
flatList = "".join(inputString)
return flatList
class Node:
def __init__(self, indented_line):
self.children = []
self.level = len(indented_line) - len(indented_line.lstrip())
self.text = indented_line.strip()
def add_children(self, nodes):
childlevel = nodes[0].level
while nodes:
node = nodes.pop(0)
if node.level == childlevel:
self.children.append(node)
elif node.level > childlevel:
nodes.insert(0,node)
self.children[-1].add_children(nodes)
elif node.level <= self.level:
nodes.insert(0,node)
return
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text
And then:
from pathlib import Path
def txt_to_json(filename: Path):
content = filename.read_text(encoding='utf-8').splitlines()
fileParse = convertIndentation(content)
root = Node('root')
root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
d = root.as_dict()['root']
jsonOutput = json.dumps(d, indent = 4, sort_keys = False)
print(jsonOutput)
def main():
search_directory = Path.home().joinpath('OneDrive', 'Documents', 'LAB', 'lean')
for txt_file in search_directory.glob("**/*.txt"):
txt_to_json(txt_file)
if __name__ == '__main__':
main()
When I open the file after implementing the above code, I get this:
Observe that python cannot figure out end of line (no ‘n’). However, when I use @jrynes’ proposed method .readlines()
, I get this:
Observe that with @jrynes’ method Python see ‘n’ in the end of each line.
Here is ‘jsonOutput’ when I use pathlib
:
My question is: what is wrong with my method using pathlib
? Why can’t it figure out end of lines?
Answers:
You could try something like below:
# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
indentCount = 0
indentVal = " "
for position, eachLine in enumerate(inputString):
if "=" not in eachLine:
continue
else:
strSplit = eachLine.split("=", 1)
#get previous indentation
prevIndent = inputString[position].count(indentVal)
newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
inputString[position] = strSplit[0] + 'n'
inputString.insert(position+1, newVal)
flatList = "".join(inputString)
return flatList
# helper class for node usage
class Node:
def __init__(self, indented_line):
self.children = []
self.level = len(indented_line) - len(indented_line.lstrip())
self.text = indented_line.strip()
def add_children(self, nodes):
childlevel = nodes[0].level
while nodes:
node = nodes.pop(0)
if node.level == childlevel: # add node as a child
self.children.append(node)
elif node.level > childlevel: # add nodes as grandchildren of the last child
nodes.insert(0,node)
self.children[-1].add_children(nodes)
elif node.level <= self.level: # this node is a sibling, no more children
nodes.insert(0,node)
return
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text
# process our file here
with open(filename, 'r') as fh:
fileContent = fh.readlines()
fileParse = convertIndentation(fileContent)
# convert equals signs to indentation
root = Node('root')
root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
d = root.as_dict()['root']
# this variable is storing the json output
jsonOutput = json.dumps(d, indent = 4, sort_keys = False)
print(jsonOutput)
That should yield some output like below:
[
{
"key1": "value1"
},
{
"key2": [
{
"key2_1": "value2_1"
},
{
"key2_2": {
"key2_2_1": "value2_2_1"
}
},
{
"key2_3": "value2_3"
}
]
},
{
"key3": "value3"
}
]
Suppose I have multiple txt files that look like this (indentation is 4 spaces):
key1=value1
key2
key2_1=value2_1
key2_2
key2_2_1=value2_2_1
key2_3=value2_3
key3=value3
How do I convert one (or all) of them into this format:
{
'key1':'value1',
'key2':
{
'key2_1':'value2_1',
'key2_2':
{
'key2_2_1':'value2_2_1'
},
'key2_3':'value2_3'
},
'key3':'value3'
}
or in a flattened dictionary format.
I appreciate any comments. Best,
Nigel
##########################################
New inclusion (as of 2022/10/28):
I am using the code proposed by @jrynes, but with a slight change: I am using pathlib
to call files in the directory and then the method .splitlines()
from pathlib will split lines:
import json
def convertIndentation(inputString):
indentCount = 0
indentVal = " "
for position, eachLine in enumerate(inputString):
if "=" not in eachLine:
continue
else:
strSplit = eachLine.split("=", 1)
prevIndent = inputString[position].count(indentVal)
newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
inputString[position] = strSplit[0] + 'n'
inputString.insert(position+1, newVal)
flatList = "".join(inputString)
return flatList
class Node:
def __init__(self, indented_line):
self.children = []
self.level = len(indented_line) - len(indented_line.lstrip())
self.text = indented_line.strip()
def add_children(self, nodes):
childlevel = nodes[0].level
while nodes:
node = nodes.pop(0)
if node.level == childlevel:
self.children.append(node)
elif node.level > childlevel:
nodes.insert(0,node)
self.children[-1].add_children(nodes)
elif node.level <= self.level:
nodes.insert(0,node)
return
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text
And then:
from pathlib import Path
def txt_to_json(filename: Path):
content = filename.read_text(encoding='utf-8').splitlines()
fileParse = convertIndentation(content)
root = Node('root')
root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
d = root.as_dict()['root']
jsonOutput = json.dumps(d, indent = 4, sort_keys = False)
print(jsonOutput)
def main():
search_directory = Path.home().joinpath('OneDrive', 'Documents', 'LAB', 'lean')
for txt_file in search_directory.glob("**/*.txt"):
txt_to_json(txt_file)
if __name__ == '__main__':
main()
When I open the file after implementing the above code, I get this:
Observe that python cannot figure out end of line (no ‘n’). However, when I use @jrynes’ proposed method .readlines()
, I get this:
Observe that with @jrynes’ method Python see ‘n’ in the end of each line.
Here is ‘jsonOutput’ when I use pathlib
:
My question is: what is wrong with my method using pathlib
? Why can’t it figure out end of lines?
You could try something like below:
# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
indentCount = 0
indentVal = " "
for position, eachLine in enumerate(inputString):
if "=" not in eachLine:
continue
else:
strSplit = eachLine.split("=", 1)
#get previous indentation
prevIndent = inputString[position].count(indentVal)
newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
inputString[position] = strSplit[0] + 'n'
inputString.insert(position+1, newVal)
flatList = "".join(inputString)
return flatList
# helper class for node usage
class Node:
def __init__(self, indented_line):
self.children = []
self.level = len(indented_line) - len(indented_line.lstrip())
self.text = indented_line.strip()
def add_children(self, nodes):
childlevel = nodes[0].level
while nodes:
node = nodes.pop(0)
if node.level == childlevel: # add node as a child
self.children.append(node)
elif node.level > childlevel: # add nodes as grandchildren of the last child
nodes.insert(0,node)
self.children[-1].add_children(nodes)
elif node.level <= self.level: # this node is a sibling, no more children
nodes.insert(0,node)
return
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text
# process our file here
with open(filename, 'r') as fh:
fileContent = fh.readlines()
fileParse = convertIndentation(fileContent)
# convert equals signs to indentation
root = Node('root')
root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
d = root.as_dict()['root']
# this variable is storing the json output
jsonOutput = json.dumps(d, indent = 4, sort_keys = False)
print(jsonOutput)
That should yield some output like below:
[
{
"key1": "value1"
},
{
"key2": [
{
"key2_1": "value2_1"
},
{
"key2_2": {
"key2_2_1": "value2_2_1"
}
},
{
"key2_3": "value2_3"
}
]
},
{
"key3": "value3"
}
]