From text doc to JSON with python

Question

Suppose I have multiple txt files that look like this (indentation is 4 spaces):

key1=value1
key2
    key2_1=value2_1
    key2_2
        key2_2_1=value2_2_1
    key2_3=value2_3
key3=value3

How do I convert one (or all) of them into this format:

{
'key1':'value1',
'key2':
    {
    'key2_1':'value2_1',
    'key2_2':
        {
        'key2_2_1':'value2_2_1'
        },
    'key2_3':'value2_3'
    },
'key3':'value3'
}

or in a flattened dictionary format.

I appreciate any comments. Best,

Nigel

##########################################
New inclusion (as of 2022/10/28):

I am using the code proposed by @jrynes, but with a slight change: I am using pathlib to call files in the directory and then the method .splitlines() from pathlib will split lines:

import json

def convertIndentation(inputString):
    indentCount = 0
    indentVal = "    "
    for position, eachLine in enumerate(inputString):
        if "=" not in eachLine:
            continue
        else:
            strSplit = eachLine.split("=", 1)
            prevIndent = inputString[position].count(indentVal)
            newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
            inputString[position] = strSplit[0] + 'n'
            inputString.insert(position+1, newVal)
    flatList = "".join(inputString)
    return flatList

class Node:
    def __init__(self, indented_line):
        self.children = []
        self.level = len(indented_line) - len(indented_line.lstrip())
        self.text = indented_line.strip()

    def add_children(self, nodes):
        childlevel = nodes[0].level

        while nodes:
            node = nodes.pop(0)
            if node.level == childlevel:
                self.children.append(node)
            elif node.level > childlevel:
                nodes.insert(0,node)
                self.children[-1].add_children(nodes)
            elif node.level <= self.level:
                nodes.insert(0,node)
                return

    def as_dict(self):
        if len(self.children) > 1:
            return {self.text: [node.as_dict() for node in self.children]}
        elif len(self.children) == 1:
            return {self.text: self.children[0].as_dict()}
        else:
            return self.text

And then:

from pathlib import Path
def txt_to_json(filename: Path):
    content = filename.read_text(encoding='utf-8').splitlines()
    fileParse = convertIndentation(content)
    root = Node('root')
    root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
    d = root.as_dict()['root']
    jsonOutput = json.dumps(d, indent = 4, sort_keys = False)
    print(jsonOutput)

def main():
    search_directory = Path.home().joinpath('OneDrive', 'Documents', 'LAB', 'lean')
    for txt_file in search_directory.glob("**/*.txt"):
        txt_to_json(txt_file)

if __name__ == '__main__':
    main()

When I open the file after implementing the above code, I get this:

Observe that python cannot figure out end of line (no ‘n’). However, when I use @jrynes’ proposed method .readlines(), I get this:

Observe that with @jrynes’ method Python see ‘n’ in the end of each line.

Here is ‘jsonOutput’ when I use pathlib:

My question is: what is wrong with my method using pathlib? Why can’t it figure out end of lines?

Asked By: NigelBlainey

||

Source

Answer 1

You could try something like below:

# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
    indentCount = 0
    indentVal = "    "
    for position, eachLine in enumerate(inputString):
        if "=" not in eachLine:
            continue
        else:
            strSplit = eachLine.split("=", 1)
            #get previous indentation
            prevIndent = inputString[position].count(indentVal)
            newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
            inputString[position] = strSplit[0] + 'n'
            inputString.insert(position+1, newVal)
    flatList = "".join(inputString)
    return flatList

# helper class for node usage
class Node:
    def __init__(self, indented_line):
        self.children = []
        self.level = len(indented_line) - len(indented_line.lstrip())
        self.text = indented_line.strip()

    def add_children(self, nodes):
        childlevel = nodes[0].level

        while nodes:
            node = nodes.pop(0)
            if node.level == childlevel: # add node as a child
                self.children.append(node)
            elif node.level > childlevel: # add nodes as grandchildren of the last child
                nodes.insert(0,node)
                self.children[-1].add_children(nodes)
            elif node.level <= self.level: # this node is a sibling, no more children
                nodes.insert(0,node)
                return

    def as_dict(self):
        if len(self.children) > 1:
            return {self.text: [node.as_dict() for node in self.children]}
        elif len(self.children) == 1:
            return {self.text: self.children[0].as_dict()}
        else:
            return self.text

# process our file here
with open(filename, 'r') as fh:
    fileContent = fh.readlines()
    fileParse = convertIndentation(fileContent)
    # convert equals signs to indentation
    root = Node('root')
    root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
    d = root.as_dict()['root']
    # this variable is storing the json output
    jsonOutput = json.dumps(d, indent = 4, sort_keys = False)
    print(jsonOutput)

That should yield some output like below:

[
    {
        "key1": "value1"
    },
    {
        "key2": [
            {
                "key2_1": "value2_1"
            },
            {
                "key2_2": {
                    "key2_2_1": "value2_2_1"
                }
            },
            {
                "key2_3": "value2_3"
            }
        ]
    },
    {
        "key3": "value3"
    }
]

Answered By: jrynes

From text doc to JSON with python

Question:

Answers: