From text file to JSON file with python

Question:

Suppose I have a txt file that looks like this (indentation is 4 spaces):

key1=value1
key2
    key2_1=value2_1
    key2_2
        key2_2_1=value2_2_1
    key2_3=value2_3_1,value2_3_2,value2_3_3
key3=value3_1,value3_2,value3_3

I want to convert it into any VALID json, like this one:

{
'key1':'value1',
'key2': {
    'key2_1':'value2_1',
    'key2_2':{
        'key2_2_1':'value2_2_1'
        },
    'key2_3':['value2_3_1','value2_3_2','value2_3_3']
    },
'key3':['value3_1','value3_2','value3_3']
}

I have tried this (which I got from another post):

# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
    indentCount = 0
    indentVal = "    "
    for position, eachLine in enumerate(inputString):
        if "=" not in eachLine:
            continue
        else:
            strSplit = eachLine.split("=", 1)
            #get previous indentation
            prevIndent = inputString[position].count(indentVal)
            newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
            inputString[position] = strSplit[0] + 'n'
            inputString.insert(position+1, newVal)
    flatList = "".join(inputString)
    return flatList

# helper class for node usage
class Node:
    def __init__(self, indented_line):
        self.children = []
        self.level = len(indented_line) - len(indented_line.lstrip())
        self.text = indented_line.strip()

    def add_children(self, nodes):
        childlevel = nodes[0].level

        while nodes:
            node = nodes.pop(0)
            if node.level == childlevel: # add node as a child
                self.children.append(node)
            elif node.level > childlevel: # add nodes as grandchildren of the last child
                nodes.insert(0,node)
                self.children[-1].add_children(nodes)
            elif node.level <= self.level: # this node is a sibling, no more children
                nodes.insert(0,node)
                return

    def as_dict(self):
        if len(self.children) > 1:
            return {self.text: [node.as_dict() for node in self.children]}
        elif len(self.children) == 1:
            return {self.text: self.children[0].as_dict()}
        else:
            return self.text

# process our file here
with open(filename, 'r') as fh:
    fileContent = fh.readlines()
    fileParse = convertIndentation(fileContent)
    # convert equals signs to indentation
    root = Node('root')
    root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
    d = root.as_dict()['root']
    # this variable is storing the json output
    jsonOutput = json.dumps(d, indent = 4, sort_keys = False)
    print(jsonOutput)

which yields the following:

[
    {
        "key1": "value1"
    },
    {
        "key2": [
            {
                "key2_1": "value2_1"
            },
            {
                "key2_2": {
                    "key2_2_1": "value2_2_1"
                }
            },
            {
                "key2_3": "value2_3_1,value2_3_2,value2_3_3"
            },
        ]
    },
    {
        "key3": "value3_1,value3_2,value3_3"
    }
]

Yet this is still not a valid JSON file.

When I try to open the output file using ‘json’ module, I get this predictable message: "JSONDecodeError: Expecting property name enclosed in double quotes: line 10 column 5 (char 165)".

with open(r'C:UsersnigelOneDriveDocumentsLABleansample_01.02_R00.json', 'r', encoding='utf-8') as read_file:
    data = json.load(read_file)

output:

JSONDecodeError                           Traceback (most recent call last)
Input In [2], in <cell line: 1>()
      1 with open(r'C:UsersnigelOneDriveDocumentsLABleansample_01.02_R00.json', 'r', encoding='utf-8') as read_file:
----> 2     data = json.load(read_file)

File ~Anaconda3libjson__init__.py:293, in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    274 def load(fp, *, cls=None, object_hook=None, parse_float=None,
    275         parse_int=None, parse_constant=None, object_pairs_hook=None, **kw):
    276     """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
    277     a JSON document) to a Python object.
    278 
   (...)
    291     kwarg; otherwise ``JSONDecoder`` is used.
    292     """
--> 293     return loads(fp.read(),
    294         cls=cls, object_hook=object_hook,
    295         parse_float=parse_float, parse_int=parse_int,
    296         parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)

File ~Anaconda3libjson__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    341     s = s.decode(detect_encoding(s), 'surrogatepass')
    343 if (cls is None and object_hook is None and
    344         parse_int is None and parse_float is None and
    345         parse_constant is None and object_pairs_hook is None and not kw):
--> 346     return _default_decoder.decode(s)
    347 if cls is None:
    348     cls = JSONDecoder

File ~Anaconda3libjsondecoder.py:337, in JSONDecoder.decode(self, s, _w)
    332 def decode(self, s, _w=WHITESPACE.match):
    333     """Return the Python representation of ``s`` (a ``str`` instance
    334     containing a JSON document).
    335 
    336     """
--> 337     obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    338     end = _w(s, end).end()
    339     if end != len(s):

File ~Anaconda3libjsondecoder.py:353, in JSONDecoder.raw_decode(self, s, idx)
    344 """Decode a JSON document from ``s`` (a ``str`` beginning with
    345 a JSON document) and return a 2-tuple of the Python
    346 representation and the index in ``s`` where the document ended.
   (...)
    350 
    351 """
    352 try:
--> 353     obj, end = self.scan_once(s, idx)
    354 except StopIteration as err:
    355     raise JSONDecodeError("Expecting value", s, err.value) from None

JSONDecodeError: Expecting property name enclosed in double quotes: line 10 column 5 (char 165)

The reason is that JSON expects to find keys (strings enclosed in double quotes) when it actually finds json objects (nested dictionaries) in their places. That is it!

I truly appreciate any comments. Best,

Nigel

Asked By: NigelBlainey

||

Answers:

An aside for users that land on this page: I could not reproduce the error that the OP posted. json.dumps() would be very highly unlikely to output "bad json". This was merely an attempt to help out the poster.

Splitting The Strings Into Lists

I am assuming per your comment that you mean that you want to take your strings, for example, this line
key2_3=value2_3_1,value2_3_2,value2_3_3
and break these values up into "key2_3": ["value2_3_1", "value2_3_2", "value2_3_3"].

To do so, you’d have to make the following adjustment to the code provided to you:

def as_dict(self):
    if len(self.children) > 1:
        return {self.text: [node.as_dict() for node in self.children]}
    elif len(self.children) == 1:
        return {self.text: self.children[0].as_dict()}
    else:
        return self.text.split(",") # was self.text

Dictionaries of Dictionaries Instead of Lists

To make the output dictionary a dictionary of dictionaries with node base values of lists, ie {k1: {k2: [1, 2, 3]}}, and of the like, we have to make 2 changes.

  1. Update the as_dict method to use {}
    instead of [].
  2. Include a function to compress keys.

When I was doing this, I had a hard time outputting the correct data structure… it’d look basically like this, {k1: {k1: {k2: {k2: value}}}}. This becomes obvious when you don’t run the d = compress(root.as_dict()['root']) (d = root.as_dict()['root']) function in the code. So the code went from

def as_dict(self):
    if len(self.children) > 1:
        return {self.text: [node.as_dict() for node in self.children]}
    elif len(self.children) == 1:
        return {self.text: self.children[0].as_dict()}
    else:
        return self.text.split(",") if "," in self.text else self.text

to

def as_dict(self):
    if len(self.children) > 1:
        return {self.text: {node.text: node.as_dict() for node in self.children}}
    elif len(self.children) == 1:
        return {self.text: self.children[0].as_dict()}
    else:
        return self.text.split(",") if "," in self.text else self.text

, then I included the compress function

# for merging like sub keys and values
def compress(dictionary):
    if isinstance(dictionary, dict):
        for k, v in dictionary.items():
            if isinstance(v, dict):
                if k in v.keys():
                    dictionary[k] = dictionary[k].pop(k)
                compress(dictionary[k])
            compress(k)
    return dictionary

Full Code

If you put the below in a file and run it from the command line, it should work 100%. Otherwise its probably a problem with anaconda or version of python (though that doesn’t really seem likely).

from io import StringIO
import json

# for merging like sub keys and values
def compress(dictionary):
    if isinstance(dictionary, dict):
        for k, v in dictionary.items():
            if isinstance(v, dict):
                if k in v.keys():
                    dictionary[k] = dictionary[k].pop(k)
                compress(dictionary[k])
            compress(k)
    return dictionary

# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
    indentCount = 0
    indentVal = "    "
    for position, eachLine in enumerate(inputString):
        if "=" not in eachLine:
            continue
        else:
            strSplit = eachLine.split("=", 1)
            #get previous indentation
            prevIndent = inputString[position].count(indentVal)
            newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
            inputString[position] = strSplit[0] + 'n'
            inputString.insert(position+1, newVal)
    flatList = "".join(inputString)
    return flatList



# helper class for node usage
class Node:
    def __init__(self, indented_line):
        self.children = []
        self.level = len(indented_line) - len(indented_line.lstrip())
        self.text = indented_line.strip()
    def add_children(self, nodes):
        childlevel = nodes[0].level
        while nodes:
            node = nodes.pop(0)
            if node.level == childlevel: # add node as a child
                self.children.append(node)
            elif node.level > childlevel: # add nodes as grandchildren of the last child
                nodes.insert(0,node)
                self.children[-1].add_children(nodes)
            elif node.level <= self.level: # this node is a sibling, no more children
                nodes.insert(0,node)
                return
    def as_dict(self):
        if len(self.children) > 1:
            return {self.text: {node.text: node.as_dict() for node in self.children}}
        elif len(self.children) == 1:
            return {self.text: self.children[0].as_dict()}
        else:
            return self.text.split(",") if "," in self.text else self.text

if __name__ == "__main__":

    s = """
        key1=value1
        key2
            key2_1=value2_1
            key2_2
                key2_2_1
                    key2_2_1_1=value2_2_1_1
            key2_3=value2_3_1,value2_3_2,value2_3_3
        key3=value3_1,value3_2,value3_3
    """

    fh = StringIO(s)
    fileContent = fh.readlines()
    fileParse = convertIndentation(fileContent)
    # convert equals signs to indentation
    root = Node('root')
    root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
    d = compress(root.as_dict()['root'])
    # this variable is storing the json output
    jsonOutput = json.dumps(d, indent=4, sort_keys=False)
    f = StringIO(jsonOutput)

    # load the "file"
    loaded = json.load(f)

    print(s)
    print(jsonOutput)
    print(loaded)
Answered By: Shmack
Categories: questions Tags: , , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.