Python Correctly Parse a Complex Object into a JSON format
Question:
I have the following which I’d like to parse it into JSON. The class has a list of item object also
class Item(JSONEncoder):
def __init__(self):
self.Type = ''
self.Content = ''
self.N = None
self.Parent = None
self.Items = []
def reprJSON(self):
d = dict()
for a, v in self.__dict__.items():
if (hasattr(v, "reprJSON")):
d[a] = v.reprJSON()
else:
d[a] = v
return d
So, when I try to parse the instance of Item class, root.reprJSON()
I get the following result.
{'Type': 'root',
'Content': '',
'N': 'root',
'Parent': None,
'Items': [<Item.Item at 0x10575fb3c88>,
<Item.Item at 0x10575fb3e10>,
<Item.Item at 0x10575fb3eb8>,
<Item.Item at 0x10575fbc080>,
<Item.Item at 0x10575fbc2b0>,
<Item.Item at 0x10575fc6a20>,
<Item.Item at 0x10575fc6a58>,
<Item.Item at 0x10575fc6b70>,
<Item.Item at 0x10575fc6be0>,
<Item.Item at 0x10575fc6c50>,
<Item.Item at 0x10575fc6da0>,
<Item.Item at 0x10575fc6fd0>,
<Item.Item at 0x10575fcb128>,
<Item.Item at 0x10575fcb358>,
<Item.Item at 0x10575fcba90>,
<Item.Item at 0x10575fcbb00>,
<Item.Item at 0x10575fcbb70>,
<Item.Item at 0x10575fcbc18>,
<Item.Item at 0x10575fcbda0>,
<Item.Item at 0x10575fcbfd0>,
<Item.Item at 0x10575fd3208>,
<Item.Item at 0x10575fd34a8>,
<Item.Item at 0x10575fd3550>,
<Item.Item at 0x10575fd35c0>,
<Item.Item at 0x10575fd36d8>,
<Item.Item at 0x10575fd37f0>,
<Item.Item at 0x10575fd3898>,
<Item.Item at 0x10575fd3940>,
<Item.Item at 0x10575fd39b0>,
<Item.Item at 0x10575fd3a20>,
<Item.Item at 0x10575fd3ac8>,
<Item.Item at 0x10575fd3b70>,
<Item.Item at 0x10575fd3c88>,
<Item.Item at 0x10575fd3d68>,
<Item.Item at 0x10575fd3dd8>,
<Item.Item at 0x10575fd3e10>,
<Item.Item at 0x10575fd3ef0>,
<Item.Item at 0x10575fdc080>,
<Item.Item at 0x10575fdc0b8>,
<Item.Item at 0x10575fdc128>,
<Item.Item at 0x10575fdc1d0>,
<Item.Item at 0x10575fdc240>,
<Item.Item at 0x10575fdc390>,
<Item.Item at 0x10575fdc438>,
<Item.Item at 0x10575fdc550>,
<Item.Item at 0x10575fdc5c0>,
<Item.Item at 0x10575fdc630>,
<Item.Item at 0x10575fdc6a0>,
<Item.Item at 0x10575fdc6d8>,
<Item.Item at 0x10575fdc780>,
<Item.Item at 0x10575fdc908>,
<Item.Item at 0x10575fdc9e8>,
<Item.Item at 0x10575fdca58>,
<Item.Item at 0x10575fdcac8>,
<Item.Item at 0x10575fdcb00>,
<Item.Item at 0x10575fdcba8>,
<Item.Item at 0x10575fdccc0>,
<Item.Item at 0x10575fdcd30>,
<Item.Item at 0x10575fdcda0>,
<Item.Item at 0x10575fdce48>,
<Item.Item at 0x10575fdceb8>,
<Item.Item at 0x10575fdcf28>,
<Item.Item at 0x10575fe22e8>,
<Item.Item at 0x10575fe2828>,
<Item.Item at 0x10575fe2940>,
<Item.Item at 0x10575fe2b70>,
<Item.Item at 0x10575fe2be0>,
<Item.Item at 0x10575fe2c88>,
<Item.Item at 0x10575fe2cc0>,
<Item.Item at 0x10575fe2cf8>]}
But I’d like to get the values of those item also into a single json object. I don’t know how to do it, would appreciate any help. Thank you
Edit
Following code create an instance of item class and filled it with data.
def Crawl(parsedPDF):
soup = BeautifulSoup(parsedPDF, "html.parser")
root = Item()
root.Type = "root"
root.N = "root"
parent = root
head = root
body = RemoveEmptyTags(soup.body)
for tag in body:
elements = RemoveEmptyChild(tag.contents)
for element in elements:
if element.name == "head":
head = CreateHeading(root, parent, element)
parent = head.Parent
elif element.name == "p":
AddParagraph(head, element)
elif element.name == "figure":
pass
elif element.name == "figdesc":
pass
elif element.name == "table":
#elem = AddElement(head, element)
pass
else:
#elem = AddElement(head, element)
pass
pass
return root
def AddParagraph(head, element):
# split the paragraph into multiple lines based on alphabetize bullet points
lines = split_with_AplhabetizeBullets(element.text, '.s((.*?)s)')
for line in lines:
item = Item()
item.Content = line
item.Type = element.name
item.Parent = head
head.Items.append(item)
def CreateHeading(root, parent, element):
item = Item()
item.Content = element.text
item.Type = element.name
item.Parent = parent
try:
item.N = element["n"]
except:
pass
if item.N is None:
bracketTextLength = 0
try:
result = re.search(r'(.*?)',item.Content)
bracketTextLength = len(result.group)
except:
pass
item.N = item.Content
# to check if the heading without 'N' is a heading or its a subheading
if len(item.Content) > 3 and bracketTextLength == 0:
root.Items.append(item)
item.Parent = item
pass
else:
parent.Items.append(item)
pass
else: # item.N is not None
if parent.N is None:
item.Parent = item
parent = item.Parent
pass
#else: # if the new heading sharing the same reference as of its parent then
if parent.N in item.N[:len(parent.N)]:
parent.Items.append(item)
pass
else: # if the new heading has no parent then add it into root
root.Items.append(item)
item.Parent = item
pass
return item
Answers:
Looking at your code you can use this demo solution in your code as I’m storing objects of Demo class in the Items list. You need to write serialize()
and dumper()
methods in Items class, and also changes need to be done in reprJSON
method for iteration on Items list.
from json import JSONEncoder
class Demo():
def __init__(self):
self.name = ''
self.demolist = []
class Item(JSONEncoder):
def __init__(self):
# super().__init__()
self.Type = ''
self.Content = ''
self.N = None
self.Parent = None
self.Items = []
def reprJSON(self):
d = {}
for a, v in self.__dict__.items():
if isinstance(v, list):
for i in v:
if d.get(a, []) == []:
d[a] = []
d[a].append(self.dumper(i))
else:
d[a].append(self.dumper(i))
else:
d[a] = v
return d
def serialize(self):
return self.__dict__
@staticmethod
def dumper(obj):
if "serialize" in dir(obj):
return obj.serialize()
return obj.__dict__
itemobj = Item()
d1 = Demo()
d2 = Demo()
d1.name = 'akash'
d1.demolist = [{'good':[4,6,5],'yyy':'why'},{'ho':{'ksks':'333'}}]
d2.name = 'heheh'
d2.demolist = [4,6,1111]
itemobj.Items.extend([d1,d2])
from pprint import pprint
pprint(itemobj.reprJSON())
Output:
{'Content': '',
'Items': [{'demolist': [{'good': [4, 6, 5], 'yyy': 'why'},
{'ho': {'ksks': '333'}}],
'name': 'akash'},
{'demolist': [4, 6, 1111], 'name': 'heheh'}],
'N': None,
'Parent': None,
'Type': ''}```
pip install jsonwhatever
from jsonwhatever import jsonwhatever as jw
class Item():
def __init__(self):
self.Type = ''
self.Content = ''
self.N = None
self.Parent = None #Not to reference father class to avoid infinite recursivity
self.Items = None #You should put None by default to stop recursivity
obj = Item()
obj01 = Item()
obj01.Type = '01'
obj01.Content = 'stuff'
obj01.N = 9
obj01.Parent = None
list_objects = []
list_objects.append(obj01)
obj.Items = list_objects
json_string = jw.jsonwhatever('list_of_items', obj)
print(json_string)
I have the following which I’d like to parse it into JSON. The class has a list of item object also
class Item(JSONEncoder):
def __init__(self):
self.Type = ''
self.Content = ''
self.N = None
self.Parent = None
self.Items = []
def reprJSON(self):
d = dict()
for a, v in self.__dict__.items():
if (hasattr(v, "reprJSON")):
d[a] = v.reprJSON()
else:
d[a] = v
return d
So, when I try to parse the instance of Item class, root.reprJSON()
I get the following result.
{'Type': 'root',
'Content': '',
'N': 'root',
'Parent': None,
'Items': [<Item.Item at 0x10575fb3c88>,
<Item.Item at 0x10575fb3e10>,
<Item.Item at 0x10575fb3eb8>,
<Item.Item at 0x10575fbc080>,
<Item.Item at 0x10575fbc2b0>,
<Item.Item at 0x10575fc6a20>,
<Item.Item at 0x10575fc6a58>,
<Item.Item at 0x10575fc6b70>,
<Item.Item at 0x10575fc6be0>,
<Item.Item at 0x10575fc6c50>,
<Item.Item at 0x10575fc6da0>,
<Item.Item at 0x10575fc6fd0>,
<Item.Item at 0x10575fcb128>,
<Item.Item at 0x10575fcb358>,
<Item.Item at 0x10575fcba90>,
<Item.Item at 0x10575fcbb00>,
<Item.Item at 0x10575fcbb70>,
<Item.Item at 0x10575fcbc18>,
<Item.Item at 0x10575fcbda0>,
<Item.Item at 0x10575fcbfd0>,
<Item.Item at 0x10575fd3208>,
<Item.Item at 0x10575fd34a8>,
<Item.Item at 0x10575fd3550>,
<Item.Item at 0x10575fd35c0>,
<Item.Item at 0x10575fd36d8>,
<Item.Item at 0x10575fd37f0>,
<Item.Item at 0x10575fd3898>,
<Item.Item at 0x10575fd3940>,
<Item.Item at 0x10575fd39b0>,
<Item.Item at 0x10575fd3a20>,
<Item.Item at 0x10575fd3ac8>,
<Item.Item at 0x10575fd3b70>,
<Item.Item at 0x10575fd3c88>,
<Item.Item at 0x10575fd3d68>,
<Item.Item at 0x10575fd3dd8>,
<Item.Item at 0x10575fd3e10>,
<Item.Item at 0x10575fd3ef0>,
<Item.Item at 0x10575fdc080>,
<Item.Item at 0x10575fdc0b8>,
<Item.Item at 0x10575fdc128>,
<Item.Item at 0x10575fdc1d0>,
<Item.Item at 0x10575fdc240>,
<Item.Item at 0x10575fdc390>,
<Item.Item at 0x10575fdc438>,
<Item.Item at 0x10575fdc550>,
<Item.Item at 0x10575fdc5c0>,
<Item.Item at 0x10575fdc630>,
<Item.Item at 0x10575fdc6a0>,
<Item.Item at 0x10575fdc6d8>,
<Item.Item at 0x10575fdc780>,
<Item.Item at 0x10575fdc908>,
<Item.Item at 0x10575fdc9e8>,
<Item.Item at 0x10575fdca58>,
<Item.Item at 0x10575fdcac8>,
<Item.Item at 0x10575fdcb00>,
<Item.Item at 0x10575fdcba8>,
<Item.Item at 0x10575fdccc0>,
<Item.Item at 0x10575fdcd30>,
<Item.Item at 0x10575fdcda0>,
<Item.Item at 0x10575fdce48>,
<Item.Item at 0x10575fdceb8>,
<Item.Item at 0x10575fdcf28>,
<Item.Item at 0x10575fe22e8>,
<Item.Item at 0x10575fe2828>,
<Item.Item at 0x10575fe2940>,
<Item.Item at 0x10575fe2b70>,
<Item.Item at 0x10575fe2be0>,
<Item.Item at 0x10575fe2c88>,
<Item.Item at 0x10575fe2cc0>,
<Item.Item at 0x10575fe2cf8>]}
But I’d like to get the values of those item also into a single json object. I don’t know how to do it, would appreciate any help. Thank you
Edit
Following code create an instance of item class and filled it with data.
def Crawl(parsedPDF):
soup = BeautifulSoup(parsedPDF, "html.parser")
root = Item()
root.Type = "root"
root.N = "root"
parent = root
head = root
body = RemoveEmptyTags(soup.body)
for tag in body:
elements = RemoveEmptyChild(tag.contents)
for element in elements:
if element.name == "head":
head = CreateHeading(root, parent, element)
parent = head.Parent
elif element.name == "p":
AddParagraph(head, element)
elif element.name == "figure":
pass
elif element.name == "figdesc":
pass
elif element.name == "table":
#elem = AddElement(head, element)
pass
else:
#elem = AddElement(head, element)
pass
pass
return root
def AddParagraph(head, element):
# split the paragraph into multiple lines based on alphabetize bullet points
lines = split_with_AplhabetizeBullets(element.text, '.s((.*?)s)')
for line in lines:
item = Item()
item.Content = line
item.Type = element.name
item.Parent = head
head.Items.append(item)
def CreateHeading(root, parent, element):
item = Item()
item.Content = element.text
item.Type = element.name
item.Parent = parent
try:
item.N = element["n"]
except:
pass
if item.N is None:
bracketTextLength = 0
try:
result = re.search(r'(.*?)',item.Content)
bracketTextLength = len(result.group)
except:
pass
item.N = item.Content
# to check if the heading without 'N' is a heading or its a subheading
if len(item.Content) > 3 and bracketTextLength == 0:
root.Items.append(item)
item.Parent = item
pass
else:
parent.Items.append(item)
pass
else: # item.N is not None
if parent.N is None:
item.Parent = item
parent = item.Parent
pass
#else: # if the new heading sharing the same reference as of its parent then
if parent.N in item.N[:len(parent.N)]:
parent.Items.append(item)
pass
else: # if the new heading has no parent then add it into root
root.Items.append(item)
item.Parent = item
pass
return item
Looking at your code you can use this demo solution in your code as I’m storing objects of Demo class in the Items list. You need to write serialize()
and dumper()
methods in Items class, and also changes need to be done in reprJSON
method for iteration on Items list.
from json import JSONEncoder
class Demo():
def __init__(self):
self.name = ''
self.demolist = []
class Item(JSONEncoder):
def __init__(self):
# super().__init__()
self.Type = ''
self.Content = ''
self.N = None
self.Parent = None
self.Items = []
def reprJSON(self):
d = {}
for a, v in self.__dict__.items():
if isinstance(v, list):
for i in v:
if d.get(a, []) == []:
d[a] = []
d[a].append(self.dumper(i))
else:
d[a].append(self.dumper(i))
else:
d[a] = v
return d
def serialize(self):
return self.__dict__
@staticmethod
def dumper(obj):
if "serialize" in dir(obj):
return obj.serialize()
return obj.__dict__
itemobj = Item()
d1 = Demo()
d2 = Demo()
d1.name = 'akash'
d1.demolist = [{'good':[4,6,5],'yyy':'why'},{'ho':{'ksks':'333'}}]
d2.name = 'heheh'
d2.demolist = [4,6,1111]
itemobj.Items.extend([d1,d2])
from pprint import pprint
pprint(itemobj.reprJSON())
Output:
{'Content': '',
'Items': [{'demolist': [{'good': [4, 6, 5], 'yyy': 'why'},
{'ho': {'ksks': '333'}}],
'name': 'akash'},
{'demolist': [4, 6, 1111], 'name': 'heheh'}],
'N': None,
'Parent': None,
'Type': ''}```
pip install jsonwhatever
from jsonwhatever import jsonwhatever as jw
class Item():
def __init__(self):
self.Type = ''
self.Content = ''
self.N = None
self.Parent = None #Not to reference father class to avoid infinite recursivity
self.Items = None #You should put None by default to stop recursivity
obj = Item()
obj01 = Item()
obj01.Type = '01'
obj01.Content = 'stuff'
obj01.N = 9
obj01.Parent = None
list_objects = []
list_objects.append(obj01)
obj.Items = list_objects
json_string = jw.jsonwhatever('list_of_items', obj)
print(json_string)