Process the python dictionary to remove undesired elements and retain desired ones
Question:
I have a python dictionary as given below:
ip = {
"doc1.pdf": {
"img1.png": ("FP", "text1"),
"img2.png": ("NP", "text2"),
"img3.png": ("FP", "text3"),
},
"doc2.pdf": {
"img1.png": ("FP", "text4"),
"img2.png": ("NP", "text5"),
"img3.png": ("NP", "text6"),
"img4.png": ("NP", "text7"),
"img5.png": ("Others", "text8"),
"img6.png": ("FP", "text9"),
"img7.png": ("NP", "text10"),
},
"doc3.pdf": {
"img1.png": ("Others", "text8"),
"img2.png": ("FP", "text9"),
"img3.png": ("Others", "text10"),
"img4.png": ("FP", "text11"),
},
"doc4.pdf": {
"img1.png": ("FP", "text12"),
"img2.png": ("Others", "text13"),
"img3.png": ("Others", "text14"),
"img4.png": ("Others", "text15"),
},
"doc5.pdf": {
"img1.png": ("FP", "text16"),
"img2.png": ("FP", "text17"),
"img3.png": ("NP", "text18"),
"img4.png": ("NP", "text19"),
},
}
Here the keyword FP
means FirstPage
, NP
is NextPage
and Others
is OtherPage
(which is not a part of the FP
or NP
). So FP
and NP
are sequential and hence FP
will appear before NP
. Now I want to segregate the sequential FP
‘s NP
‘s from other other sequential FP
‘s and NP
‘s.
I want to process the dictionary based on these rules:
- Remove all the elements that contain the keyword
Others
in the tuple present.
- Next I want to combine those elements into one dictionary which are sequential i.e. consecutive
FP
‘s and NP
‘s. So if one or more NP
‘s appear after an FP
then the FP
and NP
should be combined into one dictionary.
- If there is a lone
FP
with no NP
following it, or if an FP
(1) is followed by another FP
(2) then the (1) FP
needs to be put in a separate dictionary.
Here is what the output would look like for the above input:
op = {
"doc1.pdf": [
{
"img1.png": ("FP", "text1"),
"img2.png": ("NP", "text2")
}
{
"img3.png": ("FP", "text3")
}
],
"doc2.pdf": [
{
"img1.png": ("FP", "text4"),
"img2.png": ("NP", "text5"),
"img3.png": ("NP", "text6"),
"img4.png": ("NP", "text7")
}
{
"img6.png": ("FP", "text9"),
"img7.png": ("NP", "text10")
}
],
"doc3.pdf": [
{
"img2.png": ("FP", "text9")
}
{
"img4.png": ("FP", "text11"),
}
],
"doc4.pdf": [
{
"img1.png": ("FP", "text12")
}
],
"doc5.pdf": [
{
"img1.png": ("FP", "text16")
}
{
"img2.png": ("FP", "text17"),
"img3.png": ("NP", "text18"),
"img4.png": ("NP", "text19")
}
]
}
So far I have tried this but it is not working:
def remove_others(ip_dict):
op_dict = {}
for doc, img_dict in ip_dict.items():
temp_list = []
current_group = []
for img, values in img_dict.items():
label, text = values
if label == "Others":
continue
if current_group and label == "NP" and current_group[-1][1][0] == "FP":
current_group.append((img, (label, text)))
else:
if current_group:
temp_list.append(dict(current_group))
current_group = [(img, (label, text))]
if current_group:
temp_list.append(dict(current_group))
op_dict[doc] = temp_list
return op_dict
Any help is appreciated!
Answers:
This appears to do what you asked.
def split_on_FP(list_of_tuples):
result = []
interm = collections.OrderedDict()
for name,(k,v) in list_of_tuples:
if k == "FP" and len(interm) > 0:
result.append(interm)
interm = collections.OrderedDict()
interm.update({k:v})
if len(interm) > 0:
result.append(interm)
return result
print({ kd: split_on_FP((kx,vx) for kx,vx in doc.items() if "Others" not in vx) for kd,doc in ip.items() })
Instead of checking the last label of temp_list
, start a new dictionary whenever you see an FP
label, and add keys to it for other labels.
def remove_others(ip_dict):
op_dict = {}
for doc, img_dict in ip_dict.items():
current_group = []
for img, (label, text) in img_dict.items():
if label == "Others":
continue
if label == "FP":
current_item = {img: (label, text)}
current_group.append(current_item)
else:
current_item[img] = (label, text)
op_dict[doc] = current_group
return op_dict
Another solution:
for k, v in ip.items():
out = []
for img, (pg, text) in v.items():
match pg:
case "FP":
out.append({img: (pg, text)})
case "NP":
out[-1][img] = (pg, text)
ip[k] = out
print(ip)
Prints:
{
"doc1.pdf": [
{"img1.png": ("FP", "text1"), "img2.png": ("NP", "text2")},
{"img3.png": ("FP", "text3")},
],
"doc2.pdf": [
{
"img1.png": ("FP", "text4"),
"img2.png": ("NP", "text5"),
"img3.png": ("NP", "text6"),
"img4.png": ("NP", "text7"),
},
{"img6.png": ("FP", "text9"), "img7.png": ("NP", "text10")},
],
"doc3.pdf": [{"img2.png": ("FP", "text9")}, {"img4.png": ("FP", "text11")}],
"doc4.pdf": [{"img1.png": ("FP", "text12")}],
"doc5.pdf": [
{"img1.png": ("FP", "text16")},
{
"img2.png": ("FP", "text17"),
"img3.png": ("NP", "text18"),
"img4.png": ("NP", "text19"),
},
],
}
I have a python dictionary as given below:
ip = {
"doc1.pdf": {
"img1.png": ("FP", "text1"),
"img2.png": ("NP", "text2"),
"img3.png": ("FP", "text3"),
},
"doc2.pdf": {
"img1.png": ("FP", "text4"),
"img2.png": ("NP", "text5"),
"img3.png": ("NP", "text6"),
"img4.png": ("NP", "text7"),
"img5.png": ("Others", "text8"),
"img6.png": ("FP", "text9"),
"img7.png": ("NP", "text10"),
},
"doc3.pdf": {
"img1.png": ("Others", "text8"),
"img2.png": ("FP", "text9"),
"img3.png": ("Others", "text10"),
"img4.png": ("FP", "text11"),
},
"doc4.pdf": {
"img1.png": ("FP", "text12"),
"img2.png": ("Others", "text13"),
"img3.png": ("Others", "text14"),
"img4.png": ("Others", "text15"),
},
"doc5.pdf": {
"img1.png": ("FP", "text16"),
"img2.png": ("FP", "text17"),
"img3.png": ("NP", "text18"),
"img4.png": ("NP", "text19"),
},
}
Here the keyword FP
means FirstPage
, NP
is NextPage
and Others
is OtherPage
(which is not a part of the FP
or NP
). So FP
and NP
are sequential and hence FP
will appear before NP
. Now I want to segregate the sequential FP
‘s NP
‘s from other other sequential FP
‘s and NP
‘s.
I want to process the dictionary based on these rules:
- Remove all the elements that contain the keyword
Others
in the tuple present. - Next I want to combine those elements into one dictionary which are sequential i.e. consecutive
FP
‘s andNP
‘s. So if one or moreNP
‘s appear after anFP
then theFP
andNP
should be combined into one dictionary. - If there is a lone
FP
with noNP
following it, or if anFP
(1) is followed by anotherFP
(2) then the (1)FP
needs to be put in a separate dictionary.
Here is what the output would look like for the above input:
op = {
"doc1.pdf": [
{
"img1.png": ("FP", "text1"),
"img2.png": ("NP", "text2")
}
{
"img3.png": ("FP", "text3")
}
],
"doc2.pdf": [
{
"img1.png": ("FP", "text4"),
"img2.png": ("NP", "text5"),
"img3.png": ("NP", "text6"),
"img4.png": ("NP", "text7")
}
{
"img6.png": ("FP", "text9"),
"img7.png": ("NP", "text10")
}
],
"doc3.pdf": [
{
"img2.png": ("FP", "text9")
}
{
"img4.png": ("FP", "text11"),
}
],
"doc4.pdf": [
{
"img1.png": ("FP", "text12")
}
],
"doc5.pdf": [
{
"img1.png": ("FP", "text16")
}
{
"img2.png": ("FP", "text17"),
"img3.png": ("NP", "text18"),
"img4.png": ("NP", "text19")
}
]
}
So far I have tried this but it is not working:
def remove_others(ip_dict):
op_dict = {}
for doc, img_dict in ip_dict.items():
temp_list = []
current_group = []
for img, values in img_dict.items():
label, text = values
if label == "Others":
continue
if current_group and label == "NP" and current_group[-1][1][0] == "FP":
current_group.append((img, (label, text)))
else:
if current_group:
temp_list.append(dict(current_group))
current_group = [(img, (label, text))]
if current_group:
temp_list.append(dict(current_group))
op_dict[doc] = temp_list
return op_dict
Any help is appreciated!
This appears to do what you asked.
def split_on_FP(list_of_tuples):
result = []
interm = collections.OrderedDict()
for name,(k,v) in list_of_tuples:
if k == "FP" and len(interm) > 0:
result.append(interm)
interm = collections.OrderedDict()
interm.update({k:v})
if len(interm) > 0:
result.append(interm)
return result
print({ kd: split_on_FP((kx,vx) for kx,vx in doc.items() if "Others" not in vx) for kd,doc in ip.items() })
Instead of checking the last label of temp_list
, start a new dictionary whenever you see an FP
label, and add keys to it for other labels.
def remove_others(ip_dict):
op_dict = {}
for doc, img_dict in ip_dict.items():
current_group = []
for img, (label, text) in img_dict.items():
if label == "Others":
continue
if label == "FP":
current_item = {img: (label, text)}
current_group.append(current_item)
else:
current_item[img] = (label, text)
op_dict[doc] = current_group
return op_dict
Another solution:
for k, v in ip.items():
out = []
for img, (pg, text) in v.items():
match pg:
case "FP":
out.append({img: (pg, text)})
case "NP":
out[-1][img] = (pg, text)
ip[k] = out
print(ip)
Prints:
{
"doc1.pdf": [
{"img1.png": ("FP", "text1"), "img2.png": ("NP", "text2")},
{"img3.png": ("FP", "text3")},
],
"doc2.pdf": [
{
"img1.png": ("FP", "text4"),
"img2.png": ("NP", "text5"),
"img3.png": ("NP", "text6"),
"img4.png": ("NP", "text7"),
},
{"img6.png": ("FP", "text9"), "img7.png": ("NP", "text10")},
],
"doc3.pdf": [{"img2.png": ("FP", "text9")}, {"img4.png": ("FP", "text11")}],
"doc4.pdf": [{"img1.png": ("FP", "text12")}],
"doc5.pdf": [
{"img1.png": ("FP", "text16")},
{
"img2.png": ("FP", "text17"),
"img3.png": ("NP", "text18"),
"img4.png": ("NP", "text19"),
},
],
}