Process the python dictionary to remove undesired elements and retain desired ones

Question:

I have a python dictionary as given below:

ip = {
    "doc1.pdf": {
        "img1.png": ("FP", "text1"),
        "img2.png": ("NP", "text2"),
        "img3.png": ("FP", "text3"),
    },
    "doc2.pdf": {
        "img1.png": ("FP", "text4"),
        "img2.png": ("NP", "text5"),
        "img3.png": ("NP", "text6"),
        "img4.png": ("NP", "text7"),
      "img5.png": ("Others", "text8"),
      "img6.png": ("FP", "text9"),
      "img7.png": ("NP", "text10"),
    },
    "doc3.pdf": {
        "img1.png": ("Others", "text8"),
        "img2.png": ("FP", "text9"),
        "img3.png": ("Others", "text10"),
        "img4.png": ("FP", "text11"),
    },
    "doc4.pdf": {
        "img1.png": ("FP", "text12"),
        "img2.png": ("Others", "text13"),
        "img3.png": ("Others", "text14"),
        "img4.png": ("Others", "text15"),
    },
    "doc5.pdf": {
        "img1.png": ("FP", "text16"),
        "img2.png": ("FP", "text17"),
        "img3.png": ("NP", "text18"),
        "img4.png": ("NP", "text19"),
    },
}

Here the keyword FP means FirstPage, NP is NextPage and Others is OtherPage (which is not a part of the FP or NP). So FP and NP are sequential and hence FP will appear before NP. Now I want to segregate the sequential FP‘s NP‘s from other other sequential FP‘s and NP‘s.

I want to process the dictionary based on these rules:

  1. Remove all the elements that contain the keyword Others in the tuple present.
  2. Next I want to combine those elements into one dictionary which are sequential i.e. consecutive FP‘s and NP‘s. So if one or more NP‘s appear after an FP then the FP and NP should be combined into one dictionary.
  3. If there is a lone FP with no NP following it, or if an FP (1) is followed by another FP (2) then the (1) FP needs to be put in a separate dictionary.

Here is what the output would look like for the above input:

    op = {
        "doc1.pdf": [
            {
            "img1.png": ("FP", "text1"),
            "img2.png": ("NP", "text2")
            }
            {
            "img3.png": ("FP", "text3")
            }
        ],

        "doc2.pdf": [
            {
            "img1.png": ("FP", "text4"),
            "img2.png": ("NP", "text5"),
            "img3.png": ("NP", "text6"),
            "img4.png": ("NP", "text7")
            }
           {
            "img6.png": ("FP", "text9"),
            "img7.png": ("NP", "text10")
           }
        ],

        "doc3.pdf": [
           {
            "img2.png": ("FP", "text9")
           }
           {
            "img4.png": ("FP", "text11"),
           }
        ],

        "doc4.pdf": [
           {
            "img1.png": ("FP", "text12")
           }
        ],
        
        "doc5.pdf": [
           {
            "img1.png": ("FP", "text16")
           }
           {
            "img2.png": ("FP", "text17"),
            "img3.png": ("NP", "text18"),
            "img4.png": ("NP", "text19")
           }
        ]
    }

So far I have tried this but it is not working:

def remove_others(ip_dict):

    op_dict = {}
    for doc, img_dict in ip_dict.items():
        temp_list = []
        current_group = []
        
        for img, values in img_dict.items():
            label, text = values
            
            if label == "Others":
                continue
            
            if current_group and label == "NP" and current_group[-1][1][0] == "FP":
                current_group.append((img, (label, text)))
            else:
                if current_group:
                    temp_list.append(dict(current_group))
                current_group = [(img, (label, text))]
        
        if current_group:
            temp_list.append(dict(current_group))
        
        op_dict[doc] = temp_list

    return op_dict

Any help is appreciated!

Asked By: lowkey

||

Answers:

This appears to do what you asked.

def split_on_FP(list_of_tuples):
    result = []
    interm = collections.OrderedDict()
    for name,(k,v) in list_of_tuples:
        if k == "FP" and len(interm) > 0:
            result.append(interm)
            interm = collections.OrderedDict()
        interm.update({k:v})
    if len(interm) > 0:
        result.append(interm)
    return result

print({ kd: split_on_FP((kx,vx) for kx,vx in doc.items() if "Others" not in vx) for kd,doc in ip.items() })
Answered By: VoNWooDSoN

Instead of checking the last label of temp_list, start a new dictionary whenever you see an FP label, and add keys to it for other labels.

def remove_others(ip_dict):
    op_dict = {}

    for doc, img_dict in ip_dict.items():
        current_group = []

        for img, (label, text) in img_dict.items():
            if label == "Others":
                continue
            if label == "FP":
                current_item = {img: (label, text)}
                current_group.append(current_item)
            else:
                current_item[img] = (label, text)

        op_dict[doc] = current_group

    return op_dict
Answered By: Barmar

Another solution:

for k, v in ip.items():
    out = []
    for img, (pg, text) in v.items():
        match pg:
            case "FP":
                out.append({img: (pg, text)})
            case "NP":
                out[-1][img] = (pg, text)
    ip[k] = out

print(ip)

Prints:

{
    "doc1.pdf": [
        {"img1.png": ("FP", "text1"), "img2.png": ("NP", "text2")},
        {"img3.png": ("FP", "text3")},
    ],
    "doc2.pdf": [
        {
            "img1.png": ("FP", "text4"),
            "img2.png": ("NP", "text5"),
            "img3.png": ("NP", "text6"),
            "img4.png": ("NP", "text7"),
        },
        {"img6.png": ("FP", "text9"), "img7.png": ("NP", "text10")},
    ],
    "doc3.pdf": [{"img2.png": ("FP", "text9")}, {"img4.png": ("FP", "text11")}],
    "doc4.pdf": [{"img1.png": ("FP", "text12")}],
    "doc5.pdf": [
        {"img1.png": ("FP", "text16")},
        {
            "img2.png": ("FP", "text17"),
            "img3.png": ("NP", "text18"),
            "img4.png": ("NP", "text19"),
        },
    ],
}
Answered By: Andrej Kesely