How to read from all XML files in directory and then combine the info into data frame?

Question:

I wonder how can I do this code, but with multiply files coming from one directory:

import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import os

import xml.etree.ElementTree as ET
tree = ET.parse('C:/Users/STJ2TW/Desktop/Pliki XML/0204N01323-00_2021082310500900')
root = tree.getroot()

for Type in root.iter('Type'):
    PN = Type.get('name')
    print(f"Part number: {PN}")

for Component in root.iter('Component'):
    CName = Component.get('name')
    if CName == 'Pos010_GearHousing':
         for ComponentNo in Component.iter('ComponentNo'):
            GH = ComponentNo.get('name')
            print(f"Gear Housing: {GH}")

    if CName == 'Pos058_PowerPack':
         for ComponentNo in Component.iter('ComponentNo'):
            PP = ComponentNo.get('name')
            print(f"Power Pack: {PP}")


df = pd.DataFrame(
    {
        "Part number:": [PN],
        "Gear Housing:": [GH],
        "Power Pack:": [PP],
    }
)
df

df = df.set_index('Part number:', drop = True)
df.to_csv("C:/Users/STJ2TW/Desktop/Pliki XML/plik.csv", sep=";")
df.head(10) 

I think I should go with this:

path = 'C:/Users/STJ2TW/Desktop/Pliki XML/'
for filename in os.listdir(path):
    if not filename.endswith('.xml'): continue
    fullname = os.path.join(path, filename)
    tree = ET.parse(fullname)

But I don’t know how to operate on files in the rest of the code. Some loops might be helpful?
Thanks in advance.

Asked By: ASAS

||

Answers:

This should work. It’s hard to say for sure without knowing the structure of the xml file, but based on the code you provided it seems like this might work.

import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import os


path = 'C:/Users/STJ2TW/Desktop/Pliki XML/'
master = []
for filename in os.listdir(path):
    if not filename.endswith('.xml'): continue
    fullname = os.path.join(path, filename)
    tree = ET.parse(fullname)
    root = tree.getroot()
    items = []   # collect items into a list of dictionaries

    for Type in root.iter('Type'):
        PN = Type.get('name')
        items.append({"Part Number": PN})   # add data to dictionary
        print(f"Part number: {PN}")

    for i, Component in enumerate(root.iter('Component')):
        CName = Component.get('name')
        if CName == 'Pos010_GearHousing':
            for ComponentNo in Component.iter('ComponentNo'):
                GH = ComponentNo.get('name')
                items[i]["Gear Housing"] = GH  # update dictionary
                print(f"Gear Housing: {GH}")


        if CName == 'Pos058_PowerPack':
            for ComponentNo in Component.iter('ComponentNo'):
                PP = ComponentNo.get('name')
                items[i]["Power Pack"] = PP  # update dictionary
                print(f"Power Pack: {PP}")
    master += items  # add dictionary list for this file to the 
                     # master list for all files


df = pd.DataFrame(master)  # Create dataframe from master list
df = df.set_index('Part Number', drop = True)
df.to_csv("C:/Users/STJ2TW/Desktop/Pliki XML/plik.csv", sep=";")
df.head(10)
Answered By: Alexander

Thank you!

Me and my colleague created a function:

def get_data_from_xml(path):
    
    tree = ET.parse(path)
    root = tree.getroot()
    
    for Type in root.iter('Type'):
        PN = Type.get('name')

        for Component in root.iter('Component'):
            CName = Component.get('name')
            if CName == 'Pos010_GearHousing':
                 for ComponentNo in Component.iter('ComponentNo'):
                    GH = ComponentNo.get('name')                   

            if CName == 'Pos058_PowerPack':
                 for ComponentNo in Component.iter('ComponentNo'):
                    PP = ComponentNo.get('name')

    return PN, GH, PP

and then created lists:

pn_list = []
gh_list = []
pp_list = []

Used the function:

path = 'C:/Users/STJ2TW/Desktop/Pliki XML/'
for filename in os.listdir(path):
    fullname = os.path.join(path, filename)
    PN, GH, PP = get_data_from_xml(fullname)
    pn_list.append(PN)
    gh_list.append(GH)
    pp_list.append(PP)

And put it all to the dataframe:

df = pd.DataFrame(
    {'Part number': pn_list,
     'Gear Housing': gh_list,
     'Power Pack': pp_list
    }) 
Answered By: ASAS