Parse XML to CSV when XML tag has child attributes

Question:

I’ve written a small python app to print some XML tags and select child attributes. The XML are for electronic invoicing here in Mexico, here is an example of the XML:

<?xml version="1.0" encoding="UTF-8"?><cfdi:Comprobante >import pandas as pd
import xml.dom.minidom
import xml.etree.ElementTree as Xet

from tkinter import *
from tkinter import filedialog
from lxml import etree

root1 = Tk()

root1.filename = filedialog.askopenfilename(title="Select file", filetypes=[('XML Files','*.xml')])
print (root1.filename)
cols = ["Monto","Fecha","Descripcion","RFC_Emisor","Emisor","UUID"]
rows = []
row =[]
doc = xml.dom.minidom.parse(root1.filename);

xmlparse = Xet.parse(root1.filename)
root = xmlparse.getroot()

for m in root.findall('.//*[@Total]'):
    row.extend(m.attrib.get("Total")) #,m.attrib.get("Fecha")))
for d in root.findall('.//*[@Descripcion]'):
    row.append(d.attrib.get('Descripcion'))
for rf in root.findall('.//*[@Rfc]'):
    row.extend((rf.attrib.get("Rfc"),rf.attrib.get("Nombre")))
for u in root.findall('.//*[@UUID]'):
    row.append(u.attrib.get("UUID"))
    
rows.append(row)
df= pd.DataFrame(rows,columns=cols)
df.to_csv('output.csv',mode='a', index=False, header=False)

EDIT 2022-11-16

I was able to run the code, but with this XML:

<?xml version="1.0" encoding="utf-8"?><cfdi:Comprobante xmlns_xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns_cfdi="http://www.sat.gob.mx/cfd/3" xsi_schemaLocation="http://www.sat.gob.mx/cfd/3 http://www.sat.gob.mx/sitio_internet/cfd/3/cfdv33.xsd" Version="3.3" Serie="EE" Folio="3963205" Fecha="2022-08-26T11:56:10" Sello="fg7F9EF8YXkWP4UAw96g97gat8D7nzJ10TjtxB/x4t1G10LS7RmRKa/jQ1dcBpJ96ck8FPBnOirF8Ya4IQ7hJgkoWRDkY4cpTI5UChiZsld7frl8x1yz21HIckqWqBtn/xQT4l0iAXda5xIRA6shOf0YErTU6NOkZNLNp4ToNg6hUbaoc4RXTNWcyc25lyXc9nMY6BkYiDNaCgLnIZ/d1jTIrwIPOyAlhAcdmVaPKxyfpMNrUhPBh4FKRy6MW8iNGXw+ZhPSYUncSiuUYA6O7B1qlHGHQSuN4q+dEv8T3C+4gM0PjInYhRt3XSOmwDfXAvjUFy4tyqXIHBHZCnpG+A==" FormaPago="99" NoCertificado="00001000000507261913" Certificado="MIIF6DCCA9CgAwIBAgIUMDAwMDEwMDAwMDA1MDcyNjE5MTMwDQYJKoZIhvcNAQELBQAwggGEMSAwHgYDVQQDDBdBVVRPUklEQUQgQ0VSVElGSUNBRE9SQTEuMCwGA1UECgwlU0VSVklDSU8gREUgQURNSU5JU1RSQUNJT04gVFJJQlVUQVJJQTEaMBgGA1UECwwRU0FULUlFUyBBdXRob3JpdHkxKjAoBgkqhkiG9w0BCQEWG2NvbnRhY3RvLnRlY25pY29Ac2F0LmdvYi5teDEmMCQGA1UECQwdQVYuIEhJREFMR08gNzcsIENPTC4gR1VFUlJFUk8xDjAMBgNVBBEMBTA2MzAwMQswCQYDVQQGEwJNWDEZMBcGA1UECAwQQ0lVREFEIERFIE1FWElDTzETMBEGA1UEBwwKQ1VBVUhURU1PQzEVMBMGA1UELRMMU0FUOTcwNzAxTk4zMVwwWgYJKoZIhvcNAQkCE01yZXNwb25zYWJsZTogQURNSU5JU1RSQUNJT04gQ0VOVFJBTCBERSBTRVJWSUNJT1MgVFJJQlVUQVJJT1MgQUwgQ09OVFJJQlVZRU5URTAeFw0yMTA0MzAxOTMwNTFaFw0yNTA0MzAxOTMwNTFaMIG2MRwwGgYDVQQDExNCQ0QgVFJBVkVMIFNBIERFIENWMRwwGgYDVQQpExNCQ0QgVFJBVkVMIFNBIERFIENWMRwwGgYDVQQKExNCQ0QgVFJBVkVMIFNBIERFIENWMSUwIwYDVQQtExxCVFI3ODAyMjM3VTIgLyBNRVNFNjkwNjEzNTQ3MR4wHAYDVQQFExUgLyBNRVNFNjkwNjEzSERGTk5OMDUxEzARBgNVBAsTCkJDRCBUUkFWRUwwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDcOxH+0iOfuj7fkxJhU3EoUo/aHFcofV2RkJ0XOOTP7H0oL99KE1AofsUbPZZvn4605puQWp2Fu/xg544Fd24fQ3WinhktnFu/bfP4X7O6hFyiL7//Kcme87/sNkFqaO6JUjkGMAASa3XROPUyYrnPszshF4sne+KZZzHg2347l2qPhN6LMMMyqIN0YdS9AsMTdUnSnZYgfgrxHN8RnWrwgmpELGJ6lZBf4mEpltzXYNYOWgm9t2xlnmMXss7MCsvQh6+ctA8iwEe64F2AesQzFNarer48RI8WHhBeUqO6APnom8tgA9K9SlBYxgR7FyRlrR8Q7NWRy122yTDiqUD3AgMBAAGjHTAbMAwGA1UdEwEB/wQCMAAwCwYDVR0PBAQDAgbAMA0GCSqGSIb3DQEBCwUAA4ICAQAJruwufHGLVpyZ6ReQ8AyrkMtxONRmLhv7C2nY2c8+O+k/emdZU0Zm2iTQViXTPo0K0i2o4scEVMZAKtTbqzJk4NDHTYHn6ESNsH1whLcBAtGn2b+GYt4TYMZVy7zP/ty5mL8rlMBmg89zi2NGQRqLl4l5DoI1KgcSk7wue2hCOj3kIi4noWZQdh8kgACthei7aPscfNQXivZ17tBDTzmdWIBcn4KACIwFvkTCeGl1gsV5i1CIdex5p011qXsmIcPMIF/gAsVZRbfEKpu2afxZcCC9ig/xmB2blpv2E8QSMa6S27w7dxH3i92OpFBRXONpFXNJRtx7r6UvUw6Shq2oqTfImGzvdfC8oOa3LIq2AdoD2XGRjuaOQLqRhWldxyiW0N7jTreMnZoeUi2TiN8yp8aJFMRf8pj5sxGsUVBUiktYp6FYk3H+hk3DPEngZje2pJog0suuUyHJJEWaSMQlyFe30lyEMvnXt/xSpMwjt0PU3I9VOhXHrLW0QeCV8AyQ/timemYdJlgke6ROygIU01xqG/SBfM8REYYgtoKFKrtBuqowSrbjlJOXZHFZzKapAipbQ4dfOWoiBIlxHwgOtmOqfBlGRSf/MfSDxIERRGybZzBTnrMBPbgXPNeqs2qFAyrIx/qiTqp85KMgwoANVHUy4rk6mrc3gxZtp3isxA==" SubTotal="1483.10" Descuento="0.00" Moneda="MXN" Total="1713.48" TipoDeComprobante="I" MetodoPago="PPD" LugarExpedicion="11560"><cfdi:Emisor Rfc="BTR7802237U2" Nombre="BCD Travel S.A. de C.V." RegimenFiscal="601"/><cfdi:Receptor Rfc="IST190806QJ7" Nombre="INDRA SISTEMAS TRANSPORTE Y DEFENSA" UsoCFDI="G03"/><cfdi:Conceptos><cfdi:Concepto ClaveProdServ="90121502" NoIdentificacion="AZOEIG-3969614-15936193-16737975" Cantidad="1" ClaveUnidad="E48" Unidad="Unidad de Servicio" Descripcion="Reservaci&#243;n Hotel ( Tasa 16% )" ValorUnitario="1439.90" Importe="1439.90" Descuento="0.00"><cfdi:Impuestos><cfdi:Traslados><cfdi:Traslado Base="1439.90" Impuesto="002" TipoFactor="Tasa" TasaOCuota="0.160000" Importe="230.38"/></cfdi:Traslados></cfdi:Impuestos></cfdi:Concepto><cfdi:Concepto ClaveProdServ="90121502" NoIdentificacion="AZOEIG-3969614-15936193-16737975" Cantidad="1" ClaveUnidad="E48" Unidad="Unidad de Servicio" Descripcion="Otros Impuestos" ValorUnitario="43.20" Importe="43.20" Descuento="0.00"><cfdi:Impuestos><cfdi:Traslados><cfdi:Traslado Base="43.20" Impuesto="002" TipoFactor="Tasa" TasaOCuota="0.000000" Importe="0.00"/></cfdi:Traslados></cfdi:Impuestos></cfdi:Concepto></cfdi:Conceptos><cfdi:Impuestos TotalImpuestosTrasladados="230.38"><cfdi:Traslados><cfdi:Traslado Impuesto="002" TipoFactor="Tasa" TasaOCuota="0.160000" Importe="230.38"/><cfdi:Traslado Impuesto="002" TipoFactor="Tasa" TasaOCuota="0.000000" Importe="0.00"/></cfdi:Traslados></cfdi:Impuestos><cfdi:Complemento><tfd:TimbreFiscalDigital xmlns_tfd="http://www.sat.gob.mx/TimbreFiscalDigital" xsi_schemaLocation="http://www.sat.gob.mx/TimbreFiscalDigital http://www.sat.gob.mx/sitio_internet/cfd/TimbreFiscalDigital/TimbreFiscalDigitalv11.xsd" Version="1.1" UUID="055DC12A-C9F7-4E70-B23B-EB6CA1ABDC4A" FechaTimbrado="2022-08-26T13:10:51" RfcProvCertif="DET080304395" SelloCFD="fg7F9EF8YXkWP4UAw96g97gat8D7nzJ10TjtxB/x4t1G10LS7RmRKa/jQ1dcBpJ96ck8FPBnOirF8Ya4IQ7hJgkoWRDkY4cpTI5UChiZsld7frl8x1yz21HIckqWqBtn/xQT4l0iAXda5xIRA6shOf0YErTU6NOkZNLNp4ToNg6hUbaoc4RXTNWcyc25lyXc9nMY6BkYiDNaCgLnIZ/d1jTIrwIPOyAlhAcdmVaPKxyfpMNrUhPBh4FKRy6MW8iNGXw+ZhPSYUncSiuUYA6O7B1qlHGHQSuN4q+dEv8T3C+4gM0PjInYhRt3XSOmwDfXAvjUFy4tyqXIHBHZCnpG+A==" NoCertificadoSAT="00001000000503726537" SelloSAT="LyWQC2ExMofC25dv/qhchiKH2yVf29BuRzA1WJaPFOGq5+JF+bJL7nPpV2jE6iP1aKbtD7lyPLHRW8/P9KTR47GtGf3iuPpUWddsUA70cVTk1ol6/FJfrfuE1G2CLlUdhhf8MholjYtJNgbZ7hlfdmv0Zrj5vv3waO9FIRr0J/P6fA0uBK0qX0CxGYxNTsxPrwJ3CNkWFa94rVdM4iCfCZeXNGoqTXF+EEe2yPFJUvMR/BcYoiG8w6mKrojzKetDgg3J6bSDhW8XNGvYNt300fwUlU7arvoCo7f36UbI1lh+xGWEIDPy/IO7bccpfOm3T7xf0bfWBT1kl3o8IqxqgQ=="/></cfdi:Complemento><cfdi:Addenda><BCDTravel:AdditionalInformation xmlns_BCDTravel="https://www.bcdtravelmexico.com.mx/Addenda" xsi_schemaLocation="https://www.bcdtravelmexico.com.mx/Addenda https://www.bcdtravelmexico.com.mx/Addenda/BCDTravel.xsd"><BCDTravel:RecordInformation><BCDTravel:Reservacion ClaveReservacion="AZOEIG" NumeroOS="3969614" Pasajero="TEJEDA/EDGAR LEONARDO"/></BCDTravel:RecordInformation><BCDTravel:PaymentInformation><BCDTravel:MetodoPago Metodo="AR" Monto="1713.48"/></BCDTravel:PaymentInformation></BCDTravel:AdditionalInformation></cfdi:Addenda></cfdi:Comprobante>

I'm getting this error:

8 columns passed, passed data had 9 columns

I assume I have to add an exception but I'm not sure how to do it.

Asked By: rwffh

||

Answers:

It looks like your code is way more complicated than necessary.

Try it this way:

from lxml import etree

cols = ["Monto","Fecha","Descripcion","RFC_Emisor","Emisor","UUID","RFC_Receptor","Receptor"]
rows = []
row =[]
for t in root.xpath('//*[@Total]'):
    row.extend((t.attrib.get("Total"),t.attrib.get("Fecha")))
for d in doc.xpath('//*[@Descripcion]'):
    row.append(d.attrib.get('Descripcion'))
for rf in doc.xpath('//*[@Rfc]'):
    row.extend((rf.attrib.get("Rfc"),rf.attrib.get("Nombre")))
for u in doc.xpath('//*[@UUID]'):
    row.append(u.attrib.get("UUID"))
    
rows.append(row)
pd.DataFrame(rows,columns=cols)

Output (based on your sample xml):

    Monto   Fecha   Descripcion     RFC_Emisor  Emisor  UUID    RFC_Receptor    Receptor
0   169.00  2022-11-09T18:55:51     PQT. DE ALIMENTOS (CONSUMO: 2022-11-08) FOLIO(...   PRB100802H20    PREMIUM RESTAURANT BRANDS   IST190806QJ7    INDRA SISTEMAS TRANSPORTE Y DEFENSA     67B2DDD8-ABCF-4CD1-B435-C228742542B6

You'll likely have to modify this to fit your actual xml.

Answered By: Jack Fleeting
Categories: questions Tags: , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.