XML and probably LXML issue
Question:
I have many XML files that look like this
<?xml version="1.0" encoding="utf-8" standalone="no"?>
<reiXmlPrenos>
<Qfl>1808</Qfl>
<fOVE>13.7</fOVE>
<NetoVolumen>613</NetoVolumen>
<Hv>104.2</Hv>
<energenti>
<energent>
<sifra>energy_e</sifra>
<naziv>EE [kWh]</naziv>
<vrednost>238981</vrednost>
</energent>
<energent>
<sifra>energy_to</sifra>
<naziv>Do</naziv>
<vrednost>16359</vrednost>
</energent>
<energent>
<sifra>energy_en</sifra>
<naziv>En</naziv>
<vrednost>0</vrednost>
</energent>
</energenti>
<rei>
<zavetrovanost>2</zavetrovanost>
<cone>
<cona>
<cona_id>1</cona_id>
<cc_si_cona>1110000</cc_si_cona>
<visina_cone>2.7</visina_cone>
<dolzina_cone>14</dolzina_cone>
</cona>
<cona>
<cona_id>2</cona_id>
<cc_si_cona>120000</cc_si_cona>
</cona>
</rei>
</reiXmlPrenos>
I would like to extract certain values from those XML files. So I put together with the help from people here code below that is suppose to work:
import pandas as pd
import glob
import os
from lxml import etree
os.chdir(r'R:...XML-1122_test')
dir_path = glob.glob('*.xml')
xmls = dir_path
#note: For simplicity, I'm using the well formed version of the xml strings in your question; you'll have to use actual file names and paths
energies = ["xml", "energy_ge", "energy_en", "energy_dteu", "energy_dtlb"]
#I just made up some names - you'll have to use actual names, of course; the first one is for the file identifier - see below
rows = []
for xml in xmls:
row = []
id = "xml-"+str(xmls.index(xml)+1)
#this creates the file identifier
row.append(id)
root = etree.XML(xml.encode())
#in real life, you'll have to use the parse() method
for energy in energies[1:]:
#the '[1:]' is used to skip the first "energy"; it's only used as the file identifier
target = root.xpath(f'//energent[./sifra[.="{energy}"]]/vrednost/text()')
#note the use of f-strings
row.extend( target if len(target)>0 else "0" )
rows.append(row)
print(pd.DataFrame(rows,columns=energies))
But in the end I get a warning:
File "<string>", line 1
XMLSyntaxError: Start tag expected, '<' not found, line 1, column 1
Is this an XML issue? Or maybe lxml issue? Does anyone know how to approach this?
Ideally, the result would look like this
xml energy_e energy_en energy_to
xml-1 238981 0 16539
xml-2 ... .. ..
Answers:
import pandas as pd
import glob
import os
from lxml import etree
os.chdir(r'R:...XML-1122_test')
dir_path = glob.glob('*.xml')
xmls = dir_path
energies = ["xml", "energy_e", "energy_en", "energy_to"]
rows = []
for xml in xmls:
row = []
id = "xml-"+str(xmls.index(xml)+1)
row.append(id)
with open(xml, 'r', encoding='utf-8') as f:
xml_string = f.read()
root = etree.XML(xml_string.encode())
for energy in energies[1:]:
target = root.xpath(f'//energent[./sifra="{energy}"]/vrednost/text()')
row.extend(target if len(target)>0 else ["0"])
rows.append(row)
print(pd.DataFrame(rows, columns=energies)
parse below
import pandas as pd
import glob
import os
from lxml import etree
os.chdir(r'R:...XML-1122_test')
dir_path = glob.glob('*.xml')
xmls = dir_path
energies = ["xml", "energy_ge", "energy_en", "energy_dteu", "energy_dtlb"]
rows = []
for xml in xmls:
row = []
id = "xml-"+str(xmls.index(xml)+1)
row.append(id)
root = etree.parse(xml)
for energy in energies[1:]:
target = root.xpath(f'//energent[./sifra[.="{energy}"]]/vrednost/text()')
row.extend( target if len(target)>0 else "0" )
rows.append(row)
print(pd.DataFrame(rows,columns=energies))
Since you’re looking for a dataframe, you can simply use read_xml
from pandas:
df = (
pd.read_xml(xml, xpath=".//energent")
.drop("naziv", axis=1)
.set_index("sifra").T
.rename_axis(None, axis=1)
)
And this is how you can incorporate it in your code :
xmls = glob.glob("*.xml")
list_dfs = []
for idx, xml in enumerate(xmls, start=1):
tmp_df = (
pd.read_xml(xml, xpath=".//energent")
.drop("naziv", axis=1)
.set_index("sifra").T
.rename_axis(None, axis=1)
)
tmp_df.insert(0, "xml", f"{xml}-{idx}")
list_dfs.append(tmp_df)
df = pd.concat(list_dfs, ignore_index=True)
Test/Output (x3 the same xml):
print(df)
xml energy_e energy_to energy_en
0 first.xml-1 238981 16359 0
1 second.xml-2 238981 16359 0
2 third.xml-3 238981 16359 0
I have many XML files that look like this
<?xml version="1.0" encoding="utf-8" standalone="no"?>
<reiXmlPrenos>
<Qfl>1808</Qfl>
<fOVE>13.7</fOVE>
<NetoVolumen>613</NetoVolumen>
<Hv>104.2</Hv>
<energenti>
<energent>
<sifra>energy_e</sifra>
<naziv>EE [kWh]</naziv>
<vrednost>238981</vrednost>
</energent>
<energent>
<sifra>energy_to</sifra>
<naziv>Do</naziv>
<vrednost>16359</vrednost>
</energent>
<energent>
<sifra>energy_en</sifra>
<naziv>En</naziv>
<vrednost>0</vrednost>
</energent>
</energenti>
<rei>
<zavetrovanost>2</zavetrovanost>
<cone>
<cona>
<cona_id>1</cona_id>
<cc_si_cona>1110000</cc_si_cona>
<visina_cone>2.7</visina_cone>
<dolzina_cone>14</dolzina_cone>
</cona>
<cona>
<cona_id>2</cona_id>
<cc_si_cona>120000</cc_si_cona>
</cona>
</rei>
</reiXmlPrenos>
I would like to extract certain values from those XML files. So I put together with the help from people here code below that is suppose to work:
import pandas as pd
import glob
import os
from lxml import etree
os.chdir(r'R:...XML-1122_test')
dir_path = glob.glob('*.xml')
xmls = dir_path
#note: For simplicity, I'm using the well formed version of the xml strings in your question; you'll have to use actual file names and paths
energies = ["xml", "energy_ge", "energy_en", "energy_dteu", "energy_dtlb"]
#I just made up some names - you'll have to use actual names, of course; the first one is for the file identifier - see below
rows = []
for xml in xmls:
row = []
id = "xml-"+str(xmls.index(xml)+1)
#this creates the file identifier
row.append(id)
root = etree.XML(xml.encode())
#in real life, you'll have to use the parse() method
for energy in energies[1:]:
#the '[1:]' is used to skip the first "energy"; it's only used as the file identifier
target = root.xpath(f'//energent[./sifra[.="{energy}"]]/vrednost/text()')
#note the use of f-strings
row.extend( target if len(target)>0 else "0" )
rows.append(row)
print(pd.DataFrame(rows,columns=energies))
But in the end I get a warning:
File "<string>", line 1
XMLSyntaxError: Start tag expected, '<' not found, line 1, column 1
Is this an XML issue? Or maybe lxml issue? Does anyone know how to approach this?
Ideally, the result would look like this
xml energy_e energy_en energy_to
xml-1 238981 0 16539
xml-2 ... .. ..
import pandas as pd
import glob
import os
from lxml import etree
os.chdir(r'R:...XML-1122_test')
dir_path = glob.glob('*.xml')
xmls = dir_path
energies = ["xml", "energy_e", "energy_en", "energy_to"]
rows = []
for xml in xmls:
row = []
id = "xml-"+str(xmls.index(xml)+1)
row.append(id)
with open(xml, 'r', encoding='utf-8') as f:
xml_string = f.read()
root = etree.XML(xml_string.encode())
for energy in energies[1:]:
target = root.xpath(f'//energent[./sifra="{energy}"]/vrednost/text()')
row.extend(target if len(target)>0 else ["0"])
rows.append(row)
print(pd.DataFrame(rows, columns=energies)
parse below
import pandas as pd
import glob
import os
from lxml import etree
os.chdir(r'R:...XML-1122_test')
dir_path = glob.glob('*.xml')
xmls = dir_path
energies = ["xml", "energy_ge", "energy_en", "energy_dteu", "energy_dtlb"]
rows = []
for xml in xmls:
row = []
id = "xml-"+str(xmls.index(xml)+1)
row.append(id)
root = etree.parse(xml)
for energy in energies[1:]:
target = root.xpath(f'//energent[./sifra[.="{energy}"]]/vrednost/text()')
row.extend( target if len(target)>0 else "0" )
rows.append(row)
print(pd.DataFrame(rows,columns=energies))
Since you’re looking for a dataframe, you can simply use read_xml
from pandas:
df = (
pd.read_xml(xml, xpath=".//energent")
.drop("naziv", axis=1)
.set_index("sifra").T
.rename_axis(None, axis=1)
)
And this is how you can incorporate it in your code :
xmls = glob.glob("*.xml")
list_dfs = []
for idx, xml in enumerate(xmls, start=1):
tmp_df = (
pd.read_xml(xml, xpath=".//energent")
.drop("naziv", axis=1)
.set_index("sifra").T
.rename_axis(None, axis=1)
)
tmp_df.insert(0, "xml", f"{xml}-{idx}")
list_dfs.append(tmp_df)
df = pd.concat(list_dfs, ignore_index=True)
Test/Output (x3 the same xml):
print(df)
xml energy_e energy_to energy_en
0 first.xml-1 238981 16359 0
1 second.xml-2 238981 16359 0
2 third.xml-3 238981 16359 0