Remove namespace and prefix from xml in python using lxml
Question:
I have an xml file I need to open and make some changes to, one of those changes is to remove the namespace and prefix and then save to another file.
Here is the xml:
<?xml version='1.0' encoding='UTF-8'?>
<package >
<provider>some data</provider>
<language>en-GB</language>
</package>
I can make the other changes I need, but can’t find out how to remove the namespace and prefix. This is the reusklt xml I need:
<?xml version='1.0' encoding='UTF-8'?>
<package>
<provider>some data</provider>
<language>en-GB</language>
</package>
And here is my script which will open and parse the xml and save it:
metadata = '/Users/user1/Desktop/Python/metadata.xml'
from lxml import etree
parser = etree.XMLParser(remove_blank_text=True)
open(metadata)
tree = etree.parse(metadata, parser)
root = tree.getroot()
tree.write('/Users/user1/Desktop/Python/done.xml', pretty_print = True, xml_declaration = True, encoding = 'UTF-8')
So how would I add code in my script which will remove the namespace and prefix?
Answers:
import xml.etree.ElementTree as ET
def remove_namespace(doc, namespace):
"""Remove namespace in the passed document in place."""
ns = u'{%s}' % namespace
nsl = len(ns)
for elem in doc.getiterator():
if elem.tag.startswith(ns):
elem.tag = elem.tag[nsl:]
metadata = '/Users/user1/Desktop/Python/metadata.xml'
tree = ET.parse(metadata)
root = tree.getroot()
remove_namespace(root, u'http://apple.com/itunes/importer')
tree.write('/Users/user1/Desktop/Python/done.xml',
pretty_print=True, xml_declaration=True, encoding='UTF-8')
Used a snippet of code from here
This method could be easily extended to delete any namespace attributes by searching for tags that begin with ” rel=”nofollow noreferrer”>lxml.objectify.deannotate.
from lxml import etree, objectify
metadata = '/Users/user1/Desktop/Python/metadata.xml'
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(metadata, parser)
root = tree.getroot()
####
for elem in root.getiterator():
if not hasattr(elem.tag, 'find'): continue # guard for Comment tags
i = elem.tag.find('}')
if i >= 0:
elem.tag = elem.tag[i+1:]
objectify.deannotate(root, cleanup_namespaces=True)
####
tree.write('/Users/user1/Desktop/Python/done.xml',
pretty_print=True, xml_declaration=True, encoding='UTF-8')
Note: Some tags like Comment
return a function when accessing tag
attribute. added a guard for that.
all you need to do is:
objectify.deannotate(root, cleanup_namespaces=True)
after you have get the root, by using root = tree.getroot()
Here are two other ways of removing namespaces. The first uses the lxml.etree.QName helper while the second uses regexes. Both functions allow an optional list of namespaces to match against. If no namespace list is supplied then all namespaces are removed. Attribute keys are also cleaned.
from lxml import etree
import re
def remove_namespaces_qname(doc, namespaces=None):
for el in doc.getiterator():
# clean tag
q = etree.QName(el.tag)
if q is not None:
if namespaces is not None:
if q.namespace in namespaces:
el.tag = q.localname
else:
el.tag = q.localname
# clean attributes
for a, v in el.items():
q = etree.QName(a)
if q is not None:
if namespaces is not None:
if q.namespace in namespaces:
del el.attrib[a]
el.attrib[q.localname] = v
else:
del el.attrib[a]
el.attrib[q.localname] = v
return doc
def remove_namespace_re(doc, namespaces=None):
if namespaces is not None:
ns = list(map(lambda n: u'{%s}' % n, namespaces))
for el in doc.getiterator():
# clean tag
m = re.match(r'({.+})(.+)', el.tag)
if m is not None:
if namespaces is not None:
if m.group(1) in ns:
el.tag = m.group(2)
else:
el.tag = m.group(2)
# clean attributes
for a, v in el.items():
m = re.match(r'({.+})(.+)', a)
if m is not None:
if namespaces is not None:
if m.group(1) in ns:
del el.attrib[a]
el.attrib[m.group(2)] = v
else:
del el.attrib[a]
el.attrib[m.group(2)] = v
return doc
We can get the desired output document in two steps:
- Remove namespace URIs from element names
- Remove unused namespace declarations from the XML tree
Example code
from lxml import etree
input_xml = """
<package rel="noreferrer">documentation, we use lxml.etree.QName.localname
to get local names of elements, that is names without namespace URIs. Then we replace the fully qualified names of the elements by their local names.
Some XML elements, such as comments and processing instructions do not have names. So, we have to skip these elements while replacing element names, otherwise a ValueError
will be raised.
Finally, we use lxml.etree.cleanup_namespaces()
to remove unused namespace declarations from the XML tree.
You could also use XSLT to strip the namespaces...
XSLT 1.0 (test.xsl)
<xsl:stylesheet version="1.0" >
<xsl:output indent="yes"/>
<xsl:strip-space elements="*"/>
<xsl:template match="node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="*" priority="1">
<xsl:element name="{local-name()}" namespace="">
<xsl:apply-templates select="@*|node()"/>
</xsl:element>
</xsl:template>
<xsl:template match="@*">
<xsl:attribute name="{local-name()}" namespace="">
<xsl:value-of select="."/>
</xsl:attribute>
</xsl:template>
</xsl:stylesheet>
Python
from lxml import etree
tree = etree.parse("metadata.xml")
xslt = etree.parse("test.xsl")
new_tree = tree.xslt(xslt)
print(etree.tostring(new_tree, pretty_print=True, xml_declaration=True,
encoding="UTF-8").decode("UTF-8"))
Output
<?xml version='1.0' encoding='UTF-8'?>
<package>
<provider>some data</provider>
<language>en-GB</language>
</package>
you can try with lxml:
# Remove namespace prefixes
for elem in root.getiterator():
namespace_removed = elem.xpath('local-name()')
So I realize this is an older answer with a highly up-voted and accepted answer, but if you are reading LARGE-FILES and find yourself in the same predicament I did; I hope this helps you out.
The issue with this approach is, in fact, the iteration. Regardless of how fast the parser is, doing anything say... a few 100k times is gonna eat your execution time. With that said, it came down to really thinking about the problem for me and understanding how namespaces work (or are "intended to work", because they are honestly not needed). Now if your xml truly uses namespaces, meaning you see tags that look like this: <xs:table>
, then you'll need to tweak the approach here for your use-case. I'll include the full way of handling, as well.
DISCLAIMER : I cannot, with a good conscience, tell you to use regular expressions when parsing html/xml, go look at SergiyKolesnikov's answer as it WORKS, but I had an edge case so with that said... let's dive into some regex!
Problem: namespace stripping takes forever... and most of the time the namespaces only live inside of the very opening tag, or our "root". So in thinking about how python reads information in, and where our only problem-child is that root node, why not use that to our advantage.
Please NOTE: the file i'm using as my example comes as a raw, horrid, remarkably senseless structure of lulz with the promise of data in there somewhere.
my_file
is the path to the file im using for our example, I cannot share it with you for professional reasons; and it has been cut down way in size just to get through this answer.
import os, sys, subprocess, re, io, json
from lxml import etree
# Your file would be '_biggest_file' if playing along at home
my_file = _biggest_file
meta_stuff = dict(
exists = os.path.exists(_biggest_file),
sizeof = os.path.getsize(_biggest_file),
extension_is_a_real_thing = any(re.findall(".(html|xml)$", my_file, re.I)),
system_thinks_its_a = subprocess.check_output(
["file", "-i", _biggest_file]
).decode().split(":")[-1:][0].strip()
)
print(json.dumps(meta_stuff, indent = 2))
So for starters, decently sized, and system thinks at best it's html; the file extension is neither xml or html either...
{
"exists": true,
"sizeof": 24442371,
"extension_is_a_real_thing": false,
"system_thinks_its_a": "text/html; charset=us-ascii"
}
Approach:
- In order to parse an xml file... it should at the very least be xml, so we'll need to check and add a declarations tag if one doesn't exist
- If I have namespaces.. thats bad because I can't use xpaths, which is what I want to do
- If my file is huge, I should only operate on the smallest imaginable parts that I need to clean before I'm ready to parse it.
Function
def speed_read(file_path):
# We're gonna be low-brow and add our own using this string. It's fine
_xml_dec = '<?xml version="1.0" encoding="utf-8"?>'
# Even worse.. rgx for xml here we go
#
# We'll need to extract the very first node that we find in our document,
# because for our purposes thats the one we know has the namespace uri's
# ie: "attributes"
# FiRsT node : <actual_name >import pandas as pd
safe_times = []
for i in range(0,5):
s = time.time()
safe_read(_biggest_file)
safe_times.append(time.time() - s)
fast_times = []
for i in range(0,5):
s = time.time()
speed_read(_biggest_file)
fast_times.append(time.time() - s)
pd.DataFrame({"safe":safe_times, "fast":fast_times})
Results
safe
fast
2.36
0.61
2.15
0.58
2.47
0.49
2.94
0.60
2.83
0.53
Define and call the following function, right after you parse the XML string:
from lxml import etree
def clean_xml_namespaces(root):
for element in root.getiterator():
if isinstance(element, etree._Comment):
continue
element.tag = etree.QName(element).localname
etree.cleanup_namespaces(root)
Note - comment elements in the XML are ignored, as they should be
Usage:
xml_content = b'''<?xml version="1.0" encoding="UTF-8"?>
<project >accepted solution removes namespaces in node names and not in attributes, i.e. <b:spam c_name="cheese"/>
will be transformed to <spam c_name="cheese"/>
.
An updated version which will give you <spam name="cheese"/>
def remove_namespaces(root):
for elem in root.getiterator():
if not (
isinstance(elem, etree._Comment)
or isinstance(elem, etree._ProcessingInstruction)
):
localname = etree.QName(elem).localname
if elem.tag != localname:
elem.tag = etree.QName(elem).localname
for attr_name in elem.attrib:
local_attr_name = etree.QName(attr_name).localname
if attr_name != local_attr_name:
attr_value = elem.attrib[attr_name]
del elem.attrib[attr_name]
elem.attrib[local_attr_name] = attr_value
deannotate(root, cleanup_namespaces=True)
I have an xml file I need to open and make some changes to, one of those changes is to remove the namespace and prefix and then save to another file.
Here is the xml:
<?xml version='1.0' encoding='UTF-8'?>
<package >
<provider>some data</provider>
<language>en-GB</language>
</package>
I can make the other changes I need, but can’t find out how to remove the namespace and prefix. This is the reusklt xml I need:
<?xml version='1.0' encoding='UTF-8'?>
<package>
<provider>some data</provider>
<language>en-GB</language>
</package>
And here is my script which will open and parse the xml and save it:
metadata = '/Users/user1/Desktop/Python/metadata.xml'
from lxml import etree
parser = etree.XMLParser(remove_blank_text=True)
open(metadata)
tree = etree.parse(metadata, parser)
root = tree.getroot()
tree.write('/Users/user1/Desktop/Python/done.xml', pretty_print = True, xml_declaration = True, encoding = 'UTF-8')
So how would I add code in my script which will remove the namespace and prefix?
import xml.etree.ElementTree as ET
def remove_namespace(doc, namespace):
"""Remove namespace in the passed document in place."""
ns = u'{%s}' % namespace
nsl = len(ns)
for elem in doc.getiterator():
if elem.tag.startswith(ns):
elem.tag = elem.tag[nsl:]
metadata = '/Users/user1/Desktop/Python/metadata.xml'
tree = ET.parse(metadata)
root = tree.getroot()
remove_namespace(root, u'http://apple.com/itunes/importer')
tree.write('/Users/user1/Desktop/Python/done.xml',
pretty_print=True, xml_declaration=True, encoding='UTF-8')
Used a snippet of code from here
This method could be easily extended to delete any namespace attributes by searching for tags that begin with ” rel=”nofollow noreferrer”>lxml.objectify.deannotate.
from lxml import etree, objectify
metadata = '/Users/user1/Desktop/Python/metadata.xml'
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(metadata, parser)
root = tree.getroot()
####
for elem in root.getiterator():
if not hasattr(elem.tag, 'find'): continue # guard for Comment tags
i = elem.tag.find('}')
if i >= 0:
elem.tag = elem.tag[i+1:]
objectify.deannotate(root, cleanup_namespaces=True)
####
tree.write('/Users/user1/Desktop/Python/done.xml',
pretty_print=True, xml_declaration=True, encoding='UTF-8')
Note: Some tags like Comment
return a function when accessing tag
attribute. added a guard for that.
all you need to do is:
objectify.deannotate(root, cleanup_namespaces=True)
after you have get the root, by using root = tree.getroot()
Here are two other ways of removing namespaces. The first uses the lxml.etree.QName helper while the second uses regexes. Both functions allow an optional list of namespaces to match against. If no namespace list is supplied then all namespaces are removed. Attribute keys are also cleaned.
from lxml import etree
import re
def remove_namespaces_qname(doc, namespaces=None):
for el in doc.getiterator():
# clean tag
q = etree.QName(el.tag)
if q is not None:
if namespaces is not None:
if q.namespace in namespaces:
el.tag = q.localname
else:
el.tag = q.localname
# clean attributes
for a, v in el.items():
q = etree.QName(a)
if q is not None:
if namespaces is not None:
if q.namespace in namespaces:
del el.attrib[a]
el.attrib[q.localname] = v
else:
del el.attrib[a]
el.attrib[q.localname] = v
return doc
def remove_namespace_re(doc, namespaces=None):
if namespaces is not None:
ns = list(map(lambda n: u'{%s}' % n, namespaces))
for el in doc.getiterator():
# clean tag
m = re.match(r'({.+})(.+)', el.tag)
if m is not None:
if namespaces is not None:
if m.group(1) in ns:
el.tag = m.group(2)
else:
el.tag = m.group(2)
# clean attributes
for a, v in el.items():
m = re.match(r'({.+})(.+)', a)
if m is not None:
if namespaces is not None:
if m.group(1) in ns:
del el.attrib[a]
el.attrib[m.group(2)] = v
else:
del el.attrib[a]
el.attrib[m.group(2)] = v
return doc
We can get the desired output document in two steps:
- Remove namespace URIs from element names
- Remove unused namespace declarations from the XML tree
Example code
from lxml import etree
input_xml = """
<package rel="noreferrer">documentation, we use lxml.etree.QName.localname
to get local names of elements, that is names without namespace URIs. Then we replace the fully qualified names of the elements by their local names.
Some XML elements, such as comments and processing instructions do not have names. So, we have to skip these elements while replacing element names, otherwise a ValueError
will be raised.
Finally, we use lxml.etree.cleanup_namespaces()
to remove unused namespace declarations from the XML tree.
You could also use XSLT to strip the namespaces...
XSLT 1.0 (test.xsl)
<xsl:stylesheet version="1.0" >
<xsl:output indent="yes"/>
<xsl:strip-space elements="*"/>
<xsl:template match="node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="*" priority="1">
<xsl:element name="{local-name()}" namespace="">
<xsl:apply-templates select="@*|node()"/>
</xsl:element>
</xsl:template>
<xsl:template match="@*">
<xsl:attribute name="{local-name()}" namespace="">
<xsl:value-of select="."/>
</xsl:attribute>
</xsl:template>
</xsl:stylesheet>
Python
from lxml import etree
tree = etree.parse("metadata.xml")
xslt = etree.parse("test.xsl")
new_tree = tree.xslt(xslt)
print(etree.tostring(new_tree, pretty_print=True, xml_declaration=True,
encoding="UTF-8").decode("UTF-8"))
Output
<?xml version='1.0' encoding='UTF-8'?>
<package>
<provider>some data</provider>
<language>en-GB</language>
</package>
you can try with lxml:
# Remove namespace prefixes
for elem in root.getiterator():
namespace_removed = elem.xpath('local-name()')
So I realize this is an older answer with a highly up-voted and accepted answer, but if you are reading LARGE-FILES and find yourself in the same predicament I did; I hope this helps you out.
The issue with this approach is, in fact, the iteration. Regardless of how fast the parser is, doing anything say... a few 100k times is gonna eat your execution time. With that said, it came down to really thinking about the problem for me and understanding how namespaces work (or are "intended to work", because they are honestly not needed). Now if your xml truly uses namespaces, meaning you see tags that look like this: <xs:table>
, then you'll need to tweak the approach here for your use-case. I'll include the full way of handling, as well.
DISCLAIMER : I cannot, with a good conscience, tell you to use regular expressions when parsing html/xml, go look at SergiyKolesnikov's answer as it WORKS, but I had an edge case so with that said... let's dive into some regex!
Problem: namespace stripping takes forever... and most of the time the namespaces only live inside of the very opening tag, or our "root". So in thinking about how python reads information in, and where our only problem-child is that root node, why not use that to our advantage.
Please NOTE: the file i'm using as my example comes as a raw, horrid, remarkably senseless structure of lulz with the promise of data in there somewhere.
my_file
is the path to the file im using for our example, I cannot share it with you for professional reasons; and it has been cut down way in size just to get through this answer.
import os, sys, subprocess, re, io, json
from lxml import etree
# Your file would be '_biggest_file' if playing along at home
my_file = _biggest_file
meta_stuff = dict(
exists = os.path.exists(_biggest_file),
sizeof = os.path.getsize(_biggest_file),
extension_is_a_real_thing = any(re.findall(".(html|xml)$", my_file, re.I)),
system_thinks_its_a = subprocess.check_output(
["file", "-i", _biggest_file]
).decode().split(":")[-1:][0].strip()
)
print(json.dumps(meta_stuff, indent = 2))
So for starters, decently sized, and system thinks at best it's html; the file extension is neither xml or html either...
{
"exists": true,
"sizeof": 24442371,
"extension_is_a_real_thing": false,
"system_thinks_its_a": "text/html; charset=us-ascii"
}
Approach:
- In order to parse an xml file... it should at the very least be xml, so we'll need to check and add a declarations tag if one doesn't exist
- If I have namespaces.. thats bad because I can't use xpaths, which is what I want to do
- If my file is huge, I should only operate on the smallest imaginable parts that I need to clean before I'm ready to parse it.
Function
def speed_read(file_path):
# We're gonna be low-brow and add our own using this string. It's fine
_xml_dec = '<?xml version="1.0" encoding="utf-8"?>'
# Even worse.. rgx for xml here we go
#
# We'll need to extract the very first node that we find in our document,
# because for our purposes thats the one we know has the namespace uri's
# ie: "attributes"
# FiRsT node : <actual_name >import pandas as pd
safe_times = []
for i in range(0,5):
s = time.time()
safe_read(_biggest_file)
safe_times.append(time.time() - s)
fast_times = []
for i in range(0,5):
s = time.time()
speed_read(_biggest_file)
fast_times.append(time.time() - s)
pd.DataFrame({"safe":safe_times, "fast":fast_times})
Results
safe | fast |
---|---|
2.36 | 0.61 |
2.15 | 0.58 |
2.47 | 0.49 |
2.94 | 0.60 |
2.83 | 0.53 |
Define and call the following function, right after you parse the XML string:
from lxml import etree
def clean_xml_namespaces(root):
for element in root.getiterator():
if isinstance(element, etree._Comment):
continue
element.tag = etree.QName(element).localname
etree.cleanup_namespaces(root)
Note - comment elements in the XML are ignored, as they should be
Usage:
xml_content = b'''<?xml version="1.0" encoding="UTF-8"?>
<project >accepted solution removes namespaces in node names and not in attributes, i.e. <b:spam c_name="cheese"/>
will be transformed to <spam c_name="cheese"/>
.
An updated version which will give you <spam name="cheese"/>
def remove_namespaces(root):
for elem in root.getiterator():
if not (
isinstance(elem, etree._Comment)
or isinstance(elem, etree._ProcessingInstruction)
):
localname = etree.QName(elem).localname
if elem.tag != localname:
elem.tag = etree.QName(elem).localname
for attr_name in elem.attrib:
local_attr_name = etree.QName(attr_name).localname
if attr_name != local_attr_name:
attr_value = elem.attrib[attr_name]
del elem.attrib[attr_name]
elem.attrib[local_attr_name] = attr_value
deannotate(root, cleanup_namespaces=True)