Compare XML snippets?
Question:
Building on another SO question, how can one check whether two well-formed XML snippets are semantically equal. All I need is “equal” or not, since I’m using this for unit tests.
In the system I want, these would be equal (note the order of ‘start’
and ‘end’):
<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200" end="1276041599">
</Stats>
# Reordered start and end
<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats end="1276041599" start="1275955200" >
</Stats>
I have lmxl and other tools at my disposal, and a simple function that only allows reordering of attributes would work fine as well!
Working snippet based on IanB’s answer:
from formencode.doctest_xml_compare import xml_compare
# have to strip these or fromstring carps
xml1 = """ <?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200" end="1276041599"></Stats>"""
xml2 = """ <?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats end="1276041599" start="1275955200"></Stats>"""
xml3 = """ <?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200"></Stats>"""
from lxml import etree
tree1 = etree.fromstring(xml1.strip())
tree2 = etree.fromstring(xml2.strip())
tree3 = etree.fromstring(xml3.strip())
import sys
reporter = lambda x: sys.stdout.write(x + "n")
assert xml_compare(tree1,tree2,reporter)
assert xml_compare(tree1,tree3,reporter) is False
Answers:
If you take a DOM approach, you can traverse the two trees simultaneously while comparing nodes (node type, text, attributes) as you go.
A recursive solution will be the most elegant – just short-circuit further comparison once a pair of nodes are not “equal” or once you detect a leaf in one tree when it’s a branch in another, etc.
You can use formencode.doctest_xml_compare — the xml_compare function compares two ElementTree or lxml trees.
I had the same problem: two documents I wanted to compare that had the same attributes but in different orders.
It seems that XML Canonicalization (C14N) in lxml works well for this, but I’m definitely not an XML expert. I’m curious to know if somebody else can point out drawbacks to this approach.
parser = etree.XMLParser(remove_blank_text=True)
xml1 = etree.fromstring(xml_string1, parser)
xml2 = etree.fromstring(xml_string2, parser)
print "xml1 == xml2: " + str(xml1 == xml2)
ppxml1 = etree.tostring(xml1, pretty_print=True)
ppxml2 = etree.tostring(xml2, pretty_print=True)
print "pretty(xml1) == pretty(xml2): " + str(ppxml1 == ppxml2)
xml_string_io1 = StringIO()
xml1.getroottree().write_c14n(xml_string_io1)
cxml1 = xml_string_io1.getvalue()
xml_string_io2 = StringIO()
xml2.getroottree().write_c14n(xml_string_io2)
cxml2 = xml_string_io2.getvalue()
print "canonicalize(xml1) == canonicalize(xml2): " + str(cxml1 == cxml2)
Running this gives me:
$ python test.py
xml1 == xml2: false
pretty(xml1) == pretty(xml2): false
canonicalize(xml1) == canonicalize(xml2): true
Thinking about this problem, I came up with the following solution that renders XML elements comparable and sortable:
import xml.etree.ElementTree as ET
def cmpElement(x, y):
# compare type
r = cmp(type(x), type(y))
if r: return r
# compare tag
r = cmp(x.tag, y.tag)
if r: return r
# compare tag attributes
r = cmp(x.attrib, y.attrib)
if r: return r
# compare stripped text content
xtext = (x.text and x.text.strip()) or None
ytext = (y.text and y.text.strip()) or None
r = cmp(xtext, ytext)
if r: return r
# compare sorted children
if len(x) or len(y):
return cmp(sorted(x.getchildren()), sorted(y.getchildren()))
return 0
ET._ElementInterface.__lt__ = lambda self, other: cmpElement(self, other) == -1
ET._ElementInterface.__gt__ = lambda self, other: cmpElement(self, other) == 1
ET._ElementInterface.__le__ = lambda self, other: cmpElement(self, other) <= 0
ET._ElementInterface.__ge__ = lambda self, other: cmpElement(self, other) >= 0
ET._ElementInterface.__eq__ = lambda self, other: cmpElement(self, other) == 0
ET._ElementInterface.__ne__ = lambda self, other: cmpElement(self, other) != 0
The order of the elements can be significant in XML, this may be why most other methods suggested will compare unequal if the order is different… even if the elements have same attributes and text content.
But I also wanted an order-insensitive comparison, so I came up with this:
from lxml import etree
import xmltodict # pip install xmltodict
def normalise_dict(d):
"""
Recursively convert dict-like object (eg OrderedDict) into plain dict.
Sorts list values.
"""
out = {}
for k, v in dict(d).iteritems():
if hasattr(v, 'iteritems'):
out[k] = normalise_dict(v)
elif isinstance(v, list):
out[k] = []
for item in sorted(v):
if hasattr(item, 'iteritems'):
out[k].append(normalise_dict(item))
else:
out[k].append(item)
else:
out[k] = v
return out
def xml_compare(a, b):
"""
Compares two XML documents (as string or etree)
Does not care about element order
"""
if not isinstance(a, basestring):
a = etree.tostring(a)
if not isinstance(b, basestring):
b = etree.tostring(b)
a = normalise_dict(xmltodict.parse(a))
b = normalise_dict(xmltodict.parse(b))
return a == b
Adapting Anentropic’s great answer to Python 3 (basically, change iteritems()
to items()
, and basestring
to string
):
from lxml import etree
import xmltodict # pip install xmltodict
def normalise_dict(d):
"""
Recursively convert dict-like object (eg OrderedDict) into plain dict.
Sorts list values.
"""
out = {}
for k, v in dict(d).items():
if hasattr(v, 'iteritems'):
out[k] = normalise_dict(v)
elif isinstance(v, list):
out[k] = []
for item in sorted(v):
if hasattr(item, 'iteritems'):
out[k].append(normalise_dict(item))
else:
out[k].append(item)
else:
out[k] = v
return out
def xml_compare(a, b):
"""
Compares two XML documents (as string or etree)
Does not care about element order
"""
if not isinstance(a, str):
a = etree.tostring(a)
if not isinstance(b, str):
b = etree.tostring(b)
a = normalise_dict(xmltodict.parse(a))
b = normalise_dict(xmltodict.parse(b))
return a == b
Since the order of attributes is not significant in XML, you want to ignore differences due to different attribute orderings and XML canonicalization (C14N) deterministically orders attributes, you can that method for testing equality:
xml1 = b''' <?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200" end="1276041599"></Stats>'''
xml2 = b''' <?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats end="1276041599" start="1275955200"></Stats>'''
xml3 = b''' <?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200"></Stats>'''
import lxml.etree
tree1 = lxml.etree.fromstring(xml1.strip())
tree2 = lxml.etree.fromstring(xml2.strip())
tree3 = lxml.etree.fromstring(xml3.strip())
import io
b1 = io.BytesIO()
b2 = io.BytesIO()
b3 = io.BytesIO()
tree1.getroottree().write_c14n(b1)
tree2.getroottree().write_c14n(b2)
tree3.getroottree().write_c14n(b3)
assert b1.getvalue() == b2.getvalue()
assert b1.getvalue() != b3.getvalue()
Note that this example assumes Python 3. With Python 3, the use of b'''...'''
strings and io.BytesIO
is mandatory, while with Python 2 this method also works with normal strings and io.StringIO
.
Here a simple solution, convert XML into dictionaries (with xmltodict) and compare dictionaries together
import json
import xmltodict
class XmlDiff(object):
def __init__(self, xml1, xml2):
self.dict1 = json.loads(json.dumps((xmltodict.parse(xml1))))
self.dict2 = json.loads(json.dumps((xmltodict.parse(xml2))))
def equal(self):
return self.dict1 == self.dict2
unit test
import unittest
class XMLDiffTestCase(unittest.TestCase):
def test_xml_equal(self):
xml1 = """<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200" end="1276041599">
</Stats>"""
xml2 = """<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats end="1276041599" start="1275955200" >
</Stats>"""
self.assertTrue(XmlDiff(xml1, xml2).equal())
def test_xml_not_equal(self):
xml1 = """<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200">
</Stats>"""
xml2 = """<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats end="1276041599" start="1275955200" >
</Stats>"""
self.assertFalse(XmlDiff(xml1, xml2).equal())
or in simple python method :
import json
import xmltodict
def xml_equal(a, b):
"""
Compares two XML documents (as string or etree)
Does not care about element order
"""
return json.loads(json.dumps((xmltodict.parse(a)))) == json.loads(json.dumps((xmltodict.parse(b))))
SimpleTAL uses a custom xml.sax handler to compare xml-documents
https://github.com/janbrohl/SimpleTAL/blob/python2/tests/TALTests/XMLTests/TALAttributeTestCases.py#L47-L112
(the results for getXMLChecksum are compared)
but I prefer generating a list instead of a md5-hash
What about the following code snippet ? Can be easily enhanced to include attribs as well :
def separator(self):
return "!@#$%^&*" # Very ugly separator
def _traverseXML(self, xmlElem, tags, xpaths):
tags.append(xmlElem.tag)
for e in xmlElem:
self._traverseXML(e, tags, xpaths)
text = ''
if (xmlElem.text):
text = xmlElem.text.strip()
xpaths.add("/".join(tags) + self.separator() + text)
tags.pop()
def _xmlToSet(self, xml):
xpaths = set() # output
tags = list()
root = ET.fromstring(xml)
self._traverseXML(root, tags, xpaths)
return xpaths
def _areXMLsAlike(self, xml1, xml2):
xpaths1 = self._xmlToSet(xml1)
xpaths2 = self._xmlToSet(xml2)`enter code here`
return xpaths1 == xpaths2
Building on another SO question, how can one check whether two well-formed XML snippets are semantically equal. All I need is “equal” or not, since I’m using this for unit tests.
In the system I want, these would be equal (note the order of ‘start’
and ‘end’):
<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200" end="1276041599">
</Stats>
# Reordered start and end
<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats end="1276041599" start="1275955200" >
</Stats>
I have lmxl and other tools at my disposal, and a simple function that only allows reordering of attributes would work fine as well!
Working snippet based on IanB’s answer:
from formencode.doctest_xml_compare import xml_compare
# have to strip these or fromstring carps
xml1 = """ <?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200" end="1276041599"></Stats>"""
xml2 = """ <?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats end="1276041599" start="1275955200"></Stats>"""
xml3 = """ <?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200"></Stats>"""
from lxml import etree
tree1 = etree.fromstring(xml1.strip())
tree2 = etree.fromstring(xml2.strip())
tree3 = etree.fromstring(xml3.strip())
import sys
reporter = lambda x: sys.stdout.write(x + "n")
assert xml_compare(tree1,tree2,reporter)
assert xml_compare(tree1,tree3,reporter) is False
If you take a DOM approach, you can traverse the two trees simultaneously while comparing nodes (node type, text, attributes) as you go.
A recursive solution will be the most elegant – just short-circuit further comparison once a pair of nodes are not “equal” or once you detect a leaf in one tree when it’s a branch in another, etc.
You can use formencode.doctest_xml_compare — the xml_compare function compares two ElementTree or lxml trees.
I had the same problem: two documents I wanted to compare that had the same attributes but in different orders.
It seems that XML Canonicalization (C14N) in lxml works well for this, but I’m definitely not an XML expert. I’m curious to know if somebody else can point out drawbacks to this approach.
parser = etree.XMLParser(remove_blank_text=True)
xml1 = etree.fromstring(xml_string1, parser)
xml2 = etree.fromstring(xml_string2, parser)
print "xml1 == xml2: " + str(xml1 == xml2)
ppxml1 = etree.tostring(xml1, pretty_print=True)
ppxml2 = etree.tostring(xml2, pretty_print=True)
print "pretty(xml1) == pretty(xml2): " + str(ppxml1 == ppxml2)
xml_string_io1 = StringIO()
xml1.getroottree().write_c14n(xml_string_io1)
cxml1 = xml_string_io1.getvalue()
xml_string_io2 = StringIO()
xml2.getroottree().write_c14n(xml_string_io2)
cxml2 = xml_string_io2.getvalue()
print "canonicalize(xml1) == canonicalize(xml2): " + str(cxml1 == cxml2)
Running this gives me:
$ python test.py
xml1 == xml2: false
pretty(xml1) == pretty(xml2): false
canonicalize(xml1) == canonicalize(xml2): true
Thinking about this problem, I came up with the following solution that renders XML elements comparable and sortable:
import xml.etree.ElementTree as ET
def cmpElement(x, y):
# compare type
r = cmp(type(x), type(y))
if r: return r
# compare tag
r = cmp(x.tag, y.tag)
if r: return r
# compare tag attributes
r = cmp(x.attrib, y.attrib)
if r: return r
# compare stripped text content
xtext = (x.text and x.text.strip()) or None
ytext = (y.text and y.text.strip()) or None
r = cmp(xtext, ytext)
if r: return r
# compare sorted children
if len(x) or len(y):
return cmp(sorted(x.getchildren()), sorted(y.getchildren()))
return 0
ET._ElementInterface.__lt__ = lambda self, other: cmpElement(self, other) == -1
ET._ElementInterface.__gt__ = lambda self, other: cmpElement(self, other) == 1
ET._ElementInterface.__le__ = lambda self, other: cmpElement(self, other) <= 0
ET._ElementInterface.__ge__ = lambda self, other: cmpElement(self, other) >= 0
ET._ElementInterface.__eq__ = lambda self, other: cmpElement(self, other) == 0
ET._ElementInterface.__ne__ = lambda self, other: cmpElement(self, other) != 0
The order of the elements can be significant in XML, this may be why most other methods suggested will compare unequal if the order is different… even if the elements have same attributes and text content.
But I also wanted an order-insensitive comparison, so I came up with this:
from lxml import etree
import xmltodict # pip install xmltodict
def normalise_dict(d):
"""
Recursively convert dict-like object (eg OrderedDict) into plain dict.
Sorts list values.
"""
out = {}
for k, v in dict(d).iteritems():
if hasattr(v, 'iteritems'):
out[k] = normalise_dict(v)
elif isinstance(v, list):
out[k] = []
for item in sorted(v):
if hasattr(item, 'iteritems'):
out[k].append(normalise_dict(item))
else:
out[k].append(item)
else:
out[k] = v
return out
def xml_compare(a, b):
"""
Compares two XML documents (as string or etree)
Does not care about element order
"""
if not isinstance(a, basestring):
a = etree.tostring(a)
if not isinstance(b, basestring):
b = etree.tostring(b)
a = normalise_dict(xmltodict.parse(a))
b = normalise_dict(xmltodict.parse(b))
return a == b
Adapting Anentropic’s great answer to Python 3 (basically, change iteritems()
to items()
, and basestring
to string
):
from lxml import etree
import xmltodict # pip install xmltodict
def normalise_dict(d):
"""
Recursively convert dict-like object (eg OrderedDict) into plain dict.
Sorts list values.
"""
out = {}
for k, v in dict(d).items():
if hasattr(v, 'iteritems'):
out[k] = normalise_dict(v)
elif isinstance(v, list):
out[k] = []
for item in sorted(v):
if hasattr(item, 'iteritems'):
out[k].append(normalise_dict(item))
else:
out[k].append(item)
else:
out[k] = v
return out
def xml_compare(a, b):
"""
Compares two XML documents (as string or etree)
Does not care about element order
"""
if not isinstance(a, str):
a = etree.tostring(a)
if not isinstance(b, str):
b = etree.tostring(b)
a = normalise_dict(xmltodict.parse(a))
b = normalise_dict(xmltodict.parse(b))
return a == b
Since the order of attributes is not significant in XML, you want to ignore differences due to different attribute orderings and XML canonicalization (C14N) deterministically orders attributes, you can that method for testing equality:
xml1 = b''' <?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200" end="1276041599"></Stats>'''
xml2 = b''' <?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats end="1276041599" start="1275955200"></Stats>'''
xml3 = b''' <?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200"></Stats>'''
import lxml.etree
tree1 = lxml.etree.fromstring(xml1.strip())
tree2 = lxml.etree.fromstring(xml2.strip())
tree3 = lxml.etree.fromstring(xml3.strip())
import io
b1 = io.BytesIO()
b2 = io.BytesIO()
b3 = io.BytesIO()
tree1.getroottree().write_c14n(b1)
tree2.getroottree().write_c14n(b2)
tree3.getroottree().write_c14n(b3)
assert b1.getvalue() == b2.getvalue()
assert b1.getvalue() != b3.getvalue()
Note that this example assumes Python 3. With Python 3, the use of b'''...'''
strings and io.BytesIO
is mandatory, while with Python 2 this method also works with normal strings and io.StringIO
.
Here a simple solution, convert XML into dictionaries (with xmltodict) and compare dictionaries together
import json
import xmltodict
class XmlDiff(object):
def __init__(self, xml1, xml2):
self.dict1 = json.loads(json.dumps((xmltodict.parse(xml1))))
self.dict2 = json.loads(json.dumps((xmltodict.parse(xml2))))
def equal(self):
return self.dict1 == self.dict2
unit test
import unittest
class XMLDiffTestCase(unittest.TestCase):
def test_xml_equal(self):
xml1 = """<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200" end="1276041599">
</Stats>"""
xml2 = """<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats end="1276041599" start="1275955200" >
</Stats>"""
self.assertTrue(XmlDiff(xml1, xml2).equal())
def test_xml_not_equal(self):
xml1 = """<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200">
</Stats>"""
xml2 = """<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats end="1276041599" start="1275955200" >
</Stats>"""
self.assertFalse(XmlDiff(xml1, xml2).equal())
or in simple python method :
import json
import xmltodict
def xml_equal(a, b):
"""
Compares two XML documents (as string or etree)
Does not care about element order
"""
return json.loads(json.dumps((xmltodict.parse(a)))) == json.loads(json.dumps((xmltodict.parse(b))))
SimpleTAL uses a custom xml.sax handler to compare xml-documents
https://github.com/janbrohl/SimpleTAL/blob/python2/tests/TALTests/XMLTests/TALAttributeTestCases.py#L47-L112
(the results for getXMLChecksum are compared)
but I prefer generating a list instead of a md5-hash
What about the following code snippet ? Can be easily enhanced to include attribs as well :
def separator(self):
return "!@#$%^&*" # Very ugly separator
def _traverseXML(self, xmlElem, tags, xpaths):
tags.append(xmlElem.tag)
for e in xmlElem:
self._traverseXML(e, tags, xpaths)
text = ''
if (xmlElem.text):
text = xmlElem.text.strip()
xpaths.add("/".join(tags) + self.separator() + text)
tags.pop()
def _xmlToSet(self, xml):
xpaths = set() # output
tags = list()
root = ET.fromstring(xml)
self._traverseXML(root, tags, xpaths)
return xpaths
def _areXMLsAlike(self, xml1, xml2):
xpaths1 = self._xmlToSet(xml1)
xpaths2 = self._xmlToSet(xml2)`enter code here`
return xpaths1 == xpaths2