xml reading.
# blogspot_xml_santizer.py from xml.dom import minidom from xml.sax.saxutils import unescape p = open('blog-09-02-2012.xml') x = p.read() doc_ob = minidom.parseString(x) print(len(doc_ob.getElementsByTagName('content'))) entities = {r''': "-"} for content in doc_ob.getElementsByTagName('content'): data = content.toprettyxml() modified_xml = unescape(data, entities) print(modified_xml)