def encode_for_xml(unicode_data, encoding='ascii'):
"""
Encode unicode_data for use as XML or HTML, with characters outside
of the encoding converted to XML numeric character references.
"""
try:
return unicode_data.encode(encoding, 'xmlcharrefreplace')
except ValueError:
# ValueError is raised if there are unencodable chars in the
# data and the 'xmlcharrefreplace' error handler is not found.
# Pre-2.3 Python doesn't support the 'xmlcharrefreplace' error
# handler, so we'll emulate it.
return _xmlcharref_encode(unicode_data, encoding)
def _xmlcharref_encode(unicode_data, encoding):
"""Emulate Python 2.3's 'xmlcharrefreplace' encoding error handler."""
chars = []
# Step through the unicode_data string one character at a time in
# order to catch unencodable characters:
for char in unicode_data:
try:
chars.append(char.encode(encoding, 'strict'))
except UnicodeError:
chars.append('&#%i;' % ord(char))
return ''.join(chars)
if __name__ == '__main__':
# demo
data = u'''\
<html>
<head>
<title>Encoding Test</title>
</head>
<body>
<p>accented characters:</p>
<ul>
<li>\xe0 (a + grave)
<li>\xe7 (c + cedilla)
<li>\xe9 (e + acute)
<li>\xee (i + circumflex)
<li>\xf1 (n + tilde)
<li>\xfc (u + umlaut)
</ul>
<p>symbols:</p>
<ul>
<li>\xa3 (British pound)
<li>\xa2 (cent)
<li>\u20ac (Euro)
<li>\u221e (infinity)
<li>\xb0 (degree)
</ul>
</body></html>
'''
print encode_for_xml(data, 'ascii')