DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world
Decode Html Entities
use like this :
print decode_htmlentities("l'eau")
from htmlentitydefs import name2codepoint as n2cp
import re
def substitute_entity(match):
ent = match.group(2)
if match.group(1) == "#":
return unichr(int(ent))
else:
cp = n2cp.get(ent)
if cp:
return unichr(cp)
else:
return match.group()
def decode_htmlentities(string):
entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
return entity_re.subn(substitute_entity, string)[0]






Comments
Snippets Manager replied on Wed, 2009/04/01 - 6:27pm
Snippets Manager replied on Wed, 2009/04/01 - 6:27pm
r'&(#?)(x?)(\d{1,5}|\w{1,8});'tor'&(#?)(x?)(\w+);'Here's my test:>>> from html_decode import decode_htmlentities >>> u = u'E tu vivrai nel terrore - L'aldilà (1981)' >>> print decode_htmlentities(u).encode('UTF-8') E tu vivrai nel terrore - L'aldilà (1981)Snippets Manager replied on Wed, 2009/04/01 - 6:27pm
from htmlentitydefs import name2codepoint as n2cp import re def substitute_entity(match): ent = match.group(3) if match.group(1) == "#": if match.group(2) == '': return unichr(int(ent)) elif match.group(2) == 'x': return unichr(int('0x'+ent, 16)) else: cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group() def decode_htmlentities(string): entity_re = re.compile(r'&(#?)(x?)(\d{1,5}|\w{1,8});') return entity_re.subn(substitute_entity, string)[0]Snippets Manager replied on Wed, 2007/10/31 - 11:42am