DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world
Python Html2txt
// description of your code here
p = re.compile('(<p.*?>)|(<tr.*?>)', re.I)
t = re.compile('<td.*?>', re.I)
comm = re.compile('<!--.*?-->', re.M)
tags = re.compile('<.*?>', re.M)
def html2txt(s, hint = 'entity', code = 'ISO-8859-1'):
"""Convert the html to raw txt
- suppress all return
- <p>, <tr> to return
- <td> to tab
Need the foolwing regex:
p = re.compile('(<p.*?>)|(<tr.*?>)', re.I)
t = re.compile('<td.*?>', re.I)
comm = re.compile('<!--.*?-->', re.M)
tags = re.compile('<.*?>', re.M)
version 0.0.1 20020930
"""
s = s.replace('\n', '') # remove returns time this compare to split filter j
oin
s = p.sub('\n', s) # replace p and tr by \n
s = t.sub('\t', s) # replace td by \t
s = comm.sub('', s) # remove comments
s = tags.sub('', s) # remove all remaining tags
s = re.sub(' +', ' ', s) # remove running spaces this remove the \n and \t
# handling of entities
result = s
pass
return result




