DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world
Html Table To Wiki Converter
For more details on how to call this script from php if your server doesn't support python, click http://just-tech.blogspot.com/2007/01/python-html-tables-to-mediawiki.html
import HTMLParser, re, sys
class html2wiki(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.wiki = '' # The Wiki text
self.wikirow = '' # The current Wiki row of table being constructed from HTML
self.inTD = 0 # Used to track if we are inside or outside a <TD>...</TD> tag.
self.inTR = 0 # Used to track if we are inside or outside a <TR>...</TR> tag.
self.re_multiplespaces = re.compile('\s+') # regular expression used to remove spaces in excess
self.rowCount = 0 # output row counter.
self.rowspan = ''
self.colspan = ''
self.linebreak = '<br>'
self.data = ''
self.prop = ''
def handle_starttag(self, tag, attrs):
if tag == 'table': self.start_table()
elif tag == 'tr': self.start_tr()
elif tag == 'td': self.start_td(attrs)
def handle_endtag(self, tag):
if tag == 'table': self.end_table();
elif tag == 'tr': self.end_tr()
elif tag == 'td': self.end_td()
def start_table(self):
self.wiki += '{| border=1' + self.linebreak
self.wiki += '|-' + self.linebreak
def end_table(self):
self.wiki += '|}' + self.linebreak
def start_tr(self):
if self.inTR: self.end_tr() # <TR> implies </TR>
self.inTR = 1
def end_tr(self):
if self.inTD: self.end_td() # </TR> implies </TD>
self.inTR = 0
if len(self.wikirow) > 0:
self.wiki += self.wikirow
self.wiki += '|-' + self.linebreak
self.wikirow = ''
self.rowCount += 1
def start_td(self, attrs):
if not self.inTR: self.start_tr() # <TD> implies <TR>
self.data = ''
self.prop = ''
self.rowspan = ''
self.colspan = ''
for key, value in attrs:
if key == 'rowspan':
self.rowspan = value
elif key == 'colspan':
self.colspan = value
self.inTD = 1
def end_td(self):
if self.inTD:
self.wikirow += '| ' + self.prop + self.re_multiplespaces.sub(' ',self.data.replace('\t',' ').replace(self.linebreak,'').replace('\r','').replace('"','""'))+ self.linebreak;
self.data = ''
self.inTD = 0
def handle_data(self, data):
if self.inTD:
if data.strip() != '':
self.prop = ''
if self.rowspan != '':
self.prop = ' rowspan = '+self.rowspan
if self.colspan != '':
self.prop += ' colspan = '+self.colspan
if self.prop:
self.prop += ' | '
self.data += data
if __name__ == '__main__':
parser = html2wiki()
if len(sys.argv) == 2:
in_file = open(sys.argv[1],"r")
text = in_file.read()
parser.feed(text)
in_file.close()
print parser.wiki
else:
print 'Argument - filename required'






Comments
Snippets Manager replied on Sat, 2008/07/26 - 12:13pm