DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Snippets has posted 5883 posts at DZone. View Full User Profile

Html Table To Wiki Converter

01.26.2007
| 7326 views |
  • submit to reddit
        For more details on how to call this script from php if your server doesn't support python, click http://just-tech.blogspot.com/2007/01/python-html-tables-to-mediawiki.html
import HTMLParser, re, sys
class html2wiki(HTMLParser.HTMLParser):
	def __init__(self):
		HTMLParser.HTMLParser.__init__(self)
		self.wiki = ''	  # The Wiki text
		self.wikirow = ''   # The current Wiki row of table being constructed from HTML
		self.inTD = 0	  # Used to track if we are inside or outside a <TD>...</TD> tag.
		self.inTR = 0	  # Used to track if we are inside or outside a <TR>...</TR> tag.
		self.re_multiplespaces = re.compile('\s+')  # regular expression used to remove spaces in excess
		self.rowCount = 0  # output row counter.
		self.rowspan = ''
		self.colspan = ''
		self.linebreak = '<br>'
		self.data = ''
		self.prop = ''
		
	def handle_starttag(self, tag, attrs):
		if tag == 'table': self.start_table()
		elif   tag == 'tr': self.start_tr()
		elif tag == 'td': self.start_td(attrs)
		
	def handle_endtag(self, tag):
		if tag == 'table': self.end_table();
		elif   tag == 'tr': self.end_tr()
		elif tag == 'td': self.end_td()
		
	def start_table(self):
		self.wiki += '{| border=1' + self.linebreak
		self.wiki += '|-' + self.linebreak
		
	def end_table(self):
		self.wiki += '|}' + self.linebreak
	
	def start_tr(self):
		if self.inTR: self.end_tr()  # <TR> implies </TR>
		self.inTR = 1
		
	def end_tr(self):
		if self.inTD: self.end_td()  # </TR> implies </TD>
		self.inTR = 0			
		if len(self.wikirow) > 0:
			self.wiki += self.wikirow
			self.wiki += '|-' + self.linebreak
			self.wikirow = ''
		self.rowCount += 1

	def start_td(self, attrs):
		if not self.inTR: self.start_tr() # <TD> implies <TR>
		self.data = ''
		self.prop = ''
		self.rowspan = ''
		self.colspan = ''
		for key, value in attrs:
			if key == 'rowspan':
				self.rowspan = value
			elif key == 'colspan':
				self.colspan = value			
		self.inTD = 1
		
	def end_td(self):
		if self.inTD:				
			self.wikirow += '| ' + self.prop + self.re_multiplespaces.sub(' ',self.data.replace('\t',' ').replace(self.linebreak,'').replace('\r','').replace('"','""'))+ self.linebreak;
			self.data = ''
			self.inTD = 0

	def handle_data(self, data):
		if self.inTD:
			if data.strip() != '':				
				self.prop = ''
				if self.rowspan != '':
					self.prop = ' rowspan = '+self.rowspan 
				if self.colspan != '':
					self.prop += ' colspan = '+self.colspan
				if self.prop:
					self.prop += ' | '
				self.data += data

if __name__ == '__main__':				
	parser = html2wiki()
	if len(sys.argv) == 2:
		in_file = open(sys.argv[1],"r")
		text = in_file.read()
		parser.feed(text)
		in_file.close()
		print parser.wiki
	else:
		print 'Argument - filename required'
    

Comments

Snippets Manager replied on Sat, 2008/07/26 - 12:13pm

Cool script. I made it into a handy online utility here