DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Snippets has posted 5883 posts at DZone. View Full User Profile

Python - Very Simple Parser

01.09.2007
| 6614 views |
  • submit to reddit
        // Very Simple Parser 

from sgmllib import SGMLParser

import urllib

class ParserHTML(SGMLParser):

	def scrivi(self):
		self.f = open('/tmp/fileOUT.html', 'w')

	def unknown_starttag(self, tag, attrs):

		value = 0
		startTAG = '<' + tag
		
		for i in attrs:
			if(i[0].lower() == i[1].lower() and not i[0] == i[1]):
				startTAG = startTAG[:-1] + ' ' + str(i[1])
				value = 1
			else:
				startTAG += ' ' + str(i[0]) + '="' + str(i[1]) + '"'
				value = 0
		
		if(value == 1): startTAG += '"'

		startTAG += '>'
		self.f.write(startTAG + "\n")

	def handle_data(self, data):

		self.f.write(data + "\n")

	def unknown_endtag(self, tag):

		self.f.write('</' + tag + '>' + "\n")

if __name__ == '__main__':

	p = ParserHTML()
	p.scrivi()
	p.feed(open('/tmp/fileIN.html', 'r').read())