faqts : Computers : Programming : Languages : Python : Snippets : Web Programming / Manipulating HTML files

+ Search
Add Entry AlertManage Folder Edit Entry Add page to http://del.icio.us/
Did You Find This Entry Useful?

11 of 11 people (100%) answered Yes
Recently 5 of 5 people (100%) answered Yes

Entry

Extracting fields from HTML table

Jul 5th, 2000 09:58
Nathan Wallace, unknown unknown, Hans Nowak, Snippet 3, Gary Capell


"""
Packages: text.html
"""

"""
>Any tips on how to extract fields from html tables? Rendering the
>table with lynx sort of works, but field delimiters are lost.

This script opens a HTML file, and writes a whole bunch
of files, one per TD or TH cell:
"""

from sgmllib import SGMLParser
import sys

class tableparser(SGMLParser):
	def __init__(self, verbose=0):
		SGMLParser.__init__(self, verbose)
		self.cell = 0	# count of cells encountered
		self.fp = 0	# file pointer: where we're sending data
		
	def handle_data(self, text):
		if not self.fp:
			name = `self.cell` + '.html'
			self.cell = self.cell + 1
			self.fp = open(name, 'w')
		self.fp.write(text)
	
	def unknown_starttag(self, tag, attrs):
		text = '<'+tag+ ' '
		for key,val in attrs:
			text = text + key+'="'+val+'" '
		text = text + '>'
		self.handle_data(text)

	def unknown_endtag(self, tag):
		self.handle_data('</'+tag+'>')
		
	def start_td(self, attrs):
		self.fp = 0

	def end_td(self):
		self.fp = 0

	def end_table(self):
		pass
	def start_table(self, attrs):
		pass
	def start_tr(self, attrs):
		pass
	def end_tr(self):
		pass

def main():
	f = open("xxx")   # insert filename here
	parser = tableparser()
	parser.feed(f.read())

if __name__ == '__main__':
	main()