Entry
Extracting fields from HTML table
Jul 5th, 2000 09:58
Nathan Wallace, unknown unknown, Hans Nowak, Snippet 3, Gary Capell
"""
Packages: text.html
"""
"""
>Any tips on how to extract fields from html tables? Rendering the
>table with lynx sort of works, but field delimiters are lost.
This script opens a HTML file, and writes a whole bunch
of files, one per TD or TH cell:
"""
from sgmllib import SGMLParser
import sys
class tableparser(SGMLParser):
def __init__(self, verbose=0):
SGMLParser.__init__(self, verbose)
self.cell = 0 # count of cells encountered
self.fp = 0 # file pointer: where we're sending data
def handle_data(self, text):
if not self.fp:
name = `self.cell` + '.html'
self.cell = self.cell + 1
self.fp = open(name, 'w')
self.fp.write(text)
def unknown_starttag(self, tag, attrs):
text = '<'+tag+ ' '
for key,val in attrs:
text = text + key+'="'+val+'" '
text = text + '>'
self.handle_data(text)
def unknown_endtag(self, tag):
self.handle_data('</'+tag+'>')
def start_td(self, attrs):
self.fp = 0
def end_td(self):
self.fp = 0
def end_table(self):
pass
def start_table(self, attrs):
pass
def start_tr(self, attrs):
pass
def end_tr(self):
pass
def main():
f = open("xxx") # insert filename here
parser = tableparser()
parser.feed(f.read())
if __name__ == '__main__':
main()