Entry
Extracting fields from HTML table
Jul 5th, 2000 09:58
Nathan Wallace, unknown unknown, Hans Nowak, Snippet 4, Nigel O'Brian
"""
Packages: text.html
"""
"""
>> Any tips on how to extract fields from html tables? Rendering the
>> table with lynx sort of works, but field delimiters are lost.
Try this - recursively turn the HTML into a list of lists
according to the nesting of specified tags.
"""
import re
def tagPattern(str):
"""Generate regex's for begin/end pairs of an html tag."""
a = re.compile('<%s[^>]*>' % str, re.IGNORECASE)
b = re.compile('</%s>' % str, re.IGNORECASE)
return a, b
table, tr, td, th = map(tagPattern, ('table', 'tr', 'td', 'th'))
def getStrings(str, tags):
"""Argument 'tags' is a pair of compiled re objects.
Return a list of strings bracketted by such pairs."""
txt = str ; L = [] ; bTag, eTag = tags
while 1:
bT = bTag.search(txt)
if bT == None: return L
eT = eTag.search(txt)
if eT == None: return L
b0, b1 = bT.span()
e0, e1 = eT.span()
L.append(txt[b1:e0])
txt = txt[e1:]
def findTables(str, Levels):
"""str is the HTML text. Levels is a list of tag-patterns"""
if not Levels: return str
S = []
for s in getStrings(str, Levels[0]):
S.append(findTables(s, Levels[1:]))
return S
if __name__ == '__main__': # do an example
tableExample = """
<TABLE border=2 cellpadding=4 cellspacing=1 width="80%">
<CAPTION align=top><STRONG>Tutorial Timetable</STRONG></CAPTION>
<TR Align=center bgcolor="#f8a0d0">
<TH>Day</TH>
<TH>Time</TH>
<TH>Room</TH>
<TH>Tutor</TH>
</TR>
<TR Align=left bgcolor="#ffc0e0">
<TD>Wed</TD>
<TD>11:00</TD>
<TD>351</TD>
<TD>Sue</TD>
</TR>
<TR Align=left bgcolor="#ffc0e0">
<TD>Fri</TD>
<TD>12:00</TD>
<TD>250</TD>
<TD>Fred</TD>
</TR>
</TABLE>
<P>
"""
print findTables(tableExample, (table, th)) # headers
print findTables(tableExample, (table, tr, td)) # table body