faqts : Computers : Programming : Languages : Python : Snippets : Web Programming / Manipulating HTML files

+ Search
Add Entry AlertManage Folder Edit Entry Add page to http://del.icio.us/
Did You Find This Entry Useful?

8 of 8 people (100%) answered Yes
Recently 3 of 3 people (100%) answered Yes

Entry

Extracting fields from HTML table

Jul 5th, 2000 09:58
Nathan Wallace, unknown unknown, Hans Nowak, Snippet 4, Nigel O'Brian


"""
Packages: text.html
"""

"""
>> Any tips on how to extract fields from html tables? Rendering the
>> table with lynx sort of works, but field delimiters are lost.

Try this - recursively turn the HTML into a list of lists
according to the nesting of specified tags. 
"""

import re

def tagPattern(str):
    """Generate regex's for begin/end pairs of an html tag."""
    a = re.compile('<%s[^>]*>' % str, re.IGNORECASE)
    b = re.compile('</%s>'     % str, re.IGNORECASE)
    return a, b

table, tr, td, th = map(tagPattern, ('table', 'tr', 'td', 'th'))

def getStrings(str, tags):
    """Argument 'tags' is a pair of compiled re objects.
    Return a list of strings bracketted by such pairs."""
    txt = str ; L = [] ; bTag, eTag = tags
    while 1:
	bT = bTag.search(txt)
	if bT == None: return L
	eT = eTag.search(txt)
	if eT == None: return L
	b0, b1 = bT.span()
	e0, e1 = eT.span()
	L.append(txt[b1:e0])
	txt = txt[e1:]

def findTables(str, Levels):
    """str is the HTML text. Levels is a list of tag-patterns"""
    if not Levels: return str
    S = []
    for s in getStrings(str, Levels[0]):
	S.append(findTables(s, Levels[1:]))
    return S

if __name__ == '__main__': # do an example
    tableExample = """
<TABLE border=2 cellpadding=4 cellspacing=1 width="80%">
<CAPTION align=top><STRONG>Tutorial Timetable</STRONG></CAPTION>
<TR Align=center bgcolor="#f8a0d0">
  <TH>Day</TH>
  <TH>Time</TH>
  <TH>Room</TH>
  <TH>Tutor</TH>
</TR>
<TR Align=left bgcolor="#ffc0e0">
    <TD>Wed</TD>
    <TD>11:00</TD>
    <TD>351</TD>
    <TD>Sue</TD>
  </TR>
<TR Align=left bgcolor="#ffc0e0">
    <TD>Fri</TD>
    <TD>12:00</TD>
    <TD>250</TD>
    <TD>Fred</TD>
  </TR>
</TABLE>
<P>
"""
    print findTables(tableExample, (table, th)) # headers
    print findTables(tableExample, (table, tr, td)) # table body