Entry
Extracting tagged text from an HTML file
Jul 5th, 2000 09:58
Nathan Wallace, unknown unknown, Hans Nowak, Snippet 6, Fredrik Lundh
"""
Packages: text.html
"""
Subject: Re: extracting tagged text from an HTML file
"""
>Is there an easy way to extract some pieces of raw tagged text when
>parsing an HTML file?
>
>I'm used to HTMLParser (and its colleagues), and I know how to extract
>parsed (thus untagged) text using the parser class. I tried to extract
>raw text by subclassing HTMLParser, but bogged down. It seems
>impossible for me to extract raw text by subclassing HTMLParser.
use sgmllib, not htmllib. see the attached example and the
library reference for details.
"""
#
# a simple sgml/html dump utility.
import sys, urllib
import sgmllib
class myParser(sgmllib.SGMLParser):
def __init__(self):
# initialize base class
sgmllib.SGMLParser.__init__(self)
self.flag = 0
def newline(self):
# force newline, if necessary
if self.flag:
sys.stdout.write("\n")
self.flag = 0
def unknown_starttag(self, tag, attrs):
# called for each start tag
# the attrs argument is a list of (attr, value)
# tuples. convert it to a string
text = ""
for attr, value in attrs:
text = text + " " + attr + "=" + repr(value)
self.newline()
sys.stdout.write("<" + tag + text + ">\n")
def handle_data(self, text):
# called for each data section
sys.stdout.write(text)
self.flag = 1
def unknown_endtag(self, tag):
# called for each end tag
sys.stdout.write("<" + tag + ">\n")
for url in sys.argv[1:]:
try:
file = open(url)
except IOError:
file = urllib.urlopen(url)
p = myParser()
p.feed(file.read())
p.close()
file.close()