Skip to main content

Posts

Showing posts from February, 2011

Using HTMLParser to extract links from html files

We can use python htmllib.HTMLParser to extract link and corresponding text from html files: import urllib from formatter import * from htmllib import * formatter = AbstractFormatter(NullWriter()) class LinkParser(HTMLParser):     def __init__(self, *sub, **kw):         HTMLParser.__init__(self, *sub, **kw)         self.current_link = self.current_text = None     def handle_starttag(self, tag, method, attrs):         if tag == 'a':             for attr in attrs:                 if attr[0]=='href':                     self.current_link = attr[1]         return HTMLParser.handle_starttag(self, tag, method, attrs)     def handle_endtag(self, tag, method):         if tag == 'a':             if self.current_link and self.current_text:                 print self.current_link, self.current_text             self.current_link = self.current_text = None         return HTMLParser.handle_endtag(self, tag, method)     def handle_data(self, data):