Data Matters

Posts

Showing posts from February, 2011

Using HTMLParser to extract links from html files

We can use python htmllib.HTMLParser to extract link and corresponding text from html files: import urllib from formatter import * from htmllib import * formatter = AbstractFormatter(NullWriter()) class LinkParser(HTMLParser): def __init__(self, *sub, **kw): HTMLParser.__init__(self, *sub, **kw) self.current_link = self.current_text = None def handle_starttag(self, tag, method, attrs): if tag == 'a': for attr in attrs: if attr[0]=='href': self.current_link = attr[1] return HTMLParser.handle_starttag(self, tag, method, attrs) def handle_endtag(self, tag, method): if tag == 'a': if self.current_link and self.current_text: print self.current_link, self.current_text self.current_link = self.current_text = None return HTMLParser.handle_endtag(self, tag, method) def handle_data(self, data):