We can use python htmllib.HTMLParser to extract link and corresponding text from html files: import urllib from formatter import * from htmllib import * formatter = AbstractFormatter(NullWriter()) class LinkParser(HTMLParser): def __init__(self, *sub, **kw): HTMLParser.__init__(self, *sub, **kw) self.current_link = self.current_text = None def handle_starttag(self, tag, method, attrs): if tag == 'a': for attr in attrs: if attr[0]=='href': self.current_link = attr[1] return HTMLParser.handle_starttag(self, tag, method, attrs) def handle_endtag(self, tag, method): if tag ...
The blog talks about data mining, algorithm, nosql, python ...