We can use python htmllib.HTMLParser to extract link and corresponding text from html files:
import urllib
from formatter import *
from htmllib import *
formatter = AbstractFormatter(NullWriter())
class LinkParser(HTMLParser):
def __init__(self, *sub, **kw):
HTMLParser.__init__(self, *sub, **kw)
self.current_link = self.current_text = None
def handle_starttag(self, tag, method, attrs):
if tag == 'a':
for attr in attrs:
if attr[0]=='href':
self.current_link = attr[1]
return HTMLParser.handle_starttag(self, tag, method, attrs)
def handle_endtag(self, tag, method):
if tag == 'a':
if self.current_link and self.current_text:
print self.current_link, self.current_text
self.current_link = self.current_text = None
return HTMLParser.handle_endtag(self, tag, method)
def handle_data(self, data):
if self.current_link:
self.current_text = str(data).strip()
return HTMLParser.handle_data(self, data)
To use the code:
p = LinkParser(formatter)
p.feed(urllib.urlopen('http://www.slashdot.org').read())
p.close()
import urllib
from formatter import *
from htmllib import *
formatter = AbstractFormatter(NullWriter())
class LinkParser(HTMLParser):
def __init__(self, *sub, **kw):
HTMLParser.__init__(self, *sub, **kw)
self.current_link = self.current_text = None
def handle_starttag(self, tag, method, attrs):
if tag == 'a':
for attr in attrs:
if attr[0]=='href':
self.current_link = attr[1]
return HTMLParser.handle_starttag(self, tag, method, attrs)
def handle_endtag(self, tag, method):
if tag == 'a':
if self.current_link and self.current_text:
print self.current_link, self.current_text
self.current_link = self.current_text = None
return HTMLParser.handle_endtag(self, tag, method)
def handle_data(self, data):
if self.current_link:
self.current_text = str(data).strip()
return HTMLParser.handle_data(self, data)
To use the code:
p = LinkParser(formatter)
p.feed(urllib.urlopen('http://www.slashdot.org').read())
p.close()
Comments