Using HTMLParser to extract links from html files

We can use python htmllib.HTMLParser to extract link and corresponding text from html files:

import urllib
from formatter import *
from htmllib import *

formatter = AbstractFormatter(NullWriter())
class LinkParser(HTMLParser):
   def __init__(self, *sub, **kw):
   HTMLParser.__init__(self, *sub, **kw)
   self.current_link = self.current_text = None
   def handle_starttag(self, tag, method, attrs):
   if tag == 'a':
   for attr in attrs:
   if attr[0]=='href':
   self.current_link = attr[1]
   return HTMLParser.handle_starttag(self, tag, method, attrs)
   def handle_endtag(self, tag, method):
   if tag == 'a':
   if self.current_link and self.current_text:
   print self.current_link, self.current_text
   self.current_link = self.current_text = None
   return HTMLParser.handle_endtag(self, tag, method)
   def handle_data(self, data):
   if self.current_link:
   self.current_text = str(data).strip()
   return HTMLParser.handle_data(self, data)

To use the code:

p = LinkParser(formatter)
p.feed(urllib.urlopen('http://www.slashdot.org').read())
p.close()

Data Matters

Search This Blog

Using HTMLParser to extract links from html files

Labels

Comments

Popular posts from this blog

A simple implementation of DTW(Dynamic Time Warping) in C#/python

Install mysql-python with mariadb

PrefixSpan source code in python