class Tag:
name = '';
text = '';
first_child = 0;
parent = 0;
next_sibling = 0;
closed = 0;
depth = 0;
def get_tag_info_str(self):
c,p,s = 'none','none','none'
if self.first_child != 0:
c = self.first_child.name
if self.parent != 0:
p = self.parent.name
if self.next_sibling != 0:
s = self.next_sibling.name
return "name = {}, text = {}\nParent = {}, First Child = {}, Next Sibling = {}\nClosed = {}, Depth = {}\n".format(self.name, self.text, p, c, s, self.closed, self.depth)
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
class MyHTMLParser(HTMLParser):
tag_list = []
depth = 0;
previous_tag = 'none';
mode = 'silent';
def handle_starttag(self, tag, attrs):
if self.mode != 'silent':
print "Start tag:", tag
for attr in attrs:
print " attr:", attr
self.depth = self.depth + 1
t = Tag()
t.name = tag
t.depth = self.depth
if self.previous_tag == 'start':
# current tag is a first child of the last tag
t.parent = self.tag_list[len(self.tag_list)-1]
self.tag_list[len(self.tag_list)-1].first_child = t
elif self.previous_tag == 'end':
# current tag is next sibling of the last tag
for x in reversed(self.tag_list):
if x.depth == self.depth:
x.next_sibling = t
if t.parent == 0:
t.parent = x.parent
break
elif self.previous_tag == 'startend':
# current tag is the next sibling of the previous tag
t.parent = self.tag_list[len(self.tag_list)-1].parent
self.tag_list[len(self.tag_list)-1].next_sibling = t
self.tag_list.append(t)
self.previous_tag = 'start'
def handle_endtag(self, tag):
if self.mode != 'silent':
print "End tag :", tag
for x in reversed(self.tag_list):
if x.name == tag and x.closed == 0:
x.closed = 1
break
self.depth = self.depth - 1
self.previous_tag = 'end'
def handle_startendtag(self, tag, attrs):
if self.mode != 'silent':
print "Start/End tag :", tag
for attr in attrs:
print " attr:", attr
t = Tag()
self.depth = self.depth + 1
t.name = tag
t.depth = self.depth
t.closed = 1
if self.previous_tag == 'start':
# current tag is first child of the last tag
t.parent = self.tag_list[len(self.tag_list)-1]
self.tag_list[len(self.tag_list)-1].first_child = t
elif self.previous_tag == 'startend':
# current tag is next sibling of last tag
t.parent = self.tag_list[len(self.tag_list)-1].parent
self.tag_list[len(self.tag_list)-1].next_sibling = t
elif self.previous_tag == 'end':
# current tag is next sibling of a previous tag of depth = self.depth
for x in reversed(self.tag_list):
if x.depth == self.depth:
x.next_sibling = t
if t.parent == 0:
t.parent = x.parent
break
self.tag_list.append(t)
self.depth = self.depth - 1
self.previous_tag = 'startend'
def handle_data(self, data):
if self.mode != 'silent':
print "Data :", data
self.depth = self.depth + 1
# add data to last tag in list with depth = current depth - 1
for x in reversed(self.tag_list):
if x.depth == self.depth - 1:
x.text = (x.text + ' ' + data.strip(' \n\t')).strip(' \n\t')
break
self.depth = self.depth - 1
def handle_comment(self, data):
if self.mode != 'silent':
print "Comment :", data
def handle_entityref(self, name):
if self.mode != 'silent':
c = unichr(name2codepoint[name])
print "Named ent:", c
def handle_charref(self, name):
if self.mode != 'silent':
if name.startswith('x'):
c = unichr(int(name[1:], 16))
else:
c = unichr(int(name))
print "Num ent :", c
def handle_decl(self, data):
if self.mode != 'silent':
print "Decl :", data
def print_tag_list(self, u):
for l in self.tag_list:
print l.get_tag_info_str()
def clear_tag_list(self):
self.tag_list.__delslice__(0,len(self.tag_list))
def pretty_print_tags(self):
for t in self.tag_list:
s = ''
s = s + self.get_indent_str(t.depth-1)
s = s + self.get_tag_str(t.name)
print s
def get_indent_str(self, n):
s = ''
while(n != 0):
s = s + ' '
n = n - 1
return s
def get_tag_str(self, name):
return '<{}>'.format(name)
def find_first_tag(self, name):
r = 0
for t in self.tag_list:
if t.name == name:
r = t
break
return r
def print_first_tag_info(self, name):
t = self.find_first_tag(name)
if t == 0:
print "Tag: {} not found".format(name)
else:
print t.get_tag_info_str()
import urllib
import socket
socket.setdefaulttimeout(10)
import httplib
class WebCrawler:
"""A simple web crawler"""
link_dict = {};
initial_depth = 0;
#filter_list = [];
parser = 0;
re_compiled_obj = 0;
class PageInfo:
""" i store info about a webpage here """
has_been_scraped = 0;
word_dict = {};
def __init__(self,re_compiled_obj):
#self.filter_list.append(self.Filter(1,'.cnn.'))
self.parser = MyHTMLParser()
self.re_compiled_obj = re_compiled_obj
def get_page(self,url):
""" loads a webpage into a string """
page = ''
try:
f = urllib.urlopen(url=url)
page = f.read()
f.close()
except IOError:
print "Error opening {}".format(url)
except httplib.InvalidURL, e:
print "{} caused an Invalid URL error.".format(url)
if hasattr(e, 'reason'):
print 'We failed to reach a server.'
print 'Reason: ', e.reason
elif hasattr(e, 'code'):
print 'The server couldn\'t fulfill the request.'
print 'Error code: ', e.code
return page
def check_filters(self,url):
""" If the url_str matches any of the
enabled filter strings
then put the url in the dictionary """
match = self.re_compiled_obj.search(url)
#print "match = {}".format(match)
return match
def find_h1_tag(self,s,pos):
""" finds the first <h1> tag """
start = s.find('<h1>',pos)
end = s.find('</h1>',start)
return start, end
def save_tag_text(self, tag, d):
""" stores each word in the tag in a dictionary """
if tag != 0:
token_list = tag.text.split(' ')
for token in token_list:
#print 'token = {}'.format(token)
if d.has_key(token):
d[token] = d[token] + 1
else:
d[token] = 1
return d
def save_page_text(self,page_str):
""" Save all important text on the page """
offset = 0
d = {}
while offset != -1:
start,end = self.find_h1_tag(page_str,offset)
offset = end
if start != -1 and end != -1:
h1_tag = page_str[start:end+5]
#print h1_tag
self.parser.clear_tag_list()
# turn text into linked list of tags
# only feed part of the page into the parser
self.parser.feed(h1_tag)
#self.parser.pretty_print_tags()
tag = self.parser.find_first_tag('h1')
# add words from tag into the dictionary
d = self.save_tag_text(tag,d)
return d
def save_all_links_on_page(self,page_str,limit=60):
""" Stores all links found on the current page in a dictionary """
d = {}
offset = 0
i = 0
num_pages_filtered = 0
num_duplicate_pages = 0
while offset != -1:
if i == limit:
break
offset = page_str.find('<a href="http',offset)
if offset != -1:
start = page_str.find('"', offset)
end = page_str.find('"',start+1)
link = page_str[start+1:end]
# don't just save all the links
# filter the links that match specified criteria
if self.check_filters(link):
if link not in self.link_dict:
# adding link to global dictionary
self.link_dict[link] = self.PageInfo()
# adding link to local dictionary
d[link] = self.PageInfo()
else:
num_duplicate_pages = num_duplicate_pages + 1
else:
num_pages_filtered = num_pages_filtered + 1
offset = offset + 1
i = i + 1
print "{} out of {} links were filtered".format(num_pages_filtered,i)
print "{} out of {} links were duplicates".format(num_duplicate_pages,i)
#print "{} links are being returned from save_all_links".format(len(d))
return d
def save_all_links_recursive(self,links,depth):
""" Recursive function that
1) converts each page (link) into a string
2) stores all links found in a dictionary """
d = {}
print "We are {} levels deep".format(self.initial_depth - depth)
if depth != 0:
depth = depth - 1
urls = links.viewkeys()
#print "There are {} urls".format(len(urls))
for url in urls:
print "trying to get {} over the internet".format(url)
page_str = self.get_page(url)
print "done getting {} over the internet".format(url)
self.link_dict[url].word_dict = self.save_page_text(page_str)
d = self.save_all_links_on_page(page_str)
self.link_dict[url].has_been_scraped = 1
# d contains all the links found on the current page
self.save_all_links_recursive(d,depth)
def start_crawling(self,seed_pages,depth):
""" User calls this function to start crawling the web """
d = {}
self.link_dict.clear()
# initialize global dictionary variable to the seed page url's passed in
for page in seed_pages:
self.link_dict[page] = self.PageInfo()
d[page] = self.PageInfo()
self.initial_depth = depth
# start a recursive crawl
# can't pass in self.link_dict because then i get a RuntimeError: dictionary changed size during iteration
self.save_all_links_recursive(d,depth)
def print_all_page_text(self):
""" prints contents of all the word dictionaries """
for i in range(len(self.link_dict)):
page_info = self.link_dict.values()[i]
url = self.link_dict.keys()[i]
print 'url = {}, has_been_scraped = {}'.format(url,page_info.has_been_scraped)
d = page_info.word_dict
for j in range(len(d)):
word = d.keys()[j]
count = d.values()[j]
print '{} was found {} times'.format(word,count)
import re
cnn_url_regex = re.compile('(?<=[.]cnn)[.]com')
# (?<=[.]cnn)[.]com regular expression does the following:
# 1) match '.com' exactly
# 2) then looking backwards from where '.com' was found it attempts to find '.cnn'
w = WebCrawler(cnn_url_regex)
w.start_crawling(['http://www.cnn.com/2012/02/24/world/americas/haiti-pm-resigns/index.html?hpt=hp_t3'],1)