Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

Python web crawler: import urllib2 from bs4 import BeautifulSoup import time def

ID: 3891946 • Letter: P

Question

Python web crawler:

import urllib2

from bs4 import BeautifulSoup

import time

def crawl(seeds):

    frontier = [seeds]

    visited_urls = set()   

       

    for crawl_url in frontier:

        print "Crawling:", crawl_url

        visited_urls.add(crawl_url)

       

        try:

            resp = urllib2.urlopen(crawl_url)

        except:

            print "Could not access ", crawl_url

            continue

        content_type = resp.info().get('Content-Type')           

        if not content_type.startswith('text/html'):

            print "Skipping %s content" % content_type

            continue

        contents = resp.read()

        soup = BeautifulSoup(contents)               

        discovered_urls = set()

        links = soup('a')    # Get all anchor tags

        for link in links:

            if ('href' in dict(link.attrs)):

                url = urllib2.urlparse.urljoin(crawl_url, link['href'])                

                if (url[0:4] == 'http' and url not in visited_urls

                    and url not in discovered_urls and url not in frontier):

                    discovered_urls.add(url)

        frontier += discovered_urls

        time.sleep(2)

Assignment:

•Add an optional parameter limit with a default of 10 to crawl() function which is the maximum number of web pages to download. •Save files to pages dir using the MD5 hash of the page’s URL.

import hashlib
filename = 'pages/' + hashlib.md5(url.encode()).hexdigest() + '.html'.

•Only crawl URLs that are in landmark.edu domain (*.landmark.edu)

–Use a regular expression when examining discovered links.

import re
p = re.compile('ab*')
if p.match('abc'):
    print("yes").

Coded in python.

Primary data structures:

•Frontier

–Links that have not yet been visited

–Implement as a list to simulate a queue

•Visited

–Links that have been visited

–Implement as a set to quickly check for inclusion

•Discovered

–Links that have been discovered

–Implement as a set to quickly check for inclusion

Explanation / Answer

import urllib2
from bs4 import BeautifulSoup
import time
def crawl(seeds, limit=10):
frontier = [seeds]
visited_urls = set()
counter = 0
for crawl_url in frontier:
print "Crawling:", crawl_url
visited_urls.add(crawl_url)

try:
resp = urllib2.urlopen(crawl_url)
except:
print "Could not access ", crawl_url
continue
content_type = resp.info().get('Content-Type')
if not content_type.startswith('text/html'):
print "Skipping %s content" % content_type
continue
if counter >= limit:
return
contents = resp.read()
filename = 'pages/' + hashlib.md5(url.encode()).hexdigest() + '.html'
with open(filename) as f:
f.write(content)
counter += 1
soup = BeautifulSoup(contents)
discovered_urls = set()
links = soup('a') # Get all anchor tags
for link in links:
if ('href' in dict(link.attrs)):
url = urllib2.urlparse.urljoin(crawl_url, link['href'])   
if (url[0:4] == 'http' and url not in visited_urls
and url not in discovered_urls and url not in frontier):
if re.match('https?://*.landmark.edu/', url):
discovered_urls.add(url)
frontier += discovered_urls
time.sleep(2)