Python web crawler: import urllib2 from bs4 import BeautifulSoup import time def
ID: 3891946 • Letter: P
Question
Python web crawler:
import urllib2
from bs4 import BeautifulSoup
import time
def crawl(seeds):
frontier = [seeds]
visited_urls = set()
for crawl_url in frontier:
print "Crawling:", crawl_url
visited_urls.add(crawl_url)
try:
resp = urllib2.urlopen(crawl_url)
except:
print "Could not access ", crawl_url
continue
content_type = resp.info().get('Content-Type')
if not content_type.startswith('text/html'):
print "Skipping %s content" % content_type
continue
contents = resp.read()
soup = BeautifulSoup(contents)
discovered_urls = set()
links = soup('a') # Get all anchor tags
for link in links:
if ('href' in dict(link.attrs)):
url = urllib2.urlparse.urljoin(crawl_url, link['href'])
if (url[0:4] == 'http' and url not in visited_urls
and url not in discovered_urls and url not in frontier):
discovered_urls.add(url)
frontier += discovered_urls
time.sleep(2)
Assignment:
•Add an optional parameter limit with a default of 10 to crawl() function which is the maximum number of web pages to download. •Save files to pages dir using the MD5 hash of the page’s URL.
import hashlib
filename = 'pages/' + hashlib.md5(url.encode()).hexdigest() + '.html'.
•Only crawl URLs that are in landmark.edu domain (*.landmark.edu)
–Use a regular expression when examining discovered links.
import re
p = re.compile('ab*')
if p.match('abc'):
print("yes").
Coded in python.
Primary data structures:
•Frontier
–Links that have not yet been visited
–Implement as a list to simulate a queue
•Visited
–Links that have been visited
–Implement as a set to quickly check for inclusion
•Discovered
–Links that have been discovered
–Implement as a set to quickly check for inclusion
Explanation / Answer
import urllib2
from bs4 import BeautifulSoup
import time
def crawl(seeds, limit=10):
frontier = [seeds]
visited_urls = set()
counter = 0
for crawl_url in frontier:
print "Crawling:", crawl_url
visited_urls.add(crawl_url)
try:
resp = urllib2.urlopen(crawl_url)
except:
print "Could not access ", crawl_url
continue
content_type = resp.info().get('Content-Type')
if not content_type.startswith('text/html'):
print "Skipping %s content" % content_type
continue
if counter >= limit:
return
contents = resp.read()
filename = 'pages/' + hashlib.md5(url.encode()).hexdigest() + '.html'
with open(filename) as f:
f.write(content)
counter += 1
soup = BeautifulSoup(contents)
discovered_urls = set()
links = soup('a') # Get all anchor tags
for link in links:
if ('href' in dict(link.attrs)):
url = urllib2.urlparse.urljoin(crawl_url, link['href'])
if (url[0:4] == 'http' and url not in visited_urls
and url not in discovered_urls and url not in frontier):
if re.match('https?://*.landmark.edu/', url):
discovered_urls.add(url)
frontier += discovered_urls
time.sleep(2)
Related Questions
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.