Python web crawler: import urllib2 from bs4 import BeautifulSoup import time def

ID: 3891946 • Letter: P

Question

Python web crawler:

import urllib2

from bs4 import BeautifulSoup

import time

def crawl(seeds):

frontier = [seeds]

visited_urls = set()

for crawl_url in frontier:

print "Crawling:", crawl_url

visited_urls.add(crawl_url)

try:

resp = urllib2.urlopen(crawl_url)

except:

print "Could not access ", crawl_url

continue

content_type = resp.info().get('Content-Type')

if not content_type.startswith('text/html'):

print "Skipping %s content" % content_type

continue

contents = resp.read()

soup = BeautifulSoup(contents)

discovered_urls = set()

links = soup('a') # Get all anchor tags

for link in links:

if ('href' in dict(link.attrs)):

url = urllib2.urlparse.urljoin(crawl_url, link['href'])

if (url[0:4] == 'http' and url not in visited_urls

and url not in discovered_urls and url not in frontier):

discovered_urls.add(url)

frontier += discovered_urls

time.sleep(2)

Assignment:

•Add an optional parameter limit with a default of 10 to crawl() function which is the maximum number of web pages to download. •Save files to pages dir using the MD5 hash of the page’s URL.

import hashlib
filename = 'pages/' + hashlib.md5(url.encode()).hexdigest() + '.html'.

•Only crawl URLs that are in landmark.edu domain (*.landmark.edu)

–Use a regular expression when examining discovered links.

import re
p = re.compile('ab*')
if p.match('abc'):
print("yes").

Coded in python.

Primary data structures:

•Frontier

–Links that have not yet been visited

–Implement as a list to simulate a queue

•Visited

–Links that have been visited

–Implement as a set to quickly check for inclusion

•Discovered

–Links that have been discovered

–Implement as a set to quickly check for inclusion

Explanation / Answer

import urllib2
from bs4 import BeautifulSoup
import time
def crawl(seeds, limit=10):
frontier = [seeds]
visited_urls = set()
counter = 0
for crawl_url in frontier:
print "Crawling:", crawl_url
visited_urls.add(crawl_url)

try:
resp = urllib2.urlopen(crawl_url)
except:
print "Could not access ", crawl_url
continue
content_type = resp.info().get('Content-Type')
if not content_type.startswith('text/html'):
print "Skipping %s content" % content_type
continue
if counter >= limit:
return
contents = resp.read()
filename = 'pages/' + hashlib.md5(url.encode()).hexdigest() + '.html'
with open(filename) as f:
f.write(content)
counter += 1
soup = BeautifulSoup(contents)
discovered_urls = set()
links = soup('a') # Get all anchor tags
for link in links:
if ('href' in dict(link.attrs)):
url = urllib2.urlparse.urljoin(crawl_url, link['href'])
if (url[0:4] == 'http' and url not in visited_urls
and url not in discovered_urls and url not in frontier):
if re.match('https?://*.landmark.edu/', url):
discovered_urls.add(url)
frontier += discovered_urls
time.sleep(2)

Navigate

Python ver 2.7 File Encryption/Decryption Program: You are to design and write a

Python would be best considered a __________ . statically typed language. abstra

Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.

Python web crawler: import urllib2 from bs4 import BeautifulSoup import time def

Question

Explanation / Answer

Related Questions

Navigate