Path: blob/master/4-Crawling-Patterns/crawlExternal.py
164 views
from urllib.request import urlopen1from urllib.parse import urlparse2from bs4 import BeautifulSoup3import re4import datetime5import random67pages = set()8random.seed(datetime.datetime.now())910#Retrieves a list of all Internal links found on a page11def getInternalLinks(bsObj, includeUrl):12includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc13internalLinks = []14#Finds all links that begin with a "/"15for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):16if link.attrs['href'] is not None:17if link.attrs['href'] not in internalLinks:18if(link.attrs['href'].startswith("/")):19internalLinks.append(includeUrl+link.attrs['href'])20else:21internalLinks.append(link.attrs['href'])22return internalLinks2324#Retrieves a list of all external links found on a page25def getExternalLinks(bsObj, excludeUrl):26externalLinks = []27#Finds all links that start with "http" or "www" that do28#not contain the current URL29for link in bsObj.findAll("a", href=re.compile(30"^(http|www)((?!"+excludeUrl+").)*$")):31if link.attrs['href'] is not None:32if link.attrs['href'] not in externalLinks:33externalLinks.append(link.attrs['href'])34return externalLinks3536def getRandomExternalLink(startingPage):37html = urlopen(startingPage)38bsObj = BeautifulSoup(html,"lxml")39#Pass in the current page object, along with the domain name to exclude40externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)41if len(externalLinks) == 0:42print("No external links, looking around the site for one")43domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc44internalLinks = getInternalLinks(bsObj, domain)45return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])46else:47return externalLinks[random.randint(0, len(externalLinks)-1)]4849def followExternalOnly(startingSite):50externalLink = getRandomExternalLink(startingSite)51print("Random external link is: "+externalLink)52followExternalOnly(externalLink)5354followExternalOnly("http://oreilly.com")555657