Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
REMitchell
GitHub Repository: REMitchell/python-crawling
Path: blob/master/4-Crawling-Patterns/crawlExternal.py
164 views
1
from urllib.request import urlopen
2
from urllib.parse import urlparse
3
from bs4 import BeautifulSoup
4
import re
5
import datetime
6
import random
7
8
pages = set()
9
random.seed(datetime.datetime.now())
10
11
#Retrieves a list of all Internal links found on a page
12
def getInternalLinks(bsObj, includeUrl):
13
includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
14
internalLinks = []
15
#Finds all links that begin with a "/"
16
for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
17
if link.attrs['href'] is not None:
18
if link.attrs['href'] not in internalLinks:
19
if(link.attrs['href'].startswith("/")):
20
internalLinks.append(includeUrl+link.attrs['href'])
21
else:
22
internalLinks.append(link.attrs['href'])
23
return internalLinks
24
25
#Retrieves a list of all external links found on a page
26
def getExternalLinks(bsObj, excludeUrl):
27
externalLinks = []
28
#Finds all links that start with "http" or "www" that do
29
#not contain the current URL
30
for link in bsObj.findAll("a", href=re.compile(
31
"^(http|www)((?!"+excludeUrl+").)*$")):
32
if link.attrs['href'] is not None:
33
if link.attrs['href'] not in externalLinks:
34
externalLinks.append(link.attrs['href'])
35
return externalLinks
36
37
def getRandomExternalLink(startingPage):
38
html = urlopen(startingPage)
39
bsObj = BeautifulSoup(html,"lxml")
40
#Pass in the current page object, along with the domain name to exclude
41
externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
42
if len(externalLinks) == 0:
43
print("No external links, looking around the site for one")
44
domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc
45
internalLinks = getInternalLinks(bsObj, domain)
46
return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
47
else:
48
return externalLinks[random.randint(0, len(externalLinks)-1)]
49
50
def followExternalOnly(startingSite):
51
externalLink = getRandomExternalLink(startingSite)
52
print("Random external link is: "+externalLink)
53
followExternalOnly(externalLink)
54
55
followExternalOnly("http://oreilly.com")
56
57