Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
REMitchell
GitHub Repository: REMitchell/python-crawling
Path: blob/master/4-Crawling-Patterns/crawlSite_recursive.py
164 views
1
from urllib.request import urlopen
2
from urllib.parse import urlparse
3
from bs4 import BeautifulSoup
4
import re
5
6
pages = set()
7
8
def formatUrl(url, root):
9
if(url.startswith("/")):
10
return root+url
11
if(url.startswith("http")):
12
return url
13
return root+"/"+url
14
15
#Retrieves a list of all Internal links found on a page
16
def getInternalLinks(bsObj, root):
17
internalLinks = []
18
parsed_uri = urlparse(root)
19
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
20
for link in bsObj.findAll("a"):
21
if link.has_attr('href'):
22
url = link.attrs['href']
23
#Check if URL is internal
24
if url is not None and "#" not in url and (url.startswith(domain) or not url.startswith("http")):
25
url = formatUrl(url, root)
26
if url not in internalLinks:
27
internalLinks.append(url)
28
return internalLinks
29
30
def getLinks(pageUrl, root):
31
global pages
32
html = urlopen(pageUrl)
33
bsObj = BeautifulSoup(html, "lxml")
34
internalLinks = getInternalLinks(bsObj, root)
35
print(internalLinks)
36
for link in internalLinks:
37
if link not in pages:
38
#We have encountered a new page
39
print("----------------\n"+link)
40
pages.add(link)
41
getLinks(link, root)
42
43
getLinks("http://pythonscraping.com","http://pythonscraping.com")
44