Path: blob/master/4-Crawling-Patterns/crawlSite_recursive.py
164 views
from urllib.request import urlopen1from urllib.parse import urlparse2from bs4 import BeautifulSoup3import re45pages = set()67def formatUrl(url, root):8if(url.startswith("/")):9return root+url10if(url.startswith("http")):11return url12return root+"/"+url1314#Retrieves a list of all Internal links found on a page15def getInternalLinks(bsObj, root):16internalLinks = []17parsed_uri = urlparse(root)18domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)19for link in bsObj.findAll("a"):20if link.has_attr('href'):21url = link.attrs['href']22#Check if URL is internal23if url is not None and "#" not in url and (url.startswith(domain) or not url.startswith("http")):24url = formatUrl(url, root)25if url not in internalLinks:26internalLinks.append(url)27return internalLinks2829def getLinks(pageUrl, root):30global pages31html = urlopen(pageUrl)32bsObj = BeautifulSoup(html, "lxml")33internalLinks = getInternalLinks(bsObj, root)34print(internalLinks)35for link in internalLinks:36if link not in pages:37#We have encountered a new page38print("----------------\n"+link)39pages.add(link)40getLinks(link, root)4142getLinks("http://pythonscraping.com","http://pythonscraping.com")4344