CoCalc -- crawlSite_recursive.py

GitHub Repository: REMitchell/python-crawling
Path: blob/master/4-Crawling-Patterns/crawlSite_recursive.py
¹⁶⁴ views

1
from urllib.request import urlopen
2
from urllib.parse import urlparse
3
from bs4 import BeautifulSoup
4
import re
5

6
pages = set()
7

8
def formatUrl(url, root):
9
    if(url.startswith("/")):
10
        return root+url
11
    if(url.startswith("http")):
12
        return url
13
    return root+"/"+url
14

15
#Retrieves a list of all Internal links found on a page
16
def getInternalLinks(bsObj, root):
17
    internalLinks = []
18
    parsed_uri = urlparse(root)
19
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
20
    for link in bsObj.findAll("a"):
21
        if link.has_attr('href'):
22
            url = link.attrs['href']
23
            #Check if URL is internal
24
            if url is not None and "#" not in url and (url.startswith(domain) or not url.startswith("http")):
25
                url = formatUrl(url, root)
26
                if url not in internalLinks:
27
                    internalLinks.append(url)
28
    return internalLinks
29

30
def getLinks(pageUrl, root):
31
    global pages
32
    html = urlopen(pageUrl)
33
    bsObj = BeautifulSoup(html, "lxml")
34
    internalLinks = getInternalLinks(bsObj, root)
35
    print(internalLinks)
36
    for link in internalLinks:
37
        if link not in pages:
38
            #We have encountered a new page
39
            print("----------------\n"+link)
40
            pages.add(link)
41
            getLinks(link, root)
42

43
getLinks("http://pythonscraping.com","http://pythonscraping.com") 
44

Product

Resources

Company