Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
REMitchell
GitHub Repository: REMitchell/python-crawling
Path: blob/master/4-Crawling-Patterns/crawlWiki_recursive.py
164 views
1
from urllib.request import urlopen
2
from bs4 import BeautifulSoup
3
import re
4
5
pages = set()
6
def getLinks(pageUrl):
7
global pages
8
html = urlopen("http://en.wikipedia.org"+pageUrl)
9
bsObj = BeautifulSoup(html, "lxml")
10
try:
11
print(bsObj.h1.get_text())
12
print(bsObj.find(id ="mw-content-text").findAll("p")[0])
13
print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
14
except AttributeError:
15
print("This page is missing something! No worries though!")
16
17
for link in bsObj.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")):
18
if 'href' in link.attrs:
19
if link.attrs['href'] not in pages:
20
#We have encountered a new page
21
newPage = link.attrs['href']
22
print("----------------\n"+newPage)
23
pages.add(newPage)
24
getLinks(newPage)
25
getLinks("")
26