Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
REMitchell
GitHub Repository: REMitchell/python-crawling
Path: blob/master/4-Crawling-Patterns/crawlWiki_randomWalk.py
164 views
1
from urllib.request import urlopen
2
from bs4 import BeautifulSoup
3
import re
4
import random
5
import time
6
7
pages = set()
8
def getLinks(pageUrl):
9
global pages
10
html = urlopen("http://en.wikipedia.org"+pageUrl)
11
bsObj = BeautifulSoup(html, "lxml")
12
try:
13
print(bsObj.h1.get_text())
14
print(bsObj.find(id ="mw-content-text").findAll("p")[0])
15
print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
16
except AttributeError:
17
print("This page is missing something! No worries though!")
18
links = bsObj.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
19
randomLink = links[random.randint(0, len(links)-1)]
20
while randomLink.attrs['href'] in pages:
21
randomLink = links[random.randint(0, len(links)-1)]
22
23
24
#We have encountered a new page
25
newPage = randomLink.attrs['href']
26
print("----------------\n"+newPage)
27
pages.add(newPage)
28
time.sleep(1)
29
getLinks(newPage)
30
getLinks("")
31