Path: blob/master/4-Crawling-Patterns/crawlWiki_randomWalk.py
164 views
from urllib.request import urlopen1from bs4 import BeautifulSoup2import re3import random4import time56pages = set()7def getLinks(pageUrl):8global pages9html = urlopen("http://en.wikipedia.org"+pageUrl)10bsObj = BeautifulSoup(html, "lxml")11try:12print(bsObj.h1.get_text())13print(bsObj.find(id ="mw-content-text").findAll("p")[0])14print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])15except AttributeError:16print("This page is missing something! No worries though!")17links = bsObj.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))18randomLink = links[random.randint(0, len(links)-1)]19while randomLink.attrs['href'] in pages:20randomLink = links[random.randint(0, len(links)-1)]212223#We have encountered a new page24newPage = randomLink.attrs['href']25print("----------------\n"+newPage)26pages.add(newPage)27time.sleep(1)28getLinks(newPage)29getLinks("")3031