Path: blob/master/4-Crawling-Patterns/crawlWiki_recursive.py
164 views
from urllib.request import urlopen1from bs4 import BeautifulSoup2import re34pages = set()5def getLinks(pageUrl):6global pages7html = urlopen("http://en.wikipedia.org"+pageUrl)8bsObj = BeautifulSoup(html, "lxml")9try:10print(bsObj.h1.get_text())11print(bsObj.find(id ="mw-content-text").findAll("p")[0])12print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])13except AttributeError:14print("This page is missing something! No worries though!")1516for link in bsObj.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")):17if 'href' in link.attrs:18if link.attrs['href'] not in pages:19#We have encountered a new page20newPage = link.attrs['href']21print("----------------\n"+newPage)22pages.add(newPage)23getLinks(newPage)24getLinks("")2526