Path: blob/master/2-Basic-Crawler/basicCrawler.py
164 views
from urllib.request import urlopen1from bs4 import BeautifulSoup2import re34##################5# This crawler gets the most recent "Technology" articles6# from Reuters, and prints out their title and lede7# (or the first paragraph)8#################9def getArticle(url):10print("URL: "+url)11html = urlopen(url)12articleObj = BeautifulSoup(html.read(), "lxml")13#Get article title. This should have a class name ending in "title"14title = articleObj.find("h1").get_text()15time = articleObj.find("span",{"class":"timestamp"}).get_text()16location = ""17if articleObj.find("span",{"class":"articleLocation"}):18location = articleObj.find("span",{"class":"articleLocation"}).get_text()19#Get the main body of the article text20body = articleObj.find("span", {"id":"article-text"}).get_text()2122print("TITLE: "+title)2324print("AUTHOR: "+time)2526print("LOCATION: "+location)27print("BODY: "+body)28print("-----------------------------")2930for i in range(0, 10):31print("Scraping page: "+str(i)+" of articles")32url = "http://www.reuters.com/news/archive/technologyNews?view=page&page="+str(i)+"&pageSize=10"33html = urlopen(url)34listingObj = BeautifulSoup(html.read(), "lxml")35urls = listingObj.findAll("h3", {"class":"story-title"})36for url in urls:37newPage = url.find("a").attrs['href']38#Ignore external URLs39if newPage.startswith("/"):40getArticle("http://reuters.com"+newPage)4142