CoCalc -- basicCrawler.py

GitHub Repository: REMitchell/python-crawling
Path: blob/master/2-Basic-Crawler/basicCrawler.py
¹⁶⁴ views
1
from urllib.request import urlopen
2
from bs4 import BeautifulSoup
3
import re
4

5
##################
6
# This crawler gets the most recent "Technology" articles
7
# from Reuters, and prints out their title and lede
8
# (or the first paragraph)
9
#################
10
def getArticle(url):
11
	print("URL: "+url)
12
	html = urlopen(url)
13
	articleObj = BeautifulSoup(html.read(), "lxml")
14
	#Get article title. This should have a class name ending in "title"
15
	title = articleObj.find("h1").get_text()
16
	time = articleObj.find("span",{"class":"timestamp"}).get_text()
17
	location = ""
18
	if articleObj.find("span",{"class":"articleLocation"}):
19
		location = articleObj.find("span",{"class":"articleLocation"}).get_text()
20
	#Get the main body of the article text
21
	body = articleObj.find("span", {"id":"article-text"}).get_text()
22

23
	print("TITLE: "+title)
24

25
	print("AUTHOR: "+time)
26

27
	print("LOCATION: "+location)
28
	print("BODY: "+body)
29
	print("-----------------------------")
30

31
for i in range(0, 10):
32
	print("Scraping page: "+str(i)+" of articles")
33
	url = "http://www.reuters.com/news/archive/technologyNews?view=page&page="+str(i)+"&pageSize=10"
34
	html = urlopen(url)
35
	listingObj = BeautifulSoup(html.read(), "lxml")
36
	urls = listingObj.findAll("h3", {"class":"story-title"})
37
	for url in urls:
38
		newPage = url.find("a").attrs['href']
39
		#Ignore external URLs
40
		if newPage.startswith("/"):
41
			getArticle("http://reuters.com"+newPage)
42
Product

Resources

Company