Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
REMitchell
GitHub Repository: REMitchell/python-crawling
Path: blob/master/2-Basic-Crawler/basicCrawler.py
164 views
1
from urllib.request import urlopen
2
from bs4 import BeautifulSoup
3
import re
4
5
##################
6
# This crawler gets the most recent "Technology" articles
7
# from Reuters, and prints out their title and lede
8
# (or the first paragraph)
9
#################
10
def getArticle(url):
11
print("URL: "+url)
12
html = urlopen(url)
13
articleObj = BeautifulSoup(html.read(), "lxml")
14
#Get article title. This should have a class name ending in "title"
15
title = articleObj.find("h1").get_text()
16
time = articleObj.find("span",{"class":"timestamp"}).get_text()
17
location = ""
18
if articleObj.find("span",{"class":"articleLocation"}):
19
location = articleObj.find("span",{"class":"articleLocation"}).get_text()
20
#Get the main body of the article text
21
body = articleObj.find("span", {"id":"article-text"}).get_text()
22
23
print("TITLE: "+title)
24
25
print("AUTHOR: "+time)
26
27
print("LOCATION: "+location)
28
print("BODY: "+body)
29
print("-----------------------------")
30
31
for i in range(0, 10):
32
print("Scraping page: "+str(i)+" of articles")
33
url = "http://www.reuters.com/news/archive/technologyNews?view=page&page="+str(i)+"&pageSize=10"
34
html = urlopen(url)
35
listingObj = BeautifulSoup(html.read(), "lxml")
36
urls = listingObj.findAll("h3", {"class":"story-title"})
37
for url in urls:
38
newPage = url.find("a").attrs['href']
39
#Ignore external URLs
40
if newPage.startswith("/"):
41
getArticle("http://reuters.com"+newPage)
42