CoCalc -- crawler.py

GitHub Repository: REMitchell/python-crawling
Path: blob/master/3-Advanced-Crawlers/crawler.py
¹⁶⁴ views
1
from website import Website
2
from topic import Topic
3
from content import Content
4

5
import pymysql
6
import requests
7
from bs4 import BeautifulSoup
8
import sys
9
from io import StringIO
10
import csv
11

12
class Crawler:
13
	conn = None
14
	cur = None
15

16
#	def __init__(self):
17
#		print("Starting!")
18

19

20
	#########
21
	# Prints content, can be integrated with MySQL to store things
22
	#########
23
	def printContent(self, topic, title, body, url):
24
		print("New article found for: "+topic.name)
25
		print(title)
26
		print(body)
27

28

29
	#########
30
	# Creates a new topic object, from a topic string
31
	##########
32
	def getTopicFromName(self, topicName):
33
		topic = Topic(0, topicName)
34
		return topic
35

36
	################
37
	# Utilty function used to get a Beautiful Soup object
38
	# from a given URL
39
	##############
40
	def getPage(self, url):
41
		print("Retrieving URL:\n"+url)
42
		session = requests.Session()
43
		headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
44
		try:
45
			req = session.get(url, headers=headers)
46
		except requests.exceptions.RequestException:
47
			return None
48
		bsObj = BeautifulSoup(req.text, "lxml")
49
		return bsObj
50

51

52
   	################
53
	# Utilty function used to get a content string from a Beautiful Soup
54
	# object and a selector. Returns an empty string if no object
55
	# is found for the given selector
56
	##############
57
	def safeGet(self, pageObj, selector):
58
		childObj = pageObj.select(selector)
59
		if childObj is not None and len(childObj) > 0:
60
			return childObj[0].get_text()
61
		return ""
62

63

64
	################
65
	# Searches a given website for a given topic and records all 
66
	# pages found
67
	##############
68
	def search(self, topic, site):
69
		bsObj = self.getPage(site.searchUrl+topic.name)
70
		searchResults = bsObj.select(site.resultListing)
71
		for result in searchResults:
72
			url = result.select(site.resultUrl)[0].attrs["href"]
73
			#Check to see whether it's a relative or an absolute URL
74

75
			if(site.absoluteUrl == "TRUE"):
76
				pageObj = self.getPage(url)
77
			else:
78
				pageObj = self.getPage(site.url+url)
79
			if pageObj == None:
80
				print("Something was wrong with that page or URL. Skipping!")
81
			else:
82
				title = self.safeGet(pageObj, site.pageTitle)
83
				print("Title is "+title)
84
				body = self.safeGet(pageObj, site.pageBody)
85
				if title != "" and body != "":
86
					self.printContent(topic, title, body, url)
87

88
	################
89
	# Starts a search of a given website for a given topic
90
	##############
91
	def crawl(self, topicStr, targetSite):
92
		global conn
93
		global cur
94
		#If using MySQL, this will get any stored details about the topic
95
		#If not using MySQL, it will essentially do nothing
96
		topic = self.getTopicFromName(topicStr)
97
		self.search(topic, targetSite)
98

99
#####################################################
100
##### "User" code, outside the scraper class ########
101
#####################################################
102

103
f = open("topics.txt", 'r')
104
topicName = f.readline().strip()
105
crawler = Crawler()
106

107
#Get a list of sites to search from the sites.csv file
108
data = open("sites.csv", 'r').read()
109
dataFile = StringIO(data)
110
siteRows = csv.reader(dataFile)
111

112
#Skip the header line in the CSV file - the header makes it easy to read,
113
#but we don't want to use the column titles as actual site data
114
next(siteRows)
115

116
#build a list of websites to search, from the CSV file
117
sites = []
118
for row in siteRows:
119
	sites.append(Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]))
120

121
while(topicName):
122
	print("GETTING INFO ABOUT: "+topicName);
123
	for targetSite in sites:
124
		crawler.crawl(topicName, targetSite)
125
	topicName = f.readline().strip()
126

127

128

129

130

131
Product

Resources

Company