CoCalc -- articles

GitHub Repository: REMitchell/python-crawling
Path: blob/master/3-Advanced-Crawlers/mysql-optional/articles_mysql.py
¹⁶⁴ views
1
from website import Website
2
from topic import Topic
3
from content import Content
4

5
import pymysql
6
import requests
7
from bs4 import BeautifulSoup
8
import sys
9
from io import StringIO
10
import csv
11

12
class Crawler:
13
	conn = None
14
	cur = None
15

16
	def __init__(self):
17
		global conn
18
		global curcomp
19
		
20

21
	#########
22
	# Open a MySQL connection. Should be triggered by the caller before running
23
	# the scraper, if the caller is using MySQL
24
	#########
25
	def openCon(self):
26
		global conn
27
		global cur
28
		#Use this line connecting to MySQL on Linux/Unix/MacOSX
29
		conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='mysql', charset='utf8')
30
		#Use this line connecting to MySQL on Windows
31
		#conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd=None, db='mysql' charset='utf8')
32

33
		cur = conn.cursor(pymysql.cursors.DictCursor)
34
		cur.execute("USE articleCrawler")
35

36
	#########
37
	# Close a MySQL connection. Should be triggered by the caller after running
38
	# the scraper, if the caller is using MySQL
39
	#########
40
	def closeCon(self):
41
		global conn
42
		global cur
43
		conn.close()
44

45
	#########
46
	# Prints and stores content if content does not already exist for that URL and topic
47
	#########
48
	def storeContent(self, topic, site, title, body, url):
49
		global conn
50
		global cur
51
		#Optionally, comment out the print statements if you want this to go straight to
52
		#MySQL without printing
53
		print("New article found for: "+topic.name)
54
		print(title)
55
		print(body)
56

57
		if(len(body) > 9999):
58
			body = body[:9999]
59
		if(len(title) > 999):
60
			title = title[:999]
61
		cur.execute("SELECT * FROM content WHERE url = %s AND topicId = %s", (url, int(topic.id)))
62
		if cur.rowcount == 0:
63
			try:
64
				cur.execute("INSERT INTO content (topicId, siteId, title, body, url) VALUES(%s, %s, %s, %s, %s)", (int(topic.id), int(site.id), title, body, url))
65
			except:
66
				print("Could not store article")
67
			try:
68
				conn.commit()
69
			except:
70
				conn.rollback()
71

72

73
	def getSites(self):
74
		global conn
75
		global cur
76
		cur.execute("SELECT * FROM sites")
77
		sitesData = cur.fetchall()
78
		allSiteObjs = []
79
		for site in sitesData:
80
			siteObj = Website(site['id'], site['name'], site['url'], site['searchUrl'], site['resultListing'], site['resultUrl'], site['absoluteUrl'], site['pageTitle'], site['pageBody'])
81
			allSiteObjs.append(siteObj)
82
		return allSiteObjs
83

84
	def getTopics(self):
85
		global conn
86
		global cur
87
		cur.execute("SELECT * FROM topics")
88
		topicsData = cur.fetchall()
89
		allTopicObjs = []
90
		for topic in topicsData:
91
			topicObj = Topic(topic['id'], topic['name'])
92
			allTopicObjs.append(topicObj)
93
		return allTopicObjs
94

95

96
	################
97
	# Utilty function used to get a Beautiful Soup object
98
	# from a given URL
99
	##############
100
	def getPage(self, url):
101
		print("Retrieving URL:\n"+url)
102
		session = requests.Session()
103
		headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
104
		try:
105
			req = session.get(url, headers=headers)
106
		except requests.exceptions.RequestException:
107
			return None
108
		bsObj = BeautifulSoup(req.text, "lxml")
109
		return bsObj
110

111

112
   	################
113
	# Utilty function used to get a string from a Beautiful Soup
114
	# object and a selector. Returns an empty string if no object
115
	# is found for the given selector
116
	##############
117
	def safeGet(self, pageObj, selector):
118
		childObj = pageObj.select(selector)
119
		if childObj is not None and len(childObj) > 0:
120
			return childObj[0].get_text()
121
		return ""
122

123

124
	################
125
	# Searches a given website for a given topic and records all 
126
	# pages found
127
	##############
128
	def search(self, topic, site):
129
		print(site.searchUrl+topic.name)
130
		bsObj = self.getPage(site.searchUrl+topic.name)
131
		searchResults = bsObj.select(site.resultListing)
132
		for result in searchResults:
133
			url = result.select(site.resultUrl)[0].attrs["href"]
134
			#Check to see whether it's a relative or an absolute URL
135
			
136
			if(site.absoluteUrl == "true"):
137
				pageObj = self.getPage(url)
138
			else:
139
				pageObj = self.getPage(site.url+url)
140
				
141
			if pageObj == None:
142
				print("Something is wrong with that page or URL. Skipping")
143
			else:
144
				title = self.safeGet(pageObj, site.pageTitle)
145
				print("Title is "+title)
146
				body = self.safeGet(pageObj, site.pageBody)
147
				if title != "" and body != "":
148
					self.storeContent(topic, site, title, body, url)
149

150

151
#####################################################
152
##### "User" code, outside the scraper class ########
153
#####################################################
154

155
crawler = Crawler()
156
crawler.openCon()
157
#build a list of websites to search, from the CSV file
158
sites = crawler.getSites()
159
topics = crawler.getTopics()
160

161
for topic in topics:
162
	print("GETTING INFO ABOUT: "+topic.name);
163
	for targetSite in sites:
164
		print("FROM SITE: "+targetSite.name);
165
		crawler.search(topic, targetSite)
166

167
crawler.closeCon()
168

169

170

171

172
Product

Resources

Company