Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
REMitchell
GitHub Repository: REMitchell/python-crawling
Path: blob/master/3-Advanced-Crawlers/mysql-optional/articles_mysql.py
164 views
1
from website import Website
2
from topic import Topic
3
from content import Content
4
5
import pymysql
6
import requests
7
from bs4 import BeautifulSoup
8
import sys
9
from io import StringIO
10
import csv
11
12
class Crawler:
13
conn = None
14
cur = None
15
16
def __init__(self):
17
global conn
18
global curcomp
19
20
21
#########
22
# Open a MySQL connection. Should be triggered by the caller before running
23
# the scraper, if the caller is using MySQL
24
#########
25
def openCon(self):
26
global conn
27
global cur
28
#Use this line connecting to MySQL on Linux/Unix/MacOSX
29
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='mysql', charset='utf8')
30
#Use this line connecting to MySQL on Windows
31
#conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd=None, db='mysql' charset='utf8')
32
33
cur = conn.cursor(pymysql.cursors.DictCursor)
34
cur.execute("USE articleCrawler")
35
36
#########
37
# Close a MySQL connection. Should be triggered by the caller after running
38
# the scraper, if the caller is using MySQL
39
#########
40
def closeCon(self):
41
global conn
42
global cur
43
conn.close()
44
45
#########
46
# Prints and stores content if content does not already exist for that URL and topic
47
#########
48
def storeContent(self, topic, site, title, body, url):
49
global conn
50
global cur
51
#Optionally, comment out the print statements if you want this to go straight to
52
#MySQL without printing
53
print("New article found for: "+topic.name)
54
print(title)
55
print(body)
56
57
if(len(body) > 9999):
58
body = body[:9999]
59
if(len(title) > 999):
60
title = title[:999]
61
cur.execute("SELECT * FROM content WHERE url = %s AND topicId = %s", (url, int(topic.id)))
62
if cur.rowcount == 0:
63
try:
64
cur.execute("INSERT INTO content (topicId, siteId, title, body, url) VALUES(%s, %s, %s, %s, %s)", (int(topic.id), int(site.id), title, body, url))
65
except:
66
print("Could not store article")
67
try:
68
conn.commit()
69
except:
70
conn.rollback()
71
72
73
def getSites(self):
74
global conn
75
global cur
76
cur.execute("SELECT * FROM sites")
77
sitesData = cur.fetchall()
78
allSiteObjs = []
79
for site in sitesData:
80
siteObj = Website(site['id'], site['name'], site['url'], site['searchUrl'], site['resultListing'], site['resultUrl'], site['absoluteUrl'], site['pageTitle'], site['pageBody'])
81
allSiteObjs.append(siteObj)
82
return allSiteObjs
83
84
def getTopics(self):
85
global conn
86
global cur
87
cur.execute("SELECT * FROM topics")
88
topicsData = cur.fetchall()
89
allTopicObjs = []
90
for topic in topicsData:
91
topicObj = Topic(topic['id'], topic['name'])
92
allTopicObjs.append(topicObj)
93
return allTopicObjs
94
95
96
################
97
# Utilty function used to get a Beautiful Soup object
98
# from a given URL
99
##############
100
def getPage(self, url):
101
print("Retrieving URL:\n"+url)
102
session = requests.Session()
103
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
104
try:
105
req = session.get(url, headers=headers)
106
except requests.exceptions.RequestException:
107
return None
108
bsObj = BeautifulSoup(req.text, "lxml")
109
return bsObj
110
111
112
################
113
# Utilty function used to get a string from a Beautiful Soup
114
# object and a selector. Returns an empty string if no object
115
# is found for the given selector
116
##############
117
def safeGet(self, pageObj, selector):
118
childObj = pageObj.select(selector)
119
if childObj is not None and len(childObj) > 0:
120
return childObj[0].get_text()
121
return ""
122
123
124
################
125
# Searches a given website for a given topic and records all
126
# pages found
127
##############
128
def search(self, topic, site):
129
print(site.searchUrl+topic.name)
130
bsObj = self.getPage(site.searchUrl+topic.name)
131
searchResults = bsObj.select(site.resultListing)
132
for result in searchResults:
133
url = result.select(site.resultUrl)[0].attrs["href"]
134
#Check to see whether it's a relative or an absolute URL
135
136
if(site.absoluteUrl == "true"):
137
pageObj = self.getPage(url)
138
else:
139
pageObj = self.getPage(site.url+url)
140
141
if pageObj == None:
142
print("Something is wrong with that page or URL. Skipping")
143
else:
144
title = self.safeGet(pageObj, site.pageTitle)
145
print("Title is "+title)
146
body = self.safeGet(pageObj, site.pageBody)
147
if title != "" and body != "":
148
self.storeContent(topic, site, title, body, url)
149
150
151
#####################################################
152
##### "User" code, outside the scraper class ########
153
#####################################################
154
155
crawler = Crawler()
156
crawler.openCon()
157
#build a list of websites to search, from the CSV file
158
sites = crawler.getSites()
159
topics = crawler.getTopics()
160
161
for topic in topics:
162
print("GETTING INFO ABOUT: "+topic.name);
163
for targetSite in sites:
164
print("FROM SITE: "+targetSite.name);
165
crawler.search(topic, targetSite)
166
167
crawler.closeCon()
168
169
170
171
172