Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
REMitchell
GitHub Repository: REMitchell/python-crawling
Path: blob/master/3-Advanced-Crawlers/crawler.py
164 views
1
from website import Website
2
from topic import Topic
3
from content import Content
4
5
import pymysql
6
import requests
7
from bs4 import BeautifulSoup
8
import sys
9
from io import StringIO
10
import csv
11
12
class Crawler:
13
conn = None
14
cur = None
15
16
# def __init__(self):
17
# print("Starting!")
18
19
20
#########
21
# Prints content, can be integrated with MySQL to store things
22
#########
23
def printContent(self, topic, title, body, url):
24
print("New article found for: "+topic.name)
25
print(title)
26
print(body)
27
28
29
#########
30
# Creates a new topic object, from a topic string
31
##########
32
def getTopicFromName(self, topicName):
33
topic = Topic(0, topicName)
34
return topic
35
36
################
37
# Utilty function used to get a Beautiful Soup object
38
# from a given URL
39
##############
40
def getPage(self, url):
41
print("Retrieving URL:\n"+url)
42
session = requests.Session()
43
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
44
try:
45
req = session.get(url, headers=headers)
46
except requests.exceptions.RequestException:
47
return None
48
bsObj = BeautifulSoup(req.text, "lxml")
49
return bsObj
50
51
52
################
53
# Utilty function used to get a content string from a Beautiful Soup
54
# object and a selector. Returns an empty string if no object
55
# is found for the given selector
56
##############
57
def safeGet(self, pageObj, selector):
58
childObj = pageObj.select(selector)
59
if childObj is not None and len(childObj) > 0:
60
return childObj[0].get_text()
61
return ""
62
63
64
################
65
# Searches a given website for a given topic and records all
66
# pages found
67
##############
68
def search(self, topic, site):
69
bsObj = self.getPage(site.searchUrl+topic.name)
70
searchResults = bsObj.select(site.resultListing)
71
for result in searchResults:
72
url = result.select(site.resultUrl)[0].attrs["href"]
73
#Check to see whether it's a relative or an absolute URL
74
75
if(site.absoluteUrl == "TRUE"):
76
pageObj = self.getPage(url)
77
else:
78
pageObj = self.getPage(site.url+url)
79
if pageObj == None:
80
print("Something was wrong with that page or URL. Skipping!")
81
else:
82
title = self.safeGet(pageObj, site.pageTitle)
83
print("Title is "+title)
84
body = self.safeGet(pageObj, site.pageBody)
85
if title != "" and body != "":
86
self.printContent(topic, title, body, url)
87
88
################
89
# Starts a search of a given website for a given topic
90
##############
91
def crawl(self, topicStr, targetSite):
92
global conn
93
global cur
94
#If using MySQL, this will get any stored details about the topic
95
#If not using MySQL, it will essentially do nothing
96
topic = self.getTopicFromName(topicStr)
97
self.search(topic, targetSite)
98
99
#####################################################
100
##### "User" code, outside the scraper class ########
101
#####################################################
102
103
f = open("topics.txt", 'r')
104
topicName = f.readline().strip()
105
crawler = Crawler()
106
107
#Get a list of sites to search from the sites.csv file
108
data = open("sites.csv", 'r').read()
109
dataFile = StringIO(data)
110
siteRows = csv.reader(dataFile)
111
112
#Skip the header line in the CSV file - the header makes it easy to read,
113
#but we don't want to use the column titles as actual site data
114
next(siteRows)
115
116
#build a list of websites to search, from the CSV file
117
sites = []
118
for row in siteRows:
119
sites.append(Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]))
120
121
while(topicName):
122
print("GETTING INFO ABOUT: "+topicName);
123
for targetSite in sites:
124
crawler.crawl(topicName, targetSite)
125
topicName = f.readline().strip()
126
127
128
129
130
131