Path: blob/master/3-Advanced-Crawlers/crawler.py
164 views
from website import Website1from topic import Topic2from content import Content34import pymysql5import requests6from bs4 import BeautifulSoup7import sys8from io import StringIO9import csv1011class Crawler:12conn = None13cur = None1415# def __init__(self):16# print("Starting!")171819#########20# Prints content, can be integrated with MySQL to store things21#########22def printContent(self, topic, title, body, url):23print("New article found for: "+topic.name)24print(title)25print(body)262728#########29# Creates a new topic object, from a topic string30##########31def getTopicFromName(self, topicName):32topic = Topic(0, topicName)33return topic3435################36# Utilty function used to get a Beautiful Soup object37# from a given URL38##############39def getPage(self, url):40print("Retrieving URL:\n"+url)41session = requests.Session()42headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}43try:44req = session.get(url, headers=headers)45except requests.exceptions.RequestException:46return None47bsObj = BeautifulSoup(req.text, "lxml")48return bsObj495051################52# Utilty function used to get a content string from a Beautiful Soup53# object and a selector. Returns an empty string if no object54# is found for the given selector55##############56def safeGet(self, pageObj, selector):57childObj = pageObj.select(selector)58if childObj is not None and len(childObj) > 0:59return childObj[0].get_text()60return ""616263################64# Searches a given website for a given topic and records all65# pages found66##############67def search(self, topic, site):68bsObj = self.getPage(site.searchUrl+topic.name)69searchResults = bsObj.select(site.resultListing)70for result in searchResults:71url = result.select(site.resultUrl)[0].attrs["href"]72#Check to see whether it's a relative or an absolute URL7374if(site.absoluteUrl == "TRUE"):75pageObj = self.getPage(url)76else:77pageObj = self.getPage(site.url+url)78if pageObj == None:79print("Something was wrong with that page or URL. Skipping!")80else:81title = self.safeGet(pageObj, site.pageTitle)82print("Title is "+title)83body = self.safeGet(pageObj, site.pageBody)84if title != "" and body != "":85self.printContent(topic, title, body, url)8687################88# Starts a search of a given website for a given topic89##############90def crawl(self, topicStr, targetSite):91global conn92global cur93#If using MySQL, this will get any stored details about the topic94#If not using MySQL, it will essentially do nothing95topic = self.getTopicFromName(topicStr)96self.search(topic, targetSite)9798#####################################################99##### "User" code, outside the scraper class ########100#####################################################101102f = open("topics.txt", 'r')103topicName = f.readline().strip()104crawler = Crawler()105106#Get a list of sites to search from the sites.csv file107data = open("sites.csv", 'r').read()108dataFile = StringIO(data)109siteRows = csv.reader(dataFile)110111#Skip the header line in the CSV file - the header makes it easy to read,112#but we don't want to use the column titles as actual site data113next(siteRows)114115#build a list of websites to search, from the CSV file116sites = []117for row in siteRows:118sites.append(Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]))119120while(topicName):121print("GETTING INFO ABOUT: "+topicName);122for targetSite in sites:123crawler.crawl(topicName, targetSite)124topicName = f.readline().strip()125126127128129130131