Path: blob/master/3-Advanced-Crawlers/mysql-optional/articles_mysql.py
164 views
from website import Website1from topic import Topic2from content import Content34import pymysql5import requests6from bs4 import BeautifulSoup7import sys8from io import StringIO9import csv1011class Crawler:12conn = None13cur = None1415def __init__(self):16global conn17global curcomp181920#########21# Open a MySQL connection. Should be triggered by the caller before running22# the scraper, if the caller is using MySQL23#########24def openCon(self):25global conn26global cur27#Use this line connecting to MySQL on Linux/Unix/MacOSX28conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='mysql', charset='utf8')29#Use this line connecting to MySQL on Windows30#conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd=None, db='mysql' charset='utf8')3132cur = conn.cursor(pymysql.cursors.DictCursor)33cur.execute("USE articleCrawler")3435#########36# Close a MySQL connection. Should be triggered by the caller after running37# the scraper, if the caller is using MySQL38#########39def closeCon(self):40global conn41global cur42conn.close()4344#########45# Prints and stores content if content does not already exist for that URL and topic46#########47def storeContent(self, topic, site, title, body, url):48global conn49global cur50#Optionally, comment out the print statements if you want this to go straight to51#MySQL without printing52print("New article found for: "+topic.name)53print(title)54print(body)5556if(len(body) > 9999):57body = body[:9999]58if(len(title) > 999):59title = title[:999]60cur.execute("SELECT * FROM content WHERE url = %s AND topicId = %s", (url, int(topic.id)))61if cur.rowcount == 0:62try:63cur.execute("INSERT INTO content (topicId, siteId, title, body, url) VALUES(%s, %s, %s, %s, %s)", (int(topic.id), int(site.id), title, body, url))64except:65print("Could not store article")66try:67conn.commit()68except:69conn.rollback()707172def getSites(self):73global conn74global cur75cur.execute("SELECT * FROM sites")76sitesData = cur.fetchall()77allSiteObjs = []78for site in sitesData:79siteObj = Website(site['id'], site['name'], site['url'], site['searchUrl'], site['resultListing'], site['resultUrl'], site['absoluteUrl'], site['pageTitle'], site['pageBody'])80allSiteObjs.append(siteObj)81return allSiteObjs8283def getTopics(self):84global conn85global cur86cur.execute("SELECT * FROM topics")87topicsData = cur.fetchall()88allTopicObjs = []89for topic in topicsData:90topicObj = Topic(topic['id'], topic['name'])91allTopicObjs.append(topicObj)92return allTopicObjs939495################96# Utilty function used to get a Beautiful Soup object97# from a given URL98##############99def getPage(self, url):100print("Retrieving URL:\n"+url)101session = requests.Session()102headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}103try:104req = session.get(url, headers=headers)105except requests.exceptions.RequestException:106return None107bsObj = BeautifulSoup(req.text, "lxml")108return bsObj109110111################112# Utilty function used to get a string from a Beautiful Soup113# object and a selector. Returns an empty string if no object114# is found for the given selector115##############116def safeGet(self, pageObj, selector):117childObj = pageObj.select(selector)118if childObj is not None and len(childObj) > 0:119return childObj[0].get_text()120return ""121122123################124# Searches a given website for a given topic and records all125# pages found126##############127def search(self, topic, site):128print(site.searchUrl+topic.name)129bsObj = self.getPage(site.searchUrl+topic.name)130searchResults = bsObj.select(site.resultListing)131for result in searchResults:132url = result.select(site.resultUrl)[0].attrs["href"]133#Check to see whether it's a relative or an absolute URL134135if(site.absoluteUrl == "true"):136pageObj = self.getPage(url)137else:138pageObj = self.getPage(site.url+url)139140if pageObj == None:141print("Something is wrong with that page or URL. Skipping")142else:143title = self.safeGet(pageObj, site.pageTitle)144print("Title is "+title)145body = self.safeGet(pageObj, site.pageBody)146if title != "" and body != "":147self.storeContent(topic, site, title, body, url)148149150#####################################################151##### "User" code, outside the scraper class ########152#####################################################153154crawler = Crawler()155crawler.openCon()156#build a list of websites to search, from the CSV file157sites = crawler.getSites()158topics = crawler.getTopics()159160for topic in topics:161print("GETTING INFO ABOUT: "+topic.name);162for targetSite in sites:163print("FROM SITE: "+targetSite.name);164crawler.search(topic, targetSite)165166crawler.closeCon()167168169170171172