Path: blob/master/03_alternative_data/01_opentable/opentable/middlewares.py
2929 views
# -*- coding: utf-8 -*-12# Define here the models for your spider middleware3#4# See documentation in:5# https://doc.scrapy.org/en/latest/topics/spider-middleware.html67from scrapy import signals8from random import choice9from scrapy.exceptions import NotConfigured101112class OpentableSpiderMiddleware(object):13# Not all methods need to be defined. If a method is not defined,14# scrapy acts as if the spider middleware does not modify the15# passed objects.1617@classmethod18def from_crawler(cls, crawler):19# This method is used by Scrapy to create your spiders.20s = cls()21crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)22return s2324def process_spider_input(self, response, spider):25# Called for each response that goes through the spider26# middleware and into the spider.2728# Should return None or raise an exception.29return None3031def process_spider_output(self, response, result, spider):32# Called with the results returned from the Spider, after33# it has processed the response.3435# Must return an iterable of Request, dict or Item objects.36for i in result:37yield i3839def process_spider_exception(self, response, exception, spider):40# Called when a spider or process_spider_input() method41# (from other spider middleware) raises an exception.4243# Should return either None or an iterable of Response, dict44# or Item objects.45pass4647def process_start_requests(self, start_requests, spider):48# Called with the start requests of the spider, and works49# similarly to the process_spider_output() method, except50# that it doesn’t have a response associated.5152# Must return only requests (not items).53for r in start_requests:54yield r5556def spider_opened(self, spider):57spider.logger.info('Spider opened: %s' % spider.name)585960class OpentableDownloaderMiddleware(object):61# Not all methods need to be defined. If a method is not defined,62# scrapy acts as if the downloader middleware does not modify the63# passed objects.6465@classmethod66def from_crawler(cls, crawler):67# This method is used by Scrapy to create your spiders.68s = cls()69crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)70return s7172def process_request(self, request, spider):73# Called for each request that goes through the downloader74# middleware.7576# Must either:77# - return None: continue processing this request78# - or return a Response object79# - or return a Request object80# - or raise IgnoreRequest: process_exception() methods of81# installed downloader middleware will be called82return None8384def process_response(self, request, response, spider):85# Called with the response returned from the downloader.8687# Must either;88# - return a Response object89# - return a Request object90# - or raise IgnoreRequest91return response9293def process_exception(self, request, exception, spider):94# Called when a download handler or a process_request()95# (from other downloader middleware) raises an exception.9697# Must either:98# - return None: continue processing this exception99# - return a Response object: stops process_exception() chain100# - return a Request object: stops process_exception() chain101pass102103def spider_opened(self, spider):104spider.logger.info('Spider opened: %s' % spider.name)105106107class RotateUserAgentMiddleware(object):108"""Rotate user-agent for each request."""109110def __init__(self, user_agents):111self.enabled = False112self.user_agents = user_agents113114@classmethod115def from_crawler(cls, crawler):116user_agents = crawler.settings.get('USER_AGENT_CHOICES', [])117118if not user_agents:119raise NotConfigured("USER_AGENT_CHOICES not set or empty")120121o = cls(user_agents)122crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)123124return o125126def spider_opened(self, spider):127self.enabled = getattr(spider, 'rotate_user_agent', self.enabled)128129def process_request(self, request, spider):130if not self.enabled or not self.user_agents:131return132133request.headers['user-agent'] = choice(self.user_agents)134135136