Path: blob/master/03_alternative_data/01_opentable/opentable/extensions.py
2929 views
import logging1import pprint23from twisted.internet.task import LoopingCall4from scrapy import signals56logger = logging.getLogger(__name__)789class _LoopingExtension:10def setup_looping_task(self, task, crawler, interval):11self._interval = interval12self._task = LoopingCall(task)13crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)14crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)1516def spider_opened(self):17self._task.start(self._interval, now=False)1819def spider_closed(self):20if self._task.running:21self._task.stop()222324class MonitorDownloadsExtension(_LoopingExtension):25"""26Enable this extension to periodically log a number of active downloads.27"""2829def __init__(self, crawler, interval):30self.crawler = crawler31self.setup_looping_task(self.monitor, crawler, interval)3233@classmethod34def from_crawler(cls, crawler):35# fixme: 0 should mean NotConfigured36interval = crawler.settings.getfloat("MONITOR_DOWNLOADS_INTERVAL", 10.0)37return cls(crawler, interval)3839def monitor(self):40active_downloads = len(self.crawler.engine.downloader.active)41logger.info("Active downloads: {}".format(active_downloads))424344class DumpStatsExtension(_LoopingExtension):45"""46Enable this extension to log Scrapy stats periodically, not only47at the end of the crawl.48"""4950def __init__(self, crawler, interval):51self.stats = crawler.stats52self.setup_looping_task(self.print_stats, crawler, interval)5354def print_stats(self):55stats = self.stats.get_stats()56logger.info("Scrapy stats:\n" + pprint.pformat(stats))5758@classmethod59def from_crawler(cls, crawler):60interval = crawler.settings.getfloat("DUMP_STATS_INTERVAL", 60.0)61# fixme: 0 should mean NotConfigured62return cls(crawler, interval)636465