scrapy: Call a function when a spider quits
Question:
Is there a way to trigger a method in a Spider class just before it terminates?
I can terminate the spider myself, like this:
class MySpider(CrawlSpider):
#Config stuff goes here...
def quit(self):
#Do some stuff...
raise CloseSpider('MySpider is quitting now.')
def my_parser(self, response):
if termination_condition:
self.quit()
#Parsing stuff goes here...
But I can’t find any information on how to determine when the spider is about to quit naturally.
Answers:
It looks like you can register a signal listener through dispatcher
.
I would try something like:
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
class MySpider(CrawlSpider):
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
# second param is instance of spder about to be closed.
In the newer version of scrapy scrapy.xlib.pydispatch
is deprecated. instead you can use from pydispatch import dispatcher
.
For me the accepted did not work / is outdated at least for scrapy 0.19.
I got it to work with the following though:
from scrapy.signalmanager import SignalManager
from scrapy.xlib.pydispatch import dispatcher
class MySpider(CrawlSpider):
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
SignalManager(dispatcher.Any).connect(
self.closed_handler, signal=signals.spider_closed)
def closed_handler(self, spider):
# do stuff here
Just to update, you can just call closed
function like this:
class MySpider(CrawlSpider):
def closed(self, reason):
do-something()
For Scrapy version 1.0.0+ (it may also work for older versions).
from scrapy import signals
class MySpider(CrawlSpider):
name = 'myspider'
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_opened(self, spider):
print('Opening {} spider'.format(spider.name))
def spider_closed(self, spider):
print('Closing {} spider'.format(spider.name))
One good usage is to add tqdm progress bar to scrapy spider.
# -*- coding: utf-8 -*-
from scrapy import signals
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from tqdm import tqdm
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['somedomain.comm']
start_urls = ['http://www.somedomain.comm/ccid.php']
rules = (
Rule(LinkExtractor(allow=r'^http://www.somedomain.comm/ccds.php?id=.*'),
callback='parse_item',
),
Rule(LinkExtractor(allow=r'^http://www.somedomain.comm/ccid.php$',
restrict_xpaths='//table/tr[contains(., "SMTH")]'), follow=True),
)
def parse_item(self, response):
self.pbar.update() # update progress bar by 1
item = MyItem()
# parse response
return item
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_opened(self, spider):
self.pbar = tqdm() # initialize progress bar
self.pbar.clear()
self.pbar.write('Opening {} spider'.format(spider.name))
def spider_closed(self, spider):
self.pbar.clear()
self.pbar.write('Closing {} spider'.format(spider.name))
self.pbar.close() # close progress bar
if you have many spiders and want to do something before each of them closing, maybe it will be convenient to add statscollector in your project.
in settings:
STATS_CLASS = 'scraper.stats.MyStatsCollector'
and collector:
from scrapy.statscollectors import StatsCollector
class MyStatsCollector(StatsCollector):
def _persist_stats(self, stats, spider):
do something here
For the latest version(v1.7), just define closed(reason)
method in your spider class.
closed(reason)
:
Called when the spider closes. This method provides a shortcut to
signals.connect() for the spider_closed signal.
Is there a way to trigger a method in a Spider class just before it terminates?
I can terminate the spider myself, like this:
class MySpider(CrawlSpider):
#Config stuff goes here...
def quit(self):
#Do some stuff...
raise CloseSpider('MySpider is quitting now.')
def my_parser(self, response):
if termination_condition:
self.quit()
#Parsing stuff goes here...
But I can’t find any information on how to determine when the spider is about to quit naturally.
It looks like you can register a signal listener through dispatcher
.
I would try something like:
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
class MySpider(CrawlSpider):
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
# second param is instance of spder about to be closed.
In the newer version of scrapy scrapy.xlib.pydispatch
is deprecated. instead you can use from pydispatch import dispatcher
.
For me the accepted did not work / is outdated at least for scrapy 0.19.
I got it to work with the following though:
from scrapy.signalmanager import SignalManager
from scrapy.xlib.pydispatch import dispatcher
class MySpider(CrawlSpider):
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
SignalManager(dispatcher.Any).connect(
self.closed_handler, signal=signals.spider_closed)
def closed_handler(self, spider):
# do stuff here
Just to update, you can just call closed
function like this:
class MySpider(CrawlSpider):
def closed(self, reason):
do-something()
For Scrapy version 1.0.0+ (it may also work for older versions).
from scrapy import signals
class MySpider(CrawlSpider):
name = 'myspider'
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_opened(self, spider):
print('Opening {} spider'.format(spider.name))
def spider_closed(self, spider):
print('Closing {} spider'.format(spider.name))
One good usage is to add tqdm progress bar to scrapy spider.
# -*- coding: utf-8 -*-
from scrapy import signals
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from tqdm import tqdm
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['somedomain.comm']
start_urls = ['http://www.somedomain.comm/ccid.php']
rules = (
Rule(LinkExtractor(allow=r'^http://www.somedomain.comm/ccds.php?id=.*'),
callback='parse_item',
),
Rule(LinkExtractor(allow=r'^http://www.somedomain.comm/ccid.php$',
restrict_xpaths='//table/tr[contains(., "SMTH")]'), follow=True),
)
def parse_item(self, response):
self.pbar.update() # update progress bar by 1
item = MyItem()
# parse response
return item
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_opened(self, spider):
self.pbar = tqdm() # initialize progress bar
self.pbar.clear()
self.pbar.write('Opening {} spider'.format(spider.name))
def spider_closed(self, spider):
self.pbar.clear()
self.pbar.write('Closing {} spider'.format(spider.name))
self.pbar.close() # close progress bar
if you have many spiders and want to do something before each of them closing, maybe it will be convenient to add statscollector in your project.
in settings:
STATS_CLASS = 'scraper.stats.MyStatsCollector'
and collector:
from scrapy.statscollectors import StatsCollector
class MyStatsCollector(StatsCollector):
def _persist_stats(self, stats, spider):
do something here
For the latest version(v1.7), just define closed(reason)
method in your spider class.
closed(reason)
:Called when the spider closes. This method provides a shortcut to
signals.connect() for the spider_closed signal.