Python scrapy.spiders() Examples

The following are 4 code examples of scrapy.spiders(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy , or try the search function .
Example #1
Source File: url.py    From hoaxy-backend with GNU General Public License v3.0 6 votes vote down vote up
def start_requests(self):
        """This function generates the initial request of ArchiveSpider.

        See 'http://doc.scrapy.org/en/latest/topics/spiders.html#\
        scrapy.spiders.Spider.start_requests'.

        The most import part of the function is to set a request meta,
        'archive_meta', according to its site 'archive_rules'. The meta would
        be used to parse article URLs from response and generate next request!
        """
        for page in self.page_templates:
            url = page.format(p_num=self.p_kw['start'])
            meta = dict(archive_meta=dict(
                last_urls=dict(),
                p_num=self.p_kw['start'],
                next_tries=0,
                max_next_tries=self.p_kw['max_next_tries'],
                page=page))
            logger.debug('Page format meta info:\n%s', pprint.pformat(meta))
            yield scrapy.Request(url, callback=self.parse, meta=meta) 
Example #2
Source File: xml.py    From invana-bot with MIT License 5 votes vote down vote up
def is_this_request_from_same_traversal(response, traversal):
        """
        This mean the current request came from this  traversal,
        so we can put max pages condition on this, otherwise for different
        traversals of different spiders, adding max_page doest make sense.
        """
        traversal_id = traversal['traversal_id']
        current_request_traversal_id = response.meta.get('current_request_traversal_id', None)
        return current_request_traversal_id == traversal_id 
Example #3
Source File: base.py    From invana-bot with MIT License 5 votes vote down vote up
def is_this_request_from_same_traversal(response, traversal):
        """
        This mean the current request came from this  traversal,
        so we can put max pages condition on this, otherwise for different
        traversals of different spiders, adding max_page doest make sense.
        """
        traversal_id = traversal['traversal_id']
        current_request_traversal_id = response.meta.get('current_request_traversal_id', None)
        return current_request_traversal_id == traversal_id 
Example #4
Source File: url.py    From hoaxy-backend with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, domains, urls, *args, **kwargs):
        """Constructor for FeedSpider.

        Parameters
        ----------
        domains : list
            A list of domains for the site.
        urls : list
            A list of feed URLs of the site.
        provider : string
            The provider of RSS feed.
        url_regex : string
            URL pattern regular expression.

        If you use this spider to store item into database, additional
        keywords are required:

        platform_id : int
            The id of a platform instance.
        session : object
            An instance of SQLAlchemy session.

        Other keywords are used to specify how to parse the XML, see
        http://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders\
        .XMLFeedSpider.
        """
        self.platform_id = kwargs.pop('platform_id', None)
        self.session = kwargs.pop('session', None)
        self.url_regex = kwargs.pop('url_regex', None)
        self.provider = kwargs.pop('provider', 'self')
        self.iterator = kwargs.pop('iterator', 'iternodes')
        self.itertag = kwargs.pop('iterator', 'item')
        self.allowed_domains = domains
        self.start_urls = urls
        super(FeedSpider, self).__init__(*args, **kwargs)