Python scrapy.utils.request.request_fingerprint() Examples

The following are 30 code examples of scrapy.utils.request.request_fingerprint(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.utils.request , or try the search function .
Example #1
Source File: dupefilter.py    From sozlukcrawler with GNU General Public License v2.0 7 votes vote down vote up
def request_seen(self, request):
        is_seen = is_request_seen(request)

        if not is_seen:
            log.msg('New URL: %s. Adding it to seen database' % request.url, log.DEBUG)
            seen = Seen(fingerprint=request_fingerprint(request),
                        url=request.url,
                        last_crawl_time=datetime.now())
            try:
                session.add(seen)
                session.commit()
            except:
                session.rollback()
                raise
            finally:
                session.close()
        else:
            log.msg('[seen] "%s" is seen. Skipping.' % request.url, log.INFO)

        return is_seen 
Example #2
Source File: impl.py    From scrapy-qiniu with Apache License 2.0 6 votes vote down vote up
def _extract_key_info(self, request):
        """
        从欲下载资源的request中, 获得资源上传七牛时的bucket和key
        """
        from scrapy.utils.request import request_fingerprint

        key_generator = request.meta.get('qiniu_key_generator')
        if key_generator:
            tmp = key_generator(request.url)
            bucket = tmp['bucket'] or self.bucket
            key = tmp['key']
        else:
            bucket = self.bucket
            key = '%s%s' % (self.key_prefix, request_fingerprint(request))

        return {'bucket': bucket, 'key': key} 
Example #3
Source File: dupefilter.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def request_seen(self, request):
        """Returns True if request was already seen.

        Parameters
        ----------
        request : scrapy.http.Request

        Returns
        -------
        bool

        """
        fp = self.request_fingerprint(request)
        # This returns the number of values added, zero if already exists.
        added = self.server.sadd(self.key, fp)
        return added == 0 
Example #4
Source File: httpcache.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _request_key(self, request):
        return to_bytes(request_fingerprint(request)) 
Example #5
Source File: dupefilter.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def request_seen(self, request):
        fp = request_fingerprint(request)

        added = self.server.basic_publish(
            exchange='',
            routing_key=self.key,
            body=fp
        )

        return not added 
Example #6
Source File: dupefilter.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def request_seen(self, request):
        fp = request_fingerprint(request)
        # added = self.server.sadd(self.key + ":" + c_id, fp)
        # self.server.expire(self.key + ":" + c_id, self.timeout)
        added = self.server.sadd(self.key, fp)
        return not added 
Example #7
Source File: cache.py    From PyFeeds with GNU Affero General Public License v3.0 5 votes vote down vote up
def _get_request_path(self, spider, request):
        key = request_fingerprint(request, include_headers=["Cookie"])
        return os.path.join(self.cachedir, spider.name, key[0:2], key) 
Example #8
Source File: spidermiddlewares.py    From PyFeeds with GNU Affero General Public License v3.0 5 votes vote down vote up
def request_scheduled(self, request, spider):
        try:
            request.meta["fingerprints"] = copy(request.meta["fingerprints"])
        except KeyError:
            request.meta["fingerprints"] = []
        logger.debug(
            "Parent fingerprints for request {}: {}".format(
                request, request.meta["fingerprints"]
            )
        )
        if not request.meta.get("dont_cache", False):
            fingerprint = request_fingerprint(request, include_headers=["Cookie"])
            request.meta["fingerprints"].append(fingerprint)
        else:
            logger.debug("Skipping fingerprinting uncached request {}".format(request)) 
Example #9
Source File: textspider.py    From ARGUS with GNU General Public License v3.0 5 votes vote down vote up
def parse(self, response):

        #initialize collector item which stores the website's content and meta data
        loader = ItemLoader(item=Collector())
        loader.add_value("dl_slot", response.request.meta.get('download_slot'))
        loader.add_value("redirect", self.checkRedirectDomain(response))
        loader.add_value("start_page", response.url)
        loader.add_value("start_domain", self.subdomainGetter(response))  
        loader.add_value("scraped_urls", [response.urljoin(response.url)])
        loader.add_value("scrape_counter", 1)
        loader.add_value("scraped_text", [self.extractText(response)])
        loader.add_value("error", "None")
        loader.add_value("ID", response.request.meta["ID"])

        #initialize the fingerprints set which stores all fingerprints of visited websites
        fingerprints = set()
        #add the fingerprints of the start_page
        fingerprints.add(request_fingerprint(response.request))
        
        #if there was an initial redirect, the new domain is added to the allowed domains
        domain = self.subdomainGetter(response)
        if domain not in self.allowed_domains:
            self.allowed_domains.append(domain)
            self.refreshAllowedDomains()

        #extract all urls from the page...
        urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
        #...and safe them to a urlstack
        urlstack = [response.urljoin(url) for url in urls]   
            
        #attach the urlstack, the loader, and the fingerprints to the response...        
        response.meta["urlstack"] = urlstack
        response.meta["loader"] = loader
        response.meta["fingerprints"] = fingerprints
        #...and send it over to the processURLstack function
        return self.processURLstack(response)
    
    
##################################################################
# PROCESS URL STACK
################################################################## 
Example #10
Source File: scrapy_pagestorage.py    From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def process_spider_output(self, response, result, spider):
        fp = request_fingerprint(response.request)
        try:
            for r in result:
                if isinstance(r, DictItem):
                    r.fields["_cached_page_id"] = Field()
                    r._values["_cached_page_id"] = fp
                elif isinstance(r, dict):
                    r["_cached_page_id"] = fp
                yield r
        except Exception as exc:
            self.process_spider_exception(response, exc, spider)
            raise 
Example #11
Source File: scrapy_pagestorage.py    From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def save_response(self, response, spider):
        if isinstance(response, TextResponse):
            fp = request_fingerprint(response.request)
            payload = {
                "_key": fp,
                "_jobid": self.hsref.job.key,
                "_type": "_pageitem",
                "_encoding": response.encoding,
                "url": response.url,
            }
            self._set_cookies(payload, response)

            if response.request.method == 'POST':
                payload["postdata"] = dict(parse_qsl(response.request.body.decode()))

            payload["body"] = response.body_as_unicode()
            if self.trim_html:
                payload['body'] = payload['body'].strip(' \r\n\0')

            if len(payload['body']) > self._writer.maxitemsize:
                spider.logger.warning("Page not saved, body too large: <%s>" %
                                      response.url)
                return

            try:
                self._writer.write(payload)
            except ValueTooLarge as exc:
                spider.logger.warning("Page not saved, %s: <%s>" %
                                      (exc, response.url)) 
Example #12
Source File: dupefilter.py    From scrapy-rabbitmq with MIT License 5 votes vote down vote up
def request_seen(self, request):
        fp = request_fingerprint(request)

        added = self.server.basic_publish(
            exchange='',
            routing_key=self.key,
            body=fp
        )

        return not added 
Example #13
Source File: media.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _process_request(self, request, info):
        fp = request_fingerprint(request)
        cb = request.callback or (lambda _: _)
        eb = request.errback
        request.callback = None
        request.errback = None

        # Return cached result if request was already seen
        if fp in info.downloaded:
            return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)

        # Otherwise, wait for result
        wad = Deferred().addCallbacks(cb, eb)
        info.waiting[fp].append(wad)

        # Check if request is downloading right now to avoid doing it twice
        if fp in info.downloading:
            return wad

        # Download request checking media_to_download hook output first
        info.downloading.add(fp)
        dfd = mustbe_deferred(self.media_to_download, request, info)
        dfd.addCallback(self._check_media_to_download, request, info)
        dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
        dfd.addErrback(lambda f: logger.error(
            f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider})
        )
        return dfd.addBoth(lambda _: wad)  # it must return wad at last 
Example #14
Source File: dupefilters.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def request_fingerprint(self, request):
        return request_fingerprint(request) 
Example #15
Source File: mongodb.py    From invana-bot with MIT License 5 votes vote down vote up
def _request_key(self, request):
        return to_bytes(request_fingerprint(request)) 
Example #16
Source File: httpcache.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _get_request_path(self, spider, request):
        key = request_fingerprint(request)
        return os.path.join(self.cachedir, spider.name, key[0:2], key) 
Example #17
Source File: dupefilter.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def request_fingerprint(self, request):
        """Returns a fingerprint for a given request.

        Parameters
        ----------
        request : scrapy.http.Request

        Returns
        -------
        str

        """
        return request_fingerprint(request) 
Example #18
Source File: elasticsearch.py    From invana-bot with MIT License 5 votes vote down vote up
def _request_key(self, request):
        return to_bytes(request_fingerprint(request)) 
Example #19
Source File: redis_dupefilter.py    From scrapy-cluster with MIT License 5 votes vote down vote up
def request_seen(self, request):
        fp = request_fingerprint(request)
        c_id = request.meta['crawlid']

        added = self.server.sadd(self.key + ":" + c_id, fp)
        self.server.expire(self.key + ":" + c_id, self.timeout)

        return not added 
Example #20
Source File: utils.py    From sozlukcrawler with GNU General Public License v2.0 5 votes vote down vote up
def is_request_seen(request):
    return session.query(exists().where(Seen.fingerprint == request_fingerprint(request))).scalar() 
Example #21
Source File: py_my_scrapy_redis_server.py    From vrequest with MIT License 5 votes vote down vote up
def request_seen(self, request):
        tid = request._plusmeta.get('taskid')
        if tid:
            fp = self.request_fingerprint(request)
            added = self.server.sadd(self.key.format(tid), fp)
            return added == 0 
Example #22
Source File: py_my_scrapy_redis_server.py    From vrequest with MIT License 5 votes vote down vote up
def request_fingerprint(self, request):
        return request_fingerprint(request) 
Example #23
Source File: test_finger.py    From news_spider with MIT License 5 votes vote down vote up
def test_request(self):
        """
        测试请求
        :return:
        """
        req_01 = Request(url=self.url_01)
        result_01 = request.request_fingerprint(req_01)

        req_02 = Request(url=self.url_02)
        result_02 = request.request_fingerprint(req_02)

        self.assertEqual(result_01, result_02) 
Example #24
Source File: url.py    From news_spider with MIT License 5 votes vote down vote up
def get_request_finger(url):
    """
    获取 url 指纹(允许参数无序)
    :param url:
    :return:
    """
    req = Request(url=url)
    return request.request_fingerprint(req) 
Example #25
Source File: httpcache.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _request_key(self, request):
        return request_fingerprint(request) 
Example #26
Source File: httpcache.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _get_request_path(self, spider, request):
        key = request_fingerprint(request)
        return os.path.join(self.cachedir, spider.name, key[0:2], key) 
Example #27
Source File: httpcache.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _request_key(self, request):
        return to_bytes(request_fingerprint(request)) 
Example #28
Source File: dupefilters.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def request_seen(self, request):
        fp = self.request_fingerprint(request)
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)
        if self.file:
            self.file.write(fp + os.linesep) 
Example #29
Source File: dupefilters.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def request_fingerprint(self, request):
        return request_fingerprint(request) 
Example #30
Source File: httpcache.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _request_key(self, request):
        return request_fingerprint(request)