Python Examples of scrapy.utils.request.request

Source File: dupefilter.py From sozlukcrawler with GNU General Public License v2.0

7 votes

def request_seen(self, request):
        is_seen = is_request_seen(request)

        if not is_seen:
            log.msg('New URL: %s. Adding it to seen database' % request.url, log.DEBUG)
            seen = Seen(fingerprint=request_fingerprint(request),
                        url=request.url,
                        last_crawl_time=datetime.now())
            try:
                session.add(seen)
                session.commit()
            except:
                session.rollback()
                raise
            finally:
                session.close()
        else:
            log.msg('[seen] "%s" is seen. Skipping.' % request.url, log.INFO)

        return is_seen

Source File: impl.py From scrapy-qiniu with Apache License 2.0

6 votes

def _extract_key_info(self, request):
        """
        从欲下载资源的request中, 获得资源上传七牛时的bucket和key
        """
        from scrapy.utils.request import request_fingerprint

        key_generator = request.meta.get('qiniu_key_generator')
        if key_generator:
            tmp = key_generator(request.url)
            bucket = tmp['bucket'] or self.bucket
            key = tmp['key']
        else:
            bucket = self.bucket
            key = '%s%s' % (self.key_prefix, request_fingerprint(request))

        return {'bucket': bucket, 'key': key}

Source File: dupefilter.py From learn_python3_spider with MIT License

6 votes

def request_seen(self, request):
        """Returns True if request was already seen.

        Parameters
        ----------
        request : scrapy.http.Request

        Returns
        -------
        bool

        """
        fp = self.request_fingerprint(request)
        # This returns the number of values added, zero if already exists.
        added = self.server.sadd(self.key, fp)
        return added == 0

Source File: httpcache.py From learn_python3_spider with MIT License

5 votes

def _request_key(self, request):
        return to_bytes(request_fingerprint(request))

Source File: dupefilter.py From openslack-crawler with Apache License 2.0

5 votes

def request_seen(self, request):
        fp = request_fingerprint(request)

        added = self.server.basic_publish(
            exchange='',
            routing_key=self.key,
            body=fp
        )

        return not added

Source File: dupefilter.py From openslack-crawler with Apache License 2.0

5 votes

def request_seen(self, request):
        fp = request_fingerprint(request)
        # added = self.server.sadd(self.key + ":" + c_id, fp)
        # self.server.expire(self.key + ":" + c_id, self.timeout)
        added = self.server.sadd(self.key, fp)
        return not added

Source File: cache.py From PyFeeds with GNU Affero General Public License v3.0

5 votes

def _get_request_path(self, spider, request):
        key = request_fingerprint(request, include_headers=["Cookie"])
        return os.path.join(self.cachedir, spider.name, key[0:2], key)

Source File: spidermiddlewares.py From PyFeeds with GNU Affero General Public License v3.0

5 votes

def request_scheduled(self, request, spider):
        try:
            request.meta["fingerprints"] = copy(request.meta["fingerprints"])
        except KeyError:
            request.meta["fingerprints"] = []
        logger.debug(
            "Parent fingerprints for request {}: {}".format(
                request, request.meta["fingerprints"]
            )
        )
        if not request.meta.get("dont_cache", False):
            fingerprint = request_fingerprint(request, include_headers=["Cookie"])
            request.meta["fingerprints"].append(fingerprint)
        else:
            logger.debug("Skipping fingerprinting uncached request {}".format(request))

Source File: textspider.py From ARGUS with GNU General Public License v3.0

5 votes

def parse(self, response):

        #initialize collector item which stores the website's content and meta data
        loader = ItemLoader(item=Collector())
        loader.add_value("dl_slot", response.request.meta.get('download_slot'))
        loader.add_value("redirect", self.checkRedirectDomain(response))
        loader.add_value("start_page", response.url)
        loader.add_value("start_domain", self.subdomainGetter(response))  
        loader.add_value("scraped_urls", [response.urljoin(response.url)])
        loader.add_value("scrape_counter", 1)
        loader.add_value("scraped_text", [self.extractText(response)])
        loader.add_value("error", "None")
        loader.add_value("ID", response.request.meta["ID"])

        #initialize the fingerprints set which stores all fingerprints of visited websites
        fingerprints = set()
        #add the fingerprints of the start_page
        fingerprints.add(request_fingerprint(response.request))
        
        #if there was an initial redirect, the new domain is added to the allowed domains
        domain = self.subdomainGetter(response)
        if domain not in self.allowed_domains:
            self.allowed_domains.append(domain)
            self.refreshAllowedDomains()

        #extract all urls from the page...
        urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
        #...and safe them to a urlstack
        urlstack = [response.urljoin(url) for url in urls]   
            
        #attach the urlstack, the loader, and the fingerprints to the response...        
        response.meta["urlstack"] = urlstack
        response.meta["loader"] = loader
        response.meta["fingerprints"] = fingerprints
        #...and send it over to the processURLstack function
        return self.processURLstack(response)
    
    
##################################################################
# PROCESS URL STACK
##################################################################

Source File: scrapy_pagestorage.py From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License

5 votes

def process_spider_output(self, response, result, spider):
        fp = request_fingerprint(response.request)
        try:
            for r in result:
                if isinstance(r, DictItem):
                    r.fields["_cached_page_id"] = Field()
                    r._values["_cached_page_id"] = fp
                elif isinstance(r, dict):
                    r["_cached_page_id"] = fp
                yield r
        except Exception as exc:
            self.process_spider_exception(response, exc, spider)
            raise

Source File: scrapy_pagestorage.py From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License

5 votes

def save_response(self, response, spider):
        if isinstance(response, TextResponse):
            fp = request_fingerprint(response.request)
            payload = {
                "_key": fp,
                "_jobid": self.hsref.job.key,
                "_type": "_pageitem",
                "_encoding": response.encoding,
                "url": response.url,
            }
            self._set_cookies(payload, response)

            if response.request.method == 'POST':
                payload["postdata"] = dict(parse_qsl(response.request.body.decode()))

            payload["body"] = response.body_as_unicode()
            if self.trim_html:
                payload['body'] = payload['body'].strip(' \r\n\0')

            if len(payload['body']) > self._writer.maxitemsize:
                spider.logger.warning("Page not saved, body too large: <%s>" %
                                      response.url)
                return

            try:
                self._writer.write(payload)
            except ValueTooLarge as exc:
                spider.logger.warning("Page not saved, %s: <%s>" %
                                      (exc, response.url))

Source File: dupefilter.py From scrapy-rabbitmq with MIT License

5 votes

def request_seen(self, request):
        fp = request_fingerprint(request)

        added = self.server.basic_publish(
            exchange='',
            routing_key=self.key,
            body=fp
        )

        return not added

Source File: media.py From learn_python3_spider with MIT License

5 votes

def _process_request(self, request, info):
        fp = request_fingerprint(request)
        cb = request.callback or (lambda _: _)
        eb = request.errback
        request.callback = None
        request.errback = None

        # Return cached result if request was already seen
        if fp in info.downloaded:
            return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)

        # Otherwise, wait for result
        wad = Deferred().addCallbacks(cb, eb)
        info.waiting[fp].append(wad)

        # Check if request is downloading right now to avoid doing it twice
        if fp in info.downloading:
            return wad

        # Download request checking media_to_download hook output first
        info.downloading.add(fp)
        dfd = mustbe_deferred(self.media_to_download, request, info)
        dfd.addCallback(self._check_media_to_download, request, info)
        dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
        dfd.addErrback(lambda f: logger.error(
            f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider})
        )
        return dfd.addBoth(lambda _: wad)  # it must return wad at last

Source File: dupefilters.py From learn_python3_spider with MIT License

5 votes

def request_fingerprint(self, request):
        return request_fingerprint(request)

Source File: mongodb.py From invana-bot with MIT License

5 votes

def _request_key(self, request):
        return to_bytes(request_fingerprint(request))

Source File: httpcache.py From learn_python3_spider with MIT License

5 votes

def _get_request_path(self, spider, request):
        key = request_fingerprint(request)
        return os.path.join(self.cachedir, spider.name, key[0:2], key)

Source File: dupefilter.py From learn_python3_spider with MIT License

5 votes

def request_fingerprint(self, request):
        """Returns a fingerprint for a given request.

        Parameters
        ----------
        request : scrapy.http.Request

        Returns
        -------
        str

        """
        return request_fingerprint(request)

Source File: elasticsearch.py From invana-bot with MIT License

5 votes

def _request_key(self, request):
        return to_bytes(request_fingerprint(request))

Source File: redis_dupefilter.py From scrapy-cluster with MIT License

5 votes

def request_seen(self, request):
        fp = request_fingerprint(request)
        c_id = request.meta['crawlid']

        added = self.server.sadd(self.key + ":" + c_id, fp)
        self.server.expire(self.key + ":" + c_id, self.timeout)

        return not added

Source File: utils.py From sozlukcrawler with GNU General Public License v2.0

5 votes

def is_request_seen(request):
    return session.query(exists().where(Seen.fingerprint == request_fingerprint(request))).scalar()

Source File: py_my_scrapy_redis_server.py From vrequest with MIT License

5 votes

def request_seen(self, request):
        tid = request._plusmeta.get('taskid')
        if tid:
            fp = self.request_fingerprint(request)
            added = self.server.sadd(self.key.format(tid), fp)
            return added == 0

Source File: py_my_scrapy_redis_server.py From vrequest with MIT License

5 votes

def request_fingerprint(self, request):
        return request_fingerprint(request)

Source File: test_finger.py From news_spider with MIT License

5 votes

def test_request(self):
        """
        测试请求
        :return:
        """
        req_01 = Request(url=self.url_01)
        result_01 = request.request_fingerprint(req_01)

        req_02 = Request(url=self.url_02)
        result_02 = request.request_fingerprint(req_02)

        self.assertEqual(result_01, result_02)

Source File: url.py From news_spider with MIT License

5 votes

def get_request_finger(url):
    """
    获取 url 指纹（允许参数无序）
    :param url:
    :return:
    """
    req = Request(url=url)
    return request.request_fingerprint(req)

Source File: httpcache.py From learn_python3_spider with MIT License

5 votes

def _request_key(self, request):
        return request_fingerprint(request)

Source File: httpcache.py From learn_python3_spider with MIT License

5 votes

def _get_request_path(self, spider, request):
        key = request_fingerprint(request)
        return os.path.join(self.cachedir, spider.name, key[0:2], key)

Source File: httpcache.py From learn_python3_spider with MIT License

5 votes

def _request_key(self, request):
        return to_bytes(request_fingerprint(request))

Source File: dupefilters.py From learn_python3_spider with MIT License

5 votes

def request_seen(self, request):
        fp = self.request_fingerprint(request)
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)
        if self.file:
            self.file.write(fp + os.linesep)

Source File: dupefilters.py From learn_python3_spider with MIT License

5 votes

def request_fingerprint(self, request):
        return request_fingerprint(request)

Source File: httpcache.py From learn_python3_spider with MIT License

5 votes

def _request_key(self, request):
        return request_fingerprint(request)

Python scrapy.utils.request.request_fingerprint() Examples