Python scrapy.utils.request.request_fingerprint() Examples
The following are 30
code examples of scrapy.utils.request.request_fingerprint().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.utils.request
, or try the search function
.
Example #1
Source File: dupefilter.py From sozlukcrawler with GNU General Public License v2.0 | 7 votes |
def request_seen(self, request): is_seen = is_request_seen(request) if not is_seen: log.msg('New URL: %s. Adding it to seen database' % request.url, log.DEBUG) seen = Seen(fingerprint=request_fingerprint(request), url=request.url, last_crawl_time=datetime.now()) try: session.add(seen) session.commit() except: session.rollback() raise finally: session.close() else: log.msg('[seen] "%s" is seen. Skipping.' % request.url, log.INFO) return is_seen
Example #2
Source File: impl.py From scrapy-qiniu with Apache License 2.0 | 6 votes |
def _extract_key_info(self, request): """ 从欲下载资源的request中, 获得资源上传七牛时的bucket和key """ from scrapy.utils.request import request_fingerprint key_generator = request.meta.get('qiniu_key_generator') if key_generator: tmp = key_generator(request.url) bucket = tmp['bucket'] or self.bucket key = tmp['key'] else: bucket = self.bucket key = '%s%s' % (self.key_prefix, request_fingerprint(request)) return {'bucket': bucket, 'key': key}
Example #3
Source File: dupefilter.py From learn_python3_spider with MIT License | 6 votes |
def request_seen(self, request): """Returns True if request was already seen. Parameters ---------- request : scrapy.http.Request Returns ------- bool """ fp = self.request_fingerprint(request) # This returns the number of values added, zero if already exists. added = self.server.sadd(self.key, fp) return added == 0
Example #4
Source File: httpcache.py From learn_python3_spider with MIT License | 5 votes |
def _request_key(self, request): return to_bytes(request_fingerprint(request))
Example #5
Source File: dupefilter.py From openslack-crawler with Apache License 2.0 | 5 votes |
def request_seen(self, request): fp = request_fingerprint(request) added = self.server.basic_publish( exchange='', routing_key=self.key, body=fp ) return not added
Example #6
Source File: dupefilter.py From openslack-crawler with Apache License 2.0 | 5 votes |
def request_seen(self, request): fp = request_fingerprint(request) # added = self.server.sadd(self.key + ":" + c_id, fp) # self.server.expire(self.key + ":" + c_id, self.timeout) added = self.server.sadd(self.key, fp) return not added
Example #7
Source File: cache.py From PyFeeds with GNU Affero General Public License v3.0 | 5 votes |
def _get_request_path(self, spider, request): key = request_fingerprint(request, include_headers=["Cookie"]) return os.path.join(self.cachedir, spider.name, key[0:2], key)
Example #8
Source File: spidermiddlewares.py From PyFeeds with GNU Affero General Public License v3.0 | 5 votes |
def request_scheduled(self, request, spider): try: request.meta["fingerprints"] = copy(request.meta["fingerprints"]) except KeyError: request.meta["fingerprints"] = [] logger.debug( "Parent fingerprints for request {}: {}".format( request, request.meta["fingerprints"] ) ) if not request.meta.get("dont_cache", False): fingerprint = request_fingerprint(request, include_headers=["Cookie"]) request.meta["fingerprints"].append(fingerprint) else: logger.debug("Skipping fingerprinting uncached request {}".format(request))
Example #9
Source File: textspider.py From ARGUS with GNU General Public License v3.0 | 5 votes |
def parse(self, response): #initialize collector item which stores the website's content and meta data loader = ItemLoader(item=Collector()) loader.add_value("dl_slot", response.request.meta.get('download_slot')) loader.add_value("redirect", self.checkRedirectDomain(response)) loader.add_value("start_page", response.url) loader.add_value("start_domain", self.subdomainGetter(response)) loader.add_value("scraped_urls", [response.urljoin(response.url)]) loader.add_value("scrape_counter", 1) loader.add_value("scraped_text", [self.extractText(response)]) loader.add_value("error", "None") loader.add_value("ID", response.request.meta["ID"]) #initialize the fingerprints set which stores all fingerprints of visited websites fingerprints = set() #add the fingerprints of the start_page fingerprints.add(request_fingerprint(response.request)) #if there was an initial redirect, the new domain is added to the allowed domains domain = self.subdomainGetter(response) if domain not in self.allowed_domains: self.allowed_domains.append(domain) self.refreshAllowedDomains() #extract all urls from the page... urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract() #...and safe them to a urlstack urlstack = [response.urljoin(url) for url in urls] #attach the urlstack, the loader, and the fingerprints to the response... response.meta["urlstack"] = urlstack response.meta["loader"] = loader response.meta["fingerprints"] = fingerprints #...and send it over to the processURLstack function return self.processURLstack(response) ################################################################## # PROCESS URL STACK ##################################################################
Example #10
Source File: scrapy_pagestorage.py From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def process_spider_output(self, response, result, spider): fp = request_fingerprint(response.request) try: for r in result: if isinstance(r, DictItem): r.fields["_cached_page_id"] = Field() r._values["_cached_page_id"] = fp elif isinstance(r, dict): r["_cached_page_id"] = fp yield r except Exception as exc: self.process_spider_exception(response, exc, spider) raise
Example #11
Source File: scrapy_pagestorage.py From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def save_response(self, response, spider): if isinstance(response, TextResponse): fp = request_fingerprint(response.request) payload = { "_key": fp, "_jobid": self.hsref.job.key, "_type": "_pageitem", "_encoding": response.encoding, "url": response.url, } self._set_cookies(payload, response) if response.request.method == 'POST': payload["postdata"] = dict(parse_qsl(response.request.body.decode())) payload["body"] = response.body_as_unicode() if self.trim_html: payload['body'] = payload['body'].strip(' \r\n\0') if len(payload['body']) > self._writer.maxitemsize: spider.logger.warning("Page not saved, body too large: <%s>" % response.url) return try: self._writer.write(payload) except ValueTooLarge as exc: spider.logger.warning("Page not saved, %s: <%s>" % (exc, response.url))
Example #12
Source File: dupefilter.py From scrapy-rabbitmq with MIT License | 5 votes |
def request_seen(self, request): fp = request_fingerprint(request) added = self.server.basic_publish( exchange='', routing_key=self.key, body=fp ) return not added
Example #13
Source File: media.py From learn_python3_spider with MIT License | 5 votes |
def _process_request(self, request, info): fp = request_fingerprint(request) cb = request.callback or (lambda _: _) eb = request.errback request.callback = None request.errback = None # Return cached result if request was already seen if fp in info.downloaded: return defer_result(info.downloaded[fp]).addCallbacks(cb, eb) # Otherwise, wait for result wad = Deferred().addCallbacks(cb, eb) info.waiting[fp].append(wad) # Check if request is downloading right now to avoid doing it twice if fp in info.downloading: return wad # Download request checking media_to_download hook output first info.downloading.add(fp) dfd = mustbe_deferred(self.media_to_download, request, info) dfd.addCallback(self._check_media_to_download, request, info) dfd.addBoth(self._cache_result_and_execute_waiters, fp, info) dfd.addErrback(lambda f: logger.error( f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider}) ) return dfd.addBoth(lambda _: wad) # it must return wad at last
Example #14
Source File: dupefilters.py From learn_python3_spider with MIT License | 5 votes |
def request_fingerprint(self, request): return request_fingerprint(request)
Example #15
Source File: mongodb.py From invana-bot with MIT License | 5 votes |
def _request_key(self, request): return to_bytes(request_fingerprint(request))
Example #16
Source File: httpcache.py From learn_python3_spider with MIT License | 5 votes |
def _get_request_path(self, spider, request): key = request_fingerprint(request) return os.path.join(self.cachedir, spider.name, key[0:2], key)
Example #17
Source File: dupefilter.py From learn_python3_spider with MIT License | 5 votes |
def request_fingerprint(self, request): """Returns a fingerprint for a given request. Parameters ---------- request : scrapy.http.Request Returns ------- str """ return request_fingerprint(request)
Example #18
Source File: elasticsearch.py From invana-bot with MIT License | 5 votes |
def _request_key(self, request): return to_bytes(request_fingerprint(request))
Example #19
Source File: redis_dupefilter.py From scrapy-cluster with MIT License | 5 votes |
def request_seen(self, request): fp = request_fingerprint(request) c_id = request.meta['crawlid'] added = self.server.sadd(self.key + ":" + c_id, fp) self.server.expire(self.key + ":" + c_id, self.timeout) return not added
Example #20
Source File: utils.py From sozlukcrawler with GNU General Public License v2.0 | 5 votes |
def is_request_seen(request): return session.query(exists().where(Seen.fingerprint == request_fingerprint(request))).scalar()
Example #21
Source File: py_my_scrapy_redis_server.py From vrequest with MIT License | 5 votes |
def request_seen(self, request): tid = request._plusmeta.get('taskid') if tid: fp = self.request_fingerprint(request) added = self.server.sadd(self.key.format(tid), fp) return added == 0
Example #22
Source File: py_my_scrapy_redis_server.py From vrequest with MIT License | 5 votes |
def request_fingerprint(self, request): return request_fingerprint(request)
Example #23
Source File: test_finger.py From news_spider with MIT License | 5 votes |
def test_request(self): """ 测试请求 :return: """ req_01 = Request(url=self.url_01) result_01 = request.request_fingerprint(req_01) req_02 = Request(url=self.url_02) result_02 = request.request_fingerprint(req_02) self.assertEqual(result_01, result_02)
Example #24
Source File: url.py From news_spider with MIT License | 5 votes |
def get_request_finger(url): """ 获取 url 指纹(允许参数无序) :param url: :return: """ req = Request(url=url) return request.request_fingerprint(req)
Example #25
Source File: httpcache.py From learn_python3_spider with MIT License | 5 votes |
def _request_key(self, request): return request_fingerprint(request)
Example #26
Source File: httpcache.py From learn_python3_spider with MIT License | 5 votes |
def _get_request_path(self, spider, request): key = request_fingerprint(request) return os.path.join(self.cachedir, spider.name, key[0:2], key)
Example #27
Source File: httpcache.py From learn_python3_spider with MIT License | 5 votes |
def _request_key(self, request): return to_bytes(request_fingerprint(request))
Example #28
Source File: dupefilters.py From learn_python3_spider with MIT License | 5 votes |
def request_seen(self, request): fp = self.request_fingerprint(request) if fp in self.fingerprints: return True self.fingerprints.add(fp) if self.file: self.file.write(fp + os.linesep)
Example #29
Source File: dupefilters.py From learn_python3_spider with MIT License | 5 votes |
def request_fingerprint(self, request): return request_fingerprint(request)
Example #30
Source File: httpcache.py From learn_python3_spider with MIT License | 5 votes |
def _request_key(self, request): return request_fingerprint(request)