Python scrapy.exceptions.DropItem() Examples
The following are 30
code examples of scrapy.exceptions.DropItem().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.exceptions
, or try the search function
.
Example #1
Source File: pipelines.py From hoaxy-backend with GNU General Public License v3.0 | 6 votes |
def process_item(self, item, spider): """Main function that process URL item (first phase).""" # validate URL length if len(item['raw']) > MAX_URL_LEN: item['raw'] = item['raw'][:MAX_URL_LEN] logger.error('Raw URL too long, trucate it! %r', item['raw']) # parse raw URL purl = get_parsed_url(item['raw']) if purl is None or purl.hostname is None: raise DropItem('Invalide URL') site_id = belongs_to_site(purl.hostname, self.site_tuples) if site_id is None: raise DropItem('Offsite domain: %s', item) item['site_id'] = site_id # insert URL into table try: get_or_create_murl(spider.session, item, spider.platform_id) except SQLAlchemyError as e: logger.error(e) spider.session.rollback() raise DropItem('Fail to insert database of url: %s', item) return item
Example #2
Source File: scraper.py From learn_python3_spider with MIT License | 6 votes |
def _itemproc_finished(self, output, item, response, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.slot.itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): logkws = self.logformatter.dropped(item, ex, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_dropped, item=item, response=response, spider=spider, exception=output.value) else: logger.error('Error processing %(item)s', {'item': item}, exc_info=failure_to_exc_info(output), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_error, item=item, response=response, spider=spider, failure=output) else: logkws = self.logformatter.scraped(output, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_scraped, item=output, response=response, spider=spider)
Example #3
Source File: scraper.py From learn_python3_spider with MIT License | 6 votes |
def _itemproc_finished(self, output, item, response, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.slot.itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): logkws = self.logformatter.dropped(item, ex, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_dropped, item=item, response=response, spider=spider, exception=output.value) else: logger.error('Error processing %(item)s', {'item': item}, exc_info=failure_to_exc_info(output), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_error, item=item, response=response, spider=spider, failure=output) else: logkws = self.logformatter.scraped(output, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_scraped, item=output, response=response, spider=spider)
Example #4
Source File: pipeline.py From scrapy-jsonschema with BSD 3-Clause "New" or "Revised" License | 6 votes |
def process_item(self, item, spider): if not isinstance(item, JsonSchemaItem): return item errors = list(item.validator.iter_errors(dict(item))) paths_messages = [] for error in errors: absolute_path = list(error.absolute_path) # error path is not available when required field is not filled # so we parse error message. Nasty. required_match = self.REQUIRED_RE.search(error.message) if required_match: absolute_path.append(required_match.group(1)) path = '.'.join(map(str, absolute_path)) self.stats.inc_value(self.STAT_FMT.format(field=path)) paths_messages.append((path, error.message)) if errors: error_msg = '' for path, message in paths_messages: error_msg += u'{}: {}\n'.format(path, message) raise DropItem(u'schema validation failed: \n {}'.format(error_msg)) return item
Example #5
Source File: pipelines.py From snippet with MIT License | 6 votes |
def item_completed(self, results, item, info): result = {} for n, r in enumerate(results): ok, x = r if ok: result[x["url"]] = x["path"] else: result[item[self.URLS_NAME][n]] = x.getErrorMessage() # TODO: Save the result # file_paths = [x['path'] for ok, x in results if ok] # if not file_paths: # raise DropItem("Item contains no files") # item['image_paths'] = file_paths # return item return super(GroupDownPipelineMinix, self).item_completed(results, item, info)
Example #6
Source File: pipelines.py From PyFeeds with GNU Affero General Public License v3.0 | 6 votes |
def process_item(self, item, spider): def raise_if_missing(name, item): if name not in item: raise DropItem( 'The required field "{}" is missing in: {}.'.format(name, item) ) # Required fields for all items for required in ("id", "title", "link"): raise_if_missing(required, item) # Required fields for FeedEntryItems if isinstance(item, FeedEntryItem): for required in ("updated",): raise_if_missing(required, item) return item
Example #7
Source File: pipelines.py From In2ItChicago with GNU General Public License v3.0 | 6 votes |
def process_item(self, item, spider): item['organization'] = spider.organization if 'event_time' in item: item['event_time']['date_format'] = spider.date_format loader = EventLoader(**item) # see if there is a custom filter for the item if not spider.item_filter(item): raise DropItem('Custom item filter did not allow this event') if 'event_time' in loader.item: time = loader.item['event_time'] if self.time_utils.time_range_is_between(time['start_timestamp'], time['end_timestamp'], spider.start_timestamp, spider.end_timestamp): return loader.item else: raise DropItem('Event is not in the configured timeframe') else: return loader.item
Example #8
Source File: assessment_spider.py From assessor-scraper with MIT License | 6 votes |
def parse(self, response): """ Default callback function with response for the crawled url https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse """ response = response.replace(body=re.sub(r"<br\s*[\/]?>", "\n", response.body.decode('utf=8'))) property_key = response.url.split('=')[1].replace('&', '') # logging.debug("Parsing property_key: %s", property_key) if 'No Data at this time' in response.text: msg = "No data for " + response.url logging.warning(msg) raise DropItem(msg) else: property_info = self.parse_property_info(response) property_values = self.parse_property_values(response) property_sales = self.parse_property_sales(response) property_info['sales'] = property_sales property_info['values'] = property_values property_info['property_key'] = property_key yield Property(property_info)
Example #9
Source File: middlewares.py From scrapy-autoextract with BSD 3-Clause "New" or "Revised" License | 6 votes |
def process_exception(self, request, exception, spider): if isinstance(exception, (IgnoreRequest, DropItem)): return if not self._is_enabled_for_request(request): return autoextract = request.meta.pop(AUTOEXTRACT_META_KEY) stop_time = time.time() latency = time.time() - autoextract['timing']['start_ts'] autoextract['timing'].update({'end_ts': stop_time, 'latency': latency}) # Make sure to log all unknown failures logger.warning('AutoExtract failure after %.3fs for %s: %s', latency, autoextract['original_url'], repr(exception), extra={'spider': spider}) request.meta['autoextract'] = autoextract ex_class = global_object_name(exception.__class__) self.inc_metric('autoextract/errors/total_count', spider=spider) self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider)
Example #10
Source File: pipelines.py From news-please with Apache License 2.0 | 6 votes |
def process_item(self, item, spider): if spider.name in ['RssCrawler', 'GdeltCrawler']: # Search the CurrentVersion table for a version of the article try: self.cursor.execute(self.compare_versions, (item['url'],)) except (pymysql.err.OperationalError, pymysql.ProgrammingError, pymysql.InternalError, pymysql.IntegrityError, TypeError) as error: self.log.error("Something went wrong in rss query: %s", error) # Save the result of the query. Must be done before the add, # otherwise the result will be overwritten in the buffer old_version = self.cursor.fetchone() if old_version is not None and (datetime.datetime.strptime( item['download_date'], "%y-%m-%d %H:%M:%S") - old_version[3]) \ < datetime.timedelta(hours=self.delta_time): # Compare the two download dates. index 3 of old_version # corresponds to the download_date attribute in the DB raise DropItem("Article in DB too recent. Not saving.") return item
Example #11
Source File: pipelines.py From restaurant with MIT License | 6 votes |
def process_item(self, item, spider): if spider.name not in ['meituan']: return item if self.filter_dic.get(item['restaurant_name']) == item['address']: print(item['restaurant_name']) print(item['address']) raise DropItem("Duplicate item found: %s" % item) else: self.filter_dic[item['restaurant_name']] = item['address'] try: item['lng'], item['lat'] = gaode_to_baidu(float(item['lng']), float(item['lat'])) item['province_code'] = pinyin.get(item['province']) item['city_code'] = pinyin.get(item['city']) item['region_code'] = pinyin.get(item['region']) item['area_code'] = pinyin.get(item['area']) except BaseException as e: print(e) return item
Example #12
Source File: pipelines.py From scrape with MIT License | 6 votes |
def checkInvalidKeys(self, item): """ Checks Keys For Invalid Entries Such as None/Empty """ allowedKeys = { 'None': ["image"], 'Empty': ["image"] } for key in item: try: if (item[key] == None or item[key] == "Error") and key not in allowedKeys['None']: raise DropItem("Required Key " + str(key) + " is None") if(type(item[key]) is str and key not in allowedKeys['Empty']): if len(item[key]) == 0: raise DropItem("Required Key " + str(key) + " is Empty") except DropItem: pass except Exception as e: logger.error(__name__ + " Exception: " + str(e)) continue
Example #13
Source File: pipelines.py From SinaWeiboSpider with MIT License | 5 votes |
def process_item(self, item, spider): collection_name = item.__class__.__name__ try: self.db[collection_name].insert(dict(item)) except DuplicateKeyError: return DropItem("Duplicate item found: %s" % item) else: return item
Example #14
Source File: pipelines.py From scrapy_xiuren with Apache License 2.0 | 5 votes |
def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: print('{}/{}------文件保存失败'.format(item['image_dir'], item['image_url'])) raise DropItem("Item contains no images") else: print('{}/{}------文件保存成功'.format(item['image_dir'], item['image_url'])) return item
Example #15
Source File: pipelines.py From news-please with Apache License 2.0 | 5 votes |
def process_item(self, item, spider): # Check if date could be extracted if item['article_publish_date'] is None and self.strict_mode: raise DropItem('DateFilter: %s: Publishing date is missing and strict mode is enabled.' % item['url']) elif item['article_publish_date'] is None: return item else: # Create datetime object try: publish_date = datetime.datetime.strptime(str(item['article_publish_date']), '%Y-%m-%d %H:%M:%S') except ValueError as error: self.log.warning("DateFilter: Extracted date has the wrong format: %s - %s" % (item['article_publishing_date'], item['url'])) if self.strict_mode: raise DropItem('DateFilter: %s: Dropped due to wrong date format: %s' % (item['url'], item['publish_date'])) else: return item # Check interval boundaries if self.start_date is not None and self.start_date > publish_date: raise DropItem('DateFilter: %s: Article is too old: %s' % (item['url'], publish_date)) elif self.end_date is not None and self.end_date < publish_date: raise DropItem('DateFilter: %s: Article is too young: %s ' % (item['url'], publish_date)) else: return item
Example #16
Source File: pipelines.py From PyFeeds with GNU Affero General Public License v3.0 | 5 votes |
def process_item(self, item, spider): if "id" not in item: if "link" in item: item["id"] = uuid.uuid5(uuid.NAMESPACE_DNS, item["link"]).urn else: raise DropItem( "A link is required to autogenerate the feed " "id for: {}".format(item) ) if "title" not in item: # Having a title is mandatory, so we use an empty string if none # is set. item["title"] = "" if isinstance(item, FeedEntryItem) and "updated" not in item: if "link" in item: item["updated"] = spider.cache.setdefault( spider, key="{}|updated".format(item["id"]), default_obj=datetime.now(timezone.utc), ) else: raise DropItem( "A link is required to autogenerate the updated field " "for: {}".format(item) ) return item
Example #17
Source File: pipelines.py From news-please with Apache License 2.0 | 5 votes |
def process_item(self, item, spider): # For the case where something goes wrong if item['spider_response'].status != 200: # Item is no longer processed in the pipeline raise DropItem("%s: Non-200 response" % item['url']) else: return item
Example #18
Source File: pipelines.py From Agriculture_KnowledgeGraph with GNU General Public License v3.0 | 5 votes |
def process_item(self, item, spider): if item['title']: line = "" if(self.count > 0): line += "," line += json.dumps(dict(item),ensure_ascii=False) + "\n" self.file.write(line) self.count += 1 print("count: "+str(self.count)) return item else: raise DropItem("忽略无title的组件!")
Example #19
Source File: pipelines.py From bulletin-scraper with BSD 2-Clause "Simplified" License | 5 votes |
def get_media_requests(self, item, info): url = item['url'] if not url.lower().endswith('.msu'): raise DropItem('Item not an MSU') request = scrapy.Request(url) request.meta['bulletin'] = item['bulletin'] yield request
Example #20
Source File: pipelines.py From NewsScrapy with GNU Lesser General Public License v3.0 | 5 votes |
def process_item(self, item, spider): item_keywords = judge_key_words(item)#获得item和关键词匹配的词 if item_keywords: #筛选出有关键词的item item["keywords"] = item_keywords return item else: logger = logging.getLogger(spider.name) logger.info("No keyword in %s" % item["news_url"]) raise DropItem("No keyword in %s" % item["news_url"])
Example #21
Source File: pipelines.py From Agriculture_KnowledgeGraph with GNU General Public License v3.0 | 5 votes |
def process_item(self, item, spider): if item['title'] != 'error': # 'error'是百科中没有的页面赋予的title值(自己定义的) line = "" if(self.count > 0): line += "," line += json.dumps(dict(item),ensure_ascii=False) + '\n' self.file.write(line) self.count += 1 cur = time.time() T = int(cur-self.start) print("page count: " + str(self.count) + " time:" + str(int(T/3600)) + "h " + str(int(T/60)%60) + "m " + str(T%60) + "s......") return item else: raise DropItem("百科中找不到对应页面!")
Example #22
Source File: clean.py From kmanga with GNU General Public License v3.0 | 5 votes |
def _clean_field_list(self, field, cleaner=None, cleaner_params=None, optional=False, exclude=None, drop=False, max_length=None): """Generic clean method for list field.""" if cleaner: cleaner_params = cleaner_params if cleaner_params else () value = [] for e in self._as_list(field): try: c = cleaner(e, *cleaner_params) except DropItem: # If the exception created by the cleaner function # is DropItem and we are allowed to drop items, we # drop it, else we re-raise the exception droping # the full item container. if not drop: raise else: value.append(c) else: value = [e.strip() for e in self._as_list(field)] if exclude: value = [e for e in value if e not in exclude] if max_length: value = [e[:max_length] for e in value] if not value and not optional: raise ValueError('field is not optional' " or can't be converted to a list") return value
Example #23
Source File: pipelines.py From corpus-builder with MIT License | 5 votes |
def process_item(self, item, spider): if item['body']: item['body'] = item['body'].strip() return item else: raise DropItem("Empty Body")
Example #24
Source File: pipelines.py From openslack-crawler with Apache License 2.0 | 5 votes |
def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") # print image_paths return item
Example #25
Source File: image.py From openslack-crawler with Apache License 2.0 | 5 votes |
def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") item['image_paths'] = image_paths return item
Example #26
Source File: pipelines.py From hoaxy-backend with GNU General Public License v3.0 | 5 votes |
def process_item(self, item, spider): """Main function that process URL item (second phase).""" # canonicalize expanded URL without considering the status_code # because scrapy crawling not ganrantee the success # we still try to canonicalize the URL if len(item['expanded']) > MAX_URL_LEN: item['expanded'] = item['expanded'][:MAX_URL_LEN] logger.error('Expanded URL too long, trucate it! %r', item['raw']) item['canonical'] = canonicalize(item['expanded']) if item['canonical'] is None: item['status_code'] = U_HTML_ERROR_INVALID_URL # if url could be canonicalized and if site_id is not determined # we infer it from the expanded url if item['status_code'] != U_HTML_ERROR_INVALID_URL\ and item.get('site_id', None) is None: purl = get_parsed_url(item['expanded']) if purl is not None and purl.hostname is not None: if belongs_to_domain(purl.hostname, spider.excluded_domains)\ is not None: item['status_code'] = U_HTML_ERROR_EXCLUDED_DOMAIN else: item['site_id'] = belongs_to_site(purl.hostname, self.site_tuples) else: item['status_code'] = U_HTML_ERROR_INVALID_URL # remove potential NUL byte \x00 in the HTML if 'html' in item: item['html'] = item['html'].replace(b'\x00', b'') try: # update database of url table spider.session.query(Url).filter_by(id=item['id'])\ .update(dict(item), synchronize_session=False) spider.session.commit() logger.debug('Fetched html of url %r with status %i', item['raw'], item['status_code']) except SQLAlchemyError as e: logger.error(e) spider.session.rollback() raise DropItem('Fail to update database of url: %s', item) return item
Example #27
Source File: pipelines.py From ajax_crawler with MIT License | 5 votes |
def ensure_not_empty(self, item, field): if field in item: if item[field] ==[]: raise DropItem("Empty item found: %s" % item)
Example #28
Source File: pipelines.py From ajax_crawler with MIT License | 5 votes |
def ensure_not_duplicate(self, spider, item, field): if field in item: if field not in self.duplicates[spider]: self.duplicates[spider][field] = set() if item[field] and type(item[field]) is list: if item[field][0] in self.duplicates[spider][field]: raise DropItem("Duplicate item found: %s" % item) else: self.duplicates[spider][field].add(item[field][0])
Example #29
Source File: pipelines.py From scrape with MIT License | 5 votes |
def item_dropped(self, item, response, exception, spider): # Calls When DropItem Exception is raised spider.urls_dropped += 1 logger.info(__name__ + " [Dropped] <Spider>: " + spider.name + " <Reason>: " + str(exception) + " <Link>: " + str(item['link']))
Example #30
Source File: pipelines.py From scrape with MIT License | 5 votes |
def process_item(self, item, spider): if not spider.postgres.checkConnection(): raise CloseSpider("Unable to Establish a Database Connection") if spider.postgres.checkUrlExists(item['link']): raise DropItem("Url " + item['link'] + " Exists in Database") return item