Python scrapy.exceptions.DropItem() Examples

The following are 30 code examples of scrapy.exceptions.DropItem(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.exceptions , or try the search function .
Example #1
Source File: pipelines.py    From hoaxy-backend with GNU General Public License v3.0 6 votes vote down vote up
def process_item(self, item, spider):
        """Main function that process URL item (first phase)."""
        # validate URL length
        if len(item['raw']) > MAX_URL_LEN:
            item['raw'] = item['raw'][:MAX_URL_LEN]
            logger.error('Raw URL too long, trucate it! %r', item['raw'])
        # parse raw URL
        purl = get_parsed_url(item['raw'])
        if purl is None or purl.hostname is None:
            raise DropItem('Invalide URL')
        site_id = belongs_to_site(purl.hostname, self.site_tuples)
        if site_id is None:
            raise DropItem('Offsite domain: %s', item)
        item['site_id'] = site_id
        # insert URL into table
        try:
            get_or_create_murl(spider.session, item, spider.platform_id)
        except SQLAlchemyError as e:
            logger.error(e)
            spider.session.rollback()
            raise DropItem('Fail to insert database of url: %s', item)
        return item 
Example #2
Source File: scraper.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _itemproc_finished(self, output, item, response, spider):
        """ItemProcessor finished for the given ``item`` and returned ``output``
        """
        self.slot.itemproc_size -= 1
        if isinstance(output, Failure):
            ex = output.value
            if isinstance(ex, DropItem):
                logkws = self.logformatter.dropped(item, ex, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_dropped, item=item, response=response,
                    spider=spider, exception=output.value)
            else:
                logger.error('Error processing %(item)s', {'item': item},
                             exc_info=failure_to_exc_info(output),
                             extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_error, item=item, response=response,
                    spider=spider, failure=output)
        else:
            logkws = self.logformatter.scraped(output, response, spider)
            logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
            return self.signals.send_catch_log_deferred(
                signal=signals.item_scraped, item=output, response=response,
                spider=spider) 
Example #3
Source File: scraper.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _itemproc_finished(self, output, item, response, spider):
        """ItemProcessor finished for the given ``item`` and returned ``output``
        """
        self.slot.itemproc_size -= 1
        if isinstance(output, Failure):
            ex = output.value
            if isinstance(ex, DropItem):
                logkws = self.logformatter.dropped(item, ex, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_dropped, item=item, response=response,
                    spider=spider, exception=output.value)
            else:
                logger.error('Error processing %(item)s', {'item': item},
                             exc_info=failure_to_exc_info(output),
                             extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_error, item=item, response=response,
                    spider=spider, failure=output)
        else:
            logkws = self.logformatter.scraped(output, response, spider)
            logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
            return self.signals.send_catch_log_deferred(
                signal=signals.item_scraped, item=output, response=response,
                spider=spider) 
Example #4
Source File: pipeline.py    From scrapy-jsonschema with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def process_item(self, item, spider):
        if not isinstance(item, JsonSchemaItem):
            return item

        errors = list(item.validator.iter_errors(dict(item)))
        paths_messages = []
        for error in errors:
            absolute_path = list(error.absolute_path)
            # error path is not available when required field is not filled
            # so we parse error message. Nasty.
            required_match = self.REQUIRED_RE.search(error.message)
            if required_match:
                absolute_path.append(required_match.group(1))
            path = '.'.join(map(str, absolute_path))
            self.stats.inc_value(self.STAT_FMT.format(field=path))
            paths_messages.append((path, error.message))
        if errors:
            error_msg = ''
            for path, message in paths_messages:
                error_msg += u'{}: {}\n'.format(path, message)
            raise DropItem(u'schema validation failed: \n {}'.format(error_msg))

        return item 
Example #5
Source File: pipelines.py    From snippet with MIT License 6 votes vote down vote up
def item_completed(self, results, item, info):
        result = {}
        for n, r in enumerate(results):
            ok, x = r
            if ok:
                result[x["url"]] = x["path"]
            else:
                result[item[self.URLS_NAME][n]] = x.getErrorMessage()
        # TODO: Save the result

        # file_paths = [x['path'] for ok, x in results if ok]
        # if not file_paths:
        #     raise DropItem("Item contains no files")
        # item['image_paths'] = file_paths
        # return item

        return super(GroupDownPipelineMinix, self).item_completed(results, item, info) 
Example #6
Source File: pipelines.py    From PyFeeds with GNU Affero General Public License v3.0 6 votes vote down vote up
def process_item(self, item, spider):
        def raise_if_missing(name, item):
            if name not in item:
                raise DropItem(
                    'The required field "{}" is missing in: {}.'.format(name, item)
                )

        # Required fields for all items
        for required in ("id", "title", "link"):
            raise_if_missing(required, item)

        # Required fields for FeedEntryItems
        if isinstance(item, FeedEntryItem):
            for required in ("updated",):
                raise_if_missing(required, item)

        return item 
Example #7
Source File: pipelines.py    From In2ItChicago with GNU General Public License v3.0 6 votes vote down vote up
def process_item(self, item, spider):
        item['organization'] = spider.organization
        if 'event_time' in item:
            item['event_time']['date_format'] = spider.date_format
        loader = EventLoader(**item)
        # see if there is a custom filter for the item
        if not spider.item_filter(item):
            raise DropItem('Custom item filter did not allow this event')
        if 'event_time' in loader.item:
            time = loader.item['event_time']
            if self.time_utils.time_range_is_between(time['start_timestamp'], time['end_timestamp'], spider.start_timestamp, spider.end_timestamp):
                return loader.item
            else:
                raise DropItem('Event is not in the configured timeframe')
        else:
            return loader.item 
Example #8
Source File: assessment_spider.py    From assessor-scraper with MIT License 6 votes vote down vote up
def parse(self, response):
        """
        Default callback function with response for the crawled url
        https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse
        """
        response = response.replace(body=re.sub(r"<br\s*[\/]?>", "\n", response.body.decode('utf=8')))
        property_key = response.url.split('=')[1].replace('&', '')
        # logging.debug("Parsing property_key: %s", property_key)
        if 'No Data at this time' in response.text:
            msg = "No data for " + response.url
            logging.warning(msg)
            raise DropItem(msg)
        else:
            property_info = self.parse_property_info(response)
            property_values = self.parse_property_values(response)
            property_sales = self.parse_property_sales(response)
            property_info['sales'] = property_sales
            property_info['values'] = property_values
            property_info['property_key'] = property_key
            yield Property(property_info) 
Example #9
Source File: middlewares.py    From scrapy-autoextract with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def process_exception(self, request, exception, spider):
        if isinstance(exception, (IgnoreRequest, DropItem)):
            return
        if not self._is_enabled_for_request(request):
            return

        autoextract = request.meta.pop(AUTOEXTRACT_META_KEY)
        stop_time = time.time()
        latency = time.time() - autoextract['timing']['start_ts']
        autoextract['timing'].update({'end_ts': stop_time, 'latency': latency})

        # Make sure to log all unknown failures
        logger.warning('AutoExtract failure after %.3fs for %s: %s',
                       latency,
                       autoextract['original_url'],
                       repr(exception),
                       extra={'spider': spider})

        request.meta['autoextract'] = autoextract
        ex_class = global_object_name(exception.__class__)
        self.inc_metric('autoextract/errors/total_count', spider=spider)
        self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider) 
Example #10
Source File: pipelines.py    From news-please with Apache License 2.0 6 votes vote down vote up
def process_item(self, item, spider):
        if spider.name in ['RssCrawler', 'GdeltCrawler']:
            # Search the CurrentVersion table for a version of the article
            try:
                self.cursor.execute(self.compare_versions, (item['url'],))
            except (pymysql.err.OperationalError, pymysql.ProgrammingError, pymysql.InternalError,
                    pymysql.IntegrityError, TypeError) as error:
                self.log.error("Something went wrong in rss query: %s", error)

            # Save the result of the query. Must be done before the add,
            #   otherwise the result will be overwritten in the buffer
            old_version = self.cursor.fetchone()

            if old_version is not None and (datetime.datetime.strptime(
                    item['download_date'], "%y-%m-%d %H:%M:%S") -
                                            old_version[3]) \
                    < datetime.timedelta(hours=self.delta_time):
                # Compare the two download dates. index 3 of old_version
                # corresponds to the download_date attribute in the DB
                raise DropItem("Article in DB too recent. Not saving.")

        return item 
Example #11
Source File: pipelines.py    From restaurant with MIT License 6 votes vote down vote up
def process_item(self, item, spider):
        if spider.name not in ['meituan']:
            return item
        if self.filter_dic.get(item['restaurant_name']) == item['address']:
            print(item['restaurant_name'])
            print(item['address'])
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.filter_dic[item['restaurant_name']] = item['address']
            try:
                item['lng'], item['lat'] = gaode_to_baidu(float(item['lng']), float(item['lat']))
                item['province_code'] = pinyin.get(item['province'])
                item['city_code'] = pinyin.get(item['city'])
                item['region_code'] = pinyin.get(item['region'])
                item['area_code'] = pinyin.get(item['area'])
            except BaseException as e:
                print(e)
            return item 
Example #12
Source File: pipelines.py    From scrape with MIT License 6 votes vote down vote up
def checkInvalidKeys(self, item):
        """ Checks Keys For Invalid Entries Such as None/Empty """        
        allowedKeys = {
            'None': ["image"],
            'Empty': ["image"]
        }
        for key in item:
            try:
                if (item[key] == None or item[key] == "Error") and key not in allowedKeys['None']:
                    raise DropItem("Required Key " + str(key) + " is None")

                if(type(item[key]) is str and key not in allowedKeys['Empty']):
                    if len(item[key]) == 0:
                        raise DropItem("Required Key " + str(key) + " is Empty")
            except DropItem:
                pass
            except Exception as e:
                logger.error(__name__ + " Exception: " + str(e))
                continue 
Example #13
Source File: pipelines.py    From SinaWeiboSpider with MIT License 5 votes vote down vote up
def process_item(self, item, spider):
        collection_name = item.__class__.__name__
        try:
            self.db[collection_name].insert(dict(item))
        except DuplicateKeyError:
            return DropItem("Duplicate item found: %s" % item)
        else:
            return item 
Example #14
Source File: pipelines.py    From scrapy_xiuren with Apache License 2.0 5 votes vote down vote up
def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]

        if not image_paths:
            print('{}/{}------文件保存失败'.format(item['image_dir'], item['image_url']))
            raise DropItem("Item contains no images")
        else:
            print('{}/{}------文件保存成功'.format(item['image_dir'], item['image_url']))
        return item 
Example #15
Source File: pipelines.py    From news-please with Apache License 2.0 5 votes vote down vote up
def process_item(self, item, spider):

        # Check if date could be extracted
        if item['article_publish_date'] is None and self.strict_mode:
            raise DropItem('DateFilter: %s: Publishing date is missing and strict mode is enabled.' % item['url'])
        elif item['article_publish_date'] is None:
            return item
        else:
            # Create datetime object
            try:
                publish_date = datetime.datetime.strptime(str(item['article_publish_date']), '%Y-%m-%d %H:%M:%S')
            except ValueError as error:
                self.log.warning("DateFilter: Extracted date has the wrong format: %s - %s" %
                                 (item['article_publishing_date'], item['url']))
                if self.strict_mode:
                    raise DropItem('DateFilter: %s: Dropped due to wrong date format: %s' %
                                   (item['url'], item['publish_date']))
                else:
                    return item
            # Check interval boundaries
            if self.start_date is not None and self.start_date > publish_date:
                raise DropItem('DateFilter: %s: Article is too old: %s' % (item['url'], publish_date))
            elif self.end_date is not None and self.end_date < publish_date:
                raise DropItem('DateFilter: %s: Article is too young: %s ' % (item['url'], publish_date))
            else:
                return item 
Example #16
Source File: pipelines.py    From PyFeeds with GNU Affero General Public License v3.0 5 votes vote down vote up
def process_item(self, item, spider):
        if "id" not in item:
            if "link" in item:
                item["id"] = uuid.uuid5(uuid.NAMESPACE_DNS, item["link"]).urn
            else:
                raise DropItem(
                    "A link is required to autogenerate the feed "
                    "id for: {}".format(item)
                )

        if "title" not in item:
            # Having a title is mandatory, so we use an empty string if none
            # is set.
            item["title"] = ""

        if isinstance(item, FeedEntryItem) and "updated" not in item:
            if "link" in item:
                item["updated"] = spider.cache.setdefault(
                    spider,
                    key="{}|updated".format(item["id"]),
                    default_obj=datetime.now(timezone.utc),
                )
            else:
                raise DropItem(
                    "A link is required to autogenerate the updated field "
                    "for: {}".format(item)
                )

        return item 
Example #17
Source File: pipelines.py    From news-please with Apache License 2.0 5 votes vote down vote up
def process_item(self, item, spider):
        # For the case where something goes wrong
        if item['spider_response'].status != 200:
            # Item is no longer processed in the pipeline
            raise DropItem("%s: Non-200 response" % item['url'])
        else:
            return item 
Example #18
Source File: pipelines.py    From Agriculture_KnowledgeGraph with GNU General Public License v3.0 5 votes vote down vote up
def process_item(self, item, spider):
        if item['title']:
            line = ""
            if(self.count > 0):
                line += ","
            line += json.dumps(dict(item),ensure_ascii=False) + "\n"
            self.file.write(line)
            self.count += 1
            print("count: "+str(self.count))
            return item
        else:
            raise DropItem("忽略无title的组件!") 
Example #19
Source File: pipelines.py    From bulletin-scraper with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def get_media_requests(self, item, info):
        url = item['url']
        if not url.lower().endswith('.msu'):
            raise DropItem('Item not an MSU')
        request = scrapy.Request(url)
        request.meta['bulletin'] = item['bulletin']
        yield request 
Example #20
Source File: pipelines.py    From NewsScrapy with GNU Lesser General Public License v3.0 5 votes vote down vote up
def process_item(self, item, spider):
        item_keywords = judge_key_words(item)#获得item和关键词匹配的词
        if item_keywords:   #筛选出有关键词的item
            item["keywords"] = item_keywords
            return item
        else:
            logger = logging.getLogger(spider.name)
            logger.info("No keyword in %s" % item["news_url"])
            raise DropItem("No keyword in %s" % item["news_url"]) 
Example #21
Source File: pipelines.py    From Agriculture_KnowledgeGraph with GNU General Public License v3.0 5 votes vote down vote up
def process_item(self, item, spider):
        if item['title'] != 'error':   # 'error'是百科中没有的页面赋予的title值(自己定义的)
            line = ""
            if(self.count > 0):
                line += ","
            line += json.dumps(dict(item),ensure_ascii=False) + '\n'
            self.file.write(line)
            self.count += 1
            cur = time.time()
            T = int(cur-self.start)
            print("page count: " + str(self.count) + "      time:" + str(int(T/3600)) + "h " + str(int(T/60)%60) + "m " + str(T%60) + "s......")
            return item
        else:
            raise DropItem("百科中找不到对应页面!") 
Example #22
Source File: clean.py    From kmanga with GNU General Public License v3.0 5 votes vote down vote up
def _clean_field_list(self, field, cleaner=None,
                          cleaner_params=None, optional=False,
                          exclude=None, drop=False, max_length=None):
        """Generic clean method for list field."""
        if cleaner:
            cleaner_params = cleaner_params if cleaner_params else ()
            value = []
            for e in self._as_list(field):
                try:
                    c = cleaner(e, *cleaner_params)
                except DropItem:
                    # If the exception created by the cleaner function
                    # is DropItem and we are allowed to drop items, we
                    # drop it, else we re-raise the exception droping
                    # the full item container.
                    if not drop:
                        raise
                else:
                    value.append(c)
        else:
            value = [e.strip() for e in self._as_list(field)]
        if exclude:
            value = [e for e in value if e not in exclude]
        if max_length:
            value = [e[:max_length] for e in value]
        if not value and not optional:
            raise ValueError('field is not optional'
                             " or can't be converted to a list")
        return value 
Example #23
Source File: pipelines.py    From corpus-builder with MIT License 5 votes vote down vote up
def process_item(self, item, spider):
        if item['body']:
            item['body'] = item['body'].strip()
            return item
        else:
            raise DropItem("Empty Body") 
Example #24
Source File: pipelines.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        # print image_paths
        return item 
Example #25
Source File: image.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item 
Example #26
Source File: pipelines.py    From hoaxy-backend with GNU General Public License v3.0 5 votes vote down vote up
def process_item(self, item, spider):
        """Main function that process URL item (second phase)."""
        # canonicalize expanded URL without considering the status_code
        # because scrapy crawling not ganrantee the success
        # we still try to canonicalize the URL
        if len(item['expanded']) > MAX_URL_LEN:
            item['expanded'] = item['expanded'][:MAX_URL_LEN]
            logger.error('Expanded URL too long, trucate it! %r', item['raw'])
        item['canonical'] = canonicalize(item['expanded'])
        if item['canonical'] is None:
            item['status_code'] = U_HTML_ERROR_INVALID_URL

        # if url could be canonicalized and if site_id is not determined
        # we infer it from the expanded url
        if item['status_code'] != U_HTML_ERROR_INVALID_URL\
                and item.get('site_id', None) is None:
            purl = get_parsed_url(item['expanded'])
            if purl is not None and purl.hostname is not None:
                if belongs_to_domain(purl.hostname, spider.excluded_domains)\
                        is not None:
                    item['status_code'] = U_HTML_ERROR_EXCLUDED_DOMAIN
                else:
                    item['site_id'] = belongs_to_site(purl.hostname,
                                                      self.site_tuples)
            else:
                item['status_code'] = U_HTML_ERROR_INVALID_URL
        # remove potential NUL byte \x00 in the HTML
        if 'html' in item:
            item['html'] = item['html'].replace(b'\x00', b'')
        try:
            # update database of url table
            spider.session.query(Url).filter_by(id=item['id'])\
                .update(dict(item), synchronize_session=False)
            spider.session.commit()
            logger.debug('Fetched html of url %r with status %i', item['raw'],
                         item['status_code'])
        except SQLAlchemyError as e:
            logger.error(e)
            spider.session.rollback()
            raise DropItem('Fail to update database of url: %s', item)
        return item 
Example #27
Source File: pipelines.py    From ajax_crawler with MIT License 5 votes vote down vote up
def ensure_not_empty(self, item, field):
        if field in item:
            if item[field] ==[]:
                raise DropItem("Empty item found: %s" % item) 
Example #28
Source File: pipelines.py    From ajax_crawler with MIT License 5 votes vote down vote up
def ensure_not_duplicate(self, spider, item, field):
        if field in item:
            if field not in self.duplicates[spider]:
                self.duplicates[spider][field] = set()
            if item[field] and type(item[field]) is list:
                if item[field][0] in self.duplicates[spider][field]:
                    raise DropItem("Duplicate item found: %s" % item)
                else:
                    self.duplicates[spider][field].add(item[field][0]) 
Example #29
Source File: pipelines.py    From scrape with MIT License 5 votes vote down vote up
def item_dropped(self, item, response, exception, spider):
        # Calls When DropItem Exception is raised
        spider.urls_dropped += 1
        logger.info(__name__ + " [Dropped] <Spider>: " + spider.name + " <Reason>: " + str(exception) + " <Link>: " + str(item['link'])) 
Example #30
Source File: pipelines.py    From scrape with MIT License 5 votes vote down vote up
def process_item(self, item, spider):

        if not spider.postgres.checkConnection():
            raise CloseSpider("Unable to Establish a Database Connection")
        
        if spider.postgres.checkUrlExists(item['link']):
            raise DropItem("Url " + item['link'] + " Exists in Database")
        
        return item