Python Examples of scrapy.exceptions.DropItem

Source File: pipelines.py From hoaxy-backend with GNU General Public License v3.0

6 votes

def process_item(self, item, spider):
        """Main function that process URL item (first phase)."""
        # validate URL length
        if len(item['raw']) > MAX_URL_LEN:
            item['raw'] = item['raw'][:MAX_URL_LEN]
            logger.error('Raw URL too long, trucate it! %r', item['raw'])
        # parse raw URL
        purl = get_parsed_url(item['raw'])
        if purl is None or purl.hostname is None:
            raise DropItem('Invalide URL')
        site_id = belongs_to_site(purl.hostname, self.site_tuples)
        if site_id is None:
            raise DropItem('Offsite domain: %s', item)
        item['site_id'] = site_id
        # insert URL into table
        try:
            get_or_create_murl(spider.session, item, spider.platform_id)
        except SQLAlchemyError as e:
            logger.error(e)
            spider.session.rollback()
            raise DropItem('Fail to insert database of url: %s', item)
        return item

Source File: scraper.py From learn_python3_spider with MIT License

6 votes

def _itemproc_finished(self, output, item, response, spider):
        """ItemProcessor finished for the given ``item`` and returned ``output``
        """
        self.slot.itemproc_size -= 1
        if isinstance(output, Failure):
            ex = output.value
            if isinstance(ex, DropItem):
                logkws = self.logformatter.dropped(item, ex, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_dropped, item=item, response=response,
                    spider=spider, exception=output.value)
            else:
                logger.error('Error processing %(item)s', {'item': item},
                             exc_info=failure_to_exc_info(output),
                             extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_error, item=item, response=response,
                    spider=spider, failure=output)
        else:
            logkws = self.logformatter.scraped(output, response, spider)
            logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
            return self.signals.send_catch_log_deferred(
                signal=signals.item_scraped, item=output, response=response,
                spider=spider)

Source File: scraper.py From learn_python3_spider with MIT License

6 votes

def _itemproc_finished(self, output, item, response, spider):
        """ItemProcessor finished for the given ``item`` and returned ``output``
        """
        self.slot.itemproc_size -= 1
        if isinstance(output, Failure):
            ex = output.value
            if isinstance(ex, DropItem):
                logkws = self.logformatter.dropped(item, ex, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_dropped, item=item, response=response,
                    spider=spider, exception=output.value)
            else:
                logger.error('Error processing %(item)s', {'item': item},
                             exc_info=failure_to_exc_info(output),
                             extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_error, item=item, response=response,
                    spider=spider, failure=output)
        else:
            logkws = self.logformatter.scraped(output, response, spider)
            logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
            return self.signals.send_catch_log_deferred(
                signal=signals.item_scraped, item=output, response=response,
                spider=spider)

Source File: pipeline.py From scrapy-jsonschema with BSD 3-Clause "New" or "Revised" License

6 votes

def process_item(self, item, spider):
        if not isinstance(item, JsonSchemaItem):
            return item

        errors = list(item.validator.iter_errors(dict(item)))
        paths_messages = []
        for error in errors:
            absolute_path = list(error.absolute_path)
            # error path is not available when required field is not filled
            # so we parse error message. Nasty.
            required_match = self.REQUIRED_RE.search(error.message)
            if required_match:
                absolute_path.append(required_match.group(1))
            path = '.'.join(map(str, absolute_path))
            self.stats.inc_value(self.STAT_FMT.format(field=path))
            paths_messages.append((path, error.message))
        if errors:
            error_msg = ''
            for path, message in paths_messages:
                error_msg += u'{}: {}\n'.format(path, message)
            raise DropItem(u'schema validation failed: \n {}'.format(error_msg))

        return item

Source File: pipelines.py From snippet with MIT License

6 votes

def item_completed(self, results, item, info):
        result = {}
        for n, r in enumerate(results):
            ok, x = r
            if ok:
                result[x["url"]] = x["path"]
            else:
                result[item[self.URLS_NAME][n]] = x.getErrorMessage()
        # TODO: Save the result

        # file_paths = [x['path'] for ok, x in results if ok]
        # if not file_paths:
        #     raise DropItem("Item contains no files")
        # item['image_paths'] = file_paths
        # return item

        return super(GroupDownPipelineMinix, self).item_completed(results, item, info)

Source File: pipelines.py From PyFeeds with GNU Affero General Public License v3.0

6 votes

def process_item(self, item, spider):
        def raise_if_missing(name, item):
            if name not in item:
                raise DropItem(
                    'The required field "{}" is missing in: {}.'.format(name, item)
                )

        # Required fields for all items
        for required in ("id", "title", "link"):
            raise_if_missing(required, item)

        # Required fields for FeedEntryItems
        if isinstance(item, FeedEntryItem):
            for required in ("updated",):
                raise_if_missing(required, item)

        return item

Source File: pipelines.py From In2ItChicago with GNU General Public License v3.0

6 votes

def process_item(self, item, spider):
        item['organization'] = spider.organization
        if 'event_time' in item:
            item['event_time']['date_format'] = spider.date_format
        loader = EventLoader(**item)
        # see if there is a custom filter for the item
        if not spider.item_filter(item):
            raise DropItem('Custom item filter did not allow this event')
        if 'event_time' in loader.item:
            time = loader.item['event_time']
            if self.time_utils.time_range_is_between(time['start_timestamp'], time['end_timestamp'], spider.start_timestamp, spider.end_timestamp):
                return loader.item
            else:
                raise DropItem('Event is not in the configured timeframe')
        else:
            return loader.item

Source File: assessment_spider.py From assessor-scraper with MIT License

6 votes

def parse(self, response):
        """
        Default callback function with response for the crawled url
        https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse
        """
        response = response.replace(body=re.sub(r"<br\s*[\/]?>", "\n", response.body.decode('utf=8')))
        property_key = response.url.split('=')[1].replace('&', '')
        # logging.debug("Parsing property_key: %s", property_key)
        if 'No Data at this time' in response.text:
            msg = "No data for " + response.url
            logging.warning(msg)
            raise DropItem(msg)
        else:
            property_info = self.parse_property_info(response)
            property_values = self.parse_property_values(response)
            property_sales = self.parse_property_sales(response)
            property_info['sales'] = property_sales
            property_info['values'] = property_values
            property_info['property_key'] = property_key
            yield Property(property_info)

Source File: middlewares.py From scrapy-autoextract with BSD 3-Clause "New" or "Revised" License

6 votes

def process_exception(self, request, exception, spider):
        if isinstance(exception, (IgnoreRequest, DropItem)):
            return
        if not self._is_enabled_for_request(request):
            return

        autoextract = request.meta.pop(AUTOEXTRACT_META_KEY)
        stop_time = time.time()
        latency = time.time() - autoextract['timing']['start_ts']
        autoextract['timing'].update({'end_ts': stop_time, 'latency': latency})

        # Make sure to log all unknown failures
        logger.warning('AutoExtract failure after %.3fs for %s: %s',
                       latency,
                       autoextract['original_url'],
                       repr(exception),
                       extra={'spider': spider})

        request.meta['autoextract'] = autoextract
        ex_class = global_object_name(exception.__class__)
        self.inc_metric('autoextract/errors/total_count', spider=spider)
        self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider)

Source File: pipelines.py From news-please with Apache License 2.0

6 votes

def process_item(self, item, spider):
        if spider.name in ['RssCrawler', 'GdeltCrawler']:
            # Search the CurrentVersion table for a version of the article
            try:
                self.cursor.execute(self.compare_versions, (item['url'],))
            except (pymysql.err.OperationalError, pymysql.ProgrammingError, pymysql.InternalError,
                    pymysql.IntegrityError, TypeError) as error:
                self.log.error("Something went wrong in rss query: %s", error)

            # Save the result of the query. Must be done before the add,
            #   otherwise the result will be overwritten in the buffer
            old_version = self.cursor.fetchone()

            if old_version is not None and (datetime.datetime.strptime(
                    item['download_date'], "%y-%m-%d %H:%M:%S") -
                                            old_version[3]) \
                    < datetime.timedelta(hours=self.delta_time):
                # Compare the two download dates. index 3 of old_version
                # corresponds to the download_date attribute in the DB
                raise DropItem("Article in DB too recent. Not saving.")

        return item

Source File: pipelines.py From restaurant with MIT License

6 votes

def process_item(self, item, spider):
        if spider.name not in ['meituan']:
            return item
        if self.filter_dic.get(item['restaurant_name']) == item['address']:
            print(item['restaurant_name'])
            print(item['address'])
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.filter_dic[item['restaurant_name']] = item['address']
            try:
                item['lng'], item['lat'] = gaode_to_baidu(float(item['lng']), float(item['lat']))
                item['province_code'] = pinyin.get(item['province'])
                item['city_code'] = pinyin.get(item['city'])
                item['region_code'] = pinyin.get(item['region'])
                item['area_code'] = pinyin.get(item['area'])
            except BaseException as e:
                print(e)
            return item

Source File: pipelines.py From scrape with MIT License

6 votes

def checkInvalidKeys(self, item):
        """ Checks Keys For Invalid Entries Such as None/Empty """        
        allowedKeys = {
            'None': ["image"],
            'Empty': ["image"]
        }
        for key in item:
            try:
                if (item[key] == None or item[key] == "Error") and key not in allowedKeys['None']:
                    raise DropItem("Required Key " + str(key) + " is None")

                if(type(item[key]) is str and key not in allowedKeys['Empty']):
                    if len(item[key]) == 0:
                        raise DropItem("Required Key " + str(key) + " is Empty")
            except DropItem:
                pass
            except Exception as e:
                logger.error(__name__ + " Exception: " + str(e))
                continue

Source File: pipelines.py From SinaWeiboSpider with MIT License

5 votes

def process_item(self, item, spider):
        collection_name = item.__class__.__name__
        try:
            self.db[collection_name].insert(dict(item))
        except DuplicateKeyError:
            return DropItem("Duplicate item found: %s" % item)
        else:
            return item

Source File: pipelines.py From scrapy_xiuren with Apache License 2.0

5 votes

def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]

        if not image_paths:
            print('{}/{}------文件保存失败'.format(item['image_dir'], item['image_url']))
            raise DropItem("Item contains no images")
        else:
            print('{}/{}------文件保存成功'.format(item['image_dir'], item['image_url']))
        return item

Source File: pipelines.py From news-please with Apache License 2.0

5 votes

def process_item(self, item, spider):

        # Check if date could be extracted
        if item['article_publish_date'] is None and self.strict_mode:
            raise DropItem('DateFilter: %s: Publishing date is missing and strict mode is enabled.' % item['url'])
        elif item['article_publish_date'] is None:
            return item
        else:
            # Create datetime object
            try:
                publish_date = datetime.datetime.strptime(str(item['article_publish_date']), '%Y-%m-%d %H:%M:%S')
            except ValueError as error:
                self.log.warning("DateFilter: Extracted date has the wrong format: %s - %s" %
                                 (item['article_publishing_date'], item['url']))
                if self.strict_mode:
                    raise DropItem('DateFilter: %s: Dropped due to wrong date format: %s' %
                                   (item['url'], item['publish_date']))
                else:
                    return item
            # Check interval boundaries
            if self.start_date is not None and self.start_date > publish_date:
                raise DropItem('DateFilter: %s: Article is too old: %s' % (item['url'], publish_date))
            elif self.end_date is not None and self.end_date < publish_date:
                raise DropItem('DateFilter: %s: Article is too young: %s ' % (item['url'], publish_date))
            else:
                return item

Source File: pipelines.py From PyFeeds with GNU Affero General Public License v3.0

5 votes

def process_item(self, item, spider):
        if "id" not in item:
            if "link" in item:
                item["id"] = uuid.uuid5(uuid.NAMESPACE_DNS, item["link"]).urn
            else:
                raise DropItem(
                    "A link is required to autogenerate the feed "
                    "id for: {}".format(item)
                )

        if "title" not in item:
            # Having a title is mandatory, so we use an empty string if none
            # is set.
            item["title"] = ""

        if isinstance(item, FeedEntryItem) and "updated" not in item:
            if "link" in item:
                item["updated"] = spider.cache.setdefault(
                    spider,
                    key="{}|updated".format(item["id"]),
                    default_obj=datetime.now(timezone.utc),
                )
            else:
                raise DropItem(
                    "A link is required to autogenerate the updated field "
                    "for: {}".format(item)
                )

        return item

Source File: pipelines.py From news-please with Apache License 2.0

5 votes

def process_item(self, item, spider):
        # For the case where something goes wrong
        if item['spider_response'].status != 200:
            # Item is no longer processed in the pipeline
            raise DropItem("%s: Non-200 response" % item['url'])
        else:
            return item

Source File: pipelines.py From Agriculture_KnowledgeGraph with GNU General Public License v3.0

5 votes

def process_item(self, item, spider):
        if item['title']:
            line = ""
            if(self.count > 0):
                line += ","
            line += json.dumps(dict(item),ensure_ascii=False) + "\n"
            self.file.write(line)
            self.count += 1
            print("count: "+str(self.count))
            return item
        else:
            raise DropItem("忽略无title的组件！")

Source File: pipelines.py From bulletin-scraper with BSD 2-Clause "Simplified" License

5 votes

def get_media_requests(self, item, info):
        url = item['url']
        if not url.lower().endswith('.msu'):
            raise DropItem('Item not an MSU')
        request = scrapy.Request(url)
        request.meta['bulletin'] = item['bulletin']
        yield request

Source File: pipelines.py From NewsScrapy with GNU Lesser General Public License v3.0

5 votes

def process_item(self, item, spider):
        item_keywords = judge_key_words(item)#获得item和关键词匹配的词
        if item_keywords:   #筛选出有关键词的item
            item["keywords"] = item_keywords
            return item
        else:
            logger = logging.getLogger(spider.name)
            logger.info("No keyword in %s" % item["news_url"])
            raise DropItem("No keyword in %s" % item["news_url"])

Source File: pipelines.py From Agriculture_KnowledgeGraph with GNU General Public License v3.0

5 votes

def process_item(self, item, spider):
        if item['title'] != 'error':   # 'error'是百科中没有的页面赋予的title值（自己定义的）
            line = ""
            if(self.count > 0):
                line += ","
            line += json.dumps(dict(item),ensure_ascii=False) + '\n'
            self.file.write(line)
            self.count += 1
            cur = time.time()
            T = int(cur-self.start)
            print("page count: " + str(self.count) + "      time:" + str(int(T/3600)) + "h " + str(int(T/60)%60) + "m " + str(T%60) + "s......")
            return item
        else:
            raise DropItem("百科中找不到对应页面！")

Source File: clean.py From kmanga with GNU General Public License v3.0

5 votes

def _clean_field_list(self, field, cleaner=None,
                          cleaner_params=None, optional=False,
                          exclude=None, drop=False, max_length=None):
        """Generic clean method for list field."""
        if cleaner:
            cleaner_params = cleaner_params if cleaner_params else ()
            value = []
            for e in self._as_list(field):
                try:
                    c = cleaner(e, *cleaner_params)
                except DropItem:
                    # If the exception created by the cleaner function
                    # is DropItem and we are allowed to drop items, we
                    # drop it, else we re-raise the exception droping
                    # the full item container.
                    if not drop:
                        raise
                else:
                    value.append(c)
        else:
            value = [e.strip() for e in self._as_list(field)]
        if exclude:
            value = [e for e in value if e not in exclude]
        if max_length:
            value = [e[:max_length] for e in value]
        if not value and not optional:
            raise ValueError('field is not optional'
                             " or can't be converted to a list")
        return value

Source File: pipelines.py From corpus-builder with MIT License

5 votes

def process_item(self, item, spider):
        if item['body']:
            item['body'] = item['body'].strip()
            return item
        else:
            raise DropItem("Empty Body")

Source File: pipelines.py From openslack-crawler with Apache License 2.0

5 votes

def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        # print image_paths
        return item

Source File: image.py From openslack-crawler with Apache License 2.0

5 votes

def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item

Source File: pipelines.py From hoaxy-backend with GNU General Public License v3.0

5 votes

def process_item(self, item, spider):
        """Main function that process URL item (second phase)."""
        # canonicalize expanded URL without considering the status_code
        # because scrapy crawling not ganrantee the success
        # we still try to canonicalize the URL
        if len(item['expanded']) > MAX_URL_LEN:
            item['expanded'] = item['expanded'][:MAX_URL_LEN]
            logger.error('Expanded URL too long, trucate it! %r', item['raw'])
        item['canonical'] = canonicalize(item['expanded'])
        if item['canonical'] is None:
            item['status_code'] = U_HTML_ERROR_INVALID_URL

        # if url could be canonicalized and if site_id is not determined
        # we infer it from the expanded url
        if item['status_code'] != U_HTML_ERROR_INVALID_URL\
                and item.get('site_id', None) is None:
            purl = get_parsed_url(item['expanded'])
            if purl is not None and purl.hostname is not None:
                if belongs_to_domain(purl.hostname, spider.excluded_domains)\
                        is not None:
                    item['status_code'] = U_HTML_ERROR_EXCLUDED_DOMAIN
                else:
                    item['site_id'] = belongs_to_site(purl.hostname,
                                                      self.site_tuples)
            else:
                item['status_code'] = U_HTML_ERROR_INVALID_URL
        # remove potential NUL byte \x00 in the HTML
        if 'html' in item:
            item['html'] = item['html'].replace(b'\x00', b'')
        try:
            # update database of url table
            spider.session.query(Url).filter_by(id=item['id'])\
                .update(dict(item), synchronize_session=False)
            spider.session.commit()
            logger.debug('Fetched html of url %r with status %i', item['raw'],
                         item['status_code'])
        except SQLAlchemyError as e:
            logger.error(e)
            spider.session.rollback()
            raise DropItem('Fail to update database of url: %s', item)
        return item

Source File: pipelines.py From ajax_crawler with MIT License

5 votes

def ensure_not_empty(self, item, field):
        if field in item:
            if item[field] ==[]:
                raise DropItem("Empty item found: %s" % item)

Source File: pipelines.py From ajax_crawler with MIT License

5 votes

def ensure_not_duplicate(self, spider, item, field):
        if field in item:
            if field not in self.duplicates[spider]:
                self.duplicates[spider][field] = set()
            if item[field] and type(item[field]) is list:
                if item[field][0] in self.duplicates[spider][field]:
                    raise DropItem("Duplicate item found: %s" % item)
                else:
                    self.duplicates[spider][field].add(item[field][0])

Source File: pipelines.py From scrape with MIT License

5 votes

def item_dropped(self, item, response, exception, spider):
        # Calls When DropItem Exception is raised
        spider.urls_dropped += 1
        logger.info(__name__ + " [Dropped] <Spider>: " + spider.name + " <Reason>: " + str(exception) + " <Link>: " + str(item['link']))

Source File: pipelines.py From scrape with MIT License

5 votes

def process_item(self, item, spider):

        if not spider.postgres.checkConnection():
            raise CloseSpider("Unable to Establish a Database Connection")
        
        if spider.postgres.checkUrlExists(item['link']):
            raise DropItem("Url " + item['link'] + " Exists in Database")
        
        return item

Python scrapy.exceptions.DropItem() Examples