Python scrapy.Item() Examples
The following are 12
code examples of scrapy.Item().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy
, or try the search function
.
Example #1
Source File: test_pipelines.py From scrapy-cluster with MIT License | 6 votes |
def test_process_item(self): item = self._get_item() spider = MagicMock() spider.name = "link" self.pipe.logger.info = MagicMock(side_effect=Exception("info")) try: self.pipe.process_item(item, spider) self.assertFalse(True) except Exception as e: self.assertEqual(str(e), "info") # test unknown item class WeirdItem(Item): pass item2 = WeirdItem() self.pipe.logger.warn = MagicMock(side_effect=Exception("warn")) try: self.pipe.process_item(item2, spider) self.assertFalse(True) except Exception as e: self.assertEqual(str(e), "warn")
Example #2
Source File: middlewares.py From scrapy-corenlp with BSD 2-Clause "Simplified" License | 6 votes |
def process_spider_output(self, response, result, spider): for element in result: if isinstance(element, (Item, dict)): if isinstance(self.field_to_process, list): text = ' '.join( [element[field] for field in self.field_to_process] ) elif isinstance(self.field_to_process, string_types): text = element[self.field_to_process] else: yield element tagger = StanfordNERTagger( model_filename=self.classifier, path_to_jar=self.jar_file ) token_entity_pairs = tagger.tag( tokens=self.tokenizer(s=text) ) accumulated = self.accumulate(token_entity_pairs) element.setdefault(self.output_field, accumulated) yield element else: yield element
Example #3
Source File: middlewares.py From realestate-scraper with MIT License | 6 votes |
def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. ts = datetime.now() stored_meta = response.meta.get('stored_meta') if stored_meta and 'timestamp' in stored_meta: ts = datetime.fromtimestamp(stored_meta['timestamp']) for i in result: if isinstance(i, (dict, Item)): i['scraped_time'] = ts i['scraped_time'] = ts.strftime('%d/%m/%Y') if 'DataAtualizacaoHumanizada' in i: updated = parse(i['DataAtualizacaoHumanizada'], languages=['pt'], settings={'RELATIVE_BASE': ts}) i['updated_time'] = updated.strftime('%d/%m/%Y') yield i
Example #4
Source File: pipelines.py From spidermon with BSD 3-Clause "New" or "Revised" License | 5 votes |
def from_crawler(cls, crawler): spidermon_enabled = crawler.settings.getbool("SPIDERMON_ENABLED") if not spidermon_enabled: return PassThroughPipeline() validators = defaultdict(list) allowed_types = (list, tuple, dict) def set_validators(loader, schema): if type(schema) in (list, tuple): schema = {Item: schema} for obj, paths in schema.items(): key = obj.__name__ paths = paths if type(paths) in (list, tuple) else [paths] objects = [loader(v) for v in paths] validators[key].extend(objects) for loader, name in [ (cls._load_jsonschema_validator, "SPIDERMON_VALIDATION_SCHEMAS"), (cls._load_schematics_validator, "SPIDERMON_VALIDATION_MODELS"), ]: res = crawler.settings.get(name) if not res: continue if type(res) not in allowed_types: raise NotConfigured( "Invalid <{}> type for <{}> settings, dict or list/tuple" "is required".format(type(res), name) ) set_validators(loader, res) if not validators: raise NotConfigured("No validators were found") return cls( validators=validators, stats=crawler.stats, drop_items_with_errors=crawler.settings.getbool( "SPIDERMON_VALIDATION_DROP_ITEMS_WITH_ERRORS" ), add_errors_to_items=crawler.settings.getbool( "SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS" ), errors_field=crawler.settings.get("SPIDERMON_VALIDATION_ERRORS_FIELD"), )
Example #5
Source File: pipelines.py From spidermon with BSD 3-Clause "New" or "Revised" License | 5 votes |
def find_validators(self, item): find = lambda x: self.validators.get(x.__name__, []) return find(item.__class__) or find(Item)
Example #6
Source File: pipelines.py From spidermon with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _add_errors_to_item(self, item, errors): try: if self.errors_field not in item.__class__.fields: item.__class__.fields[self.errors_field] = Field() if self.errors_field not in item._values: item[self.errors_field] = defaultdict(list) except AttributeError: # The item is just a dict object instead of a Scrapy.Item object if self.errors_field not in item: item[self.errors_field] = defaultdict(list) for field_name, messages in errors.items(): item[self.errors_field][field_name] += messages
Example #7
Source File: test_pipeline.py From scrapy-jsonschema with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_default_item(self): stats = self._get_stats_for_docs(valid_docs, True) pipeline = JsonSchemaValidatePipeline(stats) item = Item() output_item = pipeline.process_item(item, None) assert item == output_item
Example #8
Source File: cli.py From scrapy-autounit with BSD 3-Clause "New" or "Revised" License | 5 votes |
def parse_data(self, data): if isinstance(data, (dict, scrapy.Item)): return { self.parse_data(k): self.parse_data(v) for k, v in data.items() } elif isinstance(data, list): return [self.parse_data(x) for x in data] elif isinstance(data, bytes): return to_unicode(data) elif isinstance(data, datetime): return data.isoformat() elif isinstance(data, (int, float)): return data return str(data)
Example #9
Source File: introspection.py From ws-backend-community with GNU General Public License v3.0 | 5 votes |
def get_scrapy_item_classes(): """ Get a list of tuples containing (1) the class name and (2) the class for all of the Scrapy item classes defined in the crawling module. :return: A list of tuples containing (1) the class name and (2) the class for all of the Scrapy item classes defined in the crawling module. """ import lib.inspection.web.crawling.item import scrapy return list(set(IntrospectionHelper.get_all_classes_of_type( to_find=scrapy.Item, path="lib/inspection/web/crawling", )))
Example #10
Source File: middlewares.py From realestate-scraper with MIT License | 5 votes |
def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass
Example #11
Source File: items.py From OpenScraper with MIT License | 5 votes |
def create_item_class(class_name, fields_list): """generic Item class creator populated from a list""" fields_dict = {} for field_name in fields_list: fields_dict[field_name] = Field() return type( str(class_name), (DictItem,), {'fields': fields_dict} )
Example #12
Source File: proxy.py From fp-server with MIT License | 5 votes |
def hmset_dict(self, key, item): if not isinstance(item, (dict, Item)): raise TypeError("Error type: %s" % type(item)) if not item: raise ValueError("item is empty") args = chain.from_iterable(item.items()) return self.cli.hmset(key, *args)