Python scrapy.loader.ItemLoader() Examples
The following are 30
code examples of scrapy.loader.ItemLoader().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.loader
, or try the search function
.
Example #1
Source File: 1fol_pag2scr.py From scrapy-templates with MIT License | 6 votes |
def populate_item(self, response): item_loader = ItemLoader(item=MySpiderItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) # item_loader.add_css("", "") yield item_loader.load_item() # 3. PAGINATION LEVEL 1
Example #2
Source File: post_pass_item.py From scrapy-templates with MIT License | 6 votes |
def parse(self, response): item_loader = ItemLoader(item=MyItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) #item_loader.add_css("", "") #item_loader.add_css("", "") #item_loader.add_css("", "") yield FormRequest("POST_URL", formdata={'parameter': 'p'}, meta={'item': item_loader.load_item()}, callback=self.populate_field)
Example #3
Source File: price_crawler.py From scrapy-templates with MIT License | 6 votes |
def parse(self, response): item = PriceItem() item_loader = ItemLoader(item=item, response=response) item_loader.default_output_processor = TakeFirst() item_loader.add_css("price", self.price_css) item_loader.add_css("stock", self.stock_css) item_loader.add_value("product_id", response.meta.get("product_id")) item_loader.add_value("cron_id", self.cron_id) item_loader.add_value("shop_id", self.shop_id) item_loader.add_value("item_id", str(uuid.uuid1())) item_loader.add_value("updated", str(datetime.datetime.now())) item_loader.add_value("url", response.url) return item_loader.load_item() # 2. Updating database by calling the backend API
Example #4
Source File: 1fol2fol_pag3scr.py From scrapy-templates with MIT License | 6 votes |
def populate_item(self, response): item_loader = ItemLoader(item=MySpiderItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) #item_loader.add_css("field", "") yield item_loader.load_item() # 3. PAGINATION LEVEL 2
Example #5
Source File: 1scr.py From scrapy-templates with MIT License | 6 votes |
def parse(self, response): item_loader = ItemLoader(item=MyItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) item_loader.default_output_processor = TakeFirst() # #item_loader.add_css("my_field", "my_css") #item_loader.add_xpath("my_field", "my_xpath") # return item_loader.load_item()
Example #6
Source File: 1fol2fol3scr.py From scrapy-templates with MIT License | 6 votes |
def populate_item(self, response): item_loader = ItemLoader(item=MySpiderItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) #item_loader.add_css("", "") #item_loader.add_css("", "") yield item_loader.load_item()
Example #7
Source File: 1fol2scr.py From scrapy-templates with MIT License | 6 votes |
def populate_item(self, response): item_loader = ItemLoader(item=MySpiderItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) #item_loader.add_css("", "") #item_loader.add_css("", "") yield item_loader.load_item()
Example #8
Source File: 1fol2scr_pag.py From scrapy-templates with MIT License | 6 votes |
def populate_item(self, response): item_loader = ItemLoader(item=MySpiderItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) # item_loader.add_css("") # item_loader.add_value("raw", raw) # yield the populated item first yield item_loader.load_item() # then yield the function which paginates to another page that contains data yield self.paginate(response) # 3. PAGINATION LEVEL 2
Example #9
Source File: sitemap_spider.py From scrapy-templates with MIT License | 6 votes |
def scrape_product(self, response): item_loader = ItemLoader(item=MyItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) item_loader.default_output_processor = TakeFirst() item_loader.add_css("my_field", "selector") return item_loader.load_item()
Example #10
Source File: main.py From python-examples with MIT License | 6 votes |
def parse(self,response): print('url:', response.url) articles = response.xpath('//div[@class="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls"]') for article in articles: if article.xpath('.//a[@class="button button--smaller button--chromeless u-baseColor--buttonNormal"]/@href').extract_first(): l = ItemLoader(item = MediumItem(), selector = article) l.default_output_processor = scrapy.loader.processors.TakeFirst() l.add_css('Title','div > h3::text') l.add_xpath('Name','.//a[@class="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken"]/text()') l.add_css('Read','span::attr(title)') l.add_xpath('Publication', './/a[@class="ds-link ds-link--styleSubtle link--darkenlink--accent u-accentColor--textNormal"]/text()') l.add_xpath('Claps','.//button[@class="button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()') l.add_xpath('Responses','.//a[@class="button button--chromeless u-baseColor--buttonNormal"]/text()') l.add_value('Page', response.url) yield l.load_item()
Example #11
Source File: fbcrawl.py From fbcrawl with Apache License 2.0 | 6 votes |
def parse_post(self,response): new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item']) new.context['lang'] = self.lang new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()") new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()') # new.add_xpath('date','//div/div/abbr/text()') new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()') #check reactions for old posts check_reactions = response.xpath("//a[contains(@href,'reaction/profile')]/div/div/text()").get() if not check_reactions: yield new.load_item() else: new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()") reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href") reactions = response.urljoin(reactions[0].extract()) yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})
Example #12
Source File: traffic.py From place2live with MIT License | 6 votes |
def parse(self, response): """Scrape a list of world rankings, cities, countries, and congestion levels""" """Then populate an item with the data and return it""" # Base XPath for extract need values xpath_selector = "//div[@id='RankingPage-table']//td[{}]" world_ranks = response.xpath(xpath_selector.format(1)).getall() cities = response.xpath(xpath_selector.format(3)).getall() countries = response.xpath(xpath_selector.format(4)).getall() congestion_levels = response.xpath(xpath_selector.format(5)).getall() for rank, city, country, level in zip( world_ranks, cities, countries, congestion_levels, ): i = ItemLoader(item=TrafficIndexItem()) i.add_value("world_rank", rank) i.add_value("city", city) i.add_value("country", country) i.add_value("congestion_level", level) yield i.load_item()
Example #13
Source File: quotes_spider.py From scrapy-tutorial with MIT License | 6 votes |
def parse(self, response): self.logger.info('Parse function called on {}'.format(response.url)) # quotes = response.xpath("//div[@class='quote']") quotes = response.css('div.quote') for quote in quotes: loader = ItemLoader(item=QuoteItem(), selector=quote) # pay attention to the dot .// to use relative xpath # loader.add_xpath('quote_content', ".//span[@class='text']/text()") loader.add_css('quote_content', '.text::text') # loader.add_xpath('author', './/small//text()') loader.add_css('tags', '.tag::text') quote_item = loader.load_item() author_url = quote.css('.author + a::attr(href)').get() # go to the author page and pass the current collected quote info yield response.follow(author_url, self.parse_author, meta={'quote_item': quote_item}) # go to Next page for a in response.css('li.next a'): yield response.follow(a, self.parse)
Example #14
Source File: inflation.py From place2live with MIT License | 6 votes |
def parse(self, response): table = bs(response.body, "html.parser").findAll("tbody")[2] countries = [ c["title"] for c in table.findAll("a", href=True, title=True) ] dateCandidates = table.findAll("td", {"data-sort-value": True}) dates = [re.findall(r'\d+', yr)[0] for yr in dateCandidates] inflations = [] for td in [i.replace("−", "-") for i in table.findAll("td")]: if isinstance(td, float): inflations.append(td) for inflation, country, date in zip( inflations, countries, dates, ): i = ItemLoader(item=InflationItem()) i.add_value("inflation", inflation) i.add_value("date", date) i.add_value("country", country) yield i.load_item()
Example #15
Source File: urls_from_db.py From scrapy-templates with MIT License | 5 votes |
def parse(self, response): item_loader = ItemLoader(item=MyItem(), response=response) #item_loader.add_css("", "") #item_loader.add_value("product_id", response.meta["product_id"]) return item_loader.load_item()
Example #16
Source File: main-itemloader.py From python-examples with MIT License | 5 votes |
def parse(self, response): for sel in response.xpath("//div[@class='list-group']//h3/a"): l = ItemLoader(item=ComicscraperItem(), selector=sel) l.add_xpath('title', './text()') l.add_xpath('link', './@href') request = scrapy.Request(sel.xpath('./@href').extract_first(), callback=self.parse_isbn, dont_filter=True) request.meta['l'] = l yield request
Example #17
Source File: textspider.py From ARGUS with GNU General Public License v3.0 | 5 votes |
def parse(self, response): #initialize collector item which stores the website's content and meta data loader = ItemLoader(item=Collector()) loader.add_value("dl_slot", response.request.meta.get('download_slot')) loader.add_value("redirect", self.checkRedirectDomain(response)) loader.add_value("start_page", response.url) loader.add_value("start_domain", self.subdomainGetter(response)) loader.add_value("scraped_urls", [response.urljoin(response.url)]) loader.add_value("scrape_counter", 1) loader.add_value("scraped_text", [self.extractText(response)]) loader.add_value("error", "None") loader.add_value("ID", response.request.meta["ID"]) #initialize the fingerprints set which stores all fingerprints of visited websites fingerprints = set() #add the fingerprints of the start_page fingerprints.add(request_fingerprint(response.request)) #if there was an initial redirect, the new domain is added to the allowed domains domain = self.subdomainGetter(response) if domain not in self.allowed_domains: self.allowed_domains.append(domain) self.refreshAllowedDomains() #extract all urls from the page... urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract() #...and safe them to a urlstack urlstack = [response.urljoin(url) for url in urls] #attach the urlstack, the loader, and the fingerprints to the response... response.meta["urlstack"] = urlstack response.meta["loader"] = loader response.meta["fingerprints"] = fingerprints #...and send it over to the processURLstack function return self.processURLstack(response) ################################################################## # PROCESS URL STACK ##################################################################
Example #18
Source File: itemloader.py From scrapy-bench with MIT License | 5 votes |
def main(): total = 0 time = 0 tar = tarfile.open("bookfiles.tar.gz") for member in tar.getmembers(): f = tar.extractfile(member) html = f.read() response = HtmlResponse(url="local", body=html, encoding='utf8') for i in xrange(0, 10): start = timer() loader = ItemLoader(item=ItemloaderItem(), response=response) loader.add_xpath( 'rating', '//*[@id="content_inner"]/article/div[1]/div[2]/p[3]/i[1]') loader.add_xpath( 'title', '//*[@id=("content_inner")]/article/div[1]/div[2]/h1') loader.add_xpath( 'price', '//*[@id=("content_inner")]/article/div[1]/div[2]/p[1]') loader.add_css('stock', '.product_main .instock.availability ::text') loader.add_css('category', 'ul.breadcrumb li:nth-last-child(2) ::text') loader.add_value('name', 'item {}'.format(i)) loader.add_value('url', 'http://site.com/item{}'.format(i)) product = loader.load_item() end = timer() total += 1 time = time + end - start print("\nTotal number of items extracted = {0}".format(total)) print("Time taken = {0}".format(time)) click.secho("Rate of link extraction : {0} items/second\n".format( float(total / time)), bold=True) with open("Benchmark.txt", 'w') as g: g.write(" {0}".format((float(total / time))))
Example #19
Source File: rosi.py From crawlers with GNU General Public License v3.0 | 5 votes |
def parse_item(self, response): il = ItemLoader(item=ImageItem(), response=response) il.add_css('image_urls', 'img::attr(src)') return il.load_item()
Example #20
Source File: fbcrawl.py From fbcrawl with Apache License 2.0 | 5 votes |
def parse_reactions(self,response): new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item']) new.context['lang'] = self.lang new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()") new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()") new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()") new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") yield new.load_item()
Example #21
Source File: profiles.py From fbcrawl with Apache License 2.0 | 5 votes |
def parse_profile(self,response): new = ItemLoader(item=ProfileItem(),response=response) self.logger.info('Crawling profile info') new.add_xpath('name','//span/div/span/strong/text()') new.add_xpath('gender',"//div[@id='basic-info']//div[@title='Gender']//div/text()") new.add_xpath('birthday',"//div[@id='basic-info']//div[@title='Birthday']//div/text()") new.add_xpath('current_city',"//div[@id='living']//div[@title='Current City']//a/text()") new.add_xpath('hometown',"//div[@id='living']//div[@title='Hometown']//a/text()") new.add_xpath('work',"//div[@id='work']//a/text()") new.add_xpath('education',"//div[@id='education']//a/text()") new.add_xpath('interested_in',"//div[@id='interested-in']//div[not(contains(text(),'Interested In'))]/text()") new.add_xpath('page',"//div[@id='contact-info']//div[@title='Facebook']//div/text()") yield new.load_item()
Example #22
Source File: quotes_spider.py From scrapy-tutorial with MIT License | 5 votes |
def parse_author(self, response): quote_item = response.meta['quote_item'] loader = ItemLoader(item=quote_item, response=response) loader.add_css('author_name', '.author-title::text') loader.add_css('author_birthday', '.author-born-date::text') loader.add_css('author_bornlocation', '.author-born-location::text') loader.add_css('author_bio', '.author-description::text') yield loader.load_item()
Example #23
Source File: event.py From In2ItChicago with GNU General Public License v3.0 | 5 votes |
def __init__(self, *args, **kwargs): item_loader = ItemLoader(item=Event()) for key, value in kwargs.items(): try: item_loader.add_value(key, value) except KeyError: raise KeyError(f'{key} is not a valid event field') self.item = item_loader.load_item()
Example #24
Source File: textspider.py From ARGUS with GNU General Public License v3.0 | 4 votes |
def errorback(self, failure): loader = ItemLoader(item=Collector()) if failure.check(HttpError): response = failure.value.response loader.add_value("dl_slot", response.request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", response.status) loader.add_value("ID", response.request.meta["ID"]) yield loader.load_item() elif failure.check(DNSLookupError): request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", "DNS") loader.add_value("ID", request.meta["ID"]) yield loader.load_item() elif failure.check(TimeoutError, TCPTimedOutError): request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", "Timeout") loader.add_value("ID", request.meta["ID"]) yield loader.load_item() else: request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", "other") loader.add_value("ID", request.meta["ID"]) yield loader.load_item() ################################################################## # MAIN PARSE ##################################################################
Example #25
Source File: linkspider.py From ARGUS with GNU General Public License v3.0 | 4 votes |
def errorback(self, failure): loader = ItemLoader(item=LinkCollector()) if failure.check(HttpError): response = failure.value.response loader.add_value("dl_slot", response.request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", response.status) loader.add_value("ID", response.request.meta["ID"]) loader.add_value("links", "") loader.add_value("alias", "") yield loader.load_item() elif failure.check(DNSLookupError): request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", "DNS") loader.add_value("ID", request.meta["ID"]) loader.add_value("links", "") loader.add_value("alias", "") yield loader.load_item() elif failure.check(TimeoutError, TCPTimedOutError): request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", "Timeout") loader.add_value("ID", request.meta["ID"]) loader.add_value("links", "") loader.add_value("alias", "") yield loader.load_item() else: request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", "other") loader.add_value("ID", request.meta["ID"]) loader.add_value("links", "") loader.add_value("alias", "") yield loader.load_item() ################################################################## # MAIN PARSE ##################################################################
Example #26
Source File: linkspider.py From ARGUS with GNU General Public License v3.0 | 4 votes |
def parse(self, response): #initialize collector item which stores the website's content and meta data loader = ItemLoader(item=LinkCollector()) loader.add_value("dl_slot", response.request.meta.get('download_slot')) loader.add_value("redirect", self.checkRedirectDomain(response)) #add alias if there was an initial redirect if self.checkRedirectDomain(response): loader.add_value("alias", self.subdomainGetter(response).split("www.")[-1]) else: loader.add_value("alias", "") loader.add_value("start_page", response.url) loader.add_value("start_domain", self.subdomainGetter(response)) loader.add_value("scraped_urls", [response.urljoin(response.url)]) loader.add_value("scrape_counter", 1) loader.add_value("error", "None") loader.add_value("ID", response.request.meta["ID"]) loader.add_value("links", "") #initialize the fingerprints set which stores all fingerprints of visited websites fingerprints = set() #add the fingerprints of the start_page fingerprints.add(request_fingerprint(response.request)) #if there was an initial redirect, the new domain is added to the allowed domains domain = self.subdomainGetter(response) if domain not in self.allowed_domains: self.allowed_domains.append(domain) self.refreshAllowedDomains() #extract all urls from the page... urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract() #...and safe them to a urlstack urlstack = [response.urljoin(url) for url in urls] #attach the urlstack, the loader, and the fingerprints to the response... response.meta["urlstack"] = urlstack response.meta["loader"] = loader response.meta["fingerprints"] = fingerprints #...and send it over to the processURLstack function return self.processURLstack(response) ################################################################## # PROCESS URL STACK ##################################################################
Example #27
Source File: textspider.py From ARGUS with GNU General Public License v3.0 | 4 votes |
def errorback(self, failure): loader = ItemLoader(item=Collector()) if failure.check(HttpError): response = failure.value.response loader.add_value("dl_slot", response.request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("title", "") loader.add_value("description", "") loader.add_value("keywords", "") loader.add_value("error", response.status) loader.add_value("ID", response.request.meta["ID"]) yield loader.load_item() elif failure.check(DNSLookupError): request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("title", "") loader.add_value("description", "") loader.add_value("keywords", "") loader.add_value("error", "DNS") loader.add_value("ID", request.meta["ID"]) yield loader.load_item() elif failure.check(TimeoutError, TCPTimedOutError): request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("title", "") loader.add_value("description", "") loader.add_value("keywords", "") loader.add_value("error", "Timeout") loader.add_value("ID", request.meta["ID"]) yield loader.load_item() else: request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("title", "") loader.add_value("description", "") loader.add_value("keywords", "") loader.add_value("error", "other") loader.add_value("ID", request.meta["ID"]) yield loader.load_item() ################################################################## # MAIN PARSE ##################################################################
Example #28
Source File: linkspider.py From ARGUS with GNU General Public License v3.0 | 4 votes |
def errorback(self, failure): loader = ItemLoader(item=LinkCollector()) if failure.check(HttpError): response = failure.value.response loader.add_value("dl_slot", response.request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", response.status) loader.add_value("ID", response.request.meta["ID"]) loader.add_value("links", "") loader.add_value("alias", "") yield loader.load_item() elif failure.check(DNSLookupError): request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", "DNS") loader.add_value("ID", request.meta["ID"]) loader.add_value("links", "") loader.add_value("alias", "") yield loader.load_item() elif failure.check(TimeoutError, TCPTimedOutError): request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", "Timeout") loader.add_value("ID", request.meta["ID"]) loader.add_value("links", "") loader.add_value("alias", "") yield loader.load_item() else: request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", "other") loader.add_value("ID", request.meta["ID"]) loader.add_value("links", "") loader.add_value("alias", "") yield loader.load_item() ################################################################## # MAIN PARSE ##################################################################
Example #29
Source File: linkspider.py From ARGUS with GNU General Public License v3.0 | 4 votes |
def parse(self, response): #initialize collector item which stores the website's content and meta data loader = ItemLoader(item=LinkCollector()) loader.add_value("dl_slot", response.request.meta.get('download_slot')) loader.add_value("redirect", self.checkRedirectDomain(response)) #add alias if there was an initial redirect if self.checkRedirectDomain(response): loader.add_value("alias", self.subdomainGetter(response).split("www.")[-1]) else: loader.add_value("alias", "") loader.add_value("start_page", response.url) loader.add_value("start_domain", self.subdomainGetter(response)) loader.add_value("scraped_urls", [response.urljoin(response.url)]) loader.add_value("scrape_counter", 1) loader.add_value("error", "None") loader.add_value("ID", response.request.meta["ID"]) loader.add_value("links", "") #initialize the fingerprints set which stores all fingerprints of visited websites fingerprints = set() #add the fingerprints of the start_page fingerprints.add(request_fingerprint(response.request)) #if there was an initial redirect, the new domain is added to the allowed domains domain = self.subdomainGetter(response) if domain not in self.allowed_domains: self.allowed_domains.append(domain) self.refreshAllowedDomains() #extract all urls from the page... urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract() #...and safe them to a urlstack urlstack = [response.urljoin(url) for url in urls] #attach the urlstack, the loader, and the fingerprints to the response... response.meta["urlstack"] = urlstack response.meta["loader"] = loader response.meta["fingerprints"] = fingerprints #...and send it over to the processURLstack function return self.processURLstack(response) ################################################################## # PROCESS URL STACK ##################################################################
Example #30
Source File: zhihu.py From FunpySpiderSearchEngine with Apache License 2.0 | 4 votes |
def parse_question(self, response): # 处理question页面, 从页面中提取出具体的question item if "QuestionHeader-title" in response.text: # 处理新版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("question_id", question_id) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_xpath("content", "//*[@id='root']/div/main/div/div[1]/div[2]" "/div[1]/div[1]/div[2]/div/div/div/span/text()") item_loader.add_css("topics", ".QuestionHeader-topics .Tag.QuestionTopic .Popover div::text") item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") # 这里的watch_user_num 包含Watch 和 click 在clean data中分离 item_loader.add_css("watch_user_num", ".NumberBoard-itemValue ::text") item_loader.add_value("url", response.url) question_item = item_loader.load_item() else: # 处理老版本页面的item提取(好像已经没有老版页面了我这里放着保险一下) match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_xpath("title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()") item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|" "//*[@class='zh-question-followers-sidebar']/div/a/strong/text()") item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() # 发起向后台具体answer的接口请求 yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item