Python scrapy.loader.ItemLoader() Examples

The following are 30 code examples of scrapy.loader.ItemLoader(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.loader , or try the search function .
Example #1
Source File: 1fol_pag2scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        # item_loader.add_css("", "")
        yield item_loader.load_item()

    # 3. PAGINATION LEVEL 1 
Example #2
Source File: post_pass_item.py    From scrapy-templates with MIT License 6 votes vote down vote up
def parse(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        #item_loader.add_css("", "")
        #item_loader.add_css("", "")
        #item_loader.add_css("", "")
        yield FormRequest("POST_URL", formdata={'parameter': 'p'},
                                        meta={'item': item_loader.load_item()}, callback=self.populate_field) 
Example #3
Source File: price_crawler.py    From scrapy-templates with MIT License 6 votes vote down vote up
def parse(self, response):
        item = PriceItem()
        item_loader = ItemLoader(item=item, response=response)
        item_loader.default_output_processor = TakeFirst()
        
        item_loader.add_css("price", self.price_css)
        item_loader.add_css("stock", self.stock_css)

        item_loader.add_value("product_id", response.meta.get("product_id"))
        item_loader.add_value("cron_id", self.cron_id)
        item_loader.add_value("shop_id", self.shop_id)
        item_loader.add_value("item_id", str(uuid.uuid1()))
        item_loader.add_value("updated", str(datetime.datetime.now()))
        item_loader.add_value("url", response.url)

        return item_loader.load_item()

    # 2. Updating database by calling the backend API 
Example #4
Source File: 1fol2fol_pag3scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        #item_loader.add_css("field", "")
        yield item_loader.load_item()

    # 3. PAGINATION LEVEL 2 
Example #5
Source File: 1scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def parse(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        item_loader.default_output_processor = TakeFirst()
        #
        #item_loader.add_css("my_field", "my_css")
        #item_loader.add_xpath("my_field", "my_xpath")
        #
        return item_loader.load_item() 
Example #6
Source File: 1fol2fol3scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        #item_loader.add_css("", "")
        #item_loader.add_css("", "")

        yield item_loader.load_item() 
Example #7
Source File: 1fol2scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        #item_loader.add_css("", "")
        #item_loader.add_css("", "")

        yield item_loader.load_item() 
Example #8
Source File: 1fol2scr_pag.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        # item_loader.add_css("")
        # item_loader.add_value("raw", raw)

        # yield the populated item first
        yield item_loader.load_item()
        # then yield the function which paginates to another page that contains data
        yield self.paginate(response)

    # 3. PAGINATION LEVEL 2 
Example #9
Source File: sitemap_spider.py    From scrapy-templates with MIT License 6 votes vote down vote up
def scrape_product(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        item_loader.default_output_processor = TakeFirst()

        item_loader.add_css("my_field", "selector")

        return item_loader.load_item() 
Example #10
Source File: main.py    From python-examples with MIT License 6 votes vote down vote up
def parse(self,response):
        print('url:', response.url)
        articles = response.xpath('//div[@class="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls"]')
        for article in articles:

            if article.xpath('.//a[@class="button button--smaller button--chromeless u-baseColor--buttonNormal"]/@href').extract_first():
                l = ItemLoader(item = MediumItem(), selector = article)
                l.default_output_processor = scrapy.loader.processors.TakeFirst()
                l.add_css('Title','div > h3::text')
                l.add_xpath('Name','.//a[@class="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken"]/text()')
                l.add_css('Read','span::attr(title)')
                l.add_xpath('Publication', './/a[@class="ds-link ds-link--styleSubtle link--darkenlink--accent u-accentColor--textNormal"]/text()')
                l.add_xpath('Claps','.//button[@class="button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()')
                l.add_xpath('Responses','.//a[@class="button button--chromeless u-baseColor--buttonNormal"]/text()')
                l.add_value('Page', response.url)
                yield l.load_item() 
Example #11
Source File: fbcrawl.py    From fbcrawl with Apache License 2.0 6 votes vote down vote up
def parse_post(self,response):
        new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])
        new.context['lang'] = self.lang           
        new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
        new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()')
     #   new.add_xpath('date','//div/div/abbr/text()')
        new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
        
        #check reactions for old posts
        check_reactions = response.xpath("//a[contains(@href,'reaction/profile')]/div/div/text()").get()
        if not check_reactions:
            yield new.load_item()       
        else:
            new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")              
            reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
            reactions = response.urljoin(reactions[0].extract())
            yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new}) 
Example #12
Source File: traffic.py    From place2live with MIT License 6 votes vote down vote up
def parse(self, response):
        """Scrape a list of world rankings, cities, countries, and congestion levels"""
        """Then populate an item with the data and return it"""
        # Base XPath for extract need values
        xpath_selector = "//div[@id='RankingPage-table']//td[{}]"
        world_ranks = response.xpath(xpath_selector.format(1)).getall()
        cities = response.xpath(xpath_selector.format(3)).getall()
        countries = response.xpath(xpath_selector.format(4)).getall()
        congestion_levels = response.xpath(xpath_selector.format(5)).getall()
        for rank, city, country, level in zip(
            world_ranks, cities, countries, congestion_levels,
        ):
            i = ItemLoader(item=TrafficIndexItem())
            i.add_value("world_rank", rank)
            i.add_value("city", city)
            i.add_value("country", country)
            i.add_value("congestion_level", level)
            yield i.load_item() 
Example #13
Source File: quotes_spider.py    From scrapy-tutorial with MIT License 6 votes vote down vote up
def parse(self, response):
        self.logger.info('Parse function called on {}'.format(response.url))
        # quotes = response.xpath("//div[@class='quote']")
        quotes = response.css('div.quote')

        for quote in quotes:
            loader = ItemLoader(item=QuoteItem(), selector=quote)
            # pay attention to the dot .// to use relative xpath
            # loader.add_xpath('quote_content', ".//span[@class='text']/text()")
            loader.add_css('quote_content', '.text::text')
            # loader.add_xpath('author', './/small//text()')
            loader.add_css('tags', '.tag::text')
            quote_item = loader.load_item()
            author_url = quote.css('.author + a::attr(href)').get()
            # go to the author page and pass the current collected quote info
            yield response.follow(author_url, self.parse_author, meta={'quote_item': quote_item})

        # go to Next page
        for a in response.css('li.next a'):
            yield response.follow(a, self.parse) 
Example #14
Source File: inflation.py    From place2live with MIT License 6 votes vote down vote up
def parse(self, response):
        table = bs(response.body, "html.parser").findAll("tbody")[2]

        countries = [
            c["title"] for c in table.findAll("a", href=True, title=True)
        ]
        dateCandidates = table.findAll("td", {"data-sort-value": True})
        dates = [re.findall(r'\d+', yr)[0] for yr in dateCandidates]
        inflations = []
        for td in [i.replace("−", "-") for i in table.findAll("td")]:
            if isinstance(td, float):
                inflations.append(td)

        for inflation, country, date in zip(
            inflations, countries, dates,
        ):
            i = ItemLoader(item=InflationItem())
            i.add_value("inflation", inflation)
            i.add_value("date", date)
            i.add_value("country", country)
            yield i.load_item() 
Example #15
Source File: urls_from_db.py    From scrapy-templates with MIT License 5 votes vote down vote up
def parse(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)

        #item_loader.add_css("", "")
        #item_loader.add_value("product_id", response.meta["product_id"])

        return item_loader.load_item() 
Example #16
Source File: main-itemloader.py    From python-examples with MIT License 5 votes vote down vote up
def parse(self, response):
        for sel in response.xpath("//div[@class='list-group']//h3/a"):
            l = ItemLoader(item=ComicscraperItem(), selector=sel)
            l.add_xpath('title', './text()')
            l.add_xpath('link', './@href')

            request = scrapy.Request(sel.xpath('./@href').extract_first(), callback=self.parse_isbn, dont_filter=True)
            request.meta['l'] = l
            yield request 
Example #17
Source File: textspider.py    From ARGUS with GNU General Public License v3.0 5 votes vote down vote up
def parse(self, response):

        #initialize collector item which stores the website's content and meta data
        loader = ItemLoader(item=Collector())
        loader.add_value("dl_slot", response.request.meta.get('download_slot'))
        loader.add_value("redirect", self.checkRedirectDomain(response))
        loader.add_value("start_page", response.url)
        loader.add_value("start_domain", self.subdomainGetter(response))  
        loader.add_value("scraped_urls", [response.urljoin(response.url)])
        loader.add_value("scrape_counter", 1)
        loader.add_value("scraped_text", [self.extractText(response)])
        loader.add_value("error", "None")
        loader.add_value("ID", response.request.meta["ID"])

        #initialize the fingerprints set which stores all fingerprints of visited websites
        fingerprints = set()
        #add the fingerprints of the start_page
        fingerprints.add(request_fingerprint(response.request))
        
        #if there was an initial redirect, the new domain is added to the allowed domains
        domain = self.subdomainGetter(response)
        if domain not in self.allowed_domains:
            self.allowed_domains.append(domain)
            self.refreshAllowedDomains()

        #extract all urls from the page...
        urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
        #...and safe them to a urlstack
        urlstack = [response.urljoin(url) for url in urls]   
            
        #attach the urlstack, the loader, and the fingerprints to the response...        
        response.meta["urlstack"] = urlstack
        response.meta["loader"] = loader
        response.meta["fingerprints"] = fingerprints
        #...and send it over to the processURLstack function
        return self.processURLstack(response)
    
    
##################################################################
# PROCESS URL STACK
################################################################## 
Example #18
Source File: itemloader.py    From scrapy-bench with MIT License 5 votes vote down vote up
def main():
    total = 0
    time = 0
    tar = tarfile.open("bookfiles.tar.gz")

    for member in tar.getmembers():
        f = tar.extractfile(member)
        html = f.read()
        response = HtmlResponse(url="local", body=html, encoding='utf8')

        for i in xrange(0, 10):
            start = timer()
            loader = ItemLoader(item=ItemloaderItem(), response=response)
            loader.add_xpath(
                'rating', '//*[@id="content_inner"]/article/div[1]/div[2]/p[3]/i[1]')
            loader.add_xpath(
                'title', '//*[@id=("content_inner")]/article/div[1]/div[2]/h1')
            loader.add_xpath(
                'price', '//*[@id=("content_inner")]/article/div[1]/div[2]/p[1]')
            loader.add_css('stock', '.product_main .instock.availability ::text')
            loader.add_css('category', 'ul.breadcrumb li:nth-last-child(2) ::text')
            loader.add_value('name', 'item {}'.format(i))
            loader.add_value('url', 'http://site.com/item{}'.format(i))
            product = loader.load_item()
            end = timer()

            total += 1
            time = time + end - start


    print("\nTotal number of items extracted = {0}".format(total))
    print("Time taken = {0}".format(time))
    click.secho("Rate of link extraction : {0} items/second\n".format(
        float(total / time)), bold=True)

    with open("Benchmark.txt", 'w') as g:
        g.write(" {0}".format((float(total / time)))) 
Example #19
Source File: rosi.py    From crawlers with GNU General Public License v3.0 5 votes vote down vote up
def parse_item(self, response):
        il = ItemLoader(item=ImageItem(), response=response)
        il.add_css('image_urls', 'img::attr(src)')
        return il.load_item() 
Example #20
Source File: fbcrawl.py    From fbcrawl with Apache License 2.0 5 votes vote down vote up
def parse_reactions(self,response):
        new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item'])
        new.context['lang'] = self.lang           
        new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")
        new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")
        new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")
        new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")
        new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")
        new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")     
        yield new.load_item() 
Example #21
Source File: profiles.py    From fbcrawl with Apache License 2.0 5 votes vote down vote up
def parse_profile(self,response):
        new = ItemLoader(item=ProfileItem(),response=response)
        self.logger.info('Crawling profile info')
        new.add_xpath('name','//span/div/span/strong/text()')
        new.add_xpath('gender',"//div[@id='basic-info']//div[@title='Gender']//div/text()")
        new.add_xpath('birthday',"//div[@id='basic-info']//div[@title='Birthday']//div/text()")
        new.add_xpath('current_city',"//div[@id='living']//div[@title='Current City']//a/text()")
        new.add_xpath('hometown',"//div[@id='living']//div[@title='Hometown']//a/text()")
        new.add_xpath('work',"//div[@id='work']//a/text()")
        new.add_xpath('education',"//div[@id='education']//a/text()")
        new.add_xpath('interested_in',"//div[@id='interested-in']//div[not(contains(text(),'Interested In'))]/text()")
        new.add_xpath('page',"//div[@id='contact-info']//div[@title='Facebook']//div/text()")
        yield new.load_item() 
Example #22
Source File: quotes_spider.py    From scrapy-tutorial with MIT License 5 votes vote down vote up
def parse_author(self, response):
        quote_item = response.meta['quote_item']
        loader = ItemLoader(item=quote_item, response=response)
        loader.add_css('author_name', '.author-title::text')
        loader.add_css('author_birthday', '.author-born-date::text')
        loader.add_css('author_bornlocation', '.author-born-location::text')
        loader.add_css('author_bio', '.author-description::text')
        yield loader.load_item() 
Example #23
Source File: event.py    From In2ItChicago with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, *args, **kwargs):
        item_loader = ItemLoader(item=Event())
        for key, value in kwargs.items():
            try:
                item_loader.add_value(key, value)
            except KeyError:
                raise KeyError(f'{key} is not a valid event field')
        self.item = item_loader.load_item() 
Example #24
Source File: textspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def errorback(self, failure):
        loader = ItemLoader(item=Collector())
        if failure.check(HttpError):
            response = failure.value.response
            loader.add_value("dl_slot", response.request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", response.status)
            loader.add_value("ID", response.request.meta["ID"])
            yield loader.load_item()
        elif failure.check(DNSLookupError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "DNS")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item() 
        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "Timeout")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item()
        else:
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "other")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item()


##################################################################
# MAIN PARSE
################################################################## 
Example #25
Source File: linkspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def errorback(self, failure):
        loader = ItemLoader(item=LinkCollector())
        if failure.check(HttpError):
            response = failure.value.response
            loader.add_value("dl_slot", response.request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", response.status)
            loader.add_value("ID", response.request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()
        elif failure.check(DNSLookupError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "DNS")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item() 
        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "Timeout")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()
        else:
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "other")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()


##################################################################
# MAIN PARSE
################################################################## 
Example #26
Source File: linkspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def parse(self, response):

        #initialize collector item which stores the website's content and meta data
        loader = ItemLoader(item=LinkCollector())
        loader.add_value("dl_slot", response.request.meta.get('download_slot'))
        loader.add_value("redirect", self.checkRedirectDomain(response))
        #add alias if there was an initial redirect
        if self.checkRedirectDomain(response):
            loader.add_value("alias", self.subdomainGetter(response).split("www.")[-1])
        else:
            loader.add_value("alias", "") 
        loader.add_value("start_page", response.url)
        loader.add_value("start_domain", self.subdomainGetter(response))  
        loader.add_value("scraped_urls", [response.urljoin(response.url)])
        loader.add_value("scrape_counter", 1)
        loader.add_value("error", "None")
        loader.add_value("ID", response.request.meta["ID"])
        loader.add_value("links", "")

        #initialize the fingerprints set which stores all fingerprints of visited websites
        fingerprints = set()
        #add the fingerprints of the start_page
        fingerprints.add(request_fingerprint(response.request))
        
        #if there was an initial redirect, the new domain is added to the allowed domains
        domain = self.subdomainGetter(response)
        if domain not in self.allowed_domains:
            self.allowed_domains.append(domain)
            self.refreshAllowedDomains()

        #extract all urls from the page...
        urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
        #...and safe them to a urlstack
        urlstack = [response.urljoin(url) for url in urls]
            
        #attach the urlstack, the loader, and the fingerprints to the response...        
        response.meta["urlstack"] = urlstack
        response.meta["loader"] = loader
        response.meta["fingerprints"] = fingerprints
        #...and send it over to the processURLstack function
        return self.processURLstack(response)
    
    
##################################################################
# PROCESS URL STACK
################################################################## 
Example #27
Source File: textspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def errorback(self, failure):
        loader = ItemLoader(item=Collector())
        if failure.check(HttpError):
            response = failure.value.response
            loader.add_value("dl_slot", response.request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("title", "")
            loader.add_value("description", "")
            loader.add_value("keywords", "")
            loader.add_value("error", response.status)
            loader.add_value("ID", response.request.meta["ID"])
            yield loader.load_item()
        elif failure.check(DNSLookupError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("title", "")
            loader.add_value("description", "")
            loader.add_value("keywords", "")
            loader.add_value("error", "DNS")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item() 
        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("title", "")
            loader.add_value("description", "")
            loader.add_value("keywords", "")
            loader.add_value("error", "Timeout")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item()
        else:
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("title", "")
            loader.add_value("description", "")
            loader.add_value("keywords", "")
            loader.add_value("error", "other")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item()


##################################################################
# MAIN PARSE
################################################################## 
Example #28
Source File: linkspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def errorback(self, failure):
        loader = ItemLoader(item=LinkCollector())
        if failure.check(HttpError):
            response = failure.value.response
            loader.add_value("dl_slot", response.request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", response.status)
            loader.add_value("ID", response.request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()
        elif failure.check(DNSLookupError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "DNS")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item() 
        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "Timeout")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()
        else:
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "other")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()


##################################################################
# MAIN PARSE
################################################################## 
Example #29
Source File: linkspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def parse(self, response):

        #initialize collector item which stores the website's content and meta data
        loader = ItemLoader(item=LinkCollector())
        loader.add_value("dl_slot", response.request.meta.get('download_slot'))
        loader.add_value("redirect", self.checkRedirectDomain(response))
        #add alias if there was an initial redirect
        if self.checkRedirectDomain(response):
            loader.add_value("alias", self.subdomainGetter(response).split("www.")[-1])
        else:
            loader.add_value("alias", "") 
        loader.add_value("start_page", response.url)
        loader.add_value("start_domain", self.subdomainGetter(response))  
        loader.add_value("scraped_urls", [response.urljoin(response.url)])
        loader.add_value("scrape_counter", 1)
        loader.add_value("error", "None")
        loader.add_value("ID", response.request.meta["ID"])
        loader.add_value("links", "")

        #initialize the fingerprints set which stores all fingerprints of visited websites
        fingerprints = set()
        #add the fingerprints of the start_page
        fingerprints.add(request_fingerprint(response.request))
        
        #if there was an initial redirect, the new domain is added to the allowed domains
        domain = self.subdomainGetter(response)
        if domain not in self.allowed_domains:
            self.allowed_domains.append(domain)
            self.refreshAllowedDomains()

        #extract all urls from the page...
        urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
        #...and safe them to a urlstack
        urlstack = [response.urljoin(url) for url in urls]
            
        #attach the urlstack, the loader, and the fingerprints to the response...        
        response.meta["urlstack"] = urlstack
        response.meta["loader"] = loader
        response.meta["fingerprints"] = fingerprints
        #...and send it over to the processURLstack function
        return self.processURLstack(response)
    
    
##################################################################
# PROCESS URL STACK
################################################################## 
Example #30
Source File: zhihu.py    From FunpySpiderSearchEngine with Apache License 2.0 4 votes vote down vote up
def parse_question(self, response):
        # 处理question页面, 从页面中提取出具体的question item
        if "QuestionHeader-title" in response.text:
            # 处理新版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)

            item_loader.add_value("url_object_id", get_md5(response.url))
            item_loader.add_value("question_id", question_id)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_xpath("content", "//*[@id='root']/div/main/div/div[1]/div[2]"
                                             "/div[1]/div[1]/div[2]/div/div/div/span/text()")
            item_loader.add_css("topics", ".QuestionHeader-topics .Tag.QuestionTopic .Popover div::text")
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
            # 这里的watch_user_num 包含Watch 和 click 在clean data中分离
            item_loader.add_css("watch_user_num", ".NumberBoard-itemValue ::text")
            item_loader.add_value("url", response.url)
            question_item = item_loader.load_item()
        else:
            # 处理老版本页面的item提取(好像已经没有老版页面了我这里放着保险一下)
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            item_loader.add_xpath("title",
                                  "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
            item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|"
                                                    "//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()
        # 发起向后台具体answer的接口请求
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
                             callback=self.parse_answer)
        yield question_item