Python scrapy.loader() Examples

The following are 16 code examples of scrapy.loader(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy , or try the search function .
Example #1
Source File: quotes_spider.py    From scrapy-tutorial with MIT License 6 votes vote down vote up
def parse(self, response):
        self.logger.info('Parse function called on {}'.format(response.url))
        # quotes = response.xpath("//div[@class='quote']")
        quotes = response.css('div.quote')

        for quote in quotes:
            loader = ItemLoader(item=QuoteItem(), selector=quote)
            # pay attention to the dot .// to use relative xpath
            # loader.add_xpath('quote_content', ".//span[@class='text']/text()")
            loader.add_css('quote_content', '.text::text')
            # loader.add_xpath('author', './/small//text()')
            loader.add_css('tags', '.tag::text')
            quote_item = loader.load_item()
            author_url = quote.css('.author + a::attr(href)').get()
            # go to the author page and pass the current collected quote info
            yield response.follow(author_url, self.parse_author, meta={'quote_item': quote_item})

        # go to Next page
        for a in response.css('li.next a'):
            yield response.follow(a, self.parse) 
Example #2
Source File: main.py    From python-examples with MIT License 6 votes vote down vote up
def parse(self,response):
        print('url:', response.url)
        articles = response.xpath('//div[@class="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls"]')
        for article in articles:

            if article.xpath('.//a[@class="button button--smaller button--chromeless u-baseColor--buttonNormal"]/@href').extract_first():
                l = ItemLoader(item = MediumItem(), selector = article)
                l.default_output_processor = scrapy.loader.processors.TakeFirst()
                l.add_css('Title','div > h3::text')
                l.add_xpath('Name','.//a[@class="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken"]/text()')
                l.add_css('Read','span::attr(title)')
                l.add_xpath('Publication', './/a[@class="ds-link ds-link--styleSubtle link--darkenlink--accent u-accentColor--textNormal"]/text()')
                l.add_xpath('Claps','.//button[@class="button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()')
                l.add_xpath('Responses','.//a[@class="button button--chromeless u-baseColor--buttonNormal"]/text()')
                l.add_value('Page', response.url)
                yield l.load_item() 
Example #3
Source File: main.py    From python-examples with MIT License 6 votes vote down vote up
def phone_parse(self, response):
        print("[phone_parse] response:", response)

        # get data from `parse_car_page`
        loader = response.meta['loader']

        item = response.xpath('//p/text()').extract()
        print('[phone_parse] item:', type(item), item)
        
        json_data = json.loads(item[0])
        print('[phone_parse] json:', json_data)

        number = json_data["value"].replace(" ","")
        print("'[phone_parse] number:", number)
        
        # add new data to loader
        loader.add_value('number', number)
        
        # send all data 
        yield loader.load_item()

# --- run without project and save in `output.csv` --- 
Example #4
Source File: quotes_spider.py    From scrapy-tutorial with MIT License 5 votes vote down vote up
def parse_author(self, response):
        quote_item = response.meta['quote_item']
        loader = ItemLoader(item=quote_item, response=response)
        loader.add_css('author_name', '.author-title::text')
        loader.add_css('author_birthday', '.author-born-date::text')
        loader.add_css('author_bornlocation', '.author-born-location::text')
        loader.add_css('author_bio', '.author-description::text')
        yield loader.load_item() 
Example #5
Source File: itemloader.py    From scrapy-bench with MIT License 5 votes vote down vote up
def main():
    total = 0
    time = 0
    tar = tarfile.open("bookfiles.tar.gz")

    for member in tar.getmembers():
        f = tar.extractfile(member)
        html = f.read()
        response = HtmlResponse(url="local", body=html, encoding='utf8')

        for i in xrange(0, 10):
            start = timer()
            loader = ItemLoader(item=ItemloaderItem(), response=response)
            loader.add_xpath(
                'rating', '//*[@id="content_inner"]/article/div[1]/div[2]/p[3]/i[1]')
            loader.add_xpath(
                'title', '//*[@id=("content_inner")]/article/div[1]/div[2]/h1')
            loader.add_xpath(
                'price', '//*[@id=("content_inner")]/article/div[1]/div[2]/p[1]')
            loader.add_css('stock', '.product_main .instock.availability ::text')
            loader.add_css('category', 'ul.breadcrumb li:nth-last-child(2) ::text')
            loader.add_value('name', 'item {}'.format(i))
            loader.add_value('url', 'http://site.com/item{}'.format(i))
            product = loader.load_item()
            end = timer()

            total += 1
            time = time + end - start


    print("\nTotal number of items extracted = {0}".format(total))
    print("Time taken = {0}".format(time))
    click.secho("Rate of link extraction : {0} items/second\n".format(
        float(total / time)), bold=True)

    with open("Benchmark.txt", 'w') as g:
        g.write(" {0}".format((float(total / time)))) 
Example #6
Source File: textspider.py    From ARGUS with GNU General Public License v3.0 5 votes vote down vote up
def parse(self, response):

        #initialize collector item which stores the website's content and meta data
        loader = ItemLoader(item=Collector())
        loader.add_value("dl_slot", response.request.meta.get('download_slot'))
        loader.add_value("redirect", self.checkRedirectDomain(response))
        loader.add_value("start_page", response.url)
        loader.add_value("start_domain", self.subdomainGetter(response))  
        loader.add_value("scraped_urls", [response.urljoin(response.url)])
        loader.add_value("scrape_counter", 1)
        loader.add_value("scraped_text", [self.extractText(response)])
        loader.add_value("error", "None")
        loader.add_value("ID", response.request.meta["ID"])

        #initialize the fingerprints set which stores all fingerprints of visited websites
        fingerprints = set()
        #add the fingerprints of the start_page
        fingerprints.add(request_fingerprint(response.request))
        
        #if there was an initial redirect, the new domain is added to the allowed domains
        domain = self.subdomainGetter(response)
        if domain not in self.allowed_domains:
            self.allowed_domains.append(domain)
            self.refreshAllowedDomains()

        #extract all urls from the page...
        urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
        #...and safe them to a urlstack
        urlstack = [response.urljoin(url) for url in urls]   
            
        #attach the urlstack, the loader, and the fingerprints to the response...        
        response.meta["urlstack"] = urlstack
        response.meta["loader"] = loader
        response.meta["fingerprints"] = fingerprints
        #...and send it over to the processURLstack function
        return self.processURLstack(response)
    
    
##################################################################
# PROCESS URL STACK
################################################################## 
Example #7
Source File: main.py    From python-examples with MIT License 5 votes vote down vote up
def parse_car_page(self, response):

        loader = OtomotoCarLoader(OtomotoItem(), response=response)

        property_list_map = {
            'Marka pojazdu': 'brand',
            'Model pojazdu': 'model',
            'Rok produkcji': 'year',
        }

        for params in response.css('.offer-params__item'):

            property_name = params.css('.offer-params__label::text').extract_first().strip()
            
            if property_name in property_list_map:
                css = params.css('div::text').extract_first().strip()
                
                if css == '':
                    css = params.css('a::text').extract_first().strip()

                loader.add_value(property_list_map[property_name], css)

        loader.add_css('features', '.offer-features__item::text')
        loader.add_value('url', response.url)
        
        number_id = self.parse_number(response)
        print('number_id:', len(number_id), '|', number_id)
        
        for id in number_id:
            phone_url = "https://www.otomoto.pl/ajax/misc/contact/multi_phone/" + id + '/0/'
            # use `meta=` to send data to `photo_parse`
            yield scrapy.Request(phone_url, callback=self.phone_parse, meta={'loader': loader}) 
Example #8
Source File: textspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def errorback(self, failure):
        loader = ItemLoader(item=Collector())
        if failure.check(HttpError):
            response = failure.value.response
            loader.add_value("dl_slot", response.request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", response.status)
            loader.add_value("ID", response.request.meta["ID"])
            yield loader.load_item()
        elif failure.check(DNSLookupError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "DNS")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item() 
        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "Timeout")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item()
        else:
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "other")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item()


##################################################################
# MAIN PARSE
################################################################## 
Example #9
Source File: textspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def processURLstack(self, response):

        #get meta data from response object to revive dragged stuff
        meta = response.request.meta
        loader = meta["loader"]
        urlstack = meta["urlstack"]
        fingerprints = meta["fingerprints"]
        
        #check whether max number of webpages has been scraped for this website
        if self.site_limit != 0:
            if loader.get_collected_values("scrape_counter")[0] >= self.site_limit:
                del urlstack[:]
        
        #reorder the urlstack to scrape the most relevant urls first
        urlstack = self.reorderUrlstack(urlstack, self.language, self.prefer_short_urls)
            
        #check if the next url in the urlstack is valid
        while len(urlstack) > 0:
            #pop non-valid domains
            domain = self.subdomainGetter(urlstack[0])
            if domain not in self.allowed_domains:
                urlstack.pop(0)
            #pop "mailto" urls
            elif re.match(r"mailto", urlstack[0]):
                urlstack.pop(0)
            #pop unwanted filetypes
            elif urlstack[0].split(".")[-1].lower() in self.filetypes:
                urlstack.pop(0)
            #pop visited urls 
            #(potential bottleneck: Request has to be sent to generate fingerprint from)
            elif request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
                urlstack.pop(0)
            else:
                break

        #if the url was assessed to be valid, send out a request and callbak the parse_subpage function
        #errbacks return to processURLstack
        #ALLOW ALL HTTP STATUS: 
        #errors must to be catched in the callback function, because middleware catched request break the sequence and collector items get lost
        if len(urlstack) > 0:
            yield scrapy.Request(urlstack.pop(0), meta={"loader": loader, "urlstack": urlstack, "fingerprints": fingerprints, 'handle_httpstatus_all': True}, dont_filter=True, callback=self.parse_subpage, errback=self.processURLstack)
        #if there are no urls left in the urlstack, the website was scraped completely and the item can be sent to the pipeline
        else:
            yield loader.load_item()
    
    
##################################################################
# PARSE SUB PAGE
################################################################## 
Example #10
Source File: linkspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def errorback(self, failure):
        loader = ItemLoader(item=LinkCollector())
        if failure.check(HttpError):
            response = failure.value.response
            loader.add_value("dl_slot", response.request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", response.status)
            loader.add_value("ID", response.request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()
        elif failure.check(DNSLookupError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "DNS")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item() 
        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "Timeout")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()
        else:
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "other")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()


##################################################################
# MAIN PARSE
################################################################## 
Example #11
Source File: linkspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def parse(self, response):

        #initialize collector item which stores the website's content and meta data
        loader = ItemLoader(item=LinkCollector())
        loader.add_value("dl_slot", response.request.meta.get('download_slot'))
        loader.add_value("redirect", self.checkRedirectDomain(response))
        #add alias if there was an initial redirect
        if self.checkRedirectDomain(response):
            loader.add_value("alias", self.subdomainGetter(response).split("www.")[-1])
        else:
            loader.add_value("alias", "") 
        loader.add_value("start_page", response.url)
        loader.add_value("start_domain", self.subdomainGetter(response))  
        loader.add_value("scraped_urls", [response.urljoin(response.url)])
        loader.add_value("scrape_counter", 1)
        loader.add_value("error", "None")
        loader.add_value("ID", response.request.meta["ID"])
        loader.add_value("links", "")

        #initialize the fingerprints set which stores all fingerprints of visited websites
        fingerprints = set()
        #add the fingerprints of the start_page
        fingerprints.add(request_fingerprint(response.request))
        
        #if there was an initial redirect, the new domain is added to the allowed domains
        domain = self.subdomainGetter(response)
        if domain not in self.allowed_domains:
            self.allowed_domains.append(domain)
            self.refreshAllowedDomains()

        #extract all urls from the page...
        urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
        #...and safe them to a urlstack
        urlstack = [response.urljoin(url) for url in urls]
            
        #attach the urlstack, the loader, and the fingerprints to the response...        
        response.meta["urlstack"] = urlstack
        response.meta["loader"] = loader
        response.meta["fingerprints"] = fingerprints
        #...and send it over to the processURLstack function
        return self.processURLstack(response)
    
    
##################################################################
# PROCESS URL STACK
################################################################## 
Example #12
Source File: textspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def errorback(self, failure):
        loader = ItemLoader(item=Collector())
        if failure.check(HttpError):
            response = failure.value.response
            loader.add_value("dl_slot", response.request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("title", "")
            loader.add_value("description", "")
            loader.add_value("keywords", "")
            loader.add_value("error", response.status)
            loader.add_value("ID", response.request.meta["ID"])
            yield loader.load_item()
        elif failure.check(DNSLookupError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("title", "")
            loader.add_value("description", "")
            loader.add_value("keywords", "")
            loader.add_value("error", "DNS")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item() 
        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("title", "")
            loader.add_value("description", "")
            loader.add_value("keywords", "")
            loader.add_value("error", "Timeout")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item()
        else:
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("title", "")
            loader.add_value("description", "")
            loader.add_value("keywords", "")
            loader.add_value("error", "other")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item()


##################################################################
# MAIN PARSE
################################################################## 
Example #13
Source File: textspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def parse(self, response):

        #initialize collector item which stores the website's content and meta data
        loader = ItemLoader(item=Collector())
        loader.add_value("dl_slot", response.request.meta.get('download_slot'))
        loader.add_value("redirect", self.checkRedirectDomain(response))
        loader.add_value("start_page", response.url)
        loader.add_value("start_domain", self.subdomainGetter(response))  
        loader.add_value("scraped_urls", [response.urljoin(response.url)])
        loader.add_value("scrape_counter", 1)
        loader.add_value("scraped_text", [self.extractText(response)])
        title, description, keywords = self.extractHeader(response)
        loader.add_value("title", [title])
        loader.add_value("description", [description])
        loader.add_value("keywords", [keywords])
        loader.add_value("error", "None")
        loader.add_value("ID", response.request.meta["ID"])

        #initialize the fingerprints set which stores all fingerprints of visited websites
        fingerprints = set()
        #add the fingerprints of the start_page
        fingerprints.add(request_fingerprint(response.request))
        
        #if there was an initial redirect, the new domain is added to the allowed domains
        domain = self.subdomainGetter(response)
        if domain not in self.allowed_domains:
            self.allowed_domains.append(domain)
            self.refreshAllowedDomains()

        #extract all urls from the page...
        urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
        #...and safe them to a urlstack
        urlstack = [response.urljoin(url) for url in urls]
            
        #attach the urlstack, the loader, and the fingerprints to the response...        
        response.meta["urlstack"] = urlstack
        response.meta["loader"] = loader
        response.meta["fingerprints"] = fingerprints
        #...and send it over to the processURLstack function
        return self.processURLstack(response)
    
    
##################################################################
# PROCESS URL STACK
################################################################## 
Example #14
Source File: textspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def processURLstack(self, response):

        #get meta data from response object to revive dragged stuff
        meta = response.request.meta
        loader = meta["loader"]
        urlstack = meta["urlstack"]
        fingerprints = meta["fingerprints"]
        
        #check whether max number of websites has been scraped for this website
        if self.site_limit != 0:
            if loader.get_collected_values("scrape_counter")[0] >= self.site_limit:
                del urlstack[:]
        
        #reorder the urlstack to scrape the most relevant urls first
        urlstack = self.reorderUrlstack(urlstack, self.language, self.prefer_short_urls)
            
        #check if the next url in the urlstack is valid
        while len(urlstack) > 0:
            #pop non-valid domains
            domain = self.subdomainGetter(urlstack[0])
            if domain not in self.allowed_domains:
                urlstack.pop(0)
            #pop "mailto" urls
            elif re.match(r"mailto", urlstack[0]):
                urlstack.pop(0)
            #pop unwanted filetypes
            elif urlstack[0].split(".")[-1].lower() in self.filetypes:
                urlstack.pop(0)
            #pop visited urls 
            #(potential bottleneck: Request has to be sent to generate fingerprint from)
            elif request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
                urlstack.pop(0)
            else:
                break

        #if the url was assessed to be valid, send out a request and callback the parse_subpage function
        #errbacks return to processURLstack
        #ALLOW ALL HTTP STATUS: 
        #errors must be caught in the callback function, because middleware caught request break the sequence and collector items get lost
        if len(urlstack) > 0:
            yield scrapy.Request(urlstack.pop(0), meta={"loader": loader, "urlstack": urlstack, "fingerprints": fingerprints, 'handle_httpstatus_all': True}, dont_filter=True, callback=self.parse_subpage, errback=self.processURLstack)
        #if there are no urls left in the urlstack, the website was scraped completely and the item can be sent to the pipeline
        else:
            yield loader.load_item()
    
    
##################################################################
# PARSE SUB PAGE
################################################################## 
Example #15
Source File: linkspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def errorback(self, failure):
        loader = ItemLoader(item=LinkCollector())
        if failure.check(HttpError):
            response = failure.value.response
            loader.add_value("dl_slot", response.request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", response.status)
            loader.add_value("ID", response.request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()
        elif failure.check(DNSLookupError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "DNS")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item() 
        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "Timeout")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()
        else:
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "other")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()


##################################################################
# MAIN PARSE
################################################################## 
Example #16
Source File: linkspider.py    From ARGUS with GNU General Public License v3.0 4 votes vote down vote up
def parse(self, response):

        #initialize collector item which stores the website's content and meta data
        loader = ItemLoader(item=LinkCollector())
        loader.add_value("dl_slot", response.request.meta.get('download_slot'))
        loader.add_value("redirect", self.checkRedirectDomain(response))
        #add alias if there was an initial redirect
        if self.checkRedirectDomain(response):
            loader.add_value("alias", self.subdomainGetter(response).split("www.")[-1])
        else:
            loader.add_value("alias", "") 
        loader.add_value("start_page", response.url)
        loader.add_value("start_domain", self.subdomainGetter(response))  
        loader.add_value("scraped_urls", [response.urljoin(response.url)])
        loader.add_value("scrape_counter", 1)
        loader.add_value("error", "None")
        loader.add_value("ID", response.request.meta["ID"])
        loader.add_value("links", "")

        #initialize the fingerprints set which stores all fingerprints of visited websites
        fingerprints = set()
        #add the fingerprints of the start_page
        fingerprints.add(request_fingerprint(response.request))
        
        #if there was an initial redirect, the new domain is added to the allowed domains
        domain = self.subdomainGetter(response)
        if domain not in self.allowed_domains:
            self.allowed_domains.append(domain)
            self.refreshAllowedDomains()

        #extract all urls from the page...
        urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
        #...and safe them to a urlstack
        urlstack = [response.urljoin(url) for url in urls]
            
        #attach the urlstack, the loader, and the fingerprints to the response...        
        response.meta["urlstack"] = urlstack
        response.meta["loader"] = loader
        response.meta["fingerprints"] = fingerprints
        #...and send it over to the processURLstack function
        return self.processURLstack(response)
    
    
##################################################################
# PROCESS URL STACK
##################################################################