Python scrapy.loader.processors.MapCompose() Examples

The following are 15 code examples of scrapy.loader.processors.MapCompose(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.loader.processors , or try the search function .
Example #1
Source File: sitemap_spider.py    From scrapy-templates with MIT License 6 votes vote down vote up
def scrape_product(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        item_loader.default_output_processor = TakeFirst()

        item_loader.add_css("my_field", "selector")

        return item_loader.load_item() 
Example #2
Source File: 1fol_pag2scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        # item_loader.add_css("", "")
        yield item_loader.load_item()

    # 3. PAGINATION LEVEL 1 
Example #3
Source File: 1fol2scr_pag.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        # item_loader.add_css("")
        # item_loader.add_value("raw", raw)

        # yield the populated item first
        yield item_loader.load_item()
        # then yield the function which paginates to another page that contains data
        yield self.paginate(response)

    # 3. PAGINATION LEVEL 2 
Example #4
Source File: 1fol2scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        #item_loader.add_css("", "")
        #item_loader.add_css("", "")

        yield item_loader.load_item() 
Example #5
Source File: 1fol2fol3scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        #item_loader.add_css("", "")
        #item_loader.add_css("", "")

        yield item_loader.load_item() 
Example #6
Source File: 1scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def parse(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        item_loader.default_output_processor = TakeFirst()
        #
        #item_loader.add_css("my_field", "my_css")
        #item_loader.add_xpath("my_field", "my_xpath")
        #
        return item_loader.load_item() 
Example #7
Source File: 1fol2fol_pag3scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        #item_loader.add_css("field", "")
        yield item_loader.load_item()

    # 3. PAGINATION LEVEL 2 
Example #8
Source File: post_pass_item.py    From scrapy-templates with MIT License 6 votes vote down vote up
def parse(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        #item_loader.add_css("", "")
        #item_loader.add_css("", "")
        #item_loader.add_css("", "")
        yield FormRequest("POST_URL", formdata={'parameter': 'p'},
                                        meta={'item': item_loader.load_item()}, callback=self.populate_field) 
Example #9
Source File: post_pass_item.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_field(self, response):
        item_loader = ItemLoader(item=response.meta["item"], response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        #item_loader.add_css("field", "")
        return item_loader.load_item() 
Example #10
Source File: falter_at.py    From PyFeeds with GNU Affero General Public License v3.0 6 votes vote down vote up
def parse_blog_article(self, response):
        remove_elems = [".ad-component", ".wp-caption-text"]
        il = FeedEntryItemLoader(
            response=response,
            remove_elems=remove_elems,
            base_url="https://cms.{}".format(self.name),
            timezone="Europe/Vienna",
            dayfirst=True,
            yearfirst=False,
        )
        il.add_css("content_html", "article > h2")
        il.add_css("content_html", ".storycontent-article")
        il.add_css("author_name", ".falter-heading ::text", MapCompose(str.title))
        il.add_css(
            "author_name", ".thinktank-meta > span ::text", MapCompose(str.title)
        )
        il.add_css("updated", ".post > .text-label ::text", re=r"(\d{2}\.\d{2}\.\d{4})")
        il.add_value("link", response.url)
        il.add_value("path", "blog_{}".format(response.meta["blog"]))
        il.add_css("title", "article > h1 ::text")
        return il.load_item() 
Example #11
Source File: event.py    From In2ItChicago with GNU General Public License v3.0 5 votes vote down vote up
def custom_field():
    return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html), output_processor=Join()) 
Example #12
Source File: event.py    From In2ItChicago with GNU General Public License v3.0 5 votes vote down vote up
def numeric_field():
    return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html), output_processor=TakeFirst()) 
Example #13
Source File: event.py    From In2ItChicago with GNU General Public License v3.0 5 votes vote down vote up
def price_field():
    return scrapy.Field(input_processor=MapCompose(
            lambda value: value.replace('$', '') if type(value) == str else value,
            DataUtils.remove_html, float),
        output_processor=TakeFirst()) 
Example #14
Source File: event.py    From In2ItChicago with GNU General Public License v3.0 5 votes vote down vote up
def url_field():
    return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html, 
        lambda value: value \
            .replace('//', '/') \
            .replace('https:/', 'https://') \
            .replace('http:/', 'http://') \
            .rstrip('/')),
        output_processor=Join()) 
Example #15
Source File: event.py    From In2ItChicago with GNU General Public License v3.0 5 votes vote down vote up
def address_field():
    def parse_address(value):
        parsed = usaddress.parse(value) 
        def default_or_empty(field, default):
            if any(i[0] for i in parsed if i[1] == field):
                return ''
            return default 
        city_append = default_or_empty("PlaceName", " Chicago, ")
        state_append = default_or_empty("StateName", "IL")
        return f'{value}{city_append}{state_append}' 

    return scrapy.Field(input_processor=MapCompose(
            DataUtils.remove_html,
            parse_address),
        output_processor=Join())