Python w3lib.html.remove_tags() Examples

The following are 18 code examples of w3lib.html.remove_tags(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module w3lib.html , or try the search function .
Example #1
Source File: 1scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def parse(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        item_loader.default_output_processor = TakeFirst()
        #
        #item_loader.add_css("my_field", "my_css")
        #item_loader.add_xpath("my_field", "my_xpath")
        #
        return item_loader.load_item() 
Example #2
Source File: jobbole_Item.py    From FunpySpiderSearchEngine with Apache License 2.0 6 votes vote down vote up
def save_to_es(self):
        """保存伯乐在线文章到es中"""
        self.clean_data()
        blog = JobboleBlogIndex()
        blog.title = self['title']
        blog.create_date = self["create_date"]
        blog.content = remove_tags(self["content"])
        blog.front_image_url = self["front_image_url"]
        blog.praise_nums = self["praise_nums"]
        blog.fav_nums = self["fav_nums"]
        blog.comment_nums = self["comment_nums"]
        blog.url = self["url"]
        blog.tags = self["tags"]
        blog.meta.id = self["url_object_id"]
        # 在保存数据时必须传入suggest
        blog.suggest = generate_suggests(es_jobbole_blog,
                                         ((blog.title, 10), (blog.tags, 6), (blog.content, 4)))
        real_time_count('jobbole_blog_count', JOBBOLE_COUNT_INIT)
        blog.save() 
Example #3
Source File: zhihu_item.py    From FunpySpiderSearchEngine with Apache License 2.0 6 votes vote down vote up
def clean_data(self):
        try:
            self["praise_num"] = extract_num("".join(self["praise_num"]))
        except BaseException:
            self["praise_num"] = 0
        self["comments_num"] = extract_num("".join(self["comments_num"]))

        self["create_time"] = datetime.datetime.fromtimestamp(
            self["create_time"]).strftime(SQL_DATETIME_FORMAT)
        try:
            self["update_time"] = datetime.datetime.fromtimestamp(
                self["update_time"]).strftime(SQL_DATETIME_FORMAT)
        except:
            self["update_time"] = self["create_time"]

        self["crawl_time"] = self["crawl_time"].strftime(SQL_DATETIME_FORMAT)
        self["content"] = remove_tags(self["content"]) 
Example #4
Source File: post_pass_item.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_field(self, response):
        item_loader = ItemLoader(item=response.meta["item"], response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        #item_loader.add_css("field", "")
        return item_loader.load_item() 
Example #5
Source File: post_pass_item.py    From scrapy-templates with MIT License 6 votes vote down vote up
def parse(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        #item_loader.add_css("", "")
        #item_loader.add_css("", "")
        #item_loader.add_css("", "")
        yield FormRequest("POST_URL", formdata={'parameter': 'p'},
                                        meta={'item': item_loader.load_item()}, callback=self.populate_field) 
Example #6
Source File: 1fol2fol_pag3scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        #item_loader.add_css("field", "")
        yield item_loader.load_item()

    # 3. PAGINATION LEVEL 2 
Example #7
Source File: regex.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        def clean_text(text):
            return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()

        def clean_url(url):
            clean_url = ''
            try:
                clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
            except ValueError:
                pass
            return clean_url

        if base_url is None:
            base_url = get_base_url(response_text, response_url, response_encoding)

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding),
                     clean_text(text))
                for url, _, text in links_text] 
Example #8
Source File: 1fol2fol3scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        #item_loader.add_css("", "")
        #item_loader.add_css("", "")

        yield item_loader.load_item() 
Example #9
Source File: 1fol2scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        #item_loader.add_css("", "")
        #item_loader.add_css("", "")

        yield item_loader.load_item() 
Example #10
Source File: 1fol2scr_pag.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        # item_loader.add_css("")
        # item_loader.add_value("raw", raw)

        # yield the populated item first
        yield item_loader.load_item()
        # then yield the function which paginates to another page that contains data
        yield self.paginate(response)

    # 3. PAGINATION LEVEL 2 
Example #11
Source File: 1fol_pag2scr.py    From scrapy-templates with MIT License 6 votes vote down vote up
def populate_item(self, response):
        item_loader = ItemLoader(item=MySpiderItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        # item_loader.add_css("", "")
        yield item_loader.load_item()

    # 3. PAGINATION LEVEL 1 
Example #12
Source File: sitemap_spider.py    From scrapy-templates with MIT License 6 votes vote down vote up
def scrape_product(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        item_loader.default_output_processor = TakeFirst()

        item_loader.add_css("my_field", "selector")

        return item_loader.load_item() 
Example #13
Source File: regex.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        def clean_text(text):
            return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()

        def clean_url(url):
            clean_url = ''
            try:
                clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
            except ValueError:
                pass
            return clean_url

        if base_url is None:
            base_url = get_base_url(response_text, response_url, response_encoding)

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding),
                     clean_text(text))
                for url, _, text in links_text] 
Example #14
Source File: baidu_search.py    From ChatBotCourse with MIT License 5 votes vote down vote up
def parse_url(self, response):
        print "url:", response.url
        print "title:", response.meta['title']
        print "abstract:", response.meta['abstract']
        content = remove_tags(response.selector.xpath('//body').extract()[0])
        print "content_len:", len(content) 
Example #15
Source File: mg.py    From scrape-news with MIT License 5 votes vote down vote up
def parse(self, response):
        canonical_url = response.xpath('//link[@rel="canonical"]/@href').extract_first()

        ## Skip excluded sections
        section = response.css('a.section').xpath('text()').extract_first()
        if section and section.lower() in IGNORE_SECTIONS:
            self.logger.info("Skipping %s because section is %s", canonical_url, section)
            return

        ## Skip syndicated content
        body_html = "".join(response.css("#body_content p").extract())
        body_text = remove_tags(body_html)

        for string in SKIP_STRINGS:
            suffix = body_text[-20:]
            if string in suffix:
                self.logger.info("Skipping %s because suffix %r contains %r",
                                 canonical_url,
                                 suffix,
                                 string)
                return

        publication_date_str = response.xpath('//meta[@name="publicationdate"]/@content').extract_first()
        publication_date = datetime.strptime(publication_date_str, '%d/%m/%Y')
        publication_date = SAST.localize(publication_date)


        item = ScrapenewsItem()
        item['body_html'] = response.css("#body_content").extract_first()
        item['title'] = response.xpath('//meta[@name="title"]/@content').extract_first()
        item['byline'] = response.xpath('//meta[@name="author"]/@content').extract_first()
        item['published_at'] = publication_date.isoformat()
        item['retrieved_at'] = datetime.utcnow().isoformat()
        item['url'] = canonical_url
        item['file_name'] = response.url.split('/')[-1]
        item['spider_name'] = self.name
        item['publication_name'] = self.publication_name

        yield item 
Example #16
Source File: lagou_Item.py    From FunpySpiderSearchEngine with Apache License 2.0 5 votes vote down vote up
def save_to_es(self):
        self.clean_data()
        job = LagouJobIndex()
        job.title = self["title"]
        job.url = self["url"]
        job.meta.id = self["url_object_id"]
        job.salary_min = self["salary_min"]
        job.salary_max = self["salary_max"]
        job.job_city = self["job_city"]
        job.work_years_min = self["work_years_min"]
        job.work_years_max = self["work_years_max"]
        job.degree_need = self["degree_need"]
        job.job_desc = remove_tags(self["job_desc"]).strip().replace("\r\n", "").replace("\t", "")
        job.job_advantage = self["job_advantage"]
        job.tags = self["tags"]
        job.job_type = self["job_type"]
        job.publish_time = self["publish_time"]
        job.job_addr = self["job_addr"]
        job.company_name = self["company_name"]
        job.company_url = self["company_url"]
        job.crawl_time = self['crawl_time']

        job.suggest = generate_suggests(es_lagou_job,
                                        ((job.title, 10), (job.tags, 7), (job.job_advantage, 6), (job.job_desc, 3),
                                         (job.job_addr, 5), (job.company_name, 8), (job.degree_need, 4),
                                         (job.job_city, 9)))
        real_time_count('lagou_job_count', JOB_COUNT_INIT)
        job.save() 
Example #17
Source File: zhihu_item.py    From FunpySpiderSearchEngine with Apache License 2.0 5 votes vote down vote up
def clean_data(self):
        self["question_id"] = self["question_id"][0]
        self["topics"] = ",".join(self["topics"])
        self["url"] = self["url"][0]
        self["title"] = "".join(self["title"])
        try:
            self["content"] = "".join(self["content"])
            self["content"] = remove_tags(self["content"])
        except BaseException:
            self["content"] = "无"
        try:
            self["answer_num"] = extract_num("".join(self["answer_num"]))
        except BaseException:
            self["answer_num"] = 0
        self["comments_num"] = extract_num("".join(self["comments_num"]))

        if len(self["watch_user_num"]) == 2:
            watch_user_num_click = self["watch_user_num"]
            self["watch_user_num"] = extract_num_include_dot(watch_user_num_click[0])
            self["click_num"] = extract_num_include_dot(watch_user_num_click[1])
        else:
            watch_user_num_click = self["watch_user_num"]
            self["watch_user_num"] = extract_num_include_dot(watch_user_num_click[0])
            self["click_num"] = 0

        self["crawl_time"] = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) 
Example #18
Source File: baidu_search.py    From ChatBotCourse with MIT License 5 votes vote down vote up
def parse(self, response):
        hrefs = response.selector.xpath('//div[contains(@class, "c-container")]/h3/a/@href').extract()
        containers = response.selector.xpath('//div[contains(@class, "c-container")]')
        for container in containers:
            href = container.xpath('h3/a/@href').extract()[0]
            title = remove_tags(container.xpath('h3/a').extract()[0])
            c_abstract = container.xpath('div/div/div[contains(@class, "c-abstract")]').extract()
            abstract = ""
            if len(c_abstract) > 0:
                abstract = remove_tags(c_abstract[0])
            request = scrapy.Request(href, callback=self.parse_url)
            request.meta['title'] = title
            request.meta['abstract'] = abstract
            yield request