Python w3lib.html.remove_tags() Examples
The following are 18
code examples of w3lib.html.remove_tags().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
w3lib.html
, or try the search function
.
Example #1
Source File: 1scr.py From scrapy-templates with MIT License | 6 votes |
def parse(self, response): item_loader = ItemLoader(item=MyItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) item_loader.default_output_processor = TakeFirst() # #item_loader.add_css("my_field", "my_css") #item_loader.add_xpath("my_field", "my_xpath") # return item_loader.load_item()
Example #2
Source File: jobbole_Item.py From FunpySpiderSearchEngine with Apache License 2.0 | 6 votes |
def save_to_es(self): """保存伯乐在线文章到es中""" self.clean_data() blog = JobboleBlogIndex() blog.title = self['title'] blog.create_date = self["create_date"] blog.content = remove_tags(self["content"]) blog.front_image_url = self["front_image_url"] blog.praise_nums = self["praise_nums"] blog.fav_nums = self["fav_nums"] blog.comment_nums = self["comment_nums"] blog.url = self["url"] blog.tags = self["tags"] blog.meta.id = self["url_object_id"] # 在保存数据时必须传入suggest blog.suggest = generate_suggests(es_jobbole_blog, ((blog.title, 10), (blog.tags, 6), (blog.content, 4))) real_time_count('jobbole_blog_count', JOBBOLE_COUNT_INIT) blog.save()
Example #3
Source File: zhihu_item.py From FunpySpiderSearchEngine with Apache License 2.0 | 6 votes |
def clean_data(self): try: self["praise_num"] = extract_num("".join(self["praise_num"])) except BaseException: self["praise_num"] = 0 self["comments_num"] = extract_num("".join(self["comments_num"])) self["create_time"] = datetime.datetime.fromtimestamp( self["create_time"]).strftime(SQL_DATETIME_FORMAT) try: self["update_time"] = datetime.datetime.fromtimestamp( self["update_time"]).strftime(SQL_DATETIME_FORMAT) except: self["update_time"] = self["create_time"] self["crawl_time"] = self["crawl_time"].strftime(SQL_DATETIME_FORMAT) self["content"] = remove_tags(self["content"])
Example #4
Source File: post_pass_item.py From scrapy-templates with MIT License | 6 votes |
def populate_field(self, response): item_loader = ItemLoader(item=response.meta["item"], response=response) item_loader.default_input_processor = MapCompose(remove_tags) #item_loader.add_css("field", "") return item_loader.load_item()
Example #5
Source File: post_pass_item.py From scrapy-templates with MIT License | 6 votes |
def parse(self, response): item_loader = ItemLoader(item=MyItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) #item_loader.add_css("", "") #item_loader.add_css("", "") #item_loader.add_css("", "") yield FormRequest("POST_URL", formdata={'parameter': 'p'}, meta={'item': item_loader.load_item()}, callback=self.populate_field)
Example #6
Source File: 1fol2fol_pag3scr.py From scrapy-templates with MIT License | 6 votes |
def populate_item(self, response): item_loader = ItemLoader(item=MySpiderItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) #item_loader.add_css("field", "") yield item_loader.load_item() # 3. PAGINATION LEVEL 2
Example #7
Source File: regex.py From learn_python3_spider with MIT License | 6 votes |
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): def clean_text(text): return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip() def clean_url(url): clean_url = '' try: clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding)))) except ValueError: pass return clean_url if base_url is None: base_url = get_base_url(response_text, response_url, response_encoding) links_text = linkre.findall(response_text) return [Link(clean_url(url).encode(response_encoding), clean_text(text)) for url, _, text in links_text]
Example #8
Source File: 1fol2fol3scr.py From scrapy-templates with MIT License | 6 votes |
def populate_item(self, response): item_loader = ItemLoader(item=MySpiderItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) #item_loader.add_css("", "") #item_loader.add_css("", "") yield item_loader.load_item()
Example #9
Source File: 1fol2scr.py From scrapy-templates with MIT License | 6 votes |
def populate_item(self, response): item_loader = ItemLoader(item=MySpiderItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) #item_loader.add_css("", "") #item_loader.add_css("", "") yield item_loader.load_item()
Example #10
Source File: 1fol2scr_pag.py From scrapy-templates with MIT License | 6 votes |
def populate_item(self, response): item_loader = ItemLoader(item=MySpiderItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) # item_loader.add_css("") # item_loader.add_value("raw", raw) # yield the populated item first yield item_loader.load_item() # then yield the function which paginates to another page that contains data yield self.paginate(response) # 3. PAGINATION LEVEL 2
Example #11
Source File: 1fol_pag2scr.py From scrapy-templates with MIT License | 6 votes |
def populate_item(self, response): item_loader = ItemLoader(item=MySpiderItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) # item_loader.add_css("", "") yield item_loader.load_item() # 3. PAGINATION LEVEL 1
Example #12
Source File: sitemap_spider.py From scrapy-templates with MIT License | 6 votes |
def scrape_product(self, response): item_loader = ItemLoader(item=MyItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) item_loader.default_output_processor = TakeFirst() item_loader.add_css("my_field", "selector") return item_loader.load_item()
Example #13
Source File: regex.py From learn_python3_spider with MIT License | 6 votes |
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): def clean_text(text): return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip() def clean_url(url): clean_url = '' try: clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding)))) except ValueError: pass return clean_url if base_url is None: base_url = get_base_url(response_text, response_url, response_encoding) links_text = linkre.findall(response_text) return [Link(clean_url(url).encode(response_encoding), clean_text(text)) for url, _, text in links_text]
Example #14
Source File: baidu_search.py From ChatBotCourse with MIT License | 5 votes |
def parse_url(self, response): print "url:", response.url print "title:", response.meta['title'] print "abstract:", response.meta['abstract'] content = remove_tags(response.selector.xpath('//body').extract()[0]) print "content_len:", len(content)
Example #15
Source File: mg.py From scrape-news with MIT License | 5 votes |
def parse(self, response): canonical_url = response.xpath('//link[@rel="canonical"]/@href').extract_first() ## Skip excluded sections section = response.css('a.section').xpath('text()').extract_first() if section and section.lower() in IGNORE_SECTIONS: self.logger.info("Skipping %s because section is %s", canonical_url, section) return ## Skip syndicated content body_html = "".join(response.css("#body_content p").extract()) body_text = remove_tags(body_html) for string in SKIP_STRINGS: suffix = body_text[-20:] if string in suffix: self.logger.info("Skipping %s because suffix %r contains %r", canonical_url, suffix, string) return publication_date_str = response.xpath('//meta[@name="publicationdate"]/@content').extract_first() publication_date = datetime.strptime(publication_date_str, '%d/%m/%Y') publication_date = SAST.localize(publication_date) item = ScrapenewsItem() item['body_html'] = response.css("#body_content").extract_first() item['title'] = response.xpath('//meta[@name="title"]/@content').extract_first() item['byline'] = response.xpath('//meta[@name="author"]/@content').extract_first() item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = canonical_url item['file_name'] = response.url.split('/')[-1] item['spider_name'] = self.name item['publication_name'] = self.publication_name yield item
Example #16
Source File: lagou_Item.py From FunpySpiderSearchEngine with Apache License 2.0 | 5 votes |
def save_to_es(self): self.clean_data() job = LagouJobIndex() job.title = self["title"] job.url = self["url"] job.meta.id = self["url_object_id"] job.salary_min = self["salary_min"] job.salary_max = self["salary_max"] job.job_city = self["job_city"] job.work_years_min = self["work_years_min"] job.work_years_max = self["work_years_max"] job.degree_need = self["degree_need"] job.job_desc = remove_tags(self["job_desc"]).strip().replace("\r\n", "").replace("\t", "") job.job_advantage = self["job_advantage"] job.tags = self["tags"] job.job_type = self["job_type"] job.publish_time = self["publish_time"] job.job_addr = self["job_addr"] job.company_name = self["company_name"] job.company_url = self["company_url"] job.crawl_time = self['crawl_time'] job.suggest = generate_suggests(es_lagou_job, ((job.title, 10), (job.tags, 7), (job.job_advantage, 6), (job.job_desc, 3), (job.job_addr, 5), (job.company_name, 8), (job.degree_need, 4), (job.job_city, 9))) real_time_count('lagou_job_count', JOB_COUNT_INIT) job.save()
Example #17
Source File: zhihu_item.py From FunpySpiderSearchEngine with Apache License 2.0 | 5 votes |
def clean_data(self): self["question_id"] = self["question_id"][0] self["topics"] = ",".join(self["topics"]) self["url"] = self["url"][0] self["title"] = "".join(self["title"]) try: self["content"] = "".join(self["content"]) self["content"] = remove_tags(self["content"]) except BaseException: self["content"] = "无" try: self["answer_num"] = extract_num("".join(self["answer_num"])) except BaseException: self["answer_num"] = 0 self["comments_num"] = extract_num("".join(self["comments_num"])) if len(self["watch_user_num"]) == 2: watch_user_num_click = self["watch_user_num"] self["watch_user_num"] = extract_num_include_dot(watch_user_num_click[0]) self["click_num"] = extract_num_include_dot(watch_user_num_click[1]) else: watch_user_num_click = self["watch_user_num"] self["watch_user_num"] = extract_num_include_dot(watch_user_num_click[0]) self["click_num"] = 0 self["crawl_time"] = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)
Example #18
Source File: baidu_search.py From ChatBotCourse with MIT License | 5 votes |
def parse(self, response): hrefs = response.selector.xpath('//div[contains(@class, "c-container")]/h3/a/@href').extract() containers = response.selector.xpath('//div[contains(@class, "c-container")]') for container in containers: href = container.xpath('h3/a/@href').extract()[0] title = remove_tags(container.xpath('h3/a').extract()[0]) c_abstract = container.xpath('div/div/div[contains(@class, "c-abstract")]').extract() abstract = "" if len(c_abstract) > 0: abstract = remove_tags(c_abstract[0]) request = scrapy.Request(href, callback=self.parse_url) request.meta['title'] = title request.meta['abstract'] = abstract yield request