Python bs4.Comment() Examples
The following are 30
code examples of bs4.Comment().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bs4
, or try the search function
.
Example #1
Source File: web_import.py From anki-search-inside-add-card with GNU Affero General Public License v3.0 | 7 votes |
def _fetch(url: str) -> BeautifulSoup: html = "" req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) with urllib.request.urlopen(req) as response: html = response.read() page = BeautifulSoup(html, "html.parser") for ignored_tag in ["script", "img", "input", "button", "style", "font", "iframe", "object", "embed"]: for tag in page.find_all(ignored_tag): tag.decompose() for tag in page.find_all(recursive=True): for attribute in ["class", "id", "name", "style", "role", "lang", "dir", "href", "src"]: del tag[attribute] for attribute in list(tag.attrs): if attribute.startswith("data-"): del tag.attrs[attribute] for node in page.find_all(text=lambda s: isinstance(s, Comment)): node.extract() return page
Example #2
Source File: crawler.py From lightnovel-crawler with Apache License 2.0 | 6 votes |
def clean_contents(self, div): if not div: return div # end if div.attrs = {} for tag in div.findAll(True): if isinstance(tag, Comment): tag.extract() # Remove comments elif tag.name == 'br': next_tag = getattr(tag, 'next_sibling') if next_tag and getattr(next_tag, 'name') == 'br': tag.extract() # end if elif tag.name in self.bad_tags: tag.extract() # Remove bad tags elif not tag.text.strip(): tag.extract() # Remove empty tags elif self.is_blacklisted(tag.text): tag.extract() # Remove blacklisted contents elif hasattr(tag, 'attrs'): tag.attrs = {} # Remove attributes # end if # end for return div # end def
Example #3
Source File: wordpress.py From CMSsc4n with GNU General Public License v3.0 | 6 votes |
def wordpressFuncXml(data): cms = False comment = "" version_match = None try: soup = BeautifulSoup(data.text, 'lxml') comments = soup.findAll(text=lambda text:isinstance(text, Comment)) if len(comments) > 0: cms = True version_match = re.findall(r'(?:(\d+\.[.\d]*\d+))',comments[0]) if len(version_match) > 0: version_match = version_match[0] if version_match != WORDPRESS_LAST_CMS_VERSION: print "The version wordpress is outdated or not identified" else: print "The version wordpress is updated" except Exception as e: print e version_match = None finally: return cms,version_match
Example #4
Source File: parser.py From OrgNote with GNU General Public License v2.0 | 6 votes |
def duosuo(self): if not self.duoshuo_shortname: return """ """ else: return """ <!-- Duoshuo Comment BEGIN --> <div class="ds-thread"></div> <script type="text/javascript"> var duoshuoQuery = {short_name:"%s"}; (function() { var ds = document.createElement('script'); ds.type = 'text/javascript';ds.async = true; ds.src = 'http://static.duoshuo.com/embed.js'; ds.charset = 'UTF-8'; (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(ds); })(); </script> <!-- Duoshuo Comment END --> """ % self.duoshuo_shortname
Example #5
Source File: owasp_suite.py From adapt with Apache License 2.0 | 6 votes |
def find_comments_in_html_by_urls(self, urls): res = [] for url in urls: path = urlparse(url).path host = urlparse(url).hostname scheme = urlparse(url).scheme req = "GET {0} {1}/1.1\r\nhost: {2}\r\n\r\n".format(path, scheme, host) try: r = self.zap.send_request(req) html = str(r['responseBody']) except Exception as e: r = requests.get(url) html = r.text if (html): soup = BeautifulSoup(html,'html.parser') comments = soup.findAll(text=lambda text:isinstance(text, Comment)) comment_list = [] for comment in comments: str1 = str(comment) comment_list.append(str1) c = { "method":"GET", "url":url, "resp":r.text, "request":"GET "+url, "data":comment_list } res.append(c) return res
Example #6
Source File: standings.py From pybaseball with MIT License | 6 votes |
def standings(season=None): # get most recent standings if date not specified if(season is None): season = int(datetime.datetime.today().strftime("%Y")) if season<1871: raise ValueError("This query currently only returns standings until the 1871 season. Try looking at years from 1871 to present.") # retrieve html from baseball reference soup = get_soup(season) if season>=1969: tables = get_tables(soup, season) else: t = soup.find_all(string=lambda text:isinstance(text,Comment)) # list of seasons whose table placement breaks the site's usual pattern exceptions = [1884, 1885, 1886, 1887, 1888, 1889, 1890, 1892, 1903] if (season>1904 or season in exceptions): code = BeautifulSoup(t[16], "lxml") elif season<=1904: code = BeautifulSoup(t[15], "lxml") tables = get_tables(code, season) tables = [pd.DataFrame(table) for table in tables] for idx in range(len(tables)): tables[idx] = tables[idx].rename(columns=tables[idx].iloc[0]) tables[idx] = tables[idx].reindex(tables[idx].index.drop(0)) return tables
Example #7
Source File: owasp_suite.py From adapt with Apache License 2.0 | 6 votes |
def find_comments_in_html_by_urls(self, urls): res = [] for url in urls: path = urlparse(url).path host = urlparse(url).hostname scheme = urlparse(url).scheme req = "GET {0} {1}/1.1\r\nhost: {2}\r\n\r\n".format(path, scheme, host) try: r = self.zap.send_request(req) html = str(r['responseBody']) except Exception as e: r = requests.get(url) html = r.text if (html): soup = BeautifulSoup(html,'html.parser') comments = soup.findAll(text=lambda text:isinstance(text, Comment)) comment_list = [] for comment in comments: str1 = str(comment) comment_list.append(str1) c = { "method":"GET", "url":url, "resp":r.text, "request":"GET "+url, "data":comment_list } res.append(c) return res
Example #8
Source File: dactyl_style_checker.py From dactyl with MIT License | 6 votes |
def get_overrides(self, soup): """ Look for overrides in the text to make exceptions for specific style rules. Returns a set of rule strings to ignore for this block. """ overrides = set() comments = soup.find_all(string=lambda text:isinstance(text,Comment)) for comment in comments: m = re.match(OVERRIDE_COMMENT_REGEX, comment) if m: new_overrides = m.group(1).split(",") new_overrides = {o.strip() for o in new_overrides} logger.info("Overrides found: %s" % new_overrides) overrides |= new_overrides return overrides
Example #9
Source File: importer.py From incremental-reading with ISC License | 6 votes |
def _fetchWebpage(self, url): if isMac: context = _create_unverified_context() html = urlopen(url, context=context).read() else: headers = {'User-Agent': self.settings['userAgent']} html = get(url, headers=headers).content webpage = BeautifulSoup(html, 'html.parser') for tagName in self.settings['badTags']: for tag in webpage.find_all(tagName): tag.decompose() for c in webpage.find_all(text=lambda s: isinstance(s, Comment)): c.extract() return webpage
Example #10
Source File: __init__.py From uoft-scrapers with MIT License | 6 votes |
def normalize_text_sections(div): paragraph = '' for content in div.contents: text = '' if type(content) == NavigableString: text = content elif type(content) == Comment: pass elif content.name == 'li': text = content.text else: text = content.text text = text.strip() paragraph += text.strip() + ' ' paragraph = paragraph.strip() paragraph = paragraph.replace('\r', '') paragraph = paragraph.replace('\n', ', ') paragraph = paragraph.strip() return paragraph
Example #11
Source File: __init__.py From uoft-scrapers with MIT License | 6 votes |
def normalize_text_sections(div): paragraph = '' for content in div.contents: text = '' if type(content) == NavigableString: text = content elif type(content) == Comment: pass elif content.name == 'li': text = content.text else: text = content.text text = text.strip() paragraph += text.strip() + ' ' paragraph = paragraph.strip() paragraph = paragraph.replace('\r', '') paragraph = paragraph.replace('\n', ', ') paragraph = paragraph.replace(' ', ' ') paragraph = paragraph.strip() return paragraph
Example #12
Source File: doc_utils.py From axcell with Apache License 2.0 | 5 votes |
def transform(el): if isinstance(el, Tag): # for f in _transforms_el: # r = f(el) # if r is not None: # return transform(r) return el.get_text() elif not isinstance(el, Comment): return str(el) return ''
Example #13
Source File: test_hints_html.py From qutebrowser with GNU General Public License v3.0 | 5 votes |
def _parse_file(test_name): """Parse the given HTML file.""" file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'hints', 'html', test_name) with open(file_path, 'r', encoding='utf-8') as html: soup = bs4.BeautifulSoup(html, 'html.parser') comment = str(soup.find(text=lambda text: isinstance(text, bs4.Comment))) if comment is None: raise InvalidFile(test_name, "no comment found") data = utils.yaml_load(comment) if not isinstance(data, dict): raise InvalidFile(test_name, "expected yaml dict but got {}".format( type(data).__name__)) allowed_keys = {'target', 'qtwebengine_todo'} if not set(data.keys()).issubset(allowed_keys): raise InvalidFile(test_name, "expected keys {} but found {}".format( ', '.join(allowed_keys), ', '.join(set(data.keys())))) if 'target' not in data: raise InvalidFile(test_name, "'target' key not found") qtwebengine_todo = data.get('qtwebengine_todo', None) return ParsedFile(target=data['target'], qtwebengine_todo=qtwebengine_todo)
Example #14
Source File: doc_utils.py From axcell with Apache License 2.0 | 5 votes |
def content_in_section(header, names=['h2', 'h3'], skip_comments=True): for el in header.next_siblings: if getattr(el, 'name', '') in names: break if skip_comments and isinstance(el, Comment): continue yield el
Example #15
Source File: blog_parser.py From QzoneExporter with GNU General Public License v3.0 | 5 votes |
def export(self): with open(self._blog_filename, "w", encoding="utf-8") as f: self._bs_obj.title.string = self._blog_info.title # 删除script, style标签 delete_labels = ["script", "style"] for delete_label in delete_labels: for t in self._bs_obj.find_all(delete_label): if filter_blog_script(t.text): continue t.extract() # 删除注释 for comment in self._bs_obj.find_all(text=lambda text: isinstance(text, Comment)): comment.extract() pubtime = self._bs_obj.find("span", {"id": "pubTime"}) pubtime.string = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(self._blog_info.blog_id)) readnum = self._bs_obj.find("span", {"id": "readNum"}) readnum.string = "阅读(%d)\t评论(%d)" % ( self._read, self._blog_info.comment_num) f.write(self._bs_obj.prettify())
Example #16
Source File: thu_learn.py From thu_learn with MIT License | 5 votes |
def files(self): """ get all files in course :return: File generator """ def file_size_M(s): digitals = s[:-1] if s.endswith('K'): return float(digitals) / 1024 elif s.endswith('M'): return float(digitals) else: return 1024 * float(digitals) url = _PREF_FILES + self.id soup = make_soup(url) for j in soup.find_all('tr', class_=['tr1', 'tr2']): name = re.search(r'getfilelink=([^&]+)&', str(j.find(text=lambda text: isinstance(text, Comment)))).group(1) a = j.find('a') url = 'http://learn.tsinghua.edu.cn/kejian/data/%s/download/%s' % (self.id, name) title = re.sub(r'[\n\r\t ]', '', a.contents[0]) name = re.sub(r'_[^_]+\.', '.', name) size = file_size_M(j.find_all('td')[-3].text) #单位:Mb yield File(size=size, name=name, url=url) pass
Example #17
Source File: css_match.py From plugin.git.browser with GNU General Public License v3.0 | 5 votes |
def is_comment(obj): """Is comment.""" import bs4 return isinstance(obj, bs4.Comment)
Example #18
Source File: css_match.py From plugin.git.browser with GNU General Public License v3.0 | 5 votes |
def is_special_string(obj): """Is special string.""" import bs4 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction))
Example #19
Source File: owasp_suite.py From adapt with Apache License 2.0 | 5 votes |
def find_comments_in_html(self, html): # Sometimes there is a dict type that gets passed to this function if(type(html) != str): return [] soup = BeautifulSoup( html, 'html.parser' ) comments = soup.findAll(text=lambda text:isinstance(text, Comment)) return comments
Example #20
Source File: owasp_suite.py From adapt with Apache License 2.0 | 5 votes |
def find_comments_in_html(self, html): # Sometimes there is a dict type that gets passed to this function if(type(html) != str): return [] soup = BeautifulSoup( html, 'html.parser' ) comments = soup.findAll(text=lambda text:isinstance(text, Comment)) return comments
Example #21
Source File: css_match.py From bazarr with GNU General Public License v3.0 | 5 votes |
def is_comment(obj): """Is comment.""" import bs4 return isinstance(obj, bs4.Comment)
Example #22
Source File: css_match.py From bazarr with GNU General Public License v3.0 | 5 votes |
def is_special_string(obj): """Is special string.""" import bs4 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
Example #23
Source File: css_match.py From Tautulli with GNU General Public License v3.0 | 5 votes |
def is_comment(obj): """Is comment.""" import bs4 return isinstance(obj, bs4.Comment)
Example #24
Source File: css_match.py From Tautulli with GNU General Public License v3.0 | 5 votes |
def is_special_string(obj): """Is special string.""" import bs4 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
Example #25
Source File: yukinovel.py From lightnovel-crawler with Apache License 2.0 | 5 votes |
def download_chapter_body(self, chapter): '''Download body of a single chapter and return as clean html format.''' logger.info('Downloading %s', chapter['url']) soup = self.get_soup(chapter['url']) contents = soup.select_one('div.entry-content.cl') for d in contents.findAll('div'): d.decompose() # end for for comment in contents.find_all(string=lambda text: isinstance(text, Comment)): comment.extract() # end for if contents.findAll('p')[0].text.strip().startswith('Bab'): chapter['title'] = contents.findAll('p')[0].text.strip() contents.findAll('p')[0].extract() else: chapter['title'] = chapter['title'] # end if logger.debug(chapter['title']) return str(contents) # end def # end class
Example #26
Source File: importbasics.py From SchoolIdolAPI with Apache License 2.0 | 5 votes |
def remove_all_comments(soup): comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] return soup
Example #27
Source File: whitelist.py From wagtail with BSD 3-Clause "New" or "Revised" License | 5 votes |
def clean_string_node(self, doc, node): # Remove comments if isinstance(node, Comment): node.extract() return # by default, nothing needs to be done to whitelist string nodes pass
Example #28
Source File: amazon_invoice_sanitize.py From beancount-import with GNU General Public License v2.0 | 5 votes |
def sanitize_invoice(input_path: str, output_path: str, credit_card_digits: str): with open(input_path, 'rb') as fb: soup = bs4.BeautifulSoup(fb.read(), 'lxml') comments = soup.find_all(text=lambda text: isinstance(text, bs4.Comment)) remove_tag(soup, 'script') remove_tag(soup, 'style') remove_tag(soup, 'link') remove_tag(soup, 'noscript') remove_tag(soup, 'img') remove_tag(soup, 'input') for x in soup.find_all('a'): if 'href' in x.attrs and '/dp/' not in x.attrs['href']: del x['href'] for x in comments: x.extract() new_output, order_id_replacements = sanitize_order_ids(str(soup)) # new_output = sanitize_other_ids(new_output) new_output = sanitize_credit_card(new_output, credit_card_digits) new_output = sanitize_address(new_output) if os.path.isdir(output_path): output_name, _ = sanitize_order_ids( os.path.basename(input_path), order_id_replacements) output_path = os.path.join(output_path, output_name) with open(output_path, 'w', encoding='utf-8', newline='\n') as f: f.write(new_output)
Example #29
Source File: core_parser.py From getbook with GNU Affero General Public License v3.0 | 5 votes |
def clean_soup(dom): # clean comments for el in dom.find_all(text=lambda text: isinstance(text, Comment)): el.extract() for el in dom.find_all(KILL_TAGS): el.extract()
Example #30
Source File: css_match.py From soupsieve with MIT License | 5 votes |
def is_special_string(obj): """Is special string.""" return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))