Python Examples of bs4.Comment

Source File: web_import.py From anki-search-inside-add-card with GNU Affero General Public License v3.0

7 votes

def _fetch(url: str) -> BeautifulSoup:
    html    = ""
    req     = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) 

    with urllib.request.urlopen(req) as response:
        html = response.read()

    page    = BeautifulSoup(html, "html.parser")

    for ignored_tag in ["script", "img", "input", "button", "style", "font", "iframe", "object", "embed"]:
        for tag in page.find_all(ignored_tag):
            tag.decompose()

    for tag in page.find_all(recursive=True):
        for attribute in ["class", "id", "name", "style", "role", "lang", "dir", "href", "src"]:
            del tag[attribute]
        for attribute in list(tag.attrs):
            if attribute.startswith("data-"):
                del tag.attrs[attribute]

    for node in page.find_all(text=lambda s: isinstance(s, Comment)):
        node.extract()

    return page

Source File: crawler.py From lightnovel-crawler with Apache License 2.0

6 votes

def clean_contents(self, div):
        if not div:
            return div
        # end if
        div.attrs = {}
        for tag in div.findAll(True):
            if isinstance(tag, Comment):
                tag.extract()   # Remove comments
            elif tag.name == 'br':
                next_tag = getattr(tag, 'next_sibling')
                if next_tag and getattr(next_tag, 'name') == 'br':
                    tag.extract()
                # end if
            elif tag.name in self.bad_tags:
                tag.extract()   # Remove bad tags
            elif not tag.text.strip():
                tag.extract()   # Remove empty tags
            elif self.is_blacklisted(tag.text):
                tag.extract()   # Remove blacklisted contents
            elif hasattr(tag, 'attrs'):
                tag.attrs = {}    # Remove attributes
            # end if
        # end for
        return div
    # end def

Source File: wordpress.py From CMSsc4n with GNU General Public License v3.0

6 votes

def wordpressFuncXml(data):
	cms = False
	comment = ""
	version_match = None
	try:

		soup = BeautifulSoup(data.text, 'lxml')
		comments = soup.findAll(text=lambda text:isinstance(text, Comment))

		if len(comments) > 0:
			cms = True	
			version_match = re.findall(r'(?:(\d+\.[.\d]*\d+))',comments[0])
			if len(version_match) > 0:
				version_match = version_match[0]
			if version_match != WORDPRESS_LAST_CMS_VERSION:
				print "The version wordpress is outdated or not identified"
			else:
				print "The version wordpress is updated"
			
	except Exception as e:
		print e
		version_match = None

	finally:
		return cms,version_match

Source File: parser.py From OrgNote with GNU General Public License v2.0

6 votes

def duosuo(self):
        if not self.duoshuo_shortname:
            return """
            """
        else:
            return """
            <!-- Duoshuo Comment BEGIN -->
            <div class="ds-thread"></div>
            <script type="text/javascript">
            var duoshuoQuery = {short_name:"%s"};
            (function() {
            var ds = document.createElement('script');
            ds.type = 'text/javascript';ds.async = true;
            ds.src = 'http://static.duoshuo.com/embed.js';
            ds.charset = 'UTF-8';
            (document.getElementsByTagName('head')[0]
            || document.getElementsByTagName('body')[0]).appendChild(ds);
            })();
            </script>
            <!-- Duoshuo Comment END -->
            """ % self.duoshuo_shortname

Source File: owasp_suite.py From adapt with Apache License 2.0

6 votes

def find_comments_in_html_by_urls(self, urls):
		res = []
		for url in urls:
			path = urlparse(url).path
			host = urlparse(url).hostname
			scheme = urlparse(url).scheme
			req = "GET {0} {1}/1.1\r\nhost: {2}\r\n\r\n".format(path, scheme, host)
			try:
				r = self.zap.send_request(req)
				html = str(r['responseBody'])
			except Exception as e:
				r = requests.get(url)
				html = r.text
			if (html):
				soup = BeautifulSoup(html,'html.parser')
				comments = soup.findAll(text=lambda text:isinstance(text, Comment))
				comment_list = []
				for comment in comments:
					str1 = str(comment)
					comment_list.append(str1)
					c = { "method":"GET", "url":url, "resp":r.text, "request":"GET "+url, "data":comment_list }
					res.append(c)
		return res

Source File: standings.py From pybaseball with MIT License

6 votes

def standings(season=None):
    # get most recent standings if date not specified
    if(season is None):
        season = int(datetime.datetime.today().strftime("%Y"))
    if season<1871:
        raise ValueError("This query currently only returns standings until the 1871 season. Try looking at years from 1871 to present.")
    # retrieve html from baseball reference
    soup = get_soup(season)
    if season>=1969:
        tables = get_tables(soup, season)
    else:
        t = soup.find_all(string=lambda text:isinstance(text,Comment))
        # list of seasons whose table placement breaks the site's usual pattern
        exceptions = [1884, 1885, 1886, 1887, 1888, 1889, 1890, 1892, 1903]
        if (season>1904 or season in exceptions): code = BeautifulSoup(t[16], "lxml")
        elif season<=1904: code = BeautifulSoup(t[15], "lxml")
        tables = get_tables(code, season)
    tables = [pd.DataFrame(table) for table in tables]
    for idx in range(len(tables)):
        tables[idx] = tables[idx].rename(columns=tables[idx].iloc[0])
        tables[idx] = tables[idx].reindex(tables[idx].index.drop(0))
    return tables

Source File: owasp_suite.py From adapt with Apache License 2.0

6 votes

def find_comments_in_html_by_urls(self, urls):
		res = []
		for url in urls:
			path = urlparse(url).path
			host = urlparse(url).hostname
			scheme = urlparse(url).scheme
			req = "GET {0} {1}/1.1\r\nhost: {2}\r\n\r\n".format(path, scheme, host)
			try:
				r = self.zap.send_request(req)
				html = str(r['responseBody'])
			except Exception as e:
				r = requests.get(url)
				html = r.text
			if (html):
				soup = BeautifulSoup(html,'html.parser')
				comments = soup.findAll(text=lambda text:isinstance(text, Comment))
				comment_list = []
				for comment in comments:
					str1 = str(comment)
					comment_list.append(str1)
					c = { "method":"GET", "url":url, "resp":r.text, "request":"GET "+url, "data":comment_list }
					res.append(c)
		return res

Source File: dactyl_style_checker.py From dactyl with MIT License

6 votes

def get_overrides(self, soup):
        """
        Look for overrides in the text to make exceptions for specific style
        rules. Returns a set of rule strings to ignore for this block.
        """

        overrides = set()
        comments = soup.find_all(string=lambda text:isinstance(text,Comment))
        for comment in comments:
            m = re.match(OVERRIDE_COMMENT_REGEX, comment)
            if m:
                new_overrides = m.group(1).split(",")
                new_overrides = {o.strip() for o in new_overrides}
                logger.info("Overrides found: %s" % new_overrides)
                overrides |= new_overrides

        return overrides

Source File: importer.py From incremental-reading with ISC License

6 votes

def _fetchWebpage(self, url):
        if isMac:
            context = _create_unverified_context()
            html = urlopen(url, context=context).read()
        else:
            headers = {'User-Agent': self.settings['userAgent']}
            html = get(url, headers=headers).content

        webpage = BeautifulSoup(html, 'html.parser')

        for tagName in self.settings['badTags']:
            for tag in webpage.find_all(tagName):
                tag.decompose()

        for c in webpage.find_all(text=lambda s: isinstance(s, Comment)):
            c.extract()

        return webpage

Source File: __init__.py From uoft-scrapers with MIT License

6 votes

def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
                pass
            elif content.name == 'li':
                text = content.text
            else:
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.strip()
        return paragraph

Source File: __init__.py From uoft-scrapers with MIT License

6 votes

def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
                pass
            elif content.name == 'li':
                text = content.text
            else:
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.replace('  ', ' ')
        paragraph = paragraph.strip()
        return paragraph

Source File: doc_utils.py From axcell with Apache License 2.0

5 votes

def transform(el):
    if isinstance(el, Tag):
#        for f in _transforms_el:
#            r = f(el)
#            if r is not None:
#                return transform(r)
        return el.get_text()
    elif not isinstance(el, Comment):
        return str(el)
    return ''

Source File: test_hints_html.py From qutebrowser with GNU General Public License v3.0

5 votes

def _parse_file(test_name):
    """Parse the given HTML file."""
    file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             'data', 'hints', 'html', test_name)
    with open(file_path, 'r', encoding='utf-8') as html:
        soup = bs4.BeautifulSoup(html, 'html.parser')

    comment = str(soup.find(text=lambda text: isinstance(text, bs4.Comment)))

    if comment is None:
        raise InvalidFile(test_name, "no comment found")

    data = utils.yaml_load(comment)

    if not isinstance(data, dict):
        raise InvalidFile(test_name, "expected yaml dict but got {}".format(
            type(data).__name__))

    allowed_keys = {'target', 'qtwebengine_todo'}
    if not set(data.keys()).issubset(allowed_keys):
        raise InvalidFile(test_name, "expected keys {} but found {}".format(
            ', '.join(allowed_keys),
            ', '.join(set(data.keys()))))

    if 'target' not in data:
        raise InvalidFile(test_name, "'target' key not found")

    qtwebengine_todo = data.get('qtwebengine_todo', None)

    return ParsedFile(target=data['target'], qtwebengine_todo=qtwebengine_todo)

Source File: doc_utils.py From axcell with Apache License 2.0

5 votes

def content_in_section(header, names=['h2', 'h3'], skip_comments=True):
    for el in header.next_siblings:
        if getattr(el, 'name', '') in names:
            break
        if skip_comments and isinstance(el, Comment):
            continue
        yield el

Source File: blog_parser.py From QzoneExporter with GNU General Public License v3.0

5 votes

def export(self):
        with open(self._blog_filename, "w", encoding="utf-8") as f:

            self._bs_obj.title.string = self._blog_info.title

            # 删除script, style标签
            delete_labels = ["script", "style"]
            for delete_label in delete_labels:
                for t in self._bs_obj.find_all(delete_label):
                    if filter_blog_script(t.text):
                        continue
                    t.extract()

            # 删除注释
            for comment in self._bs_obj.find_all(text=lambda text: isinstance(text, Comment)):
                comment.extract()

            pubtime = self._bs_obj.find("span", {"id": "pubTime"})
            pubtime.string = time.strftime(
                "%Y-%m-%d %H:%M:%S", time.localtime(self._blog_info.blog_id))

            readnum = self._bs_obj.find("span", {"id": "readNum"})
            readnum.string = "阅读(%d)\t评论(%d)" % (
                self._read, self._blog_info.comment_num)

            f.write(self._bs_obj.prettify())

Source File: thu_learn.py From thu_learn with MIT License

5 votes

def files(self):
        """
        get all files in course
        :return: File generator
        """

        def file_size_M(s):
            digitals = s[:-1]
            if s.endswith('K'):
                return float(digitals) / 1024
            elif s.endswith('M'):
                return float(digitals)
            else:
                return 1024 * float(digitals)

        url = _PREF_FILES + self.id
        soup = make_soup(url)
        for j in soup.find_all('tr', class_=['tr1', 'tr2']):
            name = re.search(r'getfilelink=([^&]+)&', str(j.find(text=lambda text: isinstance(text, Comment)))).group(1)
            a = j.find('a')
            url = 'http://learn.tsinghua.edu.cn/kejian/data/%s/download/%s' % (self.id, name)
            title = re.sub(r'[\n\r\t ]', '', a.contents[0])
            name = re.sub(r'_[^_]+\.', '.', name)
            size = file_size_M(j.find_all('td')[-3].text) #单位：Mb
            yield File(size=size, name=name, url=url)
        pass

Source File: css_match.py From plugin.git.browser with GNU General Public License v3.0

5 votes

def is_comment(obj):
        """Is comment."""

        import bs4
        return isinstance(obj, bs4.Comment)

Source File: css_match.py From plugin.git.browser with GNU General Public License v3.0

5 votes

def is_special_string(obj):
        """Is special string."""

        import bs4
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction))

Source File: owasp_suite.py From adapt with Apache License 2.0

5 votes

def find_comments_in_html(self, html):
		# Sometimes there is a dict type that gets passed to this function
		if(type(html) != str):
			return []
		soup = BeautifulSoup( html, 'html.parser' )
		comments = soup.findAll(text=lambda text:isinstance(text, Comment))
		return comments

Source File: owasp_suite.py From adapt with Apache License 2.0

5 votes

def find_comments_in_html(self, html):
		# Sometimes there is a dict type that gets passed to this function
		if(type(html) != str):
			return []
		soup = BeautifulSoup( html, 'html.parser' )
		comments = soup.findAll(text=lambda text:isinstance(text, Comment))
		return comments

Source File: css_match.py From bazarr with GNU General Public License v3.0

5 votes

def is_comment(obj):
        """Is comment."""

        import bs4
        return isinstance(obj, bs4.Comment)

Source File: css_match.py From bazarr with GNU General Public License v3.0

5 votes

def is_special_string(obj):
        """Is special string."""

        import bs4
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))

Source File: css_match.py From Tautulli with GNU General Public License v3.0

5 votes

def is_comment(obj):
        """Is comment."""

        import bs4
        return isinstance(obj, bs4.Comment)

Source File: css_match.py From Tautulli with GNU General Public License v3.0

5 votes

def is_special_string(obj):
        """Is special string."""

        import bs4
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))

Source File: yukinovel.py From lightnovel-crawler with Apache License 2.0

5 votes

def download_chapter_body(self, chapter):
        '''Download body of a single chapter and return as clean html format.'''
        logger.info('Downloading %s', chapter['url'])
        soup = self.get_soup(chapter['url'])

        contents = soup.select_one('div.entry-content.cl')

        for d in contents.findAll('div'):
            d.decompose()
        # end for

        for comment in contents.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()
        # end for

        if contents.findAll('p')[0].text.strip().startswith('Bab'):
            chapter['title'] = contents.findAll('p')[0].text.strip()
            contents.findAll('p')[0].extract()
        else:
            chapter['title'] = chapter['title']
        # end if

        logger.debug(chapter['title'])

        return str(contents)
    # end def
# end class

Source File: importbasics.py From SchoolIdolAPI with Apache License 2.0

5 votes

def remove_all_comments(soup):
    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    [comment.extract() for comment in comments]
    return soup

Source File: whitelist.py From wagtail with BSD 3-Clause "New" or "Revised" License

5 votes

def clean_string_node(self, doc, node):
        # Remove comments
        if isinstance(node, Comment):
            node.extract()
            return

        # by default, nothing needs to be done to whitelist string nodes
        pass

Source File: amazon_invoice_sanitize.py From beancount-import with GNU General Public License v2.0

5 votes

def sanitize_invoice(input_path: str, output_path: str,
                     credit_card_digits: str):
    with open(input_path, 'rb') as fb:
        soup = bs4.BeautifulSoup(fb.read(), 'lxml')
    comments = soup.find_all(text=lambda text: isinstance(text, bs4.Comment))
    remove_tag(soup, 'script')
    remove_tag(soup, 'style')
    remove_tag(soup, 'link')
    remove_tag(soup, 'noscript')
    remove_tag(soup, 'img')
    remove_tag(soup, 'input')
    for x in soup.find_all('a'):
        if 'href' in x.attrs and '/dp/' not in x.attrs['href']:
            del x['href']
    for x in comments:
        x.extract()

    new_output, order_id_replacements = sanitize_order_ids(str(soup))
    # new_output = sanitize_other_ids(new_output)
    new_output = sanitize_credit_card(new_output, credit_card_digits)
    new_output = sanitize_address(new_output)
    if os.path.isdir(output_path):
        output_name, _ = sanitize_order_ids(
            os.path.basename(input_path), order_id_replacements)
        output_path = os.path.join(output_path, output_name)
    with open(output_path, 'w', encoding='utf-8', newline='\n') as f:
        f.write(new_output)

Source File: core_parser.py From getbook with GNU Affero General Public License v3.0

5 votes

def clean_soup(dom):
    # clean comments
    for el in dom.find_all(text=lambda text: isinstance(text, Comment)):
        el.extract()

    for el in dom.find_all(KILL_TAGS):
        el.extract()

Source File: css_match.py From soupsieve with MIT License

5 votes

def is_special_string(obj):
        """Is special string."""
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))

Python bs4.Comment() Examples