Python bs4.Comment() Examples

The following are 30 code examples of bs4.Comment(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module bs4 , or try the search function .
Example #1
Source File: web_import.py    From anki-search-inside-add-card with GNU Affero General Public License v3.0 7 votes vote down vote up
def _fetch(url: str) -> BeautifulSoup:
    html    = ""
    req     = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) 

    with urllib.request.urlopen(req) as response:
        html = response.read()

    page    = BeautifulSoup(html, "html.parser")

    for ignored_tag in ["script", "img", "input", "button", "style", "font", "iframe", "object", "embed"]:
        for tag in page.find_all(ignored_tag):
            tag.decompose()

    for tag in page.find_all(recursive=True):
        for attribute in ["class", "id", "name", "style", "role", "lang", "dir", "href", "src"]:
            del tag[attribute]
        for attribute in list(tag.attrs):
            if attribute.startswith("data-"):
                del tag.attrs[attribute]

    for node in page.find_all(text=lambda s: isinstance(s, Comment)):
        node.extract()

    return page 
Example #2
Source File: crawler.py    From lightnovel-crawler with Apache License 2.0 6 votes vote down vote up
def clean_contents(self, div):
        if not div:
            return div
        # end if
        div.attrs = {}
        for tag in div.findAll(True):
            if isinstance(tag, Comment):
                tag.extract()   # Remove comments
            elif tag.name == 'br':
                next_tag = getattr(tag, 'next_sibling')
                if next_tag and getattr(next_tag, 'name') == 'br':
                    tag.extract()
                # end if
            elif tag.name in self.bad_tags:
                tag.extract()   # Remove bad tags
            elif not tag.text.strip():
                tag.extract()   # Remove empty tags
            elif self.is_blacklisted(tag.text):
                tag.extract()   # Remove blacklisted contents
            elif hasattr(tag, 'attrs'):
                tag.attrs = {}    # Remove attributes
            # end if
        # end for
        return div
    # end def 
Example #3
Source File: wordpress.py    From CMSsc4n with GNU General Public License v3.0 6 votes vote down vote up
def wordpressFuncXml(data):
	cms = False
	comment = ""
	version_match = None
	try:

		soup = BeautifulSoup(data.text, 'lxml')
		comments = soup.findAll(text=lambda text:isinstance(text, Comment))

		if len(comments) > 0:
			cms = True	
			version_match = re.findall(r'(?:(\d+\.[.\d]*\d+))',comments[0])
			if len(version_match) > 0:
				version_match = version_match[0]
			if version_match != WORDPRESS_LAST_CMS_VERSION:
				print "The version wordpress is outdated or not identified"
			else:
				print "The version wordpress is updated"
			
	except Exception as e:
		print e
		version_match = None

	finally:
		return cms,version_match 
Example #4
Source File: parser.py    From OrgNote with GNU General Public License v2.0 6 votes vote down vote up
def duosuo(self):
        if not self.duoshuo_shortname:
            return """
            """
        else:
            return """
            <!-- Duoshuo Comment BEGIN -->
            <div class="ds-thread"></div>
            <script type="text/javascript">
            var duoshuoQuery = {short_name:"%s"};
            (function() {
            var ds = document.createElement('script');
            ds.type = 'text/javascript';ds.async = true;
            ds.src = 'http://static.duoshuo.com/embed.js';
            ds.charset = 'UTF-8';
            (document.getElementsByTagName('head')[0]
            || document.getElementsByTagName('body')[0]).appendChild(ds);
            })();
            </script>
            <!-- Duoshuo Comment END -->
            """ % self.duoshuo_shortname 
Example #5
Source File: owasp_suite.py    From adapt with Apache License 2.0 6 votes vote down vote up
def find_comments_in_html_by_urls(self, urls):
		res = []
		for url in urls:
			path = urlparse(url).path
			host = urlparse(url).hostname
			scheme = urlparse(url).scheme
			req = "GET {0} {1}/1.1\r\nhost: {2}\r\n\r\n".format(path, scheme, host)
			try:
				r = self.zap.send_request(req)
				html = str(r['responseBody'])
			except Exception as e:
				r = requests.get(url)
				html = r.text
			if (html):
				soup = BeautifulSoup(html,'html.parser')
				comments = soup.findAll(text=lambda text:isinstance(text, Comment))
				comment_list = []
				for comment in comments:
					str1 = str(comment)
					comment_list.append(str1)
					c = { "method":"GET", "url":url, "resp":r.text, "request":"GET "+url, "data":comment_list }
					res.append(c)
		return res 
Example #6
Source File: standings.py    From pybaseball with MIT License 6 votes vote down vote up
def standings(season=None):
    # get most recent standings if date not specified
    if(season is None):
        season = int(datetime.datetime.today().strftime("%Y"))
    if season<1871:
        raise ValueError("This query currently only returns standings until the 1871 season. Try looking at years from 1871 to present.")
    # retrieve html from baseball reference
    soup = get_soup(season)
    if season>=1969:
        tables = get_tables(soup, season)
    else:
        t = soup.find_all(string=lambda text:isinstance(text,Comment))
        # list of seasons whose table placement breaks the site's usual pattern
        exceptions = [1884, 1885, 1886, 1887, 1888, 1889, 1890, 1892, 1903]
        if (season>1904 or season in exceptions): code = BeautifulSoup(t[16], "lxml")
        elif season<=1904: code = BeautifulSoup(t[15], "lxml")
        tables = get_tables(code, season)
    tables = [pd.DataFrame(table) for table in tables]
    for idx in range(len(tables)):
        tables[idx] = tables[idx].rename(columns=tables[idx].iloc[0])
        tables[idx] = tables[idx].reindex(tables[idx].index.drop(0))
    return tables 
Example #7
Source File: owasp_suite.py    From adapt with Apache License 2.0 6 votes vote down vote up
def find_comments_in_html_by_urls(self, urls):
		res = []
		for url in urls:
			path = urlparse(url).path
			host = urlparse(url).hostname
			scheme = urlparse(url).scheme
			req = "GET {0} {1}/1.1\r\nhost: {2}\r\n\r\n".format(path, scheme, host)
			try:
				r = self.zap.send_request(req)
				html = str(r['responseBody'])
			except Exception as e:
				r = requests.get(url)
				html = r.text
			if (html):
				soup = BeautifulSoup(html,'html.parser')
				comments = soup.findAll(text=lambda text:isinstance(text, Comment))
				comment_list = []
				for comment in comments:
					str1 = str(comment)
					comment_list.append(str1)
					c = { "method":"GET", "url":url, "resp":r.text, "request":"GET "+url, "data":comment_list }
					res.append(c)
		return res 
Example #8
Source File: dactyl_style_checker.py    From dactyl with MIT License 6 votes vote down vote up
def get_overrides(self, soup):
        """
        Look for overrides in the text to make exceptions for specific style
        rules. Returns a set of rule strings to ignore for this block.
        """

        overrides = set()
        comments = soup.find_all(string=lambda text:isinstance(text,Comment))
        for comment in comments:
            m = re.match(OVERRIDE_COMMENT_REGEX, comment)
            if m:
                new_overrides = m.group(1).split(",")
                new_overrides = {o.strip() for o in new_overrides}
                logger.info("Overrides found: %s" % new_overrides)
                overrides |= new_overrides

        return overrides 
Example #9
Source File: importer.py    From incremental-reading with ISC License 6 votes vote down vote up
def _fetchWebpage(self, url):
        if isMac:
            context = _create_unverified_context()
            html = urlopen(url, context=context).read()
        else:
            headers = {'User-Agent': self.settings['userAgent']}
            html = get(url, headers=headers).content

        webpage = BeautifulSoup(html, 'html.parser')

        for tagName in self.settings['badTags']:
            for tag in webpage.find_all(tagName):
                tag.decompose()

        for c in webpage.find_all(text=lambda s: isinstance(s, Comment)):
            c.extract()

        return webpage 
Example #10
Source File: __init__.py    From uoft-scrapers with MIT License 6 votes vote down vote up
def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
                pass
            elif content.name == 'li':
                text = content.text
            else:
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.strip()
        return paragraph 
Example #11
Source File: __init__.py    From uoft-scrapers with MIT License 6 votes vote down vote up
def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
                pass
            elif content.name == 'li':
                text = content.text
            else:
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.replace('  ', ' ')
        paragraph = paragraph.strip()
        return paragraph 
Example #12
Source File: doc_utils.py    From axcell with Apache License 2.0 5 votes vote down vote up
def transform(el):
    if isinstance(el, Tag):
#        for f in _transforms_el:
#            r = f(el)
#            if r is not None:
#                return transform(r)
        return el.get_text()
    elif not isinstance(el, Comment):
        return str(el)
    return '' 
Example #13
Source File: test_hints_html.py    From qutebrowser with GNU General Public License v3.0 5 votes vote down vote up
def _parse_file(test_name):
    """Parse the given HTML file."""
    file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             'data', 'hints', 'html', test_name)
    with open(file_path, 'r', encoding='utf-8') as html:
        soup = bs4.BeautifulSoup(html, 'html.parser')

    comment = str(soup.find(text=lambda text: isinstance(text, bs4.Comment)))

    if comment is None:
        raise InvalidFile(test_name, "no comment found")

    data = utils.yaml_load(comment)

    if not isinstance(data, dict):
        raise InvalidFile(test_name, "expected yaml dict but got {}".format(
            type(data).__name__))

    allowed_keys = {'target', 'qtwebengine_todo'}
    if not set(data.keys()).issubset(allowed_keys):
        raise InvalidFile(test_name, "expected keys {} but found {}".format(
            ', '.join(allowed_keys),
            ', '.join(set(data.keys()))))

    if 'target' not in data:
        raise InvalidFile(test_name, "'target' key not found")

    qtwebengine_todo = data.get('qtwebengine_todo', None)

    return ParsedFile(target=data['target'], qtwebengine_todo=qtwebengine_todo) 
Example #14
Source File: doc_utils.py    From axcell with Apache License 2.0 5 votes vote down vote up
def content_in_section(header, names=['h2', 'h3'], skip_comments=True):
    for el in header.next_siblings:
        if getattr(el, 'name', '') in names:
            break
        if skip_comments and isinstance(el, Comment):
            continue
        yield el 
Example #15
Source File: blog_parser.py    From QzoneExporter with GNU General Public License v3.0 5 votes vote down vote up
def export(self):
        with open(self._blog_filename, "w", encoding="utf-8") as f:

            self._bs_obj.title.string = self._blog_info.title

            # 删除script, style标签
            delete_labels = ["script", "style"]
            for delete_label in delete_labels:
                for t in self._bs_obj.find_all(delete_label):
                    if filter_blog_script(t.text):
                        continue
                    t.extract()

            # 删除注释
            for comment in self._bs_obj.find_all(text=lambda text: isinstance(text, Comment)):
                comment.extract()

            pubtime = self._bs_obj.find("span", {"id": "pubTime"})
            pubtime.string = time.strftime(
                "%Y-%m-%d %H:%M:%S", time.localtime(self._blog_info.blog_id))

            readnum = self._bs_obj.find("span", {"id": "readNum"})
            readnum.string = "阅读(%d)\t评论(%d)" % (
                self._read, self._blog_info.comment_num)

            f.write(self._bs_obj.prettify()) 
Example #16
Source File: thu_learn.py    From thu_learn with MIT License 5 votes vote down vote up
def files(self):
        """
        get all files in course
        :return: File generator
        """

        def file_size_M(s):
            digitals = s[:-1]
            if s.endswith('K'):
                return float(digitals) / 1024
            elif s.endswith('M'):
                return float(digitals)
            else:
                return 1024 * float(digitals)

        url = _PREF_FILES + self.id
        soup = make_soup(url)
        for j in soup.find_all('tr', class_=['tr1', 'tr2']):
            name = re.search(r'getfilelink=([^&]+)&', str(j.find(text=lambda text: isinstance(text, Comment)))).group(1)
            a = j.find('a')
            url = 'http://learn.tsinghua.edu.cn/kejian/data/%s/download/%s' % (self.id, name)
            title = re.sub(r'[\n\r\t ]', '', a.contents[0])
            name = re.sub(r'_[^_]+\.', '.', name)
            size = file_size_M(j.find_all('td')[-3].text) #单位:Mb
            yield File(size=size, name=name, url=url)
        pass 
Example #17
Source File: css_match.py    From plugin.git.browser with GNU General Public License v3.0 5 votes vote down vote up
def is_comment(obj):
        """Is comment."""

        import bs4
        return isinstance(obj, bs4.Comment) 
Example #18
Source File: css_match.py    From plugin.git.browser with GNU General Public License v3.0 5 votes vote down vote up
def is_special_string(obj):
        """Is special string."""

        import bs4
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction)) 
Example #19
Source File: owasp_suite.py    From adapt with Apache License 2.0 5 votes vote down vote up
def find_comments_in_html(self, html):
		# Sometimes there is a dict type that gets passed to this function
		if(type(html) != str):
			return []
		soup = BeautifulSoup( html, 'html.parser' )
		comments = soup.findAll(text=lambda text:isinstance(text, Comment))
		return comments 
Example #20
Source File: owasp_suite.py    From adapt with Apache License 2.0 5 votes vote down vote up
def find_comments_in_html(self, html):
		# Sometimes there is a dict type that gets passed to this function
		if(type(html) != str):
			return []
		soup = BeautifulSoup( html, 'html.parser' )
		comments = soup.findAll(text=lambda text:isinstance(text, Comment))
		return comments 
Example #21
Source File: css_match.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def is_comment(obj):
        """Is comment."""

        import bs4
        return isinstance(obj, bs4.Comment) 
Example #22
Source File: css_match.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def is_special_string(obj):
        """Is special string."""

        import bs4
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) 
Example #23
Source File: css_match.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def is_comment(obj):
        """Is comment."""

        import bs4
        return isinstance(obj, bs4.Comment) 
Example #24
Source File: css_match.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def is_special_string(obj):
        """Is special string."""

        import bs4
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) 
Example #25
Source File: yukinovel.py    From lightnovel-crawler with Apache License 2.0 5 votes vote down vote up
def download_chapter_body(self, chapter):
        '''Download body of a single chapter and return as clean html format.'''
        logger.info('Downloading %s', chapter['url'])
        soup = self.get_soup(chapter['url'])

        contents = soup.select_one('div.entry-content.cl')

        for d in contents.findAll('div'):
            d.decompose()
        # end for

        for comment in contents.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()
        # end for

        if contents.findAll('p')[0].text.strip().startswith('Bab'):
            chapter['title'] = contents.findAll('p')[0].text.strip()
            contents.findAll('p')[0].extract()
        else:
            chapter['title'] = chapter['title']
        # end if

        logger.debug(chapter['title'])

        return str(contents)
    # end def
# end class 
Example #26
Source File: importbasics.py    From SchoolIdolAPI with Apache License 2.0 5 votes vote down vote up
def remove_all_comments(soup):
    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    [comment.extract() for comment in comments]
    return soup 
Example #27
Source File: whitelist.py    From wagtail with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def clean_string_node(self, doc, node):
        # Remove comments
        if isinstance(node, Comment):
            node.extract()
            return

        # by default, nothing needs to be done to whitelist string nodes
        pass 
Example #28
Source File: amazon_invoice_sanitize.py    From beancount-import with GNU General Public License v2.0 5 votes vote down vote up
def sanitize_invoice(input_path: str, output_path: str,
                     credit_card_digits: str):
    with open(input_path, 'rb') as fb:
        soup = bs4.BeautifulSoup(fb.read(), 'lxml')
    comments = soup.find_all(text=lambda text: isinstance(text, bs4.Comment))
    remove_tag(soup, 'script')
    remove_tag(soup, 'style')
    remove_tag(soup, 'link')
    remove_tag(soup, 'noscript')
    remove_tag(soup, 'img')
    remove_tag(soup, 'input')
    for x in soup.find_all('a'):
        if 'href' in x.attrs and '/dp/' not in x.attrs['href']:
            del x['href']
    for x in comments:
        x.extract()

    new_output, order_id_replacements = sanitize_order_ids(str(soup))
    # new_output = sanitize_other_ids(new_output)
    new_output = sanitize_credit_card(new_output, credit_card_digits)
    new_output = sanitize_address(new_output)
    if os.path.isdir(output_path):
        output_name, _ = sanitize_order_ids(
            os.path.basename(input_path), order_id_replacements)
        output_path = os.path.join(output_path, output_name)
    with open(output_path, 'w', encoding='utf-8', newline='\n') as f:
        f.write(new_output) 
Example #29
Source File: core_parser.py    From getbook with GNU Affero General Public License v3.0 5 votes vote down vote up
def clean_soup(dom):
    # clean comments
    for el in dom.find_all(text=lambda text: isinstance(text, Comment)):
        el.extract()

    for el in dom.find_all(KILL_TAGS):
        el.extract() 
Example #30
Source File: css_match.py    From soupsieve with MIT License 5 votes vote down vote up
def is_special_string(obj):
        """Is special string."""
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))