Python bs4.Doctype() Examples

The following are 9 code examples of bs4.Doctype(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module bs4 , or try the search function .
Example #1
Source File: html.py    From PyBloqs with GNU Lesser General Public License v2.1 6 votes vote down vote up
def root(tag_name="html", doctype=None, **kwargs):
    """
    Creates a new soup with the given root element.

    :param tag_name: Root element tag name.
    :param doctype: Optional doctype tag to add.
    :param kwargs: Optional parameters passed down to soup.new_tag()
    :return: Soup.
    """
    soup = parse("")
    if doctype is not None:
        soup.append(bs4.Doctype(doctype))
    tag = soup.new_tag(tag_name, **kwargs)
    tag.soup = soup
    soup.append(tag)
    return tag 
Example #2
Source File: HtmlProcessor.py    From ReadableWebProxy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def cleanHtmlPage(self, soup, url=None):

		soup = self.relink(soup)

		title = self.extractTitle(soup, url)


		if isinstance(self.stripTitle, (list, set)):
			for stripTitle in self.stripTitle:
				title = title.replace(stripTitle, "")
		else:
			title = title.replace(self.stripTitle, "")

		title = title.strip()

		if soup.head:
			soup.head.decompose()

		# Since the content we're extracting will be embedded into another page, we want to
		# strip out the <body> and <html> tags. `unwrap()`  replaces the soup with the contents of the
		# tag it's called on. We end up with just the contents of the <body> tag.
		while soup.body:
			# print("Unwrapping body tag")
			soup.body.unwrap()

		while soup.html:
			# print("Unwrapping html tag")
			soup.html.unwrap()

		for item in soup.children:
			if isinstance(item, bs4.Doctype):
				# print("decomposing doctype")
				item.extract()

		contents = soup.prettify()

		for item in common.global_constants.GLOBAL_INLINE_BULLSHIT:
			contents = contents.replace(item, "")

		return title, contents 
Example #3
Source File: css_match.py    From soupsieve with MIT License 5 votes vote down vote up
def is_special_string(obj):
        """Is special string."""
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) 
Example #4
Source File: css_match.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def is_special_string(obj):
        """Is special string."""

        import bs4
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) 
Example #5
Source File: css_match.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def is_special_string(obj):
        """Is special string."""

        import bs4
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) 
Example #6
Source File: parse_html_deps.py    From Jandroid with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return unicode(soup).strip() 
Example #7
Source File: parse_html_deps.py    From Jandroid with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return unicode(soup).strip() 
Example #8
Source File: parse_html_deps.py    From Jandroid with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return unicode(soup).strip() 
Example #9
Source File: css_match.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 4 votes vote down vote up
def is_special_string(obj):
        """Is special string."""

        import bs4
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))