Python Examples of bs4.Doctype

Source File: html.py From PyBloqs with GNU Lesser General Public License v2.1

6 votes

def root(tag_name="html", doctype=None, **kwargs):
    """
    Creates a new soup with the given root element.

    :param tag_name: Root element tag name.
    :param doctype: Optional doctype tag to add.
    :param kwargs: Optional parameters passed down to soup.new_tag()
    :return: Soup.
    """
    soup = parse("")
    if doctype is not None:
        soup.append(bs4.Doctype(doctype))
    tag = soup.new_tag(tag_name, **kwargs)
    tag.soup = soup
    soup.append(tag)
    return tag

Source File: HtmlProcessor.py From ReadableWebProxy with BSD 3-Clause "New" or "Revised" License

5 votes

def cleanHtmlPage(self, soup, url=None):

		soup = self.relink(soup)

		title = self.extractTitle(soup, url)


		if isinstance(self.stripTitle, (list, set)):
			for stripTitle in self.stripTitle:
				title = title.replace(stripTitle, "")
		else:
			title = title.replace(self.stripTitle, "")

		title = title.strip()

		if soup.head:
			soup.head.decompose()

		# Since the content we're extracting will be embedded into another page, we want to
		# strip out the <body> and <html> tags. `unwrap()`  replaces the soup with the contents of the
		# tag it's called on. We end up with just the contents of the <body> tag.
		while soup.body:
			# print("Unwrapping body tag")
			soup.body.unwrap()

		while soup.html:
			# print("Unwrapping html tag")
			soup.html.unwrap()

		for item in soup.children:
			if isinstance(item, bs4.Doctype):
				# print("decomposing doctype")
				item.extract()

		contents = soup.prettify()

		for item in common.global_constants.GLOBAL_INLINE_BULLSHIT:
			contents = contents.replace(item, "")

		return title, contents

Source File: css_match.py From soupsieve with MIT License

5 votes

def is_special_string(obj):
        """Is special string."""
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))

Source File: css_match.py From bazarr with GNU General Public License v3.0

5 votes

def is_special_string(obj):
        """Is special string."""

        import bs4
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))

Source File: css_match.py From Tautulli with GNU General Public License v3.0

5 votes

def is_special_string(obj):
        """Is special string."""

        import bs4
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))

Source File: parse_html_deps.py From Jandroid with BSD 3-Clause "New" or "Revised" License

4 votes

def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return unicode(soup).strip()

Source File: parse_html_deps.py From Jandroid with BSD 3-Clause "New" or "Revised" License

4 votes

def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return unicode(soup).strip()

Source File: parse_html_deps.py From Jandroid with BSD 3-Clause "New" or "Revised" License

4 votes

def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return unicode(soup).strip()

Source File: css_match.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

4 votes

def is_special_string(obj):
        """Is special string."""

        import bs4
        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))

Python bs4.Doctype() Examples