Python bs4.Doctype() Examples
The following are 9
code examples of bs4.Doctype().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bs4
, or try the search function
.
Example #1
Source File: html.py From PyBloqs with GNU Lesser General Public License v2.1 | 6 votes |
def root(tag_name="html", doctype=None, **kwargs): """ Creates a new soup with the given root element. :param tag_name: Root element tag name. :param doctype: Optional doctype tag to add. :param kwargs: Optional parameters passed down to soup.new_tag() :return: Soup. """ soup = parse("") if doctype is not None: soup.append(bs4.Doctype(doctype)) tag = soup.new_tag(tag_name, **kwargs) tag.soup = soup soup.append(tag) return tag
Example #2
Source File: HtmlProcessor.py From ReadableWebProxy with BSD 3-Clause "New" or "Revised" License | 5 votes |
def cleanHtmlPage(self, soup, url=None): soup = self.relink(soup) title = self.extractTitle(soup, url) if isinstance(self.stripTitle, (list, set)): for stripTitle in self.stripTitle: title = title.replace(stripTitle, "") else: title = title.replace(self.stripTitle, "") title = title.strip() if soup.head: soup.head.decompose() # Since the content we're extracting will be embedded into another page, we want to # strip out the <body> and <html> tags. `unwrap()` replaces the soup with the contents of the # tag it's called on. We end up with just the contents of the <body> tag. while soup.body: # print("Unwrapping body tag") soup.body.unwrap() while soup.html: # print("Unwrapping html tag") soup.html.unwrap() for item in soup.children: if isinstance(item, bs4.Doctype): # print("decomposing doctype") item.extract() contents = soup.prettify() for item in common.global_constants.GLOBAL_INLINE_BULLSHIT: contents = contents.replace(item, "") return title, contents
Example #3
Source File: css_match.py From soupsieve with MIT License | 5 votes |
def is_special_string(obj): """Is special string.""" return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
Example #4
Source File: css_match.py From bazarr with GNU General Public License v3.0 | 5 votes |
def is_special_string(obj): """Is special string.""" import bs4 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
Example #5
Source File: css_match.py From Tautulli with GNU General Public License v3.0 | 5 votes |
def is_special_string(obj): """Is special string.""" import bs4 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
Example #6
Source File: parse_html_deps.py From Jandroid with BSD 3-Clause "New" or "Revised" License | 4 votes |
def GenerateHTML(self, controller, minify=False, prettify=False): soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup)) # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Doctype): x.extract() # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Declaration): x.extract() # Remove all imports. imports = soup.findAll('link', rel='import') for imp in imports: imp.extract() # Remove all script links. scripts_external = soup.findAll('script', src=True) for script in scripts_external: script.extract() # Remove all in-line scripts. scripts_external = soup.findAll('script', src=None) for script in scripts_external: script.extract() # Process all in-line styles. inline_styles = soup.findAll('style') for style in inline_styles: html = controller.GetHTMLForInlineStylesheet(unicode(style.string)) if html: ns = soup.new_tag('style') ns.append(bs4.NavigableString(html)) style.replaceWith(ns) else: style.extract() # Rewrite all external stylesheet hrefs or remove, as needed. stylesheet_links = soup.findAll('link', rel='stylesheet') for stylesheet_link in stylesheet_links: html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href']) if html: tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style') assert len(tmp) == 1 stylesheet_link.replaceWith(tmp[0]) else: stylesheet_link.extract() # Remove comments if minifying. if minify: comments = soup.findAll( text=lambda text: isinstance(text, bs4.Comment)) for comment in comments: comment.extract() if prettify: return soup.prettify('utf-8').strip() # We are done. return unicode(soup).strip()
Example #7
Source File: parse_html_deps.py From Jandroid with BSD 3-Clause "New" or "Revised" License | 4 votes |
def GenerateHTML(self, controller, minify=False, prettify=False): soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup)) # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Doctype): x.extract() # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Declaration): x.extract() # Remove all imports. imports = soup.findAll('link', rel='import') for imp in imports: imp.extract() # Remove all script links. scripts_external = soup.findAll('script', src=True) for script in scripts_external: script.extract() # Remove all in-line scripts. scripts_external = soup.findAll('script', src=None) for script in scripts_external: script.extract() # Process all in-line styles. inline_styles = soup.findAll('style') for style in inline_styles: html = controller.GetHTMLForInlineStylesheet(unicode(style.string)) if html: ns = soup.new_tag('style') ns.append(bs4.NavigableString(html)) style.replaceWith(ns) else: style.extract() # Rewrite all external stylesheet hrefs or remove, as needed. stylesheet_links = soup.findAll('link', rel='stylesheet') for stylesheet_link in stylesheet_links: html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href']) if html: tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style') assert len(tmp) == 1 stylesheet_link.replaceWith(tmp[0]) else: stylesheet_link.extract() # Remove comments if minifying. if minify: comments = soup.findAll( text=lambda text: isinstance(text, bs4.Comment)) for comment in comments: comment.extract() if prettify: return soup.prettify('utf-8').strip() # We are done. return unicode(soup).strip()
Example #8
Source File: parse_html_deps.py From Jandroid with BSD 3-Clause "New" or "Revised" License | 4 votes |
def GenerateHTML(self, controller, minify=False, prettify=False): soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup)) # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Doctype): x.extract() # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Declaration): x.extract() # Remove all imports. imports = soup.findAll('link', rel='import') for imp in imports: imp.extract() # Remove all script links. scripts_external = soup.findAll('script', src=True) for script in scripts_external: script.extract() # Remove all in-line scripts. scripts_external = soup.findAll('script', src=None) for script in scripts_external: script.extract() # Process all in-line styles. inline_styles = soup.findAll('style') for style in inline_styles: html = controller.GetHTMLForInlineStylesheet(unicode(style.string)) if html: ns = soup.new_tag('style') ns.append(bs4.NavigableString(html)) style.replaceWith(ns) else: style.extract() # Rewrite all external stylesheet hrefs or remove, as needed. stylesheet_links = soup.findAll('link', rel='stylesheet') for stylesheet_link in stylesheet_links: html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href']) if html: tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style') assert len(tmp) == 1 stylesheet_link.replaceWith(tmp[0]) else: stylesheet_link.extract() # Remove comments if minifying. if minify: comments = soup.findAll( text=lambda text: isinstance(text, bs4.Comment)) for comment in comments: comment.extract() if prettify: return soup.prettify('utf-8').strip() # We are done. return unicode(soup).strip()
Example #9
Source File: css_match.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 4 votes |
def is_special_string(obj): """Is special string.""" import bs4 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))