Python html5lib.parseFragment() Examples
The following are 12
code examples of html5lib.parseFragment().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
html5lib
, or try the search function
.
Example #1
Source File: sanitizer.py From bazarr with GNU General Public License v3.0 | 6 votes |
def runtest(self): input = self.test["input"] expected = self.test["output"] parsed = parseFragment(input) serialized = serialize(parsed, sanitize=True, omit_optional_tags=False, use_trailing_solidus=True, space_before_trailing_solidus=False, quote_attr_values="always", quote_char="'", alphabetical_attributes=True) errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected, "\nReceived:", serialized]) assert expected == serialized, errorMsg
Example #2
Source File: chunk.py From budou with Apache License 2.0 | 5 votes |
def span_serialize(self, attributes, max_length=None): """Returns concatenated HTML code with SPAN tag. Args: attributes (dict): A map of name-value pairs for attributes of output SPAN tags. max_length (int, optional): Maximum length of span enclosed chunk. Returns: The organized HTML code. (str) """ doc = ET.Element('span') for chunk in self: if (chunk.has_cjk() and not (max_length and len(chunk.word) > max_length)): ele = ET.Element('span') ele.text = chunk.word for key, val in attributes.items(): ele.attrib[key] = val doc.append(ele) else: # add word without span tag for non-CJK text (e.g. English) # by appending it after the last element if doc.getchildren(): if doc.getchildren()[-1].tail is None: doc.getchildren()[-1].tail = chunk.word else: doc.getchildren()[-1].tail += chunk.word else: if doc.text is None: doc.text = chunk.word else: doc.text += chunk.word result = ET.tostring(doc, encoding='utf-8').decode('utf-8') result = html5lib.serialize( html5lib.parseFragment(result), sanitize=True, quote_attr_values='always') return result
Example #3
Source File: chunk.py From budou with Apache License 2.0 | 5 votes |
def wbr_serialize(self): """Returns concatenated HTML code with WBR tag. This is still experimental. Returns: The organized HTML code. (str) """ doc = ET.Element('span') doc.attrib['style'] = 'word-break: keep-all' for chunk in self: if (chunk.has_cjk() and doc.text): ele = ET.Element('wbr') doc.append(ele) doc.getchildren()[-1].tail = chunk.word else: # add word without span tag for non-CJK text (e.g. English) # by appending it after the last element if doc.getchildren(): if doc.getchildren()[-1].tail is None: doc.getchildren()[-1].tail = chunk.word else: doc.getchildren()[-1].tail += chunk.word else: if doc.text is None: doc.text = chunk.word else: doc.text += chunk.word content = ET.tostring(doc, encoding='utf-8').decode('utf-8') dom = html5lib.parseFragment(content) treewalker = getTreeWalker('etree') stream = treewalker(dom) serializer = html5lib.serializer.HTMLSerializer( quote_attr_values='always') allowed_elements = set(sanitizer.allowed_elements) allowed_elements.add((namespaces['html'], 'wbr')) allowed_css_properties = set(sanitizer.allowed_css_properties) allowed_css_properties.add('word-break') result = serializer.render(sanitizer.Filter( stream, allowed_elements=allowed_elements, allowed_css_properties=allowed_css_properties, )) return result
Example #4
Source File: parser.py From budou with Apache License 2.0 | 5 votes |
def preprocess(source): """Removes unnecessary break lines and white spaces. Args: source (str): Input sentence. Returns: Preprocessed sentence. (str) """ doc = html5lib.parseFragment(source) source = ET.tostring(doc, encoding='utf-8', method='text').decode('utf-8') source = source.replace(u'\n', u'').strip() source = re.sub(r'\s\s+', u' ', source) return source
Example #5
Source File: ckeditor_tags.py From adhocracy4 with GNU Affero General Public License v3.0 | 5 votes |
def transform_collapsibles(text): """Find simple collapsible elements and transform them to full html.""" tree = parseFragment(text, container='div', treebuilder='etree', namespaceHTMLElements=False) base_id = ''.join(filter(str.isdigit, str(time.time()))) collapsibles = tree.findall('./div[@class="collapsible-item"]') for i, collapsible in enumerate(collapsibles): title = collapsible.find('./div[@class="collapsible-item-title"]') body = collapsible.find('./div[@class="collapsible-item-body"]') if title is not None and body is not None: title.tag = 'span' del title.attrib['class'] body.tag = 'div' del body.attrib['class'] final_html = render_to_string( 'a4ckeditor/collapsible_fragment.html', dict( id='a4ckeditor-collapsible-{}_{}'.format(base_id, i), title=serialize(title), body=serialize(body)) ) collapsible.clear() collapsible.append(parseFragment(final_html, treebuilder='etree', namespaceHTMLElements=False)) return serialize(tree)
Example #6
Source File: test_utils.py From allura with Apache License 2.0 | 5 votes |
def walker_from_text(self, text): parsed = html5lib.parseFragment(text) TreeWalker = html5lib.treewalkers.getTreeWalker("etree") walker = TreeWalker(parsed) return walker
Example #7
Source File: markdown_extensions.py From allura with Apache License 2.0 | 5 votes |
def run(self, text): parsed = html5lib.parseFragment(text) # if we didn't have to customize our sanitization, could just do: # return html5lib.serialize(parsed, sanitize=True) # instead we do the same steps as that function, # but add our ForgeHTMLSanitizerFilter instead of sanitize=True which would use the standard one TreeWalker = html5lib.treewalkers.getTreeWalker("etree") walker = TreeWalker(parsed) walker = ForgeHTMLSanitizerFilter(walker) # this is our custom step s = html5lib.serializer.HTMLSerializer() return s.render(walker)
Example #8
Source File: test_parser2.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_no_duplicate_clone(): frag = parseFragment("<b><em><foo><foob><fooc><aside></b></em>") assert len(frag) == 2
Example #9
Source File: test_parser2.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_self_closing_col(): parser = HTMLParser() parser.parseFragment('<table><colgroup><col /></colgroup></table>') assert not parser.errors
Example #10
Source File: test_sanitizer.py From bazarr with GNU General Public License v3.0 | 5 votes |
def runSanitizerTest(_, expected, input): parsed = parseFragment(expected) expected = serialize(parsed, omit_optional_tags=False, use_trailing_solidus=True, space_before_trailing_solidus=False, quote_attr_values="always", quote_char='"', alphabetical_attributes=True) assert expected == sanitize_html(input)
Example #11
Source File: test_sanitizer.py From bazarr with GNU General Public License v3.0 | 5 votes |
def sanitize_html(stream): parsed = parseFragment(stream) serialized = serialize(parsed, sanitize=True, omit_optional_tags=False, use_trailing_solidus=True, space_before_trailing_solidus=False, quote_attr_values="always", quote_char='"', alphabetical_attributes=True) return serialized
Example #12
Source File: html5lib.py From canvas with BSD 3-Clause "New" or "Revised" License | 5 votes |
def html(self): try: import html5lib self.html5lib = html5lib return html5lib.parseFragment(self.content) except ImportError, err: raise ImproperlyConfigured("Error while importing html5lib: %s" % err)