Python lxml.html.fragment_fromstring() Examples

The following are 12 code examples of lxml.html.fragment_fromstring(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml.html , or try the search function .
Example #1
Source File: diff.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def parse_html(html, cleanup=True):
    """
    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
    wrapped in a <div> tag that was not in the original document.

    If cleanup is true, make sure there's no <head> or <body>, and get
    rid of any <ins> and <del> tags.
    """
    if cleanup:
        # This removes any extra markup or structure like <head>:
        html = cleanup_html(html)
    return fragment_fromstring(html, create_parent=True) 
Example #2
Source File: diff.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def parse_html(html, cleanup=True):
    """
    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
    wrapped in a <div> tag that was not in the original document.

    If cleanup is true, make sure there's no <head> or <body>, and get
    rid of any <ins> and <del> tags.
    """
    if cleanup:
        # This removes any extra markup or structure like <head>:
        html = cleanup_html(html)
    return fragment_fromstring(html, create_parent=True) 
Example #3
Source File: diff.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def parse_html(html, cleanup=True):
    """
    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
    wrapped in a <div> tag that was not in the original document.

    If cleanup is true, make sure there's no <head> or <body>, and get
    rid of any <ins> and <del> tags.
    """
    if cleanup:
        # This removes any extra markup or structure like <head>:
        html = cleanup_html(html)
    return fragment_fromstring(html, create_parent=True) 
Example #4
Source File: diff.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def parse_html(html, cleanup=True):
    """
    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
    wrapped in a <div> tag that was not in the original document.

    If cleanup is true, make sure there's no <head> or <body>, and get
    rid of any <ins> and <del> tags.
    """
    if cleanup:
        # This removes any extra markup or structure like <head>:
        html = cleanup_html(html)
    return fragment_fromstring(html, create_parent=True) 
Example #5
Source File: diff.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def parse_html(html, cleanup=True):
    """
    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
    wrapped in a <div> tag that was not in the original document.

    If cleanup is true, make sure there's no <head> or <body>, and get
    rid of any <ins> and <del> tags.
    """
    if cleanup:
        # This removes any extra markup or structure like <head>:
        html = cleanup_html(html)
    return fragment_fromstring(html, create_parent=True) 
Example #6
Source File: diff.py    From stopstalk-deployment with MIT License 5 votes vote down vote up
def parse_html(html, cleanup=True):
    """
    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
    wrapped in a <div> tag that was not in the original document.

    If cleanup is true, make sure there's no <head> or <body>, and get
    rid of any <ins> and <del> tags.
    """
    if cleanup:
        # This removes any extra markup or structure like <head>:
        html = cleanup_html(html)
    return fragment_fromstring(html, create_parent=True) 
Example #7
Source File: html_to_telegraph.py    From html-telegraph-poster with MIT License 5 votes vote down vote up
def convert_json_to_html(elements):
    content = html.fragment_fromstring('<div></div>')
    for element in elements:
        content.append(_recursive_convert_json(element))
    content.make_links_absolute(base_url=base_url)
    for x in content.xpath('.//span'):
        x.drop_tag()
    html_string = html.tostring(content, encoding='unicode')
    html_string = replace_line_breaks_except_pre(html_string, '<br/>')
    html_string = html_string[5:-6]
    return html_string 
Example #8
Source File: cleaner.py    From wanish with MIT License 5 votes vote down vote up
def transform_misused_divs_into_paragraphs(self):
        """
        Transforms <div> without other block elements into <p>, merges near-standing <p> together.
        """
        for elem in self.tags(self._html, 'div'):
            # transform <div>s that do not contain other block elements into
            # <p>s
            # FIXME: The current implementation ignores all descendants that are not direct children of elem
            # This results in incorrect results in case there is an <img> buried within an <a> for example

            if not REGEXES['divToPElementsRe'].search(tostring(elem).decode()):
                elem.tag = "p"

        for elem in self.tags(self._html, 'div'):
            if elem.text and elem.text.strip():
                p = fragment_fromstring('<p/>')
                p.text = elem.text
                elem.text = None
                elem.insert(0, p)

            for pos, child in reversed(list(enumerate(elem))):
                if child.tail and child.tail.strip():
                    p = fragment_fromstring('<p/>')
                    p.text = child.tail
                    child.tail = None
                    elem.insert(pos + 1, p)

                if child.tag == 'br':
                    child.drop_tree() 
Example #9
Source File: cleaner.py    From wanish with MIT License 5 votes vote down vote up
def initial_output(html_partial=False):
        """
        Creates initial HTML document according to the given flag
        :param html_partial: determines if there should be the html page or only a fragment
        :return: html output element
        """
        return fragment_fromstring('<div/>') if html_partial else document_fromstring('<div/>') 
Example #10
Source File: translate.py    From odoo13-x64 with GNU General Public License v3.0 5 votes vote down vote up
def parse_html(text):
    return html.fragment_fromstring(text, parser=_HTML_PARSER) 
Example #11
Source File: diff.py    From lambda-text-extractor with Apache License 2.0 5 votes vote down vote up
def parse_html(html, cleanup=True):
    """
    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
    wrapped in a <div> tag that was not in the original document.

    If cleanup is true, make sure there's no <head> or <body>, and get
    rid of any <ins> and <del> tags.
    """
    if cleanup:
        # This removes any extra markup or structure like <head>:
        html = cleanup_html(html)
    return fragment_fromstring(html, create_parent=True) 
Example #12
Source File: diff.py    From lambda-text-extractor with Apache License 2.0 5 votes vote down vote up
def parse_html(html, cleanup=True):
    """
    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
    wrapped in a <div> tag that was not in the original document.

    If cleanup is true, make sure there's no <head> or <body>, and get
    rid of any <ins> and <del> tags.
    """
    if cleanup:
        # This removes any extra markup or structure like <head>:
        html = cleanup_html(html)
    return fragment_fromstring(html, create_parent=True)