Python lxml.etree.iselement() Examples

The following are 30 code examples of lxml.etree.iselement(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml.etree , or try the search function .
Example #1
Source File: diff.py    From stopstalk-deployment with MIT License 6 votes vote down vote up
def tokenize(html, include_hrefs=True):
    """
    Parse the given HTML and returns token objects (words with attached tags).

    This parses only the content of a page; anything in the head is
    ignored, and the <head> and <body> elements are themselves
    optional.  The content is then parsed by lxml, which ensures the
    validity of the resulting parsed document (though lxml may make
    incorrect guesses when the markup is particular bad).

    <ins> and <del> tags are also eliminated from the document, as
    that gets confusing.

    If include_hrefs is true, then the href attribute of <a> tags is
    included as a special kind of diffable token."""
    if etree.iselement(html):
        body_el = html
    else:
        body_el = parse_html(html, cleanup=True)
    # Then we split the document into text chunks for each tag, word, and end tag:
    chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
    # Finally re-joining them into token objects:
    return fixup_chunks(chunks) 
Example #2
Source File: diff.py    From lambda-text-extractor with Apache License 2.0 6 votes vote down vote up
def tokenize(html, include_hrefs=True):
    """
    Parse the given HTML and returns token objects (words with attached tags).

    This parses only the content of a page; anything in the head is
    ignored, and the <head> and <body> elements are themselves
    optional.  The content is then parsed by lxml, which ensures the
    validity of the resulting parsed document (though lxml may make
    incorrect guesses when the markup is particular bad).

    <ins> and <del> tags are also eliminated from the document, as
    that gets confusing.

    If include_hrefs is true, then the href attribute of <a> tags is
    included as a special kind of diffable token."""
    if etree.iselement(html):
        body_el = html
    else:
        body_el = parse_html(html, cleanup=True)
    # Then we split the document into text chunks for each tag, word, and end tag:
    chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
    # Finally re-joining them into token objects:
    return fixup_chunks(chunks) 
Example #3
Source File: diff.py    From lambda-text-extractor with Apache License 2.0 6 votes vote down vote up
def tokenize(html, include_hrefs=True):
    """
    Parse the given HTML and returns token objects (words with attached tags).

    This parses only the content of a page; anything in the head is
    ignored, and the <head> and <body> elements are themselves
    optional.  The content is then parsed by lxml, which ensures the
    validity of the resulting parsed document (though lxml may make
    incorrect guesses when the markup is particular bad).

    <ins> and <del> tags are also eliminated from the document, as
    that gets confusing.

    If include_hrefs is true, then the href attribute of <a> tags is
    included as a special kind of diffable token."""
    if etree.iselement(html):
        body_el = html
    else:
        body_el = parse_html(html, cleanup=True)
    # Then we split the document into text chunks for each tag, word, and end tag:
    chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
    # Finally re-joining them into token objects:
    return fixup_chunks(chunks) 
Example #4
Source File: service.py    From gdata-python3 with Apache License 2.0 6 votes vote down vote up
def CalculateDataLength(data):
    """Attempts to determine the length of the data to send.

    This method will respond with a length only if the data is a string or
    and ElementTree element.

    Args:
      data: object If this is not a string or ElementTree element this funtion
          will return None.
    """
    if isinstance(data, str):
        return len(data)
    elif isinstance(data, list):
        return None
    elif ElementTree.iselement(data):
        return len(ElementTree.tostring(data))
    elif hasattr(data, 'read'):
        # If this is a file-like object, don't try to guess the length.
        return None
    else:
        return len(str(data)) 
Example #5
Source File: service.py    From gdata-python3 with Apache License 2.0 6 votes vote down vote up
def __SendDataPart(data, connection):
    """This method is deprecated, use atom.http._send_data_part"""
    deprecated('call to deprecated function __SendDataPart')
    if isinstance(data, str):
        # TODO add handling for unicode.
        connection.send(data)
        return
    elif ElementTree.iselement(data):
        connection.send(ElementTree.tostring(data))
        return
    # Check to see if data is a file-like object that has a read method.
    elif hasattr(data, 'read'):
        # Read the file and send it a chunk at a time.
        while 1:
            binarydata = data.read(100000)
            if binarydata == '': break
            connection.send(binarydata)
        return
    else:
        # The data object was not a file.
        # Try to convert to a string and send the data.
        connection.send(str(data))
        return 
Example #6
Source File: __init__.py    From gdata-python3 with Apache License 2.0 6 votes vote down vote up
def SetXmlBlob(self, blob):
        """Sets the contents of the extendedProperty to XML as a child node.

        Since the extendedProperty is only allowed one child element as an XML
        blob, setting the XML blob will erase any preexisting extension elements
        in this object.

        Args:
          blob: str, ElementTree Element or atom.ExtensionElement representing
                the XML blob stored in the extendedProperty.
        """
        # Erase any existing extension_elements, clears the child nodes from the
        # extendedProperty.
        self.extension_elements = []
        if isinstance(blob, atom.ExtensionElement):
            self.extension_elements.append(blob)
        elif ElementTree.iselement(blob):
            self.extension_elements.append(atom._ExtensionElementFromElementTree(
                blob))
        else:
            self.extension_elements.append(atom.ExtensionElementFromString(blob)) 
Example #7
Source File: client.py    From pyvas with MIT License 6 votes vote down vote up
def _send_request(self, request):
        """Send XML data to OpenVAS Manager and get results"""

        block_size = 1024

        if etree.iselement(request):
            root = etree.ElementTree(request)
            root.write(self.socket, encoding="utf-8")

        else:
            if isinstance(request, six.text_type):
                request = request.encode("utf-8")
            self.socket.send(request)

        parser = etree.XMLTreeBuilder()

        while True:
            response = self.socket.recv(block_size)
            parser.feed(response)
            if len(response) < block_size:
                break

        root = parser.close()
        return root 
Example #8
Source File: xml.py    From peach with Mozilla Public License 2.0 6 votes vote down vote up
def handleElement(self, node, parent):
        """
        Handle an XML element, children and attributes. Returns an XmlElement object.
        """
        doc = node.getroottree()
        # Element
        element = XmlElement(None, parent)
        ns, tag = split_ns(node.tag)
        if ns is not None:
            element.xmlNamespace = ns
        element.elementName = tag
        # Element attributes
        for attrib in node.keys():
            attribElement = self.handleAttribute(attrib, node.get(attrib), element)
            element.append(attribElement)
        # Element children
        self._handleText(node.text, element)
        for child in node.iterchildren():
            if etree.iselement(child):  # TODO: skip comments
                childElement = self.handleElement(child, element)
                element.append(childElement)
            self._handleText(child.tail, element)
        return element 
Example #9
Source File: xml.py    From peach with Mozilla Public License 2.0 6 votes vote down vote up
def handleElement(self, node, parent):
        """
        Handle an XML element, children and attributes. Returns an XmlElement object.
        """
        if parent is None:
            return None
        # Element
        element = etree.Element("XmlElement")
        ns, tag = split_ns(node.tag)
        element.set("elementName", tag)
        if ns is not None:
            element.set("ns", ns)
        parent.append(element)
        # Element attributes
        for attrib in node.keys():
            attribElement = self.handleAttribute(attrib, node.get(attrib), element)
            element.append(attribElement)
        # Element children
        self._handleText(node.text, element)
        for child in node.iterchildren():
            if etree.iselement(child):  # TODO: skip comments
                self.handleElement(child, element)
            self._handleText(child.tail, element)
        return element 
Example #10
Source File: xml.py    From python-gvm with GNU General Public License v3.0 6 votes vote down vote up
def pretty_print(xml):
    """Prints beautiful XML-Code

    This function gets a string containing the xml, an object of
    List[lxml.etree.Element] or directly a lxml element.

    Print it with good readable format.

    Arguments:
        xml (str, List[lxml.etree.Element] or lxml.etree.Element):
            xml as string,
            List[lxml.etree.Element] or directly a lxml element.

    """
    if isinstance(xml, list):
        for item in xml:
            if etree.iselement(item):
                print(etree.tostring(item, pretty_print=True).decode("utf-8"))
            else:
                print(item)
    elif etree.iselement(xml):
        print(etree.tostring(xml, pretty_print=True).decode("utf-8"))
    elif isinstance(xml, str):
        tree = secET.fromstring(xml)
        print(etree.tostring(tree, pretty_print=True).decode("utf-8")) 
Example #11
Source File: diff.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def tokenize(html, include_hrefs=True):
    """
    Parse the given HTML and returns token objects (words with attached tags).

    This parses only the content of a page; anything in the head is
    ignored, and the <head> and <body> elements are themselves
    optional.  The content is then parsed by lxml, which ensures the
    validity of the resulting parsed document (though lxml may make
    incorrect guesses when the markup is particular bad).

    <ins> and <del> tags are also eliminated from the document, as
    that gets confusing.

    If include_hrefs is true, then the href attribute of <a> tags is
    included as a special kind of diffable token."""
    if etree.iselement(html):
        body_el = html
    else:
        body_el = parse_html(html, cleanup=True)
    # Then we split the document into text chunks for each tag, word, and end tag:
    chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
    # Finally re-joining them into token objects:
    return fixup_chunks(chunks) 
Example #12
Source File: diff.py    From aws-lambda-lxml with GNU General Public License v3.0 6 votes vote down vote up
def tokenize(html, include_hrefs=True):
    """
    Parse the given HTML and returns token objects (words with attached tags).

    This parses only the content of a page; anything in the head is
    ignored, and the <head> and <body> elements are themselves
    optional.  The content is then parsed by lxml, which ensures the
    validity of the resulting parsed document (though lxml may make
    incorrect guesses when the markup is particular bad).

    <ins> and <del> tags are also eliminated from the document, as
    that gets confusing.

    If include_hrefs is true, then the href attribute of <a> tags is
    included as a special kind of diffable token."""
    if etree.iselement(html):
        body_el = html
    else:
        body_el = parse_html(html, cleanup=True)
    # Then we split the document into text chunks for each tag, word, and end tag:
    chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
    # Finally re-joining them into token objects:
    return fixup_chunks(chunks) 
Example #13
Source File: diff.py    From xmldiff with MIT License 6 votes vote down vote up
def set_trees(self, left, right):
        self.clear()

        # Make sure we were passed two lxml elements:
        if isinstance(left, etree._ElementTree):
            left = left.getroot()
        if isinstance(right, etree._ElementTree):
            right = right.getroot()

        if not (etree.iselement(left) and etree.iselement(right)):
            raise TypeError("The 'left' and 'right' parameters must be "
                            "lxml Elements.")

        # Left gets modified as a part of the diff, deepcopy it first.
        self.left = deepcopy(left)
        self.right = right 
Example #14
Source File: diff.py    From aws-lambda-lxml with GNU General Public License v3.0 6 votes vote down vote up
def tokenize(html, include_hrefs=True):
    """
    Parse the given HTML and returns token objects (words with attached tags).

    This parses only the content of a page; anything in the head is
    ignored, and the <head> and <body> elements are themselves
    optional.  The content is then parsed by lxml, which ensures the
    validity of the resulting parsed document (though lxml may make
    incorrect guesses when the markup is particular bad).

    <ins> and <del> tags are also eliminated from the document, as
    that gets confusing.

    If include_hrefs is true, then the href attribute of <a> tags is
    included as a special kind of diffable token."""
    if etree.iselement(html):
        body_el = html
    else:
        body_el = parse_html(html, cleanup=True)
    # Then we split the document into text chunks for each tag, word, and end tag:
    chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
    # Finally re-joining them into token objects:
    return fixup_chunks(chunks) 
Example #15
Source File: diff.py    From aws-lambda-lxml with GNU General Public License v3.0 6 votes vote down vote up
def tokenize(html, include_hrefs=True):
    """
    Parse the given HTML and returns token objects (words with attached tags).

    This parses only the content of a page; anything in the head is
    ignored, and the <head> and <body> elements are themselves
    optional.  The content is then parsed by lxml, which ensures the
    validity of the resulting parsed document (though lxml may make
    incorrect guesses when the markup is particular bad).

    <ins> and <del> tags are also eliminated from the document, as
    that gets confusing.

    If include_hrefs is true, then the href attribute of <a> tags is
    included as a special kind of diffable token."""
    if etree.iselement(html):
        body_el = html
    else:
        body_el = parse_html(html, cleanup=True)
    # Then we split the document into text chunks for each tag, word, and end tag:
    chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
    # Finally re-joining them into token objects:
    return fixup_chunks(chunks) 
Example #16
Source File: test_response.py    From pyvas with MIT License 5 votes vote down vote up
def test_response_init(response):
    # attributes
    assert response.ok
    assert response.status_code is 200
    assert response.command == "test"
    assert iselement(response.xml)
    # data dict elements
    assert response["@test_id"] == "1234"
    assert response["child"]["@id"] == "1234" 
Example #17
Source File: test_etree_check_command_transform.py    From python-gvm with GNU General Public License v3.0 5 votes vote down vote up
def test_success_response(self):
        transform = EtreeCheckCommandTransform()

        root = etree.Element('foo_response')
        root.set('status', '200')

        response = etree.tostring(root).decode('utf-8')

        result = transform(response)

        self.assertTrue(etree.iselement(result))
        self.assertEqual(result.tag, 'foo_response')
        self.assertEqual(result.get('status'), '200') 
Example #18
Source File: builder.py    From lambda-text-extractor with Apache License 2.0 5 votes vote down vote up
def __call__(self, tag, *children, **attrib):
        typemap = self._typemap

        if self._namespace is not None and tag[0] != '{':
            tag = self._namespace + tag
        elem = self._makeelement(tag, nsmap=self._nsmap)
        if attrib:
            typemap[dict](elem, attrib)

        for item in children:
            if callable(item):
                item = item()
            t = typemap.get(type(item))
            if t is None:
                if ET.iselement(item):
                    elem.append(item)
                    continue
                for basetype in type(item).__mro__:
                    # See if the typemap knows of any of this type's bases.
                    t = typemap.get(basetype)
                    if t is not None:
                        break
                else:
                    raise TypeError("bad argument type: %s(%r)" %
                                    (type(item).__name__, item))
            v = t(elem, item)
            if v:
                typemap.get(type(v))(elem, v)

        return elem 
Example #19
Source File: test_etree_transform.py    From python-gvm with GNU General Public License v3.0 5 votes vote down vote up
def test_transform_response(self):
        transform = EtreeTransform()
        result = transform('<foo/')

        self.assertTrue(etree.iselement(result)) 
Example #20
Source File: builder.py    From lambda-text-extractor with Apache License 2.0 5 votes vote down vote up
def __call__(self, tag, *children, **attrib):
        typemap = self._typemap

        if self._namespace is not None and tag[0] != '{':
            tag = self._namespace + tag
        elem = self._makeelement(tag, nsmap=self._nsmap)
        if attrib:
            typemap[dict](elem, attrib)

        for item in children:
            if callable(item):
                item = item()
            t = typemap.get(type(item))
            if t is None:
                if ET.iselement(item):
                    elem.append(item)
                    continue
                for basetype in type(item).__mro__:
                    # See if the typemap knows of any of this type's bases.
                    t = typemap.get(basetype)
                    if t is not None:
                        break
                else:
                    raise TypeError("bad argument type: %s(%r)" %
                                    (type(item).__name__, item))
            v = t(elem, item)
            if v:
                typemap.get(type(v))(elem, v)

        return elem 
Example #21
Source File: test_etree_transform.py    From python-gvm with GNU General Public License v3.0 5 votes vote down vote up
def test_transform_more_complex_response(self):
        transform = EtreeTransform()
        result = transform('<foo id="bar"><lorem/><ipsum/></foo>')

        self.assertTrue(etree.iselement(result))
        self.assertEqual(result.tag, 'foo')
        self.assertEqual(result.get('id'), 'bar')
        self.assertEqual(len(result), 2) 
Example #22
Source File: builder.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def __call__(self, tag, *children, **attrib):
        get = self._typemap.get

        if self._namespace is not None and tag[0] != '{':
            tag = self._namespace + tag
        elem = self._makeelement(tag, nsmap=self._nsmap)
        if attrib:
            get(dict)(elem, attrib)

        for item in children:
            if callable(item):
                item = item()
            t = get(type(item))
            if t is None:
                if ET.iselement(item):
                    elem.append(item)
                    continue
                for basetype in type(item).__mro__:
                    # See if the typemap knows of any of this type's bases.
                    t = get(basetype)
                    if t is not None:
                        break
                else:
                    raise TypeError("bad argument type: %s(%r)" %
                                    (type(item).__name__, item))
            v = t(elem, item)
            if v:
                get(type(v))(elem, v)

        return elem 
Example #23
Source File: test_client.py    From pyvas with MIT License 5 votes vote down vote up
def test_download_report_with_xml_format(self, client, report):
        response = client.download_report(uuid=report["@id"])
        assert etree.iselement(response)
        assert response.attrib["id"] == report["@id"] 
Example #24
Source File: test_client.py    From pyvas with MIT License 5 votes vote down vote up
def test_client_send_request(client):
    response = client._send_request("<describe_auth/>")
    assert etree.iselement(response) 
Example #25
Source File: parser.py    From peach with Mozilla Public License 2.0 5 votes vote down vote up
def StripComments(self, node):
        i = 0
        while i < len(node):
            if not etree.iselement(node[i]):
                del node[i] # may not preserve text, don't care
            else:
                self.StripComments(node[i])
                i += 1 
Example #26
Source File: builder.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def __call__(self, tag, *children, **attrib):
        get = self._typemap.get

        if self._namespace is not None and tag[0] != '{':
            tag = self._namespace + tag
        elem = self._makeelement(tag, nsmap=self._nsmap)
        if attrib:
            get(dict)(elem, attrib)

        for item in children:
            if callable(item):
                item = item()
            t = get(type(item))
            if t is None:
                if ET.iselement(item):
                    elem.append(item)
                    continue
                for basetype in type(item).__mro__:
                    # See if the typemap knows of any of this type's bases.
                    t = get(basetype)
                    if t is not None:
                        break
                else:
                    raise TypeError("bad argument type: %s(%r)" %
                                    (type(item).__name__, item))
            v = t(elem, item)
            if v:
                get(type(v))(elem, v)

        return elem 
Example #27
Source File: builder.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def __call__(self, tag, *children, **attrib):
        get = self._typemap.get

        if self._namespace is not None and tag[0] != '{':
            tag = self._namespace + tag
        elem = self._makeelement(tag, nsmap=self._nsmap)
        if attrib:
            get(dict)(elem, attrib)

        for item in children:
            if callable(item):
                item = item()
            t = get(type(item))
            if t is None:
                if ET.iselement(item):
                    elem.append(item)
                    continue
                for basetype in type(item).__mro__:
                    # See if the typemap knows of any of this type's bases.
                    t = get(basetype)
                    if t is not None:
                        break
                else:
                    raise TypeError("bad argument type: %s(%r)" %
                                    (type(item).__name__, item))
            v = t(elem, item)
            if v:
                get(type(v))(elem, v)

        return elem 
Example #28
Source File: builder.py    From stopstalk-deployment with MIT License 5 votes vote down vote up
def __call__(self, tag, *children, **attrib):
        get = self._typemap.get

        if self._namespace is not None and tag[0] != '{':
            tag = self._namespace + tag
        elem = self._makeelement(tag, nsmap=self._nsmap)
        if attrib:
            get(dict)(elem, attrib)

        for item in children:
            if callable(item):
                item = item()
            t = get(type(item))
            if t is None:
                if ET.iselement(item):
                    elem.append(item)
                    continue
                for basetype in type(item).__mro__:
                    # See if the typemap knows of any of this type's bases.
                    t = get(basetype)
                    if t is not None:
                        break
                else:
                    raise TypeError("bad argument type: %s(%r)" %
                                    (type(item).__name__, item))
            v = t(elem, item)
            if v:
                get(type(v))(elem, v)

        return elem 
Example #29
Source File: builder.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def __call__(self, tag, *children, **attrib):
        get = self._typemap.get

        if self._namespace is not None and tag[0] != '{':
            tag = self._namespace + tag
        elem = self._makeelement(tag, nsmap=self._nsmap)
        if attrib:
            get(dict)(elem, attrib)

        for item in children:
            if callable(item):
                item = item()
            t = get(type(item))
            if t is None:
                if ET.iselement(item):
                    elem.append(item)
                    continue
                for basetype in type(item).__mro__:
                    # See if the typemap knows of any of this type's bases.
                    t = get(basetype)
                    if t is not None:
                        break
                else:
                    raise TypeError("bad argument type: %s(%r)" %
                                    (type(item).__name__, item))
            v = t(elem, item)
            if v:
                get(type(v))(elem, v)

        return elem 
Example #30
Source File: builder.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def __call__(self, tag, *children, **attrib):
        typemap = self._typemap

        if self._namespace is not None and tag[0] != '{':
            tag = self._namespace + tag
        elem = self._makeelement(tag, nsmap=self._nsmap)
        if attrib:
            typemap[dict](elem, attrib)

        for item in children:
            if callable(item):
                item = item()
            t = typemap.get(type(item))
            if t is None:
                if ET.iselement(item):
                    elem.append(item)
                    continue
                for basetype in type(item).__mro__:
                    # See if the typemap knows of any of this type's bases.
                    t = typemap.get(basetype)
                    if t is not None:
                        break
                else:
                    raise TypeError("bad argument type: %s(%r)" %
                                    (type(item).__name__, item))
            v = t(elem, item)
            if v:
                typemap.get(type(v))(elem, v)

        return elem