Python Examples of xml.etree.ElementTree.XMLParser

Source File: read.py From typhon with MIT License

6 votes

def parse(source, binaryfp=None):
    """Parse ArtsXML file from source.

    Args:
        source (str): Filename or file pointer.

    Returns:
        xml.etree.ElementTree: XML Tree of the ARTS data file.

    """
    arts_element = type('ARTSElementBinaryFP',
                        ARTSElement.__bases__,
                        dict(ARTSElement.__dict__))
    arts_element.binaryfp = binaryfp
    return ElementTree.parse(source,
                             parser=ElementTree.XMLParser(
                                 target=ElementTree.TreeBuilder(
                                     element_factory=arts_element)))

Source File: main.py From meshio with MIT License

6 votes

def read(self):
        parser = ET.XMLParser()
        tree = ET.parse(self.filename, parser)
        root = tree.getroot()

        if root.tag != "Xdmf":
            raise ReadError()

        version = root.get("Version")

        if version.split(".")[0] == "2":
            return self.read_xdmf2(root)

        if version.split(".")[0] != "3":
            raise ReadError("Unknown XDMF version {}.".format(version))

        return self.read_xdmf3(root)

Source File: test_xml_etree.py From gcblue with BSD 3-Clause "New" or "Revised" License

6 votes

def bug_200708_close():
    """

    Test default builder.
    >>> parser = ET.XMLParser() # default
    >>> parser.feed("<element>some text</element>")
    >>> summarize(parser.close())
    'element'

    Test custom builder.
    >>> class EchoTarget:
    ...     def close(self):
    ...         return ET.Element("element") # simulate root
    >>> parser = ET.XMLParser(EchoTarget())
    >>> parser.feed("<element>some text</element>")
    >>> summarize(parser.close())
    'element'

    """

Source File: parser.py From fbchat-archive-parser with MIT License

6 votes

def parse_impl(self):
        """
        Parses the HTML content as a stream. This is far less memory
        intensive than loading the entire HTML file into memory, like
        BeautifulSoup does.
        """

        # Cast to str to ensure not unicode under Python 2, as the parser
        # doesn't like that.
        parser = XMLParser(encoding=str('UTF-8'))
        element_iter = ET.iterparse(self.handle, events=("start", "end"), parser=parser)
        for pos, element in element_iter:
            tag, class_attr = _tag_and_class_attr(element)
            if tag == "h1" and pos == "end":
                if not self.user:
                    self.user = element.text.strip()
            elif tag == "div" and "thread" in class_attr and pos == "start":
                participants = self.parse_participants(element)
                thread = self.parse_thread(participants, element_iter, True)
                self.save_thread(thread)

Source File: document_processor.py From mma-dexter with Apache License 2.0

6 votes

def fetch_daily_feeds(self, day):
        """ Fetch the feed for +day+ and returns an ElementTree instance. """
        # import xml.etree.ElementTree as ET

        from xml.etree import ElementTree
        from htmlentitydefs import name2codepoint

        if self.FEED_PASSWORD is None:
            raise ValueError("%s.FEED_PASSWORD must be set." % self.__class__.__name__)

        r = requests.get(self.FEED_URL % day.strftime('%d-%m-%Y'),
                         auth=(self.FEED_USER, self.FEED_PASSWORD),
                         verify=False,
                         timeout=60)

        r.raise_for_status()

        parser = ElementTree.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
        etree = ElementTree

        return etree.fromstring(r.text.encode('utf-8'), parser=parser)

Source File: document_processor.py From mma-dexter with Apache License 2.0

6 votes

def fetch_filtered_daily_feeds(self, day, filter_parm):
        """ Fetch the feed for +day+ and returns an ElementTree instance. """
        # import xml.etree.ElementTree as ET

        from xml.etree import ElementTree
        from htmlentitydefs import name2codepoint

        if self.FEED_PASSWORD is None:
            raise ValueError("%s.FEED_PASSWORD must be set." % self.__class__.__name__)

        r = requests.get(self.FEED_FILTER_URL % (day.strftime('%d-%m-%Y'), filter_parm),
                         auth=(self.FEED_USER, self.FEED_PASSWORD),
                         verify=False,
                         timeout=60)

        r.raise_for_status()

        parser = ElementTree.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
        etree = ElementTree

        return etree.fromstring(r.text.encode('utf-8'), parser=parser)

Source File: omexml.py From aicsimageio with BSD 3-Clause "New" or "Revised" License

6 votes

def __init__(self, xml=None, rootnode=None):
        if xml is None and rootnode is None:
            xml = default_xml
        if rootnode is None:
            if sys.platform.startswith('win'):
                enc = 'ISO-8859-1'
            else:
                enc = 'UTF-8'
            self.dom = ElementTree.fromstring(xml, ElementTree.XMLParser(encoding=enc))
        else:
            self.dom = rootnode

        # determine OME namespaces
        self.ns = get_namespaces(self.dom)
        if __name__ == '__main__':
            if self.ns['ome'] is None:
                raise Exception("Error: String not in OME-XML format")

        # generate a uuid if there is none
        # < OME UUID = "urn:uuid:ef8af211-b6c1-44d4-97de-daca46f16346"
        omeElem = self.dom
        if not omeElem.get('UUID'):
            omeElem.set('UUID', 'urn:uuid:'+str(uuid.uuid4()))
        self.uuidStr = omeElem.get('UUID')

Source File: test_xml_etree.py From oss-ftp with MIT License

6 votes

def bug_200708_close():
    """

    Test default builder.
    >>> parser = ET.XMLParser() # default
    >>> parser.feed("<element>some text</element>")
    >>> summarize(parser.close())
    'element'

    Test custom builder.
    >>> class EchoTarget:
    ...     def close(self):
    ...         return ET.Element("element") # simulate root
    >>> parser = ET.XMLParser(EchoTarget())
    >>> parser.feed("<element>some text</element>")
    >>> summarize(parser.close())
    'element'

    """

Source File: test_xml_etree.py From BinderFilter with MIT License

6 votes

def bug_200708_close():
    """

    Test default builder.
    >>> parser = ET.XMLParser() # default
    >>> parser.feed("<element>some text</element>")
    >>> summarize(parser.close())
    'element'

    Test custom builder.
    >>> class EchoTarget:
    ...     def close(self):
    ...         return ET.Element("element") # simulate root
    >>> parser = ET.XMLParser(EchoTarget())
    >>> parser.feed("<element>some text</element>")
    >>> summarize(parser.close())
    'element'

    """

Source File: test_xml_etree.py From oss-ftp with MIT License

5 votes

def entity():
    """
    Test entity handling.

    1) good entities

    >>> e = ET.XML("<document title='&#x8230;'>test</document>")
    >>> serialize(e)
    '<document title="&#33328;">test</document>'

    2) bad entities

    >>> ET.XML("<document>&entity;</document>")
    Traceback (most recent call last):
    ParseError: undefined entity: line 1, column 10

    >>> ET.XML(ENTITY_XML)
    Traceback (most recent call last):
    ParseError: undefined entity &entity;: line 5, column 10

    3) custom entity

    >>> parser = ET.XMLParser()
    >>> parser.entity["entity"] = "text"
    >>> parser.feed(ENTITY_XML)
    >>> root = parser.close()
    >>> serialize(root)
    '<document>text</document>'
    """

Source File: __init__.py From zim-desktop-wiki with GNU General Public License v2.0

5 votes

def new_parsetree_from_xml(xml):
	# For some reason this does not work with cElementTree.XMLBuilder ...
	from xml.etree.ElementTree import XMLParser
	from zim.formats import ParseTree
	builder = XMLParser()
	builder.feed(xml)
	root = builder.close()
	return ParseTree(root)

Source File: test_xml_etree.py From gcblue with BSD 3-Clause "New" or "Revised" License

5 votes

def entity():
    """
    Test entity handling.

    1) good entities

    >>> e = ET.XML("<document title='&#x8230;'>test</document>")
    >>> serialize(e)
    '<document title="&#33328;">test</document>'

    2) bad entities

    >>> ET.XML("<document>&entity;</document>")
    Traceback (most recent call last):
    ParseError: undefined entity: line 1, column 10

    >>> ET.XML(ENTITY_XML)
    Traceback (most recent call last):
    ParseError: undefined entity &entity;: line 5, column 10

    3) custom entity

    >>> parser = ET.XMLParser()
    >>> parser.entity["entity"] = "text"
    >>> parser.feed(ENTITY_XML)
    >>> root = parser.close()
    >>> serialize(root)
    '<document>text</document>'
    """

Source File: __init__.py From zim-desktop-wiki with GNU General Public License v2.0

5 votes

def fromstring(self, string):
		'''Set the contents of this tree from XML representation.'''
		parser = ElementTreeModule.XMLParser()
		parser.feed(string)
		root = parser.close()
		self._etree._setroot(root)
		return self # allow ParseTree().fromstring(..)

Source File: parser.py From fbchat-archive-parser with MIT License

5 votes

def _get_manifest_data(self):

        user, thread_references = None, []

        ignore_anchors = True
        saw_anchor = False

        # Cast to str to ensure not unicode under Python 2, as the parser
        # doesn't like that.
        parser = XMLParser(encoding=str('UTF-8'))
        element_iter = ET.iterparse(self.handle, events=("start", "end"), parser=parser)
        for pos, element in element_iter:
            tag, class_attr = _tag_and_class_attr(element)
            if tag == "h1" and pos == "end":
                if not self.user:
                    user = element.text.strip()
            elif tag == "div" and "content" in class_attr and pos == "start":
                ignore_anchors = False
            elif tag == "a" and pos == "start":
                if ignore_anchors:
                    continue
                saw_anchor = True
                participants = self.parse_participants(element)
                thread_path = re.sub(r'^../', '', element.attrib['href'])
                if using_windows():
                    thread_path = thread_path.replace('/', '\\')
                thread_references += [(participants, os.path.join(self.root, thread_path))]

        if not saw_anchor:
            # Indicator of a `messages.htm` file that is probably in the legacy format.
            raise UnsuitableParserError

        return user, thread_references

Source File: parser.py From fbchat-archive-parser with MIT License

5 votes

def process_thread(self, participants, thread_path):

        file_path = os.path.join(self.root, thread_path)

        try:
            with io.open(file_path, 'rt', encoding='utf8') as thread_file:
                parser = XMLParser(encoding=str('UTF-8'))
                element_iter = ET.iterparse(
                    SafeXMLStream(thread_file), events=("start", "end"), parser=parser)
                thread = self.parse_thread(participants, element_iter, False)
        except FileNotFoundError:
            raise MissingReferenceError(file_path)
        self.save_thread(thread)

Source File: en_fr_to_raw.py From AmusingPythonCodes with MIT License

5 votes

def read_and_write_xml_data(filenames, output_path):
    data = []
    for filename in filenames:
        parser = etree.XMLParser(encoding='utf-8')
        root = etree.parse(filename, parser=parser).getroot().find('srcset')
        for doc in root:
            segs = doc.findall('seg')
            for seg in segs:
                data.append(seg.text.strip())
    with codecs.open(output_path, 'w', encoding='utf-8') as out:
        out.write('\n'.join(data))

Source File: xform2json.py From pyxform with BSD 2-Clause "Simplified" License

5 votes

def __init__(self, root):
        if isinstance(root, basestring):
            parser = ETree.XMLParser(encoding="UTF-8")
            self._root = _try_parse(root, parser)
            self._dict = XmlDictObject(
                {
                    self._root.tag: _convert_xml_to_dict_recurse(
                        self._root, XmlDictObject
                    )
                }
            )
        elif not isinstance(root, ETree.Element):
            raise TypeError("Expected ElementTree.Element or file path string")

Source File: rest_server_class.py From warriorframework with Apache License 2.0

5 votes

def verify_xml(self, incoming_xml, respond_obj, file=False):
        """
            Verify the incoming_xml data with either
            a. whole xml file
            b. tag text pairs
            :param:
                incoming_xml: an xml string
                respond_obj: contains the verification detail from datafile
                file: indicate if comparing whole file or just pairs
            :return:
                True if whole file match/all pairs match
                False if not match
        """
        if file:
            status = False
            for expect_xml_file in respond_obj["request_verify_data"]:
                expect_xml_file = getAbsPath(expect_xml_file, getDirName(self.datafile))
                status, _, _, _ = compare_xml(incoming_xml, expect_xml_file, 
                    output_file=False, sorted_json=False, remove_namespaces=True)
            return status
        else:
            incoming_xml = ET.fromstring(incoming_xml, parser=ET.XMLParser(encoding="utf-8"))
            for element_pair in respond_obj["request_verify"]:
                xpath = element_pair.split(",")[0][4:]
                value = element_pair.split(",")[1][6:]
                incoming_value = getChildElementWithSpecificXpath(incoming_xml, xpath)
                if incoming_value is None or value != incoming_value.text:
                    return False

        return True

Source File: base.py From syntribos with Apache License 2.0

5 votes

def _xml_to_obj(cls, serialized_str, encoding="iso-8859-2"):
        parser = ET.XMLParser(encoding=encoding)
        element = ET.fromstring(serialized_str, parser=parser)
        return cls._xml_ele_to_obj(cls._remove_xml_namespaces(element))

Source File: test_xml_etree.py From BinderFilter with MIT License

5 votes

def entity():
    """
    Test entity handling.

    1) good entities

    >>> e = ET.XML("<document title='&#x8230;'>test</document>")
    >>> serialize(e)
    '<document title="&#33328;">test</document>'

    2) bad entities

    >>> ET.XML("<document>&entity;</document>")
    Traceback (most recent call last):
    ParseError: undefined entity: line 1, column 10

    >>> ET.XML(ENTITY_XML)
    Traceback (most recent call last):
    ParseError: undefined entity &entity;: line 5, column 10

    3) custom entity

    >>> parser = ET.XMLParser()
    >>> parser.entity["entity"] = "text"
    >>> parser.feed(ENTITY_XML)
    >>> root = parser.close()
    >>> serialize(root)
    '<document>text</document>'
    """

Source File: document_processor.py From mma-dexter with Apache License 2.0

5 votes

def fetch_daily_feeds(self, day):
        """ Fetch the feed for +day+ and returns an ElementTree instance. """
        # import xml.etree.ElementTree as ET

        from xml.etree import ElementTree
        from htmlentitydefs import name2codepoint

        if self.FEED_PASSWORD is None:
            raise ValueError("%s.FEED_PASSWORD must be set." % self.__class__.__name__)

        # r = requests.get(self.FEED_URL % day.strftime('%d-%m-%Y'),
        #                  auth=(self.FEED_USER, self.FEED_PASSWORD),
        #                  verify=False,
        #                  timeout=60)

        payload = {'PHP_AUTH_USER': self.FEED_USER, 'PHP_AUTH_PW': self.FEED_PASSWORD}

        r = requests.get(self.FEED_URL % day.strftime('%d-%m-%Y'),
                         headers=payload,
                         verify=False,
                         timeout=60)

        r.raise_for_status()

        parser = ElementTree.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
        etree = ElementTree

        return etree.fromstring(r.text.encode('utf-8'), parser=parser)

Source File: xmlparser.py From edl with MIT License

5 votes

def getlog(self,input):
        lines=input.split(b"<?xml")
        data = []
        for line in lines:
            if line==b'':
                continue
            line=b"<?xml"+line
            parser = ET.XMLParser(encoding="utf-8")
            tree = ET.fromstring(line, parser=parser)
            e = ET.ElementTree(tree).getroot()
            for atype in e.findall('log'):
                if 'value' in atype.attrib:
                    data.append(atype.attrib['value'])
        return data

Source File: xmlparser.py From edl with MIT License

5 votes

def getresponse(self,input):
        lines=input.split(b"<?xml")
        content = {}
        for line in lines:
            if line==b'':
                continue
            line=b"<?xml"+line
            parser = ET.XMLParser(encoding="utf-8")
            tree = ET.fromstring(line, parser=parser)
            e = ET.ElementTree(tree).getroot()
            for atype in e.findall('response'):
                for field in atype.attrib:
                    content[field]=atype.attrib[field]
        return content

Source File: confluence.py From confluence-publisher with MIT License

5 votes

def _init_parser(self):
        self._original_parser = etree.XMLParser(*self.args, **self.kwargs)
        self._original_parser.entity.update(self.known_entity)

Source File: advancedsettings.py From script.artwork.beef with MIT License

5 votes

def read_xml():
    if not xbmcvfs.exists(FILENAME):
        return ET.Element(ROOT_TAG)

    parser = ET.XMLParser(target=CommentedTreeBuilder())
    with closing(xbmcvfs.File(FILENAME)) as as_xml:
        try:
            return ET.parse(as_xml, parser).getroot()
        except ET.ParseError:
            log("Can't parse advancedsettings.xml", xbmc.LOGWARNING)

Source File: xml_util.py From pyxcli with Apache License 2.0

5 votes

def __init__(self):
        self.tree_builder = _TerminationDetectingTreeBuilder()
        self.xml_tree_builder = et.XMLParser(target=self.tree_builder)

Source File: util.py From sphinxcontrib-needs with MIT License

5 votes

def extract_needs_from_html(html):
    # Replace entities, which elementTree can not handle
    html = html.replace('&copy;', '')
    html = html.replace('&amp;', '')

    if sys.version_info >= (3, 0):
        source = StringIO(html)
        parser = ElementTree.XMLParser(encoding="utf-8")
    else:  # Python 2.x
        source = StringIO(html.encode("utf-8"))
        parser = ElementTree.XMLParser(encoding="utf-8")

    # XML knows not nbsp definition, which comes from HTML.
    # So we need to add it
    parser.entity["nbsp"] = ' '

    etree = ElementTree.ElementTree()
    document = etree.parse(source, parser=parser)
    tables = document.findall(".//html:table", NS)

    # Sphinx <3.0 start html-code with:
    #    <html xmlns="http://www.w3.org/1999/xhtml">
    # Sphinx >= 3.0 starts it with:
    #    <html>
    # So above search will not work for Sphinx >= 3.0 and we try a new one
    if len(tables) == 0:
        tables = document.findall(".//html:table", {'html': ''})

    return [HtmlNeed(table) for table in tables if 'need' in table.get('class', '')]

Source File: util.py From aamo with MIT License

5 votes

def load_xml(file_name):  # Load an XML file
    try:
        parser = ET.XMLParser(encoding="utf-8")
        return ET.parse(base_dir() + file_name, parser=parser)
    except IOError as ex:
        if ex.errno == 2:
            raise e.FileNotFound
        else:
            raise e.LoadFileException(str(ex)+'\nUnable to load XML ' + base_dir() + file_name)

Source File: data_process_i2b2.py From MedicalRelationExtraction with MIT License

5 votes

def data_process(inDIR, outFile):
    fileList = file_name(inDIR)
    print(len(fileList))
    lableType = set()
    outFile = open(outFile, "w")
    for f in fileList:
        print(f, end=' ')
        linkNO = 0
        inFile = open(inDIR + f, "r")
        xmlString = ""
        for lines in inFile.readlines():
            xmlString += lines.replace(" & ", " ").replace("&", " and ")
        inFile.close()

        parser = ET.XMLParser(encoding="utf-8")
        root = ET.fromstring(xmlString, parser=parser)
        # tree = ET.parse(inDIR + f)
        # root = tree.getroot()
        text = root.find("TEXT").text.replace("\n", " ").strip()
        # print(text)
        tags = root.find("TAGS")
        for tlink in tags.findall("TLINK"):
            id = f[:-4] +"_"+ str(tlink.attrib['id'] )
            target = tlink.attrib['fromText'] + " " + tlink.attrib['toText']
            label = tlink.attrib['type'].upper()
            if label == '':
                continue
            lableType.add(label)
            # print(id + "\t"+target + "\t" + label)
            outFile.write(id + "\t" + target + "\t" + text + "\t"+ label+"\n")
            linkNO += 1
        print("linkNO = " + str(linkNO))
    print("*"*80)

Source File: DoxygenDB.py From CodeAtlasSublime with Eclipse Public License 1.0

5 votes

def _getXmlDocumentItem(self, fileName):
		filePath = '%s/%s.xml' % (self._dbFolder, fileName)
		xmlDoc = self.xmlCache.get(filePath)
		if xmlDoc:
			return xmlDoc
		doc = None

		# try different encoding and configurations
		encodingArray = ['utf-8', 'iso-8859-5']
		for docEncoding in encodingArray:
			try:
				doc = ET.parse(filePath, parser=ET.XMLParser(encoding=docEncoding))
				if doc is not None:
					print('parse %s success. encoding = %s' % (fileName, docEncoding))
					break
			except:
				print('parse %s failed. encoding = %s'% (fileName, docEncoding))
				# traceback.print_exc()
				continue
		if doc is None:
			print('parse %s failed'% (fileName, ))
			return XmlDocItem(None)

		xmlDoc = XmlDocItem(doc)
		self.xmlCache[filePath] = xmlDoc
		return xmlDoc

Python xml.etree.ElementTree.XMLParser() Examples