Python xml.etree.ElementTree.XMLParser() Examples

The following are 30 code examples of xml.etree.ElementTree.XMLParser(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module xml.etree.ElementTree , or try the search function .
Example #1
Source File: read.py    From typhon with MIT License 6 votes vote down vote up
def parse(source, binaryfp=None):
    """Parse ArtsXML file from source.

    Args:
        source (str): Filename or file pointer.

    Returns:
        xml.etree.ElementTree: XML Tree of the ARTS data file.

    """
    arts_element = type('ARTSElementBinaryFP',
                        ARTSElement.__bases__,
                        dict(ARTSElement.__dict__))
    arts_element.binaryfp = binaryfp
    return ElementTree.parse(source,
                             parser=ElementTree.XMLParser(
                                 target=ElementTree.TreeBuilder(
                                     element_factory=arts_element))) 
Example #2
Source File: main.py    From meshio with MIT License 6 votes vote down vote up
def read(self):
        parser = ET.XMLParser()
        tree = ET.parse(self.filename, parser)
        root = tree.getroot()

        if root.tag != "Xdmf":
            raise ReadError()

        version = root.get("Version")

        if version.split(".")[0] == "2":
            return self.read_xdmf2(root)

        if version.split(".")[0] != "3":
            raise ReadError("Unknown XDMF version {}.".format(version))

        return self.read_xdmf3(root) 
Example #3
Source File: test_xml_etree.py    From gcblue with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def bug_200708_close():
    """

    Test default builder.
    >>> parser = ET.XMLParser() # default
    >>> parser.feed("<element>some text</element>")
    >>> summarize(parser.close())
    'element'

    Test custom builder.
    >>> class EchoTarget:
    ...     def close(self):
    ...         return ET.Element("element") # simulate root
    >>> parser = ET.XMLParser(EchoTarget())
    >>> parser.feed("<element>some text</element>")
    >>> summarize(parser.close())
    'element'

    """ 
Example #4
Source File: parser.py    From fbchat-archive-parser with MIT License 6 votes vote down vote up
def parse_impl(self):
        """
        Parses the HTML content as a stream. This is far less memory
        intensive than loading the entire HTML file into memory, like
        BeautifulSoup does.
        """

        # Cast to str to ensure not unicode under Python 2, as the parser
        # doesn't like that.
        parser = XMLParser(encoding=str('UTF-8'))
        element_iter = ET.iterparse(self.handle, events=("start", "end"), parser=parser)
        for pos, element in element_iter:
            tag, class_attr = _tag_and_class_attr(element)
            if tag == "h1" and pos == "end":
                if not self.user:
                    self.user = element.text.strip()
            elif tag == "div" and "thread" in class_attr and pos == "start":
                participants = self.parse_participants(element)
                thread = self.parse_thread(participants, element_iter, True)
                self.save_thread(thread) 
Example #5
Source File: document_processor.py    From mma-dexter with Apache License 2.0 6 votes vote down vote up
def fetch_daily_feeds(self, day):
        """ Fetch the feed for +day+ and returns an ElementTree instance. """
        # import xml.etree.ElementTree as ET

        from xml.etree import ElementTree
        from htmlentitydefs import name2codepoint

        if self.FEED_PASSWORD is None:
            raise ValueError("%s.FEED_PASSWORD must be set." % self.__class__.__name__)

        r = requests.get(self.FEED_URL % day.strftime('%d-%m-%Y'),
                         auth=(self.FEED_USER, self.FEED_PASSWORD),
                         verify=False,
                         timeout=60)

        r.raise_for_status()

        parser = ElementTree.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
        etree = ElementTree

        return etree.fromstring(r.text.encode('utf-8'), parser=parser) 
Example #6
Source File: document_processor.py    From mma-dexter with Apache License 2.0 6 votes vote down vote up
def fetch_filtered_daily_feeds(self, day, filter_parm):
        """ Fetch the feed for +day+ and returns an ElementTree instance. """
        # import xml.etree.ElementTree as ET

        from xml.etree import ElementTree
        from htmlentitydefs import name2codepoint

        if self.FEED_PASSWORD is None:
            raise ValueError("%s.FEED_PASSWORD must be set." % self.__class__.__name__)

        r = requests.get(self.FEED_FILTER_URL % (day.strftime('%d-%m-%Y'), filter_parm),
                         auth=(self.FEED_USER, self.FEED_PASSWORD),
                         verify=False,
                         timeout=60)

        r.raise_for_status()

        parser = ElementTree.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
        etree = ElementTree

        return etree.fromstring(r.text.encode('utf-8'), parser=parser) 
Example #7
Source File: omexml.py    From aicsimageio with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, xml=None, rootnode=None):
        if xml is None and rootnode is None:
            xml = default_xml
        if rootnode is None:
            if sys.platform.startswith('win'):
                enc = 'ISO-8859-1'
            else:
                enc = 'UTF-8'
            self.dom = ElementTree.fromstring(xml, ElementTree.XMLParser(encoding=enc))
        else:
            self.dom = rootnode

        # determine OME namespaces
        self.ns = get_namespaces(self.dom)
        if __name__ == '__main__':
            if self.ns['ome'] is None:
                raise Exception("Error: String not in OME-XML format")

        # generate a uuid if there is none
        # < OME UUID = "urn:uuid:ef8af211-b6c1-44d4-97de-daca46f16346"
        omeElem = self.dom
        if not omeElem.get('UUID'):
            omeElem.set('UUID', 'urn:uuid:'+str(uuid.uuid4()))
        self.uuidStr = omeElem.get('UUID') 
Example #8
Source File: test_xml_etree.py    From oss-ftp with MIT License 6 votes vote down vote up
def bug_200708_close():
    """

    Test default builder.
    >>> parser = ET.XMLParser() # default
    >>> parser.feed("<element>some text</element>")
    >>> summarize(parser.close())
    'element'

    Test custom builder.
    >>> class EchoTarget:
    ...     def close(self):
    ...         return ET.Element("element") # simulate root
    >>> parser = ET.XMLParser(EchoTarget())
    >>> parser.feed("<element>some text</element>")
    >>> summarize(parser.close())
    'element'

    """ 
Example #9
Source File: test_xml_etree.py    From BinderFilter with MIT License 6 votes vote down vote up
def bug_200708_close():
    """

    Test default builder.
    >>> parser = ET.XMLParser() # default
    >>> parser.feed("<element>some text</element>")
    >>> summarize(parser.close())
    'element'

    Test custom builder.
    >>> class EchoTarget:
    ...     def close(self):
    ...         return ET.Element("element") # simulate root
    >>> parser = ET.XMLParser(EchoTarget())
    >>> parser.feed("<element>some text</element>")
    >>> summarize(parser.close())
    'element'

    """ 
Example #10
Source File: test_xml_etree.py    From oss-ftp with MIT License 5 votes vote down vote up
def entity():
    """
    Test entity handling.

    1) good entities

    >>> e = ET.XML("<document title='&#x8230;'>test</document>")
    >>> serialize(e)
    '<document title="&#33328;">test</document>'

    2) bad entities

    >>> ET.XML("<document>&entity;</document>")
    Traceback (most recent call last):
    ParseError: undefined entity: line 1, column 10

    >>> ET.XML(ENTITY_XML)
    Traceback (most recent call last):
    ParseError: undefined entity &entity;: line 5, column 10

    3) custom entity

    >>> parser = ET.XMLParser()
    >>> parser.entity["entity"] = "text"
    >>> parser.feed(ENTITY_XML)
    >>> root = parser.close()
    >>> serialize(root)
    '<document>text</document>'
    """ 
Example #11
Source File: __init__.py    From zim-desktop-wiki with GNU General Public License v2.0 5 votes vote down vote up
def new_parsetree_from_xml(xml):
	# For some reason this does not work with cElementTree.XMLBuilder ...
	from xml.etree.ElementTree import XMLParser
	from zim.formats import ParseTree
	builder = XMLParser()
	builder.feed(xml)
	root = builder.close()
	return ParseTree(root) 
Example #12
Source File: test_xml_etree.py    From gcblue with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def entity():
    """
    Test entity handling.

    1) good entities

    >>> e = ET.XML("<document title='&#x8230;'>test</document>")
    >>> serialize(e)
    '<document title="&#33328;">test</document>'

    2) bad entities

    >>> ET.XML("<document>&entity;</document>")
    Traceback (most recent call last):
    ParseError: undefined entity: line 1, column 10

    >>> ET.XML(ENTITY_XML)
    Traceback (most recent call last):
    ParseError: undefined entity &entity;: line 5, column 10

    3) custom entity

    >>> parser = ET.XMLParser()
    >>> parser.entity["entity"] = "text"
    >>> parser.feed(ENTITY_XML)
    >>> root = parser.close()
    >>> serialize(root)
    '<document>text</document>'
    """ 
Example #13
Source File: __init__.py    From zim-desktop-wiki with GNU General Public License v2.0 5 votes vote down vote up
def fromstring(self, string):
		'''Set the contents of this tree from XML representation.'''
		parser = ElementTreeModule.XMLParser()
		parser.feed(string)
		root = parser.close()
		self._etree._setroot(root)
		return self # allow ParseTree().fromstring(..) 
Example #14
Source File: parser.py    From fbchat-archive-parser with MIT License 5 votes vote down vote up
def _get_manifest_data(self):

        user, thread_references = None, []

        ignore_anchors = True
        saw_anchor = False

        # Cast to str to ensure not unicode under Python 2, as the parser
        # doesn't like that.
        parser = XMLParser(encoding=str('UTF-8'))
        element_iter = ET.iterparse(self.handle, events=("start", "end"), parser=parser)
        for pos, element in element_iter:
            tag, class_attr = _tag_and_class_attr(element)
            if tag == "h1" and pos == "end":
                if not self.user:
                    user = element.text.strip()
            elif tag == "div" and "content" in class_attr and pos == "start":
                ignore_anchors = False
            elif tag == "a" and pos == "start":
                if ignore_anchors:
                    continue
                saw_anchor = True
                participants = self.parse_participants(element)
                thread_path = re.sub(r'^../', '', element.attrib['href'])
                if using_windows():
                    thread_path = thread_path.replace('/', '\\')
                thread_references += [(participants, os.path.join(self.root, thread_path))]

        if not saw_anchor:
            # Indicator of a `messages.htm` file that is probably in the legacy format.
            raise UnsuitableParserError

        return user, thread_references 
Example #15
Source File: parser.py    From fbchat-archive-parser with MIT License 5 votes vote down vote up
def process_thread(self, participants, thread_path):

        file_path = os.path.join(self.root, thread_path)

        try:
            with io.open(file_path, 'rt', encoding='utf8') as thread_file:
                parser = XMLParser(encoding=str('UTF-8'))
                element_iter = ET.iterparse(
                    SafeXMLStream(thread_file), events=("start", "end"), parser=parser)
                thread = self.parse_thread(participants, element_iter, False)
        except FileNotFoundError:
            raise MissingReferenceError(file_path)
        self.save_thread(thread) 
Example #16
Source File: en_fr_to_raw.py    From AmusingPythonCodes with MIT License 5 votes vote down vote up
def read_and_write_xml_data(filenames, output_path):
    data = []
    for filename in filenames:
        parser = etree.XMLParser(encoding='utf-8')
        root = etree.parse(filename, parser=parser).getroot().find('srcset')
        for doc in root:
            segs = doc.findall('seg')
            for seg in segs:
                data.append(seg.text.strip())
    with codecs.open(output_path, 'w', encoding='utf-8') as out:
        out.write('\n'.join(data)) 
Example #17
Source File: xform2json.py    From pyxform with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def __init__(self, root):
        if isinstance(root, basestring):
            parser = ETree.XMLParser(encoding="UTF-8")
            self._root = _try_parse(root, parser)
            self._dict = XmlDictObject(
                {
                    self._root.tag: _convert_xml_to_dict_recurse(
                        self._root, XmlDictObject
                    )
                }
            )
        elif not isinstance(root, ETree.Element):
            raise TypeError("Expected ElementTree.Element or file path string") 
Example #18
Source File: rest_server_class.py    From warriorframework with Apache License 2.0 5 votes vote down vote up
def verify_xml(self, incoming_xml, respond_obj, file=False):
        """
            Verify the incoming_xml data with either
            a. whole xml file
            b. tag text pairs
            :param:
                incoming_xml: an xml string
                respond_obj: contains the verification detail from datafile
                file: indicate if comparing whole file or just pairs
            :return:
                True if whole file match/all pairs match
                False if not match
        """
        if file:
            status = False
            for expect_xml_file in respond_obj["request_verify_data"]:
                expect_xml_file = getAbsPath(expect_xml_file, getDirName(self.datafile))
                status, _, _, _ = compare_xml(incoming_xml, expect_xml_file, 
                    output_file=False, sorted_json=False, remove_namespaces=True)
            return status
        else:
            incoming_xml = ET.fromstring(incoming_xml, parser=ET.XMLParser(encoding="utf-8"))
            for element_pair in respond_obj["request_verify"]:
                xpath = element_pair.split(",")[0][4:]
                value = element_pair.split(",")[1][6:]
                incoming_value = getChildElementWithSpecificXpath(incoming_xml, xpath)
                if incoming_value is None or value != incoming_value.text:
                    return False

        return True 
Example #19
Source File: base.py    From syntribos with Apache License 2.0 5 votes vote down vote up
def _xml_to_obj(cls, serialized_str, encoding="iso-8859-2"):
        parser = ET.XMLParser(encoding=encoding)
        element = ET.fromstring(serialized_str, parser=parser)
        return cls._xml_ele_to_obj(cls._remove_xml_namespaces(element)) 
Example #20
Source File: test_xml_etree.py    From BinderFilter with MIT License 5 votes vote down vote up
def entity():
    """
    Test entity handling.

    1) good entities

    >>> e = ET.XML("<document title='&#x8230;'>test</document>")
    >>> serialize(e)
    '<document title="&#33328;">test</document>'

    2) bad entities

    >>> ET.XML("<document>&entity;</document>")
    Traceback (most recent call last):
    ParseError: undefined entity: line 1, column 10

    >>> ET.XML(ENTITY_XML)
    Traceback (most recent call last):
    ParseError: undefined entity &entity;: line 5, column 10

    3) custom entity

    >>> parser = ET.XMLParser()
    >>> parser.entity["entity"] = "text"
    >>> parser.feed(ENTITY_XML)
    >>> root = parser.close()
    >>> serialize(root)
    '<document>text</document>'
    """ 
Example #21
Source File: document_processor.py    From mma-dexter with Apache License 2.0 5 votes vote down vote up
def fetch_daily_feeds(self, day):
        """ Fetch the feed for +day+ and returns an ElementTree instance. """
        # import xml.etree.ElementTree as ET

        from xml.etree import ElementTree
        from htmlentitydefs import name2codepoint

        if self.FEED_PASSWORD is None:
            raise ValueError("%s.FEED_PASSWORD must be set." % self.__class__.__name__)

        # r = requests.get(self.FEED_URL % day.strftime('%d-%m-%Y'),
        #                  auth=(self.FEED_USER, self.FEED_PASSWORD),
        #                  verify=False,
        #                  timeout=60)

        payload = {'PHP_AUTH_USER': self.FEED_USER, 'PHP_AUTH_PW': self.FEED_PASSWORD}

        r = requests.get(self.FEED_URL % day.strftime('%d-%m-%Y'),
                         headers=payload,
                         verify=False,
                         timeout=60)

        r.raise_for_status()

        parser = ElementTree.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
        etree = ElementTree

        return etree.fromstring(r.text.encode('utf-8'), parser=parser) 
Example #22
Source File: xmlparser.py    From edl with MIT License 5 votes vote down vote up
def getlog(self,input):
        lines=input.split(b"<?xml")
        data = []
        for line in lines:
            if line==b'':
                continue
            line=b"<?xml"+line
            parser = ET.XMLParser(encoding="utf-8")
            tree = ET.fromstring(line, parser=parser)
            e = ET.ElementTree(tree).getroot()
            for atype in e.findall('log'):
                if 'value' in atype.attrib:
                    data.append(atype.attrib['value'])
        return data 
Example #23
Source File: xmlparser.py    From edl with MIT License 5 votes vote down vote up
def getresponse(self,input):
        lines=input.split(b"<?xml")
        content = {}
        for line in lines:
            if line==b'':
                continue
            line=b"<?xml"+line
            parser = ET.XMLParser(encoding="utf-8")
            tree = ET.fromstring(line, parser=parser)
            e = ET.ElementTree(tree).getroot()
            for atype in e.findall('response'):
                for field in atype.attrib:
                    content[field]=atype.attrib[field]
        return content 
Example #24
Source File: confluence.py    From confluence-publisher with MIT License 5 votes vote down vote up
def _init_parser(self):
        self._original_parser = etree.XMLParser(*self.args, **self.kwargs)
        self._original_parser.entity.update(self.known_entity) 
Example #25
Source File: advancedsettings.py    From script.artwork.beef with MIT License 5 votes vote down vote up
def read_xml():
    if not xbmcvfs.exists(FILENAME):
        return ET.Element(ROOT_TAG)

    parser = ET.XMLParser(target=CommentedTreeBuilder())
    with closing(xbmcvfs.File(FILENAME)) as as_xml:
        try:
            return ET.parse(as_xml, parser).getroot()
        except ET.ParseError:
            log("Can't parse advancedsettings.xml", xbmc.LOGWARNING) 
Example #26
Source File: xml_util.py    From pyxcli with Apache License 2.0 5 votes vote down vote up
def __init__(self):
        self.tree_builder = _TerminationDetectingTreeBuilder()
        self.xml_tree_builder = et.XMLParser(target=self.tree_builder) 
Example #27
Source File: util.py    From sphinxcontrib-needs with MIT License 5 votes vote down vote up
def extract_needs_from_html(html):
    # Replace entities, which elementTree can not handle
    html = html.replace('&copy;', '')
    html = html.replace('&amp;', '')

    if sys.version_info >= (3, 0):
        source = StringIO(html)
        parser = ElementTree.XMLParser(encoding="utf-8")
    else:  # Python 2.x
        source = StringIO(html.encode("utf-8"))
        parser = ElementTree.XMLParser(encoding="utf-8")

    # XML knows not nbsp definition, which comes from HTML.
    # So we need to add it
    parser.entity["nbsp"] = ' '

    etree = ElementTree.ElementTree()
    document = etree.parse(source, parser=parser)
    tables = document.findall(".//html:table", NS)

    # Sphinx <3.0 start html-code with:
    #    <html xmlns="http://www.w3.org/1999/xhtml">
    # Sphinx >= 3.0 starts it with:
    #    <html>
    # So above search will not work for Sphinx >= 3.0 and we try a new one
    if len(tables) == 0:
        tables = document.findall(".//html:table", {'html': ''})

    return [HtmlNeed(table) for table in tables if 'need' in table.get('class', '')] 
Example #28
Source File: util.py    From aamo with MIT License 5 votes vote down vote up
def load_xml(file_name):  # Load an XML file
    try:
        parser = ET.XMLParser(encoding="utf-8")
        return ET.parse(base_dir() + file_name, parser=parser)
    except IOError as ex:
        if ex.errno == 2:
            raise e.FileNotFound
        else:
            raise e.LoadFileException(str(ex)+'\nUnable to load XML ' + base_dir() + file_name) 
Example #29
Source File: data_process_i2b2.py    From MedicalRelationExtraction with MIT License 5 votes vote down vote up
def data_process(inDIR, outFile):
    fileList = file_name(inDIR)
    print(len(fileList))
    lableType = set()
    outFile = open(outFile, "w")
    for f in fileList:
        print(f, end=' ')
        linkNO = 0
        inFile = open(inDIR + f, "r")
        xmlString = ""
        for lines in inFile.readlines():
            xmlString += lines.replace(" & ", " ").replace("&", " and ")
        inFile.close()

        parser = ET.XMLParser(encoding="utf-8")
        root = ET.fromstring(xmlString, parser=parser)
        # tree = ET.parse(inDIR + f)
        # root = tree.getroot()
        text = root.find("TEXT").text.replace("\n", " ").strip()
        # print(text)
        tags = root.find("TAGS")
        for tlink in tags.findall("TLINK"):
            id = f[:-4] +"_"+ str(tlink.attrib['id'] )
            target = tlink.attrib['fromText'] + " " + tlink.attrib['toText']
            label = tlink.attrib['type'].upper()
            if label == '':
                continue
            lableType.add(label)
            # print(id + "\t"+target + "\t" + label)
            outFile.write(id + "\t" + target + "\t" + text + "\t"+ label+"\n")
            linkNO += 1
        print("linkNO = " + str(linkNO))
    print("*"*80) 
Example #30
Source File: DoxygenDB.py    From CodeAtlasSublime with Eclipse Public License 1.0 5 votes vote down vote up
def _getXmlDocumentItem(self, fileName):
		filePath = '%s/%s.xml' % (self._dbFolder, fileName)
		xmlDoc = self.xmlCache.get(filePath)
		if xmlDoc:
			return xmlDoc
		doc = None

		# try different encoding and configurations
		encodingArray = ['utf-8', 'iso-8859-5']
		for docEncoding in encodingArray:
			try:
				doc = ET.parse(filePath, parser=ET.XMLParser(encoding=docEncoding))
				if doc is not None:
					print('parse %s success. encoding = %s' % (fileName, docEncoding))
					break
			except:
				print('parse %s failed. encoding = %s'% (fileName, docEncoding))
				# traceback.print_exc()
				continue
		if doc is None:
			print('parse %s failed'% (fileName, ))
			return XmlDocItem(None)

		xmlDoc = XmlDocItem(doc)
		self.xmlCache[filePath] = xmlDoc
		return xmlDoc