Python xml.etree.ElementTree.iterparse() Examples

The following are 30 code examples of xml.etree.ElementTree.iterparse(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module xml.etree.ElementTree , or try the search function .
Example #1
Source File: xlsx.py    From lambda-text-extractor with Apache License 2.0 6 votes vote down vote up
def process_stream_iterparse(self, stream, heading=None):
        if self.verbosity >= 2 and heading is not None:
            fprintf(self.logfile, "\n=== %s ===\n", heading)
        si_tag = U_SSML12 + 'si'
        elemno = -1
        sst = self.bk._sharedstrings
        for event, elem in ET.iterparse(stream):
            if elem.tag != si_tag: continue
            elemno = elemno + 1
            if self.verbosity >= 3:
                fprintf(self.logfile, "element #%d\n", elemno)
                self.dump_elem(elem)
            result = get_text_from_si_or_is(self, elem)
            sst.append(result)
            elem.clear() # destroy all child elements
        if self.verbosity >= 2:
            self.dumpout('Entries in SST: %d', len(sst))
        if self.verbosity >= 3:
            for x, s in enumerate(sst):
                fprintf(self.logfile, "SST x=%d s=%r\n", x, s) 
Example #2
Source File: OMIA.py    From dipper with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def process_associations(self, limit):
        """
        Loop through the xml file and process the article-breed, article-phene,
        breed-phene, phene-gene associations, and the external links to LIDA.

        :param limit:
        :return:

        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))
        with gzip.open(myfile, 'rb') as readbin:
            filereader = io.TextIOWrapper(readbin, newline="")
            filereader.readline()  # remove the xml declaration line
            for event, elem in ET.iterparse(filereader):  # iterparse is not deprecated
                self.process_xml_table(
                    elem, 'Article_Breed', self._process_article_breed_row, limit)
                self.process_xml_table(
                    elem, 'Article_Phene', self._process_article_phene_row, limit)
                self.process_xml_table(
                    elem, 'Breed_Phene', self._process_breed_phene_row, limit)
                self.process_xml_table(
                    elem, 'Lida_Links', self._process_lida_links_row, limit)
                self.process_xml_table(
                    elem, 'Phene_Gene', self._process_phene_gene_row, limit)
                self.process_xml_table(
                    elem, 'Group_MPO', self._process_group_mpo_row, limit)

    # ############ INDIVIDUAL TABLE-LEVEL PROCESSING FUNCTIONS ################ 
Example #3
Source File: __init__.py    From streamlink with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def _parse_xml(data, strip_ns=False):
    if six.PY2 and isinstance(data, six.text_type):
        data = data.encode("utf8")
    elif six.PY3:
        data = bytearray(data, "utf8")
    try:
        it = ET.iterparse(BytesIO(data))
        for _, el in it:
            if '}' in el.tag and strip_ns:
                # strip all namespaces
                el.tag = el.tag.split('}', 1)[1]
        return it.root
    except Exception as err:
        snippet = repr(data)
        if len(snippet) > 35:
            snippet = snippet[:35] + " ..."

        raise ValueError("Unable to parse XML: {0} ({1})".format(err, snippet)) 
Example #4
Source File: path-flowmon-parse-result.py    From ns3-load-balance with GNU General Public License v2.0 6 votes vote down vote up
def main(argv):
    file_obj = open(argv[1])
    print "Reading XML file ",

    sys.stdout.flush()
    level = 0
    sim_list = []
    for event, elem in ElementTree.iterparse(file_obj, events=("start", "end")):
        if event == "start":
            level += 1
        if event == "end":
            level -= 1
            if level == 0 and elem.tag == 'FlowMonitor':
                sim = Simulation(elem)
                sim_list.append(sim)
                elem.clear() # won't need this any more
                sys.stdout.write(".")
                sys.stdout.flush()
    print " done."

    for sim in sim_list:
        for flow in sim.flows:
            print "FlowID: %i" % flow.flowId,
	    print flow.paths 
Example #5
Source File: __init__.py    From zulip with Apache License 2.0 6 votes vote down vote up
def handleMatch(self, match: Match[str]) -> Element:
        rendered = render_tex(match.group('body'), is_inline=True)
        if rendered is not None:
            # We need to give Python-Markdown an ElementTree object, but if we
            # give it one with correctly stored XML namespaces, it will mangle
            # everything when serializing it.  So we play this stupid game to
            # store xmlns as a normal attribute.  :-[
            assert ' zulip-xmlns="' not in rendered
            rendered = rendered.replace(' xmlns="', ' zulip-xmlns="')
            parsed = etree.iterparse(StringIO(rendered))
            for event, elem in parsed:
                if 'zulip-xmlns' in elem.attrib:
                    elem.attrib['xmlns'] = elem.attrib.pop('zulip-xmlns')
                root = elem
            return root
        else:  # Something went wrong while rendering
            span = Element('span')
            span.set('class', 'tex-error')
            span.text = '$$' + match.group('body') + '$$'
            return span 
Example #6
Source File: asf_template.py    From esa_sentinel with MIT License 6 votes vote down vote up
def process_metalink(self, ml_file):
        print("Processing metalink file: {0}".format(ml_file))
        with open(ml_file, 'r') as ml:
            xml = ml.read()
        
        # Hack to remove annoying namespace
        it = ET.iterparse(StringIO(xml))
        for _, el in it:
            if '}' in el.tag:
                el.tag = el.tag.split('}', 1)[1]  # strip all namespaces
        root = it.root
        
        dl_urls = []
        ml_files = root.find('files')
        for dl in ml_files:
            dl_urls.append(dl.find('resources').find('url').text)
        
        if len(dl_urls) > 0:
            return dl_urls
        else:
            return None
    
    # Get download urls from a csv file 
Example #7
Source File: Objects.py    From IFIscripts with MIT License 6 votes vote down vote up
def parse(filename):
    """Returns a DFXMLObject populated from the contents of the (string) filename argument."""
    retval = None
    appender = None
    for (event, obj) in iterparse(filename):
        if event == "start":
            if isinstance(obj, DFXMLObject):
                retval = obj
                appender = obj
            elif isinstance(obj, VolumeObject):
                retval.append(obj)
                appender = obj
        elif event == "end":
            if isinstance(obj, DFXMLObject):
                if retval is None:
                    retval = obj
                appender = obj
            if isinstance(obj, VolumeObject):
                appender = retval
            elif isinstance(obj, FileObject):
                appender.append(obj)
    return retval 
Example #8
Source File: xlsx.py    From InternationalizationScript-iOS with MIT License 6 votes vote down vote up
def process_stream_iterparse(self, stream, heading=None):
        if self.verbosity >= 2 and heading is not None:
            fprintf(self.logfile, "\n=== %s ===\n", heading)
        si_tag = U_SSML12 + 'si'
        elemno = -1
        sst = self.bk._sharedstrings
        for event, elem in ET.iterparse(stream):
            if elem.tag != si_tag: continue
            elemno = elemno + 1
            if self.verbosity >= 3:
                fprintf(self.logfile, "element #%d\n", elemno)
                self.dump_elem(elem)
            result = get_text_from_si_or_is(self, elem)
            sst.append(result)                
            elem.clear() # destroy all child elements
        if self.verbosity >= 2:
            self.dumpout('Entries in SST: %d', len(sst))
        if self.verbosity >= 3:
            for x, s in enumerate(sst):
                fprintf(self.logfile, "SST x=%d s=%r\n", x, s) 
Example #9
Source File: xlsx.py    From InternationalizationScript-iOS with MIT License 6 votes vote down vote up
def process_stream_iterparse(self, stream, heading=None):
        if self.verbosity >= 2 and heading is not None:
            fprintf(self.logfile, "\n=== %s ===\n", heading)
        si_tag = U_SSML12 + 'si'
        elemno = -1
        sst = self.bk._sharedstrings
        for event, elem in ET.iterparse(stream):
            if elem.tag != si_tag: continue
            elemno = elemno + 1
            if self.verbosity >= 3:
                fprintf(self.logfile, "element #%d\n", elemno)
                self.dump_elem(elem)
            result = get_text_from_si_or_is(self, elem)
            sst.append(result)                
            elem.clear() # destroy all child elements
        if self.verbosity >= 2:
            self.dumpout('Entries in SST: %d', len(sst))
        if self.verbosity >= 3:
            for x, s in enumerate(sst):
                fprintf(self.logfile, "SST x=%d s=%r\n", x, s) 
Example #10
Source File: parser.py    From estnltk with GNU General Public License v2.0 6 votes vote down vote up
def parse_and_remove(filename, path):
    path_parts = path.split('/')
    doc = iterparse(filename, ('start', 'end'))
    tag_stack = []
    elem_stack = []
    for event, elem in doc:
        if event == 'start' in elem.tag:
            tag_stack.append(elem.tag)
            elem_stack.append(elem)
        elif event == 'end':
            eletag = elem.tag
            elemtext = elem.text
            yield eletag, elemtext

            if tag_stack == path_parts:
                yield elem
                elem_stack[-2].remove(elem)
            try:
                tag_stack.pop()
                elem_stack.pop()
            except IndexError:
                pass 
Example #11
Source File: xlsx.py    From pyRevit with GNU General Public License v3.0 6 votes vote down vote up
def process_stream_iterparse(self, stream, heading=None):
        if self.verbosity >= 2 and heading is not None:
            fprintf(self.logfile, "\n=== %s ===\n", heading)
        si_tag = U_SSML12 + 'si'
        elemno = -1
        sst = self.bk._sharedstrings
        for event, elem in ET.iterparse(stream):
            if elem.tag != si_tag: continue
            elemno = elemno + 1
            if self.verbosity >= 3:
                fprintf(self.logfile, "element #%d\n", elemno)
                self.dump_elem(elem)
            result = get_text_from_si_or_is(self, elem)
            sst.append(result)
            elem.clear() # destroy all child elements
        if self.verbosity >= 2:
            self.dumpout('Entries in SST: %d', len(sst))
        if self.verbosity >= 3:
            for x, s in enumerate(sst):
                fprintf(self.logfile, "SST x=%d s=%r\n", x, s) 
Example #12
Source File: create-corpus.py    From tinysearch with MIT License 6 votes vote down vote up
def articles():
    n = 0
    with bz2.BZ2File("articles.xml.bz2", 'r') as infile:
        for event, elem in iterparse(infile, events=("start", "end")):
            if event == 'start':
                if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}mediawiki':
                    root = elem
            elif event == 'end':
                if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}page':
                    title_elem = elem.find('{http://www.mediawiki.org/xml/export-0.10/}title')
                    if title_elem is None: continue
                    title = title_elem.text
                    if title is None or ':' in title: continue
                    revision = elem.find('{http://www.mediawiki.org/xml/export-0.10/}revision')
                    if revision is None: continue
                    text_elem = revision.find('{http://www.mediawiki.org/xml/export-0.10/}text')
                    if text_elem is None: continue
                    text = text_elem.text
                    if text is None: continue

                    yield Article(n, title, text)
                    n += 1
                    #if title == 'Zhang Heng':
                    #    break
                root.clear() 
Example #13
Source File: sentinel.py    From sarpy with MIT License 6 votes vote down vote up
def isa(filename):
    # Test to see if file is a manifest.safe file
    try:
        ns = dict([node for _, node in ET.iterparse(filename, events=['start-ns'])])
        # Parse everything else
        root_node = ET.parse(filename).getroot()
        if ((root_node.find('./metadataSection/metadataObject[@ID="platform"]/' +
                            'metadataWrap/xmlData/safe:platform/safe:familyName', ns).text ==
             'SENTINEL-1') and
            (root_node.find('./metadataSection/metadataObject[@ID="generalProductInformation"]/' +
                            'metadataWrap/xmlData/s1sarl1:standAloneProductInformation/' +
                            's1sarl1:productType', ns).text ==
             'SLC')):
            return Reader
    except Exception:
        pass 
Example #14
Source File: Bootstrapper.py    From discograph with MIT License 6 votes vote down vote up
def iterparse(source, tag):
        context = ElementTree.iterparse(
            source,
            events=('start', 'end',),
            )
        context = iter(context)
        _, root = next(context)
        depth = 0
        for event, element in context:
            if element.tag == tag:
                if event == 'start':
                    depth += 1
                else:
                    depth -= 1
                    if depth == 0:
                        yield element
                        root.clear() 
Example #15
Source File: parser.py    From fbchat-archive-parser with MIT License 6 votes vote down vote up
def parse_impl(self):
        """
        Parses the HTML content as a stream. This is far less memory
        intensive than loading the entire HTML file into memory, like
        BeautifulSoup does.
        """

        # Cast to str to ensure not unicode under Python 2, as the parser
        # doesn't like that.
        parser = XMLParser(encoding=str('UTF-8'))
        element_iter = ET.iterparse(self.handle, events=("start", "end"), parser=parser)
        for pos, element in element_iter:
            tag, class_attr = _tag_and_class_attr(element)
            if tag == "h1" and pos == "end":
                if not self.user:
                    self.user = element.text.strip()
            elif tag == "div" and "thread" in class_attr and pos == "start":
                participants = self.parse_participants(element)
                thread = self.parse_thread(participants, element_iter, True)
                self.save_thread(thread) 
Example #16
Source File: lidcXmlHelper.py    From LIDC-IDRI-processing with MIT License 6 votes vote down vote up
def create_xml_tree(filepath):
    """
    Method to ignore the namespaces if ElementTree is used. 
    Necessary becauseElementTree, by default, extend
    Tag names by the name space, but the namespaces used in the
    LIDC-IDRI dataset are not consistent. 
    Solution based on https://stackoverflow.com/questions/13412496/python-elementtree-module-how-to-ignore-the-namespace-of-xml-files-to-locate-ma
    
    instead of ET.fromstring(xml)
    """
    it = ET.iterparse(filepath)
    for _, el in it:
        if '}' in el.tag:
            el.tag = el.tag.split('}', 1)[1]  # strip all namespaces
        for at in el.attrib.keys(): # strip namespaces of attributes too
            if '}' in at:
                newat = at.split('}', 1)[1]
                el.attrib[newat] = el.attrib[at]
                del el.attrib[at]
    return it.root 
Example #17
Source File: primary.py    From vmaas with GNU General Public License v2.0 6 votes vote down vote up
def __init__(self, filename):
        self.package_count = 0
        self.packages = []
        root = None
        for event, elem in eT.iterparse(filename, events=("start", "end")):
            if elem.tag == "{%s}metadata" % NS["primary"] and event == "start":
                root = elem
                self.package_count = int(elem.get("packages"))
            elif elem.tag == "{%s}package" % NS["primary"] and event == "end":
                if elem.get("type") == "rpm":
                    package = {}
                    package["name"] = text_strip(elem.find("primary:name", NS))
                    evr = elem.find("primary:version", NS)
                    package["epoch"] = evr.get("epoch")
                    package["ver"] = evr.get("ver")
                    package["rel"] = evr.get("rel")
                    package["arch"] = text_strip(elem.find("primary:arch", NS))
                    package["summary"] = text_strip(elem.find("primary:summary", NS))
                    package["description"] = text_strip(elem.find("primary:description", NS))
                    package["srpm"] = elem.find("primary:format", NS).find("rpm:sourcerpm", NS).text
                    self.packages.append(package)
                    # Clear the XML tree continuously
                    root.clear() 
Example #18
Source File: OMIA.py    From dipper with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def process_species(self, limit):
        """
        Loop through the xml file and process the species.
        We add elements to the graph, and store the
        id-to-label in the label_hash dict.
        :param limit:
        :return:
        """
        myfile = '/'.join((self.rawdir, self.files['data']['file']))
        with gzip.open(myfile, 'rb') as readbin:
            filereader = io.TextIOWrapper(readbin, newline="")
            filereader.readline()  # remove the xml declaration line
            for event, elem in ET.iterparse(filereader):
                # Species ids are == NCBITaxon ids
                self.process_xml_table(
                    elem, 'Species_gb', self._process_species_table_row, limit) 
Example #19
Source File: Bootstrapper.py    From discograph with MIT License 5 votes vote down vote up
def get_iterator(tag):
        file_path = Bootstrapper.get_xml_path(tag)
        file_pointer = gzip.GzipFile(file_path, 'r')
        iterator = Bootstrapper.iterparse(file_pointer, tag)
        iterator = Bootstrapper.clean_elements(iterator)
        return iterator 
Example #20
Source File: flowmon-parse-results.py    From ns3-ecn-sharp with GNU General Public License v2.0 5 votes vote down vote up
def main(argv):
    file_obj = open(argv[1])
    print "Reading XML file ",
 
    sys.stdout.flush()        
    level = 0
    sim_list = []
    for event, elem in ElementTree.iterparse(file_obj, events=("start", "end")):
        if event == "start":
            level += 1
        if event == "end":
            level -= 1
            if level == 0 and elem.tag == 'FlowMonitor':
                sim = Simulation(elem)
                sim_list.append(sim)
                elem.clear() # won't need this any more
                sys.stdout.write(".")
                sys.stdout.flush()
    print " done."


    for sim in sim_list:
        for flow in sim.flows:
            t = flow.fiveTuple
            proto = {6: 'TCP', 17: 'UDP'} [t.protocol]
            print "FlowID: %i (%s %s/%s --> %s/%i)" % \
                (flow.flowId, proto, t.sourceAddress, t.sourcePort, t.destinationAddress, t.destinationPort)
            print "\tTX bitrate: %.2f kbit/s" % (flow.txBitrate*1e-3,)
            print "\tRX bitrate: %.2f kbit/s" % (flow.rxBitrate*1e-3,)
            print "\tMean Delay: %.2f ms" % (flow.delayMean*1e3,)
            print "\tPacket Loss Ratio: %.2f %%" % (flow.packetLossRatio*100) 
Example #21
Source File: xlsx.py    From pyRevit with GNU General Public License v3.0 5 votes vote down vote up
def ensure_elementtree_imported(verbosity, logfile):
    global ET, ET_has_iterparse, Element_has_iter
    if ET is not None:
        return
    if "IronPython" in sys.version:
        import xml.etree.ElementTree as ET
        #### 2.7.2.1: fails later with
        #### NotImplementedError: iterparse is not supported on IronPython. (CP #31923)
    else:
        try: import xml.etree.cElementTree as ET
        except ImportError:
            try: import cElementTree as ET
            except ImportError:
                try: import lxml.etree as ET
                except ImportError:
                    try: import xml.etree.ElementTree as ET
                    except ImportError:
                        try: import elementtree.ElementTree as ET
                        except ImportError:
                            raise Exception("Failed to import an ElementTree implementation")
    if hasattr(ET, 'iterparse'):
        _dummy_stream = BYTES_IO('')
        try:
            ET.iterparse(_dummy_stream)
            ET_has_iterparse = True
        except NotImplementedError:
            pass
    Element_has_iter = hasattr(ET.ElementTree, 'iter')
    if verbosity:
        etree_version = repr([
            (item, getattr(ET, item))
            for item in ET.__dict__.keys()
            if item.lower().replace('_', '') == 'version'
            ])
        print(ET.__file__, ET.__name__, etree_version, ET_has_iterparse, file=logfile) 
Example #22
Source File: element_iterator.py    From python-mediawiki-utilities with MIT License 5 votes vote down vote up
def from_file(cls, f):
        return EventPointer(etree.iterparse(f, events=("start", "end"))) 
Example #23
Source File: proxy_QuickRank.py    From rankeval with Mozilla Public License 2.0 5 votes vote down vote up
def _count_nodes(file_path):
        """
        Count the total number of nodes (both split and leaf nodes)
        in the model identified by file_path.

        Parameters
        ----------
        file_path : str
            The path to the filename where the model has been saved

        Returns
        -------
        tuple(n_trees, n_nodes) : tuple(int, int)
            The total number of trees and nodes (both split and leaf nodes)
            in the model identified by file_path.
        """
        # get an iterable
        context = etree.iterparse(file_path, events=("end",))

        # get the root element
        _, root = next(context)

        n_nodes = 0
        n_trees = 0
        for _, elem in context:
            if elem.tag == 'tree':
                n_trees += 1
            elif elem.tag == 'feature' or elem.tag == 'output':
                n_nodes += 1

            elem.clear()    # discard the element
            root.clear()    # remove root reference to the child

        return n_trees, n_nodes 
Example #24
Source File: factory.py    From network_tech with Apache License 2.0 5 votes vote down vote up
def _strip_namespaces(xml):
    it = ElementTree.iterparse(StringIO(xml))
    for _, el in it:
        if '}' in el.tag:
            el.tag = el.tag.split('}', 1)[1]  # strip all namespaces
    return it.root 
Example #25
Source File: cvemap.py    From vmaas with GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, filename, lastmodified):
        self.lastmodified = lastmodified
        self.cves = {}
        root = None
        updated = None
        for event, elem in eT.iterparse(filename, events=("start", "end")):
            if elem.tag == "cvemap" and event == "start":
                root = elem
                updated = parse_datetime(elem.get('updated'))
            elif elem.tag == "Vulnerability" and event == "end":
                name = elem.get('name')
                self.cves[name] = {
                    'impact': text_strip(elem.find('ThreatSeverity')),
                    'published_date': parse_datetime(text_strip(elem.find('PublicDate'))),
                    'modified_date': updated,
                    'cvss2_score': text_strip(elem.find('CVSS/CVSSBaseScore')),
                    'cvss2_metrics': text_strip(elem.find('CVSS/CVSSScoringVector')),
                    'cvss3_score': text_strip(elem.find('CVSS3/CVSS3BaseScore')),
                    'cvss3_metrics': text_strip(elem.find('CVSS3/CVSS3ScoringVector')),
                    'cwe_list': self._cwe_list(text_strip(elem.find('CWE'))),
                    'description': self._cve_description(elem.findall('Details[@{%s}lang="en:us"]' % NS)),
                    'iava': text_strip(elem.find('IAVA')),
                    'redhat_url': "https://access.redhat.com/security/cve/" + str.lower(name),
                    'secondary_url': text_strip(elem.find('References'))
                }

                # Clear the XML tree continuously
                root.clear() 
Example #26
Source File: test_regressions.py    From ironpython2 with Apache License 2.0 5 votes vote down vote up
def test_gh370(self):
        """https://github.com/IronLanguages/ironpython2/issues/370"""
        from xml.etree import ElementTree as ET
        from StringIO import StringIO
        x = ET.iterparse(StringIO('<root/>'))
        y = next(x)
        self.assertTrue(y[0] == 'end' and y[1].tag == 'root') 
Example #27
Source File: xml.py    From dpdata with GNU Lesser General Public License v3.0 5 votes vote down vote up
def analyze (fname, type_idx_zero = False, begin = 0, step = 1) :
    """
    can deal with broken xml file
    """
    all_posi = []
    all_cell = []
    all_ener = []
    all_forc = []
    all_strs = []
    cc = 0
    try:
        for event, elem in ET.iterparse(fname):
            if elem.tag == 'atominfo' :
                eles, types = analyze_atominfo(elem)
                types = np.array(types, dtype = int)
                if type_idx_zero :
                    types = types - 1
            if elem.tag == 'calculation' :
                posi, cell, ener, forc, strs = analyze_calculation(elem)
                if cc >= begin and (cc - begin) % step == 0 :
                    all_posi.append(posi)
                    all_cell.append(cell)
                    all_ener.append(ener)
                    all_forc.append(forc)
                    if strs is not None :
                        all_strs.append(strs)                
                cc += 1
    except ET.ParseError:
        return eles, types, np.array(all_cell), np.array(all_posi), np.array(all_ener), np.array(all_forc), np.array(all_strs)
    return eles, types, np.array(all_cell), np.array(all_posi), np.array(all_ener), np.array(all_forc), np.array(all_strs) 
Example #28
Source File: update_resources.py    From indra with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def update_hmdb_chebi_map():
    logger.info('--Updating HMDB to ChEBI entries----')
    ns = {'hmdb': 'http://www.hmdb.ca'}
    url = 'http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip'
    fname = os.path.join(path, 'hmdb_metabolites.zip')
    logger.info('Downloading %s' % url)
    #urlretrieve(url, fname)
    mappings = []
    with ZipFile(fname) as input_zip:
        with input_zip.open('hmdb_metabolites.xml') as fh:
            for event, elem in ET.iterparse(fh, events=('start', 'end')):
                #print(elem.tag)
                if event == 'start' and \
                        elem.tag == '{%s}metabolite' % ns['hmdb']:
                    hmdb_id = None
                    chebi_id = None
                # Important: we only look at accession if there's no HMDB
                # ID yet, otherwise we pick up secondary accession tags
                elif event == 'start' and \
                        elem.tag == '{%s}accession' % ns['hmdb'] and \
                        not hmdb_id:
                    hmdb_id = elem.text
                elif event == 'start' and \
                        elem.tag == '{%s}chebi_id' % ns['hmdb']:
                    chebi_id = elem.text
                elif event == 'end' and \
                        elem.tag == '{%s}metabolite' % ns['hmdb']:
                    if hmdb_id and chebi_id:
                        print(hmdb_id, chebi_id)
                        mappings.append([hmdb_id, chebi_id])
                elem.clear()
    fname = os.path.join(path, 'hmdb_to_chebi.tsv')
    mappings = [['HMDB_ID', 'CHEBI_ID']] + sorted(mappings, key=lambda x: x[0])
    write_unicode_csv(fname, mappings, delimiter='\t') 
Example #29
Source File: parser.py    From fbchat-archive-parser with MIT License 5 votes vote down vote up
def _get_manifest_data(self):

        user, thread_references = None, []

        ignore_anchors = True
        saw_anchor = False

        # Cast to str to ensure not unicode under Python 2, as the parser
        # doesn't like that.
        parser = XMLParser(encoding=str('UTF-8'))
        element_iter = ET.iterparse(self.handle, events=("start", "end"), parser=parser)
        for pos, element in element_iter:
            tag, class_attr = _tag_and_class_attr(element)
            if tag == "h1" and pos == "end":
                if not self.user:
                    user = element.text.strip()
            elif tag == "div" and "content" in class_attr and pos == "start":
                ignore_anchors = False
            elif tag == "a" and pos == "start":
                if ignore_anchors:
                    continue
                saw_anchor = True
                participants = self.parse_participants(element)
                thread_path = re.sub(r'^../', '', element.attrib['href'])
                if using_windows():
                    thread_path = thread_path.replace('/', '\\')
                thread_references += [(participants, os.path.join(self.root, thread_path))]

        if not saw_anchor:
            # Indicator of a `messages.htm` file that is probably in the legacy format.
            raise UnsuitableParserError

        return user, thread_references 
Example #30
Source File: xlsx.py    From pyRevit with GNU General Public License v3.0 5 votes vote down vote up
def own_process_stream(self, stream, heading=None):
        if self.verbosity >= 2 and heading is not None:
            fprintf(self.logfile, "\n=== %s ===\n", heading)
        getmethod = self.tag2meth.get
        row_tag = U_SSML12 + "row"
        self_do_row = self.do_row
        for event, elem in ET.iterparse(stream):
            if elem.tag == row_tag:
                self_do_row(elem)
                elem.clear() # destroy all child elements (cells)
            elif elem.tag == U_SSML12 + "dimension":
                self.do_dimension(elem)
            elif elem.tag == U_SSML12 + "mergeCell":
                self.do_merge_cell(elem)
        self.finish_off()