Python Examples of xml.etree.ElementTree.iterparse

Source File: xlsx.py From lambda-text-extractor with Apache License 2.0

6 votes

def process_stream_iterparse(self, stream, heading=None):
        if self.verbosity >= 2 and heading is not None:
            fprintf(self.logfile, "\n=== %s ===\n", heading)
        si_tag = U_SSML12 + 'si'
        elemno = -1
        sst = self.bk._sharedstrings
        for event, elem in ET.iterparse(stream):
            if elem.tag != si_tag: continue
            elemno = elemno + 1
            if self.verbosity >= 3:
                fprintf(self.logfile, "element #%d\n", elemno)
                self.dump_elem(elem)
            result = get_text_from_si_or_is(self, elem)
            sst.append(result)
            elem.clear() # destroy all child elements
        if self.verbosity >= 2:
            self.dumpout('Entries in SST: %d', len(sst))
        if self.verbosity >= 3:
            for x, s in enumerate(sst):
                fprintf(self.logfile, "SST x=%d s=%r\n", x, s)

Source File: OMIA.py From dipper with BSD 3-Clause "New" or "Revised" License

6 votes

def process_associations(self, limit):
        """
        Loop through the xml file and process the article-breed, article-phene,
        breed-phene, phene-gene associations, and the external links to LIDA.

        :param limit:
        :return:

        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))
        with gzip.open(myfile, 'rb') as readbin:
            filereader = io.TextIOWrapper(readbin, newline="")
            filereader.readline()  # remove the xml declaration line
            for event, elem in ET.iterparse(filereader):  # iterparse is not deprecated
                self.process_xml_table(
                    elem, 'Article_Breed', self._process_article_breed_row, limit)
                self.process_xml_table(
                    elem, 'Article_Phene', self._process_article_phene_row, limit)
                self.process_xml_table(
                    elem, 'Breed_Phene', self._process_breed_phene_row, limit)
                self.process_xml_table(
                    elem, 'Lida_Links', self._process_lida_links_row, limit)
                self.process_xml_table(
                    elem, 'Phene_Gene', self._process_phene_gene_row, limit)
                self.process_xml_table(
                    elem, 'Group_MPO', self._process_group_mpo_row, limit)

    # ############ INDIVIDUAL TABLE-LEVEL PROCESSING FUNCTIONS ################

Source File: __init__.py From streamlink with BSD 2-Clause "Simplified" License

6 votes

def _parse_xml(data, strip_ns=False):
    if six.PY2 and isinstance(data, six.text_type):
        data = data.encode("utf8")
    elif six.PY3:
        data = bytearray(data, "utf8")
    try:
        it = ET.iterparse(BytesIO(data))
        for _, el in it:
            if '}' in el.tag and strip_ns:
                # strip all namespaces
                el.tag = el.tag.split('}', 1)[1]
        return it.root
    except Exception as err:
        snippet = repr(data)
        if len(snippet) > 35:
            snippet = snippet[:35] + " ..."

        raise ValueError("Unable to parse XML: {0} ({1})".format(err, snippet))

Source File: path-flowmon-parse-result.py From ns3-load-balance with GNU General Public License v2.0

6 votes

def main(argv):
    file_obj = open(argv[1])
    print "Reading XML file ",

    sys.stdout.flush()
    level = 0
    sim_list = []
    for event, elem in ElementTree.iterparse(file_obj, events=("start", "end")):
        if event == "start":
            level += 1
        if event == "end":
            level -= 1
            if level == 0 and elem.tag == 'FlowMonitor':
                sim = Simulation(elem)
                sim_list.append(sim)
                elem.clear() # won't need this any more
                sys.stdout.write(".")
                sys.stdout.flush()
    print " done."

    for sim in sim_list:
        for flow in sim.flows:
            print "FlowID: %i" % flow.flowId,
	    print flow.paths

Source File: __init__.py From zulip with Apache License 2.0

6 votes

def handleMatch(self, match: Match[str]) -> Element:
        rendered = render_tex(match.group('body'), is_inline=True)
        if rendered is not None:
            # We need to give Python-Markdown an ElementTree object, but if we
            # give it one with correctly stored XML namespaces, it will mangle
            # everything when serializing it.  So we play this stupid game to
            # store xmlns as a normal attribute.  :-[
            assert ' zulip-xmlns="' not in rendered
            rendered = rendered.replace(' xmlns="', ' zulip-xmlns="')
            parsed = etree.iterparse(StringIO(rendered))
            for event, elem in parsed:
                if 'zulip-xmlns' in elem.attrib:
                    elem.attrib['xmlns'] = elem.attrib.pop('zulip-xmlns')
                root = elem
            return root
        else:  # Something went wrong while rendering
            span = Element('span')
            span.set('class', 'tex-error')
            span.text = '$$' + match.group('body') + '$$'
            return span

Source File: asf_template.py From esa_sentinel with MIT License

6 votes

def process_metalink(self, ml_file):
        print("Processing metalink file: {0}".format(ml_file))
        with open(ml_file, 'r') as ml:
            xml = ml.read()
        
        # Hack to remove annoying namespace
        it = ET.iterparse(StringIO(xml))
        for _, el in it:
            if '}' in el.tag:
                el.tag = el.tag.split('}', 1)[1]  # strip all namespaces
        root = it.root
        
        dl_urls = []
        ml_files = root.find('files')
        for dl in ml_files:
            dl_urls.append(dl.find('resources').find('url').text)
        
        if len(dl_urls) > 0:
            return dl_urls
        else:
            return None
    
    # Get download urls from a csv file

Source File: Objects.py From IFIscripts with MIT License

6 votes

def parse(filename):
    """Returns a DFXMLObject populated from the contents of the (string) filename argument."""
    retval = None
    appender = None
    for (event, obj) in iterparse(filename):
        if event == "start":
            if isinstance(obj, DFXMLObject):
                retval = obj
                appender = obj
            elif isinstance(obj, VolumeObject):
                retval.append(obj)
                appender = obj
        elif event == "end":
            if isinstance(obj, DFXMLObject):
                if retval is None:
                    retval = obj
                appender = obj
            if isinstance(obj, VolumeObject):
                appender = retval
            elif isinstance(obj, FileObject):
                appender.append(obj)
    return retval

Source File: xlsx.py From InternationalizationScript-iOS with MIT License

6 votes

def process_stream_iterparse(self, stream, heading=None):
        if self.verbosity >= 2 and heading is not None:
            fprintf(self.logfile, "\n=== %s ===\n", heading)
        si_tag = U_SSML12 + 'si'
        elemno = -1
        sst = self.bk._sharedstrings
        for event, elem in ET.iterparse(stream):
            if elem.tag != si_tag: continue
            elemno = elemno + 1
            if self.verbosity >= 3:
                fprintf(self.logfile, "element #%d\n", elemno)
                self.dump_elem(elem)
            result = get_text_from_si_or_is(self, elem)
            sst.append(result)                
            elem.clear() # destroy all child elements
        if self.verbosity >= 2:
            self.dumpout('Entries in SST: %d', len(sst))
        if self.verbosity >= 3:
            for x, s in enumerate(sst):
                fprintf(self.logfile, "SST x=%d s=%r\n", x, s)

Source File: xlsx.py From InternationalizationScript-iOS with MIT License

6 votes

def process_stream_iterparse(self, stream, heading=None):
        if self.verbosity >= 2 and heading is not None:
            fprintf(self.logfile, "\n=== %s ===\n", heading)
        si_tag = U_SSML12 + 'si'
        elemno = -1
        sst = self.bk._sharedstrings
        for event, elem in ET.iterparse(stream):
            if elem.tag != si_tag: continue
            elemno = elemno + 1
            if self.verbosity >= 3:
                fprintf(self.logfile, "element #%d\n", elemno)
                self.dump_elem(elem)
            result = get_text_from_si_or_is(self, elem)
            sst.append(result)                
            elem.clear() # destroy all child elements
        if self.verbosity >= 2:
            self.dumpout('Entries in SST: %d', len(sst))
        if self.verbosity >= 3:
            for x, s in enumerate(sst):
                fprintf(self.logfile, "SST x=%d s=%r\n", x, s)

Source File: parser.py From estnltk with GNU General Public License v2.0

6 votes

def parse_and_remove(filename, path):
    path_parts = path.split('/')
    doc = iterparse(filename, ('start', 'end'))
    tag_stack = []
    elem_stack = []
    for event, elem in doc:
        if event == 'start' in elem.tag:
            tag_stack.append(elem.tag)
            elem_stack.append(elem)
        elif event == 'end':
            eletag = elem.tag
            elemtext = elem.text
            yield eletag, elemtext

            if tag_stack == path_parts:
                yield elem
                elem_stack[-2].remove(elem)
            try:
                tag_stack.pop()
                elem_stack.pop()
            except IndexError:
                pass

Source File: xlsx.py From pyRevit with GNU General Public License v3.0

6 votes

def process_stream_iterparse(self, stream, heading=None):
        if self.verbosity >= 2 and heading is not None:
            fprintf(self.logfile, "\n=== %s ===\n", heading)
        si_tag = U_SSML12 + 'si'
        elemno = -1
        sst = self.bk._sharedstrings
        for event, elem in ET.iterparse(stream):
            if elem.tag != si_tag: continue
            elemno = elemno + 1
            if self.verbosity >= 3:
                fprintf(self.logfile, "element #%d\n", elemno)
                self.dump_elem(elem)
            result = get_text_from_si_or_is(self, elem)
            sst.append(result)
            elem.clear() # destroy all child elements
        if self.verbosity >= 2:
            self.dumpout('Entries in SST: %d', len(sst))
        if self.verbosity >= 3:
            for x, s in enumerate(sst):
                fprintf(self.logfile, "SST x=%d s=%r\n", x, s)

Source File: create-corpus.py From tinysearch with MIT License

6 votes

def articles():
    n = 0
    with bz2.BZ2File("articles.xml.bz2", 'r') as infile:
        for event, elem in iterparse(infile, events=("start", "end")):
            if event == 'start':
                if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}mediawiki':
                    root = elem
            elif event == 'end':
                if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}page':
                    title_elem = elem.find('{http://www.mediawiki.org/xml/export-0.10/}title')
                    if title_elem is None: continue
                    title = title_elem.text
                    if title is None or ':' in title: continue
                    revision = elem.find('{http://www.mediawiki.org/xml/export-0.10/}revision')
                    if revision is None: continue
                    text_elem = revision.find('{http://www.mediawiki.org/xml/export-0.10/}text')
                    if text_elem is None: continue
                    text = text_elem.text
                    if text is None: continue

                    yield Article(n, title, text)
                    n += 1
                    #if title == 'Zhang Heng':
                    #    break
                root.clear()

Source File: sentinel.py From sarpy with MIT License

6 votes

def isa(filename):
    # Test to see if file is a manifest.safe file
    try:
        ns = dict([node for _, node in ET.iterparse(filename, events=['start-ns'])])
        # Parse everything else
        root_node = ET.parse(filename).getroot()
        if ((root_node.find('./metadataSection/metadataObject[@ID="platform"]/' +
                            'metadataWrap/xmlData/safe:platform/safe:familyName', ns).text ==
             'SENTINEL-1') and
            (root_node.find('./metadataSection/metadataObject[@ID="generalProductInformation"]/' +
                            'metadataWrap/xmlData/s1sarl1:standAloneProductInformation/' +
                            's1sarl1:productType', ns).text ==
             'SLC')):
            return Reader
    except Exception:
        pass

Source File: Bootstrapper.py From discograph with MIT License

6 votes

def iterparse(source, tag):
        context = ElementTree.iterparse(
            source,
            events=('start', 'end',),
            )
        context = iter(context)
        _, root = next(context)
        depth = 0
        for event, element in context:
            if element.tag == tag:
                if event == 'start':
                    depth += 1
                else:
                    depth -= 1
                    if depth == 0:
                        yield element
                        root.clear()

Source File: parser.py From fbchat-archive-parser with MIT License

6 votes

def parse_impl(self):
        """
        Parses the HTML content as a stream. This is far less memory
        intensive than loading the entire HTML file into memory, like
        BeautifulSoup does.
        """

        # Cast to str to ensure not unicode under Python 2, as the parser
        # doesn't like that.
        parser = XMLParser(encoding=str('UTF-8'))
        element_iter = ET.iterparse(self.handle, events=("start", "end"), parser=parser)
        for pos, element in element_iter:
            tag, class_attr = _tag_and_class_attr(element)
            if tag == "h1" and pos == "end":
                if not self.user:
                    self.user = element.text.strip()
            elif tag == "div" and "thread" in class_attr and pos == "start":
                participants = self.parse_participants(element)
                thread = self.parse_thread(participants, element_iter, True)
                self.save_thread(thread)

Source File: lidcXmlHelper.py From LIDC-IDRI-processing with MIT License

6 votes

def create_xml_tree(filepath):
    """
    Method to ignore the namespaces if ElementTree is used. 
    Necessary becauseElementTree, by default, extend
    Tag names by the name space, but the namespaces used in the
    LIDC-IDRI dataset are not consistent. 
    Solution based on https://stackoverflow.com/questions/13412496/python-elementtree-module-how-to-ignore-the-namespace-of-xml-files-to-locate-ma
    
    instead of ET.fromstring(xml)
    """
    it = ET.iterparse(filepath)
    for _, el in it:
        if '}' in el.tag:
            el.tag = el.tag.split('}', 1)[1]  # strip all namespaces
        for at in el.attrib.keys(): # strip namespaces of attributes too
            if '}' in at:
                newat = at.split('}', 1)[1]
                el.attrib[newat] = el.attrib[at]
                del el.attrib[at]
    return it.root

Source File: primary.py From vmaas with GNU General Public License v2.0

6 votes

def __init__(self, filename):
        self.package_count = 0
        self.packages = []
        root = None
        for event, elem in eT.iterparse(filename, events=("start", "end")):
            if elem.tag == "{%s}metadata" % NS["primary"] and event == "start":
                root = elem
                self.package_count = int(elem.get("packages"))
            elif elem.tag == "{%s}package" % NS["primary"] and event == "end":
                if elem.get("type") == "rpm":
                    package = {}
                    package["name"] = text_strip(elem.find("primary:name", NS))
                    evr = elem.find("primary:version", NS)
                    package["epoch"] = evr.get("epoch")
                    package["ver"] = evr.get("ver")
                    package["rel"] = evr.get("rel")
                    package["arch"] = text_strip(elem.find("primary:arch", NS))
                    package["summary"] = text_strip(elem.find("primary:summary", NS))
                    package["description"] = text_strip(elem.find("primary:description", NS))
                    package["srpm"] = elem.find("primary:format", NS).find("rpm:sourcerpm", NS).text
                    self.packages.append(package)
                    # Clear the XML tree continuously
                    root.clear()

Source File: OMIA.py From dipper with BSD 3-Clause "New" or "Revised" License

6 votes

def process_species(self, limit):
        """
        Loop through the xml file and process the species.
        We add elements to the graph, and store the
        id-to-label in the label_hash dict.
        :param limit:
        :return:
        """
        myfile = '/'.join((self.rawdir, self.files['data']['file']))
        with gzip.open(myfile, 'rb') as readbin:
            filereader = io.TextIOWrapper(readbin, newline="")
            filereader.readline()  # remove the xml declaration line
            for event, elem in ET.iterparse(filereader):
                # Species ids are == NCBITaxon ids
                self.process_xml_table(
                    elem, 'Species_gb', self._process_species_table_row, limit)

Source File: Bootstrapper.py From discograph with MIT License

5 votes

def get_iterator(tag):
        file_path = Bootstrapper.get_xml_path(tag)
        file_pointer = gzip.GzipFile(file_path, 'r')
        iterator = Bootstrapper.iterparse(file_pointer, tag)
        iterator = Bootstrapper.clean_elements(iterator)
        return iterator

Source File: flowmon-parse-results.py From ns3-ecn-sharp with GNU General Public License v2.0

5 votes

def main(argv):
    file_obj = open(argv[1])
    print "Reading XML file ",
 
    sys.stdout.flush()        
    level = 0
    sim_list = []
    for event, elem in ElementTree.iterparse(file_obj, events=("start", "end")):
        if event == "start":
            level += 1
        if event == "end":
            level -= 1
            if level == 0 and elem.tag == 'FlowMonitor':
                sim = Simulation(elem)
                sim_list.append(sim)
                elem.clear() # won't need this any more
                sys.stdout.write(".")
                sys.stdout.flush()
    print " done."


    for sim in sim_list:
        for flow in sim.flows:
            t = flow.fiveTuple
            proto = {6: 'TCP', 17: 'UDP'} [t.protocol]
            print "FlowID: %i (%s %s/%s --> %s/%i)" % \
                (flow.flowId, proto, t.sourceAddress, t.sourcePort, t.destinationAddress, t.destinationPort)
            print "\tTX bitrate: %.2f kbit/s" % (flow.txBitrate*1e-3,)
            print "\tRX bitrate: %.2f kbit/s" % (flow.rxBitrate*1e-3,)
            print "\tMean Delay: %.2f ms" % (flow.delayMean*1e3,)
            print "\tPacket Loss Ratio: %.2f %%" % (flow.packetLossRatio*100)

Source File: xlsx.py From pyRevit with GNU General Public License v3.0

5 votes

def ensure_elementtree_imported(verbosity, logfile):
    global ET, ET_has_iterparse, Element_has_iter
    if ET is not None:
        return
    if "IronPython" in sys.version:
        import xml.etree.ElementTree as ET
        #### 2.7.2.1: fails later with
        #### NotImplementedError: iterparse is not supported on IronPython. (CP #31923)
    else:
        try: import xml.etree.cElementTree as ET
        except ImportError:
            try: import cElementTree as ET
            except ImportError:
                try: import lxml.etree as ET
                except ImportError:
                    try: import xml.etree.ElementTree as ET
                    except ImportError:
                        try: import elementtree.ElementTree as ET
                        except ImportError:
                            raise Exception("Failed to import an ElementTree implementation")
    if hasattr(ET, 'iterparse'):
        _dummy_stream = BYTES_IO('')
        try:
            ET.iterparse(_dummy_stream)
            ET_has_iterparse = True
        except NotImplementedError:
            pass
    Element_has_iter = hasattr(ET.ElementTree, 'iter')
    if verbosity:
        etree_version = repr([
            (item, getattr(ET, item))
            for item in ET.__dict__.keys()
            if item.lower().replace('_', '') == 'version'
            ])
        print(ET.__file__, ET.__name__, etree_version, ET_has_iterparse, file=logfile)

Source File: element_iterator.py From python-mediawiki-utilities with MIT License

5 votes

def from_file(cls, f):
        return EventPointer(etree.iterparse(f, events=("start", "end")))

Source File: proxy_QuickRank.py From rankeval with Mozilla Public License 2.0

5 votes

def _count_nodes(file_path):
        """
        Count the total number of nodes (both split and leaf nodes)
        in the model identified by file_path.

        Parameters
        ----------
        file_path : str
            The path to the filename where the model has been saved

        Returns
        -------
        tuple(n_trees, n_nodes) : tuple(int, int)
            The total number of trees and nodes (both split and leaf nodes)
            in the model identified by file_path.
        """
        # get an iterable
        context = etree.iterparse(file_path, events=("end",))

        # get the root element
        _, root = next(context)

        n_nodes = 0
        n_trees = 0
        for _, elem in context:
            if elem.tag == 'tree':
                n_trees += 1
            elif elem.tag == 'feature' or elem.tag == 'output':
                n_nodes += 1

            elem.clear()    # discard the element
            root.clear()    # remove root reference to the child

        return n_trees, n_nodes

Source File: factory.py From network_tech with Apache License 2.0

5 votes

def _strip_namespaces(xml):
    it = ElementTree.iterparse(StringIO(xml))
    for _, el in it:
        if '}' in el.tag:
            el.tag = el.tag.split('}', 1)[1]  # strip all namespaces
    return it.root

Source File: cvemap.py From vmaas with GNU General Public License v2.0

5 votes

def __init__(self, filename, lastmodified):
        self.lastmodified = lastmodified
        self.cves = {}
        root = None
        updated = None
        for event, elem in eT.iterparse(filename, events=("start", "end")):
            if elem.tag == "cvemap" and event == "start":
                root = elem
                updated = parse_datetime(elem.get('updated'))
            elif elem.tag == "Vulnerability" and event == "end":
                name = elem.get('name')
                self.cves[name] = {
                    'impact': text_strip(elem.find('ThreatSeverity')),
                    'published_date': parse_datetime(text_strip(elem.find('PublicDate'))),
                    'modified_date': updated,
                    'cvss2_score': text_strip(elem.find('CVSS/CVSSBaseScore')),
                    'cvss2_metrics': text_strip(elem.find('CVSS/CVSSScoringVector')),
                    'cvss3_score': text_strip(elem.find('CVSS3/CVSS3BaseScore')),
                    'cvss3_metrics': text_strip(elem.find('CVSS3/CVSS3ScoringVector')),
                    'cwe_list': self._cwe_list(text_strip(elem.find('CWE'))),
                    'description': self._cve_description(elem.findall('Details[@{%s}lang="en:us"]' % NS)),
                    'iava': text_strip(elem.find('IAVA')),
                    'redhat_url': "https://access.redhat.com/security/cve/" + str.lower(name),
                    'secondary_url': text_strip(elem.find('References'))
                }

                # Clear the XML tree continuously
                root.clear()

Source File: test_regressions.py From ironpython2 with Apache License 2.0

5 votes

def test_gh370(self):
        """https://github.com/IronLanguages/ironpython2/issues/370"""
        from xml.etree import ElementTree as ET
        from StringIO import StringIO
        x = ET.iterparse(StringIO('<root/>'))
        y = next(x)
        self.assertTrue(y[0] == 'end' and y[1].tag == 'root')

Source File: xml.py From dpdata with GNU Lesser General Public License v3.0

5 votes

def analyze (fname, type_idx_zero = False, begin = 0, step = 1) :
    """
    can deal with broken xml file
    """
    all_posi = []
    all_cell = []
    all_ener = []
    all_forc = []
    all_strs = []
    cc = 0
    try:
        for event, elem in ET.iterparse(fname):
            if elem.tag == 'atominfo' :
                eles, types = analyze_atominfo(elem)
                types = np.array(types, dtype = int)
                if type_idx_zero :
                    types = types - 1
            if elem.tag == 'calculation' :
                posi, cell, ener, forc, strs = analyze_calculation(elem)
                if cc >= begin and (cc - begin) % step == 0 :
                    all_posi.append(posi)
                    all_cell.append(cell)
                    all_ener.append(ener)
                    all_forc.append(forc)
                    if strs is not None :
                        all_strs.append(strs)                
                cc += 1
    except ET.ParseError:
        return eles, types, np.array(all_cell), np.array(all_posi), np.array(all_ener), np.array(all_forc), np.array(all_strs)
    return eles, types, np.array(all_cell), np.array(all_posi), np.array(all_ener), np.array(all_forc), np.array(all_strs)

Source File: update_resources.py From indra with BSD 2-Clause "Simplified" License

5 votes

def update_hmdb_chebi_map():
    logger.info('--Updating HMDB to ChEBI entries----')
    ns = {'hmdb': 'http://www.hmdb.ca'}
    url = 'http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip'
    fname = os.path.join(path, 'hmdb_metabolites.zip')
    logger.info('Downloading %s' % url)
    #urlretrieve(url, fname)
    mappings = []
    with ZipFile(fname) as input_zip:
        with input_zip.open('hmdb_metabolites.xml') as fh:
            for event, elem in ET.iterparse(fh, events=('start', 'end')):
                #print(elem.tag)
                if event == 'start' and \
                        elem.tag == '{%s}metabolite' % ns['hmdb']:
                    hmdb_id = None
                    chebi_id = None
                # Important: we only look at accession if there's no HMDB
                # ID yet, otherwise we pick up secondary accession tags
                elif event == 'start' and \
                        elem.tag == '{%s}accession' % ns['hmdb'] and \
                        not hmdb_id:
                    hmdb_id = elem.text
                elif event == 'start' and \
                        elem.tag == '{%s}chebi_id' % ns['hmdb']:
                    chebi_id = elem.text
                elif event == 'end' and \
                        elem.tag == '{%s}metabolite' % ns['hmdb']:
                    if hmdb_id and chebi_id:
                        print(hmdb_id, chebi_id)
                        mappings.append([hmdb_id, chebi_id])
                elem.clear()
    fname = os.path.join(path, 'hmdb_to_chebi.tsv')
    mappings = [['HMDB_ID', 'CHEBI_ID']] + sorted(mappings, key=lambda x: x[0])
    write_unicode_csv(fname, mappings, delimiter='\t')

Source File: parser.py From fbchat-archive-parser with MIT License

5 votes

def _get_manifest_data(self):

        user, thread_references = None, []

        ignore_anchors = True
        saw_anchor = False

        # Cast to str to ensure not unicode under Python 2, as the parser
        # doesn't like that.
        parser = XMLParser(encoding=str('UTF-8'))
        element_iter = ET.iterparse(self.handle, events=("start", "end"), parser=parser)
        for pos, element in element_iter:
            tag, class_attr = _tag_and_class_attr(element)
            if tag == "h1" and pos == "end":
                if not self.user:
                    user = element.text.strip()
            elif tag == "div" and "content" in class_attr and pos == "start":
                ignore_anchors = False
            elif tag == "a" and pos == "start":
                if ignore_anchors:
                    continue
                saw_anchor = True
                participants = self.parse_participants(element)
                thread_path = re.sub(r'^../', '', element.attrib['href'])
                if using_windows():
                    thread_path = thread_path.replace('/', '\\')
                thread_references += [(participants, os.path.join(self.root, thread_path))]

        if not saw_anchor:
            # Indicator of a `messages.htm` file that is probably in the legacy format.
            raise UnsuitableParserError

        return user, thread_references

Source File: xlsx.py From pyRevit with GNU General Public License v3.0

5 votes

def own_process_stream(self, stream, heading=None):
        if self.verbosity >= 2 and heading is not None:
            fprintf(self.logfile, "\n=== %s ===\n", heading)
        getmethod = self.tag2meth.get
        row_tag = U_SSML12 + "row"
        self_do_row = self.do_row
        for event, elem in ET.iterparse(stream):
            if elem.tag == row_tag:
                self_do_row(elem)
                elem.clear() # destroy all child elements (cells)
            elif elem.tag == U_SSML12 + "dimension":
                self.do_dimension(elem)
            elif elem.tag == U_SSML12 + "mergeCell":
                self.do_merge_cell(elem)
        self.finish_off()

Python xml.etree.ElementTree.iterparse() Examples