Python xml.etree.ElementTree.iterparse() Examples
The following are 30
code examples of xml.etree.ElementTree.iterparse().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
xml.etree.ElementTree
, or try the search function
.
Example #1
Source File: xlsx.py From lambda-text-extractor with Apache License 2.0 | 6 votes |
def process_stream_iterparse(self, stream, heading=None): if self.verbosity >= 2 and heading is not None: fprintf(self.logfile, "\n=== %s ===\n", heading) si_tag = U_SSML12 + 'si' elemno = -1 sst = self.bk._sharedstrings for event, elem in ET.iterparse(stream): if elem.tag != si_tag: continue elemno = elemno + 1 if self.verbosity >= 3: fprintf(self.logfile, "element #%d\n", elemno) self.dump_elem(elem) result = get_text_from_si_or_is(self, elem) sst.append(result) elem.clear() # destroy all child elements if self.verbosity >= 2: self.dumpout('Entries in SST: %d', len(sst)) if self.verbosity >= 3: for x, s in enumerate(sst): fprintf(self.logfile, "SST x=%d s=%r\n", x, s)
Example #2
Source File: OMIA.py From dipper with BSD 3-Clause "New" or "Revised" License | 6 votes |
def process_associations(self, limit): """ Loop through the xml file and process the article-breed, article-phene, breed-phene, phene-gene associations, and the external links to LIDA. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) with gzip.open(myfile, 'rb') as readbin: filereader = io.TextIOWrapper(readbin, newline="") filereader.readline() # remove the xml declaration line for event, elem in ET.iterparse(filereader): # iterparse is not deprecated self.process_xml_table( elem, 'Article_Breed', self._process_article_breed_row, limit) self.process_xml_table( elem, 'Article_Phene', self._process_article_phene_row, limit) self.process_xml_table( elem, 'Breed_Phene', self._process_breed_phene_row, limit) self.process_xml_table( elem, 'Lida_Links', self._process_lida_links_row, limit) self.process_xml_table( elem, 'Phene_Gene', self._process_phene_gene_row, limit) self.process_xml_table( elem, 'Group_MPO', self._process_group_mpo_row, limit) # ############ INDIVIDUAL TABLE-LEVEL PROCESSING FUNCTIONS ################
Example #3
Source File: __init__.py From streamlink with BSD 2-Clause "Simplified" License | 6 votes |
def _parse_xml(data, strip_ns=False): if six.PY2 and isinstance(data, six.text_type): data = data.encode("utf8") elif six.PY3: data = bytearray(data, "utf8") try: it = ET.iterparse(BytesIO(data)) for _, el in it: if '}' in el.tag and strip_ns: # strip all namespaces el.tag = el.tag.split('}', 1)[1] return it.root except Exception as err: snippet = repr(data) if len(snippet) > 35: snippet = snippet[:35] + " ..." raise ValueError("Unable to parse XML: {0} ({1})".format(err, snippet))
Example #4
Source File: path-flowmon-parse-result.py From ns3-load-balance with GNU General Public License v2.0 | 6 votes |
def main(argv): file_obj = open(argv[1]) print "Reading XML file ", sys.stdout.flush() level = 0 sim_list = [] for event, elem in ElementTree.iterparse(file_obj, events=("start", "end")): if event == "start": level += 1 if event == "end": level -= 1 if level == 0 and elem.tag == 'FlowMonitor': sim = Simulation(elem) sim_list.append(sim) elem.clear() # won't need this any more sys.stdout.write(".") sys.stdout.flush() print " done." for sim in sim_list: for flow in sim.flows: print "FlowID: %i" % flow.flowId, print flow.paths
Example #5
Source File: __init__.py From zulip with Apache License 2.0 | 6 votes |
def handleMatch(self, match: Match[str]) -> Element: rendered = render_tex(match.group('body'), is_inline=True) if rendered is not None: # We need to give Python-Markdown an ElementTree object, but if we # give it one with correctly stored XML namespaces, it will mangle # everything when serializing it. So we play this stupid game to # store xmlns as a normal attribute. :-[ assert ' zulip-xmlns="' not in rendered rendered = rendered.replace(' xmlns="', ' zulip-xmlns="') parsed = etree.iterparse(StringIO(rendered)) for event, elem in parsed: if 'zulip-xmlns' in elem.attrib: elem.attrib['xmlns'] = elem.attrib.pop('zulip-xmlns') root = elem return root else: # Something went wrong while rendering span = Element('span') span.set('class', 'tex-error') span.text = '$$' + match.group('body') + '$$' return span
Example #6
Source File: asf_template.py From esa_sentinel with MIT License | 6 votes |
def process_metalink(self, ml_file): print("Processing metalink file: {0}".format(ml_file)) with open(ml_file, 'r') as ml: xml = ml.read() # Hack to remove annoying namespace it = ET.iterparse(StringIO(xml)) for _, el in it: if '}' in el.tag: el.tag = el.tag.split('}', 1)[1] # strip all namespaces root = it.root dl_urls = [] ml_files = root.find('files') for dl in ml_files: dl_urls.append(dl.find('resources').find('url').text) if len(dl_urls) > 0: return dl_urls else: return None # Get download urls from a csv file
Example #7
Source File: Objects.py From IFIscripts with MIT License | 6 votes |
def parse(filename): """Returns a DFXMLObject populated from the contents of the (string) filename argument.""" retval = None appender = None for (event, obj) in iterparse(filename): if event == "start": if isinstance(obj, DFXMLObject): retval = obj appender = obj elif isinstance(obj, VolumeObject): retval.append(obj) appender = obj elif event == "end": if isinstance(obj, DFXMLObject): if retval is None: retval = obj appender = obj if isinstance(obj, VolumeObject): appender = retval elif isinstance(obj, FileObject): appender.append(obj) return retval
Example #8
Source File: xlsx.py From InternationalizationScript-iOS with MIT License | 6 votes |
def process_stream_iterparse(self, stream, heading=None): if self.verbosity >= 2 and heading is not None: fprintf(self.logfile, "\n=== %s ===\n", heading) si_tag = U_SSML12 + 'si' elemno = -1 sst = self.bk._sharedstrings for event, elem in ET.iterparse(stream): if elem.tag != si_tag: continue elemno = elemno + 1 if self.verbosity >= 3: fprintf(self.logfile, "element #%d\n", elemno) self.dump_elem(elem) result = get_text_from_si_or_is(self, elem) sst.append(result) elem.clear() # destroy all child elements if self.verbosity >= 2: self.dumpout('Entries in SST: %d', len(sst)) if self.verbosity >= 3: for x, s in enumerate(sst): fprintf(self.logfile, "SST x=%d s=%r\n", x, s)
Example #9
Source File: xlsx.py From InternationalizationScript-iOS with MIT License | 6 votes |
def process_stream_iterparse(self, stream, heading=None): if self.verbosity >= 2 and heading is not None: fprintf(self.logfile, "\n=== %s ===\n", heading) si_tag = U_SSML12 + 'si' elemno = -1 sst = self.bk._sharedstrings for event, elem in ET.iterparse(stream): if elem.tag != si_tag: continue elemno = elemno + 1 if self.verbosity >= 3: fprintf(self.logfile, "element #%d\n", elemno) self.dump_elem(elem) result = get_text_from_si_or_is(self, elem) sst.append(result) elem.clear() # destroy all child elements if self.verbosity >= 2: self.dumpout('Entries in SST: %d', len(sst)) if self.verbosity >= 3: for x, s in enumerate(sst): fprintf(self.logfile, "SST x=%d s=%r\n", x, s)
Example #10
Source File: parser.py From estnltk with GNU General Public License v2.0 | 6 votes |
def parse_and_remove(filename, path): path_parts = path.split('/') doc = iterparse(filename, ('start', 'end')) tag_stack = [] elem_stack = [] for event, elem in doc: if event == 'start' in elem.tag: tag_stack.append(elem.tag) elem_stack.append(elem) elif event == 'end': eletag = elem.tag elemtext = elem.text yield eletag, elemtext if tag_stack == path_parts: yield elem elem_stack[-2].remove(elem) try: tag_stack.pop() elem_stack.pop() except IndexError: pass
Example #11
Source File: xlsx.py From pyRevit with GNU General Public License v3.0 | 6 votes |
def process_stream_iterparse(self, stream, heading=None): if self.verbosity >= 2 and heading is not None: fprintf(self.logfile, "\n=== %s ===\n", heading) si_tag = U_SSML12 + 'si' elemno = -1 sst = self.bk._sharedstrings for event, elem in ET.iterparse(stream): if elem.tag != si_tag: continue elemno = elemno + 1 if self.verbosity >= 3: fprintf(self.logfile, "element #%d\n", elemno) self.dump_elem(elem) result = get_text_from_si_or_is(self, elem) sst.append(result) elem.clear() # destroy all child elements if self.verbosity >= 2: self.dumpout('Entries in SST: %d', len(sst)) if self.verbosity >= 3: for x, s in enumerate(sst): fprintf(self.logfile, "SST x=%d s=%r\n", x, s)
Example #12
Source File: create-corpus.py From tinysearch with MIT License | 6 votes |
def articles(): n = 0 with bz2.BZ2File("articles.xml.bz2", 'r') as infile: for event, elem in iterparse(infile, events=("start", "end")): if event == 'start': if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}mediawiki': root = elem elif event == 'end': if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}page': title_elem = elem.find('{http://www.mediawiki.org/xml/export-0.10/}title') if title_elem is None: continue title = title_elem.text if title is None or ':' in title: continue revision = elem.find('{http://www.mediawiki.org/xml/export-0.10/}revision') if revision is None: continue text_elem = revision.find('{http://www.mediawiki.org/xml/export-0.10/}text') if text_elem is None: continue text = text_elem.text if text is None: continue yield Article(n, title, text) n += 1 #if title == 'Zhang Heng': # break root.clear()
Example #13
Source File: sentinel.py From sarpy with MIT License | 6 votes |
def isa(filename): # Test to see if file is a manifest.safe file try: ns = dict([node for _, node in ET.iterparse(filename, events=['start-ns'])]) # Parse everything else root_node = ET.parse(filename).getroot() if ((root_node.find('./metadataSection/metadataObject[@ID="platform"]/' + 'metadataWrap/xmlData/safe:platform/safe:familyName', ns).text == 'SENTINEL-1') and (root_node.find('./metadataSection/metadataObject[@ID="generalProductInformation"]/' + 'metadataWrap/xmlData/s1sarl1:standAloneProductInformation/' + 's1sarl1:productType', ns).text == 'SLC')): return Reader except Exception: pass
Example #14
Source File: Bootstrapper.py From discograph with MIT License | 6 votes |
def iterparse(source, tag): context = ElementTree.iterparse( source, events=('start', 'end',), ) context = iter(context) _, root = next(context) depth = 0 for event, element in context: if element.tag == tag: if event == 'start': depth += 1 else: depth -= 1 if depth == 0: yield element root.clear()
Example #15
Source File: parser.py From fbchat-archive-parser with MIT License | 6 votes |
def parse_impl(self): """ Parses the HTML content as a stream. This is far less memory intensive than loading the entire HTML file into memory, like BeautifulSoup does. """ # Cast to str to ensure not unicode under Python 2, as the parser # doesn't like that. parser = XMLParser(encoding=str('UTF-8')) element_iter = ET.iterparse(self.handle, events=("start", "end"), parser=parser) for pos, element in element_iter: tag, class_attr = _tag_and_class_attr(element) if tag == "h1" and pos == "end": if not self.user: self.user = element.text.strip() elif tag == "div" and "thread" in class_attr and pos == "start": participants = self.parse_participants(element) thread = self.parse_thread(participants, element_iter, True) self.save_thread(thread)
Example #16
Source File: lidcXmlHelper.py From LIDC-IDRI-processing with MIT License | 6 votes |
def create_xml_tree(filepath): """ Method to ignore the namespaces if ElementTree is used. Necessary becauseElementTree, by default, extend Tag names by the name space, but the namespaces used in the LIDC-IDRI dataset are not consistent. Solution based on https://stackoverflow.com/questions/13412496/python-elementtree-module-how-to-ignore-the-namespace-of-xml-files-to-locate-ma instead of ET.fromstring(xml) """ it = ET.iterparse(filepath) for _, el in it: if '}' in el.tag: el.tag = el.tag.split('}', 1)[1] # strip all namespaces for at in el.attrib.keys(): # strip namespaces of attributes too if '}' in at: newat = at.split('}', 1)[1] el.attrib[newat] = el.attrib[at] del el.attrib[at] return it.root
Example #17
Source File: primary.py From vmaas with GNU General Public License v2.0 | 6 votes |
def __init__(self, filename): self.package_count = 0 self.packages = [] root = None for event, elem in eT.iterparse(filename, events=("start", "end")): if elem.tag == "{%s}metadata" % NS["primary"] and event == "start": root = elem self.package_count = int(elem.get("packages")) elif elem.tag == "{%s}package" % NS["primary"] and event == "end": if elem.get("type") == "rpm": package = {} package["name"] = text_strip(elem.find("primary:name", NS)) evr = elem.find("primary:version", NS) package["epoch"] = evr.get("epoch") package["ver"] = evr.get("ver") package["rel"] = evr.get("rel") package["arch"] = text_strip(elem.find("primary:arch", NS)) package["summary"] = text_strip(elem.find("primary:summary", NS)) package["description"] = text_strip(elem.find("primary:description", NS)) package["srpm"] = elem.find("primary:format", NS).find("rpm:sourcerpm", NS).text self.packages.append(package) # Clear the XML tree continuously root.clear()
Example #18
Source File: OMIA.py From dipper with BSD 3-Clause "New" or "Revised" License | 6 votes |
def process_species(self, limit): """ Loop through the xml file and process the species. We add elements to the graph, and store the id-to-label in the label_hash dict. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) with gzip.open(myfile, 'rb') as readbin: filereader = io.TextIOWrapper(readbin, newline="") filereader.readline() # remove the xml declaration line for event, elem in ET.iterparse(filereader): # Species ids are == NCBITaxon ids self.process_xml_table( elem, 'Species_gb', self._process_species_table_row, limit)
Example #19
Source File: Bootstrapper.py From discograph with MIT License | 5 votes |
def get_iterator(tag): file_path = Bootstrapper.get_xml_path(tag) file_pointer = gzip.GzipFile(file_path, 'r') iterator = Bootstrapper.iterparse(file_pointer, tag) iterator = Bootstrapper.clean_elements(iterator) return iterator
Example #20
Source File: flowmon-parse-results.py From ns3-ecn-sharp with GNU General Public License v2.0 | 5 votes |
def main(argv): file_obj = open(argv[1]) print "Reading XML file ", sys.stdout.flush() level = 0 sim_list = [] for event, elem in ElementTree.iterparse(file_obj, events=("start", "end")): if event == "start": level += 1 if event == "end": level -= 1 if level == 0 and elem.tag == 'FlowMonitor': sim = Simulation(elem) sim_list.append(sim) elem.clear() # won't need this any more sys.stdout.write(".") sys.stdout.flush() print " done." for sim in sim_list: for flow in sim.flows: t = flow.fiveTuple proto = {6: 'TCP', 17: 'UDP'} [t.protocol] print "FlowID: %i (%s %s/%s --> %s/%i)" % \ (flow.flowId, proto, t.sourceAddress, t.sourcePort, t.destinationAddress, t.destinationPort) print "\tTX bitrate: %.2f kbit/s" % (flow.txBitrate*1e-3,) print "\tRX bitrate: %.2f kbit/s" % (flow.rxBitrate*1e-3,) print "\tMean Delay: %.2f ms" % (flow.delayMean*1e3,) print "\tPacket Loss Ratio: %.2f %%" % (flow.packetLossRatio*100)
Example #21
Source File: xlsx.py From pyRevit with GNU General Public License v3.0 | 5 votes |
def ensure_elementtree_imported(verbosity, logfile): global ET, ET_has_iterparse, Element_has_iter if ET is not None: return if "IronPython" in sys.version: import xml.etree.ElementTree as ET #### 2.7.2.1: fails later with #### NotImplementedError: iterparse is not supported on IronPython. (CP #31923) else: try: import xml.etree.cElementTree as ET except ImportError: try: import cElementTree as ET except ImportError: try: import lxml.etree as ET except ImportError: try: import xml.etree.ElementTree as ET except ImportError: try: import elementtree.ElementTree as ET except ImportError: raise Exception("Failed to import an ElementTree implementation") if hasattr(ET, 'iterparse'): _dummy_stream = BYTES_IO('') try: ET.iterparse(_dummy_stream) ET_has_iterparse = True except NotImplementedError: pass Element_has_iter = hasattr(ET.ElementTree, 'iter') if verbosity: etree_version = repr([ (item, getattr(ET, item)) for item in ET.__dict__.keys() if item.lower().replace('_', '') == 'version' ]) print(ET.__file__, ET.__name__, etree_version, ET_has_iterparse, file=logfile)
Example #22
Source File: element_iterator.py From python-mediawiki-utilities with MIT License | 5 votes |
def from_file(cls, f): return EventPointer(etree.iterparse(f, events=("start", "end")))
Example #23
Source File: proxy_QuickRank.py From rankeval with Mozilla Public License 2.0 | 5 votes |
def _count_nodes(file_path): """ Count the total number of nodes (both split and leaf nodes) in the model identified by file_path. Parameters ---------- file_path : str The path to the filename where the model has been saved Returns ------- tuple(n_trees, n_nodes) : tuple(int, int) The total number of trees and nodes (both split and leaf nodes) in the model identified by file_path. """ # get an iterable context = etree.iterparse(file_path, events=("end",)) # get the root element _, root = next(context) n_nodes = 0 n_trees = 0 for _, elem in context: if elem.tag == 'tree': n_trees += 1 elif elem.tag == 'feature' or elem.tag == 'output': n_nodes += 1 elem.clear() # discard the element root.clear() # remove root reference to the child return n_trees, n_nodes
Example #24
Source File: factory.py From network_tech with Apache License 2.0 | 5 votes |
def _strip_namespaces(xml): it = ElementTree.iterparse(StringIO(xml)) for _, el in it: if '}' in el.tag: el.tag = el.tag.split('}', 1)[1] # strip all namespaces return it.root
Example #25
Source File: cvemap.py From vmaas with GNU General Public License v2.0 | 5 votes |
def __init__(self, filename, lastmodified): self.lastmodified = lastmodified self.cves = {} root = None updated = None for event, elem in eT.iterparse(filename, events=("start", "end")): if elem.tag == "cvemap" and event == "start": root = elem updated = parse_datetime(elem.get('updated')) elif elem.tag == "Vulnerability" and event == "end": name = elem.get('name') self.cves[name] = { 'impact': text_strip(elem.find('ThreatSeverity')), 'published_date': parse_datetime(text_strip(elem.find('PublicDate'))), 'modified_date': updated, 'cvss2_score': text_strip(elem.find('CVSS/CVSSBaseScore')), 'cvss2_metrics': text_strip(elem.find('CVSS/CVSSScoringVector')), 'cvss3_score': text_strip(elem.find('CVSS3/CVSS3BaseScore')), 'cvss3_metrics': text_strip(elem.find('CVSS3/CVSS3ScoringVector')), 'cwe_list': self._cwe_list(text_strip(elem.find('CWE'))), 'description': self._cve_description(elem.findall('Details[@{%s}lang="en:us"]' % NS)), 'iava': text_strip(elem.find('IAVA')), 'redhat_url': "https://access.redhat.com/security/cve/" + str.lower(name), 'secondary_url': text_strip(elem.find('References')) } # Clear the XML tree continuously root.clear()
Example #26
Source File: test_regressions.py From ironpython2 with Apache License 2.0 | 5 votes |
def test_gh370(self): """https://github.com/IronLanguages/ironpython2/issues/370""" from xml.etree import ElementTree as ET from StringIO import StringIO x = ET.iterparse(StringIO('<root/>')) y = next(x) self.assertTrue(y[0] == 'end' and y[1].tag == 'root')
Example #27
Source File: xml.py From dpdata with GNU Lesser General Public License v3.0 | 5 votes |
def analyze (fname, type_idx_zero = False, begin = 0, step = 1) : """ can deal with broken xml file """ all_posi = [] all_cell = [] all_ener = [] all_forc = [] all_strs = [] cc = 0 try: for event, elem in ET.iterparse(fname): if elem.tag == 'atominfo' : eles, types = analyze_atominfo(elem) types = np.array(types, dtype = int) if type_idx_zero : types = types - 1 if elem.tag == 'calculation' : posi, cell, ener, forc, strs = analyze_calculation(elem) if cc >= begin and (cc - begin) % step == 0 : all_posi.append(posi) all_cell.append(cell) all_ener.append(ener) all_forc.append(forc) if strs is not None : all_strs.append(strs) cc += 1 except ET.ParseError: return eles, types, np.array(all_cell), np.array(all_posi), np.array(all_ener), np.array(all_forc), np.array(all_strs) return eles, types, np.array(all_cell), np.array(all_posi), np.array(all_ener), np.array(all_forc), np.array(all_strs)
Example #28
Source File: update_resources.py From indra with BSD 2-Clause "Simplified" License | 5 votes |
def update_hmdb_chebi_map(): logger.info('--Updating HMDB to ChEBI entries----') ns = {'hmdb': 'http://www.hmdb.ca'} url = 'http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip' fname = os.path.join(path, 'hmdb_metabolites.zip') logger.info('Downloading %s' % url) #urlretrieve(url, fname) mappings = [] with ZipFile(fname) as input_zip: with input_zip.open('hmdb_metabolites.xml') as fh: for event, elem in ET.iterparse(fh, events=('start', 'end')): #print(elem.tag) if event == 'start' and \ elem.tag == '{%s}metabolite' % ns['hmdb']: hmdb_id = None chebi_id = None # Important: we only look at accession if there's no HMDB # ID yet, otherwise we pick up secondary accession tags elif event == 'start' and \ elem.tag == '{%s}accession' % ns['hmdb'] and \ not hmdb_id: hmdb_id = elem.text elif event == 'start' and \ elem.tag == '{%s}chebi_id' % ns['hmdb']: chebi_id = elem.text elif event == 'end' and \ elem.tag == '{%s}metabolite' % ns['hmdb']: if hmdb_id and chebi_id: print(hmdb_id, chebi_id) mappings.append([hmdb_id, chebi_id]) elem.clear() fname = os.path.join(path, 'hmdb_to_chebi.tsv') mappings = [['HMDB_ID', 'CHEBI_ID']] + sorted(mappings, key=lambda x: x[0]) write_unicode_csv(fname, mappings, delimiter='\t')
Example #29
Source File: parser.py From fbchat-archive-parser with MIT License | 5 votes |
def _get_manifest_data(self): user, thread_references = None, [] ignore_anchors = True saw_anchor = False # Cast to str to ensure not unicode under Python 2, as the parser # doesn't like that. parser = XMLParser(encoding=str('UTF-8')) element_iter = ET.iterparse(self.handle, events=("start", "end"), parser=parser) for pos, element in element_iter: tag, class_attr = _tag_and_class_attr(element) if tag == "h1" and pos == "end": if not self.user: user = element.text.strip() elif tag == "div" and "content" in class_attr and pos == "start": ignore_anchors = False elif tag == "a" and pos == "start": if ignore_anchors: continue saw_anchor = True participants = self.parse_participants(element) thread_path = re.sub(r'^../', '', element.attrib['href']) if using_windows(): thread_path = thread_path.replace('/', '\\') thread_references += [(participants, os.path.join(self.root, thread_path))] if not saw_anchor: # Indicator of a `messages.htm` file that is probably in the legacy format. raise UnsuitableParserError return user, thread_references
Example #30
Source File: xlsx.py From pyRevit with GNU General Public License v3.0 | 5 votes |
def own_process_stream(self, stream, heading=None): if self.verbosity >= 2 and heading is not None: fprintf(self.logfile, "\n=== %s ===\n", heading) getmethod = self.tag2meth.get row_tag = U_SSML12 + "row" self_do_row = self.do_row for event, elem in ET.iterparse(stream): if elem.tag == row_tag: self_do_row(elem) elem.clear() # destroy all child elements (cells) elif elem.tag == U_SSML12 + "dimension": self.do_dimension(elem) elif elem.tag == U_SSML12 + "mergeCell": self.do_merge_cell(elem) self.finish_off()