Python Examples of lxml.etree.XMLParser

Source File: sim_files.py From simLAB with GNU General Public License v2.0

6 votes

def readXml(self, simType):
        path = os.path.dirname(__file__)
        if simType == types.TYPE_USIM:
            path = os.path.join(path, "sim_files_3g.xml")
        else:
            path = os.path.join(path, "sim_files_2g.xml")
        tree = etree.ElementTree()
        if not os.path.exists(path):
            logging.warning("File %s not exists" %path)
            logging.info("Create xml")
            if simType == types.TYPE_USIM:
                root = etree.Element('sim_3G')
            else:
                root = etree.Element('sim_2G')
        else:
            parser = etree.XMLParser(remove_blank_text=True)
            root = etree.parse(path, parser).getroot()
        return path, root

Source File: scan.py From report-ng with GNU General Public License v2.0

6 votes

def __init__(self, filename, requests_and_responses=False):
        self._filename = filename
        self._requests_and_responses = requests_and_responses
        json_ext = '.json'
        yaml_ext = '.yaml'
        if filename[-len(json_ext):] == json_ext:
            self._scan = json.loads(open(filename).read().decode('utf-8-sig'), object_pairs_hook=UnsortableOrderedDict)
        elif filename[-len(yaml_ext):] == yaml_ext:
            self._scan = yaml_load(open(filename).read(), yaml.SafeLoader, UnsortableOrderedDict)
        else:
            # xml
            #self._xml = etree.parse(filename)
            etree_parser = etree.XMLParser(huge_tree=True)
            self._xml = etree.parse(filename, parser=etree_parser)
            root = self._xml.getroot()
            if root.tag == 'Sessions':
                self._webinspect_import()
            elif root.tag == 'issues':
                self._burp_import()
            elif root.tag == 'items':
                self._burp_items_import()
            else:
                raise Exception('Unknown scan format!')

Source File: sso.py From vsphere-automation-sdk-python with MIT License

6 votes

def _canonicalize(xml_string):
    '''
    Given an xml string, canonicalize the string per
    U{http://www.w3.org/2001/10/xml-exc-c14n#}

    @type  xml_string: C{str}
    @param xml_string: The XML string that needs to be canonicalized.

    @rtype: C{str}
    @return: Canonicalized string in Unicode.
    '''
    parser = etree.XMLParser(remove_blank_text=True)
    tree = etree.fromstring(xml_string, parser=parser).getroottree()
    string = BytesIO()
    tree.write_c14n(string, exclusive=True, with_comments=False)
    return string.getvalue().decode(UTF_8)

Source File: views.py From MobileSF with GNU General Public License v3.0

6 votes

def findBodyType(request):
    bd_typ ="none"
    try:
        if request["body"]:
            try:
                json.loads(request["body"])
                bd_typ ="json"
            except:
                pass
            try:
                config = etree.XMLParser(remove_blank_text=True, resolve_entities=False)
                #Prevent Entity Expansion Attacks against the Framework
                etree.fromstring(request["body"],config)
                bd_typ ="xml"
            except:
                pass
            qs=parse_qs(request["body"])
            if qs:
                bd_typ="form"
        return bd_typ
    except:
        PrintException("[ERROR] Finding Request Body type")

Source File: site.py From shareplum with MIT License

6 votes

def get_site_templates(self, lcid="1033"):

        # Build Request
        soap_request = Soap("GetSiteTemplates")
        soap_request.add_parameter("LCID", lcid)
        self.last_request = str(soap_request)

        # Send Request
        response = post(self._session,
                        url=self._url("Sites"),
                        headers=self._headers("GetSiteTemplates"),
                        data=str(soap_request).encode("utf-8"),
                        verify=self._verify_ssl,
                        timeout=self.timeout)

        return response
        envelope = etree.fromstring(response.text.encode("utf-8"),
                                    parser=etree.XMLParser(huge_tree=self.huge_tree,
                                    recover=True))
        lists = envelope[0][0][1]
        data = []
        for _list in lists:
            data.append({k: v for (k, v) in _list.items()})

        return data

Source File: site.py From shareplum with MIT License

6 votes

def get_list_templates(self):

        # Build Request
        soap_request = Soap("GetListTemplates")
        soap_request.add_parameter("GetListTemplates")
        self.last_request = str(soap_request)

        # Send Request
        response = post(self._session,
                        url=self._url("Webs"),
                        headers=self._headers("GetListTemplates"),
                        data=str(soap_request).encode("utf-8"),
                        verify=self._verify_ssl,
                        timeout=self.timeout)

        envelope = etree.fromstring(response.text.encode("utf-8"),
                                    parser=etree.XMLParser(huge_tree=self.huge_tree,
                                    recover=True))
        lists = envelope[0][0][0][0]
        data = []
        for _list in lists:
            data.append({k: v for (k, v) in _list.items()})

        return data

Source File: site.py From shareplum with MIT License

6 votes

def get_site(self):

        # Build Request
        soap_request = Soap("GetSite")
        soap_request.add_parameter("SiteUrl", self.site_url)
        self.last_request = str(soap_request)

        # Send Request
        response = post(self._session,
                        url=self._url("Sites"),
                        headers=self._headers("GetSite"),
                        data=str(soap_request).encode("utf-8"),
                        verify=self._verify_ssl,
                        timeout=self.timeout)

        envelope = etree.fromstring(response.text.encode("utf-8"),
                                    parser=etree.XMLParser(huge_tree=self.huge_tree,
                                    recover=True))
        data = envelope[0][0][0]

        # TODO: Not sure what to do with this, so just return the text
        return data.text

Source File: romeo.py From dissemin with GNU Affero General Public License v3.0

6 votes

def perform_romeo_query(self, search_terms):
        search_terms = search_terms.copy()
        if self.api_key:
            search_terms['ak'] = self.api_key

        # Perform the query
        try:
            req = requests.get(self.base_url, params=search_terms, timeout=20)
        except requests.exceptions.RequestException as e:
            raise MetadataSourceException('Error while querying RoMEO.\n' +
                                          'URL was: '+self.base_url+'\n' +
                                          'Parameters were: '+str(search_terms)+'\n' +
                                          'Error is: '+str(e))

        # Parse it
        try:
            parser = ET.XMLParser(encoding='ISO-8859-1')
            root = ET.parse(BytesIO(req.content), parser)
        except ET.ParseError as e:
            raise MetadataSourceException('RoMEO returned an invalid XML response.\n' +
                                          'URL was: '+self.base_url+'\n' +
                                          'Parameters were: '+str(search_terms)+'\n' +
                                          'Error is: '+str(e))

        return root

Source File: youtube.py From xblock-video with GNU General Public License v3.0

6 votes

def download_default_transcript(self, url=None, language_code=None):  # pylint: disable=unused-argument
        """
        Download default transcript from Youtube API and format it to WebVTT-like unicode.

        Reference to `get_transcripts_from_youtube()`:
            https://github.com/edx/edx-platform/blob/ecc3473d36b3c7a360e260f8962e21cb01eb1c39/common/lib/xmodule/xmodule/video_module/transcripts_utils.py#L122
        """
        if url is None:
            raise VideoXBlockException(_('`url` parameter is required.'))
        utf8_parser = etree.XMLParser(encoding='utf-8')
        data = requests.get(url)
        xmltree = etree.fromstring(data.content, parser=utf8_parser)
        sub = [
            self.format_transcript_element(element, i)
            for i, element in enumerate(xmltree, 1)
        ]
        sub = "".join(sub)
        sub = u"WEBVTT\n\n" + unicode(sub) if "WEBVTT" not in sub else unicode(sub)
        return sub

Source File: generic.py From n6 with GNU Affero General Public License v3.0

6 votes

def iter_entry(self, data):
        """
        Get an iterator over rows extracted from the raw data body.

        Args:
            `data` (dict):
                As returned by prepare_data() (especially, its 'raw' item
                contains the raw data body).

        Returns:
            An iterator over xml tree:
        """
        raw_entry = StringIO(data['raw']).getvalue()
        parser = etree.XMLParser(ns_clean=True, remove_blank_text=True)
        tree = etree.fromstring(str(raw_entry), parser)
        return tree

Source File: protocol.py From dissemin with GNU Affero General Public License v3.0

6 votes

def get_new_status(self, identifier):
        """
        Unconditionnally fetch the new status of a deposit, by ID (e.g.
        hal-0001234)
        """
        deposit_url = '%s%s' % (self.api_url, identifier)
        req = requests.get(deposit_url,
                auth=requests.auth.HTTPBasicAuth(self.username,self.password))
        if req.status_code == 400:
            return 'deleted'
        req.raise_for_status()

        parser = etree.XMLParser(encoding='utf-8')
        receipt = etree.parse(BytesIO(req.text.encode('utf-8')), parser)
        receipt = receipt.getroot()

        hal_status = receipt.find('status').text
        if hal_status == 'accept' or hal_status == 'replace':
            return 'published'
        elif hal_status == 'verify' or hal_status == 'update':
            return 'pending'
        elif hal_status == 'delete':
            return 'refused'

Source File: files.py From janeway with GNU Affero General Public License v3.0

6 votes

def transform_with_xsl(xml_path, xsl_path, recover=False):
    try:
        xml_dom = etree.parse(xml_path)
    except etree.XMLSyntaxError as e:
        if recover:
            logger.error(e)
            parser = etree.XMLParser(recover=True)
            xml_dom = etree.parse(xml_path, parser=parser)
        else:
            raise
    xsl_transform = etree.XSLT(etree.parse(xsl_path))
    try:
        transformed_dom = xsl_transform(xml_dom)
    except Exception as err:
        logger.error(err)
        for xsl_error in xsl_transform.error_log:
            logger.error(xsl_error)
        if not recover:
            raise

    return transformed_dom

Source File: intellij_set_default_inspection_profile.py From ansible-role-intellij with MIT License

5 votes

def pretty_print(elem):
    text = etree.tostring(elem, encoding='iso-8859-1')
    parser = etree.XMLParser(remove_blank_text=True)
    xml = etree.fromstring(text, parser)
    return etree.tostring(
        xml, encoding='iso-8859-1', pretty_print=True, xml_declaration=False)

Source File: pokerstars.py From poker with MIT License

5 votes

def __init__(self, notes: str):
        self.raw = notes
        parser = etree.XMLParser(recover=True, resolve_entities=False)
        self.root = etree.XML(notes.encode(), parser)

Source File: intellij_configure_jdk.py From ansible-role-intellij with MIT License

5 votes

def pretty_print(elem):
    text = etree.tostring(elem, encoding='iso-8859-1')
    parser = etree.XMLParser(remove_blank_text=True)
    xml = etree.fromstring(text, parser)
    return etree.tostring(
        xml, encoding='iso-8859-1', pretty_print=True, xml_declaration=False)

Source File: intellij_set_default_jdk.py From ansible-role-intellij with MIT License

5 votes

def pretty_print(elem):
    text = etree.tostring(elem, encoding='iso-8859-1')
    parser = etree.XMLParser(remove_blank_text=True)
    xml = etree.fromstring(text, parser)
    return etree.tostring(
        xml, encoding='iso-8859-1', pretty_print=True, xml_declaration=False)

Source File: selector.py From ChemDataExtractor with MIT License

5 votes

def from_xml_text(cls, text, base_url=None, namespaces=None, encoding=None):
        return cls.from_text(text, base_url=base_url, parser=XMLParser, translator=CssXmlTranslator, fmt='xml', namespaces=namespaces, encoding=encoding)

Source File: intellij_set_default_maven.py From ansible-role-intellij with MIT License

5 votes

def pretty_print(elem):
    text = etree.tostring(elem, encoding='iso-8859-1')
    parser = etree.XMLParser(remove_blank_text=True)
    xml = etree.fromstring(text, parser)
    return etree.tostring(
        xml, encoding='iso-8859-1', pretty_print=True, xml_declaration=False)

Source File: main.py From sysmon-config-bypass-finder with GNU General Public License v3.0

5 votes

def _read_config_to_json(sysmon_config):
    parser = etree.XMLParser(remove_comments=True)
    tree = objectify.parse(sysmon_config, parser=parser)
    root = tree.getroot()
    event_filtering = root.find('EventFiltering')

    configuration = []
    for rule in event_filtering.getchildren():
        rule_type = rule.tag
        on_match = rule.get('onmatch')
        single_rule = {
            'rule_type': rule_type,
            'on_match': on_match,
            'conditions': []
        }
        for condition in rule.iterchildren():
            cond_operator = condition.get('condition')
            cond_content = condition.text
            cond_type = condition.tag
            single_rule['conditions'].append({
                'operator': cond_operator,
                'content': cond_content,
                'condition_type': cond_type
            })
        configuration.append(single_rule)
    return configuration

Source File: parser.py From avacity-2.0 with MIT License

5 votes

def __init__(self):
        self.parser = etree.XMLParser(remove_comments=True)
        self.apprnc_map = ["sc", "et", "brt", "at", "ht", "bt", "sh", "rg",
                           "ss", "pt", "fat", "fft"]

Source File: xml.py From ansible-xml with GNU General Public License v3.0

5 votes

def child_to_element(module, child, in_type):
    if in_type == 'xml':
        infile = BytesIO(to_bytes(child, errors='surrogate_or_strict'))

        try:
            parser = etree.XMLParser()
            node = etree.parse(infile, parser)
            return node.getroot()
        except etree.XMLSyntaxError as e:
            module.fail_json(msg="Error while parsing child element: %s" % e)
    elif in_type == 'yaml':
        if isinstance(child, string_types):
            return etree.Element(child)
        elif isinstance(child, MutableMapping):
            if len(child) > 1:
                module.fail_json(msg="Can only create children from hashes with one key")

            (key, value) = next(iteritems(child))
            if isinstance(value, MutableMapping):
                children = value.pop('_', None)

                node = etree.Element(key, value)

                if children is not None:
                    if not isinstance(children, list):
                        module.fail_json(msg="Invalid children type: %s, must be list." % type(children))

                    subnodes = children_to_nodes(module, children)
                    node.extend(subnodes)
            else:
                node = etree.Element(key)
                node.text = value
            return node
        else:
            module.fail_json(msg="Invalid child type: %s. Children must be either strings or hashes." % type(child))
    else:
        module.fail_json(msg="Invalid child input type: %s. Type must be either xml or yaml." % in_type)

Source File: _lxml.py From nzb-subliminal with GNU General Public License v3.0

5 votes

def default_parser(self, encoding):
        # This can either return a parser object or a class, which
        # will be instantiated with default arguments.
        if self._default_parser is not None:
            return self._default_parser
        return etree.XMLParser(
            target=self, strip_cdata=False, recover=True, encoding=encoding)

Source File: _lxml.py From B.E.N.J.I. with MIT License

5 votes

def default_parser(self, encoding):
        # This can either return a parser object or a class, which
        # will be instantiated with default arguments.
        if self._default_parser is not None:
            return self._default_parser
        return etree.XMLParser(
            target=self, strip_cdata=False, recover=True, encoding=encoding)

Source File: conftest.py From pycon with MIT License

5 votes

def sample_invoice_xml():
    here = os.path.dirname(__file__)

    parser = etree.XMLParser(remove_blank_text=True)

    root = etree.parse(
        os.path.join(here, "../data/IT01234567890_FPA01.xml"), parser=parser
    )

    for elem in root.iter("*"):
        if elem.text is not None:
            elem.text = elem.text.strip()

    return root

Source File: client.py From nsxramlclient with MIT License

5 votes

def get_xml_example_by_displayname(self, display_name, method, remove_content=None, remove_comments=None):
        if not remove_content:
            remove_content = True
        if not remove_comments:
            remove_comments = True
        method_options = {'read': 'get', 'create': 'post', 'delete': 'delete', 'update': 'put'}
        matched_resource = self.find_resource_recursively(display_name)

        assert matched_resource, 'The searched displayName could not be found in RAML File'
        assert method_options[method] in matched_resource[1].methods, 'the resource does not support ' \
                                                                      'the {} method'.format(method)
        assert matched_resource[1].methods[method_options[method]].body, 'the resource does not have a ' \
                                                                         'body schema in the RAML File'

        matched_resource_body = matched_resource[1].methods[method_options[method]].body
        example = matched_resource_body['application/xml'].example
        try:
            parser = et.XMLParser(remove_comments=remove_comments)
            example_et = et.fromstring(example, parser=parser)
        except et.XMLSyntaxError as e:
            raise Exception('The parsing of the body example XML failed, please check the format in the RAML file,'
                            'the execption is:\n{}'.format(e))

        if remove_content:
            for parent, child in self._iterparent(example_et):
                child.text = None
                child.tail = None

        return example_et

Source File: conftest.py From dissemin with GNU Affero General Public License v3.0

5 votes

def dissemin_xml_1_0():
    '''
    Loads a dissemin xml document ready to be manipulated and be validated
    '''
    directory = os.path.dirname(os.path.abspath(__file__))
    parser = etree.XMLParser(remove_blank_text=True)
    return etree.parse(os.path.join(directory, 'schema', 'test_data', 'dissemin_v1.0.xml'), parser).getroot()

Source File: test_romeo.py From dissemin with GNU Affero General Public License v3.0

5 votes

def perform_romeo_query(self, search_terms):
        filename = '_'.join(sorted('{}-{}'.format(key, val.replace(' ','_')) for key, val in search_terms.items())) + '.xml'
        try:
            with open(os.path.join(self.datadir, filename), 'rb') as response_file:
                parser = etree.XMLParser(encoding='ISO-8859-1')
                return etree.parse(response_file, parser)
        except IOError:
            xml = super(RomeoAPIStub, self).perform_romeo_query(search_terms)
            with open(os.path.join(self.datadir, filename), 'wb') as response_file:
                xml.write(response_file)
            return xml

# SHERPA/RoMEO interface

Source File: romeo.py From dissemin with GNU Affero General Public License v3.0

5 votes

def get_romeo_latest_update_date(self):
        """
        Fetches the dates of the latest updates on the RoMEO service.
        This returns a dict: the dates can be accessed via the 'publishers' and 'journals'
        keys.
        """
        r = requests.get('http://www.sherpa.ac.uk/downloads/download-dates.php',
                         {'ak':self.api_key, 'format':'xml'})
        parser = ET.XMLParser(encoding='ISO-8859-1')
        root = ET.parse(BytesIO(r.content), parser)
        return {
            'publishers': self._get_romeo_date(root, './publisherspolicies/latestupdate'),
            'journals': self._get_romeo_date(root, './journals/latestupdate')
        }

Source File: cctop.py From ssbio with MIT License

5 votes

def parse_cctop_full(infile):
    """Parse a CCTOP XML results file and return a list of the consensus TM domains in the format::

            [(1, inside_outside_or_tm),
             (2, inside_outside_or_tm),
             ...]

    Where the first value of a tuple is the sequence residue number, and the second is the predicted location with the
    values 'I' (inside), 'O' (outside), or 'M' (membrane).

    Args:
        infile (str): Path to CCTOP XML file

    Returns:
        list: List of tuples in the format described above

    """
    parser = etree.XMLParser(ns_clean=True)
    with open(infile, 'r') as f:
        tree = etree.fromstring(f.read(), parser)

    all_info = []

    if tree.find('Topology') is not None:
        for r in tree.find('Topology').findall('Region'):
            region_start = int(r.attrib['from'])
            region_end = int(r.attrib['to'])
            region = r.attrib['loc']
            for i in range(region_start, region_end + 1):
                all_info.append((i, region))

    return all_info

Source File: XnatUtils.py From dax with MIT License

5 votes

def get_resource_lastdate_modified(intf, resource_obj):
    """
    Get the last modified data for a resource on XNAT.
     (NOT WORKING: bug on XNAT side for version<1.6.5)
    :param intf: pyxnat.Interface object
    :param resource: resource pyxnat Eobject
    :return: date of last modified data with the format %Y%m%d%H%M%S
    """
    # xpaths for times in resource xml
    created_dcm_xpath = "/cat:DCMCatalog/cat:entries/cat:entry/@createdTime"
    modified_dcm_xpath = "/cat:DCMCatalog/cat:entries/cat:entry/@modifiedTime"
    created_xpath = "/cat:Catalog/cat:entries/cat:entry/@createdTime"
    modified_xpath = "/cat:Catalog/cat:entries/cat:entry/@modifiedTime"
    # Get the resource object and its uri
    res_xml_uri = '%s?format=xml' % (resource_obj._uri)
    # Get the XML for resource
    xmlstr = intf._exec(res_xml_uri, 'GET')
    # Parse out the times
    root = etree.fromstring(xmlstr, parser=etree.XMLParser(huge_tree=True))
    create_times = root.xpath(created_xpath, namespaces=root.nsmap)
    if not create_times:
        create_times = root.xpath(created_dcm_xpath, namespaces=root.nsmap)
    mod_times = root.xpath(modified_xpath, namespaces=root.nsmap)
    if not mod_times:
        mod_times = root.xpath(modified_dcm_xpath, namespaces=root.nsmap)
    # Find the most recent time
    all_times = create_times + mod_times
    if all_times:
        max_time = max(all_times)
        date = max_time.split('.')[0]
        res_date = (date.split('T')[0].replace('-', '') +
                    date.split('T')[1].replace(':', ''))
    else:
        res_date = ('{:%Y%m%d%H%M%S}'.format(datetime.now()))
    return res_date

Python lxml.etree.XMLParser() Examples