Python lxml.etree.XMLParser() Examples

The following are 30 code examples of lxml.etree.XMLParser(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml.etree , or try the search function .
Example #1
Source File: sim_files.py    From simLAB with GNU General Public License v2.0 6 votes vote down vote up
def readXml(self, simType):
        path = os.path.dirname(__file__)
        if simType == types.TYPE_USIM:
            path = os.path.join(path, "sim_files_3g.xml")
        else:
            path = os.path.join(path, "sim_files_2g.xml")
        tree = etree.ElementTree()
        if not os.path.exists(path):
            logging.warning("File %s not exists" %path)
            logging.info("Create xml")
            if simType == types.TYPE_USIM:
                root = etree.Element('sim_3G')
            else:
                root = etree.Element('sim_2G')
        else:
            parser = etree.XMLParser(remove_blank_text=True)
            root = etree.parse(path, parser).getroot()
        return path, root 
Example #2
Source File: scan.py    From report-ng with GNU General Public License v2.0 6 votes vote down vote up
def __init__(self, filename, requests_and_responses=False):
        self._filename = filename
        self._requests_and_responses = requests_and_responses
        json_ext = '.json'
        yaml_ext = '.yaml'
        if filename[-len(json_ext):] == json_ext:
            self._scan = json.loads(open(filename).read().decode('utf-8-sig'), object_pairs_hook=UnsortableOrderedDict)
        elif filename[-len(yaml_ext):] == yaml_ext:
            self._scan = yaml_load(open(filename).read(), yaml.SafeLoader, UnsortableOrderedDict)
        else:
            # xml
            #self._xml = etree.parse(filename)
            etree_parser = etree.XMLParser(huge_tree=True)
            self._xml = etree.parse(filename, parser=etree_parser)
            root = self._xml.getroot()
            if root.tag == 'Sessions':
                self._webinspect_import()
            elif root.tag == 'issues':
                self._burp_import()
            elif root.tag == 'items':
                self._burp_items_import()
            else:
                raise Exception('Unknown scan format!') 
Example #3
Source File: sso.py    From vsphere-automation-sdk-python with MIT License 6 votes vote down vote up
def _canonicalize(xml_string):
    '''
    Given an xml string, canonicalize the string per
    U{http://www.w3.org/2001/10/xml-exc-c14n#}

    @type  xml_string: C{str}
    @param xml_string: The XML string that needs to be canonicalized.

    @rtype: C{str}
    @return: Canonicalized string in Unicode.
    '''
    parser = etree.XMLParser(remove_blank_text=True)
    tree = etree.fromstring(xml_string, parser=parser).getroottree()
    string = BytesIO()
    tree.write_c14n(string, exclusive=True, with_comments=False)
    return string.getvalue().decode(UTF_8) 
Example #4
Source File: views.py    From MobileSF with GNU General Public License v3.0 6 votes vote down vote up
def findBodyType(request):
    bd_typ ="none"
    try:
        if request["body"]:
            try:
                json.loads(request["body"])
                bd_typ ="json"
            except:
                pass
            try:
                config = etree.XMLParser(remove_blank_text=True, resolve_entities=False)
                #Prevent Entity Expansion Attacks against the Framework
                etree.fromstring(request["body"],config)
                bd_typ ="xml"
            except:
                pass
            qs=parse_qs(request["body"])
            if qs:
                bd_typ="form"
        return bd_typ
    except:
        PrintException("[ERROR] Finding Request Body type") 
Example #5
Source File: site.py    From shareplum with MIT License 6 votes vote down vote up
def get_site_templates(self, lcid="1033"):

        # Build Request
        soap_request = Soap("GetSiteTemplates")
        soap_request.add_parameter("LCID", lcid)
        self.last_request = str(soap_request)

        # Send Request
        response = post(self._session,
                        url=self._url("Sites"),
                        headers=self._headers("GetSiteTemplates"),
                        data=str(soap_request).encode("utf-8"),
                        verify=self._verify_ssl,
                        timeout=self.timeout)

        return response
        envelope = etree.fromstring(response.text.encode("utf-8"),
                                    parser=etree.XMLParser(huge_tree=self.huge_tree,
                                    recover=True))
        lists = envelope[0][0][1]
        data = []
        for _list in lists:
            data.append({k: v for (k, v) in _list.items()})

        return data 
Example #6
Source File: site.py    From shareplum with MIT License 6 votes vote down vote up
def get_list_templates(self):

        # Build Request
        soap_request = Soap("GetListTemplates")
        soap_request.add_parameter("GetListTemplates")
        self.last_request = str(soap_request)

        # Send Request
        response = post(self._session,
                        url=self._url("Webs"),
                        headers=self._headers("GetListTemplates"),
                        data=str(soap_request).encode("utf-8"),
                        verify=self._verify_ssl,
                        timeout=self.timeout)

        envelope = etree.fromstring(response.text.encode("utf-8"),
                                    parser=etree.XMLParser(huge_tree=self.huge_tree,
                                    recover=True))
        lists = envelope[0][0][0][0]
        data = []
        for _list in lists:
            data.append({k: v for (k, v) in _list.items()})

        return data 
Example #7
Source File: site.py    From shareplum with MIT License 6 votes vote down vote up
def get_site(self):

        # Build Request
        soap_request = Soap("GetSite")
        soap_request.add_parameter("SiteUrl", self.site_url)
        self.last_request = str(soap_request)

        # Send Request
        response = post(self._session,
                        url=self._url("Sites"),
                        headers=self._headers("GetSite"),
                        data=str(soap_request).encode("utf-8"),
                        verify=self._verify_ssl,
                        timeout=self.timeout)

        envelope = etree.fromstring(response.text.encode("utf-8"),
                                    parser=etree.XMLParser(huge_tree=self.huge_tree,
                                    recover=True))
        data = envelope[0][0][0]

        # TODO: Not sure what to do with this, so just return the text
        return data.text 
Example #8
Source File: romeo.py    From dissemin with GNU Affero General Public License v3.0 6 votes vote down vote up
def perform_romeo_query(self, search_terms):
        search_terms = search_terms.copy()
        if self.api_key:
            search_terms['ak'] = self.api_key

        # Perform the query
        try:
            req = requests.get(self.base_url, params=search_terms, timeout=20)
        except requests.exceptions.RequestException as e:
            raise MetadataSourceException('Error while querying RoMEO.\n' +
                                          'URL was: '+self.base_url+'\n' +
                                          'Parameters were: '+str(search_terms)+'\n' +
                                          'Error is: '+str(e))

        # Parse it
        try:
            parser = ET.XMLParser(encoding='ISO-8859-1')
            root = ET.parse(BytesIO(req.content), parser)
        except ET.ParseError as e:
            raise MetadataSourceException('RoMEO returned an invalid XML response.\n' +
                                          'URL was: '+self.base_url+'\n' +
                                          'Parameters were: '+str(search_terms)+'\n' +
                                          'Error is: '+str(e))

        return root 
Example #9
Source File: youtube.py    From xblock-video with GNU General Public License v3.0 6 votes vote down vote up
def download_default_transcript(self, url=None, language_code=None):  # pylint: disable=unused-argument
        """
        Download default transcript from Youtube API and format it to WebVTT-like unicode.

        Reference to `get_transcripts_from_youtube()`:
            https://github.com/edx/edx-platform/blob/ecc3473d36b3c7a360e260f8962e21cb01eb1c39/common/lib/xmodule/xmodule/video_module/transcripts_utils.py#L122
        """
        if url is None:
            raise VideoXBlockException(_('`url` parameter is required.'))
        utf8_parser = etree.XMLParser(encoding='utf-8')
        data = requests.get(url)
        xmltree = etree.fromstring(data.content, parser=utf8_parser)
        sub = [
            self.format_transcript_element(element, i)
            for i, element in enumerate(xmltree, 1)
        ]
        sub = "".join(sub)
        sub = u"WEBVTT\n\n" + unicode(sub) if "WEBVTT" not in sub else unicode(sub)
        return sub 
Example #10
Source File: generic.py    From n6 with GNU Affero General Public License v3.0 6 votes vote down vote up
def iter_entry(self, data):
        """
        Get an iterator over rows extracted from the raw data body.

        Args:
            `data` (dict):
                As returned by prepare_data() (especially, its 'raw' item
                contains the raw data body).

        Returns:
            An iterator over xml tree:
        """
        raw_entry = StringIO(data['raw']).getvalue()
        parser = etree.XMLParser(ns_clean=True, remove_blank_text=True)
        tree = etree.fromstring(str(raw_entry), parser)
        return tree 
Example #11
Source File: protocol.py    From dissemin with GNU Affero General Public License v3.0 6 votes vote down vote up
def get_new_status(self, identifier):
        """
        Unconditionnally fetch the new status of a deposit, by ID (e.g.
        hal-0001234)
        """
        deposit_url = '%s%s' % (self.api_url, identifier)
        req = requests.get(deposit_url,
                auth=requests.auth.HTTPBasicAuth(self.username,self.password))
        if req.status_code == 400:
            return 'deleted'
        req.raise_for_status()

        parser = etree.XMLParser(encoding='utf-8')
        receipt = etree.parse(BytesIO(req.text.encode('utf-8')), parser)
        receipt = receipt.getroot()

        hal_status = receipt.find('status').text
        if hal_status == 'accept' or hal_status == 'replace':
            return 'published'
        elif hal_status == 'verify' or hal_status == 'update':
            return 'pending'
        elif hal_status == 'delete':
            return 'refused' 
Example #12
Source File: files.py    From janeway with GNU Affero General Public License v3.0 6 votes vote down vote up
def transform_with_xsl(xml_path, xsl_path, recover=False):
    try:
        xml_dom = etree.parse(xml_path)
    except etree.XMLSyntaxError as e:
        if recover:
            logger.error(e)
            parser = etree.XMLParser(recover=True)
            xml_dom = etree.parse(xml_path, parser=parser)
        else:
            raise
    xsl_transform = etree.XSLT(etree.parse(xsl_path))
    try:
        transformed_dom = xsl_transform(xml_dom)
    except Exception as err:
        logger.error(err)
        for xsl_error in xsl_transform.error_log:
            logger.error(xsl_error)
        if not recover:
            raise

    return transformed_dom 
Example #13
Source File: intellij_set_default_inspection_profile.py    From ansible-role-intellij with MIT License 5 votes vote down vote up
def pretty_print(elem):
    text = etree.tostring(elem, encoding='iso-8859-1')
    parser = etree.XMLParser(remove_blank_text=True)
    xml = etree.fromstring(text, parser)
    return etree.tostring(
        xml, encoding='iso-8859-1', pretty_print=True, xml_declaration=False) 
Example #14
Source File: pokerstars.py    From poker with MIT License 5 votes vote down vote up
def __init__(self, notes: str):
        self.raw = notes
        parser = etree.XMLParser(recover=True, resolve_entities=False)
        self.root = etree.XML(notes.encode(), parser) 
Example #15
Source File: intellij_configure_jdk.py    From ansible-role-intellij with MIT License 5 votes vote down vote up
def pretty_print(elem):
    text = etree.tostring(elem, encoding='iso-8859-1')
    parser = etree.XMLParser(remove_blank_text=True)
    xml = etree.fromstring(text, parser)
    return etree.tostring(
        xml, encoding='iso-8859-1', pretty_print=True, xml_declaration=False) 
Example #16
Source File: intellij_set_default_jdk.py    From ansible-role-intellij with MIT License 5 votes vote down vote up
def pretty_print(elem):
    text = etree.tostring(elem, encoding='iso-8859-1')
    parser = etree.XMLParser(remove_blank_text=True)
    xml = etree.fromstring(text, parser)
    return etree.tostring(
        xml, encoding='iso-8859-1', pretty_print=True, xml_declaration=False) 
Example #17
Source File: selector.py    From ChemDataExtractor with MIT License 5 votes vote down vote up
def from_xml_text(cls, text, base_url=None, namespaces=None, encoding=None):
        return cls.from_text(text, base_url=base_url, parser=XMLParser, translator=CssXmlTranslator, fmt='xml', namespaces=namespaces, encoding=encoding) 
Example #18
Source File: intellij_set_default_maven.py    From ansible-role-intellij with MIT License 5 votes vote down vote up
def pretty_print(elem):
    text = etree.tostring(elem, encoding='iso-8859-1')
    parser = etree.XMLParser(remove_blank_text=True)
    xml = etree.fromstring(text, parser)
    return etree.tostring(
        xml, encoding='iso-8859-1', pretty_print=True, xml_declaration=False) 
Example #19
Source File: main.py    From sysmon-config-bypass-finder with GNU General Public License v3.0 5 votes vote down vote up
def _read_config_to_json(sysmon_config):
    parser = etree.XMLParser(remove_comments=True)
    tree = objectify.parse(sysmon_config, parser=parser)
    root = tree.getroot()
    event_filtering = root.find('EventFiltering')

    configuration = []
    for rule in event_filtering.getchildren():
        rule_type = rule.tag
        on_match = rule.get('onmatch')
        single_rule = {
            'rule_type': rule_type,
            'on_match': on_match,
            'conditions': []
        }
        for condition in rule.iterchildren():
            cond_operator = condition.get('condition')
            cond_content = condition.text
            cond_type = condition.tag
            single_rule['conditions'].append({
                'operator': cond_operator,
                'content': cond_content,
                'condition_type': cond_type
            })
        configuration.append(single_rule)
    return configuration 
Example #20
Source File: parser.py    From avacity-2.0 with MIT License 5 votes vote down vote up
def __init__(self):
        self.parser = etree.XMLParser(remove_comments=True)
        self.apprnc_map = ["sc", "et", "brt", "at", "ht", "bt", "sh", "rg",
                           "ss", "pt", "fat", "fft"] 
Example #21
Source File: xml.py    From ansible-xml with GNU General Public License v3.0 5 votes vote down vote up
def child_to_element(module, child, in_type):
    if in_type == 'xml':
        infile = BytesIO(to_bytes(child, errors='surrogate_or_strict'))

        try:
            parser = etree.XMLParser()
            node = etree.parse(infile, parser)
            return node.getroot()
        except etree.XMLSyntaxError as e:
            module.fail_json(msg="Error while parsing child element: %s" % e)
    elif in_type == 'yaml':
        if isinstance(child, string_types):
            return etree.Element(child)
        elif isinstance(child, MutableMapping):
            if len(child) > 1:
                module.fail_json(msg="Can only create children from hashes with one key")

            (key, value) = next(iteritems(child))
            if isinstance(value, MutableMapping):
                children = value.pop('_', None)

                node = etree.Element(key, value)

                if children is not None:
                    if not isinstance(children, list):
                        module.fail_json(msg="Invalid children type: %s, must be list." % type(children))

                    subnodes = children_to_nodes(module, children)
                    node.extend(subnodes)
            else:
                node = etree.Element(key)
                node.text = value
            return node
        else:
            module.fail_json(msg="Invalid child type: %s. Children must be either strings or hashes." % type(child))
    else:
        module.fail_json(msg="Invalid child input type: %s. Type must be either xml or yaml." % in_type) 
Example #22
Source File: _lxml.py    From nzb-subliminal with GNU General Public License v3.0 5 votes vote down vote up
def default_parser(self, encoding):
        # This can either return a parser object or a class, which
        # will be instantiated with default arguments.
        if self._default_parser is not None:
            return self._default_parser
        return etree.XMLParser(
            target=self, strip_cdata=False, recover=True, encoding=encoding) 
Example #23
Source File: _lxml.py    From B.E.N.J.I. with MIT License 5 votes vote down vote up
def default_parser(self, encoding):
        # This can either return a parser object or a class, which
        # will be instantiated with default arguments.
        if self._default_parser is not None:
            return self._default_parser
        return etree.XMLParser(
            target=self, strip_cdata=False, recover=True, encoding=encoding) 
Example #24
Source File: conftest.py    From pycon with MIT License 5 votes vote down vote up
def sample_invoice_xml():
    here = os.path.dirname(__file__)

    parser = etree.XMLParser(remove_blank_text=True)

    root = etree.parse(
        os.path.join(here, "../data/IT01234567890_FPA01.xml"), parser=parser
    )

    for elem in root.iter("*"):
        if elem.text is not None:
            elem.text = elem.text.strip()

    return root 
Example #25
Source File: client.py    From nsxramlclient with MIT License 5 votes vote down vote up
def get_xml_example_by_displayname(self, display_name, method, remove_content=None, remove_comments=None):
        if not remove_content:
            remove_content = True
        if not remove_comments:
            remove_comments = True
        method_options = {'read': 'get', 'create': 'post', 'delete': 'delete', 'update': 'put'}
        matched_resource = self.find_resource_recursively(display_name)

        assert matched_resource, 'The searched displayName could not be found in RAML File'
        assert method_options[method] in matched_resource[1].methods, 'the resource does not support ' \
                                                                      'the {} method'.format(method)
        assert matched_resource[1].methods[method_options[method]].body, 'the resource does not have a ' \
                                                                         'body schema in the RAML File'

        matched_resource_body = matched_resource[1].methods[method_options[method]].body
        example = matched_resource_body['application/xml'].example
        try:
            parser = et.XMLParser(remove_comments=remove_comments)
            example_et = et.fromstring(example, parser=parser)
        except et.XMLSyntaxError as e:
            raise Exception('The parsing of the body example XML failed, please check the format in the RAML file,'
                            'the execption is:\n{}'.format(e))

        if remove_content:
            for parent, child in self._iterparent(example_et):
                child.text = None
                child.tail = None

        return example_et 
Example #26
Source File: conftest.py    From dissemin with GNU Affero General Public License v3.0 5 votes vote down vote up
def dissemin_xml_1_0():
    '''
    Loads a dissemin xml document ready to be manipulated and be validated
    '''
    directory = os.path.dirname(os.path.abspath(__file__))
    parser = etree.XMLParser(remove_blank_text=True)
    return etree.parse(os.path.join(directory, 'schema', 'test_data', 'dissemin_v1.0.xml'), parser).getroot() 
Example #27
Source File: test_romeo.py    From dissemin with GNU Affero General Public License v3.0 5 votes vote down vote up
def perform_romeo_query(self, search_terms):
        filename = '_'.join(sorted('{}-{}'.format(key, val.replace(' ','_')) for key, val in search_terms.items())) + '.xml'
        try:
            with open(os.path.join(self.datadir, filename), 'rb') as response_file:
                parser = etree.XMLParser(encoding='ISO-8859-1')
                return etree.parse(response_file, parser)
        except IOError:
            xml = super(RomeoAPIStub, self).perform_romeo_query(search_terms)
            with open(os.path.join(self.datadir, filename), 'wb') as response_file:
                xml.write(response_file)
            return xml

# SHERPA/RoMEO interface 
Example #28
Source File: romeo.py    From dissemin with GNU Affero General Public License v3.0 5 votes vote down vote up
def get_romeo_latest_update_date(self):
        """
        Fetches the dates of the latest updates on the RoMEO service.
        This returns a dict: the dates can be accessed via the 'publishers' and 'journals'
        keys.
        """
        r = requests.get('http://www.sherpa.ac.uk/downloads/download-dates.php',
                         {'ak':self.api_key, 'format':'xml'})
        parser = ET.XMLParser(encoding='ISO-8859-1')
        root = ET.parse(BytesIO(r.content), parser)
        return {
            'publishers': self._get_romeo_date(root, './publisherspolicies/latestupdate'),
            'journals': self._get_romeo_date(root, './journals/latestupdate')
        } 
Example #29
Source File: cctop.py    From ssbio with MIT License 5 votes vote down vote up
def parse_cctop_full(infile):
    """Parse a CCTOP XML results file and return a list of the consensus TM domains in the format::

            [(1, inside_outside_or_tm),
             (2, inside_outside_or_tm),
             ...]

    Where the first value of a tuple is the sequence residue number, and the second is the predicted location with the
    values 'I' (inside), 'O' (outside), or 'M' (membrane).

    Args:
        infile (str): Path to CCTOP XML file

    Returns:
        list: List of tuples in the format described above

    """
    parser = etree.XMLParser(ns_clean=True)
    with open(infile, 'r') as f:
        tree = etree.fromstring(f.read(), parser)

    all_info = []

    if tree.find('Topology') is not None:
        for r in tree.find('Topology').findall('Region'):
            region_start = int(r.attrib['from'])
            region_end = int(r.attrib['to'])
            region = r.attrib['loc']
            for i in range(region_start, region_end + 1):
                all_info.append((i, region))

    return all_info 
Example #30
Source File: XnatUtils.py    From dax with MIT License 5 votes vote down vote up
def get_resource_lastdate_modified(intf, resource_obj):
    """
    Get the last modified data for a resource on XNAT.
     (NOT WORKING: bug on XNAT side for version<1.6.5)
    :param intf: pyxnat.Interface object
    :param resource: resource pyxnat Eobject
    :return: date of last modified data with the format %Y%m%d%H%M%S
    """
    # xpaths for times in resource xml
    created_dcm_xpath = "/cat:DCMCatalog/cat:entries/cat:entry/@createdTime"
    modified_dcm_xpath = "/cat:DCMCatalog/cat:entries/cat:entry/@modifiedTime"
    created_xpath = "/cat:Catalog/cat:entries/cat:entry/@createdTime"
    modified_xpath = "/cat:Catalog/cat:entries/cat:entry/@modifiedTime"
    # Get the resource object and its uri
    res_xml_uri = '%s?format=xml' % (resource_obj._uri)
    # Get the XML for resource
    xmlstr = intf._exec(res_xml_uri, 'GET')
    # Parse out the times
    root = etree.fromstring(xmlstr, parser=etree.XMLParser(huge_tree=True))
    create_times = root.xpath(created_xpath, namespaces=root.nsmap)
    if not create_times:
        create_times = root.xpath(created_dcm_xpath, namespaces=root.nsmap)
    mod_times = root.xpath(modified_xpath, namespaces=root.nsmap)
    if not mod_times:
        mod_times = root.xpath(modified_dcm_xpath, namespaces=root.nsmap)
    # Find the most recent time
    all_times = create_times + mod_times
    if all_times:
        max_time = max(all_times)
        date = max_time.split('.')[0]
        res_date = (date.split('T')[0].replace('-', '') +
                    date.split('T')[1].replace(':', ''))
    else:
        res_date = ('{:%Y%m%d%H%M%S}'.format(datetime.now()))
    return res_date