Python Examples of lxml.etree.XMLSyntaxError

Source File: doctestcompare.py From learn_python3_spider with MIT License

6 votes

def check_output(self, want, got, optionflags):
        alt_self = getattr(self, '_temp_override_self', None)
        if alt_self is not None:
            super_method = self._temp_call_super_check_output
            self = alt_self
        else:
            super_method = OutputChecker.check_output
        parser = self.get_parser(want, got, optionflags)
        if not parser:
            return super_method(
                self, want, got, optionflags)
        try:
            want_doc = parser(want)
        except etree.XMLSyntaxError:
            return False
        try:
            got_doc = parser(got)
        except etree.XMLSyntaxError:
            return False
        return self.compare_docs(want_doc, got_doc)

Source File: test_tesseract.py From nidaba with GNU General Public License v2.0

6 votes

def test_capi_file_output_jpg(self):
        """
        Test that tesseract CAPI calls create hocr output for jpgs.
        """

        try:
            t = ctypes.cdll.LoadLibrary('libtesseract.so.3')
        except:
            raise unittest.SkipTest

        self.tesseract.setup(tessdata=tessdata, implementation='capi')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_jpg.xml'),
                                           languages=['eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!')

Source File: base.py From daf-recipes with GNU General Public License v3.0

6 votes

def _validate_document(self, document_string, harvest_object, validator=None):
        '''
        Validates an XML document with the default, or if present, the
        provided validators.

        It will create a HarvestObjectError for each validation error found,
        so they can be shown properly on the frontend.

        Returns a tuple, with a boolean showing whether the validation passed
        or not, the profile used and a list of errors (tuples with error
        message and error lines if present).
        '''
        if not validator:
            validator = self._get_validator()

        document_string = re.sub('<\?xml(.*)\?>', '', document_string)

        try:
            xml = etree.fromstring(document_string)
        except etree.XMLSyntaxError, e:
            self._save_object_error('Could not parse XML file: {0}'.format(str(e)), harvest_object, 'Import')
            return False, None, []

Source File: test_tesseract.py From nidaba with GNU General Public License v2.0

6 votes

def test_direct_file_output_png(self):
        """
        Test that direct tesseract calls create hocr output for pngs.
        """

        if not spawn.find_executable('tesseract'):
            raise unittest.SkipTest

        self.tesseract.setup(tessdata=tessdata, implementation='direct')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_png.xml'),
                                           languages=['eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!')

Source File: test_tesseract.py From nidaba with GNU General Public License v2.0

6 votes

def test_direct_file_output_tiff(self):
        """
        Test that direct tesseract calls create hocr output for tiffs.
        """
        if not spawn.find_executable('tesseract'):
            raise unittest.SkipTest

        self.tesseract.setup(tessdata=tessdata, implementation='direct')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_tiff.xml'),
                                           languages=['eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!')

Source File: test_tesseract.py From nidaba with GNU General Public License v2.0

6 votes

def test_direct_file_output_jpg(self):
        """
        Test that direct tesseract calls create hocr output for jpgs.
        """
        if not spawn.find_executable('tesseract'):
            raise unittest.SkipTest

        self.tesseract.setup(tessdata=tessdata, implementation='direct')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_jpg.xml'),
                                           languages=['eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!')

Source File: ironclaw_class.py From warriorframework with Apache License 2.0

6 votes

def xml_to_xsd_validation(file_xml, file_xsd):
        """ Verify that the XML compliance with XSD
        Arguments:
            1. file_xml: Input xml file
            2. file_xsd: xsd file which needs to be validated against xml
        Return:
            No return value
        """
        try:
            print_info("Validating:{0}".format(file_xml))
            print_info("xsd_file:{0}".format(file_xsd))
            xml_doc = parse(file_xml)
            xsd_doc = parse(file_xsd)
            xmlschema = XMLSchema(xsd_doc)
            xmlschema.assert_(xml_doc)
            return True

        except XMLSyntaxError as err:
            print_error("PARSING ERROR:{0}".format(err))
            return False

        except AssertionError, err:
            print_error("Incorrect XML schema: {0}".format(err))
            return False

Source File: pyreact.py From pypath with GNU General Public License v3.0

6 votes

def init_etree(self):
        """
        Creates the ``lxml.etree.iterparse`` object.
        This method should not be called directly,
        ``BioPaxReader.process()`` calls it.
        """
        try:

            self.bp = etree.iterparse(self._biopax, events=('start', 'end'))
            _, self.root = next(self.bp)

        except etree.XMLSyntaxError:

            self.bp = None

        self.used_elements = []

Source File: __init__.py From python-gvm with GNU General Public License v3.0

6 votes

def import_config(self, config: str) -> Any:
        """Import a scan config from XML

        Arguments:
            config: Scan Config XML as string to import. This XML must
                contain a :code:`<get_configs_response>` root element.

        Returns:
            The response. See :py:meth:`send_command` for details.
        """
        if not config:
            raise RequiredArgument(
                function=self.import_config.__name__, argument='config'
            )

        cmd = XmlCommand("create_config")

        try:
            cmd.append_xml_str(config)
        except etree.XMLSyntaxError:
            raise InvalidArgument(
                function=self.import_config.__name__, argument='config'
            )

        return self._send_xml_command(cmd)

Source File: test_tesseract.py From nidaba with GNU General Public License v2.0

6 votes

def test_capi_file_output_png(self):
        """
        Test that tesseract CAPI calls create hocr output for pngs.
        """

        try:
            t = ctypes.cdll.LoadLibrary('libtesseract.so.3')
        except:
            raise unittest.SkipTest

        self.tesseract.setup(tessdata=tessdata, implementation='capi')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_png.xml'),
                                           languages=['eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!')

Source File: test_tesseract.py From nidaba with GNU General Public License v2.0

6 votes

def test_capi_extended(self):
        """
        Test that the CAPI extended output contains character cuts in each
        ocr_line and character confidences in each ocrx_word.
        """

        try:
            ctypes.cdll.LoadLibrary('libtesseract.so.3')
        except:
            raise unittest.SkipTest
        self.tesseract.setup(tessdata=tessdata, implementation='capi')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_tiff.xml'),
                                           languages=['eng'],
                                           extended=True)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')

        try:
            h = etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!')
        self.assertIsNotNone(h.findall(".//line"), msg='Tesseract did not write lines.')
        self.assertIsNotNone(h.findall(".//seg"), msg='Tesseract did not write segments.')
        self.assertIsNotNone(h.findall(".//g"), msg='Tesseract did not write graphemes.')

Source File: test_tesseract.py From nidaba with GNU General Public License v2.0

6 votes

def test_direct_multiple(self):
        """
        Test that direct tesseract calls create hocr output for multiple
        languages.
        """
        if not spawn.find_executable('tesseract'):
            raise unittest.SkipTest

        self.tesseract.setup(tessdata=tessdata, implementation='direct')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_tiff.xml'),
                                           languages=['grc', 'eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!')

Source File: xml.py From kraken with Apache License 2.0

6 votes

def parse_xml(filename):
    """
    Parses either a PageXML or ALTO file with autodetermination of the file
    format.

    Args:
        filename (str): path to an XML file.

    Returns:
        A dict {'image': impath, lines: [{'boundary': [[x0, y0], ...],
        'baseline': [[x0, y0], ...]}, {...], 'text': 'apdjfqpf', 'script':
        'script_type'}, regions: {'region_type_0': [[[x0, y0], ...], ...],
        ...}}
    """
    with open(filename, 'rb') as fp:
        try:
            doc = etree.parse(fp)
        except etree.XMLSyntaxError as e:
            raise KrakenInputException(f'Parsing {filename} failed: {e}')
    if doc.getroot().tag.endswith('alto'):
        return parse_alto(filename)
    elif doc.getroot().tag.endswith('PcGts'):
        return parse_page(filename)
    else:
        raise KrakenInputException(f'Unknown XML format in {filename}')

Source File: main.py From parserator with MIT License

6 votes

def __call__(self, parser, namespace, string, option_string):
        try:
            with open(string, 'r') as f:
                tree = etree.parse(f)
                xml = tree.getroot()
        except (OSError, IOError):
            xml = None
        except etree.XMLSyntaxError as e:
            if 'Document is empty' not in str(e):
                raise argparse.ArgumentError(self,
                                             "%s does not seem to be a valid xml file"
                                             % string)
            xml = None

        setattr(namespace, self.dest, string)
        setattr(namespace, 'xml', xml)

Source File: files.py From janeway with GNU Affero General Public License v3.0

6 votes

def transform_with_xsl(xml_path, xsl_path, recover=False):
    try:
        xml_dom = etree.parse(xml_path)
    except etree.XMLSyntaxError as e:
        if recover:
            logger.error(e)
            parser = etree.XMLParser(recover=True)
            xml_dom = etree.parse(xml_path, parser=parser)
        else:
            raise
    xsl_transform = etree.XSLT(etree.parse(xsl_path))
    try:
        transformed_dom = xsl_transform(xml_dom)
    except Exception as err:
        logger.error(err)
        for xsl_error in xsl_transform.error_log:
            logger.error(xsl_error)
        if not recover:
            raise

    return transformed_dom

Source File: nexpose.py From nexpose-client-python with BSD 3-Clause "New" or "Revised" License

6 votes

def Open(self):
        """
        Opens a session to the nexpose appliance by logging in.
        This function with raise an exception on error or if the session is already open.
        """
        if self._session_id:
            raise SessionIsNotClosedException("Please close the session first!")
        try:
            response = self._Execute_APIv1d1(self._login_request)
        except NexposeConnectionException as ex:
            if isinstance(ex.inner_exception, etree.XMLSyntaxError):
                raise NexposeException("Unexpected error! Is the Nexpose appliance activated?")
            raise ex
        if response.tag == "LoginResponse":
            if response.attrib["success"] == "1":
                self._session_id = response.attrib["session-id"]
        if not self._session_id:
            raise NexposeFailureException("Login failure!")

Source File: doctestcompare.py From aws-lambda-lxml with GNU General Public License v3.0

6 votes

def check_output(self, want, got, optionflags):
        alt_self = getattr(self, '_temp_override_self', None)
        if alt_self is not None:
            super_method = self._temp_call_super_check_output
            self = alt_self
        else:
            super_method = OutputChecker.check_output
        parser = self.get_parser(want, got, optionflags)
        if not parser:
            return super_method(
                self, want, got, optionflags)
        try:
            want_doc = parser(want)
        except etree.XMLSyntaxError:
            return False
        try:
            got_doc = parser(got)
        except etree.XMLSyntaxError:
            return False
        return self.compare_docs(want_doc, got_doc)

Source File: doctestcompare.py From aws-lambda-lxml with GNU General Public License v3.0

6 votes

def check_output(self, want, got, optionflags):
        alt_self = getattr(self, '_temp_override_self', None)
        if alt_self is not None:
            super_method = self._temp_call_super_check_output
            self = alt_self
        else:
            super_method = OutputChecker.check_output
        parser = self.get_parser(want, got, optionflags)
        if not parser:
            return super_method(
                self, want, got, optionflags)
        try:
            want_doc = parser(want)
        except etree.XMLSyntaxError:
            return False
        try:
            got_doc = parser(got)
        except etree.XMLSyntaxError:
            return False
        return self.compare_docs(want_doc, got_doc)

Source File: doctestcompare.py From aws-lambda-lxml with GNU General Public License v3.0

6 votes

def check_output(self, want, got, optionflags):
        alt_self = getattr(self, '_temp_override_self', None)
        if alt_self is not None:
            super_method = self._temp_call_super_check_output
            self = alt_self
        else:
            super_method = OutputChecker.check_output
        parser = self.get_parser(want, got, optionflags)
        if not parser:
            return super_method(
                self, want, got, optionflags)
        try:
            want_doc = parser(want)
        except etree.XMLSyntaxError:
            return False
        try:
            got_doc = parser(got)
        except etree.XMLSyntaxError:
            return False
        return self.compare_docs(want_doc, got_doc)

Source File: doctestcompare.py From stopstalk-deployment with MIT License

6 votes

def check_output(self, want, got, optionflags):
        alt_self = getattr(self, '_temp_override_self', None)
        if alt_self is not None:
            super_method = self._temp_call_super_check_output
            self = alt_self
        else:
            super_method = OutputChecker.check_output
        parser = self.get_parser(want, got, optionflags)
        if not parser:
            return super_method(
                self, want, got, optionflags)
        try:
            want_doc = parser(want)
        except etree.XMLSyntaxError:
            return False
        try:
            got_doc = parser(got)
        except etree.XMLSyntaxError:
            return False
        return self.compare_docs(want_doc, got_doc)

Source File: doctestcompare.py From aws-lambda-lxml with GNU General Public License v3.0

6 votes

def check_output(self, want, got, optionflags):
        alt_self = getattr(self, '_temp_override_self', None)
        if alt_self is not None:
            super_method = self._temp_call_super_check_output
            self = alt_self
        else:
            super_method = OutputChecker.check_output
        parser = self.get_parser(want, got, optionflags)
        if not parser:
            return super_method(
                self, want, got, optionflags)
        try:
            want_doc = parser(want)
        except etree.XMLSyntaxError:
            return False
        try:
            got_doc = parser(got)
        except etree.XMLSyntaxError:
            return False
        return self.compare_docs(want_doc, got_doc)

Source File: get_ui.py From adbui with MIT License

6 votes

def get_uis_by_xpath(self, xpath, is_update=True):
        """
        通过xpath查找节点
        :param xpath: 
        :param is_update: 
        :return: 
        """
        if is_update:
            xml_str = None
            for _ in range(5):
                try:
                    xml_str = self.adb_ext.dump()  # 获取xml文件
                    self.__init_xml(xml_str)
                    break
                except etree.XMLSyntaxError:
                    logging.error('etree.XMLSyntaxError:\n')
                    if xml_str:
                        logging.error('xml str:{}'.format(xml_str))
        xpath = xpath.decode('utf-8') if sys.version_info[0] < 3 else xpath
        elements = self.xml.xpath(xpath)
        uis = []
        for element in elements:
            uis.append(self.get_ui_by_element(element))
        return uis

Source File: test_backends.py From xblock-video with GNU General Public License v3.0

6 votes

def test_download_default_transcript(self, backend, download_transcript_mock, params):
        """
        Check default transcript is downloaded from a video platform API.
        """
        player = self.player[backend]
        for index, event in enumerate(download_transcript_mock.get_outcomes()):
            mock = download_transcript_mock(event=event)
            self.mocked_objects = mock.apply_mock(self.mocked_objects)
            try:
                res = player(self.xblock).download_default_transcript(**params[index])
                message = ''
                expected_default_transcript = mock.expected_value[0]
                self.assertIsInstance(res, unicode)
                self.assertEqual(res, expected_default_transcript)
            except VideoXBlockException as ex:
                message = ex.message
            except etree.XMLSyntaxError:
                message = 'XMLSyntaxError exception'
            expected_message = mock.expected_value[-1]
            self.assertIn(expected_message, message)
            self.restore_mocked()

Source File: test_tesseract.py From nidaba with GNU General Public License v2.0

6 votes

def test_capi_multiple(self):
        """
        Test that tesseract CAPI calls create hocr output for multiple
        languages.
        """
        try:
            t = ctypes.cdll.LoadLibrary('libtesseract.so.3')
        except:
            raise unittest.SkipTest
        self.tesseract.setup(tessdata=tessdata, implementation='capi')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_tiff.xml'),
                                           languages=['grc', 'eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            doc = etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!')

Source File: process_forest.py From process-forest with Apache License 2.0

5 votes

def get_entries(evtx):
    """
    @rtype: generator of Entry
    """
    for xml, record in evtx_file_xml_view(evtx.get_file_header()):
        try:
            yield Entry(xml, record)
        except etree.XMLSyntaxError as e:
            continue

Source File: doctestcompare.py From stopstalk-deployment with MIT License

5 votes

def output_difference(self, example, got, optionflags):
        want = example.want
        parser = self.get_parser(want, got, optionflags)
        errors = []
        if parser is not None:
            try:
                want_doc = parser(want)
            except etree.XMLSyntaxError:
                e = sys.exc_info()[1]
                errors.append('In example: %s' % e)
            try:
                got_doc = parser(got)
            except etree.XMLSyntaxError:
                e = sys.exc_info()[1]
                errors.append('In actual output: %s' % e)
        if parser is None or errors:
            value = OutputChecker.output_difference(
                self, example, got, optionflags)
            if errors:
                errors.append(value)
                return '\n'.join(errors)
            else:
                return value
        html = parser is html_fromstring
        diff_parts = []
        diff_parts.append('Expected:')
        diff_parts.append(self.format_doc(want_doc, html, 2))
        diff_parts.append('Got:')
        diff_parts.append(self.format_doc(got_doc, html, 2))
        diff_parts.append('Diff:')
        diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2))
        return '\n'.join(diff_parts)

Source File: test_ocropus.py From nidaba with GNU General Public License v2.0

5 votes

def test_file_outpath_jpg(self):
        """
        Test that ocropus creates hocr output for jpgs.
        """
        ocr = self.ocropus.ocr_ocropus.run((('test', 'segmentation.xml'),
                                            ('test', 'image_jpg.jpg')),
                                           model='ocropus')
        try:
            parser = etree.HTMLParser()
            etree.parse(open(os.path.join(self.storage_path, *ocr)), parser)
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!')

Source File: ooyala_player.py From xblock-ooyala with GNU Affero General Public License v3.0

5 votes

def studio_submit(self, submissions, suffix=''):

        xml_config = submissions['xml_config']
        try:
            etree.parse(StringIO(xml_config))
        except etree.XMLSyntaxError as e:
            response = {
                'result': 'error',
                'message': e.message
            }
        else:
            response = {
                'result': 'success',
            }

            self.xml_config = xml_config
            self.display_name = submissions['display_name']
            self.content_id = submissions['content_id'].strip()
            self.transcript_file_id = submissions['transcript_file_id'].strip()
            self.enable_player_token = submissions['enable_player_token']
            self.partner_code = submissions['partner_code']
            self.api_key = submissions['api_key']
            self.api_secret_key = submissions['api_secret_key']
            self.api_key_3play = submissions['api_key_3play']
            self.expiration_time = submissions['expiration_time']
            self.width = submissions['width']
            self.height = submissions['height']
            self.disable_cc_and_translations = submissions['cc_disable']

        return response

Source File: test_metadata.py From pikepdf with Mozilla Public License 2.0

5 votes

def test_truncated_xml(resources, idx):
    sandwich = Pdf.open(resources / 'sandwich.pdf')
    data = sandwich.Root.Metadata.read_bytes()
    assume(idx < len(data))

    sandwich.Root.Metadata = sandwich.make_stream(data[0:idx])
    try:
        with sandwich.open_metadata(strict=True) as xmp:
            xmp['pdfaid:part'] = '5'
    except (XMLSyntaxError, AssertionError):
        pass

    with sandwich.open_metadata(strict=False) as xmp:
        xmp['pdfaid:part'] = '7'

Source File: _fc.py From pypowervm with Apache License 2.0

5 votes

def _parse_pg83_xml(xml_resp):
    """Parse LUARecovery XML response, looking for pg83 descriptor.

    :param xml_resp: Tuple containing OutputXML and StdOut results of the
                     LUARecovery Job
    :return: pg83 descriptor text, or None if not found.
    """
    # QUERY_INVENTORY response may contain more than one element.  Each will be
    # delimited by its own <?xml?> tag.  etree will only parse one at a time.
    for chunk in xml_resp.split('<?xml version="1.0"?>'):
        if not chunk:
            continue
        try:
            parsed = etree.fromstring(chunk)
        except etree.XMLSyntaxError as e:
            LOG.warning(_('QUERY_INVENTORY produced invalid chunk of XML '
                          '(%(chunk)s).  Error: %(err)s'),
                        {'chunk': chunk, 'err': e.args[0]})
            continue
        for elem in parsed.getiterator():
            if (etree.QName(elem.tag).localname == 'PhysicalVolume_base' and
                    elem.attrib.get('desType') == "NAA"):
                return elem.attrib.get('descriptor')
    LOG.warning(_('Failed to find pg83 descriptor in XML output:\n%s'),
                xml_resp)
    return None

Python lxml.etree.XMLSyntaxError() Examples