Python Examples of lxml.html.parse

Source File: html.py From recruit with Apache License 2.0

6 votes

def _parse_tables(self, doc, match, attrs):
        """
        Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : the DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            multiple tables on a page.

        Raises
        ------
        ValueError : `match` does not match any text in the document.

        Returns
        -------
        list of node-like
            HTML <table> elements to be parsed into raw data.
        """
        raise AbstractMethodError(self)

Source File: html.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def _parse_tables(self, doc, match, attrs):
        """
        Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : the DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            multiple tables on a page.

        Raises
        ------
        ValueError : `match` does not match any text in the document.

        Returns
        -------
        list of node-like
            HTML <table> elements to be parsed into raw data.
        """
        raise AbstractMethodError(self)

Source File: html.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
    flavor = _validate_flavor(flavor)
    compiled_match = re.compile(match)  # you can pass a compiled regex here

    # hack around python 3 deleting the exception variable
    retained = None
    for flav in flavor:
        parser = _parser_dispatch(flav)
        p = parser(io, compiled_match, attrs, encoding, displayed_only)

        try:
            tables = p.parse_tables()
        except Exception as caught:
            # if `io` is an io-like object, check if it's seekable
            # and try to rewind it before trying the next parser
            if hasattr(io, 'seekable') and io.seekable():
                io.seek(0)
            elif hasattr(io, 'seekable') and not io.seekable():
                # if we couldn't rewind it, let the user know
                raise ValueError('The flavor {} failed to parse your input. '
                                 'Since you passed a non-rewindable file '
                                 'object, we can\'t rewind it to try '
                                 'another parser. Try read_html() with a '
                                 'different flavor.'.format(flav))

            retained = caught
        else:
            break
    else:
        raise_with_traceback(retained)

    ret = []
    for table in tables:
        try:
            ret.append(_data_to_frame(data=table, **kwargs))
        except EmptyDataError:  # empty table
            continue
    return ret

Source File: html.py From recruit with Apache License 2.0

5 votes

def _build_doc(self):
        """
        Return a tree-like object that can be used to iterate over the DOM.

        Returns
        -------
        node-like
            The DOM from which to parse the table element.
        """
        raise AbstractMethodError(self)

Source File: gifbot.py From SnapchatBot with MIT License

5 votes

def grab_trending_gif_urls():
    doc = parse("http://giphy.com").getroot()
    els = doc.cssselect(".gif-link img")[:10]
    ret = []
    for el in els:
        ret.append("http:" +re.sub(r"\/([^./])*\.gif", "/giphy.gif", el.attrib['src']))
    return ret

Source File: core.py From libextract with MIT License

5 votes

def parse_html(fileobj, encoding):
    """
    Given a file object *fileobj*, get an ElementTree instance.
    The *encoding* is assumed to be utf8.
    """
    parser = HTMLParser(encoding=encoding, remove_blank_text=True)
    return parse(fileobj, parser)

Source File: tutorial.py From wextracto with BSD 3-Clause "New" or "Revised" License

5 votes

def extract(response):
    tree = parse(response)
    return tree.xpath('//h1/text()')

Source File: tutorial.py From wextracto with BSD 3-Clause "New" or "Revised" License

5 votes

def extract(response):
    tree = parse(response)
    yield "name", text(tree.xpath('//h1'))
    yield "country", text(tree.xpath('//dd[@id="country"]'))
    yield "region", text(tree.xpath('//dd[@id="region"]'))

Source File: tutorial.py From wextracto with BSD 3-Clause "New" or "Revised" License

5 votes

def extract(response):
    tree = parse(response)
    return text(tree.xpath('//h1/text()'))

Source File: rvl_cdip.py From unilm with MIT License

5 votes

def read_hocr_file(self, data_dir, file):
        hocr_file = os.path.join(data_dir, "images", file[:-4] + ".xml")
        text_buffer = []
        bbox_buffer = []
        try:
            doc = html.parse(hocr_file)
        except AssertionError:
            logger.warning(
                "%s is empty or its format is unacceptable. Skipped.", hocr_file
            )
            return [], []
        for page in doc.xpath("//*[@class='ocr_page']"):
            page_bbox = [int(x) for x in get_prop(page, "bbox").split()]
            width, height = page_bbox[2], page_bbox[3]
            for word in doc.xpath("//*[@class='ocrx_word']"):
                textnodes = word.xpath(".//text()")
                s = "".join([text for text in textnodes])
                text = re.sub(r"\s+", " ", s).strip()
                if text:
                    text_buffer.append(text)
                    bbox = [int(x) for x in get_prop(word, "bbox").split()]
                    bbox = [
                        bbox[0] / width,
                        bbox[1] / height,
                        bbox[2] / width,
                        bbox[3] / height,
                    ]
                    bbox = [int(x * 1000) for x in bbox]
                    bbox_buffer.append(bbox)
        return text_buffer, bbox_buffer

Source File: html.py From Splunking-Crime with GNU Affero General Public License v3.0

5 votes

def _parse_tables(self, doc, match, attrs):
        """Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : tree-like
            The DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            mutliple tables on a page.

        Raises
        ------
        ValueError
            * If `match` does not match any text in the document.

        Returns
        -------
        tables : list of node-like
            A list of <table> elements to be parsed into raw data.
        """
        raise AbstractMethodError(self)

Source File: make.py From facebook-friends-map with MIT License

5 votes

def index_friends():
    friends = utils.db_read(db_index)
    already_parsed = []
    for i,d in enumerate(friends):
        already_parsed.append(d['id'])
    print('Loading saved friends list...')

    file_path = os.getcwd() + '/' + friends_html
    x = html.parse(file_path).xpath
    base = '(//*[@data-sigil="undoable-action"])'
    num_items = len(x(base))
    if num_items == 0:
        print("\nWasn't able to parse friends index. This probably means that Facebook updated their template. \nPlease raise issue on Github and I will try to update the script. \nOr if you can code, please submit a pull request instead :)\n")
        sys.exit()
    for i in range(1,num_items+1):
        b = base + '['+str(i)+']/'
        info = json.loads(x(b+'/div[3]/div/div/div[3]')[0].get('data-store'))
        stdout.flush()
        stdout.write("\rScanning friend list... (%d / %d)" % (i,num_items))
        if not info['id'] in already_parsed:
            name = x(b+'/div[2]//a')[0].text
            alias = '' if info['is_deactivated'] else x(b+'/div[2]//a')[0].get('href')[1:]
            d = {
                'id': info['id'],
                'name': name,
                'active': 0 if int(info['is_deactivated']) else 1,
                'alias': alias                
                }
            
            utils.db_write(db_index,d)

    print('\n>> Saved friends list (%s) to %s' % (num_items,db_index))

# Download profile pages

Source File: test_html.py From elasticintel with GNU General Public License v3.0

5 votes

def get_lxml_elements(url, element):
    _skip_if_no('lxml')
    from lxml.html import parse
    doc = parse(url)
    return doc.xpath('.//{0}'.format(element))

Source File: html.py From elasticintel with GNU General Public License v3.0

5 votes

def _parse_tables(self, doc, match, attrs):
        """Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : tree-like
            The DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            mutliple tables on a page.

        Raises
        ------
        ValueError
            * If `match` does not match any text in the document.

        Returns
        -------
        tables : list of node-like
            A list of <table> elements to be parsed into raw data.
        """
        raise AbstractMethodError(self)

Source File: http.py From memorious with MIT License

5 votes

def html(self):
        if not hasattr(self, '_html'):
            self._html = None
            if self.content_type in NON_HTML:
                return
            if self.raw is None or not len(self.raw):
                return
            try:
                self._html = html.fromstring(self.text)
            except ValueError as ve:
                if 'encoding declaration' in str(ve):
                    self._html = html.parse(self.file_path.as_posix())
            except (etree.ParserError, etree.ParseError):
                pass
        return self._html

Source File: http.py From memorious with MIT License

5 votes

def xml(self):
        if not hasattr(self, '_xml'):
            parser = etree.XMLParser(
                ns_clean=True,
                recover=True,
                resolve_entities=False,
                no_network=True
            )
            self._xml = etree.parse(self.file_path.as_posix(), parser=parser)
        return self._xml

Source File: http.py From memorious with MIT License

5 votes

def json(self):
        if not hasattr(self, '_json'):
            if self.file_path is None:
                raise ParseError("Cannot parse failed download.")
            with open(self.file_path, 'r') as fh:
                self._json = json.load(fh)
        return self._json

Source File: models.py From jorvik with GNU General Public License v3.0

5 votes

def corpo_body(self):
        """
        Prova ad estrarre il corpo della pagina (body).
        :return:
        """
        if not self.corpo:
            return ""
        doc = html.document_fromstring(self.corpo)
        body = doc.xpath('//body')[0]
        body.tag = 'div'
        #try:
        return html.tostring(body)
        #except:
        #    return self.corpo
        #print html.parse('http://someurl.at.domain').xpath('//body')[0].text_content()

Source File: html.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def _build_doc(self):
        """
        Return a tree-like object that can be used to iterate over the DOM.

        Returns
        -------
        node-like
            The DOM from which to parse the table element.
        """
        raise AbstractMethodError(self)

Source File: parsers.py From riko with MIT License

5 votes

def xml2etree(f, xml=True, html5=False):
    if xml:
        element_tree = etree.parse(f)
    elif html5 and html5parser:
        element_tree = html5parser.parse(f)
    elif html5parser:
        element_tree = html.parse(f)
    else:
        # html5lib's parser returns an Element, so we must convert it into an
        # ElementTree
        element_tree = ElementTree(html.parse(f))

    return element_tree

Source File: parsers.py From riko with MIT License

5 votes

def parse_rss(url=None, **kwargs):
    try:
        f = fetch(decode(url), **kwargs)
    except (ValueError, URLError):
        parsed = rssparser.parse(url)
    else:
        content = f.read() if speedparser else f

        try:
            parsed = rssparser.parse(content)
        finally:
            f.close()

    return parsed

Source File: test_html.py From Computable with MIT License

5 votes

def get_lxml_elements(url, element):
    _skip_if_no('lxml')
    from lxml.html import parse
    doc = parse(url)
    return doc.xpath('.//{0}'.format(element))

Source File: html.py From recruit with Apache License 2.0

5 votes

def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
    flavor = _validate_flavor(flavor)
    compiled_match = re.compile(match)  # you can pass a compiled regex here

    # hack around python 3 deleting the exception variable
    retained = None
    for flav in flavor:
        parser = _parser_dispatch(flav)
        p = parser(io, compiled_match, attrs, encoding, displayed_only)

        try:
            tables = p.parse_tables()
        except Exception as caught:
            # if `io` is an io-like object, check if it's seekable
            # and try to rewind it before trying the next parser
            if hasattr(io, 'seekable') and io.seekable():
                io.seek(0)
            elif hasattr(io, 'seekable') and not io.seekable():
                # if we couldn't rewind it, let the user know
                raise ValueError('The flavor {} failed to parse your input. '
                                 'Since you passed a non-rewindable file '
                                 'object, we can\'t rewind it to try '
                                 'another parser. Try read_html() with a '
                                 'different flavor.'.format(flav))

            retained = caught
        else:
            break
    else:
        raise_with_traceback(retained)

    ret = []
    for table in tables:
        try:
            ret.append(_data_to_frame(data=table, **kwargs))
        except EmptyDataError:  # empty table
            continue
    return ret

Source File: html.py From Computable with MIT License

5 votes

def _parse_tables(self, doc, match, attrs):
        """Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : tree-like
            The DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            mutliple tables on a page.

        Raises
        ------
        ValueError
            * If `match` does not match any text in the document.

        Returns
        -------
        tables : list of node-like
            A list of <table> elements to be parsed into raw data.
        """
        raise NotImplementedError

Source File: check_https.py From httpswatch with MIT License

5 votes

def fetch_through_redirects(url):
    tree = None
    while True:
        cont = False
        resp = requests.get(
            url,
            verify=certifi.where(),
            headers={"User-Agent": USER_AGENT},
            timeout=10,
            stream=True,
        )
        try:
            if resp.status_code != 200:
                raise Not200(resp.status_code)
            # Convince urllib3 to decode gzipped pages.
            resp.raw.decode_content = True
            tree = html.parse(resp.raw)
        finally:
            resp.close()
        # Check for sneaky <meta> redirects.
        for meta in META_XPATH(tree):
            m = re.match(r"0;\s*url=['\"](.+?)['\"]", meta.get("content"))
            if m is not None:
                url = m.groups()[0]
                cont = True
                break
        if not cont:
            break
    return resp, tree

Source File: html.py From vnpy_crypto with MIT License

5 votes

def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
    flavor = _validate_flavor(flavor)
    compiled_match = re.compile(match)  # you can pass a compiled regex here

    # hack around python 3 deleting the exception variable
    retained = None
    for flav in flavor:
        parser = _parser_dispatch(flav)
        p = parser(io, compiled_match, attrs, encoding, displayed_only)

        try:
            tables = p.parse_tables()
        except Exception as caught:
            # if `io` is an io-like object, check if it's seekable
            # and try to rewind it before trying the next parser
            if hasattr(io, 'seekable') and io.seekable():
                io.seek(0)
            elif hasattr(io, 'seekable') and not io.seekable():
                # if we couldn't rewind it, let the user know
                raise ValueError('The flavor {} failed to parse your input. '
                                 'Since you passed a non-rewindable file '
                                 'object, we can\'t rewind it to try '
                                 'another parser. Try read_html() with a '
                                 'different flavor.'.format(flav))

            retained = caught
        else:
            break
    else:
        raise_with_traceback(retained)

    ret = []
    for table in tables:
        try:
            ret.append(_data_to_frame(data=table, **kwargs))
        except EmptyDataError:  # empty table
            continue
    return ret

Source File: MangaList.py From MangaScrapper with Apache License 2.0

5 votes

def main():
    with open("mangalist.csv", "w") as f:
        tree = parse("http://www.mangapanda.com/alphabetical")
        manga_name_list = tree.xpath("//ul[@class='series_alpha']/li/a/text()")
        manga_url_list = tree.xpath("//ul[@class='series_alpha']/li/a/@href")
        f.write("\"Manga Name\", URL\n")

        for i in range(len(manga_name_list)):
            f.write("\"{0}\", http://www.mangapanda.com{1}\n".format(manga_name_list[i].replace("\"", ""), manga_url_list[i]))

Source File: html.py From vnpy_crypto with MIT License

5 votes

def _parse_tables(self, doc, match, attrs):
        """Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : tree-like
            The DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            multiple tables on a page.

        Raises
        ------
        ValueError
            * If `match` does not match any text in the document.

        Returns
        -------
        tables : list of node-like
            A list of <table> elements to be parsed into raw data.
        """
        raise com.AbstractMethodError(self)

Source File: html.py From recruit with Apache License 2.0

4 votes

def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError
        parser = HTMLParser(recover=True, encoding=self.encoding)

        try:
            if _is_url(self.io):
                with urlopen(self.io) as f:
                    r = parse(f, parser=parser)
            else:
                # try to parse the input in the simplest way
                r = parse(self.io, parser=parser)
            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError) as e:
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                raise e
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r

Source File: html.py From predictive-maintenance-using-machine-learning with Apache License 2.0

4 votes

def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError
        parser = HTMLParser(recover=True, encoding=self.encoding)

        try:
            if _is_url(self.io):
                with urlopen(self.io) as f:
                    r = parse(f, parser=parser)
            else:
                # try to parse the input in the simplest way
                r = parse(self.io, parser=parser)
            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError) as e:
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                raise e
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r

Python lxml.html.parse() Examples