Python lxml.html.parse() Examples

The following are 30 code examples of lxml.html.parse(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml.html , or try the search function .
Example #1
Source File: html.py    From recruit with Apache License 2.0 6 votes vote down vote up
def _parse_tables(self, doc, match, attrs):
        """
        Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : the DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            multiple tables on a page.

        Raises
        ------
        ValueError : `match` does not match any text in the document.

        Returns
        -------
        list of node-like
            HTML <table> elements to be parsed into raw data.
        """
        raise AbstractMethodError(self) 
Example #2
Source File: html.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def _parse_tables(self, doc, match, attrs):
        """
        Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : the DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            multiple tables on a page.

        Raises
        ------
        ValueError : `match` does not match any text in the document.

        Returns
        -------
        list of node-like
            HTML <table> elements to be parsed into raw data.
        """
        raise AbstractMethodError(self) 
Example #3
Source File: html.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
    flavor = _validate_flavor(flavor)
    compiled_match = re.compile(match)  # you can pass a compiled regex here

    # hack around python 3 deleting the exception variable
    retained = None
    for flav in flavor:
        parser = _parser_dispatch(flav)
        p = parser(io, compiled_match, attrs, encoding, displayed_only)

        try:
            tables = p.parse_tables()
        except Exception as caught:
            # if `io` is an io-like object, check if it's seekable
            # and try to rewind it before trying the next parser
            if hasattr(io, 'seekable') and io.seekable():
                io.seek(0)
            elif hasattr(io, 'seekable') and not io.seekable():
                # if we couldn't rewind it, let the user know
                raise ValueError('The flavor {} failed to parse your input. '
                                 'Since you passed a non-rewindable file '
                                 'object, we can\'t rewind it to try '
                                 'another parser. Try read_html() with a '
                                 'different flavor.'.format(flav))

            retained = caught
        else:
            break
    else:
        raise_with_traceback(retained)

    ret = []
    for table in tables:
        try:
            ret.append(_data_to_frame(data=table, **kwargs))
        except EmptyDataError:  # empty table
            continue
    return ret 
Example #4
Source File: html.py    From recruit with Apache License 2.0 5 votes vote down vote up
def _build_doc(self):
        """
        Return a tree-like object that can be used to iterate over the DOM.

        Returns
        -------
        node-like
            The DOM from which to parse the table element.
        """
        raise AbstractMethodError(self) 
Example #5
Source File: gifbot.py    From SnapchatBot with MIT License 5 votes vote down vote up
def grab_trending_gif_urls():
    doc = parse("http://giphy.com").getroot()
    els = doc.cssselect(".gif-link img")[:10]
    ret = []
    for el in els:
        ret.append("http:" +re.sub(r"\/([^./])*\.gif", "/giphy.gif", el.attrib['src']))
    return ret 
Example #6
Source File: core.py    From libextract with MIT License 5 votes vote down vote up
def parse_html(fileobj, encoding):
    """
    Given a file object *fileobj*, get an ElementTree instance.
    The *encoding* is assumed to be utf8.
    """
    parser = HTMLParser(encoding=encoding, remove_blank_text=True)
    return parse(fileobj, parser) 
Example #7
Source File: tutorial.py    From wextracto with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def extract(response):
    tree = parse(response)
    return tree.xpath('//h1/text()') 
Example #8
Source File: tutorial.py    From wextracto with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def extract(response):
    tree = parse(response)
    yield "name", text(tree.xpath('//h1'))
    yield "country", text(tree.xpath('//dd[@id="country"]'))
    yield "region", text(tree.xpath('//dd[@id="region"]')) 
Example #9
Source File: tutorial.py    From wextracto with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def extract(response):
    tree = parse(response)
    return text(tree.xpath('//h1/text()')) 
Example #10
Source File: rvl_cdip.py    From unilm with MIT License 5 votes vote down vote up
def read_hocr_file(self, data_dir, file):
        hocr_file = os.path.join(data_dir, "images", file[:-4] + ".xml")
        text_buffer = []
        bbox_buffer = []
        try:
            doc = html.parse(hocr_file)
        except AssertionError:
            logger.warning(
                "%s is empty or its format is unacceptable. Skipped.", hocr_file
            )
            return [], []
        for page in doc.xpath("//*[@class='ocr_page']"):
            page_bbox = [int(x) for x in get_prop(page, "bbox").split()]
            width, height = page_bbox[2], page_bbox[3]
            for word in doc.xpath("//*[@class='ocrx_word']"):
                textnodes = word.xpath(".//text()")
                s = "".join([text for text in textnodes])
                text = re.sub(r"\s+", " ", s).strip()
                if text:
                    text_buffer.append(text)
                    bbox = [int(x) for x in get_prop(word, "bbox").split()]
                    bbox = [
                        bbox[0] / width,
                        bbox[1] / height,
                        bbox[2] / width,
                        bbox[3] / height,
                    ]
                    bbox = [int(x * 1000) for x in bbox]
                    bbox_buffer.append(bbox)
        return text_buffer, bbox_buffer 
Example #11
Source File: html.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def _parse_tables(self, doc, match, attrs):
        """Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : tree-like
            The DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            mutliple tables on a page.

        Raises
        ------
        ValueError
            * If `match` does not match any text in the document.

        Returns
        -------
        tables : list of node-like
            A list of <table> elements to be parsed into raw data.
        """
        raise AbstractMethodError(self) 
Example #12
Source File: make.py    From facebook-friends-map with MIT License 5 votes vote down vote up
def index_friends():
    friends = utils.db_read(db_index)
    already_parsed = []
    for i,d in enumerate(friends):
        already_parsed.append(d['id'])
    print('Loading saved friends list...')

    file_path = os.getcwd() + '/' + friends_html
    x = html.parse(file_path).xpath
    base = '(//*[@data-sigil="undoable-action"])'
    num_items = len(x(base))
    if num_items == 0:
        print("\nWasn't able to parse friends index. This probably means that Facebook updated their template. \nPlease raise issue on Github and I will try to update the script. \nOr if you can code, please submit a pull request instead :)\n")
        sys.exit()
    for i in range(1,num_items+1):
        b = base + '['+str(i)+']/'
        info = json.loads(x(b+'/div[3]/div/div/div[3]')[0].get('data-store'))
        stdout.flush()
        stdout.write("\rScanning friend list... (%d / %d)" % (i,num_items))
        if not info['id'] in already_parsed:
            name = x(b+'/div[2]//a')[0].text
            alias = '' if info['is_deactivated'] else x(b+'/div[2]//a')[0].get('href')[1:]
            d = {
                'id': info['id'],
                'name': name,
                'active': 0 if int(info['is_deactivated']) else 1,
                'alias': alias                
                }
            
            utils.db_write(db_index,d)

    print('\n>> Saved friends list (%s) to %s' % (num_items,db_index))

# Download profile pages 
Example #13
Source File: test_html.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def get_lxml_elements(url, element):
    _skip_if_no('lxml')
    from lxml.html import parse
    doc = parse(url)
    return doc.xpath('.//{0}'.format(element)) 
Example #14
Source File: html.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def _parse_tables(self, doc, match, attrs):
        """Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : tree-like
            The DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            mutliple tables on a page.

        Raises
        ------
        ValueError
            * If `match` does not match any text in the document.

        Returns
        -------
        tables : list of node-like
            A list of <table> elements to be parsed into raw data.
        """
        raise AbstractMethodError(self) 
Example #15
Source File: http.py    From memorious with MIT License 5 votes vote down vote up
def html(self):
        if not hasattr(self, '_html'):
            self._html = None
            if self.content_type in NON_HTML:
                return
            if self.raw is None or not len(self.raw):
                return
            try:
                self._html = html.fromstring(self.text)
            except ValueError as ve:
                if 'encoding declaration' in str(ve):
                    self._html = html.parse(self.file_path.as_posix())
            except (etree.ParserError, etree.ParseError):
                pass
        return self._html 
Example #16
Source File: http.py    From memorious with MIT License 5 votes vote down vote up
def xml(self):
        if not hasattr(self, '_xml'):
            parser = etree.XMLParser(
                ns_clean=True,
                recover=True,
                resolve_entities=False,
                no_network=True
            )
            self._xml = etree.parse(self.file_path.as_posix(), parser=parser)
        return self._xml 
Example #17
Source File: http.py    From memorious with MIT License 5 votes vote down vote up
def json(self):
        if not hasattr(self, '_json'):
            if self.file_path is None:
                raise ParseError("Cannot parse failed download.")
            with open(self.file_path, 'r') as fh:
                self._json = json.load(fh)
        return self._json 
Example #18
Source File: models.py    From jorvik with GNU General Public License v3.0 5 votes vote down vote up
def corpo_body(self):
        """
        Prova ad estrarre il corpo della pagina (body).
        :return:
        """
        if not self.corpo:
            return ""
        doc = html.document_fromstring(self.corpo)
        body = doc.xpath('//body')[0]
        body.tag = 'div'
        #try:
        return html.tostring(body)
        #except:
        #    return self.corpo
        #print html.parse('http://someurl.at.domain').xpath('//body')[0].text_content() 
Example #19
Source File: html.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def _build_doc(self):
        """
        Return a tree-like object that can be used to iterate over the DOM.

        Returns
        -------
        node-like
            The DOM from which to parse the table element.
        """
        raise AbstractMethodError(self) 
Example #20
Source File: parsers.py    From riko with MIT License 5 votes vote down vote up
def xml2etree(f, xml=True, html5=False):
    if xml:
        element_tree = etree.parse(f)
    elif html5 and html5parser:
        element_tree = html5parser.parse(f)
    elif html5parser:
        element_tree = html.parse(f)
    else:
        # html5lib's parser returns an Element, so we must convert it into an
        # ElementTree
        element_tree = ElementTree(html.parse(f))

    return element_tree 
Example #21
Source File: parsers.py    From riko with MIT License 5 votes vote down vote up
def parse_rss(url=None, **kwargs):
    try:
        f = fetch(decode(url), **kwargs)
    except (ValueError, URLError):
        parsed = rssparser.parse(url)
    else:
        content = f.read() if speedparser else f

        try:
            parsed = rssparser.parse(content)
        finally:
            f.close()

    return parsed 
Example #22
Source File: test_html.py    From Computable with MIT License 5 votes vote down vote up
def get_lxml_elements(url, element):
    _skip_if_no('lxml')
    from lxml.html import parse
    doc = parse(url)
    return doc.xpath('.//{0}'.format(element)) 
Example #23
Source File: html.py    From recruit with Apache License 2.0 5 votes vote down vote up
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
    flavor = _validate_flavor(flavor)
    compiled_match = re.compile(match)  # you can pass a compiled regex here

    # hack around python 3 deleting the exception variable
    retained = None
    for flav in flavor:
        parser = _parser_dispatch(flav)
        p = parser(io, compiled_match, attrs, encoding, displayed_only)

        try:
            tables = p.parse_tables()
        except Exception as caught:
            # if `io` is an io-like object, check if it's seekable
            # and try to rewind it before trying the next parser
            if hasattr(io, 'seekable') and io.seekable():
                io.seek(0)
            elif hasattr(io, 'seekable') and not io.seekable():
                # if we couldn't rewind it, let the user know
                raise ValueError('The flavor {} failed to parse your input. '
                                 'Since you passed a non-rewindable file '
                                 'object, we can\'t rewind it to try '
                                 'another parser. Try read_html() with a '
                                 'different flavor.'.format(flav))

            retained = caught
        else:
            break
    else:
        raise_with_traceback(retained)

    ret = []
    for table in tables:
        try:
            ret.append(_data_to_frame(data=table, **kwargs))
        except EmptyDataError:  # empty table
            continue
    return ret 
Example #24
Source File: html.py    From Computable with MIT License 5 votes vote down vote up
def _parse_tables(self, doc, match, attrs):
        """Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : tree-like
            The DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            mutliple tables on a page.

        Raises
        ------
        ValueError
            * If `match` does not match any text in the document.

        Returns
        -------
        tables : list of node-like
            A list of <table> elements to be parsed into raw data.
        """
        raise NotImplementedError 
Example #25
Source File: check_https.py    From httpswatch with MIT License 5 votes vote down vote up
def fetch_through_redirects(url):
    tree = None
    while True:
        cont = False
        resp = requests.get(
            url,
            verify=certifi.where(),
            headers={"User-Agent": USER_AGENT},
            timeout=10,
            stream=True,
        )
        try:
            if resp.status_code != 200:
                raise Not200(resp.status_code)
            # Convince urllib3 to decode gzipped pages.
            resp.raw.decode_content = True
            tree = html.parse(resp.raw)
        finally:
            resp.close()
        # Check for sneaky <meta> redirects.
        for meta in META_XPATH(tree):
            m = re.match(r"0;\s*url=['\"](.+?)['\"]", meta.get("content"))
            if m is not None:
                url = m.groups()[0]
                cont = True
                break
        if not cont:
            break
    return resp, tree 
Example #26
Source File: html.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
    flavor = _validate_flavor(flavor)
    compiled_match = re.compile(match)  # you can pass a compiled regex here

    # hack around python 3 deleting the exception variable
    retained = None
    for flav in flavor:
        parser = _parser_dispatch(flav)
        p = parser(io, compiled_match, attrs, encoding, displayed_only)

        try:
            tables = p.parse_tables()
        except Exception as caught:
            # if `io` is an io-like object, check if it's seekable
            # and try to rewind it before trying the next parser
            if hasattr(io, 'seekable') and io.seekable():
                io.seek(0)
            elif hasattr(io, 'seekable') and not io.seekable():
                # if we couldn't rewind it, let the user know
                raise ValueError('The flavor {} failed to parse your input. '
                                 'Since you passed a non-rewindable file '
                                 'object, we can\'t rewind it to try '
                                 'another parser. Try read_html() with a '
                                 'different flavor.'.format(flav))

            retained = caught
        else:
            break
    else:
        raise_with_traceback(retained)

    ret = []
    for table in tables:
        try:
            ret.append(_data_to_frame(data=table, **kwargs))
        except EmptyDataError:  # empty table
            continue
    return ret 
Example #27
Source File: MangaList.py    From MangaScrapper with Apache License 2.0 5 votes vote down vote up
def main():
    with open("mangalist.csv", "w") as f:
        tree = parse("http://www.mangapanda.com/alphabetical")
        manga_name_list = tree.xpath("//ul[@class='series_alpha']/li/a/text()")
        manga_url_list = tree.xpath("//ul[@class='series_alpha']/li/a/@href")
        f.write("\"Manga Name\", URL\n")

        for i in range(len(manga_name_list)):
            f.write("\"{0}\", http://www.mangapanda.com{1}\n".format(manga_name_list[i].replace("\"", ""), manga_url_list[i])) 
Example #28
Source File: html.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def _parse_tables(self, doc, match, attrs):
        """Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : tree-like
            The DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            multiple tables on a page.

        Raises
        ------
        ValueError
            * If `match` does not match any text in the document.

        Returns
        -------
        tables : list of node-like
            A list of <table> elements to be parsed into raw data.
        """
        raise com.AbstractMethodError(self) 
Example #29
Source File: html.py    From recruit with Apache License 2.0 4 votes vote down vote up
def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError
        parser = HTMLParser(recover=True, encoding=self.encoding)

        try:
            if _is_url(self.io):
                with urlopen(self.io) as f:
                    r = parse(f, parser=parser)
            else:
                # try to parse the input in the simplest way
                r = parse(self.io, parser=parser)
            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError) as e:
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                raise e
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r 
Example #30
Source File: html.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 4 votes vote down vote up
def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError
        parser = HTMLParser(recover=True, encoding=self.encoding)

        try:
            if _is_url(self.io):
                with urlopen(self.io) as f:
                    r = parse(f, parser=parser)
            else:
                # try to parse the input in the simplest way
                r = parse(self.io, parser=parser)
            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError) as e:
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                raise e
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r