Python lxml.html.HtmlElement() Examples

The following are 30 code examples of lxml.html.HtmlElement(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml.html , or try the search function .
Example #1
Source File: scraper.py    From scrape-linkedin with MIT License 6 votes vote down vote up
def test_scores(self):
        """ Return a list of dictionnary with test scores """
        if isinstance(self.xp_test_scores, html.HtmlElement) is True:
            count=int(self.get_clean_xpath(
                'count(//div[@id="background-test-scores"]/div[contains(@id, "scores-")])'))
            test_scores=[]
            for i in range(1, count + 1):
                data={}
                data['name']=extract_one(
                    self.get_xp(self.xp_test_scores, './/h4//text()'))
                data['score']=extract_one(
                    self.get_xp(self.xp_test_scores, './/h5//text()'))
                data['description']=' '.join((self.get_xp(
                    self.xp_test_scores, './/p[contains(@class,"description")]//text()')))
                data['date']=extract_one(self.get_xp(
                    self.xp_test_scores, './/span[@class = "date-range"]/time[1]/text()'))
                test_scores.append(data)
        else:
            test_scores=[]
        return test_scores 
Example #2
Source File: html_to_telegraph.py    From html-telegraph-poster with MIT License 6 votes vote down vote up
def _fragments_from_string(html_string):
    fragments = html.fragments_fromstring(html_string)
    if not len(fragments):
        return []
    # convert and append text node before starting tag
    if not isinstance(fragments[0], html.HtmlElement):
        if len(fragments[0].strip()) > 0:
            if len(fragments) == 1:
                return html.fragments_fromstring('<p>%s</p>' % fragments[0])
            else:
                paragraph = _create_element('p')
                paragraph.text = fragments[0]
                fragments[1].addprevious(paragraph)
                fragments.insert(1, paragraph)

        fragments.pop(0)
        if not len(fragments):
            return []

    # remove xml instructions (if cleaning is disabled)
    for instruction in fragments[0].xpath('//processing-instruction()'):
        instruction.drop_tag()

    return fragments 
Example #3
Source File: parser.py    From fonduer with MIT License 6 votes vote down vote up
def _parse_node(
        self, node: HtmlElement, state: Dict[str, Any]
    ) -> Iterator[Sentence]:
        """Entry point for parsing all node types.

        :param node: The lxml HTML node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        :return: a *generator* of Sentences
        """
        # Processing on entry of node
        state = self._parse_section(node, state)

        state = self._parse_figure(node, state)

        if self.tabular:
            state = self._parse_table(node, state)

        state = self._parse_caption(node, state)

        yield from self._parse_paragraph(node, state) 
Example #4
Source File: utils.py    From GeneralNewsExtractor with MIT License 5 votes vote down vote up
def drop_tag(node: HtmlElement):
    """
    only delete the tag, but merge its text to parent.
    :param node:
    :return:
    """
    parent = node.getparent()
    if parent is not None:
        node.drop_tag() 
Example #5
Source File: jav321.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def get_title(lx: html.HtmlElement) -> str:
    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip() 
Example #6
Source File: requests_html.py    From requests-html with MIT License 5 votes vote down vote up
def lxml(self) -> HtmlElement:
        """`lxml <http://lxml.de>`_ representation of the
        :class:`Element <Element>` or :class:`HTML <HTML>`.
        """
        if self._lxml is None:
            try:
                self._lxml = soup_parse(self.html, features='html.parser')
            except ValueError:
                self._lxml = lxml.html.fromstring(self.raw_html)

        return self._lxml 
Example #7
Source File: test_http.py    From memorious with MIT License 5 votes vote down vote up
def test_html(self, http):
        request = Request("GET", "https://httpbin.org/html")
        context_http_response = ContextHttpResponse(http, request)
        assert isinstance(context_http_response.html, html.HtmlElement) 
Example #8
Source File: jav321.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def get_cover(lx: html.HtmlElement) -> str:
    return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0] 
Example #9
Source File: html_to_telegraph.py    From html-telegraph-poster with MIT License 5 votes vote down vote up
def preprocess_media_tags(element):
    if isinstance(element, html.HtmlElement):
        if element.tag in ['ol', 'ul']:
            # ignore any spaces between <ul> and <li>
            element.text = ''
        elif element.tag == 'li':
            # ignore spaces after </li>
            element.tail = ''
        elif element.tag == 'iframe':
            iframe_src = element.get('src')

            youtube = youtube_re.match(iframe_src)
            vimeo = vimeo_re.match(iframe_src)
            telegram = re.match(telegram_embed_iframe_re, iframe_src)
            if youtube or vimeo or telegram:
                element.text = ''  # ignore any legacy text
                if youtube:
                    yt_id = urlparse(iframe_src).path.replace('/embed/', '')
                    element.set('src', '/embed/youtube?url=' + quote_plus('https://www.youtube.com/watch?v=' + yt_id))
                elif vimeo:
                    element.set('src', '/embed/vimeo?url=' + quote_plus('https://vimeo.com/' + vimeo.group(2)))
                elif telegram:
                    element.set('src', '/embed/telegram?url=' + quote_plus(iframe_src))
                if not len(element.xpath('./ancestor::figure')):
                    _wrap_figure(element)
            else:
                element.drop_tag()

        elif element.tag == 'blockquote' and element.get('class') == 'twitter-tweet':
            twitter_links = element.xpath('.//a[@href]')
            for tw_link in twitter_links:
                if twitter_re.match(tw_link.get('href')):
                    twitter_frame = html.HtmlElement()
                    twitter_frame.tag = 'iframe'
                    twitter_frame.set('src', '/embed/twitter?url=' + quote_plus(tw_link.get('href')))
                    element.addprevious(twitter_frame)
                    _wrap_figure(twitter_frame)
                    element.drop_tree()
                    break 
Example #10
Source File: html_to_telegraph.py    From html-telegraph-poster with MIT License 5 votes vote down vote up
def _create_element(element, text=None):
    # creates lxml element without document tree (no body, no parents)
    new_element = html.HtmlElement()
    new_element.tag = element
    if text:
        new_element.text = text
    return new_element 
Example #11
Source File: test_markdown.py    From online-judge with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_text_prefix(self):
        tree = fragments_to_tree('z<p>a</p><p>b</p>')
        self.assertIsInstance(tree, html.HtmlElement)
        self.assertEqual(len(tree.getchildren()), 2)
        self.assertEqual(tree.text, 'z')

        self.assertHTMLEqual(fragment_tree_to_str(tree), 'z<p>a</p><p>b</p>') 
Example #12
Source File: test_markdown.py    From online-judge with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_simple(self):
        tree = fragments_to_tree('<p>a</p><p>b</p>')
        self.assertIsInstance(tree, html.HtmlElement)
        self.assertEqual(len(tree.getchildren()), 2)

        self.assertIsInstance(tree[0], html.HtmlElement)
        self.assertEqual(tree[0].tag, 'p')
        self.assertEqual(tree[0].text, 'a')

        self.assertIsInstance(tree[1], html.HtmlElement)
        self.assertEqual(tree[1].tag, 'p')
        self.assertEqual(tree[1].text, 'b')

        self.assertHTMLEqual(fragment_tree_to_str(tree), '<p>a</p><p>b</p>') 
Example #13
Source File: structural.py    From fonduer with MIT License 5 votes vote down vote up
def _get_node(sentence: Sentence) -> HtmlElement:
    # Using caching to speed up retrieve process
    doc_etree = _get_etree_for_text(sentence.document.text)
    return doc_etree.xpath(sentence.xpath)[0] 
Example #14
Source File: utils.py    From scrape with MIT License 5 votes vote down vote up
def get_resp(url):
    """Get webpage response as an lxml.html.HtmlElement object."""
    try:
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        try:
            request = requests.get(url, headers=headers, proxies=get_proxies())
        except MissingSchema:
            url = add_protocol(url)
            request = requests.get(url, headers=headers, proxies=get_proxies())
        return lh.fromstring(request.text.encode("utf-8") if PY2 else request.text)
    except Exception:
        sys.stderr.write("Failed to retrieve {0}.\n".format(url))
        raise 
Example #15
Source File: parser.py    From fonduer with MIT License 5 votes vote down vote up
def _parse_section(
        self, node: HtmlElement, state: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Parse a Section of the node.

        Note that this implementation currently creates a Section at the
        beginning of the document and creates Section based on tag of node.

        :param node: The lxml node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """
        if node.tag not in ["html", "section"]:
            return state

        # Add a Section
        stable_id = (
            f"{state['document'].name}"
            f"::"
            f"{'section'}"
            f":"
            f"{state['section']['idx']}"
        )

        # Set name for Section
        name = node.attrib["name"] if "name" in node.attrib else None

        state["context"][node] = Section(
            document=state["document"],
            name=name,
            stable_id=stable_id,
            position=state["section"]["idx"],
        )
        state["section"]["idx"] += 1

        return state 
Example #16
Source File: utils.py    From GeneralNewsExtractor with MIT License 5 votes vote down vote up
def remove_node(node: HtmlElement):
    """
    this is a in-place operation, not necessary to return
    :param node:
    :return:
    """
    parent = node.getparent()
    if parent is not None:
        parent.remove(node) 
Example #17
Source File: utils.py    From GeneralNewsExtractor with MIT License 5 votes vote down vote up
def iter_node(element: HtmlElement):
    yield element
    for sub_element in element:
        if isinstance(sub_element, HtmlElement):
            yield from iter_node(sub_element) 
Example #18
Source File: utils.py    From GeneralNewsExtractor with MIT License 5 votes vote down vote up
def normalize_node(element: HtmlElement):
    etree.strip_elements(element, *USELESS_TAG)
    for node in iter_node(element):
        # inspired by readability.
        if node.tag.lower() in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
            remove_node(node)

        # merge text in span or strong to parent p tag
        if node.tag.lower() == 'p':
            etree.strip_tags(node, 'span')
            etree.strip_tags(node, 'strong')

        # if a div tag does not contain any sub node, it could be converted to p node.
        if node.tag.lower() == 'div' and not node.getchildren():
            node.tag = 'p'

        if node.tag.lower() == 'span' and not node.getchildren():
            node.tag = 'p'

        # remove empty p tag
        if node.tag.lower() == 'p' and not node.xpath('.//img'):
            if not (node.text and node.text.strip()):
                drop_tag(node)

        class_name = node.get('class')
        if class_name:
            for attribute in USELESS_ATTR:
                if attribute in class_name:
                    remove_node(node)
                    break 
Example #19
Source File: TitleExtractor.py    From GeneralNewsExtractor with MIT License 5 votes vote down vote up
def extract(self, element: HtmlElement, title_xpath: str = '') -> str:
        title_xpath = title_xpath or config.get('title', {}).get('xpath')
        title = (self.extract_by_xpath(element, title_xpath)
                 or self.extract_by_htag_and_title(element)
                 or self.extract_by_title(element)
                 or self.extract_by_htag(element)
                 )
        return title.strip() 
Example #20
Source File: AuthorExtractor.py    From GeneralNewsExtractor with MIT License 5 votes vote down vote up
def extractor(self, element: HtmlElement, author_xpath=''):
        author_xpath = author_xpath or config.get('author', {}).get('xpath')
        if author_xpath:
            author = ''.join(element.xpath(author_xpath))
            return author
        text = ''.join(element.xpath('.//text()'))
        for pattern in self.author_pattern:
            author_obj = re.search(pattern, text)
            if author_obj:
                return author_obj.group(1)
        return '' 
Example #21
Source File: TimeExtractor.py    From GeneralNewsExtractor with MIT License 5 votes vote down vote up
def extract_from_meta(self, element: HtmlElement) -> str:
        """
        一些很规范的新闻网站,会把新闻的发布时间放在 META 中,因此应该优先检查 META 数据
        :param element: 网页源代码对应的Dom 树
        :return: str
        """
        for xpath in PUBLISH_TIME_META:
            publish_time = element.xpath(xpath)
            if publish_time:
                return ''.join(publish_time)
        return '' 
Example #22
Source File: TimeExtractor.py    From GeneralNewsExtractor with MIT License 5 votes vote down vote up
def extract_from_text(self, element: HtmlElement) -> str:
        text = ''.join(element.xpath('.//text()'))
        for dt in self.time_pattern:
            dt_obj = re.search(dt, text)
            if dt_obj:
                return dt_obj.group(1)
        else:
            return '' 
Example #23
Source File: TimeExtractor.py    From GeneralNewsExtractor with MIT License 5 votes vote down vote up
def extract_from_user_xpath(self, publish_time_xpath: str, element: HtmlElement) -> str:
        if publish_time_xpath:
            publish_time = ''.join(element.xpath(publish_time_xpath))
            return publish_time
        return '' 
Example #24
Source File: TimeExtractor.py    From GeneralNewsExtractor with MIT License 5 votes vote down vote up
def extractor(self, element: HtmlElement, publish_time_xpath: str = '') -> str:
        publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath')
        publish_time = (self.extract_from_user_xpath(publish_time_xpath, element)  # 用户指定的 Xpath 是第一优先级
                        or self.extract_from_meta(element)   # 第二优先级从 Meta 中提取
                        or self.extract_from_text(element))  # 最坏的情况从正文中提取
        return publish_time 
Example #25
Source File: jav321.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def get_outline(lx: html.HtmlElement) -> str:
    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0] 
Example #26
Source File: jav321.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def get_series2(lx: html.HtmlElement) -> str:
    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0] 
Example #27
Source File: utils.py    From scrape with MIT License 5 votes vote down vote up
def parse_html(infile, xpath):
    """Filter HTML using XPath."""
    if not isinstance(infile, lh.HtmlElement):
        infile = lh.fromstring(infile)
    infile = infile.xpath(xpath)
    if not infile:
        raise ValueError("XPath {0} returned no results.".format(xpath))
    return infile


# URL processing functions
# 
Example #28
Source File: javlib.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
    return lx.xpath(xpath)[0].strip() 
Example #29
Source File: parser.py    From fonduer with MIT License 4 votes vote down vote up
def _parse_caption(
        self, node: HtmlElement, state: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Parse a Caption of the node.

        :param node: The lxml node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """
        if node.tag not in ["caption", "figcaption"]:  # captions used in Tables
            return state

        # Add a Caption
        parent = state["parent"][node]
        stable_id = (
            f"{state['document'].name}"
            f"::"
            f"{'caption'}"
            f":"
            f"{state['caption']['idx']}"
        )

        # Set name for Section
        name = node.attrib["name"] if "name" in node.attrib else None

        if isinstance(parent, Table):
            state["context"][node] = Caption(
                document=state["document"],
                table=parent,
                figure=None,
                stable_id=stable_id,
                name=name,
                position=state["caption"]["idx"],
            )
        elif isinstance(parent, Figure):
            state["context"][node] = Caption(
                document=state["document"],
                table=None,
                figure=parent,
                stable_id=stable_id,
                name=name,
                position=state["caption"]["idx"],
            )
        else:
            raise NotImplementedError("Caption must be a child of Table or Figure.")
        state["caption"]["idx"] += 1

        return state 
Example #30
Source File: utils.py    From trafilatura with GNU General Public License v3.0 4 votes vote down vote up
def load_html(htmlobject):
    """Load object given as input and validate its type
    (accepted: LXML tree, bytestring and string)
    """
    # use tree directly
    if isinstance(htmlobject, (etree._ElementTree, html.HtmlElement)):
        return htmlobject
    tree = None
    check_flag = False
    # try to detect encoding and convert to string
    if isinstance(htmlobject, bytes):
        # test
        if 'html' not in htmlobject[:50].decode(encoding='ascii', errors='ignore'):
            check_flag = True
        guessed_encoding = detect_encoding(htmlobject)
        if guessed_encoding is not None:
            if guessed_encoding == 'UTF-8':
                tree = html.fromstring(htmlobject, parser=HTML_PARSER)
            else:
                try:
                    htmlobject = htmlobject.decode(guessed_encoding)
                    tree = html.fromstring(htmlobject, parser=HTML_PARSER)
                except UnicodeDecodeError:
                    LOGGER.warning('encoding issue: %s', guessed_encoding)
                    tree = html.fromstring(htmlobject, parser=RECOVERY_PARSER)
        else:
            tree = html.fromstring(htmlobject, parser=RECOVERY_PARSER)
    # use string if applicable
    elif isinstance(htmlobject, str):
        # test
        if 'html' not in htmlobject[:50]:
            check_flag = True
        try:
            tree = html.fromstring(htmlobject, parser=HTML_PARSER)
        except ValueError:
            # try to parse a bytestring
            try:
                tree = html.fromstring(htmlobject.encode('utf8'), parser=HTML_PARSER)
            except Exception as err:
                LOGGER.error('parser bytestring %s', err)
        except Exception as err:
            LOGGER.error('parsing failed: %s', err)
    # default to None
    else:
        LOGGER.error('this type cannot be processed: %s', type(htmlobject))
    # further test
    # test if it's HTML
    if tree is not None and check_flag is True:
        if len(tree) < 2:
            LOGGER.error('Parse tree empty: not valid HTML')
            tree = None
    #if tree is None:
    #    if isinstance(htmlobject, bytes) or isinstance(htmlobject, str):
    #        # more robust parsing
    #        tree = fromsoup(htmlobject)
    return tree