Python Examples of lxml.html.HtmlElement

Source File: scraper.py From scrape-linkedin with MIT License

6 votes

def test_scores(self):
        """ Return a list of dictionnary with test scores """
        if isinstance(self.xp_test_scores, html.HtmlElement) is True:
            count=int(self.get_clean_xpath(
                'count(//div[@id="background-test-scores"]/div[contains(@id, "scores-")])'))
            test_scores=[]
            for i in range(1, count + 1):
                data={}
                data['name']=extract_one(
                    self.get_xp(self.xp_test_scores, './/h4//text()'))
                data['score']=extract_one(
                    self.get_xp(self.xp_test_scores, './/h5//text()'))
                data['description']=' '.join((self.get_xp(
                    self.xp_test_scores, './/p[contains(@class,"description")]//text()')))
                data['date']=extract_one(self.get_xp(
                    self.xp_test_scores, './/span[@class = "date-range"]/time[1]/text()'))
                test_scores.append(data)
        else:
            test_scores=[]
        return test_scores

Source File: html_to_telegraph.py From html-telegraph-poster with MIT License

6 votes

def _fragments_from_string(html_string):
    fragments = html.fragments_fromstring(html_string)
    if not len(fragments):
        return []
    # convert and append text node before starting tag
    if not isinstance(fragments[0], html.HtmlElement):
        if len(fragments[0].strip()) > 0:
            if len(fragments) == 1:
                return html.fragments_fromstring('<p>%s</p>' % fragments[0])
            else:
                paragraph = _create_element('p')
                paragraph.text = fragments[0]
                fragments[1].addprevious(paragraph)
                fragments.insert(1, paragraph)

        fragments.pop(0)
        if not len(fragments):
            return []

    # remove xml instructions (if cleaning is disabled)
    for instruction in fragments[0].xpath('//processing-instruction()'):
        instruction.drop_tag()

    return fragments

Source File: parser.py From fonduer with MIT License

6 votes

def _parse_node(
        self, node: HtmlElement, state: Dict[str, Any]
    ) -> Iterator[Sentence]:
        """Entry point for parsing all node types.

        :param node: The lxml HTML node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        :return: a *generator* of Sentences
        """
        # Processing on entry of node
        state = self._parse_section(node, state)

        state = self._parse_figure(node, state)

        if self.tabular:
            state = self._parse_table(node, state)

        state = self._parse_caption(node, state)

        yield from self._parse_paragraph(node, state)

Source File: utils.py From GeneralNewsExtractor with MIT License

5 votes

def drop_tag(node: HtmlElement):
    """
    only delete the tag, but merge its text to parent.
    :param node:
    :return:
    """
    parent = node.getparent()
    if parent is not None:
        node.drop_tag()

Source File: jav321.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def get_title(lx: html.HtmlElement) -> str:
    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()

Source File: requests_html.py From requests-html with MIT License

5 votes

def lxml(self) -> HtmlElement:
        """`lxml <http://lxml.de>`_ representation of the
        :class:`Element <Element>` or :class:`HTML <HTML>`.
        """
        if self._lxml is None:
            try:
                self._lxml = soup_parse(self.html, features='html.parser')
            except ValueError:
                self._lxml = lxml.html.fromstring(self.raw_html)

        return self._lxml

Source File: test_http.py From memorious with MIT License

5 votes

def test_html(self, http):
        request = Request("GET", "https://httpbin.org/html")
        context_http_response = ContextHttpResponse(http, request)
        assert isinstance(context_http_response.html, html.HtmlElement)

Source File: jav321.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def get_cover(lx: html.HtmlElement) -> str:
    return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0]

Source File: html_to_telegraph.py From html-telegraph-poster with MIT License

5 votes

def preprocess_media_tags(element):
    if isinstance(element, html.HtmlElement):
        if element.tag in ['ol', 'ul']:
            # ignore any spaces between <ul> and <li>
            element.text = ''
        elif element.tag == 'li':
            # ignore spaces after </li>
            element.tail = ''
        elif element.tag == 'iframe':
            iframe_src = element.get('src')

            youtube = youtube_re.match(iframe_src)
            vimeo = vimeo_re.match(iframe_src)
            telegram = re.match(telegram_embed_iframe_re, iframe_src)
            if youtube or vimeo or telegram:
                element.text = ''  # ignore any legacy text
                if youtube:
                    yt_id = urlparse(iframe_src).path.replace('/embed/', '')
                    element.set('src', '/embed/youtube?url=' + quote_plus('https://www.youtube.com/watch?v=' + yt_id))
                elif vimeo:
                    element.set('src', '/embed/vimeo?url=' + quote_plus('https://vimeo.com/' + vimeo.group(2)))
                elif telegram:
                    element.set('src', '/embed/telegram?url=' + quote_plus(iframe_src))
                if not len(element.xpath('./ancestor::figure')):
                    _wrap_figure(element)
            else:
                element.drop_tag()

        elif element.tag == 'blockquote' and element.get('class') == 'twitter-tweet':
            twitter_links = element.xpath('.//a[@href]')
            for tw_link in twitter_links:
                if twitter_re.match(tw_link.get('href')):
                    twitter_frame = html.HtmlElement()
                    twitter_frame.tag = 'iframe'
                    twitter_frame.set('src', '/embed/twitter?url=' + quote_plus(tw_link.get('href')))
                    element.addprevious(twitter_frame)
                    _wrap_figure(twitter_frame)
                    element.drop_tree()
                    break

Source File: html_to_telegraph.py From html-telegraph-poster with MIT License

5 votes

def _create_element(element, text=None):
    # creates lxml element without document tree (no body, no parents)
    new_element = html.HtmlElement()
    new_element.tag = element
    if text:
        new_element.text = text
    return new_element

Source File: test_markdown.py From online-judge with GNU Affero General Public License v3.0

5 votes

def test_text_prefix(self):
        tree = fragments_to_tree('z<p>a</p><p>b</p>')
        self.assertIsInstance(tree, html.HtmlElement)
        self.assertEqual(len(tree.getchildren()), 2)
        self.assertEqual(tree.text, 'z')

        self.assertHTMLEqual(fragment_tree_to_str(tree), 'z<p>a</p><p>b</p>')

Source File: test_markdown.py From online-judge with GNU Affero General Public License v3.0

5 votes

def test_simple(self):
        tree = fragments_to_tree('<p>a</p><p>b</p>')
        self.assertIsInstance(tree, html.HtmlElement)
        self.assertEqual(len(tree.getchildren()), 2)

        self.assertIsInstance(tree[0], html.HtmlElement)
        self.assertEqual(tree[0].tag, 'p')
        self.assertEqual(tree[0].text, 'a')

        self.assertIsInstance(tree[1], html.HtmlElement)
        self.assertEqual(tree[1].tag, 'p')
        self.assertEqual(tree[1].text, 'b')

        self.assertHTMLEqual(fragment_tree_to_str(tree), '<p>a</p><p>b</p>')

Source File: structural.py From fonduer with MIT License

5 votes

def _get_node(sentence: Sentence) -> HtmlElement:
    # Using caching to speed up retrieve process
    doc_etree = _get_etree_for_text(sentence.document.text)
    return doc_etree.xpath(sentence.xpath)[0]

Source File: utils.py From scrape with MIT License

5 votes

def get_resp(url):
    """Get webpage response as an lxml.html.HtmlElement object."""
    try:
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        try:
            request = requests.get(url, headers=headers, proxies=get_proxies())
        except MissingSchema:
            url = add_protocol(url)
            request = requests.get(url, headers=headers, proxies=get_proxies())
        return lh.fromstring(request.text.encode("utf-8") if PY2 else request.text)
    except Exception:
        sys.stderr.write("Failed to retrieve {0}.\n".format(url))
        raise

Source File: parser.py From fonduer with MIT License

5 votes

def _parse_section(
        self, node: HtmlElement, state: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Parse a Section of the node.

        Note that this implementation currently creates a Section at the
        beginning of the document and creates Section based on tag of node.

        :param node: The lxml node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """
        if node.tag not in ["html", "section"]:
            return state

        # Add a Section
        stable_id = (
            f"{state['document'].name}"
            f"::"
            f"{'section'}"
            f":"
            f"{state['section']['idx']}"
        )

        # Set name for Section
        name = node.attrib["name"] if "name" in node.attrib else None

        state["context"][node] = Section(
            document=state["document"],
            name=name,
            stable_id=stable_id,
            position=state["section"]["idx"],
        )
        state["section"]["idx"] += 1

        return state

Source File: utils.py From GeneralNewsExtractor with MIT License

5 votes

def remove_node(node: HtmlElement):
    """
    this is a in-place operation, not necessary to return
    :param node:
    :return:
    """
    parent = node.getparent()
    if parent is not None:
        parent.remove(node)

Source File: utils.py From GeneralNewsExtractor with MIT License

5 votes

def iter_node(element: HtmlElement):
    yield element
    for sub_element in element:
        if isinstance(sub_element, HtmlElement):
            yield from iter_node(sub_element)

Source File: utils.py From GeneralNewsExtractor with MIT License

5 votes

def normalize_node(element: HtmlElement):
    etree.strip_elements(element, *USELESS_TAG)
    for node in iter_node(element):
        # inspired by readability.
        if node.tag.lower() in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
            remove_node(node)

        # merge text in span or strong to parent p tag
        if node.tag.lower() == 'p':
            etree.strip_tags(node, 'span')
            etree.strip_tags(node, 'strong')

        # if a div tag does not contain any sub node, it could be converted to p node.
        if node.tag.lower() == 'div' and not node.getchildren():
            node.tag = 'p'

        if node.tag.lower() == 'span' and not node.getchildren():
            node.tag = 'p'

        # remove empty p tag
        if node.tag.lower() == 'p' and not node.xpath('.//img'):
            if not (node.text and node.text.strip()):
                drop_tag(node)

        class_name = node.get('class')
        if class_name:
            for attribute in USELESS_ATTR:
                if attribute in class_name:
                    remove_node(node)
                    break

Source File: TitleExtractor.py From GeneralNewsExtractor with MIT License

5 votes

def extract(self, element: HtmlElement, title_xpath: str = '') -> str:
        title_xpath = title_xpath or config.get('title', {}).get('xpath')
        title = (self.extract_by_xpath(element, title_xpath)
                 or self.extract_by_htag_and_title(element)
                 or self.extract_by_title(element)
                 or self.extract_by_htag(element)
                 )
        return title.strip()

Source File: AuthorExtractor.py From GeneralNewsExtractor with MIT License

5 votes

def extractor(self, element: HtmlElement, author_xpath=''):
        author_xpath = author_xpath or config.get('author', {}).get('xpath')
        if author_xpath:
            author = ''.join(element.xpath(author_xpath))
            return author
        text = ''.join(element.xpath('.//text()'))
        for pattern in self.author_pattern:
            author_obj = re.search(pattern, text)
            if author_obj:
                return author_obj.group(1)
        return ''

Source File: TimeExtractor.py From GeneralNewsExtractor with MIT License

5 votes

def extract_from_meta(self, element: HtmlElement) -> str:
        """
        一些很规范的新闻网站，会把新闻的发布时间放在 META 中，因此应该优先检查 META 数据
        :param element: 网页源代码对应的Dom 树
        :return: str
        """
        for xpath in PUBLISH_TIME_META:
            publish_time = element.xpath(xpath)
            if publish_time:
                return ''.join(publish_time)
        return ''

Source File: TimeExtractor.py From GeneralNewsExtractor with MIT License

5 votes

def extract_from_text(self, element: HtmlElement) -> str:
        text = ''.join(element.xpath('.//text()'))
        for dt in self.time_pattern:
            dt_obj = re.search(dt, text)
            if dt_obj:
                return dt_obj.group(1)
        else:
            return ''

Source File: TimeExtractor.py From GeneralNewsExtractor with MIT License

5 votes

def extract_from_user_xpath(self, publish_time_xpath: str, element: HtmlElement) -> str:
        if publish_time_xpath:
            publish_time = ''.join(element.xpath(publish_time_xpath))
            return publish_time
        return ''

Source File: TimeExtractor.py From GeneralNewsExtractor with MIT License

5 votes

def extractor(self, element: HtmlElement, publish_time_xpath: str = '') -> str:
        publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath')
        publish_time = (self.extract_from_user_xpath(publish_time_xpath, element)  # 用户指定的 Xpath 是第一优先级
                        or self.extract_from_meta(element)   # 第二优先级从 Meta 中提取
                        or self.extract_from_text(element))  # 最坏的情况从正文中提取
        return publish_time

Source File: jav321.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def get_outline(lx: html.HtmlElement) -> str:
    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]

Source File: jav321.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def get_series2(lx: html.HtmlElement) -> str:
    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0]

Source File: utils.py From scrape with MIT License

5 votes

def parse_html(infile, xpath):
    """Filter HTML using XPath."""
    if not isinstance(infile, lh.HtmlElement):
        infile = lh.fromstring(infile)
    infile = infile.xpath(xpath)
    if not infile:
        raise ValueError("XPath {0} returned no results.".format(xpath))
    return infile


# URL processing functions
#

Source File: javlib.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
    return lx.xpath(xpath)[0].strip()

Source File: parser.py From fonduer with MIT License

4 votes

def _parse_caption(
        self, node: HtmlElement, state: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Parse a Caption of the node.

        :param node: The lxml node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """
        if node.tag not in ["caption", "figcaption"]:  # captions used in Tables
            return state

        # Add a Caption
        parent = state["parent"][node]
        stable_id = (
            f"{state['document'].name}"
            f"::"
            f"{'caption'}"
            f":"
            f"{state['caption']['idx']}"
        )

        # Set name for Section
        name = node.attrib["name"] if "name" in node.attrib else None

        if isinstance(parent, Table):
            state["context"][node] = Caption(
                document=state["document"],
                table=parent,
                figure=None,
                stable_id=stable_id,
                name=name,
                position=state["caption"]["idx"],
            )
        elif isinstance(parent, Figure):
            state["context"][node] = Caption(
                document=state["document"],
                table=None,
                figure=parent,
                stable_id=stable_id,
                name=name,
                position=state["caption"]["idx"],
            )
        else:
            raise NotImplementedError("Caption must be a child of Table or Figure.")
        state["caption"]["idx"] += 1

        return state

Source File: utils.py From trafilatura with GNU General Public License v3.0

4 votes

def load_html(htmlobject):
    """Load object given as input and validate its type
    (accepted: LXML tree, bytestring and string)
    """
    # use tree directly
    if isinstance(htmlobject, (etree._ElementTree, html.HtmlElement)):
        return htmlobject
    tree = None
    check_flag = False
    # try to detect encoding and convert to string
    if isinstance(htmlobject, bytes):
        # test
        if 'html' not in htmlobject[:50].decode(encoding='ascii', errors='ignore'):
            check_flag = True
        guessed_encoding = detect_encoding(htmlobject)
        if guessed_encoding is not None:
            if guessed_encoding == 'UTF-8':
                tree = html.fromstring(htmlobject, parser=HTML_PARSER)
            else:
                try:
                    htmlobject = htmlobject.decode(guessed_encoding)
                    tree = html.fromstring(htmlobject, parser=HTML_PARSER)
                except UnicodeDecodeError:
                    LOGGER.warning('encoding issue: %s', guessed_encoding)
                    tree = html.fromstring(htmlobject, parser=RECOVERY_PARSER)
        else:
            tree = html.fromstring(htmlobject, parser=RECOVERY_PARSER)
    # use string if applicable
    elif isinstance(htmlobject, str):
        # test
        if 'html' not in htmlobject[:50]:
            check_flag = True
        try:
            tree = html.fromstring(htmlobject, parser=HTML_PARSER)
        except ValueError:
            # try to parse a bytestring
            try:
                tree = html.fromstring(htmlobject.encode('utf8'), parser=HTML_PARSER)
            except Exception as err:
                LOGGER.error('parser bytestring %s', err)
        except Exception as err:
            LOGGER.error('parsing failed: %s', err)
    # default to None
    else:
        LOGGER.error('this type cannot be processed: %s', type(htmlobject))
    # further test
    # test if it's HTML
    if tree is not None and check_flag is True:
        if len(tree) < 2:
            LOGGER.error('Parse tree empty: not valid HTML')
            tree = None
    #if tree is None:
    #    if isinstance(htmlobject, bytes) or isinstance(htmlobject, str):
    #        # more robust parsing
    #        tree = fromsoup(htmlobject)
    return tree

Python lxml.html.HtmlElement() Examples