Python lxml.html.HtmlElement() Examples
The following are 30
code examples of lxml.html.HtmlElement().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
lxml.html
, or try the search function
.
Example #1
Source File: scraper.py From scrape-linkedin with MIT License | 6 votes |
def test_scores(self): """ Return a list of dictionnary with test scores """ if isinstance(self.xp_test_scores, html.HtmlElement) is True: count=int(self.get_clean_xpath( 'count(//div[@id="background-test-scores"]/div[contains(@id, "scores-")])')) test_scores=[] for i in range(1, count + 1): data={} data['name']=extract_one( self.get_xp(self.xp_test_scores, './/h4//text()')) data['score']=extract_one( self.get_xp(self.xp_test_scores, './/h5//text()')) data['description']=' '.join((self.get_xp( self.xp_test_scores, './/p[contains(@class,"description")]//text()'))) data['date']=extract_one(self.get_xp( self.xp_test_scores, './/span[@class = "date-range"]/time[1]/text()')) test_scores.append(data) else: test_scores=[] return test_scores
Example #2
Source File: html_to_telegraph.py From html-telegraph-poster with MIT License | 6 votes |
def _fragments_from_string(html_string): fragments = html.fragments_fromstring(html_string) if not len(fragments): return [] # convert and append text node before starting tag if not isinstance(fragments[0], html.HtmlElement): if len(fragments[0].strip()) > 0: if len(fragments) == 1: return html.fragments_fromstring('<p>%s</p>' % fragments[0]) else: paragraph = _create_element('p') paragraph.text = fragments[0] fragments[1].addprevious(paragraph) fragments.insert(1, paragraph) fragments.pop(0) if not len(fragments): return [] # remove xml instructions (if cleaning is disabled) for instruction in fragments[0].xpath('//processing-instruction()'): instruction.drop_tag() return fragments
Example #3
Source File: parser.py From fonduer with MIT License | 6 votes |
def _parse_node( self, node: HtmlElement, state: Dict[str, Any] ) -> Iterator[Sentence]: """Entry point for parsing all node types. :param node: The lxml HTML node to parse :param state: The global state necessary to place the node in context of the document as a whole. :return: a *generator* of Sentences """ # Processing on entry of node state = self._parse_section(node, state) state = self._parse_figure(node, state) if self.tabular: state = self._parse_table(node, state) state = self._parse_caption(node, state) yield from self._parse_paragraph(node, state)
Example #4
Source File: utils.py From GeneralNewsExtractor with MIT License | 5 votes |
def drop_tag(node: HtmlElement): """ only delete the tag, but merge its text to parent. :param node: :return: """ parent = node.getparent() if parent is not None: node.drop_tag()
Example #5
Source File: jav321.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def get_title(lx: html.HtmlElement) -> str: return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
Example #6
Source File: requests_html.py From requests-html with MIT License | 5 votes |
def lxml(self) -> HtmlElement: """`lxml <http://lxml.de>`_ representation of the :class:`Element <Element>` or :class:`HTML <HTML>`. """ if self._lxml is None: try: self._lxml = soup_parse(self.html, features='html.parser') except ValueError: self._lxml = lxml.html.fromstring(self.raw_html) return self._lxml
Example #7
Source File: test_http.py From memorious with MIT License | 5 votes |
def test_html(self, http): request = Request("GET", "https://httpbin.org/html") context_http_response = ContextHttpResponse(http, request) assert isinstance(context_http_response.html, html.HtmlElement)
Example #8
Source File: jav321.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def get_cover(lx: html.HtmlElement) -> str: return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0]
Example #9
Source File: html_to_telegraph.py From html-telegraph-poster with MIT License | 5 votes |
def preprocess_media_tags(element): if isinstance(element, html.HtmlElement): if element.tag in ['ol', 'ul']: # ignore any spaces between <ul> and <li> element.text = '' elif element.tag == 'li': # ignore spaces after </li> element.tail = '' elif element.tag == 'iframe': iframe_src = element.get('src') youtube = youtube_re.match(iframe_src) vimeo = vimeo_re.match(iframe_src) telegram = re.match(telegram_embed_iframe_re, iframe_src) if youtube or vimeo or telegram: element.text = '' # ignore any legacy text if youtube: yt_id = urlparse(iframe_src).path.replace('/embed/', '') element.set('src', '/embed/youtube?url=' + quote_plus('https://www.youtube.com/watch?v=' + yt_id)) elif vimeo: element.set('src', '/embed/vimeo?url=' + quote_plus('https://vimeo.com/' + vimeo.group(2))) elif telegram: element.set('src', '/embed/telegram?url=' + quote_plus(iframe_src)) if not len(element.xpath('./ancestor::figure')): _wrap_figure(element) else: element.drop_tag() elif element.tag == 'blockquote' and element.get('class') == 'twitter-tweet': twitter_links = element.xpath('.//a[@href]') for tw_link in twitter_links: if twitter_re.match(tw_link.get('href')): twitter_frame = html.HtmlElement() twitter_frame.tag = 'iframe' twitter_frame.set('src', '/embed/twitter?url=' + quote_plus(tw_link.get('href'))) element.addprevious(twitter_frame) _wrap_figure(twitter_frame) element.drop_tree() break
Example #10
Source File: html_to_telegraph.py From html-telegraph-poster with MIT License | 5 votes |
def _create_element(element, text=None): # creates lxml element without document tree (no body, no parents) new_element = html.HtmlElement() new_element.tag = element if text: new_element.text = text return new_element
Example #11
Source File: test_markdown.py From online-judge with GNU Affero General Public License v3.0 | 5 votes |
def test_text_prefix(self): tree = fragments_to_tree('z<p>a</p><p>b</p>') self.assertIsInstance(tree, html.HtmlElement) self.assertEqual(len(tree.getchildren()), 2) self.assertEqual(tree.text, 'z') self.assertHTMLEqual(fragment_tree_to_str(tree), 'z<p>a</p><p>b</p>')
Example #12
Source File: test_markdown.py From online-judge with GNU Affero General Public License v3.0 | 5 votes |
def test_simple(self): tree = fragments_to_tree('<p>a</p><p>b</p>') self.assertIsInstance(tree, html.HtmlElement) self.assertEqual(len(tree.getchildren()), 2) self.assertIsInstance(tree[0], html.HtmlElement) self.assertEqual(tree[0].tag, 'p') self.assertEqual(tree[0].text, 'a') self.assertIsInstance(tree[1], html.HtmlElement) self.assertEqual(tree[1].tag, 'p') self.assertEqual(tree[1].text, 'b') self.assertHTMLEqual(fragment_tree_to_str(tree), '<p>a</p><p>b</p>')
Example #13
Source File: structural.py From fonduer with MIT License | 5 votes |
def _get_node(sentence: Sentence) -> HtmlElement: # Using caching to speed up retrieve process doc_etree = _get_etree_for_text(sentence.document.text) return doc_etree.xpath(sentence.xpath)[0]
Example #14
Source File: utils.py From scrape with MIT License | 5 votes |
def get_resp(url): """Get webpage response as an lxml.html.HtmlElement object.""" try: headers = {"User-Agent": random.choice(USER_AGENTS)} try: request = requests.get(url, headers=headers, proxies=get_proxies()) except MissingSchema: url = add_protocol(url) request = requests.get(url, headers=headers, proxies=get_proxies()) return lh.fromstring(request.text.encode("utf-8") if PY2 else request.text) except Exception: sys.stderr.write("Failed to retrieve {0}.\n".format(url)) raise
Example #15
Source File: parser.py From fonduer with MIT License | 5 votes |
def _parse_section( self, node: HtmlElement, state: Dict[str, Any] ) -> Dict[str, Any]: """Parse a Section of the node. Note that this implementation currently creates a Section at the beginning of the document and creates Section based on tag of node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["html", "section"]: return state # Add a Section stable_id = ( f"{state['document'].name}" f"::" f"{'section'}" f":" f"{state['section']['idx']}" ) # Set name for Section name = node.attrib["name"] if "name" in node.attrib else None state["context"][node] = Section( document=state["document"], name=name, stable_id=stable_id, position=state["section"]["idx"], ) state["section"]["idx"] += 1 return state
Example #16
Source File: utils.py From GeneralNewsExtractor with MIT License | 5 votes |
def remove_node(node: HtmlElement): """ this is a in-place operation, not necessary to return :param node: :return: """ parent = node.getparent() if parent is not None: parent.remove(node)
Example #17
Source File: utils.py From GeneralNewsExtractor with MIT License | 5 votes |
def iter_node(element: HtmlElement): yield element for sub_element in element: if isinstance(sub_element, HtmlElement): yield from iter_node(sub_element)
Example #18
Source File: utils.py From GeneralNewsExtractor with MIT License | 5 votes |
def normalize_node(element: HtmlElement): etree.strip_elements(element, *USELESS_TAG) for node in iter_node(element): # inspired by readability. if node.tag.lower() in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node): remove_node(node) # merge text in span or strong to parent p tag if node.tag.lower() == 'p': etree.strip_tags(node, 'span') etree.strip_tags(node, 'strong') # if a div tag does not contain any sub node, it could be converted to p node. if node.tag.lower() == 'div' and not node.getchildren(): node.tag = 'p' if node.tag.lower() == 'span' and not node.getchildren(): node.tag = 'p' # remove empty p tag if node.tag.lower() == 'p' and not node.xpath('.//img'): if not (node.text and node.text.strip()): drop_tag(node) class_name = node.get('class') if class_name: for attribute in USELESS_ATTR: if attribute in class_name: remove_node(node) break
Example #19
Source File: TitleExtractor.py From GeneralNewsExtractor with MIT License | 5 votes |
def extract(self, element: HtmlElement, title_xpath: str = '') -> str: title_xpath = title_xpath or config.get('title', {}).get('xpath') title = (self.extract_by_xpath(element, title_xpath) or self.extract_by_htag_and_title(element) or self.extract_by_title(element) or self.extract_by_htag(element) ) return title.strip()
Example #20
Source File: AuthorExtractor.py From GeneralNewsExtractor with MIT License | 5 votes |
def extractor(self, element: HtmlElement, author_xpath=''): author_xpath = author_xpath or config.get('author', {}).get('xpath') if author_xpath: author = ''.join(element.xpath(author_xpath)) return author text = ''.join(element.xpath('.//text()')) for pattern in self.author_pattern: author_obj = re.search(pattern, text) if author_obj: return author_obj.group(1) return ''
Example #21
Source File: TimeExtractor.py From GeneralNewsExtractor with MIT License | 5 votes |
def extract_from_meta(self, element: HtmlElement) -> str: """ 一些很规范的新闻网站,会把新闻的发布时间放在 META 中,因此应该优先检查 META 数据 :param element: 网页源代码对应的Dom 树 :return: str """ for xpath in PUBLISH_TIME_META: publish_time = element.xpath(xpath) if publish_time: return ''.join(publish_time) return ''
Example #22
Source File: TimeExtractor.py From GeneralNewsExtractor with MIT License | 5 votes |
def extract_from_text(self, element: HtmlElement) -> str: text = ''.join(element.xpath('.//text()')) for dt in self.time_pattern: dt_obj = re.search(dt, text) if dt_obj: return dt_obj.group(1) else: return ''
Example #23
Source File: TimeExtractor.py From GeneralNewsExtractor with MIT License | 5 votes |
def extract_from_user_xpath(self, publish_time_xpath: str, element: HtmlElement) -> str: if publish_time_xpath: publish_time = ''.join(element.xpath(publish_time_xpath)) return publish_time return ''
Example #24
Source File: TimeExtractor.py From GeneralNewsExtractor with MIT License | 5 votes |
def extractor(self, element: HtmlElement, publish_time_xpath: str = '') -> str: publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath') publish_time = (self.extract_from_user_xpath(publish_time_xpath, element) # 用户指定的 Xpath 是第一优先级 or self.extract_from_meta(element) # 第二优先级从 Meta 中提取 or self.extract_from_text(element)) # 最坏的情况从正文中提取 return publish_time
Example #25
Source File: jav321.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def get_outline(lx: html.HtmlElement) -> str: return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
Example #26
Source File: jav321.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def get_series2(lx: html.HtmlElement) -> str: return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0]
Example #27
Source File: utils.py From scrape with MIT License | 5 votes |
def parse_html(infile, xpath): """Filter HTML using XPath.""" if not isinstance(infile, lh.HtmlElement): infile = lh.fromstring(infile) infile = infile.xpath(xpath) if not infile: raise ValueError("XPath {0} returned no results.".format(xpath)) return infile # URL processing functions #
Example #28
Source File: javlib.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str: return lx.xpath(xpath)[0].strip()
Example #29
Source File: parser.py From fonduer with MIT License | 4 votes |
def _parse_caption( self, node: HtmlElement, state: Dict[str, Any] ) -> Dict[str, Any]: """Parse a Caption of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["caption", "figcaption"]: # captions used in Tables return state # Add a Caption parent = state["parent"][node] stable_id = ( f"{state['document'].name}" f"::" f"{'caption'}" f":" f"{state['caption']['idx']}" ) # Set name for Section name = node.attrib["name"] if "name" in node.attrib else None if isinstance(parent, Table): state["context"][node] = Caption( document=state["document"], table=parent, figure=None, stable_id=stable_id, name=name, position=state["caption"]["idx"], ) elif isinstance(parent, Figure): state["context"][node] = Caption( document=state["document"], table=None, figure=parent, stable_id=stable_id, name=name, position=state["caption"]["idx"], ) else: raise NotImplementedError("Caption must be a child of Table or Figure.") state["caption"]["idx"] += 1 return state
Example #30
Source File: utils.py From trafilatura with GNU General Public License v3.0 | 4 votes |
def load_html(htmlobject): """Load object given as input and validate its type (accepted: LXML tree, bytestring and string) """ # use tree directly if isinstance(htmlobject, (etree._ElementTree, html.HtmlElement)): return htmlobject tree = None check_flag = False # try to detect encoding and convert to string if isinstance(htmlobject, bytes): # test if 'html' not in htmlobject[:50].decode(encoding='ascii', errors='ignore'): check_flag = True guessed_encoding = detect_encoding(htmlobject) if guessed_encoding is not None: if guessed_encoding == 'UTF-8': tree = html.fromstring(htmlobject, parser=HTML_PARSER) else: try: htmlobject = htmlobject.decode(guessed_encoding) tree = html.fromstring(htmlobject, parser=HTML_PARSER) except UnicodeDecodeError: LOGGER.warning('encoding issue: %s', guessed_encoding) tree = html.fromstring(htmlobject, parser=RECOVERY_PARSER) else: tree = html.fromstring(htmlobject, parser=RECOVERY_PARSER) # use string if applicable elif isinstance(htmlobject, str): # test if 'html' not in htmlobject[:50]: check_flag = True try: tree = html.fromstring(htmlobject, parser=HTML_PARSER) except ValueError: # try to parse a bytestring try: tree = html.fromstring(htmlobject.encode('utf8'), parser=HTML_PARSER) except Exception as err: LOGGER.error('parser bytestring %s', err) except Exception as err: LOGGER.error('parsing failed: %s', err) # default to None else: LOGGER.error('this type cannot be processed: %s', type(htmlobject)) # further test # test if it's HTML if tree is not None and check_flag is True: if len(tree) < 2: LOGGER.error('Parse tree empty: not valid HTML') tree = None #if tree is None: # if isinstance(htmlobject, bytes) or isinstance(htmlobject, str): # # more robust parsing # tree = fromsoup(htmlobject) return tree