Python lxml.html.Element() Examples
The following are 14
code examples of lxml.html.Element().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
lxml.html
, or try the search function
.
Example #1
Source File: reference.py From online-judge with GNU Affero General Public License v3.0 | 6 votes |
def get_user_rating(username, data): if not data: element = Element('span') element.text = username return element rating = data[1] element = Element('a', {'class': 'rate-group', 'href': reverse('user_page', args=[username])}) if rating: rating_css = rating_class(rating) rate_box = Element('span', {'class': 'rate-box ' + rating_css}) rate_box.append(Element('span', {'style': 'height: %3.fem' % rating_progress(rating)})) user = Element('span', {'class': 'rating ' + rating_css}) user.text = username element.append(rate_box) element.append(user) else: element.text = username return element
Example #2
Source File: lxml_tree.py From online-judge with GNU Affero General Public License v3.0 | 5 votes |
def __init__(self, str): try: self._tree = html.fromstring(str, parser=html.HTMLParser(recover=True)) except (XMLSyntaxError, ParserError) as e: if str and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'): logger.exception('Failed to parse HTML string') self._tree = html.Element('div')
Example #3
Source File: reference.py From online-judge with GNU Affero General Public License v3.0 | 5 votes |
def get_user(username, data): if not data: element = Element('span') element.text = username return element element = Element('span', {'class': Profile.get_user_css_class(*data)}) link = Element('a', {'href': reverse('user_page', args=[username])}) link.text = username element.append(link) return element
Example #4
Source File: lazy_load.py From online-judge with GNU Affero General Public License v3.0 | 5 votes |
def lazy_load(tree): blank = static('blank.gif') for img in tree.xpath('.//img'): src = img.get('src', '') if src.startswith('data') or '-math' in img.get('class', ''): continue noscript = html.Element('noscript') copy = deepcopy(img) copy.tail = '' noscript.append(copy) img.addprevious(noscript) img.set('data-src', src) img.set('src', blank) img.set('class', img.get('class') + ' unveil' if img.get('class') else 'unveil')
Example #5
Source File: __init__.py From online-judge with GNU Affero General Public License v3.0 | 5 votes |
def fragments_to_tree(fragment): tree = html.Element('div') try: parsed = html.fragments_fromstring(fragment, parser=html.HTMLParser(recover=True)) except (XMLSyntaxError, ParserError) as e: if fragment and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'): logger.exception('Failed to parse HTML string') return tree if parsed and isinstance(parsed[0], str): tree.text = parsed[0] parsed = parsed[1:] tree.extend(parsed) return tree
Example #6
Source File: html5parser.py From aws-lambda-lxml with GNU General Public License v3.0 | 5 votes |
def fragment_fromstring(html, create_parent=False, guess_charset=False, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element. If create_parent is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is allowed. """ if not isinstance(html, _strings): raise TypeError('string required') accept_leading_text = bool(create_parent) elements = fragments_fromstring( html, guess_charset=guess_charset, parser=parser, no_leading_text=not accept_leading_text) if create_parent: if not isinstance(create_parent, _strings): create_parent = 'div' new_root = Element(create_parent) if elements: if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) return new_root if not elements: raise etree.ParserError('No elements found') if len(elements) > 1: raise etree.ParserError('Multiple elements found') result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError('Element followed by text: %r' % result.tail) result.tail = None return result
Example #7
Source File: html5parser.py From aws-lambda-lxml with GNU General Public License v3.0 | 5 votes |
def fragment_fromstring(html, create_parent=False, guess_charset=False, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element. If create_parent is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is allowed. """ if not isinstance(html, _strings): raise TypeError('string required') accept_leading_text = bool(create_parent) elements = fragments_fromstring( html, guess_charset=guess_charset, parser=parser, no_leading_text=not accept_leading_text) if create_parent: if not isinstance(create_parent, _strings): create_parent = 'div' new_root = Element(create_parent) if elements: if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) return new_root if not elements: raise etree.ParserError('No elements found') if len(elements) > 1: raise etree.ParserError('Multiple elements found') result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError('Element followed by text: %r' % result.tail) result.tail = None return result
Example #8
Source File: html5parser.py From aws-lambda-lxml with GNU General Public License v3.0 | 5 votes |
def fragment_fromstring(html, create_parent=False, guess_charset=False, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element. If create_parent is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is allowed. """ if not isinstance(html, _strings): raise TypeError('string required') accept_leading_text = bool(create_parent) elements = fragments_fromstring( html, guess_charset=guess_charset, parser=parser, no_leading_text=not accept_leading_text) if create_parent: if not isinstance(create_parent, _strings): create_parent = 'div' new_root = Element(create_parent) if elements: if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) return new_root if not elements: raise etree.ParserError('No elements found') if len(elements) > 1: raise etree.ParserError('Multiple elements found') result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError('Element followed by text: %r' % result.tail) result.tail = None return result
Example #9
Source File: html5parser.py From aws-lambda-lxml with GNU General Public License v3.0 | 5 votes |
def fragment_fromstring(html, create_parent=False, guess_charset=False, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element. If create_parent is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is allowed. """ if not isinstance(html, _strings): raise TypeError('string required') accept_leading_text = bool(create_parent) elements = fragments_fromstring( html, guess_charset=guess_charset, parser=parser, no_leading_text=not accept_leading_text) if create_parent: if not isinstance(create_parent, _strings): create_parent = 'div' new_root = Element(create_parent) if elements: if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) return new_root if not elements: raise etree.ParserError('No elements found') if len(elements) > 1: raise etree.ParserError('Multiple elements found') result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError('Element followed by text: %r' % result.tail) result.tail = None return result
Example #10
Source File: html5parser.py From stopstalk-deployment with MIT License | 5 votes |
def fragment_fromstring(html, create_parent=False, guess_charset=False, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element. If create_parent is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is allowed. """ if not isinstance(html, _strings): raise TypeError('string required') accept_leading_text = bool(create_parent) elements = fragments_fromstring( html, guess_charset=guess_charset, parser=parser, no_leading_text=not accept_leading_text) if create_parent: if not isinstance(create_parent, _strings): create_parent = 'div' new_root = Element(create_parent) if elements: if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) return new_root if not elements: raise etree.ParserError('No elements found') if len(elements) > 1: raise etree.ParserError('Multiple elements found') result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError('Element followed by text: %r' % result.tail) result.tail = None return result
Example #11
Source File: html5parser.py From learn_python3_spider with MIT License | 4 votes |
def fragment_fromstring(html, create_parent=False, guess_charset=None, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element. If 'create_parent' is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is allowed. If `guess_charset` is true, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError('string required') accept_leading_text = bool(create_parent) elements = fragments_fromstring( html, guess_charset=guess_charset, parser=parser, no_leading_text=not accept_leading_text) if create_parent: if not isinstance(create_parent, _strings): create_parent = 'div' new_root = Element(create_parent) if elements: if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) return new_root if not elements: raise etree.ParserError('No elements found') if len(elements) > 1: raise etree.ParserError('Multiple elements found') result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError('Element followed by text: %r' % result.tail) result.tail = None return result
Example #12
Source File: rsc.py From ChemDataExtractor with MIT License | 4 votes |
def parse_rsc_html(htmlstring): """Messy RSC HTML needs this special parser to fix problems before creating selector.""" converted = UnicodeDammit(htmlstring) if not converted.unicode_markup: raise UnicodeDecodeError('Failed to detect encoding, tried [%s]') root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding)) # Add p.otherpara tags around orphan text newp = None for child in root.get_element_by_id('wrapper'): if newp is not None: if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None: child.addprevious(newp) newp = None else: newp.append(child) if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip(): newp = Element('p', **{'class': 'otherpara'}) newp.text = child.tail child.tail = '' return root
Example #13
Source File: html5parser.py From lambda-text-extractor with Apache License 2.0 | 4 votes |
def fragment_fromstring(html, create_parent=False, guess_charset=None, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element. If 'create_parent' is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is allowed. If `guess_charset` is true, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError('string required') accept_leading_text = bool(create_parent) elements = fragments_fromstring( html, guess_charset=guess_charset, parser=parser, no_leading_text=not accept_leading_text) if create_parent: if not isinstance(create_parent, _strings): create_parent = 'div' new_root = Element(create_parent) if elements: if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) return new_root if not elements: raise etree.ParserError('No elements found') if len(elements) > 1: raise etree.ParserError('Multiple elements found') result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError('Element followed by text: %r' % result.tail) result.tail = None return result
Example #14
Source File: html5parser.py From lambda-text-extractor with Apache License 2.0 | 4 votes |
def fragment_fromstring(html, create_parent=False, guess_charset=None, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element. If 'create_parent' is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is allowed. If `guess_charset` is true, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError('string required') accept_leading_text = bool(create_parent) elements = fragments_fromstring( html, guess_charset=guess_charset, parser=parser, no_leading_text=not accept_leading_text) if create_parent: if not isinstance(create_parent, _strings): create_parent = 'div' new_root = Element(create_parent) if elements: if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) return new_root if not elements: raise etree.ParserError('No elements found') if len(elements) > 1: raise etree.ParserError('Multiple elements found') result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError('Element followed by text: %r' % result.tail) result.tail = None return result