Python lxml.html.parse() Examples
The following are 30
code examples of lxml.html.parse().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
lxml.html
, or try the search function
.
Example #1
Source File: html.py From recruit with Apache License 2.0 | 6 votes |
def _parse_tables(self, doc, match, attrs): """ Return all tables from the parsed DOM. Parameters ---------- doc : the DOM from which to parse the table element. match : str or regular expression The text to search for in the DOM tree. attrs : dict A dictionary of table attributes that can be used to disambiguate multiple tables on a page. Raises ------ ValueError : `match` does not match any text in the document. Returns ------- list of node-like HTML <table> elements to be parsed into raw data. """ raise AbstractMethodError(self)
Example #2
Source File: html.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def _parse_tables(self, doc, match, attrs): """ Return all tables from the parsed DOM. Parameters ---------- doc : the DOM from which to parse the table element. match : str or regular expression The text to search for in the DOM tree. attrs : dict A dictionary of table attributes that can be used to disambiguate multiple tables on a page. Raises ------ ValueError : `match` does not match any text in the document. Returns ------- list of node-like HTML <table> elements to be parsed into raw data. """ raise AbstractMethodError(self)
Example #3
Source File: html.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here # hack around python 3 deleting the exception variable retained = None for flav in flavor: parser = _parser_dispatch(flav) p = parser(io, compiled_match, attrs, encoding, displayed_only) try: tables = p.parse_tables() except Exception as caught: # if `io` is an io-like object, check if it's seekable # and try to rewind it before trying the next parser if hasattr(io, 'seekable') and io.seekable(): io.seek(0) elif hasattr(io, 'seekable') and not io.seekable(): # if we couldn't rewind it, let the user know raise ValueError('The flavor {} failed to parse your input. ' 'Since you passed a non-rewindable file ' 'object, we can\'t rewind it to try ' 'another parser. Try read_html() with a ' 'different flavor.'.format(flav)) retained = caught else: break else: raise_with_traceback(retained) ret = [] for table in tables: try: ret.append(_data_to_frame(data=table, **kwargs)) except EmptyDataError: # empty table continue return ret
Example #4
Source File: html.py From recruit with Apache License 2.0 | 5 votes |
def _build_doc(self): """ Return a tree-like object that can be used to iterate over the DOM. Returns ------- node-like The DOM from which to parse the table element. """ raise AbstractMethodError(self)
Example #5
Source File: gifbot.py From SnapchatBot with MIT License | 5 votes |
def grab_trending_gif_urls(): doc = parse("http://giphy.com").getroot() els = doc.cssselect(".gif-link img")[:10] ret = [] for el in els: ret.append("http:" +re.sub(r"\/([^./])*\.gif", "/giphy.gif", el.attrib['src'])) return ret
Example #6
Source File: core.py From libextract with MIT License | 5 votes |
def parse_html(fileobj, encoding): """ Given a file object *fileobj*, get an ElementTree instance. The *encoding* is assumed to be utf8. """ parser = HTMLParser(encoding=encoding, remove_blank_text=True) return parse(fileobj, parser)
Example #7
Source File: tutorial.py From wextracto with BSD 3-Clause "New" or "Revised" License | 5 votes |
def extract(response): tree = parse(response) return tree.xpath('//h1/text()')
Example #8
Source File: tutorial.py From wextracto with BSD 3-Clause "New" or "Revised" License | 5 votes |
def extract(response): tree = parse(response) yield "name", text(tree.xpath('//h1')) yield "country", text(tree.xpath('//dd[@id="country"]')) yield "region", text(tree.xpath('//dd[@id="region"]'))
Example #9
Source File: tutorial.py From wextracto with BSD 3-Clause "New" or "Revised" License | 5 votes |
def extract(response): tree = parse(response) return text(tree.xpath('//h1/text()'))
Example #10
Source File: rvl_cdip.py From unilm with MIT License | 5 votes |
def read_hocr_file(self, data_dir, file): hocr_file = os.path.join(data_dir, "images", file[:-4] + ".xml") text_buffer = [] bbox_buffer = [] try: doc = html.parse(hocr_file) except AssertionError: logger.warning( "%s is empty or its format is unacceptable. Skipped.", hocr_file ) return [], [] for page in doc.xpath("//*[@class='ocr_page']"): page_bbox = [int(x) for x in get_prop(page, "bbox").split()] width, height = page_bbox[2], page_bbox[3] for word in doc.xpath("//*[@class='ocrx_word']"): textnodes = word.xpath(".//text()") s = "".join([text for text in textnodes]) text = re.sub(r"\s+", " ", s).strip() if text: text_buffer.append(text) bbox = [int(x) for x in get_prop(word, "bbox").split()] bbox = [ bbox[0] / width, bbox[1] / height, bbox[2] / width, bbox[3] / height, ] bbox = [int(x * 1000) for x in bbox] bbox_buffer.append(bbox) return text_buffer, bbox_buffer
Example #11
Source File: html.py From Splunking-Crime with GNU Affero General Public License v3.0 | 5 votes |
def _parse_tables(self, doc, match, attrs): """Return all tables from the parsed DOM. Parameters ---------- doc : tree-like The DOM from which to parse the table element. match : str or regular expression The text to search for in the DOM tree. attrs : dict A dictionary of table attributes that can be used to disambiguate mutliple tables on a page. Raises ------ ValueError * If `match` does not match any text in the document. Returns ------- tables : list of node-like A list of <table> elements to be parsed into raw data. """ raise AbstractMethodError(self)
Example #12
Source File: make.py From facebook-friends-map with MIT License | 5 votes |
def index_friends(): friends = utils.db_read(db_index) already_parsed = [] for i,d in enumerate(friends): already_parsed.append(d['id']) print('Loading saved friends list...') file_path = os.getcwd() + '/' + friends_html x = html.parse(file_path).xpath base = '(//*[@data-sigil="undoable-action"])' num_items = len(x(base)) if num_items == 0: print("\nWasn't able to parse friends index. This probably means that Facebook updated their template. \nPlease raise issue on Github and I will try to update the script. \nOr if you can code, please submit a pull request instead :)\n") sys.exit() for i in range(1,num_items+1): b = base + '['+str(i)+']/' info = json.loads(x(b+'/div[3]/div/div/div[3]')[0].get('data-store')) stdout.flush() stdout.write("\rScanning friend list... (%d / %d)" % (i,num_items)) if not info['id'] in already_parsed: name = x(b+'/div[2]//a')[0].text alias = '' if info['is_deactivated'] else x(b+'/div[2]//a')[0].get('href')[1:] d = { 'id': info['id'], 'name': name, 'active': 0 if int(info['is_deactivated']) else 1, 'alias': alias } utils.db_write(db_index,d) print('\n>> Saved friends list (%s) to %s' % (num_items,db_index)) # Download profile pages
Example #13
Source File: test_html.py From elasticintel with GNU General Public License v3.0 | 5 votes |
def get_lxml_elements(url, element): _skip_if_no('lxml') from lxml.html import parse doc = parse(url) return doc.xpath('.//{0}'.format(element))
Example #14
Source File: html.py From elasticintel with GNU General Public License v3.0 | 5 votes |
def _parse_tables(self, doc, match, attrs): """Return all tables from the parsed DOM. Parameters ---------- doc : tree-like The DOM from which to parse the table element. match : str or regular expression The text to search for in the DOM tree. attrs : dict A dictionary of table attributes that can be used to disambiguate mutliple tables on a page. Raises ------ ValueError * If `match` does not match any text in the document. Returns ------- tables : list of node-like A list of <table> elements to be parsed into raw data. """ raise AbstractMethodError(self)
Example #15
Source File: http.py From memorious with MIT License | 5 votes |
def html(self): if not hasattr(self, '_html'): self._html = None if self.content_type in NON_HTML: return if self.raw is None or not len(self.raw): return try: self._html = html.fromstring(self.text) except ValueError as ve: if 'encoding declaration' in str(ve): self._html = html.parse(self.file_path.as_posix()) except (etree.ParserError, etree.ParseError): pass return self._html
Example #16
Source File: http.py From memorious with MIT License | 5 votes |
def xml(self): if not hasattr(self, '_xml'): parser = etree.XMLParser( ns_clean=True, recover=True, resolve_entities=False, no_network=True ) self._xml = etree.parse(self.file_path.as_posix(), parser=parser) return self._xml
Example #17
Source File: http.py From memorious with MIT License | 5 votes |
def json(self): if not hasattr(self, '_json'): if self.file_path is None: raise ParseError("Cannot parse failed download.") with open(self.file_path, 'r') as fh: self._json = json.load(fh) return self._json
Example #18
Source File: models.py From jorvik with GNU General Public License v3.0 | 5 votes |
def corpo_body(self): """ Prova ad estrarre il corpo della pagina (body). :return: """ if not self.corpo: return "" doc = html.document_fromstring(self.corpo) body = doc.xpath('//body')[0] body.tag = 'div' #try: return html.tostring(body) #except: # return self.corpo #print html.parse('http://someurl.at.domain').xpath('//body')[0].text_content()
Example #19
Source File: html.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def _build_doc(self): """ Return a tree-like object that can be used to iterate over the DOM. Returns ------- node-like The DOM from which to parse the table element. """ raise AbstractMethodError(self)
Example #20
Source File: parsers.py From riko with MIT License | 5 votes |
def xml2etree(f, xml=True, html5=False): if xml: element_tree = etree.parse(f) elif html5 and html5parser: element_tree = html5parser.parse(f) elif html5parser: element_tree = html.parse(f) else: # html5lib's parser returns an Element, so we must convert it into an # ElementTree element_tree = ElementTree(html.parse(f)) return element_tree
Example #21
Source File: parsers.py From riko with MIT License | 5 votes |
def parse_rss(url=None, **kwargs): try: f = fetch(decode(url), **kwargs) except (ValueError, URLError): parsed = rssparser.parse(url) else: content = f.read() if speedparser else f try: parsed = rssparser.parse(content) finally: f.close() return parsed
Example #22
Source File: test_html.py From Computable with MIT License | 5 votes |
def get_lxml_elements(url, element): _skip_if_no('lxml') from lxml.html import parse doc = parse(url) return doc.xpath('.//{0}'.format(element))
Example #23
Source File: html.py From recruit with Apache License 2.0 | 5 votes |
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here # hack around python 3 deleting the exception variable retained = None for flav in flavor: parser = _parser_dispatch(flav) p = parser(io, compiled_match, attrs, encoding, displayed_only) try: tables = p.parse_tables() except Exception as caught: # if `io` is an io-like object, check if it's seekable # and try to rewind it before trying the next parser if hasattr(io, 'seekable') and io.seekable(): io.seek(0) elif hasattr(io, 'seekable') and not io.seekable(): # if we couldn't rewind it, let the user know raise ValueError('The flavor {} failed to parse your input. ' 'Since you passed a non-rewindable file ' 'object, we can\'t rewind it to try ' 'another parser. Try read_html() with a ' 'different flavor.'.format(flav)) retained = caught else: break else: raise_with_traceback(retained) ret = [] for table in tables: try: ret.append(_data_to_frame(data=table, **kwargs)) except EmptyDataError: # empty table continue return ret
Example #24
Source File: html.py From Computable with MIT License | 5 votes |
def _parse_tables(self, doc, match, attrs): """Return all tables from the parsed DOM. Parameters ---------- doc : tree-like The DOM from which to parse the table element. match : str or regular expression The text to search for in the DOM tree. attrs : dict A dictionary of table attributes that can be used to disambiguate mutliple tables on a page. Raises ------ ValueError * If `match` does not match any text in the document. Returns ------- tables : list of node-like A list of <table> elements to be parsed into raw data. """ raise NotImplementedError
Example #25
Source File: check_https.py From httpswatch with MIT License | 5 votes |
def fetch_through_redirects(url): tree = None while True: cont = False resp = requests.get( url, verify=certifi.where(), headers={"User-Agent": USER_AGENT}, timeout=10, stream=True, ) try: if resp.status_code != 200: raise Not200(resp.status_code) # Convince urllib3 to decode gzipped pages. resp.raw.decode_content = True tree = html.parse(resp.raw) finally: resp.close() # Check for sneaky <meta> redirects. for meta in META_XPATH(tree): m = re.match(r"0;\s*url=['\"](.+?)['\"]", meta.get("content")) if m is not None: url = m.groups()[0] cont = True break if not cont: break return resp, tree
Example #26
Source File: html.py From vnpy_crypto with MIT License | 5 votes |
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here # hack around python 3 deleting the exception variable retained = None for flav in flavor: parser = _parser_dispatch(flav) p = parser(io, compiled_match, attrs, encoding, displayed_only) try: tables = p.parse_tables() except Exception as caught: # if `io` is an io-like object, check if it's seekable # and try to rewind it before trying the next parser if hasattr(io, 'seekable') and io.seekable(): io.seek(0) elif hasattr(io, 'seekable') and not io.seekable(): # if we couldn't rewind it, let the user know raise ValueError('The flavor {} failed to parse your input. ' 'Since you passed a non-rewindable file ' 'object, we can\'t rewind it to try ' 'another parser. Try read_html() with a ' 'different flavor.'.format(flav)) retained = caught else: break else: raise_with_traceback(retained) ret = [] for table in tables: try: ret.append(_data_to_frame(data=table, **kwargs)) except EmptyDataError: # empty table continue return ret
Example #27
Source File: MangaList.py From MangaScrapper with Apache License 2.0 | 5 votes |
def main(): with open("mangalist.csv", "w") as f: tree = parse("http://www.mangapanda.com/alphabetical") manga_name_list = tree.xpath("//ul[@class='series_alpha']/li/a/text()") manga_url_list = tree.xpath("//ul[@class='series_alpha']/li/a/@href") f.write("\"Manga Name\", URL\n") for i in range(len(manga_name_list)): f.write("\"{0}\", http://www.mangapanda.com{1}\n".format(manga_name_list[i].replace("\"", ""), manga_url_list[i]))
Example #28
Source File: html.py From vnpy_crypto with MIT License | 5 votes |
def _parse_tables(self, doc, match, attrs): """Return all tables from the parsed DOM. Parameters ---------- doc : tree-like The DOM from which to parse the table element. match : str or regular expression The text to search for in the DOM tree. attrs : dict A dictionary of table attributes that can be used to disambiguate multiple tables on a page. Raises ------ ValueError * If `match` does not match any text in the document. Returns ------- tables : list of node-like A list of <table> elements to be parsed into raw data. """ raise com.AbstractMethodError(self)
Example #29
Source File: html.py From recruit with Apache License 2.0 | 4 votes |
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=True, encoding=self.encoding) try: if _is_url(self.io): with urlopen(self.io) as f: r = parse(f, parser=parser) else: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError) as e: # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: raise e else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r
Example #30
Source File: html.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 4 votes |
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=True, encoding=self.encoding) try: if _is_url(self.io): with urlopen(self.io) as f: r = parse(f, parser=parser) else: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError) as e: # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: raise e else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r