Python lxml.html.tostring() Examples
The following are 30
code examples of lxml.html.tostring().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
lxml.html
, or try the search function
.
Example #1
Source File: parsers.py From crestify with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, file_name, user_id): with open(file_name, 'r') as self.opened_file: # So Instapaper doesn't close <li> tags # This was causing infinite recursion when using BS directly # Hence why the stuff below is being done, so that the <li> tags get closed self.html = html.document_fromstring(self.opened_file.read()) self.html = html.tostring(self.html) self.soup = BeautifulSoup4(self.html) self.user = user_id self.urls = dict() self.check_duplicates = dict() self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user, Bookmark.deleted == False).all() for bmark in self.check_duplicates_query: self.check_duplicates[bmark.main_url] = bmark self.tags_dict = dict() self.tags_set = set() self.valid_url = re.compile( r'^(?:[a-z0-9\.\-]*)://' r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|' r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' r'(?::\d+)?' r'(?:/?|[/?]\S+)$', re.IGNORECASE)
Example #2
Source File: messaging.py From okcupyd with MIT License | 6 votes |
def content(self): """ :returns: The text body of the message. """ # The code that follows is obviously pretty disgusting. # It seems like it might be impossible to completely replicate # the text of the original message if it has trailing whitespace message = self._content_xpb.one_(self._message_element) first_line = message.text if message.text[:2] == ' ': first_line = message.text[2:] else: log.debug("message did not have expected leading whitespace") subsequent_lines = ''.join([ html.tostring(child, encoding='unicode').replace('<br>', '\n') for child in message.iterchildren() ]) message_text = first_line + subsequent_lines if len(message_text) > 0 and message_text[-1] == ' ': message_text = message_text[:-1] else: log.debug("message did not have expected leading whitespace") return message_text
Example #3
Source File: views.py From openprescribing with MIT License | 6 votes |
def gdoc_view(request, doc_id): try: gdoc_id = settings.GDOC_DOCS[doc_id] except KeyError: raise Http404("No doc named %s" % doc_id) url = "https://docs.google.com/document/d/%s/pub?embedded=true" % gdoc_id page = requests.get(url) tree = html.fromstring(page.text) content = ( "<style>" + "".join( [ html.tostring(child).decode("utf8") for child in tree.head.xpath("//style") ] ) + "</style>" ) content += "".join([html.tostring(child).decode("utf8") for child in tree.body]) context = {"content": content} return render(request, "gdoc.html", context)
Example #4
Source File: server.py From autologin with Apache License 2.0 | 6 votes |
def download_page(url, cookie_jar): """ Request page using authenticated cookies (cookiejar). Download html source and save in browser directory, to be used by in show_in_browser(). """ browser_dir = os.path.join(server_path, 'static/browser') delete_directory_files(browser_dir) filename = '{}.html'.format(uuid.uuid4()) filepath = os.path.join(browser_dir, filename) try: response = cookie_request(url, cookie_jar) except requests.RequestException as e: return e, None doc = html.document_fromstring(response.text) with open(filepath, 'wb') as f: f.write(html.tostring(doc)) return None, filename
Example #5
Source File: clean.py From memorious with MIT License | 6 votes |
def clean_html(context, data): """Clean an HTML DOM and store the changed version.""" doc = _get_html_document(context, data) if doc is None: context.emit(data=data) return remove_paths = context.params.get('remove_paths') for path in ensure_list(remove_paths): for el in doc.xpath(path): el.drop_tree() html_text = html.tostring(doc, pretty_print=True) content_hash = context.store_data(html_text) data['content_hash'] = content_hash context.emit(data=data)
Example #6
Source File: html.py From mailur with GNU General Public License v3.0 | 6 votes |
def from_text(txt): def replace(match): txt = match.group() if '\n' in txt: return '<br>' * txt.count('\n') else: return ' ' * txt.count(' ') tpl = '<p>%s</p>' htm = escape(txt) htm = fromstring(tpl % htm) fix_links(htm) htm = tostring(htm, encoding='unicode') htm = htm[3:-4] htm = re.sub('(?m)((\r?\n)+| [ ]+|^ )', replace, htm) htm = tpl % htm return htm
Example #7
Source File: xml.py From ingestors with MIT License | 6 votes |
def ingest(self, file_path): """Ingestor implementation.""" file_size = self.result.size or os.path.getsize(file_path) if file_size > self.MAX_SIZE: raise ProcessingException("XML file is too large.") try: doc = etree.parse(file_path) except (ParserError, ParseError): raise ProcessingException("XML could not be parsed.") text = self.extract_html_text(doc.getroot()) transform = etree.XSLT(self.XSLT) html_doc = transform(doc) html_body = html.tostring(html_doc, encoding=str, pretty_print=True) self.result.flag(self.result.FLAG_HTML) self.result.emit_html_body(html_body, text)
Example #8
Source File: external.py From trafilatura with GNU General Public License v3.0 | 6 votes |
def try_justext(tree, url, target_language): '''Second safety net: try with the generic algorithm justext''' result_body = etree.Element('body') justtextstring = html.tostring(tree, pretty_print=False, encoding='utf-8') # determine language if target_language is not None and target_language in JUSTEXT_LANGUAGES: langsetting = JUSTEXT_LANGUAGES[target_language] justext_stoplist = justext.get_stoplist(langsetting) else: #justext_stoplist = justext.get_stoplist(JUSTEXT_DEFAULT) justext_stoplist = JT_STOPLIST # extract try: paragraphs = justext.justext(justtextstring, justext_stoplist, 50, 200, 0.1, 0.2, 0.2, 200, True) except ValueError as err: # not an XML element: HtmlComment LOGGER.error('justext %s %s', err, url) result_body = None else: for paragraph in paragraphs: if not paragraph.is_boilerplate: #if duplicate_test(paragraph) is not True: elem = etree.Element('p') elem.text = paragraph.text result_body.append(elem) return result_body
Example #9
Source File: requests_html.py From requests-html with MIT License | 5 votes |
def xpath(self, selector: str, *, clean: bool = False, first: bool = False, _encoding: str = None) -> _XPath: """Given an XPath selector, returns a list of :class:`Element <Element>` objects or a single one. :param selector: XPath Selector to use. :param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags. :param first: Whether or not to return just the first result. :param _encoding: The encoding format. If a sub-selector is specified (e.g. ``//a/@href``), a simple list of results is returned. See W3School's `XPath Examples <https://www.w3schools.com/xml/xpath_examples.asp>`_ for more details. If ``first`` is ``True``, only returns the first :class:`Element <Element>` found. """ selected = self.lxml.xpath(selector) elements = [ Element(element=selection, url=self.url, default_encoding=_encoding or self.encoding) if not isinstance(selection, etree._ElementUnicodeResult) else str(selection) for selection in selected ] # Sanitize the found HTML. if clean: elements_copy = elements.copy() elements = [] for element in elements_copy: element.raw_html = lxml_html_tostring(cleaner.clean_html(element.lxml)) elements.append(element) return _get_first_or_list(elements, first)
Example #10
Source File: models.py From jorvik with GNU General Public License v3.0 | 5 votes |
def processa_link(self): """ Controlla i link nella e-mail relativi e li rende assoluti. """ doc = html.document_fromstring(self.corpo) links = doc.xpath('//a') for el in links: try: url = el.attrib['href'] if '://' not in url: el.attrib['href'] = "https://gaia.cri.it%s" % (url,) except KeyError: continue self.corpo = html.tostring(doc, pretty_print=True).decode('UTF-8')
Example #11
Source File: models.py From jorvik with GNU General Public License v3.0 | 5 votes |
def corpo_body(self): """ Prova ad estrarre il corpo della pagina (body). :return: """ if not self.corpo: return "" doc = html.document_fromstring(self.corpo) body = doc.xpath('//body')[0] body.tag = 'div' #try: return html.tostring(body) #except: # return self.corpo #print html.parse('http://someurl.at.domain').xpath('//body')[0].text_content()
Example #12
Source File: utils.py From jorvik with GNU General Public License v3.0 | 5 votes |
def get_drive_file(file): req = urllib.request.Request("https://docs.google.com/feeds/download/documents/export/Export?id=%s&exportFormat=html" %(file,)) str = urllib.request.urlopen(req).read().decode('UTF-8') doc = html.document_fromstring(str) head = doc.xpath('//head')[0] head.tag = 'div' body = doc.xpath('//body')[0] body.tag = 'div' str = html.tostring(head)+html.tostring(body) return str
Example #13
Source File: EntityLinking.py From ClusType with GNU General Public License v3.0 | 5 votes |
def run(self): print "Start DBpediaSpotlight" g = open('tmp/temp' + str(self.offset) + '.txt', 'w') index = 0 while 1: did = str(index + self.offset) if did in self.docList: try: doc = self.docList[did] url = "http://spotlight.sztaki.hu:2222/rest/annotate" #url = "http://localhost:2222/rest/annotate" data = {"confidence":self.confidence} data["support"] = "20" data["text"] = doc; data = urllib.urlencode(data) req = urllib2.Request(url) req.add_header('Accept', 'application/json') #text/xml') # print did page = html.fromstring(urllib2.urlopen(req, data, timeout=100).read()) docJson = html.tostring(page)[3:-4] #print docJson validEntities = extractAnnotations(docJson) for entity in validEntities: linkToFreebase(entity) if (entity['@URI'] != None): g.write(str(index + self.offset) + '\t' + entity['@surfaceForm'] + '\t' + entity['@URI'] + '\t' + entity['@similarityScore'] + '\t' + entity['@percentageOfSecondRank']+ '\n') index += threadNum except: index += threadNum print 'noresult' else: break g.close()
Example #14
Source File: lxml_toolkit_object.py From enaml-web with MIT License | 5 votes |
def render(self, method='html', encoding='unicode', **kwargs): """ Render the widget tree into a string """ return tostring(self.widget, method=method, encoding=encoding, **kwargs)
Example #15
Source File: metadata.py From trafilatura with GNU General Public License v3.0 | 5 votes |
def extract_url(tree, default_url=None): '''Extract the URL from the canonical link''' # https://www.tutorialrepublic.com/html-reference/html-base-tag.php # default url as fallback url = default_url # try canonical link first element = tree.find('.//head//link[@rel="canonical"]') if element is not None: url = element.attrib['href'] # try default language link else: for element in tree.xpath('//head//link[@rel="alternate"]'): if 'hreflang' in element.attrib and element.attrib['hreflang'] is not None and element.attrib['hreflang'] == 'x-default': LOGGER.debug(html.tostring(element, pretty_print=False, encoding='unicode').strip()) url = element.attrib['href'] # add domain name if it's missing if url is not None and url.startswith('/'): for element in tree.xpath('//head//meta[@content]'): if 'name' in element.attrib: attrtype = element.attrib['name'] elif 'property' in element.attrib: attrtype = element.attrib['property'] else: continue if attrtype.startswith('og:') or attrtype.startswith('twitter:'): domain_match = re.match(r'https?://[^/]+', element.attrib['content']) if domain_match: # prepend URL url = domain_match.group(0) + url break return url
Example #16
Source File: core.py From trafilatura with GNU General Public License v3.0 | 5 votes |
def determine_returnstring(docmeta, postbody, commentsbody, output_format, tei_validation, record_id): '''Convert XML tree to chosen format, clean the result and output it as a string''' # XML (TEI) steps if 'xml' in output_format: # last cleaning for element in postbody.iter(): if len(element) == 0 and not element.text and not element.tail: parent = element.getparent() if parent is not None: parent.remove(element) # build output trees if output_format == 'xml': output = build_xml_output(postbody, commentsbody) output = add_xml_meta(output, docmeta) elif output_format == 'xmltei': output = build_tei_output(postbody, commentsbody, docmeta) # can be improved control_string = etree.tostring(output, encoding='unicode') control_string = sanitize(control_string) # necessary for cleaning control_parser = etree.XMLParser(remove_blank_text=True) output_tree = etree.fromstring(control_string, control_parser) # validate if output_format == 'xmltei' and tei_validation is True: result = validate_tei(output_tree) LOGGER.info('TEI validation result: %s %s %s', result, record_id, docmeta.url) # output as string returnstring = etree.tostring(output_tree, pretty_print=True, encoding='unicode').strip() # CSV + TXT output else: if output_format == 'csv': posttext = xmltotxt(postbody) if commentsbody is not None: commentstext = xmltotxt(commentsbody) else: commentstext = '' returnstring = txttocsv(posttext, commentstext, docmeta) else: output = build_xml_output(postbody, commentsbody) returnstring = xmltotxt(output) return returnstring
Example #17
Source File: course_info_model.py From ANALYSE with GNU Affero General Public License v3.0 | 5 votes |
def _course_info_content(html_parsed): """ Constructs the HTML for the course info update, not including the header. """ if len(html_parsed) == 1: # could enforce that update[0].tag == 'h2' content = html_parsed[0].tail else: content = html_parsed[0].tail if html_parsed[0].tail is not None else "" content += "\n".join([html.tostring(ele) for ele in html_parsed[1:]]) return content
Example #18
Source File: html_to_telegraph.py From html-telegraph-poster with MIT License | 5 votes |
def convert_html_to_telegraph_format(html_string, clean_html=True, output_format="json_string"): if clean_html: html_string = clean_article_html(html_string) body = preprocess_fragments( _fragments_from_string(html_string) ) if body is not None: desc = [x for x in body.iterdescendants()] for tag in desc: preprocess_media_tags(tag) move_to_top(body) post_process(body) else: fragments = _fragments_from_string(html_string) body = fragments[0].getparent() if len(fragments) else None content = [] if body is not None: content = [_recursive_convert(x) for x in body.iterchildren()] if output_format == 'json_string': return json.dumps(content, ensure_ascii=False) elif output_format == 'python_list': return content elif output_format == 'html_string': return html.tostring(body, encoding='unicode')
Example #19
Source File: html_to_telegraph.py From html-telegraph-poster with MIT License | 5 votes |
def convert_json_to_html(elements): content = html.fragment_fromstring('<div></div>') for element in elements: content.append(_recursive_convert_json(element)) content.make_links_absolute(base_url=base_url) for x in content.xpath('.//span'): x.drop_tag() html_string = html.tostring(content, encoding='unicode') html_string = replace_line_breaks_except_pre(html_string, '<br/>') html_string = html_string[5:-6] return html_string
Example #20
Source File: hearth.py From hearthstats with GNU General Public License v2.0 | 5 votes |
def get_deck_list(deckid): """ For a given HearthPwn deck ID, return a list of Cards that belong to that deck. Parameters: - 'deckid' - a HearthPwn deck ID """ # http://www.hearthpwn.com/decks/listing/ + deckid + /neutral or /class url = 'http://www.hearthpwn.com/decks/listing/' css = '#cards > tbody > tr > td.col-name' deck = [] # Class Cards htmlelement = get_htmlelement_from_url(url + str(deckid) + '/class') cardelements = htmlelement.cssselect(css) # Neutral Cards htmlelement = get_htmlelement_from_url(url + str(deckid) + '/neutral') cardelements += htmlelement.cssselect(css) regex = re.compile('×\s+(\d+)') for element in cardelements: # cssselect always returns an array, but in our case the result is # always just one element. cardname = element.cssselect('a')[0].text.strip() elementtext = html.tostring(element).decode('UTF-8') # There's probably a better way to get the amount, but we currently # look for the "x #" in the raw text of the element match = re.search(regex, elementtext) if match: amount = int(match.group(1)) else: print('ERROR: Unable to get amount for card ' + cardname) # This shouldn't happen, but when it does, just continue on after # logging an error. amount = 0 deck.append(Card(cardname, amount)) return deck
Example #21
Source File: html.py From mailur with GNU General Public License v3.0 | 5 votes |
def fix_privacy(htm, only_proxy=False): if not htm.strip(): return htm use_proxy = conf['USE_PROXY'] if only_proxy and not use_proxy: return htm htm = fromstring(htm) for img in htm.xpath('//img[@src]'): src = img.attrib['src'] if re.match('^(https?://|//).*', src): if src.startswith('//'): src = 'https:' + src if use_proxy: src = '/proxy?url=' + src if only_proxy: img.attrib['src'] = src else: img.attrib['data-src'] = src del img.attrib['src'] if not only_proxy: # style could contain "background-image", etc. for el in htm.xpath('//*[@style]'): el.attrib['data-style'] = el.attrib['style'] del el.attrib['style'] htm = tostring(htm, encoding='unicode').strip() htm = re.sub('(^<div>|</div>$)', '', htm) return htm
Example #22
Source File: cleanhtml.py From zing with GNU General Public License v3.0 | 5 votes |
def url_trim(html): """Trims anchor texts that are longer than 70 chars.""" fragment = fromstring(html) for el, attrib_, link_, pos_ in fragment.iterlinks(): new_link_text = trim_url(el.text_content()) el.text = new_link_text return mark_safe(tostring(fragment, encoding="unicode"))
Example #23
Source File: __init__.py From online-judge with GNU Affero General Public License v3.0 | 5 votes |
def fragment_tree_to_str(tree): return html.tostring(tree, encoding='unicode')[len('<div>'):-len('</div>')]
Example #24
Source File: lxml_tree.py From online-judge with GNU Affero General Public License v3.0 | 5 votes |
def __str__(self): return mark_safe(html.tostring(self._tree, encoding='unicode'))
Example #25
Source File: test_items.py From ant_nest with GNU Lesser General Public License v3.0 | 4 votes |
def test_extract_item(): with open("./tests/test.html", "rb") as f: response = httpx.Response( 200, request=httpx.Request("Get", "https://test.com"), content=f.read() ) class Item: pass # extract item with xpath and regex item_extractor = Extractor(Item) item_extractor.add_extractor( "paragraph", lambda x: html.fromstring(x.text).xpath("/html/body/div/p/text()")[0], ) item_extractor.add_extractor( "title", lambda x: re.findall(r"<title>([A-Z a-z]+)</title>", x.text)[0] ) item = item_extractor.extract(response) assert item.paragraph == "test" assert item.title == "Test html" # extract with jpath response = httpx.Response( 200, request=httpx.Request("Get", "https://test.com"), content=b'{"a": {"b": {"c": 1}}, "d": null}', ) item_extractor = Extractor(Item) item_extractor.add_extractor( "author", lambda x: jpath.get_all("a.b.c", x.json())[0] ) item_extractor.add_extractor("freedom", lambda x: jpath.get_all("d", x.json())[0]) item = item_extractor.extract(response) assert item.author == 1 assert item.freedom is None # ItemNestExtractor tests with open("./tests/test.html", "rb") as f: response = httpx.Response( 200, request=httpx.Request("Get", "https://test.com"), content=f.read() ) item_nest_extractor = NestExtractor( Item, lambda x: html.fromstring(x.text).xpath('//div[@id="nest"]/div') ) item_nest_extractor.add_extractor("xpath_key", lambda x: x.xpath("./p/text()")[0]) item_nest_extractor.add_extractor( "regex_key", lambda x: re.findall(r"regex(\d+)</", html.tostring(x, encoding="unicode"))[0], ) temp = 1 for item in item_nest_extractor.extract_items(response): assert item.xpath_key == str(temp) assert item.regex_key == str(temp) temp += 1
Example #26
Source File: core.py From trafilatura with GNU General Public License v3.0 | 4 votes |
def extract_content(tree, include_tables=False): '''Find the main content of a page using a set of XPath expressions, then extract relevant elements, strip them of unwanted subparts and convert them''' sure_thing = False result_body = etree.Element('body') # iterate for expr in BODY_XPATH: # select tree if the expression has been found subtree = tree.xpath(expr) if not subtree: continue subtree = subtree[0] # prune subtree = discard_unwanted(subtree) # remove elements by link density for elem in subtree.iter('list'): if link_density_test(elem) is True: elem.getparent().remove(elem) continue elem.attrib.clear() #for subelem in elem.iter('item'): # subelem.attrib.clear() etree.strip_tags(subtree, 'a', 'link', 'span') # define iteration strategy potential_tags = set(TAG_CATALOG) # + 'span'? if include_tables is True: potential_tags.add('table') # no paragraphs containing text if not subtree.xpath('//p//text()'): potential_tags.add('div') LOGGER.debug(sorted(potential_tags)) # etree.strip_tags(subtree, 'lb') # BoingBoing-Bug # print(html.tostring(subtree, pretty_print=True, encoding='unicode')) # extract content processed_elems = [handle_textelem(e, potential_tags) for e in subtree.xpath('.//*')] # list(filter(None.__ne__, processed_elems)) result_body.extend([e for e in processed_elems if e is not None]) # exit the loop if the result has children if len(result_body) > 0: sure_thing = True LOGGER.debug(expr) break # try parsing wild <p> elements if nothing found or text too short temp_text = trim(' '.join(result_body.itertext())) len_text = len(temp_text) if len(result_body) == 0 or len_text < MIN_EXTRACTED_SIZE: result_body = recover_wild_paragraphs(tree, result_body) #search_tree = discard_unwanted(tree) #search_tree = prune_html(search_tree) #result_body, _, _ = baseline(search_tree) temp_text = trim(' '.join(result_body.itertext())) len_text = len(temp_text) # filter output etree.strip_elements(result_body, 'done') etree.strip_tags(result_body, 'div') # return return result_body, temp_text, len_text, sure_thing
Example #27
Source File: html.py From mailur with GNU General Public License v3.0 | 4 votes |
def clean(htm, embeds=None): htm = re.sub(r'^\s*<\?xml.*?\?>', '', htm).strip() if not htm: return '', {} htm = htm.replace('\r\n', '\n') cleaner = Cleaner( links=False, style=True, inline_style=False, kill_tags=['head'], remove_tags=['html', 'base'], safe_attrs=list(set(Cleaner.safe_attrs) - {'class'}) + ['style'], ) htm = fromstring(htm) htm = cleaner.clean_html(htm) ext_images = 0 embeds = embeds or {} for img in htm.xpath('//img[@src]'): src = img.attrib.get('src') cid = re.match('^cid:(.*)', src) url = cid and embeds.get('<%s>' % cid.group(1)) if url: img.attrib['src'] = url elif re.match('^data:image/.*', src): pass elif re.match('^(https?://|//).*', src): ext_images += 1 else: del img.attrib['src'] styles = False for el in htm.xpath('//*[@style]'): styles = True break fix_links(htm) richer = (('styles', styles), ('ext_images', ext_images)) richer = {k: v for k, v in richer if v} htm = tostring(htm, encoding='unicode').strip() htm = re.sub('(^<div>|</div>$)', '', htm) return htm, richer
Example #28
Source File: weibo.py From news_spider with MIT License | 4 votes |
def parse_article_detail_js(self, response): """ 文章详情解析 js 版 :param response: :return: """ article_detail_body = response.body_as_unicode() article_detail_rule = r'<script>FM.view\({"ns":.*?"html":"(.*?)"}\)</script>' article_detail_re_parse = re.compile(article_detail_rule, re.S).findall(article_detail_body) if not article_detail_re_parse: return article_detail_html = ''.join(article_detail_re_parse) # 转义字符处理 article_detail_html = article_detail_html.replace('\\r', '') article_detail_html = article_detail_html.replace('\\t', '') article_detail_html = article_detail_html.replace('\\n', '') article_detail_html = article_detail_html.replace('\\"', '"') article_detail_html = article_detail_html.replace('\\/', '/') article_detail_doc = fromstring(article_detail_html) article_title_parse = article_detail_doc.xpath('//h1[@class="title"]/text()') article_title = article_title_parse[0].strip() if article_title_parse else '' article_pub_time_parse = article_detail_doc.xpath('//span[@class="time"]/text()') article_pub_time = self.trans_time(article_pub_time_parse[0].strip()) if article_pub_time_parse else time.strftime('%Y-%m-%d %H:%M:%S') article_content_parse = article_detail_doc.xpath('//div[@class="WBA_content"]') article_content = tostring(article_content_parse[0], encoding='unicode').strip() if article_content_parse else '' fetch_result_item = FetchResultItem() fetch_result_item['task_id'] = response.meta['task_id'] fetch_result_item['platform_id'] = response.meta['platform_id'] fetch_result_item['platform_name'] = platform_name_map.get(response.meta['platform_id'], '') fetch_result_item['channel_id'] = response.meta['channel_id'] fetch_result_item['channel_name'] = channel_name_map.get(response.meta['channel_id'], '') fetch_result_item['article_id'] = response.meta['article_id'] fetch_result_item['article_title'] = article_title fetch_result_item['article_author_id'] = response.meta['follow_id'] fetch_result_item['article_author_name'] = response.meta['follow_name'] fetch_result_item['article_pub_time'] = time_local_to_utc(article_pub_time).strftime('%Y-%m-%d %H:%M:%S') fetch_result_item['article_url'] = response.url fetch_result_item['article_tags'] = '' fetch_result_item['article_abstract'] = response.meta['article_abstract'] fetch_result_item['article_content'] = article_content yield fetch_result_item
Example #29
Source File: utils.py From scrape with MIT License | 4 votes |
def write_part_file(args, url, raw_html, html=None, part_num=None): """Write PART.html file(s) to disk, images in PART_files directory. Keyword arguments: args -- program arguments (dict) raw_html -- unparsed HTML file content (list) html -- parsed HTML file content (lxml.html.HtmlElement) (default: None) part_num -- PART(#).html file number (int) (default: None) """ if part_num is None: part_num = get_num_part_files() + 1 filename = "PART{0}.html".format(part_num) # Decode bytes to string in Python 3 versions if not PY2 and isinstance(raw_html, bytes): raw_html = raw_html.encode("ascii", "ignore") # Convert html to an lh.HtmlElement object for parsing/saving images if html is None: html = lh.fromstring(raw_html) # Parse HTML if XPath entered if args["xpath"]: raw_html = parse_html(html, args["xpath"]) if isinstance(raw_html, list): if not isinstance(raw_html[0], lh.HtmlElement): raise ValueError("XPath should return an HtmlElement object.") else: if not isinstance(raw_html, lh.HtmlElement): raise ValueError("XPath should return an HtmlElement object.") # Write HTML and possibly images to disk if raw_html: if not args["no_images"] and (args["pdf"] or args["html"]): raw_html = write_part_images(url, raw_html, html, filename) with open(filename, "w") as part: if not isinstance(raw_html, list): raw_html = [raw_html] if isinstance(raw_html[0], lh.HtmlElement): for elem in raw_html: part.write(lh.tostring(elem)) else: for line in raw_html: part.write(line)
Example #30
Source File: test_parsers.py From crestify with BSD 3-Clause "New" or "Revised" License | 4 votes |
def setUp(self): query_user = User.query.filter_by(email='instapaper@example.com').first() if query_user: query_bookmarks = Bookmark.query.filter_by(user=query_user.id) for bmark in query_bookmarks: db.session.delete(bmark) db.session.commit() db.session.delete(query_user) db.session.commit() create_user = User() create_user.first_name = 'Instapaper' create_user.last_name = 'Test' create_user.email = 'instapaper@example.com' create_user.password = 'instapaper_pass' create_user.active = True create_user.confirmed_at = datetime.datetime.utcnow() db.session.add(create_user) db.session.commit() self.user = create_user with open('Instapaper.html') as json_file: create_file = open(os.path.join(app.config['CRESTIFY_UPLOAD_DIRECTORY'], 'test_instapaper.html'), 'w+') self.data = html.document_fromstring(json_file.read()) self.data = html.tostring(self.data) self.html_data = BeautifulSoup4(self.data) self.bookmarks = {} for tag in self.html_data.find_all('h1'): parent_elem = tag.find_next_sibling('ol') links = parent_elem.find_all('a') for link in links: title = link.text url = link['href'] tags = [tag.text] tags.append('Imported') # Thanks Instapaper for not adding timestamps self.bookmarks[url] = { 'href': url, 'title': title, 'tags': tags } create_file.write(self.data) self.file_path = create_file.name create_file.close() init_parser = InstapaperParser(self.file_path, self.user.id) init_parser.process() init_parser.add_to_database() self.query = Bookmark.query.filter_by(user=self.user.id).all() self.html_parser = HTMLParser()