Python html2text.HTML2Text() Examples
The following are 30
code examples of html2text.HTML2Text().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
html2text
, or try the search function
.
Example #1
Source File: html2md.py From FengTools with MIT License | 6 votes |
def doelse(url): headers = { 'User-Agent': random.choice(useragents) } res = requests.get(url=url ,headers=headers) # 获取整个html页面 h = html2text.HTML2Text() h.ignore_links = False soup = BeautifulSoup(res.text,'html5lib') title = soup.title.text # 获取标题 html = str(soup.body) article = h.handle(html) pwd = os.getcwd() # 获取当前文件的路径 dirpath = pwd + '/Else/' if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录 os.makedirs(dirpath) ## 写入文件 write2md(dirpath,title,article)
Example #2
Source File: gapps.py From danforth-east with MIT License | 6 votes |
def send_email(to_address, to_name, subject, body_html): """Sends an email from the configured address. Does not check for address validity. """ if config.ALLOWED_EMAIL_TO_ADDRESSES is not None and \ to_address not in config.ALLOWED_EMAIL_TO_ADDRESSES: # Not allowed to send to this address logging.info('send_email: not allowed to send to: %s' % to_address) return full_to_address = '%s <%s>' % (to_name, to_address) h2t = html2text.HTML2Text() h2t.body_width = 0 body_text = h2t.handle(body_html) message = mail.EmailMessage(sender=config.MASTER_EMAIL_SEND_ADDRESS, subject=subject, to=full_to_address, body=body_text, html=body_html) message.send()
Example #3
Source File: utils.py From OpenCryptoBot with GNU Affero General Public License v3.0 | 6 votes |
def remove_html_links(text): import html2text h = html2text.HTML2Text() h.ignore_links = True start = "<a href=" end = "</a>" while start in text and end in text: s_index = text.find(start) e_index = text.find(end) + len(end) html_link = text[s_index:e_index] title = h.handle(html_link).strip() text = text.replace(html_link, title) return text.strip()
Example #4
Source File: sayurl.py From Fox-V3 with GNU Affero General Public License v3.0 | 6 votes |
def sayurl(self, ctx: commands.Context, url): """ Converts a URL to something readable Works better on smaller websites """ h = html2text.HTML2Text() h.ignore_links = True # h.ignore_images = True h.images_to_alt = True h.escape_snob = True h.skip_internal_links = True h.ignore_tables = True h.single_line_break = True h.mark_code = True h.wrap_links = True h.ul_item_mark = "-" async with aiohttp.ClientSession() as session: site = await fetch_url(session, url) for page in pagify(h.handle(site)): await ctx.send(page)
Example #5
Source File: xzl.py From xzl with MIT License | 6 votes |
def get_xs_detail(href, title, path): url = xzl+href print('开始采集' + title + '的详情, 章节地址为: ' + url + '\n') text_maker = ht.HTML2Text() response = close_session().get(url=url, headers=headers) selector = Selector(text=response.text) html = selector.css(u'.cata-book-content').extract_first() file_name = title if markdown: md = text_maker.handle(html) with open(path + file_name + '.md', 'w') as f: f.write(md) else: if not xs_pdf: # 在html中加入编码, 否则中文会乱码 html = "<html><head><meta charset='utf-8'></head> " + html + "</html>" pdfkit.from_string(html, path + file_name + '.pdf') else: return html # 采集专栏列表
Example #6
Source File: xzl.py From xzl with MIT License | 6 votes |
def get_zl_detail(url, path, name): response = close_session().get(url=url, headers=headers) selector = Selector(text=response.text) text_maker = ht.HTML2Text() create_time = selector.css(u'.time abbr::attr(title)').extract_first() html = selector.css(u'.xzl-topic-body-content').extract_first() file_name = name if hasTime: file_name = create_time+' '+name if markdown: md = text_maker.handle(html) with open(path + file_name + '.md', 'w') as f: f.write(md) else: # 在html中加入编码, 否则中文会乱码 html = "<html><head><meta charset='utf-8'></head> " + html + "</html>" pdfkit.from_string(html, path + file_name + '.pdf') # 关闭多余连接
Example #7
Source File: sql_email.py From modoboa-amavis with MIT License | 6 votes |
def body(self): if self._body is None: super(SQLemail, self).body self._body = fix_utf8_encoding(self._body) # if there's no plain text version available attempt to make one by # sanitising the html version. The output isn't always pretty but it # is readable, better than a blank screen and helps the user decide # if the message is spam or ham. if self.dformat == "plain" and not self.contents["plain"] \ and self.contents["html"]: h = HTML2Text() h.ignore_tables = True h.images_to_alt = True mail_text = h.handle(self.contents["html"]) self.contents["plain"] = self._post_process_plain( smart_text(mail_text)) self._body = self.viewmail_plain() self._body = fix_utf8_encoding(self._body) return self._body
Example #8
Source File: html2text.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 6 votes |
def export_html_to_text_html2text(input_buffer, encoding="utf-8"): """ Export HTML to text via html2text. :param input_buffer: input HTML buffer :param encoding: default encoding :return: """ # Ensure we have a decoded string if isinstance(input_buffer, bytes): input_buffer = input_buffer.decode(encoding) # Process and return parser = html2text.HTML2Text() parser.ignore_emphasis = True parser.ignore_links = True parser.ignore_images = True html_buffer = html.unescape(parser.handle(input_buffer)) return html_buffer
Example #9
Source File: reader.py From reader with MIT License | 6 votes |
def main(result, body_width): """Convert Mercury parse result dict to Markdown and plain-text result: a mercury-parser result (as a Python dict) """ text = HTML2Text() text.body_width = body_width text.ignore_emphasis = True text.ignore_images = True text.ignore_links = True text.convert_charrefs = True markdown = HTML2Text() markdown.body_width = body_width markdown.convert_charrefs = True result['content'] = { 'html': result['content'], 'markdown': unescape(markdown.handle(result['content'])), 'text': unescape(text.handle(result['content'])) } return result
Example #10
Source File: html2md.py From FengTools with MIT License | 6 votes |
def write2md(dirpath,title,article): ## 创建转换器 h2md = html2text.HTML2Text() h2md.ignore_links = False ## 转换文档 article = h2md.handle(article) ## 写入文件 if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录 os.makedirs(dirpath) # 创建md文件 with open(dirpath+title+'.md','w',encoding="utf8") as f: lines = article.splitlines() for line in lines: if line.endswith('-'): f.write(line) else: f.write(line+"\n") print(title+"下载完成....")
Example #11
Source File: MdownloadRecipes.py From extract_recipe with Apache License 2.0 | 6 votes |
def get_url_markdown(baseurl,start,increment): try: ''' opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')] try: j = opener.open(baseurl) except: return None data = j.read() ''' urlHandler = urllib2.urlopen(baseurl) data = urlHandler.read() ''' os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl) data = open('temp' + str(start)+"_"+str(increment),'rU').read() ''' h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True h.body_width = 10000 data = h.handle(unidecode(unicode(data,errors='ignore'))) return unidecode(data) except: return None
Example #12
Source File: downloadRecipes.py From extract_recipe with Apache License 2.0 | 6 votes |
def get_url_markdown(baseurl,start,increment): ''' opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')] try: j = opener.open(baseurl) except: return None data = j.read() ''' urlHandler = urllib2.urlopen(baseurl) data = urlHandler.read() ''' os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl) data = open('temp' + str(start)+"_"+str(increment),'rU').read() ''' h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True h.body_width = 10000 data = h.handle(unidecode(unicode(data,errors='ignore'))) return unidecode(data)
Example #13
Source File: doc.py From Greynir with GNU General Public License v3.0 | 6 votes |
def extract_text(self): html = self.data.decode(DEFAULT_TEXT_ENCODING) h = html2text.HTML2Text() # See https://github.com/Alir3z4/html2text/blob/master/html2text/cli.py h.ignore_links = True h.ignore_emphasis = True h.ignore_images = True h.unicode_snob = True h.ignore_tables = True h.decode_errors = "ignore" h.body_width = 0 text = h.handle(html) return self.remove_header_prefixes(text)
Example #14
Source File: feed.py From reader with MIT License | 6 votes |
def get_article(article_id, links=False, url=URL): # type: (str, bool, str) -> str """Get article from feed with the given ID""" articles = _feed(url).entries try: article = articles[int(article_id)] except (IndexError, ValueError): max_id = len(articles) - 1 msg = "Unknown article ID, use ID from 0 to {}".format(max_id) raise SystemExit("Error: {}".format(msg)) # Get article as HTML try: html = article.content[0].value except AttributeError: html = article.summary # Convert HTML to plain text to_text = html2text.HTML2Text() to_text.ignore_links = not links text = to_text.handle(html) return u"# {}\n\n{}".format(article.title, text)
Example #15
Source File: rssfeed.py From Rss-Atom-Feed-Integration-for-Mattermost with MIT License | 5 votes |
def jointext(self): text = '' h = html2text.HTML2Text() h.ignore_links = True self.Description = h.handle(self.Description) if self.ShowName == True: text += "_" + self.Name + '_\n' if self.ShowTitle == True: text += '### [' + self.NewTitle + '](' + urllib.quote(self.ArticleUrl, safe=';/?:@&=+$,') + ')\n' if self.ShowDescription == True: text += self.Description + '\n' if self.ShowUrl == True: text += self.ArticleUrl return text
Example #16
Source File: feeds.py From openstax-cms with GNU Affero General Public License v3.0 | 5 votes |
def item_description(self, item): h = html2text.HTML2Text() excerpt = h.handle(str(item.body)).split('\n\n')[0] return excerpt + "..."
Example #17
Source File: zhihu_answer_wordcloud.py From Python-tools with MIT License | 5 votes |
def get_flow(): h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True h.ignore_emphasis = True h.ignore_tables = True filename = "{}.txt".format(question_id) f = open(filename,'w',encoding='utf-8') for i in range(15): limit = "10" offset = str(i*10) print("getting the {} answer...".format(offset)) api_base_url = "https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B%2A%5D.is_normal%2Ccontent%2Ceditable_content%2Cvoteup_count&limit={}&offset={}&platform=desktop&sort_by=default".format(question_id,limit,offset) answer_flow = requests.get(api_base_url,headers=headers) if answer_flow.status_code == 200: answer_data = answer_flow.json()["data"] question.append(answer_data[0]["question"]["title"]) answer_count = answer_flow.json()["paging"]["totals"] if int(answer_count) < int(offset) + 10: return 0 for single_answer in answer_data: single_content = single_answer["content"] single_vote = single_answer["voteup_count"] converted_content = h.handle(single_content) f.write(converted_content) f.write('\n') f.close()
Example #18
Source File: rssfeeds.py From allura with Apache License 2.0 | 5 votes |
def process_entry(self, e, appid): title = e.title allura_base.log.info(" ...entry '%s'", title) parsed_content = [_f for _f in e.get('content') or [e.get('summary_detail')] if _f] if parsed_content: content = '' for ct in parsed_content: if ct.type != 'text/html': content += plain2markdown(ct.value) else: html2md = html2text.HTML2Text(baseurl=e.link) html2md.escape_snob = True markdown_content = html2md.handle(ct.value) content += markdown_content else: content = plain2markdown(getattr(e, 'summary', getattr(e, 'subtitle', getattr(e, 'title')))) content += ' [link](%s)' % e.link updated = datetime.utcfromtimestamp(calendar.timegm(e.updated_parsed)) base_slug = BM.BlogPost.make_base_slug(title, updated) b_count = BM.BlogPost.query.find( dict(slug=base_slug, app_config_id=appid)).count() if b_count == 0: post = BM.BlogPost(title=title, text=content, timestamp=updated, app_config_id=appid, state='published') post.neighborhood_id = c.project.neighborhood_id post.make_slug() post.commit()
Example #19
Source File: utils.py From openprescribing with MIT License | 5 votes |
def email_as_text(html): text_maker = html2text.HTML2Text() text_maker.images_to_alt = True text_maker.asterisk_emphasis = True text_maker.wrap_links = False text_maker.pad_tables = True text_maker.ignore_images = True text = text_maker.handle(html) return text
Example #20
Source File: html.py From paper2remarkable with MIT License | 5 votes |
def retrieve_pdf(self, pdf_url, filename): """Turn the HTML article in a clean pdf file""" # Steps # 1. Pull the HTML page using requests # 2. Extract the article part of the page using readability # 3. Convert the article HTML to markdown using html2text # 4. Convert the markdown back to HTML (this is done to sanitize HTML) # 4. Convert the HTML to PDF, pulling in images where needed # 5. Save the PDF to the specified filename. request_text = get_page_with_retry(pdf_url, return_text=True) doc = readability.Document(request_text) title = doc.title() raw_html = doc.summary(html_partial=True) h2t = html2text.HTML2Text() h2t.wrap_links = False text = h2t.handle(raw_html) # Add the title back to the document article = "# {title}\n\n{text}".format(title=title, text=text) # Convert to html, fixing relative image urls. md = markdown.Markdown() md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10) html_article = md.convert(article) if self.debug: with open("./paper.html", "w") as fp: fp.write(html_article) font_config = weasyprint.fonts.FontConfiguration() html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher) css = weasyprint.CSS(string=CSS, font_config=font_config) html.write_pdf(filename, stylesheets=[css], font_config=font_config)
Example #21
Source File: speak.py From pythonista-scripts with MIT License | 5 votes |
def main(): speech.stop() if not appex.is_running_extension(): console.hud_alert('Reading clipboard') text = clipboard.get() url = None else: text = appex.get_text() url = appex.get_url() if url == None: try: url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0] except: pass if url != None: console.hud_alert('Reading: ' + url) h = html2text.HTML2Text() try: r = requests.get( url=url, headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))}) except requests.ConnectionError as e: console.alert('Unable to connect to url.') return True html_content = r.text.decode('utf-8') text = html2text.html2text(html_content) else: console.hud_alert('Reading text: ' + str(text)) if text: speech.say(text) stop = console.alert('Done?', hide_cancel_button=True, button1='OK') speech.stop() else: console.hud_alert('No text found.')
Example #22
Source File: url2md.py From pythonista-scripts with MIT License | 5 votes |
def main(): if appex.is_running_extension(): url = appex.get_url() if url == None: text = appex.get_text() url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0] else: text = clipboard.get().strip() url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0] if not "http" in url: url = "http://" try: url = console.input_alert("URL", "", url) except: return True console.hud_alert('URL: %s' % url) h = html2text.HTML2Text() try: r = requests.get( url=url, headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))} ) except Exception as e: raise(e.message) return True html_content = r.text.decode('utf-8') rendered_content = html2text.html2text(html_content) clipboard.set(rendered_content) launch_e = console.alert('Markdown copied to clipboard. Launch Evernote?', button1='Yes', button2='No', hide_cancel_button=True) if launch_e ==1: _eurl = "evernote://x-callback-url/new-note?type=clipboard&title=DRAFT&text=" app=UIApplication.sharedApplication() eurl=nsurl(_eurl) app.openURL_(eurl) appex.finish()
Example #23
Source File: service.py From service.subtitles.subdivx with GNU General Public License v2.0 | 5 votes |
def cleanup_subdivx_comment(comment): """Convert the subtitle comment HTML to plain text.""" parser = html2text.HTML2Text() parser.unicode_snob = True parser.ignore_emphasis = True parser.ignore_tables = True parser.ignore_links = True parser.body_width = 1000 clean_text = parser.handle(comment) # Remove new lines manually clean_text = re.sub('\n', ' ', clean_text) return clean_text.rstrip(' \t')
Example #24
Source File: analysis.py From BruteforceHTTP with GNU General Public License v3.0 | 5 votes |
def get_response_diff(first_content, current_content): """ Analysis different in response data :param first_content: string = body of server html responses in first time :param current_content: string = current body of server html response :return: source_diff: string = New text appears in html source text_diff: string = New text appears in html view [html2text] """ import html2text convert = html2text.HTML2Text() text_diff, source_diff = "", "" # 2 loops: fix bug lines(source) > lines(text) for src_line in current_content.split("\n"): source_diff += src_line if src_line not in first_content else "" for line in convert.handle(current_content).split("\n"): text_diff += line if line not in convert.handle(first_content) else "" if sys.version_info[0] == 3: return text_diff, source_diff else: return text_diff.encode('utf-8'), source_diff.encode('utf-8')
Example #25
Source File: linksys_0.py From DLink_Harvester with GNU General Public License v3.0 | 5 votes |
def dom2text(dom, ignore_images=True, ignore_emphasis=True, ignore_tables=True): from lxml import etree import html2text htt = html2text.HTML2Text() htt.body_width = 0 htt.ignore_images = ignore_images htt.ignore_emphasis = ignore_emphasis htt.ignore_tables = ignore_tables return htt.handle(etree.tostring(dom).decode())
Example #26
Source File: html.py From yeti with Apache License 2.0 | 5 votes |
def import_html(results, content): content = Document(content) converter = HTML2Text() converter.body_width = 0 body = content.summary() text = BeautifulSoup(body).get_text(" ") results.investigation.update( name=content.short_title(), import_md=converter.handle(body), import_text=text)
Example #27
Source File: utils.py From Servo with BSD 2-Clause "Simplified" License | 5 votes |
def html_to_text(s, ignore_images=False): h = html2text.HTML2Text() h.ignore_images = ignore_images return h.handle(s)
Example #28
Source File: email_utils.py From karrot-backend with GNU Affero General Public License v3.0 | 5 votes |
def generate_plaintext_from_html(html): # always create an instance as it keeps state inside it # and will create ever increment link references otherwise h = html2text.HTML2Text() h.ignore_tables = True h.inline_links = False h.ignore_images = True h.wrap_links = False return h.handle(html)
Example #29
Source File: context_extractor.py From extract_recipe with Apache License 2.0 | 5 votes |
def get_url_markdown(baseurl): opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')] j = opener.open(baseurl) data = j.read() h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True h.body_width = 10000 return h.handle(data.decode('utf8'))
Example #30
Source File: downloadRecipes.py From extract_recipe with Apache License 2.0 | 5 votes |
def get_url_markdown(baseurl): opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')] j = opener.open(baseurl) data = j.read() h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True h.body_width = 10000 data = h.handle(unidecode(unicode(data,errors='ignore'))) return unidecode(data)