Python html2text.HTML2Text() Examples

The following are 30 code examples of html2text.HTML2Text(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module html2text , or try the search function .
Example #1
Source File: html2md.py    From FengTools with MIT License 6 votes vote down vote up
def doelse(url):
    headers = {
        'User-Agent': random.choice(useragents)
    }
    res = requests.get(url=url ,headers=headers) # 获取整个html页面

    h = html2text.HTML2Text()
    h.ignore_links = False
    soup = BeautifulSoup(res.text,'html5lib')
    title = soup.title.text # 获取标题
    html = str(soup.body)
    article = h.handle(html)

    pwd = os.getcwd() # 获取当前文件的路径
    dirpath = pwd + '/Else/'
    if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
        os.makedirs(dirpath)
    ## 写入文件
    write2md(dirpath,title,article) 
Example #2
Source File: gapps.py    From danforth-east with MIT License 6 votes vote down vote up
def send_email(to_address, to_name, subject, body_html):
    """Sends an email from the configured address.
    Does not check for address validity.
    """

    if config.ALLOWED_EMAIL_TO_ADDRESSES is not None and \
       to_address not in config.ALLOWED_EMAIL_TO_ADDRESSES:
        # Not allowed to send to this address
        logging.info('send_email: not allowed to send to: %s' % to_address)
        return

    full_to_address = '%s <%s>' % (to_name, to_address)

    h2t = html2text.HTML2Text()
    h2t.body_width = 0
    body_text = h2t.handle(body_html)

    message = mail.EmailMessage(sender=config.MASTER_EMAIL_SEND_ADDRESS,
                                subject=subject,
                                to=full_to_address,
                                body=body_text,
                                html=body_html)

    message.send() 
Example #3
Source File: utils.py    From OpenCryptoBot with GNU Affero General Public License v3.0 6 votes vote down vote up
def remove_html_links(text):
    import html2text

    h = html2text.HTML2Text()
    h.ignore_links = True

    start = "<a href="
    end = "</a>"

    while start in text and end in text:
        s_index = text.find(start)
        e_index = text.find(end) + len(end)

        html_link = text[s_index:e_index]
        title = h.handle(html_link).strip()
        text = text.replace(html_link, title)

    return text.strip() 
Example #4
Source File: sayurl.py    From Fox-V3 with GNU Affero General Public License v3.0 6 votes vote down vote up
def sayurl(self, ctx: commands.Context, url):
        """
        Converts a URL to something readable

        Works better on smaller websites
        """

        h = html2text.HTML2Text()
        h.ignore_links = True
        # h.ignore_images = True
        h.images_to_alt = True

        h.escape_snob = True
        h.skip_internal_links = True
        h.ignore_tables = True
        h.single_line_break = True
        h.mark_code = True
        h.wrap_links = True
        h.ul_item_mark = "-"

        async with aiohttp.ClientSession() as session:
            site = await fetch_url(session, url)

        for page in pagify(h.handle(site)):
            await ctx.send(page) 
Example #5
Source File: xzl.py    From xzl with MIT License 6 votes vote down vote up
def get_xs_detail(href, title, path):
    url = xzl+href
    print('开始采集' + title + '的详情, 章节地址为: ' + url + '\n')
    text_maker = ht.HTML2Text()
    response = close_session().get(url=url, headers=headers)
    selector = Selector(text=response.text)
    html = selector.css(u'.cata-book-content').extract_first()
    file_name = title
    if markdown:
        md = text_maker.handle(html)
        with open(path + file_name + '.md', 'w') as f:
            f.write(md)
    else:
        if not xs_pdf:
            # 在html中加入编码, 否则中文会乱码
            html = "<html><head><meta charset='utf-8'></head> " + html + "</html>"
            pdfkit.from_string(html, path + file_name + '.pdf')
        else:
            return html


# 采集专栏列表 
Example #6
Source File: xzl.py    From xzl with MIT License 6 votes vote down vote up
def get_zl_detail(url, path, name):
    response = close_session().get(url=url, headers=headers)
    selector = Selector(text=response.text)
    text_maker = ht.HTML2Text()
    create_time = selector.css(u'.time abbr::attr(title)').extract_first()
    html = selector.css(u'.xzl-topic-body-content').extract_first()
    file_name = name
    if hasTime:
        file_name = create_time+' '+name
    if markdown:
        md = text_maker.handle(html)
        with open(path + file_name + '.md', 'w') as f:
            f.write(md)
    else:
        # 在html中加入编码, 否则中文会乱码
        html = "<html><head><meta charset='utf-8'></head> " + html + "</html>"
        pdfkit.from_string(html, path + file_name + '.pdf')


# 关闭多余连接 
Example #7
Source File: sql_email.py    From modoboa-amavis with MIT License 6 votes vote down vote up
def body(self):
        if self._body is None:
            super(SQLemail, self).body
            self._body = fix_utf8_encoding(self._body)

        # if there's no plain text version available attempt to make one by
        # sanitising the html version. The output isn't always pretty but it
        # is readable, better than a blank screen and helps the user decide
        # if the message is spam or ham.
        if self.dformat == "plain" and not self.contents["plain"] \
                and self.contents["html"]:
            h = HTML2Text()
            h.ignore_tables = True
            h.images_to_alt = True
            mail_text = h.handle(self.contents["html"])
            self.contents["plain"] = self._post_process_plain(
                smart_text(mail_text))
            self._body = self.viewmail_plain()
            self._body = fix_utf8_encoding(self._body)

        return self._body 
Example #8
Source File: html2text.py    From lexpredict-contraxsuite with GNU Affero General Public License v3.0 6 votes vote down vote up
def export_html_to_text_html2text(input_buffer, encoding="utf-8"):
    """
    Export HTML to text via html2text.
    :param input_buffer: input HTML buffer
    :param encoding: default encoding
    :return:
    """
    # Ensure we have a decoded string
    if isinstance(input_buffer, bytes):
        input_buffer = input_buffer.decode(encoding)

    # Process and return
    parser = html2text.HTML2Text()
    parser.ignore_emphasis = True
    parser.ignore_links = True
    parser.ignore_images = True
    html_buffer = html.unescape(parser.handle(input_buffer))
    return html_buffer 
Example #9
Source File: reader.py    From reader with MIT License 6 votes vote down vote up
def main(result, body_width):
    """Convert Mercury parse result dict to Markdown and plain-text
    
    result: a mercury-parser result (as a Python dict)
    """
    text = HTML2Text()
    text.body_width = body_width
    text.ignore_emphasis = True
    text.ignore_images = True
    text.ignore_links = True
    text.convert_charrefs = True
    markdown = HTML2Text()
    markdown.body_width = body_width
    markdown.convert_charrefs = True
    result['content'] = {
        'html': result['content'],
        'markdown': unescape(markdown.handle(result['content'])),
        'text': unescape(text.handle(result['content']))
    }
    return result 
Example #10
Source File: html2md.py    From FengTools with MIT License 6 votes vote down vote up
def write2md(dirpath,title,article):
    ## 创建转换器
    h2md = html2text.HTML2Text()
    h2md.ignore_links = False
    ## 转换文档
    article = h2md.handle(article)
    ## 写入文件
    if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
        os.makedirs(dirpath)
    # 创建md文件
    with open(dirpath+title+'.md','w',encoding="utf8") as f:
        lines = article.splitlines()
        for line in lines:
            if line.endswith('-'):
                f.write(line)
            else:
                f.write(line+"\n")
    print(title+"下载完成....") 
Example #11
Source File: MdownloadRecipes.py    From extract_recipe with Apache License 2.0 6 votes vote down vote up
def get_url_markdown(baseurl,start,increment):
  try:
    '''
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
    try:
      j = opener.open(baseurl)
    except:
      return None
    data = j.read()
    '''
    urlHandler = urllib2.urlopen(baseurl)
    data = urlHandler.read()
    '''
    os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl)
    data = open('temp' + str(start)+"_"+str(increment),'rU').read()
    '''
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    h.body_width = 10000
    data = h.handle(unidecode(unicode(data,errors='ignore')))
    return unidecode(data)
  except:
    return None 
Example #12
Source File: downloadRecipes.py    From extract_recipe with Apache License 2.0 6 votes vote down vote up
def get_url_markdown(baseurl,start,increment):
  '''
  opener = urllib2.build_opener()
  opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
  try:
    j = opener.open(baseurl)
  except:
    return None
  data = j.read()
  '''
  urlHandler = urllib2.urlopen(baseurl)
  data = urlHandler.read()
  '''
  os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl)
  data = open('temp' + str(start)+"_"+str(increment),'rU').read()
  '''
  h = html2text.HTML2Text()
  h.ignore_links = True
  h.ignore_images = True
  h.body_width = 10000
  data = h.handle(unidecode(unicode(data,errors='ignore')))
  return unidecode(data) 
Example #13
Source File: doc.py    From Greynir with GNU General Public License v3.0 6 votes vote down vote up
def extract_text(self):
        html = self.data.decode(DEFAULT_TEXT_ENCODING)

        h = html2text.HTML2Text()
        # See https://github.com/Alir3z4/html2text/blob/master/html2text/cli.py
        h.ignore_links = True
        h.ignore_emphasis = True
        h.ignore_images = True
        h.unicode_snob = True
        h.ignore_tables = True
        h.decode_errors = "ignore"
        h.body_width = 0

        text = h.handle(html)

        return self.remove_header_prefixes(text) 
Example #14
Source File: feed.py    From reader with MIT License 6 votes vote down vote up
def get_article(article_id, links=False, url=URL):
    # type: (str, bool, str) -> str
    """Get article from feed with the given ID"""
    articles = _feed(url).entries
    try:
        article = articles[int(article_id)]
    except (IndexError, ValueError):
        max_id = len(articles) - 1
        msg = "Unknown article ID, use ID from 0 to {}".format(max_id)
        raise SystemExit("Error: {}".format(msg))

    # Get article as HTML
    try:
        html = article.content[0].value
    except AttributeError:
        html = article.summary

    # Convert HTML to plain text
    to_text = html2text.HTML2Text()
    to_text.ignore_links = not links
    text = to_text.handle(html)

    return u"# {}\n\n{}".format(article.title, text) 
Example #15
Source File: rssfeed.py    From Rss-Atom-Feed-Integration-for-Mattermost with MIT License 5 votes vote down vote up
def jointext(self):
        text = ''
        h = html2text.HTML2Text()
        h.ignore_links = True
        self.Description = h.handle(self.Description)
        if self.ShowName == True:
            text += "_" + self.Name + '_\n'
        if self.ShowTitle == True:
            text += '### [' + self.NewTitle + '](' + urllib.quote(self.ArticleUrl, safe=';/?:@&=+$,') + ')\n'
        if self.ShowDescription == True:
            text += self.Description + '\n'
        if self.ShowUrl == True:
            text += self.ArticleUrl
        return text 
Example #16
Source File: feeds.py    From openstax-cms with GNU Affero General Public License v3.0 5 votes vote down vote up
def item_description(self, item):
        h = html2text.HTML2Text()
        excerpt = h.handle(str(item.body)).split('\n\n')[0]
        return excerpt + "..." 
Example #17
Source File: zhihu_answer_wordcloud.py    From Python-tools with MIT License 5 votes vote down vote up
def get_flow():
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    h.ignore_emphasis = True
    h.ignore_tables = True
    filename = "{}.txt".format(question_id)
    f = open(filename,'w',encoding='utf-8')
    for i in range(15):
        limit = "10"
        offset = str(i*10)
        print("getting the {} answer...".format(offset))
        api_base_url = "https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B%2A%5D.is_normal%2Ccontent%2Ceditable_content%2Cvoteup_count&limit={}&offset={}&platform=desktop&sort_by=default".format(question_id,limit,offset)
        answer_flow = requests.get(api_base_url,headers=headers)
        if answer_flow.status_code == 200:
            answer_data = answer_flow.json()["data"]
            question.append(answer_data[0]["question"]["title"])
            answer_count = answer_flow.json()["paging"]["totals"]
            if int(answer_count) < int(offset) + 10:
                return 0
            for single_answer in answer_data:
                single_content = single_answer["content"]
                single_vote = single_answer["voteup_count"]
                converted_content = h.handle(single_content)
                f.write(converted_content)
                f.write('\n')
    f.close() 
Example #18
Source File: rssfeeds.py    From allura with Apache License 2.0 5 votes vote down vote up
def process_entry(self, e, appid):
        title = e.title
        allura_base.log.info(" ...entry '%s'", title)
        parsed_content = [_f for _f in e.get('content') or [e.get('summary_detail')] if _f]
        if parsed_content:
            content = ''
            for ct in parsed_content:
                if ct.type != 'text/html':
                    content += plain2markdown(ct.value)
                else:
                    html2md = html2text.HTML2Text(baseurl=e.link)
                    html2md.escape_snob = True
                    markdown_content = html2md.handle(ct.value)
                    content += markdown_content
        else:
            content = plain2markdown(getattr(e, 'summary',
                                             getattr(e, 'subtitle',
                                                     getattr(e, 'title'))))

        content += ' [link](%s)' % e.link
        updated = datetime.utcfromtimestamp(calendar.timegm(e.updated_parsed))

        base_slug = BM.BlogPost.make_base_slug(title, updated)
        b_count = BM.BlogPost.query.find(
            dict(slug=base_slug, app_config_id=appid)).count()
        if b_count == 0:
            post = BM.BlogPost(title=title, text=content, timestamp=updated,
                               app_config_id=appid,
                               state='published')
            post.neighborhood_id = c.project.neighborhood_id
            post.make_slug()
            post.commit() 
Example #19
Source File: utils.py    From openprescribing with MIT License 5 votes vote down vote up
def email_as_text(html):
    text_maker = html2text.HTML2Text()
    text_maker.images_to_alt = True
    text_maker.asterisk_emphasis = True
    text_maker.wrap_links = False
    text_maker.pad_tables = True
    text_maker.ignore_images = True
    text = text_maker.handle(html)
    return text 
Example #20
Source File: html.py    From paper2remarkable with MIT License 5 votes vote down vote up
def retrieve_pdf(self, pdf_url, filename):
        """Turn the HTML article in a clean pdf file"""
        # Steps
        # 1. Pull the HTML page using requests
        # 2. Extract the article part of the page using readability
        # 3. Convert the article HTML to markdown using html2text
        # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
        # 4. Convert the HTML to PDF, pulling in images where needed
        # 5. Save the PDF to the specified filename.
        request_text = get_page_with_retry(pdf_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)

        h2t = html2text.HTML2Text()
        h2t.wrap_links = False
        text = h2t.handle(raw_html)

        # Add the title back to the document
        article = "# {title}\n\n{text}".format(title=title, text=text)

        # Convert to html, fixing relative image urls.
        md = markdown.Markdown()
        md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
        html_article = md.convert(article)

        if self.debug:
            with open("./paper.html", "w") as fp:
                fp.write(html_article)

        font_config = weasyprint.fonts.FontConfiguration()
        html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher)
        css = weasyprint.CSS(string=CSS, font_config=font_config)

        html.write_pdf(filename, stylesheets=[css], font_config=font_config) 
Example #21
Source File: speak.py    From pythonista-scripts with MIT License 5 votes vote down vote up
def main():
    speech.stop()
    if not appex.is_running_extension():
        console.hud_alert('Reading clipboard')
        text = clipboard.get()
        url = None
    else:
        text = appex.get_text()
        url = appex.get_url()

    if url == None:
        try:
            url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
        except:
            pass

    if url != None:
        console.hud_alert('Reading: ' + url)
        h = html2text.HTML2Text()
        try:
            r = requests.get(
            url=url,
            headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))})
        except requests.ConnectionError as e:
            console.alert('Unable to connect to url.')
            return True
        html_content = r.text.decode('utf-8')
        text = html2text.html2text(html_content)
    else:
        console.hud_alert('Reading text: ' + str(text))

    if text:
        speech.say(text)
        stop = console.alert('Done?', hide_cancel_button=True, button1='OK')
        speech.stop()
    else:
        console.hud_alert('No text found.') 
Example #22
Source File: url2md.py    From pythonista-scripts with MIT License 5 votes vote down vote up
def main():
    if appex.is_running_extension():
        url = appex.get_url()
        if url == None:
            text = appex.get_text()
            url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
    else:
        text = clipboard.get().strip()
        url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
        if not "http" in url:
            url = "http://"
        try:
            url = console.input_alert("URL", "", url)
        except:
            return True

    console.hud_alert('URL: %s' % url)

    h = html2text.HTML2Text()
    try:
        r = requests.get(
            url=url,
            headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))}
        )
    except Exception as e:
        raise(e.message)
        return True

    html_content = r.text.decode('utf-8')
    rendered_content = html2text.html2text(html_content)
    clipboard.set(rendered_content)

    launch_e = console.alert('Markdown copied to clipboard. Launch Evernote?', button1='Yes', button2='No', hide_cancel_button=True)
    if launch_e ==1:
        _eurl = "evernote://x-callback-url/new-note?type=clipboard&title=DRAFT&text="
        app=UIApplication.sharedApplication()
        eurl=nsurl(_eurl)
        app.openURL_(eurl)
    appex.finish() 
Example #23
Source File: service.py    From service.subtitles.subdivx with GNU General Public License v2.0 5 votes vote down vote up
def cleanup_subdivx_comment(comment):
    """Convert the subtitle comment HTML to plain text."""
    parser = html2text.HTML2Text()
    parser.unicode_snob = True
    parser.ignore_emphasis = True
    parser.ignore_tables = True
    parser.ignore_links = True
    parser.body_width = 1000
    clean_text = parser.handle(comment)
    # Remove new lines manually
    clean_text = re.sub('\n', ' ', clean_text)
    return clean_text.rstrip(' \t') 
Example #24
Source File: analysis.py    From BruteforceHTTP with GNU General Public License v3.0 5 votes vote down vote up
def get_response_diff(first_content, current_content):
	"""
	Analysis different in response data
	:param first_content: string = body of server html responses in first time
	:param current_content: string = current body of server html response
	:return:
		source_diff: string = New text appears in html source
		text_diff: string = New text appears in html view [html2text]
	"""
	import html2text
	convert = html2text.HTML2Text()

	text_diff, source_diff = "", ""
	
	# 2 loops: fix bug lines(source) > lines(text)
	
	for src_line in current_content.split("\n"):
		source_diff += src_line if src_line not in first_content else ""
	
	for line in convert.handle(current_content).split("\n"):
		text_diff += line if line not in convert.handle(first_content) else ""
	
	if sys.version_info[0] == 3:
		return text_diff, source_diff
	else:
		return text_diff.encode('utf-8'), source_diff.encode('utf-8') 
Example #25
Source File: linksys_0.py    From DLink_Harvester with GNU General Public License v3.0 5 votes vote down vote up
def dom2text(dom, ignore_images=True, ignore_emphasis=True, ignore_tables=True):
    from lxml import etree
    import html2text
    htt = html2text.HTML2Text()
    htt.body_width = 0
    htt.ignore_images = ignore_images
    htt.ignore_emphasis = ignore_emphasis
    htt.ignore_tables = ignore_tables
    return htt.handle(etree.tostring(dom).decode()) 
Example #26
Source File: html.py    From yeti with Apache License 2.0 5 votes vote down vote up
def import_html(results, content):
    content = Document(content)

    converter = HTML2Text()
    converter.body_width = 0

    body = content.summary()
    text = BeautifulSoup(body).get_text(" ")

    results.investigation.update(
        name=content.short_title(),
        import_md=converter.handle(body),
        import_text=text) 
Example #27
Source File: utils.py    From Servo with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def html_to_text(s, ignore_images=False):
    h = html2text.HTML2Text()
    h.ignore_images = ignore_images
    return h.handle(s) 
Example #28
Source File: email_utils.py    From karrot-backend with GNU Affero General Public License v3.0 5 votes vote down vote up
def generate_plaintext_from_html(html):
    # always create an instance as it keeps state inside it
    # and will create ever increment link references otherwise
    h = html2text.HTML2Text()
    h.ignore_tables = True
    h.inline_links = False
    h.ignore_images = True
    h.wrap_links = False
    return h.handle(html) 
Example #29
Source File: context_extractor.py    From extract_recipe with Apache License 2.0 5 votes vote down vote up
def get_url_markdown(baseurl):
  opener = urllib2.build_opener()
  opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
  j = opener.open(baseurl)
  data = j.read()

  h = html2text.HTML2Text()
  h.ignore_links = True
  h.ignore_images = True
  h.body_width = 10000
  return h.handle(data.decode('utf8')) 
Example #30
Source File: downloadRecipes.py    From extract_recipe with Apache License 2.0 5 votes vote down vote up
def get_url_markdown(baseurl):
  opener = urllib2.build_opener()
  opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
  j = opener.open(baseurl)
  data = j.read()

  h = html2text.HTML2Text()
  h.ignore_links = True
  h.ignore_images = True
  h.body_width = 10000
  data = h.handle(unidecode(unicode(data,errors='ignore')))
  return unidecode(data)