Python Examples of html2text.HTML2Text

Source File: html2md.py From FengTools with MIT License

6 votes

def doelse(url):
    headers = {
        'User-Agent': random.choice(useragents)
    }
    res = requests.get(url=url ,headers=headers) # 获取整个html页面

    h = html2text.HTML2Text()
    h.ignore_links = False
    soup = BeautifulSoup(res.text,'html5lib')
    title = soup.title.text # 获取标题
    html = str(soup.body)
    article = h.handle(html)

    pwd = os.getcwd() # 获取当前文件的路径
    dirpath = pwd + '/Else/'
    if not os.path.exists(dirpath):# 判断目录是否存在，不存在则创建新的目录
        os.makedirs(dirpath)
    ## 写入文件
    write2md(dirpath,title,article)

Source File: gapps.py From danforth-east with MIT License

6 votes

def send_email(to_address, to_name, subject, body_html):
    """Sends an email from the configured address.
    Does not check for address validity.
    """

    if config.ALLOWED_EMAIL_TO_ADDRESSES is not None and \
       to_address not in config.ALLOWED_EMAIL_TO_ADDRESSES:
        # Not allowed to send to this address
        logging.info('send_email: not allowed to send to: %s' % to_address)
        return

    full_to_address = '%s <%s>' % (to_name, to_address)

    h2t = html2text.HTML2Text()
    h2t.body_width = 0
    body_text = h2t.handle(body_html)

    message = mail.EmailMessage(sender=config.MASTER_EMAIL_SEND_ADDRESS,
                                subject=subject,
                                to=full_to_address,
                                body=body_text,
                                html=body_html)

    message.send()

Source File: utils.py From OpenCryptoBot with GNU Affero General Public License v3.0

6 votes

def remove_html_links(text):
    import html2text

    h = html2text.HTML2Text()
    h.ignore_links = True

    start = "<a href="
    end = "</a>"

    while start in text and end in text:
        s_index = text.find(start)
        e_index = text.find(end) + len(end)

        html_link = text[s_index:e_index]
        title = h.handle(html_link).strip()
        text = text.replace(html_link, title)

    return text.strip()

Source File: sayurl.py From Fox-V3 with GNU Affero General Public License v3.0

6 votes

def sayurl(self, ctx: commands.Context, url):
        """
        Converts a URL to something readable

        Works better on smaller websites
        """

        h = html2text.HTML2Text()
        h.ignore_links = True
        # h.ignore_images = True
        h.images_to_alt = True

        h.escape_snob = True
        h.skip_internal_links = True
        h.ignore_tables = True
        h.single_line_break = True
        h.mark_code = True
        h.wrap_links = True
        h.ul_item_mark = "-"

        async with aiohttp.ClientSession() as session:
            site = await fetch_url(session, url)

        for page in pagify(h.handle(site)):
            await ctx.send(page)

Source File: xzl.py From xzl with MIT License

6 votes

def get_xs_detail(href, title, path):
    url = xzl+href
    print('开始采集' + title + '的详情, 章节地址为: ' + url + '\n')
    text_maker = ht.HTML2Text()
    response = close_session().get(url=url, headers=headers)
    selector = Selector(text=response.text)
    html = selector.css(u'.cata-book-content').extract_first()
    file_name = title
    if markdown:
        md = text_maker.handle(html)
        with open(path + file_name + '.md', 'w') as f:
            f.write(md)
    else:
        if not xs_pdf:
            # 在html中加入编码， 否则中文会乱码
            html = "<html><head><meta charset='utf-8'></head> " + html + "</html>"
            pdfkit.from_string(html, path + file_name + '.pdf')
        else:
            return html


# 采集专栏列表

Source File: xzl.py From xzl with MIT License

6 votes

def get_zl_detail(url, path, name):
    response = close_session().get(url=url, headers=headers)
    selector = Selector(text=response.text)
    text_maker = ht.HTML2Text()
    create_time = selector.css(u'.time abbr::attr(title)').extract_first()
    html = selector.css(u'.xzl-topic-body-content').extract_first()
    file_name = name
    if hasTime:
        file_name = create_time+' '+name
    if markdown:
        md = text_maker.handle(html)
        with open(path + file_name + '.md', 'w') as f:
            f.write(md)
    else:
        # 在html中加入编码， 否则中文会乱码
        html = "<html><head><meta charset='utf-8'></head> " + html + "</html>"
        pdfkit.from_string(html, path + file_name + '.pdf')


# 关闭多余连接

Source File: sql_email.py From modoboa-amavis with MIT License

6 votes

def body(self):
        if self._body is None:
            super(SQLemail, self).body
            self._body = fix_utf8_encoding(self._body)

        # if there's no plain text version available attempt to make one by
        # sanitising the html version. The output isn't always pretty but it
        # is readable, better than a blank screen and helps the user decide
        # if the message is spam or ham.
        if self.dformat == "plain" and not self.contents["plain"] \
                and self.contents["html"]:
            h = HTML2Text()
            h.ignore_tables = True
            h.images_to_alt = True
            mail_text = h.handle(self.contents["html"])
            self.contents["plain"] = self._post_process_plain(
                smart_text(mail_text))
            self._body = self.viewmail_plain()
            self._body = fix_utf8_encoding(self._body)

        return self._body

Source File: html2text.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

6 votes

def export_html_to_text_html2text(input_buffer, encoding="utf-8"):
    """
    Export HTML to text via html2text.
    :param input_buffer: input HTML buffer
    :param encoding: default encoding
    :return:
    """
    # Ensure we have a decoded string
    if isinstance(input_buffer, bytes):
        input_buffer = input_buffer.decode(encoding)

    # Process and return
    parser = html2text.HTML2Text()
    parser.ignore_emphasis = True
    parser.ignore_links = True
    parser.ignore_images = True
    html_buffer = html.unescape(parser.handle(input_buffer))
    return html_buffer

Source File: reader.py From reader with MIT License

6 votes

def main(result, body_width):
    """Convert Mercury parse result dict to Markdown and plain-text
    
    result: a mercury-parser result (as a Python dict)
    """
    text = HTML2Text()
    text.body_width = body_width
    text.ignore_emphasis = True
    text.ignore_images = True
    text.ignore_links = True
    text.convert_charrefs = True
    markdown = HTML2Text()
    markdown.body_width = body_width
    markdown.convert_charrefs = True
    result['content'] = {
        'html': result['content'],
        'markdown': unescape(markdown.handle(result['content'])),
        'text': unescape(text.handle(result['content']))
    }
    return result

Source File: html2md.py From FengTools with MIT License

6 votes

def write2md(dirpath,title,article):
    ## 创建转换器
    h2md = html2text.HTML2Text()
    h2md.ignore_links = False
    ## 转换文档
    article = h2md.handle(article)
    ## 写入文件
    if not os.path.exists(dirpath):# 判断目录是否存在，不存在则创建新的目录
        os.makedirs(dirpath)
    # 创建md文件
    with open(dirpath+title+'.md','w',encoding="utf8") as f:
        lines = article.splitlines()
        for line in lines:
            if line.endswith('-'):
                f.write(line)
            else:
                f.write(line+"\n")
    print(title+"下载完成....")

Source File: MdownloadRecipes.py From extract_recipe with Apache License 2.0

6 votes

def get_url_markdown(baseurl,start,increment):
  try:
    '''
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
    try:
      j = opener.open(baseurl)
    except:
      return None
    data = j.read()
    '''
    urlHandler = urllib2.urlopen(baseurl)
    data = urlHandler.read()
    '''
    os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl)
    data = open('temp' + str(start)+"_"+str(increment),'rU').read()
    '''
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    h.body_width = 10000
    data = h.handle(unidecode(unicode(data,errors='ignore')))
    return unidecode(data)
  except:
    return None

Source File: downloadRecipes.py From extract_recipe with Apache License 2.0

6 votes

def get_url_markdown(baseurl,start,increment):
  '''
  opener = urllib2.build_opener()
  opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
  try:
    j = opener.open(baseurl)
  except:
    return None
  data = j.read()
  '''
  urlHandler = urllib2.urlopen(baseurl)
  data = urlHandler.read()
  '''
  os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl)
  data = open('temp' + str(start)+"_"+str(increment),'rU').read()
  '''
  h = html2text.HTML2Text()
  h.ignore_links = True
  h.ignore_images = True
  h.body_width = 10000
  data = h.handle(unidecode(unicode(data,errors='ignore')))
  return unidecode(data)

Source File: doc.py From Greynir with GNU General Public License v3.0

6 votes

def extract_text(self):
        html = self.data.decode(DEFAULT_TEXT_ENCODING)

        h = html2text.HTML2Text()
        # See https://github.com/Alir3z4/html2text/blob/master/html2text/cli.py
        h.ignore_links = True
        h.ignore_emphasis = True
        h.ignore_images = True
        h.unicode_snob = True
        h.ignore_tables = True
        h.decode_errors = "ignore"
        h.body_width = 0

        text = h.handle(html)

        return self.remove_header_prefixes(text)

Source File: feed.py From reader with MIT License

6 votes

def get_article(article_id, links=False, url=URL):
    # type: (str, bool, str) -> str
    """Get article from feed with the given ID"""
    articles = _feed(url).entries
    try:
        article = articles[int(article_id)]
    except (IndexError, ValueError):
        max_id = len(articles) - 1
        msg = "Unknown article ID, use ID from 0 to {}".format(max_id)
        raise SystemExit("Error: {}".format(msg))

    # Get article as HTML
    try:
        html = article.content[0].value
    except AttributeError:
        html = article.summary

    # Convert HTML to plain text
    to_text = html2text.HTML2Text()
    to_text.ignore_links = not links
    text = to_text.handle(html)

    return u"# {}\n\n{}".format(article.title, text)

Source File: rssfeed.py From Rss-Atom-Feed-Integration-for-Mattermost with MIT License

5 votes

def jointext(self):
        text = ''
        h = html2text.HTML2Text()
        h.ignore_links = True
        self.Description = h.handle(self.Description)
        if self.ShowName == True:
            text += "_" + self.Name + '_\n'
        if self.ShowTitle == True:
            text += '### [' + self.NewTitle + '](' + urllib.quote(self.ArticleUrl, safe=';/?:@&=+$,') + ')\n'
        if self.ShowDescription == True:
            text += self.Description + '\n'
        if self.ShowUrl == True:
            text += self.ArticleUrl
        return text

Source File: feeds.py From openstax-cms with GNU Affero General Public License v3.0

5 votes

def item_description(self, item):
        h = html2text.HTML2Text()
        excerpt = h.handle(str(item.body)).split('\n\n')[0]
        return excerpt + "..."

Source File: zhihu_answer_wordcloud.py From Python-tools with MIT License

5 votes

def get_flow():
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    h.ignore_emphasis = True
    h.ignore_tables = True
    filename = "{}.txt".format(question_id)
    f = open(filename,'w',encoding='utf-8')
    for i in range(15):
        limit = "10"
        offset = str(i*10)
        print("getting the {} answer...".format(offset))
        api_base_url = "https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B%2A%5D.is_normal%2Ccontent%2Ceditable_content%2Cvoteup_count&limit={}&offset={}&platform=desktop&sort_by=default".format(question_id,limit,offset)
        answer_flow = requests.get(api_base_url,headers=headers)
        if answer_flow.status_code == 200:
            answer_data = answer_flow.json()["data"]
            question.append(answer_data[0]["question"]["title"])
            answer_count = answer_flow.json()["paging"]["totals"]
            if int(answer_count) < int(offset) + 10:
                return 0
            for single_answer in answer_data:
                single_content = single_answer["content"]
                single_vote = single_answer["voteup_count"]
                converted_content = h.handle(single_content)
                f.write(converted_content)
                f.write('\n')
    f.close()

Source File: rssfeeds.py From allura with Apache License 2.0

5 votes

def process_entry(self, e, appid):
        title = e.title
        allura_base.log.info(" ...entry '%s'", title)
        parsed_content = [_f for _f in e.get('content') or [e.get('summary_detail')] if _f]
        if parsed_content:
            content = ''
            for ct in parsed_content:
                if ct.type != 'text/html':
                    content += plain2markdown(ct.value)
                else:
                    html2md = html2text.HTML2Text(baseurl=e.link)
                    html2md.escape_snob = True
                    markdown_content = html2md.handle(ct.value)
                    content += markdown_content
        else:
            content = plain2markdown(getattr(e, 'summary',
                                             getattr(e, 'subtitle',
                                                     getattr(e, 'title'))))

        content += ' [link](%s)' % e.link
        updated = datetime.utcfromtimestamp(calendar.timegm(e.updated_parsed))

        base_slug = BM.BlogPost.make_base_slug(title, updated)
        b_count = BM.BlogPost.query.find(
            dict(slug=base_slug, app_config_id=appid)).count()
        if b_count == 0:
            post = BM.BlogPost(title=title, text=content, timestamp=updated,
                               app_config_id=appid,
                               state='published')
            post.neighborhood_id = c.project.neighborhood_id
            post.make_slug()
            post.commit()

Source File: utils.py From openprescribing with MIT License

5 votes

def email_as_text(html):
    text_maker = html2text.HTML2Text()
    text_maker.images_to_alt = True
    text_maker.asterisk_emphasis = True
    text_maker.wrap_links = False
    text_maker.pad_tables = True
    text_maker.ignore_images = True
    text = text_maker.handle(html)
    return text

Source File: html.py From paper2remarkable with MIT License

5 votes

def retrieve_pdf(self, pdf_url, filename):
        """Turn the HTML article in a clean pdf file"""
        # Steps
        # 1. Pull the HTML page using requests
        # 2. Extract the article part of the page using readability
        # 3. Convert the article HTML to markdown using html2text
        # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
        # 4. Convert the HTML to PDF, pulling in images where needed
        # 5. Save the PDF to the specified filename.
        request_text = get_page_with_retry(pdf_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)

        h2t = html2text.HTML2Text()
        h2t.wrap_links = False
        text = h2t.handle(raw_html)

        # Add the title back to the document
        article = "# {title}\n\n{text}".format(title=title, text=text)

        # Convert to html, fixing relative image urls.
        md = markdown.Markdown()
        md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
        html_article = md.convert(article)

        if self.debug:
            with open("./paper.html", "w") as fp:
                fp.write(html_article)

        font_config = weasyprint.fonts.FontConfiguration()
        html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher)
        css = weasyprint.CSS(string=CSS, font_config=font_config)

        html.write_pdf(filename, stylesheets=[css], font_config=font_config)

Source File: speak.py From pythonista-scripts with MIT License

5 votes

def main():
    speech.stop()
    if not appex.is_running_extension():
        console.hud_alert('Reading clipboard')
        text = clipboard.get()
        url = None
    else:
        text = appex.get_text()
        url = appex.get_url()

    if url == None:
        try:
            url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
        except:
            pass

    if url != None:
        console.hud_alert('Reading: ' + url)
        h = html2text.HTML2Text()
        try:
            r = requests.get(
            url=url,
            headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))})
        except requests.ConnectionError as e:
            console.alert('Unable to connect to url.')
            return True
        html_content = r.text.decode('utf-8')
        text = html2text.html2text(html_content)
    else:
        console.hud_alert('Reading text: ' + str(text))

    if text:
        speech.say(text)
        stop = console.alert('Done?', hide_cancel_button=True, button1='OK')
        speech.stop()
    else:
        console.hud_alert('No text found.')

Source File: url2md.py From pythonista-scripts with MIT License

5 votes

def main():
    if appex.is_running_extension():
        url = appex.get_url()
        if url == None:
            text = appex.get_text()
            url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
    else:
        text = clipboard.get().strip()
        url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
        if not "http" in url:
            url = "http://"
        try:
            url = console.input_alert("URL", "", url)
        except:
            return True

    console.hud_alert('URL: %s' % url)

    h = html2text.HTML2Text()
    try:
        r = requests.get(
            url=url,
            headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))}
        )
    except Exception as e:
        raise(e.message)
        return True

    html_content = r.text.decode('utf-8')
    rendered_content = html2text.html2text(html_content)
    clipboard.set(rendered_content)

    launch_e = console.alert('Markdown copied to clipboard. Launch Evernote?', button1='Yes', button2='No', hide_cancel_button=True)
    if launch_e ==1:
        _eurl = "evernote://x-callback-url/new-note?type=clipboard&title=DRAFT&text="
        app=UIApplication.sharedApplication()
        eurl=nsurl(_eurl)
        app.openURL_(eurl)
    appex.finish()

Source File: service.py From service.subtitles.subdivx with GNU General Public License v2.0

5 votes

def cleanup_subdivx_comment(comment):
    """Convert the subtitle comment HTML to plain text."""
    parser = html2text.HTML2Text()
    parser.unicode_snob = True
    parser.ignore_emphasis = True
    parser.ignore_tables = True
    parser.ignore_links = True
    parser.body_width = 1000
    clean_text = parser.handle(comment)
    # Remove new lines manually
    clean_text = re.sub('\n', ' ', clean_text)
    return clean_text.rstrip(' \t')

Source File: analysis.py From BruteforceHTTP with GNU General Public License v3.0

5 votes

def get_response_diff(first_content, current_content):
	"""
	Analysis different in response data
	:param first_content: string = body of server html responses in first time
	:param current_content: string = current body of server html response
	:return:
		source_diff: string = New text appears in html source
		text_diff: string = New text appears in html view [html2text]
	"""
	import html2text
	convert = html2text.HTML2Text()

	text_diff, source_diff = "", ""
	
	# 2 loops: fix bug lines(source) > lines(text)
	
	for src_line in current_content.split("\n"):
		source_diff += src_line if src_line not in first_content else ""
	
	for line in convert.handle(current_content).split("\n"):
		text_diff += line if line not in convert.handle(first_content) else ""
	
	if sys.version_info[0] == 3:
		return text_diff, source_diff
	else:
		return text_diff.encode('utf-8'), source_diff.encode('utf-8')

Source File: linksys_0.py From DLink_Harvester with GNU General Public License v3.0

5 votes

def dom2text(dom, ignore_images=True, ignore_emphasis=True, ignore_tables=True):
    from lxml import etree
    import html2text
    htt = html2text.HTML2Text()
    htt.body_width = 0
    htt.ignore_images = ignore_images
    htt.ignore_emphasis = ignore_emphasis
    htt.ignore_tables = ignore_tables
    return htt.handle(etree.tostring(dom).decode())

Source File: html.py From yeti with Apache License 2.0

5 votes

def import_html(results, content):
    content = Document(content)

    converter = HTML2Text()
    converter.body_width = 0

    body = content.summary()
    text = BeautifulSoup(body).get_text(" ")

    results.investigation.update(
        name=content.short_title(),
        import_md=converter.handle(body),
        import_text=text)

Source File: utils.py From Servo with BSD 2-Clause "Simplified" License

5 votes

def html_to_text(s, ignore_images=False):
    h = html2text.HTML2Text()
    h.ignore_images = ignore_images
    return h.handle(s)

Source File: email_utils.py From karrot-backend with GNU Affero General Public License v3.0

5 votes

def generate_plaintext_from_html(html):
    # always create an instance as it keeps state inside it
    # and will create ever increment link references otherwise
    h = html2text.HTML2Text()
    h.ignore_tables = True
    h.inline_links = False
    h.ignore_images = True
    h.wrap_links = False
    return h.handle(html)

Source File: context_extractor.py From extract_recipe with Apache License 2.0

5 votes

def get_url_markdown(baseurl):
  opener = urllib2.build_opener()
  opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
  j = opener.open(baseurl)
  data = j.read()

  h = html2text.HTML2Text()
  h.ignore_links = True
  h.ignore_images = True
  h.body_width = 10000
  return h.handle(data.decode('utf8'))

Source File: downloadRecipes.py From extract_recipe with Apache License 2.0

5 votes

def get_url_markdown(baseurl):
  opener = urllib2.build_opener()
  opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
  j = opener.open(baseurl)
  data = j.read()

  h = html2text.HTML2Text()
  h.ignore_links = True
  h.ignore_images = True
  h.body_width = 10000
  data = h.handle(unidecode(unicode(data,errors='ignore')))
  return unidecode(data)

Python html2text.HTML2Text() Examples