Python Examples of html2text.html2text

Source File: email_service.py From python-for-entrepreneurs-course-demos with MIT License

6 votes

def send_email(to_address, subject, html_body):
        try:
            smtp = EmailService.create_smtp_server()
            message = mailer.Message(
                From=EmailService.__from_address,
                To=to_address,
                charset='utf-8')
            message.Subject = subject
            message.Html = html_body
            message.Body = html2text.html2text(html_body)

            if not EmailService.__is_debug_mode:
                print("Sending message (live!)")
                smtp.send(message)
            else:
                print("Skipping send, email is in dev mode.")
        except Exception as x:
            print("Error sending mail: {}".format(x))

Source File: converters.py From allura with Apache License 2.0

6 votes

def mediawiki2markdown(source):
    try:
        import html2text
        from mediawiki import wiki2html
    except ImportError:
        raise ImportError("""This operation requires GPL libraries:
        "mediawiki" (https://pypi.org/project/mediawiki2html/)
        "html2text" (https://pypi.org/project/html2text/)""")

    html2text.BODY_WIDTH = 0

    wiki_content = wiki2html(source, True)
    wiki_content = _convert_toc(wiki_content)
    markdown_text = html2text.html2text(wiki_content)
    markdown_text = markdown_text.replace('<', '&lt;').replace('>', '&gt;')
    return markdown_text

Source File: run_whoosh.py From TorCMS with MIT License

6 votes

def do_for_wiki(rand=True, doc_type=''):
    if rand:
        recs = MWiki.query_random(num=10, kind='1')
    else:
        recs = MWiki.query_recent(num=2, kind='1')

    for rec in recs:
        text2 = rec.title + ',' + html2text.html2text(tornado.escape.xhtml_unescape(rec.cnt_html))

        writer = TOR_IDX.writer()
        writer.update_document(
            title=rec.title,
            catid='sid1',
            type=doc_type,
            link='/wiki/{0}'.format(rec.title),
            content=text2
        )
        writer.commit()

Source File: run_whoosh.py From TorCMS with MIT License

6 votes

def do_for_post(rand=True, doc_type=''):
    if rand:
        recs = MPost.query_random(num=10, kind='1')
    else:
        recs = MPost.query_recent(num=2, kind='1')

    for rec in recs:
        text2 = rec.title + ',' + html2text.html2text(tornado.escape.xhtml_unescape(rec.cnt_html))
        writer = TOR_IDX.writer()
        writer.update_document(
            title=rec.title,
            catid='sid1',
            type=doc_type,
            link='/post/{0}'.format(rec.uid),
            content=text2,
        )
        writer.commit()

Source File: models.py From colossus with MIT License

6 votes

def send(self, to: str, context: dict = None):
        """
        Send a confirm email/welcome email/goodbye email to a subscriber.
        If the SubscriptionFormTemplate instance is not an email, it will raise
        an FormTemplateIsNotEmail exception.

        :param to: Subscriber email address
        :param context: Extra context to be used during email rendering
        """
        if not self.is_email:
            raise FormTemplateIsNotEmail

        rich_text_message = self.render_template(context)
        plain_text_message = html2text.html2text(rich_text_message, bodywidth=2000)
        email = EmailMultiAlternatives(
            subject=self.subject,
            body=plain_text_message,
            from_email=self.get_from_email(),
            to=[to]
        )
        email.attach_alternative(rich_text_message, 'text/html')
        email.send()

Source File: filter_handler.py From TorCMS with MIT License

6 votes

def echo_html_list_str(self, catid, infos):
        '''
        生成 list 后的 HTML 格式的字符串
        '''
        zhiding_str = ''
        tuiguang_str = ''
        imgname = 'fixed/zhanwei.png'

        kwd = {
            'imgname': imgname,
            'zhiding': zhiding_str,
            'tuiguang': tuiguang_str,
        }

        self.render('autogen/infolist/infolist_{0}.html'.format(catid),
                    userinfo=self.userinfo,
                    kwd=kwd,
                    html2text=html2text,
                    post_infos=infos,
                    widget_info=kwd)

Source File: email_service.py From cookiecutter-course with GNU General Public License v2.0

6 votes

def send_email(to_address, subject, html_body):
        try:
            smtp = EmailService.create_smtp_server()
            message = mailer.Message(
                From=EmailService.__from_address,
                To=to_address,
                charset='utf-8')
            message.Subject = subject
            message.Html = html_body
            message.Body = html2text.html2text(html_body)

            if not EmailService.__is_debug_mode:
                print("Sending message (live!)")
                smtp.send(message)
            else:
                print("Skipping send, email is in dev mode.")
        except Exception as x:
            print("Error sending mail: {}".format(x))

Source File: email_service.py From python-for-entrepreneurs-course-demos with MIT License

6 votes

def send_email(to_address, subject, html_body):
        try:
            smtp = EmailService.create_smtp_server()
            message = mailer.Message(
                From=EmailService.__from_address,
                To=to_address,
                charset='utf-8')
            message.Subject = subject
            message.Html = html_body
            message.Body = html2text.html2text(html_body)

            if not EmailService.__is_debug_mode:
                print("Sending message (live!)")
                smtp.send(message)
            else:
                print("Skipping send, email is in dev mode.")
        except Exception as x:
            print("Error sending mail: {}".format(x))

Source File: email_service.py From python-for-entrepreneurs-course-demos with MIT License

6 votes

def send_email(to_address, subject, html_body):
        try:
            smtp = EmailService.create_smtp_server()
            message = mailer.Message(
                From=EmailService.__from_address,
                To=to_address,
                charset='utf-8')
            message.Subject = subject
            message.Html = html_body
            message.Body = html2text.html2text(html_body)

            if not EmailService.__is_debug_mode:
                print("Sending message (live!)")
                smtp.send(message)
            else:
                print("Skipping send, email is in dev mode.")
        except Exception as x:
            print("Error sending mail: {}".format(x))

Source File: email_service.py From python-for-entrepreneurs-course-demos with MIT License

6 votes

def send_email(to_address, subject, html_body):
        try:
            smtp = EmailService.create_smtp_server()
            message = mailer.Message(
                From=EmailService.__from_address,
                To=to_address,
                charset='utf-8')
            message.Subject = subject
            message.Html = html_body
            message.Body = html2text.html2text(html_body)

            if not EmailService.__is_debug_mode:
                print("Sending message (live!)")
                smtp.send(message)
            else:
                print("Skipping send, email is in dev mode.")
        except Exception as x:
            print("Error sending mail: {}".format(x))

Source File: email_service.py From python-for-entrepreneurs-course-demos with MIT License

6 votes

def send_email(to_address, subject, html_body):
        try:
            smtp = EmailService.create_smtp_server()
            message = mailer.Message(
                From=EmailService.__from_address,
                To=to_address,
                charset='utf-8')
            message.Subject = subject
            message.Html = html_body
            message.Body = html2text.html2text(html_body)

            if not EmailService.__is_debug_mode:
                print("Sending message (live!)")
                smtp.send(message)
            else:
                print("Skipping send, email is in dev mode.")
        except Exception as x:
            print("Error sending mail: {}".format(x))

Source File: email_service.py From cookiecutter-pyramid-talk-python-starter with MIT License

6 votes

def send_email(to_address, subject, html_body):
        try:
            smtp = EmailService.create_smtp_server()
            message = mailer.Message(
                From=EmailService.__from_address,
                To=to_address,
                charset='utf-8')
            message.Subject = subject
            message.Html = html_body
            message.Body = html2text.html2text(html_body)

            if not EmailService.__is_debug_mode:
                print("Sending message (live!)")
                smtp.send(message)
            else:
                print("Skipping send, email is in dev mode.")
        except Exception as x:
            print("Error sending mail: {}".format(x))

Source File: export.py From patzilla with GNU Affero General Public License v3.0

5 votes

def get_fulltext(payload, what):

        xpath_lang = '/ops:world-patent-data/ftxt:fulltext-documents/ftxt:fulltext-document/ftxt:{what}/@lang'.format(what=what)
        xpath_content = '/ops:world-patent-data/ftxt:fulltext-documents/ftxt:fulltext-document/ftxt:{what}'.format(what=what)
        namespaces = {'ops': 'http://ops.epo.org', 'ftxt': 'http://www.epo.org/fulltext'}

        tree = ET.parse(BytesIO(payload))
        #print 'tree:'; pprint(tree)

        lang = tree.xpath(xpath_lang, namespaces=namespaces)
        #print 'lang:', lang

        elements = tree.xpath(xpath_content, namespaces=namespaces)
        if elements:
            return html2text.html2text(ET.tostring(elements[0]))

Source File: integration_test.py From microblog.pub with GNU Affero General Public License v3.0

5 votes

def resp2plaintext(resp):
    """Convert the body of a requests reponse to plain text in order to make basic assertions."""
    return html2text(resp.text)

Source File: federation_test.py From microblog.pub with GNU Affero General Public License v3.0

5 votes

def resp2plaintext(resp):
    """Convert the body of a requests reponse to plain text in order to make basic assertions."""
    return html2text(resp.text)

Source File: mail.py From django-userena-ce with BSD 3-Clause "New" or "Revised" License

5 votes

def send_mail(
    subject,
    message_plain,
    message_html,
    email_from,
    email_to,
    custom_headers={},
    attachments=(),
):
    """
    Build the email as a multipart message containing
    a multipart alternative for text (plain, HTML) plus
    all the attached files.
    """
    if not message_plain and not message_html:
        raise ValueError(_("Either message_plain or message_html should be not None"))

    if not message_plain:
        message_plain = html2text(message_html)

    message = {}

    message["subject"] = subject
    message["body"] = message_plain
    message["from_email"] = email_from
    message["to"] = email_to
    if attachments:
        message["attachments"] = attachments
    if custom_headers:
        message["headers"] = custom_headers

    msg = EmailMultiAlternatives(**message)
    if message_html:
        msg.attach_alternative(message_html, "text/html")
    msg.send()

Source File: comparison.py From trafilatura with GNU General Public License v3.0

5 votes

def run_html2text(htmlstring):
    '''try with the html2text module'''
    text = html2text.html2text(htmlstring)
    return text # sanitize(text)

Source File: feed.py From microblog.pub with GNU Affero General Public License v3.0

5 votes

def json_feed(path: str) -> Dict[str, Any]:
    """JSON Feed (https://jsonfeed.org/) document."""
    data = []
    for item in DB.activities.find(
        {
            "box": Box.OUTBOX.value,
            "type": "Create",
            "meta.deleted": False,
            "meta.public": True,
        },
        limit=10,
    ).sort("_id", -1):
        data.append(
            {
                "id": item["activity"]["id"],
                "url": item["activity"]["object"].get("url"),
                "content_html": item["activity"]["object"]["content"],
                "content_text": html2text(item["activity"]["object"]["content"]),
                "date_published": item["activity"]["object"].get("published"),
            }
        )
    return {
        "version": "https://jsonfeed.org/version/1",
        "user_comment": (
            "This is a microblog feed. You can add this to your feed reader using the following URL: "
            + ID
            + path
        ),
        "title": USERNAME,
        "home_page_url": ID,
        "feed_url": ID + path,
        "author": {
            "name": USERNAME,
            "url": ID,
            "avatar": ME.get("icon", {}).get("url"),
        },
        "items": data,
    }

Source File: define.py From python-zulip-api with Apache License 2.0

5 votes

def get_bot_define_response(self, original_content: str) -> str:
        split_content = original_content.split(' ')
        # If there are more than one word (a phrase)
        if len(split_content) > 1:
            return DefineHandler.PHRASE_ERROR_MESSAGE

        to_define = split_content[0].strip()
        to_define_lower = to_define.lower()

        # Check for presence of non-letters
        non_letters = set(to_define_lower) - set(string.ascii_lowercase)
        if len(non_letters):
            return self.SYMBOLS_PRESENT_ERROR_MESSAGE

        # No word was entered.
        if not to_define_lower:
            return self.EMPTY_WORD_REQUEST_ERROR_MESSAGE
        else:
            response = '**{}**:\n'.format(to_define)

            try:
                # Use OwlBot API to fetch definition.
                api_result = requests.get(self.DEFINITION_API_URL.format(to_define_lower))
                # Convert API result from string to JSON format.
                definitions = api_result.json()

                # Could not fetch definitions for the given word.
                if not definitions:
                    response += self.REQUEST_ERROR_MESSAGE
                else:  # Definitions available.
                    # Show definitions line by line.
                    for d in definitions:
                        example = d['example'] if d['example'] else '*No example available.*'
                        response += '\n' + '* (**{}**) {}\n&nbsp;&nbsp;{}'.format(d['type'], d['definition'], html2text.html2text(example))

            except Exception:
                response += self.REQUEST_ERROR_MESSAGE
                logging.exception("")

            return response

Source File: trac_export.py From allura with Apache License 2.0

5 votes

def parse_ticket(self, id):
        # Use CSV export to get ticket fields
        url = self.full_url(self.TICKET_URL % id, 'csv')
        f = self.csvopen(url)
        reader = csv.DictReader(f)
        ticket_fields = next(reader)
        ticket_fields['class'] = 'ARTIFACT'
        ticket = self.remap_fields(ticket_fields)

        # Use HTML export to get ticket description and comments
        import html2text
        html2text.BODY_WIDTH = 0
        url = self.full_url(self.TICKET_URL % id)
        self.log_url(url)
        d = BeautifulSoup(urlopen(url))
        self.clean_missing_wiki_links(d)
        desc = d.find('div', 'description').find('div', 'searchable')
        ticket['description'] = html2text.html2text(
            desc.renderContents('utf8').decode('utf8')) if desc else ''
        comments = []
        relative_base_url = six.moves.urllib.parse.urlparse(self.full_url(self.TICKET_URL % '')).path
        for comment in d.findAll('form', action='#comment'):
            c = {}
            c['submitter'] = re.sub(
                r'.* by ', '', comment.find('h3', 'change').text).strip()
            c['date'] = self.trac2z_date(
                comment.find('a', 'timeline')['title'].replace(' in Timeline', ''))
            changes = six.text_type(comment.find('ul', 'changes') or '')
            body = comment.find('div', 'comment')
            body = body.renderContents('utf8').decode('utf8') if body else ''
            body = body.replace('href="{}'.format(relative_base_url), 'href="')  # crude way to rewrite ticket links
            c['comment'] = html2text.html2text(changes + body)
            c['class'] = 'COMMENT'
            comments.append(c)
        ticket['comments'] = comments
        return ticket

Source File: leetcode-crawler.py From leetcode-crawler with MIT License

5 votes

def generate_question_markdown(self, question, path, has_get_code):    
        text_path = os.path.join(path, "{:0>3d}-{}".format(question['frontedId'], question['slug']))
        if not os.path.isdir(text_path):
            os.mkdir(text_path)   
        with open(os.path.join(text_path, "README.md"), 'w', encoding='utf-8') as f:
            f.write("# [{}][title]\n".format(question['title']))
            f.write("\n## Description\n\n")
            text = question['content']

            content = html2text.html2text(text).replace("**Input:**", "Input:").replace("**Output:**", "Output:").replace('**Explanation:**', 'Explanation:').replace('\n    ', '    ')
            f.write(content)
            
            f.write("\n**Tags:** {}\n".format(question['tags']))
            f.write("\n**Difficulty:** {}\n".format(question['difficulty']))
            f.write("\n## 思路\n")

            if self.is_login and has_get_code:
                sql = "SELECT code, language FROM last_ac_submission_record WHERE question_slug = ? ORDER BY timestamp"
                cursor = self.conn.cursor()
                cursor.execute(sql, (question['slug'],))
                submission = cursor.fetchone()
                cursor.close()

                if submission != None:
                    f.write("\n``` %s\n" %(submission[1]))
                    f.write(submission[0].encode('utf-8').decode('unicode_escape'))
                    f.write("\n```\n")

            
            f.write("\n[title]: https://leetcode.com/problems/{}\n".format(question['slug']))

Source File: RSSReader.py From RedditBots with MIT License

5 votes

def get_new_articles(source):
	articles = []
	try:
		response = urllib.request.urlopen(source)
		orig_rss = response.read().decode("utf-8")
		rss = ET.fromstring(orig_rss)
		channel = rss.find("channel")
		
		for item in channel.findall("item"):
			# Not used anymore
			# pubDate = item.find("pubDate").text
			# pubDateConv = mktime(time.strptime(pubDate, PUBDATEFORMAT)))
			
			link = item.find("link").text
			
			title = item.find("title")
			
			if title is not None:
				title = title.text
			if title is None:
				print("found no title, will use link")
				title = link
				
			description = item.find("description")
			
			if description is not None:
				description = html2text.html2text(description.text)
			
			guid = item.find("guid")
			
			if guid is not None:
				guid = guid.text
			if guid is None:
				#print("found no guid, will use link")
				guid = link
			articles.append((title, link, description, guid))
		
	except URLError as e:
		print("Error:", e.reason)
	
	return articles

Source File: speak.py From pythonista-scripts with MIT License

5 votes

def main():
    speech.stop()
    if not appex.is_running_extension():
        console.hud_alert('Reading clipboard')
        text = clipboard.get()
        url = None
    else:
        text = appex.get_text()
        url = appex.get_url()

    if url == None:
        try:
            url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
        except:
            pass

    if url != None:
        console.hud_alert('Reading: ' + url)
        h = html2text.HTML2Text()
        try:
            r = requests.get(
            url=url,
            headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))})
        except requests.ConnectionError as e:
            console.alert('Unable to connect to url.')
            return True
        html_content = r.text.decode('utf-8')
        text = html2text.html2text(html_content)
    else:
        console.hud_alert('Reading text: ' + str(text))

    if text:
        speech.say(text)
        stop = console.alert('Done?', hide_cancel_button=True, button1='OK')
        speech.stop()
    else:
        console.hud_alert('No text found.')

Source File: url2md.py From pythonista-scripts with MIT License

5 votes

def main():
    if appex.is_running_extension():
        url = appex.get_url()
        if url == None:
            text = appex.get_text()
            url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
    else:
        text = clipboard.get().strip()
        url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
        if not "http" in url:
            url = "http://"
        try:
            url = console.input_alert("URL", "", url)
        except:
            return True

    console.hud_alert('URL: %s' % url)

    h = html2text.HTML2Text()
    try:
        r = requests.get(
            url=url,
            headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))}
        )
    except Exception as e:
        raise(e.message)
        return True

    html_content = r.text.decode('utf-8')
    rendered_content = html2text.html2text(html_content)
    clipboard.set(rendered_content)

    launch_e = console.alert('Markdown copied to clipboard. Launch Evernote?', button1='Yes', button2='No', hide_cancel_button=True)
    if launch_e ==1:
        _eurl = "evernote://x-callback-url/new-note?type=clipboard&title=DRAFT&text="
        app=UIApplication.sharedApplication()
        eurl=nsurl(_eurl)
        app.openURL_(eurl)
    appex.finish()

Source File: ical.py From sync-engine with GNU Affero General Public License v3.0

5 votes

def generate_invite_message(ical_txt, event, account, invite_type='request'):
    assert invite_type in ['request', 'update', 'cancel']
    html_body = event.description or ''

    text_body = html2text(html_body)
    msg = mime.create.multipart('mixed')

    body = mime.create.multipart('alternative')

    if invite_type in ['request', 'update']:
        body.append(
            mime.create.text('plain', text_body),
            mime.create.text('html', html_body),
            mime.create.text('calendar; method=REQUEST',
                             ical_txt, charset='utf8'))
        msg.append(body)
    elif invite_type == 'cancel':
        body.append(
            mime.create.text('plain', text_body),
            mime.create.text('html', html_body),
            mime.create.text('calendar; method=CANCEL',
                             ical_txt, charset='utf8'))
        msg.append(body)

    # From should match our mailsend provider (mailgun) so it doesn't confuse
    # spam filters
    msg.headers['From'] = "automated@notifications.nylas.com"
    msg.headers['Reply-To'] = account.email_address

    if invite_type == 'request':
        msg.headers['Subject'] = u'Invitation: {}'.format(event.title)
    elif invite_type == 'update':
        msg.headers['Subject'] = u'Updated Invitation: {}'.format(event.title)
    elif invite_type == 'cancel':
        msg.headers['Subject'] = u'Cancelled: {}'.format(event.title)

    return msg

Source File: doc_spell_checker.py From SNIPER-mxnet with Apache License 2.0

5 votes

def check_grammar(self, file_name):
        """Check the grammar of the specified file

           Parameters
           -----------
           file_name: name of the file to be checked
        """
        file_content = html2text.html2text(open(file_name).read())
        file_content = re.sub(u"[\x00-\x08\x0b-\x0c\x0e-\x1f]+", u"", file_content)
        self.__grammar_check_res = self.__grammar_checker.check(file_content)

Source File: app.py From activitypub with Mozilla Public License 2.0

5 votes

def html2plaintext(self, body, *args, **kwargs):
    return html2text(body)

Source File: send_email.py From loaner with Apache License 2.0

5 votes

def send_shelf_audit_email(shelf):
  """Sends a shelf audit email.

  Args:
    shelf: shelf_model.Shelf object for location details.

  Raises:
    SendEmailError: if the data pertaining to the audit is incomplete.
  """
  timedelta_since_audit = datetime.datetime.utcnow() - shelf.last_audit_time
  template_dict = {
      'friendly_name': shelf.friendly_name,
      'hours_since_audit': int(timedelta_since_audit.total_seconds() / 3600),
      'location': shelf.location,
      'origin': constants.ORIGIN,
  }
  title, body = constants.TEMPLATE_LOADER.render(
      'shelf_audit_request', template_dict)
  email_dict = {
      'to': config_model.Config.get('shelf_audit_email_to'),
      'subject': title,
      'body': html2text.html2text(body),
      'html': body,
  }
  # We want each different subject to generate a unique hash.
  logging.info(
      'Sending email to %s\nSubject: %s.', shelf.responsible_for_audit, title)
  _send_email(**email_dict)

Source File: doc_spell_checker.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0

5 votes

def check_grammar(self, file_name):
        """Check the grammar of the specified file

           Parameters
           -----------
           file_name: name of the file to be checked
        """
        file_content = html2text.html2text(open(file_name).read())
        file_content = re.sub(u"[\x00-\x08\x0b-\x0c\x0e-\x1f]+", u"", file_content)
        self.__grammar_check_res = self.__grammar_checker.check(file_content)

Source File: book.py From JARVIS-on-Messenger with MIT License

5 votes

def process(input, entities):
    output = {}
    try:
        book_title = entities['book'][0]['value']

        with requests_cache.enabled('book_cache', backend='sqlite', expire_after=86400):
            response = requests.get(
                'https://www.goodreads.com/book/title.xml?key=' + GOODREADS_ACCESS_TOKEN + '&title=' + book_title)
            data = ElementTree.fromstring(response.content)

        book_node = data.find('book')
        author = book_node.find('authors').find('author').find('name').text
        title = book_node.find('title').text
        description = html2text(book_node.find('description').text)
        average_rating = book_node.find('average_rating').text
        link = book_node.find('link').text
        goodreads_attribution = '- Powered by Goodreads'

        template = TextTemplate()
        template.set_text('Title: ' + title + '\nAuthor: ' + author + '\nDescription: ' + description)
        template.set_post_text('\nAverage Rating: ' + average_rating + ' / 5' + '\n' + goodreads_attribution)

        text = template.get_text()
        template = ButtonTemplate(text)
        template.add_web_url('Goodreads Link', link)

        output['input'] = input
        output['output'] = template.get_message()
        output['success'] = True
    except:
        error_message = 'I couldn\'t find any book matching your query.'
        error_message += '\nPlease ask me something else, like:'
        error_message += '\n  - book timeline'
        error_message += '\n  - harry potter book plot'
        error_message += '\n  - little women book rating'
        output['error_msg'] = TextTemplate(error_message).get_message()
        output['success'] = False
    return output

Python html2text.html2text() Examples