Python html.unescape() Examples
The following are 30
code examples of html.unescape().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
html
, or try the search function
.
Example #1
Source File: functions.py From mstdn-ebooks with Mozilla Public License 2.0 | 6 votes |
def extract_toot(toot): toot = html.unescape(toot) # convert HTML escape codes to text soup = BeautifulSoup(toot, "html.parser") for lb in soup.select("br"): # replace <br> with linebreak lb.replace_with("\n") for p in soup.select("p"): # ditto for <p> p.replace_with("\n") for ht in soup.select("a.hashtag"): # convert hashtags from links to text ht.unwrap() for link in soup.select("a"): #ocnvert <a href='https://example.com>example.com</a> to just https://example.com if 'href' in link: # apparently not all a tags have a href, which is understandable if you're doing normal web stuff, but on a social media platform?? link.replace_with(link["href"]) text = soup.get_text() text = re.sub(r"https://([^/]+)/(@[^\s]+)", r"\2@\1", text) # put mastodon-style mentions back in text = re.sub(r"https://([^/]+)/users/([^\s/]+)", r"@\2@\1", text) # put pleroma-style mentions back in text = text.rstrip("\n") # remove trailing newline(s) return text
Example #2
Source File: test_views_lti_document.py From marsha with MIT License | 6 votes |
def test_views_lti_document_post_error(self, mock_verify, mock_logger): """Validate the response returned in case of an LTI exception.""" role = random.choice(["instructor", "student"]) data = {"resource_link_id": "123", "roles": role, "context_id": "abc"} response = self.client.post("/lti/documents/{!s}".format(uuid.uuid4()), data) self.assertEqual(response.status_code, 200) self.assertContains(response, "<html>") content = response.content.decode("utf-8") mock_logger.assert_called_once_with("lti error") match = re.search( '<div id="marsha-frontend-data" data-context="(.*)">', content ) context = json.loads(html.unescape(match.group(1))) self.assertEqual(context.get("state"), "error") self.assertIsNone(context.get("resource"))
Example #3
Source File: test_views_lti_video.py From marsha with MIT License | 6 votes |
def test_views_lti_video_post_error(self, mock_verify, mock_logger): """Validate the response returned in case of an LTI exception.""" role = random.choice(["instructor", "student"]) data = {"resource_link_id": "123", "roles": role, "context_id": "abc"} response = self.client.post("/lti/videos/{!s}".format(uuid.uuid4()), data) self.assertEqual(response.status_code, 200) self.assertContains(response, "<html>") content = response.content.decode("utf-8") mock_logger.assert_called_once_with("lti error") match = re.search( '<div id="marsha-frontend-data" data-context="(.*)">', content ) context = json.loads(unescape(match.group(1))) self.assertEqual(context.get("state"), "error") self.assertIsNone(context.get("resource")) self.assertEqual(context.get("modelName"), "videos")
Example #4
Source File: keyboard.py From kle_render with MIT License | 6 votes |
def get_labels(key, fa_subs, kb_subs): # split into labels for each part of key labels = key.split('\n') for i, label in enumerate(labels): tree = lxml.html.fragment_fromstring(label, create_parent=True) # set key.pic to true and make label url of image if tree.xpath('//img[1]/@src'): return (tree.xpath('//img[1]/@src'), True) # replace icons with unicode characters for fa_icon in tree.find_class('fa'): fa_class = re.search(r'fa-\S+', fa_icon.get('class')) if fa_class and fa_class.group(0) in fa_subs: fa_icon.text = chr(int(fa_subs[fa_class.group(0)], 16)) for kb_icon in tree.find_class('kb'): kb_class = re.search(r'kb-\S+', kb_icon.get('class')) if kb_class and kb_class.group(0) in kb_subs: kb_icon.text = chr(int(kb_subs[kb_class.group(0)], 16)) # replace breaks with newlines and remove html entities for br in tree.xpath('//br'): br.text = '\n' labels[i] = html.unescape(tree.text_content()) return (labels, False)
Example #5
Source File: annotation.py From vadnet with GNU Lesser General Public License v3.0 | 6 votes |
def write_voiceactivity(path, subs): if not path.endswith('.annotation'): path += '.annotation' # print('writing {}'.format(path)) count = 0 with open(path + '~', 'w', encoding='latin-1') as fp: for sub in subs: if not sub: continue count += 1 start = convert_timestamp_to_s(sub.format_start()) end = convert_timestamp_to_s(sub.format_end()) cap = html.unescape(sub.get_text().replace('\n', ' ').replace(';', ',')) if not re.match('\s*\*.*\*\s*', cap) \ and not re.match('\s*\[.*\]\s*', cap): fp.write('{};{};0;1.0\n'.format(start, end)) with open(path, 'w', encoding='latin-1') as fp: fp.write('<?xml version="1.0" ?>\n<annotation ssi-v="3">\n\t<info ftype="ASCII" size="{}" />\n\t<meta role="subtitles" annotator="system" />\n\t<scheme name="voiceactivity" type="DISCRETE" color="#FFDDD9C3">\n\t\t<item name="VOICE" id="0" color="#FF494429" />\n\t</scheme>\n</annotation>\n'.format(count))
Example #6
Source File: epr.py From epr with MIT License | 6 votes |
def handle_data(self, raw): if raw and not self.ishidden: if self.text[-1] == "": tmp = raw.lstrip() else: tmp = raw if self.ispref: line = unescape(tmp) else: line = unescape(re.sub(r"\s+", " ", tmp)) self.text[-1] += line if self.ishead: self.idhead.add(len(self.text)-1) elif self.isbull: self.idbull.add(len(self.text)-1) elif self.isinde: self.idinde.add(len(self.text)-1) elif self.ispref: self.idpref.add(len(self.text)-1)
Example #7
Source File: annotation.py From vadnet with GNU Lesser General Public License v3.0 | 6 votes |
def write_transcription(path, subs): if not path.endswith('.annotation'): path += '.annotation' # print('writing {}'.format(path)) count = 0 with open(path + '~', 'w', encoding='latin-1') as fp: for sub in subs: if not sub: continue count += 1 start = convert_timestamp_to_s(sub.format_start()) end = convert_timestamp_to_s(sub.format_end()) cap = html.unescape(sub.get_text().replace('\n', ' ').replace(';', ',')) fp.write('{};{};{};1.0\n'.format(start, end, cap)) with open(path, 'w', encoding='latin-1') as fp: fp.write('<?xml version="1.0" ?>\n<annotation ssi-v="3">\n\t<info ftype="ASCII" size="{}" />\n\t<meta role="youtube" annotator="system" />\n\t<scheme name="transcription" type="FREE"/>\n</annotation>\n'.format(count))
Example #8
Source File: backtracking.py From PT-help with MIT License | 6 votes |
def backtracking_id(site): cookies = cookies_raw2jar(site['cookies']) for _tid in range(site['start_torrent'], site['end_torrent'] + 2): t0 = time.time() _link = site['torrent_url'].format(_tid) torrent_page = requests.get(_link, cookies=cookies, headers=headers) title_search = re.search(site['search_ptn'], torrent_page.text) if title_search: _title = pymysql.escape_string(unescape(title_search.group("title"))) pubDate = re.search("发布于(.+?)<", torrent_page.text).group(1) _timestamp = time.mktime(time.strptime(pubDate, "%Y-%m-%d %H:%M:%S")) wrap_insert(site=site['name'], sid=_tid, title=_title, link=_link, pubdate=_timestamp, t=t0) else: print("ID: {}, Cost: {:.5f} s, No torrent.".format(_tid, time.time() - t0)) time.sleep(2)
Example #9
Source File: models.py From Pytition with BSD 3-Clause "New" or "Revised" License | 5 votes |
def raw_text(self): return html.unescape(mark_safe(strip_tags(sanitize_html(self.text))))
Example #10
Source File: familyhandler.py From gprime with GNU General Public License v2.0 | 5 votes |
def post(self, path): _ = self.app.get_translate_func(self.current_user) page = int(self.get_argument("page", 1) or 1) search = self.get_argument("search", "") if "/" in path: handle, action = path.split("/") else: handle, action = path, "view" json_data = json.loads(html.unescape(self.get_argument("json_data"))) instance = Family.from_struct(json_data) update_json = self.get_argument("update_json", None) if update_json: # edit the instance self.update_instance(instance, update_json) form = FamilyForm(self, instance=instance) form.load_data() self.render("family.html", **self.get_template_dict(tview=_("family detail"), action=action, page=page, search=search, form=form)) else: self.send_message("Updated family. <a href=\"FIXME\">Undo</a>") form = FamilyForm(self, instance=instance) form.save() handle = instance.handle self.redirect(self.app.make_url("/family/%(handle)s" % {"handle": handle}))
Example #11
Source File: models.py From Pytition with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __str__(self): return html.unescape("[{}:{}] {} {}".format(self.petition.id, "OK" if self.confirmed else "..", self.first_name, self.last_name))
Example #12
Source File: forms.py From Pytition with BSD 3-Clause "New" or "Revised" License | 5 votes |
def clean_title(self): title = self.cleaned_data.get('title') #slugtext = slugify(html.unescape(mark_safe(strip_tags(title).strip()))) filters = {'title': title} if self.owned_by_org: org = Organization.objects.get(slugname=self.orgslugname) filters.update({'org': org}) else: user = PytitionUser.objects.get(user__username=self.username) filters.update({'user': user}) results = Petition.objects.filter(**filters) if results.count() > 0: self.add_error('title', ValidationError(_("There is already a petition with this title"), code="invalid")) return title
Example #13
Source File: BlogSpider.py From SourceCodeOfBook with MIT License | 5 votes |
def parse_detail(self, response): item = response.meta['item'] post_time = response.xpath('//time[@title="Post created"]/@datetime').extract_first() category = response.xpath('//span[@itemprop="about"]/a/span/text()').extract_first() post_body = response.xpath('//div[@class="post-body"]') body_html = unescape(etree.tostring(post_body[0]._root).decode()) item['post_time'] = post_time item['category'] = category item['detail'] = body_html yield item
Example #14
Source File: placehandler.py From gprime with GNU General Public License v2.0 | 5 votes |
def post(self, path): _ = self.app.get_translate_func(self.current_user) if "/" in path: handle, action = path.split("/") else: handle, action = path, "view" page = int(self.get_argument("page", 1) or 1) search = self.get_argument("search", "") json_data = json.loads(html.unescape(self.get_argument("json_data"))) instance = Place.from_struct(json_data) update_json = self.get_argument("update_json", None) if update_json: # edit the instance self.update_instance(instance, update_json) form = PlaceForm(self, instance=instance) form.load_data() self.render("place.html", **self.get_template_dict(tview=_("place detail"), action=action, page=page, search=search, form=form)) else: form = PlaceForm(self, instance=instance) form.save() handle = instance.handle self.redirect(self.app.make_url("/place/%(handle)s" % {"handle": handle}))
Example #15
Source File: mediahandler.py From gprime with GNU General Public License v2.0 | 5 votes |
def post(self, path): _ = self.app.get_translate_func(self.current_user) page = int(self.get_argument("page", 1) or 1) search = self.get_argument("search", "") if "/" in path: handle, action = path.split("/") else: handle, action = path, "view" json_data = json.loads(html.unescape(self.get_argument("json_data"))) instance = Media.from_struct(json_data) update_json = self.get_argument("update_json", None) if update_json: # edit the instance self.update_instance(instance, update_json) form = MediaForm(self, instance=instance) form.load_data() self.render("media.html", **self.get_template_dict(tview=_("media detail"), action=action, page=page, search=search, form=form)) else: self.send_message("Updated media. <a href=\"FIXME\">Undo</a>") form = MediaForm(self, instance=instance) form.save() handle = instance.handle self.redirect(self.app.make_url("/media/%(handle)s" % {"handle": handle}))
Example #16
Source File: notehandler.py From gprime with GNU General Public License v2.0 | 5 votes |
def post(self, path): _ = self.app.get_translate_func(self.current_user) page = int(self.get_argument("page", 1) or 1) search = self.get_argument("search", "") if "/" in path: handle, action = path.split("/") else: handle, action = path, "view" json_data = json.loads(html.unescape(self.get_argument("json_data"))) instance = Note.from_struct(json_data) update_json = self.get_argument("update_json", None) if update_json: # edit the instance self.update_instance(instance, update_json) form = NoteForm(self, instance=instance) form.load_data() self.render("note.html", **self.get_template_dict(tview=_("note detail"), action=action, page=page, search=search, form=form)) else: self.send_message("Updated note. <a href=\"FIXME\">Undo</a>") form = NoteForm(self, instance=instance) form.save() handle = instance.handle self.redirect(self.app.make_url("/note/%(handle)s" % {"handle": handle}))
Example #17
Source File: taghandler.py From gprime with GNU General Public License v2.0 | 5 votes |
def post(self, path): _ = self.app.get_translate_func(self.current_user) page = int(self.get_argument("page", 1) or 1) search = self.get_argument("search", "") if "/" in path: handle, action = path.split("/") else: handle, action = path, "view" json_data = json.loads(html.unescape(self.get_argument("json_data"))) instance = Tag.from_struct(json_data) update_json = self.get_argument("update_json", None) if update_json: # edit the instance self.update_instance(instance, update_json) form = TagForm(self, instance=instance) form.load_data() self.render("tag.html", **self.get_template_dict(tview=_("tag detail"), action=action, page=page, search=search, form=form)) else: self.send_message("Updated tag. <a href=\"FIXME\">Undo</a>") form = TagForm(self, instance=instance) form.save() handle = instance.handle self.redirect(self.app.make_url("/tag/%(handle)s" % {"handle": handle}))
Example #18
Source File: sourcehandler.py From gprime with GNU General Public License v2.0 | 5 votes |
def post(self, path): _ = self.app.get_translate_func(self.current_user) page = int(self.get_argument("page", 1) or 1) search = self.get_argument("search", "") if "/" in path: handle, action = path.split("/") else: handle, action = path, "view" json_data = json.loads(html.unescape(self.get_argument("json_data"))) instance = Source.from_struct(json_data) update_json = self.get_argument("update_json", None) if update_json: # edit the instance self.update_instance(instance, update_json) form = SourceForm(self, instance=instance) form.load_data() self.render("source.html", **self.get_template_dict(tview=_("source detail"), action=action, page=page, search=search, form=form)) else: self.send_message("Updated source. <a href=\"FIXME\">Undo</a>") form = SourceForm(self, instance=instance) form.save() handle = instance.handle self.redirect(self.app.make_url("/source/%(handle)s" % {"handle": handle}))
Example #19
Source File: models.py From Pytition with BSD 3-Clause "New" or "Revised" License | 5 votes |
def raw_twitter_description(self): return html.unescape(mark_safe(strip_tags(sanitize_html(self.twitter_description))))
Example #20
Source File: 0006_auto_20181015_0851.py From Pytition with BSD 3-Clause "New" or "Revised" License | 5 votes |
def myunescape(html_text): strings = [] soup = BeautifulSoup(html_text, 'html.parser') for tag in soup.find_all(): for s in tag.strings: strings.append(s) for s in strings: try: s.replace_with(unescape(s)) except: pass return str(soup)
Example #21
Source File: 0007_auto_20190807_2221.py From Pytition with BSD 3-Clause "New" or "Revised" License | 5 votes |
def slugify_petitions(apps, schema_editor): Petition = apps.get_model('petition', 'Petition') SlugModel = apps.get_model('petition', 'SlugModel') for p in Petition.objects.all(): if p.slugmodel_set.count() == 0: raw_title = html.unescape(mark_safe(strip_tags(p.title).strip())) SlugModel.objects.create(slug=slugify(raw_title[:200]), petition=p)
Example #22
Source File: data_preparation.py From Deep-Learning-with-TensorFlow-Second-Edition with MIT License | 5 votes |
def __clean_samples(self, samples): """ Cleans samples. :param samples: Samples to be cleaned :return: cleaned samples """ print('Cleaning samples ...') # Prepare regex patterns ret = [] reg_punct = '[' + re.escape(''.join(string.punctuation)) + ']' if self._stopwords_file is not None: stopwords = self.__read_stopwords() sw_pattern = re.compile(r'\b(' + '|'.join(stopwords) + r')\b') # Clean each sample for sample in samples: # Restore HTML characters text = html.unescape(sample) # Remove @users and urls words = text.split() words = [word for word in words if not word.startswith('@') and not word.startswith('http://')] text = ' '.join(words) # Transform to lowercase text = text.lower() # Remove punctuation symbols text = re.sub(reg_punct, ' ', text) # Replace CC(C+) (a character occurring more than twice in a row) for C text = re.sub(r'([a-z])\1{2,}', r'\1', text) # Remove stopwords if stopwords is not None: text = sw_pattern.sub('', text) ret += [text] return ret
Example #23
Source File: search_pchome.py From web-crawler-tutorial with MIT License | 5 votes |
def get_items(json_dict): item_list = list() item_objects = json_dict['prods'] for item_obj in item_objects: try: item = dict() item['name'] = html.unescape(item_obj['name']) item['price'] = int(item_obj['price']) item['describe'] = item_obj['describe'] item['img_url'] = 'http://ec1img.pchome.com.tw/' + item_obj['picB'] item['url'] = 'http://24h.pchome.com.tw/prod/' + item_obj['Id'] item_list.append(item) except Exception: pass return item_list
Example #24
Source File: monitors.py From chirp with MIT License | 5 votes |
def get_monitor_details(): """Render the index page.""" monitor_id = paranoid_clean(request.args.get('id')) monitors = mongo.db[app.config['MONITORS_COLLECTION']] monitor = monitors.find_one({'hashed': monitor_id}, {'_id': 0}) if not monitor: return jsonify({'success': False, 'error': 'Monitor was not found.'}) articles = mongo.db[app.config['ARTICLES_COLLECTION']] link = monitor['metadata']['rss_link'] articles = list(articles.find({'feed_source': link}, {'_id': 0})) for idx, item in enumerate(articles): articles[idx]['title'] = html.unescape(item['title']) articles[idx]['date'] = item['collected'][:10] articles.sort(key=lambda x: x['collected'], reverse=True) return jsonify({'success': True, 'monitor': monitor, 'articles': articles})
Example #25
Source File: advisory.py From arch-security-tracker with MIT License | 5 votes |
def advisory_fetch_from_mailman(url): try: response = get(url) if 200 != response.status_code: return None asa = unescape(sub('</?A[^<]*?>', '', response.text)) start = '<PRE>' start_marker = '{}Arch Linux Security Advisory'.format(start) end = '\n-------------- next part --------------' asa = asa[asa.index(start_marker) + len(start):asa.index(end)] return asa.strip() except Exception: return None
Example #26
Source File: cloudscraper.py From a4kScrapers with MIT License | 5 votes |
def unescape(html_text): if sys.version_info >= (3, 0): if sys.version_info >= (3, 4): return html.unescape(html_text) return HTMLParser().unescape(html_text) return HTMLParser().unescape(html_text) # ------------------------------------------------------------------------------- # # Decode Brotli on older versions of urllib3 manually # ------------------------------------------------------------------------------- #
Example #27
Source File: utils.py From a4kScrapers with MIT License | 5 votes |
def normalize(string): unescaped = unescape(string) unquoted = unquote(unescaped) return unicodedata.normalize("NFKD", unquoted).replace('\n', '')
Example #28
Source File: views.py From ACE with Apache License 2.0 | 5 votes |
def query_message_ids(): # if we passed a JSON formatted list of alert_uuids then we compute the message_ids from that if 'alert_uuids' in request.values: alert_uuids = json.loads(request.values['alert_uuids']) message_ids = [] with get_db_connection() as db: c = db.cursor() c.execute("""SELECT o.value FROM observables o JOIN observable_mapping om ON o.id = om.observable_id JOIN alerts a ON om.alert_id = a.id WHERE o.type = 'message_id' AND a.uuid IN ( {} )""".format(','.join(['%s' for _ in alert_uuids])), tuple(alert_uuids)) for row in c: message_id = row[0].decode(errors='ignore') message_ids.append(message_id) else: # otherwise we expect a JSON formatted list of message_ids message_ids = json.loads(request.values['message_ids']) import html message_ids = [html.unescape(_) for _ in message_ids] result = { } for source in get_email_archive_sections(): result[source] = search_archive(source, message_ids, excluded_emails=saq.CONFIG['remediation']['excluded_emails'].split(',')) for archive_id in result[source].keys(): result[source][archive_id] = result[source][archive_id].json response = make_response(json.dumps(result)) response.mimetype = 'application/json' return response
Example #29
Source File: _simple.py From dephell with MIT License | 5 votes |
def _get_links(self, name: str) -> List[Dict[str, str]]: cache = JSONCache( 'warehouse-simple', urlparse(self.url).hostname, 'links', name, ttl=config['cache']['ttl'], ) links = cache.load() if links is not None: yield from links return dep_url = posixpath.join(self.url, quote(name)) + '/' with requests_session() as session: logger.debug('getting dep info from simple repo', extra=dict(url=dep_url)) response = session.get(dep_url, auth=self.auth) if response.status_code == 404: raise PackageNotFoundError(package=name, url=dep_url) response.raise_for_status() document = html5lib.parse(response.text, namespaceHTMLElements=False) links = [] for tag in document.findall('.//a'): link = tag.get('href') if not link: continue parsed = urlparse(link) if not parsed.path.endswith(ARCHIVE_EXTENSIONS): continue python = tag.get('data-requires-python') fragment = parse_qs(parsed.fragment) link = dict( url=urljoin(dep_url, link), name=parsed.path.strip('/').split('/')[-1], python=html.unescape(python) if python else '*', digest=fragment['sha256'][0] if 'sha256' in fragment else None, ) links.append(link) yield link cache.dump(links) return links
Example #30
Source File: strawpoll.py From lrrbot with Apache License 2.0 | 5 votes |
def new_poll(lrrbot, conn, event, respond_to, multi, timeout, poll_id, title, options, tag=None): """ Command: !poll N https://www.strawpoll.me/ID Command: !poll N TITLE: OPTION1; OPTION2 Command: !multipoll N TITLE: OPTION1; OPTION2 Section: misc Start a new Strawpoll poll. Post results in N seconds. Multiple polls can be active at the same time. """ if poll_id is not None: url = "https://www.strawpoll.me/api/v2/polls/%s" % poll_id data = json.loads(common.http.request(url)) title = html.unescape(data["title"]) else: if title is None: title = "LoadingReadyLive poll" if ';' in options: options = [option.strip() for option in options.split(';')] elif ',' in options: options = [option.strip() for option in options.split(',')] else: options = options.split() data = json.dumps({"options": options, "title": title, "multi": multi is not None}) data = json.loads(common.http.request( "https://www.strawpoll.me/api/v2/polls", data, "POST", headers={"Content-Type": "application/json"})) poll_id = data["id"] if timeout is not None: timeout = int(timeout) else: timeout = DEFAULT_TIMEOUT end = time.time() + int(timeout) # NB: need to assign to lrrbot.polls, rather than using lrrbot.polls.append, # so that the state change gets saved properly lrrbot.polls = lrrbot.polls + [(end, title, poll_id, respond_to, tag)] conn.privmsg(respond_to, "New poll: %s (https://www.strawpoll.me/%s%s): %s from now" % (title, poll_id, space.SPACE, common.time.nice_duration(timeout, 1))) if tag is not None: data['tag'] = tag await common.rpc.eventserver.event('strawpoll-add', data)