Python bs4.dammit.unicode_markup() Examples
The following are 30
code examples of bs4.dammit.unicode_markup().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bs4.dammit
, or try the search function
.
Example #1
Source File: test_soup.py From Gank-Alfred-Workflow with MIT License | 5 votes |
def test_smart_quotes_to_ascii(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="ascii") self.assertEqual( dammit.unicode_markup, """<foo>''""</foo>""")
Example #2
Source File: test_soup.py From stopstalk-deployment with MIT License | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #3
Source File: test_soup.py From stopstalk-deployment with MIT License | 5 votes |
def test_smart_quotes_to_unicode(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) self.assertEqual( dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
Example #4
Source File: test_soup.py From nbaplus-server with Apache License 2.0 | 5 votes |
def test_unicode_input(self): markup = u"I'm already Unicode! \N{SNOWMAN}" dammit = UnicodeDammit(markup) self.assertEqual(dammit.unicode_markup, markup)
Example #5
Source File: test_soup.py From nbaplus-server with Apache License 2.0 | 5 votes |
def test_smart_quotes_to_unicode(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) self.assertEqual( dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
Example #6
Source File: test_soup.py From stopstalk-deployment with MIT License | 5 votes |
def test_byte_order_mark_removed(self): # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding)
Example #7
Source File: test_soup.py From stopstalk-deployment with MIT License | 5 votes |
def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
Example #8
Source File: test_soup.py From stopstalk-deployment with MIT License | 5 votes |
def test_smart_quotes_to_ascii(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="ascii") self.assertEqual( dammit.unicode_markup, """<foo>''""</foo>""")
Example #9
Source File: test_soup.py From stopstalk-deployment with MIT License | 5 votes |
def test_smart_quotes_to_html_entities(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="html") self.assertEqual( dammit.unicode_markup, "<foo>‘’“”</foo>")
Example #10
Source File: test_soup.py From stopstalk-deployment with MIT License | 5 votes |
def test_smart_quotes_to_xml_entities(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="xml") self.assertEqual( dammit.unicode_markup, "<foo>‘’“”</foo>")
Example #11
Source File: test_soup.py From stopstalk-deployment with MIT License | 5 votes |
def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
Example #12
Source File: test_soup.py From Gank-Alfred-Workflow with MIT License | 5 votes |
def test_sniffed_xml_encoding(self): # A document written in UTF-16LE will be converted by a different # code path that sniffs the byte order markers. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding)
Example #13
Source File: test_soup.py From Gank-Alfred-Workflow with MIT License | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet try: bs4.dammit.chardet = None with warnings.catch_warnings(record=True) as w: dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) msg = w[0].message self.assertTrue(isinstance(msg, UnicodeWarning)) self.assertTrue("Some characters could not be decoded" in str(msg)) finally: bs4.dammit.chardet = chardet
Example #14
Source File: test_soup.py From Gank-Alfred-Workflow with MIT License | 5 votes |
def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding, 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
Example #15
Source File: test_soup.py From Gank-Alfred-Workflow with MIT License | 5 votes |
def test_detect_utf8(self): utf8 = b"\xc3\xa9" dammit = UnicodeDammit(utf8) self.assertEqual(dammit.unicode_markup, u'\xe9') self.assertEqual(dammit.original_encoding, 'utf-8')
Example #16
Source File: test_soup.py From B.E.N.J.I. with MIT License | 5 votes |
def test_unicode_input(self): markup = "I'm already Unicode! \N{SNOWMAN}" dammit = UnicodeDammit(markup) self.assertEqual(dammit.unicode_markup, markup)
Example #17
Source File: test_soup.py From Gank-Alfred-Workflow with MIT License | 5 votes |
def test_smart_quotes_to_html_entities(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="html") self.assertEqual( dammit.unicode_markup, "<foo>‘’“”</foo>")
Example #18
Source File: test_soup.py From Gank-Alfred-Workflow with MIT License | 5 votes |
def test_smart_quotes_to_xml_entities(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="xml") self.assertEqual( dammit.unicode_markup, "<foo>‘’“”</foo>")
Example #19
Source File: test_soup.py From Gank-Alfred-Workflow with MIT License | 5 votes |
def test_smart_quotes_to_unicode(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) self.assertEqual( dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
Example #20
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_byte_order_mark_removed(self): # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding)
Example #21
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #22
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
Example #23
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
Example #24
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_smart_quotes_to_ascii(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="ascii") self.assertEqual( dammit.unicode_markup, """<foo>''""</foo>""")
Example #25
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_smart_quotes_to_html_entities(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="html") self.assertEqual( dammit.unicode_markup, "<foo>‘’“”</foo>")
Example #26
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_smart_quotes_to_xml_entities(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="xml") self.assertEqual( dammit.unicode_markup, "<foo>‘’“”</foo>")
Example #27
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_smart_quotes_to_unicode(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) self.assertEqual( dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
Example #28
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_unicode_input(self): markup = u"I'm already Unicode! \N{SNOWMAN}" dammit = UnicodeDammit(markup) self.assertEqual(dammit.unicode_markup, markup)
Example #29
Source File: test_soup.py From locality-sensitive-hashing with MIT License | 5 votes |
def test_byte_order_mark_removed(self): # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding)
Example #30
Source File: test_soup.py From locality-sensitive-hashing with MIT License | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet