Python bs4.dammit.contains_replacement_characters() Examples
The following are 21
code examples of bs4.dammit.contains_replacement_characters().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bs4.dammit
, or try the search function
.
Example #1
Source File: test_soup.py From CrisisMappingToolkit with Apache License 2.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #2
Source File: test_soup.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue("\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #3
Source File: test_soup.py From moviegrabber with GNU General Public License v3.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #4
Source File: test_soup.py From Tautulli with GNU General Public License v3.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue("\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #5
Source File: test_soup.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue("\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #6
Source File: test_soup.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #7
Source File: test_soup.py From MIA-Dictionary-Addon with GNU General Public License v3.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue("\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #8
Source File: test_soup.py From POC-EXP with GNU General Public License v3.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #9
Source File: test_soup.py From python-for-android with Apache License 2.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet try: bs4.dammit.chardet = None with warnings.catch_warnings(record=True) as w: dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue("\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) msg = w[0].message self.assertTrue(isinstance(msg, UnicodeWarning)) self.assertTrue("Some characters could not be decoded" in str(msg)) finally: bs4.dammit.chardet = chardet
Example #10
Source File: test_soup.py From ru with GNU General Public License v2.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #11
Source File: test_soup.py From MARA_Framework with GNU Lesser General Public License v3.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #12
Source File: test_soup.py From ServerlessCrawler-VancouverRealState with MIT License | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #13
Source File: test_soup.py From nbaplus-server with Apache License 2.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #14
Source File: test_soup.py From stopstalk-deployment with MIT License | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #15
Source File: test_soup.py From B.E.N.J.I. with MIT License | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue("\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #16
Source File: test_soup.py From Gank-Alfred-Workflow with MIT License | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet try: bs4.dammit.chardet = None with warnings.catch_warnings(record=True) as w: dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) msg = w[0].message self.assertTrue(isinstance(msg, UnicodeWarning)) self.assertTrue("Some characters could not be decoded" in str(msg)) finally: bs4.dammit.chardet = chardet
Example #17
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #18
Source File: test_soup.py From locality-sensitive-hashing with MIT License | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #19
Source File: test_soup.py From pledgeservice with Apache License 2.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #20
Source File: test_soup.py From ServerlessCrawler-VancouverRealState with MIT License | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #21
Source File: test_soup.py From ServerlessCrawler-VancouverRealState with MIT License | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet