Python bs4.dammit.original_encoding() Examples
The following are 30
code examples of bs4.dammit.original_encoding().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bs4.dammit
, or try the search function
.
Example #1
Source File: test_soup.py From ServerlessCrawler-VancouverRealState with MIT License | 6 votes |
def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. bs4.dammit.chardet_dammit = noop ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #2
Source File: test_soup.py From locality-sensitive-hashing with MIT License | 6 votes |
def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. bs4.dammit.chardet_dammit = noop ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #3
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 6 votes |
def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. bs4.dammit.chardet_dammit = noop ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #4
Source File: test_soup.py From B.E.N.J.I. with MIT License | 6 votes |
def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. bs4.dammit.chardet_dammit = noop ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, str)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #5
Source File: test_soup.py From ServerlessCrawler-VancouverRealState with MIT License | 6 votes |
def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. bs4.dammit.chardet_dammit = noop ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #6
Source File: test_soup.py From ServerlessCrawler-VancouverRealState with MIT License | 6 votes |
def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. bs4.dammit.chardet_dammit = noop ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #7
Source File: test_soup.py From pledgeservice with Apache License 2.0 | 6 votes |
def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. bs4.dammit.chardet_dammit = noop ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
Example #8
Source File: test_soup.py From Gank-Alfred-Workflow with MIT License | 5 votes |
def test_ignore_invalid_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding, 'utf-8')
Example #9
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_detect_utf8(self): utf8 = b"\xc3\xa9" dammit = UnicodeDammit(utf8) self.assertEqual(dammit.unicode_markup, u'\xe9') self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
Example #10
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
Example #11
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
Example #12
Source File: test_soup.py From B.E.N.J.I. with MIT License | 5 votes |
def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: utf8 = b"\xc3\xa9" soup = self.soup(utf8, fromEncoding="utf8") msg = str(w[0].message) self.assertTrue("fromEncoding" in msg) self.assertTrue("from_encoding" in msg) self.assertEqual("utf8", soup.original_encoding)
Example #13
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: utf8 = b"\xc3\xa9" soup = self.soup(utf8, fromEncoding="utf8") msg = str(w[0].message) self.assertTrue("fromEncoding" in msg) self.assertTrue("from_encoding" in msg) self.assertEqual("utf8", soup.original_encoding)
Example #14
Source File: test_soup.py From Gank-Alfred-Workflow with MIT License | 5 votes |
def test_detect_html5_style_meta_tag(self): for data in ( b'<html><meta charset="euc-jp" /></html>', b"<html><meta charset='euc-jp' /></html>", b"<html><meta charset=euc-jp /></html>", b"<html><meta charset=euc-jp/></html>"): dammit = UnicodeDammit(data, is_html=True) self.assertEqual( "euc-jp", dammit.original_encoding)
Example #15
Source File: test_soup.py From Gank-Alfred-Workflow with MIT License | 5 votes |
def test_sniffed_xml_encoding(self): # A document written in UTF-16LE will be converted by a different # code path that sniffs the byte order markers. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding)
Example #16
Source File: test_soup.py From B.E.N.J.I. with MIT License | 5 votes |
def test_exclude_encodings(self): utf8_data = "Räksmörgås".encode("utf-8") soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) self.assertEqual("windows-1252", soup.original_encoding)
Example #17
Source File: test_soup.py From B.E.N.J.I. with MIT License | 5 votes |
def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
Example #18
Source File: test_soup.py From B.E.N.J.I. with MIT License | 5 votes |
def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
Example #19
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
Example #20
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None)
Example #21
Source File: test_soup.py From fuzzdb-collect with GNU General Public License v3.0 | 5 votes |
def test_detect_html5_style_meta_tag(self): for data in ( b'<html><meta charset="euc-jp" /></html>', b"<html><meta charset='euc-jp' /></html>", b"<html><meta charset=euc-jp /></html>", b"<html><meta charset=euc-jp/></html>"): dammit = UnicodeDammit(data, is_html=True) self.assertEqual( "euc-jp", dammit.original_encoding)
Example #22
Source File: test_soup.py From locality-sensitive-hashing with MIT License | 5 votes |
def test_byte_order_mark_removed(self): # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding)
Example #23
Source File: test_soup.py From locality-sensitive-hashing with MIT License | 5 votes |
def test_detect_html5_style_meta_tag(self): for data in ( b'<html><meta charset="euc-jp" /></html>', b"<html><meta charset='euc-jp' /></html>", b"<html><meta charset=euc-jp /></html>", b"<html><meta charset=euc-jp/></html>"): dammit = UnicodeDammit(data, is_html=True) self.assertEqual( "euc-jp", dammit.original_encoding)
Example #24
Source File: test_soup.py From locality-sensitive-hashing with MIT License | 5 votes |
def test_ignore_invalid_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
Example #25
Source File: test_soup.py From locality-sensitive-hashing with MIT License | 5 votes |
def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
Example #26
Source File: test_soup.py From locality-sensitive-hashing with MIT License | 5 votes |
def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
Example #27
Source File: test_soup.py From locality-sensitive-hashing with MIT License | 5 votes |
def test_detect_utf8(self): utf8 = b"\xc3\xa9" dammit = UnicodeDammit(utf8) self.assertEqual(dammit.unicode_markup, u'\xe9') self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
Example #28
Source File: test_soup.py From locality-sensitive-hashing with MIT License | 5 votes |
def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
Example #29
Source File: test_soup.py From locality-sensitive-hashing with MIT License | 5 votes |
def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None)
Example #30
Source File: test_soup.py From locality-sensitive-hashing with MIT License | 5 votes |
def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: utf8 = b"\xc3\xa9" soup = self.soup(utf8, fromEncoding="utf8") msg = str(w[0].message) self.assertTrue("fromEncoding" in msg) self.assertTrue("from_encoding" in msg) self.assertEqual("utf8", soup.original_encoding)