Python bs4.dammit.UnicodeDammit() Examples
The following are 30
code examples of bs4.dammit.UnicodeDammit().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bs4.dammit
, or try the search function
.
Example #1
Source File: _htmlparser.py From ServerlessCrawler-VancouverRealState with MIT License | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #2
Source File: _htmlparser.py From POC-EXP with GNU General Public License v3.0 | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #3
Source File: _htmlparser.py From MIA-Dictionary-Addon with GNU General Public License v3.0 | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, str): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #4
Source File: _htmlparser.py From bazarr with GNU General Public License v3.0 | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #5
Source File: _htmlparser.py From ru with GNU General Public License v2.0 | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #6
Source File: _htmlparser.py From MARA_Framework with GNU Lesser General Public License v3.0 | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #7
Source File: _htmlparser.py From CrisisMappingToolkit with Apache License 2.0 | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #8
Source File: _htmlparser.py From nbaplus-server with Apache License 2.0 | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #9
Source File: _htmlparser.py From stopstalk-deployment with MIT License | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #10
Source File: _htmlparser.py From nzb-subliminal with GNU General Public License v3.0 | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #11
Source File: _htmlparser.py From B.E.N.J.I. with MIT License | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, str): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #12
Source File: _htmlparser.py From bazarr with GNU General Public License v3.0 | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, str): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #13
Source File: _htmlparser.py From Tautulli with GNU General Public License v3.0 | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, str): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #14
Source File: _htmlparser.py From svg-animation-tools with MIT License | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #15
Source File: _htmlparser.py From svg-animation-tools with MIT License | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #16
Source File: _htmlparser.py From moviegrabber with GNU General Public License v3.0 | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #17
Source File: _htmlparser.py From ServerlessCrawler-VancouverRealState with MIT License | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #18
Source File: _htmlparser.py From fuzzdb-collect with GNU General Public License v3.0 | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #19
Source File: _htmlparser.py From locality-sensitive-hashing with MIT License | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #20
Source File: _htmlparser.py From pledgeservice with Apache License 2.0 | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #21
Source File: _htmlparser.py From ServerlessCrawler-VancouverRealState with MIT License | 6 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #22
Source File: _lxml.py From Crunchyroll-XML-Decoder with GNU General Public License v2.0 | 5 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 3-tuple (markup, original encoding, encoding declared within markup). """ if isinstance(markup, unicode): return markup, None, None, False try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) return (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #23
Source File: _htmlparser.py From Crunchyroll-XML-Decoder with GNU General Public License v2.0 | 5 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): return markup, None, None, False try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) return (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #24
Source File: _htmlparser.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """Run any preliminary steps necessary to make incoming markup acceptable to the parser. :param markup: Some markup -- probably a bytestring. :param user_specified_encoding: The user asked to try this encoding. :param document_declared_encoding: The markup itself claims to be in this encoding. :param exclude_encodings: The user asked _not_ to try any of these encodings. :yield: A series of 4-tuples: (markup, encoding, declared encoding, has undergone character replacement) Each 4-tuple represents a strategy for converting the document to Unicode and parsing it. Each strategy will be tried in turn. """ if isinstance(markup, str): # Parse Unicode as-is. yield (markup, None, None, False) return # Ask UnicodeDammit to sniff the most likely encoding. try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True, exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #25
Source File: _lxml.py From python-for-android with Apache License 2.0 | 5 votes |
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 3-tuple (markup, original encoding, encoding declared within markup). """ if isinstance(markup, str): return markup, None, None, False try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) return (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters)
Example #26
Source File: test_soup.py From python-for-android with Apache License 2.0 | 5 votes |
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet try: bs4.dammit.chardet = None with warnings.catch_warnings(record=True) as w: dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue("\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) msg = w[0].message self.assertTrue(isinstance(msg, UnicodeWarning)) self.assertTrue("Some characters could not be decoded" in str(msg)) finally: bs4.dammit.chardet = chardet
Example #27
Source File: test_soup.py From python-for-android with Apache License 2.0 | 5 votes |
def test_detect_html5_style_meta_tag(self): for data in ( b'<html><meta charset="euc-jp" /></html>', b"<html><meta charset='euc-jp' /></html>", b"<html><meta charset=euc-jp /></html>", b"<html><meta charset=euc-jp/></html>"): dammit = UnicodeDammit(data, is_html=True) self.assertEqual( "euc-jp", dammit.original_encoding)
Example #28
Source File: test_soup.py From python-for-android with Apache License 2.0 | 5 votes |
def test_ignore_invalid_codecs(self): utf8_data = "Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding, 'utf-8')
Example #29
Source File: test_soup.py From python-for-android with Apache License 2.0 | 5 votes |
def test_ignore_inappropriate_codecs(self): utf8_data = "Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding, 'utf-8')
Example #30
Source File: test_soup.py From python-for-android with Apache License 2.0 | 5 votes |
def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding, 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)