Python codecs.BOM_UTF8 Examples
The following are 30
code examples of codecs.BOM_UTF8().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
codecs
, or try the search function
.
Example #1
Source File: xmldocs.py From razzy-spinner with GNU General Public License v3.0 | 7 votes |
def _detect_encoding(self, fileid): if isinstance(fileid, PathPointer): s = fileid.open().readline() else: with open(fileid, 'rb') as infile: s = infile.readline() if s.startswith(codecs.BOM_UTF16_BE): return 'utf-16-be' if s.startswith(codecs.BOM_UTF16_LE): return 'utf-16-le' if s.startswith(codecs.BOM_UTF32_BE): return 'utf-32-be' if s.startswith(codecs.BOM_UTF32_LE): return 'utf-32-le' if s.startswith(codecs.BOM_UTF8): return 'utf-8' m = re.match(br'\s*<\?xml\b.*\bencoding="([^"]+)"', s) if m: return m.group(1).decode() m = re.match(br"\s*<\?xml\b.*\bencoding='([^']+)'", s) if m: return m.group(1).decode() # No encoding found -- what should the default be? return 'utf-8'
Example #2
Source File: _utils.py From azure-cli-extensions with MIT License | 6 votes |
def _load_gitignore_file(source_location): # reference: https://git-scm.com/docs/gitignore git_ignore_file = os.path.join(source_location, ".gitignore") if not os.path.exists(git_ignore_file): return None, 0 encoding = "utf-8" header = open(git_ignore_file, "rb").read(len(codecs.BOM_UTF8)) if header.startswith(codecs.BOM_UTF8): encoding = "utf-8-sig" ignore_list = [] for line in open(git_ignore_file, 'r', encoding=encoding).readlines(): rule = line.rstrip() # skip empty line and comment if not rule or rule.startswith('#'): continue # the ignore rule at the end has higher priority ignore_list = [IgnoreRule(rule)] + ignore_list return ignore_list, len(ignore_list)
Example #3
Source File: utf_8_sig.py From kobo-predict with BSD 2-Clause "Simplified" License | 6 votes |
def _buffer_decode(self, input, errors, final): if self.first: if len(input) < 3: if codecs.BOM_UTF8.startswith(input): # not enough data to decide if this really is a BOM # => try again on the next call return ("", 0) else: self.first = 0 else: self.first = 0 if input[:3] == codecs.BOM_UTF8: (output, consumed) = \ codecs.utf_8_decode(input[3:], errors, final) return (output, consumed+3) return codecs.utf_8_decode(input, errors, final)
Example #4
Source File: wallet_manager.py From ontology-python-sdk with GNU Lesser General Public License v3.0 | 6 votes |
def load_file(self): with open(self.__wallet_path, 'rb') as f: content = f.read() if content.startswith(codecs.BOM_UTF8): content = content[len(codecs.BOM_UTF8):] content = content.decode('utf-8') wallet_dict = json.loads(content) create_time = wallet_dict.get('createTime', '') default_id = wallet_dict.get('defaultOntid', '') default_address = wallet_dict.get('defaultAccountAddress', '') identities = wallet_dict.get('identities', list()) try: scrypt_dict = wallet_dict['scrypt'] scrypt_obj = Scrypt(scrypt_dict.get('n', 16384), scrypt_dict.get('r', 8), scrypt_dict.get('p', 8), scrypt_dict.get('dk_len', 64)) wallet = WalletData(wallet_dict['name'], wallet_dict['version'], create_time, default_id, default_address, scrypt_obj, identities, wallet_dict['accounts']) except KeyError as e: raise SDKException(ErrorCode.param_err(f'wallet file format error: {e}.')) return wallet
Example #5
Source File: test_codecs.py From ironpython2 with Apache License 2.0 | 6 votes |
def test_all(self): api = ( "encode", "decode", "register", "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup", "getencoder", "getdecoder", "getincrementalencoder", "getincrementaldecoder", "getreader", "getwriter", "register_error", "lookup_error", "strict_errors", "replace_errors", "ignore_errors", "xmlcharrefreplace_errors", "backslashreplace_errors", "open", "EncodedFile", "iterencode", "iterdecode", "BOM", "BOM_BE", "BOM_LE", "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE", "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented "StreamReaderWriter", "StreamRecoder", ) self.assertEqual(sorted(api), sorted(codecs.__all__)) for api in codecs.__all__: getattr(codecs, api)
Example #6
Source File: test_codecs.py From ironpython2 with Apache License 2.0 | 6 votes |
def test_stream_bom(self): unistring = u"ABC\u00A1\u2200XYZ" bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ" reader = codecs.getreader("utf-8-sig") for sizehint in [None] + range(1, 11) + \ [64, 128, 256, 512, 1024]: istream = reader(StringIO.StringIO(bytestring)) ostream = StringIO.StringIO() while 1: if sizehint is not None: data = istream.read(sizehint) else: data = istream.read() if not data: break ostream.write(data) got = ostream.getvalue() self.assertEqual(got, unistring)
Example #7
Source File: utf_8_sig.py From kobo-predict with BSD 2-Clause "Simplified" License | 5 votes |
def decode(input, errors='strict'): prefix = 0 if input[:3] == codecs.BOM_UTF8: input = input[3:] prefix = 3 (output, consumed) = codecs.utf_8_decode(input, errors, True) return (output, consumed+prefix)
Example #8
Source File: utils.py From splunk-aws-project-trumpet with MIT License | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #9
Source File: utf_8_sig.py From kobo-predict with BSD 2-Clause "Simplified" License | 5 votes |
def encode(self, input, final=False): if self.first: self.first = 0 return codecs.BOM_UTF8 + \ codecs.utf_8_encode(input, self.errors)[0] else: return codecs.utf_8_encode(input, self.errors)[0]
Example #10
Source File: utils.py From vnpy_crypto with MIT License | 5 votes |
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #11
Source File: utils.py From splunk-aws-project-trumpet with MIT License | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #12
Source File: utils.py From splunk-aws-project-trumpet with MIT License | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #13
Source File: _inputstream.py From vnpy_crypto with MIT License | 5 votes |
def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be', codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be' } # Go to beginning of file and read in 4 bytes string = self.rawStream.read(4) assert isinstance(string, bytes) # Try detecting the BOM using bytes from the string encoding = bomDict.get(string[:3]) # UTF-8 seek = 3 if not encoding: # Need to detect UTF-32 before UTF-16 encoding = bomDict.get(string) # UTF-32 seek = 4 if not encoding: encoding = bomDict.get(string[:2]) # UTF-16 seek = 2 # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream if encoding: self.rawStream.seek(seek) return lookupEncoding(encoding) else: self.rawStream.seek(0) return None
Example #14
Source File: utils.py From vnpy_crypto with MIT License | 5 votes |
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #15
Source File: utils.py From Yuki-Chan-The-Auto-Pentest with MIT License | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #16
Source File: utils.py From jawfish with MIT License | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #17
Source File: _inputstream.py From FuYiSpider with Apache License 2.0 | 5 votes |
def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be', codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be' } # Go to beginning of file and read in 4 bytes string = self.rawStream.read(4) assert isinstance(string, bytes) # Try detecting the BOM using bytes from the string encoding = bomDict.get(string[:3]) # UTF-8 seek = 3 if not encoding: # Need to detect UTF-32 before UTF-16 encoding = bomDict.get(string) # UTF-32 seek = 4 if not encoding: encoding = bomDict.get(string[:2]) # UTF-16 seek = 2 # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream if encoding: self.rawStream.seek(seek) return lookupEncoding(encoding) else: self.rawStream.seek(0) return None
Example #18
Source File: utils.py From pledgeservice with Apache License 2.0 | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #19
Source File: utf_8_sig.py From kobo-predict with BSD 2-Clause "Simplified" License | 5 votes |
def encode(input, errors='strict'): return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
Example #20
Source File: utils.py From Python24 with MIT License | 5 votes |
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #21
Source File: _inputstream.py From Python24 with MIT License | 5 votes |
def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be', codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be' } # Go to beginning of file and read in 4 bytes string = self.rawStream.read(4) assert isinstance(string, bytes) # Try detecting the BOM using bytes from the string encoding = bomDict.get(string[:3]) # UTF-8 seek = 3 if not encoding: # Need to detect UTF-32 before UTF-16 encoding = bomDict.get(string) # UTF-32 seek = 4 if not encoding: encoding = bomDict.get(string[:2]) # UTF-16 seek = 2 # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream if encoding: self.rawStream.seek(seek) return lookupEncoding(encoding) else: self.rawStream.seek(0) return None
Example #22
Source File: utils.py From SalesforceXyTools with Apache License 2.0 | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #23
Source File: utils.py From lambda-chef-node-cleanup with Apache License 2.0 | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #24
Source File: utf_8_sig.py From ironpython2 with Apache License 2.0 | 5 votes |
def _buffer_decode(self, input, errors, final): if self.first: if len(input) < 3: if codecs.BOM_UTF8.startswith(input): # not enough data to decide if this really is a BOM # => try again on the next call return (u"", 0) else: self.first = None else: self.first = None if input[:3] == codecs.BOM_UTF8: (output, consumed) = codecs.utf_8_decode(input[3:], errors, final) return (output, consumed+3) return codecs.utf_8_decode(input, errors, final)
Example #25
Source File: utf_8_sig.py From ironpython2 with Apache License 2.0 | 5 votes |
def encode(self, input, final=False): if self.first: self.first = 0 return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0] else: return codecs.utf_8_encode(input, self.errors)[0]
Example #26
Source File: utf_8_sig.py From ironpython2 with Apache License 2.0 | 5 votes |
def decode(input, errors='strict'): prefix = 0 if input[:3] == codecs.BOM_UTF8: input = input[3:] prefix = 3 (output, consumed) = codecs.utf_8_decode(input, errors, True) return (output, consumed+prefix)
Example #27
Source File: utf_8_sig.py From ironpython2 with Apache License 2.0 | 5 votes |
def encode(input, errors='strict'): return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
Example #28
Source File: test_refactor.py From ironpython2 with Apache License 2.0 | 5 votes |
def test_bom(self): fn = os.path.join(TEST_DATA_DIR, "bom.py") data = self.check_file_refactoring(fn) self.assertTrue(data.startswith(codecs.BOM_UTF8))
Example #29
Source File: inputstream.py From lambda-packs with MIT License | 5 votes |
def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be', codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be' } # Go to beginning of file and read in 4 bytes string = self.rawStream.read(4) assert isinstance(string, bytes) # Try detecting the BOM using bytes from the string encoding = bomDict.get(string[:3]) # UTF-8 seek = 3 if not encoding: # Need to detect UTF-32 before UTF-16 encoding = bomDict.get(string) # UTF-32 seek = 4 if not encoding: encoding = bomDict.get(string[:2]) # UTF-16 seek = 2 # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream self.rawStream.seek(encoding and seek or 0) return encoding
Example #30
Source File: utils.py From FuYiSpider with Apache License 2.0 | 5 votes |
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None