Python codecs.BOM_UTF16_BE Examples
The following are 30
code examples of codecs.BOM_UTF16_BE().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
codecs
, or try the search function
.
Example #1
Source File: xmldocs.py From razzy-spinner with GNU General Public License v3.0 | 7 votes |
def _detect_encoding(self, fileid): if isinstance(fileid, PathPointer): s = fileid.open().readline() else: with open(fileid, 'rb') as infile: s = infile.readline() if s.startswith(codecs.BOM_UTF16_BE): return 'utf-16-be' if s.startswith(codecs.BOM_UTF16_LE): return 'utf-16-le' if s.startswith(codecs.BOM_UTF32_BE): return 'utf-32-be' if s.startswith(codecs.BOM_UTF32_LE): return 'utf-32-le' if s.startswith(codecs.BOM_UTF8): return 'utf-8' m = re.match(br'\s*<\?xml\b.*\bencoding="([^"]+)"', s) if m: return m.group(1).decode() m = re.match(br"\s*<\?xml\b.*\bencoding='([^']+)'", s) if m: return m.group(1).decode() # No encoding found -- what should the default be? return 'utf-8'
Example #2
Source File: FS.py From pivy with ISC License | 7 votes |
def get_text_contents(self): contents = self.get_contents() # The behavior of various decode() methods and functions # w.r.t. the initial BOM bytes is different for different # encodings and/or Python versions. ('utf-8' does not strip # them, but has a 'utf-8-sig' which does; 'utf-16' seems to # strip them; etc.) Just side step all the complication by # explicitly stripping the BOM before we decode(). if contents.startswith(codecs.BOM_UTF8): contents = contents[len(codecs.BOM_UTF8):] # TODO(2.2): Remove when 2.3 becomes floor. #contents = contents.decode('utf-8') contents = my_decode(contents, 'utf-8') elif contents.startswith(codecs.BOM_UTF16_LE): contents = contents[len(codecs.BOM_UTF16_LE):] # TODO(2.2): Remove when 2.3 becomes floor. #contents = contents.decode('utf-16-le') contents = my_decode(contents, 'utf-16-le') elif contents.startswith(codecs.BOM_UTF16_BE): contents = contents[len(codecs.BOM_UTF16_BE):] # TODO(2.2): Remove when 2.3 becomes floor. #contents = contents.decode('utf-16-be') contents = my_decode(contents, 'utf-16-be') return contents
Example #3
Source File: utils.py From pex with Apache License 2.0 | 5 votes |
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #4
Source File: utils.py From splunk-aws-project-trumpet with MIT License | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #5
Source File: utils.py From deepWordBug with Apache License 2.0 | 5 votes |
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #6
Source File: utils.py From jawfish with MIT License | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #7
Source File: _inputstream.py From deepWordBug with Apache License 2.0 | 5 votes |
def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be', codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be' } # Go to beginning of file and read in 4 bytes string = self.rawStream.read(4) assert isinstance(string, bytes) # Try detecting the BOM using bytes from the string encoding = bomDict.get(string[:3]) # UTF-8 seek = 3 if not encoding: # Need to detect UTF-32 before UTF-16 encoding = bomDict.get(string) # UTF-32 seek = 4 if not encoding: encoding = bomDict.get(string[:2]) # UTF-16 seek = 2 # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream if encoding: self.rawStream.seek(seek) return lookupEncoding(encoding) else: self.rawStream.seek(0) return None
Example #8
Source File: PdfParser.py From teleport with Apache License 2.0 | 5 votes |
def decode_text(b): if b[: len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE: return b[len(codecs.BOM_UTF16_BE) :].decode("utf_16_be") else: return "".join(PDFDocEncoding.get(byte, chr(byte)) for byte in b)
Example #9
Source File: PdfParser.py From teleport with Apache License 2.0 | 5 votes |
def encode_text(s): return codecs.BOM_UTF16_BE + s.encode("utf_16_be")
Example #10
Source File: PdfParser.py From teleport with Apache License 2.0 | 5 votes |
def decode_text(b): if b[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE: return b[len(codecs.BOM_UTF16_BE):].decode("utf_16_be") elif py3: # Python 3.x return "".join(PDFDocEncoding.get(byte, chr(byte)) for byte in b) else: # Python 2.x return u"".join(PDFDocEncoding.get(ord(byte), byte) for byte in b)
Example #11
Source File: PdfParser.py From teleport with Apache License 2.0 | 5 votes |
def encode_text(s): return codecs.BOM_UTF16_BE + s.encode("utf_16_be")
Example #12
Source File: PdfParser.py From teleport with Apache License 2.0 | 5 votes |
def encode_text(s): return codecs.BOM_UTF16_BE + s.encode("utf_16_be")
Example #13
Source File: utils.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #14
Source File: _inputstream.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be', codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be' } # Go to beginning of file and read in 4 bytes string = self.rawStream.read(4) assert isinstance(string, bytes) # Try detecting the BOM using bytes from the string encoding = bomDict.get(string[:3]) # UTF-8 seek = 3 if not encoding: # Need to detect UTF-32 before UTF-16 encoding = bomDict.get(string) # UTF-32 seek = 4 if not encoding: encoding = bomDict.get(string[:2]) # UTF-16 seek = 2 # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream if encoding: self.rawStream.seek(seek) return lookupEncoding(encoding) else: self.rawStream.seek(0) return None
Example #15
Source File: utils.py From crunchy-xml-decoder with GNU General Public License v2.0 | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #16
Source File: utils.py From oss-ftp with MIT License | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #17
Source File: utils.py From oss-ftp with MIT License | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #18
Source File: inputstream.py From oss-ftp with MIT License | 5 votes |
def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be', codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be' } # Go to beginning of file and read in 4 bytes string = self.rawStream.read(4) assert isinstance(string, bytes) # Try detecting the BOM using bytes from the string encoding = bomDict.get(string[:3]) # UTF-8 seek = 3 if not encoding: # Need to detect UTF-32 before UTF-16 encoding = bomDict.get(string) # UTF-32 seek = 4 if not encoding: encoding = bomDict.get(string[:2]) # UTF-16 seek = 2 # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream self.rawStream.seek(encoding and seek or 0) return encoding
Example #19
Source File: utils.py From oss-ftp with MIT License | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #20
Source File: utils.py From wow-addon-updater with GNU General Public License v3.0 | 5 votes |
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #21
Source File: utils.py From FuYiSpider with Apache License 2.0 | 5 votes |
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #22
Source File: _inputstream.py From FuYiSpider with Apache License 2.0 | 5 votes |
def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be', codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be' } # Go to beginning of file and read in 4 bytes string = self.rawStream.read(4) assert isinstance(string, bytes) # Try detecting the BOM using bytes from the string encoding = bomDict.get(string[:3]) # UTF-8 seek = 3 if not encoding: # Need to detect UTF-32 before UTF-16 encoding = bomDict.get(string) # UTF-32 seek = 4 if not encoding: encoding = bomDict.get(string[:2]) # UTF-16 seek = 2 # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream if encoding: self.rawStream.seek(seek) return lookupEncoding(encoding) else: self.rawStream.seek(0) return None
Example #23
Source File: utils.py From FuYiSpider with Apache License 2.0 | 5 votes |
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #24
Source File: _inputstream.py From FuYiSpider with Apache License 2.0 | 5 votes |
def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be', codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be' } # Go to beginning of file and read in 4 bytes string = self.rawStream.read(4) assert isinstance(string, bytes) # Try detecting the BOM using bytes from the string encoding = bomDict.get(string[:3]) # UTF-8 seek = 3 if not encoding: # Need to detect UTF-32 before UTF-16 encoding = bomDict.get(string) # UTF-32 seek = 4 if not encoding: encoding = bomDict.get(string[:2]) # UTF-16 seek = 2 # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream if encoding: self.rawStream.seek(seek) return lookupEncoding(encoding) else: self.rawStream.seek(0) return None
Example #25
Source File: utils.py From Yuki-Chan-The-Auto-Pentest with MIT License | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #26
Source File: utils.py From vnpy_crypto with MIT License | 5 votes |
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #27
Source File: utils.py From vnpy_crypto with MIT License | 5 votes |
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #28
Source File: _inputstream.py From vnpy_crypto with MIT License | 5 votes |
def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be', codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be' } # Go to beginning of file and read in 4 bytes string = self.rawStream.read(4) assert isinstance(string, bytes) # Try detecting the BOM using bytes from the string encoding = bomDict.get(string[:3]) # UTF-8 seek = 3 if not encoding: # Need to detect UTF-32 before UTF-16 encoding = bomDict.get(string) # UTF-32 seek = 4 if not encoding: encoding = bomDict.get(string[:2]) # UTF-16 seek = 2 # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream if encoding: self.rawStream.seek(seek) return lookupEncoding(encoding) else: self.rawStream.seek(0) return None
Example #29
Source File: utils.py From splunk-aws-project-trumpet with MIT License | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
Example #30
Source File: utils.py From deepWordBug with Apache License 2.0 | 5 votes |
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None