Java Code Examples for org.mozilla.universalchardet.UniversalDetector#reset()
The following examples show how to use
org.mozilla.universalchardet.UniversalDetector#reset() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CrawlUtils.java From Asqatasun with GNU Affero General Public License v3.0 | 7 votes |
/** * This method extracts the charset from the html source code. * If the charset is not specified, it is set to UTF-8 by default * @param is * @return */ public static String extractCharset(InputStream is) throws java.io.IOException { byte[] buf = new byte[4096]; UniversalDetector detector = new UniversalDetector(null); int nread; while ((nread = is.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); String encoding = detector.getDetectedCharset(); if (encoding != null) { LOGGER.debug("Detected encoding = " + encoding); } else { LOGGER.debug("No encoding detected."); } detector.reset(); if (encoding != null && CrawlUtils.isValidCharset(encoding)) { return encoding; } else { return DEFAULT_CHARSET; } }
Example 2
Source File: CsvImporter.java From fingen with Apache License 2.0 | 6 votes |
private String detectCharset() throws IOException { String result = "UTF-8"; UniversalDetector detector = new UniversalDetector(null); byte[] buf = new byte[4096]; try { FileInputStream fis = new FileInputStream(mFileName); int nread; while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } fis.close(); detector.dataEnd(); String encoding = detector.getDetectedCharset(); if (encoding != null) { result = encoding; } detector.reset(); } catch (Exception e) { e.printStackTrace(); } return result; }
Example 3
Source File: Charset.java From dualsub with GNU General Public License v3.0 | 6 votes |
public static String detect(InputStream inputStream) throws IOException { UniversalDetector detector = Charset.getSingleton() .getCharsetDetector(); byte[] buf = new byte[4096]; int nread; while ((nread = inputStream.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); String encoding = detector.getDetectedCharset(); detector.reset(); inputStream.close(); if (encoding == null) { // If none encoding is detected, we assume UTF-8 encoding = UTF8; } return encoding; }
Example 4
Source File: ByteUtils.java From DouBiNovel with Apache License 2.0 | 5 votes |
/** * 获取文件编码类型 * * @param bytes 文件bytes数组 * @return 编码类型 */ public static String getEncoding(byte[] bytes) { String defaultEncoding = "UTF-8"; UniversalDetector detector = new UniversalDetector(null); detector.handleData(bytes, 0, bytes.length); detector.dataEnd(); String encoding = detector.getDetectedCharset(); detector.reset(); // log.info("字符编码是:{}", encoding); if (encoding == null) { encoding = defaultEncoding; } return encoding; }
Example 5
Source File: LyricView.java From MusicPlayer_XiangDa with GNU General Public License v3.0 | 5 votes |
public void setLyricFile(File file) { if (file == null || !file.exists()) { reset(); mCurrentLyricFilePath = ""; return; } else if (file.getPath().equals(mCurrentLyricFilePath)) { return; } else { mCurrentLyricFilePath = file.getPath(); reset(); } try { FileInputStream fis = new FileInputStream(file); byte[] buf = new byte[1024]; UniversalDetector detector = new UniversalDetector(null); int nread; while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); String encoding = detector.getDetectedCharset(); if (encoding != null) { setLyricFile(file, encoding); } else { setLyricFile(file, "UTF-8"); } detector.reset(); fis.close(); } catch (IOException e) { e.printStackTrace(); } }
Example 6
Source File: EncodingDetector.java From lizzie with GNU General Public License v3.0 | 5 votes |
public static String toString(InputStream is) { String encoding = "UTF-8"; try { byte[] buf = new byte[4096]; ByteArrayOutputStream output = new ByteArrayOutputStream(); UniversalDetector detector = new UniversalDetector(null); int nread; while ((nread = is.read(buf)) > 0) { output.write(buf, 0, nread); } is.close(); if (output.size() > 0) { byte[] data = output.toByteArray(); detector.handleData(data, 0, data.length); detector.dataEnd(); String detect = detector.getDetectedCharset(); if (detect != null) { encoding = detect; } detector.reset(); return new String(data, encoding); } } catch (IOException e) { } return ""; }
Example 7
Source File: LyricView.java From RetroMusicPlayer with GNU General Public License v3.0 | 5 votes |
public void setLyricFile(File file) { if (file == null || !file.exists()) { reset(); mCurrentLyricFilePath = ""; return; } else if (file.getPath().equals(mCurrentLyricFilePath)) { return; } else { mCurrentLyricFilePath = file.getPath(); reset(); } try { FileInputStream fis = new FileInputStream(file); byte[] buf = new byte[1024]; UniversalDetector detector = new UniversalDetector(null); int nread; while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); String encoding = detector.getDetectedCharset(); if (encoding != null) { setLyricFile(file, encoding); } else { setLyricFile(file, "UTF-8"); } detector.reset(); fis.close(); } catch (IOException e) { e.printStackTrace(); } }
Example 8
Source File: LocalDocReader.java From TranskribusCore with GNU General Public License v3.0 | 5 votes |
public static String readTextFromFile(File txtFile) throws IOException { byte[] buf = new byte[4096]; java.io.FileInputStream fis = new FileInputStream(txtFile); // (1) UniversalDetector detector = new UniversalDetector(null); // (2) int nread; while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } fis.close(); // (3) detector.dataEnd(); // (4) String encoding = detector.getDetectedCharset(); if (encoding != null) { logger.debug("Detected encoding = " + encoding); } else { logger.debug("No encoding detected - use utf-8"); encoding = "utf-8"; } // (5) detector.reset(); String text = FileUtils.readFileToString(txtFile, encoding); //String text = FileUtils.readFileToString(txtFile, "ISO-8859-1"); //logger.debug("text = "+text); return text; }
Example 9
Source File: FileInfoReader.java From editorconfig-netbeans with MIT License | 5 votes |
protected static Charset guessCharset(FileObject fo) { Charset charset = StandardCharsets.UTF_8; byte[] buf = new byte[4096]; try (InputStream is = fo.getInputStream()) { UniversalDetector detector = new UniversalDetector(null); int nread; while ((nread = is.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); String encoding = detector.getDetectedCharset(); if (encoding == null) { encoding = "ISO-8859-1"; } detector.reset(); charset = Charset.forName(encoding); } catch (IllegalArgumentException | IOException ex) { Exceptions.printStackTrace(ex); } return charset; }
Example 10
Source File: CharsetDetector.java From WebCollector with GNU General Public License v3.0 | 5 votes |
/** * 根据字节数组,猜测可能的字符集,如果检测失败,返回utf-8 * * @param bytes 待检测的字节数组 * @return 可能的字符集,如果检测失败,返回utf-8 */ public static String guessEncodingByMozilla(byte[] bytes) { String DEFAULT_ENCODING = "UTF-8"; UniversalDetector detector = new UniversalDetector(null); detector.handleData(bytes, 0, bytes.length); detector.dataEnd(); String encoding = detector.getDetectedCharset(); detector.reset(); if (encoding == null) { encoding = DEFAULT_ENCODING; } return encoding; }
Example 11
Source File: FileStorable.java From Readily with MIT License | 5 votes |
public static String guessCharset(InputStream is) throws IOException{ UniversalDetector detector = new UniversalDetector(null); byte[] buf = new byte[Constants.ENCODING_HELPER_BUFFER_SIZE]; int nread; while ((nread = is.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); String encoding = detector.getDetectedCharset(); detector.reset(); if (encoding != null) return encoding; return Constants.DEFAULT_ENCODING; }
Example 12
Source File: CharsetDetector.java From webarchive-commons with Apache License 2.0 | 5 votes |
/** * Attempts to figure out the character set of the document using * the excellent juniversalchardet library. * * @param resource * @return String character encoding found, or null if nothing looked good. * @throws IOException */ protected String getCharsetFromBytes(byte buffer[], int len) throws IOException { String charsetName = null; UniversalDetector detector = new UniversalDetector(null); detector.handleData(buffer, 0, len); detector.dataEnd(); charsetName = detector.getDetectedCharset(); detector.reset(); if(isCharsetSupported(charsetName)) { return mapCharset(charsetName); } return null; }
Example 13
Source File: Track.java From IdealMedia with Apache License 2.0 | 4 votes |
public static Track fromUri(Uri data) { Track t = new Track(); String path = data.getPath(); UniversalDetector detector = new UniversalDetector(null); int chan = BASS.BASS_StreamCreateFile(path, 0, 0, 0); String tags = null; for (int format = 0; format < FORMATS.length; format++) { final ByteBuffer byteBuffer = TAGS.TAGS_ReadExByte(chan, "%ARTI@%YEAR@%TRCK@%TITL@%ALBM@%COMP" + " ", FORMATS[format]); final int bufferSize = byteBuffer.capacity(); if (bufferSize < 10) continue; final ByteBuffer frameBuf = ByteBuffer.allocate(bufferSize); frameBuf.put(byteBuffer); detector.handleData(frameBuf.array(), 0, bufferSize); detector.dataEnd(); final String encoding = detector.getDetectedCharset(); boolean wrongencoding = false; try { tags = new String(frameBuf.array(), 0, bufferSize, Charset.forName(encoding)); } catch (Exception e) { wrongencoding = true; } finally { detector.reset(); } if (wrongencoding) continue; if (!TextUtils.isEmpty(tags)) { if (tags.split("@").length >= 4) return null; } } if (TextUtils.isEmpty(tags)) tags = TAGS.TAGS_Read(chan, "%UTF8(%ARTI)@%YEAR@%TRCK@%UTF8(%TITL)@%UTF8(%ALBM)@%UTF8(%COMP)" + " "); if (TextUtils.isEmpty(tags)) return null; String[] tagsArray = tags.split("@"); if (tagsArray.length <= 4) return null; tagsArray = tags.split("@"); int duration = (int) (0.5d+BASS.BASS_ChannelBytes2Seconds(chan, BASS.BASS_ChannelGetLength(chan, BASS.BASS_POS_BYTE))); t.artist = tagsArray[0]; t.title = tagsArray[3]; t.duration = duration; t.path = path; if (t.title == null || t.title == "") t.setTitle(data.getLastPathSegment()); return t; }