com.ibm.icu.text.CharsetDetector Java Examples
The following examples show how to use
com.ibm.icu.text.CharsetDetector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CharsetIdentification.java From storm-crawler with Apache License 2.0 | 6 votes |
/** * Use a third party library as last resort to guess the charset from the * bytes. */ private static String getCharsetFromText(byte[] content, String declaredCharset, int maxLengthCharsetDetection) { String charset = null; // filter HTML tags CharsetDetector charsetDetector = new CharsetDetector(); charsetDetector.enableInputFilter(true); // give it a hint if (declaredCharset != null) charsetDetector.setDeclaredEncoding(declaredCharset); // trim the content of the text for the detection byte[] subContent = content; if (maxLengthCharsetDetection != -1 && content.length > maxLengthCharsetDetection) { subContent = Arrays.copyOfRange(content, 0, maxLengthCharsetDetection); } charsetDetector.setText(subContent); try { CharsetMatch charsetMatch = charsetDetector.detect(); charset = validateCharset(charsetMatch.getName()); } catch (Exception e) { charset = null; } return charset; }
Example #2
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 6 votes |
public static CharsetMatch checkCharset(InputStream input) { // BufferedInputStream bis = new BufferedInputStream(input); CharsetDetector cd = new CharsetDetector(); try { cd.setText(input); } catch (IOException e) { try { input.close(); } catch (IOException e1) { e1.printStackTrace(); } e.printStackTrace(); } CharsetMatch cm = cd.detect(); // if (cm != null) { // //reader = cm.getReader(); // return cm.getName(); // } else { // throw new UnsupportedCharsetException(null); // } return cm; }
Example #3
Source File: StreamDecoder.java From batfish with Apache License 2.0 | 6 votes |
/** * Automatically detects charset of the input stream, reads it, decodes it, and returns the * resulting string with a newline appended if the original stream is non-empty. Does not close * the provided input stream. * * @throws IOException if there is an error */ @SuppressWarnings("PMD.CloseResource") // PMD does not understand Closer. static @Nonnull String decodeStreamAndAppendNewline(@Nonnull InputStream inputStream) throws IOException { byte[] rawBytes = IOUtils.toByteArray(inputStream); Charset cs = Charset.forName(new CharsetDetector().setText(rawBytes).detect().getName()); try (Closer closer = Closer.create()) { InputStream inputByteStream = closer.register(bomInputStream(new ByteArrayInputStream(rawBytes))); InputStream finalInputStream = closer.register( rawBytes.length > 0 ? new SequenceInputStream( inputByteStream, closer.register(bomInputStream(new ByteArrayInputStream("\n".getBytes(cs))))) : inputByteStream); return new String(IOUtils.toByteArray(finalInputStream), cs); } }
Example #4
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 6 votes |
/** * 캐릭터셋 확인 * @param input * @return CharsetMatch */ public static CharsetMatch checkCharset(InputStream input) { CharsetDetector cd = new CharsetDetector(); try { cd.setText(input); } catch (IOException e) { try { input.close(); } catch (IOException e1) { e1.printStackTrace(); } e.printStackTrace(); } CharsetMatch cm = cd.detect(); return cm; }
Example #5
Source File: MyCharsetUtils.java From spring-boot with Apache License 2.0 | 6 votes |
/** * 利用 icu4j 探测输入流编码,只能探测文本类型的输入流 * - * 抛弃 juniversalchardet * * @param in * @return * @throws IOException */ public static Charset detectEncoding(InputStream in) throws IOException { final CharsetDetector detector = new CharsetDetector(); detector.setText(in); final CharsetMatch charsetMatch = detector.detect(); if (charsetMatch == null) { log.info("Cannot detect source charset."); return null; } //This is an integer from 0 to 100. The higher the value, the more confidence //探测的相似度在 1~100 之间,相似度越高结果越准确。 int confidence = charsetMatch.getConfidence(); final String name = charsetMatch.getName(); log.info("CharsetMatch: {} ({}% 相似度,相似度小于 50% 时,可能编码无法判断。)", name, confidence); //打印该文本编码,所有可能性 // CharsetMatch[] matches = detector.detectAll(); // System.out.println("All possibilities : " + Arrays.asList(matches)); return Charset.forName(name); }
Example #6
Source File: RegisteredExtractors.java From document-management-system with GNU General Public License v2.0 | 6 votes |
/** * Extract text to be indexed */ public static String getText(String mimeType, String encoding, InputStream isContent) throws IOException { BufferedInputStream bis = new BufferedInputStream(isContent); TextExtractor te = engine.get(mimeType); String text = null; if (te != null) { if (mimeType.startsWith("text/") && encoding == null) { CharsetDetector detector = new CharsetDetector(); detector.setText(bis); CharsetMatch cm = detector.detect(); encoding = cm.getName(); } text = te.extractText(bis, mimeType, encoding); } else { throw new IOException("Full text indexing of '" + mimeType + "' is not supported"); } IOUtils.closeQuietly(bis); return text; }
Example #7
Source File: IcuDetectorSniffer.java From caja with Apache License 2.0 | 6 votes |
public Encoding sniff() throws IOException { try { CharsetDetector detector = new CharsetDetector(); detector.setText(this); CharsetMatch match = detector.detect(); Encoding enc = Encoding.forName(match.getName()); Encoding actual = enc.getActualHtmlEncoding(); if (actual != null) { enc = actual; } if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) { return enc; } else { return null; } } catch (Exception e) { return null; } }
Example #8
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 6 votes |
public static CharsetMatch checkCharset(InputStream input) { // BufferedInputStream bis = new BufferedInputStream(input); CharsetDetector cd = new CharsetDetector(); try { cd.setText(input); } catch (IOException e) { try { input.close(); } catch (IOException e1) { e1.printStackTrace(); } e.printStackTrace(); } CharsetMatch cm = cd.detect(); // if (cm != null) { // //reader = cm.getReader(); // return cm.getName(); // } else { // throw new UnsupportedCharsetException(null); // } return cm; }
Example #9
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 6 votes |
/** * 캐릭터셋 확인 * @param input * @return CharsetMatch */ public static CharsetMatch checkCharset(InputStream input) { CharsetDetector cd = new CharsetDetector(); try { cd.setText(input); } catch (IOException e) { try { input.close(); } catch (IOException e1) { e1.printStackTrace(); } e.printStackTrace(); } CharsetMatch cm = cd.detect(); return cm; }
Example #10
Source File: FeedUtils.java From commafeed with Apache License 2.0 | 5 votes |
/** * Detect encoding by analyzing characters in the array */ public static Charset detectEncoding(byte[] bytes) { String encoding = "UTF-8"; CharsetDetector detector = new CharsetDetector(); detector.setText(bytes); CharsetMatch match = detector.detect(); if (match != null) { encoding = match.getName(); } if (encoding.equalsIgnoreCase("ISO-8859-1")) { encoding = "windows-1252"; } return Charset.forName(encoding); }
Example #11
Source File: IOHelper.java From AsciidocFX with Apache License 2.0 | 5 votes |
private static String detectCharset(byte[] bytes) { String charset = null; try { CharsetMatch charsetMatch = new CharsetDetector().setText(bytes).detect(); if (charsetMatch.getConfidence() > 70) { charset = charsetMatch.getName(); } } catch (Exception e) { } return charset; }
Example #12
Source File: IcuDetectorSniffer.java From caja with Apache License 2.0 | 5 votes |
public static void main(String[] args) { String[] detectable = CharsetDetector.getAllDetectableCharsets(); for (int i = 0; i < detectable.length; i++) { String charset = detectable[i]; System.out.println(charset); } }
Example #13
Source File: ChardetSniffer.java From caja with Apache License 2.0 | 5 votes |
public static void main(String[] args) { String[] detectable = CharsetDetector.getAllDetectableCharsets(); for (int i = 0; i < detectable.length; i++) { String charset = detectable[i]; System.out.println(charset); } }
Example #14
Source File: Source.java From tablesaw with Apache License 2.0 | 5 votes |
/** * Returns the likely charset for the given byte[], if it can be determined. A confidence score is * calculated. If the score is less than 60 (on a 1 to 100 interval) the system default charset is * returned instead. * * @param buffer The byte array to evaluate * @return The likely charset, or the system default charset */ private static Charset getCharSet(byte[] buffer) { CharsetDetector detector = new CharsetDetector(); detector.setText(buffer); CharsetMatch match = detector.detect(); if (match == null || match.getConfidence() < 60) { return Charset.defaultCharset(); } return Charset.forName(match.getName()); }
Example #15
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 5 votes |
/** * 캐릭터셋 확인 * @param input * @return CharsetMatch */ public static CharsetMatch checkCharset(byte[] input) { CharsetDetector cd = new CharsetDetector(); cd.setText(input); CharsetMatch cm = cd.detect(); return cm; }
Example #16
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 5 votes |
public static CharsetMatch checkCharset(byte[] input) { // BufferedInputStream bis = new BufferedInputStream(input); CharsetDetector cd = new CharsetDetector(); cd.setText(input); CharsetMatch cm = cd.detect(); // if (cm != null) { // //reader = cm.getReader(); // return cm.getName(); // } else { // throw new UnsupportedCharsetException(null); // } return cm; }
Example #17
Source File: Source.java From tablesaw with Apache License 2.0 | 5 votes |
/** * Returns the likely charset for the given byte[], if it can be determined. A confidence score is * calculated. If the score is less than 60 (on a 1 to 100 interval) the system default charset is * returned instead. * * @param buffer The byte array to evaluate * @return The likely charset, or the system default charset */ private static Charset getCharSet(byte[] buffer) { CharsetDetector detector = new CharsetDetector(); detector.setText(buffer); CharsetMatch match = detector.detect(); if (match == null || match.getConfidence() < 60) { return Charset.defaultCharset(); } return Charset.forName(match.getName()); }
Example #18
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 5 votes |
/** * 캐릭터셋 확인 * @param input * @return CharsetMatch */ public static CharsetMatch checkCharset(byte[] input) { CharsetDetector cd = new CharsetDetector(); cd.setText(input); CharsetMatch cm = cd.detect(); return cm; }
Example #19
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 5 votes |
public static CharsetMatch checkCharset(byte[] input) { // BufferedInputStream bis = new BufferedInputStream(input); CharsetDetector cd = new CharsetDetector(); cd.setText(input); CharsetMatch cm = cd.detect(); // if (cm != null) { // //reader = cm.getReader(); // return cm.getName(); // } else { // throw new UnsupportedCharsetException(null); // } return cm; }
Example #20
Source File: DetectCharsetStrategy.java From obevo with Apache License 2.0 | 5 votes |
@Override public Charset determineCharset(byte[] bytes) { try (ByteArrayInputStream input = new ByteArrayInputStream(bytes)) { CharsetDetector cd = new CharsetDetector(); cd.setText(input); CharsetMatch cm = cd.detect(); return Charset.forName(cm.getName()); } catch (IOException e) { return null; } }
Example #21
Source File: EncodingDetector.java From anthelion with Apache License 2.0 | 4 votes |
public EncodingDetector(Configuration conf) { minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1); detector = new CharsetDetector(); clues = new ArrayList<EncodingClue>(); }
Example #22
Source File: CommonUtil.java From batfish with Apache License 2.0 | 4 votes |
public static @Nonnull Charset detectCharset(byte[] bytes) { CharsetDetector detector = new CharsetDetector(); detector.setText(bytes); return Charset.forName(detector.detect().getName()); }
Example #23
Source File: EncodingDetector.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
public EncodingDetector(Configuration conf) { minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1); detector = new CharsetDetector(); clues = new ArrayList<EncodingClue>(); }
Example #24
Source File: CharSet.java From knife with MIT License | 4 votes |
public static String getResponseCharset(byte[] response){ Getter getter = new Getter(BurpExtender.callbacks.getHelpers()); String contentType = getter.getHeaderValueOf(false,response,"Content-Type"); String body = new String(getter.getBody(false,response)); String tmpcharSet = null; if (contentType != null){//1、尝试从contentTpye中获取 if (contentType.toLowerCase().contains("charset=")) { tmpcharSet = contentType.toLowerCase().split("charset=")[1]; } } if (tmpcharSet == null){//2、尝试从body中获取 Pattern pDomainNameOnly = Pattern.compile("charset=(.*?)>"); Matcher matcher = pDomainNameOnly.matcher(body); if (matcher.find()) { tmpcharSet = matcher.group(0).toLowerCase(); // tmpcharSet = tmpcharSet.replace("\"",""); // tmpcharSet = tmpcharSet.replace(">",""); // tmpcharSet = tmpcharSet.replace("/",""); // tmpcharSet = tmpcharSet.replace("charset=",""); } } if (tmpcharSet == null){//3、尝试使用ICU4J进行编码的检测 CharsetDetector detector = new CharsetDetector(); detector.setText(response); CharsetMatch cm = detector.detect(); tmpcharSet = cm.getName(); } tmpcharSet = tmpcharSet.toLowerCase().trim(); if (tmpcharSet.contains("utf8")){ tmpcharSet = "utf-8"; }else { //常见的编码格式有ASCII、ANSI、GBK、GB2312、UTF-8、GB18030和UNICODE等。 List<String> commonCharSet = Arrays.asList("ASCII,ANSI,GBK,GB2312,UTF-8,GB18030,UNICODE,ISO-8859-1".toLowerCase().split(",")); for (String item:commonCharSet) { if (tmpcharSet.contains(item)) { tmpcharSet = item; } } } return tmpcharSet; }