com.ibm.icu.text.CharsetMatch Java Examples
The following examples show how to use
com.ibm.icu.text.CharsetMatch.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CharsetIdentification.java From storm-crawler with Apache License 2.0 | 6 votes |
/** * Use a third party library as last resort to guess the charset from the * bytes. */ private static String getCharsetFromText(byte[] content, String declaredCharset, int maxLengthCharsetDetection) { String charset = null; // filter HTML tags CharsetDetector charsetDetector = new CharsetDetector(); charsetDetector.enableInputFilter(true); // give it a hint if (declaredCharset != null) charsetDetector.setDeclaredEncoding(declaredCharset); // trim the content of the text for the detection byte[] subContent = content; if (maxLengthCharsetDetection != -1 && content.length > maxLengthCharsetDetection) { subContent = Arrays.copyOfRange(content, 0, maxLengthCharsetDetection); } charsetDetector.setText(subContent); try { CharsetMatch charsetMatch = charsetDetector.detect(); charset = validateCharset(charsetMatch.getName()); } catch (Exception e) { charset = null; } return charset; }
Example #2
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 6 votes |
public static CharsetMatch checkCharset(InputStream input) { // BufferedInputStream bis = new BufferedInputStream(input); CharsetDetector cd = new CharsetDetector(); try { cd.setText(input); } catch (IOException e) { try { input.close(); } catch (IOException e1) { e1.printStackTrace(); } e.printStackTrace(); } CharsetMatch cm = cd.detect(); // if (cm != null) { // //reader = cm.getReader(); // return cm.getName(); // } else { // throw new UnsupportedCharsetException(null); // } return cm; }
Example #3
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 6 votes |
/** * 캐릭터셋 확인 * @param input * @return CharsetMatch */ public static CharsetMatch checkCharset(InputStream input) { CharsetDetector cd = new CharsetDetector(); try { cd.setText(input); } catch (IOException e) { try { input.close(); } catch (IOException e1) { e1.printStackTrace(); } e.printStackTrace(); } CharsetMatch cm = cd.detect(); return cm; }
Example #4
Source File: MyCharsetUtils.java From spring-boot with Apache License 2.0 | 6 votes |
/** * 利用 icu4j 探测输入流编码,只能探测文本类型的输入流 * - * 抛弃 juniversalchardet * * @param in * @return * @throws IOException */ public static Charset detectEncoding(InputStream in) throws IOException { final CharsetDetector detector = new CharsetDetector(); detector.setText(in); final CharsetMatch charsetMatch = detector.detect(); if (charsetMatch == null) { log.info("Cannot detect source charset."); return null; } //This is an integer from 0 to 100. The higher the value, the more confidence //探测的相似度在 1~100 之间,相似度越高结果越准确。 int confidence = charsetMatch.getConfidence(); final String name = charsetMatch.getName(); log.info("CharsetMatch: {} ({}% 相似度,相似度小于 50% 时,可能编码无法判断。)", name, confidence); //打印该文本编码,所有可能性 // CharsetMatch[] matches = detector.detectAll(); // System.out.println("All possibilities : " + Arrays.asList(matches)); return Charset.forName(name); }
Example #5
Source File: RegisteredExtractors.java From document-management-system with GNU General Public License v2.0 | 6 votes |
/** * Extract text to be indexed */ public static String getText(String mimeType, String encoding, InputStream isContent) throws IOException { BufferedInputStream bis = new BufferedInputStream(isContent); TextExtractor te = engine.get(mimeType); String text = null; if (te != null) { if (mimeType.startsWith("text/") && encoding == null) { CharsetDetector detector = new CharsetDetector(); detector.setText(bis); CharsetMatch cm = detector.detect(); encoding = cm.getName(); } text = te.extractText(bis, mimeType, encoding); } else { throw new IOException("Full text indexing of '" + mimeType + "' is not supported"); } IOUtils.closeQuietly(bis); return text; }
Example #6
Source File: IcuDetectorSniffer.java From caja with Apache License 2.0 | 6 votes |
public Encoding sniff() throws IOException { try { CharsetDetector detector = new CharsetDetector(); detector.setText(this); CharsetMatch match = detector.detect(); Encoding enc = Encoding.forName(match.getName()); Encoding actual = enc.getActualHtmlEncoding(); if (actual != null) { enc = actual; } if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) { return enc; } else { return null; } } catch (Exception e) { return null; } }
Example #7
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 6 votes |
public static CharsetMatch checkCharset(InputStream input) { // BufferedInputStream bis = new BufferedInputStream(input); CharsetDetector cd = new CharsetDetector(); try { cd.setText(input); } catch (IOException e) { try { input.close(); } catch (IOException e1) { e1.printStackTrace(); } e.printStackTrace(); } CharsetMatch cm = cd.detect(); // if (cm != null) { // //reader = cm.getReader(); // return cm.getName(); // } else { // throw new UnsupportedCharsetException(null); // } return cm; }
Example #8
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 6 votes |
/** * 캐릭터셋 확인 * @param input * @return CharsetMatch */ public static CharsetMatch checkCharset(InputStream input) { CharsetDetector cd = new CharsetDetector(); try { cd.setText(input); } catch (IOException e) { try { input.close(); } catch (IOException e1) { e1.printStackTrace(); } e.printStackTrace(); } CharsetMatch cm = cd.detect(); return cm; }
Example #9
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 5 votes |
public static CharsetMatch checkCharset(byte[] input) { // BufferedInputStream bis = new BufferedInputStream(input); CharsetDetector cd = new CharsetDetector(); cd.setText(input); CharsetMatch cm = cd.detect(); // if (cm != null) { // //reader = cm.getReader(); // return cm.getName(); // } else { // throw new UnsupportedCharsetException(null); // } return cm; }
Example #10
Source File: FeedUtils.java From commafeed with Apache License 2.0 | 5 votes |
/** * Detect encoding by analyzing characters in the array */ public static Charset detectEncoding(byte[] bytes) { String encoding = "UTF-8"; CharsetDetector detector = new CharsetDetector(); detector.setText(bytes); CharsetMatch match = detector.detect(); if (match != null) { encoding = match.getName(); } if (encoding.equalsIgnoreCase("ISO-8859-1")) { encoding = "windows-1252"; } return Charset.forName(encoding); }
Example #11
Source File: EncodingDetector.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void autoDetectClues(Content content, boolean filter) { byte[] data = content.getContent(); if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType()) && data.length > MIN_LENGTH) { CharsetMatch[] matches = null; // do all these in a try/catch; setText and detect/detectAll // will sometimes throw exceptions try { detector.enableInputFilter(filter); if (data.length > MIN_LENGTH) { detector.setText(data); matches = detector.detectAll(); } } catch (Exception e) { LOG.debug("Exception from ICU4J (ignoring): ", e); } if (matches != null) { for (CharsetMatch match : matches) { addClue(match.getName(), "detect", match.getConfidence()); } } } // add character encoding coming from HTTP response header addClue(parseCharacterEncoding( content.getMetadata().get(Response.CONTENT_TYPE)), "header"); }
Example #12
Source File: IOHelper.java From AsciidocFX with Apache License 2.0 | 5 votes |
private static String detectCharset(byte[] bytes) { String charset = null; try { CharsetMatch charsetMatch = new CharsetDetector().setText(bytes).detect(); if (charsetMatch.getConfidence() > 70) { charset = charsetMatch.getName(); } } catch (Exception e) { } return charset; }
Example #13
Source File: Source.java From tablesaw with Apache License 2.0 | 5 votes |
/** * Returns the likely charset for the given byte[], if it can be determined. A confidence score is * calculated. If the score is less than 60 (on a 1 to 100 interval) the system default charset is * returned instead. * * @param buffer The byte array to evaluate * @return The likely charset, or the system default charset */ private static Charset getCharSet(byte[] buffer) { CharsetDetector detector = new CharsetDetector(); detector.setText(buffer); CharsetMatch match = detector.detect(); if (match == null || match.getConfidence() < 60) { return Charset.defaultCharset(); } return Charset.forName(match.getName()); }
Example #14
Source File: EncodingDetector.java From anthelion with Apache License 2.0 | 5 votes |
public void autoDetectClues(Content content, boolean filter) { byte[] data = content.getContent(); if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType()) && data.length > MIN_LENGTH) { CharsetMatch[] matches = null; // do all these in a try/catch; setText and detect/detectAll // will sometimes throw exceptions try { detector.enableInputFilter(filter); if (data.length > MIN_LENGTH) { detector.setText(data); matches = detector.detectAll(); } } catch (Exception e) { LOG.debug("Exception from ICU4J (ignoring): ", e); } if (matches != null) { for (CharsetMatch match : matches) { addClue(match.getName(), "detect", match.getConfidence()); } } } // add character encoding coming from HTTP response header addClue(parseCharacterEncoding( content.getMetadata().get(Response.CONTENT_TYPE)), "header"); }
Example #15
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 5 votes |
/** * 캐릭터셋 확인 * @param input * @return CharsetMatch */ public static CharsetMatch checkCharset(byte[] input) { CharsetDetector cd = new CharsetDetector(); cd.setText(input); CharsetMatch cm = cd.detect(); return cm; }
Example #16
Source File: Source.java From tablesaw with Apache License 2.0 | 5 votes |
/** * Returns the likely charset for the given byte[], if it can be determined. A confidence score is * calculated. If the score is less than 60 (on a 1 to 100 interval) the system default charset is * returned instead. * * @param buffer The byte array to evaluate * @return The likely charset, or the system default charset */ private static Charset getCharSet(byte[] buffer) { CharsetDetector detector = new CharsetDetector(); detector.setText(buffer); CharsetMatch match = detector.detect(); if (match == null || match.getConfidence() < 60) { return Charset.defaultCharset(); } return Charset.forName(match.getName()); }
Example #17
Source File: ICUCharsetDetectorWrapper.java From dkpro-c4corpus with Apache License 2.0 | 5 votes |
@Override public Charset detectCharset(byte[] bytes, String declaredCharset) { // prepare fallback first Charset result = FALLBACK_CHARSET; // truncate to 8k bytes max if (bytes.length <= SUFFICIENT_BYTE_ARRAY_SIZE) { charsetDetector.setText(bytes); } else { charsetDetector.setText(Arrays.copyOf(bytes, SUFFICIENT_BYTE_ARRAY_SIZE)); } if (declaredCharset != null) { charsetDetector.setDeclaredEncoding(declaredCharset); } CharsetMatch charsetMatch = charsetDetector.detect(); if (charsetMatch != null) { try { result = Charset.forName(charsetMatch.getName()); } catch (UnsupportedCharsetException ex) { // fallback to default } } return result; }
Example #18
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 5 votes |
/** * 캐릭터셋 확인 * @param input * @return CharsetMatch */ public static CharsetMatch checkCharset(byte[] input) { CharsetDetector cd = new CharsetDetector(); cd.setText(input); CharsetMatch cm = cd.detect(); return cm; }
Example #19
Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License | 5 votes |
public static CharsetMatch checkCharset(byte[] input) { // BufferedInputStream bis = new BufferedInputStream(input); CharsetDetector cd = new CharsetDetector(); cd.setText(input); CharsetMatch cm = cd.detect(); // if (cm != null) { // //reader = cm.getReader(); // return cm.getName(); // } else { // throw new UnsupportedCharsetException(null); // } return cm; }
Example #20
Source File: DetectCharsetStrategy.java From obevo with Apache License 2.0 | 5 votes |
@Override public Charset determineCharset(byte[] bytes) { try (ByteArrayInputStream input = new ByteArrayInputStream(bytes)) { CharsetDetector cd = new CharsetDetector(); cd.setText(input); CharsetMatch cm = cd.detect(); return Charset.forName(cm.getName()); } catch (IOException e) { return null; } }
Example #21
Source File: CharSet.java From knife with MIT License | 4 votes |
public static String getResponseCharset(byte[] response){ Getter getter = new Getter(BurpExtender.callbacks.getHelpers()); String contentType = getter.getHeaderValueOf(false,response,"Content-Type"); String body = new String(getter.getBody(false,response)); String tmpcharSet = null; if (contentType != null){//1、尝试从contentTpye中获取 if (contentType.toLowerCase().contains("charset=")) { tmpcharSet = contentType.toLowerCase().split("charset=")[1]; } } if (tmpcharSet == null){//2、尝试从body中获取 Pattern pDomainNameOnly = Pattern.compile("charset=(.*?)>"); Matcher matcher = pDomainNameOnly.matcher(body); if (matcher.find()) { tmpcharSet = matcher.group(0).toLowerCase(); // tmpcharSet = tmpcharSet.replace("\"",""); // tmpcharSet = tmpcharSet.replace(">",""); // tmpcharSet = tmpcharSet.replace("/",""); // tmpcharSet = tmpcharSet.replace("charset=",""); } } if (tmpcharSet == null){//3、尝试使用ICU4J进行编码的检测 CharsetDetector detector = new CharsetDetector(); detector.setText(response); CharsetMatch cm = detector.detect(); tmpcharSet = cm.getName(); } tmpcharSet = tmpcharSet.toLowerCase().trim(); if (tmpcharSet.contains("utf8")){ tmpcharSet = "utf-8"; }else { //常见的编码格式有ASCII、ANSI、GBK、GB2312、UTF-8、GB18030和UNICODE等。 List<String> commonCharSet = Arrays.asList("ASCII,ANSI,GBK,GB2312,UTF-8,GB18030,UNICODE,ISO-8859-1".toLowerCase().split(",")); for (String item:commonCharSet) { if (tmpcharSet.contains(item)) { tmpcharSet = item; } } } return tmpcharSet; }