com.ibm.icu.text.CharsetDetector Java Exaples

Source File: CharsetIdentification.java From storm-crawler with Apache License 2.0

6 votes

/**
 * Use a third party library as last resort to guess the charset from the
 * bytes.
 */
private static String getCharsetFromText(byte[] content,
        String declaredCharset, int maxLengthCharsetDetection) {
    String charset = null;
    // filter HTML tags
    CharsetDetector charsetDetector = new CharsetDetector();
    charsetDetector.enableInputFilter(true);
    // give it a hint
    if (declaredCharset != null)
        charsetDetector.setDeclaredEncoding(declaredCharset);
    // trim the content of the text for the detection
    byte[] subContent = content;
    if (maxLengthCharsetDetection != -1
            && content.length > maxLengthCharsetDetection) {
        subContent = Arrays.copyOfRange(content, 0,
                maxLengthCharsetDetection);
    }
    charsetDetector.setText(subContent);
    try {
        CharsetMatch charsetMatch = charsetDetector.detect();
        charset = validateCharset(charsetMatch.getName());
    } catch (Exception e) {
        charset = null;
    }
    return charset;
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

6 votes

public static CharsetMatch checkCharset(InputStream input) {
	//		BufferedInputStream bis = new BufferedInputStream(input);
	CharsetDetector cd = new CharsetDetector();
	try {
		cd.setText(input);
	} catch (IOException e) {
		try {
			input.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		e.printStackTrace();
	}
	CharsetMatch cm = cd.detect();

	//		if (cm != null) {
	//			//reader = cm.getReader();
	//			return cm.getName();
	//		} else {
	//			throw new UnsupportedCharsetException(null);
	//		}
	return cm;
}

Source File: StreamDecoder.java From batfish with Apache License 2.0

6 votes

/**
 * Automatically detects charset of the input stream, reads it, decodes it, and returns the
 * resulting string with a newline appended if the original stream is non-empty. Does not close
 * the provided input stream.
 *
 * @throws IOException if there is an error
 */
@SuppressWarnings("PMD.CloseResource") // PMD does not understand Closer.
static @Nonnull String decodeStreamAndAppendNewline(@Nonnull InputStream inputStream)
    throws IOException {
  byte[] rawBytes = IOUtils.toByteArray(inputStream);
  Charset cs = Charset.forName(new CharsetDetector().setText(rawBytes).detect().getName());
  try (Closer closer = Closer.create()) {
    InputStream inputByteStream =
        closer.register(bomInputStream(new ByteArrayInputStream(rawBytes)));
    InputStream finalInputStream =
        closer.register(
            rawBytes.length > 0
                ? new SequenceInputStream(
                    inputByteStream,
                    closer.register(bomInputStream(new ByteArrayInputStream("\n".getBytes(cs)))))
                : inputByteStream);
    return new String(IOUtils.toByteArray(finalInputStream), cs);
  }
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

6 votes

/**
 * 캐릭터셋 확인
 * @param input
 * @return CharsetMatch
 */
public static CharsetMatch checkCharset(InputStream input) {
	CharsetDetector cd = new CharsetDetector();
	try {
		cd.setText(input);
	} catch (IOException e) {
		try {
			input.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		e.printStackTrace();
	}
	CharsetMatch cm = cd.detect();

	return cm;
}

Source File: MyCharsetUtils.java From spring-boot with Apache License 2.0

6 votes

/**
     * 利用 icu4j 探测输入流编码，只能探测文本类型的输入流
     * -
     * 抛弃 juniversalchardet
     *
     * @param in
     * @return
     * @throws IOException
     */
    public static Charset detectEncoding(InputStream in) throws IOException {
        final CharsetDetector detector = new CharsetDetector();
        detector.setText(in);

        final CharsetMatch charsetMatch = detector.detect();
        if (charsetMatch == null) {
            log.info("Cannot detect source charset.");
            return null;
        }
        //This is an integer from 0 to 100. The higher the value, the more confidence
        //探测的相似度在 1~100 之间，相似度越高结果越准确。
        int confidence = charsetMatch.getConfidence();
        final String name = charsetMatch.getName();
        log.info("CharsetMatch: {} ({}% 相似度，相似度小于 50% 时，可能编码无法判断。)", name, confidence);
        //打印该文本编码，所有可能性
//        CharsetMatch[] matches = detector.detectAll();
//        System.out.println("All possibilities : " + Arrays.asList(matches));
        return Charset.forName(name);
    }

Source File: RegisteredExtractors.java From document-management-system with GNU General Public License v2.0

6 votes

/**
 * Extract text to be indexed
 */
public static String getText(String mimeType, String encoding, InputStream isContent) throws IOException {
	BufferedInputStream bis = new BufferedInputStream(isContent);
	TextExtractor te = engine.get(mimeType);
	String text = null;

	if (te != null) {
		if (mimeType.startsWith("text/") && encoding == null) {
			CharsetDetector detector = new CharsetDetector();
			detector.setText(bis);
			CharsetMatch cm = detector.detect();
			encoding = cm.getName();
		}

		text = te.extractText(bis, mimeType, encoding);
	} else {
		throw new IOException("Full text indexing of '" + mimeType + "' is not supported");
	}


	IOUtils.closeQuietly(bis);
	return text;
}

Source File: IcuDetectorSniffer.java From caja with Apache License 2.0

6 votes

public Encoding sniff() throws IOException {
    try {
        CharsetDetector detector = new CharsetDetector();
        detector.setText(this);
        CharsetMatch match = detector.detect();
        Encoding enc = Encoding.forName(match.getName());
        Encoding actual = enc.getActualHtmlEncoding();
        if (actual != null) {
            enc = actual;
        }
        if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
            return enc;
        } else {
            return null;
        }
    } catch (Exception e) {
        return null;
    }
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

6 votes

public static CharsetMatch checkCharset(InputStream input) {
	//		BufferedInputStream bis = new BufferedInputStream(input);
	CharsetDetector cd = new CharsetDetector();
	try {
		cd.setText(input);
	} catch (IOException e) {
		try {
			input.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		e.printStackTrace();
	}
	CharsetMatch cm = cd.detect();

	//		if (cm != null) {
	//			//reader = cm.getReader();
	//			return cm.getName();
	//		} else {
	//			throw new UnsupportedCharsetException(null);
	//		}
	return cm;
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

6 votes

/**
 * 캐릭터셋 확인
 * @param input
 * @return CharsetMatch
 */
public static CharsetMatch checkCharset(InputStream input) {
	CharsetDetector cd = new CharsetDetector();
	try {
		cd.setText(input);
	} catch (IOException e) {
		try {
			input.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		e.printStackTrace();
	}
	CharsetMatch cm = cd.detect();

	return cm;
}

Source File: FeedUtils.java From commafeed with Apache License 2.0

5 votes

/**
 * Detect encoding by analyzing characters in the array
 */
public static Charset detectEncoding(byte[] bytes) {
	String encoding = "UTF-8";

	CharsetDetector detector = new CharsetDetector();
	detector.setText(bytes);
	CharsetMatch match = detector.detect();
	if (match != null) {
		encoding = match.getName();
	}
	if (encoding.equalsIgnoreCase("ISO-8859-1")) {
		encoding = "windows-1252";
	}
	return Charset.forName(encoding);
}

Source File: IOHelper.java From AsciidocFX with Apache License 2.0

5 votes

private static String detectCharset(byte[] bytes) {
    String charset = null;
    try {
        CharsetMatch charsetMatch = new CharsetDetector().setText(bytes).detect();
        if (charsetMatch.getConfidence() > 70) {
            charset = charsetMatch.getName();
        }
    } catch (Exception e) {
    }
    return charset;
}

Source File: IcuDetectorSniffer.java From caja with Apache License 2.0

5 votes

public static void main(String[] args) {
    String[] detectable = CharsetDetector.getAllDetectableCharsets();
    for (int i = 0; i < detectable.length; i++) {
        String charset = detectable[i];
        System.out.println(charset);
    }
}

Source File: ChardetSniffer.java From caja with Apache License 2.0

5 votes

public static void main(String[] args) {
    String[] detectable = CharsetDetector.getAllDetectableCharsets();
    for (int i = 0; i < detectable.length; i++) {
        String charset = detectable[i];
        System.out.println(charset);
    }
}

Source File: Source.java From tablesaw with Apache License 2.0

5 votes

/**
 * Returns the likely charset for the given byte[], if it can be determined. A confidence score is
 * calculated. If the score is less than 60 (on a 1 to 100 interval) the system default charset is
 * returned instead.
 *
 * @param buffer The byte array to evaluate
 * @return The likely charset, or the system default charset
 */
private static Charset getCharSet(byte[] buffer) {
  CharsetDetector detector = new CharsetDetector();
  detector.setText(buffer);
  CharsetMatch match = detector.detect();
  if (match == null || match.getConfidence() < 60) {
    return Charset.defaultCharset();
  }
  return Charset.forName(match.getName());
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

5 votes

/**
 * 캐릭터셋 확인
 * @param input
 * @return CharsetMatch
 */
public static CharsetMatch checkCharset(byte[] input) {
	CharsetDetector cd = new CharsetDetector();
	cd.setText(input);
	CharsetMatch cm = cd.detect();

	return cm;
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

5 votes

public static CharsetMatch checkCharset(byte[] input) {
	//		BufferedInputStream bis = new BufferedInputStream(input);
	CharsetDetector cd = new CharsetDetector();
	cd.setText(input);
	CharsetMatch cm = cd.detect();

	//		if (cm != null) {
	//			//reader = cm.getReader();
	//			return cm.getName();
	//		} else {
	//			throw new UnsupportedCharsetException(null);
	//		}
	return cm;
}

Source File: Source.java From tablesaw with Apache License 2.0

5 votes

/**
 * Returns the likely charset for the given byte[], if it can be determined. A confidence score is
 * calculated. If the score is less than 60 (on a 1 to 100 interval) the system default charset is
 * returned instead.
 *
 * @param buffer The byte array to evaluate
 * @return The likely charset, or the system default charset
 */
private static Charset getCharSet(byte[] buffer) {
  CharsetDetector detector = new CharsetDetector();
  detector.setText(buffer);
  CharsetMatch match = detector.detect();
  if (match == null || match.getConfidence() < 60) {
    return Charset.defaultCharset();
  }
  return Charset.forName(match.getName());
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

5 votes

/**
 * 캐릭터셋 확인
 * @param input
 * @return CharsetMatch
 */
public static CharsetMatch checkCharset(byte[] input) {
	CharsetDetector cd = new CharsetDetector();
	cd.setText(input);
	CharsetMatch cm = cd.detect();

	return cm;
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

5 votes

public static CharsetMatch checkCharset(byte[] input) {
	//		BufferedInputStream bis = new BufferedInputStream(input);
	CharsetDetector cd = new CharsetDetector();
	cd.setText(input);
	CharsetMatch cm = cd.detect();

	//		if (cm != null) {
	//			//reader = cm.getReader();
	//			return cm.getName();
	//		} else {
	//			throw new UnsupportedCharsetException(null);
	//		}
	return cm;
}

Source File: DetectCharsetStrategy.java From obevo with Apache License 2.0

5 votes

@Override
public Charset determineCharset(byte[] bytes) {
    try (ByteArrayInputStream input = new ByteArrayInputStream(bytes)) {
        CharsetDetector cd = new CharsetDetector();
        cd.setText(input);
        CharsetMatch cm = cd.detect();
        return Charset.forName(cm.getName());
    } catch (IOException e) {
        return null;
    }
}

Source File: EncodingDetector.java From anthelion with Apache License 2.0

4 votes

public EncodingDetector(Configuration conf) {
  minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1);
  detector = new CharsetDetector();
  clues = new ArrayList<EncodingClue>();
}

Source File: CommonUtil.java From batfish with Apache License 2.0

4 votes

public static @Nonnull Charset detectCharset(byte[] bytes) {
  CharsetDetector detector = new CharsetDetector();
  detector.setText(bytes);
  return Charset.forName(detector.detect().getName());
}

Source File: EncodingDetector.java From nutch-htmlunit with Apache License 2.0

4 votes

public EncodingDetector(Configuration conf) {
  minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1);
  detector = new CharsetDetector();
  clues = new ArrayList<EncodingClue>();
}

Source File: CharSet.java From knife with MIT License

4 votes

public static String getResponseCharset(byte[] response){
	Getter getter = new Getter(BurpExtender.callbacks.getHelpers());
	String contentType = getter.getHeaderValueOf(false,response,"Content-Type");
	String body = new String(getter.getBody(false,response));
	String tmpcharSet = null;

	if (contentType != null){//1、尝试从contentTpye中获取
		if (contentType.toLowerCase().contains("charset=")) {
			tmpcharSet = contentType.toLowerCase().split("charset=")[1];
		}
	}

	if (tmpcharSet == null){//2、尝试从body中获取
		Pattern pDomainNameOnly = Pattern.compile("charset=(.*?)>");
		Matcher matcher = pDomainNameOnly.matcher(body);
		if (matcher.find()) {
			tmpcharSet = matcher.group(0).toLowerCase();
			//				tmpcharSet = tmpcharSet.replace("\"","");
			//				tmpcharSet = tmpcharSet.replace(">","");
			//				tmpcharSet = tmpcharSet.replace("/","");
			//				tmpcharSet = tmpcharSet.replace("charset=","");
		}
	}

	if (tmpcharSet == null){//3、尝试使用ICU4J进行编码的检测
		CharsetDetector detector = new CharsetDetector();
		detector.setText(response);
		CharsetMatch cm = detector.detect();
		tmpcharSet = cm.getName();
	}

	tmpcharSet = tmpcharSet.toLowerCase().trim();
	if (tmpcharSet.contains("utf8")){
		tmpcharSet = "utf-8";
	}else {
		//常见的编码格式有ASCII、ANSI、GBK、GB2312、UTF-8、GB18030和UNICODE等。
		List<String> commonCharSet = Arrays.asList("ASCII,ANSI,GBK,GB2312,UTF-8,GB18030,UNICODE,ISO-8859-1".toLowerCase().split(","));
		for (String item:commonCharSet) {
			if (tmpcharSet.contains(item)) {
				tmpcharSet = item;
			}
		}
	}
	return tmpcharSet;
}

com.ibm.icu.text.CharsetDetector Java Examples