com.ibm.icu.text.CharsetMatch Java Exaples

Source File: CharsetIdentification.java From storm-crawler with Apache License 2.0

6 votes

/**
 * Use a third party library as last resort to guess the charset from the
 * bytes.
 */
private static String getCharsetFromText(byte[] content,
        String declaredCharset, int maxLengthCharsetDetection) {
    String charset = null;
    // filter HTML tags
    CharsetDetector charsetDetector = new CharsetDetector();
    charsetDetector.enableInputFilter(true);
    // give it a hint
    if (declaredCharset != null)
        charsetDetector.setDeclaredEncoding(declaredCharset);
    // trim the content of the text for the detection
    byte[] subContent = content;
    if (maxLengthCharsetDetection != -1
            && content.length > maxLengthCharsetDetection) {
        subContent = Arrays.copyOfRange(content, 0,
                maxLengthCharsetDetection);
    }
    charsetDetector.setText(subContent);
    try {
        CharsetMatch charsetMatch = charsetDetector.detect();
        charset = validateCharset(charsetMatch.getName());
    } catch (Exception e) {
        charset = null;
    }
    return charset;
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

6 votes

public static CharsetMatch checkCharset(InputStream input) {
	//		BufferedInputStream bis = new BufferedInputStream(input);
	CharsetDetector cd = new CharsetDetector();
	try {
		cd.setText(input);
	} catch (IOException e) {
		try {
			input.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		e.printStackTrace();
	}
	CharsetMatch cm = cd.detect();

	//		if (cm != null) {
	//			//reader = cm.getReader();
	//			return cm.getName();
	//		} else {
	//			throw new UnsupportedCharsetException(null);
	//		}
	return cm;
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

6 votes

/**
 * 캐릭터셋 확인
 * @param input
 * @return CharsetMatch
 */
public static CharsetMatch checkCharset(InputStream input) {
	CharsetDetector cd = new CharsetDetector();
	try {
		cd.setText(input);
	} catch (IOException e) {
		try {
			input.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		e.printStackTrace();
	}
	CharsetMatch cm = cd.detect();

	return cm;
}

Source File: MyCharsetUtils.java From spring-boot with Apache License 2.0

6 votes

/**
     * 利用 icu4j 探测输入流编码，只能探测文本类型的输入流
     * -
     * 抛弃 juniversalchardet
     *
     * @param in
     * @return
     * @throws IOException
     */
    public static Charset detectEncoding(InputStream in) throws IOException {
        final CharsetDetector detector = new CharsetDetector();
        detector.setText(in);

        final CharsetMatch charsetMatch = detector.detect();
        if (charsetMatch == null) {
            log.info("Cannot detect source charset.");
            return null;
        }
        //This is an integer from 0 to 100. The higher the value, the more confidence
        //探测的相似度在 1~100 之间，相似度越高结果越准确。
        int confidence = charsetMatch.getConfidence();
        final String name = charsetMatch.getName();
        log.info("CharsetMatch: {} ({}% 相似度，相似度小于 50% 时，可能编码无法判断。)", name, confidence);
        //打印该文本编码，所有可能性
//        CharsetMatch[] matches = detector.detectAll();
//        System.out.println("All possibilities : " + Arrays.asList(matches));
        return Charset.forName(name);
    }

Source File: RegisteredExtractors.java From document-management-system with GNU General Public License v2.0

6 votes

/**
 * Extract text to be indexed
 */
public static String getText(String mimeType, String encoding, InputStream isContent) throws IOException {
	BufferedInputStream bis = new BufferedInputStream(isContent);
	TextExtractor te = engine.get(mimeType);
	String text = null;

	if (te != null) {
		if (mimeType.startsWith("text/") && encoding == null) {
			CharsetDetector detector = new CharsetDetector();
			detector.setText(bis);
			CharsetMatch cm = detector.detect();
			encoding = cm.getName();
		}

		text = te.extractText(bis, mimeType, encoding);
	} else {
		throw new IOException("Full text indexing of '" + mimeType + "' is not supported");
	}


	IOUtils.closeQuietly(bis);
	return text;
}

Source File: IcuDetectorSniffer.java From caja with Apache License 2.0

6 votes

public Encoding sniff() throws IOException {
    try {
        CharsetDetector detector = new CharsetDetector();
        detector.setText(this);
        CharsetMatch match = detector.detect();
        Encoding enc = Encoding.forName(match.getName());
        Encoding actual = enc.getActualHtmlEncoding();
        if (actual != null) {
            enc = actual;
        }
        if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
            return enc;
        } else {
            return null;
        }
    } catch (Exception e) {
        return null;
    }
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

6 votes

public static CharsetMatch checkCharset(InputStream input) {
	//		BufferedInputStream bis = new BufferedInputStream(input);
	CharsetDetector cd = new CharsetDetector();
	try {
		cd.setText(input);
	} catch (IOException e) {
		try {
			input.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		e.printStackTrace();
	}
	CharsetMatch cm = cd.detect();

	//		if (cm != null) {
	//			//reader = cm.getReader();
	//			return cm.getName();
	//		} else {
	//			throw new UnsupportedCharsetException(null);
	//		}
	return cm;
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

6 votes

/**
 * 캐릭터셋 확인
 * @param input
 * @return CharsetMatch
 */
public static CharsetMatch checkCharset(InputStream input) {
	CharsetDetector cd = new CharsetDetector();
	try {
		cd.setText(input);
	} catch (IOException e) {
		try {
			input.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		e.printStackTrace();
	}
	CharsetMatch cm = cd.detect();

	return cm;
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

5 votes

public static CharsetMatch checkCharset(byte[] input) {
	//		BufferedInputStream bis = new BufferedInputStream(input);
	CharsetDetector cd = new CharsetDetector();
	cd.setText(input);
	CharsetMatch cm = cd.detect();

	//		if (cm != null) {
	//			//reader = cm.getReader();
	//			return cm.getName();
	//		} else {
	//			throw new UnsupportedCharsetException(null);
	//		}
	return cm;
}

Source File: FeedUtils.java From commafeed with Apache License 2.0

5 votes

/**
 * Detect encoding by analyzing characters in the array
 */
public static Charset detectEncoding(byte[] bytes) {
	String encoding = "UTF-8";

	CharsetDetector detector = new CharsetDetector();
	detector.setText(bytes);
	CharsetMatch match = detector.detect();
	if (match != null) {
		encoding = match.getName();
	}
	if (encoding.equalsIgnoreCase("ISO-8859-1")) {
		encoding = "windows-1252";
	}
	return Charset.forName(encoding);
}

Source File: EncodingDetector.java From nutch-htmlunit with Apache License 2.0

5 votes

public void autoDetectClues(Content content, boolean filter) {
  byte[] data = content.getContent();

  if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType())
      && data.length > MIN_LENGTH) {
    CharsetMatch[] matches = null;

    // do all these in a try/catch; setText and detect/detectAll
    // will sometimes throw exceptions
    try {
      detector.enableInputFilter(filter);
      if (data.length > MIN_LENGTH) {
        detector.setText(data);
        matches = detector.detectAll();
      }
    } catch (Exception e) {
      LOG.debug("Exception from ICU4J (ignoring): ", e);
    }

    if (matches != null) {
      for (CharsetMatch match : matches) {
        addClue(match.getName(), "detect", match.getConfidence());
      }
    }
  }

  // add character encoding coming from HTTP response header
  addClue(parseCharacterEncoding(
      content.getMetadata().get(Response.CONTENT_TYPE)), "header");
}

Source File: IOHelper.java From AsciidocFX with Apache License 2.0

5 votes

private static String detectCharset(byte[] bytes) {
    String charset = null;
    try {
        CharsetMatch charsetMatch = new CharsetDetector().setText(bytes).detect();
        if (charsetMatch.getConfidence() > 70) {
            charset = charsetMatch.getName();
        }
    } catch (Exception e) {
    }
    return charset;
}

Source File: Source.java From tablesaw with Apache License 2.0

5 votes

/**
 * Returns the likely charset for the given byte[], if it can be determined. A confidence score is
 * calculated. If the score is less than 60 (on a 1 to 100 interval) the system default charset is
 * returned instead.
 *
 * @param buffer The byte array to evaluate
 * @return The likely charset, or the system default charset
 */
private static Charset getCharSet(byte[] buffer) {
  CharsetDetector detector = new CharsetDetector();
  detector.setText(buffer);
  CharsetMatch match = detector.detect();
  if (match == null || match.getConfidence() < 60) {
    return Charset.defaultCharset();
  }
  return Charset.forName(match.getName());
}

Source File: EncodingDetector.java From anthelion with Apache License 2.0

5 votes

public void autoDetectClues(Content content, boolean filter) {
  byte[] data = content.getContent();

  if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType())
      && data.length > MIN_LENGTH) {
    CharsetMatch[] matches = null;

    // do all these in a try/catch; setText and detect/detectAll
    // will sometimes throw exceptions
    try {
      detector.enableInputFilter(filter);
      if (data.length > MIN_LENGTH) {
        detector.setText(data);
        matches = detector.detectAll();
      }
    } catch (Exception e) {
      LOG.debug("Exception from ICU4J (ignoring): ", e);
    }

    if (matches != null) {
      for (CharsetMatch match : matches) {
        addClue(match.getName(), "detect", match.getConfidence());
      }
    }
  }

  // add character encoding coming from HTTP response header
  addClue(parseCharacterEncoding(
      content.getMetadata().get(Response.CONTENT_TYPE)), "header");
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

5 votes

/**
 * 캐릭터셋 확인
 * @param input
 * @return CharsetMatch
 */
public static CharsetMatch checkCharset(byte[] input) {
	CharsetDetector cd = new CharsetDetector();
	cd.setText(input);
	CharsetMatch cm = cd.detect();

	return cm;
}

Source File: Source.java From tablesaw with Apache License 2.0

5 votes

/**
 * Returns the likely charset for the given byte[], if it can be determined. A confidence score is
 * calculated. If the score is less than 60 (on a 1 to 100 interval) the system default charset is
 * returned instead.
 *
 * @param buffer The byte array to evaluate
 * @return The likely charset, or the system default charset
 */
private static Charset getCharSet(byte[] buffer) {
  CharsetDetector detector = new CharsetDetector();
  detector.setText(buffer);
  CharsetMatch match = detector.detect();
  if (match == null || match.getConfidence() < 60) {
    return Charset.defaultCharset();
  }
  return Charset.forName(match.getName());
}

Source File: ICUCharsetDetectorWrapper.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public Charset detectCharset(byte[] bytes, String declaredCharset)
{
    // prepare fallback first
    Charset result = FALLBACK_CHARSET;

    // truncate to 8k bytes max
    if (bytes.length <= SUFFICIENT_BYTE_ARRAY_SIZE) {
        charsetDetector.setText(bytes);
    }
    else {
        charsetDetector.setText(Arrays.copyOf(bytes, SUFFICIENT_BYTE_ARRAY_SIZE));
    }

    if (declaredCharset != null) {
        charsetDetector.setDeclaredEncoding(declaredCharset);
    }

    CharsetMatch charsetMatch = charsetDetector.detect();

    if (charsetMatch != null) {
        try {
            result = Charset.forName(charsetMatch.getName());
        }
        catch (UnsupportedCharsetException ex) {
            // fallback to default
        }
    }

    return result;
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

5 votes

/**
 * 캐릭터셋 확인
 * @param input
 * @return CharsetMatch
 */
public static CharsetMatch checkCharset(byte[] input) {
	CharsetDetector cd = new CharsetDetector();
	cd.setText(input);
	CharsetMatch cm = cd.detect();

	return cm;
}

Source File: FileUtil.java From SDA with BSD 2-Clause "Simplified" License

5 votes

public static CharsetMatch checkCharset(byte[] input) {
	//		BufferedInputStream bis = new BufferedInputStream(input);
	CharsetDetector cd = new CharsetDetector();
	cd.setText(input);
	CharsetMatch cm = cd.detect();

	//		if (cm != null) {
	//			//reader = cm.getReader();
	//			return cm.getName();
	//		} else {
	//			throw new UnsupportedCharsetException(null);
	//		}
	return cm;
}

Source File: DetectCharsetStrategy.java From obevo with Apache License 2.0

5 votes

@Override
public Charset determineCharset(byte[] bytes) {
    try (ByteArrayInputStream input = new ByteArrayInputStream(bytes)) {
        CharsetDetector cd = new CharsetDetector();
        cd.setText(input);
        CharsetMatch cm = cd.detect();
        return Charset.forName(cm.getName());
    } catch (IOException e) {
        return null;
    }
}

Source File: CharSet.java From knife with MIT License

4 votes

public static String getResponseCharset(byte[] response){
	Getter getter = new Getter(BurpExtender.callbacks.getHelpers());
	String contentType = getter.getHeaderValueOf(false,response,"Content-Type");
	String body = new String(getter.getBody(false,response));
	String tmpcharSet = null;

	if (contentType != null){//1、尝试从contentTpye中获取
		if (contentType.toLowerCase().contains("charset=")) {
			tmpcharSet = contentType.toLowerCase().split("charset=")[1];
		}
	}

	if (tmpcharSet == null){//2、尝试从body中获取
		Pattern pDomainNameOnly = Pattern.compile("charset=(.*?)>");
		Matcher matcher = pDomainNameOnly.matcher(body);
		if (matcher.find()) {
			tmpcharSet = matcher.group(0).toLowerCase();
			//				tmpcharSet = tmpcharSet.replace("\"","");
			//				tmpcharSet = tmpcharSet.replace(">","");
			//				tmpcharSet = tmpcharSet.replace("/","");
			//				tmpcharSet = tmpcharSet.replace("charset=","");
		}
	}

	if (tmpcharSet == null){//3、尝试使用ICU4J进行编码的检测
		CharsetDetector detector = new CharsetDetector();
		detector.setText(response);
		CharsetMatch cm = detector.detect();
		tmpcharSet = cm.getName();
	}

	tmpcharSet = tmpcharSet.toLowerCase().trim();
	if (tmpcharSet.contains("utf8")){
		tmpcharSet = "utf-8";
	}else {
		//常见的编码格式有ASCII、ANSI、GBK、GB2312、UTF-8、GB18030和UNICODE等。
		List<String> commonCharSet = Arrays.asList("ASCII,ANSI,GBK,GB2312,UTF-8,GB18030,UNICODE,ISO-8859-1".toLowerCase().split(","));
		for (String item:commonCharSet) {
			if (tmpcharSet.contains(item)) {
				tmpcharSet = item;
			}
		}
	}
	return tmpcharSet;
}

com.ibm.icu.text.CharsetMatch Java Examples