com.ibm.icu.text.CharsetMatch Java Examples

The following examples show how to use com.ibm.icu.text.CharsetMatch. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CharsetIdentification.java    From storm-crawler with Apache License 2.0 6 votes vote down vote up
/**
 * Use a third party library as last resort to guess the charset from the
 * bytes.
 */
private static String getCharsetFromText(byte[] content,
        String declaredCharset, int maxLengthCharsetDetection) {
    String charset = null;
    // filter HTML tags
    CharsetDetector charsetDetector = new CharsetDetector();
    charsetDetector.enableInputFilter(true);
    // give it a hint
    if (declaredCharset != null)
        charsetDetector.setDeclaredEncoding(declaredCharset);
    // trim the content of the text for the detection
    byte[] subContent = content;
    if (maxLengthCharsetDetection != -1
            && content.length > maxLengthCharsetDetection) {
        subContent = Arrays.copyOfRange(content, 0,
                maxLengthCharsetDetection);
    }
    charsetDetector.setText(subContent);
    try {
        CharsetMatch charsetMatch = charsetDetector.detect();
        charset = validateCharset(charsetMatch.getName());
    } catch (Exception e) {
        charset = null;
    }
    return charset;
}
 
Example #2
Source File: FileUtil.java    From SDA with BSD 2-Clause "Simplified" License 6 votes vote down vote up
public static CharsetMatch checkCharset(InputStream input) {
	//		BufferedInputStream bis = new BufferedInputStream(input);
	CharsetDetector cd = new CharsetDetector();
	try {
		cd.setText(input);
	} catch (IOException e) {
		try {
			input.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		e.printStackTrace();
	}
	CharsetMatch cm = cd.detect();

	//		if (cm != null) {
	//			//reader = cm.getReader();
	//			return cm.getName();
	//		} else {
	//			throw new UnsupportedCharsetException(null);
	//		}
	return cm;
}
 
Example #3
Source File: FileUtil.java    From SDA with BSD 2-Clause "Simplified" License 6 votes vote down vote up
/**
 * 캐릭터셋 확인
 * @param input
 * @return CharsetMatch
 */
public static CharsetMatch checkCharset(InputStream input) {
	CharsetDetector cd = new CharsetDetector();
	try {
		cd.setText(input);
	} catch (IOException e) {
		try {
			input.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		e.printStackTrace();
	}
	CharsetMatch cm = cd.detect();

	return cm;
}
 
Example #4
Source File: MyCharsetUtils.java    From spring-boot with Apache License 2.0 6 votes vote down vote up
/**
     * 利用 icu4j 探测输入流编码,只能探测文本类型的输入流
     * -
     * 抛弃 juniversalchardet
     *
     * @param in
     * @return
     * @throws IOException
     */
    public static Charset detectEncoding(InputStream in) throws IOException {
        final CharsetDetector detector = new CharsetDetector();
        detector.setText(in);

        final CharsetMatch charsetMatch = detector.detect();
        if (charsetMatch == null) {
            log.info("Cannot detect source charset.");
            return null;
        }
        //This is an integer from 0 to 100. The higher the value, the more confidence
        //探测的相似度在 1~100 之间,相似度越高结果越准确。
        int confidence = charsetMatch.getConfidence();
        final String name = charsetMatch.getName();
        log.info("CharsetMatch: {} ({}% 相似度,相似度小于 50% 时,可能编码无法判断。)", name, confidence);
        //打印该文本编码,所有可能性
//        CharsetMatch[] matches = detector.detectAll();
//        System.out.println("All possibilities : " + Arrays.asList(matches));
        return Charset.forName(name);
    }
 
Example #5
Source File: RegisteredExtractors.java    From document-management-system with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Extract text to be indexed
 */
public static String getText(String mimeType, String encoding, InputStream isContent) throws IOException {
	BufferedInputStream bis = new BufferedInputStream(isContent);
	TextExtractor te = engine.get(mimeType);
	String text = null;

	if (te != null) {
		if (mimeType.startsWith("text/") && encoding == null) {
			CharsetDetector detector = new CharsetDetector();
			detector.setText(bis);
			CharsetMatch cm = detector.detect();
			encoding = cm.getName();
		}

		text = te.extractText(bis, mimeType, encoding);
	} else {
		throw new IOException("Full text indexing of '" + mimeType + "' is not supported");
	}


	IOUtils.closeQuietly(bis);
	return text;
}
 
Example #6
Source File: IcuDetectorSniffer.java    From caja with Apache License 2.0 6 votes vote down vote up
public Encoding sniff() throws IOException {
    try {
        CharsetDetector detector = new CharsetDetector();
        detector.setText(this);
        CharsetMatch match = detector.detect();
        Encoding enc = Encoding.forName(match.getName());
        Encoding actual = enc.getActualHtmlEncoding();
        if (actual != null) {
            enc = actual;
        }
        if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
            return enc;
        } else {
            return null;
        }
    } catch (Exception e) {
        return null;
    }
}
 
Example #7
Source File: FileUtil.java    From SDA with BSD 2-Clause "Simplified" License 6 votes vote down vote up
public static CharsetMatch checkCharset(InputStream input) {
	//		BufferedInputStream bis = new BufferedInputStream(input);
	CharsetDetector cd = new CharsetDetector();
	try {
		cd.setText(input);
	} catch (IOException e) {
		try {
			input.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		e.printStackTrace();
	}
	CharsetMatch cm = cd.detect();

	//		if (cm != null) {
	//			//reader = cm.getReader();
	//			return cm.getName();
	//		} else {
	//			throw new UnsupportedCharsetException(null);
	//		}
	return cm;
}
 
Example #8
Source File: FileUtil.java    From SDA with BSD 2-Clause "Simplified" License 6 votes vote down vote up
/**
 * 캐릭터셋 확인
 * @param input
 * @return CharsetMatch
 */
public static CharsetMatch checkCharset(InputStream input) {
	CharsetDetector cd = new CharsetDetector();
	try {
		cd.setText(input);
	} catch (IOException e) {
		try {
			input.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		e.printStackTrace();
	}
	CharsetMatch cm = cd.detect();

	return cm;
}
 
Example #9
Source File: FileUtil.java    From SDA with BSD 2-Clause "Simplified" License 5 votes vote down vote up
public static CharsetMatch checkCharset(byte[] input) {
	//		BufferedInputStream bis = new BufferedInputStream(input);
	CharsetDetector cd = new CharsetDetector();
	cd.setText(input);
	CharsetMatch cm = cd.detect();

	//		if (cm != null) {
	//			//reader = cm.getReader();
	//			return cm.getName();
	//		} else {
	//			throw new UnsupportedCharsetException(null);
	//		}
	return cm;
}
 
Example #10
Source File: FeedUtils.java    From commafeed with Apache License 2.0 5 votes vote down vote up
/**
 * Detect encoding by analyzing characters in the array
 */
public static Charset detectEncoding(byte[] bytes) {
	String encoding = "UTF-8";

	CharsetDetector detector = new CharsetDetector();
	detector.setText(bytes);
	CharsetMatch match = detector.detect();
	if (match != null) {
		encoding = match.getName();
	}
	if (encoding.equalsIgnoreCase("ISO-8859-1")) {
		encoding = "windows-1252";
	}
	return Charset.forName(encoding);
}
 
Example #11
Source File: EncodingDetector.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void autoDetectClues(Content content, boolean filter) {
  byte[] data = content.getContent();

  if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType())
      && data.length > MIN_LENGTH) {
    CharsetMatch[] matches = null;

    // do all these in a try/catch; setText and detect/detectAll
    // will sometimes throw exceptions
    try {
      detector.enableInputFilter(filter);
      if (data.length > MIN_LENGTH) {
        detector.setText(data);
        matches = detector.detectAll();
      }
    } catch (Exception e) {
      LOG.debug("Exception from ICU4J (ignoring): ", e);
    }

    if (matches != null) {
      for (CharsetMatch match : matches) {
        addClue(match.getName(), "detect", match.getConfidence());
      }
    }
  }

  // add character encoding coming from HTTP response header
  addClue(parseCharacterEncoding(
      content.getMetadata().get(Response.CONTENT_TYPE)), "header");
}
 
Example #12
Source File: IOHelper.java    From AsciidocFX with Apache License 2.0 5 votes vote down vote up
private static String detectCharset(byte[] bytes) {
    String charset = null;
    try {
        CharsetMatch charsetMatch = new CharsetDetector().setText(bytes).detect();
        if (charsetMatch.getConfidence() > 70) {
            charset = charsetMatch.getName();
        }
    } catch (Exception e) {
    }
    return charset;
}
 
Example #13
Source File: Source.java    From tablesaw with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the likely charset for the given byte[], if it can be determined. A confidence score is
 * calculated. If the score is less than 60 (on a 1 to 100 interval) the system default charset is
 * returned instead.
 *
 * @param buffer The byte array to evaluate
 * @return The likely charset, or the system default charset
 */
private static Charset getCharSet(byte[] buffer) {
  CharsetDetector detector = new CharsetDetector();
  detector.setText(buffer);
  CharsetMatch match = detector.detect();
  if (match == null || match.getConfidence() < 60) {
    return Charset.defaultCharset();
  }
  return Charset.forName(match.getName());
}
 
Example #14
Source File: EncodingDetector.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public void autoDetectClues(Content content, boolean filter) {
  byte[] data = content.getContent();

  if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType())
      && data.length > MIN_LENGTH) {
    CharsetMatch[] matches = null;

    // do all these in a try/catch; setText and detect/detectAll
    // will sometimes throw exceptions
    try {
      detector.enableInputFilter(filter);
      if (data.length > MIN_LENGTH) {
        detector.setText(data);
        matches = detector.detectAll();
      }
    } catch (Exception e) {
      LOG.debug("Exception from ICU4J (ignoring): ", e);
    }

    if (matches != null) {
      for (CharsetMatch match : matches) {
        addClue(match.getName(), "detect", match.getConfidence());
      }
    }
  }

  // add character encoding coming from HTTP response header
  addClue(parseCharacterEncoding(
      content.getMetadata().get(Response.CONTENT_TYPE)), "header");
}
 
Example #15
Source File: FileUtil.java    From SDA with BSD 2-Clause "Simplified" License 5 votes vote down vote up
/**
 * 캐릭터셋 확인
 * @param input
 * @return CharsetMatch
 */
public static CharsetMatch checkCharset(byte[] input) {
	CharsetDetector cd = new CharsetDetector();
	cd.setText(input);
	CharsetMatch cm = cd.detect();

	return cm;
}
 
Example #16
Source File: Source.java    From tablesaw with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the likely charset for the given byte[], if it can be determined. A confidence score is
 * calculated. If the score is less than 60 (on a 1 to 100 interval) the system default charset is
 * returned instead.
 *
 * @param buffer The byte array to evaluate
 * @return The likely charset, or the system default charset
 */
private static Charset getCharSet(byte[] buffer) {
  CharsetDetector detector = new CharsetDetector();
  detector.setText(buffer);
  CharsetMatch match = detector.detect();
  if (match == null || match.getConfidence() < 60) {
    return Charset.defaultCharset();
  }
  return Charset.forName(match.getName());
}
 
Example #17
Source File: ICUCharsetDetectorWrapper.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public Charset detectCharset(byte[] bytes, String declaredCharset)
{
    // prepare fallback first
    Charset result = FALLBACK_CHARSET;

    // truncate to 8k bytes max
    if (bytes.length <= SUFFICIENT_BYTE_ARRAY_SIZE) {
        charsetDetector.setText(bytes);
    }
    else {
        charsetDetector.setText(Arrays.copyOf(bytes, SUFFICIENT_BYTE_ARRAY_SIZE));
    }

    if (declaredCharset != null) {
        charsetDetector.setDeclaredEncoding(declaredCharset);
    }

    CharsetMatch charsetMatch = charsetDetector.detect();

    if (charsetMatch != null) {
        try {
            result = Charset.forName(charsetMatch.getName());
        }
        catch (UnsupportedCharsetException ex) {
            // fallback to default
        }
    }

    return result;
}
 
Example #18
Source File: FileUtil.java    From SDA with BSD 2-Clause "Simplified" License 5 votes vote down vote up
/**
 * 캐릭터셋 확인
 * @param input
 * @return CharsetMatch
 */
public static CharsetMatch checkCharset(byte[] input) {
	CharsetDetector cd = new CharsetDetector();
	cd.setText(input);
	CharsetMatch cm = cd.detect();

	return cm;
}
 
Example #19
Source File: FileUtil.java    From SDA with BSD 2-Clause "Simplified" License 5 votes vote down vote up
public static CharsetMatch checkCharset(byte[] input) {
	//		BufferedInputStream bis = new BufferedInputStream(input);
	CharsetDetector cd = new CharsetDetector();
	cd.setText(input);
	CharsetMatch cm = cd.detect();

	//		if (cm != null) {
	//			//reader = cm.getReader();
	//			return cm.getName();
	//		} else {
	//			throw new UnsupportedCharsetException(null);
	//		}
	return cm;
}
 
Example #20
Source File: DetectCharsetStrategy.java    From obevo with Apache License 2.0 5 votes vote down vote up
@Override
public Charset determineCharset(byte[] bytes) {
    try (ByteArrayInputStream input = new ByteArrayInputStream(bytes)) {
        CharsetDetector cd = new CharsetDetector();
        cd.setText(input);
        CharsetMatch cm = cd.detect();
        return Charset.forName(cm.getName());
    } catch (IOException e) {
        return null;
    }
}
 
Example #21
Source File: CharSet.java    From knife with MIT License 4 votes vote down vote up
public static String getResponseCharset(byte[] response){
	Getter getter = new Getter(BurpExtender.callbacks.getHelpers());
	String contentType = getter.getHeaderValueOf(false,response,"Content-Type");
	String body = new String(getter.getBody(false,response));
	String tmpcharSet = null;

	if (contentType != null){//1、尝试从contentTpye中获取
		if (contentType.toLowerCase().contains("charset=")) {
			tmpcharSet = contentType.toLowerCase().split("charset=")[1];
		}
	}

	if (tmpcharSet == null){//2、尝试从body中获取
		Pattern pDomainNameOnly = Pattern.compile("charset=(.*?)>");
		Matcher matcher = pDomainNameOnly.matcher(body);
		if (matcher.find()) {
			tmpcharSet = matcher.group(0).toLowerCase();
			//				tmpcharSet = tmpcharSet.replace("\"","");
			//				tmpcharSet = tmpcharSet.replace(">","");
			//				tmpcharSet = tmpcharSet.replace("/","");
			//				tmpcharSet = tmpcharSet.replace("charset=","");
		}
	}

	if (tmpcharSet == null){//3、尝试使用ICU4J进行编码的检测
		CharsetDetector detector = new CharsetDetector();
		detector.setText(response);
		CharsetMatch cm = detector.detect();
		tmpcharSet = cm.getName();
	}

	tmpcharSet = tmpcharSet.toLowerCase().trim();
	if (tmpcharSet.contains("utf8")){
		tmpcharSet = "utf-8";
	}else {
		//常见的编码格式有ASCII、ANSI、GBK、GB2312、UTF-8、GB18030和UNICODE等。
		List<String> commonCharSet = Arrays.asList("ASCII,ANSI,GBK,GB2312,UTF-8,GB18030,UNICODE,ISO-8859-1".toLowerCase().split(","));
		for (String item:commonCharSet) {
			if (tmpcharSet.contains(item)) {
				tmpcharSet = item;
			}
		}
	}
	return tmpcharSet;
}