Java Code Examples for com.cybozu.labs.langdetect.Detector#detect()

The following examples show how to use com.cybozu.labs.langdetect.Detector#detect() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: LangDetectProcessor.java    From elasticsearch-ingest-langdetect with Apache License 2.0 6 votes vote down vote up
@Override
public IngestDocument execute(IngestDocument ingestDocument) throws Exception {
    Detector detector = DetectorFactory.create();
    detector.setMaxTextLength(maxLength.bytesAsInt());

    String content;
    try {
        content = ingestDocument.getFieldValue(field, String.class);
    } catch (IllegalArgumentException e) {
        if (ignoreMissing) {
            return ingestDocument;
        }
        throw e;
    }
    if (Strings.isEmpty(content)) {
        return ingestDocument;
    }

    detector.append(content);
    String language = detector.detect();

    ingestDocument.setFieldValue(targetField, language);

    return ingestDocument;
}
 
Example 2
Source File: CybozuLanguageIdentifier.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public String identifyLanguage(String html)
        throws IOException
{
    // extracting plain html text
    Document doc = Jsoup.parse(html);
    String text = doc.text();

    // we might have removed everything -> no lang
    if (text.isEmpty()) {
        return UNKNOWN_LANGUAGE;
    }

    try {
        Detector detector = DetectorFactory.create();
        detector.append(text);
        String detectedLang = detector.detect();

        ArrayList<Language> detectedProbabilities = detector.getProbabilities();

        if (detectedProbabilities.get(0).prob > PROBABILITY_THRESHOLD) {
            return detectedLang;
        }
        else {
            return UNKNOWN_LANGUAGE;
        }
    }
    catch (LangDetectException e) {
        return UNKNOWN_LANGUAGE;
    }
}
 
Example 3
Source File: LanguageDetectionFilter.java    From weslang with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf == null) {
		throw new IndexingException("Not Yet Initialization.");
	}
	if (cause != null) {
		throw new IndexingException("Initialization Failed.", cause);
	}

	String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
	if (lang == null) {
		StringBuilder text = new StringBuilder();
		text.append(parse.getData().getTitle()).append(" ")
				.append(parse.getText());
		try {
			Detector detector = DetectorFactory.create();
			detector.setMaxTextLength(textsize_upper_limit);
			detector.append(text.toString());
			lang = detector.detect();
		} catch (LangDetectException e) {
			throw new IndexingException("Detection failed.", e);
		}
	}
	if (lang == null) lang = "unknown";

	doc.add("lang", lang);
	return doc;
}
 
Example 4
Source File: LangDetectTest.java    From language-detection with Apache License 2.0 5 votes vote down vote up
@Test
public static void langDetectSample() {

    long startTime;
    String lang = "none";
    ArrayList<Language> langlist = null;

    try {

        // Initialize
        startTime = System.currentTimeMillis();
        DetectorFactory.create();
        System.out.println("Initialization finished in " + (System.currentTimeMillis() - startTime) + " ms");

        // Detect
        startTime = System.currentTimeMillis();
        Detector detector = DetectorFactory.create();
        detector.append("The quick brown fox jumps over the lazy dog.");
        lang = detector.detect();
        System.out.println("Detection finished in " + (System.currentTimeMillis() - startTime) + " ms");

        // Get probabilities
        langlist = detector.getProbabilities();

    } catch (LangDetectException e) {
        System.err.println("Detection failed");
        e.printStackTrace();
    }

    System.out.println("Detected language: " + lang);
    for (Language s : langlist) {
        System.out.println(s);
    }

}
 
Example 5
Source File: LanguageDetectionFilter.java    From language-detection with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf == null) {
		throw new IndexingException("Not Yet Initialization.");
	}
	if (cause != null) {
		throw new IndexingException("Initialization Failed.", cause);
	}

	String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
	if (lang == null) {
		StringBuilder text = new StringBuilder();
		text.append(parse.getData().getTitle()).append(" ")
				.append(parse.getText());
		try {
			Detector detector = DetectorFactory.create();
			detector.setMaxTextLength(textsize_upper_limit);
			detector.append(text.toString());
			lang = detector.detect();
		} catch (LangDetectException e) {
			throw new IndexingException("Detection failed.", e);
		}
	}
	if (lang == null) lang = "unknown";

	doc.add("lang", lang);
	return doc;
}
 
Example 6
Source File: LanguageDetectionAnnotator.java    From bluima with Apache License 2.0 3 votes vote down vote up
public static String detect(String text) throws LangDetectException {

        Detector detector = DetectorFactory.create(0.5);
        detector.append(text);

        return detector.detect();
    }