com.cybozu.labs.langdetect.Language Java Examples

The following examples show how to use com.cybozu.labs.langdetect.Language. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AbstractQParser.java    From SearchServices with GNU Lesser General Public License v3.0 6 votes vote down vote up
private List<DetectedLanguage> detectLanguage(String content) {
	if (content.trim().length() == 0) { // to be consistent with the tika impl?
		log.debug("No input text to detect language from, returning empty list");
		return Collections.emptyList();
	}

	try {
		Detector detector = DetectorFactory.create();
		detector.append(content);
		ArrayList<Language> langlist = detector.getProbabilities();
		ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
		for (Language l: langlist) 
		{
			if((autoDetectQueryLocales.size() == 0) || (autoDetectQueryLocales.contains(l.lang)))
			{
			    solrLangList.add(new DetectedLanguage(l.lang, l.prob));
			}
		}
		return solrLangList;
	} catch (LangDetectException e) {
		log.debug("Could not determine language, returning empty list: ", e);
		return Collections.emptyList();
	}
}
 
Example #2
Source File: LanguageDetector.java    From Asqatasun with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Perform the detection 
 * 
 * @param text to test
 * @return the detected language
 */
public LanguageDetectionResult detectLanguage(String text) {
    try {
        Detector detector = DetectorFactory.create(0.15);
        // issue#47 correction
        detector.append(text.toLowerCase());
        ArrayList<Language> languages = detector.getProbabilities();
        Language detectedLanguage =  
                extractLangWithHighestProbability(languages);
        return new LanguageDetectionResult(detectedLanguage, text, languages.size()>1);
    } catch (LangDetectException ex) {
        LOGGER.warn(ex);
    }
    return null;
}
 
Example #3
Source File: DetectionServiceImplLanguageDetection.java    From weslang with Apache License 2.0 6 votes vote down vote up
@Override
public DetectionResult detect(String text) {
  Detector detector;
  try {
    detector = DetectorFactory.create();
  } catch (LangDetectException e) {
    // TODO(skreft): log the reason
    return UNKNOWN;
  }

  detector.append(text);
  List<Language> results = detector.getProbabilities();
  if (!results.isEmpty()) {
    Language bestLang = results.get(0);
    return new DetectionResult(bestLang.lang, bestLang.prob);
  }

  return UNKNOWN;
}
 
Example #4
Source File: Detector.java    From language-detection with Apache License 2.0 6 votes vote down vote up
/**
 * @param probabilities HashMap
 * @return lanugage candidates order by probabilities descendently
 */
private ArrayList<Language> sortProbability(double[] prob) {
    ArrayList<Language> list = new ArrayList<Language>();
    for(int j=0;j<prob.length;++j) {
        double p = prob[j];
        if (p > PROB_THRESHOLD) {
            for (int i = 0; i <= list.size(); ++i) {
                if (i == list.size() || list.get(i).prob < p) {
                    list.add(i, new Language(langlist.get(j), p));
                    break;
                }
            }
        }
    }
    return list;
}
 
Example #5
Source File: CybozuLanguageIdentifier.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public String identifyLanguage(String html)
        throws IOException
{
    // extracting plain html text
    Document doc = Jsoup.parse(html);
    String text = doc.text();

    // we might have removed everything -> no lang
    if (text.isEmpty()) {
        return UNKNOWN_LANGUAGE;
    }

    try {
        Detector detector = DetectorFactory.create();
        detector.append(text);
        String detectedLang = detector.detect();

        ArrayList<Language> detectedProbabilities = detector.getProbabilities();

        if (detectedProbabilities.get(0).prob > PROBABILITY_THRESHOLD) {
            return detectedLang;
        }
        else {
            return UNKNOWN_LANGUAGE;
        }
    }
    catch (LangDetectException e) {
        return UNKNOWN_LANGUAGE;
    }
}
 
Example #6
Source File: LanguageDetector.java    From Asqatasun with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * Multiple results are returned in a list. This method parses the different
 * results and keeps the best regarding the relevancy value.
 *
 * @param languages
 * @return the language with the highest probability
 */
private Language extractLangWithHighestProbability(ArrayList<Language> languages) {
    double bestRelevancy = -1;
    Language langWinner = null;
    for (Language lang : languages) {
        if (lang.prob > bestRelevancy) {
            bestRelevancy = lang.prob;
            langWinner = lang;
        }
    }
    return langWinner;
}
 
Example #7
Source File: LanguageDetectionResult.java    From Asqatasun with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 *
 * @param language
 * @param testedText
 * @param isMultipleLanguage
 */
public LanguageDetectionResult(Language language, String testedText, boolean isMultipleLanguage) {
    this.detectedLanguage = language.lang;
    this.probability = language.prob;
    this.isMultipleLanguage = isMultipleLanguage;
    computeNumberOfWords(testedText);
}
 
Example #8
Source File: LangDetection.java    From ache with Apache License 2.0 5 votes vote down vote up
/**
 * Try to detect the language of the text in the String.
 * 
 * @param page
 * @return true if the String contains English language, false otherwise
 */
public Boolean isEnglish(String content) {
    try {

        if (content == null || content.isEmpty()) {
            return false;
        }

        Detector detector = DetectorFactory.create();
        detector.append(content);
        ArrayList<Language> langs = detector.getProbabilities();

        if (langs.size() == 0) {
            return false;
        }

        for (Language l : langs) {
            if (l.lang.equals("en")) {
                return true;
            }
        }
        return false;
    } catch (Exception ex) {
        logger.warn("Problem while detecting language in text: " + content, ex);
        return false;
    }
}
 
Example #9
Source File: Detector.java    From weslang with Apache License 2.0 5 votes vote down vote up
/**
 * Detect language of the target text and return the language name which has the highest probability.
 * @return detected language name which has most probability.
 * @throws LangDetectException
 *  code = ErrorCode.CantDetectError : Can't detect because of no valid features in text
 */
public String detect() {
    List<Language> probabilities = getProbabilities();
    if (probabilities.size() > 0) {
        return probabilities.get(0).lang;
    }

    return UNKNOWN_LANG;
}
 
Example #10
Source File: Detector.java    From weslang with Apache License 2.0 5 votes vote down vote up
/**
 * @param probabilities HashMap
 * @return lanugage candidates order by probabilities descendently
 */
private List<Language> sortProbability(double[] prob) {
    List<Language> list = new ArrayList<Language>(prob.length);
    for(int i = 0; i < prob.length; ++i) {
        if (prob[i] > PROB_THRESHOLD) {
            list.add(new Language(langlist.get(i), prob[i]));
        }
    }
    Collections.sort(list, languageComparator);

    return list;
}
 
Example #11
Source File: Detector.java    From language-detection with Apache License 2.0 5 votes vote down vote up
/**
 * Get language candidates which have high probabilities
 * @return possible languages list (whose probabilities are over PROB_THRESHOLD, ordered by probabilities descendently
 * @throws LangDetectException 
 *  code = ErrorCode.CantDetectError : Can't detect because of no valid features in text
 */
public ArrayList<Language> getProbabilities() throws LangDetectException {
    if (langprob == null) detectBlock();

    ArrayList<Language> list = sortProbability(langprob);
    return list;
}
 
Example #12
Source File: LangDetectTest.java    From language-detection with Apache License 2.0 5 votes vote down vote up
@Test
public static void langDetectSample() {

    long startTime;
    String lang = "none";
    ArrayList<Language> langlist = null;

    try {

        // Initialize
        startTime = System.currentTimeMillis();
        DetectorFactory.create();
        System.out.println("Initialization finished in " + (System.currentTimeMillis() - startTime) + " ms");

        // Detect
        startTime = System.currentTimeMillis();
        Detector detector = DetectorFactory.create();
        detector.append("The quick brown fox jumps over the lazy dog.");
        lang = detector.detect();
        System.out.println("Detection finished in " + (System.currentTimeMillis() - startTime) + " ms");

        // Get probabilities
        langlist = detector.getProbabilities();

    } catch (LangDetectException e) {
        System.err.println("Detection failed");
        e.printStackTrace();
    }

    System.out.println("Detected language: " + lang);
    for (Language s : langlist) {
        System.out.println(s);
    }

}
 
Example #13
Source File: Detector.java    From weslang with Apache License 2.0 4 votes vote down vote up
@Override
public int compare(Language o1, Language o2) {
    // Arguments are reversed so to sort in decreasing order.
    return Double.compare(o2.prob, o1.prob);
}
 
Example #14
Source File: Detector.java    From weslang with Apache License 2.0 2 votes vote down vote up
/**
 * Get language candidates which have high probabilities
 * @return possible languages list (whose probabilities are over PROB_THRESHOLD, ordered by probabilities descendently
 * @throws LangDetectException
 *  code = ErrorCode.CantDetectError : Can't detect because of no valid features in text
 */
public List<Language> getProbabilities() {
    if (langprob == null) detectBlock();

    return sortProbability(langprob);
}
 
Example #15
Source File: Detector.java    From language-detection with Apache License 2.0 2 votes vote down vote up
/**
 * Detect language of the target text and return the language name which has the highest probability.
 * @return detected language name which has most probability.
 * @throws LangDetectException 
 *  code = ErrorCode.CantDetectError : Can't detect because of no valid features in text
 */
public String detect() throws LangDetectException {
    ArrayList<Language> probabilities = getProbabilities();
    if (probabilities.size() > 0) return probabilities.get(0).lang;
    return UNKNOWN_LANG;
}