com.optimaize.langdetect.DetectedLanguage Java Examples

The following examples show how to use com.optimaize.langdetect.DetectedLanguage. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AbstractOptimaizeFilter.java    From modernmt with Apache License 2.0 6 votes vote down vote up
protected String guessLanguage(CharSequence text, boolean largeText, float minProbability) {
    LanguageDetector detector = getLanguageDetector();

    TextObjectFactory factory;
    if (largeText) {
        factory = CommonTextObjectFactories.forDetectingOnLargeText();
    } else {
        factory = new TextObjectFactoryBuilder()
                .withTextFilter(UrlTextFilter.getInstance())
                .build();
    }

    TextObject textObject = factory.create().append(text);
    List<DetectedLanguage> languages = detector.getProbabilities(textObject);

    if (languages.size() < 1)
        return null;

    DetectedLanguage lang = languages.get(0);
    if (lang.getProbability() < minProbability && languages.size() > 1)
        return null;

    return lang.getLocale().getLanguage();
}
 
Example #2
Source File: CommandLineInterface.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
/**
 * Language detection test for each file (--detectlang option)
 * 
 * <pre>
 * usage: --detectlang -d [profile directory] -a [alpha] -s [seed] [test file(s)]
 * </pre>
 * 
 */
public void detectLang() throws IOException {
    LanguageDetector languageDetector = makeDetector();
    TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();

    for (String filename : arglist) {
        try (BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8"))) {
            TextObject textObject = textObjectFactory.create().append(is);
            List<DetectedLanguage> probabilities = languageDetector.getProbabilities(textObject);
            System.out.println(filename + ":" + probabilities);
        }
    }
}
 
Example #3
Source File: DetectedLanguageTest.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Test
public final void basic() {
    DetectedLanguage lang = new DetectedLanguage(Locale.forLanguageTag("en"), 1.0);
    assertEquals(lang.getLocale().getLanguage(), "en");
    assertEquals(lang.getProbability(), 1.0, 0.0001);
    assertEquals(lang.toString(), "DetectedLanguage[en:1.0]");
}
 
Example #4
Source File: DetectedLanguageTest.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Test
public final void comparable() {
    List<DetectedLanguage> list = new ArrayList<>();
    list.add(new DetectedLanguage(Locale.forLanguageTag("en"), 1.0));
    list.add(new DetectedLanguage(Locale.forLanguageTag("de"), 1.0));
    list.add(new DetectedLanguage(Locale.forLanguageTag("fr"), 0.9));
    Collections.sort(list);
    assertEquals(list.get(0).getLocale().getLanguage(), "de"); // alphabetical de before en
    assertEquals(list.get(1).getLocale().getLanguage(), "en");
    assertEquals(list.get(2).getLocale().getLanguage(), "fr"); // points 0.9 the last
}
 
Example #5
Source File: CommandLineInterface.java    From language-detector with Apache License 2.0 5 votes vote down vote up
/**
 * Language detection test for each file (--detectlang option)
 * 
 * <pre>
 * usage: --detectlang -d [profile directory] -a [alpha] -s [seed] [test file(s)]
 * </pre>
 * 
 */
public void detectLang() throws IOException {
    LanguageDetector languageDetector = makeDetector();
    TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();

    for (String filename: arglist) {
        try (BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8"))) {
            TextObject textObject = textObjectFactory.create().append(is);
            List<DetectedLanguage> probabilities = languageDetector.getProbabilities(textObject);
            System.out.println(filename + ":" + probabilities);
        }
    }
}
 
Example #6
Source File: DetectedLanguageTest.java    From language-detector with Apache License 2.0 5 votes vote down vote up
@Test
public final void basic() {
    DetectedLanguage lang = new DetectedLanguage(LdLocale.fromString("en"), 1.0);
    assertEquals(lang.getLocale().getLanguage(), "en");
    assertEquals(lang.getProbability(), 1.0, 0.0001);
    assertEquals(lang.toString(), "DetectedLanguage[en:1.0]");
}
 
Example #7
Source File: DetectedLanguageTest.java    From language-detector with Apache License 2.0 5 votes vote down vote up
@Test
public final void comparable() {
    List<DetectedLanguage> list = new ArrayList<>();
    list.add(new DetectedLanguage(LdLocale.fromString("en"), 1.0));
    list.add(new DetectedLanguage(LdLocale.fromString("de"), 1.0));
    list.add(new DetectedLanguage(LdLocale.fromString("fr"), 0.9));
    Collections.sort(list);
    assertEquals(list.get(0).getLocale().getLanguage(), "de"); //alphabetical de before en
    assertEquals(list.get(1).getLocale().getLanguage(), "en");
    assertEquals(list.get(2).getLocale().getLanguage(), "fr"); //points 0.9 the last
}
 
Example #8
Source File: DetectedLanguageTest.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
@Test
public final void invalidProbability() {
    Assertions.assertThrows(IllegalArgumentException.class, () -> {
        new DetectedLanguage(Locale.forLanguageTag("en"), 1.1);
    });
}
 
Example #9
Source File: DetectedLanguageTest.java    From language-detector with Apache License 2.0 4 votes vote down vote up
@Test(expected = IllegalArgumentException.class)
public final void invalidProbability() {
    new DetectedLanguage(LdLocale.fromString("en"), 1.1);
}
 
Example #10
Source File: LanguageID.java    From storm-crawler with Apache License 2.0 4 votes vote down vote up
@Override
public void filter(String url, byte[] content, DocumentFragment doc,
        ParseResult parse) {

    // check whether the metadata already contains a lang value
    // in which case we normalise its value and use it
    Metadata m = parse.get(url).getMetadata();
    String extractedValue = m.getFirstValue(extractedKeyName);
    if (StringUtils.isNotBlank(extractedValue)
            && extractedValue.length() > 1) {
        extractedValue = extractedValue.substring(0, 2)
                .toLowerCase(Locale.ENGLISH);
        LOG.info("Lang: {} extracted from page for {}", extractedValue,
                url);
        m.setValue(mdKey, extractedValue);
        return;
    }

    String text = parse.get(url).getText();
    if (StringUtils.isBlank(text)) {
        return;
    }

    if (text.length() > maxTextLength) {
        text = text.substring(0, maxTextLength);
    }

    TextObject textObject = textObjectFactory.forText(text);
    synchronized (languageDetector) {
        List<DetectedLanguage> probs = languageDetector
                .getProbabilities(textObject);
        if (probs == null || probs.size() == 0) {
            return;
        }
        for (DetectedLanguage lang : probs) {
            if (lang.getProbability() >= minProb) {
                String code = lang.getLocale().getLanguage();
                parse.get(url).getMetadata().addValue(mdKey, code);
            }
        }
    }
}