com.optimaize.langdetect.DetectedLanguage Java Examples
The following examples show how to use
com.optimaize.langdetect.DetectedLanguage.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AbstractOptimaizeFilter.java From modernmt with Apache License 2.0 | 6 votes |
protected String guessLanguage(CharSequence text, boolean largeText, float minProbability) { LanguageDetector detector = getLanguageDetector(); TextObjectFactory factory; if (largeText) { factory = CommonTextObjectFactories.forDetectingOnLargeText(); } else { factory = new TextObjectFactoryBuilder() .withTextFilter(UrlTextFilter.getInstance()) .build(); } TextObject textObject = factory.create().append(text); List<DetectedLanguage> languages = detector.getProbabilities(textObject); if (languages.size() < 1) return null; DetectedLanguage lang = languages.get(0); if (lang.getProbability() < minProbability && languages.size() > 1) return null; return lang.getLocale().getLanguage(); }
Example #2
Source File: CommandLineInterface.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
/** * Language detection test for each file (--detectlang option) * * <pre> * usage: --detectlang -d [profile directory] -a [alpha] -s [seed] [test file(s)] * </pre> * */ public void detectLang() throws IOException { LanguageDetector languageDetector = makeDetector(); TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); for (String filename : arglist) { try (BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8"))) { TextObject textObject = textObjectFactory.create().append(is); List<DetectedLanguage> probabilities = languageDetector.getProbabilities(textObject); System.out.println(filename + ":" + probabilities); } } }
Example #3
Source File: DetectedLanguageTest.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
@Test public final void basic() { DetectedLanguage lang = new DetectedLanguage(Locale.forLanguageTag("en"), 1.0); assertEquals(lang.getLocale().getLanguage(), "en"); assertEquals(lang.getProbability(), 1.0, 0.0001); assertEquals(lang.toString(), "DetectedLanguage[en:1.0]"); }
Example #4
Source File: DetectedLanguageTest.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
@Test public final void comparable() { List<DetectedLanguage> list = new ArrayList<>(); list.add(new DetectedLanguage(Locale.forLanguageTag("en"), 1.0)); list.add(new DetectedLanguage(Locale.forLanguageTag("de"), 1.0)); list.add(new DetectedLanguage(Locale.forLanguageTag("fr"), 0.9)); Collections.sort(list); assertEquals(list.get(0).getLocale().getLanguage(), "de"); // alphabetical de before en assertEquals(list.get(1).getLocale().getLanguage(), "en"); assertEquals(list.get(2).getLocale().getLanguage(), "fr"); // points 0.9 the last }
Example #5
Source File: CommandLineInterface.java From language-detector with Apache License 2.0 | 5 votes |
/** * Language detection test for each file (--detectlang option) * * <pre> * usage: --detectlang -d [profile directory] -a [alpha] -s [seed] [test file(s)] * </pre> * */ public void detectLang() throws IOException { LanguageDetector languageDetector = makeDetector(); TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); for (String filename: arglist) { try (BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8"))) { TextObject textObject = textObjectFactory.create().append(is); List<DetectedLanguage> probabilities = languageDetector.getProbabilities(textObject); System.out.println(filename + ":" + probabilities); } } }
Example #6
Source File: DetectedLanguageTest.java From language-detector with Apache License 2.0 | 5 votes |
@Test public final void basic() { DetectedLanguage lang = new DetectedLanguage(LdLocale.fromString("en"), 1.0); assertEquals(lang.getLocale().getLanguage(), "en"); assertEquals(lang.getProbability(), 1.0, 0.0001); assertEquals(lang.toString(), "DetectedLanguage[en:1.0]"); }
Example #7
Source File: DetectedLanguageTest.java From language-detector with Apache License 2.0 | 5 votes |
@Test public final void comparable() { List<DetectedLanguage> list = new ArrayList<>(); list.add(new DetectedLanguage(LdLocale.fromString("en"), 1.0)); list.add(new DetectedLanguage(LdLocale.fromString("de"), 1.0)); list.add(new DetectedLanguage(LdLocale.fromString("fr"), 0.9)); Collections.sort(list); assertEquals(list.get(0).getLocale().getLanguage(), "de"); //alphabetical de before en assertEquals(list.get(1).getLocale().getLanguage(), "en"); assertEquals(list.get(2).getLocale().getLanguage(), "fr"); //points 0.9 the last }
Example #8
Source File: DetectedLanguageTest.java From jstarcraft-nlp with Apache License 2.0 | 4 votes |
@Test public final void invalidProbability() { Assertions.assertThrows(IllegalArgumentException.class, () -> { new DetectedLanguage(Locale.forLanguageTag("en"), 1.1); }); }
Example #9
Source File: DetectedLanguageTest.java From language-detector with Apache License 2.0 | 4 votes |
@Test(expected = IllegalArgumentException.class) public final void invalidProbability() { new DetectedLanguage(LdLocale.fromString("en"), 1.1); }
Example #10
Source File: LanguageID.java From storm-crawler with Apache License 2.0 | 4 votes |
@Override public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) { // check whether the metadata already contains a lang value // in which case we normalise its value and use it Metadata m = parse.get(url).getMetadata(); String extractedValue = m.getFirstValue(extractedKeyName); if (StringUtils.isNotBlank(extractedValue) && extractedValue.length() > 1) { extractedValue = extractedValue.substring(0, 2) .toLowerCase(Locale.ENGLISH); LOG.info("Lang: {} extracted from page for {}", extractedValue, url); m.setValue(mdKey, extractedValue); return; } String text = parse.get(url).getText(); if (StringUtils.isBlank(text)) { return; } if (text.length() > maxTextLength) { text = text.substring(0, maxTextLength); } TextObject textObject = textObjectFactory.forText(text); synchronized (languageDetector) { List<DetectedLanguage> probs = languageDetector .getProbabilities(textObject); if (probs == null || probs.size() == 0) { return; } for (DetectedLanguage lang : probs) { if (lang.getProbability() >= minProb) { String code = lang.getLocale().getLanguage(); parse.get(url).getMetadata().addValue(mdKey, code); } } } }