com.cybozu.labs.langdetect.Detector Java Examples
The following examples show how to use
com.cybozu.labs.langdetect.Detector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AbstractQParser.java From SearchServices with GNU Lesser General Public License v3.0 | 6 votes |
private List<DetectedLanguage> detectLanguage(String content) { if (content.trim().length() == 0) { // to be consistent with the tika impl? log.debug("No input text to detect language from, returning empty list"); return Collections.emptyList(); } try { Detector detector = DetectorFactory.create(); detector.append(content); ArrayList<Language> langlist = detector.getProbabilities(); ArrayList<DetectedLanguage> solrLangList = new ArrayList<>(); for (Language l: langlist) { if((autoDetectQueryLocales.size() == 0) || (autoDetectQueryLocales.contains(l.lang))) { solrLangList.add(new DetectedLanguage(l.lang, l.prob)); } } return solrLangList; } catch (LangDetectException e) { log.debug("Could not determine language, returning empty list: ", e); return Collections.emptyList(); } }
Example #2
Source File: LanguageDetectionService.java From mojito with Apache License 2.0 | 6 votes |
/** * Gets a customized detector for a given language. * * TODO(P1) Adding priority on the language seems to be relatively useless. * To be reviewed. * * @param language * @return a {@link Detector} customized for that language * @throws LangDetectException */ private Detector getDetectorForLanguage(String language) throws LangDetectException { Detector detector = DetectorFactory.create(); HashMap<String, Double> priorityMap = new HashMap(); for (String supportedLanguage : getSupportedLanguages()) { if (supportedLanguage.equals(language)) { priorityMap.put(supportedLanguage, 0.8); } else if (supportedLanguage.equals("en") && !"en".equals(language)) { priorityMap.put(supportedLanguage, 0.5); } else { priorityMap.put(supportedLanguage, 0.1); } } detector.setPriorMap(priorityMap); return detector; }
Example #3
Source File: LangDetectProcessor.java From elasticsearch-ingest-langdetect with Apache License 2.0 | 6 votes |
@Override public IngestDocument execute(IngestDocument ingestDocument) throws Exception { Detector detector = DetectorFactory.create(); detector.setMaxTextLength(maxLength.bytesAsInt()); String content; try { content = ingestDocument.getFieldValue(field, String.class); } catch (IllegalArgumentException e) { if (ignoreMissing) { return ingestDocument; } throw e; } if (Strings.isEmpty(content)) { return ingestDocument; } detector.append(content); String language = detector.detect(); ingestDocument.setFieldValue(targetField, language); return ingestDocument; }
Example #4
Source File: LanguageDetector.java From Asqatasun with GNU Affero General Public License v3.0 | 6 votes |
/** * Perform the detection * * @param text to test * @return the detected language */ public LanguageDetectionResult detectLanguage(String text) { try { Detector detector = DetectorFactory.create(0.15); // issue#47 correction detector.append(text.toLowerCase()); ArrayList<Language> languages = detector.getProbabilities(); Language detectedLanguage = extractLangWithHighestProbability(languages); return new LanguageDetectionResult(detectedLanguage, text, languages.size()>1); } catch (LangDetectException ex) { LOGGER.warn(ex); } return null; }
Example #5
Source File: DetectionServiceImplLanguageDetection.java From weslang with Apache License 2.0 | 6 votes |
@Override public DetectionResult detect(String text) { Detector detector; try { detector = DetectorFactory.create(); } catch (LangDetectException e) { // TODO(skreft): log the reason return UNKNOWN; } detector.append(text); List<Language> results = detector.getProbabilities(); if (!results.isEmpty()) { Language bestLang = results.get(0); return new DetectionResult(bestLang.lang, bestLang.prob); } return UNKNOWN; }
Example #6
Source File: CybozuLanguageIdentifier.java From dkpro-c4corpus with Apache License 2.0 | 5 votes |
@Override public String identifyLanguage(String html) throws IOException { // extracting plain html text Document doc = Jsoup.parse(html); String text = doc.text(); // we might have removed everything -> no lang if (text.isEmpty()) { return UNKNOWN_LANGUAGE; } try { Detector detector = DetectorFactory.create(); detector.append(text); String detectedLang = detector.detect(); ArrayList<Language> detectedProbabilities = detector.getProbabilities(); if (detectedProbabilities.get(0).prob > PROBABILITY_THRESHOLD) { return detectedLang; } else { return UNKNOWN_LANGUAGE; } } catch (LangDetectException e) { return UNKNOWN_LANGUAGE; } }
Example #7
Source File: LangDetection.java From ache with Apache License 2.0 | 5 votes |
/** * Try to detect the language of the text in the String. * * @param page * @return true if the String contains English language, false otherwise */ public Boolean isEnglish(String content) { try { if (content == null || content.isEmpty()) { return false; } Detector detector = DetectorFactory.create(); detector.append(content); ArrayList<Language> langs = detector.getProbabilities(); if (langs.size() == 0) { return false; } for (Language l : langs) { if (l.lang.equals("en")) { return true; } } return false; } catch (Exception ex) { logger.warn("Problem while detecting language in text: " + content, ex); return false; } }
Example #8
Source File: Detector.java From weslang with Apache License 2.0 | 5 votes |
/** * Constructor. * Detector instance can be constructed via {@link DetectorFactory#create()}. * @param factory {@link DetectorFactory} instance (only DetectorFactory inside) */ public Detector(DetectorFactory factory) { this.wordLangProbMap = factory.wordLangProbMap; this.langlist = factory.langlist; this.text = new StringBuilder(); this.weight = this.alpha / BASE_FREQ; }
Example #9
Source File: LanguageDetectionFilter.java From weslang with Apache License 2.0 | 5 votes |
/** * {@inheritDoc} */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (conf == null) { throw new IndexingException("Not Yet Initialization."); } if (cause != null) { throw new IndexingException("Initialization Failed.", cause); } String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE); if (lang == null) { StringBuilder text = new StringBuilder(); text.append(parse.getData().getTitle()).append(" ") .append(parse.getText()); try { Detector detector = DetectorFactory.create(); detector.setMaxTextLength(textsize_upper_limit); detector.append(text.toString()); lang = detector.detect(); } catch (LangDetectException e) { throw new IndexingException("Detection failed.", e); } } if (lang == null) lang = "unknown"; doc.add("lang", lang); return doc; }
Example #10
Source File: Detector.java From language-detection with Apache License 2.0 | 5 votes |
/** * Constructor. * Detector instance can be constructed via {@link DetectorFactory#create()}. * @param factory {@link DetectorFactory} instance (only DetectorFactory inside) */ public Detector(DetectorFactory factory) { this.wordLangProbMap = factory.wordLangProbMap; this.langlist = factory.langlist; this.text = new StringBuffer(); this.seed = factory.seed; }
Example #11
Source File: LangDetectTest.java From language-detection with Apache License 2.0 | 5 votes |
@Test public static void langDetectSample() { long startTime; String lang = "none"; ArrayList<Language> langlist = null; try { // Initialize startTime = System.currentTimeMillis(); DetectorFactory.create(); System.out.println("Initialization finished in " + (System.currentTimeMillis() - startTime) + " ms"); // Detect startTime = System.currentTimeMillis(); Detector detector = DetectorFactory.create(); detector.append("The quick brown fox jumps over the lazy dog."); lang = detector.detect(); System.out.println("Detection finished in " + (System.currentTimeMillis() - startTime) + " ms"); // Get probabilities langlist = detector.getProbabilities(); } catch (LangDetectException e) { System.err.println("Detection failed"); e.printStackTrace(); } System.out.println("Detected language: " + lang); for (Language s : langlist) { System.out.println(s); } }
Example #12
Source File: LanguageDetectionFilter.java From language-detection with Apache License 2.0 | 5 votes |
/** * {@inheritDoc} */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (conf == null) { throw new IndexingException("Not Yet Initialization."); } if (cause != null) { throw new IndexingException("Initialization Failed.", cause); } String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE); if (lang == null) { StringBuilder text = new StringBuilder(); text.append(parse.getData().getTitle()).append(" ") .append(parse.getText()); try { Detector detector = DetectorFactory.create(); detector.setMaxTextLength(textsize_upper_limit); detector.append(text.toString()); lang = detector.detect(); } catch (LangDetectException e) { throw new IndexingException("Detection failed.", e); } } if (lang == null) lang = "unknown"; doc.add("lang", lang); return doc; }
Example #13
Source File: LanguageDetectionResult.java From mojito with Apache License 2.0 | 4 votes |
public Detector getDetector() { return detector; }
Example #14
Source File: LanguageDetectionResult.java From mojito with Apache License 2.0 | 4 votes |
public void setDetector(Detector detector) { this.detector = detector; }
Example #15
Source File: LanguageDetectionAnnotator.java From bluima with Apache License 2.0 | 3 votes |
public static String detect(String text) throws LangDetectException { Detector detector = DetectorFactory.create(0.5); detector.append(text); return detector.detect(); }