Java Code Examples for org.apache.uima.jcas.JCas#getDocumentLanguage()
The following examples show how to use
org.apache.uima.jcas.JCas#getDocumentLanguage() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Document.java From ambiverse-nlu with Apache License 2.0 | 5 votes |
public void addSettingstoJcas(JCas jcas) throws IOException { AidaDocumentSettings ads = new AidaDocumentSettings(jcas); if (this.getLanguage() != null) { if (jcas.getDocumentLanguage() != null && !jcas.getDocumentLanguage().equals("x-unspecified") && !jcas.getDocumentLanguage() .equals(this.getLanguage().toString())) { throw new IllegalArgumentException("Language in JCas and language in settings are different"); } ads.setLanguage(this.getLanguage().toString()); jcas.setDocumentLanguage(ads.getLanguage()); } if (this.getDocChunkStrategy() != null) { ads.setDocChunkStrategy(this.getDocChunkStrategy().toString()); } ads.setDocumentId(this.getDocumentId()); if (ads.getDocumentInputFormat() != null) { ads.setDocumentInputFormat(this.getDocumentInputFormat().toString()); } ads.setEncoding(this.getEncoding()); if (disambiguationSettings != null) { disambiguationSettings.addToJCas(ads, jcas); } ads.addToIndexes(); if (annotations != null) { annotations.addMentionsToJCas(jcas); } if (!exists(jcas, DocumentMetaData.class)) { DocumentMetaData md = new DocumentMetaData(jcas); md.setDocumentId(ads.getDocumentId()); md.addToIndexes(); } }
Example 2
Source File: ClassTypeProbabilityBmeow.java From ambiverse-nlu with Apache License 2.0 | 5 votes |
@Override public Set<Feature> extract(JCas jcas, TextClassificationTarget unit) throws TextClassificationException { Set<Feature> features = new HashSet<>(); String language = jcas.getDocumentLanguage(); String token = KnowNERLanguage.requiresLemma(language)? JCasUtil.selectCovered(jcas, Lemma.class, unit).get(0).getValue() : unit.getCoveredText(); Map<String, ClassProbabilityDistributionBmeow> classTypeProbability = classTypeProbabilities.get(language); if(classTypeProbability.containsKey(token)){ ClassProbabilityDistributionBmeow distribution = classTypeProbability.get(token); for (int i = 0; i < 4; i++) { features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i], distribution.getPers()[i])); features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i+4], distribution.getOrg()[i])); features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i+8], distribution.getLoc()[i])); features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i+12], distribution.getMisc()[i])); } } else { for (int i = 0; i < 4; i++) { features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i], 0.0)); features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i+4], 0.0)); features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i+8], 0.0)); features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i+12], 0.0)); } } return features; }
Example 3
Source File: BlueCasUtil.java From bluima with Apache License 2.0 | 5 votes |
/** * Whether this document should be kept for analysis, based on: * <ul> * <li>language == en</li> * <li>OOV < 0.4 (see {@link TooMuchOOVFilterAnnotator})</li> * <li>Enough tokens per page (see {@link TooFewTokensFilterAnnotator})</li> * </ul> */ public static boolean keepDoc(JCas jCas) { String lang = jCas.getDocumentLanguage(); if (lang.equals("x-unspecified")) { LOG.warn("document language needed to decide whether to keepDoc(), but document language is not set, pmId" + getHeaderDocId(jCas)); } if (!(lang.equals("en") || lang.equals("x-unspecified")) || // exists(jCas, TooFewTokensPerPage.class) || // exists(jCas, TooManyOOV.class)) { return false; } return true; }
Example 4
Source File: LanguageDetectionAnnotatorTest.java From bluima with Apache License 2.0 | 5 votes |
private Object getLang(String testSentence) throws UIMAException { JCas jCas = getTestCas(testSentence); AnalysisEngine ae = createEngine(LanguageDetectionAnnotator.class, LanguageDetectionAnnotator.MIN_TEXT_LENGTH, 1); ae.process(jCas); return jCas.getDocumentLanguage(); }
Example 5
Source File: TextLineWriter.java From newsleak with GNU Affero General Public License v3.0 | 4 votes |
@Override public void process(JCas jcas) throws AnalysisEngineProcessException { String docText = jcas.getDocumentText(); // Language String outputText = jcas.getDocumentLanguage() + "\t"; // n sentencs Collection<Sentence> sentences = JCasUtil.selectCovered(jcas, Sentence.class, 0, jcas.getDocumentText().length()); outputText += sentences.size() + "\t"; // n tokens Collection<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, 0, jcas.getDocumentText().length()); outputText += tokens.size() + "\t"; // pos String firstPOS = tokens.iterator().next().getPos(); outputText += firstPOS + "\t"; // text outputText += docText.replaceAll("\n", " "); // linewriter.append(outputText); Metadata metadata = (Metadata) jcas.getAnnotationIndex(Metadata.type).iterator().next(); langStats.put(metadata.getDocId(), jcas.getDocumentLanguage()); if (sampleIdHash.contains(metadata.getDocId())) { int i = 0; for (Sentence s : sentences) { i++; String sOut = metadata.getDocId() + "\t" + i + "\t"; String tOut = ""; for (Token t : JCasUtil.selectCovered(jcas, Token.class, s.getBegin(), s.getEnd())) { tOut += t.getCoveredText() + " "; } sOut += tOut.trim(); linewriter.append(sOut); } } }
Example 6
Source File: CpePipelineTest.java From uima-uimafit with Apache License 2.0 | 4 votes |
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { MARKER_SEEN = jCas.getDocumentLanguage(); }