Java Code Examples for org.apache.uima.jcas.JCas#getDocumentLanguage()

The following examples show how to use org.apache.uima.jcas.JCas#getDocumentLanguage() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Document.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
public void addSettingstoJcas(JCas jcas) throws IOException {
  AidaDocumentSettings ads = new AidaDocumentSettings(jcas);
  if (this.getLanguage() != null) {
    if (jcas.getDocumentLanguage() != null && !jcas.getDocumentLanguage().equals("x-unspecified") && !jcas.getDocumentLanguage()
        .equals(this.getLanguage().toString())) {
      throw new IllegalArgumentException("Language in JCas and language in settings are different");
    }
    ads.setLanguage(this.getLanguage().toString());
    jcas.setDocumentLanguage(ads.getLanguage());
  }
  if (this.getDocChunkStrategy() != null) {
    ads.setDocChunkStrategy(this.getDocChunkStrategy().toString());
  }
  ads.setDocumentId(this.getDocumentId());
  if (ads.getDocumentInputFormat() != null) {
    ads.setDocumentInputFormat(this.getDocumentInputFormat().toString());
  }
  ads.setEncoding(this.getEncoding());
  if (disambiguationSettings != null) {
    disambiguationSettings.addToJCas(ads, jcas);
  }
  ads.addToIndexes();
  if (annotations != null) {
    annotations.addMentionsToJCas(jcas);
  }
  if (!exists(jcas, DocumentMetaData.class)) {
    DocumentMetaData md = new DocumentMetaData(jcas);
    md.setDocumentId(ads.getDocumentId());
    md.addToIndexes();
  }
}
 
Example 2
Source File: ClassTypeProbabilityBmeow.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget unit) throws TextClassificationException {
	Set<Feature> features = new HashSet<>();

	String language = jcas.getDocumentLanguage();
	String token = KnowNERLanguage.requiresLemma(language)?
			JCasUtil.selectCovered(jcas, Lemma.class, unit).get(0).getValue() :
			unit.getCoveredText();

	Map<String, ClassProbabilityDistributionBmeow> classTypeProbability = classTypeProbabilities.get(language);
	if(classTypeProbability.containsKey(token)){
		ClassProbabilityDistributionBmeow distribution = classTypeProbability.get(token);
		for (int i = 0; i < 4; i++) {
			features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i], distribution.getPers()[i]));
			features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i+4], distribution.getOrg()[i]));
			features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i+8], distribution.getLoc()[i]));
			features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i+12], distribution.getMisc()[i]));
		}
	} else {
		for (int i = 0; i < 4; i++) {
			features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i], 0.0));
			features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i+4], 0.0));
			features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i+8], 0.0));
			features.add(new Feature(FEATURE_NAME + "_" + DIMENSIONS[i+12], 0.0));
		}
	}

	return features;
}
 
Example 3
Source File: BlueCasUtil.java    From bluima with Apache License 2.0 5 votes vote down vote up
/**
 * Whether this document should be kept for analysis, based on:
 * <ul>
 * <li>language == en</li>
 * <li>OOV < 0.4 (see {@link TooMuchOOVFilterAnnotator})</li>
 * <li>Enough tokens per page (see {@link TooFewTokensFilterAnnotator})</li>
 * </ul>
 */
public static boolean keepDoc(JCas jCas) {
    String lang = jCas.getDocumentLanguage();
    if (lang.equals("x-unspecified")) {
        LOG.warn("document language needed to decide whether to keepDoc(), but document language is not set, pmId"
                + getHeaderDocId(jCas));
    }
    if (!(lang.equals("en") || lang.equals("x-unspecified")) || //
            exists(jCas, TooFewTokensPerPage.class) || //
            exists(jCas, TooManyOOV.class)) {
        return false;
    }
    return true;
}
 
Example 4
Source File: LanguageDetectionAnnotatorTest.java    From bluima with Apache License 2.0 5 votes vote down vote up
private Object getLang(String testSentence) throws UIMAException {
    JCas jCas = getTestCas(testSentence);
    AnalysisEngine ae = createEngine(LanguageDetectionAnnotator.class,
            LanguageDetectionAnnotator.MIN_TEXT_LENGTH, 1);
    ae.process(jCas);
    return jCas.getDocumentLanguage();
}
 
Example 5
Source File: TextLineWriter.java    From newsleak with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {

	String docText = jcas.getDocumentText();
	// Language
	String outputText = jcas.getDocumentLanguage() + "\t";

	// n sentencs
	Collection<Sentence> sentences = JCasUtil.selectCovered(jcas, Sentence.class, 0,
			jcas.getDocumentText().length());
	outputText += sentences.size() + "\t";

	// n tokens
	Collection<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, 0, jcas.getDocumentText().length());
	outputText += tokens.size() + "\t";

	// pos
	String firstPOS = tokens.iterator().next().getPos();
	outputText += firstPOS + "\t";

	// text
	outputText += docText.replaceAll("\n", " ");

	// linewriter.append(outputText);

	Metadata metadata = (Metadata) jcas.getAnnotationIndex(Metadata.type).iterator().next();
	langStats.put(metadata.getDocId(), jcas.getDocumentLanguage());

	if (sampleIdHash.contains(metadata.getDocId())) {
		int i = 0;
		for (Sentence s : sentences) {
			i++;
			String sOut = metadata.getDocId() + "\t" + i + "\t";
			String tOut = "";
			for (Token t : JCasUtil.selectCovered(jcas, Token.class, s.getBegin(), s.getEnd())) {
				tOut += t.getCoveredText() + " ";
			}
			sOut += tOut.trim();
			linewriter.append(sOut);
		}
	}

}
 
Example 6
Source File: CpePipelineTest.java    From uima-uimafit with Apache License 2.0 4 votes vote down vote up
@Override
public void process(JCas jCas)
	throws AnalysisEngineProcessException
{
	MARKER_SEEN = jCas.getDocumentLanguage();
}