edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation Java Exaples

Source File: CoreNlpTokenizer.java From jstarcraft-nlp with Apache License 2.0

5 votes

@Override
public boolean incrementToken() {
    clearAttributes();
    while (tokens == null || !tokens.hasNext())
        if (!getNextSentence())
            return false;
    CoreLabel token = tokens.next();
    // Use the lemmatized word:
    String word = token.get(LemmaAnnotation.class);
    if (word == null) { // Fallback when no lemmatization happens.
        word = token.get(TextAnnotation.class);
    }
    termAttribute.setLength(0);
    termAttribute.append(word);
    // NER or part of speech annotation
    String pos = token.get(NamedEntityTagAnnotation.class);
    pos = (pos == null || "O".equals(pos)) ? token.get(PartOfSpeechAnnotation.class) : pos;
    typeAttribute.setType(pos != null ? pos : TypeAttribute.DEFAULT_TYPE);
    // Token character offsets
    int be = token.get(CharacterOffsetBeginAnnotation.class).intValue();
    int en = token.get(CharacterOffsetEndAnnotation.class).intValue();
    offsetAttribute.setOffset(be, en);
    // Token in-document position increment:
    positionAttribute.setPositionIncrement(1 + skippedTokens);
    skippedTokens = 0;
    return true;
}

Source File: CoreNLPTokenizer.java From Heracles with GNU General Public License v3.0

5 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must provide an annotation of type "text".
	 * This spanType does not have to be textual unit.
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfTextUnit){
		if (dataset.getPerformedNLPTasks().contains(getTask())){
			Framework.error("This dataset has already been tokenized.");
			return;
		}
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "tokenize");
		//prop1.setProperty("options", "splitHyphenated=true");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		
		for (Span span : dataset.getSpans(spanTypeOfTextUnit)){
			Span textualUnit = span.getTextualUnit();
			String originalText = span.getAnnotation("text", String.class); 
			Annotation a = new Annotation(originalText);
			pipeline.annotate(a);
			List<CoreLabel> tokenAnnotations = a.get(TokensAnnotation.class);
			Word previousWord = null;
			if (!textualUnit.isEmpty())
				previousWord = textualUnit.last();
				
			for (CoreLabel token : tokenAnnotations){
				String word = token.get(OriginalTextAnnotation.class);
				int startOffset = token.get(CharacterOffsetBeginAnnotation.class);
//				int endOffset = token.get(CharacterOffsetEndAnnotation.class);
//				System.out.println(word + "\t" + startOffset + "\t" + endOffset);
				if (previousWord == null){
					previousWord = new Word(word, startOffset, textualUnit, dataset);
				} else {
					previousWord = new Word(word, startOffset, previousWord);
				}
				//and add the new word to the sentence span. If span=textualSpan than this has no effect
				if (!textualUnit.equals(span))
					span.add(previousWord);
			}
		}		
	}

Source File: CoreNLPNamedEntityRecognizer.java From Heracles with GNU General Public License v3.0

5 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must each contain Words belonging to a single sentence.
	 * 
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){
		
		
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "ner");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			if (a == null){
				System.out.println(a);
			}
			pipeline.annotate(a);
			List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class);
			for (CoreMap sentence : sentenceAnnotations){
				for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
					
					Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class));
					String ner = token.get(NamedEntityTagAnnotation.class);
					String nerValue = token.get(NormalizedNamedEntityTagAnnotation.class);
					if (ner!=null)
						w.putAnnotation("nerLabel", ner);
					if (nerValue!=null)
						w.putAnnotation("nerValue", nerValue);
					
//					System.out.println(w.getAnnotations());
				}
			

				
				
			}
		}		
	}

Source File: CoreNLPLemmatizer.java From Heracles with GNU General Public License v3.0

5 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must each contain Words belonging to a single sentence.
	 * 
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){
		
		
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "lemma");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			if (a == null){
				System.out.println(a);
			}
			pipeline.annotate(a);
			List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class);
			for (CoreMap sentence : sentenceAnnotations){
				for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
					
					Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class));
					String tempLemma = token.get(LemmaAnnotation.class);
					w.putAnnotation("lemma", tempLemma.toLowerCase());
//					System.out.println(w.getAnnotations());
				}
			

				
				
			}
		}		
	}

Source File: CoreNLPHelper.java From Heracles with GNU General Public License v3.0

4 votes

public static Annotation reconstructStanfordAnnotations(Span sentenceSpan, HashMap<Integer, Word> wordIndex, boolean useWordOrderInsteadOfOffset){
		String originalText = sentenceSpan.getAnnotation("text", String.class); 
		Annotation a = new Annotation(originalText);
		a.set(TextAnnotation.class, originalText);
		
		//a.set(DocIDAnnotation.class, "document");
		
		List<CoreMap> sentenceAnnotations = new ArrayList<CoreMap>();
		a.set(SentencesAnnotation.class, sentenceAnnotations);
		List<CoreLabel> tokenAnnotations = new ArrayList<CoreLabel>();
		a.set(TokensAnnotation.class, tokenAnnotations);
		
		ArrayCoreMap sentenceAnnotation = new ArrayCoreMap();
		sentenceAnnotations.add(sentenceAnnotation);
		
//		int startOffset = sentenceSpan.first().getStartOffset();
		
		for (Word w : sentenceSpan){
			CoreLabel c = new CoreLabel();
			c.set(TextAnnotation.class, w.getWord());
			c.set(OriginalTextAnnotation.class, w.getWord());
			c.set(ValueAnnotation.class, w.getWord());
			c.set(CharacterOffsetBeginAnnotation.class, w.getStartOffset());
			c.set(CharacterOffsetEndAnnotation.class, w.getEndOffset());
			
			
			c.set(IndexAnnotation.class, w.getOrder()+1);
//			c.setIndex(w.getOrder());
			
			c.set(SentenceIndexAnnotation.class, 0);
//			c.setSentIndex(0);
			
			c.set(DocIDAnnotation.class, "document");
			c.setDocID("document");
			
			if (w.hasAnnotation("pos"))
				c.set(PartOfSpeechAnnotation.class, w.getAnnotation("pos",String.class));
			
			if (w.hasAnnotation("lemma"))
				c.set(LemmaAnnotation.class, w.getAnnotation("lemma", String.class));
			
			if (w.hasAnnotation("nerLabel"))
				c.set(NamedEntityTagAnnotation.class, w.getAnnotation("nerLabel", String.class));
			
			if (w.hasAnnotation("nerValue"))
				c.set(NormalizedNamedEntityTagAnnotation.class, w.getAnnotation("nerValue", String.class));
			
			tokenAnnotations.add(c);
			if (useWordOrderInsteadOfOffset){
				wordIndex.put(w.getOrder(), w);
			} else {
				wordIndex.put(w.getStartOffset(), w);
			}
		}
		//essential sentence annotation: TokensAnnotation
		sentenceAnnotation.set(TokensAnnotation.class, tokenAnnotations);
		//essential sentence annotation: TextAnnotation
		sentenceAnnotation.set(TextAnnotation.class, originalText);
		//essential sentence annotation: SentenceIndexAnnotation
		sentenceAnnotation.set(SentenceIndexAnnotation.class, 0);
		
		sentenceAnnotation.set(CharacterOffsetBeginAnnotation.class, 0);
		sentenceAnnotation.set(CharacterOffsetEndAnnotation.class, sentenceSpan.last().getEndOffset());
		sentenceAnnotation.set(TokenBeginAnnotation.class, 0);
		sentenceAnnotation.set(TokenEndAnnotation.class, sentenceSpan.last().getOrder());
		
		return a;
	}

Source File: CoreNLPPosTagger.java From Heracles with GNU General Public License v3.0

4 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must each contain Words belonging to a single sentence.
	 * 
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){
//		if (dataset.getPerformedNLPTasks().contains(getTask())){
//			Framework.error("This dataset has already been tagged with POS.");
//			return;
//		}
		//check if prerequisites are satisfied
		if (!dataset.getPerformedNLPTasks().containsAll(prerequisites)){
			HashSet<NLPTask> missingTasks = new HashSet<>();
			missingTasks.addAll(prerequisites);
			missingTasks.removeAll(dataset.getPerformedNLPTasks());
			Framework.error("This dataset does not meet the requirements to use this component! Missing tasks: " + missingTasks);
			return;
		}
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "pos");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			if (a == null){
				System.out.println(a);
			}
			pipeline.annotate(a);
			List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class);
			for (CoreMap sentence : sentenceAnnotations){
				for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
					
					Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class));
					String tempPos = token.get(PartOfSpeechAnnotation.class);
					if (w.hasAnnotation("URI")){
						w.putAnnotation("pos", "NNP");
					} else {
						w.putAnnotation("pos", tempPos);
					}
//					System.out.println(w.getAnnotations());
				}
			

				
				
			}
		}		
	}

Source File: JsonPipeline.java From tac2015-event-detection with GNU General Public License v3.0

4 votes

@SuppressWarnings({ "rawtypes", "unchecked" })
	static void addEntityMentions(Map<String,Object> sent_info, CoreMap sentence) {
        List<CoreMap> coreMentions = sentence.get(MentionsAnnotation.class);
        List<Map> jsonMentions = new ArrayList<>();
        /* trying to figure out the keys in each mention. here's a printout from one.
MENTION August 2014
class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation	August 2014
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation	3
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation	14
class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation	[August-2, 2014-3]
class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation	1
class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation	3
class edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation	DATE
class edu.stanford.nlp.ling.CoreAnnotations$NormalizedNamedEntityTagAnnotation	2014-08
class edu.stanford.nlp.ling.CoreAnnotations$EntityTypeAnnotation	DATE
class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation	0
class edu.stanford.nlp.time.TimeAnnotations$TimexAnnotation	<TIMEX3 tid="t1" type="DATE" value="2014-08">August 2014</TIMEX3>
MENTION Barack Obama
class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation	Barack Obama
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation	17
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation	29
class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation	[Barack-5, Obama-6]
class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation	4
class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation	6
class edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation	PERSON
class edu.stanford.nlp.ling.CoreAnnotations$EntityTypeAnnotation	PERSON
class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation	0
MENTION Paris
class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation	Paris
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation	66
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation	71
class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation	[Paris-5]
class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation	14
class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation	15
class edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation	LOCATION
class edu.stanford.nlp.ling.CoreAnnotations$EntityTypeAnnotation	LOCATION
class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation	1
         */
        for (CoreMap mention : coreMentions) {
//            U.p("MENTION " + mention);
//        	for (Class k : mention.keySet()) {
//        		U.pf("%s\t%s\n", k, mention.get(k));
//        	}
            Map m = new HashMap<String, Object>();
            m.put("tokspan", Lists.newArrayList(
            		mention.get(TokenBeginAnnotation.class).intValue(),
            		mention.get(TokenEndAnnotation.class).intValue()));
            m.put("charspan", Lists.newArrayList(
            		mention.get(CharacterOffsetBeginAnnotation.class).intValue(),
            		mention.get(CharacterOffsetEndAnnotation.class).intValue()));
            m.put("sentence", mention.get(SentenceIndexAnnotation.class).intValue());
            String entityType = mention.get(EntityTypeAnnotation.class);
            m.put("type", entityType);
            if (mention.containsKey(NormalizedNamedEntityTagAnnotation.class)) {
            	m.put("normalized", mention.get(NormalizedNamedEntityTagAnnotation.class));
            }
            if (mention.containsKey(TimexAnnotation.class)) {
            	m.put("timex_xml", mention.get(TimexAnnotation.class).toString());
            }
            jsonMentions.add(m);
        }
        sent_info.put("entitymentions", jsonMentions);
	}

Source File: CRFPostprocessorFeatureFactory.java From phrasal with GNU General Public License v3.0

4 votes

protected Collection<String> featuresC(PaddedList<? extends CoreLabel> cInfo, int loc) {
  Collection<String> features = new ArrayList<>();
  CoreLabel c = cInfo.get(loc);
  CoreLabel n = cInfo.get(loc + 1);
  CoreLabel n2 = cInfo.get(loc + 2);
  CoreLabel p = cInfo.get(loc - 1);
  CoreLabel p2 = cInfo.get(loc - 2);

  String charc = c.get(CoreAnnotations.CharAnnotation.class);
  String charn = n.get(CoreAnnotations.CharAnnotation.class);
  String charn2 = n2.get(CoreAnnotations.CharAnnotation.class);
  String charp = p.get(CoreAnnotations.CharAnnotation.class);
  String charp2 = p2.get(CoreAnnotations.CharAnnotation.class);

  // Default feature set...a 5 character window
  // Adding actual characters causes overfitting.
  //    features.add(charc +"-c");
  //    features.add(charn + "-n1");
  //    features.add(charn2 + "-n2" );
  //    features.add(charp + "-p");
  //    features.add(charp2 + "-p2");

  // Sequence start indicator
  if (loc == 0) features.add("seq-start");

  // Character-class features
  addCharacterClassFeatures(features, charp2, "-p2");
  addCharacterClassFeatures(features, charp, "-p");
  addCharacterClassFeatures(features, charn, "-n");
  addCharacterClassFeatures(features, charn2, "-n2");
  addCharacterClassFeatures(features, charc, "-c");

  // Token features
  if (charc != null && ! charc.equals(ProcessorTools.WHITESPACE)) {
    // Current token
    String cToken = tokenClass(c.get(ParentAnnotation.class));
    features.add(cToken + "-cword");

    // Character position in the current token.
    int cPosition = c.get(CharacterOffsetBeginAnnotation.class);
    if (cPosition == 0) {
      features.add("char-start");
      features.add("start-" + cToken);
    } else {
      //        features.add("char-inside");
      features.add("inside-" + cToken);
    }

    // Left context
    String leftToken = "<S>";
    for (int i = loc-1; i > 0; --i) {
      String leftC = cInfo.get(i).get(CoreAnnotations.CharAnnotation.class);
      if (leftC != null && leftC.equals(ProcessorTools.WHITESPACE)) {
        String left = tokenClass(cInfo.get(i-1).get(CoreAnnotations.ParentAnnotation.class));
        if (left != null) {
          leftToken = left;
        }
        break;
      }
    }
    features.add(leftToken + "-lcontext"); 

    // Right context unigram
    String rightToken = "</S>";
    for (int i = loc+1; i < cInfo.size()-1; ++i) {
      String rightC = cInfo.get(i).get(CoreAnnotations.CharAnnotation.class);
      if (rightC != null && rightC.equals(ProcessorTools.WHITESPACE)) {
        String right = tokenClass(cInfo.get(i+1).get(CoreAnnotations.ParentAnnotation.class));
        if (right != null) {
          rightToken = right; 
        }
        break;
      }
    }
    features.add(rightToken + "-rcontext");


    // Context n-grams, mainly for the truecasing feature
    // Left context bigram
    features.add(leftToken + "-" + cToken + "-lbigram");
  }

  // Indicator transition feature
  features.add("cliqueC");

  return features;
}

edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation Java Examples