edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation Java Exaples

Source File: CoreNlpTokenizer.java From jstarcraft-nlp with Apache License 2.0

5 votes

@Override
public boolean incrementToken() {
    clearAttributes();
    while (tokens == null || !tokens.hasNext())
        if (!getNextSentence())
            return false;
    CoreLabel token = tokens.next();
    // Use the lemmatized word:
    String word = token.get(LemmaAnnotation.class);
    if (word == null) { // Fallback when no lemmatization happens.
        word = token.get(TextAnnotation.class);
    }
    termAttribute.setLength(0);
    termAttribute.append(word);
    // NER or part of speech annotation
    String pos = token.get(NamedEntityTagAnnotation.class);
    pos = (pos == null || "O".equals(pos)) ? token.get(PartOfSpeechAnnotation.class) : pos;
    typeAttribute.setType(pos != null ? pos : TypeAttribute.DEFAULT_TYPE);
    // Token character offsets
    int be = token.get(CharacterOffsetBeginAnnotation.class).intValue();
    int en = token.get(CharacterOffsetEndAnnotation.class).intValue();
    offsetAttribute.setOffset(be, en);
    // Token in-document position increment:
    positionAttribute.setPositionIncrement(1 + skippedTokens);
    skippedTokens = 0;
    return true;
}

Source File: CoreNLPHelper.java From Heracles with GNU General Public License v3.0

4 votes

public static Annotation reconstructStanfordAnnotations(Span sentenceSpan, HashMap<Integer, Word> wordIndex, boolean useWordOrderInsteadOfOffset){
		String originalText = sentenceSpan.getAnnotation("text", String.class); 
		Annotation a = new Annotation(originalText);
		a.set(TextAnnotation.class, originalText);
		
		//a.set(DocIDAnnotation.class, "document");
		
		List<CoreMap> sentenceAnnotations = new ArrayList<CoreMap>();
		a.set(SentencesAnnotation.class, sentenceAnnotations);
		List<CoreLabel> tokenAnnotations = new ArrayList<CoreLabel>();
		a.set(TokensAnnotation.class, tokenAnnotations);
		
		ArrayCoreMap sentenceAnnotation = new ArrayCoreMap();
		sentenceAnnotations.add(sentenceAnnotation);
		
//		int startOffset = sentenceSpan.first().getStartOffset();
		
		for (Word w : sentenceSpan){
			CoreLabel c = new CoreLabel();
			c.set(TextAnnotation.class, w.getWord());
			c.set(OriginalTextAnnotation.class, w.getWord());
			c.set(ValueAnnotation.class, w.getWord());
			c.set(CharacterOffsetBeginAnnotation.class, w.getStartOffset());
			c.set(CharacterOffsetEndAnnotation.class, w.getEndOffset());
			
			
			c.set(IndexAnnotation.class, w.getOrder()+1);
//			c.setIndex(w.getOrder());
			
			c.set(SentenceIndexAnnotation.class, 0);
//			c.setSentIndex(0);
			
			c.set(DocIDAnnotation.class, "document");
			c.setDocID("document");
			
			if (w.hasAnnotation("pos"))
				c.set(PartOfSpeechAnnotation.class, w.getAnnotation("pos",String.class));
			
			if (w.hasAnnotation("lemma"))
				c.set(LemmaAnnotation.class, w.getAnnotation("lemma", String.class));
			
			if (w.hasAnnotation("nerLabel"))
				c.set(NamedEntityTagAnnotation.class, w.getAnnotation("nerLabel", String.class));
			
			if (w.hasAnnotation("nerValue"))
				c.set(NormalizedNamedEntityTagAnnotation.class, w.getAnnotation("nerValue", String.class));
			
			tokenAnnotations.add(c);
			if (useWordOrderInsteadOfOffset){
				wordIndex.put(w.getOrder(), w);
			} else {
				wordIndex.put(w.getStartOffset(), w);
			}
		}
		//essential sentence annotation: TokensAnnotation
		sentenceAnnotation.set(TokensAnnotation.class, tokenAnnotations);
		//essential sentence annotation: TextAnnotation
		sentenceAnnotation.set(TextAnnotation.class, originalText);
		//essential sentence annotation: SentenceIndexAnnotation
		sentenceAnnotation.set(SentenceIndexAnnotation.class, 0);
		
		sentenceAnnotation.set(CharacterOffsetBeginAnnotation.class, 0);
		sentenceAnnotation.set(CharacterOffsetEndAnnotation.class, sentenceSpan.last().getEndOffset());
		sentenceAnnotation.set(TokenBeginAnnotation.class, 0);
		sentenceAnnotation.set(TokenEndAnnotation.class, sentenceSpan.last().getOrder());
		
		return a;
	}

Source File: JsonPipeline.java From tac2015-event-detection with GNU General Public License v3.0

4 votes

@SuppressWarnings({ "rawtypes", "unchecked" })
	static void addEntityMentions(Map<String,Object> sent_info, CoreMap sentence) {
        List<CoreMap> coreMentions = sentence.get(MentionsAnnotation.class);
        List<Map> jsonMentions = new ArrayList<>();
        /* trying to figure out the keys in each mention. here's a printout from one.
MENTION August 2014
class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation	August 2014
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation	3
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation	14
class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation	[August-2, 2014-3]
class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation	1
class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation	3
class edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation	DATE
class edu.stanford.nlp.ling.CoreAnnotations$NormalizedNamedEntityTagAnnotation	2014-08
class edu.stanford.nlp.ling.CoreAnnotations$EntityTypeAnnotation	DATE
class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation	0
class edu.stanford.nlp.time.TimeAnnotations$TimexAnnotation	<TIMEX3 tid="t1" type="DATE" value="2014-08">August 2014</TIMEX3>
MENTION Barack Obama
class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation	Barack Obama
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation	17
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation	29
class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation	[Barack-5, Obama-6]
class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation	4
class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation	6
class edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation	PERSON
class edu.stanford.nlp.ling.CoreAnnotations$EntityTypeAnnotation	PERSON
class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation	0
MENTION Paris
class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation	Paris
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation	66
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation	71
class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation	[Paris-5]
class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation	14
class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation	15
class edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation	LOCATION
class edu.stanford.nlp.ling.CoreAnnotations$EntityTypeAnnotation	LOCATION
class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation	1
         */
        for (CoreMap mention : coreMentions) {
//            U.p("MENTION " + mention);
//        	for (Class k : mention.keySet()) {
//        		U.pf("%s\t%s\n", k, mention.get(k));
//        	}
            Map m = new HashMap<String, Object>();
            m.put("tokspan", Lists.newArrayList(
            		mention.get(TokenBeginAnnotation.class).intValue(),
            		mention.get(TokenEndAnnotation.class).intValue()));
            m.put("charspan", Lists.newArrayList(
            		mention.get(CharacterOffsetBeginAnnotation.class).intValue(),
            		mention.get(CharacterOffsetEndAnnotation.class).intValue()));
            m.put("sentence", mention.get(SentenceIndexAnnotation.class).intValue());
            String entityType = mention.get(EntityTypeAnnotation.class);
            m.put("type", entityType);
            if (mention.containsKey(NormalizedNamedEntityTagAnnotation.class)) {
            	m.put("normalized", mention.get(NormalizedNamedEntityTagAnnotation.class));
            }
            if (mention.containsKey(TimexAnnotation.class)) {
            	m.put("timex_xml", mention.get(TimexAnnotation.class).toString());
            }
            jsonMentions.add(m);
        }
        sent_info.put("entitymentions", jsonMentions);
	}

edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation Java Examples