edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation Java Exaples

Source File: CoreNlpTokenizer.java From jstarcraft-nlp with Apache License 2.0

5 votes

@Override
public boolean incrementToken() {
    clearAttributes();
    while (tokens == null || !tokens.hasNext())
        if (!getNextSentence())
            return false;
    CoreLabel token = tokens.next();
    // Use the lemmatized word:
    String word = token.get(LemmaAnnotation.class);
    if (word == null) { // Fallback when no lemmatization happens.
        word = token.get(TextAnnotation.class);
    }
    termAttribute.setLength(0);
    termAttribute.append(word);
    // NER or part of speech annotation
    String pos = token.get(NamedEntityTagAnnotation.class);
    pos = (pos == null || "O".equals(pos)) ? token.get(PartOfSpeechAnnotation.class) : pos;
    typeAttribute.setType(pos != null ? pos : TypeAttribute.DEFAULT_TYPE);
    // Token character offsets
    int be = token.get(CharacterOffsetBeginAnnotation.class).intValue();
    int en = token.get(CharacterOffsetEndAnnotation.class).intValue();
    offsetAttribute.setOffset(be, en);
    // Token in-document position increment:
    positionAttribute.setPositionIncrement(1 + skippedTokens);
    skippedTokens = 0;
    return true;
}

Source File: CoreNLPNamedEntityRecognizer.java From Heracles with GNU General Public License v3.0

5 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must each contain Words belonging to a single sentence.
	 * 
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){
		
		
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "ner");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			if (a == null){
				System.out.println(a);
			}
			pipeline.annotate(a);
			List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class);
			for (CoreMap sentence : sentenceAnnotations){
				for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
					
					Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class));
					String ner = token.get(NamedEntityTagAnnotation.class);
					String nerValue = token.get(NormalizedNamedEntityTagAnnotation.class);
					if (ner!=null)
						w.putAnnotation("nerLabel", ner);
					if (nerValue!=null)
						w.putAnnotation("nerValue", nerValue);
					
//					System.out.println(w.getAnnotations());
				}
			

				
				
			}
		}		
	}

Source File: Entity.java From NLIWOD with GNU Affero General Public License v3.0

5 votes

/***
 * Checks if there is an entity of the specified type in the question.
 * @param entityType an entity type: Date, Location, Organization, Person, Percent, or Money
 * @param question
 * @return if a entity of that type is present returns the name of the type otherwise "No" + the name of the type 
 */
protected String recognizeEntity(String entityType, String question){
	String result = "No" + entityType;
	Annotation annotation = new Annotation(question);
	pipeline.annotate(annotation);
	List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
	for (CoreMap sentence : sentences)
	for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
        String ne = token.get(NamedEntityTagAnnotation.class); 
        if(entityType.toUpperCase().equals(ne))
        	result = entityType;
       }
	return result;
}

Source File: WiseOwlStanfordFilter.java From wiseowl with MIT License

4 votes

public static void main(String args[])
{
	Properties props = new Properties();
	props.setProperty("annotators", "tokenize, cleanxml, ssplit,pos,lemma,ner");
	
	StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
	pipeline.addAnnotator(new TimeAnnotator("sutime", props));
	String text = "<mydata> refeer</mydata>today is 12 jan 2016. what is tommorow? Who is Avtar? Does he work at Apple or Google? Sumit was born on 13 feb,2011.";

	Annotation document = new Annotation(text);
	pipeline.annotate(document);
    System.out.println(document.get(CoreAnnotations.TextAnnotation.class));
    List<CoreMap> timexAnnsAll = document.get(TimeAnnotations.TimexAnnotations.class);
    for (CoreMap cm : timexAnnsAll) {
    List<CoreLabel> tokens = cm.get(CoreAnnotations.TokensAnnotation.class);
    TimeData td=new TimeData();
    td.setTime(cm.get(TimeExpression.Annotation.class).getTemporal().toISOString());
    td.setStart(tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    td.setEnd(tokens.get(tokens.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    }
 
	List<CoreMap> sentences = document.get(SentencesAnnotation.class);
	for(CoreMap sentence: sentences) {
	  // traversing the words in the current sentence
	  // a CoreLabel is a CoreMap with additional token-specific methods
		System.out.println("in sent");
	  for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
	    // this is the text of the token
		  System.out.println("in token");
	    String word = token.get(TextAnnotation.class);
	    // this is the POS tag of the token
	    String pos = token.get(PartOfSpeechAnnotation.class);
	    // this is the NER label of the token
	    String ne = token.get(NamedEntityTagAnnotation.class);
	    System.out.println("word : "+word+" pos: "+pos+" ner: "+ne);
	    
	  }

	}

}

Source File: WiseOwlStanfordFilter.java From wiseowl with MIT License

4 votes

public Iterator findTokens() throws IOException
{
	/*char[] c = new char[256];
    int sz = 0;
    StringBuilder b = new StringBuilder();
    
    while ((sz = input.read(c)) >= 0) {
      b.append(c, 0, sz);
    }*/
    //String text = b.toString();
	if (!input.incrementToken()) return null;
    String text;
    text = input.getAttribute(CharTermAttribute.class).toString();
	// read some text in the text variable
	//System.out.println("before annotation");
	Annotation document = new Annotation(text);
	// these are all the sentences in this document
	// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
	pipeline.annotate(document);
	List<CoreMap> timexAnnsAll = document.get(TimeAnnotations.TimexAnnotations.class);
    for (CoreMap cm : timexAnnsAll) {
    List<CoreLabel> tokens = cm.get(CoreAnnotations.TokensAnnotation.class);
    TimeData td=new TimeData();
    td.setTime(cm.get(TimeExpression.Annotation.class).getTemporal().toString());
    td.setStart(tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    td.setEnd(tokens.get(tokens.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    timeQueue.add(td);
    }
	List<CoreMap> sentences = document.get(SentencesAnnotation.class);
	
	//System.out.println("after annotation and sentence getting"+sentences.size());
	for(CoreMap sentence: sentences) {
	  // traversing the words in the current sentence
	  // a CoreLabel is a CoreMap with additional token-specific methods
	  for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
	    // this is the text of the token
		  System.out.println("in token");
	    String word = token.get(TextAnnotation.class);
	    // this is the POS tag of the token
	    String pos = token.get(PartOfSpeechAnnotation.class);
	    // this is the NER label of the token
	    String ne = token.get(NamedEntityTagAnnotation.class);
	   // System.out.println("word : "+word+" pos: "+pos+" ner: "+ne);
	    TokenData tok=new TokenData();
	    tok.setNER(ne);
	    tok.setToken(word);
	    tok.setPOS(pos);
	    tokenQueue.add(tok);
	  }

	}
	Iterator<TokenData> it=tokenQueue.iterator();
	itr_cpy=tokenQueue.iterator();
	tokenOffset=0;
	start=0;
	end=0;
	return it;
}

Source File: CoreNLPHelper.java From Heracles with GNU General Public License v3.0

4 votes

public static Annotation reconstructStanfordAnnotations(Span sentenceSpan, HashMap<Integer, Word> wordIndex, boolean useWordOrderInsteadOfOffset){
		String originalText = sentenceSpan.getAnnotation("text", String.class); 
		Annotation a = new Annotation(originalText);
		a.set(TextAnnotation.class, originalText);
		
		//a.set(DocIDAnnotation.class, "document");
		
		List<CoreMap> sentenceAnnotations = new ArrayList<CoreMap>();
		a.set(SentencesAnnotation.class, sentenceAnnotations);
		List<CoreLabel> tokenAnnotations = new ArrayList<CoreLabel>();
		a.set(TokensAnnotation.class, tokenAnnotations);
		
		ArrayCoreMap sentenceAnnotation = new ArrayCoreMap();
		sentenceAnnotations.add(sentenceAnnotation);
		
//		int startOffset = sentenceSpan.first().getStartOffset();
		
		for (Word w : sentenceSpan){
			CoreLabel c = new CoreLabel();
			c.set(TextAnnotation.class, w.getWord());
			c.set(OriginalTextAnnotation.class, w.getWord());
			c.set(ValueAnnotation.class, w.getWord());
			c.set(CharacterOffsetBeginAnnotation.class, w.getStartOffset());
			c.set(CharacterOffsetEndAnnotation.class, w.getEndOffset());
			
			
			c.set(IndexAnnotation.class, w.getOrder()+1);
//			c.setIndex(w.getOrder());
			
			c.set(SentenceIndexAnnotation.class, 0);
//			c.setSentIndex(0);
			
			c.set(DocIDAnnotation.class, "document");
			c.setDocID("document");
			
			if (w.hasAnnotation("pos"))
				c.set(PartOfSpeechAnnotation.class, w.getAnnotation("pos",String.class));
			
			if (w.hasAnnotation("lemma"))
				c.set(LemmaAnnotation.class, w.getAnnotation("lemma", String.class));
			
			if (w.hasAnnotation("nerLabel"))
				c.set(NamedEntityTagAnnotation.class, w.getAnnotation("nerLabel", String.class));
			
			if (w.hasAnnotation("nerValue"))
				c.set(NormalizedNamedEntityTagAnnotation.class, w.getAnnotation("nerValue", String.class));
			
			tokenAnnotations.add(c);
			if (useWordOrderInsteadOfOffset){
				wordIndex.put(w.getOrder(), w);
			} else {
				wordIndex.put(w.getStartOffset(), w);
			}
		}
		//essential sentence annotation: TokensAnnotation
		sentenceAnnotation.set(TokensAnnotation.class, tokenAnnotations);
		//essential sentence annotation: TextAnnotation
		sentenceAnnotation.set(TextAnnotation.class, originalText);
		//essential sentence annotation: SentenceIndexAnnotation
		sentenceAnnotation.set(SentenceIndexAnnotation.class, 0);
		
		sentenceAnnotation.set(CharacterOffsetBeginAnnotation.class, 0);
		sentenceAnnotation.set(CharacterOffsetEndAnnotation.class, sentenceSpan.last().getEndOffset());
		sentenceAnnotation.set(TokenBeginAnnotation.class, 0);
		sentenceAnnotation.set(TokenEndAnnotation.class, sentenceSpan.last().getOrder());
		
		return a;
	}

Source File: JsonPipeline.java From tac2015-event-detection with GNU General Public License v3.0

4 votes

/** annotator is a stanford corenlp notion.  */
void addAnnoToSentenceObject(Map<String,Object> sent_info, CoreMap sentence, String annotator) {
	switch(annotator) {
	case "tokenize":
	case "cleanxml":
	case "ssplit":
		break;
	case "pos":
		addTokenAnno(sent_info,sentence, "pos", PartOfSpeechAnnotation.class);
		break;
	case "lemma":
		addTokenAnno(sent_info,sentence, "lemmas", LemmaAnnotation.class);
		break;
	case "ner":
		addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class);
		addTokenAnno(sent_info, sentence, "normner", NormalizedNamedEntityTagAnnotation.class);
		break;
	case "regexner":
		addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class);
		break;
	case "sentiment": throw new RuntimeException("TODO");
	case "truecase": throw new RuntimeException("TODO");
	case "parse":
		addParseTree(sent_info,sentence);
		addDepsCC(sent_info,sentence);
		addDepsBasic(sent_info,sentence);
		break;
	case "depparse":
		addDepsCC(sent_info,sentence);
		addDepsBasic(sent_info,sentence);
		break;
	case "dcoref":
		break;
	case "relation": throw new RuntimeException("TODO");
	case "natlog": throw new RuntimeException("TODO");
	case "quote": throw new RuntimeException("TODO");
	case "entitymentions":
		addEntityMentions(sent_info, sentence);
		break;
	default:
		throw new RuntimeException("don't know how to handle annotator " + annotator);
	}
}

Source File: CoreNLPToJSON.java From phrasal with GNU General Public License v3.0

4 votes

/**
 * Process an English text file.
 * 
 * @param args
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
  if (args.length < 1) {
    System.err.printf("Usage: java %s file [inputproperties_str] > json_output%n", CoreNLPToJSON.class.getName());
    System.exit(-1);
  }
  String textFile = args[0];
  InputProperties inputProperties = args.length > 1 ? InputProperties.fromString(args[1]) : new InputProperties();

  StanfordCoreNLP coreNLP = new StanfordCoreNLP(properties);
  
  // Configure tokenizer
  EnglishPreprocessor preprocessor = new EnglishPreprocessor(true);
  
  // Use a map with ordered keys so that the output is ordered by segmentId.
  Map<Integer,SourceSegment> annotations = new TreeMap<Integer,SourceSegment>();
  LineNumberReader reader = IOTools.getReaderFromFile(textFile);
  for (String line; (line = reader.readLine()) != null;) {
    Annotation annotation = coreNLP.process(line);
    List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
    if (sentences.size() != 1) {
      throw new RuntimeException("Sentence splitting on line: " + String.valueOf(reader.getLineNumber()));
    }
    CoreMap sentence = sentences.get(0);
    Tree tree = sentence.get(TreeAnnotation.class);
    tree.indexLeaves();
    int[] chunkVector = getChunkVector(tree);
    List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);
    int numTokens = tokens.size();
    SymmetricalWordAlignment alignment = preprocessor.processAndAlign(line);
    if (alignment.e().size() != numTokens) {
      throw new RuntimeException(String.format("Tokenizer configurations differ: %d/%d", alignment.e().size(), numTokens));
    }
    SourceSegment segment = new SourceSegment(numTokens);
    segment.layoutSpec.addAll(makeLayoutSpec(alignment));
    segment.inputProperties = inputProperties.toString();
    for (int j = 0; j < numTokens; ++j) {
      CoreLabel token = tokens.get(j);
      String word = token.get(TextAnnotation.class);
      segment.tokens.add(unescape(word));
      String pos = mapPOS(token.get(PartOfSpeechAnnotation.class));
      segment.pos.add(pos);
      String ne = token.get(NamedEntityTagAnnotation.class);
      segment.ner.add(ne);
      segment.chunkVector[j] = chunkVector[j];
    }
    annotations.put(reader.getLineNumber()-1, segment);
  }
  reader.close();
  System.err.printf("Processed %d sentences%n", reader.getLineNumber());
  
  final SourceDocument jsonDocument = new SourceDocument(textFile, annotations);
  
  // Convert to json
  Gson gson = new Gson();
  String json = gson.toJson(jsonDocument);
  System.out.println(json);
}

edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation Java Examples