edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation Java Exaples

Source File: Chapter5.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

6 votes

private static void usingStanfordPOSTagger() {
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos");
    props.put("pos.model", "C:\\Current Books in Progress\\NLP and Java\\Models\\english-caseless-left3words-distsim.tagger");
    props.put("pos.maxlen", 10);
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    Annotation document = new Annotation(theSentence);
    pipeline.annotate(document);

    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            String word = token.get(TextAnnotation.class);
            String pos = token.get(PartOfSpeechAnnotation.class);
            System.out.print(word + "/" + pos + " ");
        }
        System.out.println();

        try {
            pipeline.xmlPrint(document, System.out);
            pipeline.prettyPrint(document, System.out);
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }
}

Source File: ReconTool.java From Criteria2Query with Apache License 2.0

6 votes

public boolean isCEE(String text){
	text = text.replace("/", " / ");
	Annotation annotation = new Annotation(text);
	pipeline.annotate(annotation);
	List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
	boolean flag=false;
	for (CoreMap sentence : sentences) {
		for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
			String word = token.get(TextAnnotation.class);//token.get(LemmaAnnotation.class);//TextAnnotation.class
			String pos = token.get(PartOfSpeechAnnotation.class);
			//String lemma = token.get(LemmaAnnotation.class);
			boolean f = false;
			if ((word.equals("and") || word.equals(",") || word.equals("/") || word.equals("or"))) {
				flag = true;
				break;
			}
			
		}
	}
	
	return flag;
}

Source File: CoreNLPLemmatizer.java From Heracles with GNU General Public License v3.0

5 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must each contain Words belonging to a single sentence.
	 * 
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){
		
		
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "lemma");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			if (a == null){
				System.out.println(a);
			}
			pipeline.annotate(a);
			List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class);
			for (CoreMap sentence : sentenceAnnotations){
				for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
					
					Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class));
					String tempLemma = token.get(LemmaAnnotation.class);
					w.putAnnotation("lemma", tempLemma.toLowerCase());
//					System.out.println(w.getAnnotations());
				}
			

				
				
			}
		}		
	}

Source File: StopwordAnnotator.java From coreNlp with Apache License 2.0

5 votes

@Override
public void annotate(Annotation annotation) {
    if (stopwords != null && stopwords.size() > 0 && annotation.containsKey(TokensAnnotation.class)) {
        List<CoreLabel> tokens = annotation.get(TokensAnnotation.class);
        for (CoreLabel token : tokens) {
            boolean isWordStopword = stopwords.contains(token.word().toLowerCase());
            boolean isLemmaStopword = checkLemma ? stopwords.contains(token.word().toLowerCase()) : false;
            Pair<Boolean, Boolean> pair = Pair.makePair(isWordStopword, isLemmaStopword);
            token.set(StopwordAnnotator.class, pair);
        }
    }
}

Source File: Stemming.java From AGDISTIS with GNU Affero General Public License v3.0

5 votes

public String stemming(String documentText) {
	List<String> lemmas = new LinkedList<String>();
	String label = null;
	LancasterStemmer stem = new LancasterStemmer();
	// Create an empty Annotation just with the given text
	Annotation document = new Annotation(documentText);
	// run all Annotators on this text
	this.pipeline.annotate(document);
	// Iterate over all of the sentences found
	List<CoreMap> sentences = document.get(SentencesAnnotation.class);
	for (CoreMap sentence : sentences) {
		// Iterate over all tokens in a sentence
		for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
			// Retrieve and add the lemma for each word into the
			// list of lemmas
			// lemmas.add(token.get(LemmaAnnotation.class));
			// lemmas.add(morpho.stem(token.word()));
			lemmas.add(stem.stem(token.get(LemmaAnnotation.class)));
		}
	}

	label = lemmas.toString();
	Pattern p = Pattern.compile("[,.;!?(){}\\[\\]<>%]");
	label = p.matcher(label).replaceAll("");

	return label;
}

Source File: Phrase.java From uncc2014watsonsim with GNU General Public License v2.0

5 votes

private static List<String> _lemmas(Phrase p) {
	return p.memo(Phrase.sentences)
			.stream()
			.flatMap(s -> s.get(TokensAnnotation.class).stream())
			.map( t -> t.get(LemmaAnnotation.class))
			.collect(toList());
}

Source File: JsonPipeline.java From tac2015-event-detection with GNU General Public License v3.0

5 votes

@SuppressWarnings({ "rawtypes", "unchecked" })
static void addTokenAnno(Map<String,Object> sent_info, CoreMap sentence,
		String keyname, Class annoClass) {
	List<String> tokenAnnos = Lists.newArrayList();
	for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
		tokenAnnos.add(token.getString(annoClass));
	}
	sent_info.put(keyname, (Object) tokenAnnos);
}

Source File: JsonPipeline.java From tac2015-event-detection with GNU General Public License v3.0

5 votes

static void addTokenBasics(Map<String,Object> sent_info, CoreMap sentence) {
	List<List<Integer>> tokenSpans = Lists.newArrayList();
	List<String> tokenTexts = Lists.newArrayList();
	for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
		List<Integer> span = Lists.newArrayList(token.beginPosition(), token.endPosition());
		tokenSpans.add(span);
		tokenTexts.add(token.value());
	}
	sent_info.put("tokens", (Object) tokenTexts);
	sent_info.put("char_offsets", (Object) tokenSpans);
}

Source File: Entity.java From NLIWOD with GNU Affero General Public License v3.0

5 votes

/***
 * Checks if there is an entity of the specified type in the question.
 * @param entityType an entity type: Date, Location, Organization, Person, Percent, or Money
 * @param question
 * @return if a entity of that type is present returns the name of the type otherwise "No" + the name of the type 
 */
protected String recognizeEntity(String entityType, String question){
	String result = "No" + entityType;
	Annotation annotation = new Annotation(question);
	pipeline.annotate(annotation);
	List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
	for (CoreMap sentence : sentences)
	for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
        String ne = token.get(NamedEntityTagAnnotation.class); 
        if(entityType.toUpperCase().equals(ne))
        	result = entityType;
       }
	return result;
}

Source File: CoreNlpTokenizer.java From jstarcraft-nlp with Apache License 2.0

5 votes

private boolean getNextSentence() {
    if (sentences == null)
        processInput();
    if (!sentences.hasNext())
        return false; // No more text
    tokens = sentences.next().get(TokensAnnotation.class).iterator();
    skippedTokens += SENTENCE_GAP;
    return true;
}

Source File: CoreNLPNamedEntityRecognizer.java From Heracles with GNU General Public License v3.0

5 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must each contain Words belonging to a single sentence.
	 * 
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){
		
		
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "ner");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			if (a == null){
				System.out.println(a);
			}
			pipeline.annotate(a);
			List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class);
			for (CoreMap sentence : sentenceAnnotations){
				for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
					
					Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class));
					String ner = token.get(NamedEntityTagAnnotation.class);
					String nerValue = token.get(NormalizedNamedEntityTagAnnotation.class);
					if (ner!=null)
						w.putAnnotation("nerLabel", ner);
					if (nerValue!=null)
						w.putAnnotation("nerValue", nerValue);
					
//					System.out.println(w.getAnnotations());
				}
			

				
				
			}
		}		
	}

Source File: CoreNLPTokenizer.java From Heracles with GNU General Public License v3.0

5 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must provide an annotation of type "text".
	 * This spanType does not have to be textual unit.
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfTextUnit){
		if (dataset.getPerformedNLPTasks().contains(getTask())){
			Framework.error("This dataset has already been tokenized.");
			return;
		}
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "tokenize");
		//prop1.setProperty("options", "splitHyphenated=true");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		
		for (Span span : dataset.getSpans(spanTypeOfTextUnit)){
			Span textualUnit = span.getTextualUnit();
			String originalText = span.getAnnotation("text", String.class); 
			Annotation a = new Annotation(originalText);
			pipeline.annotate(a);
			List<CoreLabel> tokenAnnotations = a.get(TokensAnnotation.class);
			Word previousWord = null;
			if (!textualUnit.isEmpty())
				previousWord = textualUnit.last();
				
			for (CoreLabel token : tokenAnnotations){
				String word = token.get(OriginalTextAnnotation.class);
				int startOffset = token.get(CharacterOffsetBeginAnnotation.class);
//				int endOffset = token.get(CharacterOffsetEndAnnotation.class);
//				System.out.println(word + "\t" + startOffset + "\t" + endOffset);
				if (previousWord == null){
					previousWord = new Word(word, startOffset, textualUnit, dataset);
				} else {
					previousWord = new Word(word, startOffset, previousWord);
				}
				//and add the new word to the sentence span. If span=textualSpan than this has no effect
				if (!textualUnit.equals(span))
					span.add(previousWord);
			}
		}		
	}

Source File: CoreNLP.java From gAnswer with BSD 3-Clause "New" or "Revised" License

5 votes

public Word[] getTaggedWords (String sentence) {
	CoreMap taggedSentence = getPOS(sentence);
	Word[] ret = new Word[taggedSentence.get(TokensAnnotation.class).size()];
	int count = 0;
	for (CoreLabel token : taggedSentence.get(TokensAnnotation.class)) {
		// this is the text of the token
		String word = token.get(TextAnnotation.class);
		// this is the POS tag of the token
		String pos = token.get(PartOfSpeechAnnotation.class);
		//System.out.println(word+"["+pos+"]");
		ret[count] = new Word(getBaseFormOfPattern(word.toLowerCase()), word, pos, count+1);
		count ++;
	}
	return ret;
}

Source File: CoreNLP.java From gAnswer with BSD 3-Clause "New" or "Revised" License

5 votes

public String getBaseFormOfPattern (String text) {
	String ret = new String("");
	
    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);
    // run all Annotators on this text
    pipeline_lemma.annotate(document);


    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    
    int count = 0;
    for(CoreMap sentence: sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
        // this is the base form (lemma) of the token
        String lemma = token.getString(LemmaAnnotation.class);
        ret += lemma;
        ret += " ";
      }
      count ++;
      if (count % 100 == 0) {
    	  System.out.println(count);
      }
    }
    
    return ret.substring(0, ret.length()-1);
}

Source File: Chapter8.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingStanfordPipelineParallel() {
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
    String path = "C:\\Current Books\\NLP and Java\\Downloads\\stanford-ner-2014-10-26\\classifiers";
    props.put("ner.model", path + "/english.muc.7class.distsim.crf.ser.gz");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    Annotation annotation1 = new Annotation("The robber took the cash and ran.");
    Annotation annotation2 = new Annotation("The policeman chased him down the street.");
    Annotation annotation3 = new Annotation("A passerby, watching the action, tripped the thief as he passed by.");
    Annotation annotation4 = new Annotation("They all lived happily everafter, except for the thief of course.");
    ArrayList<Annotation> list = new ArrayList();
    list.add(annotation1);
    list.add(annotation2);
    list.add(annotation3);
    list.add(annotation4);
    Iterable<Annotation> iterable = list;

    pipeline.annotate(iterable);

    System.out.println("Total time: " + pipeline.timingInformation());
    List<CoreMap> sentences = annotation2.get(SentencesAnnotation.class);

    for (CoreMap sentence : sentences) {
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            String word = token.get(TextAnnotation.class);
            String pos = token.get(PartOfSpeechAnnotation.class);
            System.out.println("Word: " + word + " POS Tag: " + pos);
        }
    }
}

Source File: CoreNLPHelper.java From Heracles with GNU General Public License v3.0

4 votes

public static Annotation reconstructStanfordAnnotations(Span sentenceSpan, HashMap<Integer, Word> wordIndex, boolean useWordOrderInsteadOfOffset){
		String originalText = sentenceSpan.getAnnotation("text", String.class); 
		Annotation a = new Annotation(originalText);
		a.set(TextAnnotation.class, originalText);
		
		//a.set(DocIDAnnotation.class, "document");
		
		List<CoreMap> sentenceAnnotations = new ArrayList<CoreMap>();
		a.set(SentencesAnnotation.class, sentenceAnnotations);
		List<CoreLabel> tokenAnnotations = new ArrayList<CoreLabel>();
		a.set(TokensAnnotation.class, tokenAnnotations);
		
		ArrayCoreMap sentenceAnnotation = new ArrayCoreMap();
		sentenceAnnotations.add(sentenceAnnotation);
		
//		int startOffset = sentenceSpan.first().getStartOffset();
		
		for (Word w : sentenceSpan){
			CoreLabel c = new CoreLabel();
			c.set(TextAnnotation.class, w.getWord());
			c.set(OriginalTextAnnotation.class, w.getWord());
			c.set(ValueAnnotation.class, w.getWord());
			c.set(CharacterOffsetBeginAnnotation.class, w.getStartOffset());
			c.set(CharacterOffsetEndAnnotation.class, w.getEndOffset());
			
			
			c.set(IndexAnnotation.class, w.getOrder()+1);
//			c.setIndex(w.getOrder());
			
			c.set(SentenceIndexAnnotation.class, 0);
//			c.setSentIndex(0);
			
			c.set(DocIDAnnotation.class, "document");
			c.setDocID("document");
			
			if (w.hasAnnotation("pos"))
				c.set(PartOfSpeechAnnotation.class, w.getAnnotation("pos",String.class));
			
			if (w.hasAnnotation("lemma"))
				c.set(LemmaAnnotation.class, w.getAnnotation("lemma", String.class));
			
			if (w.hasAnnotation("nerLabel"))
				c.set(NamedEntityTagAnnotation.class, w.getAnnotation("nerLabel", String.class));
			
			if (w.hasAnnotation("nerValue"))
				c.set(NormalizedNamedEntityTagAnnotation.class, w.getAnnotation("nerValue", String.class));
			
			tokenAnnotations.add(c);
			if (useWordOrderInsteadOfOffset){
				wordIndex.put(w.getOrder(), w);
			} else {
				wordIndex.put(w.getStartOffset(), w);
			}
		}
		//essential sentence annotation: TokensAnnotation
		sentenceAnnotation.set(TokensAnnotation.class, tokenAnnotations);
		//essential sentence annotation: TextAnnotation
		sentenceAnnotation.set(TextAnnotation.class, originalText);
		//essential sentence annotation: SentenceIndexAnnotation
		sentenceAnnotation.set(SentenceIndexAnnotation.class, 0);
		
		sentenceAnnotation.set(CharacterOffsetBeginAnnotation.class, 0);
		sentenceAnnotation.set(CharacterOffsetEndAnnotation.class, sentenceSpan.last().getEndOffset());
		sentenceAnnotation.set(TokenBeginAnnotation.class, 0);
		sentenceAnnotation.set(TokenEndAnnotation.class, sentenceSpan.last().getOrder());
		
		return a;
	}

Source File: CoreNLPPosTagger.java From Heracles with GNU General Public License v3.0

4 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must each contain Words belonging to a single sentence.
	 * 
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){
//		if (dataset.getPerformedNLPTasks().contains(getTask())){
//			Framework.error("This dataset has already been tagged with POS.");
//			return;
//		}
		//check if prerequisites are satisfied
		if (!dataset.getPerformedNLPTasks().containsAll(prerequisites)){
			HashSet<NLPTask> missingTasks = new HashSet<>();
			missingTasks.addAll(prerequisites);
			missingTasks.removeAll(dataset.getPerformedNLPTasks());
			Framework.error("This dataset does not meet the requirements to use this component! Missing tasks: " + missingTasks);
			return;
		}
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "pos");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			if (a == null){
				System.out.println(a);
			}
			pipeline.annotate(a);
			List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class);
			for (CoreMap sentence : sentenceAnnotations){
				for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
					
					Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class));
					String tempPos = token.get(PartOfSpeechAnnotation.class);
					if (w.hasAnnotation("URI")){
						w.putAnnotation("pos", "NNP");
					} else {
						w.putAnnotation("pos", tempPos);
					}
//					System.out.println(w.getAnnotations());
				}
			

				
				
			}
		}		
	}

Source File: WiseOwlStanfordFilter.java From wiseowl with MIT License

4 votes

public Iterator findTokens() throws IOException
{
	/*char[] c = new char[256];
    int sz = 0;
    StringBuilder b = new StringBuilder();
    
    while ((sz = input.read(c)) >= 0) {
      b.append(c, 0, sz);
    }*/
    //String text = b.toString();
	if (!input.incrementToken()) return null;
    String text;
    text = input.getAttribute(CharTermAttribute.class).toString();
	// read some text in the text variable
	//System.out.println("before annotation");
	Annotation document = new Annotation(text);
	// these are all the sentences in this document
	// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
	pipeline.annotate(document);
	List<CoreMap> timexAnnsAll = document.get(TimeAnnotations.TimexAnnotations.class);
    for (CoreMap cm : timexAnnsAll) {
    List<CoreLabel> tokens = cm.get(CoreAnnotations.TokensAnnotation.class);
    TimeData td=new TimeData();
    td.setTime(cm.get(TimeExpression.Annotation.class).getTemporal().toString());
    td.setStart(tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    td.setEnd(tokens.get(tokens.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    timeQueue.add(td);
    }
	List<CoreMap> sentences = document.get(SentencesAnnotation.class);
	
	//System.out.println("after annotation and sentence getting"+sentences.size());
	for(CoreMap sentence: sentences) {
	  // traversing the words in the current sentence
	  // a CoreLabel is a CoreMap with additional token-specific methods
	  for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
	    // this is the text of the token
		  System.out.println("in token");
	    String word = token.get(TextAnnotation.class);
	    // this is the POS tag of the token
	    String pos = token.get(PartOfSpeechAnnotation.class);
	    // this is the NER label of the token
	    String ne = token.get(NamedEntityTagAnnotation.class);
	   // System.out.println("word : "+word+" pos: "+pos+" ner: "+ne);
	    TokenData tok=new TokenData();
	    tok.setNER(ne);
	    tok.setToken(word);
	    tok.setPOS(pos);
	    tokenQueue.add(tok);
	  }

	}
	Iterator<TokenData> it=tokenQueue.iterator();
	itr_cpy=tokenQueue.iterator();
	tokenOffset=0;
	start=0;
	end=0;
	return it;
}

Source File: WiseOwlStanfordFilter.java From wiseowl with MIT License

4 votes

public static void main(String args[])
{
	Properties props = new Properties();
	props.setProperty("annotators", "tokenize, cleanxml, ssplit,pos,lemma,ner");
	
	StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
	pipeline.addAnnotator(new TimeAnnotator("sutime", props));
	String text = "<mydata> refeer</mydata>today is 12 jan 2016. what is tommorow? Who is Avtar? Does he work at Apple or Google? Sumit was born on 13 feb,2011.";

	Annotation document = new Annotation(text);
	pipeline.annotate(document);
    System.out.println(document.get(CoreAnnotations.TextAnnotation.class));
    List<CoreMap> timexAnnsAll = document.get(TimeAnnotations.TimexAnnotations.class);
    for (CoreMap cm : timexAnnsAll) {
    List<CoreLabel> tokens = cm.get(CoreAnnotations.TokensAnnotation.class);
    TimeData td=new TimeData();
    td.setTime(cm.get(TimeExpression.Annotation.class).getTemporal().toISOString());
    td.setStart(tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    td.setEnd(tokens.get(tokens.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    }
 
	List<CoreMap> sentences = document.get(SentencesAnnotation.class);
	for(CoreMap sentence: sentences) {
	  // traversing the words in the current sentence
	  // a CoreLabel is a CoreMap with additional token-specific methods
		System.out.println("in sent");
	  for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
	    // this is the text of the token
		  System.out.println("in token");
	    String word = token.get(TextAnnotation.class);
	    // this is the POS tag of the token
	    String pos = token.get(PartOfSpeechAnnotation.class);
	    // this is the NER label of the token
	    String ne = token.get(NamedEntityTagAnnotation.class);
	    System.out.println("word : "+word+" pos: "+pos+" ner: "+ne);
	    
	  }

	}

}

Source File: CoreNLPToJSON.java From phrasal with GNU General Public License v3.0

4 votes

/**
 * Process an English text file.
 * 
 * @param args
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
  if (args.length < 1) {
    System.err.printf("Usage: java %s file [inputproperties_str] > json_output%n", CoreNLPToJSON.class.getName());
    System.exit(-1);
  }
  String textFile = args[0];
  InputProperties inputProperties = args.length > 1 ? InputProperties.fromString(args[1]) : new InputProperties();

  StanfordCoreNLP coreNLP = new StanfordCoreNLP(properties);
  
  // Configure tokenizer
  EnglishPreprocessor preprocessor = new EnglishPreprocessor(true);
  
  // Use a map with ordered keys so that the output is ordered by segmentId.
  Map<Integer,SourceSegment> annotations = new TreeMap<Integer,SourceSegment>();
  LineNumberReader reader = IOTools.getReaderFromFile(textFile);
  for (String line; (line = reader.readLine()) != null;) {
    Annotation annotation = coreNLP.process(line);
    List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
    if (sentences.size() != 1) {
      throw new RuntimeException("Sentence splitting on line: " + String.valueOf(reader.getLineNumber()));
    }
    CoreMap sentence = sentences.get(0);
    Tree tree = sentence.get(TreeAnnotation.class);
    tree.indexLeaves();
    int[] chunkVector = getChunkVector(tree);
    List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);
    int numTokens = tokens.size();
    SymmetricalWordAlignment alignment = preprocessor.processAndAlign(line);
    if (alignment.e().size() != numTokens) {
      throw new RuntimeException(String.format("Tokenizer configurations differ: %d/%d", alignment.e().size(), numTokens));
    }
    SourceSegment segment = new SourceSegment(numTokens);
    segment.layoutSpec.addAll(makeLayoutSpec(alignment));
    segment.inputProperties = inputProperties.toString();
    for (int j = 0; j < numTokens; ++j) {
      CoreLabel token = tokens.get(j);
      String word = token.get(TextAnnotation.class);
      segment.tokens.add(unescape(word));
      String pos = mapPOS(token.get(PartOfSpeechAnnotation.class));
      segment.pos.add(pos);
      String ne = token.get(NamedEntityTagAnnotation.class);
      segment.ner.add(ne);
      segment.chunkVector[j] = chunkVector[j];
    }
    annotations.put(reader.getLineNumber()-1, segment);
  }
  reader.close();
  System.err.printf("Processed %d sentences%n", reader.getLineNumber());
  
  final SourceDocument jsonDocument = new SourceDocument(textFile, annotations);
  
  // Convert to json
  Gson gson = new Gson();
  String json = gson.toJson(jsonDocument);
  System.out.println(json);
}

Source File: FramatDemo.java From mateplus with GNU General Public License v2.0

4 votes

private Sentence preparse(CoreMap sentence) {
	StringBuffer posOutput = new StringBuffer();
	for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
		if (posOutput.length() > 0) {
			posOutput.append(" ");
		}
		posOutput.append(token.word());
		posOutput.append("_");
		posOutput.append(token.tag());
	}
	
	String parse = ExternalProcesses.runProcess(mstparser, posOutput.toString());
	parse = parse.replaceAll("-\t-", "_\t_\n@#").replaceAll("@#\t", "")
			.replaceAll("@#", "");

	String[] lines = parse.split("\n");
	String[] words = new String[lines.length + 1];
	String[] lemmas = new String[lines.length + 1];
	String[] tags = new String[lines.length + 1];
	String[] morphs = new String[lines.length + 1];
	int[] heads = new int[lines.length];
	String[] deprels = new String[lines.length];

	for (int i = 1; i < words.length; i++) {
		String[] parts = lines[i - 1].split("\t");
		words[i] = sentence.get(TokensAnnotation.class).get(i - 1)
				.word();
		tags[i] = sentence.get(TokensAnnotation.class).get(i - 1).tag();
		lemmas[i] = sentence.get(TokensAnnotation.class).get(i - 1)
				.lemma();
		morphs[i] = "_";
		heads[i - 1] = Integer.parseInt(parts[6]);
		deprels[i - 1] = parts[7];
	}
	Sentence s = new Sentence(words, lemmas, tags, morphs);
	s.setHeadsAndDeprels(heads, deprels);

	/* add labeled predicates from SEMAFOR */
	String json = ExternalProcesses.runProcess(semafor, parse);
	Pattern pred_frame = Pattern
			.compile("\\{\"target\":\\{\"name\":\"([A-Za-z_]*)\",\"spans\":\\[\\{\"start\":([0-9]*),\"");
	Matcher m = pred_frame.matcher(json);
	while (m.find()) {
		String frame = m.group(1);
		int index = Integer.parseInt(m.group(2));
		System.out.println(index + "\t" + frame);

		s.makePredicate(index + 1);
		((Predicate) s.get(index + 1)).setSense(frame);
	}
	return s;
}

edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation Java Examples