edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation Java Exaples

Source File: Chapter5.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

6 votes

private static void usingStanfordPOSTagger() {
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos");
    props.put("pos.model", "C:\\Current Books in Progress\\NLP and Java\\Models\\english-caseless-left3words-distsim.tagger");
    props.put("pos.maxlen", 10);
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    Annotation document = new Annotation(theSentence);
    pipeline.annotate(document);

    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            String word = token.get(TextAnnotation.class);
            String pos = token.get(PartOfSpeechAnnotation.class);
            System.out.print(word + "/" + pos + " ");
        }
        System.out.println();

        try {
            pipeline.xmlPrint(document, System.out);
            pipeline.prettyPrint(document, System.out);
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }
}

Source File: Trees.java From uncc2014watsonsim with GNU General Public License v2.0

6 votes

public static List<CoreMap> parse(String text) {
    
    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);
    
    // run all Annotators on this text
    pipeline.annotate(document);
    
    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    List<Tree> trees = new ArrayList<>();
    List<Tree> dependencies = new ArrayList<>();
    
    for(CoreMap sentence: sentences) {
      // this is the parse tree of the current sentence
    	Tree t = sentence.get(TreeAnnotation.class);
    	SemanticGraph graph = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
    	trees.add(t);
    }
    return sentences;
}

Source File: CoreNLP.java From gAnswer with BSD 3-Clause "New" or "Revised" License

6 votes

public SemanticGraph getBasicDependencies (String s) {
    // create an empty Annotation just with the given text
    Annotation document = new Annotation(s);
    
    // run all Annotators on this text
    pipeline_lemma.annotate(document);
    
    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    
    for(CoreMap sentence: sentences) {
      // this is the Stanford dependency graph of the current sentence
      SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class);
      return dependencies;
    }
    
    return null;
}

Source File: CoreNLP.java From gAnswer with BSD 3-Clause "New" or "Revised" License

6 votes

public Tree getParseTree (String text) {
    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);
    
    // run all Annotators on this text
    pipeline_lemma.annotate(document);
    
    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    
    for(CoreMap sentence: sentences) {
    	// this is the parse tree of the current sentence
    	return sentence.get(TreeAnnotation.class);
    }	    
    
    return null;
}

Source File: CoreNLP.java From gAnswer with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * How to use:
 * for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
 * 		// this is the text of the token
 * 		String word = token.get(TextAnnotation.class);
 *		// this is the POS tag of the token
 *		String pos = token.get(PartOfSpeechAnnotation.class);
 *	}
 * @param s
 * @return
 */
public CoreMap getPOS (String s) {
    // create an empty Annotation just with the given text
    Annotation document = new Annotation(s);
    
    // run all Annotators on this text
    pipeline_lemma.annotate(document);
    
    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    
    for(CoreMap sentence: sentences) {
      // this is the sentence with POS Tags
      return sentence;
    }
    
    return null;
}

Source File: ReconTool.java From Criteria2Query with Apache License 2.0

6 votes

public boolean isCEE(String text){
	text = text.replace("/", " / ");
	Annotation annotation = new Annotation(text);
	pipeline.annotate(annotation);
	List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
	boolean flag=false;
	for (CoreMap sentence : sentences) {
		for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
			String word = token.get(TextAnnotation.class);//token.get(LemmaAnnotation.class);//TextAnnotation.class
			String pos = token.get(PartOfSpeechAnnotation.class);
			//String lemma = token.get(LemmaAnnotation.class);
			boolean f = false;
			if ((word.equals("and") || word.equals(",") || word.equals("/") || word.equals("or"))) {
				flag = true;
				break;
			}
			
		}
	}
	
	return flag;
}

Source File: CorefTool.java From Criteria2Query with Apache License 2.0

6 votes

public void extractCoref() {
	String s="Subjects with hypothyroidism who are on stable treatment for 3 months prior to screening are required to have TSH and free thyroxine (FT4) obtained. If the TSH value is out of range, but FT4 is normal, such cases should be discussed directly with the JRD responsible safety physician before the subject is enrolled. If the FT4 value is out of range, the subject is not eligible.";
	 Annotation document = new Annotation(s);
	    Properties props = new Properties();
	    props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,mention,coref");
	    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
	    pipeline.annotate(document);
	    System.out.println("---");
	    System.out.println("coref chains");
	    for (CorefChain cc : document.get(CorefCoreAnnotations.CorefChainAnnotation.class).values()) {
	      System.out.println("\t" + cc);
	    }
	    for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
	      System.out.println("---");
	      System.out.println("mentions");
	      for (Mention m : sentence.get(CorefCoreAnnotations.CorefMentionsAnnotation.class)) {
	        System.out.println("\t" + m);
	       }
	    }
}

Source File: CoreNLPParser.java From Heracles with GNU General Public License v3.0

5 votes

@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit) {
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "parse");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
//			Annotation a = new Annotation((String)span.getAnnotations().get("text"));
			
			if (a == null){
				System.out.println(a);
			}
			pipeline.annotate(a);
			for (CoreMap sentence : a.get(SentencesAnnotation.class)){
				//per sentence, check the syntax tree
				Tree tree = sentence.get(TreeAnnotation.class);
//				tree.percolateHeadAnnotations(headFinder);
//				tree.indentedListPrint();
				
				try {
					analyzeTree(tree, span, wordIndex);
				} catch (IllegalSpanException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				
			}
			
		}

	}

Source File: Stemming.java From AGDISTIS with GNU Affero General Public License v3.0

5 votes

public String stemming(String documentText) {
	List<String> lemmas = new LinkedList<String>();
	String label = null;
	LancasterStemmer stem = new LancasterStemmer();
	// Create an empty Annotation just with the given text
	Annotation document = new Annotation(documentText);
	// run all Annotators on this text
	this.pipeline.annotate(document);
	// Iterate over all of the sentences found
	List<CoreMap> sentences = document.get(SentencesAnnotation.class);
	for (CoreMap sentence : sentences) {
		// Iterate over all tokens in a sentence
		for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
			// Retrieve and add the lemma for each word into the
			// list of lemmas
			// lemmas.add(token.get(LemmaAnnotation.class));
			// lemmas.add(morpho.stem(token.word()));
			lemmas.add(stem.stem(token.get(LemmaAnnotation.class)));
		}
	}

	label = lemmas.toString();
	Pattern p = Pattern.compile("[,.;!?(){}\\[\\]<>%]");
	label = p.matcher(label).replaceAll("");

	return label;
}

Source File: Phrase.java From uncc2014watsonsim with GNU General Public License v2.0

5 votes

private static List<CoreMap> _sentences(Phrase p) {
    return Optional.ofNullable(
    			p.memo(Phrase.coreNLP)
    				.get(SentencesAnnotation.class))
   				.orElse(Collections.emptyList());
    			
}

Source File: JsonPipeline.java From tac2015-event-detection with GNU General Public License v3.0

5 votes

/** runs the corenlp pipeline with all options, and returns all results as a JSON object. */
	@SuppressWarnings({ "rawtypes", "unchecked" })
	JsonNode processTextDocument(String doctext) {
		if (startMilli==-1)  startMilli = System.currentTimeMillis();
		numDocs++;
		numChars += doctext.length();

		Annotation document = new Annotation(doctext);
		pipeline.annotate(document);

		List<CoreMap> sentences = document.get(SentencesAnnotation.class);
		List<Map> outSentences = Lists.newArrayList();

		for(CoreMap sentence: sentences) {
			Map<String,Object> sent_info = Maps.newHashMap();
			addTokenBasics(sent_info, sentence);
			numTokens += ((List) sent_info.get("tokens")).size();
			for (String annotator : annotators()) {
				addAnnoToSentenceObject(sent_info, sentence, annotator);
			}
			outSentences.add(sent_info);
		}


		ImmutableMap.Builder b = new ImmutableMap.Builder();
//		b.put("text", doctext);
		b.put("sentences", outSentences);
		
		if (Lists.newArrayList(annotators()).contains("dcoref")) {
			List outCoref = getCorefInfo(document);
			b.put("entities", outCoref);
		}
		Map outDoc = b.build();
		return JsonUtil.toJson(outDoc);
	}

Source File: Entity.java From NLIWOD with GNU Affero General Public License v3.0

5 votes

/***
 * Checks if there is an entity of the specified type in the question.
 * @param entityType an entity type: Date, Location, Organization, Person, Percent, or Money
 * @param question
 * @return if a entity of that type is present returns the name of the type otherwise "No" + the name of the type 
 */
protected String recognizeEntity(String entityType, String question){
	String result = "No" + entityType;
	Annotation annotation = new Annotation(question);
	pipeline.annotate(annotation);
	List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
	for (CoreMap sentence : sentences)
	for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
        String ne = token.get(NamedEntityTagAnnotation.class); 
        if(entityType.toUpperCase().equals(ne))
        	result = entityType;
       }
	return result;
}

Source File: CoreNLPLemmatizer.java From Heracles with GNU General Public License v3.0

5 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must each contain Words belonging to a single sentence.
	 * 
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){
		
		
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "lemma");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			if (a == null){
				System.out.println(a);
			}
			pipeline.annotate(a);
			List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class);
			for (CoreMap sentence : sentenceAnnotations){
				for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
					
					Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class));
					String tempLemma = token.get(LemmaAnnotation.class);
					w.putAnnotation("lemma", tempLemma.toLowerCase());
//					System.out.println(w.getAnnotations());
				}
			

				
				
			}
		}		
	}

Source File: CoreNLPNamedEntityRecognizer.java From Heracles with GNU General Public License v3.0

5 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must each contain Words belonging to a single sentence.
	 * 
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){
		
		
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "ner");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			if (a == null){
				System.out.println(a);
			}
			pipeline.annotate(a);
			List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class);
			for (CoreMap sentence : sentenceAnnotations){
				for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
					
					Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class));
					String ner = token.get(NamedEntityTagAnnotation.class);
					String nerValue = token.get(NormalizedNamedEntityTagAnnotation.class);
					if (ner!=null)
						w.putAnnotation("nerLabel", ner);
					if (nerValue!=null)
						w.putAnnotation("nerValue", nerValue);
					
//					System.out.println(w.getAnnotations());
				}
			

				
				
			}
		}		
	}

Source File: CoreNLPUtils.java From minie with GNU General Public License v3.0

5 votes

/**
 * Given a CoreNLP pipeline and an input sentence, generate dependency parse for the sentence and return
 * the SemanticGraph object as a result
 * @param pipeline - CoreNLP pipeline
 * @param snt - input sentence
 * @return dependency parse in SemanticGraph object
 */
public static SemanticGraph parse(StanfordCoreNLP pipeline, String snt) {
    Annotation document = new Annotation(snt);
    pipeline.annotate(document);
    
    //A CoreMap is a sentence with annotations
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    SemanticGraph semanticGraph = null;
    for(CoreMap sentence: sentences) {
        semanticGraph = sentence.get(BasicDependenciesAnnotation.class);
    }
    
    return semanticGraphUniversalEnglishToEnglish(semanticGraph);
}

Source File: CoreNLP.java From gAnswer with BSD 3-Clause "New" or "Revised" License

5 votes

public String getBaseFormOfPattern (String text) {
	String ret = new String("");
	
    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);
    // run all Annotators on this text
    pipeline_lemma.annotate(document);


    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    
    int count = 0;
    for(CoreMap sentence: sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
        // this is the base form (lemma) of the token
        String lemma = token.getString(LemmaAnnotation.class);
        ret += lemma;
        ret += " ";
      }
      count ++;
      if (count % 100 == 0) {
    	  System.out.println(count);
      }
    }
    
    return ret.substring(0, ret.length()-1);
}

Source File: Chapter8.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingStanfordPipelineParallel() {
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
    String path = "C:\\Current Books\\NLP and Java\\Downloads\\stanford-ner-2014-10-26\\classifiers";
    props.put("ner.model", path + "/english.muc.7class.distsim.crf.ser.gz");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    Annotation annotation1 = new Annotation("The robber took the cash and ran.");
    Annotation annotation2 = new Annotation("The policeman chased him down the street.");
    Annotation annotation3 = new Annotation("A passerby, watching the action, tripped the thief as he passed by.");
    Annotation annotation4 = new Annotation("They all lived happily everafter, except for the thief of course.");
    ArrayList<Annotation> list = new ArrayList();
    list.add(annotation1);
    list.add(annotation2);
    list.add(annotation3);
    list.add(annotation4);
    Iterable<Annotation> iterable = list;

    pipeline.annotate(iterable);

    System.out.println("Total time: " + pipeline.timingInformation());
    List<CoreMap> sentences = annotation2.get(SentencesAnnotation.class);

    for (CoreMap sentence : sentences) {
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            String word = token.get(TextAnnotation.class);
            String pos = token.get(PartOfSpeechAnnotation.class);
            System.out.println("Word: " + word + " POS Tag: " + pos);
        }
    }
}

Source File: CoreNLPDependencyParser.java From Heracles with GNU General Public License v3.0

4 votes

@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit) {
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "depparse");
		
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			
			
//			Main.debug(span.toString());
			
			pipeline.annotate(a);
						
			for (CoreMap sentence : a.get(SentencesAnnotation.class)){
								
				//per sentence, get the dependencies
				SemanticGraph dependencies = sentence.get(EnhancedPlusPlusDependenciesAnnotation.class);
				
				for (TypedDependency td : dependencies.typedDependencies()){
//					Main.debug(td.toString());
					String relationType = td.reln().getLongName();
					Word dep = wordIndex.get(td.dep().beginPosition());
					DataEntity gov = wordIndex.get(td.gov().beginPosition());
					if (gov == null){
						//this is the root, link to sentence
						gov = span;
					}
					if (dep == null || gov == null){
						Framework.debug(td.toString());
						Framework.debug(td.dep().beginPosition() + "\t" + td.gov().beginPosition());
						Framework.debug(wordIndex.toString());
					}
					Relation rel = new Relation("deps", gov, dep);
					rel.putAnnotation("relationLongName", td.reln().getLongName());
					if (td.reln().getParent() != null)
						rel.putAnnotation("relationParentShortName", td.reln().getParent().getShortName());
					rel.putAnnotation("relationShortName", td.reln().getShortName());
//					rel.putAnnotation("relationSpecific", td.reln().getSpecific());
					dep.getRelations().addRelationToParent(rel);
					gov.getRelations().addRelationToChild(rel);
					
				}				
//				dependencies.prettyPrint();
			}
			
		}

	}

Source File: CoreNLPSentimentAnnotator.java From Heracles with GNU General Public License v3.0

4 votes

@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit) {

		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "parse sentiment");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex, true);
			pipeline.annotate(a);
			
			for (CoreMap sentence : a.get(SentencesAnnotation.class)){
				Tree sentimentTree = sentence.get(SentimentAnnotatedTree.class);
				sentimentTree.setSpans();
				sentimentTree.indexLeaves();
				sentimentTree.indexSpans();
				sentimentTree.percolateHeadIndices();
//				for (CoreLabel cl : sentimentTree.taggedLabeledYield()){
//					Main.debug(""+cl.beginPosition()+"\t"+cl.get(CharacterOffsetBeginAnnotation.class));
//					Main.debug(cl.index() + "\t" + cl.keySet());
//				}
				
//				sentimentTree.indentedListPrint();
				
//				sentence.get(TreeAnnotation.class).indentedListPrint();
				
				SimpleMatrix sm = RNNCoreAnnotations.getPredictions(sentimentTree);
				assignSentiment(span, sm, "phraseSentiment");
//				Main.debug(sm.toString());
				
//				//assign begin positions to each word in the tree because those seem to be missing
//				int order = 0;
//				ArrayList<edu.stanford.nlp.ling.Word> stanfordWords = sentimentTree.yieldWords();
//				for (Word w : span){
//					stanfordWords.get(order).setBeginPosition(w.getStartOffset());
//					order++;
//				}
				
				try {
					analyzeTree(sentimentTree, span, wordIndex,0);
				} catch (IllegalSpanException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
			
		}
	}

Source File: CoreNlpTokenizer.java From jstarcraft-nlp with Apache License 2.0

4 votes

private boolean processInput() {
    Annotation annotation = new Annotation(IOUtils.slurpReader(input));
    pipeline.annotate(annotation);
    sentences = annotation.get(SentencesAnnotation.class).iterator();
    return true;
}

Source File: WiseOwlStanfordFilter.java From wiseowl with MIT License

4 votes

public Iterator findTokens() throws IOException
{
	/*char[] c = new char[256];
    int sz = 0;
    StringBuilder b = new StringBuilder();
    
    while ((sz = input.read(c)) >= 0) {
      b.append(c, 0, sz);
    }*/
    //String text = b.toString();
	if (!input.incrementToken()) return null;
    String text;
    text = input.getAttribute(CharTermAttribute.class).toString();
	// read some text in the text variable
	//System.out.println("before annotation");
	Annotation document = new Annotation(text);
	// these are all the sentences in this document
	// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
	pipeline.annotate(document);
	List<CoreMap> timexAnnsAll = document.get(TimeAnnotations.TimexAnnotations.class);
    for (CoreMap cm : timexAnnsAll) {
    List<CoreLabel> tokens = cm.get(CoreAnnotations.TokensAnnotation.class);
    TimeData td=new TimeData();
    td.setTime(cm.get(TimeExpression.Annotation.class).getTemporal().toString());
    td.setStart(tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    td.setEnd(tokens.get(tokens.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    timeQueue.add(td);
    }
	List<CoreMap> sentences = document.get(SentencesAnnotation.class);
	
	//System.out.println("after annotation and sentence getting"+sentences.size());
	for(CoreMap sentence: sentences) {
	  // traversing the words in the current sentence
	  // a CoreLabel is a CoreMap with additional token-specific methods
	  for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
	    // this is the text of the token
		  System.out.println("in token");
	    String word = token.get(TextAnnotation.class);
	    // this is the POS tag of the token
	    String pos = token.get(PartOfSpeechAnnotation.class);
	    // this is the NER label of the token
	    String ne = token.get(NamedEntityTagAnnotation.class);
	   // System.out.println("word : "+word+" pos: "+pos+" ner: "+ne);
	    TokenData tok=new TokenData();
	    tok.setNER(ne);
	    tok.setToken(word);
	    tok.setPOS(pos);
	    tokenQueue.add(tok);
	  }

	}
	Iterator<TokenData> it=tokenQueue.iterator();
	itr_cpy=tokenQueue.iterator();
	tokenOffset=0;
	start=0;
	end=0;
	return it;
}

Source File: CoreNLPHelper.java From Heracles with GNU General Public License v3.0

4 votes

public static Annotation reconstructStanfordAnnotations(Span sentenceSpan, HashMap<Integer, Word> wordIndex, boolean useWordOrderInsteadOfOffset){
		String originalText = sentenceSpan.getAnnotation("text", String.class); 
		Annotation a = new Annotation(originalText);
		a.set(TextAnnotation.class, originalText);
		
		//a.set(DocIDAnnotation.class, "document");
		
		List<CoreMap> sentenceAnnotations = new ArrayList<CoreMap>();
		a.set(SentencesAnnotation.class, sentenceAnnotations);
		List<CoreLabel> tokenAnnotations = new ArrayList<CoreLabel>();
		a.set(TokensAnnotation.class, tokenAnnotations);
		
		ArrayCoreMap sentenceAnnotation = new ArrayCoreMap();
		sentenceAnnotations.add(sentenceAnnotation);
		
//		int startOffset = sentenceSpan.first().getStartOffset();
		
		for (Word w : sentenceSpan){
			CoreLabel c = new CoreLabel();
			c.set(TextAnnotation.class, w.getWord());
			c.set(OriginalTextAnnotation.class, w.getWord());
			c.set(ValueAnnotation.class, w.getWord());
			c.set(CharacterOffsetBeginAnnotation.class, w.getStartOffset());
			c.set(CharacterOffsetEndAnnotation.class, w.getEndOffset());
			
			
			c.set(IndexAnnotation.class, w.getOrder()+1);
//			c.setIndex(w.getOrder());
			
			c.set(SentenceIndexAnnotation.class, 0);
//			c.setSentIndex(0);
			
			c.set(DocIDAnnotation.class, "document");
			c.setDocID("document");
			
			if (w.hasAnnotation("pos"))
				c.set(PartOfSpeechAnnotation.class, w.getAnnotation("pos",String.class));
			
			if (w.hasAnnotation("lemma"))
				c.set(LemmaAnnotation.class, w.getAnnotation("lemma", String.class));
			
			if (w.hasAnnotation("nerLabel"))
				c.set(NamedEntityTagAnnotation.class, w.getAnnotation("nerLabel", String.class));
			
			if (w.hasAnnotation("nerValue"))
				c.set(NormalizedNamedEntityTagAnnotation.class, w.getAnnotation("nerValue", String.class));
			
			tokenAnnotations.add(c);
			if (useWordOrderInsteadOfOffset){
				wordIndex.put(w.getOrder(), w);
			} else {
				wordIndex.put(w.getStartOffset(), w);
			}
		}
		//essential sentence annotation: TokensAnnotation
		sentenceAnnotation.set(TokensAnnotation.class, tokenAnnotations);
		//essential sentence annotation: TextAnnotation
		sentenceAnnotation.set(TextAnnotation.class, originalText);
		//essential sentence annotation: SentenceIndexAnnotation
		sentenceAnnotation.set(SentenceIndexAnnotation.class, 0);
		
		sentenceAnnotation.set(CharacterOffsetBeginAnnotation.class, 0);
		sentenceAnnotation.set(CharacterOffsetEndAnnotation.class, sentenceSpan.last().getEndOffset());
		sentenceAnnotation.set(TokenBeginAnnotation.class, 0);
		sentenceAnnotation.set(TokenEndAnnotation.class, sentenceSpan.last().getOrder());
		
		return a;
	}

Source File: CoreNLPPosTagger.java From Heracles with GNU General Public License v3.0

4 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must each contain Words belonging to a single sentence.
	 * 
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){
//		if (dataset.getPerformedNLPTasks().contains(getTask())){
//			Framework.error("This dataset has already been tagged with POS.");
//			return;
//		}
		//check if prerequisites are satisfied
		if (!dataset.getPerformedNLPTasks().containsAll(prerequisites)){
			HashSet<NLPTask> missingTasks = new HashSet<>();
			missingTasks.addAll(prerequisites);
			missingTasks.removeAll(dataset.getPerformedNLPTasks());
			Framework.error("This dataset does not meet the requirements to use this component! Missing tasks: " + missingTasks);
			return;
		}
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "pos");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){

			
			HashMap<Integer, Word> wordIndex = new HashMap<>();
			Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
			if (a == null){
				System.out.println(a);
			}
			pipeline.annotate(a);
			List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class);
			for (CoreMap sentence : sentenceAnnotations){
				for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
					
					Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class));
					String tempPos = token.get(PartOfSpeechAnnotation.class);
					if (w.hasAnnotation("URI")){
						w.putAnnotation("pos", "NNP");
					} else {
						w.putAnnotation("pos", tempPos);
					}
//					System.out.println(w.getAnnotations());
				}
			

				
				
			}
		}		
	}

Source File: WiseOwlStanfordFilter.java From wiseowl with MIT License

4 votes

public static void main(String args[])
{
	Properties props = new Properties();
	props.setProperty("annotators", "tokenize, cleanxml, ssplit,pos,lemma,ner");
	
	StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
	pipeline.addAnnotator(new TimeAnnotator("sutime", props));
	String text = "<mydata> refeer</mydata>today is 12 jan 2016. what is tommorow? Who is Avtar? Does he work at Apple or Google? Sumit was born on 13 feb,2011.";

	Annotation document = new Annotation(text);
	pipeline.annotate(document);
    System.out.println(document.get(CoreAnnotations.TextAnnotation.class));
    List<CoreMap> timexAnnsAll = document.get(TimeAnnotations.TimexAnnotations.class);
    for (CoreMap cm : timexAnnsAll) {
    List<CoreLabel> tokens = cm.get(CoreAnnotations.TokensAnnotation.class);
    TimeData td=new TimeData();
    td.setTime(cm.get(TimeExpression.Annotation.class).getTemporal().toISOString());
    td.setStart(tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    td.setEnd(tokens.get(tokens.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    }
 
	List<CoreMap> sentences = document.get(SentencesAnnotation.class);
	for(CoreMap sentence: sentences) {
	  // traversing the words in the current sentence
	  // a CoreLabel is a CoreMap with additional token-specific methods
		System.out.println("in sent");
	  for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
	    // this is the text of the token
		  System.out.println("in token");
	    String word = token.get(TextAnnotation.class);
	    // this is the POS tag of the token
	    String pos = token.get(PartOfSpeechAnnotation.class);
	    // this is the NER label of the token
	    String ne = token.get(NamedEntityTagAnnotation.class);
	    System.out.println("word : "+word+" pos: "+pos+" ner: "+ne);
	    
	  }

	}

}

Source File: CoreNLPToJSON.java From phrasal with GNU General Public License v3.0

4 votes

/**
 * Process an English text file.
 * 
 * @param args
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
  if (args.length < 1) {
    System.err.printf("Usage: java %s file [inputproperties_str] > json_output%n", CoreNLPToJSON.class.getName());
    System.exit(-1);
  }
  String textFile = args[0];
  InputProperties inputProperties = args.length > 1 ? InputProperties.fromString(args[1]) : new InputProperties();

  StanfordCoreNLP coreNLP = new StanfordCoreNLP(properties);
  
  // Configure tokenizer
  EnglishPreprocessor preprocessor = new EnglishPreprocessor(true);
  
  // Use a map with ordered keys so that the output is ordered by segmentId.
  Map<Integer,SourceSegment> annotations = new TreeMap<Integer,SourceSegment>();
  LineNumberReader reader = IOTools.getReaderFromFile(textFile);
  for (String line; (line = reader.readLine()) != null;) {
    Annotation annotation = coreNLP.process(line);
    List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
    if (sentences.size() != 1) {
      throw new RuntimeException("Sentence splitting on line: " + String.valueOf(reader.getLineNumber()));
    }
    CoreMap sentence = sentences.get(0);
    Tree tree = sentence.get(TreeAnnotation.class);
    tree.indexLeaves();
    int[] chunkVector = getChunkVector(tree);
    List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);
    int numTokens = tokens.size();
    SymmetricalWordAlignment alignment = preprocessor.processAndAlign(line);
    if (alignment.e().size() != numTokens) {
      throw new RuntimeException(String.format("Tokenizer configurations differ: %d/%d", alignment.e().size(), numTokens));
    }
    SourceSegment segment = new SourceSegment(numTokens);
    segment.layoutSpec.addAll(makeLayoutSpec(alignment));
    segment.inputProperties = inputProperties.toString();
    for (int j = 0; j < numTokens; ++j) {
      CoreLabel token = tokens.get(j);
      String word = token.get(TextAnnotation.class);
      segment.tokens.add(unescape(word));
      String pos = mapPOS(token.get(PartOfSpeechAnnotation.class));
      segment.pos.add(pos);
      String ne = token.get(NamedEntityTagAnnotation.class);
      segment.ner.add(ne);
      segment.chunkVector[j] = chunkVector[j];
    }
    annotations.put(reader.getLineNumber()-1, segment);
  }
  reader.close();
  System.err.printf("Processed %d sentences%n", reader.getLineNumber());
  
  final SourceDocument jsonDocument = new SourceDocument(textFile, annotations);
  
  // Convert to json
  Gson gson = new Gson();
  String json = gson.toJson(jsonDocument);
  System.out.println(json);
}

Source File: LogicAnalysisTool.java From Criteria2Query with Apache License 2.0

4 votes

public List<LinkedHashSet<Integer>> ddep(String text, List<Term> terms) {
	Annotation annotation = new Annotation(text);
	pipeline.annotate(annotation);
	List<LinkedHashSet<Integer>> conj_or = new ArrayList<LinkedHashSet<Integer>>();
	for (CoreMap sentence : annotation.get(SentencesAnnotation.class)) {
		List<SemanticGraphEdge> sges = sentence
				.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class).edgeListSorted();
		int entity1_median = 0;
		int entity2_median = 0;
		for (SemanticGraphEdge sge : sges) {
			//System.out.println(
			//		sge.getRelation().getSpecific() + "\t" + sge.getDependent() + "\t" + sge.getGovernor());
			if (sge.getRelation().getSpecific() != null && sge.getRelation().getSpecific().equals("or")) {
				entity1_median = (sge.getDependent().beginPosition() + sge.getDependent().endPosition()) / 2;
				entity2_median = (sge.getGovernor().beginPosition() + sge.getGovernor().endPosition()) / 2;
				LinkedHashSet<Integer> conj_or_group_1 = searchGroup(conj_or, entity1_median);
				LinkedHashSet<Integer> conj_or_group_2 = searchGroup(conj_or, entity2_median);
				if (conj_or_group_1 == null && conj_or_group_2 == null) {
					LinkedHashSet<Integer> conj_or_group = new LinkedHashSet<Integer>();
					conj_or_group.add(entity1_median);
					conj_or_group.add(entity2_median);
					conj_or.add(conj_or_group);
				} else if (conj_or_group_1 != null && conj_or_group_2 == null) {
					conj_or.remove(conj_or_group_1);
					conj_or_group_1.add(entity2_median);
					conj_or.add(conj_or_group_1);
				} else if (conj_or_group_1 == null && conj_or_group_2 != null) {
					conj_or.remove(conj_or_group_2);
					conj_or_group_2.add(entity1_median);
					conj_or.add(conj_or_group_2);
				}
			}
		}
		
	}
	List<LinkedHashSet<Integer>> entity_group = new ArrayList<LinkedHashSet<Integer>>();
	for (int i = 0; i < conj_or.size(); i++) {
		LinkedHashSet<Integer> entities = new LinkedHashSet<Integer>();
		for (Integer b : conj_or.get(i)) {
			for (Term t : terms) {
				if (b >= t.getStart_index() && b <= t.getEnd_index()) {
					entities.add(t.getTermId());
				}
			}
		}
		entity_group.add(entities);
	}

	return entity_group;
}

edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation Java Examples