edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation Java Examples
The following examples show how to use
edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCustomLemmaAnnotator.java From blog-codes with Apache License 2.0 | 6 votes |
@Test public void test() { Properties props = new Properties(); props.setProperty("annotators", "tokenize,ssplit,pos,custom.lemma"); props.setProperty("customAnnotatorClass.custom.lemma", "com.fancyerii.blog.stanfordnlp.CustomLemmaAnnotator"); props.setProperty("custom.lemma.lemmaFile", "custom-lemmas.txt"); // set up pipeline StanfordCoreNLP pipeline = new StanfordCoreNLP(props); CoreDocument exampleDocument = new CoreDocument("Some many goods there."); // annotate document pipeline.annotate(exampleDocument); // access tokens from a CoreDocument // a token is represented by a CoreLabel List<CoreLabel> firstSentenceTokens = exampleDocument.sentences().get(0).tokens(); // this for loop will print out all of the tokens and the character offset info for (CoreLabel token : firstSentenceTokens) { System.out.println(token.word()+"/"+token.getString(LemmaAnnotation.class) + "\t" + token.beginPosition() + "\t" + token.endPosition()); } }
Example #2
Source File: CoreNlpTokenizer.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
@Override public boolean incrementToken() { clearAttributes(); while (tokens == null || !tokens.hasNext()) if (!getNextSentence()) return false; CoreLabel token = tokens.next(); // Use the lemmatized word: String word = token.get(LemmaAnnotation.class); if (word == null) { // Fallback when no lemmatization happens. word = token.get(TextAnnotation.class); } termAttribute.setLength(0); termAttribute.append(word); // NER or part of speech annotation String pos = token.get(NamedEntityTagAnnotation.class); pos = (pos == null || "O".equals(pos)) ? token.get(PartOfSpeechAnnotation.class) : pos; typeAttribute.setType(pos != null ? pos : TypeAttribute.DEFAULT_TYPE); // Token character offsets int be = token.get(CharacterOffsetBeginAnnotation.class).intValue(); int en = token.get(CharacterOffsetEndAnnotation.class).intValue(); offsetAttribute.setOffset(be, en); // Token in-document position increment: positionAttribute.setPositionIncrement(1 + skippedTokens); skippedTokens = 0; return true; }
Example #3
Source File: CoreNLP.java From gAnswer with BSD 3-Clause "New" or "Revised" License | 5 votes |
public String getBaseFormOfPattern (String text) { String ret = new String(""); // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline_lemma.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); int count = 0; for(CoreMap sentence: sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token: sentence.get(TokensAnnotation.class)) { // this is the base form (lemma) of the token String lemma = token.getString(LemmaAnnotation.class); ret += lemma; ret += " "; } count ++; if (count % 100 == 0) { System.out.println(count); } } return ret.substring(0, ret.length()-1); }
Example #4
Source File: CoreNLPLemmatizer.java From Heracles with GNU General Public License v3.0 | 5 votes |
/** * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter. * The Spans denoted by spanType must each contain Words belonging to a single sentence. * */ @Override public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){ Properties prop1 = new Properties(); prop1.setProperty("annotators", "lemma"); StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false); for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){ HashMap<Integer, Word> wordIndex = new HashMap<>(); Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex); if (a == null){ System.out.println(a); } pipeline.annotate(a); List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class); for (CoreMap sentence : sentenceAnnotations){ for (CoreLabel token: sentence.get(TokensAnnotation.class)) { Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class)); String tempLemma = token.get(LemmaAnnotation.class); w.putAnnotation("lemma", tempLemma.toLowerCase()); // System.out.println(w.getAnnotations()); } } } }
Example #5
Source File: Phrase.java From uncc2014watsonsim with GNU General Public License v2.0 | 5 votes |
private static List<String> _lemmas(Phrase p) { return p.memo(Phrase.sentences) .stream() .flatMap(s -> s.get(TokensAnnotation.class).stream()) .map( t -> t.get(LemmaAnnotation.class)) .collect(toList()); }
Example #6
Source File: Stemming.java From AGDISTIS with GNU Affero General Public License v3.0 | 5 votes |
public String stemming(String documentText) { List<String> lemmas = new LinkedList<String>(); String label = null; LancasterStemmer stem = new LancasterStemmer(); // Create an empty Annotation just with the given text Annotation document = new Annotation(documentText); // run all Annotators on this text this.pipeline.annotate(document); // Iterate over all of the sentences found List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // Iterate over all tokens in a sentence for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // Retrieve and add the lemma for each word into the // list of lemmas // lemmas.add(token.get(LemmaAnnotation.class)); // lemmas.add(morpho.stem(token.word())); lemmas.add(stem.stem(token.get(LemmaAnnotation.class))); } } label = lemmas.toString(); Pattern p = Pattern.compile("[,.;!?(){}\\[\\]<>%]"); label = p.matcher(label).replaceAll(""); return label; }
Example #7
Source File: CoreNLPHelper.java From Heracles with GNU General Public License v3.0 | 4 votes |
public static Annotation reconstructStanfordAnnotations(Span sentenceSpan, HashMap<Integer, Word> wordIndex, boolean useWordOrderInsteadOfOffset){ String originalText = sentenceSpan.getAnnotation("text", String.class); Annotation a = new Annotation(originalText); a.set(TextAnnotation.class, originalText); //a.set(DocIDAnnotation.class, "document"); List<CoreMap> sentenceAnnotations = new ArrayList<CoreMap>(); a.set(SentencesAnnotation.class, sentenceAnnotations); List<CoreLabel> tokenAnnotations = new ArrayList<CoreLabel>(); a.set(TokensAnnotation.class, tokenAnnotations); ArrayCoreMap sentenceAnnotation = new ArrayCoreMap(); sentenceAnnotations.add(sentenceAnnotation); // int startOffset = sentenceSpan.first().getStartOffset(); for (Word w : sentenceSpan){ CoreLabel c = new CoreLabel(); c.set(TextAnnotation.class, w.getWord()); c.set(OriginalTextAnnotation.class, w.getWord()); c.set(ValueAnnotation.class, w.getWord()); c.set(CharacterOffsetBeginAnnotation.class, w.getStartOffset()); c.set(CharacterOffsetEndAnnotation.class, w.getEndOffset()); c.set(IndexAnnotation.class, w.getOrder()+1); // c.setIndex(w.getOrder()); c.set(SentenceIndexAnnotation.class, 0); // c.setSentIndex(0); c.set(DocIDAnnotation.class, "document"); c.setDocID("document"); if (w.hasAnnotation("pos")) c.set(PartOfSpeechAnnotation.class, w.getAnnotation("pos",String.class)); if (w.hasAnnotation("lemma")) c.set(LemmaAnnotation.class, w.getAnnotation("lemma", String.class)); if (w.hasAnnotation("nerLabel")) c.set(NamedEntityTagAnnotation.class, w.getAnnotation("nerLabel", String.class)); if (w.hasAnnotation("nerValue")) c.set(NormalizedNamedEntityTagAnnotation.class, w.getAnnotation("nerValue", String.class)); tokenAnnotations.add(c); if (useWordOrderInsteadOfOffset){ wordIndex.put(w.getOrder(), w); } else { wordIndex.put(w.getStartOffset(), w); } } //essential sentence annotation: TokensAnnotation sentenceAnnotation.set(TokensAnnotation.class, tokenAnnotations); //essential sentence annotation: TextAnnotation sentenceAnnotation.set(TextAnnotation.class, originalText); //essential sentence annotation: SentenceIndexAnnotation sentenceAnnotation.set(SentenceIndexAnnotation.class, 0); sentenceAnnotation.set(CharacterOffsetBeginAnnotation.class, 0); sentenceAnnotation.set(CharacterOffsetEndAnnotation.class, sentenceSpan.last().getEndOffset()); sentenceAnnotation.set(TokenBeginAnnotation.class, 0); sentenceAnnotation.set(TokenEndAnnotation.class, sentenceSpan.last().getOrder()); return a; }
Example #8
Source File: JsonPipeline.java From tac2015-event-detection with GNU General Public License v3.0 | 4 votes |
/** annotator is a stanford corenlp notion. */ void addAnnoToSentenceObject(Map<String,Object> sent_info, CoreMap sentence, String annotator) { switch(annotator) { case "tokenize": case "cleanxml": case "ssplit": break; case "pos": addTokenAnno(sent_info,sentence, "pos", PartOfSpeechAnnotation.class); break; case "lemma": addTokenAnno(sent_info,sentence, "lemmas", LemmaAnnotation.class); break; case "ner": addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class); addTokenAnno(sent_info, sentence, "normner", NormalizedNamedEntityTagAnnotation.class); break; case "regexner": addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class); break; case "sentiment": throw new RuntimeException("TODO"); case "truecase": throw new RuntimeException("TODO"); case "parse": addParseTree(sent_info,sentence); addDepsCC(sent_info,sentence); addDepsBasic(sent_info,sentence); break; case "depparse": addDepsCC(sent_info,sentence); addDepsBasic(sent_info,sentence); break; case "dcoref": break; case "relation": throw new RuntimeException("TODO"); case "natlog": throw new RuntimeException("TODO"); case "quote": throw new RuntimeException("TODO"); case "entitymentions": addEntityMentions(sent_info, sentence); break; default: throw new RuntimeException("don't know how to handle annotator " + annotator); } }