edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation Java Examples
The following examples show how to use
edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CoreNlpTokenizer.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
@Override public boolean incrementToken() { clearAttributes(); while (tokens == null || !tokens.hasNext()) if (!getNextSentence()) return false; CoreLabel token = tokens.next(); // Use the lemmatized word: String word = token.get(LemmaAnnotation.class); if (word == null) { // Fallback when no lemmatization happens. word = token.get(TextAnnotation.class); } termAttribute.setLength(0); termAttribute.append(word); // NER or part of speech annotation String pos = token.get(NamedEntityTagAnnotation.class); pos = (pos == null || "O".equals(pos)) ? token.get(PartOfSpeechAnnotation.class) : pos; typeAttribute.setType(pos != null ? pos : TypeAttribute.DEFAULT_TYPE); // Token character offsets int be = token.get(CharacterOffsetBeginAnnotation.class).intValue(); int en = token.get(CharacterOffsetEndAnnotation.class).intValue(); offsetAttribute.setOffset(be, en); // Token in-document position increment: positionAttribute.setPositionIncrement(1 + skippedTokens); skippedTokens = 0; return true; }
Example #2
Source File: CoreNLPTokenizer.java From Heracles with GNU General Public License v3.0 | 5 votes |
/** * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter. * The Spans denoted by spanType must provide an annotation of type "text". * This spanType does not have to be textual unit. */ @Override public void validatedProcess(Dataset dataset, String spanTypeOfTextUnit){ if (dataset.getPerformedNLPTasks().contains(getTask())){ Framework.error("This dataset has already been tokenized."); return; } Properties prop1 = new Properties(); prop1.setProperty("annotators", "tokenize"); //prop1.setProperty("options", "splitHyphenated=true"); StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false); for (Span span : dataset.getSpans(spanTypeOfTextUnit)){ Span textualUnit = span.getTextualUnit(); String originalText = span.getAnnotation("text", String.class); Annotation a = new Annotation(originalText); pipeline.annotate(a); List<CoreLabel> tokenAnnotations = a.get(TokensAnnotation.class); Word previousWord = null; if (!textualUnit.isEmpty()) previousWord = textualUnit.last(); for (CoreLabel token : tokenAnnotations){ String word = token.get(OriginalTextAnnotation.class); int startOffset = token.get(CharacterOffsetBeginAnnotation.class); // int endOffset = token.get(CharacterOffsetEndAnnotation.class); // System.out.println(word + "\t" + startOffset + "\t" + endOffset); if (previousWord == null){ previousWord = new Word(word, startOffset, textualUnit, dataset); } else { previousWord = new Word(word, startOffset, previousWord); } //and add the new word to the sentence span. If span=textualSpan than this has no effect if (!textualUnit.equals(span)) span.add(previousWord); } } }
Example #3
Source File: CoreNLPNamedEntityRecognizer.java From Heracles with GNU General Public License v3.0 | 5 votes |
/** * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter. * The Spans denoted by spanType must each contain Words belonging to a single sentence. * */ @Override public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){ Properties prop1 = new Properties(); prop1.setProperty("annotators", "ner"); StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false); for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){ HashMap<Integer, Word> wordIndex = new HashMap<>(); Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex); if (a == null){ System.out.println(a); } pipeline.annotate(a); List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class); for (CoreMap sentence : sentenceAnnotations){ for (CoreLabel token: sentence.get(TokensAnnotation.class)) { Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class)); String ner = token.get(NamedEntityTagAnnotation.class); String nerValue = token.get(NormalizedNamedEntityTagAnnotation.class); if (ner!=null) w.putAnnotation("nerLabel", ner); if (nerValue!=null) w.putAnnotation("nerValue", nerValue); // System.out.println(w.getAnnotations()); } } } }
Example #4
Source File: CoreNLPLemmatizer.java From Heracles with GNU General Public License v3.0 | 5 votes |
/** * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter. * The Spans denoted by spanType must each contain Words belonging to a single sentence. * */ @Override public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){ Properties prop1 = new Properties(); prop1.setProperty("annotators", "lemma"); StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false); for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){ HashMap<Integer, Word> wordIndex = new HashMap<>(); Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex); if (a == null){ System.out.println(a); } pipeline.annotate(a); List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class); for (CoreMap sentence : sentenceAnnotations){ for (CoreLabel token: sentence.get(TokensAnnotation.class)) { Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class)); String tempLemma = token.get(LemmaAnnotation.class); w.putAnnotation("lemma", tempLemma.toLowerCase()); // System.out.println(w.getAnnotations()); } } } }
Example #5
Source File: CoreNLPHelper.java From Heracles with GNU General Public License v3.0 | 4 votes |
public static Annotation reconstructStanfordAnnotations(Span sentenceSpan, HashMap<Integer, Word> wordIndex, boolean useWordOrderInsteadOfOffset){ String originalText = sentenceSpan.getAnnotation("text", String.class); Annotation a = new Annotation(originalText); a.set(TextAnnotation.class, originalText); //a.set(DocIDAnnotation.class, "document"); List<CoreMap> sentenceAnnotations = new ArrayList<CoreMap>(); a.set(SentencesAnnotation.class, sentenceAnnotations); List<CoreLabel> tokenAnnotations = new ArrayList<CoreLabel>(); a.set(TokensAnnotation.class, tokenAnnotations); ArrayCoreMap sentenceAnnotation = new ArrayCoreMap(); sentenceAnnotations.add(sentenceAnnotation); // int startOffset = sentenceSpan.first().getStartOffset(); for (Word w : sentenceSpan){ CoreLabel c = new CoreLabel(); c.set(TextAnnotation.class, w.getWord()); c.set(OriginalTextAnnotation.class, w.getWord()); c.set(ValueAnnotation.class, w.getWord()); c.set(CharacterOffsetBeginAnnotation.class, w.getStartOffset()); c.set(CharacterOffsetEndAnnotation.class, w.getEndOffset()); c.set(IndexAnnotation.class, w.getOrder()+1); // c.setIndex(w.getOrder()); c.set(SentenceIndexAnnotation.class, 0); // c.setSentIndex(0); c.set(DocIDAnnotation.class, "document"); c.setDocID("document"); if (w.hasAnnotation("pos")) c.set(PartOfSpeechAnnotation.class, w.getAnnotation("pos",String.class)); if (w.hasAnnotation("lemma")) c.set(LemmaAnnotation.class, w.getAnnotation("lemma", String.class)); if (w.hasAnnotation("nerLabel")) c.set(NamedEntityTagAnnotation.class, w.getAnnotation("nerLabel", String.class)); if (w.hasAnnotation("nerValue")) c.set(NormalizedNamedEntityTagAnnotation.class, w.getAnnotation("nerValue", String.class)); tokenAnnotations.add(c); if (useWordOrderInsteadOfOffset){ wordIndex.put(w.getOrder(), w); } else { wordIndex.put(w.getStartOffset(), w); } } //essential sentence annotation: TokensAnnotation sentenceAnnotation.set(TokensAnnotation.class, tokenAnnotations); //essential sentence annotation: TextAnnotation sentenceAnnotation.set(TextAnnotation.class, originalText); //essential sentence annotation: SentenceIndexAnnotation sentenceAnnotation.set(SentenceIndexAnnotation.class, 0); sentenceAnnotation.set(CharacterOffsetBeginAnnotation.class, 0); sentenceAnnotation.set(CharacterOffsetEndAnnotation.class, sentenceSpan.last().getEndOffset()); sentenceAnnotation.set(TokenBeginAnnotation.class, 0); sentenceAnnotation.set(TokenEndAnnotation.class, sentenceSpan.last().getOrder()); return a; }
Example #6
Source File: CoreNLPPosTagger.java From Heracles with GNU General Public License v3.0 | 4 votes |
/** * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter. * The Spans denoted by spanType must each contain Words belonging to a single sentence. * */ @Override public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){ // if (dataset.getPerformedNLPTasks().contains(getTask())){ // Framework.error("This dataset has already been tagged with POS."); // return; // } //check if prerequisites are satisfied if (!dataset.getPerformedNLPTasks().containsAll(prerequisites)){ HashSet<NLPTask> missingTasks = new HashSet<>(); missingTasks.addAll(prerequisites); missingTasks.removeAll(dataset.getPerformedNLPTasks()); Framework.error("This dataset does not meet the requirements to use this component! Missing tasks: " + missingTasks); return; } Properties prop1 = new Properties(); prop1.setProperty("annotators", "pos"); StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false); for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){ HashMap<Integer, Word> wordIndex = new HashMap<>(); Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex); if (a == null){ System.out.println(a); } pipeline.annotate(a); List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class); for (CoreMap sentence : sentenceAnnotations){ for (CoreLabel token: sentence.get(TokensAnnotation.class)) { Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class)); String tempPos = token.get(PartOfSpeechAnnotation.class); if (w.hasAnnotation("URI")){ w.putAnnotation("pos", "NNP"); } else { w.putAnnotation("pos", tempPos); } // System.out.println(w.getAnnotations()); } } } }
Example #7
Source File: JsonPipeline.java From tac2015-event-detection with GNU General Public License v3.0 | 4 votes |
@SuppressWarnings({ "rawtypes", "unchecked" }) static void addEntityMentions(Map<String,Object> sent_info, CoreMap sentence) { List<CoreMap> coreMentions = sentence.get(MentionsAnnotation.class); List<Map> jsonMentions = new ArrayList<>(); /* trying to figure out the keys in each mention. here's a printout from one. MENTION August 2014 class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation August 2014 class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation 3 class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation 14 class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation [August-2, 2014-3] class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation 1 class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation 3 class edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation DATE class edu.stanford.nlp.ling.CoreAnnotations$NormalizedNamedEntityTagAnnotation 2014-08 class edu.stanford.nlp.ling.CoreAnnotations$EntityTypeAnnotation DATE class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation 0 class edu.stanford.nlp.time.TimeAnnotations$TimexAnnotation <TIMEX3 tid="t1" type="DATE" value="2014-08">August 2014</TIMEX3> MENTION Barack Obama class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation Barack Obama class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation 17 class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation 29 class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation [Barack-5, Obama-6] class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation 4 class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation 6 class edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation PERSON class edu.stanford.nlp.ling.CoreAnnotations$EntityTypeAnnotation PERSON class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation 0 MENTION Paris class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation Paris class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation 66 class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation 71 class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation [Paris-5] class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation 14 class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation 15 class edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation LOCATION class edu.stanford.nlp.ling.CoreAnnotations$EntityTypeAnnotation LOCATION class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation 1 */ for (CoreMap mention : coreMentions) { // U.p("MENTION " + mention); // for (Class k : mention.keySet()) { // U.pf("%s\t%s\n", k, mention.get(k)); // } Map m = new HashMap<String, Object>(); m.put("tokspan", Lists.newArrayList( mention.get(TokenBeginAnnotation.class).intValue(), mention.get(TokenEndAnnotation.class).intValue())); m.put("charspan", Lists.newArrayList( mention.get(CharacterOffsetBeginAnnotation.class).intValue(), mention.get(CharacterOffsetEndAnnotation.class).intValue())); m.put("sentence", mention.get(SentenceIndexAnnotation.class).intValue()); String entityType = mention.get(EntityTypeAnnotation.class); m.put("type", entityType); if (mention.containsKey(NormalizedNamedEntityTagAnnotation.class)) { m.put("normalized", mention.get(NormalizedNamedEntityTagAnnotation.class)); } if (mention.containsKey(TimexAnnotation.class)) { m.put("timex_xml", mention.get(TimexAnnotation.class).toString()); } jsonMentions.add(m); } sent_info.put("entitymentions", jsonMentions); }
Example #8
Source File: CRFPostprocessorFeatureFactory.java From phrasal with GNU General Public License v3.0 | 4 votes |
protected Collection<String> featuresC(PaddedList<? extends CoreLabel> cInfo, int loc) { Collection<String> features = new ArrayList<>(); CoreLabel c = cInfo.get(loc); CoreLabel n = cInfo.get(loc + 1); CoreLabel n2 = cInfo.get(loc + 2); CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); String charc = c.get(CoreAnnotations.CharAnnotation.class); String charn = n.get(CoreAnnotations.CharAnnotation.class); String charn2 = n2.get(CoreAnnotations.CharAnnotation.class); String charp = p.get(CoreAnnotations.CharAnnotation.class); String charp2 = p2.get(CoreAnnotations.CharAnnotation.class); // Default feature set...a 5 character window // Adding actual characters causes overfitting. // features.add(charc +"-c"); // features.add(charn + "-n1"); // features.add(charn2 + "-n2" ); // features.add(charp + "-p"); // features.add(charp2 + "-p2"); // Sequence start indicator if (loc == 0) features.add("seq-start"); // Character-class features addCharacterClassFeatures(features, charp2, "-p2"); addCharacterClassFeatures(features, charp, "-p"); addCharacterClassFeatures(features, charn, "-n"); addCharacterClassFeatures(features, charn2, "-n2"); addCharacterClassFeatures(features, charc, "-c"); // Token features if (charc != null && ! charc.equals(ProcessorTools.WHITESPACE)) { // Current token String cToken = tokenClass(c.get(ParentAnnotation.class)); features.add(cToken + "-cword"); // Character position in the current token. int cPosition = c.get(CharacterOffsetBeginAnnotation.class); if (cPosition == 0) { features.add("char-start"); features.add("start-" + cToken); } else { // features.add("char-inside"); features.add("inside-" + cToken); } // Left context String leftToken = "<S>"; for (int i = loc-1; i > 0; --i) { String leftC = cInfo.get(i).get(CoreAnnotations.CharAnnotation.class); if (leftC != null && leftC.equals(ProcessorTools.WHITESPACE)) { String left = tokenClass(cInfo.get(i-1).get(CoreAnnotations.ParentAnnotation.class)); if (left != null) { leftToken = left; } break; } } features.add(leftToken + "-lcontext"); // Right context unigram String rightToken = "</S>"; for (int i = loc+1; i < cInfo.size()-1; ++i) { String rightC = cInfo.get(i).get(CoreAnnotations.CharAnnotation.class); if (rightC != null && rightC.equals(ProcessorTools.WHITESPACE)) { String right = tokenClass(cInfo.get(i+1).get(CoreAnnotations.ParentAnnotation.class)); if (right != null) { rightToken = right; } break; } } features.add(rightToken + "-rcontext"); // Context n-grams, mainly for the truecasing feature // Left context bigram features.add(leftToken + "-" + cToken + "-lbigram"); } // Indicator transition feature features.add("cliqueC"); return features; }