edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation Java Examples
The following examples show how to use
edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CoreNlpTokenizer.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
@Override public boolean incrementToken() { clearAttributes(); while (tokens == null || !tokens.hasNext()) if (!getNextSentence()) return false; CoreLabel token = tokens.next(); // Use the lemmatized word: String word = token.get(LemmaAnnotation.class); if (word == null) { // Fallback when no lemmatization happens. word = token.get(TextAnnotation.class); } termAttribute.setLength(0); termAttribute.append(word); // NER or part of speech annotation String pos = token.get(NamedEntityTagAnnotation.class); pos = (pos == null || "O".equals(pos)) ? token.get(PartOfSpeechAnnotation.class) : pos; typeAttribute.setType(pos != null ? pos : TypeAttribute.DEFAULT_TYPE); // Token character offsets int be = token.get(CharacterOffsetBeginAnnotation.class).intValue(); int en = token.get(CharacterOffsetEndAnnotation.class).intValue(); offsetAttribute.setOffset(be, en); // Token in-document position increment: positionAttribute.setPositionIncrement(1 + skippedTokens); skippedTokens = 0; return true; }
Example #2
Source File: CoreNLPNamedEntityRecognizer.java From Heracles with GNU General Public License v3.0 | 5 votes |
/** * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter. * The Spans denoted by spanType must each contain Words belonging to a single sentence. * */ @Override public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){ Properties prop1 = new Properties(); prop1.setProperty("annotators", "ner"); StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false); for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){ HashMap<Integer, Word> wordIndex = new HashMap<>(); Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex); if (a == null){ System.out.println(a); } pipeline.annotate(a); List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class); for (CoreMap sentence : sentenceAnnotations){ for (CoreLabel token: sentence.get(TokensAnnotation.class)) { Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class)); String ner = token.get(NamedEntityTagAnnotation.class); String nerValue = token.get(NormalizedNamedEntityTagAnnotation.class); if (ner!=null) w.putAnnotation("nerLabel", ner); if (nerValue!=null) w.putAnnotation("nerValue", nerValue); // System.out.println(w.getAnnotations()); } } } }
Example #3
Source File: Entity.java From NLIWOD with GNU Affero General Public License v3.0 | 5 votes |
/*** * Checks if there is an entity of the specified type in the question. * @param entityType an entity type: Date, Location, Organization, Person, Percent, or Money * @param question * @return if a entity of that type is present returns the name of the type otherwise "No" + the name of the type */ protected String recognizeEntity(String entityType, String question){ String result = "No" + entityType; Annotation annotation = new Annotation(question); pipeline.annotate(annotation); List<CoreMap> sentences = annotation.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) for (CoreLabel token: sentence.get(TokensAnnotation.class)) { String ne = token.get(NamedEntityTagAnnotation.class); if(entityType.toUpperCase().equals(ne)) result = entityType; } return result; }
Example #4
Source File: WiseOwlStanfordFilter.java From wiseowl with MIT License | 4 votes |
public static void main(String args[]) { Properties props = new Properties(); props.setProperty("annotators", "tokenize, cleanxml, ssplit,pos,lemma,ner"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.addAnnotator(new TimeAnnotator("sutime", props)); String text = "<mydata> refeer</mydata>today is 12 jan 2016. what is tommorow? Who is Avtar? Does he work at Apple or Google? Sumit was born on 13 feb,2011."; Annotation document = new Annotation(text); pipeline.annotate(document); System.out.println(document.get(CoreAnnotations.TextAnnotation.class)); List<CoreMap> timexAnnsAll = document.get(TimeAnnotations.TimexAnnotations.class); for (CoreMap cm : timexAnnsAll) { List<CoreLabel> tokens = cm.get(CoreAnnotations.TokensAnnotation.class); TimeData td=new TimeData(); td.setTime(cm.get(TimeExpression.Annotation.class).getTemporal().toISOString()); td.setStart(tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); td.setEnd(tokens.get(tokens.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); } List<CoreMap> sentences = document.get(SentencesAnnotation.class); for(CoreMap sentence: sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods System.out.println("in sent"); for (CoreLabel token: sentence.get(TokensAnnotation.class)) { // this is the text of the token System.out.println("in token"); String word = token.get(TextAnnotation.class); // this is the POS tag of the token String pos = token.get(PartOfSpeechAnnotation.class); // this is the NER label of the token String ne = token.get(NamedEntityTagAnnotation.class); System.out.println("word : "+word+" pos: "+pos+" ner: "+ne); } } }
Example #5
Source File: WiseOwlStanfordFilter.java From wiseowl with MIT License | 4 votes |
public Iterator findTokens() throws IOException { /*char[] c = new char[256]; int sz = 0; StringBuilder b = new StringBuilder(); while ((sz = input.read(c)) >= 0) { b.append(c, 0, sz); }*/ //String text = b.toString(); if (!input.incrementToken()) return null; String text; text = input.getAttribute(CharTermAttribute.class).toString(); // read some text in the text variable //System.out.println("before annotation"); Annotation document = new Annotation(text); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types pipeline.annotate(document); List<CoreMap> timexAnnsAll = document.get(TimeAnnotations.TimexAnnotations.class); for (CoreMap cm : timexAnnsAll) { List<CoreLabel> tokens = cm.get(CoreAnnotations.TokensAnnotation.class); TimeData td=new TimeData(); td.setTime(cm.get(TimeExpression.Annotation.class).getTemporal().toString()); td.setStart(tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); td.setEnd(tokens.get(tokens.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); timeQueue.add(td); } List<CoreMap> sentences = document.get(SentencesAnnotation.class); //System.out.println("after annotation and sentence getting"+sentences.size()); for(CoreMap sentence: sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token: sentence.get(TokensAnnotation.class)) { // this is the text of the token System.out.println("in token"); String word = token.get(TextAnnotation.class); // this is the POS tag of the token String pos = token.get(PartOfSpeechAnnotation.class); // this is the NER label of the token String ne = token.get(NamedEntityTagAnnotation.class); // System.out.println("word : "+word+" pos: "+pos+" ner: "+ne); TokenData tok=new TokenData(); tok.setNER(ne); tok.setToken(word); tok.setPOS(pos); tokenQueue.add(tok); } } Iterator<TokenData> it=tokenQueue.iterator(); itr_cpy=tokenQueue.iterator(); tokenOffset=0; start=0; end=0; return it; }
Example #6
Source File: CoreNLPHelper.java From Heracles with GNU General Public License v3.0 | 4 votes |
public static Annotation reconstructStanfordAnnotations(Span sentenceSpan, HashMap<Integer, Word> wordIndex, boolean useWordOrderInsteadOfOffset){ String originalText = sentenceSpan.getAnnotation("text", String.class); Annotation a = new Annotation(originalText); a.set(TextAnnotation.class, originalText); //a.set(DocIDAnnotation.class, "document"); List<CoreMap> sentenceAnnotations = new ArrayList<CoreMap>(); a.set(SentencesAnnotation.class, sentenceAnnotations); List<CoreLabel> tokenAnnotations = new ArrayList<CoreLabel>(); a.set(TokensAnnotation.class, tokenAnnotations); ArrayCoreMap sentenceAnnotation = new ArrayCoreMap(); sentenceAnnotations.add(sentenceAnnotation); // int startOffset = sentenceSpan.first().getStartOffset(); for (Word w : sentenceSpan){ CoreLabel c = new CoreLabel(); c.set(TextAnnotation.class, w.getWord()); c.set(OriginalTextAnnotation.class, w.getWord()); c.set(ValueAnnotation.class, w.getWord()); c.set(CharacterOffsetBeginAnnotation.class, w.getStartOffset()); c.set(CharacterOffsetEndAnnotation.class, w.getEndOffset()); c.set(IndexAnnotation.class, w.getOrder()+1); // c.setIndex(w.getOrder()); c.set(SentenceIndexAnnotation.class, 0); // c.setSentIndex(0); c.set(DocIDAnnotation.class, "document"); c.setDocID("document"); if (w.hasAnnotation("pos")) c.set(PartOfSpeechAnnotation.class, w.getAnnotation("pos",String.class)); if (w.hasAnnotation("lemma")) c.set(LemmaAnnotation.class, w.getAnnotation("lemma", String.class)); if (w.hasAnnotation("nerLabel")) c.set(NamedEntityTagAnnotation.class, w.getAnnotation("nerLabel", String.class)); if (w.hasAnnotation("nerValue")) c.set(NormalizedNamedEntityTagAnnotation.class, w.getAnnotation("nerValue", String.class)); tokenAnnotations.add(c); if (useWordOrderInsteadOfOffset){ wordIndex.put(w.getOrder(), w); } else { wordIndex.put(w.getStartOffset(), w); } } //essential sentence annotation: TokensAnnotation sentenceAnnotation.set(TokensAnnotation.class, tokenAnnotations); //essential sentence annotation: TextAnnotation sentenceAnnotation.set(TextAnnotation.class, originalText); //essential sentence annotation: SentenceIndexAnnotation sentenceAnnotation.set(SentenceIndexAnnotation.class, 0); sentenceAnnotation.set(CharacterOffsetBeginAnnotation.class, 0); sentenceAnnotation.set(CharacterOffsetEndAnnotation.class, sentenceSpan.last().getEndOffset()); sentenceAnnotation.set(TokenBeginAnnotation.class, 0); sentenceAnnotation.set(TokenEndAnnotation.class, sentenceSpan.last().getOrder()); return a; }
Example #7
Source File: JsonPipeline.java From tac2015-event-detection with GNU General Public License v3.0 | 4 votes |
/** annotator is a stanford corenlp notion. */ void addAnnoToSentenceObject(Map<String,Object> sent_info, CoreMap sentence, String annotator) { switch(annotator) { case "tokenize": case "cleanxml": case "ssplit": break; case "pos": addTokenAnno(sent_info,sentence, "pos", PartOfSpeechAnnotation.class); break; case "lemma": addTokenAnno(sent_info,sentence, "lemmas", LemmaAnnotation.class); break; case "ner": addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class); addTokenAnno(sent_info, sentence, "normner", NormalizedNamedEntityTagAnnotation.class); break; case "regexner": addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class); break; case "sentiment": throw new RuntimeException("TODO"); case "truecase": throw new RuntimeException("TODO"); case "parse": addParseTree(sent_info,sentence); addDepsCC(sent_info,sentence); addDepsBasic(sent_info,sentence); break; case "depparse": addDepsCC(sent_info,sentence); addDepsBasic(sent_info,sentence); break; case "dcoref": break; case "relation": throw new RuntimeException("TODO"); case "natlog": throw new RuntimeException("TODO"); case "quote": throw new RuntimeException("TODO"); case "entitymentions": addEntityMentions(sent_info, sentence); break; default: throw new RuntimeException("don't know how to handle annotator " + annotator); } }
Example #8
Source File: CoreNLPToJSON.java From phrasal with GNU General Public License v3.0 | 4 votes |
/** * Process an English text file. * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length < 1) { System.err.printf("Usage: java %s file [inputproperties_str] > json_output%n", CoreNLPToJSON.class.getName()); System.exit(-1); } String textFile = args[0]; InputProperties inputProperties = args.length > 1 ? InputProperties.fromString(args[1]) : new InputProperties(); StanfordCoreNLP coreNLP = new StanfordCoreNLP(properties); // Configure tokenizer EnglishPreprocessor preprocessor = new EnglishPreprocessor(true); // Use a map with ordered keys so that the output is ordered by segmentId. Map<Integer,SourceSegment> annotations = new TreeMap<Integer,SourceSegment>(); LineNumberReader reader = IOTools.getReaderFromFile(textFile); for (String line; (line = reader.readLine()) != null;) { Annotation annotation = coreNLP.process(line); List<CoreMap> sentences = annotation.get(SentencesAnnotation.class); if (sentences.size() != 1) { throw new RuntimeException("Sentence splitting on line: " + String.valueOf(reader.getLineNumber())); } CoreMap sentence = sentences.get(0); Tree tree = sentence.get(TreeAnnotation.class); tree.indexLeaves(); int[] chunkVector = getChunkVector(tree); List<CoreLabel> tokens = sentence.get(TokensAnnotation.class); int numTokens = tokens.size(); SymmetricalWordAlignment alignment = preprocessor.processAndAlign(line); if (alignment.e().size() != numTokens) { throw new RuntimeException(String.format("Tokenizer configurations differ: %d/%d", alignment.e().size(), numTokens)); } SourceSegment segment = new SourceSegment(numTokens); segment.layoutSpec.addAll(makeLayoutSpec(alignment)); segment.inputProperties = inputProperties.toString(); for (int j = 0; j < numTokens; ++j) { CoreLabel token = tokens.get(j); String word = token.get(TextAnnotation.class); segment.tokens.add(unescape(word)); String pos = mapPOS(token.get(PartOfSpeechAnnotation.class)); segment.pos.add(pos); String ne = token.get(NamedEntityTagAnnotation.class); segment.ner.add(ne); segment.chunkVector[j] = chunkVector[j]; } annotations.put(reader.getLineNumber()-1, segment); } reader.close(); System.err.printf("Processed %d sentences%n", reader.getLineNumber()); final SourceDocument jsonDocument = new SourceDocument(textFile, annotations); // Convert to json Gson gson = new Gson(); String json = gson.toJson(jsonDocument); System.out.println(json); }