Java Code Examples for edu.stanford.nlp.pipeline.Annotation#get()
The following examples show how to use
edu.stanford.nlp.pipeline.Annotation#get() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StanfordCoref.java From Graphene with GNU General Public License v3.0 | 6 votes |
@Override public CoreferenceContent doCoreferenceResolution(String text) { Annotation document = new Annotation(text); PIPELINE.annotate(document); // extract sentences List<Sentence> sentences = new ArrayList<>(); for (CoreMap coreMap : document.get(CoreAnnotations.SentencesAnnotation.class)) { Sentence sentence = new Sentence(); for (CoreLabel coreLabel : coreMap.get(CoreAnnotations.TokensAnnotation.class)) { sentence.addWord(coreLabel.word()); } sentences.add(sentence); } // replace coreferences for (CorefChain cc : document.get(CorefCoreAnnotations.CorefChainAnnotation.class).values()) { String coreMention = cc.getRepresentativeMention().mentionSpan; for (CorefChain.CorefMention corefMention : cc.getMentionsInTextualOrder()) { sentences.get(corefMention.sentNum-1).replaceWords(corefMention.startIndex-1, corefMention.endIndex-1, getReplacement(corefMention.mentionSpan, coreMention)); } } return new CoreferenceContent(text, sentences.stream().map(s -> s.toString()).collect(Collectors.joining(" "))); }
Example 2
Source File: StanfordTokenizer.java From ambiverse-nlu with Apache License 2.0 | 6 votes |
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { String text = aJCas.getDocumentText(); Annotation document = new Annotation(text); StanfordCoreNLP stanfordCoreNLP; if(!languageMap.containsKey(aJCas.getDocumentLanguage())) { throw new AnalysisEngineProcessException(new LanguageNotSupportedException("Language Not Supported")); } stanfordCoreNLP = stanfordCoreNLPs[languageMap.get(aJCas.getDocumentLanguage())]; stanfordCoreNLP.annotate(document); List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { int sstart = sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); int ssend = sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); Sentence jsentence = new Sentence(aJCas, sstart, ssend); jsentence.addToIndexes(); for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { Token casToken = new Token(aJCas, token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); casToken.addToIndexes(); } } }
Example 3
Source File: CoreNLP.java From gAnswer with BSD 3-Clause "New" or "Revised" License | 6 votes |
/** * How to use: * for (CoreLabel token : sentence.get(TokensAnnotation.class)) { * // this is the text of the token * String word = token.get(TextAnnotation.class); * // this is the POS tag of the token * String pos = token.get(PartOfSpeechAnnotation.class); * } * @param s * @return */ public CoreMap getPOS (String s) { // create an empty Annotation just with the given text Annotation document = new Annotation(s); // run all Annotators on this text pipeline_lemma.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for(CoreMap sentence: sentences) { // this is the sentence with POS Tags return sentence; } return null; }
Example 4
Source File: CoreNLP.java From gAnswer with BSD 3-Clause "New" or "Revised" License | 6 votes |
public Tree getParseTree (String text) { // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline_lemma.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for(CoreMap sentence: sentences) { // this is the parse tree of the current sentence return sentence.get(TreeAnnotation.class); } return null; }
Example 5
Source File: Chapter5.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 6 votes |
private static void usingStanfordPOSTagger() { Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, pos"); props.put("pos.model", "C:\\Current Books in Progress\\NLP and Java\\Models\\english-caseless-left3words-distsim.tagger"); props.put("pos.maxlen", 10); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document = new Annotation(theSentence); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { for (CoreLabel token : sentence.get(TokensAnnotation.class)) { String word = token.get(TextAnnotation.class); String pos = token.get(PartOfSpeechAnnotation.class); System.out.print(word + "/" + pos + " "); } System.out.println(); try { pipeline.xmlPrint(document, System.out); pipeline.prettyPrint(document, System.out); } catch (IOException ex) { ex.printStackTrace(); } } }
Example 6
Source File: StopwordAnnotatorTest.java From coreNlp with Apache License 2.0 | 5 votes |
/** * Test to validate that stopwords are properly annotated in the token list * @throws Exception */ @org.junit.Test public void testLuceneStopwordList() throws Exception { Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, stopword"); props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document = new Annotation(example); pipeline.annotate(document); List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); //get the standard lucene stopword set Set<?> stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; for (CoreLabel token : tokens) { //get the stopword annotation Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class); String word = token.word().toLowerCase(); if (stopWords.contains(word)) { assertTrue(stopword.first()); } else { assertFalse(stopword.first()); } //not checking lemma, so always false assertFalse(stopword.second()); } }
Example 7
Source File: ItalianTokenizerAnnotator.java From tint with GNU General Public License v3.0 | 5 votes |
/** * Given an Annotation, perform a task on this Annotation. * * @param annotation */ @Override public void annotate(Annotation annotation) { String text = annotation.get(CoreAnnotations.TextAnnotation.class); List<List<CoreLabel>> sTokens = tokenizer .parse(text, newlineIsSentenceBreak, tokenizeOnlyOnSpace, ssplitOnlyOnNewLine); Utils.addBasicAnnotations(annotation, sTokens, text); }
Example 8
Source File: CoreNLPLemmatizer.java From Heracles with GNU General Public License v3.0 | 5 votes |
/** * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter. * The Spans denoted by spanType must each contain Words belonging to a single sentence. * */ @Override public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){ Properties prop1 = new Properties(); prop1.setProperty("annotators", "lemma"); StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false); for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){ HashMap<Integer, Word> wordIndex = new HashMap<>(); Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex); if (a == null){ System.out.println(a); } pipeline.annotate(a); List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class); for (CoreMap sentence : sentenceAnnotations){ for (CoreLabel token: sentence.get(TokensAnnotation.class)) { Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class)); String tempLemma = token.get(LemmaAnnotation.class); w.putAnnotation("lemma", tempLemma.toLowerCase()); // System.out.println(w.getAnnotations()); } } } }
Example 9
Source File: CoreNlpExample.java From core-nlp-example with MIT License | 5 votes |
public static void main(String[] args) { // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // read some text in the text variable String text = "What is the Weather in Bangalore right now?"; // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { // this is the text of the token String word = token.get(CoreAnnotations.TextAnnotation.class); // this is the POS tag of the token String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); // this is the NER label of the token String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); System.out.println(String.format("Print: word: [%s] pos: [%s] ne: [%s]", word, pos, ne)); } } }
Example 10
Source File: StopwordAnnotatorTest.java From coreNlp with Apache License 2.0 | 5 votes |
/** * Test to validate that the custom stopword list words * @throws Exception */ @org.junit.Test public void testCustomStopwordList() throws Exception { //setup coreNlp properties for stopwords. Note the custom stopword list property Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, stopword"); props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator"); props.setProperty(StopwordAnnotator.STOPWORDS_LIST, customStopWordList); //get the custom stopword set Set<?> stopWords = StopwordAnnotator.getStopWordList(Version.LUCENE_36, customStopWordList, true); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document = new Annotation(example); pipeline.annotate(document); List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); for (CoreLabel token : tokens) { //get the stopword annotation Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class); String word = token.word().toLowerCase(); if (stopWords.contains(word)) { assertTrue(stopword.first()); } else { assertFalse(stopword.first()); } //not checking lemma, so always false assertFalse(stopword.second()); } }
Example 11
Source File: SentimentAnalyzer.java From computoser with GNU Affero General Public License v3.0 | 5 votes |
/** * Synchronized method to obtain the sentiment of the set of documents. * Synchronization is fine, because the method is invoked via a scheduled job * and only one execution at a time is permitted. * That allows to optimize the loading of the model as well * @param documents * @return */ public synchronized SentimentResult getSentiment(Set<String> documents, TimelineMusic meta) { double sentimentSum = 0; for (String document: documents) { int mainSentiment = 0; if (document != null && document.length() > 0) { int longest = 0; try { Annotation annotation = pipeline.process(document); // mainSentiment is the sentiment of the whole document. We find // the whole document by comparing the length of individual // annotated "fragments" for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { Tree tree = sentence.get(SentimentCoreAnnotations.AnnotatedTree.class); int sentiment = RNNCoreAnnotations.getPredictedClass(tree); String partText = sentence.toString(); if (partText.length() > longest) { mainSentiment = sentiment; longest = partText.length(); } } } catch (Exception ex) { logger.error("Problem analyzing document sentiment. " + document, ex); continue; } } sentimentSum += mainSentiment; } double average = sentimentSum / documents.size(); meta.setAverageSentiment(average); if (average >= 2.25) { return SentimentResult.POSITIVE; } else if (average <= 1.75) { return SentimentResult.NEGATIVE; } return SentimentResult.NEUTRAL; }
Example 12
Source File: SplitSentences.java From tint with GNU General Public License v3.0 | 5 votes |
public static void main(String[] args) { try { final CommandLine cmd = CommandLine .parser() .withName("./annotate-sentences") .withHeader("Annotate sentences") .withOption("i", "input", "Input file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, true) .withOption("o", "output", "Output file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, true) .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args); File input = cmd.getOptionValue("input", File.class); File output = cmd.getOptionValue("output", File.class); String text = new String(Files.readAllBytes(input.toPath()), Charsets.UTF_8); BufferedWriter writer = new BufferedWriter(new FileWriter(output)); Properties props = new Properties(); props.setProperty("annotators", "ita_toksent"); props.setProperty("customAnnotatorClass.ita_toksent", "eu.fbk.dh.tint.tokenizer.annotators.ItalianTokenizerAnnotator"); StanfordCoreNLP ITApipeline = new StanfordCoreNLP(props); Annotation annotation = new Annotation(text); ITApipeline.annotate(annotation); List<CoreMap> sents = annotation.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap thisSent : sents) { writer.append(thisSent.get(CoreAnnotations.TextAnnotation.class)).append("\n"); } writer.close(); } catch (Exception e) { CommandLine.fail(e); } }
Example 13
Source File: UPosAnnotator.java From tint with GNU General Public License v3.0 | 5 votes |
@Override public void annotate(Annotation annotation) { for (CoreLabel token : annotation.get(CoreAnnotations.TokensAnnotation.class)) { String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); String[] parts = pos.split("\\+"); StringBuffer upos = new StringBuffer(); for (String part : parts) { String thisPos = uposMap.getOrDefault(part, DEFAULT_UPOS); upos.append("+").append(thisPos); } token.set(CustomAnnotations.UPosAnnotation.class, upos.substring(1)); } }
Example 14
Source File: CoreNlpTokenizer.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
@Override public Iterable<CoreNlpToken> tokenize(CharSequence text) { Iterable<CoreLabel> iterator; if (StringUtility.isBlank(text)) { // 空格无需分词 iterator = Collections.EMPTY_LIST; } else { Annotation annotation = new Annotation(text.toString()); annotator.annotate(annotation); iterator = annotation.get(CoreAnnotations.TokensAnnotation.class); } CoreNlpToken iterable = new CoreNlpToken(iterator.iterator()); return iterable; }
Example 15
Source File: WiseOwlStanfordFilter.java From wiseowl with MIT License | 4 votes |
public Iterator findTokens() throws IOException { /*char[] c = new char[256]; int sz = 0; StringBuilder b = new StringBuilder(); while ((sz = input.read(c)) >= 0) { b.append(c, 0, sz); }*/ //String text = b.toString(); if (!input.incrementToken()) return null; String text; text = input.getAttribute(CharTermAttribute.class).toString(); // read some text in the text variable //System.out.println("before annotation"); Annotation document = new Annotation(text); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types pipeline.annotate(document); List<CoreMap> timexAnnsAll = document.get(TimeAnnotations.TimexAnnotations.class); for (CoreMap cm : timexAnnsAll) { List<CoreLabel> tokens = cm.get(CoreAnnotations.TokensAnnotation.class); TimeData td=new TimeData(); td.setTime(cm.get(TimeExpression.Annotation.class).getTemporal().toString()); td.setStart(tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); td.setEnd(tokens.get(tokens.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); timeQueue.add(td); } List<CoreMap> sentences = document.get(SentencesAnnotation.class); //System.out.println("after annotation and sentence getting"+sentences.size()); for(CoreMap sentence: sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token: sentence.get(TokensAnnotation.class)) { // this is the text of the token System.out.println("in token"); String word = token.get(TextAnnotation.class); // this is the POS tag of the token String pos = token.get(PartOfSpeechAnnotation.class); // this is the NER label of the token String ne = token.get(NamedEntityTagAnnotation.class); // System.out.println("word : "+word+" pos: "+pos+" ner: "+ne); TokenData tok=new TokenData(); tok.setNER(ne); tok.setToken(word); tok.setPOS(pos); tokenQueue.add(tok); } } Iterator<TokenData> it=tokenQueue.iterator(); itr_cpy=tokenQueue.iterator(); tokenOffset=0; start=0; end=0; return it; }
Example 16
Source File: QueryAnswerTypeAnalyzer.java From NLIWOD with GNU Affero General Public License v3.0 | 4 votes |
@Override public Object analyze(String q) { log.debug("String question: " + q); //some cases are resolved through the first word of the question if(q.startsWith("Where ") || q.startsWith("In ")) return "DBpedia:Place"; if(q.startsWith("How ")) return "Number"; if(q.startsWith("When ")) return "Schema:Date"; if(q.startsWith("Who ")) return "DBpedia:Person"; if(QuestionTypeAnalyzer.isASKQuestion(q)) return "Boolean"; Annotation annotation = new Annotation(q); PIPELINE.annotate(annotation); List<CoreMap> question = annotation.get(CoreAnnotations.SentencesAnnotation.class); //get all nouns, verbs, adjectives List<String> verbs = getWords(question, "V"); List<String> nouns = getWords(question, "N"); List<String> adjectives = getWords(question, "JJ"); //get all properties for the nouns, verbs, adjectives Map<String, List<String>> properties = new LinkedHashMap<>(); getProperties(properties, verbs); getProperties(properties, nouns); getProperties(properties, adjectives); //query all ranges for the properties and put them in a list ArrayList<String> ranges = new ArrayList<String>(); for(String key: properties.keySet()) { for(String r: properties.get(key)) { String answer = queryRange(r); ranges.add(answer); } } //find the most common range String range = mostCommon(ranges); //set the answertype depending on the uri (xml schema, ontology etc.) if(range.contains("http://dbpedia.org/ontology/")) { return range.replace("http://dbpedia.org/ontology/", "DBpedia:"); } else if(range.contains("http://www.w3.org/2001/XMLSchema#")) { if(range.toLowerCase().contains("double") || range.toLowerCase().contains("integer")) { return "Number"; } range = range.replace("http://www.w3.org/2001/XMLSchema#", ""); range = range.substring(0,1).toUpperCase() + range.substring(1); return "Schema:" + range; } else if(range.contains("http://www.w3.org/1999/02/22-rdf-syntax-ns#langString")) { return "Schema:String"; } return "Misc"; }
Example 17
Source File: CoreNLPDependencyParser.java From Heracles with GNU General Public License v3.0 | 4 votes |
@Override public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit) { Properties prop1 = new Properties(); prop1.setProperty("annotators", "depparse"); StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false); for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){ HashMap<Integer, Word> wordIndex = new HashMap<>(); Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex); // Main.debug(span.toString()); pipeline.annotate(a); for (CoreMap sentence : a.get(SentencesAnnotation.class)){ //per sentence, get the dependencies SemanticGraph dependencies = sentence.get(EnhancedPlusPlusDependenciesAnnotation.class); for (TypedDependency td : dependencies.typedDependencies()){ // Main.debug(td.toString()); String relationType = td.reln().getLongName(); Word dep = wordIndex.get(td.dep().beginPosition()); DataEntity gov = wordIndex.get(td.gov().beginPosition()); if (gov == null){ //this is the root, link to sentence gov = span; } if (dep == null || gov == null){ Framework.debug(td.toString()); Framework.debug(td.dep().beginPosition() + "\t" + td.gov().beginPosition()); Framework.debug(wordIndex.toString()); } Relation rel = new Relation("deps", gov, dep); rel.putAnnotation("relationLongName", td.reln().getLongName()); if (td.reln().getParent() != null) rel.putAnnotation("relationParentShortName", td.reln().getParent().getShortName()); rel.putAnnotation("relationShortName", td.reln().getShortName()); // rel.putAnnotation("relationSpecific", td.reln().getSpecific()); dep.getRelations().addRelationToParent(rel); gov.getRelations().addRelationToChild(rel); } // dependencies.prettyPrint(); } } }
Example 18
Source File: Chapter6.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 4 votes |
private static void usingStanfordSentimentAnalysis() { String review = "An overly sentimental film with a somewhat " + "problematic message, but its sweetness and charm " + "are occasionally enough to approximate true depth " + "and grace. "; String sam = "Sam was an odd sort of fellow. Not prone to angry and " + "not prone to merriment. Overall, an odd fellow."; String mary = "Mary thought that custard pie was the best pie in the " + "world. However, she loathed chocolate pie."; Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, parse, sentiment"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation annotation = new Annotation(review); pipeline.annotate(annotation); System.out.println("---sentimentText"); String[] sentimentText = {"Very Negative", "Negative", "Neutral", "Positive", "Very Positive"}; for (CoreMap sentence : annotation.get( CoreAnnotations.SentencesAnnotation.class)) { Tree tree = sentence.get( SentimentCoreAnnotations.AnnotatedTree.class); System.out.println("---Number of children: " + tree.numChildren()); System.out.println("[" + tree.getChild(0) + "][" + tree.getChild(1) + "]"); tree.printLocalTree(); int score = RNNCoreAnnotations.getPredictedClass(tree); System.out.println(sentimentText[score]); } // Classifer CRFClassifier crf = CRFClassifier.getClassifierNoExceptions( "C:/Current Books in Progress/NLP and Java/Models" + "/english.all.3class.distsim.crf.ser.gz"); String S1 = "Good afternoon Rajat Raina, how are you today?"; String S2 = "I go to school at Stanford University, which is located in California."; System.out.println(crf.classifyToString(S1)); System.out.println(crf.classifyWithInlineXML(S2)); System.out.println(crf.classifyToString(S2, "xml", true)); Object classification[] = crf.classify(S2).toArray(); for (int i = 0; i < classification.length; i++) { System.out.println(classification[i]); } }
Example 19
Source File: AnnotateLemma.java From tint with GNU General Public License v3.0 | 4 votes |
public static void main(String[] args) { try { final CommandLine cmd = CommandLine .parser() .withName("./annotate-lemmas") .withHeader("Annotate lemmas") .withOption("i", "input", "Input file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, true) .withOption("o", "output", "Input file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, true) .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args); File input = cmd.getOptionValue("input", File.class); File output = cmd.getOptionValue("output", File.class); String text = new String(Files.readAllBytes(input.toPath()), Charsets.UTF_8); BufferedWriter writer = new BufferedWriter(new FileWriter(output)); Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, ita_morpho, ita_lemma"); props.setProperty("tokenize.whitespace", "true"); props.setProperty("ssplit.eolonly", "true"); // props.setProperty("ita_toksent.newlineIsSentenceBreak", "1"); props.setProperty("pos.model", "/Users/alessio/Documents/Resources/ita-models/italian5.tagger"); props.setProperty("customAnnotatorClass.ita_toksent", "eu.fbk.dkm.pikes.tintop.ita.annotators.ItalianTokenizerAnnotator"); props.setProperty("customAnnotatorClass.ita_lemma", "eu.fbk.dh.digimorph.annotator.DigiLemmaAnnotator"); props.setProperty("customAnnotatorClass.ita_morpho", "eu.fbk.dh.digimorph.annotator.DigiMorphAnnotator"); props.setProperty("ita_morpho.model", "/Users/alessio/Documents/Resources/ita-models/italian.db"); StanfordCoreNLP ITApipeline = new StanfordCoreNLP(props); Annotation annotation = new Annotation(text); ITApipeline.annotate(annotation); System.out.println(ITApipeline.timingInformation()); List<CoreMap> sents = annotation.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap thisSent : sents) { List<CoreLabel> tokens = thisSent.get(CoreAnnotations.TokensAnnotation.class); for (CoreLabel token : tokens) { writer.append(token.originalText().replaceAll("\\s+", "")) .append("\t") .append(token.get(CoreAnnotations.PartOfSpeechAnnotation.class)) .append("\t") .append(token.get(CoreAnnotations.LemmaAnnotation.class)) .append("\n"); } writer.append("\n"); } writer.close(); } catch (Exception e) { CommandLine.fail(e); } }
Example 20
Source File: CoreNLPToJSON.java From phrasal with GNU General Public License v3.0 | 4 votes |
/** * Process an English text file. * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length < 1) { System.err.printf("Usage: java %s file [inputproperties_str] > json_output%n", CoreNLPToJSON.class.getName()); System.exit(-1); } String textFile = args[0]; InputProperties inputProperties = args.length > 1 ? InputProperties.fromString(args[1]) : new InputProperties(); StanfordCoreNLP coreNLP = new StanfordCoreNLP(properties); // Configure tokenizer EnglishPreprocessor preprocessor = new EnglishPreprocessor(true); // Use a map with ordered keys so that the output is ordered by segmentId. Map<Integer,SourceSegment> annotations = new TreeMap<Integer,SourceSegment>(); LineNumberReader reader = IOTools.getReaderFromFile(textFile); for (String line; (line = reader.readLine()) != null;) { Annotation annotation = coreNLP.process(line); List<CoreMap> sentences = annotation.get(SentencesAnnotation.class); if (sentences.size() != 1) { throw new RuntimeException("Sentence splitting on line: " + String.valueOf(reader.getLineNumber())); } CoreMap sentence = sentences.get(0); Tree tree = sentence.get(TreeAnnotation.class); tree.indexLeaves(); int[] chunkVector = getChunkVector(tree); List<CoreLabel> tokens = sentence.get(TokensAnnotation.class); int numTokens = tokens.size(); SymmetricalWordAlignment alignment = preprocessor.processAndAlign(line); if (alignment.e().size() != numTokens) { throw new RuntimeException(String.format("Tokenizer configurations differ: %d/%d", alignment.e().size(), numTokens)); } SourceSegment segment = new SourceSegment(numTokens); segment.layoutSpec.addAll(makeLayoutSpec(alignment)); segment.inputProperties = inputProperties.toString(); for (int j = 0; j < numTokens; ++j) { CoreLabel token = tokens.get(j); String word = token.get(TextAnnotation.class); segment.tokens.add(unescape(word)); String pos = mapPOS(token.get(PartOfSpeechAnnotation.class)); segment.pos.add(pos); String ne = token.get(NamedEntityTagAnnotation.class); segment.ner.add(ne); segment.chunkVector[j] = chunkVector[j]; } annotations.put(reader.getLineNumber()-1, segment); } reader.close(); System.err.printf("Processed %d sentences%n", reader.getLineNumber()); final SourceDocument jsonDocument = new SourceDocument(textFile, annotations); // Convert to json Gson gson = new Gson(); String json = gson.toJson(jsonDocument); System.out.println(json); }