edu.stanford.nlp.ling.CoreLabel Java Examples
The following examples show how to use
edu.stanford.nlp.ling.CoreLabel.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaClient.java From blog-codes with Apache License 2.0 | 6 votes |
public static void main(String[] args) { // creates a StanfordCoreNLP object with POS tagging, lemmatization, NER, parsing, and coreference resolution Properties props = new Properties(); props.setProperty("annotators", "tokenize,ssplit,pos,ner,depparse,openie"); MultiLangsStanfordCoreNLPClient pipeline = new MultiLangsStanfordCoreNLPClient(props, "http://localhost", 9000, 2, null, null, "zh"); // read some text in the text variable String text = "今天天气很好。"; // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); CoreMap firstSentence = document.get(CoreAnnotations.SentencesAnnotation.class).get(0); // this for loop will print out all of the tokens and the character offset info for (CoreLabel token : firstSentence.get(CoreAnnotations.TokensAnnotation.class)) { System.out.println(token.word() + "\t" + token.beginPosition() + "\t" + token.endPosition()); } }
Example #2
Source File: NERSearcher.java From Stargraph with MIT License | 6 votes |
private List<LinkedNamedEntity> postProcessFoundNamedEntities(List<List<CoreLabel>> sentences) { final List<List<LinkedNamedEntity>> sentenceList = mergeConsecutiveNamedEntities(sentences); if (this.reverseNameOrder) { sentenceList.forEach(sentence -> { sentence.forEach(LinkedNamedEntity::reverseValue); }); } if (sentenceList.isEmpty() || (sentenceList.size() == 1 && sentenceList.get(0).isEmpty())) { logger.trace(marker, "No Entities detected."); return Collections.emptyList(); } return linkNamedEntities(sentenceList); }
Example #3
Source File: IntelKBPAnnotator.java From InformationExtraction with GNU General Public License v3.0 | 6 votes |
/** * Annotate all the pronominal mentions in the document. * * @param ann The document. * @return The list of pronominal mentions in the document. */ private static List<CoreMap> annotatePronominalMentions(Annotation ann) { List<CoreMap> pronouns = new ArrayList<>(); List<CoreMap> sentences = ann.get(CoreAnnotations.SentencesAnnotation.class); for (int sentenceIndex = 0; sentenceIndex < sentences.size(); sentenceIndex++) { CoreMap sentence = sentences.get(sentenceIndex); Integer annoTokenBegin = sentence.get(CoreAnnotations.TokenBeginAnnotation.class); if (annoTokenBegin == null) { annoTokenBegin = 0; } List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); for (int tokenIndex = 0; tokenIndex < tokens.size(); tokenIndex++) { CoreLabel token = tokens.get(tokenIndex); if (kbpIsPronominalMention(token)) { CoreMap pronoun = ChunkAnnotationUtils.getAnnotatedChunk(tokens, tokenIndex, tokenIndex + 1, annoTokenBegin, null, CoreAnnotations.TextAnnotation.class, null); pronoun.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex); sentence.get(CoreAnnotations.MentionsAnnotation.class).add(pronoun); pronouns.add(pronoun); } } } return pronouns; }
Example #4
Source File: MainTest.java From dependensee with GNU General Public License v2.0 | 6 votes |
/** * Test of writeImage method, of class Main. */ @Test public void testWriteImage() throws Exception { String text = "A quick brown fox jumped over the lazy dog."; TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); LexicalizedParser lp = LexicalizedParser.loadModel(); lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"}); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize(); Tree tree = lp.apply(wordList); GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed(); Main.writeImage(tdl, "image.png", 3); assert (new File("image.png").exists()); }
Example #5
Source File: RegexNerTest.java From InformationExtraction with GNU General Public License v3.0 | 6 votes |
public static List<String> extractNER(String doc){ Annotation document = new Annotation(doc); pipeline.annotate(document); List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); List<String> result = new ArrayList<String>(); for(CoreMap sentence: sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token: sentence.get(CoreAnnotations.TokensAnnotation.class)) { // this is the text of the token String word = token.get(CoreAnnotations.TextAnnotation.class); // this is the POS tag of the token String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); // this is the NER label of the token String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); result.add(ne); } } return result; }
Example #6
Source File: Chapter5.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 6 votes |
private static void usingStanfordPOSTagger() { Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, pos"); props.put("pos.model", "C:\\Current Books in Progress\\NLP and Java\\Models\\english-caseless-left3words-distsim.tagger"); props.put("pos.maxlen", 10); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document = new Annotation(theSentence); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { for (CoreLabel token : sentence.get(TokensAnnotation.class)) { String word = token.get(TextAnnotation.class); String pos = token.get(PartOfSpeechAnnotation.class); System.out.print(word + "/" + pos + " "); } System.out.println(); try { pipeline.xmlPrint(document, System.out); pipeline.prettyPrint(document, System.out); } catch (IOException ex) { ex.printStackTrace(); } } }
Example #7
Source File: Chapter4.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 6 votes |
private static void usingStanfordNER() { String model = getModelDir() + "\\english.conll.4class.distsim.crf.ser.gz"; CRFClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(model); String sentence = ""; for (String element : sentences) { sentence += element; } List<List<CoreLabel>> entityList = classifier.classify(sentence); for (List<CoreLabel> internalList : entityList) { for (CoreLabel coreLabel : internalList) { String word = coreLabel.word(); String category = coreLabel.get(CoreAnnotations.AnswerAnnotation.class); // System.out.println(word + ":" + category); if (!"O".equals(category)) { System.out.println(word + ":" + category); } } } }
Example #8
Source File: GalicianReadability.java From tint with GNU General Public License v3.0 | 6 votes |
@Override public void addingContentWord(CoreLabel token) { super.addingContentWord(token); token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 4); String lemma = token.lemma(); if (model.getLevel3Lemmas().contains(lemma)) { level3WordSize++; token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 3); } if (model.getLevel2Lemmas().contains(lemma)) { level2WordSize++; token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 2); } if (model.getLevel1Lemmas().contains(lemma)) { level1WordSize++; token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 1); } }
Example #9
Source File: Postprocess.java From phrases with Apache License 2.0 | 6 votes |
public List<Pattern> run(List<Pattern> patterns) { Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse, sentiment"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); for (Pattern pattern : patterns) { Annotation annotation = pipeline.process(pattern.toSentences()); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { Tree tree = sentence.get(SentimentCoreAnnotations.AnnotatedTree.class); int sentiment = RNNCoreAnnotations.getPredictedClass(tree); for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { String lemma = token.get(CoreAnnotations.LemmaAnnotation.class); } } } return null; }
Example #10
Source File: Main.java From dependensee with GNU General Public License v2.0 | 6 votes |
public static void writeImage(String sentence, String outFile, int scale) throws Exception { LexicalizedParser lp = null; try { lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); } catch (Exception e) { System.err.println("Could not load file englishPCFG.ser.gz. Try placing this file in the same directory as Dependencee.jar"); return; } lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"}); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); Tree tree = lp.apply(wordList); writeImage(tree, outFile, scale); }
Example #11
Source File: Tokens.java From ambiverse-nlu with Apache License 2.0 | 6 votes |
public static Tokens getTokensFromJCas(JCas jCas) { Tokens tokens = new Tokens(); int s_number = 0; //DKPro does not give sentence index???????? int t_number = 0; for (Sentence sentence : select(jCas, Sentence.class)) { List<de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token> dktokens = selectCovered(jCas, de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token.class, sentence); for (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token t : dktokens) { CoreLabel taggedWord = CoreNlpUtils.tokenToWord( t); //This step should be avoided. Transform directly from DKPRO to AIDA TOKEN. Problem POS mappings. AIDA works with Stanford tags Token aidaToken = new Token(t_number, t.getCoveredText(), t.getBegin(), t.getEnd(), 0); aidaToken.setPOS(taggedWord.get(CoreAnnotations.PartOfSpeechAnnotation.class)); aidaToken.setSentence(s_number); tokens.addToken(aidaToken); t_number++; } s_number++; } return tokens; }
Example #12
Source File: ProcessorTools.java From phrasal with GNU General Public License v3.0 | 6 votes |
/** * Convert a raw/preprocessed String pair to a labeled sequence appropriate for training * the CRF-based post-processor. * * The SymmetricalWordAlignment is created by a Preprocessor. Source is the raw input, target is * the tokenized/pre-processed output. * * @return */ public static List<CoreLabel> alignedPairToLabeledSequence(SymmetricalWordAlignment alignment) { List<CoreLabel> sequence = new ArrayList<>(alignment.eSize() * 7); for (int i = 0; i < alignment.fSize(); ++i) { if (sequence.size() > 0) sequence.add(createDatum(WHITESPACE, Operation.Whitespace.toString(), sequence.size(), WHITESPACE, 0)); String token = alignment.f().get(i).toString(); Set<Integer> eAlignments = alignment.f2e(i); if (eAlignments.size() == 0) { System.err.printf("%s: WARNING: discarding unaligned token (%s)%n", ProcessorTools.class.getName(), token); } else { List<String> eTokens = new ArrayList<>(eAlignments.size()); for (int j : eAlignments) { eTokens.add(alignment.e().get(j).toString()); } List<CoreLabel> charSequence = toSequence(token, eTokens, sequence.size()); sequence.addAll(charSequence); } } return sequence; }
Example #13
Source File: TestCustomLemmaAnnotator.java From blog-codes with Apache License 2.0 | 6 votes |
@Test public void test() { Properties props = new Properties(); props.setProperty("annotators", "tokenize,ssplit,pos,custom.lemma"); props.setProperty("customAnnotatorClass.custom.lemma", "com.fancyerii.blog.stanfordnlp.CustomLemmaAnnotator"); props.setProperty("custom.lemma.lemmaFile", "custom-lemmas.txt"); // set up pipeline StanfordCoreNLP pipeline = new StanfordCoreNLP(props); CoreDocument exampleDocument = new CoreDocument("Some many goods there."); // annotate document pipeline.annotate(exampleDocument); // access tokens from a CoreDocument // a token is represented by a CoreLabel List<CoreLabel> firstSentenceTokens = exampleDocument.sentences().get(0).tokens(); // this for loop will print out all of the tokens and the character offset info for (CoreLabel token : firstSentenceTokens) { System.out.println(token.word()+"/"+token.getString(LemmaAnnotation.class) + "\t" + token.beginPosition() + "\t" + token.endPosition()); } }
Example #14
Source File: TokenizerDemo.java From blog-codes with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws IOException { for (String arg : args) { // option #1: By sentence. DocumentPreprocessor dp = new DocumentPreprocessor(arg); for (List<HasWord> sentence : dp) { System.out.println(sentence); } // option #2: By token PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new FileReader(arg), new CoreLabelTokenFactory(), ""); while (ptbt.hasNext()) { CoreLabel label = ptbt.next(); System.out.println(label); } } }
Example #15
Source File: Summarizer.java From wiseowl with MIT License | 5 votes |
private static Counter<String> getTermFrequencies(List<CoreMap> sentences) { Counter<String> ret = new ClassicCounter<String>(); for (CoreMap sentence : sentences) for (CoreLabel cl : sentence.get(CoreAnnotations.TokensAnnotation.class)) ret.incrementCount(cl.get(CoreAnnotations.TextAnnotation.class)); return ret; }
Example #16
Source File: Main.java From dependensee with GNU General Public License v2.0 | 5 votes |
public static void writeImage(String sentence, String outFile, LexicalizedParser lp) throws Exception { Tree parse; try { TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); parse = lp.apply(wordList); } catch (Exception e) { throw e; } writeImage(parse, outFile); }
Example #17
Source File: NERTool.java From Criteria2Query with Apache License 2.0 | 5 votes |
public static void train(String traindatapath,String targetpath){ long startTime = System.nanoTime(); /* Step 1: learn the classifier from the training data */ String trainFile = traindatapath; /* Learn the classifier from the training data */ String serializeFileLoc =targetpath; // properties: https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/NERFeatureFactory.html Properties props = new Properties(); props.put("trainFile", trainFile); // To train with multiple files, a comma separated list props.put("map", "word=0,answer=1"); props.put("useClassFeature", "true"); props.put("useNGrams", "true"); props.put("noMidNGrams", "true"); props.put("maxNGramLeng", "6"); props.put("useDisjunctive", "true"); props.put("usePrev", "true"); props.put("useNext", "true"); props.put("useSequences", "true"); props.put("usePrevSequences", "true"); props.put("maxLeft", "1"); props.put("useTypeSeqs", "true"); props.put("useTypeSeqs2", "true"); props.put("useTypeySequences", "true"); props.put("wordShape", "chris2useLC"); // props.put("printFeatures", "true"); // This feature can be turned off in recent versions with the flag -useKnownLCWords false // https://nlp.stanford.edu/software/crf-faq.html question 13 SeqClassifierFlags flags = new SeqClassifierFlags(props); CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(flags); crf.train(); crf.serializeClassifier(serializeFileLoc); }
Example #18
Source File: CoreNLPUtils.java From minie with GNU General Public License v3.0 | 5 votes |
/** * Given a semantic graph of a whole sentence (sg) and a "local root" node, get the subgraph from 'sg' which has * 'localRoot' as a root. * @param sg: semantic graph of the whole sentence * @param localRoot: the root of the subgraph * @return semantic graph object which is the subgraph from 'sg' */ public static SemanticGraph getSubgraph(SemanticGraph sg, IndexedWord localRoot){ ObjectArrayList<TypedDependency> subGraphDependencies = getSubgraphTypedDependencies(sg, localRoot, new ObjectArrayList<TypedDependency>()); TreeGraphNode rootTGN = new TreeGraphNode(new CoreLabel(localRoot)); EnglishGrammaticalStructure gs = new EnglishGrammaticalStructure(subGraphDependencies, rootTGN); return SemanticGraphFactory.generateUncollapsedDependencies(gs); }
Example #19
Source File: CRFPostprocessor.java From phrasal with GNU General Public License v3.0 | 5 votes |
/** * Train a model given a preprocessor. * * @param preProcessor */ protected void train(Preprocessor preProcessor) { DocumentReaderAndWriter<CoreLabel> docReader = new ProcessorTools.PostprocessorDocumentReaderAndWriter(preProcessor); ObjectBank<List<CoreLabel>> lines = classifier.makeObjectBankFromFile(flags.trainFile, docReader); classifier.train(lines, docReader); System.err.println("Finished training."); }
Example #20
Source File: Phrase.java From minie with GNU General Public License v3.0 | 5 votes |
/** Remove a set of words represented as core labels from the list of indexed words **/ public void removeCoreLabelWordsFromList(List<CoreMap> cmWords){ ObjectArrayList<IndexedWord> rWords = new ObjectArrayList<>(); for (CoreMap cm: cmWords){ rWords.add(new IndexedWord(new CoreLabel(cm))); } this.removeWordsFromList(rWords); }
Example #21
Source File: Summarizer.java From wiseowl with MIT License | 5 votes |
private double tfIDFWeights(CoreMap sentence) { double total = 0; for (CoreLabel cl : sentence.get(CoreAnnotations.TokensAnnotation.class)) if (cl.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("n")) total += tfIDFWeight(cl.get(CoreAnnotations.TextAnnotation.class)); return total; }
Example #22
Source File: CoreNLPUtils.java From minie with GNU General Public License v3.0 | 5 votes |
/** * Given a list of words (as core maps), return the phrase of words as a list of indexed word objects * @param words: list of words (e.g. [She, is, pretty]) * @return list of words (as IndexedWord) */ public static ObjectArrayList<IndexedWord> listOfCoreMapWordsToIndexedWordList(List<CoreMap> cmList){ ObjectArrayList<IndexedWord> wordList = new ObjectArrayList<>(); for (CoreMap cm: cmList){ wordList.add(new IndexedWord(new CoreLabel(cm))); } return wordList; }
Example #23
Source File: CoreNLPUtils.java From minie with GNU General Public License v3.0 | 5 votes |
public static ObjectArrayList<CoreLabel> getCoreLabelListFromIndexedWordList(ObjectArrayList<IndexedWord> words) { ObjectArrayList<CoreLabel> coreLabelList = new ObjectArrayList<>(); for (IndexedWord w: words) { coreLabelList.add(new CoreLabel(w)); } return coreLabelList; }
Example #24
Source File: CoreNLPUtils.java From minie with GNU General Public License v3.0 | 5 votes |
public static ObjectArrayList<IndexedWord> getWordListFromCoreMapList(List<CoreMap> coreMapList){ ObjectArrayList<IndexedWord> coreLabelList = new ObjectArrayList<>(); for (CoreMap cm: coreMapList){ coreLabelList.add(new IndexedWord(new CoreLabel(cm))); } return coreLabelList; }
Example #25
Source File: UPosAnnotator.java From tint with GNU General Public License v3.0 | 5 votes |
@Override public void annotate(Annotation annotation) { for (CoreLabel token : annotation.get(CoreAnnotations.TokensAnnotation.class)) { String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); String[] parts = pos.split("\\+"); StringBuffer upos = new StringBuffer(); for (String part : parts) { String thisPos = uposMap.getOrDefault(part, DEFAULT_UPOS); upos.append("+").append(thisPos); } token.set(CustomAnnotations.UPosAnnotation.class, upos.substring(1)); } }
Example #26
Source File: CoreNLPUtils.java From minie with GNU General Public License v3.0 | 5 votes |
public static ObjectOpenHashSet<IndexedWord> getWordSetFromCoreMapList(List<CoreMap> coreMapList){ ObjectOpenHashSet<IndexedWord> coreLabelSet = new ObjectOpenHashSet<>(); for (CoreMap cm: coreMapList){ coreLabelSet.add(new IndexedWord(new CoreLabel(cm))); } return coreLabelSet; }
Example #27
Source File: Minimization.java From minie with GNU General Public License v3.0 | 5 votes |
/** Given a phrase, if it contains a verb phrase, make a verb phrase safe minimization **/ public void verbPhraseSafeMinimization(List<CoreMap> remWords, List<CoreMap> matchWords){ // Flags for checking certain conditions boolean isAdverb; boolean isNotNER; boolean containsNEG; // If the relation starts with a RB+ VB+, drop RB+ this.tPattern = TokenSequencePattern.compile(REGEX.T_RB_VB); this.tMatcher = tPattern.getMatcher(this.phrase.getWordCoreLabelList()); while (this.tMatcher.find()){ matchWords = tMatcher.groupNodes(); for (CoreMap cm: matchWords){ CoreLabel cl = new CoreLabel(cm); if (cl.lemma() == null) cl.setLemma(cl.word()); isAdverb = CoreNLPUtils.isAdverb(cl.tag()); isNotNER = cl.ner().equals(NE_TYPE.NO_NER); containsNEG = Polarity.NEG_WORDS.contains(cl.lemma().toLowerCase()); // Check if the word is RB which is not a NER if (isAdverb && isNotNER && !containsNEG){ remWords.add(cm); } } this.dropWords(remWords, matchWords); } }
Example #28
Source File: CoreNLPTokenizer.java From Heracles with GNU General Public License v3.0 | 5 votes |
/** * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter. * The Spans denoted by spanType must provide an annotation of type "text". * This spanType does not have to be textual unit. */ @Override public void validatedProcess(Dataset dataset, String spanTypeOfTextUnit){ if (dataset.getPerformedNLPTasks().contains(getTask())){ Framework.error("This dataset has already been tokenized."); return; } Properties prop1 = new Properties(); prop1.setProperty("annotators", "tokenize"); //prop1.setProperty("options", "splitHyphenated=true"); StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false); for (Span span : dataset.getSpans(spanTypeOfTextUnit)){ Span textualUnit = span.getTextualUnit(); String originalText = span.getAnnotation("text", String.class); Annotation a = new Annotation(originalText); pipeline.annotate(a); List<CoreLabel> tokenAnnotations = a.get(TokensAnnotation.class); Word previousWord = null; if (!textualUnit.isEmpty()) previousWord = textualUnit.last(); for (CoreLabel token : tokenAnnotations){ String word = token.get(OriginalTextAnnotation.class); int startOffset = token.get(CharacterOffsetBeginAnnotation.class); // int endOffset = token.get(CharacterOffsetEndAnnotation.class); // System.out.println(word + "\t" + startOffset + "\t" + endOffset); if (previousWord == null){ previousWord = new Word(word, startOffset, textualUnit, dataset); } else { previousWord = new Word(word, startOffset, previousWord); } //and add the new word to the sentence span. If span=textualSpan than this has no effect if (!textualUnit.equals(span)) span.add(previousWord); } } }
Example #29
Source File: MinIE.java From minie with GNU General Public License v3.0 | 5 votes |
/** * Given an object phrase, check if it has infinitive verbs modifying a noun phrase or a named entity. * If yes, then return "true", else -> "false" * @param object: the object phrase * @return */ public boolean pushInfinitiveVerb(Phrase object){ TokenSequencePattern tPattern = TokenSequencePattern.compile(REGEX.T_TO_VB_NP_NER); TokenSequenceMatcher tMatcher = tPattern.getMatcher(object.getWordCoreLabelList()); while (tMatcher.find()){ CoreLabel firstWordMatch = new CoreLabel(tMatcher.groupNodes().get(0)); if (firstWordMatch.index() == object.getWordList().get(0).index()){ return true; } } return false; }
Example #30
Source File: Minimization.java From minie with GNU General Public License v3.0 | 5 votes |
/** * Given a list of matched core maps (a phrase) and a list of words which are candidates for dropping ('remWords'), * check if some of them form sub-constituents of 'matchCoreMaps' which are found in the dictionary. * If there are, remove them from 'remWords'. The words left in 'remWords' are the ones that couldn't be matched * with a sub-constituent found in the dictionary, i.e. those are the ones that we drop. * @param matchCoreMaps: list of words as a list of CoreMap object (a phrase) * @param remWords: list of candidates to be dropped (each word in 'remWord' can also be found in 'matchCoreMaps') */ public void dropWordsNotFoundInDict(List<CoreMap> matchCoreMaps, List<CoreMap> remWords){ // Get all the sub-constituents ObjectArrayList<IndexedWord> words = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(matchCoreMaps); SubConstituent sc = new SubConstituent(this.sg, CoreNLPUtils.getRootFromWordList(this.sg, words), words); sc.generateSubConstituentsFromLeft(); ObjectOpenHashSet<String> subconstituents = sc.getStringSubConstituents(); // Sub-constituents' strings found in the dictionary ObjectArrayList<String> scStringsInDict = new ObjectArrayList<>(); for (String s: subconstituents){ if (this.mwe.contains(s)){ scStringsInDict.add(s); } } // If sub-constituents's strings are found in the dictionary, detect the words associated with them // and remove them. if (scStringsInDict.size() > 0){ Iterator<CoreMap> iter = remWords.iterator(); for (String stInDict: scStringsInDict){ while (iter.hasNext()){ CoreMap cm = iter.next(); CoreLabel cl = new CoreLabel(cm); if (stInDict.contains(cl.lemma().toLowerCase())){ iter.remove(); } } } } // Drop the words not found in frequent/collocation sub-constituents this.dropWords(remWords, matchCoreMaps); }