edu.stanford.nlp.ling.CoreLabel Java Exaples

Source File: JavaClient.java From blog-codes with Apache License 2.0

6 votes

public static void main(String[] args) {
	// creates a StanfordCoreNLP object with POS tagging, lemmatization, NER, parsing, and coreference resolution
	Properties props = new Properties();
	
	props.setProperty("annotators", "tokenize,ssplit,pos,ner,depparse,openie"); 
	MultiLangsStanfordCoreNLPClient pipeline = new MultiLangsStanfordCoreNLPClient(props, "http://localhost", 9000, 2, null, null, "zh");

	// read some text in the text variable
	String text = "今天天气很好。";
	// create an empty Annotation just with the given text
	Annotation document = new Annotation(text);
	// run all Annotators on this text
	pipeline.annotate(document);
	
	CoreMap firstSentence = document.get(CoreAnnotations.SentencesAnnotation.class).get(0);
	// this for loop will print out all of the tokens and the character offset info
	for (CoreLabel token : firstSentence.get(CoreAnnotations.TokensAnnotation.class)) {
		System.out.println(token.word() + "\t" + token.beginPosition() + "\t" + token.endPosition());
	}
}

Source File: NERSearcher.java From Stargraph with MIT License

6 votes

private List<LinkedNamedEntity> postProcessFoundNamedEntities(List<List<CoreLabel>> sentences) {
    final List<List<LinkedNamedEntity>> sentenceList = mergeConsecutiveNamedEntities(sentences);

    if (this.reverseNameOrder) {
        sentenceList.forEach(sentence -> {
            sentence.forEach(LinkedNamedEntity::reverseValue);
        });
    }

    if (sentenceList.isEmpty() || (sentenceList.size() == 1 && sentenceList.get(0).isEmpty())) {
        logger.trace(marker, "No Entities detected.");
        return Collections.emptyList();
    }

    return linkNamedEntities(sentenceList);
}

Source File: IntelKBPAnnotator.java From InformationExtraction with GNU General Public License v3.0

6 votes

/**
 * Annotate all the pronominal mentions in the document.
 *
 * @param ann The document.
 * @return The list of pronominal mentions in the document.
 */
private static List<CoreMap> annotatePronominalMentions(Annotation ann) {
    List<CoreMap> pronouns = new ArrayList<>();
    List<CoreMap> sentences = ann.get(CoreAnnotations.SentencesAnnotation.class);
    for (int sentenceIndex = 0; sentenceIndex < sentences.size(); sentenceIndex++) {
        CoreMap sentence = sentences.get(sentenceIndex);
        Integer annoTokenBegin = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
        if (annoTokenBegin == null) {
            annoTokenBegin = 0;
        }

        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        for (int tokenIndex = 0; tokenIndex < tokens.size(); tokenIndex++) {
            CoreLabel token = tokens.get(tokenIndex);
            if (kbpIsPronominalMention(token)) {
                CoreMap pronoun = ChunkAnnotationUtils.getAnnotatedChunk(tokens, tokenIndex, tokenIndex + 1,
                        annoTokenBegin, null, CoreAnnotations.TextAnnotation.class, null);
                pronoun.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex);
                sentence.get(CoreAnnotations.MentionsAnnotation.class).add(pronoun);
                pronouns.add(pronoun);
            }
        }
    }

    return pronouns;
}

Source File: MainTest.java From dependensee with GNU General Public License v2.0

6 votes

/**
 * Test of writeImage method, of class Main.
 */

@Test
public void testWriteImage() throws Exception {
    String text = "A quick brown fox jumped over the lazy dog.";
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    LexicalizedParser lp = LexicalizedParser.loadModel();
    lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
    TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
    Tree tree = lp.apply(wordList);
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed();
    Main.writeImage(tdl, "image.png", 3);
    assert (new File("image.png").exists());
}

Source File: RegexNerTest.java From InformationExtraction with GNU General Public License v3.0

6 votes

public static List<String> extractNER(String doc){
    Annotation document = new Annotation(doc);

    pipeline.annotate(document);
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    List<String> result = new ArrayList<String>();
    for(CoreMap sentence: sentences) {
        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreLabel token: sentence.get(CoreAnnotations.TokensAnnotation.class)) {
            // this is the text of the token
            String word = token.get(CoreAnnotations.TextAnnotation.class);
            // this is the POS tag of the token
            String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
            // this is the NER label of the token
            String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
            result.add(ne);
        }
    }
    return result;
}

Source File: Chapter5.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

6 votes

private static void usingStanfordPOSTagger() {
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos");
    props.put("pos.model", "C:\\Current Books in Progress\\NLP and Java\\Models\\english-caseless-left3words-distsim.tagger");
    props.put("pos.maxlen", 10);
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    Annotation document = new Annotation(theSentence);
    pipeline.annotate(document);

    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            String word = token.get(TextAnnotation.class);
            String pos = token.get(PartOfSpeechAnnotation.class);
            System.out.print(word + "/" + pos + " ");
        }
        System.out.println();

        try {
            pipeline.xmlPrint(document, System.out);
            pipeline.prettyPrint(document, System.out);
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }
}

Source File: Chapter4.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

6 votes

private static void usingStanfordNER() {
        String model = getModelDir() + "\\english.conll.4class.distsim.crf.ser.gz";
        CRFClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(model);

        String sentence = "";
        for (String element : sentences) {
            sentence += element;
        }

        List<List<CoreLabel>> entityList = classifier.classify(sentence);

        for (List<CoreLabel> internalList : entityList) {
            for (CoreLabel coreLabel : internalList) {
                String word = coreLabel.word();
                String category = coreLabel.get(CoreAnnotations.AnswerAnnotation.class);
//                System.out.println(word + ":" + category);
                if (!"O".equals(category)) {
                    System.out.println(word + ":" + category);
                }

            }

        }
    }

Source File: GalicianReadability.java From tint with GNU General Public License v3.0

6 votes

@Override public void addingContentWord(CoreLabel token) {
    super.addingContentWord(token);

    token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 4);
    String lemma = token.lemma();
    if (model.getLevel3Lemmas().contains(lemma)) {
        level3WordSize++;
        token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 3);
    }
    if (model.getLevel2Lemmas().contains(lemma)) {
        level2WordSize++;
        token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 2);
    }
    if (model.getLevel1Lemmas().contains(lemma)) {
        level1WordSize++;
        token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 1);
    }
}

Source File: Postprocess.java From phrases with Apache License 2.0

6 votes

public List<Pattern> run(List<Pattern> patterns) {

        Properties props = new Properties();
        props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse, sentiment");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

        for (Pattern pattern : patterns) {
            Annotation annotation = pipeline.process(pattern.toSentences());
            for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                    Tree tree = sentence.get(SentimentCoreAnnotations.AnnotatedTree.class);
                    int sentiment = RNNCoreAnnotations.getPredictedClass(tree);
                    for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
                        String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);

                    }
            }
        }
        return null;
    }

Source File: Main.java From dependensee with GNU General Public License v2.0

6 votes

public static void writeImage(String sentence, String outFile, int scale) throws Exception {
    
    LexicalizedParser lp = null;
    try {
        lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    } catch (Exception e) {
        System.err.println("Could not load file englishPCFG.ser.gz. Try placing this file in the same directory as Dependencee.jar");
        return;
    }
    
    lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
    TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
    Tree tree = lp.apply(wordList);
    writeImage(tree, outFile, scale);
    
}

Source File: Tokens.java From ambiverse-nlu with Apache License 2.0

6 votes

public static Tokens getTokensFromJCas(JCas jCas) {
  Tokens tokens = new Tokens();
  int s_number = 0; //DKPro does not give sentence index????????
  int t_number = 0;
  for (Sentence sentence : select(jCas, Sentence.class)) {
    List<de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token> dktokens = selectCovered(jCas,
        de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token.class, sentence);
    for (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token t : dktokens) {
      CoreLabel taggedWord = CoreNlpUtils.tokenToWord(
          t); //This step should be avoided. Transform directly from DKPRO to AIDA TOKEN. Problem POS mappings. AIDA works with Stanford tags
      Token aidaToken = new Token(t_number, t.getCoveredText(), t.getBegin(), t.getEnd(), 0);
      aidaToken.setPOS(taggedWord.get(CoreAnnotations.PartOfSpeechAnnotation.class));
      aidaToken.setSentence(s_number);
      tokens.addToken(aidaToken);
      t_number++;
    }
    s_number++;
  }
  return tokens;
}

Source File: ProcessorTools.java From phrasal with GNU General Public License v3.0

6 votes

/**
 * Convert a raw/preprocessed String pair to a labeled sequence appropriate for training
 * the CRF-based post-processor.
 * 
 * The SymmetricalWordAlignment is created by a Preprocessor. Source is the raw input, target is
 * the tokenized/pre-processed output.
 * 
 * @return
 */
public static List<CoreLabel> alignedPairToLabeledSequence(SymmetricalWordAlignment alignment) {
  List<CoreLabel> sequence = new ArrayList<>(alignment.eSize() * 7);
  
  for (int i = 0; i < alignment.fSize(); ++i) {
    if (sequence.size() > 0) sequence.add(createDatum(WHITESPACE, Operation.Whitespace.toString(), sequence.size(), WHITESPACE, 0));
    String token = alignment.f().get(i).toString();
    Set<Integer> eAlignments = alignment.f2e(i);
    if (eAlignments.size() == 0) {
      System.err.printf("%s: WARNING: discarding unaligned token (%s)%n", ProcessorTools.class.getName(), token);
      
    } else {
      List<String> eTokens = new ArrayList<>(eAlignments.size());
      for (int j : eAlignments) {
        eTokens.add(alignment.e().get(j).toString());
      }
      List<CoreLabel> charSequence = toSequence(token, eTokens, sequence.size());
      sequence.addAll(charSequence);
    }
  }
  return sequence;
}

Source File: TestCustomLemmaAnnotator.java From blog-codes with Apache License 2.0

6 votes

@Test
public void test() {
	Properties props = new Properties();
	props.setProperty("annotators", "tokenize,ssplit,pos,custom.lemma");
	props.setProperty("customAnnotatorClass.custom.lemma", "com.fancyerii.blog.stanfordnlp.CustomLemmaAnnotator");
	props.setProperty("custom.lemma.lemmaFile", "custom-lemmas.txt");
	// set up pipeline
	StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
	
	CoreDocument exampleDocument = new CoreDocument("Some many goods there.");
	// annotate document
	pipeline.annotate(exampleDocument);
	// access tokens from a CoreDocument
	// a token is represented by a CoreLabel
	List<CoreLabel> firstSentenceTokens = exampleDocument.sentences().get(0).tokens();
	// this for loop will print out all of the tokens and the character offset info
	for (CoreLabel token : firstSentenceTokens) {
		System.out.println(token.word()+"/"+token.getString(LemmaAnnotation.class) + "\t" + token.beginPosition() + "\t" + token.endPosition());
	}
}

Source File: TokenizerDemo.java From blog-codes with Apache License 2.0

5 votes

public static void main(String[] args) throws IOException {
	for (String arg : args) {
		// option #1: By sentence.
		DocumentPreprocessor dp = new DocumentPreprocessor(arg);
		for (List<HasWord> sentence : dp) {
			System.out.println(sentence);
		}
		// option #2: By token
		PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new FileReader(arg), new CoreLabelTokenFactory(), "");
		while (ptbt.hasNext()) {
			CoreLabel label = ptbt.next();
			System.out.println(label);
		}
	}
}

Source File: Summarizer.java From wiseowl with MIT License

5 votes

private static Counter<String> getTermFrequencies(List<CoreMap> sentences) {
  Counter<String> ret = new ClassicCounter<String>();

  for (CoreMap sentence : sentences)
    for (CoreLabel cl : sentence.get(CoreAnnotations.TokensAnnotation.class))
      ret.incrementCount(cl.get(CoreAnnotations.TextAnnotation.class));

  return ret;
}

Source File: Main.java From dependensee with GNU General Public License v2.0

5 votes

public static void writeImage(String sentence, String outFile, LexicalizedParser lp) throws Exception {
    
    Tree parse;
    try {
        TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
        parse = lp.apply(wordList);            
    } catch (Exception e) {
        throw e;
    }
    writeImage(parse, outFile);
    
}

Source File: NERTool.java From Criteria2Query with Apache License 2.0

5 votes

public static void train(String traindatapath,String targetpath){
	long startTime = System.nanoTime();
       /* Step 1: learn the classifier from the training data */
       String trainFile = traindatapath; 
       /* Learn the classifier from the training data */
       String serializeFileLoc =targetpath;
       // properties: https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/NERFeatureFactory.html
       Properties props = new Properties();
       props.put("trainFile", trainFile); // To train with multiple files, a comma separated list
       props.put("map", "word=0,answer=1");
       props.put("useClassFeature", "true");
       props.put("useNGrams", "true");
       props.put("noMidNGrams", "true");
       props.put("maxNGramLeng", "6");
       props.put("useDisjunctive", "true");
       props.put("usePrev", "true");
       props.put("useNext", "true");
       props.put("useSequences", "true");
       props.put("usePrevSequences", "true");
       props.put("maxLeft", "1");
       props.put("useTypeSeqs", "true");
       props.put("useTypeSeqs2", "true");
       props.put("useTypeySequences", "true");
       props.put("wordShape", "chris2useLC");
       // props.put("printFeatures", "true");
       // This feature can be turned off in recent versions with the flag -useKnownLCWords false
       // https://nlp.stanford.edu/software/crf-faq.html question 13

       SeqClassifierFlags flags = new SeqClassifierFlags(props);
       CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(flags);
       crf.train();
       crf.serializeClassifier(serializeFileLoc);
       
}

Source File: CoreNLPUtils.java From minie with GNU General Public License v3.0

5 votes

/**
 * Given a semantic graph of a whole sentence (sg) and a "local root" node, get the subgraph from 'sg' which has 
 * 'localRoot' as a root. 
 * @param sg: semantic graph of the whole sentence
 * @param localRoot: the root of the subgraph
 * @return semantic graph object which is the subgraph from 'sg'
 */
public static SemanticGraph getSubgraph(SemanticGraph sg, IndexedWord localRoot){
    ObjectArrayList<TypedDependency> subGraphDependencies = getSubgraphTypedDependencies(sg, localRoot, 
                                                                        new ObjectArrayList<TypedDependency>());
    TreeGraphNode rootTGN = new TreeGraphNode(new CoreLabel(localRoot));
    EnglishGrammaticalStructure gs = new EnglishGrammaticalStructure(subGraphDependencies, rootTGN);
    return SemanticGraphFactory.generateUncollapsedDependencies(gs);
}

Source File: CRFPostprocessor.java From phrasal with GNU General Public License v3.0

5 votes

/**
 * Train a model given a preprocessor.
 * 
 * @param preProcessor
 */
protected void train(Preprocessor preProcessor) {
  DocumentReaderAndWriter<CoreLabel> docReader = 
      new ProcessorTools.PostprocessorDocumentReaderAndWriter(preProcessor);
  ObjectBank<List<CoreLabel>> lines =
    classifier.makeObjectBankFromFile(flags.trainFile, docReader);

  classifier.train(lines, docReader);
  System.err.println("Finished training.");
}

Source File: Phrase.java From minie with GNU General Public License v3.0

5 votes

/** Remove a set of words represented as core labels from the list of indexed words **/
public void removeCoreLabelWordsFromList(List<CoreMap> cmWords){
    ObjectArrayList<IndexedWord> rWords = new ObjectArrayList<>();
    for (CoreMap cm: cmWords){
        rWords.add(new IndexedWord(new CoreLabel(cm)));
    }
    this.removeWordsFromList(rWords);
}

Source File: Summarizer.java From wiseowl with MIT License

5 votes

private double tfIDFWeights(CoreMap sentence) {
  double total = 0;
  for (CoreLabel cl : sentence.get(CoreAnnotations.TokensAnnotation.class))
    if (cl.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("n"))
      total += tfIDFWeight(cl.get(CoreAnnotations.TextAnnotation.class));

  return total;
}

Source File: CoreNLPUtils.java From minie with GNU General Public License v3.0

5 votes

/**
 * Given a list of words (as core maps), return the phrase of words as a list of indexed word objects
 * @param words: list of words (e.g. [She, is, pretty])
 * @return list of words (as IndexedWord)
 */
public static ObjectArrayList<IndexedWord> listOfCoreMapWordsToIndexedWordList(List<CoreMap> cmList){
    ObjectArrayList<IndexedWord> wordList = new ObjectArrayList<>();
    for (CoreMap cm: cmList){
        wordList.add(new IndexedWord(new CoreLabel(cm)));
    }
    return wordList;
}

Source File: CoreNLPUtils.java From minie with GNU General Public License v3.0

5 votes

public static ObjectArrayList<CoreLabel> getCoreLabelListFromIndexedWordList(ObjectArrayList<IndexedWord> words) {
    ObjectArrayList<CoreLabel> coreLabelList = new ObjectArrayList<>();
    for (IndexedWord w: words) {
        coreLabelList.add(new CoreLabel(w));
    }
    return coreLabelList;
}

Source File: CoreNLPUtils.java From minie with GNU General Public License v3.0

5 votes

public static ObjectArrayList<IndexedWord> getWordListFromCoreMapList(List<CoreMap> coreMapList){
    ObjectArrayList<IndexedWord> coreLabelList = new ObjectArrayList<>();
    for (CoreMap cm: coreMapList){
        coreLabelList.add(new IndexedWord(new CoreLabel(cm)));
    }
    return coreLabelList;
}

Source File: UPosAnnotator.java From tint with GNU General Public License v3.0

5 votes

@Override
public void annotate(Annotation annotation) {
    for (CoreLabel token : annotation.get(CoreAnnotations.TokensAnnotation.class)) {
        String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);

        String[] parts = pos.split("\\+");
        StringBuffer upos = new StringBuffer();
        for (String part : parts) {
            String thisPos = uposMap.getOrDefault(part, DEFAULT_UPOS);
            upos.append("+").append(thisPos);
        }
        token.set(CustomAnnotations.UPosAnnotation.class, upos.substring(1));
    }

}

Source File: CoreNLPUtils.java From minie with GNU General Public License v3.0

5 votes

public static ObjectOpenHashSet<IndexedWord> getWordSetFromCoreMapList(List<CoreMap> coreMapList){
    ObjectOpenHashSet<IndexedWord> coreLabelSet = new ObjectOpenHashSet<>();
    for (CoreMap cm: coreMapList){
        coreLabelSet.add(new IndexedWord(new CoreLabel(cm)));
    }
    return coreLabelSet;
}

Source File: Minimization.java From minie with GNU General Public License v3.0

5 votes

/** Given a phrase, if it contains a verb phrase, make a verb phrase safe minimization **/
public void verbPhraseSafeMinimization(List<CoreMap> remWords, List<CoreMap> matchWords){
    // Flags for checking certain conditions
    boolean isAdverb;
    boolean isNotNER;
    boolean containsNEG;
    
    // If the relation starts with a RB+ VB+, drop RB+
    this.tPattern = TokenSequencePattern.compile(REGEX.T_RB_VB);
    this.tMatcher = tPattern.getMatcher(this.phrase.getWordCoreLabelList());
    while (this.tMatcher.find()){   
        matchWords = tMatcher.groupNodes();
        
        for (CoreMap cm: matchWords){
            CoreLabel cl = new CoreLabel(cm);
            if (cl.lemma() == null) cl.setLemma(cl.word());
            
            isAdverb = CoreNLPUtils.isAdverb(cl.tag());
            isNotNER = cl.ner().equals(NE_TYPE.NO_NER);
            containsNEG = Polarity.NEG_WORDS.contains(cl.lemma().toLowerCase());
            
            // Check if the word is RB which is not a NER
            if (isAdverb && isNotNER && !containsNEG){
                remWords.add(cm);   
            }
        }
        this.dropWords(remWords, matchWords);
    }
}

Source File: CoreNLPTokenizer.java From Heracles with GNU General Public License v3.0

5 votes

/**
	 * Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
	 * The Spans denoted by spanType must provide an annotation of type "text".
	 * This spanType does not have to be textual unit.
	 */
	@Override
	public void validatedProcess(Dataset dataset, String spanTypeOfTextUnit){
		if (dataset.getPerformedNLPTasks().contains(getTask())){
			Framework.error("This dataset has already been tokenized.");
			return;
		}
		
		Properties prop1 = new Properties();
		prop1.setProperty("annotators", "tokenize");
		//prop1.setProperty("options", "splitHyphenated=true");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
		
		
		for (Span span : dataset.getSpans(spanTypeOfTextUnit)){
			Span textualUnit = span.getTextualUnit();
			String originalText = span.getAnnotation("text", String.class); 
			Annotation a = new Annotation(originalText);
			pipeline.annotate(a);
			List<CoreLabel> tokenAnnotations = a.get(TokensAnnotation.class);
			Word previousWord = null;
			if (!textualUnit.isEmpty())
				previousWord = textualUnit.last();
				
			for (CoreLabel token : tokenAnnotations){
				String word = token.get(OriginalTextAnnotation.class);
				int startOffset = token.get(CharacterOffsetBeginAnnotation.class);
//				int endOffset = token.get(CharacterOffsetEndAnnotation.class);
//				System.out.println(word + "\t" + startOffset + "\t" + endOffset);
				if (previousWord == null){
					previousWord = new Word(word, startOffset, textualUnit, dataset);
				} else {
					previousWord = new Word(word, startOffset, previousWord);
				}
				//and add the new word to the sentence span. If span=textualSpan than this has no effect
				if (!textualUnit.equals(span))
					span.add(previousWord);
			}
		}		
	}

Source File: MinIE.java From minie with GNU General Public License v3.0

5 votes

/**
 * Given an object phrase, check if it has infinitive verbs modifying a noun phrase or a named entity. 
 * If yes, then return "true", else -> "false"
 * @param object: the object phrase
 * @return
 */
public boolean pushInfinitiveVerb(Phrase object){
    TokenSequencePattern tPattern = TokenSequencePattern.compile(REGEX.T_TO_VB_NP_NER);
    TokenSequenceMatcher tMatcher = tPattern.getMatcher(object.getWordCoreLabelList());
    
    while (tMatcher.find()){         
        CoreLabel firstWordMatch = new CoreLabel(tMatcher.groupNodes().get(0));
        if (firstWordMatch.index() == object.getWordList().get(0).index()){
            return true;
        }
    }
    
    return false;
}

Source File: Minimization.java From minie with GNU General Public License v3.0

5 votes

/**
 * Given a list of matched core maps (a phrase) and a list of words which are candidates for dropping ('remWords'), 
 * check if some of them form sub-constituents of 'matchCoreMaps' which are found in the dictionary.
 * If there are, remove them from 'remWords'. The words left in 'remWords' are the ones that couldn't be matched
 * with a sub-constituent found in the dictionary, i.e. those are the ones that we drop.
 * @param matchCoreMaps: list of words as a list of CoreMap object (a phrase)
 * @param remWords: list of candidates to be dropped (each word in 'remWord' can also be found in 'matchCoreMaps')
 */
public void dropWordsNotFoundInDict(List<CoreMap> matchCoreMaps, List<CoreMap> remWords){
    // Get all the sub-constituents
    ObjectArrayList<IndexedWord> words = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(matchCoreMaps);
    SubConstituent sc = new SubConstituent(this.sg, CoreNLPUtils.getRootFromWordList(this.sg, words), words);
    sc.generateSubConstituentsFromLeft();
    ObjectOpenHashSet<String> subconstituents = sc.getStringSubConstituents();
    
    // Sub-constituents' strings found in the dictionary
    ObjectArrayList<String> scStringsInDict = new ObjectArrayList<>();
    for (String s: subconstituents){
        if (this.mwe.contains(s)){
            scStringsInDict.add(s);
        }
    }
    
    // If sub-constituents's strings are found in the dictionary, detect the words associated with them
    // and remove them.
    if (scStringsInDict.size() > 0){
        Iterator<CoreMap> iter = remWords.iterator();
        for (String stInDict: scStringsInDict){
            while (iter.hasNext()){   
                CoreMap cm = iter.next();
                CoreLabel cl = new CoreLabel(cm);
                if (stInDict.contains(cl.lemma().toLowerCase())){
                    iter.remove();
                }
            }
        }
    }
    
    // Drop the words not found in frequent/collocation sub-constituents
    this.dropWords(remWords, matchCoreMaps);
}

edu.stanford.nlp.ling.CoreLabel Java Examples