Java Code Examples for edu.stanford.nlp.ling.CoreLabel#beginPosition()

The following examples show how to use edu.stanford.nlp.ling.CoreLabel#beginPosition() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: CoreNlpToken.java From jstarcraft-nlp with Apache License 2.0

5 votes

@Override
public CoreNlpToken next() {
    CoreLabel label = iterator.next();
    text = label.get(CoreAnnotations.TextAnnotation.class);
    nature = label.get(CoreAnnotations.PartOfSpeechAnnotation.class);
    begin = label.beginPosition();
    end = label.endPosition();
    return this;
}

Example 2

Source File: NERSearcher.java From Stargraph with MIT License

4 votes

/**
 * Receives a list of CoreLabel (from CoreNLP package) and merges two consecutive named entities with
 * the same label into a single one.
 *
 * Example: "Barack/PERSON Obama/PERSON" becomes "Barack Obama/PERSON"
 *
 * @param sentences List of lists of CoreLabels
 * @return List of ScoredNamedEntities where consecutive named entities are combined
 */
private static List<List<LinkedNamedEntity>> mergeConsecutiveNamedEntities(List<List<CoreLabel>> sentences) {
    final List<List<LinkedNamedEntity>> sentenceList = new ArrayList<>();

    for (List<CoreLabel> sentence : sentences) {

        List<LinkedNamedEntity> namedEntities = new ArrayList<>();
        String previousCat = null;
        LinkedNamedEntity currentNamedEntity = null;

        /*
            A named entity is composed of multiple words, most of the time.
            Two consecutive words belong to one named entity if they have the same label.
            This method does not differentiate two different named entities when they are not
            divided by a different label.
            CoreNLP labels words that are not a named entity with "O", so we remove these from the output.
         */
        for (CoreLabel coreLabel : sentence) {

            String currentWord = coreLabel.originalText();

            String currentCat = coreLabel.get(CoreAnnotations.AnswerAnnotation.class);

            if (currentNamedEntity == null) {
                currentNamedEntity = new LinkedNamedEntity(currentWord, currentCat, coreLabel.beginPosition(), coreLabel.endPosition());
            } else if (currentCat.equals(previousCat)) {
                currentNamedEntity.merge(currentNamedEntity.getValue() + " " + currentWord, coreLabel.endPosition());
            } else {
                namedEntities.add(currentNamedEntity);
                currentNamedEntity = new LinkedNamedEntity(currentWord, currentCat, coreLabel.beginPosition(), coreLabel.endPosition());
            }

            previousCat = currentCat;
        }

        /* Add last NE when not already added.
           This happens, when the last token in a sentence belongs to a named entity.
         */
        if (!namedEntities.contains(currentNamedEntity)) {
            namedEntities.add(currentNamedEntity);
        }

        // ignore NamedEntities with label "O", they are not NamedEntities
        sentenceList.add(namedEntities
                .stream()
                .filter(s -> !s.getCat().equals("O"))
                .collect(Collectors.toList()));
    }

    return sentenceList.stream().filter(s -> s.size() > 0).collect(Collectors.toList());
}

Example 3

Source File: Readability.java From tint with GNU General Public License v3.0

4 votes

public void addWord(CoreLabel token) {
        token.set(ReadabilityAnnotations.ContentWord.class, false);
        token.set(ReadabilityAnnotations.LiteralWord.class, false);

        String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
//        String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
        String word = token.word();

        addingToken(token);

        if (isWordPos(pos)) {
            addingWord(token);
            wordCount++;
            docLenLettersOnly += token.endPosition() - token.beginPosition();

            word = flattenToAscii(word);
            Hyphenation hyphenation = hyphenator.hyphenate(word);

            boolean done = false;
            if (hyphenation != null) {
                try {
                    String h = hyphenation.toString();
                    incrementHyphenCount(hyphenation.length() + 1);
                    token.set(ReadabilityAnnotations.HyphenationAnnotation.class, h);
                    done = true;
                    hyphenWordCount++;
                } catch (Exception e) {
                    // ignored
                }
            }

            if (!done && word.length() < 5) {
                incrementHyphenCount(1);
                hyphenWordCount++;
            }

            if (isContentPos(pos)) {
                contentWordSize++;
                addingContentWord(token);
            }
            if (isEasyPos(pos)) {
                contentEasyWordSize++;
                addingEasyWord(token);
            }
        }
        if (token.get(ReadabilityAnnotations.HyphenationAnnotation.class) == null) {
            token.set(ReadabilityAnnotations.HyphenationAnnotation.class, token.originalText());
        }

        String genericPos = getGenericPos(pos);
        posStats.add(pos);
        genericPosStats.add(genericPos);
    }