org.grobid.core.layout.LayoutToken Java Exaples

Source File: ProcessTextTest.java From entity-fishing with Apache License 2.0

6 votes

@Test
public void testAcronymsTokens() {
    String input = "Figure 4. \n" +
            "Canonical Correspondence Analysis (CCA) diagram showing the ordination of anopheline species along the\n" +
            "first two axes and their correlation with environmental variables. The first axis is horizontal, second vertical. Direction\n" +
            "and length of arrows shows the degree of correlation between mosquito larvae and the variables.";
    List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input, new Language("en", 1.0));
    Map<Mention, Mention> acronyms = processText.acronymCandidates(tokens);

    assertNotNull(acronyms);
    for (Map.Entry<Mention, Mention> entry : acronyms.entrySet()) {
        Mention base = entry.getValue();
        Mention acronym = entry.getKey();

        assertEquals(input.substring(acronym.getOffsetStart(), acronym.getOffsetEnd()).trim(), "CCA");
        assertEquals(base.getRawName(), "Canonical Correspondence Analysis");

        assertThat(acronym.getOffsetStart(), is(46));
        assertThat(acronym.getOffsetEnd(), is(49));
    }
}

Source File: NLPLeaderboardFigParser.java From science-result-extractor with Apache License 2.0

6 votes

/**
     * The processing here is called from the full text parser in cascade.
     * Start and end position in the higher level tokenization are indicated in
     * the resulting Figure object.
     */
    public Figure processing(List<LayoutToken> tokenizationFigure, String featureVector) {

        String res;
        try {
            res = label(featureVector);
        } catch (Exception e) {
            throw new GrobidException("CRF labeling in ReferenceSegmenter fails.", e);
        }
        if (res == null) {
            return null;
        }
//        List<Pair<String, String>> labeled = GenericTaggerUtils.getTokensAndLabels(res);

//		System.out.println(Joiner.on("\n").join(labeled));
//		System.out.println("----------------------");
//		System.out.println("----------------------");

//		return getExtractionResult(tokenizationFigure, labeled);
        return getExtractionResult(tokenizationFigure, res);
    }

Source File: NERParsers.java From grobid-ner with Apache License 2.0

6 votes

/**
 * Extract all occurrences of named entity from a list of LayoutToken and a given language.
 */
public List<Entity> extractNE(List<LayoutToken> tokens, Language lang) throws GrobidResourceException {

    if ((tokens == null) || (tokens.size() == 0))
        return null;

    //text = text.replace("\n", " ");

    if (lang == null) {
        return extractNE(tokens);
    }

    NERParser parser = parsers.get(lang.getLang());
    if (parser == null) {
        throw new GrobidResourceException("The automatically identified language is currently not supported by grobid-ner: " +
            lang.getLang());
    }

    return parser.extractNE(tokens);
}

Source File: NERParserCommonTest.java From grobid-ner with Apache License 2.0

6 votes

@Test
public void testresultExtraction_clusteror_simple2() throws Exception {
    final String input = "Austria Hungary fought the enemies with Germany.";
    String result = "Austria\taustria\tA\tAu\tAus\tAust\tAustr\ta\tia\tria\ttria\tstria\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\t0\t1\t1\t1\t1\tXxxx\tXx\t0\tB-LOCATION\n" +
            "Hungary\thungary\tA\tAu\tAus\tAust\tAustr\ta\tia\tria\ttria\tstria\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\t0\t1\t1\t1\t1\tXxxx\tXx\t0\tLOCATION\n" +
            "fought\tfought\tf\tfo\tfou\tfoug\tfough\tt\tht\tght\tught\tought\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxxx\tx\t0\tO\n" +
            "the\tthe\tt\tth\tthe\tthe\tthe\te\the\tthe\tthe\tthe\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxx\tx\t0\tO\n" +
            "enemies\tenemies\te\ten\tene\tenem\tenemi\ts\tes\ties\tmies\temies\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxxx\tx\t0\tO\n" +
            "with\twith\tw\twi\twit\twith\twith\th\tth\tith\twith\twith\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxxx\tx\t0\tO\n" +
            "Germany\tgermany\tG\tGe\tGer\tGerm\tGerma\ty\tny\tany\tmany\trmany\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\tXxxx\tXx\t0\tB-LOCATION\n" +
            ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tALLCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t.\t.\t0\tO";
    List<LayoutToken> tokenisation = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);


    final List<Entity> entities = target.resultExtraction(GrobidModels.ENTITIES_NER, result, tokenisation);

    assertThat(entities, hasSize(2));
    assertThat(entities.get(0).getRawName(), is("Austria Hungary"));
    assertThat(entities.get(0).getType(), is(LOCATION));
    assertThat(entities.get(0).getOffsetStart(), is(0));
    assertThat(entities.get(0).getOffsetEnd(), is(15));
    assertThat(input.substring(entities.get(0).getOffsetStart(), entities.get(0).getOffsetEnd()), is("Austria Hungary"));
}

Source File: NERFrParser.java From grobid-ner with Apache License 2.0

6 votes

/**
 * Extract all occurrences of named entities from a list of LayoutToken
 * coming from a document with fixed/preserved layout, e.g. PDF. 
 * The positions of the recognized entities are given with coordinates in 
 * the input document.
 */
public List<Entity> extractNE(List<LayoutToken> tokens) {
    if (tokens == null)
        return null;

    LexiconPositionsIndexes positionsIndexes = new LexiconPositionsIndexes(lexicon);
    positionsIndexes.computeIndexes(tokens);

    String res = NERParserCommon.toFeatureVectorLayout(tokens, positionsIndexes);
    String result = label(res);
    //List<Pair<String, String>> labeled = GenericTaggerUtils.getTokensAndLabels(result);

    //String text = LayoutTokensUtil.toText(tokens);
    List<Entity> entities = nerParserCommon.resultExtraction(GrobidModels.ENTITIES_NERFR, result, tokens);

    // we use now the sense tagger for the recognized named entity
    //List<Sense> senses = senseTagger.extractSenses(labeled, tokens, positionsIndexes);

    //NERParserCommon.merge(entities, senses);

    return entities;
}

Source File: SimilarityScorer.java From entity-fishing with Apache License 2.0

6 votes

public float getCentroidScore(NerdCandidate candidate, List<LayoutToken> tokens, String lang) {
	if (candidate.getWikidataId() == null)
		return 0.0F;
	CentroidEntityScorer scorer = centroidScorers.get(lang);
	if (scorer != null) {
		List<String> terms = toStringEmbeddings(tokens, lang);
           //System.out.println("\n"+candidate.toString());
           //System.out.println(terms.toString());
		float score = scorer.score(candidate.getWikidataId(), terms);
		//System.out.println("score: " + score);
		if (score < 0.0F)
			score = 0.0F;
		return score;
	} else {
		LOGGER.warn(lang + " centroid scorer is null!");
		return 0.0F;
	}
}

Source File: NEREnParser.java From grobid-ner with Apache License 2.0

6 votes

/**
 * Extract all occurrences of named entities from a list of LayoutToken
 * coming from a document with fixed/preserved layout, e.g. PDF. 
 * The positions of the recognized entities are given with coordinates in 
 * the input document.
 */
public List<Entity> extractNE(List<LayoutToken> tokens) {
    if (tokens == null)
        return null;
    
    LexiconPositionsIndexes positionsIndexes = new LexiconPositionsIndexes(lexicon);
    positionsIndexes.computeIndexes(tokens);

    String res = NERParserCommon.toFeatureVectorLayout(tokens, positionsIndexes);
    String result = label(res);
    //List<Pair<String, String>> labeled = GenericTaggerUtils.getTokensAndLabels(result);

    //String text = LayoutTokensUtil.toText(tokens);
    List<Entity> entities = nerParserCommon.resultExtraction(GrobidModels.ENTITIES_NER, result, tokens);

    // we use now the sense tagger for the recognized named entity
    //List<Sense> senses = senseTagger.extractSenses(labeled, tokens, positionsIndexes);

    //NERParserCommon.merge(entities, senses);

    return entities;
}

Source File: ProcessText.java From entity-fishing with Apache License 2.0

6 votes

/**
 * This is the entry point for a NerdQuery to have its textual content processed.
 * The mthod will generate a list of recognized named entities produced by a list
 * of mention recognition modules specified in the list field 'mention' of the NerdQuery
 * object. Each mention recognition method will be applied sequencially in the order
 * given in the list field 'mention'.
 *
 * @param nerdQuery the NERD query to be processed
 * @return the list of identified mentions
 */
public List<Mention> process(NerdQuery nerdQuery) throws NerdException {
    String text = nerdQuery.getTextOrShortText();

    List<LayoutToken> tokens = nerdQuery.getTokens();

    if (isBlank(text) && isEmpty(tokens)) {
        LOGGER.warn("No content to process.");
        return new ArrayList<>();
    }

    if (isNotBlank(text))
        return processText(nerdQuery);
    else
        return processTokens(nerdQuery);
}

Source File: ProcessText.java From entity-fishing with Apache License 2.0

6 votes

/**
 * Precondition: list of LayoutToken in the query object is not empty
 */
private List<Mention> processTokens(NerdQuery nerdQuery) throws NerdException {
    List<LayoutToken> tokens = nerdQuery.getTokens();
    List<Mention> results = new ArrayList<>();

    Language language = nerdQuery.getLanguage();

    // get the list of requested mention types
    List<ProcessText.MentionMethod> mentionTypes = nerdQuery.getMentions();

    // we process the whole text, sentence info does not apply to layout documents
    try {
        for (ProcessText.MentionMethod mentionType : mentionTypes) {
            List<Mention> localResults = getMentions(tokens, language, mentionType);

            results.addAll(localResults);
        }
    } catch (Exception e) {
        throw new NerdException("NERD error when processing text.", e);
    }

    return results;
}

Source File: ProcessText.java From entity-fishing with Apache License 2.0

6 votes

/**
 * Utility method to process a list of layout tokens and return the NER mentions
 **/
private List<Mention> extractNER(List<LayoutToken> tokens, Language language) {
    List<Mention> results = new ArrayList<>();

    if (isEmpty(tokens)) {
        LOGGER.warn("Trying to extract NE mention from empty content. Returning empty list.");
        return results;
    }

    String lang = language.getLang();
    if ((lang == null) || (!lang.equals("en") && !lang.equals("fr")))
        return new ArrayList<>();

    try {
        List<Entity> entityResults = nerParsers.extractNE(tokens, language);
        for (Entity entityResult : entityResults) {
            Mention mention = new Mention(entityResult);
            mention.setSource(MentionMethod.ner);
            results.add(mention);
        }
    } catch (Exception e) {
        LOGGER.error("NER extraction failed", e);
    }

    return results;
}

Source File: ProcessTextTest.java From entity-fishing with Apache License 2.0

6 votes

@Test
public void testNGram_LayoutTokens_oneGram_shouldWork() throws Exception {
    final String input = "this is it.";

    final List<LayoutToken> inputLayoutTokens = GrobidAnalyzer.getInstance()
            .tokenizeWithLayoutToken(input, new Language("en"));

    final List<StringPos> result = processText.ngrams(inputLayoutTokens, 1);
    System.out.println(result);

    assertThat(result, hasSize(6));
    assertThat(result.get(0), is(new StringPos("this", 0)));
    assertThat(result.get(1), is(new StringPos(" ", 4)));
    assertThat(result.get(2), is(new StringPos("is", 5)));
    assertThat(result.get(3), is(new StringPos(" ", 7)));
}

Source File: ProcessTextTest.java From entity-fishing with Apache License 2.0

6 votes

@Test
public void testParagraphSegmentation() {
    // create a dummy super long text to be segmented
    List<LayoutToken> tokens = new ArrayList<>();
    for (int i = 0; i < 1000; i++) {
        if (i == 250) {
            tokens.add(new LayoutToken("\n"));
        }
        if (i == 500) {
            tokens.add(new LayoutToken("\n"));
            tokens.add(new LayoutToken("\n"));
        }
        tokens.add(new LayoutToken("blabla"));
        tokens.add(new LayoutToken(" "));
    }

    List<List<LayoutToken>> segments = ProcessText.segmentInParagraphs(tokens);
    assertThat(segments, hasSize(5));
}

Source File: ProcessText.java From entity-fishing with Apache License 2.0

6 votes

public static List<List<LayoutToken>> segmentInParagraphs(List<LayoutToken> tokens) {
    // heuristics: double end of line, if not simple end of line (not aligned with
    // previous line), and if still not we segment arbitrarly the monolithic block
    List<List<LayoutToken>> result = new ArrayList<>();
    result.add(tokens);

    // we recursively segment too large segments, starting with one unique segment
    // which is the whole text

    while (true) {
        result = subSsegmentInParagraphs(result);
        if (!containsTooLargeSegment(result))
            break;
    }

    return result;
}

Source File: ProcessTextTest.java From entity-fishing with Apache License 2.0

6 votes

@Test
    public void testAcronymsTokensMixedCase() {
        String input = "Cigarette smoke (CS)-induced airway epithelial senescence has been implicated in " +
                "the pathogenesis of chronic obstructive pulmonary disease (COPD).";
        List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input, new Language("en", 1.0));
        Map<Mention, Mention> acronyms = processText.acronymCandidates(tokens);
        assertNotNull(acronyms);
        for (Map.Entry<Mention, Mention> entry : acronyms.entrySet()) {
            Mention base = entry.getValue();
            Mention acronym = entry.getKey();
//System.out.println("acronym: " + input.substring(acronym.start, acronym.end) + " / base: " + base.getRawName());
            if (input.substring(acronym.getOffsetStart(), acronym.getOffsetEnd()).trim().equals("CS")) {
                assertEquals(base.getRawName(), "Cigarette smoke");
            } else {
                assertEquals(input.substring(acronym.getOffsetStart(), acronym.getOffsetEnd()).trim(), "COPD");
                assertEquals(base.getRawName(), "chronic obstructive pulmonary disease");
            }
        }
    }

Source File: Utilities.java From entity-fishing with Apache License 2.0

6 votes

public static List<LayoutToken> getWindow(int start, int end, List<LayoutToken> tokens, int size, String lang) {
	List<LayoutToken> subTokens = new ArrayList<LayoutToken>();

	// first locate the entity in the token list
	int pos = 0;
	for(LayoutToken token : tokens) {
		if ( (token.getOffset() >= start) && ((token.getOffset()+token.getText().length()) <= end) )
			break;
		pos++;
	}

	int posStart = pos - size;
	if (posStart < 0)
		posStart = 0;
	int posEnd = pos + size;
	if (posEnd >= tokens.size())
		posEnd = tokens.size()-1;

	for(int p = posStart; p <= posEnd; p++) {
		if (p != pos) {
			subTokens.add(tokens.get(p));
		}
	}

	return subTokens;
}

Source File: NERParsers.java From grobid-ner with Apache License 2.0

5 votes

/**
 * Extract all occurrences of named entity from list of LayoutToken of unknown language.
 * A language identifier is used to determine the language, and the token sequence is 
 * processed if the identified language is supported.
 */
public List<Entity> extractNE(List<LayoutToken> tokens) throws GrobidResourceException {
    // run language identifier
    LanguageUtilities languageIdentifier = LanguageUtilities.getInstance();                     
    Language resultLang = null;
    synchronized (languageIdentifier) {       
        resultLang = languageIdentifier.runLanguageId(LayoutTokensUtil.toText(tokens), 2000); 
    }

    return extractNE(tokens, resultLang);
}

Source File: ProcessTextTest.java From entity-fishing with Apache License 2.0

5 votes

@Test
@Ignore("This test is not testing anything")
public void extractMentionsWikipedia() throws Exception {
    final String input = "this is it.";

    final Language language = new Language("en");
    final List<LayoutToken> inputLayoutTokens = GrobidAnalyzer.getInstance()
            .tokenizeWithLayoutToken(input, language);

    System.out.println(processText.extractMentionsWikipedia(inputLayoutTokens, language));

    System.out.println(processText.extractMentionsWikipedia(input, language));

}

Source File: NLPLeaderboardTable.java From science-result-extractor with Apache License 2.0

5 votes

public List<String> getAssociatedTagsStr_column() {
    List<String> columns = new ArrayList();
    for (TableCell cell : associatedTags_column.values()) {
        String s = "";
        for(LayoutToken lt: cell.lt){
            s = s + " " + lt.t();
        }
        columns.add(s.trim());
    }
    return columns;
}

Source File: NerdRestProcessFile.java From entity-fishing with Apache License 2.0

5 votes

protected boolean isTitle(List<LayoutToken> layoutTokens) {
    int count = 0;
    int total = 0;
    for (LayoutToken layoutToken : layoutTokens) {
        if (!TextUtilities.delimiters.contains(layoutToken.getText())) {
            if (layoutToken.getLabels().contains(TaggingLabels.HEADER_TITLE)) {
                count++;
            }
            total++;
        }
    }

    return count == total;
}

Source File: ProcessText.java From entity-fishing with Apache License 2.0

5 votes

/**
 * NER processing of a sequence of LayoutTokens. Generate list of named entity
 * mentions.
 *
 * @param tokens the sequence of LayoutToken objects
 * @return the list of identified mentions
 */
public List<Mention> processNER(List<LayoutToken> tokens, Language language) throws NerdException {
    List<Mention> results = extractNER(tokens, language);

    Collections.sort(results);

    // associate bounding boxes to identified mentions
    List<Mention> finalResults = new ArrayList<>();

    for (Mention entity : results) {
        // synchronize layout token with the selected n-grams
        List<LayoutToken> entityTokens = entity.getLayoutTokens();

        if (entityTokens != null)
            entity.setBoundingBoxes(BoundingBoxCalculator.calculate(entityTokens));
        else
            LOGGER.warn("processNER: LayoutToken sequence not found for mention: " + entity.getRawName());
        // we have an additional check of validity based on language
        if (validEntity(entity, language.getLang())) {
            if (!finalResults.contains(entity)) {
                finalResults.add(entity);
            }
        }
    }

    return finalResults;
}

Source File: NLPLeaderboardTable.java From science-result-extractor with Apache License 2.0

5 votes

public List<String> getAssociatedTagsStr_row() {
    List<String> rows = new ArrayList();
    for (TableCell cell : associatedTags_row.values()) {
        String s = "";
        for(LayoutToken lt: cell.lt){
            s = s + " " + lt.t();
        }
        rows.add(s.trim());
    }
    return rows;
}

Source File: ProcessText.java From entity-fishing with Apache License 2.0

5 votes

/**
 * Processing of some raw text by extracting all non-trivial ngrams.
 * Generate a list of entity mentions that will be instanciated by
 * Wikipedia labels (anchors and titles).
 *
 * @param tokens the sequence of tokens to be parsed
 * @return the list of identified entities.
 */
public List<Mention> processWikipedia(List<LayoutToken> tokens, Language lang) throws NerdException {
    if ((tokens == null) || (tokens.size() == 0)) {
        //System.out.println("Content to be processed is empty.");
        LOGGER.error("Content to be processed is empty.");
        return null;
    }

    List<Mention> results = new ArrayList<>();
    try {
        List<Mention> subPool = extractMentionsWikipedia(tokens, lang);

        Collections.sort(subPool);
        for (Mention candidate : subPool) {
            List<LayoutToken> entityTokens = candidate.getLayoutTokens();

            if (entityTokens != null)
                candidate.setBoundingBoxes(BoundingBoxCalculator.calculate(entityTokens));
            else
                LOGGER.warn("processWikipedia: LayoutToken sequence not found for mention: " + candidate.rawName);
            // we have an additional check of validity based on language
            if (validEntity(candidate, lang.getLang())) {
                if (!results.contains(candidate))
                    results.add(candidate);
            }
        }
    } catch (Exception e) {
        throw new NerdException("NERD error when processing text.", e);
    }

    return results;
}

Source File: ProcessTextTest.java From entity-fishing with Apache License 2.0

5 votes

@Test
public void testParagraphSegmentationMonolithic() {
    // create a dummy super long text to be segmented
    List<LayoutToken> tokens = new ArrayList<>();
    for (int i = 0; i < 1000; i++) {
        tokens.add(new LayoutToken("blabla"));
        tokens.add(new LayoutToken(" "));
    }

    List<List<LayoutToken>> segments = ProcessText.segmentInParagraphs(tokens);
    assertThat(segments, hasSize(4));
}

Source File: ProcessTextTest.java From entity-fishing with Apache License 2.0

5 votes

@Test
public void testGetSequenceMatch_multiTokenAcronym_shouldWork() throws Exception {

    String text = "We are proving that the P.C.T. is working fine. P.C.T. will work just fine.";

    final List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);

    final LayoutToken acronymLayoutToken1 = new LayoutToken("P");
    acronymLayoutToken1.setOffset(24);
    final LayoutToken acronymLayoutToken2 = new LayoutToken(".");
    acronymLayoutToken2.setOffset(25);
    final LayoutToken acronymLayoutToken3 = new LayoutToken("C");
    acronymLayoutToken3.setOffset(26);
    final LayoutToken acronymLayoutToken4 = new LayoutToken(".");
    acronymLayoutToken4.setOffset(27);
    final LayoutToken acronymLayoutToken5 = new LayoutToken("T");
    acronymLayoutToken5.setOffset(28);
    final LayoutToken acronymLayoutToken6 = new LayoutToken(".");
    acronymLayoutToken6.setOffset(29);

    List<LayoutToken> layoutTokenAcronym = Arrays.asList(acronymLayoutToken1, acronymLayoutToken2,
            acronymLayoutToken3, acronymLayoutToken4, acronymLayoutToken5, acronymLayoutToken6);

    final List<LayoutToken> sequenceMatch = processText.getSequenceMatch(tokens, 24, layoutTokenAcronym);
    assertThat(sequenceMatch, hasSize(6));
    assertThat(sequenceMatch.get(0), is(tokens.get(24)));
}

Source File: ProcessTextTest.java From entity-fishing with Apache License 2.0

5 votes

@Test
public void testGetSequenceMatch_singleTokenAcronym_shouldWork() throws Exception {

    String text = "We are proving that the PCT is working fine. PCT will work just fine.";

    final List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);

    final LayoutToken pct = new LayoutToken("PCT");
    pct.setOffset(24);
        final List<LayoutToken> sequenceMatch = processText.getSequenceMatch(tokens, 19, Arrays.asList(pct));
    assertThat(sequenceMatch, hasSize(1));
    assertThat(sequenceMatch.get(0), is(tokens.get(19)));
}

Source File: TaggingTokenCluster.java From science-result-extractor with Apache License 2.0

5 votes

public List<LayoutToken> concatTokens() {

        Iterable<LayoutToken> it = Iterables.concat(Iterables.transform(labeledTokensContainers, new Function<LabeledTokensContainer, List<LayoutToken>>() {
            @Override
            public List<LayoutToken> apply(LabeledTokensContainer labeledTokensContainer) {
                return labeledTokensContainer.getLayoutTokens();
            }
        }));
        return Lists.newArrayList(it);
    }

Source File: ProcessText.java From entity-fishing with Apache License 2.0

5 votes

private static boolean containsTooLargeSegment(List<List<LayoutToken>> segments) {
    for (List<LayoutToken> segment : segments) {
        if (segment.size() > MAXIMAL_PARAGRAPH_LENGTH) {
            return true;
        }
    }
    return false;
}

Source File: NERParserCommonTest.java From grobid-ner with Apache License 2.0

5 votes

@Test
public void testresultExtraction_clusteror_simple() throws Exception {
    final String input = "Austria fought the enemies with Germany.";
    String result = "Austria\taustria\tA\tAu\tAus\tAust\tAustr\ta\tia\tria\ttria\tstria\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\t0\t1\t1\t1\t1\tXxxx\tXx\t0\tB-UNKNOWN\n" +
            "fought\tfought\tf\tfo\tfou\tfoug\tfough\tt\tht\tght\tught\tought\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxxx\tx\t0\tO\n" +
            "the\tthe\tt\tth\tthe\tthe\tthe\te\the\tthe\tthe\tthe\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxx\tx\t0\tO\n" +
            "enemies\tenemies\te\ten\tene\tenem\tenemi\ts\tes\ties\tmies\temies\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxxx\tx\t0\tO\n" +
            "with\twith\tw\twi\twit\twith\twith\th\tth\tith\twith\twith\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxxx\tx\t0\tO\n" +
            "Germany\tgermany\tG\tGe\tGer\tGerm\tGerma\ty\tny\tany\tmany\trmany\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\tXxxx\tXx\t0\tB-LOCATION\n" +
            ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tALLCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t.\t.\t0\tO";
    List<LayoutToken> tokenisation = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);


    final List<Entity> entities = target.resultExtraction(GrobidModels.ENTITIES_NER, result, tokenisation);

    assertThat(entities, hasSize(2));

    final Entity entity0 = entities.get(0);
    assertThat(entity0.getRawName(), is("Austria"));
    assertThat(entity0.getOffsetStart(), is(0));
    assertThat(entity0.getOffsetEnd(), is(7));

    final Entity entity1 = entities.get(1);
    assertThat(entity1.getRawName(), is("Germany"));
    assertThat(entity1.getOffsetStart(), is(32));
    assertThat(entity1.getOffsetEnd(), is(39));
}

Source File: ProcessTextTest.java From entity-fishing with Apache License 2.0

5 votes

@Test
public void testAcronymsTokensAllLower() {
    String input = "A graphical model or probabilistic graphical model (PGM) is a probabilistic model.";
    List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input, new Language("en", 1.0));
    Map<Mention, Mention> acronyms = processText.acronymCandidates(tokens);
    assertThat(acronyms.entrySet(), hasSize(1));

    final ArrayList<Mention> keys = new ArrayList<>(acronyms.keySet());
    final Mention shortAcronym = keys.get(0);
    final Mention extendedAcronym = acronyms.get(shortAcronym);

    assertThat(extendedAcronym.getRawName(), is("probabilistic graphical model"));
    assertThat(input.substring(shortAcronym.getOffsetStart(), shortAcronym.getOffsetEnd()), is("PGM"));
}

Source File: ProcessText.java From entity-fishing with Apache License 2.0

5 votes

private List<Mention> getMentions(List<LayoutToken> tokens, Language language, MentionMethod mentionType) {
    List<Mention> localResults = new ArrayList<>();

    if (mentionType == MentionMethod.ner) {
        localResults = processNER(tokens, language);
    } else if (mentionType == MentionMethod.wikipedia) {
        localResults = processWikipedia(tokens, language);
    } /*else if (mentionType == ProcessText.MentionMethod.species) {
        localResults = processSpecies(tokens, language);
    }*/
    return localResults;
}

org.grobid.core.layout.LayoutToken Java Examples