Java Code Examples for org.grobid.core.layout.LayoutToken#setOffset()

The following examples show how to use org.grobid.core.layout.LayoutToken#setOffset() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetSequenceMatch_singleTokenAcronym_shouldWork() throws Exception {

    String text = "We are proving that the PCT is working fine. PCT will work just fine.";

    final List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);

    final LayoutToken pct = new LayoutToken("PCT");
    pct.setOffset(24);
        final List<LayoutToken> sequenceMatch = processText.getSequenceMatch(tokens, 19, Arrays.asList(pct));
    assertThat(sequenceMatch, hasSize(1));
    assertThat(sequenceMatch.get(0), is(tokens.get(19)));
}
 
Example 2
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetSequenceMatch_multiTokenAcronym_shouldWork() throws Exception {

    String text = "We are proving that the P.C.T. is working fine. P.C.T. will work just fine.";

    final List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);

    final LayoutToken acronymLayoutToken1 = new LayoutToken("P");
    acronymLayoutToken1.setOffset(24);
    final LayoutToken acronymLayoutToken2 = new LayoutToken(".");
    acronymLayoutToken2.setOffset(25);
    final LayoutToken acronymLayoutToken3 = new LayoutToken("C");
    acronymLayoutToken3.setOffset(26);
    final LayoutToken acronymLayoutToken4 = new LayoutToken(".");
    acronymLayoutToken4.setOffset(27);
    final LayoutToken acronymLayoutToken5 = new LayoutToken("T");
    acronymLayoutToken5.setOffset(28);
    final LayoutToken acronymLayoutToken6 = new LayoutToken(".");
    acronymLayoutToken6.setOffset(29);

    List<LayoutToken> layoutTokenAcronym = Arrays.asList(acronymLayoutToken1, acronymLayoutToken2,
            acronymLayoutToken3, acronymLayoutToken4, acronymLayoutToken5, acronymLayoutToken6);

    final List<LayoutToken> sequenceMatch = processText.getSequenceMatch(tokens, 24, layoutTokenAcronym);
    assertThat(sequenceMatch, hasSize(6));
    assertThat(sequenceMatch.get(0), is(tokens.get(24)));
}
 
Example 3
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 4 votes vote down vote up
@Test
    public void testPropagateAcronyms_textSyncronisedWithLayoutTokens_shouldWork() {
        String input = "The Pulse Covariant Transmission (PCT) is a great deal. We are going to make it great again.\n " +
                "We propose a new methodology where the PCT results are improving in the gamma ray action matter.";
        final Language language = new Language("en");
        List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input, language);

        NerdQuery aQuery = new NerdQuery();
        aQuery.setText(input);
        aQuery.setTokens(tokens);

        final HashMap<Mention, Mention> acronyms = new HashMap<>();
        Mention base = new Mention("Pulse Covariant Transmission");
        base.setOffsetStart(4);
        base.setOffsetEnd(32);
        final LayoutToken baseLayoutToken1 = new LayoutToken("Pulse");
        baseLayoutToken1.setOffset(4);
        final LayoutToken baseLayoutToken2 = new LayoutToken(" ");
        baseLayoutToken2.setOffset(9);
        final LayoutToken baseLayoutToken3 = new LayoutToken("Covariant");
        baseLayoutToken3.setOffset(10);
        final LayoutToken baseLayoutToken4 = new LayoutToken(" ");
        baseLayoutToken4.setOffset(19);
        final LayoutToken baseLayoutToken5 = new LayoutToken("Transmission");
        baseLayoutToken5.setOffset(20);
        final LayoutToken baseLayoutToken6 = new LayoutToken(" ");
        baseLayoutToken6.setOffset(21);

        Mention acronym = new Mention("PCT");
        acronym.setNormalisedName("Pulse Covariant Transmission");
        acronym.setOffsetStart(34);
        acronym.setOffsetEnd(37);
        acronym.setIsAcronym(true);
        final LayoutToken acronymLayoutToken = new LayoutToken("PCT");
        acronymLayoutToken.setOffset(34);
        acronym.setLayoutTokens(Arrays.asList(acronymLayoutToken));

        acronyms.put(acronym, base);

        final NerdContext nerdContext = new NerdContext();
        nerdContext.setAcronyms(acronyms);
        aQuery.setContext(nerdContext);

        final List<Mention> mentions = processText.propagateAcronyms(aQuery);
        assertThat(mentions, hasSize(1));
        assertThat(mentions.get(0).getRawName(), is("PCT"));
        assertThat(mentions.get(0).getOffsetStart(), is(133));
        assertThat(mentions.get(0).getOffsetEnd(), is(136));
        assertThat(mentions.get(0).getLayoutTokens(), is(Arrays.asList(tokens.get(53))));
//        assertThat(mentions.get(0).getBoundingBoxes(), hasSize(greaterThan(0)));
    }
 
Example 4
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 4 votes vote down vote up
@Test
public void testPropagateAcronyms_textNotSyncronisedWithLayoutTokens_shouldWork() {
    String input = "The Pulse Covariant Transmission (PCT) is a great deal. We are going to make it great again.\n " +
            "We propose a new methodology where the PCT results are improving in the gamma ray action matter.";
    final Language language = new Language("en");
    List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input, language);
    tokens = tokens.stream()
            .map(layoutToken -> {
                layoutToken.setOffset(layoutToken.getOffset() + 10);
                layoutToken.setX(22.3);
                layoutToken.setY(22.3);
                layoutToken.setWidth(10);
                layoutToken.setHeight(30);
                return layoutToken;
            }).collect(Collectors.toList());

    NerdQuery aQuery = new NerdQuery();
    aQuery.setText(input);
    aQuery.setTokens(tokens);

    final HashMap<Mention, Mention> acronyms = new HashMap<>();
    Mention base = new Mention("Pulse Covariant Transmission");
    base.setOffsetStart(14);
    base.setOffsetEnd(42);
    final LayoutToken baseLayoutToken1 = new LayoutToken("Pulse");
    baseLayoutToken1.setOffset(4);
    final LayoutToken baseLayoutToken2 = new LayoutToken(" ");
    baseLayoutToken2.setOffset(9);
    final LayoutToken baseLayoutToken3 = new LayoutToken("Covariant");
    baseLayoutToken3.setOffset(10);
    final LayoutToken baseLayoutToken4 = new LayoutToken(" ");
    baseLayoutToken4.setOffset(19);
    final LayoutToken baseLayoutToken5 = new LayoutToken("Transmission");
    baseLayoutToken5.setOffset(20);
    final LayoutToken baseLayoutToken6 = new LayoutToken(" ");
    baseLayoutToken6.setOffset(21);

    Mention acronym = new Mention("PCT");
    acronym.setNormalisedName("Pulse Covariant Transmission");
    acronym.setOffsetStart(44);
    acronym.setOffsetEnd(47);
    acronym.setIsAcronym(true);
    final LayoutToken acronymLayoutToken = new LayoutToken("PCT");
    acronymLayoutToken.setOffset(44);
    acronym.setLayoutTokens(Arrays.asList(acronymLayoutToken));

    acronyms.put(acronym, base);

    final NerdContext nerdContext = new NerdContext();
    nerdContext.setAcronyms(acronyms);
    aQuery.setContext(nerdContext);

    final List<Mention> mentions = processText.propagateAcronyms(aQuery);
    assertThat(mentions, hasSize(1));
    assertThat(mentions.get(0).getRawName(), is("PCT"));
    assertThat(mentions.get(0).getOffsetStart(), is(143));
    assertThat(mentions.get(0).getOffsetEnd(), is(146));
    assertThat(mentions.get(0).getBoundingBoxes(), hasSize(greaterThan(0)));
    assertThat(mentions.get(0).getLayoutTokens(), is(Arrays.asList(tokens.get(53))));
}
 
Example 5
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 4 votes vote down vote up
@Test
    public void testPropagateAcronyms_textNotSyncronisedWithLayoutTokens2_shouldWork() {
        String input = "The Pulse Covariant Transmission (P.C.T.) is a great deal. We are going to make it great again.\n " +
                "We propose a new methodology where the P.C.T. results are improving in the gamma ray action matter. " +
                "P.C.T. is good for you";
        final Language language = new Language("en");
        List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input, language);
        tokens = tokens.stream()
                .map(layoutToken -> {
                    layoutToken.setOffset(layoutToken.getOffset() + 10);
                    return layoutToken;
                }).collect(Collectors.toList());

        NerdQuery aQuery = new NerdQuery();
        aQuery.setText(input);
        aQuery.setTokens(tokens);

        final HashMap<Mention, Mention> acronyms = new HashMap<>();
        Mention base = new Mention("Pulse Covariant Transmission");
        base.setOffsetStart(14);
        base.setOffsetEnd(42);
        final LayoutToken baseLayoutToken1 = new LayoutToken("Pulse");
        baseLayoutToken1.setOffset(4);
        final LayoutToken baseLayoutToken2 = new LayoutToken(" ");
        baseLayoutToken2.setOffset(9);
        final LayoutToken baseLayoutToken3 = new LayoutToken("Covariant");
        baseLayoutToken3.setOffset(10);
        final LayoutToken baseLayoutToken4 = new LayoutToken(" ");
        baseLayoutToken4.setOffset(19);
        final LayoutToken baseLayoutToken5 = new LayoutToken("Transmission");
        baseLayoutToken5.setOffset(20);
        final LayoutToken baseLayoutToken6 = new LayoutToken(" ");
        baseLayoutToken6.setOffset(21);

        Mention acronym = new Mention("P.C.T.");
        acronym.setNormalisedName("Pulse Covariant Transmission");
        acronym.setOffsetStart(44);
        acronym.setOffsetEnd(47);
        acronym.setIsAcronym(true);
        final LayoutToken acronymLayoutToken1 = new LayoutToken("P");
        acronymLayoutToken1.setOffset(44);
        final LayoutToken acronymLayoutToken2 = new LayoutToken(".");
        acronymLayoutToken2.setOffset(45);
        final LayoutToken acronymLayoutToken3 = new LayoutToken("C");
        acronymLayoutToken3.setOffset(46);
        final LayoutToken acronymLayoutToken4 = new LayoutToken(".");
        acronymLayoutToken4.setOffset(47);
        final LayoutToken acronymLayoutToken5 = new LayoutToken("T");
        acronymLayoutToken5.setOffset(48);
        final LayoutToken acronymLayoutToken6 = new LayoutToken(".");
        acronymLayoutToken6.setOffset(49);

        acronym.setLayoutTokens(Arrays.asList(acronymLayoutToken1, acronymLayoutToken2,
                acronymLayoutToken3, acronymLayoutToken4, acronymLayoutToken5, acronymLayoutToken6));

        acronyms.put(acronym, base);

        final NerdContext nerdContext = new NerdContext();
        nerdContext.setAcronyms(acronyms);
        aQuery.setContext(nerdContext);

        final List<Mention> mentions = processText.propagateAcronyms(aQuery);
        assertThat(mentions, hasSize(2));
        assertThat(mentions.get(0).getRawName(), is("P.C.T."));
        assertThat(mentions.get(0).getOffsetStart(), is(146));
        assertThat(mentions.get(0).getOffsetEnd(), is(152));
//        assertThat(mentions.get(0).getBoundingBoxes(), hasSize(greaterThan(0)));
        assertThat(mentions.get(0).getLayoutTokens(), hasSize(6));

        assertThat(mentions.get(1).getRawName(), is("P.C.T."));
        assertThat(mentions.get(1).getOffsetStart(), is(207));
        assertThat(mentions.get(1).getOffsetEnd(), is(213));
//        assertThat(mentions.get(1).getBoundingBoxes(), hasSize(greaterThan(0)));
        assertThat(mentions.get(1).getLayoutTokens(), hasSize(6));
    }