org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory#create

Source File: JapaneseTokenizerTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testGetTokens() throws Exception {
    TokenizerFactory tf = new JapaneseTokenizerFactory();

    Tokenizer tokenizer = tf.create(toTokenize);

    // Exhaust iterator.
    assertEquals(expect.length, tokenizer.countTokens());
    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }

    // Ensure exhausted.
    assertEquals(false, tokenizer.hasMoreTokens());

    // Count doesn't change.
    assertEquals(expect.length, tokenizer.countTokens());

    // getTokens still returns everything.
    List<String> tokens = tokenizer.getTokens();
    assertEquals(expect.length, tokens.size());
}

Source File: Windows.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, @NonNull TokenizerFactory tokenizerFactory, int windowSize,
                WordVectors vectors) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();

        // if we don't have UNK word defined - we have to skip this word
        if (vectors.getWordVectorMatrix(token) != null)
            list.add(token);
    }

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}

Source File: BertWordPieceTokenizerTests.java From deeplearning4j with Apache License 2.0

6 votes

@Test
@Ignore("AB 2019/05/24 - Disabled until dev branch merged - see issue #7657")
public void testBertWordPieceTokenizer5() throws Exception {
    // Longest Token in Vocab is 22 chars long, so make sure splits on the edge are properly handled
    String toTokenize = "Donaudampfschifffahrts Kapitänsmützeninnenfuttersaum";
    TokenizerFactory t = new BertWordPieceTokenizerFactory(pathToVocab, false, false, c);
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    final List<String> expected = Arrays.asList("Donau", "##dam", "##pf", "##schiff", "##fahrt", "##s", "Kapitän", "##sm", "##ützen", "##innen", "##fu", "##tter", "##sa", "##um");
    assertEquals(expected, tokenizer.getTokens());
    assertEquals(expected, tokenizer2.getTokens());

    String s2 = BertWordPiecePreProcessor.reconstructFromTokens(tokenizer.getTokens());
    assertEquals(toTokenize, s2);
}

Source File: DefaultDocumentIteratorTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testDocumentIterator() throws Exception {
    ClassPathResource reuters5250 = new ClassPathResource("/reuters/5250");
    File f = reuters5250.getFile();

    DocumentIterator iter = new FileDocumentIterator(f.getAbsolutePath());

    InputStream doc = iter.nextDocument();

    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer next = t.create(doc);
    String[] list = "PEARSON CONCENTRATES ON FOUR SECTORS".split(" ");
    ///PEARSON CONCENTRATES ON FOUR SECTORS
    int count = 0;
    while (next.hasMoreTokens() && count < list.length) {
        String token = next.nextToken();
        assertEquals(list[count++], token);
    }


    doc.close();
}

Source File: DefaulTokenizerTests.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testDefaultTokenizer1() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    int position = 1;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
        position++;
        assertEquals(tok1, tok2);
    }


    ClassPathResource resource = new ClassPathResource("reuters/5250");
    String str = FileUtils.readFileToString(resource.getFile());
    int stringCount = t.create(str).countTokens();
    int stringCount2 = t.create(resource.getInputStream()).countTokens();
    assertTrue(Math.abs(stringCount - stringCount2) < 2);
}

Source File: DefaulTokenizerTests.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testDefaultTokenizer2() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    tokenizer2.countTokens();
    while (tokenizer.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        assertEquals(tok1, tok2);
    }


    System.out.println("-----------------------------------------------");

    ClassPathResource resource = new ClassPathResource("reuters/5250");
    String str = FileUtils.readFileToString(resource.getFile());
    int stringCount = t.create(str).countTokens();
    int stringCount2 = t.create(resource.getInputStream()).countTokens();

    log.info("String tok: [" + stringCount + "], Stream tok: [" + stringCount2 + "], Difference: "
                    + Math.abs(stringCount - stringCount2));

    assertTrue(Math.abs(stringCount - stringCount2) < 2);
}

Source File: DefaulTokenizerTests.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testDefaultStreamTokenizer() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    assertEquals(5, tokenizer2.countTokens());

    int cnt = 0;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer2.nextToken();
        log.info(tok1);
        cnt++;
    }

    assertEquals(5, cnt);
}

Source File: BertWordPieceTokenizerTests.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testBertWordPieceTokenizer1() throws Exception {
    String toTokenize = "I saw a girl with a telescope.";
    TokenizerFactory t = new BertWordPieceTokenizerFactory(pathToVocab, false, false, c);
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    int position = 1;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
        position++;
        assertEquals(tok1, tok2);

        String s2 = BertWordPiecePreProcessor.reconstructFromTokens(tokenizer.getTokens());
        assertEquals(toTokenize, s2);
    }
}

Source File: DefaulTokenizerTests.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testDefaultTokenizer3() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    int position = 1;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
        position++;
        assertEquals(tok1, tok2);
    }
}

Source File: JapaneseTokenizerTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testBaseForm() throws Exception {
    TokenizerFactory tf = new JapaneseTokenizerFactory(true);

    Tokenizer tokenizer1 = tf.create(toTokenize);
    Tokenizer tokenizer2 = tf.create(baseString);

    assertEquals("黒い", tokenizer1.nextToken());
    assertEquals("驚く", tokenizer2.nextToken());
}

Source File: JapaneseTokenizerTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testJapaneseTokenizer() throws Exception {
    TokenizerFactory t = new JapaneseTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);

    assertEquals(expect.length, tokenizer.countTokens());
    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }
}

Source File: BertWordPieceTokenizerTests.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testBertWordPieceTokenizer4() throws Exception {
    String toTokenize = "I saw a girl with a telescope.";
    TokenizerFactory t = new BertWordPieceTokenizerFactory(pathToVocab, false, false, c);
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    final List<String> expected = Arrays.asList("I", "saw", "a", "girl", "with", "a", "tele", "##scope", ".");
    assertEquals(expected, tokenizer.getTokens());
    assertEquals(expected, tokenizer2.getTokens());

    String s2 = BertWordPiecePreProcessor.reconstructFromTokens(tokenizer.getTokens());
    assertEquals(toTokenize, s2);
}

Source File: BertWordPieceTokenizerTests.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testBertWordPieceTokenizer3() throws Exception {
    String toTokenize = "Donaudampfschifffahrtskapitänsmützeninnenfuttersaum";
    TokenizerFactory t = new BertWordPieceTokenizerFactory(pathToVocab, false, false, c);
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    final List<String> expected = Arrays.asList("Donau", "##dam", "##pf", "##schiff", "##fahrt", "##skap", "##itä", "##ns", "##m", "##ützen", "##innen", "##fu", "##tter", "##sa", "##um");
    assertEquals(expected, tokenizer.getTokens());
    assertEquals(expected, tokenizer2.getTokens());

    String s2 = BertWordPiecePreProcessor.reconstructFromTokens(tokenizer.getTokens());
    assertEquals(toTokenize, s2);
}

Source File: KoreanTokenizerTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testKoreanTokenizer() throws Exception {
    String toTokenize = "세계 최초의 상용 수준 오픈소스 딥러닝 라이브러리입니다";
    TokenizerFactory t = new KoreanTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    String[] expect = {"세계", "최초", "의", "상용", "수준", "오픈소스", "딥", "러닝", "라이브러리", "입니", "다"};

    assertEquals(expect.length, tokenizer.countTokens());

    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }
}

Source File: Windows.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, TokenizerFactory tokenizerFactory) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());
    return windows(list, 5);
}

Source File: Windows.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(InputStream words, TokenizerFactory tokenizerFactory, int windowSize) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}

Source File: ChineseTokenizerTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testChineseTokenizer() {
    TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
    Tokenizer tokenizer = tokenizerFactory.create(toTokenize);
    assertEquals(expect.length, tokenizer.countTokens());
    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }
}

Source File: ContextLabelRetriever.java From deeplearning4j with Apache License 2.0

4 votes

/**
 * Returns a stripped sentence with the indices of words
 * with certain kinds of labels.
 * @param sentence the sentence to process
 * @return a pair of a post processed sentence
 * with labels stripped and the spans of
 * the labels
 */
public static Pair<String, MultiDimensionalMap<Integer, Integer, String>> stringWithLabels(String sentence,
                                                               TokenizerFactory tokenizerFactory) {
    MultiDimensionalMap<Integer, Integer, String> map = MultiDimensionalMap.newHashBackedMap();
    Tokenizer t = tokenizerFactory.create(sentence);
    List<String> currTokens = new ArrayList<>();
    String currLabel = null;
    String endLabel = null;
    List<Pair<String, List<String>>> tokensWithSameLabel = new ArrayList<>();
    while (t.hasMoreTokens()) {
        String token = t.nextToken();
        if (token.matches(BEGIN_LABEL)) {
            if (endLabel != null)
                throw new IllegalStateException(
                                "Tried parsing sentence; found an end label when the begin label has not been cleared");
            currLabel = token;

            //no labels; add these as NONE and begin the new label
            if (!currTokens.isEmpty()) {
                tokensWithSameLabel.add(new Pair<>("NONE", (List<String>) new ArrayList<>(currTokens)));
                currTokens.clear();

            }

        } else if (token.matches(END_LABEL)) {
            if (currLabel == null)
                throw new IllegalStateException("Found an ending label with no matching begin label");
            endLabel = token;
        } else
            currTokens.add(token);

        if (currLabel != null && endLabel != null) {
            currLabel = currLabel.replaceAll("[<>/]", "");
            endLabel = endLabel.replaceAll("[<>/]", "");
            Preconditions.checkState(!currLabel.isEmpty(), "Current label is empty!");
            Preconditions.checkState(!endLabel.isEmpty(), "End label is empty!");
            Preconditions.checkState(currLabel.equals(endLabel), "Current label begin and end did not match for the parse. Was: %s ending with %s", currLabel, endLabel);

            tokensWithSameLabel.add(new Pair<>(currLabel, (List<String>) new ArrayList<>(currTokens)));
            currTokens.clear();

            //clear out the tokens
            currLabel = null;
            endLabel = null;
        }
    }

    //no labels; add these as NONE and begin the new label
    if (!currTokens.isEmpty()) {
        tokensWithSameLabel.add(new Pair<>("none", (List<String>) new ArrayList<>(currTokens)));
        currTokens.clear();

    }

    //now join the output
    StringBuilder strippedSentence = new StringBuilder();
    for (Pair<String, List<String>> tokensWithLabel : tokensWithSameLabel) {
        String joinedSentence = StringUtils.join(tokensWithLabel.getSecond(), " ");
        //spaces between separate parts of the sentence
        if (!(strippedSentence.length() < 1))
            strippedSentence.append(" ");
        strippedSentence.append(joinedSentence);
        int begin = strippedSentence.toString().indexOf(joinedSentence);
        int end = begin + joinedSentence.length();
        map.put(begin, end, tokensWithLabel.getFirst());
    }

    return new Pair<>(strippedSentence.toString(), map);
}

Java Code Examples for org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory#create()