org.datavec.nlp.tokenization.tokenizer.Tokenizer Java Exaples

Source File: VasttextTextVectorizer.java From scava with Eclipse Public License 2.0

6 votes

@Override
public void fit(RecordReader reader, RecordCallBack callBack) {
       while (reader.hasNext()) {
           Record record = reader.nextRecord();
           String s = record.getRecord().get(0).toString();
           Tokenizer tokenizer = tokenizerFactory.create(s);
           cache.incrementNumDocs(1);
           List<String> tokens = new ArrayList<String>(); //These tokens might be different from those of the tokenizer if used with stopwords
           if(stopWords==null)
           	tokens=doWithTokens(tokenizer);
           else
           	tokens=doWithTokensStopWords(tokenizer);
           if(maxNgrams>1)
           	doWithNgram(ngramsGenerator(tokens));
           if (callBack != null)
               callBack.onRecord(record);
       }

}

Source File: VasttextTextVectorizer.java From scava with Eclipse Public License 2.0

5 votes

private List<Integer> featuresWeightsFromRecord(Writable writable)
{
	String weightsAsText=writable.toString();
	Tokenizer tokenizer = tokenizerFactory.create(weightsAsText);
	List<Integer> features = new ArrayList<Integer>();
	while(tokenizer.hasMoreTokens())
		features.add(Integer.valueOf(tokenizer.nextToken()));
	return features;
}

Source File: Windows.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(InputStream words, int windowSize) {
    Tokenizer tokenizer = new DefaultStreamTokenizer(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());
    return windows(list, windowSize);
}

Source File: Windows.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(InputStream words, TokenizerFactory tokenizerFactory, int windowSize) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}

Source File: TextVectorizer.java From deeplearning4j with Apache License 2.0

5 votes

protected Counter<String> wordFrequenciesForRecord(Collection<Writable> record) {
    String s = toString(record);
    Tokenizer tokenizer = tokenizerFactory.create(s);
    Counter<String> ret = new Counter<>();
    while (tokenizer.hasMoreTokens())
        ret.incrementCount(tokenizer.nextToken(), 1.0);
    return ret;
}

Source File: TextVectorizer.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public void fit(RecordReader reader, RecordCallBack callBack) {
    while (reader.hasNext()) {
        Record record = reader.nextRecord();
        String s = toString(record.getRecord());
        Tokenizer tokenizer = tokenizerFactory.create(s);
        doWithTokens(tokenizer);
        if (callBack != null)
            callBack.onRecord(record);
        cache.incrementNumDocs(1);
    }
}

Source File: AbstractTfidfVectorizer.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public void doWithTokens(Tokenizer tokenizer) {
    Set<String> seen = new HashSet<>();
    while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();
        if (!stopWords.contains(token)) {
            cache.incrementCount(token);
            if (!seen.contains(token)) {
                cache.incrementDocCount(token);
            }
            seen.add(token);
        }
    }
}

Source File: Windows.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, TokenizerFactory tokenizerFactory, int windowSize) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}

Source File: Windows.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, TokenizerFactory tokenizerFactory) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());
    return windows(list, 5);
}

Source File: UimaTokenizerFactory.java From DataVec with Apache License 2.0

5 votes

@Override
public Tokenizer create(String toTokenize) {
    if (toTokenize == null || toTokenize.isEmpty())
        throw new IllegalArgumentException("Unable to proceed; on sentence to tokenize");
    Tokenizer ret = new UimaTokenizer(toTokenize, uimaResource, checkForLabel);
    ret.setTokenPreProcessor(preProcess);
    return ret;
}

Source File: UimaTokenizerFactory.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public Tokenizer create(String toTokenize) {
    if (toTokenize == null || toTokenize.isEmpty())
        throw new IllegalArgumentException("Unable to proceed; on sentence to tokenize");
    Tokenizer ret = new UimaTokenizer(toTokenize, uimaResource, checkForLabel);
    ret.setTokenPreProcessor(preProcess);
    return ret;
}

Source File: Windows.java From DataVec with Apache License 2.0

5 votes

/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, TokenizerFactory tokenizerFactory) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());
    return windows(list, 5);
}

Source File: Windows.java From DataVec with Apache License 2.0

5 votes

/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, TokenizerFactory tokenizerFactory, int windowSize) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}

Source File: Windows.java From DataVec with Apache License 2.0

5 votes

/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(InputStream words, TokenizerFactory tokenizerFactory, int windowSize) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}

Source File: Windows.java From DataVec with Apache License 2.0

5 votes

/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(InputStream words, int windowSize) {
    Tokenizer tokenizer = new DefaultStreamTokenizer(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());
    return windows(list, windowSize);
}

Source File: TextVectorizer.java From DataVec with Apache License 2.0

5 votes

protected Counter<String> wordFrequenciesForRecord(Collection<Writable> record) {
    String s = toString(record);
    Tokenizer tokenizer = tokenizerFactory.create(s);
    Counter<String> ret = new Counter<>();
    while (tokenizer.hasMoreTokens())
        ret.incrementCount(tokenizer.nextToken(), 1.0);
    return ret;
}

Source File: TextVectorizer.java From DataVec with Apache License 2.0

5 votes

@Override
public void fit(RecordReader reader, RecordCallBack callBack) {
    while (reader.hasNext()) {
        Record record = reader.nextRecord();
        String s = toString(record.getRecord());
        Tokenizer tokenizer = tokenizerFactory.create(s);
        cache.incrementNumDocs(1);
        doWithTokens(tokenizer);
        if (callBack != null)
            callBack.onRecord(record);


    }
}

Source File: AbstractTfidfVectorizer.java From DataVec with Apache License 2.0

5 votes

@Override
public void doWithTokens(Tokenizer tokenizer) {
    Set<String> seen = new HashSet<>();
    while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();
        if (!stopWords.contains(token)) {
            cache.incrementCount(token);
            if (!seen.contains(token)) {
                cache.incrementDocCount(token);
            }
            seen.add(token);
        }
    }
}

Source File: VasttextTextVectorizer.java From scava with Eclipse Public License 2.0

5 votes

private List<String> doWithTokensStopWords(Tokenizer tokenizer)
{
	List<String> tokens = new ArrayList<String>();
	String token;
       while (tokenizer.hasMoreTokens()) {
           token = tokenizer.nextToken();
           if (!stopWords.contains(token))
           {
           	tokens.add(token);
               doWithNgram(token);
           }
       }
       return tokens;
   }

Source File: VasttextTextVectorizer.java From scava with Eclipse Public License 2.0

5 votes

private List<String> doWithTokens(Tokenizer tokenizer)
{
	List<String> tokens = new ArrayList<String>();
	String token;
       while (tokenizer.hasMoreTokens()) {
       	token = tokenizer.nextToken();
       	tokens.add(token);
           doWithNgram(token);
       }
       return tokens;
   }

Source File: VasttextTextVectorizer.java From scava with Eclipse Public License 2.0

5 votes

protected List<String> tokensFromRecord(Writable writable)
{

	String text = writable.toString();
	Tokenizer tokenizer = tokenizerFactory.create(text);
	List<String> tokens = new ArrayList<String>();
	while (tokenizer.hasMoreTokens())
        tokens.add(tokenizer.nextToken());
	return tokens;
}

Source File: DefaultTokenizerFactory.java From deeplearning4j with Apache License 2.0

4 votes

@Override
public Tokenizer create(String toTokenize) {
    DefaultTokenizer t = new DefaultTokenizer(toTokenize);
    t.setTokenPreProcessor(tokenPreProcess);
    return t;
}

Source File: UimaTokenizerFactory.java From deeplearning4j with Apache License 2.0

4 votes

@Override
public Tokenizer create(InputStream toTokenize) {
    throw new UnsupportedOperationException();
}

Source File: PosUimaTokenizerFactory.java From deeplearning4j with Apache License 2.0

4 votes

@Override
public Tokenizer create(InputStream toTokenize) {
    throw new UnsupportedOperationException();
}

Source File: PosUimaTokenizerFactory.java From deeplearning4j with Apache License 2.0

4 votes

@Override
public Tokenizer create(String toTokenize) {
    PosUimaTokenizer t = new PosUimaTokenizer(toTokenize, tokenizer, allowedPoSTags);
    t.setTokenPreProcessor(tokenPreProcess);
    return t;
}

Source File: DefaultTokenizerFactory.java From deeplearning4j with Apache License 2.0

4 votes

@Override
public Tokenizer create(InputStream toTokenize) {
    Tokenizer t = new DefaultStreamTokenizer(toTokenize);
    t.setTokenPreProcessor(tokenPreProcess);
    return t;
}

Source File: ContextLabelRetriever.java From deeplearning4j with Apache License 2.0

4 votes

/**
 * Returns a stripped sentence with the indices of words
 * with certain kinds of labels.
 *
 * @param sentence the sentence to process
 * @return a pair of a post processed sentence
 * with labels stripped and the spans of
 * the labels
 */
public static Pair<String, MultiDimensionalMap<Integer, Integer, String>> stringWithLabels(String sentence,
                                                   TokenizerFactory tokenizerFactory) {
    MultiDimensionalMap<Integer, Integer, String> map = MultiDimensionalMap.newHashBackedMap();
    Tokenizer t = tokenizerFactory.create(sentence);
    List<String> currTokens = new ArrayList<>();
    String currLabel = null;
    String endLabel = null;
    List<Pair<String, List<String>>> tokensWithSameLabel = new ArrayList<>();
    while (t.hasMoreTokens()) {
        String token = t.nextToken();
        if (token.matches(BEGIN_LABEL)) {
            currLabel = token;

            //no labels; add these as NONE and begin the new label
            if (!currTokens.isEmpty()) {
                tokensWithSameLabel.add(new Pair<>("NONE", (List<String>) new ArrayList<>(currTokens)));
                currTokens.clear();

            }

        } else if (token.matches(END_LABEL)) {
            if (currLabel == null)
                throw new IllegalStateException("Found an ending label with no matching begin label");
            endLabel = token;
        } else
            currTokens.add(token);

        if (currLabel != null && endLabel != null) {
            currLabel = currLabel.replaceAll("[<>/]", "");
            endLabel = endLabel.replaceAll("[<>/]", "");
            Preconditions.checkState(!currLabel.isEmpty(), "Current label is empty!");
            Preconditions.checkState(!endLabel.isEmpty(), "End label is empty!");
            Preconditions.checkState(currLabel.equals(endLabel), "Current label begin and end did not match for the parse. Was: %s ending with %s",
                    currLabel, endLabel);

            tokensWithSameLabel.add(new Pair<>(currLabel, (List<String>) new ArrayList<>(currTokens)));
            currTokens.clear();


            //clear out the tokens
            currLabel = null;
            endLabel = null;
        }


    }

    //no labels; add these as NONE and begin the new label
    if (!currTokens.isEmpty()) {
        tokensWithSameLabel.add(new Pair<>("none", (List<String>) new ArrayList<>(currTokens)));
        currTokens.clear();

    }

    //now join the output
    StringBuilder strippedSentence = new StringBuilder();
    for (Pair<String, List<String>> tokensWithLabel : tokensWithSameLabel) {
        String joinedSentence = StringUtils.join(tokensWithLabel.getSecond(), " ");
        //spaces between separate parts of the sentence
        if (!(strippedSentence.length() < 1))
            strippedSentence.append(" ");
        strippedSentence.append(joinedSentence);
        int begin = strippedSentence.toString().indexOf(joinedSentence);
        int end = begin + joinedSentence.length();
        map.put(begin, end, tokensWithLabel.getFirst());
    }


    return new Pair<>(strippedSentence.toString(), map);
}

Source File: DefaultTokenizerFactory.java From DataVec with Apache License 2.0

4 votes

@Override
public Tokenizer create(InputStream toTokenize) {
    Tokenizer t = new DefaultStreamTokenizer(toTokenize);
    t.setTokenPreProcessor(tokenPreProcess);
    return t;
}

Source File: DefaultTokenizerFactory.java From DataVec with Apache License 2.0

4 votes

@Override
public Tokenizer create(String toTokenize) {
    DefaultTokenizer t = new DefaultTokenizer(toTokenize);
    t.setTokenPreProcessor(tokenPreProcess);
    return t;
}

Source File: UimaTokenizerFactory.java From DataVec with Apache License 2.0

4 votes

@Override
public Tokenizer create(InputStream toTokenize) {
    throw new UnsupportedOperationException();
}

org.datavec.nlp.tokenization.tokenizer.Tokenizer Java Examples