org.datavec.nlp.tokenization.tokenizer.Tokenizer Java Examples

The following examples show how to use org.datavec.nlp.tokenization.tokenizer.Tokenizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: VasttextTextVectorizer.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
@Override
public void fit(RecordReader reader, RecordCallBack callBack) {
       while (reader.hasNext()) {
           Record record = reader.nextRecord();
           String s = record.getRecord().get(0).toString();
           Tokenizer tokenizer = tokenizerFactory.create(s);
           cache.incrementNumDocs(1);
           List<String> tokens = new ArrayList<String>(); //These tokens might be different from those of the tokenizer if used with stopwords
           if(stopWords==null)
           	tokens=doWithTokens(tokenizer);
           else
           	tokens=doWithTokensStopWords(tokenizer);
           if(maxNgrams>1)
           	doWithNgram(ngramsGenerator(tokens));
           if (callBack != null)
               callBack.onRecord(record);
       }

}
 
Example #2
Source File: VasttextTextVectorizer.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private List<Integer> featuresWeightsFromRecord(Writable writable)
{
	String weightsAsText=writable.toString();
	Tokenizer tokenizer = tokenizerFactory.create(weightsAsText);
	List<Integer> features = new ArrayList<Integer>();
	while(tokenizer.hasMoreTokens())
		features.add(Integer.valueOf(tokenizer.nextToken()));
	return features;
}
 
Example #3
Source File: Windows.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(InputStream words, int windowSize) {
    Tokenizer tokenizer = new DefaultStreamTokenizer(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());
    return windows(list, windowSize);
}
 
Example #4
Source File: Windows.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(InputStream words, TokenizerFactory tokenizerFactory, int windowSize) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}
 
Example #5
Source File: TextVectorizer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected Counter<String> wordFrequenciesForRecord(Collection<Writable> record) {
    String s = toString(record);
    Tokenizer tokenizer = tokenizerFactory.create(s);
    Counter<String> ret = new Counter<>();
    while (tokenizer.hasMoreTokens())
        ret.incrementCount(tokenizer.nextToken(), 1.0);
    return ret;
}
 
Example #6
Source File: TextVectorizer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public void fit(RecordReader reader, RecordCallBack callBack) {
    while (reader.hasNext()) {
        Record record = reader.nextRecord();
        String s = toString(record.getRecord());
        Tokenizer tokenizer = tokenizerFactory.create(s);
        doWithTokens(tokenizer);
        if (callBack != null)
            callBack.onRecord(record);
        cache.incrementNumDocs(1);
    }
}
 
Example #7
Source File: AbstractTfidfVectorizer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public void doWithTokens(Tokenizer tokenizer) {
    Set<String> seen = new HashSet<>();
    while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();
        if (!stopWords.contains(token)) {
            cache.incrementCount(token);
            if (!seen.contains(token)) {
                cache.incrementDocCount(token);
            }
            seen.add(token);
        }
    }
}
 
Example #8
Source File: Windows.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, TokenizerFactory tokenizerFactory, int windowSize) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}
 
Example #9
Source File: Windows.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, TokenizerFactory tokenizerFactory) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());
    return windows(list, 5);
}
 
Example #10
Source File: UimaTokenizerFactory.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create(String toTokenize) {
    if (toTokenize == null || toTokenize.isEmpty())
        throw new IllegalArgumentException("Unable to proceed; on sentence to tokenize");
    Tokenizer ret = new UimaTokenizer(toTokenize, uimaResource, checkForLabel);
    ret.setTokenPreProcessor(preProcess);
    return ret;
}
 
Example #11
Source File: UimaTokenizerFactory.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create(String toTokenize) {
    if (toTokenize == null || toTokenize.isEmpty())
        throw new IllegalArgumentException("Unable to proceed; on sentence to tokenize");
    Tokenizer ret = new UimaTokenizer(toTokenize, uimaResource, checkForLabel);
    ret.setTokenPreProcessor(preProcess);
    return ret;
}
 
Example #12
Source File: Windows.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, TokenizerFactory tokenizerFactory) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());
    return windows(list, 5);
}
 
Example #13
Source File: Windows.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, TokenizerFactory tokenizerFactory, int windowSize) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}
 
Example #14
Source File: Windows.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(InputStream words, TokenizerFactory tokenizerFactory, int windowSize) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}
 
Example #15
Source File: Windows.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(InputStream words, int windowSize) {
    Tokenizer tokenizer = new DefaultStreamTokenizer(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());
    return windows(list, windowSize);
}
 
Example #16
Source File: TextVectorizer.java    From DataVec with Apache License 2.0 5 votes vote down vote up
protected Counter<String> wordFrequenciesForRecord(Collection<Writable> record) {
    String s = toString(record);
    Tokenizer tokenizer = tokenizerFactory.create(s);
    Counter<String> ret = new Counter<>();
    while (tokenizer.hasMoreTokens())
        ret.incrementCount(tokenizer.nextToken(), 1.0);
    return ret;
}
 
Example #17
Source File: TextVectorizer.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Override
public void fit(RecordReader reader, RecordCallBack callBack) {
    while (reader.hasNext()) {
        Record record = reader.nextRecord();
        String s = toString(record.getRecord());
        Tokenizer tokenizer = tokenizerFactory.create(s);
        cache.incrementNumDocs(1);
        doWithTokens(tokenizer);
        if (callBack != null)
            callBack.onRecord(record);


    }
}
 
Example #18
Source File: AbstractTfidfVectorizer.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Override
public void doWithTokens(Tokenizer tokenizer) {
    Set<String> seen = new HashSet<>();
    while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();
        if (!stopWords.contains(token)) {
            cache.incrementCount(token);
            if (!seen.contains(token)) {
                cache.incrementDocCount(token);
            }
            seen.add(token);
        }
    }
}
 
Example #19
Source File: VasttextTextVectorizer.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private List<String> doWithTokensStopWords(Tokenizer tokenizer)
{
	List<String> tokens = new ArrayList<String>();
	String token;
       while (tokenizer.hasMoreTokens()) {
           token = tokenizer.nextToken();
           if (!stopWords.contains(token))
           {
           	tokens.add(token);
               doWithNgram(token);
           }
       }
       return tokens;
   }
 
Example #20
Source File: VasttextTextVectorizer.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private List<String> doWithTokens(Tokenizer tokenizer)
{
	List<String> tokens = new ArrayList<String>();
	String token;
       while (tokenizer.hasMoreTokens()) {
       	token = tokenizer.nextToken();
       	tokens.add(token);
           doWithNgram(token);
       }
       return tokens;
   }
 
Example #21
Source File: VasttextTextVectorizer.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
protected List<String> tokensFromRecord(Writable writable)
{

	String text = writable.toString();
	Tokenizer tokenizer = tokenizerFactory.create(text);
	List<String> tokens = new ArrayList<String>();
	while (tokenizer.hasMoreTokens())
        tokens.add(tokenizer.nextToken());
	return tokens;
}
 
Example #22
Source File: DefaultTokenizerFactory.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Override
public Tokenizer create(String toTokenize) {
    DefaultTokenizer t = new DefaultTokenizer(toTokenize);
    t.setTokenPreProcessor(tokenPreProcess);
    return t;
}
 
Example #23
Source File: UimaTokenizerFactory.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Override
public Tokenizer create(InputStream toTokenize) {
    throw new UnsupportedOperationException();
}
 
Example #24
Source File: PosUimaTokenizerFactory.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Override
public Tokenizer create(InputStream toTokenize) {
    throw new UnsupportedOperationException();
}
 
Example #25
Source File: PosUimaTokenizerFactory.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Override
public Tokenizer create(String toTokenize) {
    PosUimaTokenizer t = new PosUimaTokenizer(toTokenize, tokenizer, allowedPoSTags);
    t.setTokenPreProcessor(tokenPreProcess);
    return t;
}
 
Example #26
Source File: DefaultTokenizerFactory.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Override
public Tokenizer create(InputStream toTokenize) {
    Tokenizer t = new DefaultStreamTokenizer(toTokenize);
    t.setTokenPreProcessor(tokenPreProcess);
    return t;
}
 
Example #27
Source File: ContextLabelRetriever.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**
 * Returns a stripped sentence with the indices of words
 * with certain kinds of labels.
 *
 * @param sentence the sentence to process
 * @return a pair of a post processed sentence
 * with labels stripped and the spans of
 * the labels
 */
public static Pair<String, MultiDimensionalMap<Integer, Integer, String>> stringWithLabels(String sentence,
                                                   TokenizerFactory tokenizerFactory) {
    MultiDimensionalMap<Integer, Integer, String> map = MultiDimensionalMap.newHashBackedMap();
    Tokenizer t = tokenizerFactory.create(sentence);
    List<String> currTokens = new ArrayList<>();
    String currLabel = null;
    String endLabel = null;
    List<Pair<String, List<String>>> tokensWithSameLabel = new ArrayList<>();
    while (t.hasMoreTokens()) {
        String token = t.nextToken();
        if (token.matches(BEGIN_LABEL)) {
            currLabel = token;

            //no labels; add these as NONE and begin the new label
            if (!currTokens.isEmpty()) {
                tokensWithSameLabel.add(new Pair<>("NONE", (List<String>) new ArrayList<>(currTokens)));
                currTokens.clear();

            }

        } else if (token.matches(END_LABEL)) {
            if (currLabel == null)
                throw new IllegalStateException("Found an ending label with no matching begin label");
            endLabel = token;
        } else
            currTokens.add(token);

        if (currLabel != null && endLabel != null) {
            currLabel = currLabel.replaceAll("[<>/]", "");
            endLabel = endLabel.replaceAll("[<>/]", "");
            Preconditions.checkState(!currLabel.isEmpty(), "Current label is empty!");
            Preconditions.checkState(!endLabel.isEmpty(), "End label is empty!");
            Preconditions.checkState(currLabel.equals(endLabel), "Current label begin and end did not match for the parse. Was: %s ending with %s",
                    currLabel, endLabel);

            tokensWithSameLabel.add(new Pair<>(currLabel, (List<String>) new ArrayList<>(currTokens)));
            currTokens.clear();


            //clear out the tokens
            currLabel = null;
            endLabel = null;
        }


    }

    //no labels; add these as NONE and begin the new label
    if (!currTokens.isEmpty()) {
        tokensWithSameLabel.add(new Pair<>("none", (List<String>) new ArrayList<>(currTokens)));
        currTokens.clear();

    }

    //now join the output
    StringBuilder strippedSentence = new StringBuilder();
    for (Pair<String, List<String>> tokensWithLabel : tokensWithSameLabel) {
        String joinedSentence = StringUtils.join(tokensWithLabel.getSecond(), " ");
        //spaces between separate parts of the sentence
        if (!(strippedSentence.length() < 1))
            strippedSentence.append(" ");
        strippedSentence.append(joinedSentence);
        int begin = strippedSentence.toString().indexOf(joinedSentence);
        int end = begin + joinedSentence.length();
        map.put(begin, end, tokensWithLabel.getFirst());
    }


    return new Pair<>(strippedSentence.toString(), map);
}
 
Example #28
Source File: DefaultTokenizerFactory.java    From DataVec with Apache License 2.0 4 votes vote down vote up
@Override
public Tokenizer create(InputStream toTokenize) {
    Tokenizer t = new DefaultStreamTokenizer(toTokenize);
    t.setTokenPreProcessor(tokenPreProcess);
    return t;
}
 
Example #29
Source File: DefaultTokenizerFactory.java    From DataVec with Apache License 2.0 4 votes vote down vote up
@Override
public Tokenizer create(String toTokenize) {
    DefaultTokenizer t = new DefaultTokenizer(toTokenize);
    t.setTokenPreProcessor(tokenPreProcess);
    return t;
}
 
Example #30
Source File: UimaTokenizerFactory.java    From DataVec with Apache License 2.0 4 votes vote down vote up
@Override
public Tokenizer create(InputStream toTokenize) {
    throw new UnsupportedOperationException();
}