Java Code Examples for org.datavec.nlp.tokenization.tokenizer.Tokenizer#nextToken()
The following examples show how to use
org.datavec.nlp.tokenization.tokenizer.Tokenizer#nextToken() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: VasttextTextVectorizer.java From scava with Eclipse Public License 2.0 | 5 votes |
private List<String> doWithTokens(Tokenizer tokenizer) { List<String> tokens = new ArrayList<String>(); String token; while (tokenizer.hasMoreTokens()) { token = tokenizer.nextToken(); tokens.add(token); doWithNgram(token); } return tokens; }
Example 2
Source File: VasttextTextVectorizer.java From scava with Eclipse Public License 2.0 | 5 votes |
private List<String> doWithTokensStopWords(Tokenizer tokenizer) { List<String> tokens = new ArrayList<String>(); String token; while (tokenizer.hasMoreTokens()) { token = tokenizer.nextToken(); if (!stopWords.contains(token)) { tokens.add(token); doWithNgram(token); } } return tokens; }
Example 3
Source File: AbstractTfidfVectorizer.java From DataVec with Apache License 2.0 | 5 votes |
@Override public void doWithTokens(Tokenizer tokenizer) { Set<String> seen = new HashSet<>(); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (!stopWords.contains(token)) { cache.incrementCount(token); if (!seen.contains(token)) { cache.incrementDocCount(token); } seen.add(token); } } }
Example 4
Source File: AbstractTfidfVectorizer.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public void doWithTokens(Tokenizer tokenizer) { Set<String> seen = new HashSet<>(); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (!stopWords.contains(token)) { cache.incrementCount(token); if (!seen.contains(token)) { cache.incrementDocCount(token); } seen.add(token); } } }
Example 5
Source File: ContextLabelRetriever.java From deeplearning4j with Apache License 2.0 | 4 votes |
/** * Returns a stripped sentence with the indices of words * with certain kinds of labels. * * @param sentence the sentence to process * @return a pair of a post processed sentence * with labels stripped and the spans of * the labels */ public static Pair<String, MultiDimensionalMap<Integer, Integer, String>> stringWithLabels(String sentence, TokenizerFactory tokenizerFactory) { MultiDimensionalMap<Integer, Integer, String> map = MultiDimensionalMap.newHashBackedMap(); Tokenizer t = tokenizerFactory.create(sentence); List<String> currTokens = new ArrayList<>(); String currLabel = null; String endLabel = null; List<Pair<String, List<String>>> tokensWithSameLabel = new ArrayList<>(); while (t.hasMoreTokens()) { String token = t.nextToken(); if (token.matches(BEGIN_LABEL)) { currLabel = token; //no labels; add these as NONE and begin the new label if (!currTokens.isEmpty()) { tokensWithSameLabel.add(new Pair<>("NONE", (List<String>) new ArrayList<>(currTokens))); currTokens.clear(); } } else if (token.matches(END_LABEL)) { if (currLabel == null) throw new IllegalStateException("Found an ending label with no matching begin label"); endLabel = token; } else currTokens.add(token); if (currLabel != null && endLabel != null) { currLabel = currLabel.replaceAll("[<>/]", ""); endLabel = endLabel.replaceAll("[<>/]", ""); Preconditions.checkState(!currLabel.isEmpty(), "Current label is empty!"); Preconditions.checkState(!endLabel.isEmpty(), "End label is empty!"); Preconditions.checkState(currLabel.equals(endLabel), "Current label begin and end did not match for the parse. Was: %s ending with %s", currLabel, endLabel); tokensWithSameLabel.add(new Pair<>(currLabel, (List<String>) new ArrayList<>(currTokens))); currTokens.clear(); //clear out the tokens currLabel = null; endLabel = null; } } //no labels; add these as NONE and begin the new label if (!currTokens.isEmpty()) { tokensWithSameLabel.add(new Pair<>("none", (List<String>) new ArrayList<>(currTokens))); currTokens.clear(); } //now join the output StringBuilder strippedSentence = new StringBuilder(); for (Pair<String, List<String>> tokensWithLabel : tokensWithSameLabel) { String joinedSentence = StringUtils.join(tokensWithLabel.getSecond(), " "); //spaces between separate parts of the sentence if (!(strippedSentence.length() < 1)) strippedSentence.append(" "); strippedSentence.append(joinedSentence); int begin = strippedSentence.toString().indexOf(joinedSentence); int end = begin + joinedSentence.length(); map.put(begin, end, tokensWithLabel.getFirst()); } return new Pair<>(strippedSentence.toString(), map); }