Java Code Examples for weka.filters.unsupervised.attribute.StringToWordVector#setWordsToKeep()
The following examples show how to use
weka.filters.unsupervised.attribute.StringToWordVector#setWordsToKeep() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Trainer.java From sentiment-analysis with Apache License 2.0 | 6 votes |
/**Returns the text-based Representations.*/ private Instances getText(String fileText) throws Exception{ DataSource ds = new DataSource(fileText); Instances data = ds.getDataSet(); data.setClassIndex(1); StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(data); filter.setLowerCaseTokens(true); filter.setMinTermFreq(1); filter.setUseStoplist(false); filter.setTFTransform(false); filter.setIDFTransform(false); filter.setWordsToKeep(1000000000); NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMinSize(2); tokenizer.setNGramMaxSize(2); filter.setTokenizer(tokenizer); Instances newData = weka.filters.Filter.useFilter(data, filter); return newData; }
Example 2
Source File: Trainer.java From sentiment-analysis with Apache License 2.0 | 6 votes |
/**Returns the Feature-based Representations.*/ private Instances getFeature(String fileFeature) throws Exception{ DataSource ds = new DataSource(fileFeature); Instances data = ds.getDataSet(); data.setClassIndex(1); StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(data); filter.setLowerCaseTokens(true); filter.setMinTermFreq(1); filter.setUseStoplist(false); filter.setTFTransform(false); filter.setIDFTransform(false); filter.setWordsToKeep(1000000000); NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMinSize(1); tokenizer.setNGramMaxSize(1); filter.setTokenizer(tokenizer); Instances newData = weka.filters.Filter.useFilter(data, filter); return newData; }
Example 3
Source File: Trainer.java From sentiment-analysis with Apache License 2.0 | 6 votes |
/**Returns the Combined (text+POS) Representations.*/ private Instances getComplex(String fileComplex) throws Exception{ DataSource ds = new DataSource(fileComplex); Instances data = ds.getDataSet(); data.setClassIndex(1); StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(data); filter.setLowerCaseTokens(true); filter.setMinTermFreq(1); filter.setUseStoplist(false); filter.setTFTransform(false); filter.setIDFTransform(false); filter.setWordsToKeep(1000000000); NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMinSize(2); tokenizer.setNGramMaxSize(2); filter.setTokenizer(tokenizer); Instances newData = weka.filters.Filter.useFilter(data, filter); return newData; }
Example 4
Source File: SentimentAnalyser.java From sentiment-analysis with Apache License 2.0 | 5 votes |
/**StringToWordVector filter initialization.*/ private void initializeFilter(){ stwv = new StringToWordVector(); stwv.setLowerCaseTokens(true); stwv.setMinTermFreq(1); stwv.setUseStoplist(false); stwv.setTFTransform(false); stwv.setIDFTransform(false); stwv.setWordsToKeep(1000000000); NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMinSize(2); tokenizer.setNGramMaxSize(2); stwv.setTokenizer(tokenizer); stwv.setAttributeIndices("first"); }
Example 5
Source File: PolarityClassifier.java From sentiment-analysis with Apache License 2.0 | 5 votes |
/**Initializes the StringToWordVector filter to be used in the representations.*/ private void initialiseTextFilter(){ stwv = new StringToWordVector(); stwv.setLowerCaseTokens(true); stwv.setMinTermFreq(1); stwv.setUseStoplist(false); stwv.setTFTransform(false); stwv.setIDFTransform(false); stwv.setWordsToKeep(1000000000); NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMinSize(2); tokenizer.setNGramMaxSize(2); stwv.setTokenizer(tokenizer); }