Java Code Examples for weka.filters.unsupervised.attribute.StringToWordVector#setWordsToKeep()

The following examples show how to use weka.filters.unsupervised.attribute.StringToWordVector#setWordsToKeep() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Trainer.java    From sentiment-analysis with Apache License 2.0 6 votes vote down vote up
/**Returns the text-based Representations.*/
private Instances getText(String fileText) throws Exception{
	DataSource ds = new DataSource(fileText);
	Instances data =  ds.getDataSet();
	data.setClassIndex(1);
	StringToWordVector filter = new StringToWordVector();
	filter.setInputFormat(data);
	filter.setLowerCaseTokens(true);
	filter.setMinTermFreq(1);
	filter.setUseStoplist(false);
	filter.setTFTransform(false);
	filter.setIDFTransform(false);		
	filter.setWordsToKeep(1000000000);
	NGramTokenizer tokenizer = new NGramTokenizer();
	tokenizer.setNGramMinSize(2);
	tokenizer.setNGramMaxSize(2);
	filter.setTokenizer(tokenizer);	
	Instances newData = weka.filters.Filter.useFilter(data, filter);
	return newData;
}
 
Example 2
Source File: Trainer.java    From sentiment-analysis with Apache License 2.0 6 votes vote down vote up
/**Returns the Feature-based Representations.*/
private Instances getFeature(String fileFeature) throws Exception{
	DataSource ds = new DataSource(fileFeature);
	Instances data =  ds.getDataSet();
	data.setClassIndex(1);
	StringToWordVector filter = new StringToWordVector();
	filter.setInputFormat(data);
	filter.setLowerCaseTokens(true);
	filter.setMinTermFreq(1);
	filter.setUseStoplist(false);
	filter.setTFTransform(false);
	filter.setIDFTransform(false);		
	filter.setWordsToKeep(1000000000);
	NGramTokenizer tokenizer = new NGramTokenizer();
	tokenizer.setNGramMinSize(1);
	tokenizer.setNGramMaxSize(1);
	filter.setTokenizer(tokenizer);	
	Instances newData = weka.filters.Filter.useFilter(data, filter);
	return newData;
}
 
Example 3
Source File: Trainer.java    From sentiment-analysis with Apache License 2.0 6 votes vote down vote up
/**Returns the Combined (text+POS) Representations.*/
private Instances getComplex(String fileComplex) throws Exception{
	DataSource ds = new DataSource(fileComplex);
	Instances data =  ds.getDataSet();
	data.setClassIndex(1);
	StringToWordVector filter = new StringToWordVector();
	filter.setInputFormat(data);
	filter.setLowerCaseTokens(true);
	filter.setMinTermFreq(1);
	filter.setUseStoplist(false);
	filter.setTFTransform(false);
	filter.setIDFTransform(false);		
	filter.setWordsToKeep(1000000000);
	NGramTokenizer tokenizer = new NGramTokenizer();
	tokenizer.setNGramMinSize(2);
	tokenizer.setNGramMaxSize(2);
	filter.setTokenizer(tokenizer);	
	Instances newData = weka.filters.Filter.useFilter(data, filter);
	return newData;
}
 
Example 4
Source File: SentimentAnalyser.java    From sentiment-analysis with Apache License 2.0 5 votes vote down vote up
/**StringToWordVector filter initialization.*/
private void initializeFilter(){
	stwv = new StringToWordVector();
	stwv.setLowerCaseTokens(true);
	stwv.setMinTermFreq(1);
	stwv.setUseStoplist(false);
	stwv.setTFTransform(false);
	stwv.setIDFTransform(false);		
	stwv.setWordsToKeep(1000000000);
	NGramTokenizer tokenizer = new NGramTokenizer();
	tokenizer.setNGramMinSize(2);
	tokenizer.setNGramMaxSize(2);
	stwv.setTokenizer(tokenizer);
	stwv.setAttributeIndices("first");
}
 
Example 5
Source File: PolarityClassifier.java    From sentiment-analysis with Apache License 2.0 5 votes vote down vote up
/**Initializes the StringToWordVector filter to be used in the representations.*/
private void initialiseTextFilter(){
	stwv = new StringToWordVector();
	stwv.setLowerCaseTokens(true);
	stwv.setMinTermFreq(1);
	stwv.setUseStoplist(false);
	stwv.setTFTransform(false);
	stwv.setIDFTransform(false);		
	stwv.setWordsToKeep(1000000000);
	NGramTokenizer tokenizer = new NGramTokenizer();
	tokenizer.setNGramMinSize(2);
	tokenizer.setNGramMaxSize(2);
	stwv.setTokenizer(tokenizer);
}