org.apache.spark.ml.feature.CountVectorizer Java Examples
The following examples show how to use
org.apache.spark.ml.feature.CountVectorizer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CMM.java From vn.vitk with GNU General Public License v3.0 | 9 votes |
/** * Creates a processing pipeline. * @return a pipeline */ private Pipeline createPipeline() { Tokenizer tokenizer = new Tokenizer() .setInputCol("featureStrings") .setOutputCol("tokens"); CountVectorizer countVectorizer = new CountVectorizer() .setInputCol("tokens") .setOutputCol("features") .setMinDF((Double)params.getOrDefault(params.getMinFF())) .setVocabSize((Integer)params.getOrDefault(params.getNumFeatures())); StringIndexer tagIndexer = new StringIndexer() .setInputCol("tag") .setOutputCol("label"); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, tagIndexer}); return pipeline; }
Example #2
Source File: TransitionClassifier.java From vn.vitk with GNU General Public License v3.0 | 6 votes |
/** * Creates a processing pipeline. * @return a pipeline */ protected Pipeline createPipeline() { Tokenizer tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("tokens"); CountVectorizer countVectorizer = new CountVectorizer() .setInputCol("tokens") .setOutputCol("features") .setMinDF((Double)params.getOrDefault(params.getMinFF())) .setVocabSize((Integer)params.getOrDefault(params.getNumFeatures())); StringIndexer transitionIndexer = new StringIndexer() .setInputCol("transition") .setOutputCol("label"); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, transitionIndexer}); return pipeline; }
Example #3
Source File: JavaCountVectorizerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaCountVectorizerExample") .getOrCreate(); // $example on$ // Input data: Each row is a bag of words from a sentence or document. List<Row> data = Arrays.asList( RowFactory.create(Arrays.asList("a", "b", "c")), RowFactory.create(Arrays.asList("a", "b", "b", "c", "a")) ); StructType schema = new StructType(new StructField [] { new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); // fit a CountVectorizerModel from the corpus CountVectorizerModel cvModel = new CountVectorizer() .setInputCol("text") .setOutputCol("feature") .setVocabSize(3) .setMinDF(2) .fit(df); // alternatively, define CountVectorizerModel with a-priori vocabulary CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"}) .setInputCol("text") .setOutputCol("feature"); cvModel.transform(df).show(false); // $example off$ spark.stop(); }
Example #4
Source File: CountVectorizerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testCountVectorizer() { final List<String[]> input = new ArrayList<>(); input.add(new String[]{"a", "b", "c"}); input.add(new String[]{"a", "b", "b", "c", "a"}); //prepare data JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(1, input.get(0)), RowFactory.create(2, input.get(1)) )); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(jrdd, schema); //train model in spark CountVectorizerModel sparkModel = new CountVectorizer() .setInputCol("text") .setOutputCol("feature") .setVocabSize(3) .setMinDF(2) .fit(df); //Export this model byte[] exportedModel = ModelExporter.export(sparkModel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions List<Row> sparkOutput = sparkModel.transform(df).orderBy("id").select("feature").collectAsList(); for (int i = 0; i < 2; i++) { String[] words = input.get(i); Map<String, Object> data = new HashMap<String, Object>(); data.put(sparkModel.getInputCol(), words); transformer.transform(data); double[] transformedOp = (double[]) data.get(sparkModel.getOutputCol()); double[] sparkOp = ((Vector) sparkOutput.get(i).get(0)).toArray(); assertArrayEquals(transformedOp, sparkOp, 0.01); } }
Example #5
Source File: CountVectorizerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testCountVectorizer() { final List<List<String>> input = new ArrayList<>(); input.add(Arrays.<String>asList("a", "b", "c")); input.add(Arrays.<String>asList("a", "b", "b", "c", "a")); //prepare data JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList( RowFactory.create(1, input.get(0)), RowFactory.create(2, input.get(1)) )); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); DataFrame df = sqlContext.createDataFrame(jrdd, schema); //train model in spark CountVectorizerModel sparkModel = new CountVectorizer() .setInputCol("text") .setOutputCol("feature") .setVocabSize(3) .setMinDF(2) .fit(df); //Export this model byte[] exportedModel = ModelExporter.export(sparkModel, df); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions Row[] sparkOutput = sparkModel.transform(df).orderBy("id").select("feature").collect(); for (int i = 0; i < 2; i++) { Object[] words = input.get(i).toArray(); Map<String, Object> data = new HashMap<String, Object>(); data.put(sparkModel.getInputCol(), words); transformer.transform(data); double[] transformedOp = (double[]) data.get(sparkModel.getOutputCol()); double[] sparkOp = ((Vector) sparkOutput[i].get(0)).toArray(); assertArrayEquals(transformedOp, sparkOp, EPSILON); } }