org.apache.spark.ml.feature.CountVectorizer Java Examples

The following examples show how to use org.apache.spark.ml.feature.CountVectorizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CMM.java    From vn.vitk with GNU General Public License v3.0 9 votes vote down vote up
/**
 * Creates a processing pipeline.
 * @return a pipeline
 */
private Pipeline createPipeline() {
	Tokenizer tokenizer = new Tokenizer()
		.setInputCol("featureStrings")
		.setOutputCol("tokens");
	CountVectorizer countVectorizer = new CountVectorizer()
		.setInputCol("tokens")
		.setOutputCol("features")
		.setMinDF((Double)params.getOrDefault(params.getMinFF()))
		.setVocabSize((Integer)params.getOrDefault(params.getNumFeatures()));  
	StringIndexer tagIndexer = new StringIndexer()
		.setInputCol("tag")
		.setOutputCol("label");
	
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, tagIndexer});
	return pipeline;
}
 
Example #2
Source File: TransitionClassifier.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Creates a processing pipeline.
 * @return a pipeline
 */
protected Pipeline createPipeline() {
	Tokenizer tokenizer = new Tokenizer()
		.setInputCol("text")
		.setOutputCol("tokens");
	CountVectorizer countVectorizer = new CountVectorizer()
		.setInputCol("tokens")
		.setOutputCol("features")
		.setMinDF((Double)params.getOrDefault(params.getMinFF()))
		.setVocabSize((Integer)params.getOrDefault(params.getNumFeatures()));  
	StringIndexer transitionIndexer = new StringIndexer()
		.setInputCol("transition")
		.setOutputCol("label");
	
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, transitionIndexer});
	return pipeline;
}
 
Example #3
Source File: JavaCountVectorizerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaCountVectorizerExample")
    .getOrCreate();

  // $example on$
  // Input data: Each row is a bag of words from a sentence or document.
  List<Row> data = Arrays.asList(
    RowFactory.create(Arrays.asList("a", "b", "c")),
    RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
  );
  StructType schema = new StructType(new StructField [] {
    new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  // fit a CountVectorizerModel from the corpus
  CountVectorizerModel cvModel = new CountVectorizer()
    .setInputCol("text")
    .setOutputCol("feature")
    .setVocabSize(3)
    .setMinDF(2)
    .fit(df);

  // alternatively, define CountVectorizerModel with a-priori vocabulary
  CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
    .setInputCol("text")
    .setOutputCol("feature");

  cvModel.transform(df).show(false);
  // $example off$

  spark.stop();
}
 
Example #4
Source File: CountVectorizerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testCountVectorizer() {

    final List<String[]> input = new ArrayList<>();
    input.add(new String[]{"a", "b", "c"});
    input.add(new String[]{"a", "b", "b", "c", "a"});

    //prepare data
    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
            RowFactory.create(1, input.get(0)),
            RowFactory.create(2, input.get(1))
    ));
    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
            new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
    });
    Dataset<Row> df = spark.createDataFrame(jrdd, schema);

    //train model in spark
    CountVectorizerModel sparkModel = new CountVectorizer()
            .setInputCol("text")
            .setOutputCol("feature")
            .setVocabSize(3)
            .setMinDF(2)
            .fit(df);
    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    List<Row> sparkOutput = sparkModel.transform(df).orderBy("id").select("feature").collectAsList();
    for (int i = 0; i < 2; i++) {
        String[] words = input.get(i);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), words);
        transformer.transform(data);
        double[] transformedOp = (double[]) data.get(sparkModel.getOutputCol());

        double[] sparkOp = ((Vector) sparkOutput.get(i).get(0)).toArray();
        assertArrayEquals(transformedOp, sparkOp, 0.01);
    }
}
 
Example #5
Source File: CountVectorizerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testCountVectorizer() {

    final List<List<String>> input = new ArrayList<>();
    input.add(Arrays.<String>asList("a", "b", "c"));
    input.add(Arrays.<String>asList("a", "b", "b", "c", "a"));

    //prepare data
    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create(1, input.get(0)),
            RowFactory.create(2, input.get(1))
    ));
    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
            new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
    });
    DataFrame df = sqlContext.createDataFrame(jrdd, schema);

    //train model in spark
    CountVectorizerModel sparkModel = new CountVectorizer()
            .setInputCol("text")
            .setOutputCol("feature")
            .setVocabSize(3)
            .setMinDF(2)
            .fit(df);
    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel, df);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] sparkOutput = sparkModel.transform(df).orderBy("id").select("feature").collect();
    for (int i = 0; i < 2; i++) {
        Object[] words = input.get(i).toArray();

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), words);
        transformer.transform(data);
        double[] transformedOp = (double[]) data.get(sparkModel.getOutputCol());

        double[] sparkOp = ((Vector) sparkOutput[i].get(0)).toArray();
        assertArrayEquals(transformedOp, sparkOp, EPSILON);
    }
}