org.apache.spark.ml.feature.HashingTF Java Exaples

Source File: JavaTfIdfExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaTfIdfExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0.0, "Hi I heard about Spark"),
    RowFactory.create(0.0, "I wish Java could use case classes"),
    RowFactory.create(1.0, "Logistic regression models are neat")
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
  });
  Dataset<Row> sentenceData = spark.createDataFrame(data, schema);

  Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
  Dataset<Row> wordsData = tokenizer.transform(sentenceData);

  int numFeatures = 20;
  HashingTF hashingTF = new HashingTF()
    .setInputCol("words")
    .setOutputCol("rawFeatures")
    .setNumFeatures(numFeatures);

  Dataset<Row> featurizedData = hashingTF.transform(wordsData);
  // alternatively, CountVectorizer can also be used to get term frequency vectors

  IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
  IDFModel idfModel = idf.fit(featurizedData);

  Dataset<Row> rescaledData = idfModel.transform(featurizedData);
  rescaledData.select("label", "features").show();
  // $example off$

  spark.stop();
}

Source File: TestSparkMLDeriver.java From envelope with Apache License 2.0

5 votes

private void generateAndSaveModel(String savePath) throws IOException {
  // Sourced from the Spark ML documentation and examples

  StructType trainingSchema = DataTypes.createStructType(Lists.newArrayList(
      DataTypes.createStructField("id", DataTypes.LongType, false),
      DataTypes.createStructField("text", DataTypes.StringType, false),
      DataTypes.createStructField("label", DataTypes.DoubleType, false)
  ));
  Dataset<Row> training = Contexts.getSparkSession().createDataFrame(Lists.newArrayList(
      RowFactory.create(0L, "a b c d e spark", 1.0),
      RowFactory.create(1L, "b d", 0.0),
      RowFactory.create(2L, "spark f g h", 1.0),
      RowFactory.create(3L, "hadoop mapreduce", 0.0)
  ), trainingSchema);

  Tokenizer tokenizer = new Tokenizer()
      .setInputCol("text")
      .setOutputCol("words");
  HashingTF hashingTF = new HashingTF()
      .setNumFeatures(1000)
      .setInputCol(tokenizer.getOutputCol())
      .setOutputCol("features");
  LogisticRegression lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.001);

  Pipeline pipeline = new Pipeline()
      .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});

  PipelineModel model = pipeline.fit(training);

  model.write().overwrite().save(savePath);
}

Source File: HashingTFModelInfoAdapter.java From spark-transformers with Apache License 2.0

5 votes

@Override
public HashingTFModelInfo getModelInfo(final HashingTF from) {
    final HashingTFModelInfo modelInfo = new HashingTFModelInfo();
    modelInfo.setNumFeatures(from.getNumFeatures());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getInputCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);

    return modelInfo;
}

Source File: HashingTFModelInfoAdapter.java From spark-transformers with Apache License 2.0

5 votes

@Override
public HashingTFModelInfo getModelInfo(final HashingTF from, DataFrame df) {
    final HashingTFModelInfo modelInfo = new HashingTFModelInfo();
    modelInfo.setNumFeatures(from.getNumFeatures());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getInputCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);

    return modelInfo;
}

Source File: JavaPipelineExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaPipelineExample")
    .getOrCreate();

  // $example on$
  // Prepare training documents, which are labeled.
  Dataset<Row> training = spark.createDataFrame(Arrays.asList(
    new JavaLabeledDocument(0L, "a b c d e spark", 1.0),
    new JavaLabeledDocument(1L, "b d", 0.0),
    new JavaLabeledDocument(2L, "spark f g h", 1.0),
    new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0)
  ), JavaLabeledDocument.class);

  // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
  Tokenizer tokenizer = new Tokenizer()
    .setInputCol("text")
    .setOutputCol("words");
  HashingTF hashingTF = new HashingTF()
    .setNumFeatures(1000)
    .setInputCol(tokenizer.getOutputCol())
    .setOutputCol("features");
  LogisticRegression lr = new LogisticRegression()
    .setMaxIter(10)
    .setRegParam(0.001);
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});

  // Fit the pipeline to training documents.
  PipelineModel model = pipeline.fit(training);

  // Prepare test documents, which are unlabeled.
  Dataset<Row> test = spark.createDataFrame(Arrays.asList(
    new JavaDocument(4L, "spark i j k"),
    new JavaDocument(5L, "l m n"),
    new JavaDocument(6L, "spark hadoop spark"),
    new JavaDocument(7L, "apache hadoop")
  ), JavaDocument.class);

  // Make predictions on test documents.
  Dataset<Row> predictions = model.transform(test);
  for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) {
    System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
      + ", prediction=" + r.get(3));
  }
  // $example off$

  spark.stop();
}

Source File: JavaModelSelectionViaCrossValidationExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaModelSelectionViaCrossValidationExample")
    .getOrCreate();

  // $example on$
  // Prepare training documents, which are labeled.
  Dataset<Row> training = spark.createDataFrame(Arrays.asList(
    new JavaLabeledDocument(0L, "a b c d e spark", 1.0),
    new JavaLabeledDocument(1L, "b d", 0.0),
    new JavaLabeledDocument(2L,"spark f g h", 1.0),
    new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0),
    new JavaLabeledDocument(4L, "b spark who", 1.0),
    new JavaLabeledDocument(5L, "g d a y", 0.0),
    new JavaLabeledDocument(6L, "spark fly", 1.0),
    new JavaLabeledDocument(7L, "was mapreduce", 0.0),
    new JavaLabeledDocument(8L, "e spark program", 1.0),
    new JavaLabeledDocument(9L, "a e c l", 0.0),
    new JavaLabeledDocument(10L, "spark compile", 1.0),
    new JavaLabeledDocument(11L, "hadoop software", 0.0)
  ), JavaLabeledDocument.class);

  // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
  Tokenizer tokenizer = new Tokenizer()
    .setInputCol("text")
    .setOutputCol("words");
  HashingTF hashingTF = new HashingTF()
    .setNumFeatures(1000)
    .setInputCol(tokenizer.getOutputCol())
    .setOutputCol("features");
  LogisticRegression lr = new LogisticRegression()
    .setMaxIter(10)
    .setRegParam(0.01);
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});

  // We use a ParamGridBuilder to construct a grid of parameters to search over.
  // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
  // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
  ParamMap[] paramGrid = new ParamGridBuilder()
    .addGrid(hashingTF.numFeatures(), new int[] {10, 100, 1000})
    .addGrid(lr.regParam(), new double[] {0.1, 0.01})
    .build();

  // We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
  // This will allow us to jointly choose parameters for all Pipeline stages.
  // A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
  // Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
  // is areaUnderROC.
  CrossValidator cv = new CrossValidator()
    .setEstimator(pipeline)
    .setEvaluator(new BinaryClassificationEvaluator())
    .setEstimatorParamMaps(paramGrid).setNumFolds(2);  // Use 3+ in practice

  // Run cross-validation, and choose the best set of parameters.
  CrossValidatorModel cvModel = cv.fit(training);

  // Prepare test documents, which are unlabeled.
  Dataset<Row> test = spark.createDataFrame(Arrays.asList(
    new JavaDocument(4L, "spark i j k"),
    new JavaDocument(5L, "l m n"),
    new JavaDocument(6L, "mapreduce spark"),
    new JavaDocument(7L, "apache hadoop")
  ), JavaDocument.class);

  // Make predictions on test documents. cvModel uses the best model found (lrModel).
  Dataset<Row> predictions = cvModel.transform(test);
  for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) {
    System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
      + ", prediction=" + r.get(3));
  }
  // $example off$

  spark.stop();
}

Source File: WhitespaceClassifier.java From vn.vitk with GNU General Public License v3.0

4 votes

/**
 * Trains a whitespace classifier model and save the resulting pipeline model
 * to an external file. 
 * @param sentences a list of tokenized sentences.
 * @param pipelineModelFileName
 * @param numFeatures
 */
public void train(List<String> sentences, String pipelineModelFileName, int numFeatures) {
	List<WhitespaceContext> contexts = new ArrayList<WhitespaceContext>(sentences.size());
	int id = 0;
	for (String sentence : sentences) {
		sentence = sentence.trim();
		for (int j = 0; j < sentence.length(); j++) {
			char c = sentence.charAt(j);
			if (c == ' ' || c == '_') {
				WhitespaceContext context = new WhitespaceContext();
				context.setId(id++);
				context.setContext(extractContext(sentence, j));
				context.setLabel(c == ' ' ? 0d : 1d);
				contexts.add(context);
			}
		}
	}
	JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts);
	DataFrame df = sqlContext.createDataFrame(jrdd, WhitespaceContext.class);
	df.show(false);
	System.out.println("N = " + df.count());
	df.groupBy("label").count().show();
	
	org.apache.spark.ml.feature.Tokenizer tokenizer = new Tokenizer()
			.setInputCol("context").setOutputCol("words");
	HashingTF hashingTF = new HashingTF().setNumFeatures(numFeatures)
			.setInputCol(tokenizer.getOutputCol()).setOutputCol("features");
	LogisticRegression lr = new LogisticRegression().setMaxIter(100)
			.setRegParam(0.01);
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {
			tokenizer, hashingTF, lr });
	model = pipeline.fit(df);
	
	try {
		model.write().overwrite().save(pipelineModelFileName);
	} catch (IOException e) {
		e.printStackTrace();
	}
	
	DataFrame predictions = model.transform(df);
	predictions.show();
	MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision");
	double accuracy = evaluator.evaluate(predictions);
	System.out.println("training accuracy = " + accuracy);
	
	LogisticRegressionModel lrModel = (LogisticRegressionModel) model.stages()[2];
	LogisticRegressionTrainingSummary trainingSummary = lrModel.summary();
	double[] objectiveHistory = trainingSummary.objectiveHistory();
	System.out.println("#(iterations) = " + objectiveHistory.length);
	for (double lossPerIteration : objectiveHistory) {
	  System.out.println(lossPerIteration);
	}
	
}

Source File: HashingTFModelInfoAdapter.java From spark-transformers with Apache License 2.0

4 votes

@Override
public Class<HashingTF> getSource() {
    return HashingTF.class;
}

Source File: PipelineBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testPipeline() {
    // Prepare training documents, which are labeled.
    StructType schema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
            createStructField("label", DoubleType, false)
    });
    Dataset<Row> trainingData = spark.createDataFrame(Arrays.asList(
            cr(0L, "a b c d e spark", 1.0),
            cr(1L, "b d", 0.0),
            cr(2L, "spark f g h", 1.0),
            cr(3L, "hadoop mapreduce", 0.0)
    ), schema);

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression.
    RegexTokenizer tokenizer = new RegexTokenizer()
            .setInputCol("text")
            .setOutputCol("words")
            .setPattern("\\s")
            .setGaps(true)
            .setToLowercase(false);

    HashingTF hashingTF = new HashingTF()
            .setNumFeatures(1000)
            .setInputCol(tokenizer.getOutputCol())
            .setOutputCol("features");
    LogisticRegression lr = new LogisticRegression()
            .setMaxIter(10)
            .setRegParam(0.01);
    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{tokenizer, hashingTF, lr});

    // Fit the pipeline to training documents.
    PipelineModel sparkPipelineModel = pipeline.fit(trainingData);


    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipelineModel);
    System.out.println(new String(exportedModel));

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //prepare test data
    StructType testSchema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
    });
    Dataset<Row> testData = spark.createDataFrame(Arrays.asList(
            cr(4L, "spark i j k"),
            cr(5L, "l m n"),
            cr(6L, "mapreduce spark"),
            cr(7L, "apache hadoop")
    ), testSchema);

    //verify that predictions for spark pipeline and exported pipeline are the same
    List<Row> predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collectAsList();
    for (Row r : predictions) {
        System.out.println(r);
        double sparkPipelineOp = r.getDouble(3);
        Map<String, Object> data = new HashMap<String, Object>();
        data.put("text", r.getString(1));
        transformer.transform(data);
        double exportedPipelineOp = (double) data.get("prediction");
        double exportedPipelineProb = (double) data.get("probability");
        assertEquals(sparkPipelineOp, exportedPipelineOp, 0.01);
    }
}

Source File: HashingTFModelInfoAdapter.java From spark-transformers with Apache License 2.0

4 votes

@Override
public Class<HashingTF> getSource() {
    return HashingTF.class;
}

Source File: HashingTFBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testHashingTF() {
    //prepare data
    List<Row> trainingData = Arrays.asList(
            RowFactory.create(1, 0.0, "Hi I heard about Spark"),
            RowFactory.create(2, 0.0, "I wish Java could use case classes"),
            RowFactory.create(3, 1.0, "Logistic regression models are neat")
    );
    StructType schema = new StructType(new StructField[]{
            createStructField("id", IntegerType, false),
            createStructField("label", DoubleType, false),
            createStructField("sentence", StringType, false),
    });

    DataFrame sentenceData = sqlContext.createDataFrame(trainingData, schema);
    Tokenizer tokenizer = new Tokenizer()
            .setInputCol("sentence")
            .setOutputCol("words");
    DataFrame wordsData = tokenizer.transform(sentenceData);

    //train model in spark
    int numFeatures = 20;
    HashingTF sparkModel = new HashingTF()
            .setInputCol("words")
            .setOutputCol("rawFeatures")
            .setNumFeatures(numFeatures);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel, sentenceData);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] sparkOutput = sparkModel.transform(wordsData).orderBy("id").select("id", "sentence", "words", "rawFeatures").collect();
    for (Row row : sparkOutput) {
        String[] words = ((String) row.get(1)).toLowerCase().split(" ");

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), words);
        transformer.transform(data);
        double[] transformedOp = (double[]) data.get(sparkModel.getOutputCol());

        double[] sparkOp = ((Vector) row.get(3)).toArray();
        assertArrayEquals(transformedOp, sparkOp, EPSILON);
    }
}

Source File: PipelineBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testPipeline() {
    // Prepare training documents, which are labeled.
    StructType schema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
            createStructField("label", DoubleType, false)
    });
    DataFrame trainingData = sqlContext.createDataFrame(Arrays.asList(
            cr(0L, "a b c d e spark", 1.0),
            cr(1L, "b d", 0.0),
            cr(2L, "spark f g h", 1.0),
            cr(3L, "hadoop mapreduce", 0.0)
    ), schema);

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression.
    RegexTokenizer tokenizer = new RegexTokenizer()
            .setInputCol("text")
            .setOutputCol("words")
            .setPattern("\\s")
            .setGaps(true)
            .setToLowercase(false);

    HashingTF hashingTF = new HashingTF()
            .setNumFeatures(1000)
            .setInputCol(tokenizer.getOutputCol())
            .setOutputCol("features");
    LogisticRegression lr = new LogisticRegression()
            .setMaxIter(10)
            .setRegParam(0.01);
    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{tokenizer, hashingTF, lr});

    // Fit the pipeline to training documents.
    PipelineModel sparkPipelineModel = pipeline.fit(trainingData);


    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipelineModel, trainingData);
    System.out.println(new String(exportedModel));

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //prepare test data
    StructType testSchema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
    });
    DataFrame testData = sqlContext.createDataFrame(Arrays.asList(
            cr(4L, "spark i j k"),
            cr(5L, "l m n"),
            cr(6L, "mapreduce spark"),
            cr(7L, "apache hadoop")
    ), testSchema);

    //verify that predictions for spark pipeline and exported pipeline are the same
    Row[] predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collect();
    for (Row r : predictions) {
        System.out.println(r);
        double sparkPipelineOp = r.getDouble(3);
        Map<String, Object> data = new HashMap<String, Object>();
        data.put("text", r.getString(1));
        transformer.transform(data);
        double exportedPipelineOp = (double) data.get("prediction");
        double exportedPipelineProb = (double) data.get("probability");
        assertEquals(sparkPipelineOp, exportedPipelineOp, EPSILON);
    }
}

org.apache.spark.ml.feature.HashingTF Java Examples