org.apache.spark.ml.feature.HashingTF Java Examples
The following examples show how to use
org.apache.spark.ml.feature.HashingTF.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaTfIdfExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaTfIdfExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0.0, "Hi I heard about Spark"), RowFactory.create(0.0, "I wish Java could use case classes"), RowFactory.create(1.0, "Logistic regression models are neat") ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> sentenceData = spark.createDataFrame(data, schema); Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); Dataset<Row> wordsData = tokenizer.transform(sentenceData); int numFeatures = 20; HashingTF hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("rawFeatures") .setNumFeatures(numFeatures); Dataset<Row> featurizedData = hashingTF.transform(wordsData); // alternatively, CountVectorizer can also be used to get term frequency vectors IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features"); IDFModel idfModel = idf.fit(featurizedData); Dataset<Row> rescaledData = idfModel.transform(featurizedData); rescaledData.select("label", "features").show(); // $example off$ spark.stop(); }
Example #2
Source File: TestSparkMLDeriver.java From envelope with Apache License 2.0 | 5 votes |
private void generateAndSaveModel(String savePath) throws IOException { // Sourced from the Spark ML documentation and examples StructType trainingSchema = DataTypes.createStructType(Lists.newArrayList( DataTypes.createStructField("id", DataTypes.LongType, false), DataTypes.createStructField("text", DataTypes.StringType, false), DataTypes.createStructField("label", DataTypes.DoubleType, false) )); Dataset<Row> training = Contexts.getSparkSession().createDataFrame(Lists.newArrayList( RowFactory.create(0L, "a b c d e spark", 1.0), RowFactory.create(1L, "b d", 0.0), RowFactory.create(2L, "spark f g h", 1.0), RowFactory.create(3L, "hadoop mapreduce", 0.0) ), trainingSchema); Tokenizer tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words"); HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); PipelineModel model = pipeline.fit(training); model.write().overwrite().save(savePath); }
Example #3
Source File: HashingTFModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 5 votes |
@Override public HashingTFModelInfo getModelInfo(final HashingTF from) { final HashingTFModelInfo modelInfo = new HashingTFModelInfo(); modelInfo.setNumFeatures(from.getNumFeatures()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
Example #4
Source File: HashingTFModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 5 votes |
@Override public HashingTFModelInfo getModelInfo(final HashingTF from, DataFrame df) { final HashingTFModelInfo modelInfo = new HashingTFModelInfo(); modelInfo.setNumFeatures(from.getNumFeatures()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
Example #5
Source File: JavaPipelineExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaPipelineExample") .getOrCreate(); // $example on$ // Prepare training documents, which are labeled. Dataset<Row> training = spark.createDataFrame(Arrays.asList( new JavaLabeledDocument(0L, "a b c d e spark", 1.0), new JavaLabeledDocument(1L, "b d", 0.0), new JavaLabeledDocument(2L, "spark f g h", 1.0), new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0) ), JavaLabeledDocument.class); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. Tokenizer tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words"); HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); // Fit the pipeline to training documents. PipelineModel model = pipeline.fit(training); // Prepare test documents, which are unlabeled. Dataset<Row> test = spark.createDataFrame(Arrays.asList( new JavaDocument(4L, "spark i j k"), new JavaDocument(5L, "l m n"), new JavaDocument(6L, "spark hadoop spark"), new JavaDocument(7L, "apache hadoop") ), JavaDocument.class); // Make predictions on test documents. Dataset<Row> predictions = model.transform(test); for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) + ", prediction=" + r.get(3)); } // $example off$ spark.stop(); }
Example #6
Source File: JavaModelSelectionViaCrossValidationExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaModelSelectionViaCrossValidationExample") .getOrCreate(); // $example on$ // Prepare training documents, which are labeled. Dataset<Row> training = spark.createDataFrame(Arrays.asList( new JavaLabeledDocument(0L, "a b c d e spark", 1.0), new JavaLabeledDocument(1L, "b d", 0.0), new JavaLabeledDocument(2L,"spark f g h", 1.0), new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0), new JavaLabeledDocument(4L, "b spark who", 1.0), new JavaLabeledDocument(5L, "g d a y", 0.0), new JavaLabeledDocument(6L, "spark fly", 1.0), new JavaLabeledDocument(7L, "was mapreduce", 0.0), new JavaLabeledDocument(8L, "e spark program", 1.0), new JavaLabeledDocument(9L, "a e c l", 0.0), new JavaLabeledDocument(10L, "spark compile", 1.0), new JavaLabeledDocument(11L, "hadoop software", 0.0) ), JavaLabeledDocument.class); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. Tokenizer tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words"); HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.01); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); // We use a ParamGridBuilder to construct a grid of parameters to search over. // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam, // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from. ParamMap[] paramGrid = new ParamGridBuilder() .addGrid(hashingTF.numFeatures(), new int[] {10, 100, 1000}) .addGrid(lr.regParam(), new double[] {0.1, 0.01}) .build(); // We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. // This will allow us to jointly choose parameters for all Pipeline stages. // A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. // Note that the evaluator here is a BinaryClassificationEvaluator and its default metric // is areaUnderROC. CrossValidator cv = new CrossValidator() .setEstimator(pipeline) .setEvaluator(new BinaryClassificationEvaluator()) .setEstimatorParamMaps(paramGrid).setNumFolds(2); // Use 3+ in practice // Run cross-validation, and choose the best set of parameters. CrossValidatorModel cvModel = cv.fit(training); // Prepare test documents, which are unlabeled. Dataset<Row> test = spark.createDataFrame(Arrays.asList( new JavaDocument(4L, "spark i j k"), new JavaDocument(5L, "l m n"), new JavaDocument(6L, "mapreduce spark"), new JavaDocument(7L, "apache hadoop") ), JavaDocument.class); // Make predictions on test documents. cvModel uses the best model found (lrModel). Dataset<Row> predictions = cvModel.transform(test); for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) + ", prediction=" + r.get(3)); } // $example off$ spark.stop(); }
Example #7
Source File: WhitespaceClassifier.java From vn.vitk with GNU General Public License v3.0 | 4 votes |
/** * Trains a whitespace classifier model and save the resulting pipeline model * to an external file. * @param sentences a list of tokenized sentences. * @param pipelineModelFileName * @param numFeatures */ public void train(List<String> sentences, String pipelineModelFileName, int numFeatures) { List<WhitespaceContext> contexts = new ArrayList<WhitespaceContext>(sentences.size()); int id = 0; for (String sentence : sentences) { sentence = sentence.trim(); for (int j = 0; j < sentence.length(); j++) { char c = sentence.charAt(j); if (c == ' ' || c == '_') { WhitespaceContext context = new WhitespaceContext(); context.setId(id++); context.setContext(extractContext(sentence, j)); context.setLabel(c == ' ' ? 0d : 1d); contexts.add(context); } } } JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts); DataFrame df = sqlContext.createDataFrame(jrdd, WhitespaceContext.class); df.show(false); System.out.println("N = " + df.count()); df.groupBy("label").count().show(); org.apache.spark.ml.feature.Tokenizer tokenizer = new Tokenizer() .setInputCol("context").setOutputCol("words"); HashingTF hashingTF = new HashingTF().setNumFeatures(numFeatures) .setInputCol(tokenizer.getOutputCol()).setOutputCol("features"); LogisticRegression lr = new LogisticRegression().setMaxIter(100) .setRegParam(0.01); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { tokenizer, hashingTF, lr }); model = pipeline.fit(df); try { model.write().overwrite().save(pipelineModelFileName); } catch (IOException e) { e.printStackTrace(); } DataFrame predictions = model.transform(df); predictions.show(); MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision"); double accuracy = evaluator.evaluate(predictions); System.out.println("training accuracy = " + accuracy); LogisticRegressionModel lrModel = (LogisticRegressionModel) model.stages()[2]; LogisticRegressionTrainingSummary trainingSummary = lrModel.summary(); double[] objectiveHistory = trainingSummary.objectiveHistory(); System.out.println("#(iterations) = " + objectiveHistory.length); for (double lossPerIteration : objectiveHistory) { System.out.println(lossPerIteration); } }
Example #8
Source File: HashingTFModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 4 votes |
@Override public Class<HashingTF> getSource() { return HashingTF.class; }
Example #9
Source File: PipelineBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testPipeline() { // Prepare training documents, which are labeled. StructType schema = createStructType(new StructField[]{ createStructField("id", LongType, false), createStructField("text", StringType, false), createStructField("label", DoubleType, false) }); Dataset<Row> trainingData = spark.createDataFrame(Arrays.asList( cr(0L, "a b c d e spark", 1.0), cr(1L, "b d", 0.0), cr(2L, "spark f g h", 1.0), cr(3L, "hadoop mapreduce", 0.0) ), schema); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression. RegexTokenizer tokenizer = new RegexTokenizer() .setInputCol("text") .setOutputCol("words") .setPattern("\\s") .setGaps(true) .setToLowercase(false); HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.01); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{tokenizer, hashingTF, lr}); // Fit the pipeline to training documents. PipelineModel sparkPipelineModel = pipeline.fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(sparkPipelineModel); System.out.println(new String(exportedModel)); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //prepare test data StructType testSchema = createStructType(new StructField[]{ createStructField("id", LongType, false), createStructField("text", StringType, false), }); Dataset<Row> testData = spark.createDataFrame(Arrays.asList( cr(4L, "spark i j k"), cr(5L, "l m n"), cr(6L, "mapreduce spark"), cr(7L, "apache hadoop") ), testSchema); //verify that predictions for spark pipeline and exported pipeline are the same List<Row> predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collectAsList(); for (Row r : predictions) { System.out.println(r); double sparkPipelineOp = r.getDouble(3); Map<String, Object> data = new HashMap<String, Object>(); data.put("text", r.getString(1)); transformer.transform(data); double exportedPipelineOp = (double) data.get("prediction"); double exportedPipelineProb = (double) data.get("probability"); assertEquals(sparkPipelineOp, exportedPipelineOp, 0.01); } }
Example #10
Source File: HashingTFModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 4 votes |
@Override public Class<HashingTF> getSource() { return HashingTF.class; }
Example #11
Source File: HashingTFBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testHashingTF() { //prepare data List<Row> trainingData = Arrays.asList( RowFactory.create(1, 0.0, "Hi I heard about Spark"), RowFactory.create(2, 0.0, "I wish Java could use case classes"), RowFactory.create(3, 1.0, "Logistic regression models are neat") ); StructType schema = new StructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", DoubleType, false), createStructField("sentence", StringType, false), }); DataFrame sentenceData = sqlContext.createDataFrame(trainingData, schema); Tokenizer tokenizer = new Tokenizer() .setInputCol("sentence") .setOutputCol("words"); DataFrame wordsData = tokenizer.transform(sentenceData); //train model in spark int numFeatures = 20; HashingTF sparkModel = new HashingTF() .setInputCol("words") .setOutputCol("rawFeatures") .setNumFeatures(numFeatures); //Export this model byte[] exportedModel = ModelExporter.export(sparkModel, sentenceData); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions Row[] sparkOutput = sparkModel.transform(wordsData).orderBy("id").select("id", "sentence", "words", "rawFeatures").collect(); for (Row row : sparkOutput) { String[] words = ((String) row.get(1)).toLowerCase().split(" "); Map<String, Object> data = new HashMap<String, Object>(); data.put(sparkModel.getInputCol(), words); transformer.transform(data); double[] transformedOp = (double[]) data.get(sparkModel.getOutputCol()); double[] sparkOp = ((Vector) row.get(3)).toArray(); assertArrayEquals(transformedOp, sparkOp, EPSILON); } }
Example #12
Source File: PipelineBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testPipeline() { // Prepare training documents, which are labeled. StructType schema = createStructType(new StructField[]{ createStructField("id", LongType, false), createStructField("text", StringType, false), createStructField("label", DoubleType, false) }); DataFrame trainingData = sqlContext.createDataFrame(Arrays.asList( cr(0L, "a b c d e spark", 1.0), cr(1L, "b d", 0.0), cr(2L, "spark f g h", 1.0), cr(3L, "hadoop mapreduce", 0.0) ), schema); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression. RegexTokenizer tokenizer = new RegexTokenizer() .setInputCol("text") .setOutputCol("words") .setPattern("\\s") .setGaps(true) .setToLowercase(false); HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.01); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{tokenizer, hashingTF, lr}); // Fit the pipeline to training documents. PipelineModel sparkPipelineModel = pipeline.fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(sparkPipelineModel, trainingData); System.out.println(new String(exportedModel)); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //prepare test data StructType testSchema = createStructType(new StructField[]{ createStructField("id", LongType, false), createStructField("text", StringType, false), }); DataFrame testData = sqlContext.createDataFrame(Arrays.asList( cr(4L, "spark i j k"), cr(5L, "l m n"), cr(6L, "mapreduce spark"), cr(7L, "apache hadoop") ), testSchema); //verify that predictions for spark pipeline and exported pipeline are the same Row[] predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collect(); for (Row r : predictions) { System.out.println(r); double sparkPipelineOp = r.getDouble(3); Map<String, Object> data = new HashMap<String, Object>(); data.put("text", r.getString(1)); transformer.transform(data); double exportedPipelineOp = (double) data.get("prediction"); double exportedPipelineProb = (double) data.get("probability"); assertEquals(sparkPipelineOp, exportedPipelineOp, EPSILON); } }