org.apache.spark.mllib.regression.LabeledPoint Java Examples
The following examples show how to use
org.apache.spark.mllib.regression.LabeledPoint.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: LogisticRegressionExporterTest.java From spark-transformers with Apache License 2.0 | 6 votes |
@Test public void shouldExportAndImportCorrectly() { String datapath = "src/test/resources/binary_classification_test.libsvm"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD(); //Train model in spark LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(data.rdd()); //Export this model byte[] exportedModel = ModelExporter.export(lrmodel, null); //Import it back LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel); //check if they are exactly equal with respect to their fields //it maybe edge cases eg. order of elements in the list is changed assertEquals(lrmodel.intercept(), importedModel.getIntercept(), EPSILON); assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), EPSILON); assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), EPSILON); assertEquals((double) lrmodel.getThreshold().get(), importedModel.getThreshold(), EPSILON); for (int i = 0; i < importedModel.getNumFeatures(); i++) assertEquals(lrmodel.weights().toArray()[i], importedModel.getWeights()[i], EPSILON); }
Example #2
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * Returns a labeled point of the writables * where the final item is the point and the rest of the items are * features * @param writables the writables * @return the labeled point */ public static LabeledPoint pointOf(Collection<Writable> writables) { double[] ret = new double[writables.size() - 1]; int count = 0; double target = 0; for (Writable w : writables) { if (count < writables.size() - 1) ret[count++] = Float.parseFloat(w.toString()); else target = Float.parseFloat(w.toString()); } if (target < 0) throw new IllegalStateException("Target must be >= 0"); return new LabeledPoint(target, Vectors.dense(ret)); }
Example #3
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * Convert a traditional sc.binaryFiles * in to something usable for machine learning * @param binaryFiles the binary files to convert * @param reader the reader to use * @return the labeled points based on the given rdd */ public static JavaRDD<LabeledPoint> fromBinary(JavaPairRDD<String, PortableDataStream> binaryFiles, final RecordReader reader) { JavaRDD<Collection<Writable>> records = binaryFiles.map(new Function<Tuple2<String, PortableDataStream>, Collection<Writable>>() { @Override public Collection<Writable> call( Tuple2<String, PortableDataStream> stringPortableDataStreamTuple2) throws Exception { reader.initialize(new InputStreamInputSplit(stringPortableDataStreamTuple2._2().open(), stringPortableDataStreamTuple2._1())); return reader.next(); } }); JavaRDD<LabeledPoint> ret = records.map(new Function<Collection<Writable>, LabeledPoint>() { @Override public LabeledPoint call(Collection<Writable> writables) throws Exception { return pointOf(writables); } }); return ret; }
Example #4
Source File: RDFUpdate.java From oryx with Apache License 2.0 | 6 votes |
/** * @param trainPointData data to run down trees * @param model random decision forest model to count on * @return map of predictor index to the number of training examples that reached a * node whose decision is based on that feature. The index is among predictors, not all * features, since there are fewer predictors than features. That is, the index will * match the one used in the {@link RandomForestModel}. */ private static IntLongHashMap predictorExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData, RandomForestModel model) { return trainPointData.mapPartitions(data -> { IntLongHashMap featureIndexCount = new IntLongHashMap(); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (DecisionTreeModel tree : model.trees()) { org.apache.spark.mllib.tree.model.Node node = tree.topNode(); // This logic cloned from Node.predict: while (!node.isLeaf()) { Split split = node.split().get(); int featureIndex = split.feature(); // Count feature featureIndexCount.addToValue(featureIndex, 1); node = nextNode(featureVector, node, split, featureIndex); } } }); return Collections.singleton(featureIndexCount).iterator(); }).reduce(RDFUpdate::merge); }
Example #5
Source File: LogisticRegressionExporterTest.java From spark-transformers with Apache License 2.0 | 6 votes |
@Test public void shouldExportAndImportCorrectly() { String datapath = "src/test/resources/binary_classification_test.libsvm"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); //Train model in spark LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(data.rdd()); //Export this model byte[] exportedModel = ModelExporter.export(lrmodel); //Import it back LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel); //check if they are exactly equal with respect to their fields //it maybe edge cases eg. order of elements in the list is changed assertEquals(lrmodel.intercept(), importedModel.getIntercept(), 0.01); assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), 0.01); assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), 0.01); assertEquals((double) lrmodel.getThreshold().get(), importedModel.getThreshold(), 0.01); for (int i = 0; i < importedModel.getNumFeatures(); i++) assertEquals(lrmodel.weights().toArray()[i], importedModel.getWeights()[i], 0.01); }
Example #6
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Convert an rdd of data set in to labeled point. * @param data the dataset to convert * @param preCache boolean pre-cache rdd before operation * @return an rdd of labeled point */ public static JavaRDD<LabeledPoint> fromDataSet(JavaRDD<DataSet> data, boolean preCache) { if (preCache && !data.getStorageLevel().useMemory()) { data.cache(); } return data.map(new Function<DataSet, LabeledPoint>() { @Override public LabeledPoint call(DataSet dataSet) { return toLabeledPoint(dataSet); } }); }
Example #7
Source File: TestSparkMultiLayerParameterAveraging.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testFromSvmLight() throws Exception { JavaRDD<LabeledPoint> data = MLUtils .loadLibSVMFile(sc.sc(), new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive() .getAbsolutePath()) .toJavaRDD().map(new TestFn()); MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(123) .updater(new Adam(1e-6)) .weightInit(WeightInit.XAVIER) .list() .layer(new BatchNormalization.Builder().nIn(4).nOut(4).build()) .layer(new DenseLayer.Builder().nIn(4).nOut(32).activation(Activation.RELU).build()) .layer(new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(32).nOut(3) .activation(Activation.SOFTMAX).build()) .build(); MultiLayerNetwork network = new MultiLayerNetwork(conf); network.init(); System.out.println("Initializing network"); SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, getBasicConf(), new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0)); master.fitLabeledPoint(data); }
Example #8
Source File: TestSparkMultiLayerParameterAveraging.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testFromSvmLightBackprop() throws Exception { JavaRDD<LabeledPoint> data = MLUtils .loadLibSVMFile(sc.sc(), new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive() .getAbsolutePath()) .toJavaRDD().map(new TestFn()); DataSet d = new IrisDataSetIterator(150, 150).next(); MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(123) .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).list() .layer(0, new DenseLayer.Builder().nIn(4).nOut(100).weightInit(WeightInit.XAVIER) .activation(Activation.RELU).build()) .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder( LossFunctions.LossFunction.MCXENT).nIn(100).nOut(3) .activation(Activation.SOFTMAX).weightInit(WeightInit.XAVIER) .build()) .build(); MultiLayerNetwork network = new MultiLayerNetwork(conf); network.init(); System.out.println("Initializing network"); SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, conf, new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0)); MultiLayerNetwork network2 = master.fitLabeledPoint(data); }
Example #9
Source File: MLLIbUtilTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testMlLibTest() { DataSet dataSet = new IrisDataSetIterator(150, 150).next(); List<DataSet> list = dataSet.asList(); JavaRDD<DataSet> data = sc.parallelize(list); JavaRDD<LabeledPoint> mllLibData = MLLibUtil.fromDataSet(sc, data); }
Example #10
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * * @param point * @param numPossibleLabels * @return {@link DataSet} */ private static DataSet fromLabeledPoint(LabeledPoint point, long numPossibleLabels) { Vector features = point.features(); double label = point.label(); // FIXMEL int cast double[] fArr = features.toArray(); return new DataSet(Nd4j.create(fArr, new long[]{1,fArr.length}), FeatureUtil.toOutcomeVector((int) label, (int) numPossibleLabels)); }
Example #11
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * * @param labeledPoints * @param numPossibleLabels * @return List of {@link DataSet} */ private static List<DataSet> fromLabeledPoint(List<LabeledPoint> labeledPoints, long numPossibleLabels) { List<DataSet> ret = new ArrayList<>(); for (LabeledPoint point : labeledPoints) { ret.add(fromLabeledPoint(point, numPossibleLabels)); } return ret; }
Example #12
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Convert a list of dataset in to a list of labeled points * @param labeledPoints the labeled points to convert * @return the labeled point list */ private static List<LabeledPoint> toLabeledPoint(List<DataSet> labeledPoints) { List<LabeledPoint> ret = new ArrayList<>(); for (DataSet point : labeledPoints) { ret.add(toLabeledPoint(point)); } return ret; }
Example #13
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Converts a continuous JavaRDD LabeledPoint to a JavaRDD DataSet. * @param data JavaRdd LabeledPoint * @param preCache boolean pre-cache rdd before operation * @return */ public static JavaRDD<DataSet> fromContinuousLabeledPoint(JavaRDD<LabeledPoint> data, boolean preCache) { if (preCache && !data.getStorageLevel().useMemory()) { data.cache(); } return data.map(new Function<LabeledPoint, DataSet>() { @Override public DataSet call(LabeledPoint lp) { return convertToDataset(lp); } }); }
Example #14
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Convert an rdd of data set in to labeled point * @param sc the spark context to use * @param data the dataset to convert * @return an rdd of labeled point * @deprecated Use {@link #fromDataSet(JavaRDD)} * */ @Deprecated public static JavaRDD<LabeledPoint> fromDataSet(JavaSparkContext sc, JavaRDD<DataSet> data) { return data.map(new Function<DataSet, LabeledPoint>() { @Override public LabeledPoint call(DataSet pt) { return toLabeledPoint(pt); } }); }
Example #15
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Convert rdd labeled points to a rdd dataset with continuous features * @param data the java rdd labeled points ready to convert * @return a JavaRDD<Dataset> with a continuous label * @deprecated Use {@link #fromContinuousLabeledPoint(JavaRDD)} */ @Deprecated public static JavaRDD<DataSet> fromContinuousLabeledPoint(JavaSparkContext sc, JavaRDD<LabeledPoint> data) { return data.map(new Function<LabeledPoint, DataSet>() { @Override public DataSet call(LabeledPoint lp) { return convertToDataset(lp); } }); }
Example #16
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Convert a dataset (feature vector) to a labeled point * @param point the point to convert * @return the labeled point derived from this dataset */ private static LabeledPoint toLabeledPoint(DataSet point) { if (!point.getFeatures().isVector()) { throw new IllegalArgumentException("Feature matrix must be a vector"); } Vector features = toVector(point.getFeatures().dup()); double label = Nd4j.getBlasWrapper().iamax(point.getLabels()); return new LabeledPoint(label, features); }
Example #17
Source File: JavaNaiveBayesExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // $example on$ String path = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4}); JavaRDD<LabeledPoint> training = tmp[0]; // training set JavaRDD<LabeledPoint> test = tmp[1]; // test set final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0); JavaPairRDD<Double, Double> predictionAndLabel = test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return pl._1().equals(pl._2()); } }).count() / (double) test.count(); // Save and load model model.save(jsc.sc(), "target/tmp/myNaiveBayesModel"); NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel"); // $example off$ jsc.stop(); }
Example #18
Source File: JavaLogisticRegressionWithLBFGSExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample"); SparkContext sc = new SparkContext(conf); // $example on$ String path = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); // Split initial RDD into two... [60% training data, 40% testing data]. JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L); JavaRDD<LabeledPoint> training = splits[0].cache(); JavaRDD<LabeledPoint> test = splits[1]; // Run training algorithm to build the model. final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training.rdd()); // Compute raw scores on the test set. JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map( new Function<LabeledPoint, Tuple2<Object, Object>>() { public Tuple2<Object, Object> call(LabeledPoint p) { Double prediction = model.predict(p.features()); return new Tuple2<Object, Object>(prediction, p.label()); } } ); // Get evaluation metrics. MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); double accuracy = metrics.accuracy(); System.out.println("Accuracy = " + accuracy); // Save and load model model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel"); LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel"); // $example off$ sc.stop(); }
Example #19
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Converts JavaRDD labeled points to JavaRDD DataSets. * @param data JavaRDD LabeledPoints * @param numPossibleLabels number of possible labels * @param preCache boolean pre-cache rdd before operation * @return */ public static JavaRDD<DataSet> fromLabeledPoint(JavaRDD<LabeledPoint> data, final long numPossibleLabels, boolean preCache) { if (preCache && !data.getStorageLevel().useMemory()) { data.cache(); } return data.map(new Function<LabeledPoint, DataSet>() { @Override public DataSet call(LabeledPoint lp) { return fromLabeledPoint(lp, numPossibleLabels); } }); }
Example #20
Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Convert an rdd * of labeled point * based on the specified batch size * in to data set * @param data the data to convert * @param numPossibleLabels the number of possible labels * @param batchSize the batch size * @return the new rdd */ public static JavaRDD<DataSet> fromLabeledPoint(JavaRDD<LabeledPoint> data, final long numPossibleLabels, long batchSize) { JavaRDD<DataSet> mappedData = data.map(new Function<LabeledPoint, DataSet>() { @Override public DataSet call(LabeledPoint lp) { return fromLabeledPoint(lp, numPossibleLabels); } }); return mappedData.repartition((int) (mappedData.count() / batchSize)); }
Example #21
Source File: RDFUpdate.java From oryx with Apache License 2.0 | 5 votes |
/** * @param trainPointData data to run down trees * @param model random decision forest model to count on * @return maps of node IDs to the count of training examples that reached that node, one * per tree in the model * @see #predictorExampleCounts(JavaRDD,RandomForestModel) */ private static List<IntLongHashMap> treeNodeExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData, RandomForestModel model) { return trainPointData.mapPartitions(data -> { DecisionTreeModel[] trees = model.trees(); List<IntLongHashMap> treeNodeIDCounts = IntStream.range(0, trees.length). mapToObj(i -> new IntLongHashMap()).collect(Collectors.toList()); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (int i = 0; i < trees.length; i++) { DecisionTreeModel tree = trees[i]; IntLongHashMap nodeIDCount = treeNodeIDCounts.get(i); org.apache.spark.mllib.tree.model.Node node = tree.topNode(); // This logic cloned from Node.predict: while (!node.isLeaf()) { // Count node ID nodeIDCount.addToValue(node.id(), 1); Split split = node.split().get(); int featureIndex = split.feature(); node = nextNode(featureVector, node, split, featureIndex); } nodeIDCount.addToValue(node.id(), 1); } }); return Collections.singleton(treeNodeIDCounts).iterator(); } ).reduce((a, b) -> { Preconditions.checkArgument(a.size() == b.size()); for (int i = 0; i < a.size(); i++) { merge(a.get(i), b.get(i)); } return a; }); }
Example #22
Source File: RDFUpdate.java From oryx with Apache License 2.0 | 5 votes |
private JavaRDD<LabeledPoint> parseToLabeledPointRDD( JavaRDD<String[]> parsedRDD, CategoricalValueEncodings categoricalValueEncodings) { return parsedRDD.map(data -> { try { double[] features = new double[inputSchema.getNumPredictors()]; double target = Double.NaN; for (int featureIndex = 0; featureIndex < data.length; featureIndex++) { double encoded; if (inputSchema.isNumeric(featureIndex)) { encoded = Double.parseDouble(data[featureIndex]); } else if (inputSchema.isCategorical(featureIndex)) { Map<String,Integer> valueEncoding = categoricalValueEncodings.getValueEncodingMap(featureIndex); encoded = valueEncoding.get(data[featureIndex]); } else { continue; } if (inputSchema.isTarget(featureIndex)) { target = encoded; } else { features[inputSchema.featureToPredictorIndex(featureIndex)] = encoded; } } Preconditions.checkState(!Double.isNaN(target)); return new LabeledPoint(target, Vectors.dense(features)); } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) { log.warn("Bad input: {}", Arrays.toString(data)); throw e; } }); }
Example #23
Source File: LogisticRegressionBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testLogisticRegression() { //prepare data String datapath = "src/test/resources/binary_classification_test.libsvm"; JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); //Train model in spark LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd()); //Export this model byte[] exportedModel = ModelExporter.export(lrmodel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //validate predictions List<LabeledPoint> testPoints = trainingData.collect(); for (LabeledPoint i : testPoints) { Vector v = i.features(); double actual = lrmodel.predict(v); Map<String, Object> data = new HashMap<String, Object>(); data.put("features", v.toArray()); transformer.transform(data); double predicted = (double) data.get("prediction"); assertEquals(actual, predicted, 0.01); } }
Example #24
Source File: LogisticRegression1BridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testLogisticRegression() { //prepare data String datapath = "src/test/resources/binary_classification_test.libsvm"; Dataset<Row> trainingData = spark.read().format("libsvm").load(datapath); //Train model in spark LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(lrmodel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //validate predictions List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD().collect(); for (LabeledPoint i : testPoints) { Vector v = i.features().asML(); double actual = lrmodel.predict(v); Map<String, Object> data = new HashMap<String, Object>(); data.put("features", v.toArray()); transformer.transform(data); double predicted = (double) data.get("prediction"); assertEquals(actual, predicted, 0.01); } }
Example #25
Source File: MinMaxScalerBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testStandardScaler() { //prepare data List<LabeledPoint> localTraining = Arrays.asList( new LabeledPoint(1.0, Vectors.dense(data[0])), new LabeledPoint(2.0, Vectors.dense(data[1])), new LabeledPoint(3.0, Vectors.dense(data[2])), new LabeledPoint(3.0, Vectors.dense(data[3]))); DataFrame df = sqlContext.createDataFrame(sc.parallelize(localTraining), LabeledPoint.class); //train model in spark MinMaxScalerModel sparkModel = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaled") .setMin(-5) .setMax(5) .fit(df); //Export model, import it back and get transformer byte[] exportedModel = ModelExporter.export(sparkModel, df); final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions Row[] sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaled").collect(); assertCorrectness(sparkOutput, expected, transformer); }
Example #26
Source File: Log1PScalerBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testCustomScalerDenseVector() { final double precomputedAns[][] = new double[3][3]; //precompute answers for (int j = 0; j < 3; j++) for (int k = 0; k < 3; k++) precomputedAns[j][k] = Math.log1p(data[j][k]); //prepare data List<LabeledPoint> localTraining = Arrays.asList( new LabeledPoint(1.0, Vectors.dense(data[0])), new LabeledPoint(2.0, Vectors.dense(data[1])), new LabeledPoint(3.0, Vectors.dense(data[2]))); DataFrame df = sqlContext.createDataFrame(sc.parallelize(localTraining), LabeledPoint.class); for (int i = 0; i < 2; i++) { //train model in spark Log1PScaler sparkModel = new Log1PScaler() .setInputCol("features") .setOutputCol("scaledOutput"); //Export model, import it back and get transformer byte[] exportedModel = ModelExporter.export(sparkModel, df); final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions Row[] sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaledOutput").collect(); assertCorrectness(sparkOutput, precomputedAns, transformer); } }
Example #27
Source File: LogisticRegressionBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testLogisticRegression() { //prepare data String datapath = "src/test/resources/binary_classification_test.libsvm"; JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD(); //Train model in spark LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd()); //Export this model byte[] exportedModel = ModelExporter.export(lrmodel, null); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //validate predictions List<LabeledPoint> testPoints = trainingData.collect(); for (LabeledPoint i : testPoints) { Vector v = i.features(); double actual = lrmodel.predict(v); Map<String, Object> data = new HashMap<String, Object>(); data.put("features", v.toArray()); transformer.transform(data); double predicted = (double) data.get("prediction"); assertEquals(actual, predicted, EPSILON); } }
Example #28
Source File: LogisticRegression1BridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testLogisticRegression() { //prepare data String datapath = "src/test/resources/binary_classification_test.libsvm"; DataFrame trainingData = sqlContext.read().format("libsvm").load(datapath); //Train model in spark LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(lrmodel, trainingData); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //validate predictions List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().collect(); for (LabeledPoint i : testPoints) { Vector v = i.features(); double actual = lrmodel.predict(v); Map<String, Object> data = new HashMap<String, Object>(); data.put("features", v.toArray()); transformer.transform(data); double predicted = (double) data.get("prediction"); assertEquals(actual, predicted, EPSILON); } }
Example #29
Source File: MLSupporter.java From DDF with Apache License 2.0 | 5 votes |
/** * Override this to return the approriate DDF representation matching that specified in {@link ParamInfo}. The base * implementation simply returns the DDF. * * @param paramInfo * @return */ @SuppressWarnings("unchecked") @Override protected Object convertDDF(ParamInfo paramInfo) throws DDFException { mLog.info(">>>> Running ConvertDDF of io.ddf.spark.ml.MLSupporter"); if (paramInfo.argMatches(RDD.class)) { // Yay, our target data format is an RDD! RDD<?> rdd = null; if (paramInfo.paramMatches(LabeledPoint.class)) { rdd = (RDD<LabeledPoint>) this.getDDF().getRepresentationHandler().get(RDD.class, LabeledPoint.class); } else if (paramInfo.paramMatches(Vector.class)) { rdd = (RDD<Vector>) this.getDDF().getRepresentationHandler().get(RDD.class, Vector.class); } else if (paramInfo.paramMatches(double[].class)) { rdd = (RDD<double[]>) this.getDDF().getRepresentationHandler().get(RDD.class, double[].class); } else if (paramInfo.paramMatches(io.ddf.types.Vector.class)) { rdd = (RDD<io.ddf.types.Vector>) this.getDDF().getRepresentationHandler() .get(RDD.class, io.ddf.types.Vector.class); } else if (paramInfo.paramMatches(TupleMatrixVector.class)) { rdd = (RDD<TupleMatrixVector>) this.getDDF().getRepresentationHandler().get(RDD.class, TupleMatrixVector.class); } else if (paramInfo.paramMatches(Rating.class)) { rdd = (RDD<Rating>) this.getDDF().getRepresentationHandler().get(RDD.class, Rating.class); } // else if (paramInfo.paramMatches(TablePartition.class)) { // rdd = (RDD<TablePartition>) this.getDDF().getRepresentationHandler().get(RDD.class, TablePartition.class); // } else if (paramInfo.paramMatches(Object.class)) { rdd = (RDD<Object[]>) this.getDDF().getRepresentationHandler().get(RDD.class, Object[].class); } return rdd; } else { return super.convertDDF(paramInfo); } }
Example #30
Source File: MLMetricsSupporter.java From DDF with Apache License 2.0 | 5 votes |
@Override /* * input expected RDD[double[][]] * (non-Javadoc) * @see io.ddf.ml.AMLMetricsSupporter#roc(io.ddf.DDF, int) */ public RocMetric roc(DDF predictionDDF, int alpha_length) throws DDFException { RDD<LabeledPoint> rddLabeledPoint = (RDD<LabeledPoint>) predictionDDF.getRepresentationHandler() .get(RDD.class, LabeledPoint.class); ROCComputer rc = new ROCComputer(); return (rc.ROC(rddLabeledPoint, alpha_length)); }