org.apache.spark.mllib.regression.LabeledPoint Java Exaples

Source File: LogisticRegressionExporterTest.java From spark-transformers with Apache License 2.0

6 votes

@Test
public void shouldExportAndImportCorrectly() {
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(data.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, null);

    //Import it back
    LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel);

    //check if they are exactly equal with respect to their fields
    //it maybe edge cases eg. order of elements in the list is changed
    assertEquals(lrmodel.intercept(), importedModel.getIntercept(), EPSILON);
    assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), EPSILON);
    assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), EPSILON);
    assertEquals((double) lrmodel.getThreshold().get(), importedModel.getThreshold(), EPSILON);
    for (int i = 0; i < importedModel.getNumFeatures(); i++)
        assertEquals(lrmodel.weights().toArray()[i], importedModel.getWeights()[i], EPSILON);

}

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Returns a labeled point of the writables
 * where the final item is the point and the rest of the items are
 * features
 * @param writables the writables
 * @return the labeled point
 */
public static LabeledPoint pointOf(Collection<Writable> writables) {
    double[] ret = new double[writables.size() - 1];
    int count = 0;
    double target = 0;
    for (Writable w : writables) {
        if (count < writables.size() - 1)
            ret[count++] = Float.parseFloat(w.toString());
        else
            target = Float.parseFloat(w.toString());
    }

    if (target < 0)
        throw new IllegalStateException("Target must be >= 0");
    return new LabeledPoint(target, Vectors.dense(ret));
}

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Convert a traditional sc.binaryFiles
 * in to something usable for machine learning
 * @param binaryFiles the binary files to convert
 * @param reader the reader to use
 * @return the labeled points based on the given rdd
 */
public static JavaRDD<LabeledPoint> fromBinary(JavaPairRDD<String, PortableDataStream> binaryFiles,
                final RecordReader reader) {
    JavaRDD<Collection<Writable>> records =
                    binaryFiles.map(new Function<Tuple2<String, PortableDataStream>, Collection<Writable>>() {
                        @Override
                        public Collection<Writable> call(
                                        Tuple2<String, PortableDataStream> stringPortableDataStreamTuple2)
                                        throws Exception {
                            reader.initialize(new InputStreamInputSplit(stringPortableDataStreamTuple2._2().open(),
                                            stringPortableDataStreamTuple2._1()));
                            return reader.next();
                        }
                    });

    JavaRDD<LabeledPoint> ret = records.map(new Function<Collection<Writable>, LabeledPoint>() {
        @Override
        public LabeledPoint call(Collection<Writable> writables) throws Exception {
            return pointOf(writables);
        }
    });
    return ret;
}

Source File: RDFUpdate.java From oryx with Apache License 2.0

6 votes

/**
 * @param trainPointData data to run down trees
 * @param model random decision forest model to count on
 * @return map of predictor index to the number of training examples that reached a
 *  node whose decision is based on that feature. The index is among predictors, not all
 *  features, since there are fewer predictors than features. That is, the index will
 *  match the one used in the {@link RandomForestModel}.
 */
private static IntLongHashMap predictorExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData,
                                                     RandomForestModel model) {
  return trainPointData.mapPartitions(data -> {
      IntLongHashMap featureIndexCount = new IntLongHashMap();
      data.forEachRemaining(datum -> {
        double[] featureVector = datum.features().toArray();
        for (DecisionTreeModel tree : model.trees()) {
          org.apache.spark.mllib.tree.model.Node node = tree.topNode();
          // This logic cloned from Node.predict:
          while (!node.isLeaf()) {
            Split split = node.split().get();
            int featureIndex = split.feature();
            // Count feature
            featureIndexCount.addToValue(featureIndex, 1);
            node = nextNode(featureVector, node, split, featureIndex);
          }
        }
      });
      return Collections.singleton(featureIndexCount).iterator();
  }).reduce(RDFUpdate::merge);
}

Source File: LogisticRegressionExporterTest.java From spark-transformers with Apache License 2.0

6 votes

@Test
public void shouldExportAndImportCorrectly() {
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(data.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import it back
    LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel);

    //check if they are exactly equal with respect to their fields
    //it maybe edge cases eg. order of elements in the list is changed
    assertEquals(lrmodel.intercept(), importedModel.getIntercept(), 0.01);
    assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), 0.01);
    assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), 0.01);
    assertEquals((double) lrmodel.getThreshold().get(), importedModel.getThreshold(), 0.01);
    for (int i = 0; i < importedModel.getNumFeatures(); i++)
        assertEquals(lrmodel.weights().toArray()[i], importedModel.getWeights()[i], 0.01);

}

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Convert an rdd of data set in to labeled point.
 * @param data the dataset to convert
 * @param preCache boolean pre-cache rdd before operation
 * @return an rdd of labeled point
 */
public static JavaRDD<LabeledPoint> fromDataSet(JavaRDD<DataSet> data, boolean preCache) {
    if (preCache && !data.getStorageLevel().useMemory()) {
        data.cache();
    }
    return data.map(new Function<DataSet, LabeledPoint>() {
        @Override
        public LabeledPoint call(DataSet dataSet) {
            return toLabeledPoint(dataSet);
        }
    });
}

Source File: TestSparkMultiLayerParameterAveraging.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testFromSvmLight() throws Exception {
    JavaRDD<LabeledPoint> data = MLUtils
                    .loadLibSVMFile(sc.sc(),
                                    new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive()
                                                    .getAbsolutePath())
                    .toJavaRDD().map(new TestFn());

    MultiLayerConfiguration conf =
                    new NeuralNetConfiguration.Builder().seed(123)
                                    .updater(new Adam(1e-6))
                            .weightInit(WeightInit.XAVIER)
                            .list()
                            .layer(new BatchNormalization.Builder().nIn(4).nOut(4).build())
                            .layer(new DenseLayer.Builder().nIn(4).nOut(32).activation(Activation.RELU).build())
                            .layer(new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(32).nOut(3)
                                                                    .activation(Activation.SOFTMAX).build())
                                    .build();



    MultiLayerNetwork network = new MultiLayerNetwork(conf);
    network.init();
    System.out.println("Initializing network");
    SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, getBasicConf(),
                    new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0));

    master.fitLabeledPoint(data);
}

Source File: TestSparkMultiLayerParameterAveraging.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testFromSvmLightBackprop() throws Exception {
    JavaRDD<LabeledPoint> data = MLUtils
                    .loadLibSVMFile(sc.sc(),
                                    new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive()
                                                    .getAbsolutePath())
                    .toJavaRDD().map(new TestFn());

    DataSet d = new IrisDataSetIterator(150, 150).next();
    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(123)
                    .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).list()
                    .layer(0, new DenseLayer.Builder().nIn(4).nOut(100).weightInit(WeightInit.XAVIER)
                                    .activation(Activation.RELU).build())
                    .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(
                                    LossFunctions.LossFunction.MCXENT).nIn(100).nOut(3)
                                                    .activation(Activation.SOFTMAX).weightInit(WeightInit.XAVIER)
                                                    .build())
                    .build();



    MultiLayerNetwork network = new MultiLayerNetwork(conf);
    network.init();
    System.out.println("Initializing network");

    SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, conf,
                    new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0));

    MultiLayerNetwork network2 = master.fitLabeledPoint(data);
}

Source File: MLLIbUtilTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testMlLibTest() {
    DataSet dataSet = new IrisDataSetIterator(150, 150).next();
    List<DataSet> list = dataSet.asList();
    JavaRDD<DataSet> data = sc.parallelize(list);
    JavaRDD<LabeledPoint> mllLibData = MLLibUtil.fromDataSet(sc, data);
}

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 *
 * @param point
 * @param numPossibleLabels
 * @return {@link DataSet}
 */
private static DataSet fromLabeledPoint(LabeledPoint point, long numPossibleLabels) {
    Vector features = point.features();
    double label = point.label();

    // FIXMEL int cast
    double[] fArr = features.toArray();
    return new DataSet(Nd4j.create(fArr, new long[]{1,fArr.length}),
                    FeatureUtil.toOutcomeVector((int) label, (int) numPossibleLabels));
}

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 *
 * @param labeledPoints
 * @param numPossibleLabels
 * @return List of {@link DataSet}
 */
private static List<DataSet> fromLabeledPoint(List<LabeledPoint> labeledPoints, long numPossibleLabels) {
    List<DataSet> ret = new ArrayList<>();
    for (LabeledPoint point : labeledPoints) {
        ret.add(fromLabeledPoint(point, numPossibleLabels));
    }
    return ret;
}

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Convert a list of dataset in to a list of labeled points
 * @param labeledPoints the labeled points to convert
 * @return the labeled point list
 */
private static List<LabeledPoint> toLabeledPoint(List<DataSet> labeledPoints) {
    List<LabeledPoint> ret = new ArrayList<>();
    for (DataSet point : labeledPoints) {
        ret.add(toLabeledPoint(point));
    }
    return ret;
}

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Converts a continuous JavaRDD LabeledPoint to a JavaRDD DataSet.
 * @param data JavaRdd LabeledPoint
 * @param preCache boolean pre-cache rdd before operation
 * @return
 */
public static JavaRDD<DataSet> fromContinuousLabeledPoint(JavaRDD<LabeledPoint> data, boolean preCache) {
    if (preCache && !data.getStorageLevel().useMemory()) {
        data.cache();
    }
    return data.map(new Function<LabeledPoint, DataSet>() {
        @Override
        public DataSet call(LabeledPoint lp) {
            return convertToDataset(lp);
        }
    });
}

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Convert an rdd of data set in to labeled point
 * @param sc the spark context to use
 * @param data the dataset to convert
 * @return an rdd of labeled point
 * @deprecated Use {@link #fromDataSet(JavaRDD)}
 *
 */
@Deprecated
public static JavaRDD<LabeledPoint> fromDataSet(JavaSparkContext sc, JavaRDD<DataSet> data) {

    return data.map(new Function<DataSet, LabeledPoint>() {
        @Override
        public LabeledPoint call(DataSet pt) {
            return toLabeledPoint(pt);
        }
    });
}

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Convert rdd labeled points to a rdd dataset with continuous features
 * @param data the java rdd labeled points ready to convert
 * @return a JavaRDD<Dataset> with a continuous label
 * @deprecated Use {@link #fromContinuousLabeledPoint(JavaRDD)}
 */
@Deprecated
public static JavaRDD<DataSet> fromContinuousLabeledPoint(JavaSparkContext sc, JavaRDD<LabeledPoint> data) {

    return data.map(new Function<LabeledPoint, DataSet>() {
        @Override
        public DataSet call(LabeledPoint lp) {
            return convertToDataset(lp);
        }
    });
}

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Convert a dataset (feature vector) to a labeled point
 * @param point the point to convert
 * @return the labeled point derived from this dataset
 */
private static LabeledPoint toLabeledPoint(DataSet point) {
    if (!point.getFeatures().isVector()) {
        throw new IllegalArgumentException("Feature matrix must be a vector");
    }

    Vector features = toVector(point.getFeatures().dup());

    double label = Nd4j.getBlasWrapper().iamax(point.getLabels());
    return new LabeledPoint(label, features);
}

Source File: JavaNaiveBayesExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
  JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4});
  JavaRDD<LabeledPoint> training = tmp[0]; // training set
  JavaRDD<LabeledPoint> test = tmp[1]; // test set
  final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
  JavaPairRDD<Double, Double> predictionAndLabel =
    test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
    @Override
    public Boolean call(Tuple2<Double, Double> pl) {
      return pl._1().equals(pl._2());
    }
  }).count() / (double) test.count();

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myNaiveBayesModel");
  NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel");
  // $example off$

  jsc.stop();
}

Source File: JavaLogisticRegressionWithLBFGSExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
  JavaRDD<LabeledPoint> training = splits[0].cache();
  JavaRDD<LabeledPoint> test = splits[1];

  // Run training algorithm to build the model.
  final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
    .setNumClasses(10)
    .run(training.rdd());

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double prediction = model.predict(p.features());
        return new Tuple2<Object, Object>(prediction, p.label());
      }
    }
  );

  // Get evaluation metrics.
  MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
  double accuracy = metrics.accuracy();
  System.out.println("Accuracy = " + accuracy);

  // Save and load model
  model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel");
  LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
    "target/tmp/javaLogisticRegressionWithLBFGSModel");
  // $example off$

  sc.stop();
}

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Converts JavaRDD labeled points to JavaRDD DataSets.
 * @param data JavaRDD LabeledPoints
 * @param numPossibleLabels number of possible labels
 * @param preCache boolean pre-cache rdd before operation
 * @return
 */
public static JavaRDD<DataSet> fromLabeledPoint(JavaRDD<LabeledPoint> data, final long numPossibleLabels,
                boolean preCache) {
    if (preCache && !data.getStorageLevel().useMemory()) {
        data.cache();
    }
    return data.map(new Function<LabeledPoint, DataSet>() {
        @Override
        public DataSet call(LabeledPoint lp) {
            return fromLabeledPoint(lp, numPossibleLabels);
        }
    });
}

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Convert an rdd
 * of labeled point
 * based on the specified batch size
 * in to data set
 * @param data the data to convert
 * @param numPossibleLabels the number of possible labels
 * @param batchSize the batch size
 * @return the new rdd
 */
public static JavaRDD<DataSet> fromLabeledPoint(JavaRDD<LabeledPoint> data, final long numPossibleLabels,
                long batchSize) {

    JavaRDD<DataSet> mappedData = data.map(new Function<LabeledPoint, DataSet>() {
        @Override
        public DataSet call(LabeledPoint lp) {
            return fromLabeledPoint(lp, numPossibleLabels);
        }
    });

    return mappedData.repartition((int) (mappedData.count() / batchSize));
}

Source File: RDFUpdate.java From oryx with Apache License 2.0

5 votes

/**
 * @param trainPointData data to run down trees
 * @param model random decision forest model to count on
 * @return maps of node IDs to the count of training examples that reached that node, one
 *  per tree in the model
 * @see #predictorExampleCounts(JavaRDD,RandomForestModel)
 */
private static List<IntLongHashMap> treeNodeExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData,
                                                          RandomForestModel model) {
  return trainPointData.mapPartitions(data -> {
      DecisionTreeModel[] trees = model.trees();
      List<IntLongHashMap> treeNodeIDCounts = IntStream.range(0, trees.length).
          mapToObj(i -> new IntLongHashMap()).collect(Collectors.toList());
      data.forEachRemaining(datum -> {
        double[] featureVector = datum.features().toArray();
        for (int i = 0; i < trees.length; i++) {
          DecisionTreeModel tree = trees[i];
          IntLongHashMap nodeIDCount = treeNodeIDCounts.get(i);
          org.apache.spark.mllib.tree.model.Node node = tree.topNode();
          // This logic cloned from Node.predict:
          while (!node.isLeaf()) {
            // Count node ID
            nodeIDCount.addToValue(node.id(), 1);
            Split split = node.split().get();
            int featureIndex = split.feature();
            node = nextNode(featureVector, node, split, featureIndex);
          }
          nodeIDCount.addToValue(node.id(), 1);
        }
      });
      return Collections.singleton(treeNodeIDCounts).iterator();
    }
  ).reduce((a, b) -> {
      Preconditions.checkArgument(a.size() == b.size());
      for (int i = 0; i < a.size(); i++) {
        merge(a.get(i), b.get(i));
      }
      return a;
    });
}

Source File: RDFUpdate.java From oryx with Apache License 2.0

5 votes

private JavaRDD<LabeledPoint> parseToLabeledPointRDD(
    JavaRDD<String[]> parsedRDD,
    CategoricalValueEncodings categoricalValueEncodings) {

  return parsedRDD.map(data -> {
    try {
      double[] features = new double[inputSchema.getNumPredictors()];
      double target = Double.NaN;
      for (int featureIndex = 0; featureIndex < data.length; featureIndex++) {
        double encoded;
        if (inputSchema.isNumeric(featureIndex)) {
          encoded = Double.parseDouble(data[featureIndex]);
        } else if (inputSchema.isCategorical(featureIndex)) {
          Map<String,Integer> valueEncoding =
              categoricalValueEncodings.getValueEncodingMap(featureIndex);
          encoded = valueEncoding.get(data[featureIndex]);
        } else {
          continue;
        }
        if (inputSchema.isTarget(featureIndex)) {
          target = encoded;
        } else {
          features[inputSchema.featureToPredictorIndex(featureIndex)] = encoded;
        }
      }
      Preconditions.checkState(!Double.isNaN(target));
      return new LabeledPoint(target, Vectors.dense(features));
    } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
      log.warn("Bad input: {}", Arrays.toString(data));
      throw e;
    }
  });
}

Source File: LogisticRegressionBridgeTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = trainingData.collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, 0.01);
    }
}

Source File: LogisticRegression1BridgeTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    Dataset<Row> trainingData = spark.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD().collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features().asML();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, 0.01);
    }
}

Source File: MinMaxScalerBridgeTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void testStandardScaler() {
    //prepare data
    List<LabeledPoint> localTraining = Arrays.asList(
            new LabeledPoint(1.0, Vectors.dense(data[0])),
            new LabeledPoint(2.0, Vectors.dense(data[1])),
            new LabeledPoint(3.0, Vectors.dense(data[2])),
            new LabeledPoint(3.0, Vectors.dense(data[3])));
    DataFrame df = sqlContext.createDataFrame(sc.parallelize(localTraining), LabeledPoint.class);

    //train model in spark
    MinMaxScalerModel sparkModel = new MinMaxScaler()
            .setInputCol("features")
            .setOutputCol("scaled")
            .setMin(-5)
            .setMax(5)
            .fit(df);


    //Export model, import it back and get transformer
    byte[] exportedModel = ModelExporter.export(sparkModel, df);
    final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaled").collect();
    assertCorrectness(sparkOutput, expected, transformer);
}

Source File: Log1PScalerBridgeTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void testCustomScalerDenseVector() {
    final double precomputedAns[][] = new double[3][3];
    //precompute answers
        for (int j = 0; j < 3; j++)
            for (int k = 0; k < 3; k++)
                precomputedAns[j][k] = Math.log1p(data[j][k]);

    //prepare data
    List<LabeledPoint> localTraining = Arrays.asList(
            new LabeledPoint(1.0, Vectors.dense(data[0])),
            new LabeledPoint(2.0, Vectors.dense(data[1])),
            new LabeledPoint(3.0, Vectors.dense(data[2])));
    DataFrame df = sqlContext.createDataFrame(sc.parallelize(localTraining), LabeledPoint.class);

    for (int i = 0; i < 2; i++) {
        //train model in spark
        Log1PScaler sparkModel = new Log1PScaler()
                .setInputCol("features")
                .setOutputCol("scaledOutput");

        //Export model, import it back and get transformer
        byte[] exportedModel = ModelExporter.export(sparkModel, df);
        final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

        //compare predictions
        Row[] sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaledOutput").collect();
        assertCorrectness(sparkOutput, precomputedAns, transformer);
    }
}

Source File: LogisticRegressionBridgeTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = trainingData.collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}

Source File: LogisticRegression1BridgeTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    DataFrame trainingData = sqlContext.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, trainingData);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}

Source File: MLSupporter.java From DDF with Apache License 2.0

5 votes

/**
 * Override this to return the approriate DDF representation matching that specified in {@link ParamInfo}. The base
 * implementation simply returns the DDF.
 *
 * @param paramInfo
 * @return
 */
@SuppressWarnings("unchecked")
@Override
protected Object convertDDF(ParamInfo paramInfo) throws DDFException {
  mLog.info(">>>> Running ConvertDDF of io.ddf.spark.ml.MLSupporter");
  if (paramInfo.argMatches(RDD.class)) {
    // Yay, our target data format is an RDD!
    RDD<?> rdd = null;

    if (paramInfo.paramMatches(LabeledPoint.class)) {
      rdd = (RDD<LabeledPoint>) this.getDDF().getRepresentationHandler().get(RDD.class, LabeledPoint.class);

    } else if (paramInfo.paramMatches(Vector.class)) {
      rdd = (RDD<Vector>) this.getDDF().getRepresentationHandler().get(RDD.class, Vector.class);
    } else if (paramInfo.paramMatches(double[].class)) {
      rdd = (RDD<double[]>) this.getDDF().getRepresentationHandler().get(RDD.class, double[].class);
    } else if (paramInfo.paramMatches(io.ddf.types.Vector.class)) {
      rdd = (RDD<io.ddf.types.Vector>) this.getDDF().getRepresentationHandler()
          .get(RDD.class, io.ddf.types.Vector.class);
    } else if (paramInfo.paramMatches(TupleMatrixVector.class)) {
      rdd = (RDD<TupleMatrixVector>) this.getDDF().getRepresentationHandler().get(RDD.class, TupleMatrixVector.class);
    } else if (paramInfo.paramMatches(Rating.class)) {
      rdd = (RDD<Rating>) this.getDDF().getRepresentationHandler().get(RDD.class, Rating.class);
    }
    //      else if (paramInfo.paramMatches(TablePartition.class)) {
    //        rdd = (RDD<TablePartition>) this.getDDF().getRepresentationHandler().get(RDD.class, TablePartition.class);
    //      }
    else if (paramInfo.paramMatches(Object.class)) {
      rdd = (RDD<Object[]>) this.getDDF().getRepresentationHandler().get(RDD.class, Object[].class);
    }

    return rdd;
  } else {
    return super.convertDDF(paramInfo);
  }
}

Source File: MLMetricsSupporter.java From DDF with Apache License 2.0

5 votes

@Override
/*
 * input expected RDD[double[][]]
 * (non-Javadoc)
 * @see io.ddf.ml.AMLMetricsSupporter#roc(io.ddf.DDF, int)
 */
public RocMetric roc(DDF predictionDDF, int alpha_length) throws DDFException {

  RDD<LabeledPoint> rddLabeledPoint = (RDD<LabeledPoint>) predictionDDF.getRepresentationHandler()
      .get(RDD.class, LabeledPoint.class);
  ROCComputer rc = new ROCComputer();

  return (rc.ROC(rddLabeledPoint, alpha_length));
}

org.apache.spark.mllib.regression.LabeledPoint Java Examples