org.apache.spark.mllib.stat.MultivariateStatisticalSummary Java Examples

The following examples show how to use org.apache.spark.mllib.stat.MultivariateStatisticalSummary. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaSummaryStatisticsExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaRDD<Vector> mat = jsc.parallelize(
      Arrays.asList(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)
      )
    ); // an RDD of Vectors

    // Compute column summary statistics.
    MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
    System.out.println(summary.mean());  // a dense vector containing the mean value for each column
    System.out.println(summary.variance());  // column-wise variance
    System.out.println(summary.numNonzeros());  // number of nonzeros in each column
    // $example off$

    jsc.stop();
  }
 
Example #2
Source File: SparkStatistics.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
public static void getStatementStatistics(String statement, ResultSet[] resultSets) throws SQLException {
    try {
        // Run sql statement
        Connection con = DriverManager.getConnection("jdbc:default:connection");
        PreparedStatement ps = con.prepareStatement(statement);
        ResultSet rs = ps.executeQuery();

        // Convert result set to Java RDD
        JavaRDD<ExecRow> resultSetRDD = ResultSetToRDD(rs);

        // Collect column statistics
        int[] fieldsToConvert = getFieldsToConvert(ps);
        MultivariateStatisticalSummary summary = getColumnStatisticsSummary(resultSetRDD, fieldsToConvert);

        IteratorNoPutResultSet resultsToWrap = wrapResults((EmbedConnection) con, getColumnStatistics(ps, summary, fieldsToConvert));
        resultSets[0] = new EmbedResultSet40((EmbedConnection)con, resultsToWrap, false, null, true);
   } catch (StandardException e) {
        throw new SQLException(Throwables.getRootCause(e));
    }
}
 
Example #3
Source File: SparkStatistics.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
private static Iterable<ExecRow> getColumnStatistics(PreparedStatement ps,
                                                     MultivariateStatisticalSummary summary,
                                                     int[] fieldsToConvert) throws StandardException {
    try {

        List<ExecRow> rows = Lists.newArrayList();
        ResultSetMetaData metaData = ps.getMetaData();

        double[] min = summary.min().toArray();
        double[] max = summary.max().toArray();
        double[] mean = summary.mean().toArray();
        double[] nonZeros = summary.numNonzeros().toArray();
        double[] variance = summary.variance().toArray();
        double[] normL1 = summary.normL1().toArray();
        double[] normL2 = summary.normL2().toArray();
        long count = summary.count();

        for (int i= 0; i < fieldsToConvert.length; ++i) {
            int columnPosition = fieldsToConvert[i];
            String columnName = metaData.getColumnName(columnPosition);
            ExecRow row = new ValueRow(9);
            row.setColumn(1, new SQLVarchar(columnName));
            row.setColumn(2, new SQLDouble(min[columnPosition-1]));
            row.setColumn(3, new SQLDouble(max[columnPosition-1]));
            row.setColumn(4, new SQLDouble(nonZeros[columnPosition-1]));
            row.setColumn(5, new SQLDouble(variance[columnPosition-1]));
            row.setColumn(6, new SQLDouble(mean[columnPosition-1]));
            row.setColumn(7, new SQLDouble(normL1[columnPosition-1]));
            row.setColumn(8, new SQLDouble(normL2[columnPosition-1]));
            row.setColumn(9, new SQLLongint(count));
            rows.add(row);
        }
        return rows;
    }
    catch (Exception e) {
        throw StandardException.newException(e.getLocalizedMessage());
    }
}
 
Example #4
Source File: SparkStatistics.java    From spliceengine with GNU Affero General Public License v3.0 4 votes vote down vote up
private static MultivariateStatisticalSummary getColumnStatisticsSummary(JavaRDD<ExecRow> resultSetRDD,
                                                                 int[] fieldsToConvert) throws StandardException{
    JavaRDD<Vector> vectorJavaRDD = SparkMLibUtils.locatedRowRDDToVectorRDD(resultSetRDD, fieldsToConvert);
    return Statistics.colStats(vectorJavaRDD.rdd());
}