Java Code Examples for org.apache.spark.api.java.JavaRDD#rdd()
The following examples show how to use
org.apache.spark.api.java.JavaRDD#rdd() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JavaPCAExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("PCA Example"); SparkContext sc = new SparkContext(conf); // $example on$ double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}}; LinkedList<Vector> rowsList = new LinkedList<>(); for (int i = 0; i < array.length; i++) { Vector currentRow = Vectors.dense(array[i]); rowsList.add(currentRow); } JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList); // Create a RowMatrix from JavaRDD<Vector>. RowMatrix mat = new RowMatrix(rows.rdd()); // Compute the top 3 principal components. Matrix pc = mat.computePrincipalComponents(3); RowMatrix projected = mat.multiply(pc); // $example off$ Vector[] collectPartitions = (Vector[])projected.rows().collect(); System.out.println("Projected vector of principal component:"); for (Vector vector : collectPartitions) { System.out.println("\t" + vector); } }
Example 2
Source File: SubStringCounterRelation.java From net.jgp.labs.spark with Apache License 2.0 | 6 votes |
@Override public RDD<Row> buildScan() { log.debug("-> buildScan()"); // I have isolated the work to a method to keep the plumbing code as simple // as // possible. List<List<Integer>> table = collectData(); @SuppressWarnings("resource") // cannot be closed here, done elsewhere JavaSparkContext sparkContext = new JavaSparkContext(sqlContext .sparkContext()); JavaRDD<Row> rowRDD = sparkContext.parallelize(table) .map(row -> RowFactory.create(row.toArray())); return rowRDD.rdd(); }
Example 3
Source File: SparkConverter.java From gatk-protected with BSD 3-Clause "New" or "Revised" License | 6 votes |
/** * Create a distributed matrix given an Apache Commons RealMatrix. * * @param sc Never {@code null} * @param realMat Apache Commons RealMatrix. Never {@code null} * @return A distributed Spark matrix */ public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) { logger.info("Converting matrix to distributed Spark matrix..."); final double [][] dataArray = realMat.getData(); final LinkedList<Vector> rowsList = new LinkedList<>(); for (final double [] i : dataArray) { final Vector currentRow = Vectors.dense(i); rowsList.add(currentRow); } // We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down. // final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES; // // Want the partitions to be ~100KB of space // final int slices = totalSpace/100000; final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices); // Create a RowMatrix from JavaRDD<Vector>. final RowMatrix mat = new RowMatrix(rows.rdd()); logger.info("Done converting matrix to distributed Spark matrix..."); return mat; }
Example 4
Source File: MLMetricsSupporter.java From DDF with Apache License 2.0 | 6 votes |
@Override public DDF residuals() throws DDFException { SparkDDF predictionDDF = (SparkDDF) this.getDDF(); JavaRDD<double[]> predictionRDD = predictionDDF.getJavaRDD(double[].class); JavaRDD<double[]> result = predictionRDD.map(new MetricsMapperResiduals()); if (result == null) mLog.error(">> javaRDD result of MetricMapper residuals is null"); if (predictionDDF.getManager() == null) mLog.error(">> predictionDDF.getManager() is null"); if (result.rdd() == null) mLog.error(">> result.rdd() is null"); if (predictionDDF.getSchema() == null) mLog.error(">> predictionDDF.getSchema() is null"); if (predictionDDF.getName() == null) mLog.error(">> predictionDDF.getName() is null"); Schema schema = new Schema("residuals double"); DDFManager manager = this.getDDF().getManager(); DDF residualDDF = manager .newDDF(manager, result.rdd(), new Class<?>[] { RDD.class, double[].class }, null, schema); if (residualDDF == null) mLog.error(">>>>>>>>>>>.residualDDF is null"); return residualDDF; }
Example 5
Source File: SparkConverter.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
/** * Create a distributed matrix given an Apache Commons RealMatrix. * * @param sc Never {@code null} * @param realMat Apache Commons RealMatrix. Never {@code null} * @return A distributed Spark matrix */ public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) { logger.info("Converting matrix to distributed Spark matrix..."); final double [][] dataArray = realMat.getData(); final LinkedList<Vector> rowsList = new LinkedList<>(); for (final double [] i : dataArray) { final Vector currentRow = Vectors.dense(i); rowsList.add(currentRow); } // We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down. // final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES; // // Want the partitions to be ~100KB of space // final int slices = totalSpace/100000; final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices); // Create a RowMatrix from JavaRDD<Vector>. final RowMatrix mat = new RowMatrix(rows.rdd()); logger.info("Done converting matrix to distributed Spark matrix..."); return mat; }
Example 6
Source File: RankConverter.java From spork with Apache License 2.0 | 6 votes |
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, PORank poRank) throws IOException { SparkUtil.assertPredecessorSize(predecessors, poRank, 1); RDD<Tuple> rdd = predecessors.get(0); JavaPairRDD<Integer, Long> javaPairRdd = rdd.toJavaRDD() .mapToPair(new ToPairRdd()); JavaPairRDD<Integer, Iterable<Long>> groupedByIndex = javaPairRdd .groupByKey(); JavaPairRDD<Integer, Long> countsByIndex = groupedByIndex .mapToPair(new IndexCounters()); JavaPairRDD<Integer, Long> sortedCountsByIndex = countsByIndex .sortByKey(true); Map<Integer, Long> counts = sortedCountsByIndex.collectAsMap(); JavaRDD<Tuple> finalRdd = rdd.toJavaRDD() .map(new RankFunction(new HashMap<Integer, Long>(counts))); return finalRdd.rdd(); }
Example 7
Source File: StoreConverter.java From spork with Apache License 2.0 | 6 votes |
@Override public RDD<Tuple2<Text, Tuple>> convert(List<RDD<Tuple>> predecessors, POStore physicalOperator) throws IOException { SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1); RDD<Tuple> rdd = predecessors.get(0); // convert back to KV pairs JavaRDD<Tuple2<Text, Tuple>> rddPairs = rdd.toJavaRDD().map(FROM_TUPLE_FUNCTION); PairRDDFunctions<Text, Tuple> pairRDDFunctions = new PairRDDFunctions<Text, Tuple>( rddPairs.rdd(), SparkUtil.getManifest(Text.class), SparkUtil.getManifest(Tuple.class), null); JobConf storeJobConf = SparkUtil.newJobConf(pigContext); POStore poStore = configureStorer(storeJobConf, physicalOperator); pairRDDFunctions.saveAsNewAPIHadoopFile(poStore.getSFile() .getFileName(), Text.class, Tuple.class, PigOutputFormat.class, storeJobConf); return rddPairs.rdd(); }
Example 8
Source File: SortConverter.java From spork with Apache License 2.0 | 6 votes |
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POSort sortOperator) throws IOException { SparkUtil.assertPredecessorSize(predecessors, sortOperator, 1); RDD<Tuple> rdd = predecessors.get(0); RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyValueFunction(), SparkUtil.<Tuple, Object> getTuple2Manifest()); JavaPairRDD<Tuple, Object> r = new JavaPairRDD<Tuple, Object>(rddPair, SparkUtil.getManifest(Tuple.class), SparkUtil.getManifest(Object.class)); JavaPairRDD<Tuple, Object> sorted = r.sortByKey( sortOperator.new SortComparator(), true); JavaRDD<Tuple> mapped = sorted.mapPartitions(TO_VALUE_FUNCTION); return mapped.rdd(); }
Example 9
Source File: JavaSVDExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("SVD Example"); SparkContext sc = new SparkContext(conf); JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); // $example on$ double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}}; LinkedList<Vector> rowsList = new LinkedList<>(); for (int i = 0; i < array.length; i++) { Vector currentRow = Vectors.dense(array[i]); rowsList.add(currentRow); } JavaRDD<Vector> rows = jsc.parallelize(rowsList); // Create a RowMatrix from JavaRDD<Vector>. RowMatrix mat = new RowMatrix(rows.rdd()); // Compute the top 3 singular values and corresponding singular vectors. SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(3, true, 1.0E-9d); RowMatrix U = svd.U(); Vector s = svd.s(); Matrix V = svd.V(); // $example off$ Vector[] collectPartitions = (Vector[]) U.rows().collect(); System.out.println("U factor is:"); for (Vector vector : collectPartitions) { System.out.println("\t" + vector); } System.out.println("Singular values are: " + s); System.out.println("V factor is:\n" + V); jsc.stop(); }
Example 10
Source File: JavaLogisticRegressionWithLBFGSExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample"); SparkContext sc = new SparkContext(conf); // $example on$ String path = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); // Split initial RDD into two... [60% training data, 40% testing data]. JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L); JavaRDD<LabeledPoint> training = splits[0].cache(); JavaRDD<LabeledPoint> test = splits[1]; // Run training algorithm to build the model. final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training.rdd()); // Compute raw scores on the test set. JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map( new Function<LabeledPoint, Tuple2<Object, Object>>() { public Tuple2<Object, Object> call(LabeledPoint p) { Double prediction = model.predict(p.features()); return new Tuple2<Object, Object>(prediction, p.label()); } } ); // Get evaluation metrics. MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); double accuracy = metrics.accuracy(); System.out.println("Accuracy = " + accuracy); // Save and load model model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel"); LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel"); // $example off$ sc.stop(); }
Example 11
Source File: SparkFileInputStream.java From incubator-retired-mrql with Apache License 2.0 | 5 votes |
@Override public Option<RDD<MRData>> compute ( Time validTime ) { JavaRDD<MRData> rdd = null; for ( String file: new_files() ) if (rdd == null) rdd = hadoopFile(file); else rdd = rdd.union(hadoopFile(file)); if (rdd == null) rdd = SparkEvaluator.spark_context.emptyRDD(); return new Some<RDD<MRData>>(rdd.rdd()); }
Example 12
Source File: CounterConverter.java From spork with Apache License 2.0 | 5 votes |
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POCounter poCounter) throws IOException { SparkUtil.assertPredecessorSize(predecessors, poCounter, 1); RDD<Tuple> rdd = predecessors.get(0); CounterConverterFunction f = new CounterConverterFunction(poCounter); JavaRDD<Tuple> jRdd = rdd.toJavaRDD().mapPartitionsWithIndex(f, true); // jRdd = jRdd.cache(); return jRdd.rdd(); }
Example 13
Source File: ChronixRDD.java From chronix.spark with Apache License 2.0 | 4 votes |
public ChronixRDD(JavaRDD<MetricTimeSeries> tsRdd) { super(tsRdd.rdd(), MTS_TYPE); }
Example 14
Source File: DeepSparkContext.java From deep-spark with Apache License 2.0 | 4 votes |
private RDD<Cells> createRDDFromFilePath(String filePath, TextFileDataTable textFileDataTable) { RDD<String> result = this.sc().textFile(filePath.toString(), 1); JavaRDD<Cells> resultCells = result.toJavaRDD().map(new MapSchemaFromLines(textFileDataTable)); return resultCells.rdd(); }
Example 15
Source File: ALSUpdate.java From oryx with Apache License 2.0 | 4 votes |
@Override public PMML buildModel(JavaSparkContext sparkContext, JavaRDD<String> trainData, List<?> hyperParameters, Path candidatePath) { int features = (Integer) hyperParameters.get(0); double lambda = (Double) hyperParameters.get(1); double alpha = (Double) hyperParameters.get(2); double epsilon = Double.NaN; if (logStrength) { epsilon = (Double) hyperParameters.get(3); } Preconditions.checkArgument(features > 0); Preconditions.checkArgument(lambda >= 0.0); Preconditions.checkArgument(alpha > 0.0); if (logStrength) { Preconditions.checkArgument(epsilon > 0.0); } JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN); parsedRDD.cache(); Map<String,Integer> userIDIndexMap = buildIDIndexMapping(parsedRDD, true); Map<String,Integer> itemIDIndexMap = buildIDIndexMapping(parsedRDD, false); log.info("Broadcasting ID-index mappings for {} users, {} items", userIDIndexMap.size(), itemIDIndexMap.size()); Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDIndexMap); Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDIndexMap); JavaRDD<Rating> trainRatingData = parsedToRatingRDD(parsedRDD, bUserIDToIndex, bItemIDToIndex); trainRatingData = aggregateScores(trainRatingData, epsilon); ALS als = new ALS() .setRank(features) .setIterations(iterations) .setLambda(lambda) .setCheckpointInterval(5); if (implicit) { als = als.setImplicitPrefs(true).setAlpha(alpha); } RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd(); trainingRatingDataRDD.cache(); MatrixFactorizationModel model = als.run(trainingRatingDataRDD); trainingRatingDataRDD.unpersist(false); bUserIDToIndex.unpersist(); bItemIDToIndex.unpersist(); parsedRDD.unpersist(); Broadcast<Map<Integer,String>> bUserIndexToID = sparkContext.broadcast(invertMap(userIDIndexMap)); Broadcast<Map<Integer,String>> bItemIndexToID = sparkContext.broadcast(invertMap(itemIDIndexMap)); PMML pmml = mfModelToPMML(model, features, lambda, alpha, epsilon, implicit, logStrength, candidatePath, bUserIndexToID, bItemIndexToID); unpersist(model); bUserIndexToID.unpersist(); bItemIndexToID.unpersist(); return pmml; }
Example 16
Source File: SkewedJoinConverter.java From spork with Apache License 2.0 | 4 votes |
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POSkewedJoin poSkewedJoin) throws IOException { SparkUtil.assertPredecessorSize(predecessors, poSkewedJoin, 2); LRs = new POLocalRearrange[2]; this.poSkewedJoin = poSkewedJoin; createJoinPlans(poSkewedJoin.getJoinPlans()); // extract the two RDDs RDD<Tuple> rdd1 = predecessors.get(0); RDD<Tuple> rdd2 = predecessors.get(1); // make (key, value) pairs, key has type Object, value has type Tuple RDD<Tuple2<Object, Tuple>> rdd1Pair = rdd1.map(new ExtractKeyFunction( this, 0), SparkUtil.<Object, Tuple>getTuple2Manifest()); RDD<Tuple2<Object, Tuple>> rdd2Pair = rdd2.map(new ExtractKeyFunction( this, 1), SparkUtil.<Object, Tuple>getTuple2Manifest()); // join fn is present in JavaPairRDD class .. JavaPairRDD<Object, Tuple> rdd1Pair_javaRDD = new JavaPairRDD<Object, Tuple>( rdd1Pair, SparkUtil.getManifest(Object.class), SparkUtil.getManifest(Tuple.class)); JavaPairRDD<Object, Tuple> rdd2Pair_javaRDD = new JavaPairRDD<Object, Tuple>( rdd2Pair, SparkUtil.getManifest(Object.class), SparkUtil.getManifest(Tuple.class)); // do the join JavaPairRDD<Object, Tuple2<Tuple, Tuple>> result_KeyValue = rdd1Pair_javaRDD .join(rdd2Pair_javaRDD); // map to get RDD<Tuple> from RDD<Object, Tuple2<Tuple, Tuple>> by // ignoring the key (of type Object) and appending the values (the // Tuples) JavaRDD<Tuple> result = result_KeyValue .mapPartitions(new ToValueFunction()); // return type is RDD<Tuple>, so take it from JavaRDD<Tuple> return result.rdd(); }