Java Code Examples for org.apache.spark.rdd.RDD#map()
The following examples show how to use
org.apache.spark.rdd.RDD#map() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DistinctConverter.java From spork with Apache License 2.0 | 6 votes |
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, PODistinct poDistinct) throws IOException { SparkUtil.assertPredecessorSize(predecessors, poDistinct, 1); RDD<Tuple> rdd = predecessors.get(0); ClassTag<Tuple2<Tuple, Object>> tuple2ClassManifest = SparkUtil .<Tuple, Object> getTuple2Manifest(); RDD<Tuple2<Tuple, Object>> rddPairs = rdd.map(TO_KEY_VALUE_FUNCTION, tuple2ClassManifest); PairRDDFunctions<Tuple, Object> pairRDDFunctions = new PairRDDFunctions<Tuple, Object>( rddPairs, SparkUtil.getManifest(Tuple.class), SparkUtil.getManifest(Object.class), null); int parallelism = SparkUtil.getParallelism(predecessors, poDistinct); return pairRDDFunctions.reduceByKey(MERGE_VALUES_FUNCTION, parallelism) .map(TO_VALUE_FUNCTION, SparkUtil.getManifest(Tuple.class)); }
Example 2
Source File: LoadConverter.java From spork with Apache License 2.0 | 6 votes |
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessorRdds, POLoad poLoad) throws IOException { // if (predecessors.size()!=0) { // throw new // RuntimeException("Should not have predecessors for Load. Got : "+predecessors); // } JobConf loadJobConf = SparkUtil.newJobConf(pigContext); configureLoader(physicalPlan, poLoad, loadJobConf); // don't know why but just doing this cast for now RDD<Tuple2<Text, Tuple>> hadoopRDD = sparkContext.newAPIHadoopFile( poLoad.getLFile().getFileName(), PigInputFormatSpark.class, Text.class, Tuple.class, loadJobConf); registerUdfFiles(); // map to get just RDD<Tuple> return hadoopRDD.map(TO_TUPLE_FUNCTION, SparkUtil.getManifest(Tuple.class)); }
Example 3
Source File: SortConverter.java From spork with Apache License 2.0 | 6 votes |
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POSort sortOperator) throws IOException { SparkUtil.assertPredecessorSize(predecessors, sortOperator, 1); RDD<Tuple> rdd = predecessors.get(0); RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyValueFunction(), SparkUtil.<Tuple, Object> getTuple2Manifest()); JavaPairRDD<Tuple, Object> r = new JavaPairRDD<Tuple, Object>(rddPair, SparkUtil.getManifest(Tuple.class), SparkUtil.getManifest(Object.class)); JavaPairRDD<Tuple, Object> sorted = r.sortByKey( sortOperator.new SortComparator(), true); JavaRDD<Tuple> mapped = sorted.mapPartitions(TO_VALUE_FUNCTION); return mapped.rdd(); }
Example 4
Source File: CassandraUtils.java From deep-spark with Apache License 2.0 | 5 votes |
public static <W> void doCql3SaveToCassandra(RDD<W> rdd, ICassandraDeepJobConfig<W> writeConfig, Function1<W, Tuple2<Cells, Cells>> transformer) { if (!writeConfig.getIsWriteConfig()) { throw new IllegalArgumentException("Provided configuration object is not suitable for writing"); } Tuple2<Map<String, ByteBuffer>, Map<String, ByteBuffer>> tuple = new Tuple2<>(null, null); RDD<Tuple2<Cells, Cells>> mappedRDD = rdd.map(transformer, ClassTag$.MODULE$.<Tuple2<Cells, Cells>>apply(tuple.getClass())); ((CassandraDeepJobConfig) writeConfig).createOutputTableIfNeeded(mappedRDD.first()); final int pageSize = writeConfig.getBatchSize(); int offset = 0; List<Tuple2<Cells, Cells>> elements = Arrays.asList((Tuple2<Cells, Cells>[]) mappedRDD.collect()); List<Tuple2<Cells, Cells>> split; do { split = elements.subList(pageSize * (offset++), Math.min(pageSize * offset, elements.size())); Batch batch = QueryBuilder.batch(); for (Tuple2<Cells, Cells> t : split) { Tuple2<String[], Object[]> bindVars = Utils.prepareTuple4CqlDriver(t); Insert insert = QueryBuilder .insertInto(quote(writeConfig.getKeyspace()), quote(writeConfig.getTable())) .values(bindVars._1(), bindVars._2()); batch.add(insert); } writeConfig.getSession().execute(batch); } while (!split.isEmpty() && split.size() == pageSize); }
Example 5
Source File: LocalRearrangeConverter.java From spork with Apache License 2.0 | 5 votes |
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POLocalRearrange physicalOperator) throws IOException { SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1); RDD<Tuple> rdd = predecessors.get(0); // call local rearrange to get key and value return rdd.map(new LocalRearrangeFunction(physicalOperator), SparkUtil.getManifest(Tuple.class)); }
Example 6
Source File: PackageConverter.java From spork with Apache License 2.0 | 5 votes |
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POPackage physicalOperator) throws IOException { SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1); RDD<Tuple> rdd = predecessors.get(0); // package will generate the group from the result of the local // rearrange return rdd.map(new PackageFunction(physicalOperator, this.confBytes), SparkUtil.getManifest(Tuple.class)); }
Example 7
Source File: SkewedJoinConverter.java From spork with Apache License 2.0 | 4 votes |
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POSkewedJoin poSkewedJoin) throws IOException { SparkUtil.assertPredecessorSize(predecessors, poSkewedJoin, 2); LRs = new POLocalRearrange[2]; this.poSkewedJoin = poSkewedJoin; createJoinPlans(poSkewedJoin.getJoinPlans()); // extract the two RDDs RDD<Tuple> rdd1 = predecessors.get(0); RDD<Tuple> rdd2 = predecessors.get(1); // make (key, value) pairs, key has type Object, value has type Tuple RDD<Tuple2<Object, Tuple>> rdd1Pair = rdd1.map(new ExtractKeyFunction( this, 0), SparkUtil.<Object, Tuple>getTuple2Manifest()); RDD<Tuple2<Object, Tuple>> rdd2Pair = rdd2.map(new ExtractKeyFunction( this, 1), SparkUtil.<Object, Tuple>getTuple2Manifest()); // join fn is present in JavaPairRDD class .. JavaPairRDD<Object, Tuple> rdd1Pair_javaRDD = new JavaPairRDD<Object, Tuple>( rdd1Pair, SparkUtil.getManifest(Object.class), SparkUtil.getManifest(Tuple.class)); JavaPairRDD<Object, Tuple> rdd2Pair_javaRDD = new JavaPairRDD<Object, Tuple>( rdd2Pair, SparkUtil.getManifest(Object.class), SparkUtil.getManifest(Tuple.class)); // do the join JavaPairRDD<Object, Tuple2<Tuple, Tuple>> result_KeyValue = rdd1Pair_javaRDD .join(rdd2Pair_javaRDD); // map to get RDD<Tuple> from RDD<Object, Tuple2<Tuple, Tuple>> by // ignoring the key (of type Object) and appending the values (the // Tuples) JavaRDD<Tuple> result = result_KeyValue .mapPartitions(new ToValueFunction()); // return type is RDD<Tuple>, so take it from JavaRDD<Tuple> return result.rdd(); }