Java Code Examples for org.apache.spark.rdd.RDD#map()

The following examples show how to use org.apache.spark.rdd.RDD#map() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DistinctConverter.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
        PODistinct poDistinct) throws IOException {
    SparkUtil.assertPredecessorSize(predecessors, poDistinct, 1);
    RDD<Tuple> rdd = predecessors.get(0);

    ClassTag<Tuple2<Tuple, Object>> tuple2ClassManifest = SparkUtil
            .<Tuple, Object> getTuple2Manifest();

    RDD<Tuple2<Tuple, Object>> rddPairs = rdd.map(TO_KEY_VALUE_FUNCTION,
            tuple2ClassManifest);
    PairRDDFunctions<Tuple, Object> pairRDDFunctions
      = new PairRDDFunctions<Tuple, Object>(
            rddPairs, SparkUtil.getManifest(Tuple.class),
            SparkUtil.getManifest(Object.class), null);
    int parallelism = SparkUtil.getParallelism(predecessors, poDistinct);
    return pairRDDFunctions.reduceByKey(MERGE_VALUES_FUNCTION, parallelism)
            .map(TO_VALUE_FUNCTION, SparkUtil.getManifest(Tuple.class));
}
 
Example 2
Source File: LoadConverter.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessorRdds, POLoad poLoad)
        throws IOException {
    // if (predecessors.size()!=0) {
    // throw new
    // RuntimeException("Should not have predecessors for Load. Got : "+predecessors);
    // }

    JobConf loadJobConf = SparkUtil.newJobConf(pigContext);
    configureLoader(physicalPlan, poLoad, loadJobConf);

    // don't know why but just doing this cast for now
    RDD<Tuple2<Text, Tuple>> hadoopRDD = sparkContext.newAPIHadoopFile(
            poLoad.getLFile().getFileName(), PigInputFormatSpark.class,
            Text.class, Tuple.class, loadJobConf);

    registerUdfFiles();
    // map to get just RDD<Tuple>
    return hadoopRDD.map(TO_TUPLE_FUNCTION,
            SparkUtil.getManifest(Tuple.class));
}
 
Example 3
Source File: SortConverter.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POSort sortOperator)
        throws IOException {
    SparkUtil.assertPredecessorSize(predecessors, sortOperator, 1);
    RDD<Tuple> rdd = predecessors.get(0);
    RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyValueFunction(),
            SparkUtil.<Tuple, Object> getTuple2Manifest());

    JavaPairRDD<Tuple, Object> r = new JavaPairRDD<Tuple, Object>(rddPair,
            SparkUtil.getManifest(Tuple.class),
            SparkUtil.getManifest(Object.class));

    JavaPairRDD<Tuple, Object> sorted = r.sortByKey(
            sortOperator.new SortComparator(), true);
    JavaRDD<Tuple> mapped = sorted.mapPartitions(TO_VALUE_FUNCTION);

    return mapped.rdd();
}
 
Example 4
Source File: CassandraUtils.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
public static <W> void doCql3SaveToCassandra(RDD<W> rdd, ICassandraDeepJobConfig<W> writeConfig,
                                             Function1<W, Tuple2<Cells, Cells>> transformer) {
    if (!writeConfig.getIsWriteConfig()) {
        throw new IllegalArgumentException("Provided configuration object is not suitable for writing");
    }
    Tuple2<Map<String, ByteBuffer>, Map<String, ByteBuffer>> tuple = new Tuple2<>(null, null);

    RDD<Tuple2<Cells, Cells>> mappedRDD = rdd.map(transformer,
            ClassTag$.MODULE$.<Tuple2<Cells, Cells>>apply(tuple.getClass()));

    ((CassandraDeepJobConfig) writeConfig).createOutputTableIfNeeded(mappedRDD.first());

    final int pageSize = writeConfig.getBatchSize();
    int offset = 0;

    List<Tuple2<Cells, Cells>> elements = Arrays.asList((Tuple2<Cells, Cells>[]) mappedRDD.collect());
    List<Tuple2<Cells, Cells>> split;
    do {
        split = elements.subList(pageSize * (offset++), Math.min(pageSize * offset, elements.size()));

        Batch batch = QueryBuilder.batch();

        for (Tuple2<Cells, Cells> t : split) {
            Tuple2<String[], Object[]> bindVars = Utils.prepareTuple4CqlDriver(t);

            Insert insert = QueryBuilder
                    .insertInto(quote(writeConfig.getKeyspace()), quote(writeConfig.getTable()))
                    .values(bindVars._1(), bindVars._2());

            batch.add(insert);
        }
        writeConfig.getSession().execute(batch);

    } while (!split.isEmpty() && split.size() == pageSize);
}
 
Example 5
Source File: LocalRearrangeConverter.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
        POLocalRearrange physicalOperator) throws IOException {
    SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1);
    RDD<Tuple> rdd = predecessors.get(0);
    // call local rearrange to get key and value
    return rdd.map(new LocalRearrangeFunction(physicalOperator),
            SparkUtil.getManifest(Tuple.class));

}
 
Example 6
Source File: PackageConverter.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
        POPackage physicalOperator) throws IOException {
    SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1);
    RDD<Tuple> rdd = predecessors.get(0);
    // package will generate the group from the result of the local
    // rearrange
    return rdd.map(new PackageFunction(physicalOperator, this.confBytes),
            SparkUtil.getManifest(Tuple.class));
}
 
Example 7
Source File: SkewedJoinConverter.java    From spork with Apache License 2.0 4 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
                          POSkewedJoin poSkewedJoin) throws IOException {

    SparkUtil.assertPredecessorSize(predecessors, poSkewedJoin, 2);
    LRs = new POLocalRearrange[2];
    this.poSkewedJoin = poSkewedJoin;

    createJoinPlans(poSkewedJoin.getJoinPlans());

    // extract the two RDDs
    RDD<Tuple> rdd1 = predecessors.get(0);
    RDD<Tuple> rdd2 = predecessors.get(1);

    // make (key, value) pairs, key has type Object, value has type Tuple
    RDD<Tuple2<Object, Tuple>> rdd1Pair = rdd1.map(new ExtractKeyFunction(
            this, 0), SparkUtil.<Object, Tuple>getTuple2Manifest());
    RDD<Tuple2<Object, Tuple>> rdd2Pair = rdd2.map(new ExtractKeyFunction(
            this, 1), SparkUtil.<Object, Tuple>getTuple2Manifest());

    // join fn is present in JavaPairRDD class ..
    JavaPairRDD<Object, Tuple> rdd1Pair_javaRDD = new JavaPairRDD<Object, Tuple>(
            rdd1Pair, SparkUtil.getManifest(Object.class),
            SparkUtil.getManifest(Tuple.class));
    JavaPairRDD<Object, Tuple> rdd2Pair_javaRDD = new JavaPairRDD<Object, Tuple>(
            rdd2Pair, SparkUtil.getManifest(Object.class),
            SparkUtil.getManifest(Tuple.class));

    // do the join
    JavaPairRDD<Object, Tuple2<Tuple, Tuple>> result_KeyValue = rdd1Pair_javaRDD
            .join(rdd2Pair_javaRDD);

    // map to get RDD<Tuple> from RDD<Object, Tuple2<Tuple, Tuple>> by
    // ignoring the key (of type Object) and appending the values (the
    // Tuples)
    JavaRDD<Tuple> result = result_KeyValue
            .mapPartitions(new ToValueFunction());

    // return type is RDD<Tuple>, so take it from JavaRDD<Tuple>
    return result.rdd();
}