org.apache.spark.api.java.JavaPairRDD#mapToPair

Source File: AppendGAlignedSPInstruction.java From systemds with Apache License 2.0

6 votes

@Override
public void processInstruction(ExecutionContext ec) {
	// general case append (map-extend, aggregate)
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	checkBinaryAppendInputCharacteristics(sec, _cbind, false, true);
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	
	// Simple changing of matrix indexes of RHS
	long shiftBy = _cbind ? mc1.getNumColBlocks() : mc1.getNumRowBlocks();
	out = in2.mapToPair(new ShiftColumnIndex(shiftBy, _cbind));
	out = in1.union( out );
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}

Source File: HoodieReadClient.java From hudi with Apache License 2.0

6 votes

/**
 * Given a bunch of hoodie keys, fetches all the individual records out as a data frame.
 *
 * @return a dataframe
 */
public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) {
  assertSqlContext();
  JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD =
      index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
  JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD =
      lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2)));
  List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
      .map(keyFileTuple -> keyFileTuple._2().get()).collect();

  // record locations might be same for multiple keys, so need a unique list
  Set<String> uniquePaths = new HashSet<>(paths);
  Dataset<Row> originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
  StructType schema = originalDF.schema();
  JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
    HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
        row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
    return new Tuple2<>(key, row);
  });

  // Now, we need to further filter out, for only rows that match the supplied hoodie keys
  JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
  return sqlContextOpt.get().createDataFrame(rowRDD, schema);
}

Source File: UnaryFrameSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	//get input
	JavaPairRDD<Long, FrameBlock> in = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName() );
	JavaPairRDD<Long,FrameBlock> out = in.mapToPair(new DetectSchemaUsingRows());
	FrameBlock outFrame = out.values().reduce(new MergeFrame());
	sec.setFrameOutput(output.getName(), outFrame);
}

Source File: CumulativeOffsetSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	long rlen = mc2.getRows();
	int blen = mc2.getBlocksize();
	
	//get and join inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> inData = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,Tuple2<MatrixBlock,MatrixBlock>> joined = null;
	boolean broadcast = _broadcast && !SparkUtils.isHashPartitioned(inData);
	
	if( broadcast ) {
		//broadcast offsets and broadcast join with data
		PartitionedBroadcast<MatrixBlock> inAgg = sec.getBroadcastForVariable(input2.getName());
		joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, blen));
	}
	else {
		//prepare aggregates (cumsplit of offsets) and repartition join with data
		joined = inData.join(sec
			.getBinaryMatrixBlockRDDHandleForVariable(input2.getName())
			.flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, blen)));
	}
	
	//execute cumulative offset (apply cumulative op w/ offsets)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = joined
		.mapValues(new RDDCumOffsetFunction(_uop, _cumsumprod));
	
	//put output handle in symbol table
	if( _cumsumprod )
		sec.getDataCharacteristics(output.getName())
			.set(mc1.getRows(), 1, mc1.getBlocksize(), mc1.getBlocksize());
	else //general case
		updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineage(output.getName(), input2.getName(), broadcast);
}

Source File: SparkStorageUtils.java From DataVec with Apache License 2.0

5 votes

/**
 * Restore a {@code JavaPairRDD<Long,List<Writable>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br>
 * Note that if the keys are not required, simply use {@code restoreMapFile(...).values()}
 *
 * @param path Path of the MapFile
 * @param sc   Spark context
 * @return The restored RDD, with their unique indices as the key
 */
public static JavaPairRDD<Long, List<Writable>> restoreMapFile(String path, JavaSparkContext sc) {
    Configuration c = new Configuration();
    c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true));
    JavaPairRDD<LongWritable, RecordWritable> pairRDD =
                    sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class, LongWritable.class, RecordWritable.class);

    return pairRDD.mapToPair(new RecordLoadPairFunction());
}

Source File: CumulativeOffsetSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	long rlen = mc2.getRows();
	int blen = mc2.getBlocksize();
	
	//get and join inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> inData = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,Tuple2<MatrixBlock,MatrixBlock>> joined = null;
	boolean broadcast = _broadcast && !SparkUtils.isHashPartitioned(inData);
	
	if( broadcast ) {
		//broadcast offsets and broadcast join with data
		PartitionedBroadcast<MatrixBlock> inAgg = sec.getBroadcastForVariable(input2.getName());
		joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, blen));
	}
	else {
		//prepare aggregates (cumsplit of offsets) and repartition join with data
		joined = inData.join(sec
			.getBinaryMatrixBlockRDDHandleForVariable(input2.getName())
			.flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, blen)));
	}
	
	//execute cumulative offset (apply cumulative op w/ offsets)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = joined
		.mapValues(new RDDCumOffsetFunction(_uop, _cumsumprod));
	
	//put output handle in symbol table
	if( _cumsumprod )
		sec.getDataCharacteristics(output.getName())
			.set(mc1.getRows(), 1, mc1.getBlocksize(), mc1.getBlocksize());
	else //general case
		updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineage(output.getName(), input2.getName(), broadcast);
}

Source File: FrameAppendRSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	JavaPairRDD<Long,FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<Long,FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<Long,FrameBlock> out = null;
	long leftRows = sec.getDataCharacteristics(input1.getName()).getRows();
	
	if(_cbind) {
		JavaPairRDD<Long,FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned);
		JavaPairRDD<Long,FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned);
		
		out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind));
	} else {	//rbind
		JavaPairRDD<Long,FrameBlock> right = in2.mapToPair( new ReduceSideAppendRowsFunction(leftRows));
		out = in1.union(right);
	}
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
	
	//update schema of output with merged input schemas
	sec.getFrameObject(output.getName()).setSchema(
		sec.getFrameObject(input1.getName()).mergeSchemas(
		sec.getFrameObject(input2.getName())));
}

Source File: UnaryFrameSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	//get input
	JavaPairRDD<Long, FrameBlock> in = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName() );
	JavaPairRDD<Long,FrameBlock> out = in.mapToPair(new DetectSchemaUsingRows());
	FrameBlock outFrame = out.values().reduce(new MergeFrame());
	sec.setFrameOutput(output.getName(), outFrame);
}

Source File: HoodieGlobalSimpleIndex.java From hudi with Apache License 2.0

5 votes

/**
 * Tag records with right {@link HoodieRecordLocation}.
 *
 * @param incomingRecords incoming {@link HoodieRecord}s
 * @param existingRecords existing records with {@link HoodieRecordLocation}s
 * @return {@link JavaRDD} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s
 */
private JavaRDD<HoodieRecord<T>> getTaggedRecords(JavaPairRDD<String, HoodieRecord<T>> incomingRecords, JavaPairRDD<HoodieKey, HoodieRecordLocation> existingRecords) {
  JavaPairRDD<String, Pair<String, HoodieRecordLocation>> existingRecordByRecordKey = existingRecords
      .mapToPair(entry -> new Tuple2<>(entry._1.getRecordKey(), Pair.of(entry._1.getPartitionPath(), entry._2)));

  return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values()
      .flatMap(entry -> {
        HoodieRecord<T> inputRecord = entry._1;
        Option<Pair<String, HoodieRecordLocation>> partitionPathLocationPair = Option.ofNullable(entry._2.orNull());
        List<HoodieRecord<T>> taggedRecords;

        if (partitionPathLocationPair.isPresent()) {
          String partitionPath = partitionPathLocationPair.get().getKey();
          HoodieRecordLocation location = partitionPathLocationPair.get().getRight();
          if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) {
            // Create an empty record to delete the record in the old partition
            HoodieRecord<T> emptyRecord = new HoodieRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload());
            // Tag the incoming record for inserting to the new partition
            HoodieRecord<T> taggedRecord = (HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty());
            taggedRecords = Arrays.asList(emptyRecord, taggedRecord);
          } else {
            // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not.
            // When it differs, the record will still be updated at its old partition.
            HoodieRecord<T> newRecord = new HoodieRecord<>(new HoodieKey(inputRecord.getRecordKey(), partitionPath), inputRecord.getData());
            taggedRecords = Collections.singletonList((HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location)));
          }
        } else {
          taggedRecords = Collections.singletonList((HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()));
        }
        return taggedRecords.iterator();
      });
}

Source File: CumulativeAggregateSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mcOut = new MatrixCharacteristics(mc);
	long rlen = mc.getRows();
	int blen = mc.getBlocksize();
	mcOut.setRows((long)(Math.ceil((double)rlen/blen)));
	
	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	
	//execute unary aggregate (w/ implicit drop correction)
	AggregateUnaryOperator auop = (AggregateUnaryOperator) _optr;
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = 
		in.mapToPair(new RDDCumAggFunction(auop, rlen, blen));
	//merge partial aggregates, adjusting for correct number of partitions
	//as size can significant shrink (1K) but also grow (sparse-dense)
	int numParts = SparkUtils.getNumPreferredPartitions(mcOut);
	int minPar = (int)Math.min(SparkExecutionContext.getDefaultParallelism(true), mcOut.getNumBlocks());
	out = RDDAggregateUtils.mergeByKey(out, Math.max(numParts, minPar), false);
	
	//put output handle in symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.getDataCharacteristics(output.getName()).set(mcOut);
}

Source File: FrameAppendRSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	JavaPairRDD<Long,FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<Long,FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<Long,FrameBlock> out = null;
	long leftRows = sec.getDataCharacteristics(input1.getName()).getRows();
	
	if(_cbind) {
		JavaPairRDD<Long,FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned);
		JavaPairRDD<Long,FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned);
		
		out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind));
	} else {	//rbind
		JavaPairRDD<Long,FrameBlock> right = in2.mapToPair( new ReduceSideAppendRowsFunction(leftRows));
		out = in1.union(right);
	}
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
	
	//update schema of output with merged input schemas
	sec.getFrameObject(output.getName()).setSchema(
		sec.getFrameObject(input1.getName()).mergeSchemas(
		sec.getFrameObject(input2.getName())));
}

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<Long, FrameBlock> textCellToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<LongWritable, Text> in, DataCharacteristics mcOut, ValueType[] schema )
{
	//convert input rdd to serializable long/frame block
	JavaPairRDD<Long,Text> input = 
			in.mapToPair(new LongWritableTextToLongTextFunction());
	//do actual conversion
	return textCellToBinaryBlockLongIndex(sc, input, mcOut, schema);
}

Source File: SparkStorageUtils.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record
 * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances.
 * <p>
 * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the sequence file
 * @param rdd            RDD to save
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output sequence files
 * @see #saveSequenceFile(String, JavaRDD)
 * @see #saveMapFileSequences(String, JavaRDD)
 */
public static void saveSequenceFileSequences(String path, JavaRDD<List<List<Writable>>> rdd,
                 Integer maxOutputFiles) {
    path = FilenameUtils.normalize(path, true);
    if (maxOutputFiles != null) {
        rdd = rdd.coalesce(maxOutputFiles);
    }
    JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
    JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
                    dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());

    keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
                    SequenceFileOutputFormat.class);
}

Source File: MLContextConversionUtil.java From systemds with Apache License 2.0

5 votes

/**
 * Convert a {@code JavaRDD<String>} in CSV format to a {@code MatrixObject}
 *
 * @param javaRDD
 *            the Java RDD of strings
 * @param matrixMetadata
 *            matrix metadata
 * @return the {@code JavaRDD<String>} converted to a {@code MatrixObject}
 */
public static MatrixObject javaRDDStringCSVToMatrixObject(JavaRDD<String> javaRDD,
		MatrixMetadata matrixMetadata) {
	JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
	DataCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics()
			: new MatrixCharacteristics();

	MatrixObject matrixObject = new MatrixObject(ValueType.FP64, OptimizerUtils.getUniqueTempFileName(),
			new MetaDataFormat(mc, OutputInfo.CSVOutputInfo, InputInfo.CSVInputInfo));
	JavaPairRDD<LongWritable, Text> javaPairRDD2 = javaPairRDD.mapToPair(new CopyTextInputFunction());
	matrixObject.setRDDHandle(new RDDObject(javaPairRDD2));
	return matrixObject;
}

Source File: GroupCombineFunctions.java From beam with Apache License 2.0

4 votes

/**
 * Apply a composite {@link org.apache.beam.sdk.transforms.Combine.PerKey} transformation.
 *
 * <p>This aggregation will apply Beam's {@link org.apache.beam.sdk.transforms.Combine.CombineFn}
 * via Spark's {@link JavaPairRDD#combineByKey(Function, Function2, Function2)} aggregation. For
 * streaming, this will be called from within a serialized context (DStream's transform callback),
 * so passed arguments need to be Serializable.
 */
public static <K, V, AccumT>
    JavaPairRDD<K, SparkCombineFn.WindowedAccumulator<KV<K, V>, V, AccumT, ?>> combinePerKey(
        JavaRDD<WindowedValue<KV<K, V>>> rdd,
        final SparkCombineFn<KV<K, V>, V, AccumT, ?> sparkCombineFn,
        final Coder<K> keyCoder,
        final Coder<V> valueCoder,
        final Coder<AccumT> aCoder,
        final WindowingStrategy<?, ?> windowingStrategy) {

  boolean mustBringWindowToKey = sparkCombineFn.mustBringWindowToKey();
  @SuppressWarnings("unchecked")
  Coder<BoundedWindow> windowCoder = (Coder) windowingStrategy.getWindowFn().windowCoder();
  final SparkCombineFn.WindowedAccumulatorCoder<KV<K, V>, V, AccumT> waCoder =
      sparkCombineFn.accumulatorCoder(windowCoder, aCoder, windowingStrategy);

  // We need to duplicate K as both the key of the JavaPairRDD as well as inside the value,
  // since the functions passed to combineByKey don't receive the associated key of each
  // value, and we need to map back into methods in Combine.KeyedCombineFn, which each
  // require the key in addition to the InputT's and AccumT's being merged/accumulated.
  // Once Spark provides a way to include keys in the arguments of combine/merge functions,
  // we won't need to duplicate the keys anymore.
  // Key has to bw windowed in order to group by window as well.
  final JavaPairRDD<ByteArray, WindowedValue<KV<K, V>>> inRddDuplicatedKeyPair;
  if (!mustBringWindowToKey) {
    inRddDuplicatedKeyPair = rdd.mapToPair(TranslationUtils.toPairByKeyInWindowedValue(keyCoder));
  } else {
    inRddDuplicatedKeyPair =
        GroupNonMergingWindowsFunctions.bringWindowToKey(rdd, keyCoder, windowCoder);
  }

  JavaPairRDD<
          ByteArray,
          ValueAndCoderLazySerializable<
              SparkCombineFn.WindowedAccumulator<KV<K, V>, V, AccumT, ?>>>
      accumulatedResult =
          inRddDuplicatedKeyPair.combineByKey(
              input ->
                  ValueAndCoderLazySerializable.of(sparkCombineFn.createCombiner(input), waCoder),
              (acc, input) ->
                  ValueAndCoderLazySerializable.of(
                      sparkCombineFn.mergeValue(acc.getOrDecode(waCoder), input), waCoder),
              (acc1, acc2) ->
                  ValueAndCoderLazySerializable.of(
                      sparkCombineFn.mergeCombiners(
                          acc1.getOrDecode(waCoder), acc2.getOrDecode(waCoder)),
                      waCoder));

  return accumulatedResult.mapToPair(
      i ->
          new Tuple2<>(
              CoderHelpers.fromByteArray(i._1.getValue(), keyCoder), i._2.getOrDecode(waCoder)));
}

Source File: MultiReturnParameterizedBuiltinSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override 
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;
	
	try
	{
		//get input RDD and meta data
		FrameObject fo = sec.getFrameObject(input1.getName());
		FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
		JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>)
			sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo);
		String spec = ec.getScalarInput(input2).getStringValue();
		DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName());
		DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
		String[] colnames = !TfMetaUtils.isIDSpec(spec) ?
			in.lookup(1L).get(0).getColumnNames() : null; 
		
		//step 1: build transform meta data
		Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), null);
		
		MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); 
		JavaRDD<String> rcMaps = in
			.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild))
			.distinct().groupByKey()
			.flatMap(new TransformEncodeGroupFunction(accMax));
		if( containsMVImputeEncoder(encoderBuild) ) {
			EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
			rcMaps = rcMaps.union(
				in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva))
				  .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) );
		}
		rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval
		
		//consolidate meta data frame (reuse multi-threaded reader, special handling missing values) 
		FrameReader reader = FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo);
		FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
		meta.recomputeColumnCardinality(); //recompute num distinct items per column
		meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames());
		
		//step 2: transform apply (similar to spark transformapply)
		//compute omit offset map for block shifts
		TfOffsetMap omap = null;
		if( TfMetaUtils.containsOmitSpec(spec, colnames) ) {
			omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(
				new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
		}
		
		//create encoder broadcast (avoiding replication per task) 
		Encoder encoder = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), meta);
		mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); 
		Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
		Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null;
		
		//execute transform apply
		JavaPairRDD<Long,FrameBlock> tmp = in
			.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils
			.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
		
		//set output and maintain lineage/output characteristics
		sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
		sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
		sec.setFrameOutput(_outputs.get(1).getName(), meta);
	}
	catch(IOException ex) {
		throw new RuntimeException(ex);
	}
}

Source File: MultiReturnParameterizedBuiltinSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override 
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;
	
	try
	{
		//get input RDD and meta data
		FrameObject fo = sec.getFrameObject(input1.getName());
		FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
		JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>)
			sec.getRDDHandleForFrameObject(fo, FileFormat.BINARY);
		String spec = ec.getScalarInput(input2).getStringValue();
		DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName());
		DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
		String[] colnames = !TfMetaUtils.isIDSpec(spec) ?
			in.lookup(1L).get(0).getColumnNames() : null; 
		
		//step 1: build transform meta data
		Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), null);
		
		MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); 
		JavaRDD<String> rcMaps = in
			.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild))
			.distinct().groupByKey()
			.flatMap(new TransformEncodeGroupFunction(accMax));
		if( containsMVImputeEncoder(encoderBuild) ) {
			EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
			rcMaps = rcMaps.union(
				in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva))
				  .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) );
		}
		rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval
		
		//consolidate meta data frame (reuse multi-threaded reader, special handling missing values) 
		FrameReader reader = FrameReaderFactory.createFrameReader(FileFormat.TEXT);
		FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
		meta.recomputeColumnCardinality(); //recompute num distinct items per column
		meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames());
		
		//step 2: transform apply (similar to spark transformapply)
		//compute omit offset map for block shifts
		TfOffsetMap omap = null;
		if( TfMetaUtils.containsOmitSpec(spec, colnames) ) {
			omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(
				new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
		}
		
		//create encoder broadcast (avoiding replication per task) 
		Encoder encoder = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), meta);
		mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); 
		Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
		Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null;
		
		//execute transform apply
		JavaPairRDD<Long,FrameBlock> tmp = in
			.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils
			.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
		
		//set output and maintain lineage/output characteristics
		sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
		sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
		sec.setFrameOutput(_outputs.get(1).getName(), meta);
	}
	catch(IOException ex) {
		throw new RuntimeException(ex);
	}
}

Source File: TestSequenceRecordReaderBytesFunction.java From DataVec with Apache License 2.0

4 votes

@Test
public void testRecordReaderBytesFunction() throws Exception {

    //Local file path
    ClassPathResource cpr = new ClassPathResource("/video/shapes_0.mp4");
    String path = cpr.getFile().getAbsolutePath();
    String folder = path.substring(0, path.length() - 12);
    path = folder + "*";

    //Load binary data from local file system, convert to a sequence file:
    //Load and convert
    JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path);
    JavaPairRDD<Text, BytesWritable> filesAsBytes = origData.mapToPair(new FilesAsBytesFunction());
    //Write the sequence file:
    Path p = Files.createTempDirectory("dl4j_rrbytesTest");
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    filesAsBytes.saveAsNewAPIHadoopFile(outPath, Text.class, BytesWritable.class, SequenceFileOutputFormat.class);

    //Load data from sequence file, parse via SequenceRecordReader:
    JavaPairRDD<Text, BytesWritable> fromSeqFile = sc.sequenceFile(outPath, Text.class, BytesWritable.class);
    SequenceRecordReader seqRR = new CodecRecordReader();
    Configuration conf = new Configuration();
    conf.set(CodecRecordReader.RAVEL, "true");
    conf.set(CodecRecordReader.START_FRAME, "0");
    conf.set(CodecRecordReader.TOTAL_FRAMES, "25");
    conf.set(CodecRecordReader.ROWS, "64");
    conf.set(CodecRecordReader.COLUMNS, "64");
    Configuration confCopy = new Configuration(conf);
    seqRR.setConf(conf);
    JavaRDD<List<List<Writable>>> dataVecData = fromSeqFile.map(new SequenceRecordReaderBytesFunction(seqRR));



    //Next: do the same thing locally, and compare the results
    InputSplit is = new FileSplit(new File(folder), new String[] {"mp4"}, true);
    SequenceRecordReader srr = new CodecRecordReader();
    srr.initialize(is);
    srr.setConf(confCopy);

    List<List<List<Writable>>> list = new ArrayList<>(4);
    while (srr.hasNext()) {
        list.add(srr.sequenceRecord());
    }
    assertEquals(4, list.size());

    List<List<List<Writable>>> fromSequenceFile = dataVecData.collect();

    assertEquals(4, list.size());
    assertEquals(4, fromSequenceFile.size());

    boolean[] found = new boolean[4];
    for (int i = 0; i < 4; i++) {
        int foundIndex = -1;
        List<List<Writable>> collection = fromSequenceFile.get(i);
        for (int j = 0; j < 4; j++) {
            if (collection.equals(list.get(j))) {
                if (foundIndex != -1)
                    fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                foundIndex = j;
                if (found[foundIndex])
                    fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                found[foundIndex] = true; //mark this one as seen before
            }
        }
    }
    int count = 0;
    for (boolean b : found)
        if (b)
            count++;
    assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions
}

Source File: SparkStorageUtils.java From deeplearning4j with Apache License 2.0

3 votes

/**
 * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
 * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as
 * {@link RecordWritable} instances.<br>
 * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance
 * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
 * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader}
 * <p>
 * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the MapFile
 * @param rdd            RDD to save
 * @param c              Configuration object, used to customise options for the map file
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output map files
 * @see #saveMapFileSequences(String, JavaRDD)
 * @see #saveSequenceFile(String, JavaRDD)
 */
public static void saveMapFile(String path, JavaRDD<List<Writable>> rdd, Configuration c,
                 Integer maxOutputFiles) {
    path = FilenameUtils.normalize(path, true);
    if (maxOutputFiles != null) {
        rdd = rdd.coalesce(maxOutputFiles);
    }
    JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithIndex(); //Note: Long values are unique + contiguous, but requires a count
    JavaPairRDD<LongWritable, RecordWritable> keyedByIndex =
                    dataIndexPairs.mapToPair(new RecordSavePrepPairFunction());

    keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, MapFileOutputFormat.class,
                    c);
}

Source File: ThresholdClusterer.java From ensemble-clustering with MIT License

2 votes

@Override
	public SparkClusterResult doCluster(DataSet ds) {
		// SparkDataSet needs to be passed in
		SparkDataSet rdd = (SparkDataSet)ds;
		
		// cache dataset in memory
//		rdd.getRDD().cache();
		
		distFunc = new DistanceFunction(this.typeDefs);
		ClusterFactory clusterFactory = new ClusterFactory(this.typeDefs, this.onlineUpdate);
		
		log.info("Starting threshold clusterer with threshold {}", threshold);
		
		// TODO look at using a reduce function 
		// Idea is the first step is a map<Instance, List<Instance>> that converts each instance to a single "cluster"
		// second step is a reduce where input is a List<Instances> and produces a List<Instances>
		// this step would merge clusters within threshold
		
		JavaPairRDD<String, Instance> instances = rdd.getRDD();
		instances.cache();
		
		// convert each instance into a singleton cluster
		JavaRDD<Map<String, Instance>> singletons = rdd.getRDD().map( new InstanceToClusterFunction(clusterFactory) );
		//singletons.cache();
		
		log.info("Generated initial singleton clusters");
		
		// merge clusters together
		Map<String, Instance> clusters = singletons.reduce( new AggregateClusterFunction(distFunc, threshold) );
		
		log.info("Merging clusters completed with {} clusters", clusters.size());
		
		// find the best cluster for each instance
		JavaPairRDD<String, Instance> bestCluster = instances.mapToPair( new BestClusterFunction(distFunc, clusters) );
		
		log.info("Output results");
		
		if (clusters != null && centroidsPath != null) rdd.getContext().parallelize(new ArrayList<Instance>(clusters.values())).saveAsTextFile(centroidsPath);
	
		if (bestCluster != null && clustersPath != null) bestCluster.saveAsTextFile(clustersPath);
		
		log.info("Threshold clusterer completed");
		
		// return the cluster membership rdd
		return new SparkClusterResult(bestCluster);
	}

Java Code Examples for org.apache.spark.api.java.JavaPairRDD#mapToPair()