Java Code Examples for org.apache.spark.api.java.JavaPairRDD#mapToPair()
The following examples show how to use
org.apache.spark.api.java.JavaPairRDD#mapToPair() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AppendGAlignedSPInstruction.java From systemds with Apache License 2.0 | 6 votes |
@Override public void processInstruction(ExecutionContext ec) { // general case append (map-extend, aggregate) SparkExecutionContext sec = (SparkExecutionContext)ec; checkBinaryAppendInputCharacteristics(sec, _cbind, false, true); DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> out = null; // Simple changing of matrix indexes of RHS long shiftBy = _cbind ? mc1.getNumColBlocks() : mc1.getNumRowBlocks(); out = in2.mapToPair(new ShiftColumnIndex(shiftBy, _cbind)); out = in1.union( out ); //put output RDD handle into symbol table updateBinaryAppendOutputDataCharacteristics(sec, _cbind); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); }
Example 2
Source File: HoodieReadClient.java From hudi with Apache License 2.0 | 6 votes |
/** * Given a bunch of hoodie keys, fetches all the individual records out as a data frame. * * @return a dataframe */ public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) { assertSqlContext(); JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD = index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable); JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD = lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2))); List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent()) .map(keyFileTuple -> keyFileTuple._2().get()).collect(); // record locations might be same for multiple keys, so need a unique list Set<String> uniquePaths = new HashSet<>(paths); Dataset<Row> originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()])); StructType schema = originalDF.schema(); JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> { HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); return new Tuple2<>(key, row); }); // Now, we need to further filter out, for only rows that match the supplied hoodie keys JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1()); return sqlContextOpt.get().createDataFrame(rowRDD, schema); }
Example 3
Source File: UnaryFrameSPInstruction.java From systemds with Apache License 2.0 | 5 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //get input JavaPairRDD<Long, FrameBlock> in = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName() ); JavaPairRDD<Long,FrameBlock> out = in.mapToPair(new DetectSchemaUsingRows()); FrameBlock outFrame = out.values().reduce(new MergeFrame()); sec.setFrameOutput(output.getName(), outFrame); }
Example 4
Source File: CumulativeOffsetSPInstruction.java From systemds with Apache License 2.0 | 5 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName()); long rlen = mc2.getRows(); int blen = mc2.getBlocksize(); //get and join inputs JavaPairRDD<MatrixIndexes,MatrixBlock> inData = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName()); JavaPairRDD<MatrixIndexes,Tuple2<MatrixBlock,MatrixBlock>> joined = null; boolean broadcast = _broadcast && !SparkUtils.isHashPartitioned(inData); if( broadcast ) { //broadcast offsets and broadcast join with data PartitionedBroadcast<MatrixBlock> inAgg = sec.getBroadcastForVariable(input2.getName()); joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, blen)); } else { //prepare aggregates (cumsplit of offsets) and repartition join with data joined = inData.join(sec .getBinaryMatrixBlockRDDHandleForVariable(input2.getName()) .flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, blen))); } //execute cumulative offset (apply cumulative op w/ offsets) JavaPairRDD<MatrixIndexes,MatrixBlock> out = joined .mapValues(new RDDCumOffsetFunction(_uop, _cumsumprod)); //put output handle in symbol table if( _cumsumprod ) sec.getDataCharacteristics(output.getName()) .set(mc1.getRows(), 1, mc1.getBlocksize(), mc1.getBlocksize()); else //general case updateUnaryOutputDataCharacteristics(sec); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineage(output.getName(), input2.getName(), broadcast); }
Example 5
Source File: SparkStorageUtils.java From DataVec with Apache License 2.0 | 5 votes |
/** * Restore a {@code JavaPairRDD<Long,List<Writable>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br> * Note that if the keys are not required, simply use {@code restoreMapFile(...).values()} * * @param path Path of the MapFile * @param sc Spark context * @return The restored RDD, with their unique indices as the key */ public static JavaPairRDD<Long, List<Writable>> restoreMapFile(String path, JavaSparkContext sc) { Configuration c = new Configuration(); c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true)); JavaPairRDD<LongWritable, RecordWritable> pairRDD = sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class, LongWritable.class, RecordWritable.class); return pairRDD.mapToPair(new RecordLoadPairFunction()); }
Example 6
Source File: CumulativeOffsetSPInstruction.java From systemds with Apache License 2.0 | 5 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName()); long rlen = mc2.getRows(); int blen = mc2.getBlocksize(); //get and join inputs JavaPairRDD<MatrixIndexes,MatrixBlock> inData = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName()); JavaPairRDD<MatrixIndexes,Tuple2<MatrixBlock,MatrixBlock>> joined = null; boolean broadcast = _broadcast && !SparkUtils.isHashPartitioned(inData); if( broadcast ) { //broadcast offsets and broadcast join with data PartitionedBroadcast<MatrixBlock> inAgg = sec.getBroadcastForVariable(input2.getName()); joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, blen)); } else { //prepare aggregates (cumsplit of offsets) and repartition join with data joined = inData.join(sec .getBinaryMatrixBlockRDDHandleForVariable(input2.getName()) .flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, blen))); } //execute cumulative offset (apply cumulative op w/ offsets) JavaPairRDD<MatrixIndexes,MatrixBlock> out = joined .mapValues(new RDDCumOffsetFunction(_uop, _cumsumprod)); //put output handle in symbol table if( _cumsumprod ) sec.getDataCharacteristics(output.getName()) .set(mc1.getRows(), 1, mc1.getBlocksize(), mc1.getBlocksize()); else //general case updateUnaryOutputDataCharacteristics(sec); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineage(output.getName(), input2.getName(), broadcast); }
Example 7
Source File: FrameAppendRSPInstruction.java From systemds with Apache License 2.0 | 5 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; JavaPairRDD<Long,FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<Long,FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable( input2.getName() ); JavaPairRDD<Long,FrameBlock> out = null; long leftRows = sec.getDataCharacteristics(input1.getName()).getRows(); if(_cbind) { JavaPairRDD<Long,FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows)); in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned); JavaPairRDD<Long,FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows)); in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned); out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind)); } else { //rbind JavaPairRDD<Long,FrameBlock> right = in2.mapToPair( new ReduceSideAppendRowsFunction(leftRows)); out = in1.union(right); } //put output RDD handle into symbol table updateBinaryAppendOutputDataCharacteristics(sec, _cbind); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); //update schema of output with merged input schemas sec.getFrameObject(output.getName()).setSchema( sec.getFrameObject(input1.getName()).mergeSchemas( sec.getFrameObject(input2.getName()))); }
Example 8
Source File: UnaryFrameSPInstruction.java From systemds with Apache License 2.0 | 5 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //get input JavaPairRDD<Long, FrameBlock> in = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName() ); JavaPairRDD<Long,FrameBlock> out = in.mapToPair(new DetectSchemaUsingRows()); FrameBlock outFrame = out.values().reduce(new MergeFrame()); sec.setFrameOutput(output.getName(), outFrame); }
Example 9
Source File: HoodieGlobalSimpleIndex.java From hudi with Apache License 2.0 | 5 votes |
/** * Tag records with right {@link HoodieRecordLocation}. * * @param incomingRecords incoming {@link HoodieRecord}s * @param existingRecords existing records with {@link HoodieRecordLocation}s * @return {@link JavaRDD} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s */ private JavaRDD<HoodieRecord<T>> getTaggedRecords(JavaPairRDD<String, HoodieRecord<T>> incomingRecords, JavaPairRDD<HoodieKey, HoodieRecordLocation> existingRecords) { JavaPairRDD<String, Pair<String, HoodieRecordLocation>> existingRecordByRecordKey = existingRecords .mapToPair(entry -> new Tuple2<>(entry._1.getRecordKey(), Pair.of(entry._1.getPartitionPath(), entry._2))); return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values() .flatMap(entry -> { HoodieRecord<T> inputRecord = entry._1; Option<Pair<String, HoodieRecordLocation>> partitionPathLocationPair = Option.ofNullable(entry._2.orNull()); List<HoodieRecord<T>> taggedRecords; if (partitionPathLocationPair.isPresent()) { String partitionPath = partitionPathLocationPair.get().getKey(); HoodieRecordLocation location = partitionPathLocationPair.get().getRight(); if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) { // Create an empty record to delete the record in the old partition HoodieRecord<T> emptyRecord = new HoodieRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload()); // Tag the incoming record for inserting to the new partition HoodieRecord<T> taggedRecord = (HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()); taggedRecords = Arrays.asList(emptyRecord, taggedRecord); } else { // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not. // When it differs, the record will still be updated at its old partition. HoodieRecord<T> newRecord = new HoodieRecord<>(new HoodieKey(inputRecord.getRecordKey(), partitionPath), inputRecord.getData()); taggedRecords = Collections.singletonList((HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location))); } } else { taggedRecords = Collections.singletonList((HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty())); } return taggedRecords.iterator(); }); }
Example 10
Source File: CumulativeAggregateSPInstruction.java From systemds with Apache License 2.0 | 5 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; DataCharacteristics mc = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mcOut = new MatrixCharacteristics(mc); long rlen = mc.getRows(); int blen = mc.getBlocksize(); mcOut.setRows((long)(Math.ceil((double)rlen/blen))); //get input JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); //execute unary aggregate (w/ implicit drop correction) AggregateUnaryOperator auop = (AggregateUnaryOperator) _optr; JavaPairRDD<MatrixIndexes,MatrixBlock> out = in.mapToPair(new RDDCumAggFunction(auop, rlen, blen)); //merge partial aggregates, adjusting for correct number of partitions //as size can significant shrink (1K) but also grow (sparse-dense) int numParts = SparkUtils.getNumPreferredPartitions(mcOut); int minPar = (int)Math.min(SparkExecutionContext.getDefaultParallelism(true), mcOut.getNumBlocks()); out = RDDAggregateUtils.mergeByKey(out, Math.max(numParts, minPar), false); //put output handle in symbol table sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.getDataCharacteristics(output.getName()).set(mcOut); }
Example 11
Source File: FrameAppendRSPInstruction.java From systemds with Apache License 2.0 | 5 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; JavaPairRDD<Long,FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<Long,FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable( input2.getName() ); JavaPairRDD<Long,FrameBlock> out = null; long leftRows = sec.getDataCharacteristics(input1.getName()).getRows(); if(_cbind) { JavaPairRDD<Long,FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows)); in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned); JavaPairRDD<Long,FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows)); in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned); out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind)); } else { //rbind JavaPairRDD<Long,FrameBlock> right = in2.mapToPair( new ReduceSideAppendRowsFunction(leftRows)); out = in1.union(right); } //put output RDD handle into symbol table updateBinaryAppendOutputDataCharacteristics(sec, _cbind); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); //update schema of output with merged input schemas sec.getFrameObject(output.getName()).setSchema( sec.getFrameObject(input1.getName()).mergeSchemas( sec.getFrameObject(input2.getName()))); }
Example 12
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<Long, FrameBlock> textCellToBinaryBlock(JavaSparkContext sc, JavaPairRDD<LongWritable, Text> in, DataCharacteristics mcOut, ValueType[] schema ) { //convert input rdd to serializable long/frame block JavaPairRDD<Long,Text> input = in.mapToPair(new LongWritableTextToLongTextFunction()); //do actual conversion return textCellToBinaryBlockLongIndex(sc, input, mcOut, schema); }
Example 13
Source File: SparkStorageUtils.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances. * <p> * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method. * * @param path Path to save the sequence file * @param rdd RDD to save * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions) * to limit the maximum number of output sequence files * @see #saveSequenceFile(String, JavaRDD) * @see #saveMapFileSequences(String, JavaRDD) */ public static void saveSequenceFileSequences(String path, JavaRDD<List<List<Writable>>> rdd, Integer maxOutputFiles) { path = FilenameUtils.normalize(path, true); if (maxOutputFiles != null) { rdd = rdd.coalesce(maxOutputFiles); } JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex = dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction()); keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class, SequenceFileOutputFormat.class); }
Example 14
Source File: MLContextConversionUtil.java From systemds with Apache License 2.0 | 5 votes |
/** * Convert a {@code JavaRDD<String>} in CSV format to a {@code MatrixObject} * * @param javaRDD * the Java RDD of strings * @param matrixMetadata * matrix metadata * @return the {@code JavaRDD<String>} converted to a {@code MatrixObject} */ public static MatrixObject javaRDDStringCSVToMatrixObject(JavaRDD<String> javaRDD, MatrixMetadata matrixMetadata) { JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair()); DataCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics(); MatrixObject matrixObject = new MatrixObject(ValueType.FP64, OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.CSVOutputInfo, InputInfo.CSVInputInfo)); JavaPairRDD<LongWritable, Text> javaPairRDD2 = javaPairRDD.mapToPair(new CopyTextInputFunction()); matrixObject.setRDDHandle(new RDDObject(javaPairRDD2)); return matrixObject; }
Example 15
Source File: GroupCombineFunctions.java From beam with Apache License 2.0 | 4 votes |
/** * Apply a composite {@link org.apache.beam.sdk.transforms.Combine.PerKey} transformation. * * <p>This aggregation will apply Beam's {@link org.apache.beam.sdk.transforms.Combine.CombineFn} * via Spark's {@link JavaPairRDD#combineByKey(Function, Function2, Function2)} aggregation. For * streaming, this will be called from within a serialized context (DStream's transform callback), * so passed arguments need to be Serializable. */ public static <K, V, AccumT> JavaPairRDD<K, SparkCombineFn.WindowedAccumulator<KV<K, V>, V, AccumT, ?>> combinePerKey( JavaRDD<WindowedValue<KV<K, V>>> rdd, final SparkCombineFn<KV<K, V>, V, AccumT, ?> sparkCombineFn, final Coder<K> keyCoder, final Coder<V> valueCoder, final Coder<AccumT> aCoder, final WindowingStrategy<?, ?> windowingStrategy) { boolean mustBringWindowToKey = sparkCombineFn.mustBringWindowToKey(); @SuppressWarnings("unchecked") Coder<BoundedWindow> windowCoder = (Coder) windowingStrategy.getWindowFn().windowCoder(); final SparkCombineFn.WindowedAccumulatorCoder<KV<K, V>, V, AccumT> waCoder = sparkCombineFn.accumulatorCoder(windowCoder, aCoder, windowingStrategy); // We need to duplicate K as both the key of the JavaPairRDD as well as inside the value, // since the functions passed to combineByKey don't receive the associated key of each // value, and we need to map back into methods in Combine.KeyedCombineFn, which each // require the key in addition to the InputT's and AccumT's being merged/accumulated. // Once Spark provides a way to include keys in the arguments of combine/merge functions, // we won't need to duplicate the keys anymore. // Key has to bw windowed in order to group by window as well. final JavaPairRDD<ByteArray, WindowedValue<KV<K, V>>> inRddDuplicatedKeyPair; if (!mustBringWindowToKey) { inRddDuplicatedKeyPair = rdd.mapToPair(TranslationUtils.toPairByKeyInWindowedValue(keyCoder)); } else { inRddDuplicatedKeyPair = GroupNonMergingWindowsFunctions.bringWindowToKey(rdd, keyCoder, windowCoder); } JavaPairRDD< ByteArray, ValueAndCoderLazySerializable< SparkCombineFn.WindowedAccumulator<KV<K, V>, V, AccumT, ?>>> accumulatedResult = inRddDuplicatedKeyPair.combineByKey( input -> ValueAndCoderLazySerializable.of(sparkCombineFn.createCombiner(input), waCoder), (acc, input) -> ValueAndCoderLazySerializable.of( sparkCombineFn.mergeValue(acc.getOrDecode(waCoder), input), waCoder), (acc1, acc2) -> ValueAndCoderLazySerializable.of( sparkCombineFn.mergeCombiners( acc1.getOrDecode(waCoder), acc2.getOrDecode(waCoder)), waCoder)); return accumulatedResult.mapToPair( i -> new Tuple2<>( CoderHelpers.fromByteArray(i._1.getValue(), keyCoder), i._2.getOrDecode(waCoder))); }
Example 16
Source File: MultiReturnParameterizedBuiltinSPInstruction.java From systemds with Apache License 2.0 | 4 votes |
@Override @SuppressWarnings("unchecked") public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext) ec; try { //get input RDD and meta data FrameObject fo = sec.getFrameObject(input1.getName()); FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName()); JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>) sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo); String spec = ec.getScalarInput(input2).getStringValue(); DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName()); String[] colnames = !TfMetaUtils.isIDSpec(spec) ? in.lookup(1L).get(0).getColumnNames() : null; //step 1: build transform meta data Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int)fo.getNumColumns(), null); MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); JavaRDD<String> rcMaps = in .mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild)) .distinct().groupByKey() .flatMap(new TransformEncodeGroupFunction(accMax)); if( containsMVImputeEncoder(encoderBuild) ) { EncoderMVImpute mva = getMVImputeEncoder(encoderBuild); rcMaps = rcMaps.union( in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva)) .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) ); } rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval //consolidate meta data frame (reuse multi-threaded reader, special handling missing values) FrameReader reader = FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo); FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns()); meta.recomputeColumnCardinality(); //recompute num distinct items per column meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames()); //step 2: transform apply (similar to spark transformapply) //compute omit offset map for block shifts TfOffsetMap omap = null; if( TfMetaUtils.containsOmitSpec(spec, colnames) ) { omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair( new RDDTransformApplyOffsetFunction(spec, colnames)).collect())); } //create encoder broadcast (avoiding replication per task) Encoder encoder = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int)fo.getNumColumns(), meta); mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder); Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null; //execute transform apply JavaPairRDD<Long,FrameBlock> tmp = in .mapToPair(new RDDTransformApplyFunction(bmeta, bomap)); JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils .binaryBlockToMatrixBlock(tmp, mcOut, mcOut); //set output and maintain lineage/output characteristics sec.setRDDHandleForVariable(_outputs.get(0).getName(), out); sec.addLineageRDD(_outputs.get(0).getName(), input1.getName()); sec.setFrameOutput(_outputs.get(1).getName(), meta); } catch(IOException ex) { throw new RuntimeException(ex); } }
Example 17
Source File: MultiReturnParameterizedBuiltinSPInstruction.java From systemds with Apache License 2.0 | 4 votes |
@Override @SuppressWarnings("unchecked") public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext) ec; try { //get input RDD and meta data FrameObject fo = sec.getFrameObject(input1.getName()); FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName()); JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>) sec.getRDDHandleForFrameObject(fo, FileFormat.BINARY); String spec = ec.getScalarInput(input2).getStringValue(); DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName()); String[] colnames = !TfMetaUtils.isIDSpec(spec) ? in.lookup(1L).get(0).getColumnNames() : null; //step 1: build transform meta data Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int)fo.getNumColumns(), null); MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); JavaRDD<String> rcMaps = in .mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild)) .distinct().groupByKey() .flatMap(new TransformEncodeGroupFunction(accMax)); if( containsMVImputeEncoder(encoderBuild) ) { EncoderMVImpute mva = getMVImputeEncoder(encoderBuild); rcMaps = rcMaps.union( in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva)) .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) ); } rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval //consolidate meta data frame (reuse multi-threaded reader, special handling missing values) FrameReader reader = FrameReaderFactory.createFrameReader(FileFormat.TEXT); FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns()); meta.recomputeColumnCardinality(); //recompute num distinct items per column meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames()); //step 2: transform apply (similar to spark transformapply) //compute omit offset map for block shifts TfOffsetMap omap = null; if( TfMetaUtils.containsOmitSpec(spec, colnames) ) { omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair( new RDDTransformApplyOffsetFunction(spec, colnames)).collect())); } //create encoder broadcast (avoiding replication per task) Encoder encoder = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int)fo.getNumColumns(), meta); mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder); Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null; //execute transform apply JavaPairRDD<Long,FrameBlock> tmp = in .mapToPair(new RDDTransformApplyFunction(bmeta, bomap)); JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils .binaryBlockToMatrixBlock(tmp, mcOut, mcOut); //set output and maintain lineage/output characteristics sec.setRDDHandleForVariable(_outputs.get(0).getName(), out); sec.addLineageRDD(_outputs.get(0).getName(), input1.getName()); sec.setFrameOutput(_outputs.get(1).getName(), meta); } catch(IOException ex) { throw new RuntimeException(ex); } }
Example 18
Source File: TestSequenceRecordReaderBytesFunction.java From DataVec with Apache License 2.0 | 4 votes |
@Test public void testRecordReaderBytesFunction() throws Exception { //Local file path ClassPathResource cpr = new ClassPathResource("/video/shapes_0.mp4"); String path = cpr.getFile().getAbsolutePath(); String folder = path.substring(0, path.length() - 12); path = folder + "*"; //Load binary data from local file system, convert to a sequence file: //Load and convert JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path); JavaPairRDD<Text, BytesWritable> filesAsBytes = origData.mapToPair(new FilesAsBytesFunction()); //Write the sequence file: Path p = Files.createTempDirectory("dl4j_rrbytesTest"); p.toFile().deleteOnExit(); String outPath = p.toString() + "/out"; filesAsBytes.saveAsNewAPIHadoopFile(outPath, Text.class, BytesWritable.class, SequenceFileOutputFormat.class); //Load data from sequence file, parse via SequenceRecordReader: JavaPairRDD<Text, BytesWritable> fromSeqFile = sc.sequenceFile(outPath, Text.class, BytesWritable.class); SequenceRecordReader seqRR = new CodecRecordReader(); Configuration conf = new Configuration(); conf.set(CodecRecordReader.RAVEL, "true"); conf.set(CodecRecordReader.START_FRAME, "0"); conf.set(CodecRecordReader.TOTAL_FRAMES, "25"); conf.set(CodecRecordReader.ROWS, "64"); conf.set(CodecRecordReader.COLUMNS, "64"); Configuration confCopy = new Configuration(conf); seqRR.setConf(conf); JavaRDD<List<List<Writable>>> dataVecData = fromSeqFile.map(new SequenceRecordReaderBytesFunction(seqRR)); //Next: do the same thing locally, and compare the results InputSplit is = new FileSplit(new File(folder), new String[] {"mp4"}, true); SequenceRecordReader srr = new CodecRecordReader(); srr.initialize(is); srr.setConf(confCopy); List<List<List<Writable>>> list = new ArrayList<>(4); while (srr.hasNext()) { list.add(srr.sequenceRecord()); } assertEquals(4, list.size()); List<List<List<Writable>>> fromSequenceFile = dataVecData.collect(); assertEquals(4, list.size()); assertEquals(4, fromSequenceFile.size()); boolean[] found = new boolean[4]; for (int i = 0; i < 4; i++) { int foundIndex = -1; List<List<Writable>> collection = fromSequenceFile.get(i); for (int j = 0; j < 4; j++) { if (collection.equals(list.get(j))) { if (foundIndex != -1) fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen) foundIndex = j; if (found[foundIndex]) fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list found[foundIndex] = true; //mark this one as seen before } } } int count = 0; for (boolean b : found) if (b) count++; assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions }
Example 19
Source File: SparkStorageUtils.java From deeplearning4j with Apache License 2.0 | 3 votes |
/** * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as * {@link RecordWritable} instances.<br> * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance * point of view. Contiguous keys are often only required for non-Spark use cases, such as with * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader} * <p> * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method. * * @param path Path to save the MapFile * @param rdd RDD to save * @param c Configuration object, used to customise options for the map file * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions) * to limit the maximum number of output map files * @see #saveMapFileSequences(String, JavaRDD) * @see #saveSequenceFile(String, JavaRDD) */ public static void saveMapFile(String path, JavaRDD<List<Writable>> rdd, Configuration c, Integer maxOutputFiles) { path = FilenameUtils.normalize(path, true); if (maxOutputFiles != null) { rdd = rdd.coalesce(maxOutputFiles); } JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithIndex(); //Note: Long values are unique + contiguous, but requires a count JavaPairRDD<LongWritable, RecordWritable> keyedByIndex = dataIndexPairs.mapToPair(new RecordSavePrepPairFunction()); keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, MapFileOutputFormat.class, c); }
Example 20
Source File: ThresholdClusterer.java From ensemble-clustering with MIT License | 2 votes |
@Override public SparkClusterResult doCluster(DataSet ds) { // SparkDataSet needs to be passed in SparkDataSet rdd = (SparkDataSet)ds; // cache dataset in memory // rdd.getRDD().cache(); distFunc = new DistanceFunction(this.typeDefs); ClusterFactory clusterFactory = new ClusterFactory(this.typeDefs, this.onlineUpdate); log.info("Starting threshold clusterer with threshold {}", threshold); // TODO look at using a reduce function // Idea is the first step is a map<Instance, List<Instance>> that converts each instance to a single "cluster" // second step is a reduce where input is a List<Instances> and produces a List<Instances> // this step would merge clusters within threshold JavaPairRDD<String, Instance> instances = rdd.getRDD(); instances.cache(); // convert each instance into a singleton cluster JavaRDD<Map<String, Instance>> singletons = rdd.getRDD().map( new InstanceToClusterFunction(clusterFactory) ); //singletons.cache(); log.info("Generated initial singleton clusters"); // merge clusters together Map<String, Instance> clusters = singletons.reduce( new AggregateClusterFunction(distFunc, threshold) ); log.info("Merging clusters completed with {} clusters", clusters.size()); // find the best cluster for each instance JavaPairRDD<String, Instance> bestCluster = instances.mapToPair( new BestClusterFunction(distFunc, clusters) ); log.info("Output results"); if (clusters != null && centroidsPath != null) rdd.getContext().parallelize(new ArrayList<Instance>(clusters.values())).saveAsTextFile(centroidsPath); if (bestCluster != null && clustersPath != null) bestCluster.saveAsTextFile(clustersPath); log.info("Threshold clusterer completed"); // return the cluster membership rdd return new SparkClusterResult(bestCluster); }