org.apache.spark.api.java.JavaRDD#sortBy

Source File: RDDSortUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVal( JavaPairRDD<MatrixIndexes, MatrixBlock> in, long rlen, int blen )
{
	//create value-index rdd from inputs
	JavaRDD<Double> dvals = in.values()
			.flatMap(new ExtractDoubleValuesFunction());

	//sort (creates sorted range per partition)
	long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
	int numPartitions = (int)Math.ceil(((double)rlen*8)/hdfsBlocksize);
	JavaRDD<Double> sdvals = dvals
			.sortBy(new CreateDoubleKeyFunction(), true, numPartitions);
	
	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
			.zipWithIndex()
			.mapPartitionsToPair(new ConvertToBinaryBlockFunction(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);
	
	return ret;
}

Source File: RDDSortUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVal( JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		JavaPairRDD<MatrixIndexes, MatrixBlock> in2, long rlen, int blen )
{
	//create value-index rdd from inputs
	JavaRDD<DoublePair> dvals = in.join(in2).values()
		.flatMap(new ExtractDoubleValuesFunction2());

	//sort (creates sorted range per partition)
	long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
	int numPartitions = (int)Math.ceil(((double)rlen*8)/hdfsBlocksize);
	JavaRDD<DoublePair> sdvals = dvals
		.sortBy(new CreateDoubleKeyFunction2(), true, numPartitions);

	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
		.zipWithIndex()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction2(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);		
	
	return ret;
}

Source File: RDDSortUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVals(
	JavaPairRDD<MatrixIndexes, MatrixBlock> in, long rlen, long clen, int blen )
{
	//create value-index rdd from inputs
	JavaRDD<MatrixBlock> dvals = in.values()
		.flatMap(new ExtractRowsFunction());
	
	//sort (creates sorted range per partition)
	int numPartitions = SparkUtils.getNumPreferredPartitions(
		new MatrixCharacteristics(rlen, clen, blen, blen), in);
	JavaRDD<MatrixBlock> sdvals = dvals
		.sortBy(new CreateDoubleKeysFunction(), true, numPartitions);
	
	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
		.zipWithIndex()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction5(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);
	
	return ret;
}

Source File: HoodieBloomIndex.java From hudi with Apache License 2.0

6 votes

/**
 * Find out <RowKey, filename> pair. All workload grouped by file-level.
 * <p>
 * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that each RDD
 * partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey
 * <p>
 * Make sure the parallelism is atleast the groupby parallelism for tagging location
 */
JavaPairRDD<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(
    final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
    JavaPairRDD<String, String> partitionRecordKeyPairRDD, int shuffleParallelism, HoodieTable hoodieTable,
    Map<String, Long> fileGroupToComparisons) {
  JavaRDD<Tuple2<String, HoodieKey>> fileComparisonsRDD =
      explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);

  if (config.useBloomIndexBucketizedChecking()) {
    Partitioner partitioner = new BucketizedBloomCheckPartitioner(shuffleParallelism, fileGroupToComparisons,
        config.getBloomIndexKeysPerBucket());

    fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t))
        .repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2);
  } else {
    fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism);
  }

  return fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true)
      .flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0)
      .flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream()
          .map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()),
              new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())))
          .collect(Collectors.toList()).iterator());
}

Source File: RDDSortUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVal( JavaPairRDD<MatrixIndexes, MatrixBlock> in, long rlen, int blen )
{
	//create value-index rdd from inputs
	JavaRDD<Double> dvals = in.values()
			.flatMap(new ExtractDoubleValuesFunction());

	//sort (creates sorted range per partition)
	long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
	int numPartitions = (int)Math.ceil(((double)rlen*8)/hdfsBlocksize);
	JavaRDD<Double> sdvals = dvals
			.sortBy(new CreateDoubleKeyFunction(), true, numPartitions);
	
	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
			.zipWithIndex()
			.mapPartitionsToPair(new ConvertToBinaryBlockFunction(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);
	
	return ret;
}

Source File: RDDSortUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVal( JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		JavaPairRDD<MatrixIndexes, MatrixBlock> in2, long rlen, int blen )
{
	//create value-index rdd from inputs
	JavaRDD<DoublePair> dvals = in.join(in2).values()
		.flatMap(new ExtractDoubleValuesFunction2());

	//sort (creates sorted range per partition)
	long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
	int numPartitions = (int)Math.ceil(((double)rlen*8)/hdfsBlocksize);
	JavaRDD<DoublePair> sdvals = dvals
		.sortBy(new CreateDoubleKeyFunction2(), true, numPartitions);

	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
		.zipWithIndex()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction2(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);		
	
	return ret;
}

Source File: RDDSortUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVals(
	JavaPairRDD<MatrixIndexes, MatrixBlock> in, long rlen, long clen, int blen )
{
	//create value-index rdd from inputs
	JavaRDD<MatrixBlock> dvals = in.values()
		.flatMap(new ExtractRowsFunction());
	
	//sort (creates sorted range per partition)
	int numPartitions = SparkUtils.getNumPreferredPartitions(
		new MatrixCharacteristics(rlen, clen, blen, blen), in);
	JavaRDD<MatrixBlock> sdvals = dvals
		.sortBy(new CreateDoubleKeysFunction(), true, numPartitions);
	
	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
		.zipWithIndex()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction5(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);
	
	return ret;
}

Source File: ALSUpdate.java From oryx with Apache License 2.0

5 votes

private static JavaPairRDD<String,Collection<String>> knownsRDD(JavaRDD<String[]> allData,
                                                                boolean knownItems) {
  JavaRDD<String[]> sorted = allData.sortBy(datum -> Long.valueOf(datum[3]), true, allData.partitions().size());

  JavaPairRDD<String,Tuple2<String,Boolean>> tuples = sorted.mapToPair(datum -> {
      String user = datum[0];
      String item = datum[1];
      Boolean delete = datum[2].isEmpty();
      return knownItems ?
          new Tuple2<>(user, new Tuple2<>(item, delete)) :
          new Tuple2<>(item, new Tuple2<>(user, delete));
    });

  // TODO likely need to figure out a way to avoid groupByKey but collectByKey
  // won't work here -- doesn't guarantee enough about ordering
  return tuples.groupByKey().mapValues(idDeletes -> {
      Collection<String> ids = new HashSet<>();
      for (Tuple2<String,Boolean> idDelete : idDeletes) {
        if (idDelete._2()) {
          ids.remove(idDelete._1());
        } else {
          ids.add(idDelete._1());
        }
      }
      return ids;
    });
}

Source File: BulkInsertHelper.java From hudi with Apache License 2.0

4 votes

public static <T extends HoodieRecordPayload<T>> HoodieWriteMetadata bulkInsert(
    JavaRDD<HoodieRecord<T>> inputRecords, String instantTime,
    HoodieTable<T> table, HoodieWriteConfig config,
    CommitActionExecutor<T> executor, boolean performDedupe,
    Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
  HoodieWriteMetadata result = new HoodieWriteMetadata();

  // De-dupe/merge if needed
  JavaRDD<HoodieRecord<T>> dedupedRecords = inputRecords;

  if (performDedupe) {
    dedupedRecords = WriteHelper.combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords,
        config.getInsertShuffleParallelism(), ((HoodieTable<T>)table));
  }

  final JavaRDD<HoodieRecord<T>> repartitionedRecords;
  final int parallelism = config.getBulkInsertShuffleParallelism();
  if (bulkInsertPartitioner.isPresent()) {
    repartitionedRecords = bulkInsertPartitioner.get().repartitionRecords(dedupedRecords, parallelism);
  } else {
    // Now, sort the records and line them up nicely for loading.
    repartitionedRecords = dedupedRecords.sortBy(record -> {
      // Let's use "partitionPath + key" as the sort key. Spark, will ensure
      // the records split evenly across RDD partitions, such that small partitions fit
      // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions
      return String.format("%s+%s", record.getPartitionPath(), record.getRecordKey());
    }, true, parallelism);
  }

  // generate new file ID prefixes for each output partition
  final List<String> fileIDPrefixes =
      IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList());

  table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(State.REQUESTED,
      table.getMetaClient().getCommitActionType(), instantTime), Option.empty(),
      config.shouldAllowMultiWriteOnSameInstant());

  JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
      .mapPartitionsWithIndex(new BulkInsertMapFunction<T>(instantTime, config, table, fileIDPrefixes), true)
      .flatMap(List::iterator);

  executor.updateIndexAndCommitIfNeeded(writeStatusRDD, result);
  return result;
}

Java Code Examples for org.apache.spark.api.java.JavaRDD#sortBy()