Java Code Examples for org.apache.spark.api.java.JavaSparkContext#emptyRDD()

The following examples show how to use org.apache.spark.api.java.JavaSparkContext#emptyRDD() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: HoodieMergeOnReadTableCompactor.java From hudi with Apache License 2.0

6 votes

@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, HoodieCompactionPlan compactionPlan,
    HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException {
  if (compactionPlan == null || (compactionPlan.getOperations() == null)
      || (compactionPlan.getOperations().isEmpty())) {
    return jsc.emptyRDD();
  }
  HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
  // Compacting is very similar to applying updates to existing file
  HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc.hadoopConfiguration(), metaClient);
  List<CompactionOperation> operations = compactionPlan.getOperations().stream()
      .map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
  LOG.info("Compactor compacting " + operations + " files");

  return jsc.parallelize(operations, operations.size())
      .map(s -> compact(table, metaClient, config, s, compactionInstantTime)).flatMap(List::iterator);
}

Example 2

Source File: TransformTranslator.java From beam with Apache License 2.0

4 votes

private static <InputT, AccumT, OutputT>
    TransformEvaluator<Combine.Globally<InputT, OutputT>> combineGlobally() {
  return new TransformEvaluator<Combine.Globally<InputT, OutputT>>() {

    @Override
    public void evaluate(Combine.Globally<InputT, OutputT> transform, EvaluationContext context) {
      final PCollection<InputT> input = context.getInput(transform);
      final Coder<InputT> iCoder = context.getInput(transform).getCoder();
      final Coder<OutputT> oCoder = context.getOutput(transform).getCoder();
      final WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
      @SuppressWarnings("unchecked")
      final CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT> combineFn =
          (CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT>)
              CombineFnUtil.toFnWithContext(transform.getFn());
      final WindowedValue.FullWindowedValueCoder<OutputT> wvoCoder =
          WindowedValue.FullWindowedValueCoder.of(
              oCoder, windowingStrategy.getWindowFn().windowCoder());
      final boolean hasDefault = transform.isInsertDefault();

      final SparkCombineFn<InputT, InputT, AccumT, OutputT> sparkCombineFn =
          SparkCombineFn.globally(
              combineFn,
              context.getSerializableOptions(),
              TranslationUtils.getSideInputs(transform.getSideInputs(), context),
              windowingStrategy);
      final Coder<AccumT> aCoder;
      try {
        aCoder = combineFn.getAccumulatorCoder(context.getPipeline().getCoderRegistry(), iCoder);
      } catch (CannotProvideCoderException e) {
        throw new IllegalStateException("Could not determine coder for accumulator", e);
      }

      @SuppressWarnings("unchecked")
      JavaRDD<WindowedValue<InputT>> inRdd =
          ((BoundedDataset<InputT>) context.borrowDataset(transform)).getRDD();

      JavaRDD<WindowedValue<OutputT>> outRdd;

      SparkCombineFn.WindowedAccumulator<InputT, InputT, AccumT, ?> accumulated =
          GroupCombineFunctions.combineGlobally(inRdd, sparkCombineFn, aCoder, windowingStrategy);

      if (!accumulated.isEmpty()) {
        Iterable<WindowedValue<OutputT>> output = sparkCombineFn.extractOutput(accumulated);
        outRdd =
            context
                .getSparkContext()
                .parallelize(CoderHelpers.toByteArrays(output, wvoCoder))
                .map(CoderHelpers.fromByteFunction(wvoCoder));
      } else {
        // handle empty input RDD, which will naturally skip the entire execution
        // as Spark will not run on empty RDDs.
        JavaSparkContext jsc = new JavaSparkContext(inRdd.context());
        if (hasDefault) {
          OutputT defaultValue = combineFn.defaultValue();
          outRdd =
              jsc.parallelize(Lists.newArrayList(CoderHelpers.toByteArray(defaultValue, oCoder)))
                  .map(CoderHelpers.fromByteFunction(oCoder))
                  .map(WindowedValue::valueInGlobalWindow);
        } else {
          outRdd = jsc.emptyRDD();
        }
      }

      context.putDataset(transform, new BoundedDataset<>(outRdd));
    }

    @Override
    public String toNativeString() {
      return "aggregate(..., new <fn>(), ...)";
    }
  };
}