org.apache.spark.api.java.JavaRDD#union

Source File: RemoveOrphanFilesAction.java From iceberg with Apache License 2.0

6 votes

private Dataset<Row> buildActualFileDF() {
  List<String> subDirs = Lists.newArrayList();
  List<String> matchingFiles = Lists.newArrayList();

  Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;

  // list at most 3 levels and only dirs that have less than 10 direct sub dirs on the driver
  listDirRecursively(location, predicate, hadoopConf.value(), 3, 10, subDirs, matchingFiles);

  JavaRDD<String> matchingFileRDD = sparkContext.parallelize(matchingFiles, 1);

  if (subDirs.isEmpty()) {
    return spark.createDataset(matchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
  }

  int parallelism = Math.min(subDirs.size(), partitionDiscoveryParallelism);
  JavaRDD<String> subDirRDD = sparkContext.parallelize(subDirs, parallelism);

  Broadcast<SerializableConfiguration> conf = sparkContext.broadcast(hadoopConf);
  JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp));

  JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD);
  return spark.createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
}

Source File: Union.java From SparkDemo with MIT License

6 votes

static void union(JavaSparkContext sc ) {
    List<String> datas1 = Arrays.asList("张三", "李四");
    List<String> datas2 = Arrays.asList("tom", "gim");

    JavaRDD<String> data1RDD = sc.parallelize(datas1);
    JavaRDD<String> data2RDD = sc.parallelize(datas2);

    /**
	 *  ====================================================================
	 *   |             合并两个RDD，不去重，要求两个RDD中的元素类型一致                                                                            |
	 *   |             Merge two RDD, -not heavy, and require the consistency of the element types in the two RDD |                                                                                                                                                                                                                                    | 
	 *   ====================================================================
	 */
    JavaRDD<String> unionRDD = data1RDD
            .union(data2RDD);

    unionRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

    sc.close();
}

Source File: SparkOperatorProfiler.java From rheem with Apache License 2.0

6 votes

/**
 * Helper method to generate data quanta and provide them as a cached {@link JavaRDD}.
 */
protected <T> JavaRDD<T> prepareInputRddInDriver(long cardinality, int inputIndex) {
    @SuppressWarnings("unchecked")
    final Supplier<T> supplier = (Supplier<T>) this.dataQuantumGenerators.get(inputIndex);
    JavaRDD<T> finalInputRdd = null;

    // Create batches, parallelize them, and union them.
    long remainder = cardinality;
    do {
        int batchSize = (int) Math.min(remainder, this.dataQuantumGeneratorBatchSize);
        List<T> batch = new ArrayList<>(batchSize);
        while (batch.size() < batchSize) {
            batch.add(supplier.get());
        }
        final JavaRDD<T> batchRdd = this.sparkExecutor.sc.parallelize(batch);
        finalInputRdd = finalInputRdd == null ? batchRdd : finalInputRdd.union(batchRdd);
        remainder -= batchSize;
    } while (remainder > 0);

    // Shuffle and cache the RDD.
    final JavaRDD<T> cachedInputRdd = this.partition(finalInputRdd).cache();
    cachedInputRdd.foreach(dataQuantum -> {
    });

    return cachedInputRdd;
}

Source File: GATKSparkTool.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Loads the reads into a {@link JavaRDD} using the intervals specified, and returns them
 * without applying any filtering.
 *
 * If no intervals were specified, returns all the reads (both mapped and unmapped).
 *
 * @return all reads from our reads input(s) as a {@link JavaRDD}, bounded by intervals if specified, and unfiltered.
 */
public JavaRDD<GATKRead> getUnfilteredReads() {
    final TraversalParameters traversalParameters;
    if ( hasUserSuppliedIntervals() ) { // intervals may have been supplied by editIntervals
        final boolean traverseUnmapped;
        if (intervalArgumentCollection.intervalsSpecified()) {
            traverseUnmapped = intervalArgumentCollection.getTraversalParameters(getHeaderForReads().getSequenceDictionary()).traverseUnmappedReads();
        } else {
            traverseUnmapped = false;
        }
        traversalParameters = new TraversalParameters(getIntervals(), traverseUnmapped);
    } else {
        traversalParameters = null;
    }

    JavaRDD<GATKRead> output = null;
    ReadsSparkSource source = readsSource;
    for (final GATKPath inputPathSpecifier : readInputs.keySet()) {
        if (output == null) {
            output = getGatkReadJavaRDD(traversalParameters, source, inputPathSpecifier);
        } else {
            output = output.union(getGatkReadJavaRDD(traversalParameters, source, inputPathSpecifier));
        }
    }
    return output;
}

Source File: PSScorer.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Moves reads from the same read template into an Iterable.
 * Paired reads must be queryname-sorted, and no pair of reads can be split across partitions.
 */
static JavaRDD<Iterable<GATKRead>> groupReadsIntoPairs(final JavaRDD<GATKRead> pairedReads,
                                                       final JavaRDD<GATKRead> unpairedReads,
                                                       final int readsPerPartitionGuess) {
    JavaRDD<Iterable<GATKRead>> groupedReads;
    if (pairedReads != null) {
        groupedReads = pairedReads.mapPartitions(iter -> groupPairedReadsPartition(iter, readsPerPartitionGuess));
        if (unpairedReads != null) {
            groupedReads = groupedReads.union(unpairedReads.map(Collections::singletonList));
        }
    } else if (unpairedReads != null) {
        groupedReads = unpairedReads.map(Collections::singletonList);
    } else {
        throw new UserException.BadInput("No reads were loaded. Ensure --paired-input and/or --unpaired-input are set and valid.");
    }
    return groupedReads;
}

Source File: MLUpdate.java From oryx with Apache License 2.0

6 votes

private Pair<JavaRDD<M>,JavaRDD<M>> splitTrainTest(JavaRDD<M> newData, JavaRDD<M> pastData) {
  Objects.requireNonNull(newData);
  if (testFraction <= 0.0) {
    return new Pair<>(pastData == null ? newData : newData.union(pastData), null);
  }
  if (testFraction >= 1.0) {
    return new Pair<>(pastData, newData);
  }
  if (empty(newData)) {
    return new Pair<>(pastData, null);
  }
  Pair<JavaRDD<M>,JavaRDD<M>> newTrainTest = splitNewDataToTrainTest(newData);
  JavaRDD<M> newTrainData = newTrainTest.getFirst();
  return new Pair<>(pastData == null ? newTrainData : newTrainData.union(pastData),
                    newTrainTest.getSecond());
}

Source File: TransformationRDD.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 集合并集.
 * demo计算目的：找出所有进站是广南和天河客运站的信息
 *
 * @since hui_project 1.0.0
 */
public void testUnionAndFilter() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    JavaRDD<String> result = textRDD.filter(x -> x.contains("广州南站"));
    JavaRDD<String> result1 = textRDD.filter(x -> x.contains("天河客运站"));
    JavaRDD<String> union = result.union(result1);
    System.out.println("-------" + union.count() + "-------");
    checkResult(union.collect());
}

Source File: TransformationRDDTest.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 集合并集.
 * demo计算目的：找出所有进站是广南和天河客运站的信息
 * @since hui_project 1.0.0
 */
@Test
public void testUnionAndFilter() {
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    JavaRDD<String> result = textRDD.filter(x -> x.contains("广州南站"));
    JavaRDD<String> result1 = textRDD.filter(x -> x.contains("天河客运站"));
    JavaRDD<String> union = result.union(result1);
    System.out.println("-------" + union.count() + "-------");
    checkResult(union.collect());
}

Source File: QuadUtils.java From rdf2x with Apache License 2.0

5 votes

/**
 * Get resources related to specified resources, computed by querying an in-memory set of subjects
 *
 * @param quads       RDD of quads to filter
 * @param subjectURIs set of requested subject URIs to grow from
 * @param directed    whether to use both directions of relations
 * @return URIs of resources related to specified resources
 */
public static JavaRDD<String> getNeighborResources(JavaRDD<Quad> quads, Set<String> subjectURIs, boolean directed) {
    JavaRDD<String> neighbors = filterQuadsByAllowedSubjects(quads, subjectURIs)
            .filter(quad -> quad.getObject().isURI())
            .map(quad -> quad.getObject().getURI());
    if (!directed) {
        neighbors = neighbors.union(filterQuadsByObjects(quads, subjectURIs)
                .filter(quad -> quad.getSubject().isURI())
                .map(quad -> quad.getSubject().getURI()));
    }
    return neighbors;
}

Source File: SparkFileInputStream.java From incubator-retired-mrql with Apache License 2.0

5 votes

@Override
public Option<RDD<MRData>> compute ( Time validTime ) {
    JavaRDD<MRData> rdd = null;
    for ( String file: new_files() )
        if (rdd == null)
            rdd = hadoopFile(file);
        else rdd = rdd.union(hadoopFile(file));
    if (rdd == null)
        rdd = SparkEvaluator.spark_context.emptyRDD();
    return new Some<RDD<MRData>>(rdd.rdd());
}

Source File: MultiReturnParameterizedBuiltinSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override 
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;
	
	try
	{
		//get input RDD and meta data
		FrameObject fo = sec.getFrameObject(input1.getName());
		FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
		JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>)
			sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo);
		String spec = ec.getScalarInput(input2).getStringValue();
		DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName());
		DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
		String[] colnames = !TfMetaUtils.isIDSpec(spec) ?
			in.lookup(1L).get(0).getColumnNames() : null; 
		
		//step 1: build transform meta data
		Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), null);
		
		MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); 
		JavaRDD<String> rcMaps = in
			.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild))
			.distinct().groupByKey()
			.flatMap(new TransformEncodeGroupFunction(accMax));
		if( containsMVImputeEncoder(encoderBuild) ) {
			EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
			rcMaps = rcMaps.union(
				in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva))
				  .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) );
		}
		rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval
		
		//consolidate meta data frame (reuse multi-threaded reader, special handling missing values) 
		FrameReader reader = FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo);
		FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
		meta.recomputeColumnCardinality(); //recompute num distinct items per column
		meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames());
		
		//step 2: transform apply (similar to spark transformapply)
		//compute omit offset map for block shifts
		TfOffsetMap omap = null;
		if( TfMetaUtils.containsOmitSpec(spec, colnames) ) {
			omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(
				new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
		}
		
		//create encoder broadcast (avoiding replication per task) 
		Encoder encoder = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), meta);
		mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); 
		Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
		Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null;
		
		//execute transform apply
		JavaPairRDD<Long,FrameBlock> tmp = in
			.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils
			.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
		
		//set output and maintain lineage/output characteristics
		sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
		sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
		sec.setFrameOutput(_outputs.get(1).getName(), meta);
	}
	catch(IOException ex) {
		throw new RuntimeException(ex);
	}
}

Source File: MultiReturnParameterizedBuiltinSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override 
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;
	
	try
	{
		//get input RDD and meta data
		FrameObject fo = sec.getFrameObject(input1.getName());
		FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
		JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>)
			sec.getRDDHandleForFrameObject(fo, FileFormat.BINARY);
		String spec = ec.getScalarInput(input2).getStringValue();
		DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName());
		DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
		String[] colnames = !TfMetaUtils.isIDSpec(spec) ?
			in.lookup(1L).get(0).getColumnNames() : null; 
		
		//step 1: build transform meta data
		Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), null);
		
		MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); 
		JavaRDD<String> rcMaps = in
			.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild))
			.distinct().groupByKey()
			.flatMap(new TransformEncodeGroupFunction(accMax));
		if( containsMVImputeEncoder(encoderBuild) ) {
			EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
			rcMaps = rcMaps.union(
				in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva))
				  .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) );
		}
		rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval
		
		//consolidate meta data frame (reuse multi-threaded reader, special handling missing values) 
		FrameReader reader = FrameReaderFactory.createFrameReader(FileFormat.TEXT);
		FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
		meta.recomputeColumnCardinality(); //recompute num distinct items per column
		meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames());
		
		//step 2: transform apply (similar to spark transformapply)
		//compute omit offset map for block shifts
		TfOffsetMap omap = null;
		if( TfMetaUtils.containsOmitSpec(spec, colnames) ) {
			omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(
				new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
		}
		
		//create encoder broadcast (avoiding replication per task) 
		Encoder encoder = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), meta);
		mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); 
		Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
		Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null;
		
		//execute transform apply
		JavaPairRDD<Long,FrameBlock> tmp = in
			.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils
			.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
		
		//set output and maintain lineage/output characteristics
		sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
		sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
		sec.setFrameOutput(_outputs.get(1).getName(), meta);
	}
	catch(IOException ex) {
		throw new RuntimeException(ex);
	}
}

Java Code Examples for org.apache.spark.api.java.JavaRDD#union()