Java Code Examples for org.apache.spark.api.java.JavaPairRDD#flatMap()
The following examples show how to use
org.apache.spark.api.java.JavaPairRDD#flatMap() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 7 votes |
public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict) { JavaPairRDD<MatrixIndexes,MatrixBlock> input = in; //fast path without, general case with shuffle if( mcIn.getCols()>mcIn.getBlocksize() ) { //create row partitioned matrix input = input .flatMapToPair(new SliceBinaryBlockToRowsFunction(mcIn.getBlocksize())) .groupByKey() .mapToPair(new ConcatenateBlocksFunction(mcIn.getCols(), mcIn.getBlocksize())); } //sort if required (on blocks/rows) if( strict ) { input = input.sortByKey(true); } //convert binary block to csv (from blocks/rows) JavaRDD<String> out = input .flatMap(new BinaryBlockToCSVFunction(props)); return out; }
Example 2
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 7 votes |
public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession, JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mc, ValueType[] schema) { if( !mc.colsKnown() ) throw new RuntimeException("Number of columns needed to convert binary block to data frame."); //convert binary block to rows rdd JavaRDD<Row> rowRDD = in.flatMap( new BinaryBlockToDataFrameFunction()); //create data frame schema if( schema == null ) schema = UtilFunctions.nCopies((int)mc.getCols(), ValueType.STRING); StructType dfSchema = convertFrameSchemaToDFSchema(schema, true); //rdd to data frame conversion return sparkSession.createDataFrame(rowRDD, dfSchema); }
Example 3
Source File: InteractionFingerprinter.java From mmtf-spark with Apache License 2.0 | 7 votes |
/** * Returns a dataset of ligand - macromolecule interacting residues. * * <p>The dataset contains the following columns: * <pre> * structureChainId - pdbId.chainName of chain that interacts with ligand * queryLigandId - id of ligand from PDB chemical component dictionary * queryLigandNumber - group number of ligand including insertion code * queryLigandChainId - chain name of ligand * targetChainId - name of chain for which the interaction data are listed * groupNumbers - array of residue numbers of interacting groups including insertion code (e.g., 101A) * sequenceIndices - array of zero-based index of interaction groups (residues) mapped onto target sequence * sequence - interacting polymer sequence * interactingChains - total number of chains that interact with ligand * </pre> * * @param structures a set of PDB structures * @param filter interaction criteria * @return dataset with interacting residue information */ public static Dataset<Row> getLigandPolymerInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) { // find all interactions JavaRDD<Row> rows = structures.flatMap(new LigandInteractionFingerprint(filter)); // convert RDD to a Dataset with the following columns boolean nullable = false; StructField[] fields = { DataTypes.createStructField("structureChainId", DataTypes.StringType, nullable), DataTypes.createStructField("queryLigandId", DataTypes.StringType, nullable), DataTypes.createStructField("queryLigandNumber", DataTypes.StringType, nullable), DataTypes.createStructField("queryLigandChainId", DataTypes.StringType, nullable), DataTypes.createStructField("targetChainId", DataTypes.StringType, nullable), DataTypes.createStructField("groupNumbers", DataTypes.createArrayType(DataTypes.StringType), nullable), DataTypes.createStructField("sequenceIndices", DataTypes.createArrayType(DataTypes.IntegerType), nullable), DataTypes.createStructField("sequence", DataTypes.StringType, nullable), DataTypes.createStructField("interactingChains", DataTypes.IntegerType, nullable) }; SparkSession spark = SparkSession.builder().getOrCreate(); return spark.createDataFrame(rows, new StructType(fields)); }
Example 4
Source File: SparkTransformExecutor.java From deeplearning4j with Apache License 2.0 | 7 votes |
/** * Execute a join on the specified data * * @param join Join to execute * @param left Left data for join * @param right Right data for join * @return Joined data */ public static JavaRDD<List<Writable>> executeJoin(Join join, JavaRDD<List<Writable>> left, JavaRDD<List<Writable>> right) { String[] leftColumnNames = join.getJoinColumnsLeft(); int[] leftColumnIndexes = new int[leftColumnNames.length]; for (int i = 0; i < leftColumnNames.length; i++) { leftColumnIndexes[i] = join.getLeftSchema().getIndexOfColumn(leftColumnNames[i]); } JavaPairRDD<List<Writable>, List<Writable>> leftJV = left.mapToPair(new ExtractKeysFunction(leftColumnIndexes)); String[] rightColumnNames = join.getJoinColumnsRight(); int[] rightColumnIndexes = new int[rightColumnNames.length]; for (int i = 0; i < rightColumnNames.length; i++) { rightColumnIndexes[i] = join.getRightSchema().getIndexOfColumn(rightColumnNames[i]); } JavaPairRDD<List<Writable>, List<Writable>> rightJV = right.mapToPair(new ExtractKeysFunction(rightColumnIndexes)); JavaPairRDD<List<Writable>, Tuple2<Iterable<List<Writable>>, Iterable<List<Writable>>>> cogroupedJV = leftJV.cogroup(rightJV); return cogroupedJV.flatMap(new ExecuteJoinFromCoGroupFlatMapFunction(join)); }
Example 5
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 6 votes |
public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession, JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mc, ValueType[] schema) { if( !mc.colsKnown() ) throw new RuntimeException("Number of columns needed to convert binary block to data frame."); //convert binary block to rows rdd JavaRDD<Row> rowRDD = in.flatMap( new BinaryBlockToDataFrameFunction()); //create data frame schema if( schema == null ) schema = UtilFunctions.nCopies((int)mc.getCols(), ValueType.STRING); StructType dfSchema = convertFrameSchemaToDFSchema(schema, true); //rdd to data frame conversion return sparkSession.createDataFrame(rowRDD, dfSchema); }
Example 6
Source File: DataStep.java From envelope with Apache License 2.0 | 6 votes |
private JavaRDD<Row> planMutationsByKey(Dataset<Row> arriving, List<String> keyFieldNames, Config plannerConfig, Config outputConfig) { JavaPairRDD<Row, Row> keyedArriving = arriving.javaRDD().keyBy(new ExtractKeyFunction(keyFieldNames, accumulators)); JavaPairRDD<Row, Iterable<Row>> arrivingByKey = keyedArriving.groupByKey(getPartitioner(keyedArriving)); JavaPairRDD<Row, Tuple2<Iterable<Row>, Iterable<Row>>> arrivingAndExistingByKey = arrivingByKey.mapPartitionsToPair(new JoinExistingForKeysFunction(outputConfig, keyFieldNames, accumulators)); JavaRDD<Row> planned = arrivingAndExistingByKey.flatMap(new PlanForKeyFunction(plannerConfig, accumulators)); return planned; }
Example 7
Source File: PageOneStepConvertRateSpark.java From BigDataPlatform with GNU General Public License v3.0 | 5 votes |
/** * 获取页面流中初始页面的pv * @param taskParam * @param sessionid2actionsRDD * @return */ private static Long getStartPagePv(JSONObject taskParam, JavaPairRDD<String, Iterable<Row>> sessionid2actionsRDD) { String targetPageFlow = ParamUtils.getParam(taskParam, Constants.PARAM_TARGET_PAGE_FLOW); final Long startPageId = Long.valueOf(targetPageFlow.split(",")[0]); JavaRDD<Long> startPageRDD = sessionid2actionsRDD.flatMap( new FlatMapFunction<Tuple2<String,Iterable<Row>>, Long>() { private final Long serialVersionUID = 1L; @Override public Iterator<Long> call( Tuple2<String, Iterable<Row>> tuple) throws Exception { List<Long> list = new ArrayList<Long>(); Iterator<Row> iterator = tuple._2.iterator(); while(iterator.hasNext()) { Row row = iterator.next(); Long pageid = row.getLong(3); if(pageid == startPageId) { list.add(pageid); } } return list.iterator(); } }); return startPageRDD.count(); }
Example 8
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict) { JavaPairRDD<Long,FrameBlock> input = in; //sort if required (on blocks/rows) if( strict && !isSorted(input) ) { input = input.sortByKey(true); } //convert binary block to csv (from blocks/rows) return input.flatMap( new BinaryBlockToCSVFunction(props)); }
Example 9
Source File: GroupInteractionExtractor.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Returns a Dataset of pairwise interactions that satisfy the criteria of * the {@link InteractionFilter}. Each atom, its interacting neighbor atom, and * the interaction distance is represented as a row. * * @param structures a set of PDB structures * @return filter criteria for determining noncovalent interactions * @see edu.sdsc.mmtf.spark.interactions.InteractionFilter */ public static Dataset<Row> getPairInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) { SparkSession spark = SparkSession.builder().getOrCreate(); @SuppressWarnings("resource") // sc cannot be closed here, it's still required elsewhere JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // calculate interactions boolean pairwise = true; JavaRDD<Row> rows = structures.flatMap(new StructureToAtomInteractions(sc.broadcast(filter), pairwise)); // convert JavaRDD to Dataset return spark.createDataFrame(rows, AtomInteraction.getPairInteractionSchema()); }
Example 10
Source File: GroupInteractionExtractor.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Returns a dataset of interactions that satisfy the criteria of * the {@link InteractionFilter}. Each atom and its interacting neighbor atoms * are represented as a row in a Dataset. In addition, geometric features * of the interactions, such as distances, angles, and orientational order * parameters are returned in each row (see {@link edu.sdsc.mm.dev.utils.CoordinationGeometry}). * * @param structures a set of PDB structures * @return filter criteria for determining noncovalent interactions * @see edu.sdsc.mmtf.spark.interactions.InteractionFilter * @see edu.sdsc.mm.dev.utils.CoordinationGeometry */ public static Dataset<Row> getInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) { SparkSession spark = SparkSession.builder().getOrCreate(); @SuppressWarnings("resource") // sc cannot be closed here, it's still required elsewhere JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // calculate interactions boolean pairwise = false; JavaRDD<Row> rows = structures.flatMap(new StructureToAtomInteractions(sc.broadcast(filter), pairwise)); // convert JavaRDD to Dataset return spark.createDataFrame(rows, AtomInteraction.getSchema(filter.getMaxInteractions())); }
Example 11
Source File: GroupInteractionExtractor.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Returns a dataset of residues that interact with the specified group within * a specified cutoff distance. * * @param structures a set of PDB structures * @return dataset with interacting residue and atom information */ public Dataset<Row> getDataset(JavaPairRDD<String, StructureDataInterface> structures) { // create a list of all residues with a threshold distance JavaRDD<Row> rows = structures.flatMap(new StructureToAllInteractions(groupName, distance)); // convert to a dataset return JavaRDDToDataset.getDataset(rows, "structureId","residue1","atom1","element1","index1", "residue2","atom2","element2","index2","distance"); }
Example 12
Source File: QuaternaryStructureDataset.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Returns a dataset with quaternary structure info * * @param structure * @return dataset quaternary structure info */ public static Dataset<Row> getDataset(JavaPairRDD<String, StructureDataInterface> structure) { JavaRDD<Row> rows = structure.flatMap(t -> getQuaternaryStructure(t)); StructType schema = new StructType(new StructField[]{ new StructField("structureId", DataTypes.StringType, false, Metadata.empty()), new StructField("bioAssemblyId", DataTypes.StringType, false, Metadata.empty()), new StructField("proteinStoichiometry", DataTypes.StringType, true, Metadata.empty()), new StructField("dnaStoichiometry", DataTypes.StringType, true, Metadata.empty()), new StructField("rnaStoichiometry", DataTypes.StringType, true, Metadata.empty()), }); SparkSession spark = SparkSession.builder().getOrCreate(); return spark.createDataFrame(rows, schema); }
Example 13
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict) { JavaPairRDD<Long,FrameBlock> input = in; //sort if required (on blocks/rows) if( strict && !isSorted(input) ) { input = input.sortByKey(true); } //convert binary block to csv (from blocks/rows) return input.flatMap( new BinaryBlockToCSVFunction(props)); }
Example 14
Source File: FindAssemblyRegionsSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 4 votes |
/** * Get an RDD of assembly regions for the given reads and intervals using the <i>strict</i> algorithm (looks for * assembly regions in each contig in parallel). * @param ctx the Spark context * @param reads the coordinate-sorted reads * @param header the header for the reads * @param sequenceDictionary the sequence dictionary for the reads * @param referenceFileName the file name for the reference * @param features source of arbitrary features (may be null) * @param intervalShards the sharded intervals to find assembly regions for * @param assemblyRegionEvaluatorSupplierBroadcast evaluator used to determine whether a locus is active * @param shardingArgs the arguments for sharding reads * @param assemblyRegionArgs the arguments for finding assembly regions * @param shuffle whether to use a shuffle or not when sharding reads * @return an RDD of assembly regions */ public static JavaRDD<AssemblyRegionWalkerContext> getAssemblyRegionsStrict( final JavaSparkContext ctx, final JavaRDD<GATKRead> reads, final SAMFileHeader header, final SAMSequenceDictionary sequenceDictionary, final String referenceFileName, final FeatureManager features, final List<ShardBoundary> intervalShards, final Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvaluatorSupplierBroadcast, final AssemblyRegionReadShardArgumentCollection shardingArgs, final AssemblyRegionArgumentCollection assemblyRegionArgs, final boolean shuffle) { JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, reads, GATKRead.class, sequenceDictionary, intervalShards, shardingArgs.readShardSize, shuffle); Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features); // 1. Calculate activity for each locus in the desired intervals, in parallel. JavaRDD<ActivityProfileStateRange> activityProfileStates = shardedReads.mapPartitions(getActivityProfileStatesFunction(referenceFileName, bFeatureManager, header, assemblyRegionEvaluatorSupplierBroadcast, assemblyRegionArgs)); // 2. Group by contig. We need to do this so we can perform the band pass filter over the whole contig, so we // produce assembly regions that are identical to those produced by AssemblyRegionWalker. // This step requires a shuffle, but the amount of data in the ActivityProfileStateRange should be small, so it // should not be prohibitive. JavaPairRDD<String, Iterable<ActivityProfileStateRange>> contigToGroupedStates = activityProfileStates .keyBy((Function<ActivityProfileStateRange, String>) range -> range.getContig()) .groupByKey(); // 3. Run the band pass filter to find AssemblyRegions. The filtering is fairly cheap, so should be fast // even though it has to scan a whole contig. Note that we *don't* fill in reads here, since after we have found // the assembly regions we want to do assembly using the full resources of the cluster. So if we have // very small assembly region objects, then we can repartition them for redistribution across the cluster, // at which points the reads can be filled in. (See next step.) JavaRDD<ReadlessAssemblyRegion> readlessAssemblyRegions = contigToGroupedStates .flatMap(getReadlessAssemblyRegionsFunction(header, assemblyRegionArgs)); // repartition to distribute the data evenly across the cluster again readlessAssemblyRegions = readlessAssemblyRegions.repartition(readlessAssemblyRegions.getNumPartitions()); // 4. Fill in the reads. Each shard is an assembly region, with its overlapping reads. JavaRDD<Shard<GATKRead>> assemblyRegionShardedReads = SparkSharder.shard(ctx, reads, GATKRead.class, header.getSequenceDictionary(), readlessAssemblyRegions, shardingArgs.readShardSize); // 5. Convert shards to assembly regions. Reads downsampling is done again here. Note it will only be // consistent with the downsampling done in step 1 when https://github.com/broadinstitute/gatk/issues/5437 is in. JavaRDD<AssemblyRegion> assemblyRegions = assemblyRegionShardedReads.mapPartitions((FlatMapFunction<Iterator<Shard<GATKRead>>, AssemblyRegion>) shardedReadIterator -> { final ReadsDownsampler readsDownsampler = assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ? new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, header) : null; return Utils.stream(shardedReadIterator) .map(shardedRead -> toAssemblyRegion(shardedRead, header, readsDownsampler)).iterator(); }); // 6. Add reference and feature context. return assemblyRegions.mapPartitions(getAssemblyRegionWalkerContextFunction(referenceFileName, bFeatureManager)); }
Example 15
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 4 votes |
public static JavaRDD<String> binaryBlockToTextCell(JavaPairRDD<Long, FrameBlock> input, DataCharacteristics mcIn) { //convert frame blocks to ijv string triples return input.flatMap(new ConvertFrameBlockToIJVLines()); }
Example 16
Source File: AssemblyContigAlignmentsConfigPicker.java From gatk with BSD 3-Clause "New" or "Revised" License | 4 votes |
/** * Filters input alignments of single-ended long reads, e.g. local assembly contigs, * with the objective of * choosing a set of alignments that provide "optimal coverage" of the assembly contig. * * Currently "optimality" is defined based on an heuristic scoring scheme * {@link #computeScoreOfConfiguration(List, Set, int)}. * * <p> * It goes through four major steps: * <ul> * <li> * first group the alignments of each local assembly contig together * </li> * <li> * then exhaustively iterate through all possible combinations (named configuration) of the alignments, * score them, and pick the best ones with the "optimal coverage"; note that sometimes * there are multiple "optimal" configurations, and when this happens, all of them are returned, * with each of them having the field {@link AssemblyContigWithFineTunedAlignments#hasEquallyGoodAlnConfigurations} * set to true. * </li> * <li> * alignments containing large gaps (i.e. insertions and deletions) * are finally split at gap start and end locations. * (note, see warning in {@link #splitGaps(GoodAndBadMappings, boolean)}) * </li> * <li> * A final round of pruning, in order to further remove uninformative alignments. * </li> * </ul> * </p> * * @param assemblyAlignments long read alignments * @param header header for the long reads * @param canonicalChromosomes a set of chromosome names that are defined as canonical, e.g. for Human, chr1-chr22, and chrX and chrY * @param scoreDiffTolerance a tolerance where if two configurations' scores differ by less than or equal to this amount, they are considered equally good * @param toolLogger logger for outputting summary and debugging * * @return contigs with alignments filtered and custom formatted as {@link AlignmentInterval} */ public static JavaRDD<AssemblyContigWithFineTunedAlignments> createOptimalCoverageAlignmentSetsForContigs(final JavaRDD<GATKRead> assemblyAlignments, final SAMFileHeader header, final Set<String> canonicalChromosomes, final Double scoreDiffTolerance, final Logger toolLogger) { final JavaRDD<AlignedContig> parsedContigAlignments = convertRawAlignmentsToAlignedContigAndFilterByQuality(assemblyAlignments, header, toolLogger); final JavaPairRDD<Tuple2<String, byte[]>, List<GoodAndBadMappings>> assemblyContigWithPickedConfigurations = gatherBestConfigurationsForOneContig(parsedContigAlignments, canonicalChromosomes, scoreDiffTolerance); return assemblyContigWithPickedConfigurations .flatMap(AssemblyContigAlignmentsConfigPicker::reConstructContigFromPickedConfiguration); }
Example 17
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 4 votes |
public static JavaRDD<String> binaryBlockToTextCell(JavaPairRDD<MatrixIndexes, MatrixBlock> in, DataCharacteristics mc) { return in.flatMap(new ConvertMatrixBlockToIJVLines(mc.getBlocksize())); }
Example 18
Source File: InteractionFingerprinter.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * Returns a dataset of ligand - macromolecule interaction information. * * Criteria to select interactions are specified by the * {@link InteractionFilter} * * <p>The dataset contains the following columns: * <pre> * structureChainId - pdbId.chainName for which the interaction data are listed * queryChainId - name of chain that interacts with target chain * targetChainId - name of chain for which the interaction data are listed * groupNumbers - array of residue numbers of interacting groups including insertion code (e.g., 101A) * sequenceIndices - array of zero-based index of interaction groups (residues) mapped onto target sequence * sequence - target polymer sequence * </pre> * * @param structures a set of PDB structures * @param filter interaction criteria * @return dataset with interacting residue information */ public static Dataset<Row> getPolymerInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) { // find all interactions JavaRDD<Row> rows = structures.flatMap(new PolymerInteractionFingerprint(filter)); // convert RDD to a Dataset with the following columns boolean nullable = false; StructField[] fields = { DataTypes.createStructField("structureChainId", DataTypes.StringType, nullable), DataTypes.createStructField("queryChainId", DataTypes.StringType, nullable), DataTypes.createStructField("targetChainId", DataTypes.StringType, nullable), DataTypes.createStructField("groupNumbers", DataTypes.createArrayType(DataTypes.StringType), nullable), DataTypes.createStructField("sequenceIndices", DataTypes.createArrayType(DataTypes.IntegerType), nullable), DataTypes.createStructField("sequence", DataTypes.StringType, nullable) }; SparkSession spark = SparkSession.builder().getOrCreate(); return spark.createDataFrame(rows, new StructType(fields)); }
Example 19
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 4 votes |
public static JavaRDD<String> binaryBlockToTextCell(JavaPairRDD<Long, FrameBlock> input, DataCharacteristics mcIn) { //convert frame blocks to ijv string triples return input.flatMap(new ConvertFrameBlockToIJVLines()); }
Example 20
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 4 votes |
public static JavaRDD<String> binaryBlockToTextCell(JavaPairRDD<MatrixIndexes, MatrixBlock> in, DataCharacteristics mc) { return in.flatMap(new ConvertMatrixBlockToIJVLines(mc.getBlocksize())); }