Java Code Examples for org.apache.spark.api.java.JavaPairRDD#map()
The following examples show how to use
org.apache.spark.api.java.JavaPairRDD#map() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ChronixRDD.java From chronix.spark with Apache License 2.0 | 6 votes |
/** * Transformation: Joins the time series according their identity. * * @return joined time series */ public ChronixRDD joinChunks() { JavaPairRDD<MetricTimeSeriesKey, Iterable<MetricTimeSeries>> groupRdd = this.groupBy(MetricTimeSeriesKey::new); JavaPairRDD<MetricTimeSeriesKey, MetricTimeSeries> joinedRdd = groupRdd.mapValues((Function<Iterable<MetricTimeSeries>, MetricTimeSeries>) mtsIt -> { MetricTimeSeriesOrdering ordering = new MetricTimeSeriesOrdering(); List<MetricTimeSeries> orderedChunks = ordering.immutableSortedCopy(mtsIt); MetricTimeSeries result = null; for (MetricTimeSeries mts : orderedChunks) { if (result == null) { result = new MetricTimeSeries .Builder(mts.getMetric()) .attributes(mts.attributes()).build(); } result.addAll(mts.getTimestampsAsArray(), mts.getValuesAsArray()); } return result; }); JavaRDD<MetricTimeSeries> resultJavaRdd = joinedRdd.map((Tuple2<MetricTimeSeriesKey, MetricTimeSeries> mtTuple) -> mtTuple._2); return new ChronixRDD(resultJavaRdd); }
Example 2
Source File: TsmmSPInstruction.java From systemds with Apache License 2.0 | 6 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //get input JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); //execute tsmm instruction (always produce exactly one output block) //(this formulation with values() requires --conf spark.driver.maxResultSize=0) JavaRDD<MatrixBlock> tmp = in.map(new RDDTSMMFunction(_type)); MatrixBlock out = RDDAggregateUtils.sumStable(tmp); //put output block into symbol table (no lineage because single block) //this also includes implicit maintenance of matrix characteristics sec.setMatrixOutput(output.getName(), out); }
Example 3
Source File: BatchTrafficDataProcessor.java From lambda-arch with Apache License 2.0 | 6 votes |
/** * Method to get the vehicles which are in radius of POI and their distance from POI. * * @param nonFilteredIotDataStream original IoT data stream * @param broadcastPOIValues variable containing POI coordinates, route and vehicle types to monitor. */ public void processPOIData( JavaRDD<IoTData> nonFilteredIotDataStream, Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues ) { // Filter by routeId,vehicleType and in POI range JavaRDD<IoTData> iotDataStreamFiltered = filterVehicleInPOIRange(nonFilteredIotDataStream, broadcastPOIValues); // pair with poi JavaPairRDD<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered.mapToPair( iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1()) ); // Transform to dstream of POITrafficData JavaRDD<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc); persistPOI(trafficDStream); }
Example 4
Source File: TsmmSPInstruction.java From systemds with Apache License 2.0 | 6 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //get input JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); //execute tsmm instruction (always produce exactly one output block) //(this formulation with values() requires --conf spark.driver.maxResultSize=0) JavaRDD<MatrixBlock> tmp = in.map(new RDDTSMMFunction(_type)); MatrixBlock out = RDDAggregateUtils.sumStable(tmp); //put output block into symbol table (no lineage because single block) //this also includes implicit maintenance of matrix characteristics sec.setMatrixOutput(output.getName(), out); }
Example 5
Source File: VariantsSparkSink.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
private static JavaRDD<VariantContext> sortVariants(final JavaRDD<VariantContext> variants, final VCFHeader header, final int numReducers) { // Turn into key-value pairs so we can sort (by key). Values are null so there is no overhead in the amount // of data going through the shuffle. final JavaPairRDD<VariantContext, Void> rddVariantPairs = variants.mapToPair(variant -> new Tuple2<>(variant, (Void) null)); // do a total sort so that all the records in partition i are less than those in partition i+1 final Comparator<VariantContext> comparator = header.getVCFRecordComparator(); final JavaPairRDD<VariantContext, Void> variantVoidPairs; if (comparator == null){ variantVoidPairs = rddVariantPairs; //no sort } else if (numReducers > 0) { variantVoidPairs = rddVariantPairs.sortByKey(comparator, true, numReducers); } else { variantVoidPairs = rddVariantPairs.sortByKey(comparator); } return variantVoidPairs.map(Tuple2::_1); }
Example 6
Source File: CopybookSparkExample.java From CopybookInputFormat with Apache License 2.0 | 5 votes |
public static void main(String[] args) { if (args.length == 0) { } if (args.length == 0) { System.out .println("CopybookSparkExample {master} {copybookInputPath} {dataFileInputPath} {outputFolder}"); return; } String master = args[0]; String copybookInputPath = args[1]; String dataFileInputPath = args[2]; String outputPath = args[3]; JavaSparkContext jsc = new JavaSparkContext(master, "UniqueSeqGenerator", null, "SparkCopybookExample.jar"); Configuration config = new Configuration(); config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); config.addResource(new Path("/etc/hadoop/conf/mapred-site.xml")); config.addResource(new Path("/etc/hadoop/conf/yarn-site.xml")); config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); CopybookInputFormat.setCopybookHdfsPath(config, copybookInputPath); JavaPairRDD<LongWritable, Text> rdd = jsc.newAPIHadoopFile(dataFileInputPath, CopybookInputFormat.class, LongWritable.class, Text.class, config); JavaRDD<String> pipeDelimiter = rdd.map(new MapFunction()); pipeDelimiter.saveAsTextFile(outputPath); }
Example 7
Source File: ALSUpdate.java From oryx with Apache License 2.0 | 5 votes |
/** * Combines {@link Rating}s with the same user/item into one, with score as the sum of * all of the scores. */ private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) { JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples = original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())); JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated; if (implicit) { // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since // they don't guarantee the delete elements are properly handled aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN); } else { // For non-implicit, last wins. aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next); } JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN = aggregated.filter(kv -> !Double.isNaN(kv._2())); if (logStrength) { return noNaN.map(userProductScore -> new Rating( userProductScore._1()._1(), userProductScore._1()._2(), Math.log1p(userProductScore._2() / epsilon))); } else { return noNaN.map(userProductScore -> new Rating( userProductScore._1()._1(), userProductScore._1()._2(), userProductScore._2())); } }
Example 8
Source File: FileSystemInput.java From envelope with Apache License 2.0 | 5 votes |
private Dataset<Row> getEncodedRowsFromInputFormat(String path, Class<? extends InputFormat> inputFormatClass) { JavaSparkContext context = new JavaSparkContext(Contexts.getSparkSession().sparkContext()); JavaPairRDD rawRDD = context.newAPIHadoopFile( path, inputFormatClass, convertToClass(getKeyDataType()), convertToClass(getValueDataType()), new Configuration()); boolean useKey = getKeyDataType() != null; JavaRDD<Row> encodedRDD = rawRDD.map(new EncodeRecordAsKeyValueFunction(useKey)); return Contexts.getSparkSession().createDataFrame(encodedRDD, getProvidingSchema()); }
Example 9
Source File: BatchHeatMapProcessor.java From lambda-arch with Apache License 2.0 | 5 votes |
private JavaRDD<HeatMapData> getCountInArea(JavaPairRDD<Coordinate, Integer> tuples, Date day) throws IOException { JavaRDD<HeatMapData> map = tuples.map(tuple -> { Coordinate coordinate = tuple._1(); return new HeatMapData(coordinate.getLatitude(), coordinate.getLongitude(), tuple._2(), day); } ); return map; }
Example 10
Source File: AvroDataSupplier.java From tablasco with Apache License 2.0 | 5 votes |
@Override public DistributedTable get() { JavaPairRDD<AvroWrapper, NullWritable> avroRdd = this.sparkContext.hadoopFile(this.dataPath.toString(), AvroInputFormat.class, AvroWrapper.class, NullWritable.class); LOGGER.info("data location: {}", this.dataPath); List<String> headers = avroRdd.keys().map(new AvroHeadersFunction()).first(); LOGGER.info("data headers: {}", headers); JavaRDD<List<Object>> rows = avroRdd.map(new AvroRowsFunction(headers)); return new DistributedTable(headers, rows); }
Example 11
Source File: SQLQueryBAM.java From ViraPipe with MIT License | 4 votes |
public static void main(String[] args) throws IOException { SparkConf conf = new SparkConf().setAppName("SQLQueryBAM"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new HiveContext(sc.sc()); Options options = new Options(); Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." ); Option queryOpt = new Option( "query", true, "SQL query string." ); Option baminOpt = new Option( "in", true, "" ); options.addOption( opOpt ); options.addOption( queryOpt ); options.addOption( baminOpt ); CommandLineParser parser = new BasicParser(); CommandLine cmd = null; try { cmd = parser.parse( options, args ); } catch( ParseException exp ) { System.err.println( "Parsing failed. Reason: " + exp.getMessage() ); } String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null; String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null; String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null; sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true); //Read BAM/SAM from HDFS JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration()); //Map to SAMRecord RDD JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get()); JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag())); Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class); samDF.registerTempTable(tablename); if(query!=null) { //Save as parquet file Dataset df2 = sqlContext.sql(query); df2.show(100,false); if(bwaOutDir!=null) df2.write().parquet(bwaOutDir); }else{ if(bwaOutDir!=null) samDF.write().parquet(bwaOutDir); } sc.stop(); }
Example 12
Source File: AggregateUnarySPInstruction.java From systemds with Apache License 2.0 | 4 votes |
private void processTensorAggregate(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //get input // TODO support DataTensor JavaPairRDD<TensorIndexes, TensorBlock> in = sec.getBinaryTensorBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<TensorIndexes, TensorBlock> out = in; // TODO: filter input blocks for trace //execute unary aggregate operation AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr; AggregateOperator aggop = _aop; //perform aggregation if necessary and put output into symbol table if( _aggtype == SparkAggType.SINGLE_BLOCK ) { // TODO filter non empty blocks if sparse safe JavaRDD<TensorBlock> out2 = out.map(new RDDUTensorAggFunction2(auop)); TensorBlock out3 = RDDAggregateUtils.aggStableTensor(out2, aggop); //put output block into symbol table (no lineage because single block) //this also includes implicit maintenance of data characteristics // TODO generalize to drop depending on location of correction // TODO support DataTensor TensorBlock out4 = new TensorBlock(out3.getValueType(), new int[]{1, 1}); out4.set(0, 0, out3.get(0, 0)); sec.setTensorOutput(output.getName(), out4); } else //MULTI_BLOCK or NONE { if( _aggtype == SparkAggType.NONE ) { //in case of no block aggregation, we always drop the correction as well as //use a partitioning-preserving mapvalues out = out.mapValues(new RDDUTensorAggValueFunction(auop)); } else if( _aggtype == SparkAggType.MULTI_BLOCK ) { // TODO MULTI_BLOCK throw new DMLRuntimeException("Multi block spark aggregations are not supported for tensors yet."); /* //in case of multi-block aggregation, we always keep the correction out = out.mapToPair(new RDDUTensorAggFunction(auop, dc.getBlocksize(), dc.getBlocksize())); out = RDDAggregateUtils.aggByKeyStable(out, aggop, false); //drop correction after aggregation if required (aggbykey creates //partitioning, drop correction via partitioning-preserving mapvalues) if( auop.aggOp.correctionExists ) out = out.mapValues( new AggregateDropCorrectionFunction(aggop) ); */ } //put output RDD handle into symbol table updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); } }
Example 13
Source File: SecondaryStructureExtractor.java From mmtf-spark with Apache License 2.0 | 4 votes |
public static JavaRDD<Row> getJavaRDD(JavaPairRDD<String, StructureDataInterface> structure) { return structure.map(t -> getSecStructFractions(t)); }
Example 14
Source File: TestSequenceRecordReaderBytesFunction.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void testRecordReaderBytesFunction() throws Exception { //Local file path File f = testDir.newFolder(); new ClassPathResource("datavec-spark/video/").copyDirectory(f); String path = f.getAbsolutePath() + "/*"; //Load binary data from local file system, convert to a sequence file: //Load and convert JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path); JavaPairRDD<Text, BytesWritable> filesAsBytes = origData.mapToPair(new FilesAsBytesFunction()); //Write the sequence file: Path p = Files.createTempDirectory("dl4j_rrbytesTest"); p.toFile().deleteOnExit(); String outPath = p.toString() + "/out"; filesAsBytes.saveAsNewAPIHadoopFile(outPath, Text.class, BytesWritable.class, SequenceFileOutputFormat.class); //Load data from sequence file, parse via SequenceRecordReader: JavaPairRDD<Text, BytesWritable> fromSeqFile = sc.sequenceFile(outPath, Text.class, BytesWritable.class); SequenceRecordReader seqRR = new CodecRecordReader(); Configuration conf = new Configuration(); conf.set(CodecRecordReader.RAVEL, "true"); conf.set(CodecRecordReader.START_FRAME, "0"); conf.set(CodecRecordReader.TOTAL_FRAMES, "25"); conf.set(CodecRecordReader.ROWS, "64"); conf.set(CodecRecordReader.COLUMNS, "64"); Configuration confCopy = new Configuration(conf); seqRR.setConf(conf); JavaRDD<List<List<Writable>>> dataVecData = fromSeqFile.map(new SequenceRecordReaderBytesFunction(seqRR)); //Next: do the same thing locally, and compare the results InputSplit is = new FileSplit(f, new String[] {"mp4"}, true); SequenceRecordReader srr = new CodecRecordReader(); srr.initialize(is); srr.setConf(confCopy); List<List<List<Writable>>> list = new ArrayList<>(4); while (srr.hasNext()) { list.add(srr.sequenceRecord()); } assertEquals(4, list.size()); List<List<List<Writable>>> fromSequenceFile = dataVecData.collect(); assertEquals(4, list.size()); assertEquals(4, fromSequenceFile.size()); boolean[] found = new boolean[4]; for (int i = 0; i < 4; i++) { int foundIndex = -1; List<List<Writable>> collection = fromSequenceFile.get(i); for (int j = 0; j < 4; j++) { if (collection.equals(list.get(j))) { if (foundIndex != -1) fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen) foundIndex = j; if (found[foundIndex]) fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list found[foundIndex] = true; //mark this one as seen before } } } int count = 0; for (boolean b : found) if (b) count++; assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions }
Example 15
Source File: AreaTop3ProductSpark.java From BigDataPlatform with GNU General Public License v3.0 | 4 votes |
/** * 生成点击商品基础信息临时表 * @param sqlContext * @param cityid2clickActionRDD * @param cityid2cityInfoRDD */ private static void generateTempClickProductBasicTable( SQLContext sqlContext, JavaPairRDD<Long, Row> cityid2clickActionRDD, JavaPairRDD<Long, Row> cityid2cityInfoRDD) { // 执行join操作,进行点击行为数据和城市数据的关联 JavaPairRDD<Long, Tuple2<Row, Row>> joinedRDD = cityid2clickActionRDD.join(cityid2cityInfoRDD); // 将上面的JavaPairRDD,转换成一个JavaRDD<Row>(才能将RDD转换为Dataset<Row>) JavaRDD<Row> mappedRDD = joinedRDD.map( new Function<Tuple2<Long,Tuple2<Row,Row>>, Row>() { private static final long serialVersionUID = 1L; @Override public Row call(Tuple2<Long, Tuple2<Row, Row>> tuple) throws Exception { Long cityid = tuple._1; Row clickAction = tuple._2._1; Row cityInfo = tuple._2._2; Long productid = clickAction.getLong(1); String cityName = cityInfo.getString(1); String area = cityInfo.getString(2); return RowFactory.create(cityid, cityName, area, productid); } }); // 基于JavaRDD<Row>的格式,就可以将其转换为Dataset<Row> List<StructField> structFields = new ArrayList<StructField>(); structFields.add(DataTypes.createStructField("city_id", DataTypes.LongType, true)); structFields.add(DataTypes.createStructField("city_name", DataTypes.StringType, true)); structFields.add(DataTypes.createStructField("area", DataTypes.StringType, true)); structFields.add(DataTypes.createStructField("product_id", DataTypes.LongType, true)); // 1 北京 // 2 上海 // 1 北京 // group by area,product_id // 1:北京,2:上海 // 两个函数 // UDF:concat2(),将两个字段拼接起来,用指定的分隔符 // UDAF:group_concat_distinct(),将一个分组中的多个字段值,用逗号拼接起来,同时进行去重 StructType schema = DataTypes.createStructType(structFields); Dataset<Row> df = sqlContext.createDataFrame(mappedRDD, schema); System.out.println("tmp_click_product_basic: " + df.count()); // 将Dataset<Row>中的数据,注册成临时表(tmp_click_product_basic) df.registerTempTable("tmp_click_product_basic"); }
Example 16
Source File: SQLQueryFastq.java From ViraPipe with MIT License | 4 votes |
public static void main(String[] args) throws IOException { SparkConf conf = new SparkConf().setAppName("SQLQueryFastq"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc); Options options = new Options(); Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." ); Option queryOpt = new Option( "query", true, "SQL query string." ); Option samOpt = new Option( "format", true, "parquet or fastq" ); Option baminOpt = new Option( "in", true, "" ); options.addOption( new Option( "tablename", true, "Default sql table name is 'records'")); options.addOption( opOpt ); options.addOption( queryOpt ); options.addOption( samOpt ); options.addOption( baminOpt ); CommandLineParser parser = new BasicParser(); CommandLine cmd = null; try { // parse the command line arguments cmd = parser.parse( options, args ); } catch( ParseException exp ) { // oops, something went wrong System.err.println( "Parsing failed. Reason: " + exp.getMessage() ); } String outDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null; String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null; String format = (cmd.hasOption("format")==true)? cmd.getOptionValue("format"):"fastq"; String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null; tablename = (cmd.hasOption("tablename")==true)? cmd.getOptionValue("tablename"):"records"; sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true); JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(in, FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration()); JavaRDD<MyRead> rdd = fastqRDD.map(record -> { MyRead read = new MyRead(); read.setKey(record._1.toString()); read.setSequence(record._2.getSequence().toString()); read.setRead(record._2.getRead()); read.setQuality(record._2.getQuality().toString()); read.setTile(record._2.getTile()); read.setXpos(record._2.getXpos()); read.setYpos(record._2.getYpos()); read.setRunNumber(record._2.getRunNumber()); read.setInstrument(record._2.getInstrument()); read.setFlowcellId(record._2.getFlowcellId()); read.setLane(record._2.getLane()); read.setControlNumber(record._2.getControlNumber()); read.setFilterPassed(record._2.getFilterPassed()); return read; }); Dataset df = sqlContext.createDataFrame(rdd, MyRead.class); df.registerTempTable(tablename); //eq. count duplicates "SELECT count(DISTINCT(sequence)) FROM records" //"SELECT key,LEN(sequence) as l FROM records where l<100;" if(query!=null) { //JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag(), bam)); //Save as parquet file Dataset<Row> resultDF = sqlContext.sql(query); resultDF.show(100, false); if(outDir!=null){ if(format.equals("fastq")){ JavaPairRDD<Text, SequencedFragment> resultRDD = dfToFastqRDD(resultDF); resultRDD.saveAsNewAPIHadoopFile(outDir, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration()); } else resultDF.write().parquet(outDir); } } sc.stop(); }
Example 17
Source File: SamToFastq.java From ViraPipe with MIT License | 4 votes |
public static void main(String[] args) throws IOException { SparkConf conf = new SparkConf().setAppName("SamToFastq"); sc = new JavaSparkContext(conf); String in = args[0]; String out = args[1]; JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(in, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration()); //Map to SAMRecord RDD JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get()); JavaPairRDD<Text, SequencedFragment> fastqrdd = mapSAMRecordsToFastq(samRDD); fastqrdd.saveAsNewAPIHadoopFile(out, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration()); sc.stop(); }
Example 18
Source File: SparkUtils.java From deeplearning4j with Apache License 2.0 | 4 votes |
public static <T> JavaRDD<T> repartitionApproximateBalance(JavaRDD<T> rdd, Repartition repartition, int numPartitions) { int origNumPartitions = rdd.partitions().size(); switch (repartition) { case Never: return rdd; case NumPartitionsWorkersDiffers: if (origNumPartitions == numPartitions) return rdd; case Always: // Count each partition... List<Integer> partitionCounts = rdd.mapPartitionsWithIndex(new Function2<Integer, Iterator<T>, Iterator<Integer>>() { @Override public Iterator<Integer> call(Integer integer, Iterator<T> tIterator) throws Exception { int count = 0; while (tIterator.hasNext()) { tIterator.next(); count++; } return Collections.singletonList(count).iterator(); } }, true).collect(); Integer totalCount = 0; for (Integer i : partitionCounts) totalCount += i; List<Double> partitionWeights = new ArrayList<>(Math.max(numPartitions, origNumPartitions)); Double ideal = (double) totalCount / numPartitions; // partitions in the initial set and not in the final one get -1 => elements always jump // partitions in the final set not in the initial one get 0 => aim to receive the average amount for (int i = 0; i < Math.min(origNumPartitions, numPartitions); i++) { partitionWeights.add((double) partitionCounts.get(i) / ideal); } for (int i = Math.min(origNumPartitions, numPartitions); i < Math.max(origNumPartitions, numPartitions); i++) { // we shrink the # of partitions if (i >= numPartitions) partitionWeights.add(-1D); // we enlarge the # of partitions else partitionWeights.add(0D); } // this method won't trigger a spark job, which is different from {@link org.apache.spark.rdd.RDD#zipWithIndex} JavaPairRDD<Tuple2<Long, Integer>, T> indexedRDD = rdd.zipWithUniqueId() .mapToPair(new PairFunction<Tuple2<T, Long>, Tuple2<Long, Integer>, T>() { @Override public Tuple2<Tuple2<Long, Integer>, T> call(Tuple2<T, Long> tLongTuple2) { return new Tuple2<>( new Tuple2<Long, Integer>(tLongTuple2._2(), 0), tLongTuple2._1()); } }); HashingBalancedPartitioner hbp = new HashingBalancedPartitioner(Collections.singletonList(partitionWeights)); JavaPairRDD<Tuple2<Long, Integer>, T> partitionedRDD = indexedRDD.partitionBy(hbp); return partitionedRDD.map(new Function<Tuple2<Tuple2<Long, Integer>, T>, T>() { @Override public T call(Tuple2<Tuple2<Long, Integer>, T> indexNPayload) { return indexNPayload._2(); } }); default: throw new RuntimeException("Unknown setting for repartition: " + repartition); } }
Example 19
Source File: AggregateUnarySPInstruction.java From systemds with Apache License 2.0 | 4 votes |
private void processTensorAggregate(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //get input // TODO support DataTensor JavaPairRDD<TensorIndexes, TensorBlock> in = sec.getBinaryTensorBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<TensorIndexes, TensorBlock> out = in; // TODO: filter input blocks for trace //execute unary aggregate operation AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr; AggregateOperator aggop = _aop; //perform aggregation if necessary and put output into symbol table if( _aggtype == SparkAggType.SINGLE_BLOCK ) { // TODO filter non empty blocks if sparse safe JavaRDD<TensorBlock> out2 = out.map(new RDDUTensorAggFunction2(auop)); TensorBlock out3 = RDDAggregateUtils.aggStableTensor(out2, aggop); //put output block into symbol table (no lineage because single block) //this also includes implicit maintenance of data characteristics // TODO generalize to drop depending on location of correction // TODO support DataTensor TensorBlock out4 = new TensorBlock(out3.getValueType(), new int[]{1, 1}); out4.set(0, 0, out3.get(0, 0)); sec.setTensorOutput(output.getName(), out4); } else //MULTI_BLOCK or NONE { if( _aggtype == SparkAggType.NONE ) { //in case of no block aggregation, we always drop the correction as well as //use a partitioning-preserving mapvalues out = out.mapValues(new RDDUTensorAggValueFunction(auop)); } else if( _aggtype == SparkAggType.MULTI_BLOCK ) { // TODO MULTI_BLOCK throw new DMLRuntimeException("Multi block spark aggregations are not supported for tensors yet."); /* //in case of multi-block aggregation, we always keep the correction out = out.mapToPair(new RDDUTensorAggFunction(auop, dc.getBlocksize(), dc.getBlocksize())); out = RDDAggregateUtils.aggByKeyStable(out, aggop, false); //drop correction after aggregation if required (aggbykey creates //partitioning, drop correction via partitioning-preserving mapvalues) if( auop.aggOp.correctionExists ) out = out.mapValues( new AggregateDropCorrectionFunction(aggop) ); */ } //put output RDD handle into symbol table updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); } }
Example 20
Source File: AggregateUnarySPInstruction.java From systemds with Apache License 2.0 | 4 votes |
private void processMatrixAggregate(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; DataCharacteristics mc = sec.getDataCharacteristics(input1.getName()); //get input JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> out = in; //filter input blocks for trace if( getOpcode().equalsIgnoreCase("uaktrace") ) out = out.filter(new FilterDiagMatrixBlocksFunction()); //execute unary aggregate operation AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr; AggregateOperator aggop = _aop; //perform aggregation if necessary and put output into symbol table if( _aggtype == SparkAggType.SINGLE_BLOCK ) { if( auop.sparseSafe ) out = out.filter(new FilterNonEmptyBlocksFunction()); JavaRDD<MatrixBlock> out2 = out.map( new RDDUAggFunction2(auop, mc.getBlocksize())); MatrixBlock out3 = RDDAggregateUtils.aggStable(out2, aggop); //drop correction after aggregation out3.dropLastRowsOrColumns(aggop.correction); //put output block into symbol table (no lineage because single block) //this also includes implicit maintenance of matrix characteristics sec.setMatrixOutput(output.getName(), out3); } else //MULTI_BLOCK or NONE { if( _aggtype == SparkAggType.NONE ) { //in case of no block aggregation, we always drop the correction as well as //use a partitioning-preserving mapvalues out = out.mapValues(new RDDUAggValueFunction(auop, mc.getBlocksize())); } else if( _aggtype == SparkAggType.MULTI_BLOCK ) { //in case of multi-block aggregation, we always keep the correction out = out.mapToPair(new RDDUAggFunction(auop, mc.getBlocksize())); out = RDDAggregateUtils.aggByKeyStable(out, aggop, false); //drop correction after aggregation if required (aggbykey creates //partitioning, drop correction via partitioning-preserving mapvalues) if( auop.aggOp.existsCorrection() ) out = out.mapValues( new AggregateDropCorrectionFunction(aggop) ); } //put output RDD handle into symbol table updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); } }