org.apache.spark.api.java.JavaPairRDD#map

Source File: ChronixRDD.java From chronix.spark with Apache License 2.0

6 votes

/**
 * Transformation: Joins the time series according their identity.
 *
 * @return joined time series
 */
public ChronixRDD joinChunks() {
    JavaPairRDD<MetricTimeSeriesKey, Iterable<MetricTimeSeries>> groupRdd
            = this.groupBy(MetricTimeSeriesKey::new);

    JavaPairRDD<MetricTimeSeriesKey, MetricTimeSeries> joinedRdd
            = groupRdd.mapValues((Function<Iterable<MetricTimeSeries>, MetricTimeSeries>) mtsIt -> {
        MetricTimeSeriesOrdering ordering = new MetricTimeSeriesOrdering();
        List<MetricTimeSeries> orderedChunks = ordering.immutableSortedCopy(mtsIt);
        MetricTimeSeries result = null;
        for (MetricTimeSeries mts : orderedChunks) {
            if (result == null) {
                result = new MetricTimeSeries
                        .Builder(mts.getMetric())
                        .attributes(mts.attributes()).build();
            }
            result.addAll(mts.getTimestampsAsArray(), mts.getValuesAsArray());
        }
        return result;
    });

    JavaRDD<MetricTimeSeries> resultJavaRdd =
            joinedRdd.map((Tuple2<MetricTimeSeriesKey, MetricTimeSeries> mtTuple) -> mtTuple._2);

    return new ChronixRDD(resultJavaRdd);
}

Source File: TsmmSPInstruction.java From systemds with Apache License 2.0

6 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	
	//execute tsmm instruction (always produce exactly one output block)
	//(this formulation with values() requires --conf spark.driver.maxResultSize=0)
	JavaRDD<MatrixBlock> tmp = in.map(new RDDTSMMFunction(_type));
	MatrixBlock out = RDDAggregateUtils.sumStable(tmp);

	//put output block into symbol table (no lineage because single block)
	//this also includes implicit maintenance of matrix characteristics
	sec.setMatrixOutput(output.getName(), out);
}

Source File: BatchTrafficDataProcessor.java From lambda-arch with Apache License 2.0

6 votes

/**
 * Method to get the vehicles which are in radius of POI and their distance from POI.
 *
 * @param nonFilteredIotDataStream original IoT data stream
 * @param broadcastPOIValues       variable containing POI coordinates, route and vehicle types to monitor.
 */
public void processPOIData(
        JavaRDD<IoTData> nonFilteredIotDataStream,
        Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues
) {
    // Filter by routeId,vehicleType and in POI range
    JavaRDD<IoTData> iotDataStreamFiltered = filterVehicleInPOIRange(nonFilteredIotDataStream, broadcastPOIValues);

    // pair with poi
    JavaPairRDD<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered.mapToPair(
            iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1())
    );

    // Transform to dstream of POITrafficData
    JavaRDD<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc);
    persistPOI(trafficDStream);
}

Source File: TsmmSPInstruction.java From systemds with Apache License 2.0

6 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	
	//execute tsmm instruction (always produce exactly one output block)
	//(this formulation with values() requires --conf spark.driver.maxResultSize=0)
	JavaRDD<MatrixBlock> tmp = in.map(new RDDTSMMFunction(_type));
	MatrixBlock out = RDDAggregateUtils.sumStable(tmp);

	//put output block into symbol table (no lineage because single block)
	//this also includes implicit maintenance of matrix characteristics
	sec.setMatrixOutput(output.getName(), out);
}

Source File: VariantsSparkSink.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

private static JavaRDD<VariantContext> sortVariants(final JavaRDD<VariantContext> variants, final VCFHeader header, final int numReducers) {
    // Turn into key-value pairs so we can sort (by key). Values are null so there is no overhead in the amount
    // of data going through the shuffle.
    final JavaPairRDD<VariantContext, Void> rddVariantPairs = variants.mapToPair(variant -> new Tuple2<>(variant, (Void) null));

    // do a total sort so that all the records in partition i are less than those in partition i+1
    final Comparator<VariantContext> comparator = header.getVCFRecordComparator();
    final JavaPairRDD<VariantContext, Void> variantVoidPairs;
    if (comparator == null){
        variantVoidPairs = rddVariantPairs; //no sort
    } else if (numReducers > 0) {
        variantVoidPairs = rddVariantPairs.sortByKey(comparator, true, numReducers);
    } else {
        variantVoidPairs = rddVariantPairs.sortByKey(comparator);
    }

    return variantVoidPairs.map(Tuple2::_1);
}

Source File: CopybookSparkExample.java From CopybookInputFormat with Apache License 2.0

5 votes

public static void main(String[] args) {
	if (args.length == 0) {

	}
	if (args.length == 0) {
		System.out
				.println("CopybookSparkExample {master} {copybookInputPath} {dataFileInputPath} {outputFolder}");
		return;
	}

	String master = args[0];
	String copybookInputPath = args[1];
	String dataFileInputPath = args[2];
	String outputPath = args[3];

	JavaSparkContext jsc = new JavaSparkContext(master,
			"UniqueSeqGenerator", null, "SparkCopybookExample.jar");

	Configuration config = new Configuration();
	config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"));
	config.addResource(new Path("/etc/hadoop/conf/mapred-site.xml"));
	config.addResource(new Path("/etc/hadoop/conf/yarn-site.xml"));
	config.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
	CopybookInputFormat.setCopybookHdfsPath(config, copybookInputPath);
	
	JavaPairRDD<LongWritable, Text> rdd = jsc.newAPIHadoopFile(dataFileInputPath, CopybookInputFormat.class, LongWritable.class, Text.class, config);
	JavaRDD<String> pipeDelimiter = rdd.map(new MapFunction());

	pipeDelimiter.saveAsTextFile(outputPath);
}

Source File: ALSUpdate.java From oryx with Apache License 2.0

5 votes

/**
 * Combines {@link Rating}s with the same user/item into one, with score as the sum of
 * all of the scores.
 */
private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) {
  JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
      original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));

  JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
  if (implicit) {
    // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
    // they don't guarantee the delete elements are properly handled
    aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
  } else {
    // For non-implicit, last wins.
    aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
  }

  JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
      aggregated.filter(kv -> !Double.isNaN(kv._2()));

  if (logStrength) {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        Math.log1p(userProductScore._2() / epsilon)));
  } else {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        userProductScore._2()));
  }
}

Source File: FileSystemInput.java From envelope with Apache License 2.0

5 votes

private Dataset<Row> getEncodedRowsFromInputFormat(String path, Class<? extends InputFormat> inputFormatClass) {
  JavaSparkContext context = new JavaSparkContext(Contexts.getSparkSession().sparkContext());
  JavaPairRDD rawRDD = context.newAPIHadoopFile(
      path, inputFormatClass, convertToClass(getKeyDataType()), convertToClass(getValueDataType()),
      new Configuration());

  boolean useKey = getKeyDataType() != null;
  JavaRDD<Row> encodedRDD = rawRDD.map(new EncodeRecordAsKeyValueFunction(useKey));

  return Contexts.getSparkSession().createDataFrame(encodedRDD, getProvidingSchema());
}

Source File: BatchHeatMapProcessor.java From lambda-arch with Apache License 2.0

5 votes

private JavaRDD<HeatMapData> getCountInArea(JavaPairRDD<Coordinate, Integer> tuples, Date day) throws IOException {
    JavaRDD<HeatMapData> map = tuples.map(tuple -> {
                Coordinate coordinate = tuple._1();
                return new HeatMapData(coordinate.getLatitude(), coordinate.getLongitude(), tuple._2(), day);
            }
    );
    return map;
}

Source File: AvroDataSupplier.java From tablasco with Apache License 2.0

5 votes

@Override
public DistributedTable get()
{
    JavaPairRDD<AvroWrapper, NullWritable> avroRdd = this.sparkContext.hadoopFile(this.dataPath.toString(), AvroInputFormat.class, AvroWrapper.class, NullWritable.class);
    LOGGER.info("data location: {}", this.dataPath);
    List<String> headers = avroRdd.keys().map(new AvroHeadersFunction()).first();
    LOGGER.info("data headers: {}", headers);
    JavaRDD<List<Object>> rows = avroRdd.map(new AvroRowsFunction(headers));
    return new DistributedTable(headers, rows);
}

Source File: SQLQueryBAM.java From ViraPipe with MIT License

4 votes

public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new HiveContext(sc.sc());

  Options options = new Options();
  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option baminOpt = new Option( "in", true, "" );

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  //Read BAM/SAM from HDFS
  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
  JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));

  Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
  samDF.registerTempTable(tablename);
  if(query!=null) {

    //Save as parquet file
    Dataset df2 = sqlContext.sql(query);
    df2.show(100,false);

    if(bwaOutDir!=null)
      df2.write().parquet(bwaOutDir);

  }else{
    if(bwaOutDir!=null)
      samDF.write().parquet(bwaOutDir);
  }

  sc.stop();

}

Source File: AggregateUnarySPInstruction.java From systemds with Apache License 2.0

4 votes

private void processTensorAggregate(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	//get input
	// TODO support DataTensor
	JavaPairRDD<TensorIndexes, TensorBlock> in = sec.getBinaryTensorBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<TensorIndexes, TensorBlock> out = in;

	// TODO: filter input blocks for trace
	//execute unary aggregate operation
	AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr;
	AggregateOperator aggop = _aop;

	//perform aggregation if necessary and put output into symbol table
	if( _aggtype == SparkAggType.SINGLE_BLOCK )
	{
		// TODO filter non empty blocks if sparse safe
		JavaRDD<TensorBlock> out2 = out.map(new RDDUTensorAggFunction2(auop));
		TensorBlock out3 = RDDAggregateUtils.aggStableTensor(out2, aggop);

		//put output block into symbol table (no lineage because single block)
		//this also includes implicit maintenance of data characteristics
		// TODO generalize to drop depending on location of correction
		// TODO support DataTensor
		TensorBlock out4 = new TensorBlock(out3.getValueType(), new int[]{1, 1});
		out4.set(0, 0, out3.get(0, 0));
		sec.setTensorOutput(output.getName(), out4);
	}
	else //MULTI_BLOCK or NONE
	{
		if( _aggtype == SparkAggType.NONE ) {
			//in case of no block aggregation, we always drop the correction as well as
			//use a partitioning-preserving mapvalues
			out = out.mapValues(new RDDUTensorAggValueFunction(auop));
		}
		else if( _aggtype == SparkAggType.MULTI_BLOCK ) {
			// TODO MULTI_BLOCK
			throw new DMLRuntimeException("Multi block spark aggregations are not supported for tensors yet.");
			/*
			//in case of multi-block aggregation, we always keep the correction
			out = out.mapToPair(new RDDUTensorAggFunction(auop, dc.getBlocksize(), dc.getBlocksize()));
			out = RDDAggregateUtils.aggByKeyStable(out, aggop, false);

			//drop correction after aggregation if required (aggbykey creates
			//partitioning, drop correction via partitioning-preserving mapvalues)
			if( auop.aggOp.correctionExists )
				out = out.mapValues( new AggregateDropCorrectionFunction(aggop) );
			 */
		}

		//put output RDD handle into symbol table
		updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn);
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), input1.getName());
	}
}

Source File: SecondaryStructureExtractor.java From mmtf-spark with Apache License 2.0

4 votes

public static JavaRDD<Row> getJavaRDD(JavaPairRDD<String, StructureDataInterface> structure) {
	return structure.map(t -> getSecStructFractions(t));
}

Source File: TestSequenceRecordReaderBytesFunction.java From deeplearning4j with Apache License 2.0

4 votes

@Test
public void testRecordReaderBytesFunction() throws Exception {

    //Local file path
    File f = testDir.newFolder();
    new ClassPathResource("datavec-spark/video/").copyDirectory(f);
    String path = f.getAbsolutePath() + "/*";

    //Load binary data from local file system, convert to a sequence file:
    //Load and convert
    JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path);
    JavaPairRDD<Text, BytesWritable> filesAsBytes = origData.mapToPair(new FilesAsBytesFunction());
    //Write the sequence file:
    Path p = Files.createTempDirectory("dl4j_rrbytesTest");
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    filesAsBytes.saveAsNewAPIHadoopFile(outPath, Text.class, BytesWritable.class, SequenceFileOutputFormat.class);

    //Load data from sequence file, parse via SequenceRecordReader:
    JavaPairRDD<Text, BytesWritable> fromSeqFile = sc.sequenceFile(outPath, Text.class, BytesWritable.class);
    SequenceRecordReader seqRR = new CodecRecordReader();
    Configuration conf = new Configuration();
    conf.set(CodecRecordReader.RAVEL, "true");
    conf.set(CodecRecordReader.START_FRAME, "0");
    conf.set(CodecRecordReader.TOTAL_FRAMES, "25");
    conf.set(CodecRecordReader.ROWS, "64");
    conf.set(CodecRecordReader.COLUMNS, "64");
    Configuration confCopy = new Configuration(conf);
    seqRR.setConf(conf);
    JavaRDD<List<List<Writable>>> dataVecData = fromSeqFile.map(new SequenceRecordReaderBytesFunction(seqRR));



    //Next: do the same thing locally, and compare the results
    InputSplit is = new FileSplit(f, new String[] {"mp4"}, true);
    SequenceRecordReader srr = new CodecRecordReader();
    srr.initialize(is);
    srr.setConf(confCopy);

    List<List<List<Writable>>> list = new ArrayList<>(4);
    while (srr.hasNext()) {
        list.add(srr.sequenceRecord());
    }
    assertEquals(4, list.size());

    List<List<List<Writable>>> fromSequenceFile = dataVecData.collect();

    assertEquals(4, list.size());
    assertEquals(4, fromSequenceFile.size());

    boolean[] found = new boolean[4];
    for (int i = 0; i < 4; i++) {
        int foundIndex = -1;
        List<List<Writable>> collection = fromSequenceFile.get(i);
        for (int j = 0; j < 4; j++) {
            if (collection.equals(list.get(j))) {
                if (foundIndex != -1)
                    fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                foundIndex = j;
                if (found[foundIndex])
                    fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                found[foundIndex] = true; //mark this one as seen before
            }
        }
    }
    int count = 0;
    for (boolean b : found)
        if (b)
            count++;
    assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions
}

Source File: AreaTop3ProductSpark.java From BigDataPlatform with GNU General Public License v3.0

4 votes

/**
 * 生成点击商品基础信息临时表
 * @param sqlContext
 * @param cityid2clickActionRDD
 * @param cityid2cityInfoRDD
 */
private static void generateTempClickProductBasicTable(
		SQLContext sqlContext,
		JavaPairRDD<Long, Row> cityid2clickActionRDD,
		JavaPairRDD<Long, Row> cityid2cityInfoRDD) {
	// 执行join操作，进行点击行为数据和城市数据的关联
	JavaPairRDD<Long, Tuple2<Row, Row>> joinedRDD =
			cityid2clickActionRDD.join(cityid2cityInfoRDD);
	
	// 将上面的JavaPairRDD，转换成一个JavaRDD<Row>（才能将RDD转换为Dataset<Row>）
	JavaRDD<Row> mappedRDD = joinedRDD.map(
			
			new Function<Tuple2<Long,Tuple2<Row,Row>>, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Row call(Tuple2<Long, Tuple2<Row, Row>> tuple)
						throws Exception {
					Long cityid = tuple._1;
					Row clickAction = tuple._2._1;
					Row cityInfo = tuple._2._2;
					
					Long productid = clickAction.getLong(1);
					String cityName = cityInfo.getString(1);
					String area = cityInfo.getString(2);
					
					return RowFactory.create(cityid, cityName, area, productid);  
				}
				
			});
	
	// 基于JavaRDD<Row>的格式，就可以将其转换为Dataset<Row>
	List<StructField> structFields = new ArrayList<StructField>();
	structFields.add(DataTypes.createStructField("city_id", DataTypes.LongType, true));
	structFields.add(DataTypes.createStructField("city_name", DataTypes.StringType, true));
	structFields.add(DataTypes.createStructField("area", DataTypes.StringType, true));
	structFields.add(DataTypes.createStructField("product_id", DataTypes.LongType, true));
	
	// 1 北京
	// 2 上海
	// 1 北京
	// group by area,product_id
	// 1:北京,2:上海
	
	// 两个函数
	// UDF：concat2()，将两个字段拼接起来，用指定的分隔符
	// UDAF：group_concat_distinct()，将一个分组中的多个字段值，用逗号拼接起来，同时进行去重
	
	StructType schema = DataTypes.createStructType(structFields);

	Dataset<Row> df = sqlContext.createDataFrame(mappedRDD, schema);
	System.out.println("tmp_click_product_basic: " + df.count());  
	
	// 将Dataset<Row>中的数据，注册成临时表（tmp_click_product_basic）
	df.registerTempTable("tmp_click_product_basic");  
}

Source File: SQLQueryFastq.java From ViraPipe with MIT License

4 votes

public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryFastq");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new SQLContext(sc);

  Options options = new Options();

  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option samOpt = new Option( "format", true, "parquet or fastq" );
  Option baminOpt = new Option( "in", true, "" );
  options.addOption( new Option( "tablename", true, "Default sql table name is 'records'"));

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( samOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    // parse the command line arguments
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    // oops, something went wrong
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String outDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String format = (cmd.hasOption("format")==true)? cmd.getOptionValue("format"):"fastq";
  String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
  tablename = (cmd.hasOption("tablename")==true)? cmd.getOptionValue("tablename"):"records";

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(in, FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

  JavaRDD<MyRead> rdd = fastqRDD.map(record -> {
    MyRead read = new MyRead();
    read.setKey(record._1.toString());
    read.setSequence(record._2.getSequence().toString());
    read.setRead(record._2.getRead());
    read.setQuality(record._2.getQuality().toString());

    read.setTile(record._2.getTile());
    read.setXpos(record._2.getXpos());
    read.setYpos(record._2.getYpos());
    read.setRunNumber(record._2.getRunNumber());
    read.setInstrument(record._2.getInstrument());
    read.setFlowcellId(record._2.getFlowcellId());
    read.setLane(record._2.getLane());
    read.setControlNumber(record._2.getControlNumber());
    read.setFilterPassed(record._2.getFilterPassed());

    return read;
  });

  Dataset df = sqlContext.createDataFrame(rdd, MyRead.class);
  df.registerTempTable(tablename);
  //eq. count duplicates "SELECT count(DISTINCT(sequence)) FROM records"
  //"SELECT key,LEN(sequence) as l FROM records where l<100;"
  if(query!=null) {

    //JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag(), bam));
    //Save as parquet file
    Dataset<Row> resultDF = sqlContext.sql(query);
    resultDF.show(100, false);

    if(outDir!=null){
      if(format.equals("fastq")){
        JavaPairRDD<Text, SequencedFragment> resultRDD = dfToFastqRDD(resultDF);
        resultRDD.saveAsNewAPIHadoopFile(outDir, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
      }
      else
        resultDF.write().parquet(outDir);
    }
  }
  sc.stop();

}

Source File: SamToFastq.java From ViraPipe with MIT License

4 votes

public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SamToFastq");
  sc = new JavaSparkContext(conf);

  String in = args[0];
  String out = args[1];

  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(in, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());

  JavaPairRDD<Text, SequencedFragment> fastqrdd = mapSAMRecordsToFastq(samRDD);

  fastqrdd.saveAsNewAPIHadoopFile(out, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());

  sc.stop();

}

Source File: SparkUtils.java From deeplearning4j with Apache License 2.0

4 votes

public static <T> JavaRDD<T> repartitionApproximateBalance(JavaRDD<T> rdd, Repartition repartition,
                int numPartitions) {
    int origNumPartitions = rdd.partitions().size();
    switch (repartition) {
        case Never:
            return rdd;
        case NumPartitionsWorkersDiffers:
            if (origNumPartitions == numPartitions)
                return rdd;
        case Always:
            // Count each partition...
            List<Integer> partitionCounts =
                            rdd.mapPartitionsWithIndex(new Function2<Integer, Iterator<T>, Iterator<Integer>>() {
                                @Override
                                public Iterator<Integer> call(Integer integer, Iterator<T> tIterator)
                                                throws Exception {
                                    int count = 0;
                                    while (tIterator.hasNext()) {
                                        tIterator.next();
                                        count++;
                                    }
                                    return Collections.singletonList(count).iterator();
                                }
                            }, true).collect();

            Integer totalCount = 0;
            for (Integer i : partitionCounts)
                totalCount += i;
            List<Double> partitionWeights = new ArrayList<>(Math.max(numPartitions, origNumPartitions));
            Double ideal = (double) totalCount / numPartitions;
            // partitions in the initial set and not in the final one get -1 => elements always jump
            // partitions in the final set not in the initial one get 0 => aim to receive the average amount
            for (int i = 0; i < Math.min(origNumPartitions, numPartitions); i++) {
                partitionWeights.add((double) partitionCounts.get(i) / ideal);
            }
            for (int i = Math.min(origNumPartitions, numPartitions); i < Math.max(origNumPartitions,
                            numPartitions); i++) {
                // we shrink the # of partitions
                if (i >= numPartitions)
                    partitionWeights.add(-1D);
                // we enlarge the # of partitions
                else
                    partitionWeights.add(0D);
            }

            // this method won't trigger a spark job, which is different from {@link org.apache.spark.rdd.RDD#zipWithIndex}

            JavaPairRDD<Tuple2<Long, Integer>, T> indexedRDD = rdd.zipWithUniqueId()
                            .mapToPair(new PairFunction<Tuple2<T, Long>, Tuple2<Long, Integer>, T>() {
                                @Override
                                public Tuple2<Tuple2<Long, Integer>, T> call(Tuple2<T, Long> tLongTuple2) {
                                    return new Tuple2<>(
                                                    new Tuple2<Long, Integer>(tLongTuple2._2(), 0),
                                                    tLongTuple2._1());
                                }
                            });

            HashingBalancedPartitioner hbp =
                            new HashingBalancedPartitioner(Collections.singletonList(partitionWeights));
            JavaPairRDD<Tuple2<Long, Integer>, T> partitionedRDD = indexedRDD.partitionBy(hbp);

            return partitionedRDD.map(new Function<Tuple2<Tuple2<Long, Integer>, T>, T>() {
                @Override
                public T call(Tuple2<Tuple2<Long, Integer>, T> indexNPayload) {
                    return indexNPayload._2();
                }
            });
        default:
            throw new RuntimeException("Unknown setting for repartition: " + repartition);
    }
}

Source File: AggregateUnarySPInstruction.java From systemds with Apache License 2.0

4 votes

private void processTensorAggregate(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	//get input
	// TODO support DataTensor
	JavaPairRDD<TensorIndexes, TensorBlock> in = sec.getBinaryTensorBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<TensorIndexes, TensorBlock> out = in;

	// TODO: filter input blocks for trace
	//execute unary aggregate operation
	AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr;
	AggregateOperator aggop = _aop;

	//perform aggregation if necessary and put output into symbol table
	if( _aggtype == SparkAggType.SINGLE_BLOCK )
	{
		// TODO filter non empty blocks if sparse safe
		JavaRDD<TensorBlock> out2 = out.map(new RDDUTensorAggFunction2(auop));
		TensorBlock out3 = RDDAggregateUtils.aggStableTensor(out2, aggop);

		//put output block into symbol table (no lineage because single block)
		//this also includes implicit maintenance of data characteristics
		// TODO generalize to drop depending on location of correction
		// TODO support DataTensor
		TensorBlock out4 = new TensorBlock(out3.getValueType(), new int[]{1, 1});
		out4.set(0, 0, out3.get(0, 0));
		sec.setTensorOutput(output.getName(), out4);
	}
	else //MULTI_BLOCK or NONE
	{
		if( _aggtype == SparkAggType.NONE ) {
			//in case of no block aggregation, we always drop the correction as well as
			//use a partitioning-preserving mapvalues
			out = out.mapValues(new RDDUTensorAggValueFunction(auop));
		}
		else if( _aggtype == SparkAggType.MULTI_BLOCK ) {
			// TODO MULTI_BLOCK
			throw new DMLRuntimeException("Multi block spark aggregations are not supported for tensors yet.");
			/*
			//in case of multi-block aggregation, we always keep the correction
			out = out.mapToPair(new RDDUTensorAggFunction(auop, dc.getBlocksize(), dc.getBlocksize()));
			out = RDDAggregateUtils.aggByKeyStable(out, aggop, false);

			//drop correction after aggregation if required (aggbykey creates
			//partitioning, drop correction via partitioning-preserving mapvalues)
			if( auop.aggOp.correctionExists )
				out = out.mapValues( new AggregateDropCorrectionFunction(aggop) );
			 */
		}

		//put output RDD handle into symbol table
		updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn);
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), input1.getName());
	}
}

Source File: AggregateUnarySPInstruction.java From systemds with Apache License 2.0

4 votes

private void processMatrixAggregate(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc = sec.getDataCharacteristics(input1.getName());

	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in;

	//filter input blocks for trace
	if( getOpcode().equalsIgnoreCase("uaktrace") )
		out = out.filter(new FilterDiagMatrixBlocksFunction());

	//execute unary aggregate operation
	AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr;
	AggregateOperator aggop = _aop;

	//perform aggregation if necessary and put output into symbol table
	if( _aggtype == SparkAggType.SINGLE_BLOCK )
	{
		if( auop.sparseSafe )
			out = out.filter(new FilterNonEmptyBlocksFunction());

		JavaRDD<MatrixBlock> out2 = out.map(
				new RDDUAggFunction2(auop, mc.getBlocksize()));
		MatrixBlock out3 = RDDAggregateUtils.aggStable(out2, aggop);

		//drop correction after aggregation
		out3.dropLastRowsOrColumns(aggop.correction);

		//put output block into symbol table (no lineage because single block)
		//this also includes implicit maintenance of matrix characteristics
		sec.setMatrixOutput(output.getName(), out3);
	}
	else //MULTI_BLOCK or NONE
	{
		if( _aggtype == SparkAggType.NONE ) {
			//in case of no block aggregation, we always drop the correction as well as
			//use a partitioning-preserving mapvalues
			out = out.mapValues(new RDDUAggValueFunction(auop, mc.getBlocksize()));
		}
		else if( _aggtype == SparkAggType.MULTI_BLOCK ) {
			//in case of multi-block aggregation, we always keep the correction
			out = out.mapToPair(new RDDUAggFunction(auop, mc.getBlocksize()));
			out = RDDAggregateUtils.aggByKeyStable(out, aggop, false);

			//drop correction after aggregation if required (aggbykey creates
			//partitioning, drop correction via partitioning-preserving mapvalues)
			if( auop.aggOp.existsCorrection() )
				out = out.mapValues( new AggregateDropCorrectionFunction(aggop) );
		}

		//put output RDD handle into symbol table
		updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn);
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), input1.getName());
	}
}

Java Code Examples for org.apache.spark.api.java.JavaPairRDD#map()