org.apache.spark.api.java.JavaSparkContext#newAPIHadoopFile

Source File: InputFormatTest.java From HadoopCV with Apache License 2.0

6 votes

public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("VideoInput").setMaster("local[2]");
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	Configuration hc = new org.apache.hadoop.conf.Configuration();
	JavaPairRDD<Text, HBMat> video = sc.newAPIHadoopFile("data/bike.avi", VideoInputFormat.class, Text.class, HBMat.class,hc);
	
	video.foreach(new VoidFunction<Tuple2<Text,HBMat>>() {	
		@Override
		public void call(Tuple2<Text, HBMat> tuple) throws Exception {
			HBMat image = (HBMat)tuple._2;
			System.out.print(image.getBmat().dump());
		}
	});
	
	System.out.print(video.count());
}

Source File: FileSystemInput.java From envelope with Apache License 2.0

5 votes

private Dataset<Row> getEncodedRowsFromInputFormat(String path, Class<? extends InputFormat> inputFormatClass) {
  JavaSparkContext context = new JavaSparkContext(Contexts.getSparkSession().sparkContext());
  JavaPairRDD rawRDD = context.newAPIHadoopFile(
      path, inputFormatClass, convertToClass(getKeyDataType()), convertToClass(getValueDataType()),
      new Configuration());

  boolean useKey = getKeyDataType() != null;
  JavaRDD<Row> encodedRDD = rawRDD.map(new EncodeRecordAsKeyValueFunction(useKey));

  return Contexts.getSparkSession().createDataFrame(encodedRDD, getProvidingSchema());
}

Source File: CopybookSparkExample.java From CopybookInputFormat with Apache License 2.0

5 votes

public static void main(String[] args) {
	if (args.length == 0) {

	}
	if (args.length == 0) {
		System.out
				.println("CopybookSparkExample {master} {copybookInputPath} {dataFileInputPath} {outputFolder}");
		return;
	}

	String master = args[0];
	String copybookInputPath = args[1];
	String dataFileInputPath = args[2];
	String outputPath = args[3];

	JavaSparkContext jsc = new JavaSparkContext(master,
			"UniqueSeqGenerator", null, "SparkCopybookExample.jar");

	Configuration config = new Configuration();
	config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"));
	config.addResource(new Path("/etc/hadoop/conf/mapred-site.xml"));
	config.addResource(new Path("/etc/hadoop/conf/yarn-site.xml"));
	config.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
	CopybookInputFormat.setCopybookHdfsPath(config, copybookInputPath);
	
	JavaPairRDD<LongWritable, Text> rdd = jsc.newAPIHadoopFile(dataFileInputPath, CopybookInputFormat.class, LongWritable.class, Text.class, config);
	JavaRDD<String> pipeDelimiter = rdd.map(new MapFunction());

	pipeDelimiter.saveAsTextFile(outputPath);
}

Source File: RepartitionFastq.java From ViraPipe with MIT License

4 votes

public static void main(String[] args) throws IOException {

        if (args.length < 1) {
            System.err.println("Usage: RepartitionFastq <input path> <output path> <number of partitions>");
            System.exit(1);
        }

        SparkConf conf = new SparkConf().setAppName("RepartitionFastq");
        //conf.set("spark.default.parallelism", String.valueOf(args[2]));
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

        JavaPairRDD<Text, SequencedFragment> repartitioned = fastqRDD.repartition(Integer.valueOf(args[2]));

        repartitioned.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());

        sc.stop();
    }

Source File: SamToFastq.java From ViraPipe with MIT License

4 votes

public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SamToFastq");
  sc = new JavaSparkContext(conf);

  String in = args[0];
  String out = args[1];

  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(in, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());

  JavaPairRDD<Text, SequencedFragment> fastqrdd = mapSAMRecordsToFastq(samRDD);

  fastqrdd.saveAsNewAPIHadoopFile(out, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());

  sc.stop();

}

Source File: MergeFastq.java From ViraPipe with MIT License

4 votes

public static void main(String[] args) throws IOException {

        if (args.length < 1) {
            System.err.println("Usage: MergeFastq <input path> <output path> <number of partitions>");
            System.exit(1);
        }

        SparkConf conf = new SparkConf().setAppName("MergeFastq");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

        JavaPairRDD<Text, SequencedFragment> coalesced = fastqRDD.coalesce(Integer.valueOf(args[2]));

        coalesced.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());

        sc.stop();
    }

Source File: SQLQueryFastq.java From ViraPipe with MIT License

4 votes

public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryFastq");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new SQLContext(sc);

  Options options = new Options();

  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option samOpt = new Option( "format", true, "parquet or fastq" );
  Option baminOpt = new Option( "in", true, "" );
  options.addOption( new Option( "tablename", true, "Default sql table name is 'records'"));

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( samOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    // parse the command line arguments
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    // oops, something went wrong
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String outDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String format = (cmd.hasOption("format")==true)? cmd.getOptionValue("format"):"fastq";
  String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
  tablename = (cmd.hasOption("tablename")==true)? cmd.getOptionValue("tablename"):"records";

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(in, FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

  JavaRDD<MyRead> rdd = fastqRDD.map(record -> {
    MyRead read = new MyRead();
    read.setKey(record._1.toString());
    read.setSequence(record._2.getSequence().toString());
    read.setRead(record._2.getRead());
    read.setQuality(record._2.getQuality().toString());

    read.setTile(record._2.getTile());
    read.setXpos(record._2.getXpos());
    read.setYpos(record._2.getYpos());
    read.setRunNumber(record._2.getRunNumber());
    read.setInstrument(record._2.getInstrument());
    read.setFlowcellId(record._2.getFlowcellId());
    read.setLane(record._2.getLane());
    read.setControlNumber(record._2.getControlNumber());
    read.setFilterPassed(record._2.getFilterPassed());

    return read;
  });

  Dataset df = sqlContext.createDataFrame(rdd, MyRead.class);
  df.registerTempTable(tablename);
  //eq. count duplicates "SELECT count(DISTINCT(sequence)) FROM records"
  //"SELECT key,LEN(sequence) as l FROM records where l<100;"
  if(query!=null) {

    //JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag(), bam));
    //Save as parquet file
    Dataset<Row> resultDF = sqlContext.sql(query);
    resultDF.show(100, false);

    if(outDir!=null){
      if(format.equals("fastq")){
        JavaPairRDD<Text, SequencedFragment> resultRDD = dfToFastqRDD(resultDF);
        resultRDD.saveAsNewAPIHadoopFile(outDir, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
      }
      else
        resultDF.write().parquet(outDir);
    }
  }
  sc.stop();

}

Source File: SQLQueryBAM.java From ViraPipe with MIT License

4 votes

public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new HiveContext(sc.sc());

  Options options = new Options();
  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option baminOpt = new Option( "in", true, "" );

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  //Read BAM/SAM from HDFS
  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
  JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));

  Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
  samDF.registerTempTable(tablename);
  if(query!=null) {

    //Save as parquet file
    Dataset df2 = sqlContext.sql(query);
    df2.show(100,false);

    if(bwaOutDir!=null)
      df2.write().parquet(bwaOutDir);

  }else{
    if(bwaOutDir!=null)
      samDF.write().parquet(bwaOutDir);
  }

  sc.stop();

}

Java Code Examples for org.apache.spark.api.java.JavaSparkContext#newAPIHadoopFile()