Java Code Examples for org.apache.spark.api.java.JavaSparkContext#newAPIHadoopFile()
The following examples show how to use
org.apache.spark.api.java.JavaSparkContext#newAPIHadoopFile() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: InputFormatTest.java From HadoopCV with Apache License 2.0 | 6 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("VideoInput").setMaster("local[2]"); JavaSparkContext sc = new JavaSparkContext(conf); Configuration hc = new org.apache.hadoop.conf.Configuration(); JavaPairRDD<Text, HBMat> video = sc.newAPIHadoopFile("data/bike.avi", VideoInputFormat.class, Text.class, HBMat.class,hc); video.foreach(new VoidFunction<Tuple2<Text,HBMat>>() { @Override public void call(Tuple2<Text, HBMat> tuple) throws Exception { HBMat image = (HBMat)tuple._2; System.out.print(image.getBmat().dump()); } }); System.out.print(video.count()); }
Example 2
Source File: FileSystemInput.java From envelope with Apache License 2.0 | 5 votes |
private Dataset<Row> getEncodedRowsFromInputFormat(String path, Class<? extends InputFormat> inputFormatClass) { JavaSparkContext context = new JavaSparkContext(Contexts.getSparkSession().sparkContext()); JavaPairRDD rawRDD = context.newAPIHadoopFile( path, inputFormatClass, convertToClass(getKeyDataType()), convertToClass(getValueDataType()), new Configuration()); boolean useKey = getKeyDataType() != null; JavaRDD<Row> encodedRDD = rawRDD.map(new EncodeRecordAsKeyValueFunction(useKey)); return Contexts.getSparkSession().createDataFrame(encodedRDD, getProvidingSchema()); }
Example 3
Source File: CopybookSparkExample.java From CopybookInputFormat with Apache License 2.0 | 5 votes |
public static void main(String[] args) { if (args.length == 0) { } if (args.length == 0) { System.out .println("CopybookSparkExample {master} {copybookInputPath} {dataFileInputPath} {outputFolder}"); return; } String master = args[0]; String copybookInputPath = args[1]; String dataFileInputPath = args[2]; String outputPath = args[3]; JavaSparkContext jsc = new JavaSparkContext(master, "UniqueSeqGenerator", null, "SparkCopybookExample.jar"); Configuration config = new Configuration(); config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); config.addResource(new Path("/etc/hadoop/conf/mapred-site.xml")); config.addResource(new Path("/etc/hadoop/conf/yarn-site.xml")); config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); CopybookInputFormat.setCopybookHdfsPath(config, copybookInputPath); JavaPairRDD<LongWritable, Text> rdd = jsc.newAPIHadoopFile(dataFileInputPath, CopybookInputFormat.class, LongWritable.class, Text.class, config); JavaRDD<String> pipeDelimiter = rdd.map(new MapFunction()); pipeDelimiter.saveAsTextFile(outputPath); }
Example 4
Source File: RepartitionFastq.java From ViraPipe with MIT License | 4 votes |
public static void main(String[] args) throws IOException { if (args.length < 1) { System.err.println("Usage: RepartitionFastq <input path> <output path> <number of partitions>"); System.exit(1); } SparkConf conf = new SparkConf().setAppName("RepartitionFastq"); //conf.set("spark.default.parallelism", String.valueOf(args[2])); JavaSparkContext sc = new JavaSparkContext(conf); JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration()); JavaPairRDD<Text, SequencedFragment> repartitioned = fastqRDD.repartition(Integer.valueOf(args[2])); repartitioned.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration()); sc.stop(); }
Example 5
Source File: SamToFastq.java From ViraPipe with MIT License | 4 votes |
public static void main(String[] args) throws IOException { SparkConf conf = new SparkConf().setAppName("SamToFastq"); sc = new JavaSparkContext(conf); String in = args[0]; String out = args[1]; JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(in, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration()); //Map to SAMRecord RDD JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get()); JavaPairRDD<Text, SequencedFragment> fastqrdd = mapSAMRecordsToFastq(samRDD); fastqrdd.saveAsNewAPIHadoopFile(out, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration()); sc.stop(); }
Example 6
Source File: MergeFastq.java From ViraPipe with MIT License | 4 votes |
public static void main(String[] args) throws IOException { if (args.length < 1) { System.err.println("Usage: MergeFastq <input path> <output path> <number of partitions>"); System.exit(1); } SparkConf conf = new SparkConf().setAppName("MergeFastq"); JavaSparkContext sc = new JavaSparkContext(conf); JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration()); JavaPairRDD<Text, SequencedFragment> coalesced = fastqRDD.coalesce(Integer.valueOf(args[2])); coalesced.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration()); sc.stop(); }
Example 7
Source File: SQLQueryFastq.java From ViraPipe with MIT License | 4 votes |
public static void main(String[] args) throws IOException { SparkConf conf = new SparkConf().setAppName("SQLQueryFastq"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc); Options options = new Options(); Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." ); Option queryOpt = new Option( "query", true, "SQL query string." ); Option samOpt = new Option( "format", true, "parquet or fastq" ); Option baminOpt = new Option( "in", true, "" ); options.addOption( new Option( "tablename", true, "Default sql table name is 'records'")); options.addOption( opOpt ); options.addOption( queryOpt ); options.addOption( samOpt ); options.addOption( baminOpt ); CommandLineParser parser = new BasicParser(); CommandLine cmd = null; try { // parse the command line arguments cmd = parser.parse( options, args ); } catch( ParseException exp ) { // oops, something went wrong System.err.println( "Parsing failed. Reason: " + exp.getMessage() ); } String outDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null; String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null; String format = (cmd.hasOption("format")==true)? cmd.getOptionValue("format"):"fastq"; String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null; tablename = (cmd.hasOption("tablename")==true)? cmd.getOptionValue("tablename"):"records"; sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true); JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(in, FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration()); JavaRDD<MyRead> rdd = fastqRDD.map(record -> { MyRead read = new MyRead(); read.setKey(record._1.toString()); read.setSequence(record._2.getSequence().toString()); read.setRead(record._2.getRead()); read.setQuality(record._2.getQuality().toString()); read.setTile(record._2.getTile()); read.setXpos(record._2.getXpos()); read.setYpos(record._2.getYpos()); read.setRunNumber(record._2.getRunNumber()); read.setInstrument(record._2.getInstrument()); read.setFlowcellId(record._2.getFlowcellId()); read.setLane(record._2.getLane()); read.setControlNumber(record._2.getControlNumber()); read.setFilterPassed(record._2.getFilterPassed()); return read; }); Dataset df = sqlContext.createDataFrame(rdd, MyRead.class); df.registerTempTable(tablename); //eq. count duplicates "SELECT count(DISTINCT(sequence)) FROM records" //"SELECT key,LEN(sequence) as l FROM records where l<100;" if(query!=null) { //JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag(), bam)); //Save as parquet file Dataset<Row> resultDF = sqlContext.sql(query); resultDF.show(100, false); if(outDir!=null){ if(format.equals("fastq")){ JavaPairRDD<Text, SequencedFragment> resultRDD = dfToFastqRDD(resultDF); resultRDD.saveAsNewAPIHadoopFile(outDir, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration()); } else resultDF.write().parquet(outDir); } } sc.stop(); }
Example 8
Source File: SQLQueryBAM.java From ViraPipe with MIT License | 4 votes |
public static void main(String[] args) throws IOException { SparkConf conf = new SparkConf().setAppName("SQLQueryBAM"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new HiveContext(sc.sc()); Options options = new Options(); Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." ); Option queryOpt = new Option( "query", true, "SQL query string." ); Option baminOpt = new Option( "in", true, "" ); options.addOption( opOpt ); options.addOption( queryOpt ); options.addOption( baminOpt ); CommandLineParser parser = new BasicParser(); CommandLine cmd = null; try { cmd = parser.parse( options, args ); } catch( ParseException exp ) { System.err.println( "Parsing failed. Reason: " + exp.getMessage() ); } String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null; String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null; String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null; sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true); //Read BAM/SAM from HDFS JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration()); //Map to SAMRecord RDD JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get()); JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag())); Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class); samDF.registerTempTable(tablename); if(query!=null) { //Save as parquet file Dataset df2 = sqlContext.sql(query); df2.show(100,false); if(bwaOutDir!=null) df2.write().parquet(bwaOutDir); }else{ if(bwaOutDir!=null) samDF.write().parquet(bwaOutDir); } sc.stop(); }