Java Code Examples for org.apache.spark.api.java.JavaRDD#zip()
The following examples show how to use
org.apache.spark.api.java.JavaRDD#zip() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: InterleaveMulti.java From ViraPipe with MIT License | 6 votes |
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2); JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2); zips.foreach( splits -> { Path path = splits._1.getPath(); FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1); FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2); writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq"); }); }
Example 2
Source File: Decompress.java From ViraPipe with MIT License | 6 votes |
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2); JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2); zips.foreach( splits -> { Path path = splits._1.getPath(); FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1); FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2); writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq"); }); }
Example 3
Source File: DecompressInterleave.java From ViraPipe with MIT License | 6 votes |
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { String[] ns = fst.getPath().getName().split("\\."); //TODO: Handle also compressed files List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2); JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2); zips.foreach( splits -> { Path path = splits._1.getPath(); FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1); FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2); writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir, path.getParent().getName()+"_"+splits._1.getStart()+".fq"); }); }
Example 4
Source File: Interleave.java From ViraPipe with MIT License | 6 votes |
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2); JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2); zips.foreach( splits -> { Path path = splits._1.getPath(); FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1); FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2); writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq"); }); }
Example 5
Source File: TextPipelineTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * This test checked generations retrieved using stopWords * * @throws Exception */ @Test @Ignore //AB 2020/04/19 https://github.com/eclipse/deeplearning4j/issues/8849 public void testZipFunction1() throws Exception { JavaSparkContext sc = getContext(); JavaRDD<String> corpusRDD = getCorpusRDD(sc); // word2vec.setRemoveStop(false); Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap()); TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap); pipeline.buildVocabCache(); pipeline.buildVocabWordListRDD(); JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD(); JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD(); CountCumSum countCumSum = new CountCumSum(sentenceCountRDD); JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum(); JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD); List<Tuple2<List<VocabWord>, Long>> lst = vocabWordListSentenceCumSumRDD.collect(); List<VocabWord> vocabWordsList1 = lst.get(0)._1(); Long cumSumSize1 = lst.get(0)._2(); assertEquals(3, vocabWordsList1.size()); assertEquals(vocabWordsList1.get(0).getWord(), "strange"); assertEquals(vocabWordsList1.get(1).getWord(), "strange"); assertEquals(vocabWordsList1.get(2).getWord(), "world"); assertEquals(cumSumSize1, 6L, 0); List<VocabWord> vocabWordsList2 = lst.get(1)._1(); Long cumSumSize2 = lst.get(1)._2(); assertEquals(2, vocabWordsList2.size()); assertEquals(vocabWordsList2.get(0).getWord(), "flowers"); assertEquals(vocabWordsList2.get(1).getWord(), "red"); assertEquals(cumSumSize2, 9L, 0); sc.stop(); }
Example 6
Source File: TextPipelineTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testSyn0AfterFirstIteration() throws Exception { JavaSparkContext sc = getContext(); JavaRDD<String> corpusRDD = getCorpusRDD(sc); // word2vec.setRemoveStop(false); Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap()); TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap); pipeline.buildVocabCache(); pipeline.buildVocabWordListRDD(); VocabCache<VocabWord> vocabCache = pipeline.getVocabCache(); Huffman huffman = new Huffman(vocabCache.vocabWords()); huffman.build(); // Get total word count and put into word2vec variable map Map<String, Object> word2vecVarMap = word2vec.getWord2vecVarMap(); word2vecVarMap.put("totalWordCount", pipeline.getTotalWordCount()); double[] expTable = word2vec.getExpTable(); JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD(); JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD(); CountCumSum countCumSum = new CountCumSum(sentenceCountRDD); JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum(); JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD); Broadcast<Map<String, Object>> word2vecVarMapBroadcast = sc.broadcast(word2vecVarMap); Broadcast<double[]> expTableBroadcast = sc.broadcast(expTable); FirstIterationFunction firstIterationFunction = new FirstIterationFunction(word2vecVarMapBroadcast, expTableBroadcast, pipeline.getBroadCastVocabCache()); JavaRDD<Pair<VocabWord, INDArray>> pointSyn0Vec = vocabWordListSentenceCumSumRDD .mapPartitions(firstIterationFunction).map(new MapToPairFunction()); }
Example 7
Source File: HDFSWriter.java From ViraPipe with MIT License | 4 votes |
private static JavaPairRDD<Text, SequencedFragment> interleaveReads(String fastq, String fastq2, int splitlen, JavaSparkContext sc) throws IOException { FileSystem fs = FileSystem.get(new Configuration()); FileStatus fst = fs.getFileStatus(new Path(fastq)); FileStatus fst2 = fs.getFileStatus(new Path(fastq2)); List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2); JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2); return zips.flatMapToPair( splits -> { FastqInputFormat.FastqRecordReader fqreader = new FastqInputFormat.FastqRecordReader(new Configuration(), splits._1); FastqInputFormat.FastqRecordReader fqreader2 = new FastqInputFormat.FastqRecordReader(new Configuration(), splits._2); ArrayList<Tuple2<Text, SequencedFragment>> reads = new ArrayList<Tuple2<Text, SequencedFragment>>(); while (fqreader.nextKeyValue()) { String key = fqreader.getCurrentKey().toString(); String[] keysplit = key.split(" "); key = keysplit[0]; SequencedFragment sf = new SequencedFragment(); sf.setQuality(new Text(fqreader.getCurrentValue().getQuality().toString())); sf.setSequence(new Text(fqreader.getCurrentValue().getSequence().toString())); if (fqreader2.nextKeyValue()) { String key2 = fqreader2.getCurrentKey().toString(); String[] keysplit2 = key2.split(" "); key2 = keysplit2[0]; //key2 = key2.replace(" 2:N:0:1","/2"); SequencedFragment sf2 = new SequencedFragment(); sf2.setQuality(new Text(fqreader2.getCurrentValue().getQuality().toString())); sf2.setSequence(new Text(fqreader2.getCurrentValue().getSequence().toString())); reads.add(new Tuple2<Text, SequencedFragment>(new Text(key), sf)); reads.add(new Tuple2<Text, SequencedFragment>(new Text(key2), sf2)); } } return reads.iterator(); }); }
Example 8
Source File: TextPipelineTest.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test @Ignore //AB 2020/04/19 https://github.com/eclipse/deeplearning4j/issues/8849 public void testZipFunction2() throws Exception { JavaSparkContext sc = getContext(); JavaRDD<String> corpusRDD = getCorpusRDD(sc); // word2vec.setRemoveStop(false); Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vecNoStop.getTokenizerVarMap()); TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap); pipeline.buildVocabCache(); pipeline.buildVocabWordListRDD(); JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD(); JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD(); CountCumSum countCumSum = new CountCumSum(sentenceCountRDD); JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum(); JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD); List<Tuple2<List<VocabWord>, Long>> lst = vocabWordListSentenceCumSumRDD.collect(); List<VocabWord> vocabWordsList1 = lst.get(0)._1(); Long cumSumSize1 = lst.get(0)._2(); assertEquals(6, vocabWordsList1.size()); assertEquals(vocabWordsList1.get(0).getWord(), "this"); assertEquals(vocabWordsList1.get(1).getWord(), "is"); assertEquals(vocabWordsList1.get(2).getWord(), "a"); assertEquals(vocabWordsList1.get(3).getWord(), "strange"); assertEquals(vocabWordsList1.get(4).getWord(), "strange"); assertEquals(vocabWordsList1.get(5).getWord(), "world"); assertEquals(cumSumSize1, 6L, 0); List<VocabWord> vocabWordsList2 = lst.get(1)._1(); Long cumSumSize2 = lst.get(1)._2(); assertEquals(vocabWordsList2.size(), 3); assertEquals(vocabWordsList2.get(0).getWord(), "flowers"); assertEquals(vocabWordsList2.get(1).getWord(), "are"); assertEquals(vocabWordsList2.get(2).getWord(), "red"); assertEquals(cumSumSize2, 9L, 0); sc.stop(); }
Example 9
Source File: TextPipelineTest.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void testFirstIteration() throws Exception { JavaSparkContext sc = getContext(); JavaRDD<String> corpusRDD = getCorpusRDD(sc); // word2vec.setRemoveStop(false); Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap()); TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap); pipeline.buildVocabCache(); pipeline.buildVocabWordListRDD(); VocabCache<VocabWord> vocabCache = pipeline.getVocabCache(); /* Huffman huffman = new Huffman(vocabCache.vocabWords()); huffman.build(); huffman.applyIndexes(vocabCache); */ VocabWord token = vocabCache.tokenFor("strange"); VocabWord word = vocabCache.wordFor("strange"); log.info("Strange token: " + token); log.info("Strange word: " + word); // Get total word count and put into word2vec variable map Map<String, Object> word2vecVarMap = word2vec.getWord2vecVarMap(); word2vecVarMap.put("totalWordCount", pipeline.getTotalWordCount()); double[] expTable = word2vec.getExpTable(); JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD(); JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD(); CountCumSum countCumSum = new CountCumSum(sentenceCountRDD); JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum(); JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD); Broadcast<Map<String, Object>> word2vecVarMapBroadcast = sc.broadcast(word2vecVarMap); Broadcast<double[]> expTableBroadcast = sc.broadcast(expTable); Iterator<Tuple2<List<VocabWord>, Long>> iterator = vocabWordListSentenceCumSumRDD.collect().iterator(); FirstIterationFunction firstIterationFunction = new FirstIterationFunction( word2vecVarMapBroadcast, expTableBroadcast, pipeline.getBroadCastVocabCache()); Iterator<Map.Entry<VocabWord, INDArray>> ret = firstIterationFunction.call(iterator); assertTrue(ret.hasNext()); }