Java Code Examples for org.apache.spark.api.java.JavaRDD#filter()
The following examples show how to use
org.apache.spark.api.java.JavaRDD#filter() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AbstractJavaEsSparkTest.java From elasticsearch-hadoop with Apache License 2.0 | 7 votes |
public void testEsRDDZReadJson() throws Exception { String target = "spark-test-java-basic-json-read/data"; RestUtils.touch("spark-test-java-basic-json-read"); RestUtils.postData(target, "{\"message\" : \"Hello World\",\"message_date\" : \"2014-05-25\"}".getBytes()); RestUtils.postData(target, "{\"message\" : \"Goodbye World\",\"message_date\" : \"2014-05-25\"}".getBytes()); RestUtils.refresh("spark-test*"); JavaRDD<String> esRDD = JavaEsSpark.esJsonRDD(sc, target).values(); System.out.println(esRDD.collect()); JavaRDD<String> messages = esRDD.filter(new Function<String, Boolean>() { @Override public Boolean call(String string) throws Exception { return string.contains("message"); } }); // jdk8 //esRDD.filter(m -> m.contains("message"))); assertThat((int) messages.count(), is(2)); System.out.println(messages.take(10)); System.out.println(messages); }
Example 2
Source File: MarkDuplicatesSparkUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
@Test(dataProvider = "md", groups = "spark") public void markDupesTest(final String input, final long totalExpected, final long dupsExpected) { final GATKPath inputPathSpec = new GATKPath(input); JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); ReadsSparkSource readSource = new ReadsSparkSource(ctx); JavaRDD<GATKRead> reads = readSource.getParallelReads(inputPathSpec, null); Assert.assertEquals(reads.count(), totalExpected); SAMFileHeader header = readSource.getHeader(inputPathSpec, null); OpticalDuplicatesArgumentCollection opticalDuplicatesArgumentCollection = new OpticalDuplicatesArgumentCollection(); final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ? new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null; JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(reads, header, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, finder, 1, false, MarkDuplicates.DuplicateTaggingPolicy.DontTag); Assert.assertEquals(markedReads.count(), totalExpected); JavaRDD<GATKRead> dupes = markedReads.filter(GATKRead::isDuplicate); Assert.assertEquals(dupes.count(), dupsExpected); }
Example 3
Source File: Algorithm.java From predictionio-template-java-ecom-recommender with Apache License 2.0 | 6 votes |
private JavaRDD<ItemScore> validScores(JavaRDD<ItemScore> all, final Set<String> whitelist, final Set<String> blacklist, final Set<String> categories, final Map<String, Item> items, String userEntityId) { final Set<String> seenItemEntityIds = seenItemEntityIds(userEntityId); final Set<String> unavailableItemEntityIds = unavailableItemEntityIds(); return all.filter(new Function<ItemScore, Boolean>() { @Override public Boolean call(ItemScore itemScore) throws Exception { Item item = items.get(itemScore.getItemEntityId()); return (item != null && passWhitelistCriteria(whitelist, item.getEntityId()) && passBlacklistCriteria(blacklist, item.getEntityId()) && passCategoryCriteria(categories, item) && passUnseenCriteria(seenItemEntityIds, item.getEntityId()) && passAvailabilityCriteria(unavailableItemEntityIds, item.getEntityId())); } }); }
Example 4
Source File: DeleteHelper.java From hudi with Apache License 2.0 | 5 votes |
public static <T extends HoodieRecordPayload<T>> HoodieWriteMetadata execute(String instantTime, JavaRDD<HoodieKey> keys, JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable<T> table, CommitActionExecutor<T> deleteExecutor) { try { HoodieWriteMetadata result = null; // De-dupe/merge if needed JavaRDD<HoodieKey> dedupedKeys = config.shouldCombineBeforeDelete() ? deduplicateKeys(keys, table) : keys; JavaRDD<HoodieRecord<T>> dedupedRecords = dedupedKeys.map(key -> new HoodieRecord(key, new EmptyHoodieRecordPayload())); Instant beginTag = Instant.now(); // perform index loop up to get existing location of records JavaRDD<HoodieRecord<T>> taggedRecords = ((HoodieTable<T>)table).getIndex().tagLocation(dedupedRecords, jsc, (HoodieTable<T>)table); Duration tagLocationDuration = Duration.between(beginTag, Instant.now()); // filter out non existant keys/records JavaRDD<HoodieRecord<T>> taggedValidRecords = taggedRecords.filter(HoodieRecord::isCurrentLocationKnown); if (!taggedValidRecords.isEmpty()) { result = deleteExecutor.execute(taggedValidRecords); result.setIndexLookupDuration(tagLocationDuration); } else { // if entire set of keys are non existent deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(jsc.emptyRDD()), instantTime); result = new HoodieWriteMetadata(); result.setWriteStatuses(jsc.emptyRDD()); deleteExecutor.commitOnAutoCommit(result); } return result; } catch (Throwable e) { if (e instanceof HoodieUpsertException) { throw (HoodieUpsertException) e; } throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e); } }
Example 5
Source File: CollectBaseDistributionByCycleSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
/** * Computes the MeanQualityByCycle. Creates a metrics file with relevant histograms. */ public MetricsFile<BaseDistributionByCycleMetrics, Integer> calculateBaseDistributionByCycle(final JavaRDD<GATKRead> reads){ final MetricsReadFilter metricsFilter = new MetricsReadFilter(this.pfReadsOnly, this.alignedReadsOnly); final JavaRDD<GATKRead> filteredReads = reads.filter(read -> metricsFilter.test(read)); final HistogramGenerator hist = filteredReads.aggregate(new HistogramGenerator(), (hgp, read) -> hgp.addRead(read), (hgp1, hgp2) -> hgp1.merge(hgp2)); final MetricsFile<BaseDistributionByCycleMetrics, Integer> metricsFile = getMetricsFile(); hist.addToMetricsFile(metricsFile); return metricsFile; }
Example 6
Source File: Grep.java From flink-perf with Apache License 2.0 | 5 votes |
public static void main(String[] args) { String master = args[0]; String inFile = args[1]; String outFile = args[2]; String patterns[] = new String[args.length-3]; System.arraycopy(args,3,patterns,0,args.length-3); System.err.println("Starting spark with master="+master+" in="+inFile); System.err.println("Using patterns: "+ Arrays.toString(patterns)); SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> file = sc.textFile(inFile); for(int p = 0; p < patterns.length; p++) { final String pattern = patterns[p]; JavaRDD<String> res = file.filter(new Function<String, Boolean>() { private static final long serialVersionUID = 1L; Pattern p = Pattern.compile(pattern); @Override public Boolean call(String value) throws Exception { if (value == null || value.length() == 0) { return false; } final Matcher m = p.matcher(value); if (m.find()) { return true; } return false; } }); res.saveAsTextFile(outFile+"_"+pattern); } }
Example 7
Source File: ComputeResponse.java From incubator-retired-pirk with Apache License 2.0 | 5 votes |
/** * Method to read in the data from elasticsearch, filter, and return a RDD of MapWritable data elements */ @SuppressWarnings("unchecked") public JavaRDD<MapWritable> readDataES() throws IOException, PIRException { logger.info("Reading data "); JavaRDD<MapWritable> jsonRDD; Job job = Job.getInstance(); String jobName = "pirSpark_ES_" + esQuery + "_" + System.currentTimeMillis(); job.setJobName(jobName); job.getConfiguration().set("es.nodes", SystemConfiguration.getProperty("es.nodes")); job.getConfiguration().set("es.port", SystemConfiguration.getProperty("es.port")); job.getConfiguration().set("es.resource", esResource); job.getConfiguration().set("es.query", esQuery); jsonRDD = sc.newAPIHadoopRDD(job.getConfiguration(), EsInputFormat.class, Text.class, MapWritable.class).values().coalesce(numDataPartitions); // Filter out by the provided stopListFile entries if (qSchema.getFilter() != null) { return jsonRDD.filter(new FilterData(accum, bVars)); } else { logger.info("qSchema.getFilter() is null"); return jsonRDD; } }
Example 8
Source File: QuadUtils.java From rdf2x with Apache License 2.0 | 5 votes |
/** * Get quads with specified subjects filtered out, computed by querying an in-memory set of subjects * * @param quads RDD of quads to filter * @param subjectBlacklist set of requested subject URIs to be filtered out * @return filtered RDD with only those quads whose subject is NOT in subjectBlacklist */ public static JavaRDD<Quad> filterQuadsByForbiddenSubjects(JavaRDD<Quad> quads, Set<String> subjectBlacklist) { if (subjectBlacklist.isEmpty()) { return quads; } return quads.filter(quad -> !quad.getSubject().isURI() || !subjectBlacklist.contains(quad.getSubject().getURI()) ); }
Example 9
Source File: RddChannel.java From rheem with Apache License 2.0 | 5 votes |
public void accept(JavaRDD<?> rdd, SparkExecutor sparkExecutor) throws RheemException { if (this.isMarkedForInstrumentation() && !this.isRddCached()) { final Accumulator<Integer> accumulator = sparkExecutor.sc.accumulator(0); this.rdd = rdd.filter(dataQuantum -> { accumulator.add(1); return true; }); this.accumulator = accumulator; } else { this.rdd = rdd; } }
Example 10
Source File: AnalyzeSpark.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Randomly sample a set of invalid values from a specified column. * Values are considered invalid according to the Schema / ColumnMetaData * * @param numToSample Maximum number of invalid values to sample * @param columnName Same of the column from which to sample invalid values * @param schema Data schema * @param data Data * @param ignoreMissing If true: ignore missing values (NullWritable or empty/null string) when sampling. If false: include missing values in sampling * @return List of invalid examples */ public static List<Writable> sampleInvalidFromColumn(int numToSample, String columnName, Schema schema, JavaRDD<List<Writable>> data, boolean ignoreMissing) { //First: filter out all valid entries, to leave only invalid entries int colIdx = schema.getIndexOfColumn(columnName); JavaRDD<Writable> ithColumn = data.map(new SelectColumnFunction(colIdx)); ColumnMetaData meta = schema.getMetaData(columnName); JavaRDD<Writable> invalid = ithColumn.filter(new FilterWritablesBySchemaFunction(meta, false, ignoreMissing)); return invalid.takeSample(false, numToSample); }
Example 11
Source File: TransformationRDDTest.java From hui-bigdata-spark with Apache License 2.0 | 5 votes |
/** * 集合并集. * demo计算目的:找出所有进站是广南和天河客运站的信息 * @since hui_project 1.0.0 */ @Test public void testUnionAndFilter() { JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH); JavaRDD<String> result = textRDD.filter(x -> x.contains("广州南站")); JavaRDD<String> result1 = textRDD.filter(x -> x.contains("天河客运站")); JavaRDD<String> union = result.union(result1); System.out.println("-------" + union.count() + "-------"); checkResult(union.collect()); }
Example 12
Source File: SparkGenomeReadCounts.java From gatk-protected with BSD 3-Clause "New" or "Revised" License | 4 votes |
private void collectReads() { if ( readArguments.getReadFilesNames().size() != 1 ) { throw new UserException("This tool only accepts a single bam/sam/cram as input"); } final SampleCollection sampleCollection = new SampleCollection(getHeaderForReads()); if(sampleCollection.sampleCount()>1){ throw new UserException.BadInput("We do not support bams with more than one sample."); } final String sampleName = sampleCollection.sampleIds().get(0); final String[] commentsForRawCoverage = {"##fileFormat = tsv", "##commandLine = " + getCommandLine(), String.format("##title = Coverage counts in %d base bins for WGS", binsize)}; final ReadFilter filter = makeGenomeReadFilter(); final SAMSequenceDictionary sequenceDictionary = getReferenceSequenceDictionary(); logger.info("Starting Spark coverage collection..."); final long coverageCollectionStartTime = System.currentTimeMillis(); final JavaRDD<GATKRead> rawReads = getReads(); final JavaRDD<GATKRead> reads = rawReads.filter(read -> filter.test(read)); //Note: using a field inside a closure will pull in the whole enclosing object to serialization // (which leads to bad performance and can blow up if some objects in the fields are not // Serializable - closures always use java Serializable and not Kryo) //Solution here is to use a temp variable for binsize because it's just an int. final int binsize_tmp = binsize; final JavaRDD<SimpleInterval> readIntervals = reads .filter(read -> sequenceDictionary.getSequence(read.getContig()) != null) .map(read -> SparkGenomeReadCounts.createKey(read, sequenceDictionary, binsize_tmp)); final Map<SimpleInterval, Long> byKey = readIntervals.countByValue(); final Set<SimpleInterval> readIntervalKeySet = byKey.keySet(); final long totalReads = byKey.values().stream().mapToLong(v -> v).sum(); final long coverageCollectionEndTime = System.currentTimeMillis(); logger.info(String.format("Finished the spark coverage collection with %d targets and %d reads. Elapse of %d seconds", readIntervalKeySet.size(), totalReads, (coverageCollectionEndTime - coverageCollectionStartTime) / 1000)); final String[] commentsForProportionalCoverage = {commentsForRawCoverage[0], commentsForRawCoverage[1], String.format("##title = Proportional coverage counts in %d base bins for WGS (total reads: %d)", binsize, totalReads)}; logger.info("Creating full genome bins..."); final long createGenomeBinsStartTime = System.currentTimeMillis(); final List<SimpleInterval> fullGenomeBins = createFullGenomeBins(binsize); List<Target> fullGenomeTargetCollection = createTargetListFromSimpleInterval(fullGenomeBins); TargetWriter.writeTargetsToFile(new File(outputFile.getAbsolutePath() + ".targets.tsv"), fullGenomeTargetCollection); final long createGenomeBinsEndTime = System.currentTimeMillis(); logger.info(String.format("Finished creating genome bins. Elapse of %d seconds", (createGenomeBinsEndTime - createGenomeBinsStartTime) / 1000)); logger.info("Creating missing genome bins..."); final long createMissingGenomeBinsStartTime = System.currentTimeMillis(); logger.info("Creating missing genome bins: Creating a mutable mapping..."); final Map<SimpleInterval, Long> byKeyMutable = new HashMap<>(); byKeyMutable.putAll(byKey); logger.info("Creating missing genome bins: Populating mutable mapping with zero counts for empty regions..."); fullGenomeBins.stream().forEach(b -> byKeyMutable.putIfAbsent(b, 0l)); final long createMissingGenomeBinsEndTime = System.currentTimeMillis(); logger.info(String.format("Finished creating missing genome bins. Elapse of %d seconds", (createMissingGenomeBinsEndTime - createMissingGenomeBinsStartTime) / 1000)); logger.info("Creating final map..."); final long createFinalMapStartTime = System.currentTimeMillis(); final SortedMap<SimpleInterval, Long> byKeySorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR); byKeySorted.putAll(byKeyMutable); final long createFinalMapEndTime = System.currentTimeMillis(); logger.info(String.format("Finished creating final map. Elapse of %d seconds", (createFinalMapEndTime - createFinalMapStartTime) / 1000)); logger.info("Creating proportional coverage... "); final long pCovFileStartTime = System.currentTimeMillis(); final SortedMap<SimpleInterval, Double> byKeyProportionalSorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR); byKeySorted.entrySet().stream().forEach(e -> byKeyProportionalSorted.put(e.getKey(), (double) e.getValue() / totalReads)); final long pCovFileEndTime = System.currentTimeMillis(); logger.info(String.format("Finished creating proportional coverage map. Elapse of %d seconds", (pCovFileEndTime - pCovFileStartTime) / 1000)); logger.info("Writing raw coverage file ..."); final long writingCovFileStartTime = System.currentTimeMillis(); ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(new File(outputFile.getAbsolutePath() + RAW_COV_OUTPUT_EXTENSION), sampleName, byKeySorted, commentsForRawCoverage); final long writingCovFileEndTime = System.currentTimeMillis(); logger.info(String.format("Finished writing coverage file. Elapse of %d seconds", (writingCovFileEndTime - writingCovFileStartTime) / 1000)); logger.info("Writing proportional coverage file ..."); final long writingPCovFileStartTime = System.currentTimeMillis(); ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(outputFile, sampleName, byKeyProportionalSorted, commentsForProportionalCoverage); final long writingPCovFileEndTime = System.currentTimeMillis(); logger.info(String.format("Finished writing proportional coverage file. Elapse of %d seconds", (writingPCovFileEndTime - writingPCovFileStartTime) / 1000)); }
Example 13
Source File: MarkDuplicatesSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 4 votes |
@Override protected void runTool(final JavaSparkContext ctx) { final SAMFileHeader mergedHeader = getHeaderForReads(); // Check if we are using multiple inputs that the headers are all in the correct querygrouped ordering, if so set the aggregate header to reflect this if (readArguments.getReadPathSpecifiers().size() > 1) { final Optional<GATKPath> badlySorted = readArguments.getReadPathSpecifiers().stream() .filter(spec -> !treatAsReadGroupOrdered(getHeaderForReadsInput(spec), treatUnsortedAsOrdered)) .findFirst(); if(badlySorted.isPresent()) { if (allowMultipleSortOrders) { //don't set an ordering, the files will all be sorted downstream logger.info("Input files are not all grouped by read name so they will be sorted together."); } else { throw new UserException( "Multiple inputs to MarkDuplicatesSpark detected. MarkDuplicatesSpark requires all inputs to be queryname sorted " + "or querygroup-sorted for multi-input processing but input " + badlySorted.get() + " was sorted in " + getHeaderForReadsInput(badlySorted.get()) + " order"); } } else { // The default sort order for merged input files is unsorted, so this will be fed to the tool to be sorted if (!allowMultipleSortOrders) { mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query); } } // If there is only one file and we are in treatUnsortedAsOrdered mode than set its group order accordingly. } else { if (treatUnsortedAsOrdered && (mergedHeader.getSortOrder().equals(SAMFileHeader.SortOrder.unknown) || mergedHeader.getSortOrder().equals(SAMFileHeader.SortOrder.unsorted))) { logger.warn("Input bam was marked as " + mergedHeader.getSortOrder().toString() + " but " + TREAT_UNSORTED_AS_ORDERED + " is specified so it's being treated as read name grouped"); mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query); } } JavaRDD<GATKRead> reads = getReads(); final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ? new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null; // If we need to remove optical duplicates, set the engine to mark optical duplicates using the DT tag. if (markDuplicatesSparkArgumentCollection.removeSequencingDuplicates && markDuplicatesSparkArgumentCollection.taggingPolicy == MarkDuplicates.DuplicateTaggingPolicy.DontTag) { markDuplicatesSparkArgumentCollection.taggingPolicy = MarkDuplicates.DuplicateTaggingPolicy.OpticalOnly; } final JavaRDD<GATKRead> finalReadsForMetrics = mark(reads, mergedHeader, finder, markDuplicatesSparkArgumentCollection, getRecommendedNumReducers()); if (metricsFile != null) { final JavaPairRDD<String, GATKDuplicationMetrics> metricsByLibrary = MarkDuplicatesSparkUtils.generateMetrics( mergedHeader, finalReadsForMetrics); final MetricsFile<GATKDuplicationMetrics, Double> resultMetrics = getMetricsFile(); MarkDuplicatesSparkUtils.saveMetricsRDD(resultMetrics, mergedHeader, metricsByLibrary, metricsFile); } JavaRDD<GATKRead> readsForWriting = finalReadsForMetrics; // Filter out the duplicates if instructed to do so if (markDuplicatesSparkArgumentCollection.removeAllDuplicates) { readsForWriting = readsForWriting.filter(r -> !r.isDuplicate()); } else if (markDuplicatesSparkArgumentCollection.removeSequencingDuplicates) { readsForWriting = readsForWriting.filter(r -> !MarkDuplicates.DUPLICATE_TYPE_SEQUENCING.equals(r.getAttributeAsString(MarkDuplicates.DUPLICATE_TYPE_TAG))); } mergedHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); writeReads(ctx, output, readsForWriting, mergedHeader, true); }
Example 14
Source File: PSUtils.java From gatk with BSD 3-Clause "New" or "Revised" License | 4 votes |
public static JavaRDD<GATKRead> primaryReads(final JavaRDD<GATKRead> reads) { return reads.filter(read -> !(read.isSecondaryAlignment() || read.isSupplementaryAlignment())); }
Example 15
Source File: ReadsPipelineSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 4 votes |
@Override protected void runTool(final JavaSparkContext ctx) { String referenceFileName = addReferenceFilesForSpark(ctx, referenceArguments.getReferencePath()); List<String> localKnownSitesFilePaths = addVCFsForSpark(ctx, knownVariants); final JavaRDD<GATKRead> alignedReads; final SAMFileHeader header; final BwaSparkEngine bwaEngine; if (align) { bwaEngine = new BwaSparkEngine(ctx, referenceArguments.getReferenceFileName(), bwaArgs.indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary()); if (bwaArgs.singleEndAlignment) { alignedReads = bwaEngine.alignUnpaired(getReads()); } else { // filter reads after alignment in the case of paired reads since filtering does not know about pairs final ReadFilter filter = makeReadFilter(bwaEngine.getHeader()); alignedReads = bwaEngine.alignPaired(getUnfilteredReads()).filter(filter::test); } header = bwaEngine.getHeader(); } else { bwaEngine = null; alignedReads = getReads(); header = getHeaderForReads(); } final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(alignedReads, header, new OpticalDuplicateFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers()); // always coordinate-sort reads so BQSR can use queryLookaheadBases in FeatureDataSource final SAMFileHeader readsHeader = header.clone(); readsHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); final JavaRDD<GATKRead> sortedMarkedReads = SparkUtils.sortReadsAccordingToHeader(markedReads, readsHeader, numReducers); // The markedReads have already had the WellformedReadFilter applied to them, which // is all the filtering that MarkDupes and ApplyBQSR want. BQSR itself wants additional // filtering performed, so we do that here. //NOTE: this doesn't honor enabled/disabled commandline filters final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), header); JavaRDD<GATKRead> markedFilteredReadsForBQSR = sortedMarkedReads.filter(bqsrReadFilter::test); JavaPairRDD<GATKRead, Iterable<GATKVariant>> readsWithVariants = JoinReadsWithVariants.join(markedFilteredReadsForBQSR, localKnownSitesFilePaths); final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(readsWithVariants, getHeaderForReads(), referenceFileName, bqsrArgs); final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport); final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(sortedMarkedReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs)); if (outputBam != null) { // only write output of BQSR if output BAM is specified writeReads(ctx, outputBam, finalReads, header, true); } // Run Haplotype Caller final ReadFilter hcReadFilter = ReadFilter.fromList(HaplotypeCallerEngine.makeStandardHCReadFilters(), header); final JavaRDD<GATKRead> filteredReadsForHC = finalReads.filter(hcReadFilter::test); SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary(); final List<SimpleInterval> intervals = hasUserSuppliedIntervals() ? getIntervals() : IntervalUtils.getAllIntervalsForReference(sequenceDictionary); List<ShardBoundary> intervalShards = intervals.stream() .flatMap(interval -> Shard.divideIntervalIntoShards(interval, shardingArgs.readShardSize, shardingArgs.readShardPadding, sequenceDictionary).stream()) .collect(Collectors.toList()); HaplotypeCallerSpark.callVariantsWithHaplotypeCallerAndWriteOutput(ctx, filteredReadsForHC, readsHeader, sequenceDictionary, referenceArguments.getReferenceFileName(), intervalShards, hcArgs, shardingArgs, assemblyRegionArgs, output, makeVariantAnnotations(), logger, strict, createOutputVariantIndex); if (bwaEngine != null) { bwaEngine.close(); } }
Example 16
Source File: GrepCaching.java From flink-perf with Apache License 2.0 | 4 votes |
public static void main(String[] args) { String master = args[0]; String inFile = args[1]; String outFile = args[2]; String storageLevel = args[3]; String patterns[] = new String[args.length-4]; System.arraycopy(args, 4, patterns, 0, args.length - 4); System.err.println("Starting spark with master="+master+" in="+inFile); System.err.println("Using patterns: "+ Arrays.toString(patterns)); SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false"); JavaSparkContext sc = new JavaSparkContext(conf); StorageLevel sl; switch(storageLevel) { case "MEMORY_ONLY": sl = StorageLevel.MEMORY_ONLY(); break; case "MEMORY_AND_DISK": sl = StorageLevel.MEMORY_AND_DISK(); break; case "MEMORY_ONLY_SER": sl = StorageLevel.MEMORY_ONLY_SER(); break; case "MEMORY_AND_DISK_SER": sl = StorageLevel.MEMORY_AND_DISK_SER(); break; case "NONE": sl = StorageLevel.NONE(); break; default: throw new RuntimeException("Unknown storage level "+storageLevel); } JavaRDD<String> file = sc.textFile(inFile).persist(sl); for(int p = 0; p < patterns.length; p++) { final String pattern = patterns[p]; JavaRDD<String> res = file.filter(new Function<String, Boolean>() { private static final long serialVersionUID = 1L; Pattern p = Pattern.compile(pattern); @Override public Boolean call(String value) throws Exception { if (value == null || value.length() == 0) { return false; } final Matcher m = p.matcher(value); if (m.find()) { return true; } return false; } }); res.saveAsTextFile(outFile+"_"+pattern); } }
Example 17
Source File: PSFilter.java From gatk with BSD 3-Clause "New" or "Revised" License | 4 votes |
@SuppressWarnings("unchecked") private static JavaRDD<GATKRead> doKmerFiltering(final JavaRDD<GATKRead> reads, final String kmerLibPath, final int countThresh) { return reads.filter(new ContainsKmerReadFilterSpark(kmerLibPath, countThresh)); }
Example 18
Source File: PSFilter.java From gatk with BSD 3-Clause "New" or "Revised" License | 4 votes |
/** * Main PathSeq filtering method. See PathSeqFilterSpark for an overview. * Returns a tuple containing the paired reads and unpaired reads as separate RDDs. * If metricsFile is null, read count metrics will not be collected. */ public Tuple2<JavaRDD<GATKRead>, JavaRDD<GATKRead>> doFilter(JavaRDD<GATKRead> reads, final PSFilterLogger filterLogger) { Utils.nonNull(reads, "Input reads cannot be null"); reads = PSUtils.primaryReads(reads); filterLogger.logPrimaryReads(reads); if (filterArgs.alignedInput) { final Set<String> contigsToIgnoreSet = Collections.unmodifiableSet(new HashSet<>(filterArgs.alignmentContigsToIgnore)); reads = reads.filter(new ReadFilterSparkifier(new HostAlignmentReadFilter(filterArgs.minIdentity, contigsToIgnoreSet))); } filterLogger.logReadsAfterPrealignedHostFilter(reads); //Clear alignment data from the reads reads = clearAllAlignments(reads, header); //Remove /1 and /2 from read names reads = reads.map(new ReadTransformerSparkifier(new StripMateNumberTransformer())); if (!filterArgs.skipFilters) { //Adapter trimming reads = reads.map(new ReadTransformerSparkifier(new AdapterTrimTransformer(filterArgs.maxAdapterMismatches, filterArgs.minAdapterLength, ADAPTER_SEQUENCES))); //Apply simple repeat masking //See "Low-complexity DNA and simple repeats" at http://www.repeatmasker.org/webrepeatmaskerhelp.html reads = reads.map(new ReadTransformerSparkifier(new SimpleRepeatMaskTransformer(MAX_AT_CONTENT_1, MAX_GC_CONTENT_1, REPEAT_WINDOW_SIZE_1))); reads = reads.map(new ReadTransformerSparkifier(new SimpleRepeatMaskTransformer(MAX_AT_CONTENT_2, MAX_GC_CONTENT_2, REPEAT_WINDOW_SIZE_2))); //Apply DUST masking reads = reads.map(new ReadTransformerSparkifier(new DUSTReadTransformer(filterArgs.dustMask, filterArgs.dustW, filterArgs.dustT))); //Apply base quality hard clipping reads = reads.map(new ReadTransformerSparkifier(new BaseQualityClipReadTransformer(filterArgs.readTrimThresh))); //Filter reads with less than minReadLength bases reads = reads.filter(new ReadFilterSparkifier(new ReadLengthReadFilter(filterArgs.minReadLength, Integer.MAX_VALUE))); //Change low-quality bases to 'N' reads = reads.map(new ReadTransformerSparkifier(new BaseQualityReadTransformer(filterArgs.qualPhredThresh))); //Filter reads with too many 'N's reads = reads.filter(new ReadFilterSparkifier(new AmbiguousBaseReadFilter(filterArgs.maxAmbiguousBases))); } filterLogger.logReadsAfterQualityFilter(reads); //Kmer filtering if (filterArgs.kmerFilePath != null) { reads = doKmerFiltering(reads, filterArgs.kmerFilePath, filterArgs.hostKmerThresh); } //Redistribute reads if (!filterArgs.skipPreBwaRepartition) { reads = repartitionReadsByName(reads); } //Bwa host alignment filtering if (filterArgs.indexImageFile != null) { reads = doBwaFilter(reads, filterArgs.indexImageFile, filterArgs.minSeedLength, filterArgs.bwaThreads, filterArgs.minIdentity); } filterLogger.logReadsAfterHostFilter(reads); //Filter duplicates if (filterArgs.filterDuplicates) { reads = setPairFlags(reads, filterArgs.filterReadsPerPartition); reads = filterDuplicateSequences(reads); } filterLogger.logReadsAfterDeduplication(reads); //Sets pairedness flags properly reads = setPairFlags(reads, filterArgs.filterReadsPerPartition); reads = clearAllAlignments(reads, header); //Unset paired read flags for reads that are not paired final PSPairedUnpairedSplitterSpark splitter = new PSPairedUnpairedSplitterSpark(reads, filterArgs.filterReadsPerPartition, false); final JavaRDD<GATKRead> pairedReads = splitter.getPairedReads(); final JavaRDD<GATKRead> unpairedReads = splitter.getUnpairedReads(); filterLogger.logFinalPairedReads(pairedReads); return new Tuple2<>(pairedReads, unpairedReads); }
Example 19
Source File: BatchHeatMapProcessor.java From lambda-arch with Apache License 2.0 | 3 votes |
/** * Filter the measurements in a given time period * * @param measurements | The dataset of measurements * @param start | Start of the time period * @param end | End of the time period * @return A set of measurements in the given time period */ private JavaRDD<Measurement> filterByTime(JavaRDD<Measurement> measurements, Date start, Date end) { return measurements.filter(measurement -> ( measurement.getTimestamp().equals(start) || measurement.getTimestamp().after(start) ) && measurement.getTimestamp().before(end) ); }
Example 20
Source File: HoodieReadClient.java From hudi with Apache License 2.0 | 2 votes |
/** * Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication. * * @param hoodieRecords Input RDD of Hoodie records. * @return A subset of hoodieRecords RDD, with existing records filtered out. */ public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) { JavaRDD<HoodieRecord<T>> recordsWithLocation = tagLocation(hoodieRecords); return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); }