org.apache.spark.api.java.JavaRDD#filter

Source File: AbstractJavaEsSparkTest.java From elasticsearch-hadoop with Apache License 2.0

7 votes

public void testEsRDDZReadJson() throws Exception {
    String target = "spark-test-java-basic-json-read/data";

    RestUtils.touch("spark-test-java-basic-json-read");
    RestUtils.postData(target, "{\"message\" : \"Hello World\",\"message_date\" : \"2014-05-25\"}".getBytes());
    RestUtils.postData(target, "{\"message\" : \"Goodbye World\",\"message_date\" : \"2014-05-25\"}".getBytes());
    RestUtils.refresh("spark-test*");

    JavaRDD<String> esRDD = JavaEsSpark.esJsonRDD(sc, target).values();
    System.out.println(esRDD.collect());
    JavaRDD<String> messages = esRDD.filter(new Function<String, Boolean>() {
        @Override
        public Boolean call(String string) throws Exception {
            return string.contains("message");
        }
    });

    // jdk8
    //esRDD.filter(m -> m.contains("message")));

    assertThat((int) messages.count(), is(2));
    System.out.println(messages.take(10));
    System.out.println(messages);
}

Source File: MarkDuplicatesSparkUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@Test(dataProvider = "md", groups = "spark")
public void markDupesTest(final String input, final long totalExpected, final long dupsExpected) {
    final GATKPath inputPathSpec = new GATKPath(input);
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    ReadsSparkSource readSource = new ReadsSparkSource(ctx);
    JavaRDD<GATKRead> reads = readSource.getParallelReads(inputPathSpec, null);
    Assert.assertEquals(reads.count(), totalExpected);

    SAMFileHeader header = readSource.getHeader(inputPathSpec, null);
    OpticalDuplicatesArgumentCollection opticalDuplicatesArgumentCollection = new OpticalDuplicatesArgumentCollection();
    final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ?
            new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null;
    JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(reads, header, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, finder, 1, false, MarkDuplicates.DuplicateTaggingPolicy.DontTag);

    Assert.assertEquals(markedReads.count(), totalExpected);
    JavaRDD<GATKRead> dupes = markedReads.filter(GATKRead::isDuplicate);

    Assert.assertEquals(dupes.count(), dupsExpected);
}

Source File: Algorithm.java From predictionio-template-java-ecom-recommender with Apache License 2.0

6 votes

private JavaRDD<ItemScore> validScores(JavaRDD<ItemScore> all, final Set<String> whitelist, final Set<String> blacklist, final Set<String> categories, final Map<String, Item> items, String userEntityId) {
    final Set<String> seenItemEntityIds = seenItemEntityIds(userEntityId);
    final Set<String> unavailableItemEntityIds = unavailableItemEntityIds();

    return all.filter(new Function<ItemScore, Boolean>() {
        @Override
        public Boolean call(ItemScore itemScore) throws Exception {
            Item item = items.get(itemScore.getItemEntityId());

            return (item != null
                    && passWhitelistCriteria(whitelist, item.getEntityId())
                    && passBlacklistCriteria(blacklist, item.getEntityId())
                    && passCategoryCriteria(categories, item)
                    && passUnseenCriteria(seenItemEntityIds, item.getEntityId())
                    && passAvailabilityCriteria(unavailableItemEntityIds, item.getEntityId()));
        }
    });
}

Source File: DeleteHelper.java From hudi with Apache License 2.0

5 votes

public static <T extends HoodieRecordPayload<T>> HoodieWriteMetadata execute(String instantTime,
                                                                             JavaRDD<HoodieKey> keys, JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable<T> table,
                                                                             CommitActionExecutor<T> deleteExecutor) {
  try {
    HoodieWriteMetadata result = null;
    // De-dupe/merge if needed
    JavaRDD<HoodieKey> dedupedKeys = config.shouldCombineBeforeDelete() ? deduplicateKeys(keys, table) : keys;

    JavaRDD<HoodieRecord<T>> dedupedRecords =
        dedupedKeys.map(key -> new HoodieRecord(key, new EmptyHoodieRecordPayload()));
    Instant beginTag = Instant.now();
    // perform index loop up to get existing location of records
    JavaRDD<HoodieRecord<T>> taggedRecords =
        ((HoodieTable<T>)table).getIndex().tagLocation(dedupedRecords, jsc, (HoodieTable<T>)table);
    Duration tagLocationDuration = Duration.between(beginTag, Instant.now());

    // filter out non existant keys/records
    JavaRDD<HoodieRecord<T>> taggedValidRecords = taggedRecords.filter(HoodieRecord::isCurrentLocationKnown);
    if (!taggedValidRecords.isEmpty()) {
      result = deleteExecutor.execute(taggedValidRecords);
      result.setIndexLookupDuration(tagLocationDuration);
    } else {
      // if entire set of keys are non existent
      deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(jsc.emptyRDD()), instantTime);
      result = new HoodieWriteMetadata();
      result.setWriteStatuses(jsc.emptyRDD());
      deleteExecutor.commitOnAutoCommit(result);
    }
    return result;
  } catch (Throwable e) {
    if (e instanceof HoodieUpsertException) {
      throw (HoodieUpsertException) e;
    }
    throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e);
  }
}

Source File: CollectBaseDistributionByCycleSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Computes the MeanQualityByCycle. Creates a metrics file with relevant histograms.
 */
public MetricsFile<BaseDistributionByCycleMetrics, Integer> calculateBaseDistributionByCycle(final JavaRDD<GATKRead> reads){
    final MetricsReadFilter metricsFilter =
        new MetricsReadFilter(this.pfReadsOnly, this.alignedReadsOnly);
    final JavaRDD<GATKRead> filteredReads = reads.filter(read -> metricsFilter.test(read));
    final HistogramGenerator hist = filteredReads.aggregate(new HistogramGenerator(),
            (hgp, read) -> hgp.addRead(read),
            (hgp1, hgp2) -> hgp1.merge(hgp2));

    final MetricsFile<BaseDistributionByCycleMetrics, Integer> metricsFile = getMetricsFile();
    hist.addToMetricsFile(metricsFile);
    return metricsFile;
}

Source File: Grep.java From flink-perf with Apache License 2.0

5 votes

public static void main(String[] args) {
	String master = args[0];
	String inFile = args[1];
	String outFile = args[2];

	String patterns[] = new String[args.length-3];
	System.arraycopy(args,3,patterns,0,args.length-3);
	System.err.println("Starting spark with master="+master+" in="+inFile);
	System.err.println("Using patterns: "+ Arrays.toString(patterns));

	SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false");
	JavaSparkContext sc = new JavaSparkContext(conf);

	JavaRDD<String> file = sc.textFile(inFile);
	for(int p = 0; p < patterns.length; p++) {
		final String pattern = patterns[p];
		JavaRDD<String> res = file.filter(new Function<String, Boolean>() {
			private static final long serialVersionUID = 1L;
			Pattern p = Pattern.compile(pattern);

			@Override
			public Boolean call(String value) throws Exception {
				if (value == null || value.length() == 0) {
					return false;
				}
				final Matcher m = p.matcher(value);
				if (m.find()) {
					return true;
				}
				return false;
			}
		});
		res.saveAsTextFile(outFile+"_"+pattern);
	}
}

Source File: ComputeResponse.java From incubator-retired-pirk with Apache License 2.0

5 votes

/**
 * Method to read in the data from elasticsearch, filter, and return a RDD of MapWritable data elements
 */
@SuppressWarnings("unchecked")
public JavaRDD<MapWritable> readDataES() throws IOException, PIRException
{
  logger.info("Reading data ");

  JavaRDD<MapWritable> jsonRDD;

  Job job = Job.getInstance();
  String jobName = "pirSpark_ES_" + esQuery + "_" + System.currentTimeMillis();
  job.setJobName(jobName);
  job.getConfiguration().set("es.nodes", SystemConfiguration.getProperty("es.nodes"));
  job.getConfiguration().set("es.port", SystemConfiguration.getProperty("es.port"));
  job.getConfiguration().set("es.resource", esResource);
  job.getConfiguration().set("es.query", esQuery);

  jsonRDD = sc.newAPIHadoopRDD(job.getConfiguration(), EsInputFormat.class, Text.class, MapWritable.class).values().coalesce(numDataPartitions);

  // Filter out by the provided stopListFile entries
  if (qSchema.getFilter() != null)
  {
    return jsonRDD.filter(new FilterData(accum, bVars));
  }
  else
  {
    logger.info("qSchema.getFilter() is null");
    return jsonRDD;
  }
}

Source File: QuadUtils.java From rdf2x with Apache License 2.0

5 votes

/**
 * Get quads with specified subjects filtered out, computed by querying an in-memory set of subjects
 *
 * @param quads            RDD of quads to filter
 * @param subjectBlacklist set of requested subject URIs to be filtered out
 * @return filtered RDD with only those quads whose subject is NOT in subjectBlacklist
 */
public static JavaRDD<Quad> filterQuadsByForbiddenSubjects(JavaRDD<Quad> quads, Set<String> subjectBlacklist) {
    if (subjectBlacklist.isEmpty()) {
        return quads;
    }
    return quads.filter(quad -> !quad.getSubject().isURI() ||
            !subjectBlacklist.contains(quad.getSubject().getURI())
    );
}

Source File: RddChannel.java From rheem with Apache License 2.0

5 votes

public void accept(JavaRDD<?> rdd, SparkExecutor sparkExecutor) throws RheemException {
    if (this.isMarkedForInstrumentation() && !this.isRddCached()) {
        final Accumulator<Integer> accumulator = sparkExecutor.sc.accumulator(0);
        this.rdd = rdd.filter(dataQuantum -> {
            accumulator.add(1);
            return true;
        });
        this.accumulator = accumulator;
    } else {
        this.rdd = rdd;
    }
}

Source File: AnalyzeSpark.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Randomly sample a set of invalid values from a specified column.
 * Values are considered invalid according to the Schema / ColumnMetaData
 *
 * @param numToSample    Maximum number of invalid values to sample
 * @param columnName     Same of the column from which to sample invalid values
 * @param schema         Data schema
 * @param data           Data
 * @param ignoreMissing  If true: ignore missing values (NullWritable or empty/null string) when sampling. If false: include missing values in sampling
 * @return               List of invalid examples
 */
public static List<Writable> sampleInvalidFromColumn(int numToSample, String columnName, Schema schema,
                JavaRDD<List<Writable>> data, boolean ignoreMissing) {
    //First: filter out all valid entries, to leave only invalid entries
    int colIdx = schema.getIndexOfColumn(columnName);
    JavaRDD<Writable> ithColumn = data.map(new SelectColumnFunction(colIdx));

    ColumnMetaData meta = schema.getMetaData(columnName);

    JavaRDD<Writable> invalid = ithColumn.filter(new FilterWritablesBySchemaFunction(meta, false, ignoreMissing));

    return invalid.takeSample(false, numToSample);
}

Source File: TransformationRDDTest.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 集合并集.
 * demo计算目的：找出所有进站是广南和天河客运站的信息
 * @since hui_project 1.0.0
 */
@Test
public void testUnionAndFilter() {
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    JavaRDD<String> result = textRDD.filter(x -> x.contains("广州南站"));
    JavaRDD<String> result1 = textRDD.filter(x -> x.contains("天河客运站"));
    JavaRDD<String> union = result.union(result1);
    System.out.println("-------" + union.count() + "-------");
    checkResult(union.collect());
}

Source File: SparkGenomeReadCounts.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

4 votes

private void collectReads() {
    if ( readArguments.getReadFilesNames().size() != 1 ) {
        throw new UserException("This tool only accepts a single bam/sam/cram as input");
    }

    final SampleCollection sampleCollection = new SampleCollection(getHeaderForReads());
    if(sampleCollection.sampleCount()>1){
        throw new UserException.BadInput("We do not support bams with more than one sample.");
    }
    final String sampleName = sampleCollection.sampleIds().get(0);
    final String[] commentsForRawCoverage = {"##fileFormat  = tsv",
            "##commandLine = " + getCommandLine(),
            String.format("##title = Coverage counts in %d base bins for WGS", binsize)};

    final ReadFilter filter = makeGenomeReadFilter();
    final SAMSequenceDictionary sequenceDictionary = getReferenceSequenceDictionary();

    logger.info("Starting Spark coverage collection...");
    final long coverageCollectionStartTime = System.currentTimeMillis();
    final JavaRDD<GATKRead> rawReads = getReads();
    final JavaRDD<GATKRead> reads = rawReads.filter(read -> filter.test(read));

    //Note: using a field inside a closure will pull in the whole enclosing object to serialization
    // (which leads to bad performance and can blow up if some objects in the fields are not
    // Serializable - closures always use java Serializable and not Kryo)
    //Solution here is to use a temp variable for binsize because it's just an int.
    final int binsize_tmp = binsize;
    final JavaRDD<SimpleInterval> readIntervals = reads
            .filter(read -> sequenceDictionary.getSequence(read.getContig()) != null)
            .map(read -> SparkGenomeReadCounts.createKey(read, sequenceDictionary, binsize_tmp));
    final Map<SimpleInterval, Long> byKey = readIntervals.countByValue();
    final Set<SimpleInterval> readIntervalKeySet = byKey.keySet();
    final long totalReads = byKey.values().stream().mapToLong(v -> v).sum();
    final long coverageCollectionEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished the spark coverage collection with %d targets and %d reads. Elapse of %d seconds",
            readIntervalKeySet.size(), totalReads, (coverageCollectionEndTime - coverageCollectionStartTime) / 1000));

    final String[] commentsForProportionalCoverage = {commentsForRawCoverage[0], commentsForRawCoverage[1],
            String.format("##title = Proportional coverage counts in %d base bins for WGS (total reads: %d)",
                    binsize, totalReads)};

    logger.info("Creating full genome bins...");
    final long createGenomeBinsStartTime = System.currentTimeMillis();
    final List<SimpleInterval> fullGenomeBins = createFullGenomeBins(binsize);
    List<Target> fullGenomeTargetCollection = createTargetListFromSimpleInterval(fullGenomeBins);
    TargetWriter.writeTargetsToFile(new File(outputFile.getAbsolutePath() + ".targets.tsv"), fullGenomeTargetCollection);
    final long createGenomeBinsEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating genome bins. Elapse of %d seconds",
            (createGenomeBinsEndTime - createGenomeBinsStartTime) / 1000));

    logger.info("Creating missing genome bins...");
    final long createMissingGenomeBinsStartTime = System.currentTimeMillis();
    logger.info("Creating missing genome bins: Creating a mutable mapping...");
    final Map<SimpleInterval, Long> byKeyMutable = new HashMap<>();
    byKeyMutable.putAll(byKey);

    logger.info("Creating missing genome bins: Populating mutable mapping with zero counts for empty regions...");
    fullGenomeBins.stream().forEach(b -> byKeyMutable.putIfAbsent(b, 0l));

    final long createMissingGenomeBinsEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating missing genome bins. Elapse of %d seconds",
            (createMissingGenomeBinsEndTime - createMissingGenomeBinsStartTime) / 1000));

    logger.info("Creating final map...");
    final long createFinalMapStartTime = System.currentTimeMillis();
    final SortedMap<SimpleInterval, Long> byKeySorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR);
    byKeySorted.putAll(byKeyMutable);
    final long createFinalMapEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating final map. Elapse of %d seconds",
            (createFinalMapEndTime - createFinalMapStartTime) / 1000));

    logger.info("Creating proportional coverage... ");
    final long pCovFileStartTime = System.currentTimeMillis();
    final SortedMap<SimpleInterval, Double> byKeyProportionalSorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR);
    byKeySorted.entrySet().stream().forEach(e -> byKeyProportionalSorted.put(e.getKey(), (double) e.getValue() / totalReads));
    final long pCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating proportional coverage map. Elapse of %d seconds",
            (pCovFileEndTime - pCovFileStartTime) / 1000));

    logger.info("Writing raw coverage file ...");
    final long writingCovFileStartTime = System.currentTimeMillis();
    ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(new File(outputFile.getAbsolutePath() + RAW_COV_OUTPUT_EXTENSION), sampleName, byKeySorted, commentsForRawCoverage);
    final long writingCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished writing coverage file. Elapse of %d seconds",
            (writingCovFileEndTime - writingCovFileStartTime) / 1000));

    logger.info("Writing proportional coverage file ...");
    final long writingPCovFileStartTime = System.currentTimeMillis();
    ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(outputFile, sampleName, byKeyProportionalSorted,
            commentsForProportionalCoverage);
    final long writingPCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished writing proportional coverage file. Elapse of %d seconds",
            (writingPCovFileEndTime - writingPCovFileStartTime) / 1000));
}

Source File: MarkDuplicatesSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

@Override
protected void runTool(final JavaSparkContext ctx) {
    final SAMFileHeader mergedHeader = getHeaderForReads();

    // Check if we are using multiple inputs that the headers are all in the correct querygrouped ordering, if so set the aggregate header to reflect this
    if (readArguments.getReadPathSpecifiers().size() > 1) {
        final Optional<GATKPath> badlySorted = readArguments.getReadPathSpecifiers().stream()
                .filter(spec -> !treatAsReadGroupOrdered(getHeaderForReadsInput(spec), treatUnsortedAsOrdered))
                .findFirst();

        if(badlySorted.isPresent()) {
            if (allowMultipleSortOrders) {
                //don't set an ordering, the files will all be sorted downstream
                logger.info("Input files are not all grouped by read name so they will be sorted together.");
            } else {
                throw new UserException(
                        "Multiple inputs to MarkDuplicatesSpark detected. MarkDuplicatesSpark requires all inputs to be queryname sorted " +
                                "or querygroup-sorted for multi-input processing but input " + badlySorted.get() + " was sorted in " +
                                getHeaderForReadsInput(badlySorted.get()) + " order");
            }
        } else {
            // The default sort order for merged input files is unsorted, so this will be fed to the tool to be sorted
            if (!allowMultipleSortOrders) {
                mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query);
            }
        }

    // If there is only one file and we are in treatUnsortedAsOrdered mode than set its group order accordingly.
    } else {
        if (treatUnsortedAsOrdered && (mergedHeader.getSortOrder().equals(SAMFileHeader.SortOrder.unknown) || mergedHeader.getSortOrder().equals(SAMFileHeader.SortOrder.unsorted))) {
            logger.warn("Input bam was marked as " + mergedHeader.getSortOrder().toString() + " but " + TREAT_UNSORTED_AS_ORDERED + " is specified so it's being treated as read name grouped");
            mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query);
        }
    }

    JavaRDD<GATKRead> reads = getReads();
    final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ?
            new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null;
    // If we need to remove optical duplicates, set the engine to mark optical duplicates using the DT tag.
    if (markDuplicatesSparkArgumentCollection.removeSequencingDuplicates && markDuplicatesSparkArgumentCollection.taggingPolicy == MarkDuplicates.DuplicateTaggingPolicy.DontTag) {
        markDuplicatesSparkArgumentCollection.taggingPolicy = MarkDuplicates.DuplicateTaggingPolicy.OpticalOnly;
    }

    final JavaRDD<GATKRead> finalReadsForMetrics = mark(reads, mergedHeader, finder, markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());

    if (metricsFile != null) {
        final JavaPairRDD<String, GATKDuplicationMetrics> metricsByLibrary = MarkDuplicatesSparkUtils.generateMetrics(
                mergedHeader, finalReadsForMetrics);
        final MetricsFile<GATKDuplicationMetrics, Double> resultMetrics = getMetricsFile();
        MarkDuplicatesSparkUtils.saveMetricsRDD(resultMetrics, mergedHeader, metricsByLibrary, metricsFile);
    }
    JavaRDD<GATKRead> readsForWriting = finalReadsForMetrics;
    // Filter out the duplicates if instructed to do so
    if (markDuplicatesSparkArgumentCollection.removeAllDuplicates) {
        readsForWriting = readsForWriting.filter(r -> !r.isDuplicate());
    } else if (markDuplicatesSparkArgumentCollection.removeSequencingDuplicates) {
        readsForWriting = readsForWriting.filter(r -> !MarkDuplicates.DUPLICATE_TYPE_SEQUENCING.equals(r.getAttributeAsString(MarkDuplicates.DUPLICATE_TYPE_TAG)));
    }

    mergedHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    writeReads(ctx, output, readsForWriting, mergedHeader, true);
}

Source File: PSUtils.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

public static JavaRDD<GATKRead> primaryReads(final JavaRDD<GATKRead> reads) {
    return reads.filter(read -> !(read.isSecondaryAlignment() || read.isSupplementaryAlignment()));
}

Source File: ReadsPipelineSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

@Override
protected void runTool(final JavaSparkContext ctx) {
    String referenceFileName = addReferenceFilesForSpark(ctx, referenceArguments.getReferencePath());
    List<String> localKnownSitesFilePaths = addVCFsForSpark(ctx, knownVariants);

    final JavaRDD<GATKRead> alignedReads;
    final SAMFileHeader header;
    final BwaSparkEngine bwaEngine;
    if (align) {
        bwaEngine = new BwaSparkEngine(ctx, referenceArguments.getReferenceFileName(), bwaArgs.indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary());
        if (bwaArgs.singleEndAlignment) {
            alignedReads = bwaEngine.alignUnpaired(getReads());
        } else {
            // filter reads after alignment in the case of paired reads since filtering does not know about pairs
            final ReadFilter filter = makeReadFilter(bwaEngine.getHeader());
            alignedReads = bwaEngine.alignPaired(getUnfilteredReads()).filter(filter::test);
        }
        header = bwaEngine.getHeader();
    } else {
        bwaEngine = null;
        alignedReads = getReads();
        header = getHeaderForReads();
    }

    final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(alignedReads, header, new OpticalDuplicateFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());

    // always coordinate-sort reads so BQSR can use queryLookaheadBases in FeatureDataSource
    final SAMFileHeader readsHeader = header.clone();
    readsHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    final JavaRDD<GATKRead> sortedMarkedReads = SparkUtils.sortReadsAccordingToHeader(markedReads, readsHeader, numReducers);

    // The markedReads have already had the WellformedReadFilter applied to them, which
    // is all the filtering that MarkDupes and ApplyBQSR want. BQSR itself wants additional
    // filtering performed, so we do that here.
    //NOTE: this doesn't honor enabled/disabled commandline filters
    final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), header);

    JavaRDD<GATKRead> markedFilteredReadsForBQSR = sortedMarkedReads.filter(bqsrReadFilter::test);

    JavaPairRDD<GATKRead, Iterable<GATKVariant>> readsWithVariants = JoinReadsWithVariants.join(markedFilteredReadsForBQSR, localKnownSitesFilePaths);
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(readsWithVariants, getHeaderForReads(), referenceFileName, bqsrArgs);

    final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
    final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(sortedMarkedReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs));

    if (outputBam != null) { // only write output of BQSR if output BAM is specified
        writeReads(ctx, outputBam, finalReads, header, true);
    }

    // Run Haplotype Caller
    final ReadFilter hcReadFilter = ReadFilter.fromList(HaplotypeCallerEngine.makeStandardHCReadFilters(), header);
    final JavaRDD<GATKRead> filteredReadsForHC = finalReads.filter(hcReadFilter::test);
    SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
    final List<SimpleInterval> intervals = hasUserSuppliedIntervals() ? getIntervals() : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);

    List<ShardBoundary> intervalShards = intervals.stream()
            .flatMap(interval -> Shard.divideIntervalIntoShards(interval, shardingArgs.readShardSize, shardingArgs.readShardPadding, sequenceDictionary).stream())
            .collect(Collectors.toList());

    HaplotypeCallerSpark.callVariantsWithHaplotypeCallerAndWriteOutput(ctx, filteredReadsForHC, readsHeader, sequenceDictionary, referenceArguments.getReferenceFileName(), intervalShards, hcArgs, shardingArgs, assemblyRegionArgs, output, makeVariantAnnotations(), logger, strict, createOutputVariantIndex);

    if (bwaEngine != null) {
        bwaEngine.close();
    }
}

Source File: GrepCaching.java From flink-perf with Apache License 2.0

4 votes

public static void main(String[] args) {
	String master = args[0];
	String inFile = args[1];
	String outFile = args[2];
	String storageLevel = args[3];

	String patterns[] = new String[args.length-4];
	System.arraycopy(args, 4, patterns, 0, args.length - 4);
	System.err.println("Starting spark with master="+master+" in="+inFile);
	System.err.println("Using patterns: "+ Arrays.toString(patterns));

	SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false");
	JavaSparkContext sc = new JavaSparkContext(conf);

	StorageLevel sl;
	switch(storageLevel) {
		case "MEMORY_ONLY":
			sl = StorageLevel.MEMORY_ONLY(); break;
		case "MEMORY_AND_DISK":
			sl = StorageLevel.MEMORY_AND_DISK(); break;
		case "MEMORY_ONLY_SER":
			sl = StorageLevel.MEMORY_ONLY_SER(); break;
		case "MEMORY_AND_DISK_SER":
			sl = StorageLevel.MEMORY_AND_DISK_SER(); break;
		case "NONE":
			sl = StorageLevel.NONE(); break;
		default:
			throw new RuntimeException("Unknown storage level "+storageLevel);
	}

	JavaRDD<String> file = sc.textFile(inFile).persist(sl);
	for(int p = 0; p < patterns.length; p++) {
		final String pattern = patterns[p];
		JavaRDD<String> res = file.filter(new Function<String, Boolean>() {
			private static final long serialVersionUID = 1L;
			Pattern p = Pattern.compile(pattern);

			@Override
			public Boolean call(String value) throws Exception {
				if (value == null || value.length() == 0) {
					return false;
				}
				final Matcher m = p.matcher(value);
				if (m.find()) {
					return true;
				}
				return false;
			}
		});
		res.saveAsTextFile(outFile+"_"+pattern);
	}
}

Source File: PSFilter.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

@SuppressWarnings("unchecked")
private static JavaRDD<GATKRead> doKmerFiltering(final JavaRDD<GATKRead> reads, final String kmerLibPath,
                                                   final int countThresh) {

    return reads.filter(new ContainsKmerReadFilterSpark(kmerLibPath, countThresh));
}

Source File: PSFilter.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

/**
 * Main PathSeq filtering method. See PathSeqFilterSpark for an overview.
 * Returns a tuple containing the paired reads and unpaired reads as separate RDDs.
 * If metricsFile is null, read count metrics will not be collected.
 */
public Tuple2<JavaRDD<GATKRead>, JavaRDD<GATKRead>> doFilter(JavaRDD<GATKRead> reads, final PSFilterLogger filterLogger) {

    Utils.nonNull(reads, "Input reads cannot be null");
    reads = PSUtils.primaryReads(reads);
    filterLogger.logPrimaryReads(reads);

    if (filterArgs.alignedInput) {
        final Set<String> contigsToIgnoreSet = Collections.unmodifiableSet(new HashSet<>(filterArgs.alignmentContigsToIgnore));
        reads = reads.filter(new ReadFilterSparkifier(new HostAlignmentReadFilter(filterArgs.minIdentity, contigsToIgnoreSet)));
    }
    filterLogger.logReadsAfterPrealignedHostFilter(reads);

    //Clear alignment data from the reads
    reads = clearAllAlignments(reads, header);

    //Remove /1 and /2 from read names
    reads = reads.map(new ReadTransformerSparkifier(new StripMateNumberTransformer()));

    if (!filterArgs.skipFilters) {

        //Adapter trimming
        reads = reads.map(new ReadTransformerSparkifier(new AdapterTrimTransformer(filterArgs.maxAdapterMismatches, filterArgs.minAdapterLength, ADAPTER_SEQUENCES)));

        //Apply simple repeat masking
        //See "Low-complexity DNA and simple repeats" at http://www.repeatmasker.org/webrepeatmaskerhelp.html
        reads = reads.map(new ReadTransformerSparkifier(new SimpleRepeatMaskTransformer(MAX_AT_CONTENT_1, MAX_GC_CONTENT_1, REPEAT_WINDOW_SIZE_1)));
        reads = reads.map(new ReadTransformerSparkifier(new SimpleRepeatMaskTransformer(MAX_AT_CONTENT_2, MAX_GC_CONTENT_2, REPEAT_WINDOW_SIZE_2)));

        //Apply DUST masking
        reads = reads.map(new ReadTransformerSparkifier(new DUSTReadTransformer(filterArgs.dustMask, filterArgs.dustW, filterArgs.dustT)));

        //Apply base quality hard clipping
        reads = reads.map(new ReadTransformerSparkifier(new BaseQualityClipReadTransformer(filterArgs.readTrimThresh)));

        //Filter reads with less than minReadLength bases
        reads = reads.filter(new ReadFilterSparkifier(new ReadLengthReadFilter(filterArgs.minReadLength, Integer.MAX_VALUE)));

        //Change low-quality bases to 'N'
        reads = reads.map(new ReadTransformerSparkifier(new BaseQualityReadTransformer(filterArgs.qualPhredThresh)));

        //Filter reads with too many 'N's
        reads = reads.filter(new ReadFilterSparkifier(new AmbiguousBaseReadFilter(filterArgs.maxAmbiguousBases)));
    }
    filterLogger.logReadsAfterQualityFilter(reads);

    //Kmer filtering
    if (filterArgs.kmerFilePath != null) {
        reads = doKmerFiltering(reads, filterArgs.kmerFilePath, filterArgs.hostKmerThresh);
    }

    //Redistribute reads
    if (!filterArgs.skipPreBwaRepartition) {
        reads = repartitionReadsByName(reads);
    }

    //Bwa host alignment filtering
    if (filterArgs.indexImageFile != null) {
        reads = doBwaFilter(reads, filterArgs.indexImageFile, filterArgs.minSeedLength,
                filterArgs.bwaThreads, filterArgs.minIdentity);
    }
    filterLogger.logReadsAfterHostFilter(reads);

    //Filter duplicates
    if (filterArgs.filterDuplicates) {
        reads = setPairFlags(reads, filterArgs.filterReadsPerPartition);
        reads = filterDuplicateSequences(reads);
    }
    filterLogger.logReadsAfterDeduplication(reads);

    //Sets pairedness flags properly
    reads = setPairFlags(reads, filterArgs.filterReadsPerPartition);
    reads = clearAllAlignments(reads, header);

    //Unset paired read flags for reads that are not paired
    final PSPairedUnpairedSplitterSpark splitter = new PSPairedUnpairedSplitterSpark(reads, filterArgs.filterReadsPerPartition, false);
    final JavaRDD<GATKRead> pairedReads = splitter.getPairedReads();
    final JavaRDD<GATKRead> unpairedReads = splitter.getUnpairedReads();
    filterLogger.logFinalPairedReads(pairedReads);

    return new Tuple2<>(pairedReads, unpairedReads);
}

Source File: BatchHeatMapProcessor.java From lambda-arch with Apache License 2.0

3 votes

/**
 * Filter the measurements in a given time period
 *
 * @param measurements | The dataset of measurements
 * @param start        | Start of the time period
 * @param end          | End of the time period
 * @return A set of measurements in the given time period
 */
private JavaRDD<Measurement> filterByTime(JavaRDD<Measurement> measurements, Date start, Date end) {
    return measurements.filter(measurement -> (
                    measurement.getTimestamp().equals(start) || measurement.getTimestamp().after(start)
            ) && measurement.getTimestamp().before(end)
    );
}

Source File: HoodieReadClient.java From hudi with Apache License 2.0

2 votes

/**
 * Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication.
 *
 * @param hoodieRecords Input RDD of Hoodie records.
 * @return A subset of hoodieRecords RDD, with existing records filtered out.
 */
public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) {
  JavaRDD<HoodieRecord<T>> recordsWithLocation = tagLocation(hoodieRecords);
  return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
}

Java Code Examples for org.apache.spark.api.java.JavaRDD#filter()