htsjdk.samtools.filter.SamRecordFilter Java Examples
The following examples show how to use
htsjdk.samtools.filter.SamRecordFilter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MultiHitAlignedReadIterator.java From picard with MIT License | 6 votes |
/** * * @param querynameOrderIterator * @param primaryAlignmentSelectionStrategy Algorithm for selecting primary alignment when it is not clear from * the input what should be primary. */ MultiHitAlignedReadIterator(final CloseableIterator<SAMRecord> querynameOrderIterator, final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy) { this.primaryAlignmentSelectionStrategy = primaryAlignmentSelectionStrategy; peekIterator = new PeekableIterator<SAMRecord>(new FilteringSamIterator(querynameOrderIterator, new SamRecordFilter() { // Filter unmapped reads. public boolean filterOut(final SAMRecord record) { return record.getReadUnmappedFlag() || SAMUtils.cigarMapsNoBasesToRef(record.getCigar()); } public boolean filterOut(final SAMRecord first, final SAMRecord second) { return ((first.getReadUnmappedFlag() || SAMUtils.cigarMapsNoBasesToRef(first.getCigar())) && (second.getReadUnmappedFlag() || SAMUtils.cigarMapsNoBasesToRef(second.getCigar()))); } })); advance(); }
Example #2
Source File: MultiHitAlignedReadIterator.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
/** * * @param querynameOrderIterator * @param primaryAlignmentSelectionStrategy Algorithm for selecting primary alignment when it is not clear from * the input what should be primary. */ MultiHitAlignedReadIterator(final CloseableIterator<SAMRecord> querynameOrderIterator, final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy) { this.primaryAlignmentSelectionStrategy = primaryAlignmentSelectionStrategy; peekIterator = new PeekableIterator<>(new FilteringSamIterator(querynameOrderIterator, new SamRecordFilter() { // Filter unmapped reads. @Override public boolean filterOut(final SAMRecord record) { return record.getReadUnmappedFlag() || SAMUtils.cigarMapsNoBasesToRef(record.getCigar()); } @Override public boolean filterOut(final SAMRecord first, final SAMRecord second) { return ((first.getReadUnmappedFlag() || SAMUtils.cigarMapsNoBasesToRef(first.getCigar())) && (second.getReadUnmappedFlag() || SAMUtils.cigarMapsNoBasesToRef(second.getCigar()))); } })); advance(); }
Example #3
Source File: BayesianHetPulldownCalculator.java From gatk-protected with BSD 3-Clause "New" or "Revised" License | 5 votes |
/** * Returns a {@link SamLocusIterator} object for a given {@link SamReader} and {@link IntervalList} with filters * on minimum base quality and minimum mapping quality * * @param samReader a SamReader object * @return a SamLocusIterator object */ private SamLocusIterator getSamLocusIteratorWithDefaultFilters(final SamReader samReader) { final SamLocusIterator locusIterator = new SamLocusIterator(samReader, snpIntervals, false); /* set read and locus filters */ final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter()); locusIterator.setSamFilters(samFilters); locusIterator.setEmitUncoveredLoci(false); locusIterator.setIncludeNonPfReads(false); locusIterator.setMappingQualityScoreCutoff(minMappingQuality); locusIterator.setQualityScoreCutoff(minBaseQuality); return locusIterator; }
Example #4
Source File: BamToBfqWriter.java From picard with MIT License | 5 votes |
/** * Writes the binary fastq file(s) to the output directory */ public void writeBfqFiles() { final SamReader reader = SamReaderFactory.makeDefault().open(bamFile); final Iterator<SAMRecord> iterator = reader.iterator(); // Filter out noise reads and reads that fail the quality filter final TagFilter tagFilter = new TagFilter(ReservedTagConstants.XN, 1); final FailsVendorReadQualityFilter qualityFilter = new FailsVendorReadQualityFilter(); final WholeReadClippedFilter clippedFilter = new WholeReadClippedFilter(); if (!pairedReads) { List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>(); filters.add(tagFilter); filters.add(clippedFilter); if (!this.includeNonPfReads) { filters.add(qualityFilter); } writeSingleEndBfqs(iterator, filters); codec1.close(); } else { writePairedEndBfqs(iterator, tagFilter, qualityFilter, clippedFilter); codec1.close(); codec2.close(); } log.info("Wrote " + wrote + " bfq records."); CloserUtil.close(reader); }
Example #5
Source File: BamToBfqWriter.java From picard with MIT License | 5 votes |
/** * Path for writing bfqs for single-end reads * * @param iterator the iterator with he SAM Records to write * @param filters the list of filters to be applied */ private void writeSingleEndBfqs(final Iterator<SAMRecord> iterator, final List<SamRecordFilter> filters) { // Open the codecs for writing int fileIndex = 0; initializeNextBfqFiles(fileIndex++); int records = 0; final FilteringSamIterator it = new FilteringSamIterator(iterator, new AggregateFilter(filters)); while (it.hasNext()) { final SAMRecord record = it.next(); records++; if (records % increment == 0) { record.setReadName(record.getReadName() + "/1"); writeFastqRecord(codec1, record); wrote++; if (wrote % 1000000 == 0) { log.info(wrote + " records processed."); } if (chunk > 0 && wrote % chunk == 0) { initializeNextBfqFiles(fileIndex++); } } } }
Example #6
Source File: CollectWgsMetricsTestUtils.java From picard with MIT License | 5 votes |
static AbstractLocusIterator createReadEndsIterator(final String exampleSam){ final List<SamRecordFilter> filters = new ArrayList<>(); final CountingFilter dupeFilter = new CountingDuplicateFilter(); final CountingFilter mapqFilter = new CountingMapQFilter(0); filters.add(new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count reads twice filters.add(mapqFilter); filters.add(dupeFilter); SamReader samReader = createSamReader(exampleSam); AbstractLocusIterator iterator = new EdgeReadIterator(samReader); iterator.setSamFilters(filters); iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases iterator.setIncludeNonPfReads(false); return iterator; }
Example #7
Source File: CollectWgsMetricsTestUtils.java From picard with MIT License | 5 votes |
static AbstractLocusIterator createSamLocusIterator(final String exampleSam){ final List<SamRecordFilter> filters = new ArrayList<>(); final CountingFilter dupeFilter = new CountingDuplicateFilter(); final CountingFilter mapqFilter = new CountingMapQFilter(0); filters.add(new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count reads twice filters.add(mapqFilter); filters.add(dupeFilter); SamReader samReader = createSamReader(exampleSam); AbstractLocusIterator iterator = new SamLocusIterator(samReader); iterator.setSamFilters(filters); iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases iterator.setIncludeNonPfReads(false); return iterator; }
Example #8
Source File: HetPulldownCalculator.java From gatk-protected with BSD 3-Clause "New" or "Revised" License | 4 votes |
/** * For a normal or tumor sample, returns a data structure giving (intervals, reference counts, alternate counts), * where intervals give positions of likely heterozygous SNP sites. * * <p> * For a normal sample: * <ul> * The IntervalList snpIntervals gives common SNP sites in 1-based format. * </ul> * <ul> * The p-value threshold must be specified for a two-sided binomial test, * which is used to determine SNP sites from snpIntervals that are * compatible with a heterozygous SNP, given the sample. Only these sites are output. * </ul> * </p> * <p> * For a tumor sample: * <ul> * The IntervalList snpIntervals gives heterozygous SNP sites likely to be present in the normal sample. * This should be from {@link HetPulldownCalculator#getNormal} in 1-based format. * Only these sites are output. * </ul> * </p> * @param bamFile sorted BAM file for sample * @param snpIntervals IntervalList of SNP sites * @param sampleType flag indicating type of sample (SampleType.NORMAL or SampleType.TUMOR) * (determines whether to perform binomial test) * @param pvalThreshold p-value threshold for two-sided binomial test, used for normal sample * @param minimumRawReads minimum number of total reads that must be present at a het site * @return Pulldown of heterozygous SNP sites in 1-based format */ private Pulldown getHetPulldown(final File bamFile, final IntervalList snpIntervals, final SampleType sampleType, final double pvalThreshold, final int minimumRawReads) { try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency) .referenceSequence(refFile).open(bamFile); final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) { if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted."); } final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader()); final int totalNumberOfSNPs = snpIntervals.size(); final SamLocusIterator locusIterator = new SamLocusIterator(bamReader, snpIntervals, totalNumberOfSNPs < MAX_INTERVALS_FOR_INDEX); //set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup] final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter()); locusIterator.setSamFilters(samFilters); locusIterator.setEmitUncoveredLoci(false); locusIterator.setIncludeNonPfReads(false); locusIterator.setMappingQualityScoreCutoff(minMappingQuality); locusIterator.setQualityScoreCutoff(minBaseQuality); logger.info("Examining " + totalNumberOfSNPs + " sites in total..."); int locusCount = 0; for (final SamLocusIterator.LocusInfo locus : locusIterator) { if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) { logger.info("Examined " + locusCount + " covered sites."); } locusCount++; //include N, etc. reads here final int totalReadCount = locus.getRecordAndOffsets().size(); if (totalReadCount < minimumRawReads) { continue; } final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus); //only include total ACGT counts in binomial test (exclude N, etc.) final int totalBaseCount = Arrays.stream(BASES).mapToInt(b -> (int) baseCounts.get(b)).sum(); if (sampleType == SampleType.NORMAL && !isPileupHetCompatible(baseCounts, totalBaseCount, pvalThreshold)) { continue; } final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]); final int refReadCount = (int) baseCounts.get(refBase); final int altReadCount = totalBaseCount - refReadCount; hetPulldown.add(new AllelicCount( new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount)); } logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined."); return hetPulldown; } catch (final IOException | SAMFormatException e) { throw new UserException(e.getMessage()); } }
Example #9
Source File: AllelicCountCollector.java From gatk-protected with BSD 3-Clause "New" or "Revised" License | 4 votes |
/** * Returns an {@link AllelicCountCollection} based on the pileup at sites (specified by an interval list) * in a sorted BAM file. Reads and bases below the specified mapping quality and base quality, respectively, * are filtered out of the pileup. The alt count is defined as the total count minus the ref count, and the * alt nucleotide is defined as the non-ref base with the highest count, with ties broken by the order of the * bases in {@link AllelicCountCollector#BASES}. * @param bamFile sorted BAM file * @param siteIntervals interval list of sites * @param minMappingQuality minimum mapping quality required for reads to be included in pileup * @param minBaseQuality minimum base quality required for bases to be included in pileup * @return AllelicCountCollection of ref/alt counts at sites in BAM file */ public AllelicCountCollection collect(final File bamFile, final IntervalList siteIntervals, final int minMappingQuality, final int minBaseQuality) { try (final SamReader reader = readerFactory.open(bamFile)) { ParamUtils.isPositiveOrZero(minMappingQuality, "Minimum mapping quality must be nonnegative."); ParamUtils.isPositiveOrZero(minBaseQuality, "Minimum base quality must be nonnegative."); if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted."); } final int numberOfSites = siteIntervals.size(); final boolean useIndex = numberOfSites < MAX_INTERVALS_FOR_INDEX; final SamLocusIterator locusIterator = new SamLocusIterator(reader, siteIntervals, useIndex); //set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup] final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter()); locusIterator.setSamFilters(samFilters); locusIterator.setEmitUncoveredLoci(true); locusIterator.setIncludeNonPfReads(false); locusIterator.setMappingQualityScoreCutoff(minMappingQuality); locusIterator.setQualityScoreCutoff(minBaseQuality); logger.info("Examining " + numberOfSites + " sites in total..."); int locusCount = 0; final AllelicCountCollection counts = new AllelicCountCollection(); for (final SamLocusIterator.LocusInfo locus : locusIterator) { if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) { logger.info("Examined " + locusCount + " sites."); } locusCount++; final Nucleotide refBase = Nucleotide.valueOf(referenceWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]); if (!BASES.contains(refBase)) { logger.warn(String.format("The reference position at %d has an unknown base call (value: %s). Skipping...", locus.getPosition(), refBase.toString())); continue; } final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus); final int totalBaseCount = BASES.stream().mapToInt(b -> (int) baseCounts.get(b)).sum(); //only include total ACGT counts in binomial test (exclude N, etc.) final int refReadCount = (int) baseCounts.get(refBase); final int altReadCount = totalBaseCount - refReadCount; //we take alt = total - ref instead of the actual alt count final Nucleotide altBase = inferAltFromPileupBaseCounts(baseCounts, refBase); counts.add(new AllelicCount( new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount, refBase, altBase)); } logger.info(locusCount + " sites out of " + numberOfSites + " total sites were examined."); return counts; } catch (final IOException | SAMFormatException e) { throw new UserException("Unable to collect allelic counts from " + bamFile); } }
Example #10
Source File: BamToBfqWriter.java From picard with MIT License | 4 votes |
/** * Path for writing bfqs for paired-end reads * * @param iterator the iterator witht he SAM Records to write * @param tagFilter the filter for noise reads * @param qualityFilter the filter for PF reads */ private void writePairedEndBfqs(final Iterator<SAMRecord> iterator, final TagFilter tagFilter, final FailsVendorReadQualityFilter qualityFilter, SamRecordFilter ... otherFilters) { // Open the codecs for writing int fileIndex = 0; initializeNextBfqFiles(fileIndex++); int records = 0; RECORD_LOOP: while (iterator.hasNext()) { final SAMRecord first = iterator.next(); if (!iterator.hasNext()) { throw new PicardException("Mismatched number of records in " + this.bamFile.getAbsolutePath()); } final SAMRecord second = iterator.next(); if (!second.getReadName().equals(first.getReadName()) || first.getFirstOfPairFlag() == second.getFirstOfPairFlag()) { throw new PicardException("Unmatched read pairs in " + this.bamFile.getAbsolutePath() + ": " + first.getReadName() + ", " + second.getReadName() + "."); } // If *both* are noise reads, filter them out if (tagFilter.filterOut(first) && tagFilter.filterOut(second)) { continue; } // If either fails to pass filter, then exclude them as well if (!includeNonPfReads && (qualityFilter.filterOut(first) || qualityFilter.filterOut(second))) { continue; } // If either fails any of the other filters, exclude them both for (SamRecordFilter filter : otherFilters) { if (filter.filterOut(first) || filter.filterOut(second)) { continue RECORD_LOOP; } } // Otherwise, write them out records++; if (records % increment == 0) { first.setReadName(first.getReadName() + "/1"); writeFastqRecord(first.getFirstOfPairFlag() ? codec1 : codec2, first); second.setReadName(second.getReadName() + "/2"); writeFastqRecord(second.getFirstOfPairFlag() ? codec1 : codec2, second); wrote++; if (wrote % 1000000 == 0) { log.info(wrote + " records written."); } if (chunk > 0 && wrote % chunk == 0) { initializeNextBfqFiles(fileIndex++); } } } }
Example #11
Source File: BamToBfqWriter.java From picard with MIT License | 4 votes |
/** * Count the number of records in the bamFile that could potentially be written * * @return the number of records in the Bam file */ private int countWritableRecords() { int count = 0; final SamReader reader = SamReaderFactory.makeDefault().open(this.bamFile); if(!reader.getFileHeader().getSortOrder().equals(SAMFileHeader.SortOrder.queryname)) { //this is a fix for issue PIC-274: It looks like BamToBfqWriter requires that the input BAM is queryname sorted, //but it doesn't check this early, nor produce an understandable error message." throw new PicardException("Input file (" + this.bamFile.getAbsolutePath() +") needs to be sorted by queryname."); } final PeekableIterator<SAMRecord> it = new PeekableIterator<SAMRecord>(reader.iterator()); if (!this.pairedReads) { // Filter out noise reads and reads that fail the quality filter final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>(); filters.add(new TagFilter(ReservedTagConstants.XN, 1)); if (!this.includeNonPfReads) { filters.add(new FailsVendorReadQualityFilter()); } final FilteringSamIterator itr = new FilteringSamIterator(it, new AggregateFilter(filters)); while (itr.hasNext()) { itr.next(); count++; } } else { while (it.hasNext()) { final SAMRecord first = it.next(); final SAMRecord second = it.next(); // If both are noise reads, filter them out if (first.getAttribute(ReservedTagConstants.XN) != null && second.getAttribute(ReservedTagConstants.XN) != null) { // skip it } // If either fails to pass filter, then exclude them as well else if (!this.includeNonPfReads && (first.getReadFailsVendorQualityCheckFlag() || second.getReadFailsVendorQualityCheckFlag()) ) { // skip it } // Otherwise, write them out else { count++; } } } it.close(); CloserUtil.close(reader); return count; }
Example #12
Source File: RevertSam.java From picard with MIT License | 4 votes |
private Map<SAMReadGroupRecord, FastqQualityFormat> createReadGroupFormatMap( final SAMFileHeader inHeader, final File referenceSequence, final ValidationStringency validationStringency, final File input, final boolean restoreOriginalQualities) { final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat = new HashMap<>(); final SamReaderFactory readerFactory = SamReaderFactory.makeDefault().referenceSequence(referenceSequence).validationStringency(validationStringency); // Figure out the quality score encoding scheme for each read group. for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) { final SamRecordFilter filter = new SamRecordFilter() { public boolean filterOut(final SAMRecord rec) { return !rec.getReadGroup().getId().equals(rg.getId()); } public boolean filterOut(final SAMRecord first, final SAMRecord second) { throw new UnsupportedOperationException(); } }; try (final SamReader reader = readerFactory.open(input)) { final FilteringSamIterator filterIterator = new FilteringSamIterator(reader.iterator(), filter); if (filterIterator.hasNext()) { readGroupToFormat.put(rg, QualityEncodingDetector.detect( QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, filterIterator, restoreOriginalQualities)); } else { log.warn("Skipping read group " + rg.getReadGroupId() + " with no reads"); } } catch (IOException e) { throw new RuntimeIOException(e); } } for (final SAMReadGroupRecord r : readGroupToFormat.keySet()) { log.info("Detected quality format for " + r.getReadGroupId() + ": " + readGroupToFormat.get(r)); } if (readGroupToFormat.values().contains(FastqQualityFormat.Solexa)) { throw new PicardException("No quality score encoding conversion implemented for " + FastqQualityFormat.Solexa); } return readGroupToFormat; }
Example #13
Source File: CollectWgsMetrics.java From picard with MIT License | 4 votes |
@Override protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); INTERVALS = intervalArugmentCollection.getIntervalFile(); if (INTERVALS != null) { IOUtil.assertFileIsReadable(INTERVALS); } if (THEORETICAL_SENSITIVITY_OUTPUT != null) { IOUtil.assertFileIsWritable(THEORETICAL_SENSITIVITY_OUTPUT); } // it doesn't make sense for the locus accumulation cap to be lower than the coverage cap if (LOCUS_ACCUMULATION_CAP < COVERAGE_CAP) { log.warn("Setting the LOCUS_ACCUMULATION_CAP to be equal to the COVERAGE_CAP (" + COVERAGE_CAP + ") because it should not be lower"); LOCUS_ACCUMULATION_CAP = COVERAGE_CAP; } // Setup all the inputs final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci"); final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); final SamReader in = getSamReader(); final AbstractLocusIterator iterator = getLocusIterator(in); final List<SamRecordFilter> filters = new ArrayList<>(); final CountingFilter adapterFilter = new CountingAdapterFilter(); final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY); final CountingFilter dupeFilter = new CountingDuplicateFilter(); final CountingPairedFilter pairFilter = new CountingPairedFilter(); // The order in which filters are added matters! filters.add(new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count reads twice filters.add(adapterFilter); filters.add(mapqFilter); filters.add(dupeFilter); if (!COUNT_UNPAIRED) { filters.add(pairFilter); } iterator.setSamFilters(filters); iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases iterator.setIncludeNonPfReads(false); final AbstractWgsMetricsCollector<?> collector = getCollector(COVERAGE_CAP, getIntervalsToExamine()); final WgsMetricsProcessor processor = getWgsMetricsProcessor(progress, refWalker, iterator, collector); processor.processFile(); final MetricsFile<WgsMetrics, Integer> out = getMetricsFile(); processor.addToMetricsFile(out, INCLUDE_BQ_HISTOGRAM, dupeFilter, adapterFilter, mapqFilter, pairFilter); out.write(OUTPUT); if (THEORETICAL_SENSITIVITY_OUTPUT != null) { // Write out theoretical sensitivity results. final MetricsFile<TheoreticalSensitivityMetrics, ?> theoreticalSensitivityMetrics = getMetricsFile(); log.info("Calculating theoretical sentitivity at " + ALLELE_FRACTION.size() + " allele fractions."); List<TheoreticalSensitivityMetrics> tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION); theoreticalSensitivityMetrics.addAllMetrics(tsm); theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT); } return 0; }