htsjdk.samtools.reference.ReferenceSequenceFileWalker Java Exaples

Source File: SetNmMdAndUqTags.java From picard with MIT License

6 votes

protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);

    if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
        throw new SAMException("Input must be coordinate-sorted for this program to run. Found: " + reader.getFileHeader().getSortOrder());
    }

    final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(reader.getFileHeader(), true, OUTPUT);
    writer.setProgressLogger(
            new ProgressLogger(log, (int) 1e7, "Wrote", "records"));

    final ReferenceSequenceFileWalker refSeqWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);

    StreamSupport.stream(reader.spliterator(), false)
            .peek(rec -> fixRecord(rec, refSeqWalker))
            .forEach(writer::addAlignment);
    CloserUtil.close(reader);
    writer.close();
    return 0;
}

Source File: BaseErrorCalculationTest.java From picard with MIT License

6 votes

@Test(dataProvider = "testOverlappingErrorCalculatorWithManyReadsData", timeOut = 5000)
public void testOverlappingErrorCalculatorWithManyReads(final File temp) throws IOException {

    try (final ReferenceSequenceFileWalker referenceSequenceFileWalker =
                 new ReferenceSequenceFileWalker(CommandLineProgramTest.CHR_M_REFERENCE.getAbsoluteFile());
         final SamLocusIterator samLocusIterator = new SamLocusIterator(SamReaderFactory.make().open(temp));
         final SamLocusAndReferenceIterator samLocusAndReferences = new SamLocusAndReferenceIterator(
                 referenceSequenceFileWalker, samLocusIterator)) {

        BaseErrorAggregation<OverlappingReadsErrorCalculator> aggregation =
                new BaseErrorAggregation<>(OverlappingReadsErrorCalculator::new, ReadBaseStratification.baseCycleStratifier);

        for (final SamLocusAndReferenceIterator.SAMLocusAndReference locusAndReference : samLocusAndReferences) {
            for (SamLocusIterator.RecordAndOffset recordAndOffset : locusAndReference.getRecordAndOffsets())

                aggregation.addBase(recordAndOffset, locusAndReference);
        }
    }
}

Source File: BaseErrorCalculationTest.java From picard with MIT License

6 votes

@DataProvider
public Object[][] testOverlappingErrorCalculatorWithManyReadsData() throws IOException {
    final File temp = File.createTempFile("Overlapping", ".bam");
    temp.deleteOnExit();

    try (
            final ReferenceSequenceFileWalker referenceSequenceFileWalker =
                    new ReferenceSequenceFileWalker(CommandLineProgramTest.CHR_M_REFERENCE)) {

        final SAMRecordSetBuilder builder = new SAMRecordSetBuilder();
        builder.getHeader().setSequenceDictionary(referenceSequenceFileWalker.getSequenceDictionary());

        for (int i = 0; i < 4000; i++) {
            builder.addPair("Read" + i, 0, 1, 1,
                    false, false, "36M", "36M", true, false, 20);
        }

        try (final SAMFileWriter writer = new SAMFileWriterFactory()
                .setCompressionLevel(2)
                .makeBAMWriter(builder.getHeader(), false, temp)) {
            builder.forEach(writer::addAlignment);
        }
    }
    return new Object[][]{{temp}};
}

Source File: AllelicCountCollector.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Constructs a {@link AllelicCountCollector} object for calculating {@link AllelicCountCollection} objects from
 * BAMs based on a reference genome.
 * @param referenceFile         file containing the reference
 * @param validationStringency  validation stringency to use for reading BAM files
 */
public AllelicCountCollector(final File referenceFile,
                             final ValidationStringency validationStringency) {
    IOUtils.canReadFile(referenceFile);
    readerFactory = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(referenceFile);
    referenceWalker = new ReferenceSequenceFileWalker(referenceFile);
}

Source File: SetNmMdAndUqTags.java From picard with MIT License

5 votes

private void fixRecord(SAMRecord record, ReferenceSequenceFileWalker refSeqWalker){
    if (!record.getReadUnmappedFlag()) {
        if (SET_ONLY_UQ) {
            AbstractAlignmentMerger.fixUq(record, refSeqWalker, IS_BISULFITE_SEQUENCE);
        } else {
            AbstractAlignmentMerger.fixNmMdAndUq(record, refSeqWalker, IS_BISULFITE_SEQUENCE);
        }
    }
}

Source File: AbstractAlignmentMerger.java From picard with MIT License

5 votes

/** Calculates and sets the NM, MD, and and UQ tags from the record and the reference
 *
 * @param record the record to be fixed
 * @param refSeqWalker a ReferenceSequenceWalker that will be used to traverse the reference
 * @param isBisulfiteSequence a flag indicating whether the sequence came from bisulfite-sequencing which would imply a different
 * calculation of the NM tag.
 *
 * No return value, modifies the provided record.
 */
public static void fixNmMdAndUq(final SAMRecord record, final ReferenceSequenceFileWalker refSeqWalker, final boolean isBisulfiteSequence) {
    final byte[] referenceBases = refSeqWalker.get(record.getReferenceIndex()).getBases();
    // only recalculate NM if it isn't bisulfite, since it needs to be treated specially below
    SequenceUtil.calculateMdAndNmTags(record, referenceBases, true, !isBisulfiteSequence);
    if (isBisulfiteSequence) {  // recalculate the NM tag for bisulfite data
        record.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(record, referenceBases, 0, isBisulfiteSequence));
    }
    fixUq(record, refSeqWalker, isBisulfiteSequence);
}

Source File: WgsMetricsProcessorImplTest.java From picard with MIT License

5 votes

@Test
public void testForFilteredBases(){
    AbstractLocusIterator iterator = createReadEndsIterator(exampleSamOneRead);
    CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics();
    FastWgsMetricsCollector collector =  new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList());
    String secondReferenceString = ">ref\nNNNNNNNNNNAATATTCTTC";
    ReferenceSequenceFile referenceSequenceFile = createReferenceSequenceFile(secondReferenceString);
    ReferenceSequenceFileWalker refWalker =  new ReferenceSequenceFileWalker(referenceSequenceFile);
    WgsMetricsProcessorImpl wgsMetricsProcessor =
            new WgsMetricsProcessorImpl(iterator, refWalker, collector, progress);
    wgsMetricsProcessor.processFile();
    assertEquals(10, collector.counter);
}

Source File: WgsMetricsProcessorImpl.java From picard with MIT License

5 votes

/**
 * @param iterator  input {@link htsjdk.samtools.util.AbstractLocusIterator}
 * @param refWalker over processed reference file
 * @param collector input {@link picard.analysis.AbstractWgsMetricsCollector}
 * @param progress  logger
 */
public WgsMetricsProcessorImpl(AbstractLocusIterator<T, AbstractLocusInfo<T>> iterator,
        ReferenceSequenceFileWalker refWalker,
        AbstractWgsMetricsCollector<T> collector,
        ProgressLogger progress) {
    this.iterator = iterator;
    this.collector = collector;
    this.refWalker = refWalker;
    this.progress = progress;
}

Source File: AbstractAlignmentMerger.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

protected void resetRefSeqFileWalker() {
    this.refSeq = new ReferenceSequenceFileWalker(referenceFasta);
}

Source File: GatherGeneGCLength.java From Drop-seq with MIT License

4 votes

@Override
   protected int doWork() {

       IOUtil.assertFileIsReadable(ANNOTATIONS_FILE);
       IOUtil.assertFileIsWritable(this.OUTPUT);
       PrintStream out = new ErrorCheckingPrintStream(IOUtil.openFileForWriting(OUTPUT));
       writeHeader(out);

       PrintStream outTranscript = null;
       if (this.OUTPUT_TRANSCRIPT_LEVEL!=null) {
		outTranscript = new ErrorCheckingPrintStream(IOUtil.openFileForWriting(OUTPUT_TRANSCRIPT_LEVEL));
		writeHeaderTranscript(outTranscript);
       }

       FastaSequenceFileWriter  outSequence = null;

       if (this.OUTPUT_TRANSCRIPT_SEQUENCES!=null) {
       	IOUtil.assertFileIsWritable(this.OUTPUT_TRANSCRIPT_SEQUENCES);
		outSequence = new FastaSequenceFileWriter (this.OUTPUT_TRANSCRIPT_SEQUENCES);
       }
       ReferenceSequenceFileWalker refFileWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);

       SAMSequenceDictionary dict= refFileWalker.getSequenceDictionary();
       if (dict==null) {
       	CloserUtil.close(refFileWalker);
       	throw new IllegalArgumentException("Reference file" + this.REFERENCE_SEQUENCE.getAbsolutePath()+" is missing a dictionary file [.dict].  Please make one!");
       }

       OverlapDetector<Gene> geneOverlapDetector= GeneAnnotationReader.loadAnnotationsFile(ANNOTATIONS_FILE, dict);

       List<SAMSequenceRecord> records = dict.getSequences();

	for (SAMSequenceRecord record: records) {
		String seqName = record.getSequenceName();
		int seqIndex=dict.getSequenceIndex(seqName);
		ReferenceSequence fastaRef=refFileWalker.get(seqIndex);

		// get the genes for this contig.
		Interval i = new Interval(seqName, 1, record.getSequenceLength());
		Collection< Gene> genes = geneOverlapDetector.getOverlaps(i);
		for (Gene g: genes) {
			List<GCResult> gcList = calculateGCContentGene(g, fastaRef, dict);
			if (this.OUTPUT_TRANSCRIPT_LEVEL!=null)
				writeResultTranscript(gcList, outTranscript);
			GCIsoformSummary summary = new GCIsoformSummary(g, gcList);
			if (this.OUTPUT_TRANSCRIPT_SEQUENCES!=null)
				writeTranscriptSequence(g, fastaRef, dict, outSequence);

			GCResult gc = calculateGCContentUnionExons(g, fastaRef, dict);

			writeResult(gc, summary, out);
		}
	}
	CloserUtil.close(refFileWalker);
	CloserUtil.close(out);
	if (this.OUTPUT_TRANSCRIPT_LEVEL!=null) CloserUtil.close(outTranscript);
	if (this.OUTPUT_TRANSCRIPT_SEQUENCES!=null) CloserUtil.close(outSequence);
       return 0;
}

Source File: AbstractAlignmentMerger.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

/**
 * Constructor
 *
 * @param unmappedBamFile                   The BAM file that was used as the input to the aligner, which will
 *                                          include info on all the reads that did not map.  Required.
 * @param targetBamFile                     The file to which to write the merged SAM records. Required.
 * @param referenceFasta                    The reference sequence for the map files. Required.
 * @param clipAdapters                      Whether adapters marked in unmapped BAM file should be marked as
 *                                          soft clipped in the merged bam. Required.
 * @param bisulfiteSequence                 Whether the reads are bisulfite sequence (used when calculating the
 *                                          NM and UQ tags). Required.
 * @param alignedReadsOnly                  Whether to output only those reads that have alignment data
 * @param programRecord                     Program record for target file SAMRecords created.
 * @param attributesToRetain                private attributes from the alignment record that should be
 *                                          included when merging.  This overrides the exclusion of
 *                                          attributes whose tags start with the reserved characters
 *                                          of X, Y, and Z
 * @param attributesToRemove                attributes from the alignment record that should be
 *                                          removed when merging.  This overrides attributesToRetain if they share
 *                                          common tags.
 * @param read1BasesTrimmed                 The number of bases trimmed from start of read 1 prior to alignment.  Optional.
 * @param read2BasesTrimmed                 The number of bases trimmed from start of read 2 prior to alignment.  Optional.
 * @param expectedOrientations              A List of SamPairUtil.PairOrientations that are expected for
 *                                          aligned pairs.  Used to determine the properPair flag.
 * @param sortOrder                         The order in which the merged records should be output.  If null,
 *                                          output will be coordinate-sorted
 * @param primaryAlignmentSelectionStrategy What to do when there are multiple primary alignments, or multiple
 *                                          alignments but none primary, for a read or read pair.
 * @param addMateCigar                      True if we are to add or maintain the mate CIGAR (MC) tag, false if we are to remove or not include.
 */
public AbstractAlignmentMerger(final File unmappedBamFile, final File targetBamFile,
                               final File referenceFasta, final boolean clipAdapters,
                               final boolean bisulfiteSequence, final boolean alignedReadsOnly,
                               final SAMProgramRecord programRecord, final List<String> attributesToRetain,
                               final List<String> attributesToRemove,
                               final Integer read1BasesTrimmed, final Integer read2BasesTrimmed,
                               final List<SamPairUtil.PairOrientation> expectedOrientations,
                               final SAMFileHeader.SortOrder sortOrder,
                               final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy,
                               final boolean addMateCigar) {
    IOUtil.assertFileIsReadable(unmappedBamFile);
    IOUtil.assertFileIsWritable(targetBamFile);
    IOUtil.assertFileIsReadable(referenceFasta);

    this.unmappedBamFile = unmappedBamFile;
    this.targetBamFile = targetBamFile;
    this.referenceFasta = referenceFasta;

    this.refSeq = new ReferenceSequenceFileWalker(referenceFasta);
    this.sequenceDictionary = refSeq.getSequenceDictionary();
    if (this.sequenceDictionary == null) {
        throw new UserException("No sequence dictionary found for " + referenceFasta.getAbsolutePath() +
                ".  Use CreateSequenceDictionary.jar to create a sequence dictionary.");
    }

    this.clipAdapters = clipAdapters;
    this.bisulfiteSequence = bisulfiteSequence;
    this.alignedReadsOnly = alignedReadsOnly;

    this.header = new SAMFileHeader();
    this.sortOrder = sortOrder != null ? sortOrder : SAMFileHeader.SortOrder.coordinate;
    header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    if (programRecord != null) {
        setProgramRecord(programRecord);
    }
    header.setSequenceDictionary(this.sequenceDictionary);
    if (attributesToRetain != null) {
        this.attributesToRetain.addAll(attributesToRetain);
    }
    if (attributesToRemove != null) {
        this.attributesToRemove.addAll(attributesToRemove);
        // attributesToRemove overrides attributesToRetain
        if (!this.attributesToRetain.isEmpty()) {
            for (String attribute : this.attributesToRemove) {
                if (this.attributesToRetain.contains(attribute)) {
                    logger.info("Overriding retaining the " + attribute + " tag since remove overrides retain.");
                    this.attributesToRetain.remove(attribute);
                }
            }
        }
    }
    this.read1BasesTrimmed = read1BasesTrimmed;
    this.read2BasesTrimmed = read2BasesTrimmed;
    this.expectedOrientations = expectedOrientations;

    this.primaryAlignmentSelectionStrategy = primaryAlignmentSelectionStrategy;

    this.addMateCigar = addMateCigar;
}

Source File: CollectWgsMetricsTestUtils.java From picard with MIT License

4 votes

static ReferenceSequenceFileWalker getReferenceSequenceFileWalker(){
    String referenceString = ">ref\nACCTACGTTCAATATTCTTC";
    return getReferenceSequenceFileWalker(referenceString);
}

Source File: CollectWgsMetricsTestUtils.java From picard with MIT License

4 votes

static ReferenceSequenceFileWalker getReferenceSequenceFileWalker(final String referenceString){
    ReferenceSequenceFile referenceSequenceFile = createReferenceSequenceFile(referenceString);
    return new ReferenceSequenceFileWalker(referenceSequenceFile);
}

Source File: CollectSamErrorMetricsTest.java From picard with MIT License

4 votes

@Test(dataProvider = "provideForTestIndelErrors")
public void testIndelErrors(final String[] readCigars, final String errorSubscript, IndelErrorMetric expectedMetric) throws IOException {

    final File temp = File.createTempFile("Indels", ".bam");
    temp.deleteOnExit();

    final int priorQ = 30;

    try (final ReferenceSequenceFileWalker referenceSequenceFileWalker =
                    new ReferenceSequenceFileWalker(CommandLineProgramTest.CHR_M_REFERENCE)) {

        final SAMRecordSetBuilder builder = new SAMRecordSetBuilder();
        builder.getHeader().setSequenceDictionary(referenceSequenceFileWalker.getSequenceDictionary());

        for (int i = 0; i < readCigars.length; i++) {
            // 10M2I3M4D10M5I
            builder.addFrag("Read" + i, 0, 100, false, false, readCigars[i], null, 30);
        }

        try (final SAMFileWriter writer = new SAMFileWriterFactory()
                .setCompressionLevel(2)
                .makeBAMWriter(builder.getHeader(), false, temp)) {
            builder.forEach(writer::addAlignment);
        }
    }

    final File referenceFile = CHR_M_REFERENCE;
    final File vcf = new File(TEST_DIR, "NIST.selected.vcf");

    final File outputBaseFileName = new File(OUTPUT_DATA_PATH, "test");
    final File errorByAll = new File(outputBaseFileName.getAbsolutePath() + errorSubscript);
    errorByAll.deleteOnExit();
    outputBaseFileName.deleteOnExit();

    final String[] args = {
            "INPUT=" + temp,
            "OUTPUT=" + outputBaseFileName,
            "REFERENCE_SEQUENCE=" + referenceFile.getAbsolutePath(),
            "ERROR_METRICS=" + "INDEL_ERROR",
            "ERROR_METRICS=" + "INDEL_ERROR:INDEL_LENGTH",
            "VCF=" + vcf.getAbsolutePath()
    };

    CollectSamErrorMetrics collectSamErrorMetrics = new CollectSamErrorMetrics();
    collectSamErrorMetrics.ERROR_METRICS.clear();

    Assert.assertEquals(collectSamErrorMetrics.instanceMain(args), 0);

    ErrorMetric.setPriorError(QualityUtil.getErrorProbabilityFromPhredScore(priorQ));
    expectedMetric.calculateDerivedFields();

    // Note that soft clipped bases are not counted
    List<IndelErrorMetric> metrics = MetricsFile.readBeans(errorByAll);

    IndelErrorMetric metric = metrics
            .stream()
            .filter(m -> m.COVARIATE.equals(expectedMetric.COVARIATE))
            .findAny()
            .orElseThrow(() -> new AssertionError("didn't find metric with COVARIATE==" + expectedMetric.COVARIATE));

    Assert.assertEquals(metric, expectedMetric);
}

Source File: CollectWgsMetrics.java From picard with MIT License

4 votes

private <T extends AbstractRecordAndOffset> WgsMetricsProcessorImpl<T> getWgsMetricsProcessor(
        ProgressLogger progress, ReferenceSequenceFileWalker refWalker,
        AbstractLocusIterator<T, AbstractLocusInfo<T>> iterator, AbstractWgsMetricsCollector<T> collector) {
    return new WgsMetricsProcessorImpl<>(iterator, refWalker, collector, progress);
}

Source File: CollectWgsMetrics.java From picard with MIT License

4 votes

@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
    INTERVALS = intervalArugmentCollection.getIntervalFile();
    if (INTERVALS != null) {
        IOUtil.assertFileIsReadable(INTERVALS);
    }
    if (THEORETICAL_SENSITIVITY_OUTPUT != null) {
        IOUtil.assertFileIsWritable(THEORETICAL_SENSITIVITY_OUTPUT);
    }

    // it doesn't make sense for the locus accumulation cap to be lower than the coverage cap
    if (LOCUS_ACCUMULATION_CAP < COVERAGE_CAP) {
        log.warn("Setting the LOCUS_ACCUMULATION_CAP to be equal to the COVERAGE_CAP (" + COVERAGE_CAP + ") because it should not be lower");
        LOCUS_ACCUMULATION_CAP = COVERAGE_CAP;
    }

    // Setup all the inputs
    final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci");
    final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final SamReader in = getSamReader();
    final AbstractLocusIterator iterator = getLocusIterator(in);

    final List<SamRecordFilter> filters = new ArrayList<>();
    final CountingFilter adapterFilter = new CountingAdapterFilter();
    final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY);
    final CountingFilter dupeFilter = new CountingDuplicateFilter();
    final CountingPairedFilter pairFilter = new CountingPairedFilter();
    // The order in which filters are added matters!
    filters.add(new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count reads twice
    filters.add(adapterFilter);
    filters.add(mapqFilter);
    filters.add(dupeFilter);
    if (!COUNT_UNPAIRED) {
        filters.add(pairFilter);
    }
    iterator.setSamFilters(filters);
    iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases
    iterator.setIncludeNonPfReads(false);

    final AbstractWgsMetricsCollector<?> collector = getCollector(COVERAGE_CAP, getIntervalsToExamine());
    final WgsMetricsProcessor processor = getWgsMetricsProcessor(progress, refWalker, iterator, collector);
    processor.processFile();

    final MetricsFile<WgsMetrics, Integer> out = getMetricsFile();
    processor.addToMetricsFile(out, INCLUDE_BQ_HISTOGRAM, dupeFilter, adapterFilter, mapqFilter, pairFilter);
    out.write(OUTPUT);

    if (THEORETICAL_SENSITIVITY_OUTPUT != null) {
        // Write out theoretical sensitivity results.
        final MetricsFile<TheoreticalSensitivityMetrics, ?> theoreticalSensitivityMetrics = getMetricsFile();
        log.info("Calculating theoretical sentitivity at " + ALLELE_FRACTION.size() + " allele fractions.");
        List<TheoreticalSensitivityMetrics> tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION);
        theoreticalSensitivityMetrics.addAllMetrics(tsm);
        theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT);
    }

    return 0;
}

Source File: CollectRrbsMetrics.java From picard with MIT License

4 votes

@Override
protected int doWork() {
    if (!METRICS_FILE_PREFIX.endsWith(".")) {
        METRICS_FILE_PREFIX = METRICS_FILE_PREFIX + ".";
    }
    final File SUMMARY_OUT = new File(METRICS_FILE_PREFIX + SUMMARY_FILE_EXTENSION);
    final File DETAILS_OUT = new File(METRICS_FILE_PREFIX + DETAIL_FILE_EXTENSION);
    final File PLOTS_OUT = new File(METRICS_FILE_PREFIX + PDF_FILE_EXTENSION);
    assertIoFiles(SUMMARY_OUT, DETAILS_OUT, PLOTS_OUT);

    final SamReader samReader = SamReaderFactory.makeDefault().open(INPUT);
    if (!ASSUME_SORTED && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
        throw new PicardException("The input file " + INPUT.getAbsolutePath() + " does not appear to be coordinate sorted");
    }

    final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final ProgressLogger progressLogger = new ProgressLogger(log);

    final RrbsMetricsCollector metricsCollector = new RrbsMetricsCollector(METRIC_ACCUMULATION_LEVEL, samReader.getFileHeader().getReadGroups(),
            C_QUALITY_THRESHOLD, NEXT_BASE_QUALITY_THRESHOLD, MINIMUM_READ_LENGTH, MAX_MISMATCH_RATE);

    for (final SAMRecord samRecord : samReader) {
        progressLogger.record(samRecord);
        if (!samRecord.getReadUnmappedFlag() && !isSequenceFiltered(samRecord.getReferenceName())) {
            final ReferenceSequence referenceSequence = refWalker.get(samRecord.getReferenceIndex());
            metricsCollector.acceptRecord(samRecord, referenceSequence);
        }
    }
    metricsCollector.finish();
    final MetricsFile<RrbsMetrics, Comparable<?>> rrbsMetrics = getMetricsFile();
    metricsCollector.addAllLevelsToFile(rrbsMetrics);

    // Using RrbsMetrics as a way to get both of the metrics objects through the MultiLevelCollector. Once
    // we get it out split it apart to the two separate MetricsFiles and write them to file
    final MetricsFile<RrbsSummaryMetrics, ?> summaryFile = getMetricsFile();
    final MetricsFile<RrbsCpgDetailMetrics, ?> detailsFile = getMetricsFile();
    for (final RrbsMetrics rrbsMetric : rrbsMetrics.getMetrics()) {
        summaryFile.addMetric(rrbsMetric.getSummaryMetrics());
        for (final RrbsCpgDetailMetrics detailMetric : rrbsMetric.getDetailMetrics()) {
            detailsFile.addMetric(detailMetric);
        }
    }
    summaryFile.write(SUMMARY_OUT);
    detailsFile.write(DETAILS_OUT);
    RExecutor.executeFromClasspath(R_SCRIPT, DETAILS_OUT.getAbsolutePath(), SUMMARY_OUT.getAbsolutePath(), PLOTS_OUT.getAbsolutePath());
    CloserUtil.close(samReader);
    return 0;
}

Source File: CollectSamErrorMetrics.java From picard with MIT License

4 votes

/**
 * Creates the {@link SamLocusAndReferenceIterator} object to use to traverse the input files.
 * Also initializes the {@link #sequenceDictionary} and performs some checks on the output
 * files in {@link #aggregatorList} to make sure we can write our output.
 * @param sam an open {@link SamReader} from which to pull reads.
 * @param referenceSequenceFileWalker an open {@link ReferenceSequenceFileWalker} from which to get reference sequence information.
 * @return An open {@link SamLocusAndReferenceIterator} ready to iterate.
 */
private SamLocusAndReferenceIterator createSamLocusAndReferenceIterator(final SamReader sam, final ReferenceSequenceFileWalker referenceSequenceFileWalker) {

    sequenceDictionary = referenceSequenceFileWalker.getSequenceDictionary();
    if (sam.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
        throw new PicardException("Input BAM must be sorted by coordinate");
    }

    // Make sure our reference and reads have the same sequence dictionary:
    sequenceDictionary.assertSameDictionary(sam.getFileHeader().getSequenceDictionary());

    final IntervalList regionOfInterest = getIntervals(sequenceDictionary);

    log.info("Getting SamLocusIterator");

    final SamLocusIterator samLocusIterator = new SamLocusIterator(sam, regionOfInterest);

    // We want to know about indels:
    samLocusIterator.setIncludeIndels(true);

    // Now go through and compare information at each of these sites
    samLocusIterator.setEmitUncoveredLoci(false);
    samLocusIterator.setMappingQualityScoreCutoff(MIN_MAPPING_Q);
    samLocusIterator.setQualityScoreCutoff(MIN_BASE_Q);

    log.info("Using " + aggregatorList.size() + " aggregators.");

    aggregatorList.forEach(la ->
            IOUtil.assertFileIsWritable(new File(OUTPUT + la.getSuffix())));

    // iterate over loci
    log.info("Starting iteration over loci");

    final SamLocusAndReferenceIterator iterator = new SamLocusAndReferenceIterator(referenceSequenceFileWalker, samLocusIterator);

    // This hasNext() call has side-effects. It loads up the index and makes sure that
    // the iterator is really ready for
    // action. Calling this allows for the logging to be more accurate.
    iterator.hasNext();
    return iterator;
}

Source File: AbstractAlignmentMerger.java From picard with MIT License

4 votes

protected void resetRefSeqFileWalker() {
    this.refSeq = new ReferenceSequenceFileWalker(referenceFasta);
}

Source File: AbstractAlignmentMerger.java From picard with MIT License

4 votes

/**
 * Constructor
 *
 * @param unmappedBamFile                   The BAM file that was used as the input to the aligner, which will
 *                                          include info on all the reads that did not map.  Required.
 * @param targetBamFile                     The file to which to write the merged SAM records. Required.
 * @param referenceFasta                    The reference sequence for the map files. Required.
 * @param clipAdapters                      Whether adapters marked in unmapped BAM file should be marked as
 *                                          soft clipped in the merged bam. Required.
 * @param bisulfiteSequence                 Whether the reads are bisulfite sequence (used when calculating the
 *                                          NM and UQ tags). Required.
 * @param alignedReadsOnly                  Whether to output only those reads that have alignment data
 * @param programRecord                     Program record for target file SAMRecords created.
 * @param attributesToRetain                private attributes from the alignment record that should be
 *                                          included when merging.  This overrides the exclusion of
 *                                          attributes whose tags start with the reserved characters
 *                                          of X, Y, and Z
 * @param attributesToRemove                attributes from the alignment record that should be
 *                                          removed when merging.  This overrides attributesToRetain if they share
 *                                          common tags.
 * @param read1BasesTrimmed                 The number of bases trimmed from start of read 1 prior to alignment.  Optional.
 * @param read2BasesTrimmed                 The number of bases trimmed from start of read 2 prior to alignment.  Optional.
 * @param expectedOrientations              A List of SamPairUtil.PairOrientations that are expected for
 *                                          aligned pairs.  Used to determine the properPair flag.
 * @param sortOrder                         The order in which the merged records should be output.  If null,
 *                                          output will be coordinate-sorted
 * @param primaryAlignmentSelectionStrategy What to do when there are multiple primary alignments, or multiple
 *                                          alignments but none primary, for a read or read pair.
 * @param addMateCigar                      True if we are to add or maintain the mate CIGAR (MC) tag, false if we are to remove or not include.
 * @param unmapContaminantReads             If true, identify reads having the signature of cross-species contamination (i.e. mostly clipped bases),
 *                                          and mark them as unmapped.
 * @param unmappingReadsStrategy            An enum describing how to deal with reads whose mapping information are being removed (currently this happens due to cross-species
 *                                          contamination). Ignored unless unmapContaminantReads is true.
 */
public AbstractAlignmentMerger(final File unmappedBamFile, final File targetBamFile,
                               final File referenceFasta, final boolean clipAdapters,
                               final boolean bisulfiteSequence, final boolean alignedReadsOnly,
                               final SAMProgramRecord programRecord, final List<String> attributesToRetain,
                               final List<String> attributesToRemove,
                               final Integer read1BasesTrimmed, final Integer read2BasesTrimmed,
                               final List<SamPairUtil.PairOrientation> expectedOrientations,
                               final SortOrder sortOrder,
                               final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy,
                               final boolean addMateCigar,
                               final boolean unmapContaminantReads,
                               final UnmappingReadStrategy unmappingReadsStrategy) {
    IOUtil.assertFileIsReadable(unmappedBamFile);
    IOUtil.assertFileIsWritable(targetBamFile);
    IOUtil.assertFileIsReadable(referenceFasta);

    this.unmappedBamFile = unmappedBamFile;
    this.targetBamFile = targetBamFile;
    this.referenceFasta = referenceFasta;

    this.refSeq = new ReferenceSequenceFileWalker(referenceFasta);

    this.clipAdapters = clipAdapters;
    this.bisulfiteSequence = bisulfiteSequence;
    this.alignedReadsOnly = alignedReadsOnly;

    this.header = new SAMFileHeader();
    this.sortOrder = sortOrder != null ? sortOrder : SortOrder.coordinate;
    header.setSortOrder(SortOrder.coordinate);
    if (programRecord != null) {
        setProgramRecord(programRecord);
    }

    if (attributesToRetain != null) {
        this.attributesToRetain.addAll(attributesToRetain);
    }
    if (attributesToRemove != null) {
        this.attributesToRemove.addAll(attributesToRemove);
        // attributesToRemove overrides attributesToRetain
        if (!this.attributesToRetain.isEmpty()) {
            this.attributesToRemove.stream()
                    .filter(this.attributesToRetain::contains)
                    .peek(a->log.info("Overriding retaining the " + a + " tag since 'remove' overrides 'retain'."))
                    .forEach(this.attributesToRetain::remove);
        }
    }
    this.read1BasesTrimmed = read1BasesTrimmed;
    this.read2BasesTrimmed = read2BasesTrimmed;
    this.expectedOrientations = expectedOrientations;

    this.primaryAlignmentSelectionStrategy = primaryAlignmentSelectionStrategy;

    this.addMateCigar = addMateCigar;
    this.unmapContaminantReads = unmapContaminantReads;
    this.unmappingReadsStrategy = unmappingReadsStrategy;
}

Source File: HetPulldownCalculator.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

4 votes

/**
 * For a normal or tumor sample, returns a data structure giving (intervals, reference counts, alternate counts),
 * where intervals give positions of likely heterozygous SNP sites.
 *
 * <p>
 *     For a normal sample:
 *     <ul>
 *         The IntervalList snpIntervals gives common SNP sites in 1-based format.
 *     </ul>
 *     <ul>
 *         The p-value threshold must be specified for a two-sided binomial test,
 *         which is used to determine SNP sites from snpIntervals that are
 *         compatible with a heterozygous SNP, given the sample.  Only these sites are output.
 *     </ul>
 * </p>
 * <p>
 *     For a tumor sample:
 *     <ul>
 *         The IntervalList snpIntervals gives heterozygous SNP sites likely to be present in the normal sample.
 *         This should be from {@link HetPulldownCalculator#getNormal} in 1-based format.
 *         Only these sites are output.
 *     </ul>
 * </p>
 * @param bamFile           sorted BAM file for sample
 * @param snpIntervals      IntervalList of SNP sites
 * @param sampleType        flag indicating type of sample (SampleType.NORMAL or SampleType.TUMOR)
 *                          (determines whether to perform binomial test)
 * @param pvalThreshold     p-value threshold for two-sided binomial test, used for normal sample
 * @param minimumRawReads   minimum number of total reads that must be present at a het site
 * @return                  Pulldown of heterozygous SNP sites in 1-based format
 */
private Pulldown getHetPulldown(final File bamFile, final IntervalList snpIntervals, final SampleType sampleType,
                                final double pvalThreshold, final int minimumRawReads) {
    try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency)
            .referenceSequence(refFile).open(bamFile);
         final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
        if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
        }

        final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());

        final int totalNumberOfSNPs = snpIntervals.size();
        final SamLocusIterator locusIterator = new SamLocusIterator(bamReader, snpIntervals,
                totalNumberOfSNPs < MAX_INTERVALS_FOR_INDEX);

        //set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
        final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(),
                new DuplicateReadFilter());
        locusIterator.setSamFilters(samFilters);
        locusIterator.setEmitUncoveredLoci(false);
        locusIterator.setIncludeNonPfReads(false);
        locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
        locusIterator.setQualityScoreCutoff(minBaseQuality);

        logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
        int locusCount = 0;
        for (final SamLocusIterator.LocusInfo locus : locusIterator) {
            if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
                logger.info("Examined " + locusCount + " covered sites.");
            }
            locusCount++;

            //include N, etc. reads here
            final int totalReadCount = locus.getRecordAndOffsets().size();
            if (totalReadCount < minimumRawReads) {
                continue;
            }

            final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
            //only include total ACGT counts in binomial test (exclude N, etc.)
            final int totalBaseCount = Arrays.stream(BASES).mapToInt(b -> (int) baseCounts.get(b)).sum();

            if (sampleType == SampleType.NORMAL &&
                    !isPileupHetCompatible(baseCounts, totalBaseCount, pvalThreshold)) {
                continue;
            }

            final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
            final int refReadCount = (int) baseCounts.get(refBase);
            final int altReadCount = totalBaseCount - refReadCount;

            hetPulldown.add(new AllelicCount(
                    new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()),
                    refReadCount, altReadCount));
        }
        logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
        return hetPulldown;
    } catch (final IOException | SAMFormatException e) {
        throw new UserException(e.getMessage());
    }
}

Source File: BayesianHetPulldownCalculator.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

4 votes

/**
 * For a given normal or tumor BAM file, walks through the list of common SNPs,
 * {@link BayesianHetPulldownCalculator#snpIntervals}), detects heterozygous sites, and returns
 * a {@link Pulldown} containing detailed information on the called heterozygous SNP sites.
 *
 * The {@code hetCallingStrigency} parameters sets the threshold posterior for calling a Het SNP site:
 *
 *      hetPosteriorThreshold = 1 - 10^{-hetCallingStringency}
 *      hetThresholdLogOdds = log(hetPosteriorThreshold/(1-hetPosteriorThreshold))
 *                          = log(10^{hetCallingStringency} - 1)
 *
 * (see CNV-methods.pdf for details)
 *
 * @param bamFile sorted BAM file for sample
 * @param hetCallingStringency strigency for calling a Het site
 * @return Pulldown of heterozygous SNP sites in 1-based format
 */
public Pulldown getHetPulldown(final File bamFile, final double hetCallingStringency) {
    /* log odds from stringency */
    final double hetThresholdLogOdds = FastMath.log(FastMath.pow(10, hetCallingStringency) - 1);

    try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency)
            .referenceSequence(refFile).open(bamFile);
         final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
        if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
        }

        final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
        final SamLocusIterator locusIterator = getSamLocusIteratorWithDefaultFilters(bamReader);

        final int totalNumberOfSNPs = snpIntervals.size();
        logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
        int locusCount = 0;
        for (final SamLocusIterator.LocusInfo locus : locusIterator) {
            if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
                logger.info("Examined " + locusCount + " covered sites.");
            }
            locusCount++;

            final int totalReadCount = locus.getRecordAndOffsets().size();
            if (totalReadCount <= readDepthThreshold) {
                continue;
            }
            final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex())
                    .getBases()[locus.getPosition() - 1]);
            if (!isProperBase(refBase)) {
                logger.warn(String.format("The reference position at %d has an unknown base call (value: %s). Even though" +
                        " this position is indicated to be a possible heterozygous SNP in the provided SNP interval list," +
                        " no inference can be made. Continuing ...", locus.getPosition(), refBase.toString()));
                continue;
            }

            final Map<Nucleotide, List<BaseQuality>> baseQualities = getPileupBaseQualities(locus);
            final Nucleotide altBase = inferAltFromPileup(baseQualities, refBase);

            /* calculate Het log odds */
            final double hetLogLikelihood = getHetLogLikelihood(baseQualities, refBase, altBase);
            final double homLogLikelihood = getHomLogLikelihood(baseQualities, refBase, altBase,
                    DEFAULT_PRIOR_REF_HOM);
            final double hetLogOdds = (hetLogLikelihood + FastMath.log(DEFAULT_PRIOR_HET)) -
                    (homLogLikelihood + FastMath.log(1 - DEFAULT_PRIOR_HET));

            if (hetLogOdds > hetThresholdLogOdds) {
                hetPulldown.add(new AllelicCount(
                        new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()),
                        baseQualities.get(refBase).size(), baseQualities.get(altBase).size(),
                        refBase, altBase, totalReadCount, hetLogOdds));
            }
        }

        logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");

        return hetPulldown;

    } catch (final IOException | SAMFormatException e) {
        throw new UserException(e.getMessage());
    }
}

Source File: AbstractAlignmentMerger.java From picard with MIT License

3 votes

/** Calculates and sets UQ tag from the record and the reference
 *
 * @param record the record to be fixed
 * @param refSeqWalker a ReferenceSequenceWalker that will be used to traverse the reference
 * @param isBisulfiteSequence a flag indicating whether the sequence came from bisulfite-sequencing.
 *
 * No return value, modifies the provided record.
 */
public static void fixUq(final SAMRecord record, final ReferenceSequenceFileWalker refSeqWalker, final boolean isBisulfiteSequence) {
    if (record.getBaseQualities() != SAMRecord.NULL_QUALS) {
        final byte[] referenceBases = refSeqWalker.get(record.getReferenceIndex()).getBases();
        record.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(record, referenceBases, 0, isBisulfiteSequence));
    }
}

htsjdk.samtools.reference.ReferenceSequenceFileWalker Java Examples