htsjdk.samtools.SAMSequenceRecord#getSequenceLength

Source File: RandomDNA.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Creates a random reference and writes it in FASTA format into a {@link Writer}.
 * @param out the output writer.
 * @param dict the dictionary indicating the number of contigs and their lengths.
 * @param basesPerLine number of base to print in each line of the output FASTA file.
 *
 * @throws IOException if such an exception was thrown while accessing and writing into the temporal file.
 * @throws IllegalArgumentException if {@code dict} is {@code null}, or {@code out } is {@code null}
 *    or {@code basesPerLine} is 0 or negative.
 */
public void nextFasta(final Writer out, final SAMSequenceDictionary dict, final int basesPerLine)
        throws IOException {
    Utils.nonNull(out);
    Utils.nonNull(dict);
    ParamUtils.isPositive(basesPerLine, "number of base per line must be strictly positive: " + basesPerLine);
    final byte[] buffer = new byte[basesPerLine];
    final String lineSeparator = System.lineSeparator();
    for (final SAMSequenceRecord sequence : dict.getSequences()) {
        int pendingBases = sequence.getSequenceLength();
        out.append(">").append(sequence.getSequenceName()).append(lineSeparator);
        while (pendingBases > 0) {
            final int lineLength = pendingBases < basesPerLine ? pendingBases : basesPerLine;
            nextBases(buffer, 0, lineLength);
            out.append(new String(buffer, 0, lineLength)).append(lineSeparator);
            pendingBases -= lineLength;
        }
    }
}

Source File: CachingIndexedFastaSequenceFileUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@Test
public void testMixedCasesInExample() throws IOException {
    try(final IndexedFastaSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA));
        final CachingIndexedFastaSequenceFile casePreserving = new CachingIndexedFastaSequenceFile(IOUtils.getPath(exampleFASTA), true);
        final CachingIndexedFastaSequenceFile allUpper = new CachingIndexedFastaSequenceFile(IOUtils.getPath(exampleFASTA), CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE, false, true);
    ) {

        int nMixedCase = 0;
        for (SAMSequenceRecord contig : original.getSequenceDictionary().getSequences()) {
            nMixedCase += mixedCasesTestHelper(original, casePreserving, allUpper, contig.getSequenceName(), -1, -1);

            final int step = 100;
            for (int lastPos = step; lastPos < contig.getSequenceLength(); lastPos += step) {
                mixedCasesTestHelper(original, casePreserving, allUpper, contig.getSequenceName(), lastPos - step, lastPos);
            }
        }


        Assert.assertTrue(nMixedCase > 0, "No mixed cases sequences found in file.  Unexpected test state");
    }
}

Source File: SamRangeUtils.java From rtg-tools with BSD 2-Clause "Simplified" License

6 votes

/**
 * Resolves an inital range (supplied by the user, and may have unbounded ends) to the available sequences.
 * If end is greater than number of sequences it sets end to number of sequences.
 * @param range the range
 * @param dictionary the dictionary with which to validate/resolve the range
 * @return the resolved range.
 * @throws NoTalkbackSlimException if the start is out of range.
 */
public static SequenceNameLocus resolveRestriction(SAMSequenceDictionary dictionary, SequenceNameLocus range) {
  final SAMSequenceRecord sequence = dictionary.getSequence(range.getSequenceName());
  if (sequence == null) {
    throw new NoTalkbackSlimException("Sequence \"" + range.getSequenceName() + "\" referenced in region was not found in the SAM sequence dictionary.");
  }
  final int start = range.getStart() == SamRegionRestriction.MISSING ? 0 : range.getStart();
  final int length = sequence.getSequenceLength();
  if (start > length || (length != 0 && start == length)) {  // Allow start == 0 if empty sequence
    throw new NoTalkbackSlimException("The start position \"" + start + "\" must be less than than length of the sequence \"" + length + "\".");
  }
  int end = range.getEnd() == LongRange.MISSING ? length : range.getEnd();
  if (end > length) {
    Diagnostic.warning("The end position \"" + range.getEnd() + "\" is outside the length of the sequence (" + length
      + "). Defaulting end to \"" + length + "\"");
    end = length;
  }
  return new SequenceNameLocusSimple(range.getSequenceName(), start, end);
}

Source File: SamRangeUtils.java From rtg-tools with BSD 2-Clause "Simplified" License

6 votes

static <T> void validateRanges(SAMFileHeader header, ReferenceRanges<T> rangeMap) {
  for (final String seq : rangeMap.sequenceNames()) {
    final SAMSequenceRecord r  = header.getSequenceDictionary().getSequence(seq);
    if (r == null) {
      throw new NoTalkbackSlimException("Sequence \"" + seq + "\" referenced in regions not found in the SAM sequence dictionary.");
    }

    if (r.getSequenceLength() > 0) {
      final RangeList<T> rs = rangeMap.get(seq);
      if (rs != null) {
        final List<? extends Interval> ranges = rs.getRangeList();
        final Interval last = ranges.get(ranges.size() - 1);
        if (last.getEnd() >  r.getSequenceLength()) {
          throw new NoTalkbackSlimException("Specified sequence range (" + r.getSequenceName() + ":" + last + ") is outside the length of the sequence (" + r.getSequenceLength() + ")");
        }
      }
    }
  }
}

Source File: SamUtils.java From rtg-tools with BSD 2-Clause "Simplified" License

6 votes

/**
 * Method to check the equivalence of two SAM headers
 * @param fh a <code>SAMFileHeader</code> value
 * @param lh a <code>SAMFileHeader</code> value
 * @return true if the headers are compatible.
 */
public static boolean checkHeaderDictionary(final SAMFileHeader fh, final SAMFileHeader lh) {
  if (fh.getSortOrder() != lh.getSortOrder()) {
    return false;
  }
  final List<SAMSequenceRecord> flist = fh.getSequenceDictionary().getSequences();
  final List<SAMSequenceRecord> llist = lh.getSequenceDictionary().getSequences();
  final Iterator<SAMSequenceRecord> fi = flist.iterator();
  final Iterator<SAMSequenceRecord> li = llist.iterator();
  while (fi.hasNext()) {
    if (!li.hasNext()) {
      return false;
    }
    final SAMSequenceRecord fsr = fi.next();
    final SAMSequenceRecord lsr = li.next();
    if (!fsr.getSequenceName().equals(lsr.getSequenceName()) || fsr.getSequenceLength() != lsr.getSequenceLength()) {
      return false;
    }
  }
  if (li.hasNext()) {
    return false;
  }
  return true;
}

Source File: PicardIndexedFastaSequenceFile.java From chipster with MIT License

6 votes

/**
 * Do some basic checking to make sure the dictionary and the index match.
 * @param fastaFile Used for error reporting only.
 * @param sequenceDictionary sequence dictionary to check against the index.
 * @param index index file to check against the dictionary.
 */
protected static void sanityCheckDictionaryAgainstIndex(final String fastaFile,
                                                        final SAMSequenceDictionary sequenceDictionary,
                                                        final FastaSequenceIndex index) {
    // Make sure dictionary and index are the same size.
    if( sequenceDictionary.getSequences().size() != index.size() )
        throw new SAMException("Sequence dictionary and index contain different numbers of contigs");

    Iterator<SAMSequenceRecord> sequenceIterator = sequenceDictionary.getSequences().iterator();
    Iterator<FastaSequenceIndexEntry> indexIterator = index.iterator();

    while(sequenceIterator.hasNext() && indexIterator.hasNext()) {
        SAMSequenceRecord sequenceEntry = sequenceIterator.next();
        FastaSequenceIndexEntry indexEntry = indexIterator.next();

        if(!sequenceEntry.getSequenceName().equals(indexEntry.getContig())) {
            throw new SAMException(String.format("Mismatch between sequence dictionary fasta index for %s, sequence '%s' != '%s'.",
                    fastaFile, sequenceEntry.getSequenceName(),indexEntry.getContig()));
        }

        // Make sure sequence length matches index length.
        if( sequenceEntry.getSequenceLength() != indexEntry.getSize())
            throw new SAMException("Index length does not match dictionary length for contig: " + sequenceEntry.getSequenceName() );
    }
}

Source File: FindBadGenomicKmersSparkUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@Test(groups = "sv")
public void miniRefTest() throws IOException {
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final ReferenceMultiSparkSource ref = new ReferenceMultiSparkSource(
            REFERENCE_FILE_NAME, ReferenceWindowFunctions.IDENTITY_FUNCTION);
    final SAMSequenceDictionary dict = ref.getReferenceSequenceDictionary(null);
    if ( dict == null ) throw new GATKException("No reference dictionary available.");

    final Map<SVKmer, Long> kmerMap = new LinkedHashMap<>();
    for ( final SAMSequenceRecord rec : dict.getSequences() ) {
        final SimpleInterval interval = new SimpleInterval(rec.getSequenceName(), 1, rec.getSequenceLength());
        final byte[] bases = ref.getReferenceBases(interval).getBases();
        SVKmerizer.canonicalStream(bases, KMER_SIZE, new SVKmerLong())
                .forEach(kmer -> kmerMap.put(kmer, kmerMap.getOrDefault(kmer, 0L) + 1));
    }
    kmerMap.entrySet().removeIf( x -> x.getValue() <= FindBadGenomicKmersSpark.MAX_KMER_FREQ);

    final List<SVKmer> badKmers =
            FindBadGenomicKmersSpark.findBadGenomicKmers(ctx, KMER_SIZE, Integer.MAX_VALUE, ref, null);
    final Set<SVKmer> badKmerSet = new HashSet<>(badKmers);
    Assert.assertEquals(badKmers.size(), badKmerSet.size());
    Assert.assertEquals(badKmerSet, kmerMap.keySet());
}

Source File: SamRangeUtils.java From rtg-tools with BSD 2-Clause "Simplified" License

5 votes

/**
 * Make a reference range list corresponding to the full length of all reference sequences
 * @param header the SAM header containing sequence information
 * @return the ReferenceRanges lookup
 */
public static ReferenceRanges<String> createFullReferenceRanges(SAMFileHeader header) {
  final ReferenceRanges<String> rangeMap = new ReferenceRanges<>(true);
  for (final SAMSequenceRecord r : header.getSequenceDictionary().getSequences()) {
    final int rlen = r.getSequenceLength();
    if (rlen > 0) {
      rangeMap.put(r.getSequenceName(), new RangeList<>(new RangeList.RangeData<>(0, rlen, r.getSequenceName())));
    }
  }
  rangeMap.setIdMap(SamUtils.getSequenceIdLookup(header.getSequenceDictionary()));
  return rangeMap;
}

Source File: IntervalTagComparatorTest.java From Drop-seq with MIT License

5 votes

private List<SAMRecord> createManyIntervalTaggedSAMRecords (final int desiredNumRecords) {
	List<SAMRecord> data = new ArrayList<>();

	SamReader inputSam = SamReaderFactory.makeDefault().open(this.dictFile);
	SAMRecord samRecordTemplate = new SAMRecord (inputSam.getFileHeader());

	SAMSequenceDictionary dict= inputSam.getFileHeader().getSequenceDictionary();
	List<SAMSequenceRecord> recs = dict.getSequences();
	int numRecs = recs.size();

	Random randomGenerator = new Random();
	for (int i=0; i<desiredNumRecords; i++) {
		SAMSequenceRecord r = recs.get(randomGenerator.nextInt(numRecs+1));
		String chr = r.getSequenceName();
		int seqLen = r.getSequenceLength();
		int s1 = randomGenerator.nextInt(seqLen);
		int s2 = randomGenerator.nextInt(seqLen);
		int s = Math.min(s1, s2);
		int e = Math.max(s1, s2);
		Interval interval = new Interval (chr, s1,s2);
		try {
			SAMRecord r1 = (SAMRecord) samRecordTemplate.clone();
			// I realize that using encoding the full interval can be a bit heavy handed.
			r1.setAttribute(this.intervalTag, interval.toString());
			data.add(r1);
		} catch (CloneNotSupportedException e1) {
			// this should never happen, sigh.
		}
	}
	return data;
}

Source File: IntervalTagComparatorTest.java From Drop-seq with MIT License

5 votes

/**
 * Make this a little more like actual sequence data, where reads mapped to GL are practically non existent.
 * @param dict
 * @return
 */
private SAMSequenceDictionary filterSD (final SAMSequenceDictionary dict) {
	SAMSequenceDictionary result = new SAMSequenceDictionary();
	for (SAMSequenceRecord r: dict.getSequences())
		if (r.getSequenceLength()>10000000)
			result.addSequence(r);
	return result;
}

Source File: VarDictLauncher.java From VarDictJava with MIT License

5 votes

/**
 * Read map of chromosome lengths
 * @param bam BAM file name
 * @return Map of chromosome lengths. Key - chromosome name, value - length
 * @throws IOException if BAM/SAM file can't be opened
 */
public static Map<String, Integer> readChr(String bam) throws IOException {
    try (SamReader reader = SamReaderFactory.makeDefault().open(new File(bam))) {
        SAMFileHeader header = reader.getFileHeader();
        Map<String, Integer> chrs = new HashMap<>();
        for (SAMSequenceRecord record : header.getSequenceDictionary().getSequences()) {
            record.getSequenceLength();
            String sn = record.getSequenceName();
            int ln = record.getSequenceLength();
            chrs.put(sn, ln);
        }
        return chrs;
    }
}

Source File: IntervalUtils.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Determines whether the provided interval is within the bounds of its assigned contig according to the provided dictionary
 *
 * @param interval interval to check
 * @param dictionary dictionary to use to validate contig bounds
 * @return true if the interval's contig exists in the dictionary, and the interval is within its bounds, otherwise false
 */
public static boolean intervalIsOnDictionaryContig( final SimpleInterval interval, final SAMSequenceDictionary dictionary ) {
    Utils.nonNull(interval);
    Utils.nonNull(dictionary);

    final SAMSequenceRecord contigRecord = dictionary.getSequence(interval.getContig());
    if ( contigRecord == null ) {
        return false;
    }

    return interval.getEnd() <= contigRecord.getSequenceLength();
}

Source File: BamOverlapChecker.java From systemsgenetics with GNU General Public License v3.0

5 votes

public BamOverlapChecker(SamReader bam_file){
    
    SAMFileHeader  header = bam_file.getFileHeader();
    SAMSequenceDictionary dict = header.getSequenceDictionary();
    List<SAMSequenceRecord> sequences = dict.getSequences();
   
    
    booleanMap = new HashMap<String, boolean[]>();
    
    for(SAMSequenceRecord sequence : sequences){
        int sequenceEnd = sequence.getSequenceLength();
        int arrayLength = (int) Math.ceil( (float) sequenceEnd / (float)stepSize );
        boolean[] tempArray;
        tempArray = new boolean[arrayLength];
        
        for(int i=0;i<arrayLength;i++){
            SAMRecordIterator bamQuery = bam_file.queryOverlapping(sequence.getSequenceName(), i*stepSize, (i+1)*stepSize);
            if(bamQuery.hasNext()){
                tempArray[i] = true;
            }else{
                tempArray[i] = false;
            }
            bamQuery.close();
        }
        booleanMap.put(sequence.getSequenceName(),tempArray );
        //System.out.println("Finished checking the bam for chromosome " + sequence.getSequenceName());
    }
    
}

Source File: SequenceDictionaryUtils.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Trivial helper that returns true if elt has the same name and length as rec1 or rec2
 * @param elt record to test
 * @param recs the list of records to check for name and length equivalence
 * @return true if elt has the same name and length as any of the recs
 */
private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord... recs) {
    for (SAMSequenceRecord rec : recs) {
        if (elt.getSequenceLength() == rec.getSequenceLength() && elt.getSequenceName().equals(rec.getSequenceName())) {
            return true;
        }
    }
    return false;
}

Source File: GenomeLocParser.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * validate a position or interval on the genome as valid
 *
 * Requires that contig exist in the master sequence dictionary, and that contig index be valid as well.  Requires
 * that start <= stop.
 *
 * if mustBeOnReference is true,
 * performs boundary validation for genome loc INTERVALS:
 * start and stop are on contig and start <= stop
 *
 * @param contig the contig name
 * @param start  the start position
 * @param stop   the stop position
 *
 * @return the interned contig name, an optimization that ensures that contig == the string in the sequence dictionary
 */
protected String validateGenomeLoc(final String contig, final int contigIndex, final int start, final int stop, final boolean mustBeOnReference) {
    if ( validationLevel == ValidationLevel.NONE )
        return contig;
    else {
        if (stop < start)
            vglHelper(String.format("The stop position %d is less than start %d in contig %s", stop, start, contig));

        final SAMSequenceRecord contigInfo = this.contigInfo.getSequence(contig);
        if ( contigInfo.getSequenceIndex() != contigIndex )
            vglHelper(String.format("The contig index %d is bad, doesn't equal the contig index %d of the contig from a string %s",
                    contigIndex, contigInfo.getSequenceIndex(), contig));

        if ( mustBeOnReference ) {
            if (start < 1)
                vglHelper(String.format("The start position %d is less than 1", start));

            if (stop < 1)
                vglHelper(String.format("The stop position %d is less than 1", stop));

            final int contigSize = contigInfo.getSequenceLength();
            if (contigSize == SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH) {
                logger.warn(String.format("The available sequence dictionary does not contain a sequence length for contig (%s). " +
                        "Skipping validation of the genome loc end coordinate (%d).",
                        contig, stop));
            }
            else if (start > contigSize || stop > contigSize) {
                vglHelper(String.format("The genome loc coordinates %d-%d exceed the contig size (%d)", start, stop, contigSize));
            }
        }

        return contigInfo.getSequenceName();
    }
}

Source File: CachingIndexedFastaSequenceFileUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

@Test(dataProvider = "fastas")
public void testCachingIndexedFastaReaderTwoStage(Path fasta, Path unzipped, int cacheSize, int querySize) throws IOException {
    try(final ReferenceSequenceFile uncached = ReferenceSequenceFileFactory.getReferenceSequenceFile(unzipped);
        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false)) {

        SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);

        int middleStart = (contig.getSequenceLength() - querySize) / 2;
        int middleStop = middleStart + querySize;

        logger.debug(String.format(
                "Checking contig %s length %d with cache size %d and query size %d with intermediate query",
                contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize));

        for (int i = 0; i < contig.getSequenceLength(); i += 10) {
            int start = i;
            int stop = start + querySize;
            if (stop <= contig.getSequenceLength()) {
                ReferenceSequence grabMiddle = caching.getSubsequenceAt(contig.getSequenceName(), middleStart,
                                                                        middleStop);
                ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
                ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);

                Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
                Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
                Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
            }
        }
    }
}

Source File: UpdateVCFSequenceDictionary.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

@Override
public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext ref, final FeatureContext featureContext) {
    // Validate each variant against the source dictionary manually
    SAMSequenceRecord samSeqRec = sourceDictionary.getSequence(vc.getContig());
    if (samSeqRec == null) {
        throw new CommandLineException.BadArgumentValue(
            String.format(
                "The input variant file contains a variant (ID: \"%s\") with a reference to a sequence (\"%s\") " +
                "that is not present in the provided dictionary",
                vc.getID(),
                vc.getContig()
            )
        );
    } else if (vc.getEnd() > samSeqRec.getSequenceLength()) {
        throw new CommandLineException.BadArgumentValue(
            String.format(
                "The input variant file contains a variant (ID: \"%s\") with a reference to a sequence (\"%s\") " +
                "that ends at a position (%d) that exceeds the length of that sequence (%d) in the provided dictionary",
                vc.getID(),
                vc.getContig(),
                vc.getEnd(),
                samSeqRec.getSequenceLength()
            )
        );
    }
    vcfWriter.add(vc);
}

Source File: FilterBam.java From Drop-seq with MIT License

5 votes

private SAMSequenceRecord cloneWithNewName(final SAMSequenceRecord sequence, final String editedSequenceName) {
     final SAMSequenceRecord ret = new SAMSequenceRecord(editedSequenceName, sequence.getSequenceLength());
     for (Map.Entry<String, String> entry : sequence.getAttributes())
if (entry.getKey().equals(SAMSequenceRecord.SEQUENCE_NAME_TAG))
	ret.setAttribute(SAMSequenceRecord.SEQUENCE_NAME_TAG, editedSequenceName);
else
	ret.setAttribute(entry.getKey(), entry.getValue());
     return ret;
 }

Source File: SAMFileHeader_Utils.java From cramtools with Apache License 2.0

4 votes

static SAMFileHeader readHeader(final BinaryCodec stream, final ValidationStringency validationStringency,
		final String source) throws IOException {

	final byte[] buffer = new byte[4];
	stream.readBytes(buffer);
	if (!Arrays.equals(buffer, "BAM\1".getBytes())) {
		throw new IOException("Invalid BAM file header");
	}

	final int headerTextLength = stream.readInt();
	final String textHeader = stream.readString(headerTextLength);
	final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
	headerCodec.setValidationStringency(validationStringency);
	final SAMFileHeader samFileHeader = headerCodec.decode(new StringLineReader(textHeader), source);

	final int sequenceCount = stream.readInt();
	if (samFileHeader.getSequenceDictionary().size() > 0) {
		// It is allowed to have binary sequences but no text sequences, so
		// only validate if both are present
		if (sequenceCount != samFileHeader.getSequenceDictionary().size()) {
			throw new SAMFormatException("Number of sequences in text header ("
					+ samFileHeader.getSequenceDictionary().size() + ") != number of sequences in binary header ("
					+ sequenceCount + ") for file " + source);
		}
		for (int i = 0; i < sequenceCount; i++) {
			final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(stream, source);
			final SAMSequenceRecord sequenceRecord = samFileHeader.getSequence(i);
			if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
				throw new SAMFormatException("For sequence " + i
						+ ", text and binary have different names in file " + source);
			}
			if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
				throw new SAMFormatException("For sequence " + i
						+ ", text and binary have different lengths in file " + source);
			}
		}
	} else {
		// If only binary sequences are present, copy them into
		// samFileHeader
		final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
		for (int i = 0; i < sequenceCount; i++) {
			sequences.add(readSequenceRecord(stream, source));
		}
		samFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
	}

	return samFileHeader;
}

Source File: BedToIntervalList.java From picard with MIT License

4 votes

@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(SEQUENCE_DICTIONARY);
    IOUtil.assertFileIsWritable(OUTPUT);
    try {
        // create a new header that we will assign the dictionary provided by the SAMSequenceDictionaryExtractor to.
        final SAMFileHeader header = new SAMFileHeader();
        final SAMSequenceDictionary samSequenceDictionary = SAMSequenceDictionaryExtractor.extractDictionary(SEQUENCE_DICTIONARY.toPath());
        header.setSequenceDictionary(samSequenceDictionary);
        // set the sort order to be sorted by coordinate, which is actually done below
        // by getting the .uniqued() intervals list before we write out the file
        header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
        final IntervalList intervalList = new IntervalList(header);

        final FeatureReader<BEDFeature> bedReader = AbstractFeatureReader.getFeatureReader(INPUT.getAbsolutePath(), new BEDCodec(), false);
        final CloseableTribbleIterator<BEDFeature> iterator = bedReader.iterator();
        final ProgressLogger progressLogger = new ProgressLogger(LOG, (int) 1e6);

        while (iterator.hasNext()) {
            final BEDFeature bedFeature = iterator.next();
            final String sequenceName = bedFeature.getContig();
            final int start = bedFeature.getStart();
            final int end = bedFeature.getEnd();
            // NB: do not use an empty name within an interval
            final String name;
            if (bedFeature.getName().isEmpty()) {
                name = null;
            } else {
                name = bedFeature.getName();
            }

            final SAMSequenceRecord sequenceRecord = header.getSequenceDictionary().getSequence(sequenceName);

            // Do some validation
            if (null == sequenceRecord) {
                if (DROP_MISSING_CONTIGS) {
                    LOG.info(String.format("Dropping interval with missing contig: %s:%d-%d", sequenceName, bedFeature.getStart(), bedFeature.getEnd()));
                    missingIntervals++;
                    missingRegion += bedFeature.getEnd() - bedFeature.getStart();
                    continue;
                }
                throw new PicardException(String.format("Sequence '%s' was not found in the sequence dictionary", sequenceName));
            } else if (start < 1) {
                throw new PicardException(String.format("Start on sequence '%s' was less than one: %d", sequenceName, start));
            } else if (sequenceRecord.getSequenceLength() < start) {
                throw new PicardException(String.format("Start on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), start));
            } else if ((end == 0 && start != 1 ) //special case for 0-length interval at the start of a contig
                    || end < 0 ) {
                throw new PicardException(String.format("End on sequence '%s' was less than one: %d", sequenceName, end));
            } else if (sequenceRecord.getSequenceLength() < end) {
                throw new PicardException(String.format("End on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), end));
            } else if (end < start - 1) {
                throw new PicardException(String.format("On sequence '%s', end < start-1: %d <= %d", sequenceName, end, start));
            }

            final boolean isNegativeStrand = bedFeature.getStrand() == Strand.NEGATIVE;
            final Interval interval = new Interval(sequenceName, start, end, isNegativeStrand, name);
            intervalList.add(interval);

            progressLogger.record(sequenceName, start);
        }
        CloserUtil.close(bedReader);

        if (DROP_MISSING_CONTIGS) {
            if (missingRegion == 0) {
                LOG.info("There were no missing regions.");
            } else {
                LOG.warn(String.format("There were %d missing regions with a total of %d bases", missingIntervals, missingRegion));
            }
        }
        // Sort and write the output
        IntervalList out = intervalList;
        if (SORT) {
            out = out.sorted();
        }
        if (UNIQUE) {
            out = out.uniqued();
        }
        out.write(OUTPUT);
        LOG.info(String.format("Wrote %d intervals spanning a total of %d bases", out.getIntervals().size(),out.getBaseCount()));

    } catch (final IOException e) {
        throw new RuntimeException(e);
    }

    return 0;
}

Java Code Examples for htsjdk.samtools.SAMSequenceRecord#getSequenceLength()