htsjdk.samtools.seekablestream.SeekableStream Java Exaples

Source File: IndexAggregate.java From cramtools with Apache License 2.0

6 votes

public static IndexAggregate forDataFile(SeekableStream stream, SAMSequenceDictionary dictionary)
		throws IOException {
	String path = stream.getSource();
	File indexFile = findIndexFileFor(path);
	if (indexFile == null)
		throw new FileNotFoundException("No index found for file: " + path);

	log.info("Using index file: " + indexFile.getAbsolutePath());
	IndexAggregate a = new IndexAggregate();
	if (indexFile.getName().matches("(?i).*\\.bai")) {
		a.bai = new CachingBAMFileIndex(indexFile, dictionary);
		return a;
	}
	if (indexFile.getName().matches("(?i).*\\.crai")) {
		a.crai = CramIndex.readIndex(new GZIPInputStream(new FileInputStream(indexFile)));
		return a;
	}

	throw new FileNotFoundException("No index found for file: " + path);
}

Source File: BAMRecordReader.java From Hadoop-BAM with MIT License

6 votes

private SamReader createSamReader(SeekableStream in, SeekableStream inIndex,
		ValidationStringency stringency, boolean useIntelInflater) {
	SamReaderFactory readerFactory = SamReaderFactory.makeDefault()
			.setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, true)
			.setOption(SamReaderFactory.Option.EAGERLY_DECODE, false)
			.setUseAsyncIo(false);
	if (stringency != null) {
		readerFactory.validationStringency(stringency);
	}
	SamInputResource resource = SamInputResource.of(in);
	if (inIndex != null) {
		resource.index(inIndex);
	}
	if (useIntelInflater) {
		readerFactory.inflaterFactory(IntelGKLAccessor.newInflatorFactor());
	}
	return readerFactory.open(resource);
}

Source File: CRAMInputFormat.java From Hadoop-BAM with MIT License

5 votes

private static List<Long> getContainerOffsets(Configuration conf, Path cramFile)
    throws IOException {
  SeekableStream seekableStream = WrapSeekable.openPath(conf, cramFile);
  CramContainerIterator cci = new CramContainerIterator(seekableStream);
  List<Long> containerOffsets = new ArrayList<Long>();
  containerOffsets.add(seekableStream.position());
  while (cci.hasNext()) {
    cci.next();
    containerOffsets.add(seekableStream.position());
  }
  containerOffsets.add(seekableStream.length());
  return containerOffsets;
}

Source File: Utils.java From cramtools with Apache License 2.0

5 votes

private static boolean checkEOF(htsjdk.samtools.cram.common.Version version, SeekableStream seekableStream)
		throws IOException {
	if (version.compatibleWith(CramVersions.CRAM_v3))
		return streamEndsWith(seekableStream, CramIO.ZERO_F_EOF_MARKER);
	if (version.compatibleWith(CramVersions.CRAM_v2_1))
		return streamEndsWith(seekableStream, CramIO.ZERO_B_EOF_MARKER);

	return false;
}

Source File: Utils.java From cramtools with Apache License 2.0

5 votes

private static boolean streamEndsWith(SeekableStream seekableStream, byte[] marker) throws IOException {
	byte[] tail = new byte[marker.length];
	seekableStream.seek(seekableStream.length() - marker.length);
	InputStreamUtils.readFully(seekableStream, tail, 0, tail.length);
	if (Arrays.equals(tail, marker))
		return true;
	tail[8] = (byte) (tail[8] | 240);
	return Arrays.equals(tail, marker);
}

Source File: IndexAggregate.java From cramtools with Apache License 2.0

5 votes

private static long seek(BAMIndex index, int seqId, int start, int end, SeekableStream cramStream)
		throws IOException {
	BAMFileSpan span = index.getSpanOverlapping(seqId, start, end);
	if (span == null)
		return -1;
	long[] coords = span.toCoordinateArray();
	if (coords.length == 0)
		return -1;
	long[] offsets = new long[coords.length / 2];
	for (int i = 0; i < offsets.length; i++) {
		offsets[i] = coords[i * 2] >> 16;
	}
	Arrays.sort(offsets);

	// peek into container in offset ascending order and choose the first
	// that intersects the query:
	for (int i = 0; i < offsets.length; i++) {
		log.debug("Peeking at offset: " + offsets[i]);
		IndexAggregate.ContainerBoundary b = peek(cramStream, offsets[i]);
		if (b == null)
			continue;

		boolean intersects = intersects(start, end, b);
		// System.out.printf("%b, %d, %d, %d, %d\n", intersects, start, end,
		// b.start, b.start + b.span);

		if (intersects(start, end, b)) {
			long offset = offsets[i];
			log.debug("Found query at offset: " + offset);
			cramStream.seek(offset);
			return offset;
		}

	}
	return -1;
}

Source File: IndexAggregate.java From cramtools with Apache License 2.0

5 votes

private static long seek(List<CramIndex.Entry> index, int seqId, int start, int end, SeekableStream cramStream)
		throws IOException {
	List<Entry> found = CramIndex.find(index, seqId, start, end - start + 1);
	if (found == null || found.size() == 0)
		return -1;
	cramStream.seek(found.get(0).containerStartOffset);
	log.debug("Found query at offset: " + found.get(0).containerStartOffset);
	return found.get(0).containerStartOffset;
}

Source File: VariantsSparkSinkUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

private VCFHeader getHeader(String vcf) throws IOException {
    final java.nio.file.Path vcfPath = IOUtils.getPath(vcf);
    try (SeekableStream stream = new SeekablePathStream(vcfPath)) {
        return VCFHeaderReader.readHeaderFrom(stream);
    } catch (IOException e) {
        throw new UserException("Failed to read VCF header from " + vcf + "\n Caused by:" + e.getMessage(), e);
    }
}

Source File: CreateHadoopBamSplittingIndex.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

private static void createOnlySplittingIndex(final File inputBam, final File index, final long granularity) {
    assertIsBam(inputBam);
    try(SeekableStream in = new SeekableFileStream(inputBam);
        BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
            BAMSBIIndexer.createIndex(in, out, granularity);
    } catch (final IOException e) {
        throw new UserException("Couldn't create splitting index", e);
    }
}

Source File: SeekingBAMFileReader.java From dataflow-java with Apache License 2.0

5 votes

public SeekingBAMFileReader(final SamInputResource resource,
    final boolean eagerDecode,
    final ValidationStringency validationStringency,
    final SAMRecordFactory factory,
    long offset)
        throws IOException {
  super(resource.data().asUnbufferedSeekableStream(), (SeekableStream)null, 
      eagerDecode, validationStringency, factory);
  this.offset = offset;
  this.stream = resource.data().asUnbufferedSeekableStream();
}

Source File: BAMIO.java From dataflow-java with Apache License 2.0

5 votes

private static SamInputResource openBAMFile(Storage.Objects storageClient, String gcsStoragePath, SeekableStream index) throws IOException {
  SeekableGCSStream s = new SeekableGCSStream(storageClient, gcsStoragePath);
  SamInputResource samInputResource =
      SamInputResource.of(s);

  if (index != null) {
    samInputResource.index(index);
  }

  LOG.info("getReadsFromBAMFile - got input resources");
  return samInputResource;
}

Source File: BAMIO.java From dataflow-java with Apache License 2.0

5 votes

private static SeekableStream openIndexForPath(Storage.Objects storageClient,String gcsStoragePath) {
  final String indexPath = gcsStoragePath + ".bai";
  try {
    return new SeekableGCSStream(storageClient, indexPath);
  } catch (IOException ex) {
    LOG.info("No index for " + indexPath);
    // Ignore if there is no bai file
  }
  return null;
}

Source File: TestCRAMOutputFormat.java From Hadoop-BAM with MIT License

5 votes

private int getCRAMRecordCount(
    final File containerStreamFile,
    final SAMFileHeader header,
    final ReferenceSource refSource) throws IOException
{
    // assemble a proper CRAM file from the container stream shard(s) in
    // order to verify the contents
    final ByteArrayInputStream mergedStream = mergeCRAMContainerStream (
        containerStreamFile,
        header,
        refSource
    );

    // now we can verify that we can read everything back in
    final CRAMFileReader resultCRAMReader = new CRAMFileReader(
        mergedStream,
        (SeekableStream) null,
        refSource,
        ValidationStringency.DEFAULT_STRINGENCY);
    final Iterator<SAMRecord> it = resultCRAMReader.getIterator();
    int actualCount = 0;
    while (it.hasNext()) {
        it.next();
        actualCount++;
    }
    return actualCount;
}

Source File: TestBAMSplitGuesser.java From Hadoop-BAM with MIT License

5 votes

@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  String bam = getClass().getClassLoader().getResource("test.bam").getFile();
  SeekableStream ss = WrapSeekable.openPath(conf, new Path(bam));
  BAMSplitGuesser bamSplitGuesser = new BAMSplitGuesser(ss, conf);
  long startGuess = bamSplitGuesser.guessNextBAMRecordStart(0, 3 * 0xffff + 0xfffe);
  assertEquals(SAMUtils.findVirtualOffsetOfFirstRecordInBam(new File(bam)), startGuess);
}

Source File: BAMSplitGuesser.java From Hadoop-BAM with MIT License

5 votes

public BAMSplitGuesser(
		SeekableStream ss, InputStream headerStream, Configuration conf)
	throws IOException
{
	inFile = ss;

	header = SAMHeaderReader.readSAMHeaderFrom(headerStream, conf);
	referenceSequenceCount = header.getSequenceDictionary().size();

	bamCodec = new BAMRecordCodec(null, new LazyBAMRecordFactory());
}

Source File: BAMSplitGuesser.java From Hadoop-BAM with MIT License

5 votes

/** The stream must point to a valid BAM file, because the header is read
 * from it.
 */
public BAMSplitGuesser(
		SeekableStream ss, Configuration conf)
	throws IOException
{
	this(ss, ss, conf);

	// Secondary check that the header points to a BAM file: Picard can get
	// things wrong due to its autodetection.
	ss.seek(0);
	if (ss.read(buf.array(), 0, 4) != 4 || buf.getInt(0) != BGZF_MAGIC)
		throw new SAMFormatException("Does not seem like a BAM file");
}

Source File: BamSlicerApplication.java From hmftools with GNU General Public License v3.0

5 votes

private static void sliceFromURLs(@NotNull URL indexUrl, @NotNull URL bamUrl, @NotNull CommandLine cmd) throws IOException {
    File indexFile = downloadIndex(indexUrl);
    indexFile.deleteOnExit();

    SamReader reader = createFromCommandLine(cmd).open(SamInputResource.of(bamUrl).index(indexFile));

    BAMIndex bamIndex;
    if (indexFile.getPath().contains(".crai")) {
        SeekableStream craiIndex = CRAIIndex.openCraiFileAsBaiStream(indexFile, reader.getFileHeader().getSequenceDictionary());
        bamIndex = new DiskBasedBAMFileIndex(craiIndex, reader.getFileHeader().getSequenceDictionary());
    } else {
        bamIndex = new DiskBasedBAMFileIndex(indexFile, reader.getFileHeader().getSequenceDictionary(), false);
    }

    Optional<Pair<QueryInterval[], BAMFileSpan>> queryIntervalsAndSpan = queryIntervalsAndSpan(reader, bamIndex, cmd);
    Optional<Chunk> unmappedChunk = getUnmappedChunk(bamIndex, HttpUtils.getHeaderField(bamUrl, "Content-Length"), cmd);
    List<Chunk> sliceChunks = sliceChunks(queryIntervalsAndSpan, unmappedChunk);
    SamReader cachingReader = createCachingReader(indexFile, bamUrl, cmd, sliceChunks);

    SAMFileWriter writer = new SAMFileWriterFactory().setCreateIndex(true)
            .makeBAMWriter(reader.getFileHeader(), true, new File(cmd.getOptionValue(OUTPUT)));

    queryIntervalsAndSpan.ifPresent(pair -> {
        LOGGER.info("Slicing bam on bed regions...");
        CloseableIterator<SAMRecord> bedIterator = getIterator(cachingReader, pair.getKey(), pair.getValue().toCoordinateArray());
        writeToSlice(writer, bedIterator);
        LOGGER.info("Done writing bed slices.");
    });

    unmappedChunk.ifPresent(chunk -> {
        LOGGER.info("Slicing unmapped reads...");
        CloseableIterator<SAMRecord> unmappedIterator = cachingReader.queryUnmapped();
        writeToSlice(writer, unmappedIterator);
        LOGGER.info("Done writing unmapped reads.");
    });

    reader.close();
    writer.close();
    cachingReader.close();
}

Source File: TestVCFHeaderReader.java From Hadoop-BAM with MIT License

4 votes

static SeekableStream seekableStream(final String resource) throws IOException {
  return new ByteArraySeekableStream(Resources.toByteArray(ClassLoader.getSystemClassLoader().getResource(resource)));
}

Source File: LinearBAMIndex.java From Hadoop-BAM with MIT License

4 votes

public LinearBAMIndex(SeekableStream stream, SAMSequenceDictionary dict) {
        super(stream, dict);
}

Source File: BAMFileIndexImpl.java From dataflow-java with Apache License 2.0

4 votes

public BAMFileIndexImpl(SeekableStream stream, SAMSequenceDictionary dict) {
  super(stream, dict);
  mBamDictionary = dict;
}

Source File: BAMRecordReader.java From Hadoop-BAM with MIT License

4 votes

@Override public void initialize(InputSplit spl, TaskAttemptContext ctx)
           throws IOException
{
	// This method should only be called once (see Hadoop API). However,
	// there seems to be disagreement between implementations that call
	// initialize() and Hadoop-BAM's own code that relies on
	// {@link BAMInputFormat} to call initialize() when the reader is
	// created. Therefore we add this check for the time being. 
	if(isInitialized)
		close();
	isInitialized = true;
	reachedEnd = false;

	final Configuration conf = ctx.getConfiguration();

	final FileVirtualSplit split = (FileVirtualSplit)spl;
	final Path             file  = split.getPath();
	final FileSystem       fs    = file.getFileSystem(conf);

	ValidationStringency stringency = SAMHeaderReader.getValidationStringency(conf);
	boolean useIntelInflater = BAMInputFormat.useIntelInflater(conf);

	java.nio.file.Path index = SamFiles.findIndex(NIOFileUtil.asPath(fs.makeQualified(file).toUri()));
	Path fileIndex = index == null ? null : new Path(index.toUri());
	SeekableStream indexStream = fileIndex == null ? null : WrapSeekable.openPath(fs, fileIndex);
	in = WrapSeekable.openPath(fs, file);
	SamReader samReader = createSamReader(in, indexStream, stringency, useIntelInflater);
	final SAMFileHeader header = samReader.getFileHeader();

	long virtualStart = split.getStartVirtualOffset();

	fileStart  = virtualStart >>> 16;
	virtualEnd = split.getEndVirtualOffset();

	SamReader.PrimitiveSamReader primitiveSamReader =
			((SamReader.PrimitiveSamReaderToSamReaderAdapter) samReader).underlyingReader();
	bamFileReader = (BAMFileReader) primitiveSamReader;

	if (logger.isDebugEnabled()) {
		final long recordStart = virtualStart & 0xffff;
		logger.debug("Initialized BAMRecordReader; byte offset: {}, record offset: {}",
			fileStart, recordStart);
	}

	if (conf.getBoolean("hadoopbam.bam.keep-paired-reads-together", false)) {
		throw new IllegalArgumentException("Property hadoopbam.bam.keep-paired-reads-together is no longer honored.");
	}

	boolean boundedTraversal = BAMInputFormat.isBoundedTraversal(conf);
	if (boundedTraversal && split.getIntervalFilePointers() != null) {
		// return reads for intervals
		List<Interval> intervals = BAMInputFormat.getIntervals(conf);
		QueryInterval[] queryIntervals = BAMInputFormat.prepareQueryIntervals(intervals, header.getSequenceDictionary());
		iterator = bamFileReader.createIndexIterator(queryIntervals, false, split.getIntervalFilePointers());
	} else if (boundedTraversal && split.getIntervalFilePointers() == null) {
		// return unmapped reads
		iterator = bamFileReader.queryUnmapped();
	} else {
		// return everything
		BAMFileSpan splitSpan = new BAMFileSpan(new Chunk(virtualStart, virtualEnd));
		iterator = bamFileReader.getIterator(splitSpan);
	}
}

Source File: BCFSplitGuesser.java From Hadoop-BAM with MIT License

4 votes

private void cinSeek(long virt) throws IOException {
	if (bgzf)
		((BlockCompressedInputStream)cin).seek(virt);
	else
		((SeekableStream)cin).seek(virt);
}

Source File: BCFSplitGuesser.java From Hadoop-BAM with MIT License

4 votes

/** The stream must point to a valid BCF file, because the header is read
 * from it.
 */
public BCFSplitGuesser(SeekableStream ss) throws IOException {
	this(ss, ss);
}

Source File: IndexAggregate.java From cramtools with Apache License 2.0

4 votes

public static IndexAggregate fromBaiFile(SeekableStream baiStream, SAMSequenceDictionary dictionary)
		throws IOException {
	IndexAggregate a = new IndexAggregate();
	a.bai = new CachingBAMFileIndex(baiStream, dictionary);
	return a;
}

Source File: BAMInputFormat.java From Hadoop-BAM with MIT License

4 votes

private int addProbabilisticSplits(
		List<InputSplit> splits, int i, List<InputSplit> newSplits,
		Configuration cfg)
	throws IOException
{
	final Path path = ((FileSplit)splits.get(i)).getPath();
       try (final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(cfg), path)) {

           final BAMSplitGuesser guesser = new BAMSplitGuesser(sin, cfg);

           FileVirtualSplit previousSplit = null;

           for (; i < splits.size(); ++i) {
               FileSplit fspl = (FileSplit)splits.get(i);
               if (!fspl.getPath().equals(path))
                   break;

               long beg =       fspl.getStart();
               long end = beg + fspl.getLength();

               long alignedBeg = guesser.guessNextBAMRecordStart(beg, end);

               // As the guesser goes to the next BGZF block before looking for BAM
               // records, the ending BGZF blocks have to always be traversed fully.
               // Hence force the length to be 0xffff, the maximum possible.
               long alignedEnd = end << 16 | 0xffff;

               if (alignedBeg == end) {
                   // No records detected in this split: merge it to the previous one.
                   // This could legitimately happen e.g. if we have a split that is
                   // so small that it only contains the middle part of a BGZF block.
                   //
                   // Of course, if it's the first split, then this is simply not a
                   // valid BAM file.
                   //
                   // FIXME: In theory, any number of splits could only contain parts
                   // of the BAM header before we start to see splits that contain BAM
                   // records. For now, we require that the split size is at least as
                   // big as the header and don't handle that case.
                   if (previousSplit == null)
                       throw new IOException("'" + path + "': "+
                           "no reads in first split: bad BAM file or tiny split size?");

                   previousSplit.setEndVirtualOffset(alignedEnd);
               } else {
                   previousSplit = new FileVirtualSplit(
                                           path, alignedBeg, alignedEnd, fspl.getLocations());
                   if (logger.isDebugEnabled()) {
                       final long byteOffset  = alignedBeg >>> 16;
                       final long recordOffset = alignedBeg & 0xffff;
                       logger.debug(
                           "Split {}: byte offset: {} record offset: {}, virtual offset: {}",
                           i, byteOffset, recordOffset, alignedBeg);
                   }
                   newSplits.add(previousSplit);
               }
           }
       }
       return i;
}

Source File: KeyIgnoringVCFOutputFormat.java From Hadoop-BAM with MIT License

4 votes

public void readHeaderFrom(SeekableStream in) throws IOException {
	this.header = VCFHeaderReader.readHeaderFrom(in);
}

Source File: ReferenceSequenceFromSeekable.java From cramtools with Apache License 2.0

4 votes

private ReferenceSequenceFromSeekable(SeekableStream s, Map<String, FastaSequenceIndexEntry> index) {
	this.s = s;
	this.index = index;
}

Source File: KeyIgnoringVCFOutputFormat.java From Hadoop-BAM with MIT License

4 votes

public void readHeaderFrom(Path path, FileSystem fs) throws IOException {
	SeekableStream i = WrapSeekable.openPath(fs, path);
	readHeaderFrom(i);
	i.close();
}

Source File: IndexAggregate.java From cramtools with Apache License 2.0

3 votes

/**
 * Find and seek the data stream to the position of the alignment query.
 * 
 * @param seqId
 *            reference sequence id
 * @param start
 *            alignment start, 1-based inclusive
 * @param end
 *            alignment end, 1-based exclusive
 * @param cramStream
 *            the data stream to seek in
 * @return the offset found or -1 if the query was not found
 * @throws IOException
 */
public long seek(int seqId, int start, int end, SeekableStream cramStream) throws IOException {
	if (crai != null)
		return seek(crai, seqId, start, end, cramStream);
	if (bai != null)
		return seek(bai, seqId, start, end, cramStream);
	return -1;
}

htsjdk.samtools.seekablestream.SeekableStream Java Examples