htsjdk.tribble.AbstractFeatureReader Java Exaples

Source File: VariantHotspotFile.java From hmftools with GNU General Public License v3.0

6 votes

@NotNull
public static ListMultimap<Chromosome, VariantHotspot> readFromVCF(@NotNull final String fileName) throws IOException {
    ListMultimap<Chromosome, VariantHotspot> result = ArrayListMultimap.create();

    try (final AbstractFeatureReader<VariantContext, LineIterator> reader = AbstractFeatureReader.getFeatureReader(fileName,
            new VCFCodec(),
            false)) {
        for (VariantContext variantContext : reader.iterator()) {
            if (HumanChromosome.contains(variantContext.getContig())) {
                result.put(HumanChromosome.fromString(variantContext.getContig()), fromVariantContext(variantContext));
            }
        }
    }

    return result;
}

Source File: GenomicsDBImportIntegrationTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@Test
public void testPreserveContigOrderingInHeader() throws IOException {
    final String workspace = createTempDir("testPreserveContigOrderingInHeader-").getAbsolutePath() + "/workspace";
    ArrayList<SimpleInterval> intervals = new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20", 17959479, 17959479)));
    writeToGenomicsDB(Arrays.asList(GENOMICSDB_TEST_DIR + "testHeaderContigLineSorting1.g.vcf",
            GENOMICSDB_TEST_DIR + "testHeaderContigLineSorting2.g.vcf"), intervals, workspace, 0, false, 0, 1);

    try ( final FeatureReader<VariantContext> genomicsDBFeatureReader =
                  getGenomicsDBFeatureReader(workspace, b38_reference_20_21);

         final AbstractFeatureReader<VariantContext, LineIterator> inputGVCFReader =
                  AbstractFeatureReader.getFeatureReader(GENOMICSDB_TEST_DIR + "testHeaderContigLineSorting1.g.vcf", new VCFCodec(), true);
    ) {
        final SAMSequenceDictionary dictionaryFromGenomicsDB = ((VCFHeader)genomicsDBFeatureReader.getHeader()).getSequenceDictionary();
        final SAMSequenceDictionary dictionaryFromInputGVCF =  ((VCFHeader)inputGVCFReader.getHeader()).getSequenceDictionary();

        Assert.assertEquals(dictionaryFromGenomicsDB, dictionaryFromInputGVCF, "Sequence dictionary from GenomicsDB does not match original sequence dictionary from input GVCF");
    }

}

Source File: FixCallSetSampleOrderingIntegrationTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@Test(dataProvider = "getBadlySortedFiles")
public void testUnshufflingRestoresCorrectSamples(final File sampleMap, final File shuffled, final int batchSize, final int readerThreads, final File gvcfToHeaderSampleMapFile) throws IOException {
    final ArgumentsBuilder args = new ArgumentsBuilder();
    final File fixed = createTempFile("fixed_", ".vcf");
    args.addVCF(shuffled)
            .add(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, sampleMap)
            .add(GenomicsDBImport.VCF_INITIALIZER_THREADS_LONG_NAME, String.valueOf(readerThreads))
            .add(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize))
            .add(FixCallSetSampleOrdering.SKIP_PROMPT_LONG_NAME, true)
            .addOutput(fixed);
    if ( gvcfToHeaderSampleMapFile != null ) {
        args.add("gvcf-to-header-sample-map-file", gvcfToHeaderSampleMapFile);
    }
    
    runCommandLine(args);

    try(final FeatureReader<VariantContext> reader =
            AbstractFeatureReader.getFeatureReader(fixed.getAbsolutePath(), new VCFCodec(), true)){
        final VariantContext vc = reader.iterator().next();
        Assert.assertEquals(vc.getGenotypes().size(), 1000);
        vc.getGenotypes().iterator().forEachRemaining( g ->
                Assert.assertEquals(g.getSampleName(), g.getAnyAttribute(SAMPLE_NAME_KEY))
        );
    }
}

Source File: GatherVcfsCloudIntegrationTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@SuppressWarnings({"unchecked"})
private void assertGatherProducesCorrectVariants(GatherVcfsCloud.GatherType gatherType, File expected, List<File> inputs) throws IOException {
    final File output = createTempFile("gathered", ".vcf.gz");
    final ArgumentsBuilder args = new ArgumentsBuilder();
    inputs.forEach(args::addInput);
    args.addOutput(output)
            .add(GatherVcfsCloud.GATHER_TYPE_LONG_NAME, gatherType.toString());
    runCommandLine(args);

    try (final AbstractFeatureReader<VariantContext, ?> actualReader = AbstractFeatureReader.getFeatureReader(output.getAbsolutePath(), null, new VCFCodec(), false, Function.identity(), Function.identity());
         final AbstractFeatureReader<VariantContext, ?> expectedReader = AbstractFeatureReader.getFeatureReader(expected.getAbsolutePath(), null, new VCFCodec(), false, Function.identity(), Function.identity())) {

        final List<VariantContext> actualVariants = StreamSupport.stream(Spliterators.spliteratorUnknownSize(actualReader.iterator(),Spliterator.ORDERED), false).collect(Collectors.toList());
        final List<VariantContext> expectedVariants = StreamSupport.stream(Spliterators.spliteratorUnknownSize(expectedReader.iterator(),Spliterator.ORDERED), false).collect(Collectors.toList());
        VariantContextTestUtils.assertEqualVariants(actualVariants, expectedVariants);

        Assert.assertEquals(((VCFHeader) actualReader.getHeader()).getMetaDataInInputOrder(),
                ((VCFHeader) expectedReader.getHeader()).getMetaDataInInputOrder());
    }
}

Source File: RevertSamSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@SuppressWarnings("unchecked")
static List<String>  validateOutputParamsByReadGroup(final String output, final String outputMap) throws IOException {
    final List<String> errors = new ArrayList<>();
    if (output != null) {
        if (!Files.isDirectory(IOUtil.getPath(output))) {
            errors.add("When '--output-by-readgroup' is set and output is provided, it must be a directory: " + output);
        }
        return errors;
    }
    // output is null if we reached here
    if (outputMap == null) {
        errors.add("Must provide either output or outputMap when '--output-by-readgroup' is set.");
        return errors;
    }
    if (!Files.isReadable(IOUtil.getPath(outputMap))) {
        errors.add("Cannot read outputMap " + outputMap);
        return errors;
    }
    final FeatureReader<TableFeature>  parser = AbstractFeatureReader.getFeatureReader(outputMap, new TableCodec(null),false);
    if (!isOutputMapHeaderValid((List<String>)parser.getHeader())) {
        errors.add("Invalid header: " + outputMap + ". Must be a tab-separated file with +"+OUTPUT_MAP_READ_GROUP_FIELD_NAME+"+ as first column and output as second column.");
    }
    return errors;
}

Source File: IndexUtils.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Try to load the tabix index from disk, checking for out of date indexes and old versions
 * @return an Index, or null if we're unable to load
 */
public static Index loadTabixIndex(final File featureFile) {
    Utils.nonNull(featureFile);
    try {
        final String path = featureFile.getAbsolutePath();
        final boolean isTabix = new AbstractFeatureReader.ComponentMethods().isTabix(path, null);
        if (! isTabix){
            return null;
        }
        final String indexPath = ParsingUtils.appendToPath(path, FileExtensions.TABIX_INDEX);
        logger.debug("Loading tabix index from disk for file " + featureFile);
        final Index index = IndexFactory.loadIndex(indexPath);
        final File indexFile = new File(indexPath);
        checkIndexVersionAndModificationTime(featureFile, indexFile, index);
        return index;
    } catch (final IOException | RuntimeException e) {
        return null;
    }
}

Source File: FeatureDataSource.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

private static <T extends Feature> AbstractFeatureReader<T, ?> getTribbleFeatureReader(final FeatureInput<T> featureInput, final FeatureCodec<T, ?> codec, final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper, final Function<SeekableByteChannel, SeekableByteChannel> cloudIndexWrapper) {
    Utils.nonNull(codec);
    try {
        // Must get the path to the data file from the codec here:
        final String absoluteRawPath = featureInput.getRawInputString();

        // Instruct the reader factory to not require an index. We will require one ourselves as soon as
        // a query by interval is attempted.
        final boolean requireIndex = false;

        // Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching.
        if (BucketUtils.isEligibleForPrefetching(featureInput)) {
            return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, cloudWrapper, cloudIndexWrapper);
        } else {
            return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction());
        }
    } catch (final TribbleException e) {
        throw new GATKException("Error initializing feature reader for path " + featureInput.getFeaturePath(), e);
    }
}

Source File: AmberSiteFactory.java From hmftools with GNU General Public License v3.0

6 votes

@NotNull
public static ListMultimap<Chromosome, AmberSite> sites(@NotNull final String vcfFile) throws IOException {
    final ListMultimap<Chromosome, AmberSite> result = ArrayListMultimap.create();

    try (final AbstractFeatureReader<VariantContext, LineIterator> reader = getFeatureReader(vcfFile, new VCFCodec(), false)) {
        for (VariantContext variant : reader.iterator()) {
            if (variant.isNotFiltered()) {
                if (HumanChromosome.contains(variant.getContig())) {
                    HumanChromosome chromosome = HumanChromosome.fromString(variant.getContig());
                    result.put(chromosome,
                            ImmutableAmberSite.builder()
                                    .chromosome(variant.getContig())
                                    .position(variant.getStart())
                                    .ref(variant.getReference().getBaseString())
                                    .alt(variant.getAlternateAllele(0).getBaseString())
                                    .snpCheck(variant.hasAttribute("SNPCHECK"))
                                    .build());
                }
            }
        }
    }

    return result;
}

Source File: RevertSamSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

private static Map<String, Path> createOutputMapFromFile(final String outputMapFile) throws IOException {
    final Map<String, Path> outputMap = new HashMap<>();
    try (final FeatureReader<TableFeature>  parser = AbstractFeatureReader.getFeatureReader(outputMapFile, new TableCodec(null), false);) {
        for (final TableFeature row : parser.iterator()) {
            final String id = row.get(OUTPUT_MAP_READ_GROUP_FIELD_NAME);
            final String output = row.get(OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME);
            final Path outputPath = IOUtils.getPath(output);
            outputMap.put(id, outputPath);
        }
    }
    return outputMap;
}

Source File: LongISLNDReadAlignmentMap.java From varsim with BSD 2-Clause "Simplified" License

5 votes

public LongISLNDReadAlignmentMap(final Collection<String> readAlignmentMapFiles) throws IOException {
    for (final String readAlignmentMapFileName : readAlignmentMapFiles) {
        log.info("Reading in read map from " + readAlignmentMapFileName);
        final AbstractFeatureReader<BEDFeature, LineIterator> featureReader = AbstractFeatureReader.getFeatureReader(readAlignmentMapFileName, new BEDCodec(), false);
        try {
            final CloseableTribbleIterator<BEDFeature> featureIterator = featureReader.iterator();
            while (featureIterator.hasNext()) {
                final BEDFeature feature = featureIterator.next();
                readAlignmentMap.put(feature.getName(), new LongISLNDReadMapRecord(feature).toReadMapRecord());
            }
        } finally {
            featureReader.close();
        }
    }
}

Source File: FuncotationMapUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

@Test(expectedExceptions = GATKException.ShouldNeverReachHereException.class, expectedExceptionsMessageRegExp = ".*a Gencode Funcotation cannot be added.*")
public void testAddingGencodeFuncotationToFuncotationMap() {

    try (final FeatureReader<GencodeGtfFeature> gencodeFeatureReader = AbstractFeatureReader.getFeatureReader( FuncotatorTestConstants.GENCODE_DATA_SOURCE_GTF_PATH_HG19, new GencodeGtfCodec() ) ) {

        // Create some gencode funcotations.  The content does not really matter here.
        final List<Funcotation> gencodeFuncotations = createGencodeFuncotations("chr19", 8994200, 8994200, "G", "C", FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(),
                ReferenceDataSource.of(IOUtils.getPath(FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref())),
                gencodeFeatureReader, FuncotatorTestConstants.GENCODE_DATA_SOURCE_FASTA_PATH_HG19, FuncotatorTestConstants.GENCODE_DATA_SOURCE_GTF_PATH_HG19,
                TranscriptSelectionMode.ALL).stream().map(gf -> (Funcotation) gf).collect(Collectors.toList());

        // Create a funcotationMap with some pre-made funcotations.  Content does not really matter.
        final FuncotationMap funcotationMap = FuncotationMap.createNoTranscriptInfo(Arrays.asList(TableFuncotation.create(
                Arrays.asList("TESTFIELD1", "TESTADD1"),
                createFieldValuesFromNameList("E", Arrays.asList("TESTFIELD1", "TESTADD1")),
                Allele.create("A"),
                "TestDataSource5", null
                ),
                TableFuncotation.create(
                        Arrays.asList("TESTFIELD2", "TESTADD2"),
                        createFieldValuesFromNameList("F", Arrays.asList("TESTFIELD2", "TESTADD2")),
                        Allele.create("AG"),
                        "TestDataSource5", null
                ),
                TableFuncotation.create(
                        Arrays.asList("TESTFIELD3", "TESTADD3"),
                        createFieldValuesFromNameList("G", Arrays.asList("TESTFIELD3", "TESTADD3")),
                        Allele.create("AT"),
                        "TestDataSource5", null
                )));

        // Attempt to add the Gencode funcotations to the funcotation map.  This should cause an exception.
        funcotationMap.add(FuncotationMap.NO_TRANSCRIPT_AVAILABLE_KEY, gencodeFuncotations);
    }
    catch (final IOException ex) {
        throw new GATKException("Could not close gencodeFeatureReader!", ex);
    }
}

Source File: FuncotationMapUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

@Test(dataProvider = "provideGencodeFuncotationCreation")
public void testGencodeFuncotationCreation(final String contig,
                                           final int start,
                                           final int end,
                                           final String ref,
                                           final String alt,
                                           final String referenceFileName,
                                           final ReferenceDataSource referenceDataSource,
                                           final String transcriptFastaFile,
                                           final String transcriptGtfFile,
                                           final TranscriptSelectionMode transcriptSelectionMode,
                                           final List<String> gtTranscripts) {

    try (final FeatureReader<GencodeGtfFeature> gencodeFeatureReader = AbstractFeatureReader.getFeatureReader( FuncotatorTestConstants.GENCODE_DATA_SOURCE_GTF_PATH_HG19, new GencodeGtfCodec() ) ) {
        final List<GencodeFuncotation> gencodeFuncotations = createGencodeFuncotations(contig, start, end, ref, alt, referenceFileName, referenceDataSource, gencodeFeatureReader, transcriptFastaFile, transcriptGtfFile, transcriptSelectionMode);

        final FuncotationMap funcotationMap = FuncotationMap.createFromGencodeFuncotations(gencodeFuncotations);

        Assert.assertEquals(funcotationMap.getTranscriptList(), gtTranscripts);
        Assert.assertTrue(funcotationMap.getTranscriptList().stream().allMatch(k -> funcotationMap.get(k).size() == 1));
        Assert.assertTrue(funcotationMap.getTranscriptList().stream()
                .noneMatch(k -> ((GencodeFuncotation) funcotationMap.get(k).get(0)).getVariantClassification().equals(GencodeFuncotation.VariantClassification.IGR)));
        Assert.assertTrue(funcotationMap.getTranscriptList().stream()
                .noneMatch(k -> ((GencodeFuncotation) funcotationMap.get(k).get(0)).getVariantClassification().equals(GencodeFuncotation.VariantClassification.COULD_NOT_DETERMINE)));
    }
    catch ( final IOException ex ) {
        throw new GATKException("Could not close gencodeFeatureReader!", ex);
    }
}

Source File: StructuralVariantFileLoader.java From hmftools with GNU General Public License v3.0

5 votes

@NotNull
public static List<StructuralVariant> fromFile(@NotNull String vcfFileLocation, @NotNull VariantContextFilter filter) throws IOException {
    final StructuralVariantFactory factory = new StructuralVariantFactory(filter);

    try (final AbstractFeatureReader<VariantContext, LineIterator> reader = AbstractFeatureReader.getFeatureReader(vcfFileLocation,
            new VCFCodec(), false)) {
        reader.iterator().forEach(factory::addVariantContext);
    }

    return factory.results();
}

Source File: SomaticVariantFactory.java From hmftools with GNU General Public License v3.0

5 votes

public void fromVCFFile(@NotNull final String tumor, @Nullable final String reference, @Nullable final String rna,
        @NotNull final String vcfFile, Consumer<SomaticVariant> consumer) throws IOException {
    final List<VariantContext> variants = Lists.newArrayList();

    try (final AbstractFeatureReader<VariantContext, LineIterator> reader = getFeatureReader(vcfFile, new VCFCodec(), false)) {
        final VCFHeader header = (VCFHeader) reader.getHeader();
        if (!sampleInFile(tumor, header)) {
            throw new IllegalArgumentException("Sample " + tumor + " not found in vcf file " + vcfFile);
        }

        if (reference != null && !sampleInFile(reference, header)) {
            throw new IllegalArgumentException("Sample " + reference + " not found in vcf file " + vcfFile);
        }

        if (rna != null && !sampleInFile(rna, header)) {
            throw new IllegalArgumentException("Sample " + rna + " not found in vcf file " + vcfFile);
        }

        if (!header.hasFormatLine("AD")) {
            throw new IllegalArgumentException("Allelic depths is a required format field in vcf file " + vcfFile);
        }

        for (VariantContext variant : reader.iterator()) {
            if (filter.test(variant)) {
                createVariant(tumor, reference, rna, variant).ifPresent(consumer);
            }
        }
    }
}

Source File: GATKVariantContextUtilsUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

@Test
public void testOnTheFlyTabixCreation() throws IOException {
    final File inputGZIPFile = new File(publicTestDir + "org/broadinstitute/hellbender/engine/8_mutect2_sorted.vcf.gz");
    final File outputGZIPFile = createTempFile("testOnTheFlyTabixCreation", ".vcf.gz");

    long recordCount = 0;
    try (final VariantContextWriter vcfWriter = GATKVariantContextUtils.createVCFWriter(
             outputGZIPFile.toPath(),
             null,
             false,
             Options.INDEX_ON_THE_FLY);
        final FeatureReader<VariantContext> inputFileReader =
                 AbstractFeatureReader.getFeatureReader(inputGZIPFile.getAbsolutePath(), new VCFCodec(), false))
    {
        vcfWriter.writeHeader((VCFHeader)inputFileReader.getHeader());
        final Iterator<VariantContext> it = inputFileReader.iterator();
        while (it.hasNext()) {
            vcfWriter.add(it.next());
            recordCount++;
        }
    }

    // make sure we got a tabix index
    final File tabixIndexFile = new File(outputGZIPFile.getAbsolutePath() + FileExtensions.TABIX_INDEX);
    Assert.assertTrue(tabixIndexFile.exists());
    Assert.assertTrue(tabixIndexFile.length() > 0);

    // verify the index via query
    final SimpleInterval queryInterval = new SimpleInterval("chr6:33414233-118314029");
    try (final FeatureReader<VariantContext> outputFileReader = AbstractFeatureReader.getFeatureReader(
            outputGZIPFile.getAbsolutePath(),
            new VCFCodec(),
            true)) // require index
    {
        final long actualCount = outputFileReader.query(
                queryInterval.getContig(), queryInterval.getStart(), queryInterval.getEnd()).stream().count();
        Assert.assertEquals(actualCount, recordCount);
    }
}

Source File: TargetCodec.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

@Override
public boolean canDecode(final String path) {
    File file;
    try {
        // Use the URI constructor so that we can handle file:// URIs
        final URI uri = new URI(path);
        file = uri.isAbsolute() ? new File(uri) : new File(path);
    }
    catch ( Exception e ) {  // Contract for canDecode() mandates that all exception be trapped
        return false;
    }
    
    if (!file.canRead() || !file.isFile()) {
        return false;
    }
    try {
        new TargetTableReader(file).close();
    } catch (final IOException|RuntimeException ex) {
        // IOException correspond to low-level IO errors
        // whereas RuntimeException would be caused by a formatting error in the file.
        return false;
    }

    //disallow .bed extension
    final String toDecode = AbstractFeatureReader.hasBlockCompressedExtension(path) ?
            path.substring(0, path.lastIndexOf(".")) :
            path;
    return !toDecode.toLowerCase().endsWith(BED_EXTENSION);
}

Source File: AnnotatedIntervalCollection.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/** Create a collection based on the contents of an input file and a given config file.  The config file must be the same as
 * is ingested by {@link XsvLocatableTableCodec}.
 *
 * @param input readable path to use for the xsv file.  Must be readable.  Never {@code null}.
 * @param inputConfigFile config file for specifying the format of the xsv file.  Must be readable.  Never {@code null}.
 * @param headersOfInterest Only preserve these headers.  Only a warning will be logged if all of these are not
 *                          present in the input file.  This parameter should not include the locatable columns
 *                          defined by the config file, which are always preserved.
 *                          Use {@code null} to indicate "all headers are of interest".
 * @return never {@code null}
 */
public static AnnotatedIntervalCollection create(final Path input, final Path inputConfigFile, final Set<String> headersOfInterest) {

    IOUtils.assertFileIsReadable(input);
    IOUtils.assertFileIsReadable(inputConfigFile);

    final AnnotatedIntervalCodec codec = new AnnotatedIntervalCodec(inputConfigFile);
    final List<AnnotatedInterval> regions = new ArrayList<>();
    
    if (codec.canDecode(input.toUri().toString())) {
        try (final FeatureReader<AnnotatedInterval> reader = AbstractFeatureReader.getFeatureReader(input.toUri().toString(), codec, false)){

            // This cast is an artifact of the tribble framework.
            @SuppressWarnings("unchecked")
            final AnnotatedIntervalHeader header = (AnnotatedIntervalHeader) reader.getHeader();

            final List<String> finalAnnotations = determineCollectionAnnotations(headersOfInterest, header.getAnnotations());

            final CloseableTribbleIterator<AnnotatedInterval> it = reader.iterator();
            StreamSupport.stream(it.spliterator(), false)
                    .filter(r -> r != null)
                    .map(r -> copyAnnotatedInterval(r,  new HashSet<>(finalAnnotations)))
                    .forEach(r -> regions.add(r));

            return new AnnotatedIntervalCollection(header.getSamFileHeader(), finalAnnotations, regions);

        } catch ( final IOException ex ) {
            throw new GATKException("Error - IO problem with file " + input, ex);
        }
    } else {
        throw new UserException.BadInput("Could not parse xsv file: " + input.toUri().toString());
    }
}

Source File: BEDFileLoader.java From hmftools with GNU General Public License v3.0

5 votes

@NotNull
public static SortedSetMultimap<String, GenomeRegion> fromBedFile(@NotNull String bedFile) throws IOException {
    final SortedSetMultimap<String, GenomeRegion> regionMap = TreeMultimap.create();

    String prevChromosome = null;
    GenomeRegion prevRegion = null;
    try (final AbstractFeatureReader<BEDFeature, LineIterator> reader = getFeatureReader(bedFile, new BEDCodec(), false)) {
        for (final BEDFeature bedFeature : reader.iterator()) {
            final String chromosome = bedFeature.getContig();
            final long start = bedFeature.getStart();
            final long end = bedFeature.getEnd();

            if (end < start) {
                LOGGER.warn("Invalid genome region found in chromosome {}: start={}, end={}", chromosome, start, end);
            } else {
                final GenomeRegion region = GenomeRegions.create(chromosome, start, end);
                if (prevRegion != null && chromosome.equals(prevChromosome) && prevRegion.end() >= start) {
                    LOGGER.warn("BED file is not sorted, please fix! Current={}, Previous={}", region, prevRegion);
                } else {
                    regionMap.put(chromosome, region);
                    prevChromosome = chromosome;
                    prevRegion = region;
                }
            }
        }
    }

    return regionMap;
}

Source File: GermlineVcfReader.java From hmftools with GNU General Public License v3.0

5 votes

private void processVcf(final String vcfFile)
{
    try
    {
        LOGGER.info("processing germline VCF({})", vcfFile);

        mSampleGermlineSVs.clear();
        mSvFactory = new StructuralVariantFactory(new AlwaysPassFilter());

        final AbstractFeatureReader<VariantContext, LineIterator> reader = AbstractFeatureReader.getFeatureReader(
                vcfFile, new VCFCodec(), false);

        reader.iterator().forEach(x -> processVariant(x));

        if(mSampleGermlineSVs.isEmpty())
            return;

        if (mConfig.LinkByAssembly)
            annotateAssembledLinks(mSvAssemblyData);

        if(mConfig.CheckDisruptions)
        {
            final String sampleId = mSampleGermlineSVs.get(0).SampleId;
            mGeneImpact.findDisruptiveVariants(sampleId, mSampleGermlineSVs);
        }

        writeSVs();
    }
    catch(IOException e)
    {
        LOGGER.error("error reading vcf({}): {}", vcfFile, e.toString());
    }
}

Source File: FeatureDataSource.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
 * during queries that produce cache misses.
 *
 * @param featureInput             a FeatureInput specifying a source of Features
 * @param queryLookaheadBases      look ahead this many bases during queries that produce cache misses
 * @param targetFeatureType        When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
 *                                 that produce this type of Feature. May be null, which results in an unrestricted search.
 * @param cloudPrefetchBuffer      MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
 * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
 * @param genomicsDBOptions         options and info for reading from a GenomicsDB; may be null
 */
public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLookaheadBases, final Class<? extends Feature> targetFeatureType,
                         final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions genomicsDBOptions) {
    Utils.validateArg(queryLookaheadBases >= 0, "Query lookahead bases must be >= 0");
    this.featureInput = Utils.nonNull(featureInput, "featureInput must not be null");
    if (IOUtils.isGenomicsDBPath(featureInput)) {
        Utils.nonNull(genomicsDBOptions, "GenomicsDBOptions must not be null. Calling tool may not read from a GenomicsDB data source.");
    }

    // Create a feature reader without requiring an index.  We will require one ourselves as soon as
    // a query by interval is attempted.
    this.featureReader = getFeatureReader(featureInput, targetFeatureType,
            BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer),
            BucketUtils.getPrefetchingWrapper(cloudIndexPrefetchBuffer),
            genomicsDBOptions);

    if (IOUtils.isGenomicsDBPath(featureInput)) {
        //genomics db uri's have no associated index file to read from, but they do support random access
        this.hasIndex = false;
        this.supportsRandomAccess = true;
    } else if (featureReader instanceof AbstractFeatureReader) {
        this.hasIndex = ((AbstractFeatureReader<T, ?>) featureReader).hasIndex();
        this.supportsRandomAccess = hasIndex;
    } else {
        throw new GATKException("Found a feature input that was neither GenomicsDB or a Tribble AbstractFeatureReader.  Input was " + featureInput.toString() + ".");
    }
    // Due to a bug in HTSJDK, unindexed block compressed input files may fail to parse completely. For safety,
    // these files have been disabled. See https://github.com/broadinstitute/gatk/issues/4224 for discussion
    if (!hasIndex && IOUtil.hasBlockCompressedExtension(featureInput.getFeaturePath())) {
        throw new UserException.MissingIndex(featureInput.toString(), "Support for unindexed block-compressed files has been temporarily disabled. Try running IndexFeatureFile on the input.");
    }

    this.currentIterator = null;
    this.intervalsForTraversal = null;
    this.queryCache = new FeatureCache<>();
    this.queryLookaheadBases = queryLookaheadBases;
}

Source File: MakeVcfSampleNameMap.java From picard with MIT License

5 votes

private static AbstractFeatureReader<VariantContext, LineIterator> getReaderFromPath(final Path variantPath) {
    final String variantURI = variantPath.toAbsolutePath().toUri().toString();
    try {
        return AbstractFeatureReader.getFeatureReader(variantURI, null, new VCFCodec(),
                false, Function.identity(), Function.identity());
    } catch (final TribbleException e) {
        throw new PicardException("Failed to create reader from " + variantURI, e);
    }
}

Source File: MakeVcfSampleNameMap.java From picard with MIT License

5 votes

private static VCFHeader getHeaderFromPath(final Path variantPath) {
    try(final AbstractFeatureReader<VariantContext, LineIterator> reader = getReaderFromPath(variantPath)) {
        final VCFHeader header = (VCFHeader) reader.getHeader();
        if (header == null) {
            throw new PicardException("Null header found in " + variantPath.toUri() + ".");
        }
        return header;
    } catch (final IOException e) {
        throw new PicardException("Error while reading VCF header from " + variantPath.toUri(), e);
    }
}

Source File: GatherVcfs.java From picard with MIT License

5 votes

/**
 * Checks (via filename checking) that all files appear to be block compressed files.
 */
private boolean areAllBlockCompressed(final List<File> input) {
    for (final File f : input) {
        if (VCFFileReader.isBCF(f) || !AbstractFeatureReader.hasBlockCompressedExtension(f)) {
            return false;
        }
    }

    return true;
}

Source File: GatherVcfsCloud.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

private static FeatureReader<VariantContext> getReaderFromVCFUri(final Path variantPath, final int cloudPrefetchBuffer) {
    final String variantURI = variantPath.toUri().toString();
    return AbstractFeatureReader.getFeatureReader(variantURI, null, new VCFCodec(), false,
            BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer),
            Function.identity());
}

Source File: ApplyVQSRIntegrationTest.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

@Test
public void testApplyRecalibrationAlleleSpecificINDELmodeGZIPIndex() throws IOException {
    final File tempGZIPOut = createTempFile("testApplyRecalibrationAlleleSpecificINDELmodeGZIP", ".vcf.gz");
    final File expectedFile = new File(getToolTestDataDir(), "expected/applyIndelAlleleSpecificResult.vcf");
    final SimpleInterval queryInterval = new SimpleInterval("3:113005755-195507036");

    // The input file is the same file as used in testApplyRecalibrationAlleleSpecificINDELmode, except that the
    // hg38 sequence dictionary has been transplanted into the header (the header itself has been modified so that
    // contig "chr3" is renamed to "3" to match the variants in this file), and the file is a .gz.
    final String base =
            " -L " +  queryInterval.toString() +
                    " -mode INDEL -AS" +
                    " -ts-filter-level 99.3" +
                    " --variant " + getToolTestDataDir() + "VQSR.AStest.postSNPinput.HACKEDhg38header.vcf.gz" +
                    " --output " + tempGZIPOut.getAbsolutePath() +
                    " --tranches-file " + getToolTestDataDir() + "VQSR.AStest.indels.tranches" +
                    " --recal-file " + getToolTestDataDir() + "VQSR.AStest.indels.recal.vcf" +
                    " --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE +" false";

    final IntegrationTestSpec spec = new IntegrationTestSpec(base, Collections.emptyList());
    spec.executeTest("testApplyRecalibrationAlleleSpecificINDELmodeGZIP", this);

    // make sure we got a tabix index
    final File tabixIndexFile = new File(tempGZIPOut.getAbsolutePath() + FileExtensions.TABIX_INDEX);
    Assert.assertTrue(tabixIndexFile.exists());
    Assert.assertTrue(tabixIndexFile.length() > 0);

    // Now verify that the resulting index is valid by querying the same interval used above to run ApplyVQSR.
    // This should return all the variants in the file. Compare that with the number of records returned by (a
    // non-query) iterator that return all variants in the expected results file.
    try (final FeatureReader<VariantContext> expectedFileReader =
                 AbstractFeatureReader.getFeatureReader(expectedFile.getAbsolutePath(), new VCFCodec(), false);
            final FeatureReader<VariantContext> outputFileReader =
                 AbstractFeatureReader.getFeatureReader(tempGZIPOut.getAbsolutePath(), new VCFCodec())) {
        // results from the query should match the expected file results
        final long actualCount = outputFileReader.query(
                queryInterval.getContig(), queryInterval.getStart(), queryInterval.getEnd()).stream().count();
        final long expectedCount = expectedFileReader.iterator().stream().count();
        Assert.assertEquals(actualCount, expectedCount);
    }
}

Source File: BedToIntervalList.java From picard with MIT License

4 votes

@Override
protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(SEQUENCE_DICTIONARY);
    IOUtil.assertFileIsWritable(OUTPUT);
    try {
        // create a new header that we will assign the dictionary provided by the SAMSequenceDictionaryExtractor to.
        final SAMFileHeader header = new SAMFileHeader();
        final SAMSequenceDictionary samSequenceDictionary = SAMSequenceDictionaryExtractor.extractDictionary(SEQUENCE_DICTIONARY.toPath());
        header.setSequenceDictionary(samSequenceDictionary);
        // set the sort order to be sorted by coordinate, which is actually done below
        // by getting the .uniqued() intervals list before we write out the file
        header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
        final IntervalList intervalList = new IntervalList(header);

        final FeatureReader<BEDFeature> bedReader = AbstractFeatureReader.getFeatureReader(INPUT.getAbsolutePath(), new BEDCodec(), false);
        final CloseableTribbleIterator<BEDFeature> iterator = bedReader.iterator();
        final ProgressLogger progressLogger = new ProgressLogger(LOG, (int) 1e6);

        while (iterator.hasNext()) {
            final BEDFeature bedFeature = iterator.next();
            final String sequenceName = bedFeature.getContig();
            final int start = bedFeature.getStart();
            final int end = bedFeature.getEnd();
            // NB: do not use an empty name within an interval
            final String name;
            if (bedFeature.getName().isEmpty()) {
                name = null;
            } else {
                name = bedFeature.getName();
            }

            final SAMSequenceRecord sequenceRecord = header.getSequenceDictionary().getSequence(sequenceName);

            // Do some validation
            if (null == sequenceRecord) {
                if (DROP_MISSING_CONTIGS) {
                    LOG.info(String.format("Dropping interval with missing contig: %s:%d-%d", sequenceName, bedFeature.getStart(), bedFeature.getEnd()));
                    missingIntervals++;
                    missingRegion += bedFeature.getEnd() - bedFeature.getStart();
                    continue;
                }
                throw new PicardException(String.format("Sequence '%s' was not found in the sequence dictionary", sequenceName));
            } else if (start < 1) {
                throw new PicardException(String.format("Start on sequence '%s' was less than one: %d", sequenceName, start));
            } else if (sequenceRecord.getSequenceLength() < start) {
                throw new PicardException(String.format("Start on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), start));
            } else if ((end == 0 && start != 1 ) //special case for 0-length interval at the start of a contig
                    || end < 0 ) {
                throw new PicardException(String.format("End on sequence '%s' was less than one: %d", sequenceName, end));
            } else if (sequenceRecord.getSequenceLength() < end) {
                throw new PicardException(String.format("End on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), end));
            } else if (end < start - 1) {
                throw new PicardException(String.format("On sequence '%s', end < start-1: %d <= %d", sequenceName, end, start));
            }

            final boolean isNegativeStrand = bedFeature.getStrand() == Strand.NEGATIVE;
            final Interval interval = new Interval(sequenceName, start, end, isNegativeStrand, name);
            intervalList.add(interval);

            progressLogger.record(sequenceName, start);
        }
        CloserUtil.close(bedReader);

        if (DROP_MISSING_CONTIGS) {
            if (missingRegion == 0) {
                LOG.info("There were no missing regions.");
            } else {
                LOG.warn(String.format("There were %d missing regions with a total of %d bases", missingIntervals, missingRegion));
            }
        }
        // Sort and write the output
        IntervalList out = intervalList;
        if (SORT) {
            out = out.sorted();
        }
        if (UNIQUE) {
            out = out.uniqued();
        }
        out.write(OUTPUT);
        LOG.info(String.format("Wrote %d intervals spanning a total of %d bases", out.getIntervals().size(),out.getBaseCount()));

    } catch (final IOException e) {
        throw new RuntimeException(e);
    }

    return 0;
}

Source File: ReplicationOriginAnnotator.java From hmftools with GNU General Public License v3.0

4 votes

public void loadReplicationOrigins(final String filename)
{
    if(filename.isEmpty())
        return;

    try
    {
        String currentChr = "";
        List<ReplicationOriginRegion> regionList = null;
        int regionCount = 0;

        final AbstractFeatureReader<BEDFeature, LineIterator> reader = getFeatureReader(filename, new BEDCodec(), false);

        for (final BEDFeature bedFeature : reader.iterator())
        {
            final String chromosome = refGenomeChromosome(bedFeature.getContig(), RG_VERSION);

            if (!chromosome.equals(currentChr))
            {
                currentChr = chromosome;
                regionList = mReplicationOrigins.get(chromosome);

                if (regionList == null)
                {
                    regionList = Lists.newArrayList();
                    mReplicationOrigins.put(currentChr, regionList);
                }
            }

            ++regionCount;

            double originValue = Double.parseDouble(bedFeature.getName()) / 100;

            regionList.add(new ReplicationOriginRegion(
                    chromosome,
                    bedFeature.getStart(),
                    bedFeature.getEnd(),
                    originValue));
        }

        LNX_LOGGER.debug("loaded {} replication origins", regionCount);
    }
    catch(IOException exception)
    {
        LNX_LOGGER.error("Failed to read replication origin BED file({})", filename);
    }
}

htsjdk.tribble.AbstractFeatureReader Java Examples