Python pybedtools.BedTool() Examples

The following are 30 code examples of pybedtools.BedTool(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pybedtools , or try the search function .
Example #1
Source File: annotate_bed_for_ml.py    From metasv with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def add_weighted_score(in_bed, score_bed):
    out_bed = in_bed.intersect(score_bed, wao=True).saveas(os.path.join(args.tmpdir, "score.bed"))

    bed_array = []
    last_interval = pybedtools.Interval("", 0, 0)
    map_value = 0.0
    for interval in out_bed:
        if interval.chrom != last_interval.chrom or interval.start != last_interval.start or interval.end != last_interval.end:
            if last_interval.chrom:
                bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),))
            map_value = 0.0
            last_interval = interval
        if float(interval.fields[-1]) > 0:
            map_value += float(interval.fields[-1]) * float(interval.fields[-2]) / float(interval.length)

    if last_interval.chrom:
        bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),))

    return pybedtools.BedTool(bed_array) 
Example #2
Source File: dataloader.py    From models with MIT License 6 votes vote down vote up
def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_file = fasta_file
        self.fasta_extractor = None  # initialize later
        # DNase
        self.dnase_file = dnase_file
        self.dnase_extractor = None 
Example #3
Source File: dataloader.py    From models with MIT License 6 votes vote down vote up
def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_file = fasta_file
        self.fasta_extractor = None  # initialize later
        # DNase
        self.dnase_file = dnase_file
        self.dnase_extractor = None 
Example #4
Source File: find_sites.py    From qapa with GNU General Public License v3.0 6 votes vote down vote up
def group_reads(bedfile):
    reads = pybedtools.BedTool(bedfile)
   
    forward = reads.filter(lambda x: x.strand == "+").saveas()
    if len(forward) > 0:
        forward = sort_by_strand(forward, strand="+")\
                .groupby(g=[1,3,6], c=[2,4], o=['min','count'], full=True)\
                .cut([0,6,2,3,7,5])\
                .saveas()

    reverse = reads.filter(lambda x: x.strand == "-").saveas()
    if len(reverse) > 0:
        reverse = sort_by_strand(reverse, strand="-")\
                .groupby(g=[1,2,6], c=[3,4], o=['max','count'], full=True)\
                .cut([0,1,6,3,7,5])\
                .saveas()
 
    grouped_reads = forward.cat(reverse, postmerge=False)
    grouped_reads = sort_bed(grouped_reads).saveas()

    return grouped_reads 
Example #5
Source File: utilities.py    From SnapTools with Apache License 2.0 6 votes vote down vote up
def is_bed(fname):
    """Check if a given file is a bed file
    
    Args:
        fname: a bed file name

    Returns:
        True if file is a bed file, otherwise False
    """
    try:
        # open fname using pysam
        fileType = pybedtools.BedTool(fname).file_type;
        if fileType == "bed":
            return True
    except IndexError as e:
        # handle the errors
        return False
    return False 
Example #6
Source File: utilities.py    From SnapTools with Apache License 2.0 6 votes vote down vote up
def is_gff(fname):
    """Check if a given file is a gff file
    
    Args:
        fname: a gff file name

    Returns:
        True if file is a gff or gtf file, otherwise False
    """
    try:
        # open fname using pysam
        fileType = pybedtools.BedTool(fname).file_type;
        if fileType == "gff":
            return True
    except IndexError as e:
        # handle the errors
        return False
    return False 
Example #7
Source File: utilities.py    From SnapTools with Apache License 2.0 6 votes vote down vote up
def is_bam(fname):
    """Check if a given file is a bam file
    
    Args:
        fname: file name

    Returns:
        True if file is a bam file, otherwise False
    """
    try:
        fileType = pybedtools.BedTool(fname).file_type;
        if fileType == "bam":
            return True
    except ValueError as e:
        # handle the errors
        return False
    return True 
Example #8
Source File: dataloader.py    From models with MIT License 6 votes vote down vote up
def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_file = fasta_file
        self.fasta_extractor = None  # initialize later
        # DNase
        self.dnase_file = dnase_file
        self.dnase_extractor = None 
Example #9
Source File: utilities.py    From SnapTools with Apache License 2.0 6 votes vote down vote up
def is_gff(fname):
    """Check if a given file is a gff file
    
    Args:
        fname: a gff file name

    Returns:
        True if file is a gff or gtf file, otherwise False
    """
    try:
        # open fname using pysam
        fileType = pybedtools.BedTool(fname).file_type;
        if fileType == "gff":
            return True
    except IndexError as e:
        # handle the errors
        return False
    return False 
Example #10
Source File: dataloader.py    From models with MIT License 6 votes vote down vote up
def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_file = fasta_file
        self.fasta_extractor = None  # initialize later
        # DNase
        self.dnase_file = dnase_file
        self.dnase_extractor = None 
Example #11
Source File: utilities.py    From SnapTools with Apache License 2.0 6 votes vote down vote up
def is_bam(fname):
    """Check if a given file is a bam file
    
    Args:
        fname: file name

    Returns:
        True if file is a bam file, otherwise False
    """
    try:
        fileType = pybedtools.BedTool(fname).file_type;
        if fileType == "bam":
            return True
    except ValueError as e:
        # handle the errors
        return False
    return True 
Example #12
Source File: dataloader.py    From models with MIT License 6 votes vote down vote up
def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_file = fasta_file
        self.fasta_extractor = None  # initialize later
        # DNase
        self.dnase_file = dnase_file
        self.dnase_extractor = None 
Example #13
Source File: genotyping.py    From grocsvs with MIT License 6 votes vote down vote up
def ensure_file_is_bed(path):
    if path.endswith(".bedpe"):
        table = pandas.read_table(path)
        columns = ["chromx", "startx", "endx", "chromy", "starty", "endy", "name"]
        i = 0
        while len(columns) < len(table.columns):
            columns.append("extra_{}".format(i))

        table.columns = columns
        bedx = pandas.DataFrame()
        bedx["chrom"] = table["chromx"]
        bedx["start"] = table["startx"]
        bedx["end"] =   table["endx"]
        bedx["name"] =  table["name"]

        bedy = pandas.DataFrame()
        bedy["chrom"] = table["chromy"]
        bedy["start"] = table["starty"]
        bedy["end"] =   table["endy"]
        bedy["name"] =  table["name"]

        bed = pandas.concat([bedx, bedy], ignore_index=True)

        return pybedtools.BedTool.from_dataframe(bed).sort()
    return pybedtools.BedTool(path).sort() 
Example #14
Source File: make_annot.py    From ldsc with GNU General Public License v3.0 6 votes vote down vote up
def make_annot_files(args, bed_for_annot):
    print('making annot file')
    df_bim = pd.read_csv(args.bimfile,
            delim_whitespace=True, usecols = [0,1,2,3], names = ['CHR','SNP','CM','BP'])
    iter_bim = [['chr'+str(x1), x2 - 1, x2] for (x1, x2) in np.array(df_bim[['CHR', 'BP']])]
    bimbed = BedTool(iter_bim)
    annotbed = bimbed.intersect(bed_for_annot)
    bp = [x.start + 1 for x in annotbed]
    df_int = pd.DataFrame({'BP': bp, 'ANNOT':1})
    df_annot = pd.merge(df_bim, df_int, how='left', on='BP')
    df_annot.fillna(0, inplace=True)
    df_annot = df_annot[['ANNOT']].astype(int)
    if args.annot_file.endswith('.gz'):
        with gzip.open(args.annot_file, 'wb') as f:
            df_annot.to_csv(f, sep = "\t", index = False)
    else:
        df_annot.to_csv(args.annot_file, sep="\t", index=False) 
Example #15
Source File: TranscriptClean.py    From TranscriptClean with MIT License 6 votes vote down vote up
def find_closest_bound(sj_bound, ref_bounds):
    """ Given one side of a splice junction, find the closest reference """

    # Create a Bedtool object for the bound
    bed_pos = pybedtools.BedTool(sj_bound.getBED(), from_string=True)

    # Run Bedtools Closest operation
    closest = bed_pos.closest(ref_bounds, s=True, D="ref", t="first", nonamecheck = True)[0]

    # Create an object to represent the closest match
    # Coordinates are 0-based since they are coming from BED
    obj_closest = dstruct.Struct()
    obj_closest.chrom = closest[6]
    obj_closest.start = int(closest[7])
    obj_closest.end = int(closest[8])
    obj_closest.dist = int(closest[-1])
     
    return obj_closest 
Example #16
Source File: annotate_bed_for_ml_del.py    From metasv with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def add_weighted_score(in_bed, score_bed):
    out_bed = in_bed.intersect(score_bed, wao=True).saveas(os.path.join(args.tmpdir, "score.bed"))

    bed_array = []
    last_interval = pybedtools.Interval("", 0, 0)
    map_value = 0.0
    for interval in out_bed:
        if interval.chrom != last_interval.chrom or interval.start != last_interval.start or interval.end != last_interval.end:
            if last_interval.chrom:
                bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),))
            map_value = 0.0
            last_interval = interval
        if float(interval.fields[-1]) > 0:
            map_value += float(interval.fields[-1]) * float(interval.fields[-2]) / float(interval.length)

    if last_interval.chrom:
        bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),))

    return pybedtools.BedTool(bed_array) 
Example #17
Source File: generate_sv_intervals.py    From metasv with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def merge_for_each_sv(bedtool, c, o, svs_to_softclip=SVS_SOFTCLIP_SUPPORTED,
                      overlap_ratio=OVERLAP_RATIO, d=0, reciprocal_for_2bp=True,
                      sv_type_field=[3, 1], inter_tools=False):
    merged_bedtool = pybedtools.BedTool([])
    for svtype in svs_to_softclip:
        sv_bedtool = bedtool.filter(lambda x: svtype in x.fields[sv_type_field[0]].split(',')[sv_type_field[1]]).sort()
        if sv_bedtool.count() == 0:
            continue
        if svtype == "INS" or not reciprocal_for_2bp:
            sv_bedtool = sv_bedtool.merge(c=c, o=o, d=d)
        else:
            sv_bedtool = merge_intervals_bed(sv_bedtool, overlap_ratio=overlap_ratio,
                                             c=c, o=o)
        if len(merged_bedtool) > 0:
            merged_bedtool = sv_bedtool.cat(merged_bedtool, postmerge=False)
        else:
            merged_bedtool = sv_bedtool
    return merged_bedtool.sort() 
Example #18
Source File: utilities.py    From SnapTools with Apache License 2.0 5 votes vote down vote up
def isBedQuerynameSorted(fname):
    """Check if a given bed file is sorted based on the queryname (the 4th column).
    
    Args:
    -----
        fname: a bed file name

    Returns:
    -----
        True if fname is a bed file sorted by read name, otherwise False 
    """
    
    if not is_bed(fname):
        print(("Error @is_bed_queryname_sorted: " + fname + " is not a bed file!"));
        return False;
    
    # read the first 1 million reads and see if the read name is sorted
    fin = pybedtools.BedTool(fname);
    flag = True;
    num_instance = 0
    for item in fin:
        cur_barcode = str(item).split()[3];
        if num_instance == 0:
            prev_barcode = cur_barcode;
        num_instance += 1;
        if num_instance >= 1000000: break;
        if cur_barcode < prev_barcode: 
            flag = False;
            break;
    del fin
    return flag 
Example #19
Source File: TranscriptClean.py    From TranscriptClean with MIT License 5 votes vote down vote up
def find_closest_ref_junction(junction, ref_donors, ref_acceptors):
    """ Given a splice junction object and a splice reference, locate the 
        closest junction in the provided splice reference BedTool object"""

    # Get the splice donor and acceptor of the junction
    donor = junction.get_splice_donor()
    acceptor = junction.get_splice_acceptor()

    # Find the closest match for each
    closest_ref_donor = find_closest_bound(donor, ref_donors)
    closest_ref_acceptor = find_closest_bound(acceptor, ref_acceptors)

    return closest_ref_donor, closest_ref_acceptor 
Example #20
Source File: SNPtools.py    From slamdunk with GNU Affero General Public License v3.0 5 votes vote down vote up
def read(self):        
        if (self._vcfFile != None):
            if(os.path.exists(self._vcfFile)):
                vcfReader = BedTool(self._vcfFile)
                
                if(vcfReader.file_type != "vcf"):
                    print("Wrong file type. Empty or not a vcf file.")
         
                for snp in vcfReader:
                    self._addSNP(snp)
            else:
                print("Warning: SNP file " + self._vcfFile + " not found.") 
Example #21
Source File: getALTsForTargetsSeqsForMutSig.py    From CovGen with MIT License 5 votes vote down vote up
def mergeLoci(bedf):
	import pybedtools
	logger.info("Merging Loci in BedFile ...")
	b = pybedtools.BedTool(bedf)
	b_merged = b.sort().merge()
	newBedFilename = bedf+".merged.vcf"
	b_merged.moveto(newBedFilename)
	logger.info("Merging Loci in BedFile ... DONE")
	return newBedFilename 
Example #22
Source File: test_annotate.py    From qapa with GNU General Public License v3.0 5 votes vote down vote up
def test_gene_at_beginning_of_chr_2(self):
        example = "chr17_KI270861v1_alt	24	5793	ENST00000634102_SLC43A2 5631	-	5631	5793	SLC43A2 24,6624,8277,13231,15940,20829,21296,21593,23214,43222,45006,46589,57742,58821 5793,6748,8351,13364,16079,20976,21499,21727,23307,43299,45062,46797,57948,58864"
        bed = pybedtools.BedTool(example, from_string=True)
        l = 24
        feature = bed[0]
        target = anno.extend_feature(feature, length=l)
        self.assertEqual(target.start, 0)
        self.assertEqual(target.end, 5793)

        target = anno.restore_feature(target, length=l)
        self.assertEqual(target.start, 24) 
Example #23
Source File: tepid.py    From TEPID with GNU General Public License v3.0 5 votes vote down vote up
def check_te_overlaps_dels(te, bamfile, te_list, prefix):
    pybedtools.BedTool(bamfile).bam_to_bed().saveas(prefix + 'split.temp')
    convert_split_pairbed(prefix + 'split.temp', prefix + 'split_bedpe.temp')
    split = pybedtools.BedTool(prefix + 'split_bedpe.temp').each(append_origin, word='split').saveas()
    create_deletion_coords(split, prefix + 'second_pass_del_coords.temp')
    dels = pybedtools.BedTool(prefix + 'second_pass_del_coords.temp').sort()
    intersections = dels.intersect(te, wb=True, nonamecheck=True, sorted=True)
    os.remove(prefix + 'second_pass_del_coords.temp')
    reads = []
    for r in intersections:
        if r[-3] in te_list:
            reads.append(r[3])
        else:
            pass
    return reads 
Example #24
Source File: test_annotate.py    From qapa with GNU General Public License v3.0 5 votes vote down vote up
def setUpClass(cls):
        example = "chrX	74026591	74036494	ENSMUST00000100750_Mecp2	8825	-	74035416	74036494	Mecp2	74026591,74036981,74079908,74085509	74036494,74037332,74080032,74085669"
        cls.bed = pybedtools.BedTool(example, from_string=True) 
Example #25
Source File: fasta.py    From qapa with GNU General Public License v3.0 5 votes vote down vote up
def get_sequences(bed_file, genome):
    bed = pybedtools.BedTool(bed_file)
    logger.info("Extract sequences from %s" % genome)
    return bed.sequence(genome, s=True, name=True, fullHeader=False,
                        split=True) 
Example #26
Source File: annotate.py    From qapa with GNU General Public License v3.0 5 votes vote down vote up
def preprocess_polyasite(polyasite_file, min_polyasite):
    """
    Pre-proecess polyasite file
    """
    logger.info("Preprocessing %s" % polyasite_file)
    # add an empty filter step as hack to read gzipped files
    polyasite = pybedtools.BedTool(polyasite_file)\
        .filter(lambda x: x)\
        .saveas()
    polyasite = sort_bed(polyasite)
    validate(polyasite, polyasite_file)

    pas_filter = re.compile("(DS|TE)$")
    is_v2 = polyasite.field_count() == 11 

    if not is_v2:
        logger.info("Detected PolyASite version 1")
        field_index = 3
    else:
        logger.info("Detected PolyASite version 2")
        field_index = 9
        polyasite = polyasite.each(_add_chr)\
                             .each(_move_num_exp)\
                             .saveas()

    polyasite_te = polyasite\
        .filter(lambda x: int(x[4]) >= min_polyasite)\
        .filter(lambda x: pas_filter.search(x[field_index]))\
        .cut(range(0, 6))\
        .saveas()
    validate(polyasite_te, polyasite_file)
    return polyasite_te 
Example #27
Source File: annotate.py    From qapa with GNU General Public License v3.0 5 votes vote down vote up
def preprocess_gencode_polya(gencode_polya_file):
    """
    Preprocess GENCODE polyA track
    """
    logger.info("Preprocessing %s" % gencode_polya_file)
    gencode = pybedtools.BedTool(gencode_polya_file)\
        .filter(lambda x: x.name == 'polyA_site')\
        .saveas()
    gencode = sort_bed(gencode)
    validate(gencode, gencode_polya_file)
    return gencode 
Example #28
Source File: make_annot.py    From ldsc with GNU General Public License v3.0 5 votes vote down vote up
def gene_set_to_bed(args):
    print('making gene set bed file')
    GeneSet = pd.read_csv(args.gene_set_file, header = None, names = ['GENE'])
    all_genes = pd.read_csv(args.gene_coord_file, delim_whitespace = True)
    df = pd.merge(GeneSet, all_genes, on = 'GENE', how = 'inner')
    df['START'] = np.maximum(1, df['START'] - args.windowsize)
    df['END'] = df['END'] + args.windowsize
    iter_df = [['chr'+(str(x1).lstrip('chr')), x2 - 1, x3] for (x1,x2,x3) in np.array(df[['CHR', 'START', 'END']])]
    return BedTool(iter_df).sort().merge() 
Example #29
Source File: annotate.py    From qapa with GNU General Public License v3.0 5 votes vote down vote up
def sort_bed(bedobj):
    """
    Use GNU sort
    """
    tmpbed = bedobj._tmp()
    os.system('sort {0} -k1,1 -k2,2n -k3,3n > {1}'.format(bedobj.fn, tmpbed))
    return pybedtools.BedTool(tmpbed) 
Example #30
Source File: check_splice_sites.py    From outrigger with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def read_splice_sites(bed, genome, fasta, direction='upstream'):
    """Read splice sites of an exon

    Parameters
    ----------
    bed : pybedtools.BedTool | str
        Exons whose splice sites you're interested in
    genome : str
        Location of a chromosome sizes file for the genome
    fasta : str
        Location of the genome fasta file
    direction : 'upstream' | 'downstream'
        Which direction of splice sites you want for the exon
    """
    if isinstance(bed, str):
        bed = pybedtools.BedTool(bed)

    genome = maybe_read_chromsizes(genome)

    if direction == 'upstream':
        left = NT
        right = 0
    elif direction == 'downstream':
        left = 0
        right = NT

    flanked = bed.flank(l=left, r=right, s=True, genome=genome)
    seqs = flanked.sequence(fi=fasta, s=True)

    with open(seqs.seqfn) as f:
        records = SeqIO.parse(f, 'fasta')
        records = pd.Series([str(r.seq) for r in records],
                            index=[b.name for b in bed])
    # import pdb; pdb.set_trace()
    return records