Python pybedtools.BedTool() Examples
The following are 30
code examples of pybedtools.BedTool().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pybedtools
, or try the search function
.
Example #1
Source File: annotate_bed_for_ml.py From metasv with BSD 2-Clause "Simplified" License | 6 votes |
def add_weighted_score(in_bed, score_bed): out_bed = in_bed.intersect(score_bed, wao=True).saveas(os.path.join(args.tmpdir, "score.bed")) bed_array = [] last_interval = pybedtools.Interval("", 0, 0) map_value = 0.0 for interval in out_bed: if interval.chrom != last_interval.chrom or interval.start != last_interval.start or interval.end != last_interval.end: if last_interval.chrom: bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),)) map_value = 0.0 last_interval = interval if float(interval.fields[-1]) > 0: map_value += float(interval.fields[-1]) * float(interval.fields[-2]) / float(interval.length) if last_interval.chrom: bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),)) return pybedtools.BedTool(bed_array)
Example #2
Source File: dataloader.py From models with MIT License | 6 votes |
def __init__(self, intervals_file, fasta_file, dnase_file, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None
Example #3
Source File: dataloader.py From models with MIT License | 6 votes |
def __init__(self, intervals_file, fasta_file, dnase_file, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None
Example #4
Source File: find_sites.py From qapa with GNU General Public License v3.0 | 6 votes |
def group_reads(bedfile): reads = pybedtools.BedTool(bedfile) forward = reads.filter(lambda x: x.strand == "+").saveas() if len(forward) > 0: forward = sort_by_strand(forward, strand="+")\ .groupby(g=[1,3,6], c=[2,4], o=['min','count'], full=True)\ .cut([0,6,2,3,7,5])\ .saveas() reverse = reads.filter(lambda x: x.strand == "-").saveas() if len(reverse) > 0: reverse = sort_by_strand(reverse, strand="-")\ .groupby(g=[1,2,6], c=[3,4], o=['max','count'], full=True)\ .cut([0,1,6,3,7,5])\ .saveas() grouped_reads = forward.cat(reverse, postmerge=False) grouped_reads = sort_bed(grouped_reads).saveas() return grouped_reads
Example #5
Source File: utilities.py From SnapTools with Apache License 2.0 | 6 votes |
def is_bed(fname): """Check if a given file is a bed file Args: fname: a bed file name Returns: True if file is a bed file, otherwise False """ try: # open fname using pysam fileType = pybedtools.BedTool(fname).file_type; if fileType == "bed": return True except IndexError as e: # handle the errors return False return False
Example #6
Source File: utilities.py From SnapTools with Apache License 2.0 | 6 votes |
def is_gff(fname): """Check if a given file is a gff file Args: fname: a gff file name Returns: True if file is a gff or gtf file, otherwise False """ try: # open fname using pysam fileType = pybedtools.BedTool(fname).file_type; if fileType == "gff": return True except IndexError as e: # handle the errors return False return False
Example #7
Source File: utilities.py From SnapTools with Apache License 2.0 | 6 votes |
def is_bam(fname): """Check if a given file is a bam file Args: fname: file name Returns: True if file is a bam file, otherwise False """ try: fileType = pybedtools.BedTool(fname).file_type; if fileType == "bam": return True except ValueError as e: # handle the errors return False return True
Example #8
Source File: dataloader.py From models with MIT License | 6 votes |
def __init__(self, intervals_file, fasta_file, dnase_file, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None
Example #9
Source File: utilities.py From SnapTools with Apache License 2.0 | 6 votes |
def is_gff(fname): """Check if a given file is a gff file Args: fname: a gff file name Returns: True if file is a gff or gtf file, otherwise False """ try: # open fname using pysam fileType = pybedtools.BedTool(fname).file_type; if fileType == "gff": return True except IndexError as e: # handle the errors return False return False
Example #10
Source File: dataloader.py From models with MIT License | 6 votes |
def __init__(self, intervals_file, fasta_file, dnase_file, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None
Example #11
Source File: utilities.py From SnapTools with Apache License 2.0 | 6 votes |
def is_bam(fname): """Check if a given file is a bam file Args: fname: file name Returns: True if file is a bam file, otherwise False """ try: fileType = pybedtools.BedTool(fname).file_type; if fileType == "bam": return True except ValueError as e: # handle the errors return False return True
Example #12
Source File: dataloader.py From models with MIT License | 6 votes |
def __init__(self, intervals_file, fasta_file, dnase_file, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None
Example #13
Source File: genotyping.py From grocsvs with MIT License | 6 votes |
def ensure_file_is_bed(path): if path.endswith(".bedpe"): table = pandas.read_table(path) columns = ["chromx", "startx", "endx", "chromy", "starty", "endy", "name"] i = 0 while len(columns) < len(table.columns): columns.append("extra_{}".format(i)) table.columns = columns bedx = pandas.DataFrame() bedx["chrom"] = table["chromx"] bedx["start"] = table["startx"] bedx["end"] = table["endx"] bedx["name"] = table["name"] bedy = pandas.DataFrame() bedy["chrom"] = table["chromy"] bedy["start"] = table["starty"] bedy["end"] = table["endy"] bedy["name"] = table["name"] bed = pandas.concat([bedx, bedy], ignore_index=True) return pybedtools.BedTool.from_dataframe(bed).sort() return pybedtools.BedTool(path).sort()
Example #14
Source File: make_annot.py From ldsc with GNU General Public License v3.0 | 6 votes |
def make_annot_files(args, bed_for_annot): print('making annot file') df_bim = pd.read_csv(args.bimfile, delim_whitespace=True, usecols = [0,1,2,3], names = ['CHR','SNP','CM','BP']) iter_bim = [['chr'+str(x1), x2 - 1, x2] for (x1, x2) in np.array(df_bim[['CHR', 'BP']])] bimbed = BedTool(iter_bim) annotbed = bimbed.intersect(bed_for_annot) bp = [x.start + 1 for x in annotbed] df_int = pd.DataFrame({'BP': bp, 'ANNOT':1}) df_annot = pd.merge(df_bim, df_int, how='left', on='BP') df_annot.fillna(0, inplace=True) df_annot = df_annot[['ANNOT']].astype(int) if args.annot_file.endswith('.gz'): with gzip.open(args.annot_file, 'wb') as f: df_annot.to_csv(f, sep = "\t", index = False) else: df_annot.to_csv(args.annot_file, sep="\t", index=False)
Example #15
Source File: TranscriptClean.py From TranscriptClean with MIT License | 6 votes |
def find_closest_bound(sj_bound, ref_bounds): """ Given one side of a splice junction, find the closest reference """ # Create a Bedtool object for the bound bed_pos = pybedtools.BedTool(sj_bound.getBED(), from_string=True) # Run Bedtools Closest operation closest = bed_pos.closest(ref_bounds, s=True, D="ref", t="first", nonamecheck = True)[0] # Create an object to represent the closest match # Coordinates are 0-based since they are coming from BED obj_closest = dstruct.Struct() obj_closest.chrom = closest[6] obj_closest.start = int(closest[7]) obj_closest.end = int(closest[8]) obj_closest.dist = int(closest[-1]) return obj_closest
Example #16
Source File: annotate_bed_for_ml_del.py From metasv with BSD 2-Clause "Simplified" License | 6 votes |
def add_weighted_score(in_bed, score_bed): out_bed = in_bed.intersect(score_bed, wao=True).saveas(os.path.join(args.tmpdir, "score.bed")) bed_array = [] last_interval = pybedtools.Interval("", 0, 0) map_value = 0.0 for interval in out_bed: if interval.chrom != last_interval.chrom or interval.start != last_interval.start or interval.end != last_interval.end: if last_interval.chrom: bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),)) map_value = 0.0 last_interval = interval if float(interval.fields[-1]) > 0: map_value += float(interval.fields[-1]) * float(interval.fields[-2]) / float(interval.length) if last_interval.chrom: bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),)) return pybedtools.BedTool(bed_array)
Example #17
Source File: generate_sv_intervals.py From metasv with BSD 2-Clause "Simplified" License | 6 votes |
def merge_for_each_sv(bedtool, c, o, svs_to_softclip=SVS_SOFTCLIP_SUPPORTED, overlap_ratio=OVERLAP_RATIO, d=0, reciprocal_for_2bp=True, sv_type_field=[3, 1], inter_tools=False): merged_bedtool = pybedtools.BedTool([]) for svtype in svs_to_softclip: sv_bedtool = bedtool.filter(lambda x: svtype in x.fields[sv_type_field[0]].split(',')[sv_type_field[1]]).sort() if sv_bedtool.count() == 0: continue if svtype == "INS" or not reciprocal_for_2bp: sv_bedtool = sv_bedtool.merge(c=c, o=o, d=d) else: sv_bedtool = merge_intervals_bed(sv_bedtool, overlap_ratio=overlap_ratio, c=c, o=o) if len(merged_bedtool) > 0: merged_bedtool = sv_bedtool.cat(merged_bedtool, postmerge=False) else: merged_bedtool = sv_bedtool return merged_bedtool.sort()
Example #18
Source File: utilities.py From SnapTools with Apache License 2.0 | 5 votes |
def isBedQuerynameSorted(fname): """Check if a given bed file is sorted based on the queryname (the 4th column). Args: ----- fname: a bed file name Returns: ----- True if fname is a bed file sorted by read name, otherwise False """ if not is_bed(fname): print(("Error @is_bed_queryname_sorted: " + fname + " is not a bed file!")); return False; # read the first 1 million reads and see if the read name is sorted fin = pybedtools.BedTool(fname); flag = True; num_instance = 0 for item in fin: cur_barcode = str(item).split()[3]; if num_instance == 0: prev_barcode = cur_barcode; num_instance += 1; if num_instance >= 1000000: break; if cur_barcode < prev_barcode: flag = False; break; del fin return flag
Example #19
Source File: TranscriptClean.py From TranscriptClean with MIT License | 5 votes |
def find_closest_ref_junction(junction, ref_donors, ref_acceptors): """ Given a splice junction object and a splice reference, locate the closest junction in the provided splice reference BedTool object""" # Get the splice donor and acceptor of the junction donor = junction.get_splice_donor() acceptor = junction.get_splice_acceptor() # Find the closest match for each closest_ref_donor = find_closest_bound(donor, ref_donors) closest_ref_acceptor = find_closest_bound(acceptor, ref_acceptors) return closest_ref_donor, closest_ref_acceptor
Example #20
Source File: SNPtools.py From slamdunk with GNU Affero General Public License v3.0 | 5 votes |
def read(self): if (self._vcfFile != None): if(os.path.exists(self._vcfFile)): vcfReader = BedTool(self._vcfFile) if(vcfReader.file_type != "vcf"): print("Wrong file type. Empty or not a vcf file.") for snp in vcfReader: self._addSNP(snp) else: print("Warning: SNP file " + self._vcfFile + " not found.")
Example #21
Source File: getALTsForTargetsSeqsForMutSig.py From CovGen with MIT License | 5 votes |
def mergeLoci(bedf): import pybedtools logger.info("Merging Loci in BedFile ...") b = pybedtools.BedTool(bedf) b_merged = b.sort().merge() newBedFilename = bedf+".merged.vcf" b_merged.moveto(newBedFilename) logger.info("Merging Loci in BedFile ... DONE") return newBedFilename
Example #22
Source File: test_annotate.py From qapa with GNU General Public License v3.0 | 5 votes |
def test_gene_at_beginning_of_chr_2(self): example = "chr17_KI270861v1_alt 24 5793 ENST00000634102_SLC43A2 5631 - 5631 5793 SLC43A2 24,6624,8277,13231,15940,20829,21296,21593,23214,43222,45006,46589,57742,58821 5793,6748,8351,13364,16079,20976,21499,21727,23307,43299,45062,46797,57948,58864" bed = pybedtools.BedTool(example, from_string=True) l = 24 feature = bed[0] target = anno.extend_feature(feature, length=l) self.assertEqual(target.start, 0) self.assertEqual(target.end, 5793) target = anno.restore_feature(target, length=l) self.assertEqual(target.start, 24)
Example #23
Source File: tepid.py From TEPID with GNU General Public License v3.0 | 5 votes |
def check_te_overlaps_dels(te, bamfile, te_list, prefix): pybedtools.BedTool(bamfile).bam_to_bed().saveas(prefix + 'split.temp') convert_split_pairbed(prefix + 'split.temp', prefix + 'split_bedpe.temp') split = pybedtools.BedTool(prefix + 'split_bedpe.temp').each(append_origin, word='split').saveas() create_deletion_coords(split, prefix + 'second_pass_del_coords.temp') dels = pybedtools.BedTool(prefix + 'second_pass_del_coords.temp').sort() intersections = dels.intersect(te, wb=True, nonamecheck=True, sorted=True) os.remove(prefix + 'second_pass_del_coords.temp') reads = [] for r in intersections: if r[-3] in te_list: reads.append(r[3]) else: pass return reads
Example #24
Source File: test_annotate.py From qapa with GNU General Public License v3.0 | 5 votes |
def setUpClass(cls): example = "chrX 74026591 74036494 ENSMUST00000100750_Mecp2 8825 - 74035416 74036494 Mecp2 74026591,74036981,74079908,74085509 74036494,74037332,74080032,74085669" cls.bed = pybedtools.BedTool(example, from_string=True)
Example #25
Source File: fasta.py From qapa with GNU General Public License v3.0 | 5 votes |
def get_sequences(bed_file, genome): bed = pybedtools.BedTool(bed_file) logger.info("Extract sequences from %s" % genome) return bed.sequence(genome, s=True, name=True, fullHeader=False, split=True)
Example #26
Source File: annotate.py From qapa with GNU General Public License v3.0 | 5 votes |
def preprocess_polyasite(polyasite_file, min_polyasite): """ Pre-proecess polyasite file """ logger.info("Preprocessing %s" % polyasite_file) # add an empty filter step as hack to read gzipped files polyasite = pybedtools.BedTool(polyasite_file)\ .filter(lambda x: x)\ .saveas() polyasite = sort_bed(polyasite) validate(polyasite, polyasite_file) pas_filter = re.compile("(DS|TE)$") is_v2 = polyasite.field_count() == 11 if not is_v2: logger.info("Detected PolyASite version 1") field_index = 3 else: logger.info("Detected PolyASite version 2") field_index = 9 polyasite = polyasite.each(_add_chr)\ .each(_move_num_exp)\ .saveas() polyasite_te = polyasite\ .filter(lambda x: int(x[4]) >= min_polyasite)\ .filter(lambda x: pas_filter.search(x[field_index]))\ .cut(range(0, 6))\ .saveas() validate(polyasite_te, polyasite_file) return polyasite_te
Example #27
Source File: annotate.py From qapa with GNU General Public License v3.0 | 5 votes |
def preprocess_gencode_polya(gencode_polya_file): """ Preprocess GENCODE polyA track """ logger.info("Preprocessing %s" % gencode_polya_file) gencode = pybedtools.BedTool(gencode_polya_file)\ .filter(lambda x: x.name == 'polyA_site')\ .saveas() gencode = sort_bed(gencode) validate(gencode, gencode_polya_file) return gencode
Example #28
Source File: make_annot.py From ldsc with GNU General Public License v3.0 | 5 votes |
def gene_set_to_bed(args): print('making gene set bed file') GeneSet = pd.read_csv(args.gene_set_file, header = None, names = ['GENE']) all_genes = pd.read_csv(args.gene_coord_file, delim_whitespace = True) df = pd.merge(GeneSet, all_genes, on = 'GENE', how = 'inner') df['START'] = np.maximum(1, df['START'] - args.windowsize) df['END'] = df['END'] + args.windowsize iter_df = [['chr'+(str(x1).lstrip('chr')), x2 - 1, x3] for (x1,x2,x3) in np.array(df[['CHR', 'START', 'END']])] return BedTool(iter_df).sort().merge()
Example #29
Source File: annotate.py From qapa with GNU General Public License v3.0 | 5 votes |
def sort_bed(bedobj): """ Use GNU sort """ tmpbed = bedobj._tmp() os.system('sort {0} -k1,1 -k2,2n -k3,3n > {1}'.format(bedobj.fn, tmpbed)) return pybedtools.BedTool(tmpbed)
Example #30
Source File: check_splice_sites.py From outrigger with BSD 3-Clause "New" or "Revised" License | 5 votes |
def read_splice_sites(bed, genome, fasta, direction='upstream'): """Read splice sites of an exon Parameters ---------- bed : pybedtools.BedTool | str Exons whose splice sites you're interested in genome : str Location of a chromosome sizes file for the genome fasta : str Location of the genome fasta file direction : 'upstream' | 'downstream' Which direction of splice sites you want for the exon """ if isinstance(bed, str): bed = pybedtools.BedTool(bed) genome = maybe_read_chromsizes(genome) if direction == 'upstream': left = NT right = 0 elif direction == 'downstream': left = 0 right = NT flanked = bed.flank(l=left, r=right, s=True, genome=genome) seqs = flanked.sequence(fi=fasta, s=True) with open(seqs.seqfn) as f: records = SeqIO.parse(f, 'fasta') records = pd.Series([str(r.seq) for r in records], index=[b.name for b in bed]) # import pdb; pdb.set_trace() return records