Python pybedtools.Interval() Examples
The following are 30
code examples of pybedtools.Interval().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pybedtools
, or try the search function
.
Example #1
Source File: dataloader.py From models with MIT License | 7 votes |
def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = FastaStringExtractor(self.fasta_file, use_strand=True, force_upper=self.force_upper) if self.vcf is None: self.vcf = MultiSampleVCF(self.vcf_file) if self.vcf_extractor is None: self.vcf_extractor = VariantSeqExtractor(self.fasta_file) entry = self.bed.iloc[idx] entry_id = entry["id"] entry_chr = entry["chr"] entry_pos = entry["pos"] entry_strand = entry["strand"] ref_exons = [] var_exons = [] exon_pos_strings = [] exon_var_strings = [] for exon in entry_pos: # We get the interval interval = pybedtools.Interval(to_scalar(entry_chr), to_scalar(exon[0]), to_scalar(exon[1]), strand=to_scalar(entry_strand)) exon_pos_strings.append("%s-%s" % (str(exon[0]),str(exon[1]))) # We get the reference sequence ref_seq = self.fasta_extractor.extract(interval) # We get the variants, insert them and also save them as metadata variants = list(self.vcf.fetch_variants(interval)) if len(variants) == 0: ref_exons.append(ref_seq) var_exons.append(ref_seq) else: var_seq = self.vcf_extractor.extract(interval, variants=variants, anchor=0, fixed_len=False) var_string = ";".join([str(var) for var in variants]) ref_exons.append(ref_seq) var_exons.append(var_seq) exon_var_strings.append(var_string) # Combine if entry_strand == "-": ref_exons.reverse() var_exons.reverse() ref_seq = "".join(ref_exons) var_seq = "".join(var_exons) pos_string = ";".join(exon_pos_strings) var_string = ";".join(exon_var_strings) return { "inputs": np.array([ref_seq, var_seq]), "metadata": { "id": entry_id, "chr": entry_chr, "exon_positions": pos_string, "strand": entry_strand, "variants": var_string } }
Example #2
Source File: generate_sv_intervals.py From metasv with BSD 2-Clause "Simplified" License | 6 votes |
def merged_interval_features(feature, bam_handle,find_svtype=False): support_list = feature.name.split(",") locations = sorted(map(int, support_list[0:-1:3])) other_bp_ends = support_list[-1] if not find_svtype else ','.join(support_list[1::3]) num_unique_locations = len(set(locations)) count_str = ",".join(["%s,%s" % (i, c) for (i, c) in collections.Counter(locations).items()]) plus_support = len([i for i in support_list[2::3] if i == "+"]) minus_support = len(locations) - plus_support locations_span = max(locations) - min(locations) interval_readcount = bam_handle.count(reference=str(feature.chrom), start=feature.start, end=feature.end) info = {"SC_PLUS_SUPPORT":plus_support, "SC_MINUS_SUPPORT":minus_support, "SC_LOCATIONS_SPAN":locations_span, "SC_NUM_UNIQUE_LOCATIONS":num_unique_locations, "SC_COUNT_STR": count_str, "SC_COVERAGE":interval_readcount, "SC_OTHER_BP_ENDS": other_bp_ends, "SC_SC_BP_ENDS": "%s-%s"%(feature.start, feature.end)} name = "%s,%s,0,SC" % ( base64.b64encode(json.dumps(info)), feature.fields[6].split(',')[0]) return pybedtools.Interval(feature.chrom, feature.start, feature.end, name=name, score=feature.score, otherfields=[str(interval_readcount)]+feature.fields[6:])
Example #3
Source File: generate_final_vcf.py From metasv with BSD 2-Clause "Simplified" License | 6 votes |
def find_idp(feature, wiggle): n = len(feature.fields) / 2 if feature.chrom != feature.fields[n]: return None start_dup = feature.start end_dup = feature.end start_del = int(feature.fields[n + 1]) end_del = int(feature.fields[n + 2]) if abs(start_del - end_del) > (abs(start_dup - end_dup) - wiggle): return None dist_ends = [abs(start_del - start_dup), abs(end_del - end_dup)] if min(dist_ends) > wiggle: return None del_pos = start_del if dist_ends[0] > dist_ends[1] else end_del name = "%s,%s" % (feature.name, feature.fields[n + 3]) score = "%s,%s" % (feature.score, feature.fields[n + 4]) return pybedtools.Interval(feature.chrom, feature.start, feature.end, name=name, score=score, otherfields=["%d" % del_pos, "%d-%d" % (start_del, end_del)])
Example #4
Source File: generate_sv_intervals.py From metasv with BSD 2-Clause "Simplified" License | 6 votes |
def find_other_bp_interval(feature,pad): name_fields = feature.name.split(",") sv_type = name_fields[1] sv_methods = name_fields[3] sv_length = int(name_fields[2]) info = json.loads(base64.b64decode(name_fields[0])) other_bps=map(lambda y: map(int,y['SC_OTHER_BP_ENDS'].split('-')),info["SC_SUBINTERVAL_INFOs"]) start_interval=min(map(lambda x:x[0],other_bps)) end_interval=max(map(lambda x:x[1],other_bps)) soft_clip_location = (start_interval+end_interval)/2 sc_bp=feature.start if abs(soft_clip_location-feature.start)>abs(soft_clip_location-feature.end) else feature.end other_bp_field="%d-%d"%(max(sc_bp-pad,0),sc_bp+pad) name="%d,%d,+,%s"%(soft_clip_location,sc_bp,other_bp_field) return pybedtools.Interval(feature.chrom, start_interval, end_interval, name=name, score="1", otherfields=[sv_type])
Example #5
Source File: generate_final_vcf.py From metasv with BSD 2-Clause "Simplified" License | 6 votes |
def filter_itxs(feature): n = len(feature.fields) / 2 del_interval_idp = map(int, feature.fields[7].split("-")) del_interval_itx_1 = map(int, feature.fields[n + 7].split(",")[0].split("-")) del_interval_itx_2 = map(int, feature.fields[n + 7].split(",")[1].split("-")) if filter(lambda x: abs(x[0] - del_interval_idp[0]) + abs( x[1] - del_interval_idp[1]) == 0, [del_interval_itx_1, del_interval_itx_2]) and "LowQual" not in \ feature.fields[n + 4]: return None return pybedtools.Interval(feature.chrom, feature.start, feature.end, name=feature.name, score=feature.score, otherfields=feature.fields[6:n])
Example #6
Source File: generate_sv_intervals.py From metasv with BSD 2-Clause "Simplified" License | 6 votes |
def get_full_interval(feature, pad): name_fields = feature.name.split(",") info = json.loads(base64.b64decode(name_fields[0])) other_bp_ends=info["SC_OTHER_BP_ENDS"] start = feature.start end = feature.end sv_type = name_fields[1] if "-" in other_bp_ends: other_bp_start,other_bp_end=map(lambda x:int(x),other_bp_ends.split("-")) if other_bp_start != 0 or other_bp_end != 0: start = min((feature.start+feature.end)/2,(other_bp_start+other_bp_end)/2) end = max((feature.start+feature.end)/2,(other_bp_start+other_bp_end)/2) sv_len = 0 if sv_type == "INS" else max(end-start,0) info["SOURCES"] = "%s-%d-%s-%d-%d-SoftClip" % (feature.chrom, start, feature.chrom, end, sv_len) name = "%s,%s,%d,%s"%(base64.b64encode(json.dumps(info)),sv_type,sv_len,'SC') return pybedtools.Interval(feature.chrom, start, end, name=name, score=feature.score, otherfields=feature.fields[6:])
Example #7
Source File: sv_interval.py From metasv with BSD 2-Clause "Simplified" License | 6 votes |
def to_bed_interval(self, sample_name): if self.start <= 0: return None if self.sv_type not in ["DEL", "INS", "INV", "ITX", "CTX", "DUP"]: return None # if not self.sub_intervals and list(self.sources)[0] == "HaplotypeCaller": return None # if len(self.sources) == 1 and list(self.sources)[0] == "HaplotypeCaller": return None if self.sv_type == "INS": end = self.end + 1 elif self.sv_type in ["ITX","CTX"] : end = self.start + 1 else: end = self.end info = self.get_info() if self.sv_type in ["ITX", "CTX"]: info["POS2"] = self.end info["END"] = end info["CHR2"] = self.chrom2 if not self.is_precise: info.update({"IMPRECISE": True}) return pybedtools.Interval(self.chrom, self.start, end, name="%s,%s,%d,%s" % ( base64.b64encode(json.dumps(info)), self.sv_type, self.length, ";".join(self._get_svmethods())), score=str(len(self.sources)))
Example #8
Source File: annotate_bed_for_ml.py From metasv with BSD 2-Clause "Simplified" License | 6 votes |
def add_weighted_score(in_bed, score_bed): out_bed = in_bed.intersect(score_bed, wao=True).saveas(os.path.join(args.tmpdir, "score.bed")) bed_array = [] last_interval = pybedtools.Interval("", 0, 0) map_value = 0.0 for interval in out_bed: if interval.chrom != last_interval.chrom or interval.start != last_interval.start or interval.end != last_interval.end: if last_interval.chrom: bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),)) map_value = 0.0 last_interval = interval if float(interval.fields[-1]) > 0: map_value += float(interval.fields[-1]) * float(interval.fields[-2]) / float(interval.length) if last_interval.chrom: bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),)) return pybedtools.BedTool(bed_array)
Example #9
Source File: annotate_bed_for_ml_del.py From metasv with BSD 2-Clause "Simplified" License | 6 votes |
def add_weighted_score(in_bed, score_bed): out_bed = in_bed.intersect(score_bed, wao=True).saveas(os.path.join(args.tmpdir, "score.bed")) bed_array = [] last_interval = pybedtools.Interval("", 0, 0) map_value = 0.0 for interval in out_bed: if interval.chrom != last_interval.chrom or interval.start != last_interval.start or interval.end != last_interval.end: if last_interval.chrom: bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),)) map_value = 0.0 last_interval = interval if float(interval.fields[-1]) > 0: map_value += float(interval.fields[-1]) * float(interval.fields[-2]) / float(interval.length) if last_interval.chrom: bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),)) return pybedtools.BedTool(bed_array)
Example #10
Source File: sequence.py From kipoiseq with MIT License | 6 votes |
def __getitem__(self, idx): """Returns (pybedtools.Interval, labels) """ row = self.df.iloc[idx] # TODO: use kipoiseq.dataclasses.interval instead of pybedtools import pybedtools interval = pybedtools.create_interval_from_list( [to_scalar(x) for x in row.iloc[:self.bed_columns]]) if self.ignore_targets or self.n_tasks == 0: labels = {} else: labels = row.iloc[self.bed_columns:].values.astype( self.label_dtype) return interval, labels
Example #11
Source File: genotype_with_reference.py From pacbio_variant_caller with MIT License | 6 votes |
def soft_clips_at_breakpoint(read, region): """ Returns True if the given pysam read maps up to the edge of the given pybedtools.Interval, ``region`` and soft clips at that edge. Two cases are possible: 1. The end of the read soft clips such that the read's reference end stops at the breakpoint start and the end of the read is softclipped. 2. The beginning of the read soft clips such that the read's reference start begins at the breakpoint end and the beginning of the read is softclipped. In both cases, the read should be mapped in its best location in the reference ("perfect mapping"). """ return ( (read.reference_end == region.start and read.cigar[-1][0] == BAM_CSOFT_CLIP) or (read.reference_start == region.end + 1 and read.cigar[0][0] == BAM_CSOFT_CLIP) )
Example #12
Source File: genotype_with_reference.py From pacbio_variant_caller with MIT License | 5 votes |
def spans_region(read, region): """ Returns True if the given pysam read spans the given pybedtools.Interval, ``region``. """ return has_perfect_mapping(read) and read.reference_start <= region.start and read.reference_end >= region.end
Example #13
Source File: genotype_with_reference.py From pacbio_variant_caller with MIT License | 5 votes |
def has_gaps_in_region(read, region): """ Returns True if the given pysam read spans the given pybedtools.Interval, ``region``. """ # If the given read has gaps in its alignment to the reference inside the # given interval (more than one block inside the SV event itself), there are # gaps inside the SV. tree = intervaltree.IntervalTree() for block in read.get_blocks(): tree[block[0]:block[1]] = block return len(tree[region.start:region.end]) > 1
Example #14
Source File: generate_sv_intervals.py From metasv with BSD 2-Clause "Simplified" License | 5 votes |
def remove_ins_padding(feature, pad): name_fields = feature.name.split(",") sv_type = name_fields[1] return pybedtools.Interval(feature.chrom, feature.start+pad, max(feature.end-pad,feature.start+pad+1), name=feature.name, score=feature.score) if sv_type == "INS" else feature
Example #15
Source File: genotype_with_reference.py From pacbio_variant_caller with MIT License | 5 votes |
def pair_spans_regions(read_pair, regions): """ Returns True if the given pysam reads spans the given pybedtools.Interval list, ``regions``. Read pairs can span the regions even if either read overlaps the region breakpoints. """ return len(read_pair) == 2 and read_pair[0].reference_start < regions[0].start and read_pair[1].reference_end > regions[-1].end
Example #16
Source File: genotype_with_reference.py From pacbio_variant_caller with MIT License | 5 votes |
def get_insert_sizes_for_region(bam, region): """ Return insert sizes reads in the given pysam.AlignmentFile, ``bam``, and the given pybedtools.Interval, ``region``. """ return [read.tlen for read in bam.fetch(region.chrom, region.start, region.end) if has_perfect_mapping(read) and not read.mate_is_unmapped and read.tlen >= 0 and read.tlen <= 1000]
Example #17
Source File: generate_final_vcf.py From metasv with BSD 2-Clause "Simplified" License | 5 votes |
def find_ctx(feature, overlap_ratio=0.9): n = len(feature.fields) / 2 start_del_ins = int(feature.fields[n + 1]) end_del_ins = int(feature.fields[n + 2]) name = "%s,%s" % (feature.name, feature.fields[n + 3]) score = "%s,%s" % (feature.score, feature.fields[n + 4]) return pybedtools.Interval(feature.chrom, feature.start, feature.end, name=name, score=score, otherfields=[".", "%d-%d" % ( start_del_ins, end_del_ins)])
Example #18
Source File: utils.py From FactorNet with MIT License | 5 votes |
def get_genome_bed(): genome_sizes_info = np.loadtxt(genome_sizes_file, dtype=str) chroms = list(genome_sizes_info[:,0]) chroms_sizes = list(genome_sizes_info[:,1].astype(int)) genome_bed = [] for chrom, chrom_size in zip(chroms, chroms_sizes): genome_bed.append(Interval(chrom, 0, chrom_size)) genome_bed = BedTool(genome_bed) return chroms, chroms_sizes, genome_bed
Example #19
Source File: generate_final_vcf.py From metasv with BSD 2-Clause "Simplified" License | 5 votes |
def build_chr2_ins(feature, thr_top=0.15): sc_chr2_str = feature.fields[6] if sc_chr2_str == ".": return [] sub_str = map(lambda x: [x.split(";")[0], map(int, x.split(";")[1:])], sc_chr2_str.split(",")) chr2_dict = {} for chr2, poses in sub_str: if chr2 not in chr2_dict: chr2_dict[chr2] = [] chr2_dict[chr2].append(poses) chr2_dict = {k: [sum(map(lambda x: x[0], v)), min(map(lambda x: x[1], v)), max(map(lambda x: x[2], v))] for k, v in chr2_dict.iteritems()} sorted_chr2 = sorted(chr2_dict.items(), key=lambda x: x[1][0], reverse=True) n_reads = sum(map(lambda x: x[1][0], sorted_chr2)) top_chr2s = filter( lambda x: x[1][0] > (thr_top * n_reads) and x[0] not in ["-1", feature.chrom], sorted_chr2) if not top_chr2s: return [] ctx_intervals = [] for chr2, [cnt, start, end] in top_chr2s: ctx_intervals.append(pybedtools.Interval(chr2, start, end, name=feature.name, score=feature.score)) return ctx_intervals
Example #20
Source File: utils.py From FactorNet with MIT License | 5 votes |
def data_to_bed(data): intervals = [] for datum in data: chrom = datum[0] start = datum[1] stop = datum[2] intervals.append(Interval(chrom, start, stop)) return BedTool(intervals)
Example #21
Source File: generate_final_vcf.py From metasv with BSD 2-Clause "Simplified" License | 5 votes |
def find_itx(feature, wiggle): n = len(feature.fields) / 2 start_idp1 = feature.start end_idp1 = feature.end start_idp2 = int(feature.fields[n + 1]) end_idp2 = int(feature.fields[n + 2]) dist_ends = [abs(start_idp1 - start_idp2), abs(end_idp1 - end_idp2)] if min(dist_ends) > wiggle: return None del_pos1 = int(feature.fields[6]) del_pos2 = int(feature.fields[n + 6]) if abs(del_pos1 - del_pos2) > wiggle: return None del_interval1 = map(int, feature.fields[7].split("-")) del_interval2 = map(int, feature.fields[n + 7].split("-")) lr_1 = 1 if abs(del_pos1 - del_interval1[0]) < abs( del_pos1 - del_interval1[1]) else 0 lr_2 = 1 if abs(del_pos2 - del_interval2[0]) < abs( del_pos2 - del_interval2[1]) else 0 if lr_1 == lr_2 or lr_2 < lr_1: return None del_id_2 = feature.name.split(",")[-1] del_filter_2 = feature.score.split(",")[-1] name = "%s,%s" % (feature.name, del_id_2) score = "%s,%s" % (feature.score, del_filter_2) return pybedtools.Interval(feature.chrom, feature.start, feature.end, name=name, score=score, otherfields=["%d" % ((del_pos1 + del_pos2) / 2), "%d-%d,%d-%d" % ( del_interval1[0], del_interval1[1], del_interval2[0], del_interval2[1])])
Example #22
Source File: utils.py From FactorNet with MIT License | 5 votes |
def make_blacklist(): blacklist = BedTool(blacklist_file) blacklist = blacklist.slop(g=genome_sizes_file, b=L) # Add ends of the chromosomes to the blacklist genome_sizes_info = np.loadtxt(genome_sizes_file, dtype=str) chroms = list(genome_sizes_info[:,0]) chroms_sizes = list(genome_sizes_info[:,1].astype(int)) blacklist2 = [] for chrom, size in zip(chroms, chroms_sizes): blacklist2.append(Interval(chrom, 0, L)) blacklist2.append(Interval(chrom, size - L, size)) blacklist2 = BedTool(blacklist2) blacklist = blacklist.cat(blacklist2) return blacklist
Example #23
Source File: test_extractors.py From genomelake with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_bigwig_and_intervals(): bw_path = "tests/data/test_bigwig.bw" intervals = [Interval("chr1", 0, 10), Interval("chr2", 0, 10)] expected_chr1 = np.array([0.1] * 10, dtype=np.float32) expected_chr2 = np.array([0] + [9] * 9, dtype=np.float32) expected_data = np.stack([expected_chr1, expected_chr2]) return (bw_path, intervals, expected_data)
Example #24
Source File: test_BedDataset.py From kipoiseq with MIT License | 5 votes |
def test_bed3(tmpdir): bed_file = write_tmp('chr1\t1\t2\nchr1\t1\t3', tmpdir) bt = BedDataset(bed_file) assert bt.n_tasks == 0 assert len(bt) == 2 assert np.all(bt.df[0] == 'chr1') assert bt[0] == (Interval("chr1", 1, 2), {}) assert bt[1] == (Interval("chr1", 1, 3), {})
Example #25
Source File: test_BedDataset.py From kipoiseq with MIT License | 5 votes |
def test_bed3_labels(tmpdir): bed_file = write_tmp('chr1\t1\t2\t1\t0\nchr1\t1\t3\t0\t1', tmpdir) bt = BedDataset(bed_file) assert np.all(bt.get_targets() == np.array([[1, 0], [0, 1]])) assert len(bt) == 2 assert bt.n_tasks == 2 assert np.all(bt.df[0] == 'chr1') assert bt[0][0] == Interval("chr1", 1, 2) assert np.all(bt[0][1] == np.array([1, 0])) assert bt[1][0] == Interval("chr1", 1, 3) assert np.all(bt[1][1] == np.array([0, 1])) assert len(bt) == 2
Example #26
Source File: test_BedDataset.py From kipoiseq with MIT License | 5 votes |
def test_incl_excl_chromosomes(tmpdir): bed_file = write_tmp( 'chr1\t1\t2\t1\t0\nchr2\t1\t3\t0\t1\nchr3\t1\t3\t0\t1', tmpdir) bt = BedDataset(bed_file) assert len(bt) == 3 bt = BedDataset(bed_file, incl_chromosomes=['chr1']) assert len(bt) == 1 assert bt[0][0] == Interval("chr1", 1, 2) bt = BedDataset(bed_file, excl_chromosomes=['chr1']) assert len(bt) == 2 assert bt[0][0] == Interval("chr2", 1, 3)
Example #27
Source File: test_BedDataset.py From kipoiseq with MIT License | 5 votes |
def test_tsvreader(tsv_file, num_chr, label_dtype): ds = BedDataset(tsv_file, label_dtype=label_dtype, num_chr=num_chr) interval, labels = ds[0] assert isinstance(interval, Interval) if not num_chr: assert interval.chrom.startswith("chr") assert isinstance(labels[0], label_dtype) assert interval.start == 2 assert interval.end == 4
Example #28
Source File: dataclasses.py From kipoiseq with MIT License | 5 votes |
def from_pybedtools(cls, interval): """Create the ranges object from `pybedtools.Interval` # Arguments interval: `pybedtools.Interval` instance """ return cls(chrom=interval.chrom, start=interval.start, end=interval.stop, name=interval.name, score=interval.score, strand=interval.strand, attrs=dict(interval.attrs or dict()) )
Example #29
Source File: test_extractors.py From genomelake with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_fasta_extractor_valid_intervals(): extractor = FastaExtractor("tests/data/fasta_test.fa") intervals = [Interval("chr1", 0, 10), Interval("chr2", 0, 10)] expected_data = np.array( [ [ [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.], [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.], ], [ [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.], [0.25, 0.25, 0.25, 0.25], [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.], [0.25, 0.25, 0.25, 0.25], ], ], dtype=np.float32, ) data = extractor(intervals) assert (data == expected_data).all()
Example #30
Source File: test_extractors.py From genomelake with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_fasta_extractor_over_chr_end(): extractor = FastaExtractor("tests/data/fasta_test.fa") intervals = [Interval("chr1", 0, 100), Interval("chr1", 1, 101)] with pytest.raises(ValueError): data = extractor(intervals)