Python Bio.SeqIO.index() Examples
The following are 30
code examples of Bio.SeqIO.index().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
Bio.SeqIO
, or try the search function
.
Example #1
Source File: atlas2.py From ssbio with MIT License | 7 votes |
def _load_sequences_to_reference_gene(self, g_id, force_rerun=False): """Load orthologous strain sequences to reference Protein object, save as new pickle""" protein_seqs_pickle_path = op.join(self.sequences_by_gene_dir, '{}_protein_withseqs.pckl'.format(g_id)) if ssbio.utils.force_rerun(flag=force_rerun, outfile=protein_seqs_pickle_path): protein_pickle_path = self.gene_protein_pickles[g_id] protein_pickle = ssbio.io.load_pickle(protein_pickle_path) for strain, info in self.strain_infodict.items(): strain_sequences = SeqIO.index(info['genome_path'], 'fasta') strain_gene_functional = info['functional_genes'][g_id] if strain_gene_functional: # Pull the gene ID of the strain from the orthology matrix strain_gene_key = self.df_orthology_matrix.at[g_id, strain] new_id = '{}_{}'.format(g_id, strain) if protein_pickle.sequences.has_id(new_id): continue protein_pickle.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id, set_as_representative=False) protein_pickle.save_pickle(outfile=protein_seqs_pickle_path) return g_id, protein_seqs_pickle_path
Example #2
Source File: paf.py From dgenies with GNU General Public License v3.0 | 6 votes |
def build_list_no_assoc(self, to): """ Build list of queries that match with None target, or the opposite :param to: query or target :return: content of the file """ index = self.idx_q if to == "query" else self.idx_t name, contigs_list, contigs, reversed, abs_start, c_len = Index.load(index) contigs_list = set(contigs_list) with open(self.paf, "r") as paf: for line in paf: c_name = line.strip("\n").split("\t")[0 if to == "query" else 5] if c_name in contigs_list: contigs_list.remove(c_name) return "\n".join(contigs_list) + "\n"
Example #3
Source File: functions.py From dgenies with GNU General Public License v3.0 | 6 votes |
def compress_and_send_mail(job_name, fasta_file, index_file, lock_file, mailer): """ Compress fasta file and the send mail with its link to the client :param job_name: job id :type job_name: str :param fasta_file: fasta file path :type fasta_file: str :param index_file: index file path :type index_file: str :param lock_file: lock file path :type lock_file: str :param mailer: mailer object (to send mail) :type mailer: Mailer """ Functions.compress(fasta_file) os.remove(lock_file) index, sample_name = Functions.read_index(index_file) Functions.send_fasta_ready(mailer, job_name, sample_name, True)
Example #4
Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __getitem__(self, index: int): if not 0 <= index < self._num_examples: raise IndexError(index) # if self._in_memory and self._cache[index] is not None: record = self._cache[index] # else: # key = self._keys[index] # record = self._records[key] # if self._in_memory: # self._cache[index] = record item = {'id': record.id, 'primary': str(record.seq), 'protein_length': len(record.seq)} return item
Example #5
Source File: util.py From pomoxis with Mozilla Public License 2.0 | 6 votes |
def reverse_bed(): """Convert bed-file coordinates to coordinates on the reverse strand.""" parser = argparse.ArgumentParser( prog='reverse_bed', description='Convert bed-file coordinates to coordinates on the reverse strand.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('bed_in', help='Input bed file.') parser.add_argument('ref_fasta', help='Input reference fasta file.') parser.add_argument('bed_out', help='Output bed file.') args = parser.parse_args() fasta = pysam.FastaFile(args.ref_fasta) lengths = dict(zip(fasta.references, fasta.lengths)) d = pd.read_csv(args.bed_in, sep='\t', names=['chrom', 'start', 'stop']) d['chrom_length'] = d['chrom'].map(lambda x: lengths[x]) d['rc_stop'] = d['chrom_length'] - d['start'] d['rc_start'] = d['chrom_length'] - d['stop'] d['chrom_rc'] = d['chrom'] + '_rc' d[['chrom_rc', 'rc_start', 'rc_stop']].to_csv(args.bed_out, index=False, header=False, sep='\t')
Example #6
Source File: determine_paralogs.py From chewBBACA with GNU General Public License v3.0 | 6 votes |
def split_blast_inputs_by_core(blast_inputs, threads, blast_files_dir): """ """ splitted_ids = [[] for cpu in range(threads)] splitted_values = [[] for cpu in range(threads)] cluster_sums = [0] * threads i = 0 for cluster in blast_inputs: cluster_file = os.path.join(blast_files_dir, '{0}_ids.txt'.format(cluster)) with open(cluster_file, 'r') as infile: cluster_seqs = [line.strip() for line in infile.readlines()] splitted_values[i].append(len(cluster_seqs)) splitted_ids[i].append(cluster) cluster_sums[i] += len(cluster_seqs) i = cluster_sums.index(min(cluster_sums)) return [s for s in splitted_ids if len(s) > 0]
Example #7
Source File: handleGB.py From PhyloSuite with GNU General Public License v3.0 | 5 votes |
def remove_by_col_IDs(self, IDs): ##比较耗时,可以考虑重写 zip_array = self.zip_an_array(self.array) indice = [num for num, colH in enumerate(self.header) if colH in IDs] for index in sorted(indice, reverse=True): zip_array.pop(index) return self.zip_an_array(zip_array)
Example #8
Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __getitem__(self, index): item = self.data[index] token_ids = self.tokenizer.encode(item['primary']) input_mask = np.ones_like(token_ids) return token_ids, input_mask, item['clan'], item['family']
Example #9
Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __getitem__(self, index: int): item = self.data[index] token_ids = self.tokenizer.encode(item['primary']) input_mask = np.ones_like(token_ids) return token_ids, input_mask, float(item['log_fluorescence'][0])
Example #10
Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __getitem__(self, index: int): item = self.data[index] token_ids = self.tokenizer.encode(item['primary']) input_mask = np.ones_like(token_ids) return token_ids, input_mask, float(item['stability_score'][0])
Example #11
Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __getitem__(self, index: int): item = self.data[index] token_ids = self.tokenizer.encode(item['primary']) input_mask = np.ones_like(token_ids) return token_ids, input_mask, item['fold_label']
Example #12
Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __getitem__(self, index: int): item = self.data[index] token_ids = self.tokenizer.encode(item['primary']) input_mask = np.ones_like(token_ids) # pad with -1s because of cls/sep tokens labels = np.asarray(item['ss3'], np.int64) labels = np.pad(labels, (1, 1), 'constant', constant_values=-1) return token_ids, input_mask, labels
Example #13
Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __getitem__(self, index): item = self.data[index] msa = item['msa'] dist = item['dist6d'] omega = item['omega6d'] theta = item['theta6d'] phi = item['phi6d'] if self._split == 'train': msa = self._subsample_msa(msa) elif self._split == 'valid': msa = msa[:20000] # runs out of memory if msa is way too big msa, dist, omega, theta, phi = self._slice_long_sequences( msa, dist, omega, theta, phi) mask = dist == 0 dist_bins = np.digitize(dist, self._dist_bins) omega_bins = np.digitize(omega, self._dihedral_bins) + 1 theta_bins = np.digitize(theta, self._dihedral_bins) + 1 phi_bins = np.digitize(phi, self._planar_bins) + 1 dist_bins[mask] = 0 omega_bins[mask] = 0 theta_bins[mask] = 0 phi_bins[mask] = 0 dist_bins[np.diag_indices_from(dist_bins)] = -1 # input_mask = np.ones_like(msa[0]) return msa, dist_bins, omega_bins, theta_bins, phi_bins
Example #14
Source File: sort_fasta.py From STRetch with MIT License | 5 votes |
def main(): # Parse command line arguments args = parse_args() infile = args.infile outfile = args.outfile with open(outfile, "w") as outfasta: allfasta = SeqIO.parse(infile, "fasta") ids = sorted([rec.id for rec in allfasta]) record_index = SeqIO.index(infile, "fasta") records = (record_index[id] for id in ids) SeqIO.write(records, outfile, "fasta")
Example #15
Source File: handleGB.py From PhyloSuite with GNU General Public License v3.0 | 5 votes |
def remove_by_row_Indice(self, indice, base=None, proportion=None, processSig=None): sorted_indice = sorted(indice, reverse=True) for num, index in enumerate(sorted_indice): self.data.pop(index) if processSig: num += 1 processSig.emit(base + (num / len(sorted_indice)) * proportion) return [self.header] + self.data
Example #16
Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __getitem__(self, index: int): if not 0 <= index < len(self): raise IndexError(index) item = dict(np.load(self._file_list[index])) if not isinstance(item, dict): raise TypeError(f"Expected dataset to contain a list of dictionary " f"records, received record of type {type(item)}") if 'id' not in item: item['id'] = self._file_list[index].stem return item
Example #17
Source File: handleGB.py From PhyloSuite with GNU General Public License v3.0 | 5 votes |
def merge_gbIndex(self, genbanks): ##耗时,而且容易卡死 merged_index = OrderedDict() for gb in genbanks: gb_index = SeqIO.index(gb, "genbank") merged_index.update(gb_index) return merged_index
Example #18
Source File: handleGB.py From PhyloSuite with GNU General Public License v3.0 | 5 votes |
def fetchInfo4extracter(self, list_IDs): '''已废弃''' list_ID_path = [self.fetchRecordPath(ID) for ID in list_IDs] list_indice = self.merge_gbIndex(list_ID_path) allcontent = self.merge_file_contents(list_ID_path) rgx = re.compile(r"\/State=\"(\S+) (\S+)\"") unverifiedIDs = [i[0] for i in rgx.findall(allcontent) if i[-1] == "unverified"] names4extracter = [list_indice[index].annotations["organism"] + " " + list_indice[index].name for index in list_indice] return names4extracter, allcontent, unverifiedIDs
Example #19
Source File: handleGB.py From PhyloSuite with GNU General Public License v3.0 | 5 votes |
def updateArrayByRow(self, ID, list_row, array): for index, row in enumerate(array): #先找到对应ID if ID == row[0]: break array[index] = list_row return array
Example #20
Source File: functions.py From dgenies with GNU General Public License v3.0 | 5 votes |
def read_index(index_file): """ Load index of query or target :param index_file: index file path :type index_file: str :return: * [0] index (size of each chromosome) {dict} * [1] sample name {str} :rtype: (dict, str) """ index = OrderedDict() with open(index_file, "r") as index_f: # Sample name without special chars: sample_name = re.sub('[^A-Za-z0-9_\-.]+', '', index_f.readline().strip("\n").replace(" ", "_")) for line in index_f: if line != "": parts = line.strip("\n").split("\t") name = parts[0] lenght = int(parts[1]) to_reverse = parts[2] == "1" if len(parts) >= 3 else False index[name] = { "length": lenght, "to_reverse": to_reverse } return index, sample_name
Example #21
Source File: paf.py From dgenies with GNU General Public License v3.0 | 5 votes |
def _flush_blocks(index_c, new_index_c, new_index_o, current_block): """ When parsing index, build a mix of too small sequential contigs (if their number exceed 5), else just add co to the new index :param index_c: current index contigs def :type index_c: dict :param new_index_o: new index contigs order :type new_index_o: list :param new_index_c: new index contigs def :type new_index_c: dict :param current_block: contigs in the current analyzed block :type current_block: list :return: (new index contigs defs, new index contigs order) :rtype: (dict, list) """ if len(current_block) >= 5: block_length = 0 for contig in current_block: block_length += index_c[contig] b_name = "###MIX###_" + "###".join(current_block) new_index_c[b_name] = block_length new_index_o.append(b_name) elif len(current_block) > 0: for b_name in current_block: new_index_c[b_name] = index_c[b_name] new_index_o.append(b_name) return new_index_c, new_index_o
Example #22
Source File: paf.py From dgenies with GNU General Public License v3.0 | 5 votes |
def _update_query_index(self, contigs_reoriented): """ Write new query index file (including new reoriented contigs info) :param contigs_reoriented: reoriented contigs list :type contigs_reoriented: list """ with open(self.idx_q, "w") as idx: idx.write(self.name_q + "\n") for contig in self.q_order: idx.write("\t".join([contig, str(self.q_contigs[contig]), "1" if contig in contigs_reoriented else "0"]) + "\n")
Example #23
Source File: barcode_rarify.py From amptk with BSD 2-Clause "Simplified" License | 5 votes |
def IndexSeqs(file): global SeqIndex SeqIndex = SeqIO.index(file, 'fastq')
Example #24
Source File: dbotu.py From amplicon_sequencing_pipeline with MIT License | 5 votes |
def _process_record(self, record_id): ''' Process the next sequence: run the genetic, abundance, and distribution checks, either merging the sequence into an existing OTU or creating a new OTU. ''' assert record_id in self.seq_table.index record = self.records[record_id] candidate = OTU(record.id, str(record.seq), self.seq_table.loc[record.id]) if self.log is not None: print('seq', candidate.name, sep='\t', file=self.log) merged = False for otu in self.ga_matches(candidate): test_pval = candidate.distribution_pval(otu) if self.log is not None: print(candidate.name, 'distribution_check', otu.name, test_pval, sep='\t', file=self.log) if test_pval > self.threshold_pval: otu.absorb(candidate) self.membership[otu.name].append(candidate.name) merged = True break if not merged: # form own otu self.otus.append(candidate) self.membership[candidate.name] = [candidate.name]
Example #25
Source File: funannotate-runEVM.py From funannotate with BSD 2-Clause "Simplified" License | 5 votes |
def getBreakPoint(tupList, idx, direction='reverse', gap=2000): # takes list of tuples of coords and a starting index (idx). finds closest # break point in between tuple coordSorted solution = False while not solution: try: start, end, diff = tupList[idx] except IndexError: return False if diff >= gap: phase = int(round(diff/2)) solution = end + phase else: if direction == 'reverse': idx -= 1 else: idx += 1 return solution
Example #26
Source File: atlas.py From ssbio with MIT License | 5 votes |
def _load_strain_sequences(self, strain_gempro): """Load strain sequences from the orthology matrix into the base model for comparisons, and into the strain-specific model itself. """ if self._orthology_matrix_has_sequences: # Load directly from the orthology matrix if it contains sequences strain_sequences = self.df_orthology_matrix[strain_gempro.id].to_dict() else: # Otherwise load from the genome file if the orthology matrix contains gene IDs # Load the genome FASTA file log.debug('{}: loading strain genome CDS file'.format(strain_gempro.genome_path)) strain_sequences = SeqIO.index(strain_gempro.genome_path, 'fasta') for strain_gene in strain_gempro.genes: if strain_gene.functional: if self._orthology_matrix_has_sequences: strain_gene_key = strain_gene.id else: # Pull the gene ID of the strain from the orthology matrix strain_gene_key = self.df_orthology_matrix.loc[strain_gene.id, strain_gempro.id] log.debug('{}: original gene ID to be pulled from strain fasta file'.format(strain_gene_key)) # # Load into the base strain for comparisons ref_gene = self.reference_gempro.genes.get_by_id(strain_gene.id) new_id = '{}_{}'.format(strain_gene.id, strain_gempro.id) if ref_gene.protein.sequences.has_id(new_id): log.debug('{}: sequence already loaded into reference model'.format(new_id)) continue ref_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id, set_as_representative=False) log.debug('{}: loaded sequence into reference model'.format(new_id)) # Load into the strain GEM-PRO strain_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id, set_as_representative=True) log.debug('{}: loaded sequence into strain model'.format(new_id))
Example #27
Source File: atlas.py From ssbio with MIT License | 5 votes |
def build_strain_specific_models(self, save_models=False): """Using the orthologous genes matrix, create and modify the strain specific models based on if orthologous genes exist. Also store the sequences directly in the reference GEM-PRO protein sequence attribute for the strains. """ if len(self.df_orthology_matrix) == 0: raise RuntimeError('Empty orthology matrix') # Create an emptied copy of the reference GEM-PRO for strain_gempro in tqdm(self.strains): log.debug('{}: building strain specific model'.format(strain_gempro.id)) # For each genome, load the metabolic model or genes from the reference GEM-PRO logging.disable(logging.WARNING) if self._empty_reference_gempro.model: strain_gempro.load_cobra_model(self._empty_reference_gempro.model) elif self._empty_reference_gempro.genes: strain_gempro.genes = [x.id for x in self._empty_reference_gempro.genes] logging.disable(logging.NOTSET) # Get a list of genes which do not have orthology in the strain not_in_strain = self.df_orthology_matrix[pd.isnull(self.df_orthology_matrix[strain_gempro.id])][strain_gempro.id].index.tolist() # Mark genes non-functional self._pare_down_model(strain_gempro=strain_gempro, genes_to_remove=not_in_strain) # Load sequences into the base and strain models self._load_strain_sequences(strain_gempro=strain_gempro) if save_models: cobra.io.save_json_model(model=strain_gempro.model, filename=op.join(self.model_dir, '{}.json'.format(strain_gempro.id))) strain_gempro.save_pickle(op.join(self.model_dir, '{}_gp.pckl'.format(strain_gempro.id))) log.info('Created {} new strain-specific models and loaded in sequences'.format(len(self.strains)))
Example #28
Source File: atlas2.py From ssbio with MIT License | 5 votes |
def _write_strain_functional_genes(self, strain_id, ref_functional_genes, orth_matrix, force_rerun=False): """Create strain functional genes json file""" func_genes_path = op.join(self.model_dir, '{}_funcgenes.json'.format(strain_id)) if ssbio.utils.force_rerun(flag=force_rerun, outfile=func_genes_path): gene_to_func = {k:True for k in ref_functional_genes} # Get a list of genes which do not have orthology in the strain genes_to_remove = orth_matrix[pd.isnull(orth_matrix[strain_id])][strain_id].index.tolist() # Mark genes non-functional genes_to_remove = list(set(genes_to_remove).intersection(set(ref_functional_genes))) if len(genes_to_remove) > 0: for g in genes_to_remove: gene_to_func[g] = False with open(func_genes_path, 'w') as f: json.dump(gene_to_func, f) else: with open(func_genes_path, 'r') as f: gene_to_func = json.load(f) return strain_id, gene_to_func
Example #29
Source File: atlas2.py From ssbio with MIT License | 5 votes |
def _build_strain_specific_model(self, strain_id, ref_functional_genes, orth_matrix, force_rerun=False): """Create strain GEMPRO, set functional genes""" gp_noseqs_path = op.join(self.model_dir, '{}_gp.pckl'.format(strain_id)) if ssbio.utils.force_rerun(flag=force_rerun, outfile=gp_noseqs_path): logging.disable(logging.WARNING) strain_gp = GEMPRO(gem_name=strain_id) # if self.reference_gempro.model: # strain_gp.load_cobra_model(deepcopy(self.reference_gempro.model)) # # Reset the GenePro attributes # for x in strain_gp.genes: # x.reset_protein() # else: # Otherwise, just copy the list of genes over and rename the IDs strain_genes = [x for x in ref_functional_genes] strain_gp.add_gene_ids(strain_genes) logging.disable(logging.NOTSET) # Get a list of genes which do not have orthology in the strain genes_to_remove = orth_matrix[pd.isnull(orth_matrix[strain_id])][strain_id].index.tolist() # Mark genes non-functional strain_genes = [x.id for x in strain_gp.genes] genes_to_remove = list(set(genes_to_remove).intersection(set(strain_genes))) if len(genes_to_remove) > 0: # If a COBRApy model exists, utilize the delete_model_genes method # if strain_gp.model: # cobra.manipulation.delete_model_genes(strain_gp.model, genes_to_remove) # # Otherwise, just mark the genes as non-functional # else: for g in genes_to_remove: strain_gp.genes.get_by_id(g).functional = False strain_gp.save_pickle(outfile=gp_noseqs_path) return strain_id, gp_noseqs_path
Example #30
Source File: dbotu.py From amplicon_sequencing_pipeline with MIT License | 5 votes |
def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None): ''' seq_table: pandas.DataFrame Samples on the columns; sequences on the rows records: index of Bio.Seq Indexed, unaligned input sequences. This could come from BioPython's SeqIO.to_dict or SeqIO.index. max_dist: float genetic distance cutoff above which a sequence will not be merged into an OTU min_fold: float Multiply the sequence's abundance by this fold to get the minimum abundance of an OTU for merging threshold_pval: float P-value below which a sequence will not be merged into an OTU log: filehandle Log file reporting the abundance, genetic, and distribution checks. ''' self.seq_table = seq_table self.records = records self.max_dist = max_dist self.min_fold = min_fold self.threshold_pval = threshold_pval self.log = log # get a list of the names of the sequences in order of their (decreasing) abundance self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False) # check that all sequence IDs in the table are in the fasta missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records] if len(missing_ids) > 0: raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids)) # initialize OTU information self.membership = {} self.otus = []