Python Bio.SeqIO.index() Examples

The following are 30 code examples of Bio.SeqIO.index(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module Bio.SeqIO , or try the search function .
Example #1
Source File: atlas2.py    From ssbio with MIT License 7 votes vote down vote up
def _load_sequences_to_reference_gene(self, g_id, force_rerun=False):
        """Load orthologous strain sequences to reference Protein object, save as new pickle"""
        protein_seqs_pickle_path = op.join(self.sequences_by_gene_dir, '{}_protein_withseqs.pckl'.format(g_id))

        if ssbio.utils.force_rerun(flag=force_rerun, outfile=protein_seqs_pickle_path):
            protein_pickle_path = self.gene_protein_pickles[g_id]
            protein_pickle = ssbio.io.load_pickle(protein_pickle_path)

            for strain, info in self.strain_infodict.items():
                strain_sequences = SeqIO.index(info['genome_path'], 'fasta')
                strain_gene_functional = info['functional_genes'][g_id]
                if strain_gene_functional:
                    # Pull the gene ID of the strain from the orthology matrix
                    strain_gene_key = self.df_orthology_matrix.at[g_id, strain]
                    new_id = '{}_{}'.format(g_id, strain)
                    if protein_pickle.sequences.has_id(new_id):
                        continue
                    protein_pickle.load_manual_sequence(seq=strain_sequences[strain_gene_key],
                                                        ident=new_id,
                                                        set_as_representative=False)
            protein_pickle.save_pickle(outfile=protein_seqs_pickle_path)

        return g_id, protein_seqs_pickle_path 
Example #2
Source File: paf.py    From dgenies with GNU General Public License v3.0 6 votes vote down vote up
def build_list_no_assoc(self, to):
        """
        Build list of queries that match with None target, or the opposite

        :param to: query or target
        :return: content of the file
        """
        index = self.idx_q if to == "query" else self.idx_t
        name, contigs_list, contigs, reversed, abs_start, c_len = Index.load(index)
        contigs_list = set(contigs_list)
        with open(self.paf, "r") as paf:
            for line in paf:
                c_name = line.strip("\n").split("\t")[0 if to == "query" else 5]
                if c_name in contigs_list:
                    contigs_list.remove(c_name)
        return "\n".join(contigs_list) + "\n" 
Example #3
Source File: functions.py    From dgenies with GNU General Public License v3.0 6 votes vote down vote up
def compress_and_send_mail(job_name, fasta_file, index_file, lock_file, mailer):
        """
        Compress fasta file and the send mail with its link to the client

        :param job_name: job id
        :type job_name: str
        :param fasta_file: fasta file path
        :type fasta_file: str
        :param index_file: index file path
        :type index_file: str
        :param lock_file: lock file path
        :type lock_file: str
        :param mailer: mailer object (to send mail)
        :type mailer: Mailer
        """
        Functions.compress(fasta_file)
        os.remove(lock_file)
        index, sample_name = Functions.read_index(index_file)
        Functions.send_fasta_ready(mailer, job_name, sample_name, True) 
Example #4
Source File: datasets.py    From tape with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __getitem__(self, index: int):
        if not 0 <= index < self._num_examples:
            raise IndexError(index)

        # if self._in_memory and self._cache[index] is not None:
        record = self._cache[index]
        # else:
            # key = self._keys[index]
            # record = self._records[key]
            # if self._in_memory:
                # self._cache[index] = record

        item = {'id': record.id,
                'primary': str(record.seq),
                'protein_length': len(record.seq)}
        return item 
Example #5
Source File: util.py    From pomoxis with Mozilla Public License 2.0 6 votes vote down vote up
def reverse_bed():
    """Convert bed-file coordinates to coordinates on the reverse strand."""
    parser = argparse.ArgumentParser(
        prog='reverse_bed',
        description='Convert bed-file coordinates to coordinates on the reverse strand.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('bed_in', help='Input bed file.')
    parser.add_argument('ref_fasta', help='Input reference fasta file.')
    parser.add_argument('bed_out', help='Output bed file.')
    args = parser.parse_args()

    fasta = pysam.FastaFile(args.ref_fasta)
    lengths = dict(zip(fasta.references, fasta.lengths))
    d = pd.read_csv(args.bed_in, sep='\t', names=['chrom', 'start', 'stop'])

    d['chrom_length'] = d['chrom'].map(lambda x: lengths[x])
    d['rc_stop'] = d['chrom_length'] - d['start']
    d['rc_start'] = d['chrom_length'] - d['stop']
    d['chrom_rc'] = d['chrom'] + '_rc'
    d[['chrom_rc', 'rc_start', 'rc_stop']].to_csv(args.bed_out, index=False, header=False, sep='\t') 
Example #6
Source File: determine_paralogs.py    From chewBBACA with GNU General Public License v3.0 6 votes vote down vote up
def split_blast_inputs_by_core(blast_inputs, threads, blast_files_dir):
    """
    """

    splitted_ids = [[] for cpu in range(threads)]
    splitted_values = [[] for cpu in range(threads)]
    cluster_sums = [0] * threads
    i = 0
    for cluster in blast_inputs:
        cluster_file = os.path.join(blast_files_dir,
                                    '{0}_ids.txt'.format(cluster))
        with open(cluster_file, 'r') as infile:
            cluster_seqs = [line.strip() for line in infile.readlines()]
        splitted_values[i].append(len(cluster_seqs))
        splitted_ids[i].append(cluster)
        cluster_sums[i] += len(cluster_seqs)
        i = cluster_sums.index(min(cluster_sums))

    return [s for s in splitted_ids if len(s) > 0] 
Example #7
Source File: handleGB.py    From PhyloSuite with GNU General Public License v3.0 5 votes vote down vote up
def remove_by_col_IDs(self, IDs):
        ##比较耗时,可以考虑重写
        zip_array = self.zip_an_array(self.array)
        indice = [num for num, colH in enumerate(self.header) if colH in IDs]
        for index in sorted(indice, reverse=True):
            zip_array.pop(index)
        return self.zip_an_array(zip_array) 
Example #8
Source File: datasets.py    From tape with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __getitem__(self, index):
        item = self.data[index]
        token_ids = self.tokenizer.encode(item['primary'])
        input_mask = np.ones_like(token_ids)

        return token_ids, input_mask, item['clan'], item['family'] 
Example #9
Source File: datasets.py    From tape with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __getitem__(self, index: int):
        item = self.data[index]
        token_ids = self.tokenizer.encode(item['primary'])
        input_mask = np.ones_like(token_ids)
        return token_ids, input_mask, float(item['log_fluorescence'][0]) 
Example #10
Source File: datasets.py    From tape with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __getitem__(self, index: int):
        item = self.data[index]
        token_ids = self.tokenizer.encode(item['primary'])
        input_mask = np.ones_like(token_ids)
        return token_ids, input_mask, float(item['stability_score'][0]) 
Example #11
Source File: datasets.py    From tape with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __getitem__(self, index: int):
        item = self.data[index]
        token_ids = self.tokenizer.encode(item['primary'])
        input_mask = np.ones_like(token_ids)
        return token_ids, input_mask, item['fold_label'] 
Example #12
Source File: datasets.py    From tape with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __getitem__(self, index: int):
        item = self.data[index]
        token_ids = self.tokenizer.encode(item['primary'])
        input_mask = np.ones_like(token_ids)

        # pad with -1s because of cls/sep tokens
        labels = np.asarray(item['ss3'], np.int64)
        labels = np.pad(labels, (1, 1), 'constant', constant_values=-1)

        return token_ids, input_mask, labels 
Example #13
Source File: datasets.py    From tape with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __getitem__(self, index):
        item = self.data[index]

        msa = item['msa']
        dist = item['dist6d']
        omega = item['omega6d']
        theta = item['theta6d']
        phi = item['phi6d']

        if self._split == 'train':
            msa = self._subsample_msa(msa)
        elif self._split == 'valid':
            msa = msa[:20000]  # runs out of memory if msa is way too big
        msa, dist, omega, theta, phi = self._slice_long_sequences(
            msa, dist, omega, theta, phi)

        mask = dist == 0

        dist_bins = np.digitize(dist, self._dist_bins)
        omega_bins = np.digitize(omega, self._dihedral_bins) + 1
        theta_bins = np.digitize(theta, self._dihedral_bins) + 1
        phi_bins = np.digitize(phi, self._planar_bins) + 1

        dist_bins[mask] = 0
        omega_bins[mask] = 0
        theta_bins[mask] = 0
        phi_bins[mask] = 0

        dist_bins[np.diag_indices_from(dist_bins)] = -1

        # input_mask = np.ones_like(msa[0])

        return msa, dist_bins, omega_bins, theta_bins, phi_bins 
Example #14
Source File: sort_fasta.py    From STRetch with MIT License 5 votes vote down vote up
def main():
    # Parse command line arguments
    args = parse_args()
    infile = args.infile
    outfile = args.outfile

    with open(outfile, "w") as outfasta:

        allfasta = SeqIO.parse(infile, "fasta")
        ids = sorted([rec.id for rec in allfasta])
        record_index = SeqIO.index(infile, "fasta")
        records = (record_index[id] for id in ids)
        SeqIO.write(records, outfile, "fasta") 
Example #15
Source File: handleGB.py    From PhyloSuite with GNU General Public License v3.0 5 votes vote down vote up
def remove_by_row_Indice(self, indice, base=None, proportion=None, processSig=None):
        sorted_indice = sorted(indice, reverse=True)
        for num, index in enumerate(sorted_indice):
            self.data.pop(index)
            if processSig:
                num += 1
                processSig.emit(base + (num / len(sorted_indice)) * proportion)
        return [self.header] + self.data 
Example #16
Source File: datasets.py    From tape with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __getitem__(self, index: int):
        if not 0 <= index < len(self):
            raise IndexError(index)

        item = dict(np.load(self._file_list[index]))
        if not isinstance(item, dict):
            raise TypeError(f"Expected dataset to contain a list of dictionary "
                            f"records, received record of type {type(item)}")
        if 'id' not in item:
            item['id'] = self._file_list[index].stem
        return item 
Example #17
Source File: handleGB.py    From PhyloSuite with GNU General Public License v3.0 5 votes vote down vote up
def merge_gbIndex(self, genbanks):
        ##耗时,而且容易卡死
        merged_index = OrderedDict()
        for gb in genbanks:
            gb_index = SeqIO.index(gb, "genbank")
            merged_index.update(gb_index)
        return merged_index 
Example #18
Source File: handleGB.py    From PhyloSuite with GNU General Public License v3.0 5 votes vote down vote up
def fetchInfo4extracter(self, list_IDs):
        '''已废弃'''
        list_ID_path = [self.fetchRecordPath(ID) for ID in list_IDs]
        list_indice = self.merge_gbIndex(list_ID_path)
        allcontent = self.merge_file_contents(list_ID_path)
        rgx = re.compile(r"\/State=\"(\S+) (\S+)\"")
        unverifiedIDs = [i[0] for i in rgx.findall(allcontent) if i[-1] == "unverified"]
        names4extracter = [list_indice[index].annotations["organism"] + " " + list_indice[index].name for index in
                           list_indice]
        return names4extracter, allcontent, unverifiedIDs 
Example #19
Source File: handleGB.py    From PhyloSuite with GNU General Public License v3.0 5 votes vote down vote up
def updateArrayByRow(self, ID, list_row, array):
        for index, row in enumerate(array):
            #先找到对应ID
            if ID == row[0]:
                break
        array[index] = list_row
        return array 
Example #20
Source File: functions.py    From dgenies with GNU General Public License v3.0 5 votes vote down vote up
def read_index(index_file):
        """
        Load index of query or target

        :param index_file: index file path
        :type index_file: str
        :return:
            * [0] index (size of each chromosome) {dict}
            * [1] sample name {str}
        :rtype: (dict, str)
        """
        index = OrderedDict()
        with open(index_file, "r") as index_f:
            # Sample name without special chars:
            sample_name = re.sub('[^A-Za-z0-9_\-.]+', '', index_f.readline().strip("\n").replace(" ", "_"))
            for line in index_f:
                if line != "":
                    parts = line.strip("\n").split("\t")
                    name = parts[0]
                    lenght = int(parts[1])
                    to_reverse = parts[2] == "1" if len(parts) >= 3 else False
                    index[name] = {
                        "length": lenght,
                        "to_reverse": to_reverse
                    }
        return index, sample_name 
Example #21
Source File: paf.py    From dgenies with GNU General Public License v3.0 5 votes vote down vote up
def _flush_blocks(index_c, new_index_c, new_index_o, current_block):
        """
        When parsing index, build a mix of too small sequential contigs (if their number exceed 5), else just add
        co to the new index

        :param index_c: current index contigs def
        :type index_c: dict
        :param new_index_o: new index contigs order
        :type new_index_o: list
        :param new_index_c: new index contigs def
        :type new_index_c: dict
        :param current_block: contigs in the current analyzed block
        :type current_block: list
        :return: (new index contigs defs, new index contigs order)
        :rtype: (dict, list)
        """
        if len(current_block) >= 5:
            block_length = 0
            for contig in current_block:
                block_length += index_c[contig]
            b_name = "###MIX###_" + "###".join(current_block)
            new_index_c[b_name] = block_length
            new_index_o.append(b_name)
        elif len(current_block) > 0:
            for b_name in current_block:
                new_index_c[b_name] = index_c[b_name]
                new_index_o.append(b_name)
        return new_index_c, new_index_o 
Example #22
Source File: paf.py    From dgenies with GNU General Public License v3.0 5 votes vote down vote up
def _update_query_index(self, contigs_reoriented):
        """
        Write new query index file (including new reoriented contigs info)

        :param contigs_reoriented: reoriented contigs list
        :type contigs_reoriented: list
        """
        with open(self.idx_q, "w") as idx:
            idx.write(self.name_q + "\n")
            for contig in self.q_order:
                idx.write("\t".join([contig, str(self.q_contigs[contig]), "1" if contig in contigs_reoriented else "0"])
                          + "\n") 
Example #23
Source File: barcode_rarify.py    From amptk with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def IndexSeqs(file):
    global SeqIndex
    SeqIndex = SeqIO.index(file, 'fastq') 
Example #24
Source File: dbotu.py    From amplicon_sequencing_pipeline with MIT License 5 votes vote down vote up
def _process_record(self, record_id):
        '''
        Process the next sequence: run the genetic, abundance, and distribution checks, either
        merging the sequence into an existing OTU or creating a new OTU.
        '''
        assert record_id in self.seq_table.index
        record = self.records[record_id]

        candidate = OTU(record.id, str(record.seq), self.seq_table.loc[record.id])

        if self.log is not None:
            print('seq', candidate.name, sep='\t', file=self.log)

        merged = False
        for otu in self.ga_matches(candidate):
            test_pval = candidate.distribution_pval(otu)

            if self.log is not None:
                print(candidate.name, 'distribution_check', otu.name, test_pval, sep='\t', file=self.log)

            if test_pval > self.threshold_pval:
                otu.absorb(candidate)
                self.membership[otu.name].append(candidate.name)
                merged = True
                break

        if not merged:
            # form own otu
            self.otus.append(candidate)
            self.membership[candidate.name] = [candidate.name] 
Example #25
Source File: funannotate-runEVM.py    From funannotate with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def getBreakPoint(tupList, idx, direction='reverse', gap=2000):
    # takes list of tuples of coords and a starting index (idx). finds closest
    # break point in between tuple coordSorted
    solution = False
    while not solution:
        try:
            start, end, diff = tupList[idx]
        except IndexError:
            return False
        if diff >= gap:
            phase = int(round(diff/2))
            solution = end + phase
        else:
            if direction == 'reverse':
                idx -= 1
            else:
                idx += 1
    return solution 
Example #26
Source File: atlas.py    From ssbio with MIT License 5 votes vote down vote up
def _load_strain_sequences(self, strain_gempro):
        """Load strain sequences from the orthology matrix into the base model for comparisons, and into the
        strain-specific model itself.

        """
        if self._orthology_matrix_has_sequences:  # Load directly from the orthology matrix if it contains sequences
            strain_sequences = self.df_orthology_matrix[strain_gempro.id].to_dict()
        else:  # Otherwise load from the genome file if the orthology matrix contains gene IDs
            # Load the genome FASTA file
            log.debug('{}: loading strain genome CDS file'.format(strain_gempro.genome_path))
            strain_sequences = SeqIO.index(strain_gempro.genome_path, 'fasta')

        for strain_gene in strain_gempro.genes:
            if strain_gene.functional:
                if self._orthology_matrix_has_sequences:
                    strain_gene_key = strain_gene.id
                else:
                    # Pull the gene ID of the strain from the orthology matrix
                    strain_gene_key = self.df_orthology_matrix.loc[strain_gene.id, strain_gempro.id]
                    log.debug('{}: original gene ID to be pulled from strain fasta file'.format(strain_gene_key))

                # # Load into the base strain for comparisons
                ref_gene = self.reference_gempro.genes.get_by_id(strain_gene.id)
                new_id = '{}_{}'.format(strain_gene.id, strain_gempro.id)
                if ref_gene.protein.sequences.has_id(new_id):
                    log.debug('{}: sequence already loaded into reference model'.format(new_id))
                    continue
                ref_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id,
                                                      set_as_representative=False)
                log.debug('{}: loaded sequence into reference model'.format(new_id))

                # Load into the strain GEM-PRO
                strain_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id,
                                                         set_as_representative=True)
                log.debug('{}: loaded sequence into strain model'.format(new_id)) 
Example #27
Source File: atlas.py    From ssbio with MIT License 5 votes vote down vote up
def build_strain_specific_models(self, save_models=False):
        """Using the orthologous genes matrix, create and modify the strain specific models based on if orthologous
            genes exist.

        Also store the sequences directly in the reference GEM-PRO protein sequence attribute for the strains.
        """

        if len(self.df_orthology_matrix) == 0:
            raise RuntimeError('Empty orthology matrix')

        # Create an emptied copy of the reference GEM-PRO
        for strain_gempro in tqdm(self.strains):
            log.debug('{}: building strain specific model'.format(strain_gempro.id))

            # For each genome, load the metabolic model or genes from the reference GEM-PRO
            logging.disable(logging.WARNING)
            if self._empty_reference_gempro.model:
                strain_gempro.load_cobra_model(self._empty_reference_gempro.model)
            elif self._empty_reference_gempro.genes:
                strain_gempro.genes = [x.id for x in self._empty_reference_gempro.genes]
            logging.disable(logging.NOTSET)

            # Get a list of genes which do not have orthology in the strain
            not_in_strain = self.df_orthology_matrix[pd.isnull(self.df_orthology_matrix[strain_gempro.id])][strain_gempro.id].index.tolist()

            # Mark genes non-functional
            self._pare_down_model(strain_gempro=strain_gempro, genes_to_remove=not_in_strain)

            # Load sequences into the base and strain models
            self._load_strain_sequences(strain_gempro=strain_gempro)

            if save_models:
                cobra.io.save_json_model(model=strain_gempro.model,
                                         filename=op.join(self.model_dir, '{}.json'.format(strain_gempro.id)))
                strain_gempro.save_pickle(op.join(self.model_dir, '{}_gp.pckl'.format(strain_gempro.id)))


        log.info('Created {} new strain-specific models and loaded in sequences'.format(len(self.strains))) 
Example #28
Source File: atlas2.py    From ssbio with MIT License 5 votes vote down vote up
def _write_strain_functional_genes(self, strain_id, ref_functional_genes, orth_matrix, force_rerun=False):
        """Create strain functional genes json file"""
        func_genes_path = op.join(self.model_dir, '{}_funcgenes.json'.format(strain_id))

        if ssbio.utils.force_rerun(flag=force_rerun, outfile=func_genes_path):
            gene_to_func = {k:True for k in ref_functional_genes}
            # Get a list of genes which do not have orthology in the strain
            genes_to_remove = orth_matrix[pd.isnull(orth_matrix[strain_id])][strain_id].index.tolist()

            # Mark genes non-functional
            genes_to_remove = list(set(genes_to_remove).intersection(set(ref_functional_genes)))

            if len(genes_to_remove) > 0:
                for g in genes_to_remove:
                    gene_to_func[g] = False

            with open(func_genes_path, 'w') as f:
                json.dump(gene_to_func, f)
        else:
            with open(func_genes_path, 'r') as f:
                gene_to_func = json.load(f)

        return strain_id, gene_to_func 
Example #29
Source File: atlas2.py    From ssbio with MIT License 5 votes vote down vote up
def _build_strain_specific_model(self, strain_id, ref_functional_genes, orth_matrix, force_rerun=False):
        """Create strain GEMPRO, set functional genes"""
        gp_noseqs_path = op.join(self.model_dir, '{}_gp.pckl'.format(strain_id))

        if ssbio.utils.force_rerun(flag=force_rerun, outfile=gp_noseqs_path):
            logging.disable(logging.WARNING)

            strain_gp = GEMPRO(gem_name=strain_id)

            # if self.reference_gempro.model:
            #     strain_gp.load_cobra_model(deepcopy(self.reference_gempro.model))
            #     # Reset the GenePro attributes
            #     for x in strain_gp.genes:
            #         x.reset_protein()
            # else:
                # Otherwise, just copy the list of genes over and rename the IDs
            strain_genes = [x for x in ref_functional_genes]
            strain_gp.add_gene_ids(strain_genes)

            logging.disable(logging.NOTSET)

            # Get a list of genes which do not have orthology in the strain
            genes_to_remove = orth_matrix[pd.isnull(orth_matrix[strain_id])][strain_id].index.tolist()

            # Mark genes non-functional
            strain_genes = [x.id for x in strain_gp.genes]
            genes_to_remove = list(set(genes_to_remove).intersection(set(strain_genes)))

            if len(genes_to_remove) > 0:
                # If a COBRApy model exists, utilize the delete_model_genes method
                # if strain_gp.model:
                #     cobra.manipulation.delete_model_genes(strain_gp.model, genes_to_remove)
                # # Otherwise, just mark the genes as non-functional
                # else:
                for g in genes_to_remove:
                    strain_gp.genes.get_by_id(g).functional = False

            strain_gp.save_pickle(outfile=gp_noseqs_path)

        return strain_id, gp_noseqs_path 
Example #30
Source File: dbotu.py    From amplicon_sequencing_pipeline with MIT License 5 votes vote down vote up
def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None):
        '''
        seq_table: pandas.DataFrame
          Samples on the columns; sequences on the rows
        records: index of Bio.Seq
          Indexed, unaligned input sequences. This could come from BioPython's
          SeqIO.to_dict or SeqIO.index.
        max_dist: float
          genetic distance cutoff above which a sequence will not be merged into an OTU
        min_fold: float
          Multiply the sequence's abundance by this fold to get the minimum abundance
          of an OTU for merging
        threshold_pval: float
          P-value below which a sequence will not be merged into an OTU
        log: filehandle
          Log file reporting the abundance, genetic, and distribution checks.
        '''
        self.seq_table = seq_table
        self.records = records
        self.max_dist = max_dist
        self.min_fold = min_fold
        self.threshold_pval = threshold_pval
        self.log = log

        # get a list of the names of the sequences in order of their (decreasing) abundance
        self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False)

        # check that all sequence IDs in the table are in the fasta
        missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records]
        if len(missing_ids) > 0:
            raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids))

        # initialize OTU information
        self.membership = {}
        self.otus = []