Python Examples of Bio.SeqIO.index

Source File: atlas2.py From ssbio with MIT License

7 votes

def _load_sequences_to_reference_gene(self, g_id, force_rerun=False):
        """Load orthologous strain sequences to reference Protein object, save as new pickle"""
        protein_seqs_pickle_path = op.join(self.sequences_by_gene_dir, '{}_protein_withseqs.pckl'.format(g_id))

        if ssbio.utils.force_rerun(flag=force_rerun, outfile=protein_seqs_pickle_path):
            protein_pickle_path = self.gene_protein_pickles[g_id]
            protein_pickle = ssbio.io.load_pickle(protein_pickle_path)

            for strain, info in self.strain_infodict.items():
                strain_sequences = SeqIO.index(info['genome_path'], 'fasta')
                strain_gene_functional = info['functional_genes'][g_id]
                if strain_gene_functional:
                    # Pull the gene ID of the strain from the orthology matrix
                    strain_gene_key = self.df_orthology_matrix.at[g_id, strain]
                    new_id = '{}_{}'.format(g_id, strain)
                    if protein_pickle.sequences.has_id(new_id):
                        continue
                    protein_pickle.load_manual_sequence(seq=strain_sequences[strain_gene_key],
                                                        ident=new_id,
                                                        set_as_representative=False)
            protein_pickle.save_pickle(outfile=protein_seqs_pickle_path)

        return g_id, protein_seqs_pickle_path

Source File: paf.py From dgenies with GNU General Public License v3.0

6 votes

def build_list_no_assoc(self, to):
        """
        Build list of queries that match with None target, or the opposite

        :param to: query or target
        :return: content of the file
        """
        index = self.idx_q if to == "query" else self.idx_t
        name, contigs_list, contigs, reversed, abs_start, c_len = Index.load(index)
        contigs_list = set(contigs_list)
        with open(self.paf, "r") as paf:
            for line in paf:
                c_name = line.strip("\n").split("\t")[0 if to == "query" else 5]
                if c_name in contigs_list:
                    contigs_list.remove(c_name)
        return "\n".join(contigs_list) + "\n"

Source File: functions.py From dgenies with GNU General Public License v3.0

6 votes

def compress_and_send_mail(job_name, fasta_file, index_file, lock_file, mailer):
        """
        Compress fasta file and the send mail with its link to the client

        :param job_name: job id
        :type job_name: str
        :param fasta_file: fasta file path
        :type fasta_file: str
        :param index_file: index file path
        :type index_file: str
        :param lock_file: lock file path
        :type lock_file: str
        :param mailer: mailer object (to send mail)
        :type mailer: Mailer
        """
        Functions.compress(fasta_file)
        os.remove(lock_file)
        index, sample_name = Functions.read_index(index_file)
        Functions.send_fasta_ready(mailer, job_name, sample_name, True)

Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License

6 votes

def __getitem__(self, index: int):
        if not 0 <= index < self._num_examples:
            raise IndexError(index)

        # if self._in_memory and self._cache[index] is not None:
        record = self._cache[index]
        # else:
            # key = self._keys[index]
            # record = self._records[key]
            # if self._in_memory:
                # self._cache[index] = record

        item = {'id': record.id,
                'primary': str(record.seq),
                'protein_length': len(record.seq)}
        return item

Source File: util.py From pomoxis with Mozilla Public License 2.0

6 votes

def reverse_bed():
    """Convert bed-file coordinates to coordinates on the reverse strand."""
    parser = argparse.ArgumentParser(
        prog='reverse_bed',
        description='Convert bed-file coordinates to coordinates on the reverse strand.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('bed_in', help='Input bed file.')
    parser.add_argument('ref_fasta', help='Input reference fasta file.')
    parser.add_argument('bed_out', help='Output bed file.')
    args = parser.parse_args()

    fasta = pysam.FastaFile(args.ref_fasta)
    lengths = dict(zip(fasta.references, fasta.lengths))
    d = pd.read_csv(args.bed_in, sep='\t', names=['chrom', 'start', 'stop'])

    d['chrom_length'] = d['chrom'].map(lambda x: lengths[x])
    d['rc_stop'] = d['chrom_length'] - d['start']
    d['rc_start'] = d['chrom_length'] - d['stop']
    d['chrom_rc'] = d['chrom'] + '_rc'
    d[['chrom_rc', 'rc_start', 'rc_stop']].to_csv(args.bed_out, index=False, header=False, sep='\t')

Source File: determine_paralogs.py From chewBBACA with GNU General Public License v3.0

6 votes

def split_blast_inputs_by_core(blast_inputs, threads, blast_files_dir):
    """
    """

    splitted_ids = [[] for cpu in range(threads)]
    splitted_values = [[] for cpu in range(threads)]
    cluster_sums = [0] * threads
    i = 0
    for cluster in blast_inputs:
        cluster_file = os.path.join(blast_files_dir,
                                    '{0}_ids.txt'.format(cluster))
        with open(cluster_file, 'r') as infile:
            cluster_seqs = [line.strip() for line in infile.readlines()]
        splitted_values[i].append(len(cluster_seqs))
        splitted_ids[i].append(cluster)
        cluster_sums[i] += len(cluster_seqs)
        i = cluster_sums.index(min(cluster_sums))

    return [s for s in splitted_ids if len(s) > 0]

Source File: handleGB.py From PhyloSuite with GNU General Public License v3.0

5 votes

def remove_by_col_IDs(self, IDs):
        ##比较耗时，可以考虑重写
        zip_array = self.zip_an_array(self.array)
        indice = [num for num, colH in enumerate(self.header) if colH in IDs]
        for index in sorted(indice, reverse=True):
            zip_array.pop(index)
        return self.zip_an_array(zip_array)

Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License

5 votes

def __getitem__(self, index):
        item = self.data[index]
        token_ids = self.tokenizer.encode(item['primary'])
        input_mask = np.ones_like(token_ids)

        return token_ids, input_mask, item['clan'], item['family']

Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License

5 votes

def __getitem__(self, index: int):
        item = self.data[index]
        token_ids = self.tokenizer.encode(item['primary'])
        input_mask = np.ones_like(token_ids)
        return token_ids, input_mask, float(item['log_fluorescence'][0])

Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License

5 votes

def __getitem__(self, index: int):
        item = self.data[index]
        token_ids = self.tokenizer.encode(item['primary'])
        input_mask = np.ones_like(token_ids)
        return token_ids, input_mask, float(item['stability_score'][0])

Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License

5 votes

def __getitem__(self, index: int):
        item = self.data[index]
        token_ids = self.tokenizer.encode(item['primary'])
        input_mask = np.ones_like(token_ids)
        return token_ids, input_mask, item['fold_label']

Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License

5 votes

def __getitem__(self, index: int):
        item = self.data[index]
        token_ids = self.tokenizer.encode(item['primary'])
        input_mask = np.ones_like(token_ids)

        # pad with -1s because of cls/sep tokens
        labels = np.asarray(item['ss3'], np.int64)
        labels = np.pad(labels, (1, 1), 'constant', constant_values=-1)

        return token_ids, input_mask, labels

Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License

5 votes

def __getitem__(self, index):
        item = self.data[index]

        msa = item['msa']
        dist = item['dist6d']
        omega = item['omega6d']
        theta = item['theta6d']
        phi = item['phi6d']

        if self._split == 'train':
            msa = self._subsample_msa(msa)
        elif self._split == 'valid':
            msa = msa[:20000]  # runs out of memory if msa is way too big
        msa, dist, omega, theta, phi = self._slice_long_sequences(
            msa, dist, omega, theta, phi)

        mask = dist == 0

        dist_bins = np.digitize(dist, self._dist_bins)
        omega_bins = np.digitize(omega, self._dihedral_bins) + 1
        theta_bins = np.digitize(theta, self._dihedral_bins) + 1
        phi_bins = np.digitize(phi, self._planar_bins) + 1

        dist_bins[mask] = 0
        omega_bins[mask] = 0
        theta_bins[mask] = 0
        phi_bins[mask] = 0

        dist_bins[np.diag_indices_from(dist_bins)] = -1

        # input_mask = np.ones_like(msa[0])

        return msa, dist_bins, omega_bins, theta_bins, phi_bins

Source File: sort_fasta.py From STRetch with MIT License

5 votes

def main():
    # Parse command line arguments
    args = parse_args()
    infile = args.infile
    outfile = args.outfile

    with open(outfile, "w") as outfasta:

        allfasta = SeqIO.parse(infile, "fasta")
        ids = sorted([rec.id for rec in allfasta])
        record_index = SeqIO.index(infile, "fasta")
        records = (record_index[id] for id in ids)
        SeqIO.write(records, outfile, "fasta")

Source File: handleGB.py From PhyloSuite with GNU General Public License v3.0

5 votes

def remove_by_row_Indice(self, indice, base=None, proportion=None, processSig=None):
        sorted_indice = sorted(indice, reverse=True)
        for num, index in enumerate(sorted_indice):
            self.data.pop(index)
            if processSig:
                num += 1
                processSig.emit(base + (num / len(sorted_indice)) * proportion)
        return [self.header] + self.data

Source File: datasets.py From tape with BSD 3-Clause "New" or "Revised" License

5 votes

def __getitem__(self, index: int):
        if not 0 <= index < len(self):
            raise IndexError(index)

        item = dict(np.load(self._file_list[index]))
        if not isinstance(item, dict):
            raise TypeError(f"Expected dataset to contain a list of dictionary "
                            f"records, received record of type {type(item)}")
        if 'id' not in item:
            item['id'] = self._file_list[index].stem
        return item

Source File: handleGB.py From PhyloSuite with GNU General Public License v3.0

5 votes

def merge_gbIndex(self, genbanks):
        ##耗时，而且容易卡死
        merged_index = OrderedDict()
        for gb in genbanks:
            gb_index = SeqIO.index(gb, "genbank")
            merged_index.update(gb_index)
        return merged_index

Source File: handleGB.py From PhyloSuite with GNU General Public License v3.0

5 votes

def fetchInfo4extracter(self, list_IDs):
        '''已废弃'''
        list_ID_path = [self.fetchRecordPath(ID) for ID in list_IDs]
        list_indice = self.merge_gbIndex(list_ID_path)
        allcontent = self.merge_file_contents(list_ID_path)
        rgx = re.compile(r"\/State=\"(\S+) (\S+)\"")
        unverifiedIDs = [i[0] for i in rgx.findall(allcontent) if i[-1] == "unverified"]
        names4extracter = [list_indice[index].annotations["organism"] + " " + list_indice[index].name for index in
                           list_indice]
        return names4extracter, allcontent, unverifiedIDs

Source File: handleGB.py From PhyloSuite with GNU General Public License v3.0

5 votes

def updateArrayByRow(self, ID, list_row, array):
        for index, row in enumerate(array):
            #先找到对应ID
            if ID == row[0]:
                break
        array[index] = list_row
        return array

Source File: functions.py From dgenies with GNU General Public License v3.0

5 votes

def read_index(index_file):
        """
        Load index of query or target

        :param index_file: index file path
        :type index_file: str
        :return:
            * [0] index (size of each chromosome) {dict}
            * [1] sample name {str}
        :rtype: (dict, str)
        """
        index = OrderedDict()
        with open(index_file, "r") as index_f:
            # Sample name without special chars:
            sample_name = re.sub('[^A-Za-z0-9_\-.]+', '', index_f.readline().strip("\n").replace(" ", "_"))
            for line in index_f:
                if line != "":
                    parts = line.strip("\n").split("\t")
                    name = parts[0]
                    lenght = int(parts[1])
                    to_reverse = parts[2] == "1" if len(parts) >= 3 else False
                    index[name] = {
                        "length": lenght,
                        "to_reverse": to_reverse
                    }
        return index, sample_name

Source File: paf.py From dgenies with GNU General Public License v3.0

5 votes

def _flush_blocks(index_c, new_index_c, new_index_o, current_block):
        """
        When parsing index, build a mix of too small sequential contigs (if their number exceed 5), else just add
        co to the new index

        :param index_c: current index contigs def
        :type index_c: dict
        :param new_index_o: new index contigs order
        :type new_index_o: list
        :param new_index_c: new index contigs def
        :type new_index_c: dict
        :param current_block: contigs in the current analyzed block
        :type current_block: list
        :return: (new index contigs defs, new index contigs order)
        :rtype: (dict, list)
        """
        if len(current_block) >= 5:
            block_length = 0
            for contig in current_block:
                block_length += index_c[contig]
            b_name = "###MIX###_" + "###".join(current_block)
            new_index_c[b_name] = block_length
            new_index_o.append(b_name)
        elif len(current_block) > 0:
            for b_name in current_block:
                new_index_c[b_name] = index_c[b_name]
                new_index_o.append(b_name)
        return new_index_c, new_index_o

Source File: paf.py From dgenies with GNU General Public License v3.0

5 votes

def _update_query_index(self, contigs_reoriented):
        """
        Write new query index file (including new reoriented contigs info)

        :param contigs_reoriented: reoriented contigs list
        :type contigs_reoriented: list
        """
        with open(self.idx_q, "w") as idx:
            idx.write(self.name_q + "\n")
            for contig in self.q_order:
                idx.write("\t".join([contig, str(self.q_contigs[contig]), "1" if contig in contigs_reoriented else "0"])
                          + "\n")

Source File: barcode_rarify.py From amptk with BSD 2-Clause "Simplified" License

5 votes

def IndexSeqs(file):
    global SeqIndex
    SeqIndex = SeqIO.index(file, 'fastq')

Source File: dbotu.py From amplicon_sequencing_pipeline with MIT License

5 votes

def _process_record(self, record_id):
        '''
        Process the next sequence: run the genetic, abundance, and distribution checks, either
        merging the sequence into an existing OTU or creating a new OTU.
        '''
        assert record_id in self.seq_table.index
        record = self.records[record_id]

        candidate = OTU(record.id, str(record.seq), self.seq_table.loc[record.id])

        if self.log is not None:
            print('seq', candidate.name, sep='\t', file=self.log)

        merged = False
        for otu in self.ga_matches(candidate):
            test_pval = candidate.distribution_pval(otu)

            if self.log is not None:
                print(candidate.name, 'distribution_check', otu.name, test_pval, sep='\t', file=self.log)

            if test_pval > self.threshold_pval:
                otu.absorb(candidate)
                self.membership[otu.name].append(candidate.name)
                merged = True
                break

        if not merged:
            # form own otu
            self.otus.append(candidate)
            self.membership[candidate.name] = [candidate.name]

Source File: funannotate-runEVM.py From funannotate with BSD 2-Clause "Simplified" License

5 votes

def getBreakPoint(tupList, idx, direction='reverse', gap=2000):
    # takes list of tuples of coords and a starting index (idx). finds closest
    # break point in between tuple coordSorted
    solution = False
    while not solution:
        try:
            start, end, diff = tupList[idx]
        except IndexError:
            return False
        if diff >= gap:
            phase = int(round(diff/2))
            solution = end + phase
        else:
            if direction == 'reverse':
                idx -= 1
            else:
                idx += 1
    return solution

Source File: atlas.py From ssbio with MIT License

5 votes

def _load_strain_sequences(self, strain_gempro):
        """Load strain sequences from the orthology matrix into the base model for comparisons, and into the
        strain-specific model itself.

        """
        if self._orthology_matrix_has_sequences:  # Load directly from the orthology matrix if it contains sequences
            strain_sequences = self.df_orthology_matrix[strain_gempro.id].to_dict()
        else:  # Otherwise load from the genome file if the orthology matrix contains gene IDs
            # Load the genome FASTA file
            log.debug('{}: loading strain genome CDS file'.format(strain_gempro.genome_path))
            strain_sequences = SeqIO.index(strain_gempro.genome_path, 'fasta')

        for strain_gene in strain_gempro.genes:
            if strain_gene.functional:
                if self._orthology_matrix_has_sequences:
                    strain_gene_key = strain_gene.id
                else:
                    # Pull the gene ID of the strain from the orthology matrix
                    strain_gene_key = self.df_orthology_matrix.loc[strain_gene.id, strain_gempro.id]
                    log.debug('{}: original gene ID to be pulled from strain fasta file'.format(strain_gene_key))

                # # Load into the base strain for comparisons
                ref_gene = self.reference_gempro.genes.get_by_id(strain_gene.id)
                new_id = '{}_{}'.format(strain_gene.id, strain_gempro.id)
                if ref_gene.protein.sequences.has_id(new_id):
                    log.debug('{}: sequence already loaded into reference model'.format(new_id))
                    continue
                ref_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id,
                                                      set_as_representative=False)
                log.debug('{}: loaded sequence into reference model'.format(new_id))

                # Load into the strain GEM-PRO
                strain_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id,
                                                         set_as_representative=True)
                log.debug('{}: loaded sequence into strain model'.format(new_id))

Source File: atlas.py From ssbio with MIT License

5 votes

def build_strain_specific_models(self, save_models=False):
        """Using the orthologous genes matrix, create and modify the strain specific models based on if orthologous
            genes exist.

        Also store the sequences directly in the reference GEM-PRO protein sequence attribute for the strains.
        """

        if len(self.df_orthology_matrix) == 0:
            raise RuntimeError('Empty orthology matrix')

        # Create an emptied copy of the reference GEM-PRO
        for strain_gempro in tqdm(self.strains):
            log.debug('{}: building strain specific model'.format(strain_gempro.id))

            # For each genome, load the metabolic model or genes from the reference GEM-PRO
            logging.disable(logging.WARNING)
            if self._empty_reference_gempro.model:
                strain_gempro.load_cobra_model(self._empty_reference_gempro.model)
            elif self._empty_reference_gempro.genes:
                strain_gempro.genes = [x.id for x in self._empty_reference_gempro.genes]
            logging.disable(logging.NOTSET)

            # Get a list of genes which do not have orthology in the strain
            not_in_strain = self.df_orthology_matrix[pd.isnull(self.df_orthology_matrix[strain_gempro.id])][strain_gempro.id].index.tolist()

            # Mark genes non-functional
            self._pare_down_model(strain_gempro=strain_gempro, genes_to_remove=not_in_strain)

            # Load sequences into the base and strain models
            self._load_strain_sequences(strain_gempro=strain_gempro)

            if save_models:
                cobra.io.save_json_model(model=strain_gempro.model,
                                         filename=op.join(self.model_dir, '{}.json'.format(strain_gempro.id)))
                strain_gempro.save_pickle(op.join(self.model_dir, '{}_gp.pckl'.format(strain_gempro.id)))


        log.info('Created {} new strain-specific models and loaded in sequences'.format(len(self.strains)))

Source File: atlas2.py From ssbio with MIT License

5 votes

def _write_strain_functional_genes(self, strain_id, ref_functional_genes, orth_matrix, force_rerun=False):
        """Create strain functional genes json file"""
        func_genes_path = op.join(self.model_dir, '{}_funcgenes.json'.format(strain_id))

        if ssbio.utils.force_rerun(flag=force_rerun, outfile=func_genes_path):
            gene_to_func = {k:True for k in ref_functional_genes}
            # Get a list of genes which do not have orthology in the strain
            genes_to_remove = orth_matrix[pd.isnull(orth_matrix[strain_id])][strain_id].index.tolist()

            # Mark genes non-functional
            genes_to_remove = list(set(genes_to_remove).intersection(set(ref_functional_genes)))

            if len(genes_to_remove) > 0:
                for g in genes_to_remove:
                    gene_to_func[g] = False

            with open(func_genes_path, 'w') as f:
                json.dump(gene_to_func, f)
        else:
            with open(func_genes_path, 'r') as f:
                gene_to_func = json.load(f)

        return strain_id, gene_to_func

Source File: atlas2.py From ssbio with MIT License

5 votes

def _build_strain_specific_model(self, strain_id, ref_functional_genes, orth_matrix, force_rerun=False):
        """Create strain GEMPRO, set functional genes"""
        gp_noseqs_path = op.join(self.model_dir, '{}_gp.pckl'.format(strain_id))

        if ssbio.utils.force_rerun(flag=force_rerun, outfile=gp_noseqs_path):
            logging.disable(logging.WARNING)

            strain_gp = GEMPRO(gem_name=strain_id)

            # if self.reference_gempro.model:
            #     strain_gp.load_cobra_model(deepcopy(self.reference_gempro.model))
            #     # Reset the GenePro attributes
            #     for x in strain_gp.genes:
            #         x.reset_protein()
            # else:
                # Otherwise, just copy the list of genes over and rename the IDs
            strain_genes = [x for x in ref_functional_genes]
            strain_gp.add_gene_ids(strain_genes)

            logging.disable(logging.NOTSET)

            # Get a list of genes which do not have orthology in the strain
            genes_to_remove = orth_matrix[pd.isnull(orth_matrix[strain_id])][strain_id].index.tolist()

            # Mark genes non-functional
            strain_genes = [x.id for x in strain_gp.genes]
            genes_to_remove = list(set(genes_to_remove).intersection(set(strain_genes)))

            if len(genes_to_remove) > 0:
                # If a COBRApy model exists, utilize the delete_model_genes method
                # if strain_gp.model:
                #     cobra.manipulation.delete_model_genes(strain_gp.model, genes_to_remove)
                # # Otherwise, just mark the genes as non-functional
                # else:
                for g in genes_to_remove:
                    strain_gp.genes.get_by_id(g).functional = False

            strain_gp.save_pickle(outfile=gp_noseqs_path)

        return strain_id, gp_noseqs_path

Source File: dbotu.py From amplicon_sequencing_pipeline with MIT License

5 votes

def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None):
        '''
        seq_table: pandas.DataFrame
          Samples on the columns; sequences on the rows
        records: index of Bio.Seq
          Indexed, unaligned input sequences. This could come from BioPython's
          SeqIO.to_dict or SeqIO.index.
        max_dist: float
          genetic distance cutoff above which a sequence will not be merged into an OTU
        min_fold: float
          Multiply the sequence's abundance by this fold to get the minimum abundance
          of an OTU for merging
        threshold_pval: float
          P-value below which a sequence will not be merged into an OTU
        log: filehandle
          Log file reporting the abundance, genetic, and distribution checks.
        '''
        self.seq_table = seq_table
        self.records = records
        self.max_dist = max_dist
        self.min_fold = min_fold
        self.threshold_pval = threshold_pval
        self.log = log

        # get a list of the names of the sequences in order of their (decreasing) abundance
        self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False)

        # check that all sequence IDs in the table are in the fasta
        missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records]
        if len(missing_ids) > 0:
            raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids))

        # initialize OTU information
        self.membership = {}
        self.otus = []

Python Bio.SeqIO.index() Examples