Python rdkit.DataStructs.BulkTanimotoSimilarity() Examples
The following are 15
code examples of rdkit.DataStructs.BulkTanimotoSimilarity().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
rdkit.DataStructs
, or try the search function
.
Example #1
Source File: sim_to_train.py From PIDGINv2 with MIT License | 7 votes |
def doSimSearch(model_name): if os.name == 'nt': sep = '\\' else: sep = '/' mod = model_name.split(sep)[-1].split('.')[0] try: with zipfile.ZipFile(os.path.dirname(os.path.abspath(__file__)) + sep + 'actives' + sep + mod + '.smi.zip', 'r') as zfile: comps = [i.split('\t') for i in zfile.open(mod + '.smi', 'r').read().splitlines()] except IOError: return comps2 = [] afp = [] for comp in comps: try: afp.append(calcFingerprints(comp[1])) comps2.append(comp) except: pass ret = [] for i,fp in enumerate(querymatrix): sims = DataStructs.BulkTanimotoSimilarity(fp,afp) idx = sims.index(max(sims)) ret.append([sims[idx], mod] + comps2[idx] + [smiles[i]]) return ret #prediction runner
Example #2
Source File: chemistry.py From guacamol with MIT License | 6 votes |
def calculate_internal_pairwise_similarities(smiles_list: Collection[str]) -> np.array: """ Computes the pairwise similarities of the provided list of smiles against itself. Returns: Symmetric matrix of pairwise similarities. Diagonal is set to zero. """ if len(smiles_list) > 10000: logger.warning(f'Calculating internal similarity on large set of ' f'SMILES strings ({len(smiles_list)})') mols = get_mols(smiles_list) fps = get_fingerprints(mols) nfps = len(fps) similarities = np.zeros((nfps, nfps)) for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) similarities[i, :i] = sims similarities[:i, i] = sims return similarities
Example #3
Source File: chemistry.py From guacamol with MIT License | 6 votes |
def highest_tanimoto_precalc_fps(mol, fps): """ Args: mol: Rdkit molecule fps: precalculated ECFP4 bitvectors Returns: """ if fps is None or len(fps) == 0: return 0 fp1 = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 4096) sims = np.array(DataStructs.BulkTanimotoSimilarity(fp1, fps)) return sims.max()
Example #4
Source File: predict.py From PIDGINv3 with GNU General Public License v3.0 | 6 votes |
def doPercentileCalculation(model_name): global rdkit_mols #expensive to unzip training file - so only done if smiles requested if options.ad_smiles: smiles = get_training_smiles(model_name) ad_data = getAdData(model_name) def calcPercentile(rdkit_mol): sims = DataStructs.BulkTanimotoSimilarity(rdkit_mol,ad_data[:,0]) bias = ad_data[:,2].astype(float) std_dev = ad_data[:,3].astype(float) scores = ad_data[:,5].astype(float) weights = sims / (bias * std_dev) critical_weight = weights.max() percentile = percentileofscore(scores,critical_weight) if options.ad_smiles: critical_smiles = smiles[np.argmax(weights)] result = percentile, critical_smiles else: result = percentile, None return result ret = [calcPercentile(x) for x in rdkit_mols] return model_name, ret #prediction runner for percentile calculation
Example #5
Source File: splitters.py From deepchem with MIT License | 5 votes |
def ClusterFps(fps, cutoff=0.2): # (ytz): this is directly copypasta'd from Greg Landrum's clustering example. dists = [] nfps = len(fps) from rdkit import DataStructs for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) dists.extend([1 - x for x in sims]) from rdkit.ML.Cluster import Butina cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) return cs
Example #6
Source File: splitters.py From deepchem with MIT License | 5 votes |
def ClusterFps(fps, cutoff=0.2): # (ytz): this is directly copypasta'd from Greg Landrum's clustering example. dists = [] nfps = len(fps) for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) dists.extend([1 - x for x in sims]) cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) return cs
Example #7
Source File: splitters.py From PADME with MIT License | 5 votes |
def ClusterFps(fps, cutoff=0.2): # (ytz): this is directly copypasta'd from Greg Landrum's clustering example. dists = [] nfps = len(fps) for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) dists.extend([1 - x for x in sims]) cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) return cs
Example #8
Source File: dist_metrics.py From AMPL with MIT License | 5 votes |
def tanimoto_worker(k, fps): """Get per-fingerprint Tanimoto distance vector.""" # pylint: disable=no-member sims = DataStructs.BulkTanimotoSimilarity(fps[k], fps[(k + 1):]) dists_k = [1. - s for s in sims] return np.array(dists_k), 0
Example #9
Source File: dist_metrics.py From AMPL with MIT License | 5 votes |
def tanimoto_single(fp, fps): """Get per-fingerprint Tanimoto distance vector.""" # pylint: disable=no-member sims = DataStructs.BulkTanimotoSimilarity(fp, fps) dists = [1. - s for s in sims] return np.array(dists), 0
Example #10
Source File: molecular_metrics.py From MolGAN with MIT License | 5 votes |
def __compute_diversity(mol, fps): ref_fps = Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, 4, nBits=2048) dist = DataStructs.BulkTanimotoSimilarity(ref_fps, fps, returnDistance=True) score = np.mean(dist) return score
Example #11
Source File: molecular_metrics.py From graph-nvp with MIT License | 5 votes |
def __compute_diversity(mol, fps): ref_fps = Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, 4, nBits=2048) dist = DataStructs.BulkTanimotoSimilarity(ref_fps, fps, returnDistance=True) score = np.mean(dist) return score
Example #12
Source File: chemistry.py From guacamol with MIT License | 5 votes |
def calculate_pairwise_similarities(smiles_list1: List[str], smiles_list2: List[str]) -> np.array: """ Computes the pairwise ECFP4 tanimoto similarity of the two smiles containers. Returns: Pairwise similarity matrix as np.array """ if len(smiles_list1) > 10000 or len(smiles_list2) > 10000: logger.warning(f'Calculating similarity between large sets of ' f'SMILES strings ({len(smiles_list1)} x {len(smiles_list2)})') mols1 = get_mols(smiles_list1) fps1 = get_fingerprints(mols1) mols2 = get_mols(smiles_list2) fps2 = get_fingerprints(mols2) similarities = [] for fp1 in fps1: sims = DataStructs.BulkTanimotoSimilarity(fp1, fps2) similarities.append(sims) similarities = np.array(similarities) return similarities
Example #13
Source File: mol_metrics.py From ORGAN with GNU General Public License v2.0 | 5 votes |
def diversity(smile, fps): val = 0.0 low_rand_dst = 0.9 mean_div_dst = 0.945 ref_mol = Chem.MolFromSmiles(smile) ref_fps = Chem.GetMorganFingerprintAsBitVect(ref_mol, 4, nBits=2048) dist = DataStructs.BulkTanimotoSimilarity( ref_fps, fps, returnDistance=True) mean_dist = np.mean(np.array(dist)) val = remap(mean_dist, low_rand_dst, mean_div_dst) val = np.clip(val, 0.0, 1.0) return val #==============
Example #14
Source File: mol_distance.py From ORGAN with GNU General Public License v2.0 | 5 votes |
def tanimoto_1d(fps): ds = [] for i in range(1, len(fps)): ds.extend(DataStructs.BulkTanimotoSimilarity( fps[i], fps[:i], returnDistance=True)) return ds
Example #15
Source File: metric.py From DrugEx with MIT License | 5 votes |
def diversity(fake_path, real_path=None, is_active=False): """ Molecular diversity measurement based on Tanimoto-distance on ECFP6 fingerprints, including, intra-diversity and inter-diversity. Arguments: fake_path (str): the file path of molecules that need to measuring diversity real_path (str, optional): the file path of molecules as the reference, if it is provided, the inter-diversity will be calculated; otherwise, the intra-diversity will be calculated. is_active (bool, optional): selecting only active ligands (True) or all of the molecules (False) if it is true, the molecule with PCHEMBL_VALUE >= 6.5 or SCORE > 0.5 will be selected. (Default: False) Returns: df (DataFrame): the table that contains columns of CANONICAL_SMILES and diversity value for each molecules """ fake = pd.read_table(fake_path) fake = fake[fake.SCORE > (0.5 if is_active else 0)] fake = fake.drop_duplicates(subset='CANONICAL_SMILES') fake_fps, real_fps = [], [] for i, row in fake.iterrows(): mol = Chem.MolFromSmiles(row.CANONICAL_SMILES) fake_fps.append(AllChem.GetMorganFingerprint(mol, 3)) if real_path: real = pd.read_table(real_path) real = real[real.PCHEMBL_VALUE >= (6.5 if is_active else 0)] for i, row in real.iterrows(): mol = Chem.MolFromSmiles(row.CANONICAL_SMILES) real_fps.append(AllChem.GetMorganFingerprint(mol, 3)) else: real_fps = fake_fps method = np.min if real_path else np.mean dist = 1 - np.array([method(DataStructs.BulkTanimotoSimilarity(f, real_fps)) for f in fake_fps]) fake['DIST'] = dist return fake