Python rdkit.DataStructs.BulkTanimotoSimilarity() Examples

The following are 15 code examples of rdkit.DataStructs.BulkTanimotoSimilarity(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module rdkit.DataStructs , or try the search function .
Example #1
Source File: sim_to_train.py    From PIDGINv2 with MIT License 7 votes vote down vote up
def doSimSearch(model_name):
	if os.name == 'nt': sep = '\\'
	else: sep = '/'
	mod = model_name.split(sep)[-1].split('.')[0]
	try:
		with zipfile.ZipFile(os.path.dirname(os.path.abspath(__file__)) + sep + 'actives' + sep + mod + '.smi.zip', 'r') as zfile:
			comps = [i.split('\t') for i in zfile.open(mod + '.smi', 'r').read().splitlines()]
	except IOError: return
	comps2 = []
	afp = []
	for comp in comps:
		try:
			afp.append(calcFingerprints(comp[1]))
			comps2.append(comp)
		except: pass
	ret = []
	for i,fp in enumerate(querymatrix):
		sims = DataStructs.BulkTanimotoSimilarity(fp,afp)
		idx = sims.index(max(sims))
		ret.append([sims[idx], mod] + comps2[idx] + [smiles[i]])
	return ret

#prediction runner 
Example #2
Source File: chemistry.py    From guacamol with MIT License 6 votes vote down vote up
def calculate_internal_pairwise_similarities(smiles_list: Collection[str]) -> np.array:
    """
    Computes the pairwise similarities of the provided list of smiles against itself.

    Returns:
        Symmetric matrix of pairwise similarities. Diagonal is set to zero.
    """
    if len(smiles_list) > 10000:
        logger.warning(f'Calculating internal similarity on large set of '
                       f'SMILES strings ({len(smiles_list)})')

    mols = get_mols(smiles_list)
    fps = get_fingerprints(mols)
    nfps = len(fps)

    similarities = np.zeros((nfps, nfps))

    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        similarities[i, :i] = sims
        similarities[:i, i] = sims

    return similarities 
Example #3
Source File: chemistry.py    From guacamol with MIT License 6 votes vote down vote up
def highest_tanimoto_precalc_fps(mol, fps):
    """

    Args:
        mol: Rdkit molecule
        fps: precalculated ECFP4 bitvectors

    Returns:

    """

    if fps is None or len(fps) == 0:
        return 0

    fp1 = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 4096)
    sims = np.array(DataStructs.BulkTanimotoSimilarity(fp1, fps))

    return sims.max() 
Example #4
Source File: predict.py    From PIDGINv3 with GNU General Public License v3.0 6 votes vote down vote up
def doPercentileCalculation(model_name):
	global rdkit_mols
	#expensive to unzip training file - so only done if smiles requested
	if options.ad_smiles:
		smiles = get_training_smiles(model_name)
	ad_data = getAdData(model_name)
	def calcPercentile(rdkit_mol):
		sims = DataStructs.BulkTanimotoSimilarity(rdkit_mol,ad_data[:,0])
		bias = ad_data[:,2].astype(float)
		std_dev = ad_data[:,3].astype(float)
		scores = ad_data[:,5].astype(float)
		weights = sims / (bias * std_dev)
		critical_weight = weights.max()
		percentile = percentileofscore(scores,critical_weight)
		if options.ad_smiles:
			critical_smiles = smiles[np.argmax(weights)]
			result = percentile, critical_smiles
		else:
			result = percentile, None
		return result
	ret = [calcPercentile(x) for x in rdkit_mols]
	return model_name, ret

#prediction runner for percentile calculation 
Example #5
Source File: splitters.py    From deepchem with MIT License 5 votes vote down vote up
def ClusterFps(fps, cutoff=0.2):
  # (ytz): this is directly copypasta'd from Greg Landrum's clustering example.
  dists = []
  nfps = len(fps)
  from rdkit import DataStructs
  for i in range(1, nfps):
    sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
    dists.extend([1 - x for x in sims])
  from rdkit.ML.Cluster import Butina
  cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
  return cs 
Example #6
Source File: splitters.py    From deepchem with MIT License 5 votes vote down vote up
def ClusterFps(fps, cutoff=0.2):
  # (ytz): this is directly copypasta'd from Greg Landrum's clustering example.
  dists = []
  nfps = len(fps)
  for i in range(1, nfps):
    sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
    dists.extend([1 - x for x in sims])
  cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
  return cs 
Example #7
Source File: splitters.py    From PADME with MIT License 5 votes vote down vote up
def ClusterFps(fps, cutoff=0.2):
  # (ytz): this is directly copypasta'd from Greg Landrum's clustering example.
  dists = []
  nfps = len(fps)
  for i in range(1, nfps):
    sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
    dists.extend([1 - x for x in sims])
  cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
  return cs 
Example #8
Source File: dist_metrics.py    From AMPL with MIT License 5 votes vote down vote up
def tanimoto_worker(k, fps):
    """Get per-fingerprint Tanimoto distance vector."""
    # pylint: disable=no-member
    sims = DataStructs.BulkTanimotoSimilarity(fps[k], fps[(k + 1):])
    dists_k = [1. - s for s in sims]
    return np.array(dists_k), 0 
Example #9
Source File: dist_metrics.py    From AMPL with MIT License 5 votes vote down vote up
def tanimoto_single(fp, fps):
    """Get per-fingerprint Tanimoto distance vector."""
    # pylint: disable=no-member
    sims = DataStructs.BulkTanimotoSimilarity(fp, fps)
    dists = [1. - s for s in sims]
    return np.array(dists), 0 
Example #10
Source File: molecular_metrics.py    From MolGAN with MIT License 5 votes vote down vote up
def __compute_diversity(mol, fps):
        ref_fps = Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, 4, nBits=2048)
        dist = DataStructs.BulkTanimotoSimilarity(ref_fps, fps, returnDistance=True)
        score = np.mean(dist)
        return score 
Example #11
Source File: molecular_metrics.py    From graph-nvp with MIT License 5 votes vote down vote up
def __compute_diversity(mol, fps):
        ref_fps = Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, 4, nBits=2048)
        dist = DataStructs.BulkTanimotoSimilarity(ref_fps, fps, returnDistance=True)
        score = np.mean(dist)
        return score 
Example #12
Source File: chemistry.py    From guacamol with MIT License 5 votes vote down vote up
def calculate_pairwise_similarities(smiles_list1: List[str], smiles_list2: List[str]) -> np.array:
    """
    Computes the pairwise ECFP4 tanimoto similarity of the two smiles containers.

    Returns:
        Pairwise similarity matrix as np.array
    """
    if len(smiles_list1) > 10000 or len(smiles_list2) > 10000:
        logger.warning(f'Calculating similarity between large sets of '
                       f'SMILES strings ({len(smiles_list1)} x {len(smiles_list2)})')

    mols1 = get_mols(smiles_list1)
    fps1 = get_fingerprints(mols1)

    mols2 = get_mols(smiles_list2)
    fps2 = get_fingerprints(mols2)

    similarities = []

    for fp1 in fps1:
        sims = DataStructs.BulkTanimotoSimilarity(fp1, fps2)

        similarities.append(sims)

    similarities = np.array(similarities)

    return similarities 
Example #13
Source File: mol_metrics.py    From ORGAN with GNU General Public License v2.0 5 votes vote down vote up
def diversity(smile, fps):
    val = 0.0
    low_rand_dst = 0.9
    mean_div_dst = 0.945
    ref_mol = Chem.MolFromSmiles(smile)
    ref_fps = Chem.GetMorganFingerprintAsBitVect(ref_mol, 4, nBits=2048)
    dist = DataStructs.BulkTanimotoSimilarity(
        ref_fps, fps, returnDistance=True)
    mean_dist = np.mean(np.array(dist))
    val = remap(mean_dist, low_rand_dst, mean_div_dst)
    val = np.clip(val, 0.0, 1.0)
    return val

#============== 
Example #14
Source File: mol_distance.py    From ORGAN with GNU General Public License v2.0 5 votes vote down vote up
def tanimoto_1d(fps):
    ds = []
    for i in range(1, len(fps)):
        ds.extend(DataStructs.BulkTanimotoSimilarity(
            fps[i], fps[:i], returnDistance=True))
    return ds 
Example #15
Source File: metric.py    From DrugEx with MIT License 5 votes vote down vote up
def diversity(fake_path, real_path=None, is_active=False):
    """ Molecular diversity measurement based on Tanimoto-distance on ECFP6 fingerprints,
    including, intra-diversity and inter-diversity.

    Arguments:
        fake_path (str): the file path of molecules that need to measuring diversity

        real_path (str, optional): the file path of molecules as the reference, if it
            is provided, the inter-diversity will be calculated; otherwise, the intra-diversity
            will be calculated.
        is_active (bool, optional): selecting only active ligands (True) or all of the molecules (False)
            if it is true, the molecule with PCHEMBL_VALUE >= 6.5 or SCORE > 0.5 will be selected.
            (Default: False)

    Returns:
        df (DataFrame): the table that contains columns of CANONICAL_SMILES
            and diversity value for each molecules

    """
    fake = pd.read_table(fake_path)
    fake = fake[fake.SCORE > (0.5 if is_active else 0)]
    fake = fake.drop_duplicates(subset='CANONICAL_SMILES')
    fake_fps, real_fps = [], []
    for i, row in fake.iterrows():
        mol = Chem.MolFromSmiles(row.CANONICAL_SMILES)
        fake_fps.append(AllChem.GetMorganFingerprint(mol, 3))
    if real_path:
        real = pd.read_table(real_path)
        real = real[real.PCHEMBL_VALUE >= (6.5 if is_active else 0)]
        for i, row in real.iterrows():
            mol = Chem.MolFromSmiles(row.CANONICAL_SMILES)
            real_fps.append(AllChem.GetMorganFingerprint(mol, 3))
    else:
        real_fps = fake_fps
    method = np.min if real_path else np.mean
    dist = 1 - np.array([method(DataStructs.BulkTanimotoSimilarity(f, real_fps)) for f in fake_fps])
    fake['DIST'] = dist
    return fake