Python rdkit.DataStructs.DiceSimilarity() Examples
The following are 2
code examples of rdkit.DataStructs.DiceSimilarity().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
rdkit.DataStructs
, or try the search function
.
Example #1
Source File: similarity.py From chemprop with MIT License | 5 votes |
def morgan_similarity(smiles_1: List[str], smiles_2: List[str], radius: int, sample_rate: float): """ Determines the similarity between the morgan fingerprints of two lists of smiles strings. :param smiles_1: A list of smiles strings. :param smiles_2: A list of smiles strings. :param radius: The radius of the morgan fingerprints. :param sample_rate: Rate at which to sample pairs of molecules for Morgan similarity (to reduce time). """ # Compute similarities similarities = [] num_pairs = len(smiles_1) * len(smiles_2) # Sample to improve speed if sample_rate < 1.0: sample_num_pairs = sample_rate * num_pairs sample_size = math.ceil(math.sqrt(sample_num_pairs)) sample_smiles_1 = np.random.choice(smiles_1, size=sample_size, replace=True) sample_smiles_2 = np.random.choice(smiles_2, size=sample_size, replace=True) else: sample_smiles_1, sample_smiles_2 = smiles_1, smiles_2 sample_num_pairs = len(sample_smiles_1) * len(sample_smiles_2) for smile_1, smile_2 in tqdm(product(sample_smiles_1, sample_smiles_2), total=sample_num_pairs): mol_1, mol_2 = Chem.MolFromSmiles(smile_1), Chem.MolFromSmiles(smile_2) fp_1, fp_2 = AllChem.GetMorganFingerprint(mol_1, radius), AllChem.GetMorganFingerprint(mol_2, radius) similarity = DataStructs.DiceSimilarity(fp_1, fp_2) similarities.append(similarity) similarities = np.array(similarities) # Print results print() print(f'Average dice similarity = {np.mean(similarities):.4f} +/- {np.std(similarities):.4f}') print(f'Minimum dice similarity = {np.min(similarities):.4f}') print(f'Maximum dice similarity = {np.max(similarities):.4f}') print() print('Percentiles for dice similarity') print(' | '.join([f'{i}% = {np.percentile(similarities, i):.4f}' for i in range(0, 101, 10)]))
Example #2
Source File: splitters.py From PADME with MIT License | 4 votes |
def split(self, dataset, seed=None, frac_train=.8, frac_valid=.1, frac_test=.1, log_every_n=None): """ Splits internal compounds randomly into train/validation/test. """ np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) if seed is None: seed = random.randint(0, 2**30) np.random.seed(seed) num_datapoints = len(dataset) train_cutoff = int(frac_train * num_datapoints) valid_cutoff = int((frac_train + frac_valid) * num_datapoints) num_train = train_cutoff num_valid = valid_cutoff - train_cutoff num_test = num_datapoints - valid_cutoff all_mols = [] for ind, smiles in enumerate(dataset.ids): all_mols.append(Chem.MolFromSmiles(smiles)) fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in all_mols] def distance(i, j): return 1 - DataStructs.DiceSimilarity(fps[i], fps[j]) picker = MaxMinPicker() testIndices = picker.LazyPick( distFunc=distance, poolSize=num_datapoints, pickSize=num_test, seed=seed) validTestIndices = picker.LazyPick( distFunc=distance, poolSize=num_datapoints, pickSize=num_valid + num_test, firstPicks=testIndices, seed=seed) allSet = set(range(num_datapoints)) testSet = set(testIndices) validSet = set(validTestIndices) - testSet trainSet = allSet - testSet - validSet assert len(testSet & validSet) == 0 assert len(testSet & trainSet) == 0 assert len(validSet & trainSet) == 0 assert (validSet | trainSet | testSet) == allSet return sorted(list(trainSet)), sorted(list(validSet)), sorted(list(testSet))