Python rdkit.DataStructs.DiceSimilarity() Examples

The following are 2 code examples of rdkit.DataStructs.DiceSimilarity(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module rdkit.DataStructs , or try the search function .
Example #1
Source File: similarity.py    From chemprop with MIT License 5 votes vote down vote up
def morgan_similarity(smiles_1: List[str], smiles_2: List[str], radius: int, sample_rate: float):
    """
    Determines the similarity between the morgan fingerprints of two lists of smiles strings.

    :param smiles_1: A list of smiles strings.
    :param smiles_2: A list of smiles strings.
    :param radius: The radius of the morgan fingerprints.
    :param sample_rate: Rate at which to sample pairs of molecules for Morgan similarity (to reduce time).
    """
    # Compute similarities
    similarities = []
    num_pairs = len(smiles_1) * len(smiles_2)

    # Sample to improve speed
    if sample_rate < 1.0:
        sample_num_pairs = sample_rate * num_pairs
        sample_size = math.ceil(math.sqrt(sample_num_pairs))
        sample_smiles_1 = np.random.choice(smiles_1, size=sample_size, replace=True)
        sample_smiles_2 = np.random.choice(smiles_2, size=sample_size, replace=True)
    else:
        sample_smiles_1, sample_smiles_2 = smiles_1, smiles_2

    sample_num_pairs = len(sample_smiles_1) * len(sample_smiles_2)

    for smile_1, smile_2 in tqdm(product(sample_smiles_1, sample_smiles_2), total=sample_num_pairs):
        mol_1, mol_2 = Chem.MolFromSmiles(smile_1), Chem.MolFromSmiles(smile_2)
        fp_1, fp_2 = AllChem.GetMorganFingerprint(mol_1, radius), AllChem.GetMorganFingerprint(mol_2, radius)
        similarity = DataStructs.DiceSimilarity(fp_1, fp_2)
        similarities.append(similarity)
    similarities = np.array(similarities)

    # Print results
    print()
    print(f'Average dice similarity = {np.mean(similarities):.4f} +/- {np.std(similarities):.4f}')
    print(f'Minimum dice similarity = {np.min(similarities):.4f}')
    print(f'Maximum dice similarity = {np.max(similarities):.4f}')
    print()
    print('Percentiles for dice similarity')
    print(' | '.join([f'{i}% = {np.percentile(similarities, i):.4f}' for i in range(0, 101, 10)])) 
Example #2
Source File: splitters.py    From PADME with MIT License 4 votes vote down vote up
def split(self,
            dataset,
            seed=None,
            frac_train=.8,
            frac_valid=.1,
            frac_test=.1,
            log_every_n=None):
    """
    Splits internal compounds randomly into train/validation/test.
    """
    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
    if seed is None:
      seed = random.randint(0, 2**30)
    np.random.seed(seed)

    num_datapoints = len(dataset)

    train_cutoff = int(frac_train * num_datapoints)
    valid_cutoff = int((frac_train + frac_valid) * num_datapoints)

    num_train = train_cutoff
    num_valid = valid_cutoff - train_cutoff
    num_test = num_datapoints - valid_cutoff

    all_mols = []
    for ind, smiles in enumerate(dataset.ids):
      all_mols.append(Chem.MolFromSmiles(smiles))

    fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in all_mols]

    def distance(i, j):
      return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])

    picker = MaxMinPicker()
    testIndices = picker.LazyPick(
        distFunc=distance,
        poolSize=num_datapoints,
        pickSize=num_test,
        seed=seed)

    validTestIndices = picker.LazyPick(
        distFunc=distance,
        poolSize=num_datapoints,
        pickSize=num_valid + num_test,
        firstPicks=testIndices,
        seed=seed)

    allSet = set(range(num_datapoints))
    testSet = set(testIndices)
    validSet = set(validTestIndices) - testSet

    trainSet = allSet - testSet - validSet

    assert len(testSet & validSet) == 0
    assert len(testSet & trainSet) == 0
    assert len(validSet & trainSet) == 0
    assert (validSet | trainSet | testSet) == allSet

    return sorted(list(trainSet)), sorted(list(validSet)), sorted(list(testSet))