Python Examples of rdkit.Chem.AllChem.GetMorganFingerprint

Source File: chemTopicModel.py From CheTo with BSD 3-Clause "New" or "Revised" License

6 votes

def _generateFPs(mol,fragmentMethod='Morgan'):
    aBits={}
    fp=None
    # circular Morgan fingerprint fragmentation, we use a simple invariant than ususal here
    if fragmentMethod=='Morgan':
        tmp={}
        fp = AllChem.GetMorganFingerprint(mol,radius=2,invariants=utilsFP.generateAtomInvariant(mol),bitInfo=tmp)
        aBits = utilsFP.getMorganEnvironment(mol, tmp, fp=fp, minRad=2)
        fp = fp.GetNonzeroElements()
    # path-based RDKit fingerprint fragmentation
    elif fragmentMethod=='RDK':
        fp = AllChem.UnfoldedRDKFingerprintCountBased(mol,maxPath=5,minPath=3,bitInfo=aBits)
        fp = fp.GetNonzeroElements()
    # get the final BRICS fragmentation (= smallest possible BRICS fragments of a molecule)
    elif fragmentMethod=='Brics':
        fragMol=BRICS.BreakBRICSBonds(mol)
        propSmi = _prepBRICSSmiles(fragMol)
        fp=Counter(propSmi.split('.'))
    else:
        print("Unknown fragment method")
    return fp, aBits

# this function is not part of the class due to parallelisation
# generate the fragments of a molecule, return a map with moleculeID and fragment dict

Source File: mol_utils.py From GLN with MIT License

6 votes

def new_mol(self, name):
        if self.sanitized:
            mol = Chem.MolFromSmiles(name)
        else:
            mol = Chem.MolFromSmarts(name)
        if mol is None:            
            return None
        else:
            mg = MolGraph(name, self.sanitized, mol=mol)
            if self.fp_degree > 0:
                bi = {} if self.fp_info else None
                feat = AllChem.GetMorganFingerprint(mol, self.fp_degree, bitInfo=bi, invariants=self._get_inv(mol))
                on_bits = list(feat.GetNonzeroElements().keys())
                mg.fingerprints = on_bits
                mg.fp_info = bi
            return mg

Source File: vectorizers.py From Deep-Drug-Coder with MIT License

6 votes

def transform_mol(self, mol, misses=False):
        """ transforms the mol into a dense array using the fitted keys as index
        
            :parameter mol: the RDKit molecule to be transformed
            :parameter misses: wheter to return the number of key misses for the molecule
         """
        assert type(self.keys) is np.ndarray, "keys are not defined or is not an np.array, has the .fit(mols) function been used?"
        #Get fingerprint as a dictionary
        fp = AllChem.GetMorganFingerprint(mol,self.radius)
        fp_d = fp.GetNonzeroElements()
        
        #Prepare the array, and set the values
        #TODO is there a way to vectorize and speed up this?
        arr = np.zeros((self.dims,))
        _misses = 0
        for key, value in fp_d.items():
            if key in self.keys:
                arr[self.keys == key] = value
            else:
                _misses = _misses + 1
        
        if misses:
            return arr, _misses
        else:
            return arr

Source File: mol_metrics.py From ORGAN with GNU General Public License v2.0

6 votes

def NP_score(smile):
    mol = Chem.MolFromSmiles(smile)
    fp = Chem.GetMorganFingerprint(mol, 2)
    bits = fp.GetNonzeroElements()

    # calculating the score
    score = 0.
    for bit in bits:
        score += NP_model.get(bit, 0)
    score /= float(mol.GetNumAtoms())

    # preventing score explosion for exotic molecules
    if score > 4:
        score = 4. + math.log10(score - 4. + 1.)
    if score < -4:
        score = -4. - math.log10(-4. - score + 1.)
    val = np.clip(remap(score, -3, 1), 0.0, 1.0)
    return val

Source File: fingerprint.py From PyBioMed with BSD 3-Clause "New" or "Revised" License

6 votes

def CalculateMorganFingerprint(mol, radius=2):
    """
    #################################################################
    Calculate Morgan

    Usage:

        result=CalculateMorganFingerprint(mol)

        Input: mol is a molecule object.

        radius is a radius.

        Output: result is a tuple form. The first is the number of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = AllChem.GetMorganFingerprint(mol, radius)

    return res.GetLength(), res.GetNonzeroElements(), res

Source File: scoring_functions.py From GB-GA with MIT License

5 votes

def get_ECFP4(mol):
    return AllChem.GetMorganFingerprint(mol, 2)

Source File: drd2_scorer.py From iclr19-graph2graph with MIT License

5 votes

def fingerprints_from_mol(mol):
    fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
    size = 2048
    nfp = np.zeros((1, size), np.int32)
    for idx,v in fp.GetNonzeroElements().items():
        nidx = idx%size
        nfp[0, nidx] += int(v)
    return nfp

Source File: scoring_functions.py From GB-GA with MIT License

5 votes

def get_ECFP6(mol):
    return AllChem.GetMorganFingerprint(mol, 3)

Source File: scoring_functions.py From GB-GA with MIT License

5 votes

def get_FCFP4(mol):
    return AllChem.GetMorganFingerprint(mol, 2, useFeatures=True)

Source File: scoring_functions.py From GB-GA with MIT License

5 votes

def get_FCFP6(mol):
    return AllChem.GetMorganFingerprint(mol, 3, useFeatures=True)

Source File: fingerprints.py From guacamol with MIT License

5 votes

def get_ECFP4(self, mol: Mol):
        return AllChem.GetMorganFingerprint(mol, 2)

Source File: fingerprints.py From guacamol with MIT License

5 votes

def get_ECFP6(self, mol: Mol):
        return AllChem.GetMorganFingerprint(mol, 3)

Source File: fingerprints.py From guacamol with MIT License

5 votes

def get_FCFP4(self, mol: Mol):
        return AllChem.GetMorganFingerprint(mol, 2, useFeatures=True)

Source File: fingerprints.py From guacamol with MIT License

5 votes

def get_FCFP6(self, mol: Mol):
        return AllChem.GetMorganFingerprint(mol, 3, useFeatures=True)

Source File: metric.py From DrugEx with MIT License

5 votes

def diversity(fake_path, real_path=None, is_active=False):
    """ Molecular diversity measurement based on Tanimoto-distance on ECFP6 fingerprints,
    including, intra-diversity and inter-diversity.

    Arguments:
        fake_path (str): the file path of molecules that need to measuring diversity

        real_path (str, optional): the file path of molecules as the reference, if it
            is provided, the inter-diversity will be calculated; otherwise, the intra-diversity
            will be calculated.
        is_active (bool, optional): selecting only active ligands (True) or all of the molecules (False)
            if it is true, the molecule with PCHEMBL_VALUE >= 6.5 or SCORE > 0.5 will be selected.
            (Default: False)

    Returns:
        df (DataFrame): the table that contains columns of CANONICAL_SMILES
            and diversity value for each molecules

    """
    fake = pd.read_table(fake_path)
    fake = fake[fake.SCORE > (0.5 if is_active else 0)]
    fake = fake.drop_duplicates(subset='CANONICAL_SMILES')
    fake_fps, real_fps = [], []
    for i, row in fake.iterrows():
        mol = Chem.MolFromSmiles(row.CANONICAL_SMILES)
        fake_fps.append(AllChem.GetMorganFingerprint(mol, 3))
    if real_path:
        real = pd.read_table(real_path)
        real = real[real.PCHEMBL_VALUE >= (6.5 if is_active else 0)]
        for i, row in real.iterrows():
            mol = Chem.MolFromSmiles(row.CANONICAL_SMILES)
            real_fps.append(AllChem.GetMorganFingerprint(mol, 3))
    else:
        real_fps = fake_fps
    method = np.min if real_path else np.mean
    dist = 1 - np.array([method(DataStructs.BulkTanimotoSimilarity(f, real_fps)) for f in fake_fps])
    fake['DIST'] = dist
    return fake

Source File: similarity.py From chemprop with MIT License

5 votes

def morgan_similarity(smiles_1: List[str], smiles_2: List[str], radius: int, sample_rate: float):
    """
    Determines the similarity between the morgan fingerprints of two lists of smiles strings.

    :param smiles_1: A list of smiles strings.
    :param smiles_2: A list of smiles strings.
    :param radius: The radius of the morgan fingerprints.
    :param sample_rate: Rate at which to sample pairs of molecules for Morgan similarity (to reduce time).
    """
    # Compute similarities
    similarities = []
    num_pairs = len(smiles_1) * len(smiles_2)

    # Sample to improve speed
    if sample_rate < 1.0:
        sample_num_pairs = sample_rate * num_pairs
        sample_size = math.ceil(math.sqrt(sample_num_pairs))
        sample_smiles_1 = np.random.choice(smiles_1, size=sample_size, replace=True)
        sample_smiles_2 = np.random.choice(smiles_2, size=sample_size, replace=True)
    else:
        sample_smiles_1, sample_smiles_2 = smiles_1, smiles_2

    sample_num_pairs = len(sample_smiles_1) * len(sample_smiles_2)

    for smile_1, smile_2 in tqdm(product(sample_smiles_1, sample_smiles_2), total=sample_num_pairs):
        mol_1, mol_2 = Chem.MolFromSmiles(smile_1), Chem.MolFromSmiles(smile_2)
        fp_1, fp_2 = AllChem.GetMorganFingerprint(mol_1, radius), AllChem.GetMorganFingerprint(mol_2, radius)
        similarity = DataStructs.DiceSimilarity(fp_1, fp_2)
        similarities.append(similarity)
    similarities = np.array(similarities)

    # Print results
    print()
    print(f'Average dice similarity = {np.mean(similarities):.4f} +/- {np.std(similarities):.4f}')
    print(f'Minimum dice similarity = {np.min(similarities):.4f}')
    print(f'Maximum dice similarity = {np.max(similarities):.4f}')
    print()
    print('Percentiles for dice similarity')
    print(' | '.join([f'{i}% = {np.percentile(similarities, i):.4f}' for i in range(0, 101, 10)]))

Source File: 2_to_fingerprint.py From mhfp with MIT License

5 votes

def convert(subset):
    target = '/cluster/chembl/chembl.' + str(subset) + '.smi'
    actives = pd.read_csv(target, sep=' ', usecols=[0], header=None)
    
    mh = MHFPEncoder()

    with open('/cluster/chembl/chembl.' + str(subset) + '.mhfp6', 'w+') as f:
        for _, row in actives.iterrows():
            mol = AllChem.MolFromSmiles(row[0])
            if mol:
                fp_vals = ','.join(map(str, mh.encode_mol(mol)))

                f.write(fp_vals + '\n')

    with open('/cluster/chembl/chembl.' + str(subset) + '.mhecfp4', 'w+') as f:
        for _, row in actives.iterrows():
            mol = AllChem.MolFromSmiles(row[0])
            if mol:
                fp_vals = ','.join(map(str, mh.from_sparse_array([*AllChem.GetMorganFingerprint(mol, 2).GetNonzeroElements()])))

                f.write(fp_vals + '\n')

    with open('/cluster/chembl/chembl.' + str(subset) + '.ecfp4', 'w+') as f:
        for _, row in actives.iterrows():
            mol = AllChem.MolFromSmiles(row[0])
            if mol:
                fp_vals = ','.join(map(str, AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)))

                f.write(fp_vals + '\n')

Source File: drd2_scorer.py From hgraph2graph with MIT License

5 votes

def fingerprints_from_mol(mol):
    fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
    size = 2048
    nfp = np.zeros((1, size), np.int32)
    for idx,v in fp.GetNonzeroElements().items():
        nidx = idx%size
        nfp[0, nidx] += int(v)
    return nfp

Source File: fingerprint.py From PyBioMed with BSD 3-Clause "New" or "Revised" License

5 votes

def CalculateFCFP6Fingerprint(mol, radius=3, nBits=1024):
    """
    #################################################################
    Calculate FCFP6

    Usage:

        result=CalculateFCFP4Fingerprint(mol)

        Input: mol is a molecule object.

        radius is a radius.

        Output: result is a tuple form. The first is the vector of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = AllChem.GetMorganFingerprint(mol, radius, useFeatures=True)

    fp = tuple(
        AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits, useFeatures=True)
    )

    return fp, res.GetNonzeroElements(), res


################################################################

Source File: fingerprint.py From PyBioMed with BSD 3-Clause "New" or "Revised" License

5 votes

def CalculateFCFP2Fingerprint(mol, radius=1, nBits=1024):
    """
    #################################################################
    Calculate FCFP2

    Usage:

        result=CalculateFCFP2Fingerprint(mol)

        Input: mol is a molecule object.

        radius is a radius.

        Output: result is a tuple form. The first is the vector of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = AllChem.GetMorganFingerprint(mol, radius, useFeatures=True)

    fp = tuple(
        AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits, useFeatures=True)
    )

    return fp, res.GetNonzeroElements(), res

Source File: fingerprint.py From PyBioMed with BSD 3-Clause "New" or "Revised" License

5 votes

def CalculateECFP6Fingerprint(mol, radius=3):
    """
    #################################################################
    Calculate ECFP6

    Usage:

        result=CalculateECFP6Fingerprint(mol)

        Input: mol is a molecule object.

        radius is a radius.

        Output: result is a tuple form. The first is the vector of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = AllChem.GetMorganFingerprint(mol, radius)

    fp = tuple(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=1024))

    return fp, res.GetNonzeroElements(), res

Source File: fingerprint.py From PyBioMed with BSD 3-Clause "New" or "Revised" License

5 votes

def CalculateECFP4Fingerprint(mol, radius=2):
    """
    #################################################################
    Calculate ECFP4

    Usage:

        result=CalculateECFP4Fingerprint(mol)

        Input: mol is a molecule object.

        radius is a radius.

        Output: result is a tuple form. The first is the vector of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = AllChem.GetMorganFingerprint(mol, radius)

    fp = tuple(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=1024))

    return fp, res.GetNonzeroElements(), res

Source File: fingerprint.py From PyBioMed with BSD 3-Clause "New" or "Revised" License

5 votes

def CalculateECFP2Fingerprint(mol, radius=1):
    """
    #################################################################
    Calculate ECFP2

    Usage:

        result=CalculateECFP2Fingerprint(mol)

        Input: mol is a molecule object.

        radius is a radius.

        Output: result is a tuple form. The first is the vector of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = AllChem.GetMorganFingerprint(mol, radius)

    fp = tuple(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=1024))

    return fp, res.GetNonzeroElements(), res

Source File: features.py From mol2vec with BSD 3-Clause "New" or "Revised" License

5 votes

def mol2alt_sentence(mol, radius):
    """Same as mol2sentence() expect it only returns the alternating sentence
    Calculates ECFP (Morgan fingerprint) and returns identifiers of substructures as 'sentence' (string).
    Returns a tuple with 1) a list with sentence for each radius and 2) a sentence with identifiers from all radii
    combined.
    NOTE: Words are ALWAYS reordered according to atom order in the input mol object.
    NOTE: Due to the way how Morgan FPs are generated, number of identifiers at each radius is smaller
    
    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
    radius : float 
        Fingerprint radius
    
    Returns
    -------
    list
        alternating sentence
    combined
    """
    radii = list(range(int(radius) + 1))
    info = {}
    _ = AllChem.GetMorganFingerprint(mol, radius, bitInfo=info)  # info: dictionary identifier, atom_idx, radius

    mol_atoms = [a.GetIdx() for a in mol.GetAtoms()]
    dict_atoms = {x: {r: None for r in radii} for x in mol_atoms}

    for element in info:
        for atom_idx, radius_at in info[element]:
            dict_atoms[atom_idx][radius_at] = element  # {atom number: {fp radius: identifier}}

    # merge identifiers alternating radius to sentence: atom 0 radius0, atom 0 radius 1, etc.
    identifiers_alt = []
    for atom in dict_atoms:  # iterate over atoms
        for r in radii:  # iterate over radii
            identifiers_alt.append(dict_atoms[atom][r])

    alternating_sentence = map(str, [x for x in identifiers_alt if x])

    return list(alternating_sentence)

Source File: helpers.py From mol2vec with BSD 3-Clause "New" or "Revised" License

5 votes

def depict_identifier(mol, identifier, radius, useFeatures=False, **kwargs):
    """Depict an identifier in Morgan fingerprint.
    
    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule
    identifier : int or str
        Feature identifier from Morgan fingerprint
    radius : int
        Radius of Morgan FP
    useFeatures : bool
        Use feature-based Morgan FP
    
    Returns
    -------
    IPython.display.SVG
    """
    identifier = int(identifier)
    info = {}
    AllChem.GetMorganFingerprint(mol, radius, bitInfo=info, useFeatures=useFeatures)
    if identifier in info.keys():
        atoms, radii = zip(*info[identifier])
        return depict_atoms(mol, atoms, radii, **kwargs)
    else:
        return mol_to_svg(mol, **kwargs)

Source File: scscore.py From ASKCOS with Mozilla Public License 2.0

5 votes

def load_model(self, FP_len=1024, model_tag='1024bool'):
        self.FP_len = FP_len
        if model_tag != '1024bool' and model_tag != '1024uint8' and model_tag != '2048bool':
            MyLogger.print_and_log(
                'Non-existent SCScore model requested: {}. Using "1024bool" model'.format(model_tag), scscore_prioritizer_loc, level=2)
            model_tag = '1024bool'
        filename = 'trained_model_path_'+model_tag
        with open(gc.SCScore_Prioritiaztion[filename], 'rb') as fid:
            self.vars = pickle.load(fid)
        if gc.DEBUG:
            MyLogger.print_and_log('Loaded synthetic complexity score prioritization model from {}'.format(
            gc.SCScore_Prioritiaztion[filename]), scscore_prioritizer_loc)

        if 'uint8' in gc.SCScore_Prioritiaztion[filename]:
            def mol_to_fp(mol):
                if mol is None:
                    return np.array((self.FP_len,), dtype=np.uint8)
                fp = AllChem.GetMorganFingerprint(
                    mol, self.FP_rad, useChirality=True)  # uitnsparsevect
                fp_folded = np.zeros((self.FP_len,), dtype=np.uint8)
                for k, v in fp.GetNonzeroElements().items():
                    fp_folded[k % self.FP_len] += v
                return np.array(fp_folded)
        else:
            def mol_to_fp(mol):
                if mol is None:
                    return np.zeros((self.FP_len,), dtype=np.float32)
                return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, self.FP_rad, nBits=self.FP_len,
                                                                      useChirality=True), dtype=np.bool)
        self.mol_to_fp = mol_to_fp

        self.pricer = Pricer()
        self.pricer.load()
        self._restored = True
        self._loaded = True

Source File: vectorizers.py From Deep-Drug-Coder with MIT License

5 votes

def fit(self, mols):
        """Analyses the molecules and creates the key index for the creation of the dense array"""
        keys=set()
        for mol in mols:
            fp = AllChem.GetMorganFingerprint(mol,self.radius)
            keys.update(fp.GetNonzeroElements().keys())
        keys = list(keys)
        keys.sort()
        self.keys= np.array(keys)
        self.dims = len(self.keys)

Source File: scoring_functions.py From REINVENT with MIT License

5 votes

def fingerprints_from_mol(cls, mol):
        fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
        size = 2048
        nfp = np.zeros((1, size), np.int32)
        for idx,v in fp.GetNonzeroElements().items():
            nidx = idx%size
            nfp[0, nidx] += int(v)
        return nfp

Source File: scoring_functions.py From REINVENT with MIT License

5 votes

def __call__(self, smile):
        mol = Chem.MolFromSmiles(smile)
        if mol:
            fp = AllChem.GetMorganFingerprint(mol, 2, useCounts=True, useFeatures=True)
            score = DataStructs.TanimotoSimilarity(self.query_fp, fp)
            score = min(score, self.k) / self.k
            return float(score)
        return 0.0

Source File: scoring_functions.py From REINVENT with MIT License

5 votes

def __init__(self):
        query_mol = Chem.MolFromSmiles(self.query_structure)
        self.query_fp = AllChem.GetMorganFingerprint(query_mol, 2, useCounts=True, useFeatures=True)

Python rdkit.Chem.AllChem.GetMorganFingerprint() Examples