Python rdkit.Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect() Examples

The following are 10 code examples of rdkit.Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module rdkit.Chem.rdMolDescriptors , or try the search function .
Example #1
Source File: molecule.py    From rl_graph_generation with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def reward_target_molecule_similarity(mol, target, radius=2, nBits=2048,
                                      useChirality=True):
    """
    Reward for a target molecule similarity, based on tanimoto similarity
    between the ECFP fingerprints of the x molecule and target molecule
    :param mol: rdkit mol object
    :param target: rdkit mol object
    :return: float, [0.0, 1.0]
    """
    x = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=radius,
                                                        nBits=nBits,
                                                        useChirality=useChirality)
    target = rdMolDescriptors.GetMorganFingerprintAsBitVect(target,
                                                            radius=radius,
                                                        nBits=nBits,
                                                        useChirality=useChirality)
    return DataStructs.TanimotoSimilarity(x, target)


### TERMINAL VALUE REWARDS ### 
Example #2
Source File: fingerprints.py    From deepchem with MIT License 5 votes vote down vote up
def _featurize(self, mol):
    """
    Calculate circular fingerprint.

    Parameters
    ----------
    mol : RDKit Mol
        Molecule.
    """
    from rdkit import Chem
    from rdkit.Chem import rdMolDescriptors
    if self.sparse:
      info = {}
      fp = rdMolDescriptors.GetMorganFingerprint(
          mol,
          self.radius,
          useChirality=self.chiral,
          useBondTypes=self.bonds,
          useFeatures=self.features,
          bitInfo=info)
      fp = fp.GetNonzeroElements()  # convert to a dict

      # generate SMILES for fragments
      if self.smiles:
        fp_smiles = {}
        for fragment_id, count in fp.items():
          root, radius = info[fragment_id][0]
          env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, root)
          frag = Chem.PathToSubmol(mol, env)
          smiles = Chem.MolToSmiles(frag)
          fp_smiles[fragment_id] = {'smiles': smiles, 'count': count}
        fp = fp_smiles
    else:
      fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
          mol,
          self.radius,
          nBits=self.size,
          useChirality=self.chiral,
          useBondTypes=self.bonds,
          useFeatures=self.features)
    return fp 
Example #3
Source File: fingerprints.py    From PADME with MIT License 5 votes vote down vote up
def __init__(self, mol, radius, nBits=2048, useChirality=False, useBondTypes=True, 
    useFeatures=False, smiles=None):
    self.fingerprint = rdMolDescriptors.GetMorganFingerprintAsBitVect(
      mol, radius, nBits=nBits, useChirality=useChirality, useBondTypes=useBondTypes,
      useFeatures=useFeatures)
    self.fingerprint_array = np.asarray(self.fingerprint)
    self.smiles = smiles 
Example #4
Source File: preprocess.py    From PADME with MIT License 5 votes vote down vote up
def get_highest_similarity(input_file, output_file, comparison_file='../full_toxcast/restructured.csv',
  top_compounds_only=True, num_compounds=1500):
  df_avg = pd.read_csv(input_file, header=0, index_col=False)
  if top_compounds_only:
    df_avg = df_avg.head(num_compounds)
  smiles_list = df_avg['smiles']
  avg_scores = df_avg['avg_score']
  # default_mol = Chem.MolFromSmiles('CCCC')
  # default_fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(default_mol, 2, nBits=1024)
  # mol2 = Chem.MolFromSmiles('CCCC')
  # fp2 = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol2, 2, nBits=1024)
  # sim = DataStructs.FingerprintSimilarity(default_fp, fp2)
  df_comparison = pd.read_csv(comparison_file, header=0, index_col=False)
  #df_comparison = df_comparison.head(100)
  comparison_smiles_list = df_comparison['smiles']
  comparison_fp_list = []
  similarity_list = []
  for c_smiles in comparison_smiles_list:
    comp_mol = Chem.MolFromSmiles(c_smiles)
    #comp_fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(comp_mol, 2, nBits=1024)
    comp_fp = FingerprintMols.FingerprintMol(comp_mol)
    comparison_fp_list.append(comp_fp)

  for i, smiles in enumerate(smiles_list):
    mol_to_test = Chem.MolFromSmiles(smiles)
    #fp_to_test = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol_to_test, 2, nBits=1024)
    fp_to_test = FingerprintMols.FingerprintMol(mol_to_test)
    similarity_list.append(get_highest_similarity_for_mol(fp_to_test, comparison_fp_list))
    if i%500 == 0:
      print(i)

  with open(output_file, 'w', newline='') as csvfile:
    fieldnames = ['smiles', 'avg_score', 'max_similarity']
    writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
    writer.writeheader()
    for i, smiles in enumerate(smiles_list):
      out_line = {'smiles': smiles, 'avg_score': avg_scores[i], 
        'max_similarity': similarity_list[i]}
      writer.writerow(out_line) 
Example #5
Source File: ecfp_preprocessor.py    From chainer-chemistry with MIT License 5 votes vote down vote up
def get_input_features(self, mol):
        try:
            fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,
                                                                self.radius)
        except Exception as e:
            logger = getLogger(__name__)
            logger.debug('exception caught at ECFPPreprocessor:', e)
            # Extracting feature failed
            raise MolFeatureExtractionError
        # TODO(Nakago): Test it.
        return numpy.asarray(fp, numpy.float32) 
Example #6
Source File: organic_lorentz_lorenz.py    From chemml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __represent(self, smiles):
        # The descriptor must be a binary Morgan fingerprint with radius 2 and 1024 bits.

        mol = Chem.MolFromSmiles(smiles.strip())
        if mol is None:
            msg = '%s is not a valid SMILES representation'%smiles
            raise ValueError(msg)
        else:
            return np.array(GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)) 
Example #7
Source File: fingerprint.py    From XenonPy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def featurize(self, x):
        if self.input_type == 'smiles':
            x_ = x
            x = Chem.MolFromSmiles(x)
            if x is None:
                raise ValueError('can not convert Mol from SMILES %s' % x_)
        if self.input_type == 'any':
            if not isinstance(x, Chem.rdchem.Mol):
                x_ = x
                x = Chem.MolFromSmiles(x)
                if x is None:
                    raise ValueError('can not convert Mol from SMILES %s' % x_)
        return list(rdMol.GetMorganFingerprintAsBitVect(x, self.radius, nBits=self.n_bits, useFeatures=True)) 
Example #8
Source File: fingerprint.py    From XenonPy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def featurize(self, x):
        if self.input_type == 'smiles':
            x_ = x
            x = Chem.MolFromSmiles(x)
            if x is None:
                raise ValueError('can not convert Mol from SMILES %s' % x_)
        if self.input_type == 'any':
            if not isinstance(x, Chem.rdchem.Mol):
                x_ = x
                x = Chem.MolFromSmiles(x)
                if x is None:
                    raise ValueError('can not convert Mol from SMILES %s' % x_)
        return list(rdMol.GetMorganFingerprintAsBitVect(x, self.radius, nBits=self.n_bits)) 
Example #9
Source File: splitters.py    From PADME with MIT License 4 votes vote down vote up
def split(self,
            dataset,
            seed=None,
            frac_train=.8,
            frac_valid=.1,
            frac_test=.1,
            log_every_n=None):
    """
    Splits internal compounds randomly into train/validation/test.
    """
    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
    if seed is None:
      seed = random.randint(0, 2**30)
    np.random.seed(seed)

    num_datapoints = len(dataset)

    train_cutoff = int(frac_train * num_datapoints)
    valid_cutoff = int((frac_train + frac_valid) * num_datapoints)

    num_train = train_cutoff
    num_valid = valid_cutoff - train_cutoff
    num_test = num_datapoints - valid_cutoff

    all_mols = []
    for ind, smiles in enumerate(dataset.ids):
      all_mols.append(Chem.MolFromSmiles(smiles))

    fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in all_mols]

    def distance(i, j):
      return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])

    picker = MaxMinPicker()
    testIndices = picker.LazyPick(
        distFunc=distance,
        poolSize=num_datapoints,
        pickSize=num_test,
        seed=seed)

    validTestIndices = picker.LazyPick(
        distFunc=distance,
        poolSize=num_datapoints,
        pickSize=num_valid + num_test,
        firstPicks=testIndices,
        seed=seed)

    allSet = set(range(num_datapoints))
    testSet = set(testIndices)
    validSet = set(validTestIndices) - testSet

    trainSet = allSet - testSet - validSet

    assert len(testSet & validSet) == 0
    assert len(testSet & trainSet) == 0
    assert len(validSet & trainSet) == 0
    assert (validSet | trainSet | testSet) == allSet

    return sorted(list(trainSet)), sorted(list(validSet)), sorted(list(testSet)) 
Example #10
Source File: splitters.py    From PADME with MIT License 4 votes vote down vote up
def split(self,
            dataset,
            frac_train=None,
            frac_valid=None,
            frac_test=None,
            log_every_n=1000,
            cutoff=0.18):
    """
        Splits internal compounds into train and validation based on the butina
        clustering algorithm. This splitting algorithm has an O(N^2) run time, where N
        is the number of elements in the dataset. The dataset is expected to be a classification
        dataset.

        This algorithm is designed to generate validation data that are novel chemotypes.

        Note that this function entirely disregards the ratios for frac_train, frac_valid,
        and frac_test. Furthermore, it does not generate a test set, only a train and valid set.

        Setting a small cutoff value will generate smaller, finer clusters of high similarity,
        whereas setting a large cutoff value will generate larger, coarser clusters of low similarity.
        """
    print("Performing butina clustering with cutoff of", cutoff)
    mols = []
    for ind, smiles in enumerate(dataset.ids):
      mols.append(Chem.MolFromSmiles(smiles))
    n_mols = len(mols)
    fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in mols]

    scaffold_sets = ClusterFps(fps, cutoff=cutoff)
    scaffold_sets = sorted(scaffold_sets, key=lambda x: -len(x))

    ys = dataset.y
    valid_inds = []
    for c_idx, cluster in enumerate(scaffold_sets):
      # for m_idx in cluster:
      valid_inds.extend(cluster)
      # continue until we find an active in all the tasks, otherwise we can't
      # compute a meaningful AUC
      # TODO (ytz): really, we want at least one active and inactive in both scenarios.
      # TODO (Ytz): for regression tasks we'd stop after only one cluster.
      active_populations = np.sum(ys[valid_inds], axis=0)
      if np.all(active_populations):
        print("# of actives per task in valid:", active_populations)
        print("Total # of validation points:", len(valid_inds))
        break

    train_inds = list(itertools.chain.from_iterable(scaffold_sets[c_idx + 1:]))
    test_inds = []

    return train_inds, valid_inds, []