Python Examples of rdkit.Chem.SDMolSupplier

Source File: converter.py From 3DGCN with MIT License

7 votes

def rotate_molecule(path, target_path, count=10):
    # Load dataset
    mols = Chem.SDMolSupplier(path)
    rotated_mols = []

    print("Loaded {} Molecules from {}".format(len(mols), path))

    print("Rotating Molecules...")
    for mol in mols:
        for _ in range(count):
            for atom in mol.GetAtoms():
                atom_idx = atom.GetIdx()

                pos = list(mol.GetConformer().GetAtomPosition(atom_idx))
                pos_rotated = np.matmul(random_rotation_matrix(), pos)

                mol.GetConformer().SetAtomPosition(atom_idx, pos_rotated)

            rotated_mols.append(mol)

    w = Chem.SDWriter(target_path)
    for m in rotated_mols:
        if m is not None:
            w.write(m)
    print("Saved {} Molecules to {}".format(len(rotated_mols), target_path))

Source File: curve.py From 3DGCN with MIT License

6 votes

def draw_confusion_matrix(dataset, model, set_trial=None, filename="test_results.sdf"):
    path = find_average_trial(dataset, model, metric="test_pr") if set_trial is None \
        else "../result/{}/{}/{}/".format(model, dataset, set_trial)

    # Load true, pred value
    true_y, pred_y = [], []
    mols = Chem.SDMolSupplier(path + filename)

    for mol in mols:
        true_y.append(float(mol.GetProp("true")))
        pred_y.append(float(mol.GetProp("pred")))

    true_y = np.array(true_y, dtype=float)
    pred_y = np.array(pred_y, dtype=float).round()

    # Get precision and recall
    confusion = confusion_matrix(true_y, pred_y)
    tn, fp, fn, tp = confusion.ravel()

    print("tn: {}, fp: {}, fn: {}, tp: {}".format(tn, fp, fn, tp))

Source File: save.py From PADME with MIT License

6 votes

def load_sdf_files(input_files, clean_mols):
  """Load SDF file into dataframe."""
  dataframes = []
  for input_file in input_files:
    # Tasks are stored in .sdf.csv file
    raw_df = next(load_csv_files([input_file + ".csv"], shard_size=None))
    # Structures are stored in .sdf file
    print("Reading structures from %s." % input_file)
    suppl = Chem.SDMolSupplier(str(input_file), clean_mols, False, False)
    df_rows = []
    for ind, mol in enumerate(suppl):
      if mol is not None:
        smiles = Chem.MolToSmiles(mol)
        df_rows.append([ind, smiles, mol])
    mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol'))
    dataframes.append(pd.concat([mol_df, raw_df], axis=1, join='inner'))
  return dataframes

Source File: Getmol.py From PyBioMed with BSD 3-Clause "New" or "Revised" License

6 votes

def ReadMolFromSDF(filename=""):
    """
    Read a set of molecules by SDF file format.

    Note: the output of this function is a set of molecular objects.

    You need to use for statement to call each object.

    Usage:

        res=ReadMolFromSDF(filename)

        Input: filename is a file name with path.

        Output: res is a set of molecular object.

    """
    molset = Chem.SDMolSupplier(filename)
    return molset

Source File: sdf_file_parser.py From chainer-chemistry with MIT License

6 votes

def extract_total_num(self, filepath):
        """Extracts total number of data which can be parsed

        We can use this method to determine the value fed to `target_index`
        option of `parse` method. For example, if we want to extract input
        feature from 10% of whole dataset, we need to know how many samples
        are in a file. The returned value of this method may not to be same as
        the final dataset size.

        Args:
            filepath (str): file path of to check the total number.

        Returns (int): total number of dataset can be parsed.

        """
        mol_supplier = Chem.SDMolSupplier(filepath)
        return len(mol_supplier)

Source File: predict_enriched.py From PIDGINv3 with GNU General Public License v3.0

6 votes

def importQuerySDF(in_file):
	outfp = []
	outmol = []
	query = Chem.SDMolSupplier(in_file)
	for idx, m in enumerate(suppl):
		sys.stdout.write(' Importing SDF file. Compound number: %s\r' % idx)
		sys.stdout.flush()
		try:
			if not m: raise SdfNoneMolError('None mol')
			smi, fp, mol = calcFingerprints(m,qtype='sdf')
			outfp.append(fp)
			outmol.append(mol)
		except SdfNoneMolError: print ' SDF parse error (compound index: ' + str(idx) + ')'
	print
	return np.array(outfp,dtype=np.uint8),outmol

#unzip a pkl model

Source File: predict.py From PIDGINv3 with GNU General Public License v3.0

6 votes

def importQuerySDF(in_file):
	outfp = []
	outid= []
	outmol = []
	query = Chem.SDMolSupplier(in_file)
	for idx, m in enumerate(suppl):
		sys.stdout.write(' Importing SDF file. Compound number: %s\r' % idx)
		sys.stdout.flush()
		try:
			if not m: raise SdfNoneMolError('None mol')
			smi, fp, mol = calcFingerprints(m,qtype='sdf')
			try: outid.append(m.GetProp('_Name'))
			except KeyError: outid.append(smi)
			outfp.append(fp)
			outmol.append(mol)
		except SdfNoneMolError: print ' SDF parse error (compound index: ' + str(idx) + ')'
	print
	return np.array(outfp,dtype=np.uint8),outmol,outid

#unzip a pkl model

Source File: dataset.py From 3DGCN with MIT License

5 votes

def replace_dataset(self, path, subset="test", target_name="target"):
        x, c, y = [], [], []
        mols = Chem.SDMolSupplier(path)

        for mol in mols:
            if mol is not None:
                # Multitask
                if type(target_name) is list:
                    y.append([float(mol.GetProp(t)) if t in mol.GetPropNames() else -1 for t in target_name])
                    self.outputs = len(self.target_name)

                # Singletask
                elif target_name in mol.GetPropNames():
                    _y = float(mol.GetProp(target_name))
                    if _y == -1:
                        continue
                    else:
                        y.append(_y)

                else:
                    continue

                x.append(mol)
                c.append(mol.GetConformer().GetPositions())

        # Normalize
        x = np.array(x)
        c = np.array(c)
        y = (np.array(y) - self.mean) / self.std

        self.x[subset] = x
        self.c[subset] = c
        self.y[subset] = y.astype(int) if self.task != "regression" else y

Source File: scatter_plot.py From 3DGCN with MIT License

5 votes

def find_confusion(dataset, base_path):
    for i in range(1, 11):
        path = base_path + "trial_{}/".format(i)

        # Load true, pred value
        true_y, pred_y, diff_y = [], [], []

        mols = Chem.SDMolSupplier(path + "test.sdf")
        for mol in mols:
            diff_y.append(float(mol.GetProp("true")) - float(mol.GetProp("pred")))

        diff_y = np.array(diff_y, dtype=float)

        # Find largest, smallest error molecules
        idx = np.argsort(diff_y)
        top_1 = mols[int(idx[-1])]
        top_2 = mols[int(idx[-2])]
        btm_1 = mols[int(idx[0])]
        btm_2 = mols[int(idx[1])]

        best_idx = np.argsort(np.abs(diff_y))
        best = mols[int(best_idx[0])]

        # Save example molecules
        writer = Chem.SDWriter(path + "confusion_examples_" + dataset + "_trial" + str(i) + ".sdf")
        for mol in [top_1, top_2, btm_1, btm_2, best]:
            writer.write(mol)

Source File: save.py From deepchem with MIT License

5 votes

def load_sdf_files(input_files, clean_mols, tasks=[]):
  """Load SDF file into dataframe."""
  from rdkit import Chem
  dataframes = []
  for input_file in input_files:
    # Tasks are either in .sdf.csv file or in the .sdf file itself
    has_csv = os.path.isfile(input_file + ".csv")
    # Structures are stored in .sdf file
    print("Reading structures from %s." % input_file)
    suppl = Chem.SDMolSupplier(str(input_file), clean_mols, False, False)
    df_rows = []
    for ind, mol in enumerate(suppl):
      if mol is None:
        continue
      smiles = Chem.MolToSmiles(mol)
      df_row = [ind, smiles, mol]
      if not has_csv:  # Get task targets from .sdf file
        for task in tasks:
          df_row.append(mol.GetProp(str(task)))
      df_rows.append(df_row)
    if has_csv:
      mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol'))
      raw_df = next(load_csv_files([input_file + ".csv"], shard_size=None))
      dataframes.append(pd.concat([mol_df, raw_df], axis=1, join='inner'))
    else:
      mol_df = pd.DataFrame(
          df_rows, columns=('mol_id', 'smiles', 'mol') + tuple(tasks))
      dataframes.append(mol_df)
  return dataframes

Source File: rdk.py From oddt with BSD 3-Clause "New" or "Revised" License

5 votes

def readstring(format, string, **kwargs):
    """Read in a molecule from a string.

    Required parameters:
       format - see the informats variable for a list of available
                input formats
       string

    Example:
    >>> input = "C1=CC=CS1"
    >>> mymol = readstring("smi", input)
    >>> len(mymol.atoms)
    5
    """
    string = str(string)
    format = format.lower()
    if format in ["mol", "sdf"]:
        supplier = Chem.SDMolSupplier()
        supplier.SetData(string)
        mol = next(supplier)
        del supplier
    elif format == "mol2":
        mol = Chem.MolFromMol2Block(string, **kwargs)
    elif format == "pdb":
        mol = MolFromPDBBlock(string, **kwargs)
    elif format == 'pdbqt':
        mol = MolFromPDBQTBlock(string, **kwargs)
    elif format == "smi":
        s = string.strip().split('\n')[0].strip().split()
        mol = Chem.MolFromSmiles(s[0], **kwargs)
        if mol:
            mol.SetProp("_Name", ' '.join(s[1:]))
    elif format == 'inchi' and Chem.INCHI_AVAILABLE:
        mol = Chem.inchi.MolFromInchi(string, **kwargs)
    else:
        raise ValueError("%s is not a recognised RDKit format" % format)
    return Molecule(mol)

Source File: spectra_predictor.py From deep-molecular-massspec with Apache License 2.0

5 votes

def get_mol_list_from_sdf(sdf_fname):
  """Reads a sdf file and returns a list of molecules.

  Note: rdkit's Chem.SDMolSupplier only accepts filenames as inputs. As such
  this code only supports local filesystem name environments.

  Args:
    sdf_fname: Path to sdf file.

  Returns:
    List of rdkit.Mol objects.

  Raises:
    ValueError if a molblock in the SDF cannot be parsed.
  """
  suppl = Chem.SDMolSupplier(sdf_fname)
  mols = []

  for idx, mol in enumerate(suppl):
    if mol is not None:
      mols.append(mol)
    else:
      fail_sdf_block = suppl.GetItemText(idx)
      raise ValueError("Unable to parse the following mol block %s" %
                       fail_sdf_block)
  return mols

Source File: chopRDKit03.py From eMolFrag with GNU General Public License v3.0

5 votes

def FragmentSanitize(tempSDFPath):
    try:
        suppl2 = Chem.SDMolSupplier(tempSDFPath,sanitize=True)
        newmol2=Chem.FragmentOnBRICSBonds(suppl2[0])
        mfl=Chem.GetMolFrags(newmol2,asMols=True,sanitizeFrags=False)
        #print('Good True')
        return mfl
    except:
        #print('Not good for true')
        raise RDKitError(1)

Source File: converter.py From 3DGCN with MIT License

4 votes

def converter(path, target_path, name, target_name, process=20):
    # Load dataset
    print("Loading Dataset...")
    if ".csv" in path:
        x, y = load_csv(path, name, target_name)
        mols, props = [], []
        for smi, prop in zip(x, y):
            mol = Chem.MolFromSmiles(smi)
            if mol is not None:
                mols.append(mol)
                props.append(prop)
        mol_idx = list(range(len(mols)))

    elif ".sdf" in path:
        mols = Chem.SDMolSupplier(path)

        props = []
        for mol in mols:
            props.append(mol.GetProp(target_name))
        mol_idx = list(range(len(mols)))

    else:
        raise ValueError("Unsupported file type.")
    print("Loaded {} Molecules from {}".format(len(mols), path))

    # Optimize coordinate using multiprocessing
    print("Optimizing Conformers...")
    pool = mp.Pool(process)
    results = pool.starmap(optimize_conformer, zip(mol_idx, mols, props))

    # Collect results
    mol_list, prop_list = [], []
    for mol, prop in results:
        mol_list.append(mol)
        prop_list.append(prop)

    # Remove None and add properties
    mol_list_filtered = []
    for mol, prop in zip(mol_list, prop_list):
        if mol is not None:
            mol.SetProp("target", str(prop))
            mol_list_filtered.append(mol)
    print("{} Molecules Optimized".format(len(mol_list_filtered)))

    # Save molecules
    print("Saving File...")
    w = Chem.SDWriter(target_path)
    for m in mol_list_filtered:
        w.write(m)
    print("Saved {} Molecules to {}".format(len(mol_list_filtered), target_path))

Source File: sparse_molecular_dataset.py From MolGAN with MIT License

4 votes

def generate(self, filename, add_h=False, filters=lambda x: True, size=None, validation=0.1, test=0.1):
        self.log('Extracting {}..'.format(filename))

        if filename.endswith('.sdf'):
            self.data = list(filter(lambda x: x is not None, Chem.SDMolSupplier(filename)))
        elif filename.endswith('.smi'):
            self.data = [Chem.MolFromSmiles(line) for line in open(filename, 'r').readlines()]

        self.data = list(map(Chem.AddHs, self.data)) if add_h else self.data
        self.data = list(filter(filters, self.data))
        self.data = self.data[:size]

        self.log('Extracted {} out of {} molecules {}adding Hydrogen!'.format(len(self.data),
                                                                              len(Chem.SDMolSupplier(filename)),
                                                                              '' if add_h else 'not '))

        self._generate_encoders_decoders()
        self._generate_AX()

        # it contains the all the molecules stored as rdkit.Chem objects
        self.data = np.array(self.data)

        # it contains the all the molecules stored as SMILES strings
        self.smiles = np.array(self.smiles)

        # a (N, L) matrix where N is the length of the dataset and each L-dim vector contains the 
        # indices corresponding to a SMILE sequences with padding wrt the max length of the longest 
        # SMILES sequence in the dataset (see self._genS)
        self.data_S = np.stack(self.data_S)

        # a (N, 9, 9) tensor where N is the length of the dataset and each 9x9 matrix contains the 
        # indices of the positions of the ones in the one-hot representation of the adjacency tensor
        # (see self._genA)
        self.data_A = np.stack(self.data_A)

        # a (N, 9) matrix where N is the length of the dataset and each 9-dim vector contains the 
        # indices of the positions of the ones in the one-hot representation of the annotation matrix
        # (see self._genX)
        self.data_X = np.stack(self.data_X)

        # a (N, 9) matrix where N is the length of the dataset and each  9-dim vector contains the 
        # diagonal of the correspondent adjacency matrix
        self.data_D = np.stack(self.data_D)

        # a (N, F) matrix where N is the length of the dataset and each F vector contains features 
        # of the correspondent molecule (see self._genF)
        self.data_F = np.stack(self.data_F)

        # a (N, 9) matrix where N is the length of the dataset and each  9-dim vector contains the
        # eigenvalues of the correspondent Laplacian matrix
        self.data_Le = np.stack(self.data_Le)

        # a (N, 9, 9) matrix where N is the length of the dataset and each  9x9 matrix contains the 
        # eigenvectors of the correspondent Laplacian matrix
        self.data_Lv = np.stack(self.data_Lv) 

        self.vertexes = self.data_F.shape[-2]
        self.features = self.data_F.shape[-1]

        self._generate_train_validation_test(validation, test)

Python rdkit.Chem.SDMolSupplier() Examples