Python rdkit.Chem.SDMolSupplier() Examples
The following are 15
code examples of rdkit.Chem.SDMolSupplier().
Example #1
Source File: From 3DGCN with MIT License | 7 votes |
def rotate_molecule(path, target_path, count=10): # Load dataset mols = Chem.SDMolSupplier(path) rotated_mols = [] print("Loaded {} Molecules from {}".format(len(mols), path)) print("Rotating Molecules...") for mol in mols: for _ in range(count): for atom in mol.GetAtoms(): atom_idx = atom.GetIdx() pos = list(mol.GetConformer().GetAtomPosition(atom_idx)) pos_rotated = np.matmul(random_rotation_matrix(), pos) mol.GetConformer().SetAtomPosition(atom_idx, pos_rotated) rotated_mols.append(mol) w = Chem.SDWriter(target_path) for m in rotated_mols: if m is not None: w.write(m) print("Saved {} Molecules to {}".format(len(rotated_mols), target_path))
Example #2
Source File: From 3DGCN with MIT License | 6 votes |
def draw_confusion_matrix(dataset, model, set_trial=None, filename="test_results.sdf"): path = find_average_trial(dataset, model, metric="test_pr") if set_trial is None \ else "../result/{}/{}/{}/".format(model, dataset, set_trial) # Load true, pred value true_y, pred_y = [], [] mols = Chem.SDMolSupplier(path + filename) for mol in mols: true_y.append(float(mol.GetProp("true"))) pred_y.append(float(mol.GetProp("pred"))) true_y = np.array(true_y, dtype=float) pred_y = np.array(pred_y, dtype=float).round() # Get precision and recall confusion = confusion_matrix(true_y, pred_y) tn, fp, fn, tp = confusion.ravel() print("tn: {}, fp: {}, fn: {}, tp: {}".format(tn, fp, fn, tp))
Example #3
Source File: From PADME with MIT License | 6 votes |
def load_sdf_files(input_files, clean_mols): """Load SDF file into dataframe.""" dataframes = [] for input_file in input_files: # Tasks are stored in .sdf.csv file raw_df = next(load_csv_files([input_file + ".csv"], shard_size=None)) # Structures are stored in .sdf file print("Reading structures from %s." % input_file) suppl = Chem.SDMolSupplier(str(input_file), clean_mols, False, False) df_rows = [] for ind, mol in enumerate(suppl): if mol is not None: smiles = Chem.MolToSmiles(mol) df_rows.append([ind, smiles, mol]) mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol')) dataframes.append(pd.concat([mol_df, raw_df], axis=1, join='inner')) return dataframes
Example #4
Source File: From PyBioMed with BSD 3-Clause "New" or "Revised" License | 6 votes |
def ReadMolFromSDF(filename=""): """ Read a set of molecules by SDF file format. Note: the output of this function is a set of molecular objects. You need to use for statement to call each object. Usage: res=ReadMolFromSDF(filename) Input: filename is a file name with path. Output: res is a set of molecular object. """ molset = Chem.SDMolSupplier(filename) return molset
Example #5
Source File: From chainer-chemistry with MIT License | 6 votes |
def extract_total_num(self, filepath): """Extracts total number of data which can be parsed We can use this method to determine the value fed to `target_index` option of `parse` method. For example, if we want to extract input feature from 10% of whole dataset, we need to know how many samples are in a file. The returned value of this method may not to be same as the final dataset size. Args: filepath (str): file path of to check the total number. Returns (int): total number of dataset can be parsed. """ mol_supplier = Chem.SDMolSupplier(filepath) return len(mol_supplier)
Example #6
Source File: From PIDGINv3 with GNU General Public License v3.0 | 6 votes |
def importQuerySDF(in_file): outfp = [] outmol = [] query = Chem.SDMolSupplier(in_file) for idx, m in enumerate(suppl): sys.stdout.write(' Importing SDF file. Compound number: %s\r' % idx) sys.stdout.flush() try: if not m: raise SdfNoneMolError('None mol') smi, fp, mol = calcFingerprints(m,qtype='sdf') outfp.append(fp) outmol.append(mol) except SdfNoneMolError: print ' SDF parse error (compound index: ' + str(idx) + ')' print return np.array(outfp,dtype=np.uint8),outmol #unzip a pkl model
Example #7
Source File: From PIDGINv3 with GNU General Public License v3.0 | 6 votes |
def importQuerySDF(in_file): outfp = [] outid= [] outmol = [] query = Chem.SDMolSupplier(in_file) for idx, m in enumerate(suppl): sys.stdout.write(' Importing SDF file. Compound number: %s\r' % idx) sys.stdout.flush() try: if not m: raise SdfNoneMolError('None mol') smi, fp, mol = calcFingerprints(m,qtype='sdf') try: outid.append(m.GetProp('_Name')) except KeyError: outid.append(smi) outfp.append(fp) outmol.append(mol) except SdfNoneMolError: print ' SDF parse error (compound index: ' + str(idx) + ')' print return np.array(outfp,dtype=np.uint8),outmol,outid #unzip a pkl model
Example #8
Source File: From 3DGCN with MIT License | 5 votes |
def replace_dataset(self, path, subset="test", target_name="target"): x, c, y = [], [], [] mols = Chem.SDMolSupplier(path) for mol in mols: if mol is not None: # Multitask if type(target_name) is list: y.append([float(mol.GetProp(t)) if t in mol.GetPropNames() else -1 for t in target_name]) self.outputs = len(self.target_name) # Singletask elif target_name in mol.GetPropNames(): _y = float(mol.GetProp(target_name)) if _y == -1: continue else: y.append(_y) else: continue x.append(mol) c.append(mol.GetConformer().GetPositions()) # Normalize x = np.array(x) c = np.array(c) y = (np.array(y) - self.mean) / self.std self.x[subset] = x self.c[subset] = c self.y[subset] = y.astype(int) if self.task != "regression" else y
Example #9
Source File: From 3DGCN with MIT License | 5 votes |
def find_confusion(dataset, base_path): for i in range(1, 11): path = base_path + "trial_{}/".format(i) # Load true, pred value true_y, pred_y, diff_y = [], [], [] mols = Chem.SDMolSupplier(path + "test.sdf") for mol in mols: diff_y.append(float(mol.GetProp("true")) - float(mol.GetProp("pred"))) diff_y = np.array(diff_y, dtype=float) # Find largest, smallest error molecules idx = np.argsort(diff_y) top_1 = mols[int(idx[-1])] top_2 = mols[int(idx[-2])] btm_1 = mols[int(idx[0])] btm_2 = mols[int(idx[1])] best_idx = np.argsort(np.abs(diff_y)) best = mols[int(best_idx[0])] # Save example molecules writer = Chem.SDWriter(path + "confusion_examples_" + dataset + "_trial" + str(i) + ".sdf") for mol in [top_1, top_2, btm_1, btm_2, best]: writer.write(mol)
Example #10
Source File: From deepchem with MIT License | 5 votes |
def load_sdf_files(input_files, clean_mols, tasks=[]): """Load SDF file into dataframe.""" from rdkit import Chem dataframes = [] for input_file in input_files: # Tasks are either in .sdf.csv file or in the .sdf file itself has_csv = os.path.isfile(input_file + ".csv") # Structures are stored in .sdf file print("Reading structures from %s." % input_file) suppl = Chem.SDMolSupplier(str(input_file), clean_mols, False, False) df_rows = [] for ind, mol in enumerate(suppl): if mol is None: continue smiles = Chem.MolToSmiles(mol) df_row = [ind, smiles, mol] if not has_csv: # Get task targets from .sdf file for task in tasks: df_row.append(mol.GetProp(str(task))) df_rows.append(df_row) if has_csv: mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol')) raw_df = next(load_csv_files([input_file + ".csv"], shard_size=None)) dataframes.append(pd.concat([mol_df, raw_df], axis=1, join='inner')) else: mol_df = pd.DataFrame( df_rows, columns=('mol_id', 'smiles', 'mol') + tuple(tasks)) dataframes.append(mol_df) return dataframes
Example #11
Source File: From oddt with BSD 3-Clause "New" or "Revised" License | 5 votes |
def readstring(format, string, **kwargs): """Read in a molecule from a string. Required parameters: format - see the informats variable for a list of available input formats string Example: >>> input = "C1=CC=CS1" >>> mymol = readstring("smi", input) >>> len(mymol.atoms) 5 """ string = str(string) format = format.lower() if format in ["mol", "sdf"]: supplier = Chem.SDMolSupplier() supplier.SetData(string) mol = next(supplier) del supplier elif format == "mol2": mol = Chem.MolFromMol2Block(string, **kwargs) elif format == "pdb": mol = MolFromPDBBlock(string, **kwargs) elif format == 'pdbqt': mol = MolFromPDBQTBlock(string, **kwargs) elif format == "smi": s = string.strip().split('\n')[0].strip().split() mol = Chem.MolFromSmiles(s[0], **kwargs) if mol: mol.SetProp("_Name", ' '.join(s[1:])) elif format == 'inchi' and Chem.INCHI_AVAILABLE: mol = Chem.inchi.MolFromInchi(string, **kwargs) else: raise ValueError("%s is not a recognised RDKit format" % format) return Molecule(mol)
Example #12
Source File: From deep-molecular-massspec with Apache License 2.0 | 5 votes |
def get_mol_list_from_sdf(sdf_fname): """Reads a sdf file and returns a list of molecules. Note: rdkit's Chem.SDMolSupplier only accepts filenames as inputs. As such this code only supports local filesystem name environments. Args: sdf_fname: Path to sdf file. Returns: List of rdkit.Mol objects. Raises: ValueError if a molblock in the SDF cannot be parsed. """ suppl = Chem.SDMolSupplier(sdf_fname) mols = [] for idx, mol in enumerate(suppl): if mol is not None: mols.append(mol) else: fail_sdf_block = suppl.GetItemText(idx) raise ValueError("Unable to parse the following mol block %s" % fail_sdf_block) return mols
Example #13
Source File: From eMolFrag with GNU General Public License v3.0 | 5 votes |
def FragmentSanitize(tempSDFPath): try: suppl2 = Chem.SDMolSupplier(tempSDFPath,sanitize=True) newmol2=Chem.FragmentOnBRICSBonds(suppl2[0]) mfl=Chem.GetMolFrags(newmol2,asMols=True,sanitizeFrags=False) #print('Good True') return mfl except: #print('Not good for true') raise RDKitError(1)
Example #14
Source File: From 3DGCN with MIT License | 4 votes |
def converter(path, target_path, name, target_name, process=20): # Load dataset print("Loading Dataset...") if ".csv" in path: x, y = load_csv(path, name, target_name) mols, props = [], [] for smi, prop in zip(x, y): mol = Chem.MolFromSmiles(smi) if mol is not None: mols.append(mol) props.append(prop) mol_idx = list(range(len(mols))) elif ".sdf" in path: mols = Chem.SDMolSupplier(path) props = [] for mol in mols: props.append(mol.GetProp(target_name)) mol_idx = list(range(len(mols))) else: raise ValueError("Unsupported file type.") print("Loaded {} Molecules from {}".format(len(mols), path)) # Optimize coordinate using multiprocessing print("Optimizing Conformers...") pool = mp.Pool(process) results = pool.starmap(optimize_conformer, zip(mol_idx, mols, props)) # Collect results mol_list, prop_list = [], [] for mol, prop in results: mol_list.append(mol) prop_list.append(prop) # Remove None and add properties mol_list_filtered = [] for mol, prop in zip(mol_list, prop_list): if mol is not None: mol.SetProp("target", str(prop)) mol_list_filtered.append(mol) print("{} Molecules Optimized".format(len(mol_list_filtered))) # Save molecules print("Saving File...") w = Chem.SDWriter(target_path) for m in mol_list_filtered: w.write(m) print("Saved {} Molecules to {}".format(len(mol_list_filtered), target_path))
Example #15
Source File: From MolGAN with MIT License | 4 votes |
def generate(self, filename, add_h=False, filters=lambda x: True, size=None, validation=0.1, test=0.1): self.log('Extracting {}..'.format(filename)) if filename.endswith('.sdf'): = list(filter(lambda x: x is not None, Chem.SDMolSupplier(filename))) elif filename.endswith('.smi'): = [Chem.MolFromSmiles(line) for line in open(filename, 'r').readlines()] = list(map(Chem.AddHs, if add_h else = list(filter(filters, =[:size] self.log('Extracted {} out of {} molecules {}adding Hydrogen!'.format(len(, len(Chem.SDMolSupplier(filename)), '' if add_h else 'not ')) self._generate_encoders_decoders() self._generate_AX() # it contains the all the molecules stored as rdkit.Chem objects = np.array( # it contains the all the molecules stored as SMILES strings self.smiles = np.array(self.smiles) # a (N, L) matrix where N is the length of the dataset and each L-dim vector contains the # indices corresponding to a SMILE sequences with padding wrt the max length of the longest # SMILES sequence in the dataset (see self._genS) self.data_S = np.stack(self.data_S) # a (N, 9, 9) tensor where N is the length of the dataset and each 9x9 matrix contains the # indices of the positions of the ones in the one-hot representation of the adjacency tensor # (see self._genA) self.data_A = np.stack(self.data_A) # a (N, 9) matrix where N is the length of the dataset and each 9-dim vector contains the # indices of the positions of the ones in the one-hot representation of the annotation matrix # (see self._genX) self.data_X = np.stack(self.data_X) # a (N, 9) matrix where N is the length of the dataset and each 9-dim vector contains the # diagonal of the correspondent adjacency matrix self.data_D = np.stack(self.data_D) # a (N, F) matrix where N is the length of the dataset and each F vector contains features # of the correspondent molecule (see self._genF) self.data_F = np.stack(self.data_F) # a (N, 9) matrix where N is the length of the dataset and each 9-dim vector contains the # eigenvalues of the correspondent Laplacian matrix self.data_Le = np.stack(self.data_Le) # a (N, 9, 9) matrix where N is the length of the dataset and each 9x9 matrix contains the # eigenvectors of the correspondent Laplacian matrix self.data_Lv = np.stack(self.data_Lv) self.vertexes = self.data_F.shape[-2] self.features = self.data_F.shape[-1] self._generate_train_validation_test(validation, test)